docling-core 1.7.2__tar.gz → 2.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (66) hide show
  1. {docling_core-1.7.2 → docling_core-2.0.1}/PKG-INFO +17 -17
  2. {docling_core-1.7.2 → docling_core-2.0.1}/README.md +15 -15
  3. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/transforms/chunker/__init__.py +2 -8
  4. docling_core-2.0.1/docling_core/transforms/chunker/base.py +61 -0
  5. docling_core-2.0.1/docling_core/transforms/chunker/hierarchical_chunker.py +186 -0
  6. docling_core-2.0.1/docling_core/types/__init__.py +10 -0
  7. {docling_core-1.7.2/docling_core/types/experimental → docling_core-2.0.1/docling_core/types/doc}/__init__.py +4 -3
  8. {docling_core-1.7.2/docling_core/types/experimental → docling_core-2.0.1/docling_core/types/doc}/base.py +4 -1
  9. {docling_core-1.7.2/docling_core/types/experimental → docling_core-2.0.1/docling_core/types/doc}/document.py +332 -83
  10. {docling_core-1.7.2/docling_core/types/experimental → docling_core-2.0.1/docling_core/types/doc}/labels.py +4 -1
  11. {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/base.py +1 -1
  12. {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/doc_ann.py +1 -1
  13. {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/doc_ocr.py +1 -1
  14. {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/doc_raw.py +1 -1
  15. {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/document.py +8 -4
  16. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/subject.py +1 -1
  17. docling_core-2.0.1/docling_core/utils/generate_docs.py +82 -0
  18. docling_core-1.7.2/docling_core/utils/ds_generate_jsonschema.py → docling_core-2.0.1/docling_core/utils/generate_jsonschema.py +4 -4
  19. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/validators.py +3 -3
  20. {docling_core-1.7.2 → docling_core-2.0.1}/pyproject.toml +4 -5
  21. docling_core-1.7.2/docling_core/transforms/chunker/base.py +0 -74
  22. docling_core-1.7.2/docling_core/transforms/chunker/hierarchical_chunker.py +0 -354
  23. docling_core-1.7.2/docling_core/transforms/id_generator/__init__.py +0 -12
  24. docling_core-1.7.2/docling_core/transforms/id_generator/base.py +0 -30
  25. docling_core-1.7.2/docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
  26. docling_core-1.7.2/docling_core/transforms/id_generator/uuid_generator.py +0 -34
  27. docling_core-1.7.2/docling_core/transforms/metadata_extractor/__init__.py +0 -13
  28. docling_core-1.7.2/docling_core/transforms/metadata_extractor/base.py +0 -59
  29. docling_core-1.7.2/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
  30. docling_core-1.7.2/docling_core/types/__init__.py +0 -25
  31. docling_core-1.7.2/docling_core/utils/ds_generate_docs.py +0 -144
  32. {docling_core-1.7.2 → docling_core-2.0.1}/LICENSE +0 -0
  33. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/__init__.py +0 -0
  34. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/py.typed +0 -0
  35. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
  36. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
  37. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  38. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
  39. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  40. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  41. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  42. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  43. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/__init__.py +0 -0
  44. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  45. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/mapping.py +0 -0
  46. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/meta.py +0 -0
  47. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/package.py +0 -0
  48. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/transforms/__init__.py +0 -0
  49. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/base.py +0 -0
  50. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/gen/__init__.py +0 -0
  51. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/gen/generic.py +0 -0
  52. {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/__init__.py +0 -0
  53. {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/tokens.py +0 -0
  54. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/nlp/__init__.py +0 -0
  55. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/nlp/qa.py +0 -0
  56. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/nlp/qa_labels.py +0 -0
  57. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/__init__.py +0 -0
  58. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/attribute.py +0 -0
  59. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/base.py +0 -0
  60. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/predicate.py +0 -0
  61. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/record.py +0 -0
  62. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/statement.py +0 -0
  63. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/__init__.py +0 -0
  64. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/alias.py +0 -0
  65. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/file.py +0 -0
  66. {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/validate.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 1.7.2
3
+ Version: 2.0.1
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -25,10 +25,10 @@ Classifier: Topic :: Database
25
25
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
26
26
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
27
27
  Classifier: Typing :: Typed
28
- Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
29
28
  Requires-Dist: jsonref (>=1.1.0,<2.0.0)
30
29
  Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
31
30
  Requires-Dist: pandas (>=2.1.4,<3.0.0)
31
+ Requires-Dist: pillow (>=10.3.0,<11.0.0)
32
32
  Requires-Dist: pydantic (>=2.6.0,<3.0.0)
33
33
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
34
34
  Project-URL: Repository, https://github.com/DS4SD/docling-core
@@ -37,7 +37,7 @@ Description-Content-Type: text/markdown
37
37
  # Docling Core
38
38
 
39
39
  [![PyPI version](https://img.shields.io/pypi/v/docling-core)](https://pypi.org/project/docling-core/)
40
- ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)
40
+ ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%20%203.11%20%7C%203.12%20%7C%203.13-blue)
41
41
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
42
42
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
43
43
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
@@ -57,7 +57,7 @@ pip install docling-core
57
57
 
58
58
  ### Development setup
59
59
 
60
- To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
60
+ To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
61
61
  ```bash
62
62
  poetry install
63
63
  ```
@@ -72,37 +72,37 @@ poetry run pytest test
72
72
  - You can validate your JSON objects using the pydantic class definition.
73
73
 
74
74
  ```py
75
- from docling_core.types import Document
75
+ from docling_core.types import DoclingDocument
76
76
 
77
77
  data_dict = {...} # here the object you want to validate, as a dictionary
78
- Document.model_validate(data_dict)
78
+ DoclingDocument.model_validate(data_dict)
79
79
 
80
80
  data_str = {...} # here the object as a JSON string
81
- Document.model_validate_json(data_str)
81
+ DoclingDocument.model_validate_json(data_str)
82
82
  ```
83
83
 
84
- - You can generate the JSON schema of a model with the script `ds_generate_jsonschema`.
84
+ - You can generate the JSON schema of a model with the script `generate_jsonschema`.
85
85
 
86
86
  ```py
87
- # for the `Document` type
88
- ds_generate_jsonschema Document
87
+ # for the `DoclingDocument` type
88
+ generate_jsonschema DoclingDocument
89
89
 
90
90
  # for the use `Record` type
91
- ds_generate_jsonschema Record
91
+ generate_jsonschema Record
92
92
  ```
93
93
 
94
94
  ## Documentation
95
95
 
96
- Docling supports 3 main data types:
96
+ Docling Core contains 3 top-level data types:
97
97
 
98
- - **Document** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
99
- The Document type also models the metadata that may be attached to the converted document.
100
- Check [Document](docs/Document.md) for the full JSON schema.
98
+ - **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
99
+ The DoclingDocument type also models the metadata that may be attached to the converted document.
100
+ Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
101
101
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
102
102
  Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
103
- Check [Record](docs/Record.md) for the full JSON schema.
103
+ Check [Record](docs/Record.json) for the full JSON schema.
104
104
  - **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
105
- Check [Generic](docs/Generic.md) for the full JSON schema.
105
+ Check [Generic](docs/Generic.json) for the full JSON schema.
106
106
 
107
107
  The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
108
108
 
@@ -1,7 +1,7 @@
1
1
  # Docling Core
2
2
 
3
3
  [![PyPI version](https://img.shields.io/pypi/v/docling-core)](https://pypi.org/project/docling-core/)
4
- ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)
4
+ ![Python](https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%20%203.11%20%7C%203.12%20%7C%203.13-blue)
5
5
  [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
6
6
  [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
7
7
  [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
@@ -21,7 +21,7 @@ pip install docling-core
21
21
 
22
22
  ### Development setup
23
23
 
24
- To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
24
+ To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
25
25
  ```bash
26
26
  poetry install
27
27
  ```
@@ -36,37 +36,37 @@ poetry run pytest test
36
36
  - You can validate your JSON objects using the pydantic class definition.
37
37
 
38
38
  ```py
39
- from docling_core.types import Document
39
+ from docling_core.types import DoclingDocument
40
40
 
41
41
  data_dict = {...} # here the object you want to validate, as a dictionary
42
- Document.model_validate(data_dict)
42
+ DoclingDocument.model_validate(data_dict)
43
43
 
44
44
  data_str = {...} # here the object as a JSON string
45
- Document.model_validate_json(data_str)
45
+ DoclingDocument.model_validate_json(data_str)
46
46
  ```
47
47
 
48
- - You can generate the JSON schema of a model with the script `ds_generate_jsonschema`.
48
+ - You can generate the JSON schema of a model with the script `generate_jsonschema`.
49
49
 
50
50
  ```py
51
- # for the `Document` type
52
- ds_generate_jsonschema Document
51
+ # for the `DoclingDocument` type
52
+ generate_jsonschema DoclingDocument
53
53
 
54
54
  # for the use `Record` type
55
- ds_generate_jsonschema Record
55
+ generate_jsonschema Record
56
56
  ```
57
57
 
58
58
  ## Documentation
59
59
 
60
- Docling supports 3 main data types:
60
+ Docling Core contains 3 top-level data types:
61
61
 
62
- - **Document** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
63
- The Document type also models the metadata that may be attached to the converted document.
64
- Check [Document](docs/Document.md) for the full JSON schema.
62
+ - **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
63
+ The DoclingDocument type also models the metadata that may be attached to the converted document.
64
+ Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
65
65
  - **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
66
66
  Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
67
- Check [Record](docs/Record.md) for the full JSON schema.
67
+ Check [Record](docs/Record.json) for the full JSON schema.
68
68
  - **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
69
- Check [Generic](docs/Generic.md) for the full JSON schema.
69
+ Check [Generic](docs/Generic.json) for the full JSON schema.
70
70
 
71
71
  The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
72
72
 
@@ -5,11 +5,5 @@
5
5
 
6
6
  """Define the chunker types."""
7
7
 
8
- from docling_core.transforms.chunker.base import ( # noqa
9
- BaseChunker,
10
- Chunk,
11
- ChunkWithMetadata,
12
- )
13
- from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
14
- HierarchicalChunker,
15
- )
8
+ from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9
+ from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
@@ -0,0 +1,61 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define base classes for chunking."""
7
+ from abc import ABC, abstractmethod
8
+ from typing import Any, ClassVar, Iterator
9
+
10
+ from pydantic import BaseModel
11
+
12
+ from docling_core.types.doc import DoclingDocument as DLDocument
13
+
14
+
15
+ class BaseMeta(BaseModel):
16
+ """Metadata base class."""
17
+
18
+ excluded_embed: ClassVar[list[str]] = []
19
+ excluded_llm: ClassVar[list[str]] = []
20
+
21
+ def export_json_dict(self) -> dict[str, Any]:
22
+ """Helper method for exporting non-None keys to JSON mode.
23
+
24
+ Returns:
25
+ dict[str, Any]: The exported dictionary.
26
+ """
27
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
28
+
29
+
30
+ class BaseChunk(BaseModel):
31
+ """Chunk base class."""
32
+
33
+ text: str
34
+ meta: BaseMeta
35
+
36
+ def export_json_dict(self) -> dict[str, Any]:
37
+ """Helper method for exporting non-None keys to JSON mode.
38
+
39
+ Returns:
40
+ dict[str, Any]: The exported dictionary.
41
+ """
42
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
43
+
44
+
45
+ class BaseChunker(BaseModel, ABC):
46
+ """Chunker base class."""
47
+
48
+ @abstractmethod
49
+ def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
50
+ """Chunk the provided document.
51
+
52
+ Args:
53
+ dl_doc (DLDocument): document to chunk
54
+
55
+ Raises:
56
+ NotImplementedError: in this abstract implementation
57
+
58
+ Yields:
59
+ Iterator[BaseChunk]: iterator over extracted chunks
60
+ """
61
+ raise NotImplementedError()
@@ -0,0 +1,186 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Chunker implementation leveraging the document structure."""
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import Any, ClassVar, Iterator, Optional
12
+
13
+ from pandas import DataFrame
14
+ from pydantic import Field
15
+
16
+ from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
17
+ from docling_core.types.doc import DoclingDocument as DLDocument
18
+ from docling_core.types.doc.document import (
19
+ DocItem,
20
+ LevelNumber,
21
+ ListItem,
22
+ SectionHeaderItem,
23
+ TableItem,
24
+ TextItem,
25
+ )
26
+ from docling_core.types.doc.labels import DocItemLabel
27
+
28
+ _KEY_DOC_ITEMS = "doc_items"
29
+ _KEY_HEADINGS = "headings"
30
+ _KEY_CAPTIONS = "captions"
31
+
32
+ _logger = logging.getLogger(__name__)
33
+
34
+
35
+ class DocMeta(BaseMeta):
36
+ """Data model for Hierarchical Chunker metadata."""
37
+
38
+ doc_items: list[DocItem] = Field(
39
+ alias=_KEY_DOC_ITEMS,
40
+ min_length=1,
41
+ )
42
+ headings: Optional[list[str]] = Field(
43
+ default=None,
44
+ alias=_KEY_HEADINGS,
45
+ min_length=1,
46
+ )
47
+ captions: Optional[list[str]] = Field(
48
+ default=None,
49
+ alias=_KEY_CAPTIONS,
50
+ min_length=1,
51
+ )
52
+
53
+ excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
54
+ excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
55
+
56
+
57
+ class DocChunk(BaseChunk):
58
+ """Data model for Hierarchical Chunker chunks."""
59
+
60
+ meta: DocMeta
61
+
62
+
63
+ class HierarchicalChunker(BaseChunker):
64
+ r"""Chunker implementation leveraging the document layout.
65
+
66
+ Args:
67
+ merge_list_items (bool): Whether to merge successive list items.
68
+ Defaults to True.
69
+ delim (str): Delimiter to use for merging text. Defaults to "\n".
70
+ """
71
+
72
+ merge_list_items: bool = True
73
+ delim: str = "\n"
74
+
75
+ @classmethod
76
+ def _triplet_serialize(cls, table_df: DataFrame) -> str:
77
+
78
+ # copy header as first row and shift all rows by one
79
+ table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
80
+ table_df.index = table_df.index + 1
81
+ table_df = table_df.sort_index()
82
+
83
+ rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
84
+ cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
85
+
86
+ nrows = table_df.shape[0]
87
+ ncols = table_df.shape[1]
88
+ texts = [
89
+ f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
90
+ for i in range(1, nrows)
91
+ for j in range(1, ncols)
92
+ ]
93
+ output_text = ". ".join(texts)
94
+
95
+ return output_text
96
+
97
+ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
98
+ r"""Chunk the provided document.
99
+
100
+ Args:
101
+ dl_doc (DLDocument): document to chunk
102
+
103
+ Yields:
104
+ Iterator[Chunk]: iterator over extracted chunks
105
+ """
106
+ heading_by_level: dict[LevelNumber, str] = {}
107
+ list_items: list[TextItem] = []
108
+ for item, level in dl_doc.iterate_items():
109
+ captions = None
110
+ if isinstance(item, DocItem):
111
+
112
+ # first handle any merging needed
113
+ if self.merge_list_items:
114
+ if isinstance(
115
+ item, ListItem
116
+ ) or ( # TODO remove when all captured as ListItem:
117
+ isinstance(item, TextItem)
118
+ and item.label == DocItemLabel.LIST_ITEM
119
+ ):
120
+ list_items.append(item)
121
+ continue
122
+ elif list_items: # need to yield
123
+ yield DocChunk(
124
+ text=self.delim.join([i.text for i in list_items]),
125
+ meta=DocMeta(
126
+ doc_items=list_items,
127
+ headings=[
128
+ heading_by_level[k]
129
+ for k in sorted(heading_by_level)
130
+ ]
131
+ or None,
132
+ ),
133
+ )
134
+ list_items = [] # reset
135
+
136
+ if isinstance(
137
+ item, SectionHeaderItem
138
+ ) or ( # TODO remove when all captured as SectionHeaderItem:
139
+ isinstance(item, TextItem)
140
+ and item.label == DocItemLabel.SECTION_HEADER
141
+ ):
142
+ # TODO second branch not needed once cleanup above complete:
143
+ level = item.level if isinstance(item, SectionHeaderItem) else 1
144
+ heading_by_level[level] = item.text
145
+
146
+ # remove headings of higher level as they just went out of scope
147
+ keys_to_del = [k for k in heading_by_level if k > level]
148
+ for k in keys_to_del:
149
+ heading_by_level.pop(k, None)
150
+ continue
151
+
152
+ if isinstance(item, TextItem) or (
153
+ (not self.merge_list_items) and isinstance(item, ListItem)
154
+ ):
155
+ text = item.text
156
+ elif isinstance(item, TableItem):
157
+ table_df = item.export_to_dataframe()
158
+ if table_df.shape[0] < 1 or table_df.shape[1] < 2:
159
+ # at least two cols needed, as first column contains row headers
160
+ continue
161
+ text = self._triplet_serialize(table_df=table_df)
162
+ captions = [
163
+ c.text for c in [r.resolve(dl_doc) for r in item.captions]
164
+ ] or None
165
+ else:
166
+ continue
167
+ c = DocChunk(
168
+ text=text,
169
+ meta=DocMeta(
170
+ doc_items=[item],
171
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)]
172
+ or None,
173
+ captions=captions,
174
+ ),
175
+ )
176
+ yield c
177
+
178
+ if self.merge_list_items and list_items: # need to yield
179
+ yield DocChunk(
180
+ text=self.delim.join([i.text for i in list_items]),
181
+ meta=DocMeta(
182
+ doc_items=list_items,
183
+ headings=[heading_by_level[k] for k in sorted(heading_by_level)]
184
+ or None,
185
+ ),
186
+ )
@@ -0,0 +1,10 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Define the main types."""
7
+
8
+ from docling_core.types.doc.document import DoclingDocument
9
+ from docling_core.types.gen.generic import Generic
10
+ from docling_core.types.rec.record import Record
@@ -7,9 +7,6 @@
7
7
 
8
8
  from .base import BoundingBox, CoordOrigin, Size
9
9
  from .document import (
10
- BasePictureData,
11
- BaseTableData,
12
- DescriptionItem,
13
10
  DocItem,
14
11
  DoclingDocument,
15
12
  DocumentOrigin,
@@ -19,11 +16,15 @@ from .document import (
19
16
  KeyValueItem,
20
17
  NodeItem,
21
18
  PageItem,
19
+ PictureClassificationClass,
20
+ PictureClassificationData,
21
+ PictureDataType,
22
22
  PictureItem,
23
23
  ProvenanceItem,
24
24
  RefItem,
25
25
  SectionHeaderItem,
26
26
  TableCell,
27
+ TableData,
27
28
  TableItem,
28
29
  TextItem,
29
30
  )
@@ -108,7 +108,10 @@ class BoundingBox(BaseModel):
108
108
 
109
109
  def area(self) -> float:
110
110
  """area."""
111
- return (self.r - self.l) * (self.b - self.t)
111
+ area = (self.r - self.l) * (self.b - self.t)
112
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
113
+ area = -area
114
+ return area
112
115
 
113
116
  def intersection_area_with(self, other: "BoundingBox") -> float:
114
117
  """intersection_area_with.