docling-core 1.7.2__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-1.7.2 → docling_core-2.0.1}/PKG-INFO +17 -17
- {docling_core-1.7.2 → docling_core-2.0.1}/README.md +15 -15
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/transforms/chunker/__init__.py +2 -8
- docling_core-2.0.1/docling_core/transforms/chunker/base.py +61 -0
- docling_core-2.0.1/docling_core/transforms/chunker/hierarchical_chunker.py +186 -0
- docling_core-2.0.1/docling_core/types/__init__.py +10 -0
- {docling_core-1.7.2/docling_core/types/experimental → docling_core-2.0.1/docling_core/types/doc}/__init__.py +4 -3
- {docling_core-1.7.2/docling_core/types/experimental → docling_core-2.0.1/docling_core/types/doc}/base.py +4 -1
- {docling_core-1.7.2/docling_core/types/experimental → docling_core-2.0.1/docling_core/types/doc}/document.py +332 -83
- {docling_core-1.7.2/docling_core/types/experimental → docling_core-2.0.1/docling_core/types/doc}/labels.py +4 -1
- {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/base.py +1 -1
- {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/doc_ann.py +1 -1
- {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/doc_ocr.py +1 -1
- {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/doc_raw.py +1 -1
- {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/document.py +8 -4
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/subject.py +1 -1
- docling_core-2.0.1/docling_core/utils/generate_docs.py +82 -0
- docling_core-1.7.2/docling_core/utils/ds_generate_jsonschema.py → docling_core-2.0.1/docling_core/utils/generate_jsonschema.py +4 -4
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/validators.py +3 -3
- {docling_core-1.7.2 → docling_core-2.0.1}/pyproject.toml +4 -5
- docling_core-1.7.2/docling_core/transforms/chunker/base.py +0 -74
- docling_core-1.7.2/docling_core/transforms/chunker/hierarchical_chunker.py +0 -354
- docling_core-1.7.2/docling_core/transforms/id_generator/__init__.py +0 -12
- docling_core-1.7.2/docling_core/transforms/id_generator/base.py +0 -30
- docling_core-1.7.2/docling_core/transforms/id_generator/doc_hash_id_generator.py +0 -27
- docling_core-1.7.2/docling_core/transforms/id_generator/uuid_generator.py +0 -34
- docling_core-1.7.2/docling_core/transforms/metadata_extractor/__init__.py +0 -13
- docling_core-1.7.2/docling_core/transforms/metadata_extractor/base.py +0 -59
- docling_core-1.7.2/docling_core/transforms/metadata_extractor/simple_metadata_extractor.py +0 -59
- docling_core-1.7.2/docling_core/types/__init__.py +0 -25
- docling_core-1.7.2/docling_core/utils/ds_generate_docs.py +0 -144
- {docling_core-1.7.2 → docling_core-2.0.1}/LICENSE +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/__init__.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/py.typed +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/__init__.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/mapping.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/meta.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/search/package.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/transforms/__init__.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/base.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/gen/generic.py +0 -0
- {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/__init__.py +0 -0
- {docling_core-1.7.2/docling_core/types/doc → docling_core-2.0.1/docling_core/types/legacy_doc}/tokens.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/base.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/record.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/types/rec/statement.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/__init__.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/alias.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/file.py +0 -0
- {docling_core-1.7.2 → docling_core-2.0.1}/docling_core/utils/validate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version:
|
|
3
|
+
Version: 2.0.1
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -25,10 +25,10 @@ Classifier: Topic :: Database
|
|
|
25
25
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
27
|
Classifier: Typing :: Typed
|
|
28
|
-
Requires-Dist: json-schema-for-humans (>=1.0.0,<2.0.0)
|
|
29
28
|
Requires-Dist: jsonref (>=1.1.0,<2.0.0)
|
|
30
29
|
Requires-Dist: jsonschema (>=4.16.0,<5.0.0)
|
|
31
30
|
Requires-Dist: pandas (>=2.1.4,<3.0.0)
|
|
31
|
+
Requires-Dist: pillow (>=10.3.0,<11.0.0)
|
|
32
32
|
Requires-Dist: pydantic (>=2.6.0,<3.0.0)
|
|
33
33
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
34
34
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
@@ -37,7 +37,7 @@ Description-Content-Type: text/markdown
|
|
|
37
37
|
# Docling Core
|
|
38
38
|
|
|
39
39
|
[](https://pypi.org/project/docling-core/)
|
|
40
|
-

|
|
40
|
+

|
|
41
41
|
[](https://python-poetry.org/)
|
|
42
42
|
[](https://github.com/psf/black)
|
|
43
43
|
[](https://pycqa.github.io/isort/)
|
|
@@ -57,7 +57,7 @@ pip install docling-core
|
|
|
57
57
|
|
|
58
58
|
### Development setup
|
|
59
59
|
|
|
60
|
-
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
|
60
|
+
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
|
|
61
61
|
```bash
|
|
62
62
|
poetry install
|
|
63
63
|
```
|
|
@@ -72,37 +72,37 @@ poetry run pytest test
|
|
|
72
72
|
- You can validate your JSON objects using the pydantic class definition.
|
|
73
73
|
|
|
74
74
|
```py
|
|
75
|
-
from docling_core.types import
|
|
75
|
+
from docling_core.types import DoclingDocument
|
|
76
76
|
|
|
77
77
|
data_dict = {...} # here the object you want to validate, as a dictionary
|
|
78
|
-
|
|
78
|
+
DoclingDocument.model_validate(data_dict)
|
|
79
79
|
|
|
80
80
|
data_str = {...} # here the object as a JSON string
|
|
81
|
-
|
|
81
|
+
DoclingDocument.model_validate_json(data_str)
|
|
82
82
|
```
|
|
83
83
|
|
|
84
|
-
- You can generate the JSON schema of a model with the script `
|
|
84
|
+
- You can generate the JSON schema of a model with the script `generate_jsonschema`.
|
|
85
85
|
|
|
86
86
|
```py
|
|
87
|
-
# for the `
|
|
88
|
-
|
|
87
|
+
# for the `DoclingDocument` type
|
|
88
|
+
generate_jsonschema DoclingDocument
|
|
89
89
|
|
|
90
90
|
# for the use `Record` type
|
|
91
|
-
|
|
91
|
+
generate_jsonschema Record
|
|
92
92
|
```
|
|
93
93
|
|
|
94
94
|
## Documentation
|
|
95
95
|
|
|
96
|
-
Docling
|
|
96
|
+
Docling Core contains 3 top-level data types:
|
|
97
97
|
|
|
98
|
-
- **
|
|
99
|
-
The
|
|
100
|
-
Check [
|
|
98
|
+
- **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
|
|
99
|
+
The DoclingDocument type also models the metadata that may be attached to the converted document.
|
|
100
|
+
Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
|
|
101
101
|
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
|
|
102
102
|
Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
|
|
103
|
-
Check [Record](docs/Record.
|
|
103
|
+
Check [Record](docs/Record.json) for the full JSON schema.
|
|
104
104
|
- **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
|
|
105
|
-
Check [Generic](docs/Generic.
|
|
105
|
+
Check [Generic](docs/Generic.json) for the full JSON schema.
|
|
106
106
|
|
|
107
107
|
The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
|
|
108
108
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# Docling Core
|
|
2
2
|
|
|
3
3
|
[](https://pypi.org/project/docling-core/)
|
|
4
|
-

|
|
4
|
+

|
|
5
5
|
[](https://python-poetry.org/)
|
|
6
6
|
[](https://github.com/psf/black)
|
|
7
7
|
[](https://pycqa.github.io/isort/)
|
|
@@ -21,7 +21,7 @@ pip install docling-core
|
|
|
21
21
|
|
|
22
22
|
### Development setup
|
|
23
23
|
|
|
24
|
-
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
|
24
|
+
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
|
|
25
25
|
```bash
|
|
26
26
|
poetry install
|
|
27
27
|
```
|
|
@@ -36,37 +36,37 @@ poetry run pytest test
|
|
|
36
36
|
- You can validate your JSON objects using the pydantic class definition.
|
|
37
37
|
|
|
38
38
|
```py
|
|
39
|
-
from docling_core.types import
|
|
39
|
+
from docling_core.types import DoclingDocument
|
|
40
40
|
|
|
41
41
|
data_dict = {...} # here the object you want to validate, as a dictionary
|
|
42
|
-
|
|
42
|
+
DoclingDocument.model_validate(data_dict)
|
|
43
43
|
|
|
44
44
|
data_str = {...} # here the object as a JSON string
|
|
45
|
-
|
|
45
|
+
DoclingDocument.model_validate_json(data_str)
|
|
46
46
|
```
|
|
47
47
|
|
|
48
|
-
- You can generate the JSON schema of a model with the script `
|
|
48
|
+
- You can generate the JSON schema of a model with the script `generate_jsonschema`.
|
|
49
49
|
|
|
50
50
|
```py
|
|
51
|
-
# for the `
|
|
52
|
-
|
|
51
|
+
# for the `DoclingDocument` type
|
|
52
|
+
generate_jsonschema DoclingDocument
|
|
53
53
|
|
|
54
54
|
# for the use `Record` type
|
|
55
|
-
|
|
55
|
+
generate_jsonschema Record
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
## Documentation
|
|
59
59
|
|
|
60
|
-
Docling
|
|
60
|
+
Docling Core contains 3 top-level data types:
|
|
61
61
|
|
|
62
|
-
- **
|
|
63
|
-
The
|
|
64
|
-
Check [
|
|
62
|
+
- **DoclingDocument** for publications like books, articles, reports, or patents. When Docling converts an unstructured PDF document, the generated JSON follows this schema.
|
|
63
|
+
The DoclingDocument type also models the metadata that may be attached to the converted document.
|
|
64
|
+
Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
|
|
65
65
|
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
|
|
66
66
|
Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
|
|
67
|
-
Check [Record](docs/Record.
|
|
67
|
+
Check [Record](docs/Record.json) for the full JSON schema.
|
|
68
68
|
- **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
|
|
69
|
-
Check [Generic](docs/Generic.
|
|
69
|
+
Check [Generic](docs/Generic.json) for the full JSON schema.
|
|
70
70
|
|
|
71
71
|
The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
|
|
72
72
|
|
|
@@ -5,11 +5,5 @@
|
|
|
5
5
|
|
|
6
6
|
"""Define the chunker types."""
|
|
7
7
|
|
|
8
|
-
from docling_core.transforms.chunker.base import
|
|
9
|
-
|
|
10
|
-
Chunk,
|
|
11
|
-
ChunkWithMetadata,
|
|
12
|
-
)
|
|
13
|
-
from docling_core.transforms.chunker.hierarchical_chunker import ( # noqa
|
|
14
|
-
HierarchicalChunker,
|
|
15
|
-
)
|
|
8
|
+
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
|
9
|
+
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define base classes for chunking."""
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from typing import Any, ClassVar, Iterator
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel
|
|
11
|
+
|
|
12
|
+
from docling_core.types.doc import DoclingDocument as DLDocument
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseMeta(BaseModel):
|
|
16
|
+
"""Metadata base class."""
|
|
17
|
+
|
|
18
|
+
excluded_embed: ClassVar[list[str]] = []
|
|
19
|
+
excluded_llm: ClassVar[list[str]] = []
|
|
20
|
+
|
|
21
|
+
def export_json_dict(self) -> dict[str, Any]:
|
|
22
|
+
"""Helper method for exporting non-None keys to JSON mode.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
dict[str, Any]: The exported dictionary.
|
|
26
|
+
"""
|
|
27
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BaseChunk(BaseModel):
|
|
31
|
+
"""Chunk base class."""
|
|
32
|
+
|
|
33
|
+
text: str
|
|
34
|
+
meta: BaseMeta
|
|
35
|
+
|
|
36
|
+
def export_json_dict(self) -> dict[str, Any]:
|
|
37
|
+
"""Helper method for exporting non-None keys to JSON mode.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
dict[str, Any]: The exported dictionary.
|
|
41
|
+
"""
|
|
42
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BaseChunker(BaseModel, ABC):
|
|
46
|
+
"""Chunker base class."""
|
|
47
|
+
|
|
48
|
+
@abstractmethod
|
|
49
|
+
def chunk(self, dl_doc: DLDocument, **kwargs) -> Iterator[BaseChunk]:
|
|
50
|
+
"""Chunk the provided document.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
dl_doc (DLDocument): document to chunk
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
NotImplementedError: in this abstract implementation
|
|
57
|
+
|
|
58
|
+
Yields:
|
|
59
|
+
Iterator[BaseChunk]: iterator over extracted chunks
|
|
60
|
+
"""
|
|
61
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Chunker implementation leveraging the document structure."""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Any, ClassVar, Iterator, Optional
|
|
12
|
+
|
|
13
|
+
from pandas import DataFrame
|
|
14
|
+
from pydantic import Field
|
|
15
|
+
|
|
16
|
+
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
17
|
+
from docling_core.types.doc import DoclingDocument as DLDocument
|
|
18
|
+
from docling_core.types.doc.document import (
|
|
19
|
+
DocItem,
|
|
20
|
+
LevelNumber,
|
|
21
|
+
ListItem,
|
|
22
|
+
SectionHeaderItem,
|
|
23
|
+
TableItem,
|
|
24
|
+
TextItem,
|
|
25
|
+
)
|
|
26
|
+
from docling_core.types.doc.labels import DocItemLabel
|
|
27
|
+
|
|
28
|
+
_KEY_DOC_ITEMS = "doc_items"
|
|
29
|
+
_KEY_HEADINGS = "headings"
|
|
30
|
+
_KEY_CAPTIONS = "captions"
|
|
31
|
+
|
|
32
|
+
_logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class DocMeta(BaseMeta):
|
|
36
|
+
"""Data model for Hierarchical Chunker metadata."""
|
|
37
|
+
|
|
38
|
+
doc_items: list[DocItem] = Field(
|
|
39
|
+
alias=_KEY_DOC_ITEMS,
|
|
40
|
+
min_length=1,
|
|
41
|
+
)
|
|
42
|
+
headings: Optional[list[str]] = Field(
|
|
43
|
+
default=None,
|
|
44
|
+
alias=_KEY_HEADINGS,
|
|
45
|
+
min_length=1,
|
|
46
|
+
)
|
|
47
|
+
captions: Optional[list[str]] = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
alias=_KEY_CAPTIONS,
|
|
50
|
+
min_length=1,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
excluded_embed: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
|
|
54
|
+
excluded_llm: ClassVar[list[str]] = [_KEY_DOC_ITEMS]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class DocChunk(BaseChunk):
|
|
58
|
+
"""Data model for Hierarchical Chunker chunks."""
|
|
59
|
+
|
|
60
|
+
meta: DocMeta
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class HierarchicalChunker(BaseChunker):
|
|
64
|
+
r"""Chunker implementation leveraging the document layout.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
merge_list_items (bool): Whether to merge successive list items.
|
|
68
|
+
Defaults to True.
|
|
69
|
+
delim (str): Delimiter to use for merging text. Defaults to "\n".
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
merge_list_items: bool = True
|
|
73
|
+
delim: str = "\n"
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def _triplet_serialize(cls, table_df: DataFrame) -> str:
|
|
77
|
+
|
|
78
|
+
# copy header as first row and shift all rows by one
|
|
79
|
+
table_df.loc[-1] = table_df.columns # type: ignore[call-overload]
|
|
80
|
+
table_df.index = table_df.index + 1
|
|
81
|
+
table_df = table_df.sort_index()
|
|
82
|
+
|
|
83
|
+
rows = [item.strip() for item in table_df.iloc[:, 0].to_list()]
|
|
84
|
+
cols = [item.strip() for item in table_df.iloc[0, :].to_list()]
|
|
85
|
+
|
|
86
|
+
nrows = table_df.shape[0]
|
|
87
|
+
ncols = table_df.shape[1]
|
|
88
|
+
texts = [
|
|
89
|
+
f"{rows[i]}, {cols[j]} = {str(table_df.iloc[i, j]).strip()}"
|
|
90
|
+
for i in range(1, nrows)
|
|
91
|
+
for j in range(1, ncols)
|
|
92
|
+
]
|
|
93
|
+
output_text = ". ".join(texts)
|
|
94
|
+
|
|
95
|
+
return output_text
|
|
96
|
+
|
|
97
|
+
def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
|
|
98
|
+
r"""Chunk the provided document.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
dl_doc (DLDocument): document to chunk
|
|
102
|
+
|
|
103
|
+
Yields:
|
|
104
|
+
Iterator[Chunk]: iterator over extracted chunks
|
|
105
|
+
"""
|
|
106
|
+
heading_by_level: dict[LevelNumber, str] = {}
|
|
107
|
+
list_items: list[TextItem] = []
|
|
108
|
+
for item, level in dl_doc.iterate_items():
|
|
109
|
+
captions = None
|
|
110
|
+
if isinstance(item, DocItem):
|
|
111
|
+
|
|
112
|
+
# first handle any merging needed
|
|
113
|
+
if self.merge_list_items:
|
|
114
|
+
if isinstance(
|
|
115
|
+
item, ListItem
|
|
116
|
+
) or ( # TODO remove when all captured as ListItem:
|
|
117
|
+
isinstance(item, TextItem)
|
|
118
|
+
and item.label == DocItemLabel.LIST_ITEM
|
|
119
|
+
):
|
|
120
|
+
list_items.append(item)
|
|
121
|
+
continue
|
|
122
|
+
elif list_items: # need to yield
|
|
123
|
+
yield DocChunk(
|
|
124
|
+
text=self.delim.join([i.text for i in list_items]),
|
|
125
|
+
meta=DocMeta(
|
|
126
|
+
doc_items=list_items,
|
|
127
|
+
headings=[
|
|
128
|
+
heading_by_level[k]
|
|
129
|
+
for k in sorted(heading_by_level)
|
|
130
|
+
]
|
|
131
|
+
or None,
|
|
132
|
+
),
|
|
133
|
+
)
|
|
134
|
+
list_items = [] # reset
|
|
135
|
+
|
|
136
|
+
if isinstance(
|
|
137
|
+
item, SectionHeaderItem
|
|
138
|
+
) or ( # TODO remove when all captured as SectionHeaderItem:
|
|
139
|
+
isinstance(item, TextItem)
|
|
140
|
+
and item.label == DocItemLabel.SECTION_HEADER
|
|
141
|
+
):
|
|
142
|
+
# TODO second branch not needed once cleanup above complete:
|
|
143
|
+
level = item.level if isinstance(item, SectionHeaderItem) else 1
|
|
144
|
+
heading_by_level[level] = item.text
|
|
145
|
+
|
|
146
|
+
# remove headings of higher level as they just went out of scope
|
|
147
|
+
keys_to_del = [k for k in heading_by_level if k > level]
|
|
148
|
+
for k in keys_to_del:
|
|
149
|
+
heading_by_level.pop(k, None)
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
if isinstance(item, TextItem) or (
|
|
153
|
+
(not self.merge_list_items) and isinstance(item, ListItem)
|
|
154
|
+
):
|
|
155
|
+
text = item.text
|
|
156
|
+
elif isinstance(item, TableItem):
|
|
157
|
+
table_df = item.export_to_dataframe()
|
|
158
|
+
if table_df.shape[0] < 1 or table_df.shape[1] < 2:
|
|
159
|
+
# at least two cols needed, as first column contains row headers
|
|
160
|
+
continue
|
|
161
|
+
text = self._triplet_serialize(table_df=table_df)
|
|
162
|
+
captions = [
|
|
163
|
+
c.text for c in [r.resolve(dl_doc) for r in item.captions]
|
|
164
|
+
] or None
|
|
165
|
+
else:
|
|
166
|
+
continue
|
|
167
|
+
c = DocChunk(
|
|
168
|
+
text=text,
|
|
169
|
+
meta=DocMeta(
|
|
170
|
+
doc_items=[item],
|
|
171
|
+
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
172
|
+
or None,
|
|
173
|
+
captions=captions,
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
yield c
|
|
177
|
+
|
|
178
|
+
if self.merge_list_items and list_items: # need to yield
|
|
179
|
+
yield DocChunk(
|
|
180
|
+
text=self.delim.join([i.text for i in list_items]),
|
|
181
|
+
meta=DocMeta(
|
|
182
|
+
doc_items=list_items,
|
|
183
|
+
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
184
|
+
or None,
|
|
185
|
+
),
|
|
186
|
+
)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the main types."""
|
|
7
|
+
|
|
8
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
9
|
+
from docling_core.types.gen.generic import Generic
|
|
10
|
+
from docling_core.types.rec.record import Record
|
|
@@ -7,9 +7,6 @@
|
|
|
7
7
|
|
|
8
8
|
from .base import BoundingBox, CoordOrigin, Size
|
|
9
9
|
from .document import (
|
|
10
|
-
BasePictureData,
|
|
11
|
-
BaseTableData,
|
|
12
|
-
DescriptionItem,
|
|
13
10
|
DocItem,
|
|
14
11
|
DoclingDocument,
|
|
15
12
|
DocumentOrigin,
|
|
@@ -19,11 +16,15 @@ from .document import (
|
|
|
19
16
|
KeyValueItem,
|
|
20
17
|
NodeItem,
|
|
21
18
|
PageItem,
|
|
19
|
+
PictureClassificationClass,
|
|
20
|
+
PictureClassificationData,
|
|
21
|
+
PictureDataType,
|
|
22
22
|
PictureItem,
|
|
23
23
|
ProvenanceItem,
|
|
24
24
|
RefItem,
|
|
25
25
|
SectionHeaderItem,
|
|
26
26
|
TableCell,
|
|
27
|
+
TableData,
|
|
27
28
|
TableItem,
|
|
28
29
|
TextItem,
|
|
29
30
|
)
|
|
@@ -108,7 +108,10 @@ class BoundingBox(BaseModel):
|
|
|
108
108
|
|
|
109
109
|
def area(self) -> float:
|
|
110
110
|
"""area."""
|
|
111
|
-
|
|
111
|
+
area = (self.r - self.l) * (self.b - self.t)
|
|
112
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
|
113
|
+
area = -area
|
|
114
|
+
return area
|
|
112
115
|
|
|
113
116
|
def intersection_area_with(self, other: "BoundingBox") -> float:
|
|
114
117
|
"""intersection_area_with.
|