docling-core 2.0.0__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.0.0 → docling_core-2.1.0}/PKG-INFO +12 -12
- {docling_core-2.0.0 → docling_core-2.1.0}/README.md +11 -11
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/chunker/__init__.py +4 -1
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/chunker/base.py +1 -1
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/chunker/hierarchical_chunker.py +58 -6
- docling_core-2.1.0/docling_core/types/__init__.py +10 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/doc/__init__.py +1 -1
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/doc/base.py +7 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/doc/document.py +246 -111
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/generate_docs.py +1 -1
- {docling_core-2.0.0 → docling_core-2.1.0}/pyproject.toml +1 -1
- docling_core-2.0.0/docling_core/types/__init__.py +0 -29
- {docling_core-2.0.0 → docling_core-2.1.0}/LICENSE +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/__init__.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/py.typed +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/package.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/base.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.0.0 → docling_core-2.1.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -72,20 +72,20 @@ poetry run pytest test
|
|
|
72
72
|
- You can validate your JSON objects using the pydantic class definition.
|
|
73
73
|
|
|
74
74
|
```py
|
|
75
|
-
from docling_core.types import
|
|
75
|
+
from docling_core.types import DoclingDocument
|
|
76
76
|
|
|
77
77
|
data_dict = {...} # here the object you want to validate, as a dictionary
|
|
78
|
-
|
|
78
|
+
DoclingDocument.model_validate(data_dict)
|
|
79
79
|
|
|
80
80
|
data_str = {...} # here the object as a JSON string
|
|
81
|
-
|
|
81
|
+
DoclingDocument.model_validate_json(data_str)
|
|
82
82
|
```
|
|
83
83
|
|
|
84
84
|
- You can generate the JSON schema of a model with the script `generate_jsonschema`.
|
|
85
85
|
|
|
86
86
|
```py
|
|
87
|
-
# for the `
|
|
88
|
-
generate_jsonschema
|
|
87
|
+
# for the `DoclingDocument` type
|
|
88
|
+
generate_jsonschema DoclingDocument
|
|
89
89
|
|
|
90
90
|
# for the use `Record` type
|
|
91
91
|
generate_jsonschema Record
|
|
@@ -93,16 +93,16 @@ poetry run pytest test
|
|
|
93
93
|
|
|
94
94
|
## Documentation
|
|
95
95
|
|
|
96
|
-
Docling
|
|
96
|
+
Docling Core contains 3 top-level data types:
|
|
97
97
|
|
|
98
|
-
- **
|
|
99
|
-
The
|
|
100
|
-
Check [
|
|
98
|
+
- **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
|
|
99
|
+
The DoclingDocument type also models the metadata that may be attached to the converted document.
|
|
100
|
+
Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
|
|
101
101
|
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
|
|
102
102
|
Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
|
|
103
|
-
Check [Record](docs/Record.json) for the full JSON schema.
|
|
103
|
+
Check [Record](docs/Record.json) for the full JSON schema.
|
|
104
104
|
- **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
|
|
105
|
-
Check [Generic](docs/Generic.json) for the full JSON schema.
|
|
105
|
+
Check [Generic](docs/Generic.json) for the full JSON schema.
|
|
106
106
|
|
|
107
107
|
The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
|
|
108
108
|
|
|
@@ -36,20 +36,20 @@ poetry run pytest test
|
|
|
36
36
|
- You can validate your JSON objects using the pydantic class definition.
|
|
37
37
|
|
|
38
38
|
```py
|
|
39
|
-
from docling_core.types import
|
|
39
|
+
from docling_core.types import DoclingDocument
|
|
40
40
|
|
|
41
41
|
data_dict = {...} # here the object you want to validate, as a dictionary
|
|
42
|
-
|
|
42
|
+
DoclingDocument.model_validate(data_dict)
|
|
43
43
|
|
|
44
44
|
data_str = {...} # here the object as a JSON string
|
|
45
|
-
|
|
45
|
+
DoclingDocument.model_validate_json(data_str)
|
|
46
46
|
```
|
|
47
47
|
|
|
48
48
|
- You can generate the JSON schema of a model with the script `generate_jsonschema`.
|
|
49
49
|
|
|
50
50
|
```py
|
|
51
|
-
# for the `
|
|
52
|
-
generate_jsonschema
|
|
51
|
+
# for the `DoclingDocument` type
|
|
52
|
+
generate_jsonschema DoclingDocument
|
|
53
53
|
|
|
54
54
|
# for the use `Record` type
|
|
55
55
|
generate_jsonschema Record
|
|
@@ -57,16 +57,16 @@ poetry run pytest test
|
|
|
57
57
|
|
|
58
58
|
## Documentation
|
|
59
59
|
|
|
60
|
-
Docling
|
|
60
|
+
Docling Core contains 3 top-level data types:
|
|
61
61
|
|
|
62
|
-
- **
|
|
63
|
-
The
|
|
64
|
-
Check [
|
|
62
|
+
- **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
|
|
63
|
+
The DoclingDocument type also models the metadata that may be attached to the converted document.
|
|
64
|
+
Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
|
|
65
65
|
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
|
|
66
66
|
Related to records, the statements can represent annotations on text by Natural Language Processing (NLP) tools.
|
|
67
|
-
Check [Record](docs/Record.json) for the full JSON schema.
|
|
67
|
+
Check [Record](docs/Record.json) for the full JSON schema.
|
|
68
68
|
- **Generic** for any data representation, ensuring minimal configuration and maximum flexibility.
|
|
69
|
-
Check [Generic](docs/Generic.json) for the full JSON schema.
|
|
69
|
+
Check [Generic](docs/Generic.json) for the full JSON schema.
|
|
70
70
|
|
|
71
71
|
The data schemas are defined using [pydantic](https://pydantic-docs.helpmanual.io/) models, which provide built-in processes to support the creation of data that adhere to those models.
|
|
72
72
|
|
|
@@ -6,4 +6,7 @@
|
|
|
6
6
|
"""Define the chunker types."""
|
|
7
7
|
|
|
8
8
|
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
|
9
|
-
from docling_core.transforms.chunker.hierarchical_chunker import
|
|
9
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
10
|
+
DocMeta,
|
|
11
|
+
HierarchicalChunker,
|
|
12
|
+
)
|
{docling_core-2.0.0 → docling_core-2.1.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
@@ -8,15 +8,19 @@
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
10
|
import logging
|
|
11
|
-
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, ClassVar, Final, Iterator, Literal, Optional
|
|
12
13
|
|
|
13
14
|
from pandas import DataFrame
|
|
14
|
-
from pydantic import Field
|
|
15
|
+
from pydantic import Field, StringConstraints, field_validator
|
|
16
|
+
from typing_extensions import Annotated
|
|
15
17
|
|
|
18
|
+
from docling_core.search.package import VERSION_PATTERN
|
|
16
19
|
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
17
|
-
from docling_core.types
|
|
20
|
+
from docling_core.types import DoclingDocument as DLDocument
|
|
18
21
|
from docling_core.types.doc.document import (
|
|
19
22
|
DocItem,
|
|
23
|
+
DocumentOrigin,
|
|
20
24
|
LevelNumber,
|
|
21
25
|
ListItem,
|
|
22
26
|
SectionHeaderItem,
|
|
@@ -25,16 +29,31 @@ from docling_core.types.doc.document import (
|
|
|
25
29
|
)
|
|
26
30
|
from docling_core.types.doc.labels import DocItemLabel
|
|
27
31
|
|
|
32
|
+
_VERSION: Final = "1.0.0"
|
|
33
|
+
|
|
34
|
+
_KEY_SCHEMA_NAME = "schema_name"
|
|
35
|
+
_KEY_VERSION = "version"
|
|
28
36
|
_KEY_DOC_ITEMS = "doc_items"
|
|
29
37
|
_KEY_HEADINGS = "headings"
|
|
30
38
|
_KEY_CAPTIONS = "captions"
|
|
39
|
+
_KEY_ORIGIN = "origin"
|
|
31
40
|
|
|
32
41
|
_logger = logging.getLogger(__name__)
|
|
33
42
|
|
|
34
43
|
|
|
35
44
|
class DocMeta(BaseMeta):
|
|
36
|
-
"""Data model for Hierarchical Chunker metadata."""
|
|
45
|
+
"""Data model for Hierarchical Chunker chunk metadata."""
|
|
37
46
|
|
|
47
|
+
schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
|
|
48
|
+
default="docling_core.transforms.chunker.DocMeta",
|
|
49
|
+
alias=_KEY_SCHEMA_NAME,
|
|
50
|
+
)
|
|
51
|
+
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
52
|
+
Field(
|
|
53
|
+
default=_VERSION,
|
|
54
|
+
alias=_KEY_VERSION,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
38
57
|
doc_items: list[DocItem] = Field(
|
|
39
58
|
alias=_KEY_DOC_ITEMS,
|
|
40
59
|
min_length=1,
|
|
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
|
|
|
49
68
|
alias=_KEY_CAPTIONS,
|
|
50
69
|
min_length=1,
|
|
51
70
|
)
|
|
71
|
+
origin: Optional[DocumentOrigin] = Field(
|
|
72
|
+
default=None,
|
|
73
|
+
alias=_KEY_ORIGIN,
|
|
74
|
+
)
|
|
52
75
|
|
|
53
|
-
excluded_embed: ClassVar[list[str]] = [
|
|
54
|
-
|
|
76
|
+
excluded_embed: ClassVar[list[str]] = [
|
|
77
|
+
_KEY_SCHEMA_NAME,
|
|
78
|
+
_KEY_VERSION,
|
|
79
|
+
_KEY_DOC_ITEMS,
|
|
80
|
+
_KEY_ORIGIN,
|
|
81
|
+
]
|
|
82
|
+
excluded_llm: ClassVar[list[str]] = [
|
|
83
|
+
_KEY_SCHEMA_NAME,
|
|
84
|
+
_KEY_VERSION,
|
|
85
|
+
_KEY_DOC_ITEMS,
|
|
86
|
+
_KEY_ORIGIN,
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
@field_validator(_KEY_VERSION)
|
|
90
|
+
@classmethod
|
|
91
|
+
def check_version_is_compatible(cls, v: str) -> str:
|
|
92
|
+
"""Check if this meta item version is compatible with current version."""
|
|
93
|
+
current_match = re.match(VERSION_PATTERN, _VERSION)
|
|
94
|
+
doc_match = re.match(VERSION_PATTERN, v)
|
|
95
|
+
if (
|
|
96
|
+
doc_match is None
|
|
97
|
+
or current_match is None
|
|
98
|
+
or doc_match["major"] != current_match["major"]
|
|
99
|
+
or doc_match["minor"] > current_match["minor"]
|
|
100
|
+
):
|
|
101
|
+
raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
|
|
102
|
+
else:
|
|
103
|
+
return _VERSION
|
|
55
104
|
|
|
56
105
|
|
|
57
106
|
class DocChunk(BaseChunk):
|
|
@@ -129,6 +178,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
129
178
|
for k in sorted(heading_by_level)
|
|
130
179
|
]
|
|
131
180
|
or None,
|
|
181
|
+
origin=dl_doc.origin,
|
|
132
182
|
),
|
|
133
183
|
)
|
|
134
184
|
list_items = [] # reset
|
|
@@ -171,6 +221,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
171
221
|
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
172
222
|
or None,
|
|
173
223
|
captions=captions,
|
|
224
|
+
origin=dl_doc.origin,
|
|
174
225
|
),
|
|
175
226
|
)
|
|
176
227
|
yield c
|
|
@@ -182,5 +233,6 @@ class HierarchicalChunker(BaseChunker):
|
|
|
182
233
|
doc_items=list_items,
|
|
183
234
|
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
184
235
|
or None,
|
|
236
|
+
origin=dl_doc.origin,
|
|
185
237
|
),
|
|
186
238
|
)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""Define the main types."""
|
|
7
|
+
|
|
8
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
9
|
+
from docling_core.types.gen.generic import Generic
|
|
10
|
+
from docling_core.types.rec.record import Record
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import mimetypes
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import typing
|
|
7
8
|
from io import BytesIO
|
|
8
9
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
@@ -25,6 +26,7 @@ from typing_extensions import Annotated, Self
|
|
|
25
26
|
from docling_core.search.package import VERSION_PATTERN
|
|
26
27
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
27
28
|
from docling_core.types.doc import BoundingBox, Size
|
|
29
|
+
from docling_core.types.doc.base import ImageRefMode
|
|
28
30
|
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
29
31
|
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
30
32
|
|
|
@@ -214,6 +216,8 @@ class DocumentOrigin(BaseModel):
|
|
|
214
216
|
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
|
215
217
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
216
218
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
219
|
+
"text/asciidoc",
|
|
220
|
+
"text/markdown",
|
|
217
221
|
]
|
|
218
222
|
|
|
219
223
|
@field_validator("binary_hash", mode="before")
|
|
@@ -1107,12 +1111,14 @@ class DoclingDocument(BaseModel):
|
|
|
1107
1111
|
|
|
1108
1112
|
def export_to_markdown( # noqa: C901
|
|
1109
1113
|
self,
|
|
1110
|
-
delim: str = "\n
|
|
1114
|
+
delim: str = "\n",
|
|
1111
1115
|
from_element: int = 0,
|
|
1112
|
-
to_element:
|
|
1116
|
+
to_element: int = sys.maxsize,
|
|
1113
1117
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1114
1118
|
strict_text: bool = False,
|
|
1115
1119
|
image_placeholder: str = "<!-- image -->",
|
|
1120
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1121
|
+
indent: int = 4,
|
|
1116
1122
|
) -> str:
|
|
1117
1123
|
r"""Serialize to Markdown.
|
|
1118
1124
|
|
|
@@ -1142,136 +1148,150 @@ class DoclingDocument(BaseModel):
|
|
|
1142
1148
|
:param strict_text: bool: (Default value = False)
|
|
1143
1149
|
:param image_placeholder str: (Default value = "<!-- image -->")
|
|
1144
1150
|
the placeholder to include to position images in the markdown.
|
|
1151
|
+
:param indent: int (default=4): indent of the nested lists
|
|
1145
1152
|
:returns: The exported Markdown representation.
|
|
1146
1153
|
:rtype: str
|
|
1147
1154
|
"""
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1155
|
+
mdtexts: list[str] = []
|
|
1156
|
+
list_nesting_level = 0 # Track the current list nesting level
|
|
1157
|
+
previous_level = 0 # Track the previous item's level
|
|
1158
|
+
in_list = False # Track if we're currently processing list items
|
|
1151
1159
|
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
if to_element and ix >= to_element:
|
|
1162
|
-
break
|
|
1160
|
+
for ix, (item, level) in enumerate(
|
|
1161
|
+
self.iterate_items(self.body, with_groups=True)
|
|
1162
|
+
):
|
|
1163
|
+
# If we've moved to a lower level, we're exiting one or more groups
|
|
1164
|
+
if level < previous_level:
|
|
1165
|
+
# Calculate how many levels we've exited
|
|
1166
|
+
level_difference = previous_level - level
|
|
1167
|
+
# Decrement list_nesting_level for each list group we've exited
|
|
1168
|
+
list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
1163
1169
|
|
|
1164
|
-
|
|
1165
|
-
isinstance(item, (TableItem, PictureItem))
|
|
1166
|
-
and len(item.captions) > 0
|
|
1167
|
-
and item.label in labels
|
|
1168
|
-
):
|
|
1169
|
-
caption = item.caption_text(self)
|
|
1170
|
-
if caption:
|
|
1171
|
-
embedded_captions.add(caption)
|
|
1170
|
+
previous_level = level # Update previous_level for next iteration
|
|
1172
1171
|
|
|
1173
|
-
|
|
1174
|
-
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1175
|
-
if skip_count < from_element:
|
|
1176
|
-
skip_count += 1
|
|
1172
|
+
if ix < from_element and to_element <= ix:
|
|
1177
1173
|
continue # skip as many items as you want
|
|
1178
1174
|
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
if isinstance(item, TextItem) and item_type in labels:
|
|
1188
|
-
text = item.text
|
|
1189
|
-
|
|
1190
|
-
# skip captions of they are embedded in the actual
|
|
1191
|
-
# floating object
|
|
1192
|
-
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
|
1193
|
-
continue
|
|
1194
|
-
|
|
1195
|
-
# ignore repeated text
|
|
1196
|
-
if prev_text == text or text is None:
|
|
1197
|
-
continue
|
|
1198
|
-
else:
|
|
1199
|
-
prev_text = text
|
|
1200
|
-
|
|
1201
|
-
# first title match
|
|
1202
|
-
if item_type == DocItemLabel.TITLE and not has_title:
|
|
1203
|
-
if strict_text:
|
|
1204
|
-
markdown_text = f"{text}"
|
|
1205
|
-
else:
|
|
1206
|
-
markdown_text = f"# {text}"
|
|
1207
|
-
has_title = True
|
|
1208
|
-
|
|
1209
|
-
# secondary titles
|
|
1210
|
-
elif item_type in {
|
|
1211
|
-
DocItemLabel.TITLE,
|
|
1212
|
-
DocItemLabel.SECTION_HEADER,
|
|
1213
|
-
} or (has_title and item_type == DocItemLabel.TITLE):
|
|
1214
|
-
if strict_text:
|
|
1215
|
-
markdown_text = f"{text}"
|
|
1216
|
-
else:
|
|
1217
|
-
markdown_text = f"## {text}"
|
|
1218
|
-
|
|
1219
|
-
# secondary titles
|
|
1220
|
-
elif isinstance(item, ListItem):
|
|
1221
|
-
if item.enumerated:
|
|
1222
|
-
marker = item.marker
|
|
1223
|
-
else:
|
|
1224
|
-
marker = "-"
|
|
1225
|
-
|
|
1226
|
-
markdown_text = f"{marker} {text}"
|
|
1227
|
-
|
|
1228
|
-
# normal text
|
|
1229
|
-
else:
|
|
1230
|
-
markdown_text = text
|
|
1231
|
-
|
|
1232
|
-
elif isinstance(item, TableItem) and item.data and item_type in labels:
|
|
1233
|
-
parts = []
|
|
1234
|
-
|
|
1235
|
-
# Compute the caption
|
|
1236
|
-
if caption := item.caption_text(self):
|
|
1237
|
-
parts.append(caption)
|
|
1238
|
-
parts.append("\n")
|
|
1175
|
+
# Handle newlines between different types of content
|
|
1176
|
+
if (
|
|
1177
|
+
len(mdtexts) > 0
|
|
1178
|
+
and not isinstance(item, (ListItem, GroupItem))
|
|
1179
|
+
and in_list
|
|
1180
|
+
):
|
|
1181
|
+
mdtexts[-1] += "\n"
|
|
1182
|
+
in_list = False
|
|
1239
1183
|
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
parts.append(item.export_to_markdown())
|
|
1184
|
+
if isinstance(item, GroupItem) and item.label in [
|
|
1185
|
+
GroupLabel.LIST,
|
|
1186
|
+
GroupLabel.ORDERED_LIST,
|
|
1187
|
+
]:
|
|
1245
1188
|
|
|
1246
|
-
|
|
1247
|
-
|
|
1189
|
+
if list_nesting_level == 0: # Check if we're on the top level.
|
|
1190
|
+
# In that case a new list starts directly after another list.
|
|
1191
|
+
mdtexts.append("\n") # Add a blank line
|
|
1248
1192
|
|
|
1249
|
-
|
|
1250
|
-
|
|
1193
|
+
# Increment list nesting level when entering a new list
|
|
1194
|
+
list_nesting_level += 1
|
|
1195
|
+
in_list = True
|
|
1196
|
+
continue
|
|
1251
1197
|
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
parts.append(caption)
|
|
1255
|
-
parts.append("\n")
|
|
1198
|
+
elif isinstance(item, GroupItem):
|
|
1199
|
+
continue
|
|
1256
1200
|
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1201
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
1202
|
+
in_list = False
|
|
1203
|
+
marker = "" if strict_text else "#"
|
|
1204
|
+
text = f"{marker} {item.text}\n"
|
|
1205
|
+
mdtexts.append(text.strip())
|
|
1206
|
+
|
|
1207
|
+
elif (
|
|
1208
|
+
isinstance(item, TextItem)
|
|
1209
|
+
and item.label in [DocItemLabel.SECTION_HEADER]
|
|
1210
|
+
) or isinstance(item, SectionHeaderItem):
|
|
1211
|
+
in_list = False
|
|
1212
|
+
marker = ""
|
|
1213
|
+
if not strict_text:
|
|
1214
|
+
marker = "#" * level
|
|
1215
|
+
if len(marker) < 2:
|
|
1216
|
+
marker = "##"
|
|
1217
|
+
text = f"{marker} {item.text}\n"
|
|
1218
|
+
mdtexts.append(text.strip() + "\n")
|
|
1219
|
+
|
|
1220
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
|
|
1221
|
+
in_list = False
|
|
1222
|
+
text = f"```\n{item.text}\n```\n"
|
|
1223
|
+
mdtexts.append(text)
|
|
1224
|
+
|
|
1225
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
1226
|
+
# captions are printed in picture and table ... skipping for now
|
|
1227
|
+
continue
|
|
1260
1228
|
|
|
1261
|
-
|
|
1262
|
-
|
|
1229
|
+
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
1230
|
+
in_list = True
|
|
1231
|
+
# Calculate indent based on list_nesting_level
|
|
1232
|
+
# -1 because level 1 needs no indent
|
|
1233
|
+
list_indent = " " * (indent * (list_nesting_level - 1))
|
|
1234
|
+
|
|
1235
|
+
marker = ""
|
|
1236
|
+
if strict_text:
|
|
1237
|
+
marker = ""
|
|
1238
|
+
elif item.enumerated:
|
|
1239
|
+
marker = item.marker
|
|
1240
|
+
else:
|
|
1241
|
+
marker = "-" # Markdown needs only dash as item marker.
|
|
1242
|
+
|
|
1243
|
+
text = f"{list_indent}{marker} {item.text}"
|
|
1244
|
+
mdtexts.append(text)
|
|
1245
|
+
|
|
1246
|
+
elif isinstance(item, TextItem) and item.label in labels:
|
|
1247
|
+
in_list = False
|
|
1248
|
+
if len(item.text):
|
|
1249
|
+
text = f"{item.text}\n"
|
|
1250
|
+
mdtexts.append(text)
|
|
1251
|
+
|
|
1252
|
+
elif isinstance(item, TableItem) and not strict_text:
|
|
1253
|
+
in_list = False
|
|
1254
|
+
mdtexts.append(item.caption_text(self))
|
|
1255
|
+
md_table = item.export_to_markdown()
|
|
1256
|
+
mdtexts.append("\n" + md_table + "\n")
|
|
1257
|
+
|
|
1258
|
+
elif isinstance(item, PictureItem) and not strict_text:
|
|
1259
|
+
in_list = False
|
|
1260
|
+
mdtexts.append(item.caption_text(self))
|
|
1261
|
+
|
|
1262
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
1263
|
+
mdtexts.append("\n" + image_placeholder + "\n")
|
|
1264
|
+
elif image_mode == ImageRefMode.EMBEDDED and isinstance(
|
|
1265
|
+
item.image, ImageRef
|
|
1266
|
+
):
|
|
1267
|
+
text = f"\n"
|
|
1268
|
+
mdtexts.append(text)
|
|
1269
|
+
elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
|
|
1270
|
+
item.image, ImageRef
|
|
1271
|
+
):
|
|
1272
|
+
text = (
|
|
1273
|
+
"<!-- 🖼️❌ Image not available. "
|
|
1274
|
+
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
1275
|
+
" --> "
|
|
1276
|
+
)
|
|
1277
|
+
mdtexts.append(text)
|
|
1263
1278
|
|
|
1264
|
-
|
|
1265
|
-
|
|
1279
|
+
elif isinstance(item, DocItem) and item.label in labels:
|
|
1280
|
+
in_list = False
|
|
1281
|
+
text = "<missing-text>"
|
|
1282
|
+
mdtexts.append(text)
|
|
1266
1283
|
|
|
1267
|
-
|
|
1268
|
-
|
|
1284
|
+
mdtext = (delim.join(mdtexts)).strip()
|
|
1285
|
+
mdtext = re.sub(
|
|
1286
|
+
r"\n\n\n+", "\n\n", mdtext
|
|
1287
|
+
) # remove cases of double or more empty lines.
|
|
1288
|
+
return mdtext
|
|
1269
1289
|
|
|
1270
1290
|
def export_to_text( # noqa: C901
|
|
1271
1291
|
self,
|
|
1272
1292
|
delim: str = "\n\n",
|
|
1273
1293
|
from_element: int = 0,
|
|
1274
|
-
to_element:
|
|
1294
|
+
to_element: int = 1000000,
|
|
1275
1295
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1276
1296
|
) -> str:
|
|
1277
1297
|
"""export_to_text."""
|
|
@@ -1398,6 +1418,121 @@ class DoclingDocument(BaseModel):
|
|
|
1398
1418
|
|
|
1399
1419
|
return doctags
|
|
1400
1420
|
|
|
1421
|
+
def _export_to_indented_text(
|
|
1422
|
+
self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
|
|
1423
|
+
):
|
|
1424
|
+
"""Export the document to indented text to expose hierarchy."""
|
|
1425
|
+
result = []
|
|
1426
|
+
|
|
1427
|
+
def get_text(text: str, max_text_len: int):
|
|
1428
|
+
|
|
1429
|
+
middle = " ... "
|
|
1430
|
+
|
|
1431
|
+
if max_text_len == -1:
|
|
1432
|
+
return text
|
|
1433
|
+
elif len(text) < max_text_len + len(middle):
|
|
1434
|
+
return text
|
|
1435
|
+
else:
|
|
1436
|
+
tbeg = int((max_text_len - len(middle)) / 2)
|
|
1437
|
+
tend = int(max_text_len - tbeg)
|
|
1438
|
+
|
|
1439
|
+
return text[0:tbeg] + middle + text[-tend:]
|
|
1440
|
+
|
|
1441
|
+
for i, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1442
|
+
if isinstance(item, GroupItem):
|
|
1443
|
+
result.append(
|
|
1444
|
+
indent * level
|
|
1445
|
+
+ f"item-{i} at level {level}: {item.label}: group {item.name}"
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
1449
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1450
|
+
|
|
1451
|
+
result.append(
|
|
1452
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1453
|
+
)
|
|
1454
|
+
|
|
1455
|
+
elif isinstance(item, SectionHeaderItem):
|
|
1456
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1457
|
+
|
|
1458
|
+
result.append(
|
|
1459
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1460
|
+
)
|
|
1461
|
+
|
|
1462
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
|
|
1463
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1464
|
+
|
|
1465
|
+
result.append(
|
|
1466
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1467
|
+
)
|
|
1468
|
+
|
|
1469
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
1470
|
+
# captions are printed in picture and table ... skipping for now
|
|
1471
|
+
continue
|
|
1472
|
+
|
|
1473
|
+
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
1474
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1475
|
+
|
|
1476
|
+
result.append(
|
|
1477
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1478
|
+
)
|
|
1479
|
+
|
|
1480
|
+
elif isinstance(item, TextItem):
|
|
1481
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1482
|
+
|
|
1483
|
+
result.append(
|
|
1484
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
elif isinstance(item, TableItem):
|
|
1488
|
+
|
|
1489
|
+
result.append(
|
|
1490
|
+
indent * level
|
|
1491
|
+
+ f"item-{i} at level {level}: {item.label} with "
|
|
1492
|
+
+ f"[{item.data.num_rows}x{item.data.num_cols}]"
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
for _ in item.captions:
|
|
1496
|
+
caption = _.resolve(self)
|
|
1497
|
+
result.append(
|
|
1498
|
+
indent * (level + 1)
|
|
1499
|
+
+ f"item-{i} at level {level + 1}: {caption.label}: "
|
|
1500
|
+
+ f"{caption.text}"
|
|
1501
|
+
)
|
|
1502
|
+
|
|
1503
|
+
if explicit_tables:
|
|
1504
|
+
grid: list[list[str]] = []
|
|
1505
|
+
for i, row in enumerate(item.data.grid):
|
|
1506
|
+
grid.append([])
|
|
1507
|
+
for j, cell in enumerate(row):
|
|
1508
|
+
if j < 10:
|
|
1509
|
+
text = get_text(text=cell.text, max_text_len=16)
|
|
1510
|
+
grid[-1].append(text)
|
|
1511
|
+
|
|
1512
|
+
result.append("\n" + tabulate(grid) + "\n")
|
|
1513
|
+
|
|
1514
|
+
elif isinstance(item, PictureItem):
|
|
1515
|
+
|
|
1516
|
+
result.append(
|
|
1517
|
+
indent * level + f"item-{i} at level {level}: {item.label}"
|
|
1518
|
+
)
|
|
1519
|
+
|
|
1520
|
+
for _ in item.captions:
|
|
1521
|
+
caption = _.resolve(self)
|
|
1522
|
+
result.append(
|
|
1523
|
+
indent * (level + 1)
|
|
1524
|
+
+ f"item-{i} at level {level + 1}: {caption.label}: "
|
|
1525
|
+
+ f"{caption.text}"
|
|
1526
|
+
)
|
|
1527
|
+
|
|
1528
|
+
elif isinstance(item, DocItem):
|
|
1529
|
+
result.append(
|
|
1530
|
+
indent * (level + 1)
|
|
1531
|
+
+ f"item-{i} at level {level}: {item.label}: ignored"
|
|
1532
|
+
)
|
|
1533
|
+
|
|
1534
|
+
return "\n".join(result)
|
|
1535
|
+
|
|
1401
1536
|
def add_page(
|
|
1402
1537
|
self, page_no: int, size: Size, image: Optional[ImageRef] = None
|
|
1403
1538
|
) -> PageItem:
|
|
@@ -18,7 +18,7 @@ from typing import Final
|
|
|
18
18
|
|
|
19
19
|
from docling_core.utils.generate_jsonschema import generate_json_schema
|
|
20
20
|
|
|
21
|
-
MODELS: Final = ["
|
|
21
|
+
MODELS: Final = ["DoclingDocument", "Record", "Generic"]
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def _prepare_directory(folder: str, clean: bool = False) -> None:
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
#
|
|
2
|
-
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
#
|
|
5
|
-
|
|
6
|
-
"""Define the main types."""
|
|
7
|
-
|
|
8
|
-
from docling_core.types.gen.generic import Generic # noqa
|
|
9
|
-
from docling_core.types.legacy_doc.base import BoundingBox # noqa
|
|
10
|
-
from docling_core.types.legacy_doc.base import Table # noqa
|
|
11
|
-
from docling_core.types.legacy_doc.base import TableCell # noqa
|
|
12
|
-
from docling_core.types.legacy_doc.base import ( # noqa
|
|
13
|
-
BaseCell,
|
|
14
|
-
BaseText,
|
|
15
|
-
PageDimensions,
|
|
16
|
-
PageReference,
|
|
17
|
-
Prov,
|
|
18
|
-
Ref,
|
|
19
|
-
)
|
|
20
|
-
from docling_core.types.legacy_doc.document import ( # noqa
|
|
21
|
-
CCSDocumentDescription as DocumentDescription,
|
|
22
|
-
)
|
|
23
|
-
from docling_core.types.legacy_doc.document import ( # noqa
|
|
24
|
-
CCSFileInfoObject as FileInfoObject,
|
|
25
|
-
)
|
|
26
|
-
from docling_core.types.legacy_doc.document import ( # noqa
|
|
27
|
-
ExportedCCSDocument as Document,
|
|
28
|
-
)
|
|
29
|
-
from docling_core.types.rec.record import Record # noqa
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.0.0 → docling_core-2.1.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.0.0 → docling_core-2.1.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|