docling-core 2.0.1__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- {docling_core-2.0.1 → docling_core-2.1.0}/PKG-INFO +2 -2
- {docling_core-2.0.1 → docling_core-2.1.0}/README.md +1 -1
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/chunker/__init__.py +4 -1
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/chunker/base.py +1 -1
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/chunker/hierarchical_chunker.py +58 -6
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/doc/__init__.py +1 -1
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/doc/base.py +7 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/doc/document.py +245 -111
- {docling_core-2.0.1 → docling_core-2.1.0}/pyproject.toml +1 -1
- {docling_core-2.0.1 → docling_core-2.1.0}/LICENSE +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/py.typed +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/mapping.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/meta.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/package.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/base.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/doc/labels.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/gen/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/gen/generic.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/base.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/document.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/legacy_doc/tokens.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/nlp/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/nlp/qa.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/nlp/qa_labels.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/attribute.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/base.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/predicate.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/record.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/statement.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/types/rec/subject.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/__init__.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/alias.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/file.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/generate_docs.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/generate_jsonschema.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/validate.py +0 -0
- {docling_core-2.0.1 → docling_core-2.1.0}/docling_core/utils/validators.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.1.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -95,7 +95,7 @@ poetry run pytest test
|
|
|
95
95
|
|
|
96
96
|
Docling Core contains 3 top-level data types:
|
|
97
97
|
|
|
98
|
-
- **DoclingDocument** for publications like books, articles, reports, or patents.
|
|
98
|
+
- **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
|
|
99
99
|
The DoclingDocument type also models the metadata that may be attached to the converted document.
|
|
100
100
|
Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
|
|
101
101
|
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
|
|
@@ -59,7 +59,7 @@ poetry run pytest test
|
|
|
59
59
|
|
|
60
60
|
Docling Core contains 3 top-level data types:
|
|
61
61
|
|
|
62
|
-
- **DoclingDocument** for publications like books, articles, reports, or patents.
|
|
62
|
+
- **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
|
|
63
63
|
The DoclingDocument type also models the metadata that may be attached to the converted document.
|
|
64
64
|
Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
|
|
65
65
|
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
|
|
@@ -6,4 +6,7 @@
|
|
|
6
6
|
"""Define the chunker types."""
|
|
7
7
|
|
|
8
8
|
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
|
9
|
-
from docling_core.transforms.chunker.hierarchical_chunker import
|
|
9
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
10
|
+
DocMeta,
|
|
11
|
+
HierarchicalChunker,
|
|
12
|
+
)
|
{docling_core-2.0.1 → docling_core-2.1.0}/docling_core/transforms/chunker/hierarchical_chunker.py
RENAMED
|
@@ -8,15 +8,19 @@
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
10
|
import logging
|
|
11
|
-
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, ClassVar, Final, Iterator, Literal, Optional
|
|
12
13
|
|
|
13
14
|
from pandas import DataFrame
|
|
14
|
-
from pydantic import Field
|
|
15
|
+
from pydantic import Field, StringConstraints, field_validator
|
|
16
|
+
from typing_extensions import Annotated
|
|
15
17
|
|
|
18
|
+
from docling_core.search.package import VERSION_PATTERN
|
|
16
19
|
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
17
|
-
from docling_core.types
|
|
20
|
+
from docling_core.types import DoclingDocument as DLDocument
|
|
18
21
|
from docling_core.types.doc.document import (
|
|
19
22
|
DocItem,
|
|
23
|
+
DocumentOrigin,
|
|
20
24
|
LevelNumber,
|
|
21
25
|
ListItem,
|
|
22
26
|
SectionHeaderItem,
|
|
@@ -25,16 +29,31 @@ from docling_core.types.doc.document import (
|
|
|
25
29
|
)
|
|
26
30
|
from docling_core.types.doc.labels import DocItemLabel
|
|
27
31
|
|
|
32
|
+
_VERSION: Final = "1.0.0"
|
|
33
|
+
|
|
34
|
+
_KEY_SCHEMA_NAME = "schema_name"
|
|
35
|
+
_KEY_VERSION = "version"
|
|
28
36
|
_KEY_DOC_ITEMS = "doc_items"
|
|
29
37
|
_KEY_HEADINGS = "headings"
|
|
30
38
|
_KEY_CAPTIONS = "captions"
|
|
39
|
+
_KEY_ORIGIN = "origin"
|
|
31
40
|
|
|
32
41
|
_logger = logging.getLogger(__name__)
|
|
33
42
|
|
|
34
43
|
|
|
35
44
|
class DocMeta(BaseMeta):
|
|
36
|
-
"""Data model for Hierarchical Chunker metadata."""
|
|
45
|
+
"""Data model for Hierarchical Chunker chunk metadata."""
|
|
37
46
|
|
|
47
|
+
schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
|
|
48
|
+
default="docling_core.transforms.chunker.DocMeta",
|
|
49
|
+
alias=_KEY_SCHEMA_NAME,
|
|
50
|
+
)
|
|
51
|
+
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
52
|
+
Field(
|
|
53
|
+
default=_VERSION,
|
|
54
|
+
alias=_KEY_VERSION,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
38
57
|
doc_items: list[DocItem] = Field(
|
|
39
58
|
alias=_KEY_DOC_ITEMS,
|
|
40
59
|
min_length=1,
|
|
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
|
|
|
49
68
|
alias=_KEY_CAPTIONS,
|
|
50
69
|
min_length=1,
|
|
51
70
|
)
|
|
71
|
+
origin: Optional[DocumentOrigin] = Field(
|
|
72
|
+
default=None,
|
|
73
|
+
alias=_KEY_ORIGIN,
|
|
74
|
+
)
|
|
52
75
|
|
|
53
|
-
excluded_embed: ClassVar[list[str]] = [
|
|
54
|
-
|
|
76
|
+
excluded_embed: ClassVar[list[str]] = [
|
|
77
|
+
_KEY_SCHEMA_NAME,
|
|
78
|
+
_KEY_VERSION,
|
|
79
|
+
_KEY_DOC_ITEMS,
|
|
80
|
+
_KEY_ORIGIN,
|
|
81
|
+
]
|
|
82
|
+
excluded_llm: ClassVar[list[str]] = [
|
|
83
|
+
_KEY_SCHEMA_NAME,
|
|
84
|
+
_KEY_VERSION,
|
|
85
|
+
_KEY_DOC_ITEMS,
|
|
86
|
+
_KEY_ORIGIN,
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
@field_validator(_KEY_VERSION)
|
|
90
|
+
@classmethod
|
|
91
|
+
def check_version_is_compatible(cls, v: str) -> str:
|
|
92
|
+
"""Check if this meta item version is compatible with current version."""
|
|
93
|
+
current_match = re.match(VERSION_PATTERN, _VERSION)
|
|
94
|
+
doc_match = re.match(VERSION_PATTERN, v)
|
|
95
|
+
if (
|
|
96
|
+
doc_match is None
|
|
97
|
+
or current_match is None
|
|
98
|
+
or doc_match["major"] != current_match["major"]
|
|
99
|
+
or doc_match["minor"] > current_match["minor"]
|
|
100
|
+
):
|
|
101
|
+
raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
|
|
102
|
+
else:
|
|
103
|
+
return _VERSION
|
|
55
104
|
|
|
56
105
|
|
|
57
106
|
class DocChunk(BaseChunk):
|
|
@@ -129,6 +178,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
129
178
|
for k in sorted(heading_by_level)
|
|
130
179
|
]
|
|
131
180
|
or None,
|
|
181
|
+
origin=dl_doc.origin,
|
|
132
182
|
),
|
|
133
183
|
)
|
|
134
184
|
list_items = [] # reset
|
|
@@ -171,6 +221,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
171
221
|
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
172
222
|
or None,
|
|
173
223
|
captions=captions,
|
|
224
|
+
origin=dl_doc.origin,
|
|
174
225
|
),
|
|
175
226
|
)
|
|
176
227
|
yield c
|
|
@@ -182,5 +233,6 @@ class HierarchicalChunker(BaseChunker):
|
|
|
182
233
|
doc_items=list_items,
|
|
183
234
|
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
184
235
|
or None,
|
|
236
|
+
origin=dl_doc.origin,
|
|
185
237
|
),
|
|
186
238
|
)
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import mimetypes
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import typing
|
|
7
8
|
from io import BytesIO
|
|
8
9
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
@@ -25,6 +26,7 @@ from typing_extensions import Annotated, Self
|
|
|
25
26
|
from docling_core.search.package import VERSION_PATTERN
|
|
26
27
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
27
28
|
from docling_core.types.doc import BoundingBox, Size
|
|
29
|
+
from docling_core.types.doc.base import ImageRefMode
|
|
28
30
|
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
29
31
|
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
30
32
|
|
|
@@ -215,6 +217,7 @@ class DocumentOrigin(BaseModel):
|
|
|
215
217
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
216
218
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
217
219
|
"text/asciidoc",
|
|
220
|
+
"text/markdown",
|
|
218
221
|
]
|
|
219
222
|
|
|
220
223
|
@field_validator("binary_hash", mode="before")
|
|
@@ -1108,12 +1111,14 @@ class DoclingDocument(BaseModel):
|
|
|
1108
1111
|
|
|
1109
1112
|
def export_to_markdown( # noqa: C901
|
|
1110
1113
|
self,
|
|
1111
|
-
delim: str = "\n
|
|
1114
|
+
delim: str = "\n",
|
|
1112
1115
|
from_element: int = 0,
|
|
1113
|
-
to_element:
|
|
1116
|
+
to_element: int = sys.maxsize,
|
|
1114
1117
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1115
1118
|
strict_text: bool = False,
|
|
1116
1119
|
image_placeholder: str = "<!-- image -->",
|
|
1120
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1121
|
+
indent: int = 4,
|
|
1117
1122
|
) -> str:
|
|
1118
1123
|
r"""Serialize to Markdown.
|
|
1119
1124
|
|
|
@@ -1143,136 +1148,150 @@ class DoclingDocument(BaseModel):
|
|
|
1143
1148
|
:param strict_text: bool: (Default value = False)
|
|
1144
1149
|
:param image_placeholder str: (Default value = "<!-- image -->")
|
|
1145
1150
|
the placeholder to include to position images in the markdown.
|
|
1151
|
+
:param indent: int (default=4): indent of the nested lists
|
|
1146
1152
|
:returns: The exported Markdown representation.
|
|
1147
1153
|
:rtype: str
|
|
1148
1154
|
"""
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1155
|
+
mdtexts: list[str] = []
|
|
1156
|
+
list_nesting_level = 0 # Track the current list nesting level
|
|
1157
|
+
previous_level = 0 # Track the previous item's level
|
|
1158
|
+
in_list = False # Track if we're currently processing list items
|
|
1152
1159
|
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
if to_element and ix >= to_element:
|
|
1163
|
-
break
|
|
1160
|
+
for ix, (item, level) in enumerate(
|
|
1161
|
+
self.iterate_items(self.body, with_groups=True)
|
|
1162
|
+
):
|
|
1163
|
+
# If we've moved to a lower level, we're exiting one or more groups
|
|
1164
|
+
if level < previous_level:
|
|
1165
|
+
# Calculate how many levels we've exited
|
|
1166
|
+
level_difference = previous_level - level
|
|
1167
|
+
# Decrement list_nesting_level for each list group we've exited
|
|
1168
|
+
list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
1164
1169
|
|
|
1165
|
-
|
|
1166
|
-
isinstance(item, (TableItem, PictureItem))
|
|
1167
|
-
and len(item.captions) > 0
|
|
1168
|
-
and item.label in labels
|
|
1169
|
-
):
|
|
1170
|
-
caption = item.caption_text(self)
|
|
1171
|
-
if caption:
|
|
1172
|
-
embedded_captions.add(caption)
|
|
1170
|
+
previous_level = level # Update previous_level for next iteration
|
|
1173
1171
|
|
|
1174
|
-
|
|
1175
|
-
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1176
|
-
if skip_count < from_element:
|
|
1177
|
-
skip_count += 1
|
|
1172
|
+
if ix < from_element and to_element <= ix:
|
|
1178
1173
|
continue # skip as many items as you want
|
|
1179
1174
|
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
if isinstance(item, TextItem) and item_type in labels:
|
|
1189
|
-
text = item.text
|
|
1190
|
-
|
|
1191
|
-
# skip captions of they are embedded in the actual
|
|
1192
|
-
# floating object
|
|
1193
|
-
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
|
1194
|
-
continue
|
|
1195
|
-
|
|
1196
|
-
# ignore repeated text
|
|
1197
|
-
if prev_text == text or text is None:
|
|
1198
|
-
continue
|
|
1199
|
-
else:
|
|
1200
|
-
prev_text = text
|
|
1201
|
-
|
|
1202
|
-
# first title match
|
|
1203
|
-
if item_type == DocItemLabel.TITLE and not has_title:
|
|
1204
|
-
if strict_text:
|
|
1205
|
-
markdown_text = f"{text}"
|
|
1206
|
-
else:
|
|
1207
|
-
markdown_text = f"# {text}"
|
|
1208
|
-
has_title = True
|
|
1209
|
-
|
|
1210
|
-
# secondary titles
|
|
1211
|
-
elif item_type in {
|
|
1212
|
-
DocItemLabel.TITLE,
|
|
1213
|
-
DocItemLabel.SECTION_HEADER,
|
|
1214
|
-
} or (has_title and item_type == DocItemLabel.TITLE):
|
|
1215
|
-
if strict_text:
|
|
1216
|
-
markdown_text = f"{text}"
|
|
1217
|
-
else:
|
|
1218
|
-
markdown_text = f"## {text}"
|
|
1219
|
-
|
|
1220
|
-
# secondary titles
|
|
1221
|
-
elif isinstance(item, ListItem):
|
|
1222
|
-
if item.enumerated:
|
|
1223
|
-
marker = item.marker
|
|
1224
|
-
else:
|
|
1225
|
-
marker = "-"
|
|
1226
|
-
|
|
1227
|
-
markdown_text = f"{marker} {text}"
|
|
1228
|
-
|
|
1229
|
-
# normal text
|
|
1230
|
-
else:
|
|
1231
|
-
markdown_text = text
|
|
1232
|
-
|
|
1233
|
-
elif isinstance(item, TableItem) and item.data and item_type in labels:
|
|
1234
|
-
parts = []
|
|
1235
|
-
|
|
1236
|
-
# Compute the caption
|
|
1237
|
-
if caption := item.caption_text(self):
|
|
1238
|
-
parts.append(caption)
|
|
1239
|
-
parts.append("\n")
|
|
1175
|
+
# Handle newlines between different types of content
|
|
1176
|
+
if (
|
|
1177
|
+
len(mdtexts) > 0
|
|
1178
|
+
and not isinstance(item, (ListItem, GroupItem))
|
|
1179
|
+
and in_list
|
|
1180
|
+
):
|
|
1181
|
+
mdtexts[-1] += "\n"
|
|
1182
|
+
in_list = False
|
|
1240
1183
|
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
parts.append(item.export_to_markdown())
|
|
1184
|
+
if isinstance(item, GroupItem) and item.label in [
|
|
1185
|
+
GroupLabel.LIST,
|
|
1186
|
+
GroupLabel.ORDERED_LIST,
|
|
1187
|
+
]:
|
|
1246
1188
|
|
|
1247
|
-
|
|
1248
|
-
|
|
1189
|
+
if list_nesting_level == 0: # Check if we're on the top level.
|
|
1190
|
+
# In that case a new list starts directly after another list.
|
|
1191
|
+
mdtexts.append("\n") # Add a blank line
|
|
1249
1192
|
|
|
1250
|
-
|
|
1251
|
-
|
|
1193
|
+
# Increment list nesting level when entering a new list
|
|
1194
|
+
list_nesting_level += 1
|
|
1195
|
+
in_list = True
|
|
1196
|
+
continue
|
|
1252
1197
|
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
parts.append(caption)
|
|
1256
|
-
parts.append("\n")
|
|
1198
|
+
elif isinstance(item, GroupItem):
|
|
1199
|
+
continue
|
|
1257
1200
|
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1201
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
1202
|
+
in_list = False
|
|
1203
|
+
marker = "" if strict_text else "#"
|
|
1204
|
+
text = f"{marker} {item.text}\n"
|
|
1205
|
+
mdtexts.append(text.strip())
|
|
1206
|
+
|
|
1207
|
+
elif (
|
|
1208
|
+
isinstance(item, TextItem)
|
|
1209
|
+
and item.label in [DocItemLabel.SECTION_HEADER]
|
|
1210
|
+
) or isinstance(item, SectionHeaderItem):
|
|
1211
|
+
in_list = False
|
|
1212
|
+
marker = ""
|
|
1213
|
+
if not strict_text:
|
|
1214
|
+
marker = "#" * level
|
|
1215
|
+
if len(marker) < 2:
|
|
1216
|
+
marker = "##"
|
|
1217
|
+
text = f"{marker} {item.text}\n"
|
|
1218
|
+
mdtexts.append(text.strip() + "\n")
|
|
1219
|
+
|
|
1220
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
|
|
1221
|
+
in_list = False
|
|
1222
|
+
text = f"```\n{item.text}\n```\n"
|
|
1223
|
+
mdtexts.append(text)
|
|
1224
|
+
|
|
1225
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
1226
|
+
# captions are printed in picture and table ... skipping for now
|
|
1227
|
+
continue
|
|
1261
1228
|
|
|
1262
|
-
|
|
1263
|
-
|
|
1229
|
+
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
1230
|
+
in_list = True
|
|
1231
|
+
# Calculate indent based on list_nesting_level
|
|
1232
|
+
# -1 because level 1 needs no indent
|
|
1233
|
+
list_indent = " " * (indent * (list_nesting_level - 1))
|
|
1234
|
+
|
|
1235
|
+
marker = ""
|
|
1236
|
+
if strict_text:
|
|
1237
|
+
marker = ""
|
|
1238
|
+
elif item.enumerated:
|
|
1239
|
+
marker = item.marker
|
|
1240
|
+
else:
|
|
1241
|
+
marker = "-" # Markdown needs only dash as item marker.
|
|
1242
|
+
|
|
1243
|
+
text = f"{list_indent}{marker} {item.text}"
|
|
1244
|
+
mdtexts.append(text)
|
|
1245
|
+
|
|
1246
|
+
elif isinstance(item, TextItem) and item.label in labels:
|
|
1247
|
+
in_list = False
|
|
1248
|
+
if len(item.text):
|
|
1249
|
+
text = f"{item.text}\n"
|
|
1250
|
+
mdtexts.append(text)
|
|
1251
|
+
|
|
1252
|
+
elif isinstance(item, TableItem) and not strict_text:
|
|
1253
|
+
in_list = False
|
|
1254
|
+
mdtexts.append(item.caption_text(self))
|
|
1255
|
+
md_table = item.export_to_markdown()
|
|
1256
|
+
mdtexts.append("\n" + md_table + "\n")
|
|
1257
|
+
|
|
1258
|
+
elif isinstance(item, PictureItem) and not strict_text:
|
|
1259
|
+
in_list = False
|
|
1260
|
+
mdtexts.append(item.caption_text(self))
|
|
1261
|
+
|
|
1262
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
1263
|
+
mdtexts.append("\n" + image_placeholder + "\n")
|
|
1264
|
+
elif image_mode == ImageRefMode.EMBEDDED and isinstance(
|
|
1265
|
+
item.image, ImageRef
|
|
1266
|
+
):
|
|
1267
|
+
text = f"\n"
|
|
1268
|
+
mdtexts.append(text)
|
|
1269
|
+
elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
|
|
1270
|
+
item.image, ImageRef
|
|
1271
|
+
):
|
|
1272
|
+
text = (
|
|
1273
|
+
"<!-- 🖼️❌ Image not available. "
|
|
1274
|
+
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
1275
|
+
" --> "
|
|
1276
|
+
)
|
|
1277
|
+
mdtexts.append(text)
|
|
1264
1278
|
|
|
1265
|
-
|
|
1266
|
-
|
|
1279
|
+
elif isinstance(item, DocItem) and item.label in labels:
|
|
1280
|
+
in_list = False
|
|
1281
|
+
text = "<missing-text>"
|
|
1282
|
+
mdtexts.append(text)
|
|
1267
1283
|
|
|
1268
|
-
|
|
1269
|
-
|
|
1284
|
+
mdtext = (delim.join(mdtexts)).strip()
|
|
1285
|
+
mdtext = re.sub(
|
|
1286
|
+
r"\n\n\n+", "\n\n", mdtext
|
|
1287
|
+
) # remove cases of double or more empty lines.
|
|
1288
|
+
return mdtext
|
|
1270
1289
|
|
|
1271
1290
|
def export_to_text( # noqa: C901
|
|
1272
1291
|
self,
|
|
1273
1292
|
delim: str = "\n\n",
|
|
1274
1293
|
from_element: int = 0,
|
|
1275
|
-
to_element:
|
|
1294
|
+
to_element: int = 1000000,
|
|
1276
1295
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1277
1296
|
) -> str:
|
|
1278
1297
|
"""export_to_text."""
|
|
@@ -1399,6 +1418,121 @@ class DoclingDocument(BaseModel):
|
|
|
1399
1418
|
|
|
1400
1419
|
return doctags
|
|
1401
1420
|
|
|
1421
|
+
def _export_to_indented_text(
|
|
1422
|
+
self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
|
|
1423
|
+
):
|
|
1424
|
+
"""Export the document to indented text to expose hierarchy."""
|
|
1425
|
+
result = []
|
|
1426
|
+
|
|
1427
|
+
def get_text(text: str, max_text_len: int):
|
|
1428
|
+
|
|
1429
|
+
middle = " ... "
|
|
1430
|
+
|
|
1431
|
+
if max_text_len == -1:
|
|
1432
|
+
return text
|
|
1433
|
+
elif len(text) < max_text_len + len(middle):
|
|
1434
|
+
return text
|
|
1435
|
+
else:
|
|
1436
|
+
tbeg = int((max_text_len - len(middle)) / 2)
|
|
1437
|
+
tend = int(max_text_len - tbeg)
|
|
1438
|
+
|
|
1439
|
+
return text[0:tbeg] + middle + text[-tend:]
|
|
1440
|
+
|
|
1441
|
+
for i, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1442
|
+
if isinstance(item, GroupItem):
|
|
1443
|
+
result.append(
|
|
1444
|
+
indent * level
|
|
1445
|
+
+ f"item-{i} at level {level}: {item.label}: group {item.name}"
|
|
1446
|
+
)
|
|
1447
|
+
|
|
1448
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
1449
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1450
|
+
|
|
1451
|
+
result.append(
|
|
1452
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1453
|
+
)
|
|
1454
|
+
|
|
1455
|
+
elif isinstance(item, SectionHeaderItem):
|
|
1456
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1457
|
+
|
|
1458
|
+
result.append(
|
|
1459
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1460
|
+
)
|
|
1461
|
+
|
|
1462
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
|
|
1463
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1464
|
+
|
|
1465
|
+
result.append(
|
|
1466
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1467
|
+
)
|
|
1468
|
+
|
|
1469
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
1470
|
+
# captions are printed in picture and table ... skipping for now
|
|
1471
|
+
continue
|
|
1472
|
+
|
|
1473
|
+
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
1474
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1475
|
+
|
|
1476
|
+
result.append(
|
|
1477
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1478
|
+
)
|
|
1479
|
+
|
|
1480
|
+
elif isinstance(item, TextItem):
|
|
1481
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1482
|
+
|
|
1483
|
+
result.append(
|
|
1484
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1485
|
+
)
|
|
1486
|
+
|
|
1487
|
+
elif isinstance(item, TableItem):
|
|
1488
|
+
|
|
1489
|
+
result.append(
|
|
1490
|
+
indent * level
|
|
1491
|
+
+ f"item-{i} at level {level}: {item.label} with "
|
|
1492
|
+
+ f"[{item.data.num_rows}x{item.data.num_cols}]"
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
for _ in item.captions:
|
|
1496
|
+
caption = _.resolve(self)
|
|
1497
|
+
result.append(
|
|
1498
|
+
indent * (level + 1)
|
|
1499
|
+
+ f"item-{i} at level {level + 1}: {caption.label}: "
|
|
1500
|
+
+ f"{caption.text}"
|
|
1501
|
+
)
|
|
1502
|
+
|
|
1503
|
+
if explicit_tables:
|
|
1504
|
+
grid: list[list[str]] = []
|
|
1505
|
+
for i, row in enumerate(item.data.grid):
|
|
1506
|
+
grid.append([])
|
|
1507
|
+
for j, cell in enumerate(row):
|
|
1508
|
+
if j < 10:
|
|
1509
|
+
text = get_text(text=cell.text, max_text_len=16)
|
|
1510
|
+
grid[-1].append(text)
|
|
1511
|
+
|
|
1512
|
+
result.append("\n" + tabulate(grid) + "\n")
|
|
1513
|
+
|
|
1514
|
+
elif isinstance(item, PictureItem):
|
|
1515
|
+
|
|
1516
|
+
result.append(
|
|
1517
|
+
indent * level + f"item-{i} at level {level}: {item.label}"
|
|
1518
|
+
)
|
|
1519
|
+
|
|
1520
|
+
for _ in item.captions:
|
|
1521
|
+
caption = _.resolve(self)
|
|
1522
|
+
result.append(
|
|
1523
|
+
indent * (level + 1)
|
|
1524
|
+
+ f"item-{i} at level {level + 1}: {caption.label}: "
|
|
1525
|
+
+ f"{caption.text}"
|
|
1526
|
+
)
|
|
1527
|
+
|
|
1528
|
+
elif isinstance(item, DocItem):
|
|
1529
|
+
result.append(
|
|
1530
|
+
indent * (level + 1)
|
|
1531
|
+
+ f"item-{i} at level {level}: {item.label}: ignored"
|
|
1532
|
+
)
|
|
1533
|
+
|
|
1534
|
+
return "\n".join(result)
|
|
1535
|
+
|
|
1402
1536
|
def add_page(
|
|
1403
1537
|
self, page_no: int, size: Size, image: Optional[ImageRef] = None
|
|
1404
1538
|
) -> PageItem:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.0.1 → docling_core-2.1.0}/docling_core/resources/schemas/doc/OCR-output.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{docling_core-2.0.1 → docling_core-2.1.0}/docling_core/search/json_schema_to_search_mapper.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|