docling-core 2.0.1__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/transforms/chunker/__init__.py +4 -1
- docling_core/transforms/chunker/base.py +1 -1
- docling_core/transforms/chunker/hierarchical_chunker.py +58 -6
- docling_core/types/doc/__init__.py +1 -1
- docling_core/types/doc/base.py +7 -0
- docling_core/types/doc/document.py +252 -112
- docling_core/utils/file.py +17 -4
- {docling_core-2.0.1.dist-info → docling_core-2.2.0.dist-info}/METADATA +2 -2
- {docling_core-2.0.1.dist-info → docling_core-2.2.0.dist-info}/RECORD +12 -12
- {docling_core-2.0.1.dist-info → docling_core-2.2.0.dist-info}/LICENSE +0 -0
- {docling_core-2.0.1.dist-info → docling_core-2.2.0.dist-info}/WHEEL +0 -0
- {docling_core-2.0.1.dist-info → docling_core-2.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -6,4 +6,7 @@
|
|
|
6
6
|
"""Define the chunker types."""
|
|
7
7
|
|
|
8
8
|
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
|
|
9
|
-
from docling_core.transforms.chunker.hierarchical_chunker import
|
|
9
|
+
from docling_core.transforms.chunker.hierarchical_chunker import (
|
|
10
|
+
DocMeta,
|
|
11
|
+
HierarchicalChunker,
|
|
12
|
+
)
|
|
@@ -8,15 +8,19 @@
|
|
|
8
8
|
from __future__ import annotations
|
|
9
9
|
|
|
10
10
|
import logging
|
|
11
|
-
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, ClassVar, Final, Iterator, Literal, Optional
|
|
12
13
|
|
|
13
14
|
from pandas import DataFrame
|
|
14
|
-
from pydantic import Field
|
|
15
|
+
from pydantic import Field, StringConstraints, field_validator
|
|
16
|
+
from typing_extensions import Annotated
|
|
15
17
|
|
|
18
|
+
from docling_core.search.package import VERSION_PATTERN
|
|
16
19
|
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
|
|
17
|
-
from docling_core.types
|
|
20
|
+
from docling_core.types import DoclingDocument as DLDocument
|
|
18
21
|
from docling_core.types.doc.document import (
|
|
19
22
|
DocItem,
|
|
23
|
+
DocumentOrigin,
|
|
20
24
|
LevelNumber,
|
|
21
25
|
ListItem,
|
|
22
26
|
SectionHeaderItem,
|
|
@@ -25,16 +29,31 @@ from docling_core.types.doc.document import (
|
|
|
25
29
|
)
|
|
26
30
|
from docling_core.types.doc.labels import DocItemLabel
|
|
27
31
|
|
|
32
|
+
_VERSION: Final = "1.0.0"
|
|
33
|
+
|
|
34
|
+
_KEY_SCHEMA_NAME = "schema_name"
|
|
35
|
+
_KEY_VERSION = "version"
|
|
28
36
|
_KEY_DOC_ITEMS = "doc_items"
|
|
29
37
|
_KEY_HEADINGS = "headings"
|
|
30
38
|
_KEY_CAPTIONS = "captions"
|
|
39
|
+
_KEY_ORIGIN = "origin"
|
|
31
40
|
|
|
32
41
|
_logger = logging.getLogger(__name__)
|
|
33
42
|
|
|
34
43
|
|
|
35
44
|
class DocMeta(BaseMeta):
|
|
36
|
-
"""Data model for Hierarchical Chunker metadata."""
|
|
45
|
+
"""Data model for Hierarchical Chunker chunk metadata."""
|
|
37
46
|
|
|
47
|
+
schema_name: Literal["docling_core.transforms.chunker.DocMeta"] = Field(
|
|
48
|
+
default="docling_core.transforms.chunker.DocMeta",
|
|
49
|
+
alias=_KEY_SCHEMA_NAME,
|
|
50
|
+
)
|
|
51
|
+
version: Annotated[str, StringConstraints(pattern=VERSION_PATTERN, strict=True)] = (
|
|
52
|
+
Field(
|
|
53
|
+
default=_VERSION,
|
|
54
|
+
alias=_KEY_VERSION,
|
|
55
|
+
)
|
|
56
|
+
)
|
|
38
57
|
doc_items: list[DocItem] = Field(
|
|
39
58
|
alias=_KEY_DOC_ITEMS,
|
|
40
59
|
min_length=1,
|
|
@@ -49,9 +68,39 @@ class DocMeta(BaseMeta):
|
|
|
49
68
|
alias=_KEY_CAPTIONS,
|
|
50
69
|
min_length=1,
|
|
51
70
|
)
|
|
71
|
+
origin: Optional[DocumentOrigin] = Field(
|
|
72
|
+
default=None,
|
|
73
|
+
alias=_KEY_ORIGIN,
|
|
74
|
+
)
|
|
52
75
|
|
|
53
|
-
excluded_embed: ClassVar[list[str]] = [
|
|
54
|
-
|
|
76
|
+
excluded_embed: ClassVar[list[str]] = [
|
|
77
|
+
_KEY_SCHEMA_NAME,
|
|
78
|
+
_KEY_VERSION,
|
|
79
|
+
_KEY_DOC_ITEMS,
|
|
80
|
+
_KEY_ORIGIN,
|
|
81
|
+
]
|
|
82
|
+
excluded_llm: ClassVar[list[str]] = [
|
|
83
|
+
_KEY_SCHEMA_NAME,
|
|
84
|
+
_KEY_VERSION,
|
|
85
|
+
_KEY_DOC_ITEMS,
|
|
86
|
+
_KEY_ORIGIN,
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
@field_validator(_KEY_VERSION)
|
|
90
|
+
@classmethod
|
|
91
|
+
def check_version_is_compatible(cls, v: str) -> str:
|
|
92
|
+
"""Check if this meta item version is compatible with current version."""
|
|
93
|
+
current_match = re.match(VERSION_PATTERN, _VERSION)
|
|
94
|
+
doc_match = re.match(VERSION_PATTERN, v)
|
|
95
|
+
if (
|
|
96
|
+
doc_match is None
|
|
97
|
+
or current_match is None
|
|
98
|
+
or doc_match["major"] != current_match["major"]
|
|
99
|
+
or doc_match["minor"] > current_match["minor"]
|
|
100
|
+
):
|
|
101
|
+
raise ValueError(f"incompatible version {v} with schema version {_VERSION}")
|
|
102
|
+
else:
|
|
103
|
+
return _VERSION
|
|
55
104
|
|
|
56
105
|
|
|
57
106
|
class DocChunk(BaseChunk):
|
|
@@ -129,6 +178,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
129
178
|
for k in sorted(heading_by_level)
|
|
130
179
|
]
|
|
131
180
|
or None,
|
|
181
|
+
origin=dl_doc.origin,
|
|
132
182
|
),
|
|
133
183
|
)
|
|
134
184
|
list_items = [] # reset
|
|
@@ -171,6 +221,7 @@ class HierarchicalChunker(BaseChunker):
|
|
|
171
221
|
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
172
222
|
or None,
|
|
173
223
|
captions=captions,
|
|
224
|
+
origin=dl_doc.origin,
|
|
174
225
|
),
|
|
175
226
|
)
|
|
176
227
|
yield c
|
|
@@ -182,5 +233,6 @@ class HierarchicalChunker(BaseChunker):
|
|
|
182
233
|
doc_items=list_items,
|
|
183
234
|
headings=[heading_by_level[k] for k in sorted(heading_by_level)]
|
|
184
235
|
or None,
|
|
236
|
+
origin=dl_doc.origin,
|
|
185
237
|
),
|
|
186
238
|
)
|
docling_core/types/doc/base.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
import base64
|
|
4
4
|
import mimetypes
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import typing
|
|
7
8
|
from io import BytesIO
|
|
8
9
|
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
|
|
@@ -25,6 +26,7 @@ from typing_extensions import Annotated, Self
|
|
|
25
26
|
from docling_core.search.package import VERSION_PATTERN
|
|
26
27
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
27
28
|
from docling_core.types.doc import BoundingBox, Size
|
|
29
|
+
from docling_core.types.doc.base import ImageRefMode
|
|
28
30
|
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
|
|
29
31
|
from docling_core.types.legacy_doc.tokens import DocumentToken
|
|
30
32
|
|
|
@@ -215,6 +217,7 @@ class DocumentOrigin(BaseModel):
|
|
|
215
217
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
216
218
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
217
219
|
"text/asciidoc",
|
|
220
|
+
"text/markdown",
|
|
218
221
|
]
|
|
219
222
|
|
|
220
223
|
@field_validator("binary_hash", mode="before")
|
|
@@ -588,7 +591,13 @@ class TableItem(FloatingItem):
|
|
|
588
591
|
for row in self.data.grid:
|
|
589
592
|
tmp = []
|
|
590
593
|
for col in row:
|
|
591
|
-
|
|
594
|
+
|
|
595
|
+
# make sure that md tables are not broken
|
|
596
|
+
# due to newline chars in the text
|
|
597
|
+
text = col.text
|
|
598
|
+
text = text.replace("\n", " ")
|
|
599
|
+
tmp.append(text)
|
|
600
|
+
|
|
592
601
|
table.append(tmp)
|
|
593
602
|
|
|
594
603
|
md_table = ""
|
|
@@ -1108,12 +1117,14 @@ class DoclingDocument(BaseModel):
|
|
|
1108
1117
|
|
|
1109
1118
|
def export_to_markdown( # noqa: C901
|
|
1110
1119
|
self,
|
|
1111
|
-
delim: str = "\n
|
|
1120
|
+
delim: str = "\n",
|
|
1112
1121
|
from_element: int = 0,
|
|
1113
|
-
to_element:
|
|
1122
|
+
to_element: int = sys.maxsize,
|
|
1114
1123
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1115
1124
|
strict_text: bool = False,
|
|
1116
1125
|
image_placeholder: str = "<!-- image -->",
|
|
1126
|
+
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
|
|
1127
|
+
indent: int = 4,
|
|
1117
1128
|
) -> str:
|
|
1118
1129
|
r"""Serialize to Markdown.
|
|
1119
1130
|
|
|
@@ -1143,136 +1154,150 @@ class DoclingDocument(BaseModel):
|
|
|
1143
1154
|
:param strict_text: bool: (Default value = False)
|
|
1144
1155
|
:param image_placeholder str: (Default value = "<!-- image -->")
|
|
1145
1156
|
the placeholder to include to position images in the markdown.
|
|
1157
|
+
:param indent: int (default=4): indent of the nested lists
|
|
1146
1158
|
:returns: The exported Markdown representation.
|
|
1147
1159
|
:rtype: str
|
|
1148
1160
|
"""
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1161
|
+
mdtexts: list[str] = []
|
|
1162
|
+
list_nesting_level = 0 # Track the current list nesting level
|
|
1163
|
+
previous_level = 0 # Track the previous item's level
|
|
1164
|
+
in_list = False # Track if we're currently processing list items
|
|
1152
1165
|
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
if to_element and ix >= to_element:
|
|
1163
|
-
break
|
|
1166
|
+
for ix, (item, level) in enumerate(
|
|
1167
|
+
self.iterate_items(self.body, with_groups=True)
|
|
1168
|
+
):
|
|
1169
|
+
# If we've moved to a lower level, we're exiting one or more groups
|
|
1170
|
+
if level < previous_level:
|
|
1171
|
+
# Calculate how many levels we've exited
|
|
1172
|
+
level_difference = previous_level - level
|
|
1173
|
+
# Decrement list_nesting_level for each list group we've exited
|
|
1174
|
+
list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
1164
1175
|
|
|
1165
|
-
|
|
1166
|
-
isinstance(item, (TableItem, PictureItem))
|
|
1167
|
-
and len(item.captions) > 0
|
|
1168
|
-
and item.label in labels
|
|
1169
|
-
):
|
|
1170
|
-
caption = item.caption_text(self)
|
|
1171
|
-
if caption:
|
|
1172
|
-
embedded_captions.add(caption)
|
|
1176
|
+
previous_level = level # Update previous_level for next iteration
|
|
1173
1177
|
|
|
1174
|
-
|
|
1175
|
-
for ix, (item, level) in enumerate(self.iterate_items(self.body)):
|
|
1176
|
-
if skip_count < from_element:
|
|
1177
|
-
skip_count += 1
|
|
1178
|
+
if ix < from_element and to_element <= ix:
|
|
1178
1179
|
continue # skip as many items as you want
|
|
1179
1180
|
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
if isinstance(item, TextItem) and item_type in labels:
|
|
1189
|
-
text = item.text
|
|
1190
|
-
|
|
1191
|
-
# skip captions of they are embedded in the actual
|
|
1192
|
-
# floating object
|
|
1193
|
-
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
|
1194
|
-
continue
|
|
1195
|
-
|
|
1196
|
-
# ignore repeated text
|
|
1197
|
-
if prev_text == text or text is None:
|
|
1198
|
-
continue
|
|
1199
|
-
else:
|
|
1200
|
-
prev_text = text
|
|
1201
|
-
|
|
1202
|
-
# first title match
|
|
1203
|
-
if item_type == DocItemLabel.TITLE and not has_title:
|
|
1204
|
-
if strict_text:
|
|
1205
|
-
markdown_text = f"{text}"
|
|
1206
|
-
else:
|
|
1207
|
-
markdown_text = f"# {text}"
|
|
1208
|
-
has_title = True
|
|
1209
|
-
|
|
1210
|
-
# secondary titles
|
|
1211
|
-
elif item_type in {
|
|
1212
|
-
DocItemLabel.TITLE,
|
|
1213
|
-
DocItemLabel.SECTION_HEADER,
|
|
1214
|
-
} or (has_title and item_type == DocItemLabel.TITLE):
|
|
1215
|
-
if strict_text:
|
|
1216
|
-
markdown_text = f"{text}"
|
|
1217
|
-
else:
|
|
1218
|
-
markdown_text = f"## {text}"
|
|
1219
|
-
|
|
1220
|
-
# secondary titles
|
|
1221
|
-
elif isinstance(item, ListItem):
|
|
1222
|
-
if item.enumerated:
|
|
1223
|
-
marker = item.marker
|
|
1224
|
-
else:
|
|
1225
|
-
marker = "-"
|
|
1226
|
-
|
|
1227
|
-
markdown_text = f"{marker} {text}"
|
|
1228
|
-
|
|
1229
|
-
# normal text
|
|
1230
|
-
else:
|
|
1231
|
-
markdown_text = text
|
|
1232
|
-
|
|
1233
|
-
elif isinstance(item, TableItem) and item.data and item_type in labels:
|
|
1234
|
-
parts = []
|
|
1235
|
-
|
|
1236
|
-
# Compute the caption
|
|
1237
|
-
if caption := item.caption_text(self):
|
|
1238
|
-
parts.append(caption)
|
|
1239
|
-
parts.append("\n")
|
|
1181
|
+
# Handle newlines between different types of content
|
|
1182
|
+
if (
|
|
1183
|
+
len(mdtexts) > 0
|
|
1184
|
+
and not isinstance(item, (ListItem, GroupItem))
|
|
1185
|
+
and in_list
|
|
1186
|
+
):
|
|
1187
|
+
mdtexts[-1] += "\n"
|
|
1188
|
+
in_list = False
|
|
1240
1189
|
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
parts.append(item.export_to_markdown())
|
|
1190
|
+
if isinstance(item, GroupItem) and item.label in [
|
|
1191
|
+
GroupLabel.LIST,
|
|
1192
|
+
GroupLabel.ORDERED_LIST,
|
|
1193
|
+
]:
|
|
1246
1194
|
|
|
1247
|
-
|
|
1248
|
-
|
|
1195
|
+
if list_nesting_level == 0: # Check if we're on the top level.
|
|
1196
|
+
# In that case a new list starts directly after another list.
|
|
1197
|
+
mdtexts.append("\n") # Add a blank line
|
|
1249
1198
|
|
|
1250
|
-
|
|
1251
|
-
|
|
1199
|
+
# Increment list nesting level when entering a new list
|
|
1200
|
+
list_nesting_level += 1
|
|
1201
|
+
in_list = True
|
|
1202
|
+
continue
|
|
1252
1203
|
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
parts.append(caption)
|
|
1256
|
-
parts.append("\n")
|
|
1204
|
+
elif isinstance(item, GroupItem):
|
|
1205
|
+
continue
|
|
1257
1206
|
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1207
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
1208
|
+
in_list = False
|
|
1209
|
+
marker = "" if strict_text else "#"
|
|
1210
|
+
text = f"{marker} {item.text}\n"
|
|
1211
|
+
mdtexts.append(text.strip())
|
|
1212
|
+
|
|
1213
|
+
elif (
|
|
1214
|
+
isinstance(item, TextItem)
|
|
1215
|
+
and item.label in [DocItemLabel.SECTION_HEADER]
|
|
1216
|
+
) or isinstance(item, SectionHeaderItem):
|
|
1217
|
+
in_list = False
|
|
1218
|
+
marker = ""
|
|
1219
|
+
if not strict_text:
|
|
1220
|
+
marker = "#" * level
|
|
1221
|
+
if len(marker) < 2:
|
|
1222
|
+
marker = "##"
|
|
1223
|
+
text = f"{marker} {item.text}\n"
|
|
1224
|
+
mdtexts.append(text.strip() + "\n")
|
|
1225
|
+
|
|
1226
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
|
|
1227
|
+
in_list = False
|
|
1228
|
+
text = f"```\n{item.text}\n```\n"
|
|
1229
|
+
mdtexts.append(text)
|
|
1230
|
+
|
|
1231
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
1232
|
+
# captions are printed in picture and table ... skipping for now
|
|
1233
|
+
continue
|
|
1261
1234
|
|
|
1262
|
-
|
|
1263
|
-
|
|
1235
|
+
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
1236
|
+
in_list = True
|
|
1237
|
+
# Calculate indent based on list_nesting_level
|
|
1238
|
+
# -1 because level 1 needs no indent
|
|
1239
|
+
list_indent = " " * (indent * (list_nesting_level - 1))
|
|
1240
|
+
|
|
1241
|
+
marker = ""
|
|
1242
|
+
if strict_text:
|
|
1243
|
+
marker = ""
|
|
1244
|
+
elif item.enumerated:
|
|
1245
|
+
marker = item.marker
|
|
1246
|
+
else:
|
|
1247
|
+
marker = "-" # Markdown needs only dash as item marker.
|
|
1248
|
+
|
|
1249
|
+
text = f"{list_indent}{marker} {item.text}"
|
|
1250
|
+
mdtexts.append(text)
|
|
1251
|
+
|
|
1252
|
+
elif isinstance(item, TextItem) and item.label in labels:
|
|
1253
|
+
in_list = False
|
|
1254
|
+
if len(item.text):
|
|
1255
|
+
text = f"{item.text}\n"
|
|
1256
|
+
mdtexts.append(text)
|
|
1257
|
+
|
|
1258
|
+
elif isinstance(item, TableItem) and not strict_text:
|
|
1259
|
+
in_list = False
|
|
1260
|
+
mdtexts.append(item.caption_text(self))
|
|
1261
|
+
md_table = item.export_to_markdown()
|
|
1262
|
+
mdtexts.append("\n" + md_table + "\n")
|
|
1263
|
+
|
|
1264
|
+
elif isinstance(item, PictureItem) and not strict_text:
|
|
1265
|
+
in_list = False
|
|
1266
|
+
mdtexts.append(item.caption_text(self))
|
|
1267
|
+
|
|
1268
|
+
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
1269
|
+
mdtexts.append("\n" + image_placeholder + "\n")
|
|
1270
|
+
elif image_mode == ImageRefMode.EMBEDDED and isinstance(
|
|
1271
|
+
item.image, ImageRef
|
|
1272
|
+
):
|
|
1273
|
+
text = f"\n"
|
|
1274
|
+
mdtexts.append(text)
|
|
1275
|
+
elif image_mode == ImageRefMode.EMBEDDED and not isinstance(
|
|
1276
|
+
item.image, ImageRef
|
|
1277
|
+
):
|
|
1278
|
+
text = (
|
|
1279
|
+
"<!-- 🖼️❌ Image not available. "
|
|
1280
|
+
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
1281
|
+
" --> "
|
|
1282
|
+
)
|
|
1283
|
+
mdtexts.append(text)
|
|
1264
1284
|
|
|
1265
|
-
|
|
1266
|
-
|
|
1285
|
+
elif isinstance(item, DocItem) and item.label in labels:
|
|
1286
|
+
in_list = False
|
|
1287
|
+
text = "<missing-text>"
|
|
1288
|
+
mdtexts.append(text)
|
|
1267
1289
|
|
|
1268
|
-
|
|
1269
|
-
|
|
1290
|
+
mdtext = (delim.join(mdtexts)).strip()
|
|
1291
|
+
mdtext = re.sub(
|
|
1292
|
+
r"\n\n\n+", "\n\n", mdtext
|
|
1293
|
+
) # remove cases of double or more empty lines.
|
|
1294
|
+
return mdtext
|
|
1270
1295
|
|
|
1271
1296
|
def export_to_text( # noqa: C901
|
|
1272
1297
|
self,
|
|
1273
1298
|
delim: str = "\n\n",
|
|
1274
1299
|
from_element: int = 0,
|
|
1275
|
-
to_element:
|
|
1300
|
+
to_element: int = 1000000,
|
|
1276
1301
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
1277
1302
|
) -> str:
|
|
1278
1303
|
"""export_to_text."""
|
|
@@ -1399,6 +1424,121 @@ class DoclingDocument(BaseModel):
|
|
|
1399
1424
|
|
|
1400
1425
|
return doctags
|
|
1401
1426
|
|
|
1427
|
+
def _export_to_indented_text(
|
|
1428
|
+
self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
|
|
1429
|
+
):
|
|
1430
|
+
"""Export the document to indented text to expose hierarchy."""
|
|
1431
|
+
result = []
|
|
1432
|
+
|
|
1433
|
+
def get_text(text: str, max_text_len: int):
|
|
1434
|
+
|
|
1435
|
+
middle = " ... "
|
|
1436
|
+
|
|
1437
|
+
if max_text_len == -1:
|
|
1438
|
+
return text
|
|
1439
|
+
elif len(text) < max_text_len + len(middle):
|
|
1440
|
+
return text
|
|
1441
|
+
else:
|
|
1442
|
+
tbeg = int((max_text_len - len(middle)) / 2)
|
|
1443
|
+
tend = int(max_text_len - tbeg)
|
|
1444
|
+
|
|
1445
|
+
return text[0:tbeg] + middle + text[-tend:]
|
|
1446
|
+
|
|
1447
|
+
for i, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
1448
|
+
if isinstance(item, GroupItem):
|
|
1449
|
+
result.append(
|
|
1450
|
+
indent * level
|
|
1451
|
+
+ f"item-{i} at level {level}: {item.label}: group {item.name}"
|
|
1452
|
+
)
|
|
1453
|
+
|
|
1454
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
1455
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1456
|
+
|
|
1457
|
+
result.append(
|
|
1458
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
elif isinstance(item, SectionHeaderItem):
|
|
1462
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1463
|
+
|
|
1464
|
+
result.append(
|
|
1465
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1466
|
+
)
|
|
1467
|
+
|
|
1468
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
|
|
1469
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1470
|
+
|
|
1471
|
+
result.append(
|
|
1472
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1473
|
+
)
|
|
1474
|
+
|
|
1475
|
+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
1476
|
+
# captions are printed in picture and table ... skipping for now
|
|
1477
|
+
continue
|
|
1478
|
+
|
|
1479
|
+
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
1480
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1481
|
+
|
|
1482
|
+
result.append(
|
|
1483
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1484
|
+
)
|
|
1485
|
+
|
|
1486
|
+
elif isinstance(item, TextItem):
|
|
1487
|
+
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
1488
|
+
|
|
1489
|
+
result.append(
|
|
1490
|
+
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
1491
|
+
)
|
|
1492
|
+
|
|
1493
|
+
elif isinstance(item, TableItem):
|
|
1494
|
+
|
|
1495
|
+
result.append(
|
|
1496
|
+
indent * level
|
|
1497
|
+
+ f"item-{i} at level {level}: {item.label} with "
|
|
1498
|
+
+ f"[{item.data.num_rows}x{item.data.num_cols}]"
|
|
1499
|
+
)
|
|
1500
|
+
|
|
1501
|
+
for _ in item.captions:
|
|
1502
|
+
caption = _.resolve(self)
|
|
1503
|
+
result.append(
|
|
1504
|
+
indent * (level + 1)
|
|
1505
|
+
+ f"item-{i} at level {level + 1}: {caption.label}: "
|
|
1506
|
+
+ f"{caption.text}"
|
|
1507
|
+
)
|
|
1508
|
+
|
|
1509
|
+
if explicit_tables:
|
|
1510
|
+
grid: list[list[str]] = []
|
|
1511
|
+
for i, row in enumerate(item.data.grid):
|
|
1512
|
+
grid.append([])
|
|
1513
|
+
for j, cell in enumerate(row):
|
|
1514
|
+
if j < 10:
|
|
1515
|
+
text = get_text(text=cell.text, max_text_len=16)
|
|
1516
|
+
grid[-1].append(text)
|
|
1517
|
+
|
|
1518
|
+
result.append("\n" + tabulate(grid) + "\n")
|
|
1519
|
+
|
|
1520
|
+
elif isinstance(item, PictureItem):
|
|
1521
|
+
|
|
1522
|
+
result.append(
|
|
1523
|
+
indent * level + f"item-{i} at level {level}: {item.label}"
|
|
1524
|
+
)
|
|
1525
|
+
|
|
1526
|
+
for _ in item.captions:
|
|
1527
|
+
caption = _.resolve(self)
|
|
1528
|
+
result.append(
|
|
1529
|
+
indent * (level + 1)
|
|
1530
|
+
+ f"item-{i} at level {level + 1}: {caption.label}: "
|
|
1531
|
+
+ f"{caption.text}"
|
|
1532
|
+
)
|
|
1533
|
+
|
|
1534
|
+
elif isinstance(item, DocItem):
|
|
1535
|
+
result.append(
|
|
1536
|
+
indent * (level + 1)
|
|
1537
|
+
+ f"item-{i} at level {level}: {item.label}: ignored"
|
|
1538
|
+
)
|
|
1539
|
+
|
|
1540
|
+
return "\n".join(result)
|
|
1541
|
+
|
|
1402
1542
|
def add_page(
|
|
1403
1543
|
self, page_no: int, size: Size, image: Optional[ImageRef] = None
|
|
1404
1544
|
) -> PageItem:
|
docling_core/utils/file.py
CHANGED
|
@@ -5,15 +5,18 @@
|
|
|
5
5
|
|
|
6
6
|
"""File-related utilities."""
|
|
7
7
|
|
|
8
|
+
import importlib
|
|
8
9
|
import tempfile
|
|
9
10
|
from pathlib import Path
|
|
10
|
-
from typing import Union
|
|
11
|
+
from typing import Dict, Optional, Union
|
|
11
12
|
|
|
12
13
|
import requests
|
|
13
14
|
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
14
15
|
|
|
15
16
|
|
|
16
|
-
def resolve_file_source(
|
|
17
|
+
def resolve_file_source(
|
|
18
|
+
source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
|
|
19
|
+
) -> Path:
|
|
17
20
|
"""Resolves the source (URL, path) of a file to a local file path.
|
|
18
21
|
|
|
19
22
|
If a URL is provided, the content is first downloaded to a temporary local file.
|
|
@@ -29,7 +32,17 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
|
|
|
29
32
|
"""
|
|
30
33
|
try:
|
|
31
34
|
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
|
32
|
-
|
|
35
|
+
|
|
36
|
+
# make all header keys lower case
|
|
37
|
+
_headers = headers or {}
|
|
38
|
+
req_headers = {k.lower(): v for k, v in _headers.items()}
|
|
39
|
+
# add user-agent is not set
|
|
40
|
+
if "user-agent" not in req_headers:
|
|
41
|
+
agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
|
|
42
|
+
req_headers["user-agent"] = agent_name
|
|
43
|
+
|
|
44
|
+
# fetch the page
|
|
45
|
+
res = requests.get(http_url, stream=True, headers=req_headers)
|
|
33
46
|
res.raise_for_status()
|
|
34
47
|
fname = None
|
|
35
48
|
# try to get filename from response header
|
|
@@ -41,7 +54,7 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
|
|
|
41
54
|
break
|
|
42
55
|
# otherwise, use name from URL:
|
|
43
56
|
if fname is None:
|
|
44
|
-
fname = Path(http_url.path or "
|
|
57
|
+
fname = Path(http_url.path or "").name or "file"
|
|
45
58
|
local_path = Path(tempfile.mkdtemp()) / fname
|
|
46
59
|
with open(local_path, "wb") as f:
|
|
47
60
|
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -95,7 +95,7 @@ poetry run pytest test
|
|
|
95
95
|
|
|
96
96
|
Docling Core contains 3 top-level data types:
|
|
97
97
|
|
|
98
|
-
- **DoclingDocument** for publications like books, articles, reports, or patents.
|
|
98
|
+
- **DoclingDocument** for publications like books, articles, reports, or patents. The JSON that can be exported using Docling follows this schema.
|
|
99
99
|
The DoclingDocument type also models the metadata that may be attached to the converted document.
|
|
100
100
|
Check [DoclingDocument](docs/DoclingDocument.json) for the full JSON schema.
|
|
101
101
|
- **Record** for structured database records, centered on an entity or _subject_ that is provided with a list of attributes.
|
|
@@ -14,14 +14,14 @@ docling_core/search/mapping.py,sha256=6rqG7LgYSeWmooKNEcRa5gFDLp1ZdzPqDGlwTA5gpO
|
|
|
14
14
|
docling_core/search/meta.py,sha256=wSurrsqdP1N3gQKx027fVdzVmc33a7Y6rPl-FClQvtA,3318
|
|
15
15
|
docling_core/search/package.py,sha256=Lz2ml2eDy5t0ZimnGTq-DXHAn-f18w0bn4H5xrhs75A,1841
|
|
16
16
|
docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9ACDd57ds,106
|
|
17
|
-
docling_core/transforms/chunker/__init__.py,sha256=
|
|
18
|
-
docling_core/transforms/chunker/base.py,sha256=
|
|
19
|
-
docling_core/transforms/chunker/hierarchical_chunker.py,sha256
|
|
17
|
+
docling_core/transforms/chunker/__init__.py,sha256=sSSTnt7ZCt8Og1e0jhApNTtA0pyyHyzwcl8yXFLb2J8,292
|
|
18
|
+
docling_core/transforms/chunker/base.py,sha256=iPouZOJ3cYWvai4P0Gpd3QmsTKQuY5fFUXzTMk_XNmE,1571
|
|
19
|
+
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=-wZrQAPMiROb9yKiSN3eGqtD13tjcDVAiDUz971Q8PI,8087
|
|
20
20
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
21
21
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
22
|
-
docling_core/types/doc/__init__.py,sha256=
|
|
23
|
-
docling_core/types/doc/base.py,sha256=
|
|
24
|
-
docling_core/types/doc/document.py,sha256=
|
|
22
|
+
docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
|
|
23
|
+
docling_core/types/doc/base.py,sha256=zvx631U_yQCcJam83hNdDanXEYnO3eN-CCw9vDr6S-I,4442
|
|
24
|
+
docling_core/types/doc/document.py,sha256=RMG_iCgKJm71TFVRBrRKzST-Nb9DCgluji9O0BhPt4M,51373
|
|
25
25
|
docling_core/types/doc/labels.py,sha256=mzmSd072A-qW3IThswHxwIHV8IoyTCbHHlNOrisinRA,1335
|
|
26
26
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
27
27
|
docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
|
|
@@ -44,13 +44,13 @@ docling_core/types/rec/statement.py,sha256=YwcV4CbVaAbzNwh14yJ_6Py3Ww0XnUJrEEUiK
|
|
|
44
44
|
docling_core/types/rec/subject.py,sha256=PRCERGTMs4YhR3_Ne6jogkm41zYg8uUWb1yFpM7atm4,2572
|
|
45
45
|
docling_core/utils/__init__.py,sha256=VauNNpWRHG0_ISKrsy5-gTxicrdQZSau6qMfuMl3iqk,120
|
|
46
46
|
docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,874
|
|
47
|
-
docling_core/utils/file.py,sha256=
|
|
47
|
+
docling_core/utils/file.py,sha256=rZ3kaIpX2ZGxtaSXtqjcrivtXvsbeUolLXT-nntQ5yE,2388
|
|
48
48
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
49
49
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
50
50
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
51
51
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
52
|
-
docling_core-2.0.
|
|
53
|
-
docling_core-2.0.
|
|
54
|
-
docling_core-2.0.
|
|
55
|
-
docling_core-2.0.
|
|
56
|
-
docling_core-2.0.
|
|
52
|
+
docling_core-2.2.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
53
|
+
docling_core-2.2.0.dist-info/METADATA,sha256=3iAaT-ENDYeRWqBtefYohekCqiiXM-AFkJw3lFi3s8o,5432
|
|
54
|
+
docling_core-2.2.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
55
|
+
docling_core-2.2.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
|
|
56
|
+
docling_core-2.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|