docling-core 2.19.1__py3-none-any.whl → 2.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/base.py +28 -1
- docling_core/types/doc/document.py +330 -115
- docling_core/types/doc/labels.py +24 -0
- {docling_core-2.19.1.dist-info → docling_core-2.21.0.dist-info}/METADATA +1 -1
- {docling_core-2.19.1.dist-info → docling_core-2.21.0.dist-info}/RECORD +8 -8
- {docling_core-2.19.1.dist-info → docling_core-2.21.0.dist-info}/LICENSE +0 -0
- {docling_core-2.19.1.dist-info → docling_core-2.21.0.dist-info}/WHEEL +0 -0
- {docling_core-2.19.1.dist-info → docling_core-2.21.0.dist-info}/entry_points.txt +0 -0
docling_core/types/doc/base.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Models for the base data types."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Tuple
|
|
4
|
+
from typing import List, Tuple
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -365,3 +365,30 @@ class BoundingBox(BaseModel):
|
|
|
365
365
|
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
366
366
|
|
|
367
367
|
return False
|
|
368
|
+
|
|
369
|
+
@classmethod
|
|
370
|
+
def enclosing_bbox(cls, boxes: List["BoundingBox"]) -> "BoundingBox":
|
|
371
|
+
"""Create a bounding box that covers all of the given boxes."""
|
|
372
|
+
if not boxes:
|
|
373
|
+
raise ValueError("No bounding boxes provided for union.")
|
|
374
|
+
|
|
375
|
+
origin = boxes[0].coord_origin
|
|
376
|
+
if any(box.coord_origin != origin for box in boxes):
|
|
377
|
+
raise ValueError(
|
|
378
|
+
"All bounding boxes must have the same \
|
|
379
|
+
CoordOrigin to compute their union."
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
left = min(box.l for box in boxes)
|
|
383
|
+
right = max(box.r for box in boxes)
|
|
384
|
+
|
|
385
|
+
if origin == CoordOrigin.TOPLEFT:
|
|
386
|
+
top = min(box.t for box in boxes)
|
|
387
|
+
bottom = max(box.b for box in boxes)
|
|
388
|
+
elif origin == CoordOrigin.BOTTOMLEFT:
|
|
389
|
+
top = max(box.t for box in boxes)
|
|
390
|
+
bottom = min(box.b for box in boxes)
|
|
391
|
+
else:
|
|
392
|
+
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
393
|
+
|
|
394
|
+
return cls(l=left, t=top, r=right, b=bottom, coord_origin=origin)
|
|
@@ -43,7 +43,13 @@ from docling_core.search.package import VERSION_PATTERN
|
|
|
43
43
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
44
44
|
from docling_core.types.doc import BoundingBox, Size
|
|
45
45
|
from docling_core.types.doc.base import ImageRefMode
|
|
46
|
-
from docling_core.types.doc.labels import
|
|
46
|
+
from docling_core.types.doc.labels import (
|
|
47
|
+
CodeLanguageLabel,
|
|
48
|
+
DocItemLabel,
|
|
49
|
+
GraphCellLabel,
|
|
50
|
+
GraphLinkLabel,
|
|
51
|
+
GroupLabel,
|
|
52
|
+
)
|
|
47
53
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
48
54
|
from docling_core.types.doc.utils import (
|
|
49
55
|
get_html_tag_with_text_direction,
|
|
@@ -55,7 +61,7 @@ _logger = logging.getLogger(__name__)
|
|
|
55
61
|
|
|
56
62
|
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
|
|
57
63
|
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
|
|
58
|
-
CURRENT_VERSION: Final = "1.
|
|
64
|
+
CURRENT_VERSION: Final = "1.2.0"
|
|
59
65
|
|
|
60
66
|
DEFAULT_EXPORT_LABELS = {
|
|
61
67
|
DocItemLabel.TITLE,
|
|
@@ -855,11 +861,11 @@ class PictureItem(FloatingItem):
|
|
|
855
861
|
image_placeholder: str = "<!-- image -->",
|
|
856
862
|
) -> str:
|
|
857
863
|
"""Export picture to Markdown format."""
|
|
858
|
-
default_response =
|
|
864
|
+
default_response = image_placeholder
|
|
859
865
|
error_response = (
|
|
860
|
-
"
|
|
866
|
+
"<!-- 🖼️❌ Image not available. "
|
|
861
867
|
"Please use `PdfPipelineOptions(generate_picture_images=True)`"
|
|
862
|
-
" -->
|
|
868
|
+
" -->"
|
|
863
869
|
)
|
|
864
870
|
|
|
865
871
|
if image_mode == ImageRefMode.PLACEHOLDER:
|
|
@@ -873,7 +879,7 @@ class PictureItem(FloatingItem):
|
|
|
873
879
|
and isinstance(self.image.uri, AnyUrl)
|
|
874
880
|
and self.image.uri.scheme == "data"
|
|
875
881
|
):
|
|
876
|
-
text = f"
|
|
882
|
+
text = f""
|
|
877
883
|
return text
|
|
878
884
|
|
|
879
885
|
# get the self.image._pil or crop it out of the page-image
|
|
@@ -881,7 +887,7 @@ class PictureItem(FloatingItem):
|
|
|
881
887
|
|
|
882
888
|
if img is not None:
|
|
883
889
|
imgb64 = self._image_to_base64(img)
|
|
884
|
-
text = f"
|
|
890
|
+
text = f""
|
|
885
891
|
|
|
886
892
|
return text
|
|
887
893
|
else:
|
|
@@ -893,7 +899,7 @@ class PictureItem(FloatingItem):
|
|
|
893
899
|
):
|
|
894
900
|
return default_response
|
|
895
901
|
|
|
896
|
-
text = f"
|
|
902
|
+
text = f")})"
|
|
897
903
|
return text
|
|
898
904
|
|
|
899
905
|
else:
|
|
@@ -1101,7 +1107,9 @@ class TableItem(FloatingItem):
|
|
|
1101
1107
|
return md_table
|
|
1102
1108
|
|
|
1103
1109
|
def export_to_html(
|
|
1104
|
-
self,
|
|
1110
|
+
self,
|
|
1111
|
+
doc: Optional["DoclingDocument"] = None,
|
|
1112
|
+
add_caption: bool = True,
|
|
1105
1113
|
) -> str:
|
|
1106
1114
|
"""Export the table as html."""
|
|
1107
1115
|
if doc is None:
|
|
@@ -1330,11 +1338,81 @@ class TableItem(FloatingItem):
|
|
|
1330
1338
|
return body
|
|
1331
1339
|
|
|
1332
1340
|
|
|
1333
|
-
class
|
|
1341
|
+
class GraphCell(BaseModel):
|
|
1342
|
+
"""GraphCell."""
|
|
1343
|
+
|
|
1344
|
+
label: GraphCellLabel
|
|
1345
|
+
|
|
1346
|
+
cell_id: int
|
|
1347
|
+
|
|
1348
|
+
text: str # sanitized text
|
|
1349
|
+
orig: str # text as seen on document
|
|
1350
|
+
|
|
1351
|
+
prov: Optional[ProvenanceItem] = None
|
|
1352
|
+
|
|
1353
|
+
# in case you have a text, table or picture item
|
|
1354
|
+
item_ref: Optional[RefItem] = None
|
|
1355
|
+
|
|
1356
|
+
|
|
1357
|
+
class GraphLink(BaseModel):
|
|
1358
|
+
"""GraphLink."""
|
|
1359
|
+
|
|
1360
|
+
label: GraphLinkLabel
|
|
1361
|
+
|
|
1362
|
+
source_cell_id: int
|
|
1363
|
+
target_cell_id: int
|
|
1364
|
+
|
|
1365
|
+
|
|
1366
|
+
class GraphData(BaseModel):
|
|
1367
|
+
"""GraphData."""
|
|
1368
|
+
|
|
1369
|
+
cells: List[GraphCell] = Field(default_factory=list)
|
|
1370
|
+
links: List[GraphLink] = Field(default_factory=list)
|
|
1371
|
+
|
|
1372
|
+
@field_validator("links")
|
|
1373
|
+
@classmethod
|
|
1374
|
+
def validate_links(cls, links, info):
|
|
1375
|
+
"""Ensure that each link is valid."""
|
|
1376
|
+
cells = info.data.get("cells", [])
|
|
1377
|
+
|
|
1378
|
+
valid_cell_ids = {cell.cell_id for cell in cells}
|
|
1379
|
+
|
|
1380
|
+
for link in links:
|
|
1381
|
+
if link.source_cell_id not in valid_cell_ids:
|
|
1382
|
+
raise ValueError(
|
|
1383
|
+
f"Invalid source_cell_id {link.source_cell_id} in GraphLink"
|
|
1384
|
+
)
|
|
1385
|
+
if link.target_cell_id not in valid_cell_ids:
|
|
1386
|
+
raise ValueError(
|
|
1387
|
+
f"Invalid target_cell_id {link.target_cell_id} in GraphLink"
|
|
1388
|
+
)
|
|
1389
|
+
|
|
1390
|
+
return links
|
|
1391
|
+
|
|
1392
|
+
|
|
1393
|
+
class KeyValueItem(FloatingItem):
|
|
1334
1394
|
"""KeyValueItem."""
|
|
1335
1395
|
|
|
1336
1396
|
label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
|
|
1337
1397
|
|
|
1398
|
+
graph: GraphData
|
|
1399
|
+
|
|
1400
|
+
def _export_to_markdown(self) -> str:
|
|
1401
|
+
# TODO add actual implementation
|
|
1402
|
+
return "<!-- missing-key-value-item -->"
|
|
1403
|
+
|
|
1404
|
+
|
|
1405
|
+
class FormItem(FloatingItem):
|
|
1406
|
+
"""FormItem."""
|
|
1407
|
+
|
|
1408
|
+
label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM
|
|
1409
|
+
|
|
1410
|
+
graph: GraphData
|
|
1411
|
+
|
|
1412
|
+
def _export_to_markdown(self) -> str:
|
|
1413
|
+
# TODO add actual implementation
|
|
1414
|
+
return "<!-- missing-form-item -->"
|
|
1415
|
+
|
|
1338
1416
|
|
|
1339
1417
|
ContentItem = Annotated[
|
|
1340
1418
|
Union[
|
|
@@ -1446,7 +1524,9 @@ class DoclingDocument(BaseModel):
|
|
|
1446
1524
|
)
|
|
1447
1525
|
|
|
1448
1526
|
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
|
|
1449
|
-
name="_root_",
|
|
1527
|
+
name="_root_",
|
|
1528
|
+
self_ref="#/furniture",
|
|
1529
|
+
content_layer=ContentLayer.FURNITURE,
|
|
1450
1530
|
) # List[RefItem] = []
|
|
1451
1531
|
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
1452
1532
|
|
|
@@ -1455,6 +1535,7 @@ class DoclingDocument(BaseModel):
|
|
|
1455
1535
|
pictures: List[PictureItem] = []
|
|
1456
1536
|
tables: List[TableItem] = []
|
|
1457
1537
|
key_value_items: List[KeyValueItem] = []
|
|
1538
|
+
form_items: List[FormItem] = []
|
|
1458
1539
|
|
|
1459
1540
|
pages: Dict[int, PageItem] = {} # empty as default
|
|
1460
1541
|
|
|
@@ -1851,6 +1932,68 @@ class DoclingDocument(BaseModel):
|
|
|
1851
1932
|
|
|
1852
1933
|
return section_header_item
|
|
1853
1934
|
|
|
1935
|
+
def add_key_values(
|
|
1936
|
+
self,
|
|
1937
|
+
graph: GraphData,
|
|
1938
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1939
|
+
parent: Optional[NodeItem] = None,
|
|
1940
|
+
):
|
|
1941
|
+
"""add_key_values.
|
|
1942
|
+
|
|
1943
|
+
:param graph: GraphData:
|
|
1944
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1945
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1946
|
+
"""
|
|
1947
|
+
if not parent:
|
|
1948
|
+
parent = self.body
|
|
1949
|
+
|
|
1950
|
+
key_value_index = len(self.key_value_items)
|
|
1951
|
+
cref = f"#/key_value_items/{key_value_index}"
|
|
1952
|
+
|
|
1953
|
+
kv_item = KeyValueItem(
|
|
1954
|
+
graph=graph,
|
|
1955
|
+
self_ref=cref,
|
|
1956
|
+
parent=parent.get_ref(),
|
|
1957
|
+
)
|
|
1958
|
+
if prov:
|
|
1959
|
+
kv_item.prov.append(prov)
|
|
1960
|
+
|
|
1961
|
+
self.key_value_items.append(kv_item)
|
|
1962
|
+
parent.children.append(RefItem(cref=cref))
|
|
1963
|
+
|
|
1964
|
+
return kv_item
|
|
1965
|
+
|
|
1966
|
+
def add_form(
|
|
1967
|
+
self,
|
|
1968
|
+
graph: GraphData,
|
|
1969
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1970
|
+
parent: Optional[NodeItem] = None,
|
|
1971
|
+
):
|
|
1972
|
+
"""add_form.
|
|
1973
|
+
|
|
1974
|
+
:param graph: GraphData:
|
|
1975
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1976
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1977
|
+
"""
|
|
1978
|
+
if not parent:
|
|
1979
|
+
parent = self.body
|
|
1980
|
+
|
|
1981
|
+
form_index = len(self.form_items)
|
|
1982
|
+
cref = f"#/form_items/{form_index}"
|
|
1983
|
+
|
|
1984
|
+
form_item = FormItem(
|
|
1985
|
+
graph=graph,
|
|
1986
|
+
self_ref=cref,
|
|
1987
|
+
parent=parent.get_ref(),
|
|
1988
|
+
)
|
|
1989
|
+
if prov:
|
|
1990
|
+
form_item.prov.append(prov)
|
|
1991
|
+
|
|
1992
|
+
self.form_items.append(form_item)
|
|
1993
|
+
parent.children.append(RefItem(cref=cref))
|
|
1994
|
+
|
|
1995
|
+
return form_item
|
|
1996
|
+
|
|
1854
1997
|
def num_pages(self):
|
|
1855
1998
|
"""num_pages."""
|
|
1856
1999
|
return len(self.pages.values())
|
|
@@ -2009,7 +2152,8 @@ class DoclingDocument(BaseModel):
|
|
|
2009
2152
|
img.save(loc_path)
|
|
2010
2153
|
if reference_path is not None:
|
|
2011
2154
|
obj_path = relative_path(
|
|
2012
|
-
reference_path.resolve(),
|
|
2155
|
+
reference_path.resolve(),
|
|
2156
|
+
loc_path.resolve(),
|
|
2013
2157
|
)
|
|
2014
2158
|
else:
|
|
2015
2159
|
obj_path = loc_path
|
|
@@ -2027,7 +2171,10 @@ class DoclingDocument(BaseModel):
|
|
|
2027
2171
|
"""Print_element_tree."""
|
|
2028
2172
|
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
2029
2173
|
if isinstance(item, GroupItem):
|
|
2030
|
-
print(
|
|
2174
|
+
print(
|
|
2175
|
+
" " * level,
|
|
2176
|
+
f"{ix}: {item.label.value} with name={item.name}",
|
|
2177
|
+
)
|
|
2031
2178
|
elif isinstance(item, DocItem):
|
|
2032
2179
|
print(" " * level, f"{ix}: {item.label.value}")
|
|
2033
2180
|
|
|
@@ -2100,6 +2247,20 @@ class DoclingDocument(BaseModel):
|
|
|
2100
2247
|
with open(filename, "w", encoding="utf-8") as fw:
|
|
2101
2248
|
yaml.dump(out, fw, default_flow_style=default_flow_style)
|
|
2102
2249
|
|
|
2250
|
+
@classmethod
|
|
2251
|
+
def load_from_yaml(cls, filename: Path) -> "DoclingDocument":
|
|
2252
|
+
"""load_from_yaml.
|
|
2253
|
+
|
|
2254
|
+
Args:
|
|
2255
|
+
filename: The filename to load a YAML-serialized DoclingDocument from.
|
|
2256
|
+
|
|
2257
|
+
Returns:
|
|
2258
|
+
DoclingDocument: the loaded DoclingDocument
|
|
2259
|
+
"""
|
|
2260
|
+
with open(filename, encoding="utf-8") as f:
|
|
2261
|
+
data = yaml.load(f, Loader=yaml.FullLoader)
|
|
2262
|
+
return DoclingDocument.model_validate(data)
|
|
2263
|
+
|
|
2103
2264
|
def export_to_dict(
|
|
2104
2265
|
self,
|
|
2105
2266
|
mode: str = "json",
|
|
@@ -2115,7 +2276,7 @@ class DoclingDocument(BaseModel):
|
|
|
2115
2276
|
self,
|
|
2116
2277
|
filename: Path,
|
|
2117
2278
|
artifacts_dir: Optional[Path] = None,
|
|
2118
|
-
delim: str = "\n",
|
|
2279
|
+
delim: str = "\n\n", # TODO: deprecate
|
|
2119
2280
|
from_element: int = 0,
|
|
2120
2281
|
to_element: int = sys.maxsize,
|
|
2121
2282
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
@@ -2158,7 +2319,7 @@ class DoclingDocument(BaseModel):
|
|
|
2158
2319
|
|
|
2159
2320
|
def export_to_markdown( # noqa: C901
|
|
2160
2321
|
self,
|
|
2161
|
-
delim: str = "\n",
|
|
2322
|
+
delim: str = "\n\n", # TODO deprecate
|
|
2162
2323
|
from_element: int = 0,
|
|
2163
2324
|
to_element: int = sys.maxsize,
|
|
2164
2325
|
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
|
|
@@ -2205,10 +2366,44 @@ class DoclingDocument(BaseModel):
|
|
|
2205
2366
|
:returns: The exported Markdown representation.
|
|
2206
2367
|
:rtype: str
|
|
2207
2368
|
"""
|
|
2208
|
-
|
|
2209
|
-
|
|
2210
|
-
|
|
2211
|
-
|
|
2369
|
+
comps = self._get_markdown_components(
|
|
2370
|
+
node=self.body,
|
|
2371
|
+
from_element=from_element,
|
|
2372
|
+
to_element=to_element,
|
|
2373
|
+
labels=labels,
|
|
2374
|
+
strict_text=strict_text,
|
|
2375
|
+
escaping_underscores=escaping_underscores,
|
|
2376
|
+
image_placeholder=image_placeholder,
|
|
2377
|
+
image_mode=image_mode,
|
|
2378
|
+
indent=indent,
|
|
2379
|
+
text_width=text_width,
|
|
2380
|
+
page_no=page_no,
|
|
2381
|
+
included_content_layers=included_content_layers,
|
|
2382
|
+
list_level=0,
|
|
2383
|
+
is_inline_scope=False,
|
|
2384
|
+
visited=set(),
|
|
2385
|
+
)
|
|
2386
|
+
return delim.join(comps)
|
|
2387
|
+
|
|
2388
|
+
def _get_markdown_components( # noqa: C901
|
|
2389
|
+
self,
|
|
2390
|
+
node: NodeItem,
|
|
2391
|
+
from_element: int,
|
|
2392
|
+
to_element: int,
|
|
2393
|
+
labels: set[DocItemLabel],
|
|
2394
|
+
strict_text: bool,
|
|
2395
|
+
escaping_underscores: bool,
|
|
2396
|
+
image_placeholder: str,
|
|
2397
|
+
image_mode: ImageRefMode,
|
|
2398
|
+
indent: int,
|
|
2399
|
+
text_width: int,
|
|
2400
|
+
page_no: Optional[int],
|
|
2401
|
+
included_content_layers: set[ContentLayer],
|
|
2402
|
+
list_level: int,
|
|
2403
|
+
is_inline_scope: bool,
|
|
2404
|
+
visited: set[str], # refs of visited items
|
|
2405
|
+
) -> list[str]:
|
|
2406
|
+
components: list[str] = [] # components to concatenate
|
|
2212
2407
|
|
|
2213
2408
|
# Our export markdown doesn't contain any emphasis styling:
|
|
2214
2409
|
# Bold, Italic, or Bold-Italic
|
|
@@ -2243,137 +2438,138 @@ class DoclingDocument(BaseModel):
|
|
|
2243
2438
|
|
|
2244
2439
|
return "".join(parts)
|
|
2245
2440
|
|
|
2246
|
-
def
|
|
2441
|
+
def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
|
|
2247
2442
|
if do_escape_underscores and escaping_underscores:
|
|
2248
2443
|
text = _escape_underscores(text)
|
|
2249
2444
|
if do_escape_html:
|
|
2250
2445
|
text = html.escape(text, quote=False)
|
|
2251
|
-
|
|
2446
|
+
if text:
|
|
2447
|
+
components.append(text)
|
|
2252
2448
|
|
|
2253
2449
|
for ix, (item, level) in enumerate(
|
|
2254
2450
|
self.iterate_items(
|
|
2255
|
-
|
|
2451
|
+
node,
|
|
2256
2452
|
with_groups=True,
|
|
2257
2453
|
page_no=page_no,
|
|
2258
2454
|
included_content_layers=included_content_layers,
|
|
2259
2455
|
)
|
|
2260
2456
|
):
|
|
2261
|
-
|
|
2262
|
-
|
|
2263
|
-
|
|
2264
|
-
|
|
2265
|
-
# Decrement list_nesting_level for each list group we've exited
|
|
2266
|
-
list_nesting_level = max(0, list_nesting_level - level_difference)
|
|
2267
|
-
|
|
2268
|
-
previous_level = level # Update previous_level for next iteration
|
|
2457
|
+
if item.self_ref in visited:
|
|
2458
|
+
continue
|
|
2459
|
+
else:
|
|
2460
|
+
visited.add(item.self_ref)
|
|
2269
2461
|
|
|
2270
2462
|
if ix < from_element or to_element <= ix:
|
|
2271
2463
|
continue # skip as many items as you want
|
|
2272
2464
|
|
|
2273
|
-
|
|
2465
|
+
elif (isinstance(item, DocItem)) and (item.label not in labels):
|
|
2274
2466
|
continue # skip any label that is not whitelisted
|
|
2275
2467
|
|
|
2276
|
-
# Handle newlines between different types of content
|
|
2277
|
-
if (
|
|
2278
|
-
len(mdtexts) > 0
|
|
2279
|
-
and not isinstance(item, (ListItem, GroupItem))
|
|
2280
|
-
and in_list
|
|
2281
|
-
):
|
|
2282
|
-
mdtexts[-1] += "\n"
|
|
2283
|
-
in_list = False
|
|
2284
|
-
|
|
2285
|
-
if isinstance(item, GroupItem) and item.label in [
|
|
2286
|
-
GroupLabel.LIST,
|
|
2287
|
-
GroupLabel.ORDERED_LIST,
|
|
2288
|
-
]:
|
|
2289
|
-
|
|
2290
|
-
if list_nesting_level == 0: # Check if we're on the top level.
|
|
2291
|
-
# In that case a new list starts directly after another list.
|
|
2292
|
-
mdtexts.append("\n") # Add a blank line
|
|
2293
|
-
|
|
2294
|
-
# Increment list nesting level when entering a new list
|
|
2295
|
-
list_nesting_level += 1
|
|
2296
|
-
in_list = True
|
|
2297
|
-
continue
|
|
2298
|
-
|
|
2299
2468
|
elif isinstance(item, GroupItem):
|
|
2300
|
-
|
|
2469
|
+
if item.label in [
|
|
2470
|
+
GroupLabel.LIST,
|
|
2471
|
+
GroupLabel.ORDERED_LIST,
|
|
2472
|
+
]:
|
|
2473
|
+
comps = self._get_markdown_components(
|
|
2474
|
+
node=item,
|
|
2475
|
+
from_element=from_element,
|
|
2476
|
+
to_element=to_element,
|
|
2477
|
+
labels=labels,
|
|
2478
|
+
strict_text=strict_text,
|
|
2479
|
+
escaping_underscores=escaping_underscores,
|
|
2480
|
+
image_placeholder=image_placeholder,
|
|
2481
|
+
image_mode=image_mode,
|
|
2482
|
+
indent=indent,
|
|
2483
|
+
text_width=text_width,
|
|
2484
|
+
page_no=page_no,
|
|
2485
|
+
included_content_layers=included_content_layers,
|
|
2486
|
+
list_level=list_level + 1,
|
|
2487
|
+
is_inline_scope=is_inline_scope,
|
|
2488
|
+
visited=visited,
|
|
2489
|
+
)
|
|
2490
|
+
# NOTE: assumes unordered (flag & marker currently in ListItem)
|
|
2491
|
+
indent_str = list_level * indent * " "
|
|
2492
|
+
text = "\n".join(
|
|
2493
|
+
[
|
|
2494
|
+
# avoid additional marker on already evaled sublists
|
|
2495
|
+
cpt if cpt and cpt[0] == " " else f"{indent_str}- {cpt}"
|
|
2496
|
+
for cpt in comps
|
|
2497
|
+
]
|
|
2498
|
+
)
|
|
2499
|
+
_ingest_text(text=text)
|
|
2500
|
+
elif item.label == GroupLabel.INLINE:
|
|
2501
|
+
comps = self._get_markdown_components(
|
|
2502
|
+
node=item,
|
|
2503
|
+
from_element=from_element,
|
|
2504
|
+
to_element=to_element,
|
|
2505
|
+
labels=labels,
|
|
2506
|
+
strict_text=strict_text,
|
|
2507
|
+
escaping_underscores=escaping_underscores,
|
|
2508
|
+
image_placeholder=image_placeholder,
|
|
2509
|
+
image_mode=image_mode,
|
|
2510
|
+
indent=indent,
|
|
2511
|
+
text_width=text_width,
|
|
2512
|
+
page_no=page_no,
|
|
2513
|
+
included_content_layers=included_content_layers,
|
|
2514
|
+
list_level=list_level,
|
|
2515
|
+
is_inline_scope=True,
|
|
2516
|
+
visited=visited,
|
|
2517
|
+
)
|
|
2518
|
+
_ingest_text(" ".join(comps))
|
|
2519
|
+
else:
|
|
2520
|
+
continue
|
|
2301
2521
|
|
|
2302
2522
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
|
|
2303
|
-
in_list = False
|
|
2304
2523
|
marker = "" if strict_text else "#"
|
|
2305
2524
|
text = f"{marker} {item.text}"
|
|
2306
|
-
|
|
2525
|
+
_ingest_text(text.strip())
|
|
2307
2526
|
|
|
2308
2527
|
elif (
|
|
2309
2528
|
isinstance(item, TextItem)
|
|
2310
2529
|
and item.label in [DocItemLabel.SECTION_HEADER]
|
|
2311
2530
|
) or isinstance(item, SectionHeaderItem):
|
|
2312
|
-
in_list = False
|
|
2313
2531
|
marker = ""
|
|
2314
2532
|
if not strict_text:
|
|
2315
2533
|
marker = "#" * level
|
|
2316
2534
|
if len(marker) < 2:
|
|
2317
2535
|
marker = "##"
|
|
2318
|
-
text = f"{marker} {item.text}
|
|
2319
|
-
|
|
2320
|
-
|
|
2321
|
-
elif isinstance(item, CodeItem) and item.label in labels:
|
|
2322
|
-
in_list = False
|
|
2323
|
-
text = f"```\n{item.text}\n```\n"
|
|
2324
|
-
_append_text(text, do_escape_underscores=False, do_escape_html=False)
|
|
2325
|
-
|
|
2326
|
-
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2327
|
-
in_list = True
|
|
2328
|
-
# Calculate indent based on list_nesting_level
|
|
2329
|
-
# -1 because level 1 needs no indent
|
|
2330
|
-
list_indent = " " * (indent * (list_nesting_level - 1))
|
|
2331
|
-
|
|
2332
|
-
marker = ""
|
|
2333
|
-
if strict_text:
|
|
2334
|
-
marker = ""
|
|
2335
|
-
elif item.enumerated:
|
|
2336
|
-
marker = item.marker
|
|
2337
|
-
else:
|
|
2338
|
-
marker = "-" # Markdown needs only dash as item marker.
|
|
2536
|
+
text = f"{marker} {item.text}"
|
|
2537
|
+
_ingest_text(text.strip())
|
|
2339
2538
|
|
|
2340
|
-
|
|
2341
|
-
|
|
2539
|
+
elif isinstance(item, CodeItem):
|
|
2540
|
+
text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
|
|
2541
|
+
_ingest_text(text, do_escape_underscores=False, do_escape_html=False)
|
|
2342
2542
|
|
|
2343
2543
|
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
|
|
2344
|
-
in_list = False
|
|
2345
2544
|
if item.text != "":
|
|
2346
|
-
|
|
2347
|
-
f"$${item.text}
|
|
2545
|
+
_ingest_text(
|
|
2546
|
+
f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
|
|
2348
2547
|
do_escape_underscores=False,
|
|
2349
2548
|
do_escape_html=False,
|
|
2350
2549
|
)
|
|
2351
2550
|
elif item.orig != "":
|
|
2352
|
-
|
|
2353
|
-
"<!-- formula-not-decoded
|
|
2551
|
+
_ingest_text(
|
|
2552
|
+
"<!-- formula-not-decoded -->",
|
|
2354
2553
|
do_escape_underscores=False,
|
|
2355
2554
|
do_escape_html=False,
|
|
2356
2555
|
)
|
|
2357
2556
|
|
|
2358
|
-
elif isinstance(item, TextItem)
|
|
2359
|
-
in_list = False
|
|
2557
|
+
elif isinstance(item, TextItem):
|
|
2360
2558
|
if len(item.text) and text_width > 0:
|
|
2361
2559
|
text = item.text
|
|
2362
2560
|
wrapped_text = textwrap.fill(text, width=text_width)
|
|
2363
|
-
|
|
2561
|
+
_ingest_text(wrapped_text)
|
|
2364
2562
|
elif len(item.text):
|
|
2365
|
-
|
|
2366
|
-
_append_text(text)
|
|
2563
|
+
_ingest_text(item.text)
|
|
2367
2564
|
|
|
2368
2565
|
elif isinstance(item, TableItem) and not strict_text:
|
|
2369
|
-
|
|
2370
|
-
|
|
2566
|
+
if caption_text := item.caption_text(self):
|
|
2567
|
+
_ingest_text(caption_text)
|
|
2371
2568
|
md_table = item.export_to_markdown()
|
|
2372
|
-
|
|
2569
|
+
_ingest_text(md_table)
|
|
2373
2570
|
|
|
2374
2571
|
elif isinstance(item, PictureItem) and not strict_text:
|
|
2375
|
-
|
|
2376
|
-
_append_text(item.caption_text(self))
|
|
2572
|
+
_ingest_text(item.caption_text(self))
|
|
2377
2573
|
|
|
2378
2574
|
line = item.export_to_markdown(
|
|
2379
2575
|
doc=self,
|
|
@@ -2381,19 +2577,17 @@ class DoclingDocument(BaseModel):
|
|
|
2381
2577
|
image_mode=image_mode,
|
|
2382
2578
|
)
|
|
2383
2579
|
|
|
2384
|
-
|
|
2580
|
+
_ingest_text(line, do_escape_html=False, do_escape_underscores=False)
|
|
2385
2581
|
|
|
2386
|
-
elif isinstance(item,
|
|
2387
|
-
|
|
2388
|
-
text =
|
|
2389
|
-
_append_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2582
|
+
elif isinstance(item, (KeyValueItem, FormItem)):
|
|
2583
|
+
text = item._export_to_markdown()
|
|
2584
|
+
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2390
2585
|
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
) # remove cases of double or more empty lines.
|
|
2586
|
+
elif isinstance(item, DocItem):
|
|
2587
|
+
text = "<!-- missing-text -->"
|
|
2588
|
+
_ingest_text(text, do_escape_html=False, do_escape_underscores=False)
|
|
2395
2589
|
|
|
2396
|
-
return
|
|
2590
|
+
return components
|
|
2397
2591
|
|
|
2398
2592
|
def export_to_text( # noqa: C901
|
|
2399
2593
|
self,
|
|
@@ -2519,7 +2713,11 @@ class DoclingDocument(BaseModel):
|
|
|
2519
2713
|
|
|
2520
2714
|
return (in_ordered_list, html_texts)
|
|
2521
2715
|
|
|
2522
|
-
head_lines = [
|
|
2716
|
+
head_lines = [
|
|
2717
|
+
"<!DOCTYPE html>",
|
|
2718
|
+
f'<html lang="{html_lang}">',
|
|
2719
|
+
html_head,
|
|
2720
|
+
]
|
|
2523
2721
|
html_texts: list[str] = []
|
|
2524
2722
|
|
|
2525
2723
|
prev_level = 0 # Track the previous item's level
|
|
@@ -2599,7 +2797,8 @@ class DoclingDocument(BaseModel):
|
|
|
2599
2797
|
section_level: int = min(item.level + 1, 6)
|
|
2600
2798
|
|
|
2601
2799
|
text = get_html_tag_with_text_direction(
|
|
2602
|
-
html_tag=f"h{section_level}",
|
|
2800
|
+
html_tag=f"h{section_level}",
|
|
2801
|
+
text=_prepare_tag_content(item.text),
|
|
2603
2802
|
)
|
|
2604
2803
|
html_texts.append(text)
|
|
2605
2804
|
|
|
@@ -2620,14 +2819,17 @@ class DoclingDocument(BaseModel):
|
|
|
2620
2819
|
"</figure>"
|
|
2621
2820
|
)
|
|
2622
2821
|
|
|
2822
|
+
img_fallback = _image_fallback(item)
|
|
2823
|
+
|
|
2623
2824
|
# If the formula is not processed correcty, use its image
|
|
2624
2825
|
if (
|
|
2625
2826
|
item.text == ""
|
|
2626
2827
|
and item.orig != ""
|
|
2627
2828
|
and image_mode == ImageRefMode.EMBEDDED
|
|
2628
2829
|
and len(item.prov) > 0
|
|
2830
|
+
and img_fallback is not None
|
|
2629
2831
|
):
|
|
2630
|
-
text =
|
|
2832
|
+
text = img_fallback
|
|
2631
2833
|
|
|
2632
2834
|
# Building a math equation in MathML format
|
|
2633
2835
|
# ref https://www.w3.org/TR/wai-aria-1.1/#math
|
|
@@ -2647,9 +2849,13 @@ class DoclingDocument(BaseModel):
|
|
|
2647
2849
|
"Malformed formula cannot be rendered. "
|
|
2648
2850
|
f"Error {err.__class__.__name__}, formula={math_formula}"
|
|
2649
2851
|
)
|
|
2650
|
-
if
|
|
2651
|
-
|
|
2652
|
-
|
|
2852
|
+
if (
|
|
2853
|
+
image_mode == ImageRefMode.EMBEDDED
|
|
2854
|
+
and len(item.prov) > 0
|
|
2855
|
+
and img_fallback is not None
|
|
2856
|
+
):
|
|
2857
|
+
text = img_fallback
|
|
2858
|
+
elif len(math_formula) > 0:
|
|
2653
2859
|
text = f"<pre>{math_formula}</pre>"
|
|
2654
2860
|
|
|
2655
2861
|
elif math_formula != "":
|
|
@@ -2856,13 +3062,19 @@ class DoclingDocument(BaseModel):
|
|
|
2856
3062
|
self.iterate_items(
|
|
2857
3063
|
self.body,
|
|
2858
3064
|
with_groups=True,
|
|
2859
|
-
included_content_layers={
|
|
3065
|
+
included_content_layers={
|
|
3066
|
+
ContentLayer.BODY,
|
|
3067
|
+
ContentLayer.FURNITURE,
|
|
3068
|
+
},
|
|
2860
3069
|
)
|
|
2861
3070
|
):
|
|
2862
3071
|
# Close lists if we've moved to a lower nesting level
|
|
2863
3072
|
if current_level < previous_level and ordered_list_stack:
|
|
2864
3073
|
ordered_list_stack = _close_lists(
|
|
2865
|
-
current_level,
|
|
3074
|
+
current_level,
|
|
3075
|
+
previous_level,
|
|
3076
|
+
ordered_list_stack,
|
|
3077
|
+
output_parts,
|
|
2866
3078
|
)
|
|
2867
3079
|
previous_level = current_level
|
|
2868
3080
|
|
|
@@ -2970,7 +3182,10 @@ class DoclingDocument(BaseModel):
|
|
|
2970
3182
|
return "".join(output_parts)
|
|
2971
3183
|
|
|
2972
3184
|
def _export_to_indented_text(
|
|
2973
|
-
self,
|
|
3185
|
+
self,
|
|
3186
|
+
indent=" ",
|
|
3187
|
+
max_text_len: int = -1,
|
|
3188
|
+
explicit_tables: bool = False,
|
|
2974
3189
|
):
|
|
2975
3190
|
"""Export the document to indented text to expose hierarchy."""
|
|
2976
3191
|
result = []
|
docling_core/types/doc/labels.py
CHANGED
|
@@ -75,6 +75,7 @@ class GroupLabel(str, Enum):
|
|
|
75
75
|
FORM_AREA = "form_area"
|
|
76
76
|
KEY_VALUE_AREA = "key_value_area"
|
|
77
77
|
COMMENT_SECTION = "comment_section"
|
|
78
|
+
INLINE = "inline"
|
|
78
79
|
|
|
79
80
|
def __str__(self):
|
|
80
81
|
"""Get string value."""
|
|
@@ -140,6 +141,29 @@ class TableCellLabel(str, Enum):
|
|
|
140
141
|
return str(self.value)
|
|
141
142
|
|
|
142
143
|
|
|
144
|
+
class GraphCellLabel(str, Enum):
|
|
145
|
+
"""GraphCellLabel."""
|
|
146
|
+
|
|
147
|
+
UNSPECIFIED = "unspecified"
|
|
148
|
+
|
|
149
|
+
KEY = "key"
|
|
150
|
+
VALUE = "value"
|
|
151
|
+
|
|
152
|
+
CHECKBOX = "checkbox"
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class GraphLinkLabel(str, Enum):
|
|
156
|
+
"""GraphLinkLabel."""
|
|
157
|
+
|
|
158
|
+
UNSPECIFIED = "unspecified"
|
|
159
|
+
|
|
160
|
+
TO_VALUE = "to_value"
|
|
161
|
+
TO_KEY = "to_key"
|
|
162
|
+
|
|
163
|
+
TO_PARENT = "to_parent"
|
|
164
|
+
TO_CHILD = "to_child"
|
|
165
|
+
|
|
166
|
+
|
|
143
167
|
class CodeLanguageLabel(str, Enum):
|
|
144
168
|
"""CodeLanguageLabel."""
|
|
145
169
|
|
|
@@ -23,9 +23,9 @@ docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnH
|
|
|
23
23
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
24
24
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
25
25
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
26
|
-
docling_core/types/doc/base.py,sha256=
|
|
27
|
-
docling_core/types/doc/document.py,sha256=
|
|
28
|
-
docling_core/types/doc/labels.py,sha256=
|
|
26
|
+
docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
|
|
27
|
+
docling_core/types/doc/document.py,sha256=oaa8QetSQTiPOQIZEMpvzoxwPVBOn9DLUwE62tB707w,110223
|
|
28
|
+
docling_core/types/doc/labels.py,sha256=0J9Gsqz-jQ4FP2yxs9wOxoTr3qg97BniFX7MJVziUmk,5684
|
|
29
29
|
docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
|
|
30
30
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
31
31
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
56
56
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
57
57
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
58
58
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
59
|
-
docling_core-2.
|
|
60
|
-
docling_core-2.
|
|
61
|
-
docling_core-2.
|
|
62
|
-
docling_core-2.
|
|
63
|
-
docling_core-2.
|
|
59
|
+
docling_core-2.21.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
60
|
+
docling_core-2.21.0.dist-info/METADATA,sha256=gyy6KPX1dAJel3ysIg3Zt73Opw8WtKW2nSXxonkkRKc,5803
|
|
61
|
+
docling_core-2.21.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
62
|
+
docling_core-2.21.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
63
|
+
docling_core-2.21.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|