docling-core 2.19.1__py3-none-any.whl → 2.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/types/doc/base.py +28 -1
- docling_core/types/doc/document.py +164 -11
- docling_core/types/doc/labels.py +23 -0
- {docling_core-2.19.1.dist-info → docling_core-2.20.0.dist-info}/METADATA +1 -1
- {docling_core-2.19.1.dist-info → docling_core-2.20.0.dist-info}/RECORD +8 -8
- {docling_core-2.19.1.dist-info → docling_core-2.20.0.dist-info}/LICENSE +0 -0
- {docling_core-2.19.1.dist-info → docling_core-2.20.0.dist-info}/WHEEL +0 -0
- {docling_core-2.19.1.dist-info → docling_core-2.20.0.dist-info}/entry_points.txt +0 -0
docling_core/types/doc/base.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Models for the base data types."""
|
|
2
2
|
|
|
3
3
|
from enum import Enum
|
|
4
|
-
from typing import Tuple
|
|
4
|
+
from typing import List, Tuple
|
|
5
5
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
@@ -365,3 +365,30 @@ class BoundingBox(BaseModel):
|
|
|
365
365
|
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
366
366
|
|
|
367
367
|
return False
|
|
368
|
+
|
|
369
|
+
@classmethod
|
|
370
|
+
def enclosing_bbox(cls, boxes: List["BoundingBox"]) -> "BoundingBox":
|
|
371
|
+
"""Create a bounding box that covers all of the given boxes."""
|
|
372
|
+
if not boxes:
|
|
373
|
+
raise ValueError("No bounding boxes provided for union.")
|
|
374
|
+
|
|
375
|
+
origin = boxes[0].coord_origin
|
|
376
|
+
if any(box.coord_origin != origin for box in boxes):
|
|
377
|
+
raise ValueError(
|
|
378
|
+
"All bounding boxes must have the same \
|
|
379
|
+
CoordOrigin to compute their union."
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
left = min(box.l for box in boxes)
|
|
383
|
+
right = max(box.r for box in boxes)
|
|
384
|
+
|
|
385
|
+
if origin == CoordOrigin.TOPLEFT:
|
|
386
|
+
top = min(box.t for box in boxes)
|
|
387
|
+
bottom = max(box.b for box in boxes)
|
|
388
|
+
elif origin == CoordOrigin.BOTTOMLEFT:
|
|
389
|
+
top = max(box.t for box in boxes)
|
|
390
|
+
bottom = min(box.b for box in boxes)
|
|
391
|
+
else:
|
|
392
|
+
raise ValueError("BoundingBoxes have different CoordOrigin")
|
|
393
|
+
|
|
394
|
+
return cls(l=left, t=top, r=right, b=bottom, coord_origin=origin)
|
|
@@ -43,7 +43,13 @@ from docling_core.search.package import VERSION_PATTERN
|
|
|
43
43
|
from docling_core.types.base import _JSON_POINTER_REGEX
|
|
44
44
|
from docling_core.types.doc import BoundingBox, Size
|
|
45
45
|
from docling_core.types.doc.base import ImageRefMode
|
|
46
|
-
from docling_core.types.doc.labels import
|
|
46
|
+
from docling_core.types.doc.labels import (
|
|
47
|
+
CodeLanguageLabel,
|
|
48
|
+
DocItemLabel,
|
|
49
|
+
GraphCellLabel,
|
|
50
|
+
GraphLinkLabel,
|
|
51
|
+
GroupLabel,
|
|
52
|
+
)
|
|
47
53
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
48
54
|
from docling_core.types.doc.utils import (
|
|
49
55
|
get_html_tag_with_text_direction,
|
|
@@ -1101,7 +1107,9 @@ class TableItem(FloatingItem):
|
|
|
1101
1107
|
return md_table
|
|
1102
1108
|
|
|
1103
1109
|
def export_to_html(
|
|
1104
|
-
self,
|
|
1110
|
+
self,
|
|
1111
|
+
doc: Optional["DoclingDocument"] = None,
|
|
1112
|
+
add_caption: bool = True,
|
|
1105
1113
|
) -> str:
|
|
1106
1114
|
"""Export the table as html."""
|
|
1107
1115
|
if doc is None:
|
|
@@ -1330,11 +1338,73 @@ class TableItem(FloatingItem):
|
|
|
1330
1338
|
return body
|
|
1331
1339
|
|
|
1332
1340
|
|
|
1333
|
-
class
|
|
1341
|
+
class GraphCell(BaseModel):
|
|
1342
|
+
"""GraphCell."""
|
|
1343
|
+
|
|
1344
|
+
label: GraphCellLabel
|
|
1345
|
+
|
|
1346
|
+
cell_id: int
|
|
1347
|
+
|
|
1348
|
+
text: str # sanitized text
|
|
1349
|
+
orig: str # text as seen on document
|
|
1350
|
+
|
|
1351
|
+
prov: Optional[ProvenanceItem] = None
|
|
1352
|
+
|
|
1353
|
+
# in case you have a text, table or picture item
|
|
1354
|
+
item_ref: Optional[RefItem] = None
|
|
1355
|
+
|
|
1356
|
+
|
|
1357
|
+
class GraphLink(BaseModel):
|
|
1358
|
+
"""GraphLink."""
|
|
1359
|
+
|
|
1360
|
+
label: GraphLinkLabel
|
|
1361
|
+
|
|
1362
|
+
source_cell_id: int
|
|
1363
|
+
target_cell_id: int
|
|
1364
|
+
|
|
1365
|
+
|
|
1366
|
+
class GraphData(BaseModel):
|
|
1367
|
+
"""GraphData."""
|
|
1368
|
+
|
|
1369
|
+
cells: List[GraphCell] = Field(default_factory=list)
|
|
1370
|
+
links: List[GraphLink] = Field(default_factory=list)
|
|
1371
|
+
|
|
1372
|
+
@field_validator("links")
|
|
1373
|
+
@classmethod
|
|
1374
|
+
def validate_links(cls, links, info):
|
|
1375
|
+
"""Ensure that each link is valid."""
|
|
1376
|
+
cells = info.data.get("cells", [])
|
|
1377
|
+
|
|
1378
|
+
valid_cell_ids = {cell.cell_id for cell in cells}
|
|
1379
|
+
|
|
1380
|
+
for link in links:
|
|
1381
|
+
if link.source_cell_id not in valid_cell_ids:
|
|
1382
|
+
raise ValueError(
|
|
1383
|
+
f"Invalid source_cell_id {link.source_cell_id} in GraphLink"
|
|
1384
|
+
)
|
|
1385
|
+
if link.target_cell_id not in valid_cell_ids:
|
|
1386
|
+
raise ValueError(
|
|
1387
|
+
f"Invalid target_cell_id {link.target_cell_id} in GraphLink"
|
|
1388
|
+
)
|
|
1389
|
+
|
|
1390
|
+
return links
|
|
1391
|
+
|
|
1392
|
+
|
|
1393
|
+
class KeyValueItem(FloatingItem):
|
|
1334
1394
|
"""KeyValueItem."""
|
|
1335
1395
|
|
|
1336
1396
|
label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
|
|
1337
1397
|
|
|
1398
|
+
graph: GraphData
|
|
1399
|
+
|
|
1400
|
+
|
|
1401
|
+
class FormItem(FloatingItem):
|
|
1402
|
+
"""FormItem."""
|
|
1403
|
+
|
|
1404
|
+
label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM
|
|
1405
|
+
|
|
1406
|
+
graph: GraphData
|
|
1407
|
+
|
|
1338
1408
|
|
|
1339
1409
|
ContentItem = Annotated[
|
|
1340
1410
|
Union[
|
|
@@ -1446,7 +1516,9 @@ class DoclingDocument(BaseModel):
|
|
|
1446
1516
|
)
|
|
1447
1517
|
|
|
1448
1518
|
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
|
|
1449
|
-
name="_root_",
|
|
1519
|
+
name="_root_",
|
|
1520
|
+
self_ref="#/furniture",
|
|
1521
|
+
content_layer=ContentLayer.FURNITURE,
|
|
1450
1522
|
) # List[RefItem] = []
|
|
1451
1523
|
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
|
|
1452
1524
|
|
|
@@ -1455,6 +1527,7 @@ class DoclingDocument(BaseModel):
|
|
|
1455
1527
|
pictures: List[PictureItem] = []
|
|
1456
1528
|
tables: List[TableItem] = []
|
|
1457
1529
|
key_value_items: List[KeyValueItem] = []
|
|
1530
|
+
form_items: List[FormItem] = []
|
|
1458
1531
|
|
|
1459
1532
|
pages: Dict[int, PageItem] = {} # empty as default
|
|
1460
1533
|
|
|
@@ -1851,6 +1924,68 @@ class DoclingDocument(BaseModel):
|
|
|
1851
1924
|
|
|
1852
1925
|
return section_header_item
|
|
1853
1926
|
|
|
1927
|
+
def add_key_values(
|
|
1928
|
+
self,
|
|
1929
|
+
graph: GraphData,
|
|
1930
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1931
|
+
parent: Optional[NodeItem] = None,
|
|
1932
|
+
):
|
|
1933
|
+
"""add_key_values.
|
|
1934
|
+
|
|
1935
|
+
:param graph: GraphData:
|
|
1936
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1937
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1938
|
+
"""
|
|
1939
|
+
if not parent:
|
|
1940
|
+
parent = self.body
|
|
1941
|
+
|
|
1942
|
+
key_value_index = len(self.key_value_items)
|
|
1943
|
+
cref = f"#/key_value_items/{key_value_index}"
|
|
1944
|
+
|
|
1945
|
+
kv_item = KeyValueItem(
|
|
1946
|
+
graph=graph,
|
|
1947
|
+
self_ref=cref,
|
|
1948
|
+
parent=parent.get_ref(),
|
|
1949
|
+
)
|
|
1950
|
+
if prov:
|
|
1951
|
+
kv_item.prov.append(prov)
|
|
1952
|
+
|
|
1953
|
+
self.key_value_items.append(kv_item)
|
|
1954
|
+
parent.children.append(RefItem(cref=cref))
|
|
1955
|
+
|
|
1956
|
+
return kv_item
|
|
1957
|
+
|
|
1958
|
+
def add_form(
|
|
1959
|
+
self,
|
|
1960
|
+
graph: GraphData,
|
|
1961
|
+
prov: Optional[ProvenanceItem] = None,
|
|
1962
|
+
parent: Optional[NodeItem] = None,
|
|
1963
|
+
):
|
|
1964
|
+
"""add_form.
|
|
1965
|
+
|
|
1966
|
+
:param graph: GraphData:
|
|
1967
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1968
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1969
|
+
"""
|
|
1970
|
+
if not parent:
|
|
1971
|
+
parent = self.body
|
|
1972
|
+
|
|
1973
|
+
form_index = len(self.form_items)
|
|
1974
|
+
cref = f"#/form_items/{form_index}"
|
|
1975
|
+
|
|
1976
|
+
form_item = FormItem(
|
|
1977
|
+
graph=graph,
|
|
1978
|
+
self_ref=cref,
|
|
1979
|
+
parent=parent.get_ref(),
|
|
1980
|
+
)
|
|
1981
|
+
if prov:
|
|
1982
|
+
form_item.prov.append(prov)
|
|
1983
|
+
|
|
1984
|
+
self.form_items.append(form_item)
|
|
1985
|
+
parent.children.append(RefItem(cref=cref))
|
|
1986
|
+
|
|
1987
|
+
return form_item
|
|
1988
|
+
|
|
1854
1989
|
def num_pages(self):
|
|
1855
1990
|
"""num_pages."""
|
|
1856
1991
|
return len(self.pages.values())
|
|
@@ -2009,7 +2144,8 @@ class DoclingDocument(BaseModel):
|
|
|
2009
2144
|
img.save(loc_path)
|
|
2010
2145
|
if reference_path is not None:
|
|
2011
2146
|
obj_path = relative_path(
|
|
2012
|
-
reference_path.resolve(),
|
|
2147
|
+
reference_path.resolve(),
|
|
2148
|
+
loc_path.resolve(),
|
|
2013
2149
|
)
|
|
2014
2150
|
else:
|
|
2015
2151
|
obj_path = loc_path
|
|
@@ -2027,7 +2163,10 @@ class DoclingDocument(BaseModel):
|
|
|
2027
2163
|
"""Print_element_tree."""
|
|
2028
2164
|
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
|
|
2029
2165
|
if isinstance(item, GroupItem):
|
|
2030
|
-
print(
|
|
2166
|
+
print(
|
|
2167
|
+
" " * level,
|
|
2168
|
+
f"{ix}: {item.label.value} with name={item.name}",
|
|
2169
|
+
)
|
|
2031
2170
|
elif isinstance(item, DocItem):
|
|
2032
2171
|
print(" " * level, f"{ix}: {item.label.value}")
|
|
2033
2172
|
|
|
@@ -2519,7 +2658,11 @@ class DoclingDocument(BaseModel):
|
|
|
2519
2658
|
|
|
2520
2659
|
return (in_ordered_list, html_texts)
|
|
2521
2660
|
|
|
2522
|
-
head_lines = [
|
|
2661
|
+
head_lines = [
|
|
2662
|
+
"<!DOCTYPE html>",
|
|
2663
|
+
f'<html lang="{html_lang}">',
|
|
2664
|
+
html_head,
|
|
2665
|
+
]
|
|
2523
2666
|
html_texts: list[str] = []
|
|
2524
2667
|
|
|
2525
2668
|
prev_level = 0 # Track the previous item's level
|
|
@@ -2599,7 +2742,8 @@ class DoclingDocument(BaseModel):
|
|
|
2599
2742
|
section_level: int = min(item.level + 1, 6)
|
|
2600
2743
|
|
|
2601
2744
|
text = get_html_tag_with_text_direction(
|
|
2602
|
-
html_tag=f"h{section_level}",
|
|
2745
|
+
html_tag=f"h{section_level}",
|
|
2746
|
+
text=_prepare_tag_content(item.text),
|
|
2603
2747
|
)
|
|
2604
2748
|
html_texts.append(text)
|
|
2605
2749
|
|
|
@@ -2856,13 +3000,19 @@ class DoclingDocument(BaseModel):
|
|
|
2856
3000
|
self.iterate_items(
|
|
2857
3001
|
self.body,
|
|
2858
3002
|
with_groups=True,
|
|
2859
|
-
included_content_layers={
|
|
3003
|
+
included_content_layers={
|
|
3004
|
+
ContentLayer.BODY,
|
|
3005
|
+
ContentLayer.FURNITURE,
|
|
3006
|
+
},
|
|
2860
3007
|
)
|
|
2861
3008
|
):
|
|
2862
3009
|
# Close lists if we've moved to a lower nesting level
|
|
2863
3010
|
if current_level < previous_level and ordered_list_stack:
|
|
2864
3011
|
ordered_list_stack = _close_lists(
|
|
2865
|
-
current_level,
|
|
3012
|
+
current_level,
|
|
3013
|
+
previous_level,
|
|
3014
|
+
ordered_list_stack,
|
|
3015
|
+
output_parts,
|
|
2866
3016
|
)
|
|
2867
3017
|
previous_level = current_level
|
|
2868
3018
|
|
|
@@ -2970,7 +3120,10 @@ class DoclingDocument(BaseModel):
|
|
|
2970
3120
|
return "".join(output_parts)
|
|
2971
3121
|
|
|
2972
3122
|
def _export_to_indented_text(
|
|
2973
|
-
self,
|
|
3123
|
+
self,
|
|
3124
|
+
indent=" ",
|
|
3125
|
+
max_text_len: int = -1,
|
|
3126
|
+
explicit_tables: bool = False,
|
|
2974
3127
|
):
|
|
2975
3128
|
"""Export the document to indented text to expose hierarchy."""
|
|
2976
3129
|
result = []
|
docling_core/types/doc/labels.py
CHANGED
|
@@ -140,6 +140,29 @@ class TableCellLabel(str, Enum):
|
|
|
140
140
|
return str(self.value)
|
|
141
141
|
|
|
142
142
|
|
|
143
|
+
class GraphCellLabel(str, Enum):
|
|
144
|
+
"""GraphCellLabel."""
|
|
145
|
+
|
|
146
|
+
UNSPECIFIED = "unspecified"
|
|
147
|
+
|
|
148
|
+
KEY = "key"
|
|
149
|
+
VALUE = "value"
|
|
150
|
+
|
|
151
|
+
CHECKBOX = "checkbox"
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class GraphLinkLabel(str, Enum):
|
|
155
|
+
"""GraphLinkLabel."""
|
|
156
|
+
|
|
157
|
+
UNSPECIFIED = "unspecified"
|
|
158
|
+
|
|
159
|
+
TO_VALUE = "to_value"
|
|
160
|
+
TO_KEY = "to_key"
|
|
161
|
+
|
|
162
|
+
TO_PARENT = "to_parent"
|
|
163
|
+
TO_CHILD = "to_child"
|
|
164
|
+
|
|
165
|
+
|
|
143
166
|
class CodeLanguageLabel(str, Enum):
|
|
144
167
|
"""CodeLanguageLabel."""
|
|
145
168
|
|
|
@@ -23,9 +23,9 @@ docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnH
|
|
|
23
23
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
24
24
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
25
25
|
docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
|
|
26
|
-
docling_core/types/doc/base.py,sha256=
|
|
27
|
-
docling_core/types/doc/document.py,sha256=
|
|
28
|
-
docling_core/types/doc/labels.py,sha256=
|
|
26
|
+
docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
|
|
27
|
+
docling_core/types/doc/document.py,sha256=1tL321QdbE5ljnZjaat0yEbLcdmnHzy1EBsEAnXMj3o,107897
|
|
28
|
+
docling_core/types/doc/labels.py,sha256=aJ-vcCNzAEFj3NxVKKiGUCit-2ra43st8xlpeWkSOqc,5662
|
|
29
29
|
docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
|
|
30
30
|
docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
|
|
31
31
|
docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
|
|
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
|
|
|
56
56
|
docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
|
|
57
57
|
docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
|
|
58
58
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
59
|
-
docling_core-2.
|
|
60
|
-
docling_core-2.
|
|
61
|
-
docling_core-2.
|
|
62
|
-
docling_core-2.
|
|
63
|
-
docling_core-2.
|
|
59
|
+
docling_core-2.20.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
60
|
+
docling_core-2.20.0.dist-info/METADATA,sha256=KCJ0MWOUOYFy-JP_sBk2wa_qmqLnvWokiuRP436c0fQ,5803
|
|
61
|
+
docling_core-2.20.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
62
|
+
docling_core-2.20.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
63
|
+
docling_core-2.20.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|