docling-core 2.19.1__py3-none-any.whl → 2.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -1,7 +1,7 @@
1
1
  """Models for the base data types."""
2
2
 
3
3
  from enum import Enum
4
- from typing import Tuple
4
+ from typing import List, Tuple
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -365,3 +365,30 @@ class BoundingBox(BaseModel):
365
365
  raise ValueError("BoundingBoxes have different CoordOrigin")
366
366
 
367
367
  return False
368
+
369
+ @classmethod
370
+ def enclosing_bbox(cls, boxes: List["BoundingBox"]) -> "BoundingBox":
371
+ """Create a bounding box that covers all of the given boxes."""
372
+ if not boxes:
373
+ raise ValueError("No bounding boxes provided for union.")
374
+
375
+ origin = boxes[0].coord_origin
376
+ if any(box.coord_origin != origin for box in boxes):
377
+ raise ValueError(
378
+ "All bounding boxes must have the same \
379
+ CoordOrigin to compute their union."
380
+ )
381
+
382
+ left = min(box.l for box in boxes)
383
+ right = max(box.r for box in boxes)
384
+
385
+ if origin == CoordOrigin.TOPLEFT:
386
+ top = min(box.t for box in boxes)
387
+ bottom = max(box.b for box in boxes)
388
+ elif origin == CoordOrigin.BOTTOMLEFT:
389
+ top = max(box.t for box in boxes)
390
+ bottom = min(box.b for box in boxes)
391
+ else:
392
+ raise ValueError("BoundingBoxes have different CoordOrigin")
393
+
394
+ return cls(l=left, t=top, r=right, b=bottom, coord_origin=origin)
@@ -43,7 +43,13 @@ from docling_core.search.package import VERSION_PATTERN
43
43
  from docling_core.types.base import _JSON_POINTER_REGEX
44
44
  from docling_core.types.doc import BoundingBox, Size
45
45
  from docling_core.types.doc.base import ImageRefMode
46
- from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
46
+ from docling_core.types.doc.labels import (
47
+ CodeLanguageLabel,
48
+ DocItemLabel,
49
+ GraphCellLabel,
50
+ GraphLinkLabel,
51
+ GroupLabel,
52
+ )
47
53
  from docling_core.types.doc.tokens import DocumentToken, TableToken
48
54
  from docling_core.types.doc.utils import (
49
55
  get_html_tag_with_text_direction,
@@ -1101,7 +1107,9 @@ class TableItem(FloatingItem):
1101
1107
  return md_table
1102
1108
 
1103
1109
  def export_to_html(
1104
- self, doc: Optional["DoclingDocument"] = None, add_caption: bool = True
1110
+ self,
1111
+ doc: Optional["DoclingDocument"] = None,
1112
+ add_caption: bool = True,
1105
1113
  ) -> str:
1106
1114
  """Export the table as html."""
1107
1115
  if doc is None:
@@ -1330,11 +1338,73 @@ class TableItem(FloatingItem):
1330
1338
  return body
1331
1339
 
1332
1340
 
1333
- class KeyValueItem(DocItem):
1341
+ class GraphCell(BaseModel):
1342
+ """GraphCell."""
1343
+
1344
+ label: GraphCellLabel
1345
+
1346
+ cell_id: int
1347
+
1348
+ text: str # sanitized text
1349
+ orig: str # text as seen on document
1350
+
1351
+ prov: Optional[ProvenanceItem] = None
1352
+
1353
+ # in case you have a text, table or picture item
1354
+ item_ref: Optional[RefItem] = None
1355
+
1356
+
1357
+ class GraphLink(BaseModel):
1358
+ """GraphLink."""
1359
+
1360
+ label: GraphLinkLabel
1361
+
1362
+ source_cell_id: int
1363
+ target_cell_id: int
1364
+
1365
+
1366
+ class GraphData(BaseModel):
1367
+ """GraphData."""
1368
+
1369
+ cells: List[GraphCell] = Field(default_factory=list)
1370
+ links: List[GraphLink] = Field(default_factory=list)
1371
+
1372
+ @field_validator("links")
1373
+ @classmethod
1374
+ def validate_links(cls, links, info):
1375
+ """Ensure that each link is valid."""
1376
+ cells = info.data.get("cells", [])
1377
+
1378
+ valid_cell_ids = {cell.cell_id for cell in cells}
1379
+
1380
+ for link in links:
1381
+ if link.source_cell_id not in valid_cell_ids:
1382
+ raise ValueError(
1383
+ f"Invalid source_cell_id {link.source_cell_id} in GraphLink"
1384
+ )
1385
+ if link.target_cell_id not in valid_cell_ids:
1386
+ raise ValueError(
1387
+ f"Invalid target_cell_id {link.target_cell_id} in GraphLink"
1388
+ )
1389
+
1390
+ return links
1391
+
1392
+
1393
+ class KeyValueItem(FloatingItem):
1334
1394
  """KeyValueItem."""
1335
1395
 
1336
1396
  label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
1337
1397
 
1398
+ graph: GraphData
1399
+
1400
+
1401
+ class FormItem(FloatingItem):
1402
+ """FormItem."""
1403
+
1404
+ label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM
1405
+
1406
+ graph: GraphData
1407
+
1338
1408
 
1339
1409
  ContentItem = Annotated[
1340
1410
  Union[
@@ -1446,7 +1516,9 @@ class DoclingDocument(BaseModel):
1446
1516
  )
1447
1517
 
1448
1518
  furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
1449
- name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
1519
+ name="_root_",
1520
+ self_ref="#/furniture",
1521
+ content_layer=ContentLayer.FURNITURE,
1450
1522
  ) # List[RefItem] = []
1451
1523
  body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
1452
1524
 
@@ -1455,6 +1527,7 @@ class DoclingDocument(BaseModel):
1455
1527
  pictures: List[PictureItem] = []
1456
1528
  tables: List[TableItem] = []
1457
1529
  key_value_items: List[KeyValueItem] = []
1530
+ form_items: List[FormItem] = []
1458
1531
 
1459
1532
  pages: Dict[int, PageItem] = {} # empty as default
1460
1533
 
@@ -1851,6 +1924,68 @@ class DoclingDocument(BaseModel):
1851
1924
 
1852
1925
  return section_header_item
1853
1926
 
1927
+ def add_key_values(
1928
+ self,
1929
+ graph: GraphData,
1930
+ prov: Optional[ProvenanceItem] = None,
1931
+ parent: Optional[NodeItem] = None,
1932
+ ):
1933
+ """add_key_values.
1934
+
1935
+ :param graph: GraphData:
1936
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1937
+ :param parent: Optional[NodeItem]: (Default value = None)
1938
+ """
1939
+ if not parent:
1940
+ parent = self.body
1941
+
1942
+ key_value_index = len(self.key_value_items)
1943
+ cref = f"#/key_value_items/{key_value_index}"
1944
+
1945
+ kv_item = KeyValueItem(
1946
+ graph=graph,
1947
+ self_ref=cref,
1948
+ parent=parent.get_ref(),
1949
+ )
1950
+ if prov:
1951
+ kv_item.prov.append(prov)
1952
+
1953
+ self.key_value_items.append(kv_item)
1954
+ parent.children.append(RefItem(cref=cref))
1955
+
1956
+ return kv_item
1957
+
1958
+ def add_form(
1959
+ self,
1960
+ graph: GraphData,
1961
+ prov: Optional[ProvenanceItem] = None,
1962
+ parent: Optional[NodeItem] = None,
1963
+ ):
1964
+ """add_form.
1965
+
1966
+ :param graph: GraphData:
1967
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1968
+ :param parent: Optional[NodeItem]: (Default value = None)
1969
+ """
1970
+ if not parent:
1971
+ parent = self.body
1972
+
1973
+ form_index = len(self.form_items)
1974
+ cref = f"#/form_items/{form_index}"
1975
+
1976
+ form_item = FormItem(
1977
+ graph=graph,
1978
+ self_ref=cref,
1979
+ parent=parent.get_ref(),
1980
+ )
1981
+ if prov:
1982
+ form_item.prov.append(prov)
1983
+
1984
+ self.form_items.append(form_item)
1985
+ parent.children.append(RefItem(cref=cref))
1986
+
1987
+ return form_item
1988
+
1854
1989
  def num_pages(self):
1855
1990
  """num_pages."""
1856
1991
  return len(self.pages.values())
@@ -2009,7 +2144,8 @@ class DoclingDocument(BaseModel):
2009
2144
  img.save(loc_path)
2010
2145
  if reference_path is not None:
2011
2146
  obj_path = relative_path(
2012
- reference_path.resolve(), loc_path.resolve()
2147
+ reference_path.resolve(),
2148
+ loc_path.resolve(),
2013
2149
  )
2014
2150
  else:
2015
2151
  obj_path = loc_path
@@ -2027,7 +2163,10 @@ class DoclingDocument(BaseModel):
2027
2163
  """Print_element_tree."""
2028
2164
  for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
2029
2165
  if isinstance(item, GroupItem):
2030
- print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
2166
+ print(
2167
+ " " * level,
2168
+ f"{ix}: {item.label.value} with name={item.name}",
2169
+ )
2031
2170
  elif isinstance(item, DocItem):
2032
2171
  print(" " * level, f"{ix}: {item.label.value}")
2033
2172
 
@@ -2519,7 +2658,11 @@ class DoclingDocument(BaseModel):
2519
2658
 
2520
2659
  return (in_ordered_list, html_texts)
2521
2660
 
2522
- head_lines = ["<!DOCTYPE html>", f'<html lang="{html_lang}">', html_head]
2661
+ head_lines = [
2662
+ "<!DOCTYPE html>",
2663
+ f'<html lang="{html_lang}">',
2664
+ html_head,
2665
+ ]
2523
2666
  html_texts: list[str] = []
2524
2667
 
2525
2668
  prev_level = 0 # Track the previous item's level
@@ -2599,7 +2742,8 @@ class DoclingDocument(BaseModel):
2599
2742
  section_level: int = min(item.level + 1, 6)
2600
2743
 
2601
2744
  text = get_html_tag_with_text_direction(
2602
- html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
2745
+ html_tag=f"h{section_level}",
2746
+ text=_prepare_tag_content(item.text),
2603
2747
  )
2604
2748
  html_texts.append(text)
2605
2749
 
@@ -2856,13 +3000,19 @@ class DoclingDocument(BaseModel):
2856
3000
  self.iterate_items(
2857
3001
  self.body,
2858
3002
  with_groups=True,
2859
- included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE},
3003
+ included_content_layers={
3004
+ ContentLayer.BODY,
3005
+ ContentLayer.FURNITURE,
3006
+ },
2860
3007
  )
2861
3008
  ):
2862
3009
  # Close lists if we've moved to a lower nesting level
2863
3010
  if current_level < previous_level and ordered_list_stack:
2864
3011
  ordered_list_stack = _close_lists(
2865
- current_level, previous_level, ordered_list_stack, output_parts
3012
+ current_level,
3013
+ previous_level,
3014
+ ordered_list_stack,
3015
+ output_parts,
2866
3016
  )
2867
3017
  previous_level = current_level
2868
3018
 
@@ -2970,7 +3120,10 @@ class DoclingDocument(BaseModel):
2970
3120
  return "".join(output_parts)
2971
3121
 
2972
3122
  def _export_to_indented_text(
2973
- self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
3123
+ self,
3124
+ indent=" ",
3125
+ max_text_len: int = -1,
3126
+ explicit_tables: bool = False,
2974
3127
  ):
2975
3128
  """Export the document to indented text to expose hierarchy."""
2976
3129
  result = []
@@ -140,6 +140,29 @@ class TableCellLabel(str, Enum):
140
140
  return str(self.value)
141
141
 
142
142
 
143
+ class GraphCellLabel(str, Enum):
144
+ """GraphCellLabel."""
145
+
146
+ UNSPECIFIED = "unspecified"
147
+
148
+ KEY = "key"
149
+ VALUE = "value"
150
+
151
+ CHECKBOX = "checkbox"
152
+
153
+
154
+ class GraphLinkLabel(str, Enum):
155
+ """GraphLinkLabel."""
156
+
157
+ UNSPECIFIED = "unspecified"
158
+
159
+ TO_VALUE = "to_value"
160
+ TO_KEY = "to_key"
161
+
162
+ TO_PARENT = "to_parent"
163
+ TO_CHILD = "to_child"
164
+
165
+
143
166
  class CodeLanguageLabel(str, Enum):
144
167
  """CodeLanguageLabel."""
145
168
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.19.1
3
+ Version: 2.20.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -23,9 +23,9 @@ docling_core/transforms/chunker/hybrid_chunker.py,sha256=kokjDdxjc_gygOokQwYFVnH
23
23
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
24
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
25
25
  docling_core/types/doc/__init__.py,sha256=bysJn2iwjAHwThSWDPXEdVUUij7p_ax12_nx2_0CMdg,653
26
- docling_core/types/doc/base.py,sha256=lMRNq1DUK7K26L2VNZRqFaItCSZ6m9BdYTVaJA98PZQ,11495
27
- docling_core/types/doc/document.py,sha256=t1nk1GeR5_YvZhuWUVZkkBekp89vFB4RBtMuwD3Acw4,104373
28
- docling_core/types/doc/labels.py,sha256=cqH4DGN9lgZns6gOtL5urzZzUPGOjHJ75xQbIKSh_h8,5306
26
+ docling_core/types/doc/base.py,sha256=22U1qDlD-2ICmgzbdZrjNayoPHnq4S1ks1GRoqB7y1Q,12542
27
+ docling_core/types/doc/document.py,sha256=1tL321QdbE5ljnZjaat0yEbLcdmnHzy1EBsEAnXMj3o,107897
28
+ docling_core/types/doc/labels.py,sha256=aJ-vcCNzAEFj3NxVKKiGUCit-2ra43st8xlpeWkSOqc,5662
29
29
  docling_core/types/doc/tokens.py,sha256=i73PXkmqXCLsQ5SddnJX8L9e_Ub2_K_DYSE-VE8NDq0,3925
30
30
  docling_core/types/doc/utils.py,sha256=SaiQD-WMMooFm1bMqwatU-IGhtG048iKJb-ppnJit_k,2250
31
31
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
@@ -56,8 +56,8 @@ docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2ty
56
56
  docling_core/utils/legacy.py,sha256=SqNQAxl97aHfoJEsC9vZcMJg5FNkmqKPFi-wdSrnfI0,24442
57
57
  docling_core/utils/validate.py,sha256=aQ11UbFyl8iD_N7yTTZmm_VVeXz8KcCyn3GLXgkfYRM,2049
58
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
59
- docling_core-2.19.1.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
- docling_core-2.19.1.dist-info/METADATA,sha256=Uz-AUOD2_itxSEVxatsPbCQ0pFBE3fMX-gXx0YLmsKw,5803
61
- docling_core-2.19.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
- docling_core-2.19.1.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
- docling_core-2.19.1.dist-info/RECORD,,
59
+ docling_core-2.20.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.20.0.dist-info/METADATA,sha256=KCJ0MWOUOYFy-JP_sBk2wa_qmqLnvWokiuRP436c0fQ,5803
61
+ docling_core-2.20.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.20.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.20.0.dist-info/RECORD,,