docling-core 2.19.1__tar.gz → 2.21.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

Files changed (62) hide show
  1. {docling_core-2.19.1 → docling_core-2.21.0}/PKG-INFO +1 -1
  2. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/doc/base.py +28 -1
  3. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/doc/document.py +330 -115
  4. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/doc/labels.py +24 -0
  5. {docling_core-2.19.1 → docling_core-2.21.0}/pyproject.toml +1 -1
  6. {docling_core-2.19.1 → docling_core-2.21.0}/LICENSE +0 -0
  7. {docling_core-2.19.1 → docling_core-2.21.0}/README.md +0 -0
  8. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/__init__.py +0 -0
  9. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/cli/__init__.py +0 -0
  10. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/cli/view.py +0 -0
  11. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/py.typed +0 -0
  12. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/resources/schemas/doc/ANN.json +0 -0
  13. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/resources/schemas/doc/DOC.json +0 -0
  14. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/resources/schemas/doc/OCR-output.json +0 -0
  15. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/resources/schemas/doc/RAW.json +0 -0
  16. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/resources/schemas/generated/ccs_document_schema.json +0 -0
  17. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/resources/schemas/generated/minimal_document_schema_flat.json +0 -0
  18. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/resources/schemas/search/search_doc_mapping.json +0 -0
  19. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/resources/schemas/search/search_doc_mapping_v2.json +0 -0
  20. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/search/__init__.py +0 -0
  21. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/search/json_schema_to_search_mapper.py +0 -0
  22. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/search/mapping.py +0 -0
  23. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/search/meta.py +0 -0
  24. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/search/package.py +0 -0
  25. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/transforms/__init__.py +0 -0
  26. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/transforms/chunker/__init__.py +0 -0
  27. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/transforms/chunker/base.py +0 -0
  28. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/transforms/chunker/hierarchical_chunker.py +0 -0
  29. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/transforms/chunker/hybrid_chunker.py +0 -0
  30. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/__init__.py +0 -0
  31. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/base.py +0 -0
  32. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/doc/__init__.py +0 -0
  33. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/doc/tokens.py +0 -0
  34. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/doc/utils.py +0 -0
  35. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/gen/__init__.py +0 -0
  36. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/gen/generic.py +0 -0
  37. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/io/__init__.py +0 -0
  38. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/legacy_doc/__init__.py +0 -0
  39. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/legacy_doc/base.py +0 -0
  40. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/legacy_doc/doc_ann.py +0 -0
  41. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/legacy_doc/doc_ocr.py +0 -0
  42. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/legacy_doc/doc_raw.py +0 -0
  43. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/legacy_doc/document.py +0 -0
  44. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/legacy_doc/tokens.py +0 -0
  45. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/nlp/__init__.py +0 -0
  46. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/nlp/qa.py +0 -0
  47. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/nlp/qa_labels.py +0 -0
  48. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/rec/__init__.py +0 -0
  49. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/rec/attribute.py +0 -0
  50. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/rec/base.py +0 -0
  51. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/rec/predicate.py +0 -0
  52. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/rec/record.py +0 -0
  53. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/rec/statement.py +0 -0
  54. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/types/rec/subject.py +0 -0
  55. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/utils/__init__.py +0 -0
  56. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/utils/alias.py +0 -0
  57. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/utils/file.py +0 -0
  58. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/utils/generate_docs.py +0 -0
  59. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/utils/generate_jsonschema.py +0 -0
  60. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/utils/legacy.py +0 -0
  61. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/utils/validate.py +0 -0
  62. {docling_core-2.19.1 → docling_core-2.21.0}/docling_core/utils/validators.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.19.1
3
+ Version: 2.21.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -1,7 +1,7 @@
1
1
  """Models for the base data types."""
2
2
 
3
3
  from enum import Enum
4
- from typing import Tuple
4
+ from typing import List, Tuple
5
5
 
6
6
  from pydantic import BaseModel
7
7
 
@@ -365,3 +365,30 @@ class BoundingBox(BaseModel):
365
365
  raise ValueError("BoundingBoxes have different CoordOrigin")
366
366
 
367
367
  return False
368
+
369
+ @classmethod
370
+ def enclosing_bbox(cls, boxes: List["BoundingBox"]) -> "BoundingBox":
371
+ """Create a bounding box that covers all of the given boxes."""
372
+ if not boxes:
373
+ raise ValueError("No bounding boxes provided for union.")
374
+
375
+ origin = boxes[0].coord_origin
376
+ if any(box.coord_origin != origin for box in boxes):
377
+ raise ValueError(
378
+ "All bounding boxes must have the same \
379
+ CoordOrigin to compute their union."
380
+ )
381
+
382
+ left = min(box.l for box in boxes)
383
+ right = max(box.r for box in boxes)
384
+
385
+ if origin == CoordOrigin.TOPLEFT:
386
+ top = min(box.t for box in boxes)
387
+ bottom = max(box.b for box in boxes)
388
+ elif origin == CoordOrigin.BOTTOMLEFT:
389
+ top = max(box.t for box in boxes)
390
+ bottom = min(box.b for box in boxes)
391
+ else:
392
+ raise ValueError("BoundingBoxes have different CoordOrigin")
393
+
394
+ return cls(l=left, t=top, r=right, b=bottom, coord_origin=origin)
@@ -43,7 +43,13 @@ from docling_core.search.package import VERSION_PATTERN
43
43
  from docling_core.types.base import _JSON_POINTER_REGEX
44
44
  from docling_core.types.doc import BoundingBox, Size
45
45
  from docling_core.types.doc.base import ImageRefMode
46
- from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
46
+ from docling_core.types.doc.labels import (
47
+ CodeLanguageLabel,
48
+ DocItemLabel,
49
+ GraphCellLabel,
50
+ GraphLinkLabel,
51
+ GroupLabel,
52
+ )
47
53
  from docling_core.types.doc.tokens import DocumentToken, TableToken
48
54
  from docling_core.types.doc.utils import (
49
55
  get_html_tag_with_text_direction,
@@ -55,7 +61,7 @@ _logger = logging.getLogger(__name__)
55
61
 
56
62
  Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
57
63
  LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
58
- CURRENT_VERSION: Final = "1.1.0"
64
+ CURRENT_VERSION: Final = "1.2.0"
59
65
 
60
66
  DEFAULT_EXPORT_LABELS = {
61
67
  DocItemLabel.TITLE,
@@ -855,11 +861,11 @@ class PictureItem(FloatingItem):
855
861
  image_placeholder: str = "<!-- image -->",
856
862
  ) -> str:
857
863
  """Export picture to Markdown format."""
858
- default_response = "\n" + image_placeholder + "\n"
864
+ default_response = image_placeholder
859
865
  error_response = (
860
- "\n<!-- 🖼️❌ Image not available. "
866
+ "<!-- 🖼️❌ Image not available. "
861
867
  "Please use `PdfPipelineOptions(generate_picture_images=True)`"
862
- " --> \n"
868
+ " -->"
863
869
  )
864
870
 
865
871
  if image_mode == ImageRefMode.PLACEHOLDER:
@@ -873,7 +879,7 @@ class PictureItem(FloatingItem):
873
879
  and isinstance(self.image.uri, AnyUrl)
874
880
  and self.image.uri.scheme == "data"
875
881
  ):
876
- text = f"\n![Image]({self.image.uri})\n"
882
+ text = f"![Image]({self.image.uri})"
877
883
  return text
878
884
 
879
885
  # get the self.image._pil or crop it out of the page-image
@@ -881,7 +887,7 @@ class PictureItem(FloatingItem):
881
887
 
882
888
  if img is not None:
883
889
  imgb64 = self._image_to_base64(img)
884
- text = f"\n![Image](data:image/png;base64,{imgb64})\n"
890
+ text = f"![Image](data:image/png;base64,{imgb64})"
885
891
 
886
892
  return text
887
893
  else:
@@ -893,7 +899,7 @@ class PictureItem(FloatingItem):
893
899
  ):
894
900
  return default_response
895
901
 
896
- text = f"\n![Image]({quote(str(self.image.uri))})\n"
902
+ text = f"![Image]({quote(str(self.image.uri))})"
897
903
  return text
898
904
 
899
905
  else:
@@ -1101,7 +1107,9 @@ class TableItem(FloatingItem):
1101
1107
  return md_table
1102
1108
 
1103
1109
  def export_to_html(
1104
- self, doc: Optional["DoclingDocument"] = None, add_caption: bool = True
1110
+ self,
1111
+ doc: Optional["DoclingDocument"] = None,
1112
+ add_caption: bool = True,
1105
1113
  ) -> str:
1106
1114
  """Export the table as html."""
1107
1115
  if doc is None:
@@ -1330,11 +1338,81 @@ class TableItem(FloatingItem):
1330
1338
  return body
1331
1339
 
1332
1340
 
1333
- class KeyValueItem(DocItem):
1341
+ class GraphCell(BaseModel):
1342
+ """GraphCell."""
1343
+
1344
+ label: GraphCellLabel
1345
+
1346
+ cell_id: int
1347
+
1348
+ text: str # sanitized text
1349
+ orig: str # text as seen on document
1350
+
1351
+ prov: Optional[ProvenanceItem] = None
1352
+
1353
+ # in case you have a text, table or picture item
1354
+ item_ref: Optional[RefItem] = None
1355
+
1356
+
1357
+ class GraphLink(BaseModel):
1358
+ """GraphLink."""
1359
+
1360
+ label: GraphLinkLabel
1361
+
1362
+ source_cell_id: int
1363
+ target_cell_id: int
1364
+
1365
+
1366
+ class GraphData(BaseModel):
1367
+ """GraphData."""
1368
+
1369
+ cells: List[GraphCell] = Field(default_factory=list)
1370
+ links: List[GraphLink] = Field(default_factory=list)
1371
+
1372
+ @field_validator("links")
1373
+ @classmethod
1374
+ def validate_links(cls, links, info):
1375
+ """Ensure that each link is valid."""
1376
+ cells = info.data.get("cells", [])
1377
+
1378
+ valid_cell_ids = {cell.cell_id for cell in cells}
1379
+
1380
+ for link in links:
1381
+ if link.source_cell_id not in valid_cell_ids:
1382
+ raise ValueError(
1383
+ f"Invalid source_cell_id {link.source_cell_id} in GraphLink"
1384
+ )
1385
+ if link.target_cell_id not in valid_cell_ids:
1386
+ raise ValueError(
1387
+ f"Invalid target_cell_id {link.target_cell_id} in GraphLink"
1388
+ )
1389
+
1390
+ return links
1391
+
1392
+
1393
+ class KeyValueItem(FloatingItem):
1334
1394
  """KeyValueItem."""
1335
1395
 
1336
1396
  label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
1337
1397
 
1398
+ graph: GraphData
1399
+
1400
+ def _export_to_markdown(self) -> str:
1401
+ # TODO add actual implementation
1402
+ return "<!-- missing-key-value-item -->"
1403
+
1404
+
1405
+ class FormItem(FloatingItem):
1406
+ """FormItem."""
1407
+
1408
+ label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM
1409
+
1410
+ graph: GraphData
1411
+
1412
+ def _export_to_markdown(self) -> str:
1413
+ # TODO add actual implementation
1414
+ return "<!-- missing-form-item -->"
1415
+
1338
1416
 
1339
1417
  ContentItem = Annotated[
1340
1418
  Union[
@@ -1446,7 +1524,9 @@ class DoclingDocument(BaseModel):
1446
1524
  )
1447
1525
 
1448
1526
  furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
1449
- name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
1527
+ name="_root_",
1528
+ self_ref="#/furniture",
1529
+ content_layer=ContentLayer.FURNITURE,
1450
1530
  ) # List[RefItem] = []
1451
1531
  body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
1452
1532
 
@@ -1455,6 +1535,7 @@ class DoclingDocument(BaseModel):
1455
1535
  pictures: List[PictureItem] = []
1456
1536
  tables: List[TableItem] = []
1457
1537
  key_value_items: List[KeyValueItem] = []
1538
+ form_items: List[FormItem] = []
1458
1539
 
1459
1540
  pages: Dict[int, PageItem] = {} # empty as default
1460
1541
 
@@ -1851,6 +1932,68 @@ class DoclingDocument(BaseModel):
1851
1932
 
1852
1933
  return section_header_item
1853
1934
 
1935
+ def add_key_values(
1936
+ self,
1937
+ graph: GraphData,
1938
+ prov: Optional[ProvenanceItem] = None,
1939
+ parent: Optional[NodeItem] = None,
1940
+ ):
1941
+ """add_key_values.
1942
+
1943
+ :param graph: GraphData:
1944
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1945
+ :param parent: Optional[NodeItem]: (Default value = None)
1946
+ """
1947
+ if not parent:
1948
+ parent = self.body
1949
+
1950
+ key_value_index = len(self.key_value_items)
1951
+ cref = f"#/key_value_items/{key_value_index}"
1952
+
1953
+ kv_item = KeyValueItem(
1954
+ graph=graph,
1955
+ self_ref=cref,
1956
+ parent=parent.get_ref(),
1957
+ )
1958
+ if prov:
1959
+ kv_item.prov.append(prov)
1960
+
1961
+ self.key_value_items.append(kv_item)
1962
+ parent.children.append(RefItem(cref=cref))
1963
+
1964
+ return kv_item
1965
+
1966
+ def add_form(
1967
+ self,
1968
+ graph: GraphData,
1969
+ prov: Optional[ProvenanceItem] = None,
1970
+ parent: Optional[NodeItem] = None,
1971
+ ):
1972
+ """add_form.
1973
+
1974
+ :param graph: GraphData:
1975
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1976
+ :param parent: Optional[NodeItem]: (Default value = None)
1977
+ """
1978
+ if not parent:
1979
+ parent = self.body
1980
+
1981
+ form_index = len(self.form_items)
1982
+ cref = f"#/form_items/{form_index}"
1983
+
1984
+ form_item = FormItem(
1985
+ graph=graph,
1986
+ self_ref=cref,
1987
+ parent=parent.get_ref(),
1988
+ )
1989
+ if prov:
1990
+ form_item.prov.append(prov)
1991
+
1992
+ self.form_items.append(form_item)
1993
+ parent.children.append(RefItem(cref=cref))
1994
+
1995
+ return form_item
1996
+
1854
1997
  def num_pages(self):
1855
1998
  """num_pages."""
1856
1999
  return len(self.pages.values())
@@ -2009,7 +2152,8 @@ class DoclingDocument(BaseModel):
2009
2152
  img.save(loc_path)
2010
2153
  if reference_path is not None:
2011
2154
  obj_path = relative_path(
2012
- reference_path.resolve(), loc_path.resolve()
2155
+ reference_path.resolve(),
2156
+ loc_path.resolve(),
2013
2157
  )
2014
2158
  else:
2015
2159
  obj_path = loc_path
@@ -2027,7 +2171,10 @@ class DoclingDocument(BaseModel):
2027
2171
  """Print_element_tree."""
2028
2172
  for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
2029
2173
  if isinstance(item, GroupItem):
2030
- print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
2174
+ print(
2175
+ " " * level,
2176
+ f"{ix}: {item.label.value} with name={item.name}",
2177
+ )
2031
2178
  elif isinstance(item, DocItem):
2032
2179
  print(" " * level, f"{ix}: {item.label.value}")
2033
2180
 
@@ -2100,6 +2247,20 @@ class DoclingDocument(BaseModel):
2100
2247
  with open(filename, "w", encoding="utf-8") as fw:
2101
2248
  yaml.dump(out, fw, default_flow_style=default_flow_style)
2102
2249
 
2250
+ @classmethod
2251
+ def load_from_yaml(cls, filename: Path) -> "DoclingDocument":
2252
+ """load_from_yaml.
2253
+
2254
+ Args:
2255
+ filename: The filename to load a YAML-serialized DoclingDocument from.
2256
+
2257
+ Returns:
2258
+ DoclingDocument: the loaded DoclingDocument
2259
+ """
2260
+ with open(filename, encoding="utf-8") as f:
2261
+ data = yaml.load(f, Loader=yaml.FullLoader)
2262
+ return DoclingDocument.model_validate(data)
2263
+
2103
2264
  def export_to_dict(
2104
2265
  self,
2105
2266
  mode: str = "json",
@@ -2115,7 +2276,7 @@ class DoclingDocument(BaseModel):
2115
2276
  self,
2116
2277
  filename: Path,
2117
2278
  artifacts_dir: Optional[Path] = None,
2118
- delim: str = "\n",
2279
+ delim: str = "\n\n", # TODO: deprecate
2119
2280
  from_element: int = 0,
2120
2281
  to_element: int = sys.maxsize,
2121
2282
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
@@ -2158,7 +2319,7 @@ class DoclingDocument(BaseModel):
2158
2319
 
2159
2320
  def export_to_markdown( # noqa: C901
2160
2321
  self,
2161
- delim: str = "\n",
2322
+ delim: str = "\n\n", # TODO deprecate
2162
2323
  from_element: int = 0,
2163
2324
  to_element: int = sys.maxsize,
2164
2325
  labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
@@ -2205,10 +2366,44 @@ class DoclingDocument(BaseModel):
2205
2366
  :returns: The exported Markdown representation.
2206
2367
  :rtype: str
2207
2368
  """
2208
- mdtexts: list[str] = []
2209
- list_nesting_level = 0 # Track the current list nesting level
2210
- previous_level = 0 # Track the previous item's level
2211
- in_list = False # Track if we're currently processing list items
2369
+ comps = self._get_markdown_components(
2370
+ node=self.body,
2371
+ from_element=from_element,
2372
+ to_element=to_element,
2373
+ labels=labels,
2374
+ strict_text=strict_text,
2375
+ escaping_underscores=escaping_underscores,
2376
+ image_placeholder=image_placeholder,
2377
+ image_mode=image_mode,
2378
+ indent=indent,
2379
+ text_width=text_width,
2380
+ page_no=page_no,
2381
+ included_content_layers=included_content_layers,
2382
+ list_level=0,
2383
+ is_inline_scope=False,
2384
+ visited=set(),
2385
+ )
2386
+ return delim.join(comps)
2387
+
2388
+ def _get_markdown_components( # noqa: C901
2389
+ self,
2390
+ node: NodeItem,
2391
+ from_element: int,
2392
+ to_element: int,
2393
+ labels: set[DocItemLabel],
2394
+ strict_text: bool,
2395
+ escaping_underscores: bool,
2396
+ image_placeholder: str,
2397
+ image_mode: ImageRefMode,
2398
+ indent: int,
2399
+ text_width: int,
2400
+ page_no: Optional[int],
2401
+ included_content_layers: set[ContentLayer],
2402
+ list_level: int,
2403
+ is_inline_scope: bool,
2404
+ visited: set[str], # refs of visited items
2405
+ ) -> list[str]:
2406
+ components: list[str] = [] # components to concatenate
2212
2407
 
2213
2408
  # Our export markdown doesn't contain any emphasis styling:
2214
2409
  # Bold, Italic, or Bold-Italic
@@ -2243,137 +2438,138 @@ class DoclingDocument(BaseModel):
2243
2438
 
2244
2439
  return "".join(parts)
2245
2440
 
2246
- def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
2441
+ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
2247
2442
  if do_escape_underscores and escaping_underscores:
2248
2443
  text = _escape_underscores(text)
2249
2444
  if do_escape_html:
2250
2445
  text = html.escape(text, quote=False)
2251
- mdtexts.append(text)
2446
+ if text:
2447
+ components.append(text)
2252
2448
 
2253
2449
  for ix, (item, level) in enumerate(
2254
2450
  self.iterate_items(
2255
- self.body,
2451
+ node,
2256
2452
  with_groups=True,
2257
2453
  page_no=page_no,
2258
2454
  included_content_layers=included_content_layers,
2259
2455
  )
2260
2456
  ):
2261
- # If we've moved to a lower level, we're exiting one or more groups
2262
- if level < previous_level:
2263
- # Calculate how many levels we've exited
2264
- level_difference = previous_level - level
2265
- # Decrement list_nesting_level for each list group we've exited
2266
- list_nesting_level = max(0, list_nesting_level - level_difference)
2267
-
2268
- previous_level = level # Update previous_level for next iteration
2457
+ if item.self_ref in visited:
2458
+ continue
2459
+ else:
2460
+ visited.add(item.self_ref)
2269
2461
 
2270
2462
  if ix < from_element or to_element <= ix:
2271
2463
  continue # skip as many items as you want
2272
2464
 
2273
- if (isinstance(item, DocItem)) and (item.label not in labels):
2465
+ elif (isinstance(item, DocItem)) and (item.label not in labels):
2274
2466
  continue # skip any label that is not whitelisted
2275
2467
 
2276
- # Handle newlines between different types of content
2277
- if (
2278
- len(mdtexts) > 0
2279
- and not isinstance(item, (ListItem, GroupItem))
2280
- and in_list
2281
- ):
2282
- mdtexts[-1] += "\n"
2283
- in_list = False
2284
-
2285
- if isinstance(item, GroupItem) and item.label in [
2286
- GroupLabel.LIST,
2287
- GroupLabel.ORDERED_LIST,
2288
- ]:
2289
-
2290
- if list_nesting_level == 0: # Check if we're on the top level.
2291
- # In that case a new list starts directly after another list.
2292
- mdtexts.append("\n") # Add a blank line
2293
-
2294
- # Increment list nesting level when entering a new list
2295
- list_nesting_level += 1
2296
- in_list = True
2297
- continue
2298
-
2299
2468
  elif isinstance(item, GroupItem):
2300
- continue
2469
+ if item.label in [
2470
+ GroupLabel.LIST,
2471
+ GroupLabel.ORDERED_LIST,
2472
+ ]:
2473
+ comps = self._get_markdown_components(
2474
+ node=item,
2475
+ from_element=from_element,
2476
+ to_element=to_element,
2477
+ labels=labels,
2478
+ strict_text=strict_text,
2479
+ escaping_underscores=escaping_underscores,
2480
+ image_placeholder=image_placeholder,
2481
+ image_mode=image_mode,
2482
+ indent=indent,
2483
+ text_width=text_width,
2484
+ page_no=page_no,
2485
+ included_content_layers=included_content_layers,
2486
+ list_level=list_level + 1,
2487
+ is_inline_scope=is_inline_scope,
2488
+ visited=visited,
2489
+ )
2490
+ # NOTE: assumes unordered (flag & marker currently in ListItem)
2491
+ indent_str = list_level * indent * " "
2492
+ text = "\n".join(
2493
+ [
2494
+ # avoid additional marker on already evaled sublists
2495
+ cpt if cpt and cpt[0] == " " else f"{indent_str}- {cpt}"
2496
+ for cpt in comps
2497
+ ]
2498
+ )
2499
+ _ingest_text(text=text)
2500
+ elif item.label == GroupLabel.INLINE:
2501
+ comps = self._get_markdown_components(
2502
+ node=item,
2503
+ from_element=from_element,
2504
+ to_element=to_element,
2505
+ labels=labels,
2506
+ strict_text=strict_text,
2507
+ escaping_underscores=escaping_underscores,
2508
+ image_placeholder=image_placeholder,
2509
+ image_mode=image_mode,
2510
+ indent=indent,
2511
+ text_width=text_width,
2512
+ page_no=page_no,
2513
+ included_content_layers=included_content_layers,
2514
+ list_level=list_level,
2515
+ is_inline_scope=True,
2516
+ visited=visited,
2517
+ )
2518
+ _ingest_text(" ".join(comps))
2519
+ else:
2520
+ continue
2301
2521
 
2302
2522
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2303
- in_list = False
2304
2523
  marker = "" if strict_text else "#"
2305
2524
  text = f"{marker} {item.text}"
2306
- _append_text(text.strip() + "\n")
2525
+ _ingest_text(text.strip())
2307
2526
 
2308
2527
  elif (
2309
2528
  isinstance(item, TextItem)
2310
2529
  and item.label in [DocItemLabel.SECTION_HEADER]
2311
2530
  ) or isinstance(item, SectionHeaderItem):
2312
- in_list = False
2313
2531
  marker = ""
2314
2532
  if not strict_text:
2315
2533
  marker = "#" * level
2316
2534
  if len(marker) < 2:
2317
2535
  marker = "##"
2318
- text = f"{marker} {item.text}\n"
2319
- _append_text(text.strip() + "\n")
2320
-
2321
- elif isinstance(item, CodeItem) and item.label in labels:
2322
- in_list = False
2323
- text = f"```\n{item.text}\n```\n"
2324
- _append_text(text, do_escape_underscores=False, do_escape_html=False)
2325
-
2326
- elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
2327
- in_list = True
2328
- # Calculate indent based on list_nesting_level
2329
- # -1 because level 1 needs no indent
2330
- list_indent = " " * (indent * (list_nesting_level - 1))
2331
-
2332
- marker = ""
2333
- if strict_text:
2334
- marker = ""
2335
- elif item.enumerated:
2336
- marker = item.marker
2337
- else:
2338
- marker = "-" # Markdown needs only dash as item marker.
2536
+ text = f"{marker} {item.text}"
2537
+ _ingest_text(text.strip())
2339
2538
 
2340
- text = f"{list_indent}{marker} {item.text}"
2341
- _append_text(text)
2539
+ elif isinstance(item, CodeItem):
2540
+ text = f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
2541
+ _ingest_text(text, do_escape_underscores=False, do_escape_html=False)
2342
2542
 
2343
2543
  elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
2344
- in_list = False
2345
2544
  if item.text != "":
2346
- _append_text(
2347
- f"$${item.text}$$\n",
2545
+ _ingest_text(
2546
+ f"${item.text}$" if is_inline_scope else f"$${item.text}$$",
2348
2547
  do_escape_underscores=False,
2349
2548
  do_escape_html=False,
2350
2549
  )
2351
2550
  elif item.orig != "":
2352
- _append_text(
2353
- "<!-- formula-not-decoded -->\n",
2551
+ _ingest_text(
2552
+ "<!-- formula-not-decoded -->",
2354
2553
  do_escape_underscores=False,
2355
2554
  do_escape_html=False,
2356
2555
  )
2357
2556
 
2358
- elif isinstance(item, TextItem) and item.label in labels:
2359
- in_list = False
2557
+ elif isinstance(item, TextItem):
2360
2558
  if len(item.text) and text_width > 0:
2361
2559
  text = item.text
2362
2560
  wrapped_text = textwrap.fill(text, width=text_width)
2363
- _append_text(wrapped_text + "\n")
2561
+ _ingest_text(wrapped_text)
2364
2562
  elif len(item.text):
2365
- text = f"{item.text}\n"
2366
- _append_text(text)
2563
+ _ingest_text(item.text)
2367
2564
 
2368
2565
  elif isinstance(item, TableItem) and not strict_text:
2369
- in_list = False
2370
- _append_text(item.caption_text(self))
2566
+ if caption_text := item.caption_text(self):
2567
+ _ingest_text(caption_text)
2371
2568
  md_table = item.export_to_markdown()
2372
- _append_text("\n" + md_table + "\n")
2569
+ _ingest_text(md_table)
2373
2570
 
2374
2571
  elif isinstance(item, PictureItem) and not strict_text:
2375
- in_list = False
2376
- _append_text(item.caption_text(self))
2572
+ _ingest_text(item.caption_text(self))
2377
2573
 
2378
2574
  line = item.export_to_markdown(
2379
2575
  doc=self,
@@ -2381,19 +2577,17 @@ class DoclingDocument(BaseModel):
2381
2577
  image_mode=image_mode,
2382
2578
  )
2383
2579
 
2384
- _append_text(line, do_escape_html=False, do_escape_underscores=False)
2580
+ _ingest_text(line, do_escape_html=False, do_escape_underscores=False)
2385
2581
 
2386
- elif isinstance(item, DocItem) and item.label in labels:
2387
- in_list = False
2388
- text = "<!-- missing-text -->"
2389
- _append_text(text, do_escape_html=False, do_escape_underscores=False)
2582
+ elif isinstance(item, (KeyValueItem, FormItem)):
2583
+ text = item._export_to_markdown()
2584
+ _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2390
2585
 
2391
- mdtext = (delim.join(mdtexts)).strip()
2392
- mdtext = re.sub(
2393
- r"\n\n\n+", "\n\n", mdtext
2394
- ) # remove cases of double or more empty lines.
2586
+ elif isinstance(item, DocItem):
2587
+ text = "<!-- missing-text -->"
2588
+ _ingest_text(text, do_escape_html=False, do_escape_underscores=False)
2395
2589
 
2396
- return mdtext
2590
+ return components
2397
2591
 
2398
2592
  def export_to_text( # noqa: C901
2399
2593
  self,
@@ -2519,7 +2713,11 @@ class DoclingDocument(BaseModel):
2519
2713
 
2520
2714
  return (in_ordered_list, html_texts)
2521
2715
 
2522
- head_lines = ["<!DOCTYPE html>", f'<html lang="{html_lang}">', html_head]
2716
+ head_lines = [
2717
+ "<!DOCTYPE html>",
2718
+ f'<html lang="{html_lang}">',
2719
+ html_head,
2720
+ ]
2523
2721
  html_texts: list[str] = []
2524
2722
 
2525
2723
  prev_level = 0 # Track the previous item's level
@@ -2599,7 +2797,8 @@ class DoclingDocument(BaseModel):
2599
2797
  section_level: int = min(item.level + 1, 6)
2600
2798
 
2601
2799
  text = get_html_tag_with_text_direction(
2602
- html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
2800
+ html_tag=f"h{section_level}",
2801
+ text=_prepare_tag_content(item.text),
2603
2802
  )
2604
2803
  html_texts.append(text)
2605
2804
 
@@ -2620,14 +2819,17 @@ class DoclingDocument(BaseModel):
2620
2819
  "</figure>"
2621
2820
  )
2622
2821
 
2822
+ img_fallback = _image_fallback(item)
2823
+
2623
2824
  # If the formula is not processed correcty, use its image
2624
2825
  if (
2625
2826
  item.text == ""
2626
2827
  and item.orig != ""
2627
2828
  and image_mode == ImageRefMode.EMBEDDED
2628
2829
  and len(item.prov) > 0
2830
+ and img_fallback is not None
2629
2831
  ):
2630
- text = _image_fallback(item)
2832
+ text = img_fallback
2631
2833
 
2632
2834
  # Building a math equation in MathML format
2633
2835
  # ref https://www.w3.org/TR/wai-aria-1.1/#math
@@ -2647,9 +2849,13 @@ class DoclingDocument(BaseModel):
2647
2849
  "Malformed formula cannot be rendered. "
2648
2850
  f"Error {err.__class__.__name__}, formula={math_formula}"
2649
2851
  )
2650
- if image_mode == ImageRefMode.EMBEDDED and len(item.prov) > 0:
2651
- text = _image_fallback(item)
2652
- else:
2852
+ if (
2853
+ image_mode == ImageRefMode.EMBEDDED
2854
+ and len(item.prov) > 0
2855
+ and img_fallback is not None
2856
+ ):
2857
+ text = img_fallback
2858
+ elif len(math_formula) > 0:
2653
2859
  text = f"<pre>{math_formula}</pre>"
2654
2860
 
2655
2861
  elif math_formula != "":
@@ -2856,13 +3062,19 @@ class DoclingDocument(BaseModel):
2856
3062
  self.iterate_items(
2857
3063
  self.body,
2858
3064
  with_groups=True,
2859
- included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE},
3065
+ included_content_layers={
3066
+ ContentLayer.BODY,
3067
+ ContentLayer.FURNITURE,
3068
+ },
2860
3069
  )
2861
3070
  ):
2862
3071
  # Close lists if we've moved to a lower nesting level
2863
3072
  if current_level < previous_level and ordered_list_stack:
2864
3073
  ordered_list_stack = _close_lists(
2865
- current_level, previous_level, ordered_list_stack, output_parts
3074
+ current_level,
3075
+ previous_level,
3076
+ ordered_list_stack,
3077
+ output_parts,
2866
3078
  )
2867
3079
  previous_level = current_level
2868
3080
 
@@ -2970,7 +3182,10 @@ class DoclingDocument(BaseModel):
2970
3182
  return "".join(output_parts)
2971
3183
 
2972
3184
  def _export_to_indented_text(
2973
- self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
3185
+ self,
3186
+ indent=" ",
3187
+ max_text_len: int = -1,
3188
+ explicit_tables: bool = False,
2974
3189
  ):
2975
3190
  """Export the document to indented text to expose hierarchy."""
2976
3191
  result = []
@@ -75,6 +75,7 @@ class GroupLabel(str, Enum):
75
75
  FORM_AREA = "form_area"
76
76
  KEY_VALUE_AREA = "key_value_area"
77
77
  COMMENT_SECTION = "comment_section"
78
+ INLINE = "inline"
78
79
 
79
80
  def __str__(self):
80
81
  """Get string value."""
@@ -140,6 +141,29 @@ class TableCellLabel(str, Enum):
140
141
  return str(self.value)
141
142
 
142
143
 
144
+ class GraphCellLabel(str, Enum):
145
+ """GraphCellLabel."""
146
+
147
+ UNSPECIFIED = "unspecified"
148
+
149
+ KEY = "key"
150
+ VALUE = "value"
151
+
152
+ CHECKBOX = "checkbox"
153
+
154
+
155
+ class GraphLinkLabel(str, Enum):
156
+ """GraphLinkLabel."""
157
+
158
+ UNSPECIFIED = "unspecified"
159
+
160
+ TO_VALUE = "to_value"
161
+ TO_KEY = "to_key"
162
+
163
+ TO_PARENT = "to_parent"
164
+ TO_CHILD = "to_child"
165
+
166
+
143
167
  class CodeLanguageLabel(str, Enum):
144
168
  """CodeLanguageLabel."""
145
169
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling-core"
3
- version = "2.19.1"
3
+ version = "2.21.0"
4
4
  description = "A python library to define and validate data types in Docling."
5
5
  license = "MIT"
6
6
  authors = [
File without changes
File without changes