docling-core 2.8.0__py3-none-any.whl → 2.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1 @@
1
+ """CLI package."""
@@ -0,0 +1,68 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """CLI for docling viewer."""
7
+ import importlib
8
+ import tempfile
9
+ import webbrowser
10
+ from pathlib import Path
11
+ from typing import Annotated, Optional
12
+
13
+ import typer
14
+
15
+ from docling_core.types.doc import DoclingDocument
16
+ from docling_core.types.doc.base import ImageRefMode
17
+ from docling_core.utils.file import resolve_source_to_path
18
+
19
+ app = typer.Typer(
20
+ name="Docling",
21
+ no_args_is_help=True,
22
+ add_completion=False,
23
+ pretty_exceptions_enable=False,
24
+ )
25
+
26
+
27
+ def version_callback(value: bool):
28
+ """Callback for version inspection."""
29
+ if value:
30
+ docling_core_version = importlib.metadata.version("docling-core")
31
+ print(f"Docling Core version: {docling_core_version}")
32
+ raise typer.Exit()
33
+
34
+
35
+ @app.command(no_args_is_help=True)
36
+ def view(
37
+ source: Annotated[
38
+ str,
39
+ typer.Argument(
40
+ ...,
41
+ metavar="source",
42
+ help="Docling JSON file to view.",
43
+ ),
44
+ ],
45
+ version: Annotated[
46
+ Optional[bool],
47
+ typer.Option(
48
+ "--version",
49
+ callback=version_callback,
50
+ is_eager=True,
51
+ help="Show version information.",
52
+ ),
53
+ ] = None,
54
+ ):
55
+ """Display a Docling JSON file on the default browser."""
56
+ path = resolve_source_to_path(source=source)
57
+ doc = DoclingDocument.load_from_json(filename=path)
58
+ target_path = Path(tempfile.mkdtemp()) / "out.html"
59
+ html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
60
+ with open(target_path, "w") as f:
61
+ f.write(html_output)
62
+ webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
63
+
64
+
65
+ click_app = typer.main.get_command(app)
66
+
67
+ if __name__ == "__main__":
68
+ app()
@@ -44,7 +44,9 @@ class HybridChunker(BaseChunker):
44
44
 
45
45
  model_config = ConfigDict(arbitrary_types_allowed=True)
46
46
 
47
- tokenizer: Union[PreTrainedTokenizerBase, str]
47
+ tokenizer: Union[PreTrainedTokenizerBase, str] = (
48
+ "sentence-transformers/all-MiniLM-L6-v2"
49
+ )
48
50
  max_tokens: int = None # type: ignore[assignment]
49
51
  merge_peers: bool = True
50
52
 
@@ -96,6 +98,7 @@ class HybridChunker(BaseChunker):
96
98
  doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
97
99
  headings=doc_chunk.meta.headings,
98
100
  captions=doc_chunk.meta.captions,
101
+ origin=doc_chunk.meta.origin,
99
102
  )
100
103
  new_chunk = DocChunk(text=window_text, meta=meta)
101
104
  return new_chunk
@@ -242,6 +245,7 @@ class HybridChunker(BaseChunker):
242
245
  doc_items=window_items,
243
246
  headings=current_headings_and_captions[0],
244
247
  captions=current_headings_and_captions[1],
248
+ origin=chunk.meta.origin,
245
249
  )
246
250
  new_chunk = DocChunk(
247
251
  text=window_text,
@@ -49,7 +49,6 @@ DEFAULT_EXPORT_LABELS = {
49
49
  DocItemLabel.DOCUMENT_INDEX,
50
50
  DocItemLabel.SECTION_HEADER,
51
51
  DocItemLabel.PARAGRAPH,
52
- DocItemLabel.CAPTION,
53
52
  DocItemLabel.TABLE,
54
53
  DocItemLabel.PICTURE,
55
54
  DocItemLabel.FORMULA,
@@ -58,6 +57,7 @@ DEFAULT_EXPORT_LABELS = {
58
57
  DocItemLabel.TEXT,
59
58
  DocItemLabel.LIST_ITEM,
60
59
  DocItemLabel.CODE,
60
+ DocItemLabel.REFERENCE,
61
61
  }
62
62
 
63
63
 
@@ -380,6 +380,7 @@ class DocumentOrigin(BaseModel):
380
380
  "application/vnd.openxmlformats-officedocument.presentationml.template",
381
381
  "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
382
382
  "application/vnd.openxmlformats-officedocument.presentationml.presentation",
383
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
383
384
  "text/asciidoc",
384
385
  "text/markdown",
385
386
  ]
@@ -445,7 +446,7 @@ class ImageRef(BaseModel):
445
446
  mimetype: str
446
447
  dpi: int
447
448
  size: Size
448
- uri: Union[AnyUrl, Path]
449
+ uri: Union[AnyUrl, Path] = Field(union_mode="left_to_right")
449
450
  _pil: Optional[PILImage.Image] = None
450
451
 
451
452
  @property
@@ -592,6 +593,21 @@ class DocItem(
592
593
  class TextItem(DocItem):
593
594
  """TextItem."""
594
595
 
596
+ label: typing.Literal[
597
+ DocItemLabel.CAPTION,
598
+ DocItemLabel.CHECKBOX_SELECTED,
599
+ DocItemLabel.CHECKBOX_UNSELECTED,
600
+ DocItemLabel.CODE,
601
+ DocItemLabel.FOOTNOTE,
602
+ DocItemLabel.FORMULA,
603
+ DocItemLabel.PAGE_FOOTER,
604
+ DocItemLabel.PAGE_HEADER,
605
+ DocItemLabel.PARAGRAPH,
606
+ DocItemLabel.REFERENCE,
607
+ DocItemLabel.TEXT,
608
+ DocItemLabel.TITLE,
609
+ ]
610
+
595
611
  orig: str # untreated representation
596
612
  text: str # sanitized representation
597
613
 
@@ -643,8 +659,10 @@ class TextItem(DocItem):
643
659
  class SectionHeaderItem(TextItem):
644
660
  """SectionItem."""
645
661
 
646
- label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
647
- level: LevelNumber
662
+ label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
663
+ DocItemLabel.SECTION_HEADER # type: ignore[assignment]
664
+ )
665
+ level: LevelNumber = 1
648
666
 
649
667
  def export_to_document_tokens(
650
668
  self,
@@ -694,9 +712,11 @@ class SectionHeaderItem(TextItem):
694
712
  class ListItem(TextItem):
695
713
  """SectionItem."""
696
714
 
697
- label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
715
+ label: typing.Literal[DocItemLabel.LIST_ITEM] = (
716
+ DocItemLabel.LIST_ITEM # type: ignore[assignment]
717
+ )
698
718
  enumerated: bool = False
699
- marker: str # The bullet or number symbol that prefixes this list item
719
+ marker: str = "-" # The bullet or number symbol that prefixes this list item
700
720
 
701
721
 
702
722
  class FloatingItem(DocItem):
@@ -922,7 +942,10 @@ class TableItem(FloatingItem):
922
942
  """TableItem."""
923
943
 
924
944
  data: TableData
925
- label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
945
+ label: typing.Literal[
946
+ DocItemLabel.DOCUMENT_INDEX,
947
+ DocItemLabel.TABLE,
948
+ ] = DocItemLabel.TABLE
926
949
 
927
950
  def export_to_dataframe(self) -> pd.DataFrame:
928
951
  """Export the table as a Pandas DataFrame."""
@@ -1271,9 +1294,19 @@ class TableItem(FloatingItem):
1271
1294
  class KeyValueItem(DocItem):
1272
1295
  """KeyValueItem."""
1273
1296
 
1297
+ label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
1274
1298
 
1275
- ContentItem = Union[
1276
- TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
1299
+
1300
+ ContentItem = Annotated[
1301
+ Union[
1302
+ TextItem,
1303
+ SectionHeaderItem,
1304
+ ListItem,
1305
+ PictureItem,
1306
+ TableItem,
1307
+ KeyValueItem,
1308
+ ],
1309
+ Field(discriminator="label"),
1277
1310
  ]
1278
1311
 
1279
1312
 
@@ -1375,13 +1408,13 @@ class DoclingDocument(BaseModel):
1375
1408
  self,
1376
1409
  label: Optional[GroupLabel] = None,
1377
1410
  name: Optional[str] = None,
1378
- parent: Optional[GroupItem] = None,
1411
+ parent: Optional[NodeItem] = None,
1379
1412
  ) -> GroupItem:
1380
1413
  """add_group.
1381
1414
 
1382
1415
  :param label: Optional[GroupLabel]: (Default value = None)
1383
1416
  :param name: Optional[str]: (Default value = None)
1384
- :param parent: Optional[GroupItem]: (Default value = None)
1417
+ :param parent: Optional[NodeItem]: (Default value = None)
1385
1418
 
1386
1419
  """
1387
1420
  if not parent:
@@ -1408,7 +1441,7 @@ class DoclingDocument(BaseModel):
1408
1441
  marker: Optional[str] = None,
1409
1442
  orig: Optional[str] = None,
1410
1443
  prov: Optional[ProvenanceItem] = None,
1411
- parent: Optional[GroupItem] = None,
1444
+ parent: Optional[NodeItem] = None,
1412
1445
  ):
1413
1446
  """add_list_item.
1414
1447
 
@@ -1416,7 +1449,7 @@ class DoclingDocument(BaseModel):
1416
1449
  :param text: str:
1417
1450
  :param orig: Optional[str]: (Default value = None)
1418
1451
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1419
- :param parent: Optional[GroupItem]: (Default value = None)
1452
+ :param parent: Optional[NodeItem]: (Default value = None)
1420
1453
 
1421
1454
  """
1422
1455
  if not parent:
@@ -1451,7 +1484,7 @@ class DoclingDocument(BaseModel):
1451
1484
  text: str,
1452
1485
  orig: Optional[str] = None,
1453
1486
  prov: Optional[ProvenanceItem] = None,
1454
- parent: Optional[GroupItem] = None,
1487
+ parent: Optional[NodeItem] = None,
1455
1488
  ):
1456
1489
  """add_text.
1457
1490
 
@@ -1459,7 +1492,7 @@ class DoclingDocument(BaseModel):
1459
1492
  :param text: str:
1460
1493
  :param orig: Optional[str]: (Default value = None)
1461
1494
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1462
- :param parent: Optional[GroupItem]: (Default value = None)
1495
+ :param parent: Optional[NodeItem]: (Default value = None)
1463
1496
 
1464
1497
  """
1465
1498
  # Catch a few cases that are in principle allowed
@@ -1503,15 +1536,16 @@ class DoclingDocument(BaseModel):
1503
1536
  data: TableData,
1504
1537
  caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
1505
1538
  prov: Optional[ProvenanceItem] = None,
1506
- parent: Optional[GroupItem] = None,
1539
+ parent: Optional[NodeItem] = None,
1540
+ label: DocItemLabel = DocItemLabel.TABLE,
1507
1541
  ):
1508
1542
  """add_table.
1509
1543
 
1510
- :param data: BaseTableData:
1511
- :param caption: Optional[Union[TextItem:
1512
- :param RefItem]]: (Default value = None)
1513
- :param # This is not cool yet.prov: Optional[ProvenanceItem]
1514
- :param parent: Optional[GroupItem]: (Default value = None)
1544
+ :param data: TableData:
1545
+ :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
1546
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1547
+ :param parent: Optional[NodeItem]: (Default value = None)
1548
+ :param label: DocItemLabel: (Default value = DocItemLabel.TABLE)
1515
1549
 
1516
1550
  """
1517
1551
  if not parent:
@@ -1521,7 +1555,7 @@ class DoclingDocument(BaseModel):
1521
1555
  cref = f"#/tables/{table_index}"
1522
1556
 
1523
1557
  tbl_item = TableItem(
1524
- label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
1558
+ label=label, data=data, self_ref=cref, parent=parent.get_ref()
1525
1559
  )
1526
1560
  if prov:
1527
1561
  tbl_item.prov.append(prov)
@@ -1539,7 +1573,7 @@ class DoclingDocument(BaseModel):
1539
1573
  image: Optional[ImageRef] = None,
1540
1574
  caption: Optional[Union[TextItem, RefItem]] = None,
1541
1575
  prov: Optional[ProvenanceItem] = None,
1542
- parent: Optional[GroupItem] = None,
1576
+ parent: Optional[NodeItem] = None,
1543
1577
  ):
1544
1578
  """add_picture.
1545
1579
 
@@ -1547,7 +1581,7 @@ class DoclingDocument(BaseModel):
1547
1581
  :param caption: Optional[Union[TextItem:
1548
1582
  :param RefItem]]: (Default value = None)
1549
1583
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1550
- :param parent: Optional[GroupItem]: (Default value = None)
1584
+ :param parent: Optional[NodeItem]: (Default value = None)
1551
1585
  """
1552
1586
  if not parent:
1553
1587
  parent = self.body
@@ -1577,14 +1611,14 @@ class DoclingDocument(BaseModel):
1577
1611
  text: str,
1578
1612
  orig: Optional[str] = None,
1579
1613
  prov: Optional[ProvenanceItem] = None,
1580
- parent: Optional[GroupItem] = None,
1614
+ parent: Optional[NodeItem] = None,
1581
1615
  ):
1582
1616
  """add_title.
1583
1617
 
1584
1618
  :param text: str:
1585
1619
  :param orig: Optional[str]: (Default value = None)
1586
1620
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1587
- :param parent: Optional[GroupItem]: (Default value = None)
1621
+ :param parent: Optional[NodeItem]: (Default value = None)
1588
1622
  """
1589
1623
  if not parent:
1590
1624
  parent = self.body
@@ -1615,7 +1649,7 @@ class DoclingDocument(BaseModel):
1615
1649
  orig: Optional[str] = None,
1616
1650
  level: LevelNumber = 1,
1617
1651
  prov: Optional[ProvenanceItem] = None,
1618
- parent: Optional[GroupItem] = None,
1652
+ parent: Optional[NodeItem] = None,
1619
1653
  ):
1620
1654
  """add_heading.
1621
1655
 
@@ -1624,7 +1658,7 @@ class DoclingDocument(BaseModel):
1624
1658
  :param orig: Optional[str]: (Default value = None)
1625
1659
  :param level: LevelNumber: (Default value = 1)
1626
1660
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1627
- :param parent: Optional[GroupItem]: (Default value = None)
1661
+ :param parent: Optional[NodeItem]: (Default value = None)
1628
1662
  """
1629
1663
  if not parent:
1630
1664
  parent = self.body
@@ -1668,7 +1702,7 @@ class DoclingDocument(BaseModel):
1668
1702
  self,
1669
1703
  root: Optional[NodeItem] = None,
1670
1704
  with_groups: bool = False,
1671
- traverse_pictures: bool = True,
1705
+ traverse_pictures: bool = False,
1672
1706
  page_no: Optional[int] = None,
1673
1707
  _level: int = 0, # fixed parameter, carries through the node nesting level
1674
1708
  ) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
@@ -1685,30 +1719,31 @@ class DoclingDocument(BaseModel):
1685
1719
  if not root:
1686
1720
  root = self.body
1687
1721
 
1722
+ # Yield non-group items or group items when with_groups=True
1688
1723
  if not isinstance(root, GroupItem) or with_groups:
1689
1724
  if isinstance(root, DocItem):
1690
- if page_no is not None:
1691
- for prov in root.prov:
1692
- if prov.page_no == page_no:
1693
- yield root, _level
1694
- else:
1725
+ if page_no is None or any(
1726
+ prov.page_no == page_no for prov in root.prov
1727
+ ):
1695
1728
  yield root, _level
1696
1729
  else:
1697
1730
  yield root, _level
1698
1731
 
1732
+ # Handle picture traversal - only traverse children if requested
1733
+ if isinstance(root, PictureItem) and not traverse_pictures:
1734
+ return
1735
+
1699
1736
  # Traverse children
1700
1737
  for child_ref in root.children:
1701
1738
  child = child_ref.resolve(self)
1702
-
1703
1739
  if isinstance(child, NodeItem):
1704
- # If the child is a NodeItem, recursively traverse it
1705
- if not isinstance(child, PictureItem) or traverse_pictures:
1706
- yield from self.iterate_items(
1707
- child,
1708
- _level=_level + 1,
1709
- with_groups=with_groups,
1710
- page_no=page_no,
1711
- )
1740
+ yield from self.iterate_items(
1741
+ child,
1742
+ with_groups=with_groups,
1743
+ traverse_pictures=traverse_pictures,
1744
+ page_no=page_no,
1745
+ _level=_level + 1,
1746
+ )
1712
1747
 
1713
1748
  def _clear_picture_pil_cache(self):
1714
1749
  """Clear cache storage of all images."""
@@ -1864,7 +1899,7 @@ class DoclingDocument(BaseModel):
1864
1899
 
1865
1900
  """
1866
1901
  with open(filename, "r") as f:
1867
- return cls.model_validate(json.loads(f.read()))
1902
+ return cls.model_validate_json(f.read())
1868
1903
 
1869
1904
  def save_as_yaml(
1870
1905
  self,
@@ -2053,10 +2088,6 @@ class DoclingDocument(BaseModel):
2053
2088
  text = f"```\n{item.text}\n```\n"
2054
2089
  mdtexts.append(text)
2055
2090
 
2056
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2057
- # captions are printed in picture and table ... skipping for now
2058
- continue
2059
-
2060
2091
  elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
2061
2092
  in_list = True
2062
2093
  # Calculate indent based on list_nesting_level
@@ -2115,10 +2146,30 @@ class DoclingDocument(BaseModel):
2115
2146
  # Bold, Italic, or Bold-Italic
2116
2147
  # Hence, any underscore that we print into Markdown is coming from document text
2117
2148
  # That means we need to escape it, to properly reflect content in the markdown
2149
+ # However, we need to preserve underscores in image URLs
2150
+ # to maintain their validity
2151
+ # For example: ![image](path/to_image.png) should remain unchanged
2118
2152
  def escape_underscores(text):
2119
- # Replace "_" with "\_" only if it's not already escaped
2120
- escaped_text = re.sub(r"(?<!\\)_", r"\_", text)
2121
- return escaped_text
2153
+ """Escape underscores but leave them intact in the URL.."""
2154
+ # Firstly, identify all the URL patterns.
2155
+ url_pattern = r"!\[.*?\]\((.*?)\)"
2156
+ parts = []
2157
+ last_end = 0
2158
+
2159
+ for match in re.finditer(url_pattern, text):
2160
+ # Text to add before the URL (needs to be escaped)
2161
+ before_url = text[last_end : match.start()]
2162
+ parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
2163
+
2164
+ # Add the full URL part (do not escape)
2165
+ parts.append(match.group(0))
2166
+ last_end = match.end()
2167
+
2168
+ # Add the final part of the text (which needs to be escaped)
2169
+ if last_end < len(text):
2170
+ parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
2171
+
2172
+ return "".join(parts)
2122
2173
 
2123
2174
  mdtext = escape_underscores(mdtext)
2124
2175
 
@@ -2328,10 +2379,6 @@ class DoclingDocument(BaseModel):
2328
2379
  text = f"<pre>{item.text}</pre>"
2329
2380
  html_texts.append(text)
2330
2381
 
2331
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2332
- # captions are printed in picture and table ... skipping for now
2333
- continue
2334
-
2335
2382
  elif isinstance(item, ListItem):
2336
2383
 
2337
2384
  text = f"<li>{item.text}</li>"
@@ -2533,10 +2580,6 @@ class DoclingDocument(BaseModel):
2533
2580
  result += f"<unordered_list>{delim}"
2534
2581
  in_ordered_list.append(False)
2535
2582
 
2536
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2537
- # captions are printed in picture and table ... skipping for now
2538
- continue
2539
-
2540
2583
  elif isinstance(item, SectionHeaderItem):
2541
2584
 
2542
2585
  result += item.export_to_document_tokens(
@@ -2642,10 +2685,6 @@ class DoclingDocument(BaseModel):
2642
2685
  indent * level + f"item-{i} at level {level}: {item.label}: {text}"
2643
2686
  )
2644
2687
 
2645
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2646
- # captions are printed in picture and table ... skipping for now
2647
- continue
2648
-
2649
2688
  elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
2650
2689
  text = get_text(text=item.text, max_text_len=max_text_len)
2651
2690
 
@@ -140,6 +140,7 @@ class BaseCell(AliasModel):
140
140
  obj_type: str = Field(
141
141
  alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
142
142
  )
143
+ payload: Optional[dict] = None
143
144
 
144
145
  def get_location_tokens(
145
146
  self,
@@ -0,0 +1,633 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """Utilities for converting between legacy and new document format."""
7
+
8
+ import hashlib
9
+ import uuid
10
+ from pathlib import Path
11
+ from typing import Dict, Optional, Union
12
+
13
+ from docling_core.types.doc import (
14
+ BoundingBox,
15
+ CoordOrigin,
16
+ DocItem,
17
+ DocItemLabel,
18
+ DoclingDocument,
19
+ DocumentOrigin,
20
+ PictureItem,
21
+ ProvenanceItem,
22
+ SectionHeaderItem,
23
+ Size,
24
+ TableCell,
25
+ TableItem,
26
+ TextItem,
27
+ )
28
+ from docling_core.types.doc.document import GroupItem, ListItem, TableData
29
+ from docling_core.types.doc.labels import GroupLabel
30
+ from docling_core.types.legacy_doc.base import (
31
+ BaseCell,
32
+ BaseText,
33
+ Figure,
34
+ GlmTableCell,
35
+ PageDimensions,
36
+ PageReference,
37
+ Prov,
38
+ Ref,
39
+ )
40
+ from docling_core.types.legacy_doc.base import Table as DsSchemaTable
41
+ from docling_core.types.legacy_doc.base import TableCell as DsTableCell
42
+ from docling_core.types.legacy_doc.document import (
43
+ CCSDocumentDescription as DsDocumentDescription,
44
+ )
45
+ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
46
+ from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
47
+
48
+
49
+ def _create_hash(string: str):
50
+ hasher = hashlib.sha256()
51
+ hasher.update(string.encode("utf-8"))
52
+
53
+ return hasher.hexdigest()
54
+
55
+
56
+ def doc_item_label_to_legacy_type(label: DocItemLabel):
57
+ """Convert the DocItemLabel to the legacy type."""
58
+ _label_to_ds_type = {
59
+ DocItemLabel.TITLE: "title",
60
+ DocItemLabel.DOCUMENT_INDEX: "table-of-contents",
61
+ DocItemLabel.SECTION_HEADER: "subtitle-level-1",
62
+ DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
63
+ DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
64
+ DocItemLabel.CAPTION: "caption",
65
+ DocItemLabel.PAGE_HEADER: "page-header",
66
+ DocItemLabel.PAGE_FOOTER: "page-footer",
67
+ DocItemLabel.FOOTNOTE: "footnote",
68
+ DocItemLabel.TABLE: "table",
69
+ DocItemLabel.FORMULA: "equation",
70
+ DocItemLabel.LIST_ITEM: "paragraph",
71
+ DocItemLabel.CODE: "paragraph",
72
+ DocItemLabel.PICTURE: "figure",
73
+ DocItemLabel.TEXT: "paragraph",
74
+ DocItemLabel.PARAGRAPH: "paragraph",
75
+ }
76
+ if label in _label_to_ds_type:
77
+ return _label_to_ds_type[label]
78
+ return label.value
79
+
80
+
81
+ def doc_item_label_to_legacy_name(label: DocItemLabel):
82
+ """Convert the DocItemLabel to the legacy name."""
83
+ _reverse_label_name_mapping = {
84
+ DocItemLabel.CAPTION: "Caption",
85
+ DocItemLabel.FOOTNOTE: "Footnote",
86
+ DocItemLabel.FORMULA: "Formula",
87
+ DocItemLabel.LIST_ITEM: "List-item",
88
+ DocItemLabel.PAGE_FOOTER: "Page-footer",
89
+ DocItemLabel.PAGE_HEADER: "Page-header",
90
+ DocItemLabel.PICTURE: "Picture",
91
+ DocItemLabel.SECTION_HEADER: "Section-header",
92
+ DocItemLabel.TABLE: "Table",
93
+ DocItemLabel.TEXT: "Text",
94
+ DocItemLabel.TITLE: "Title",
95
+ DocItemLabel.DOCUMENT_INDEX: "Document Index",
96
+ DocItemLabel.CODE: "Code",
97
+ DocItemLabel.CHECKBOX_SELECTED: "Checkbox-Selected",
98
+ DocItemLabel.CHECKBOX_UNSELECTED: "Checkbox-Unselected",
99
+ DocItemLabel.FORM: "Form",
100
+ DocItemLabel.KEY_VALUE_REGION: "Key-Value Region",
101
+ DocItemLabel.PARAGRAPH: "paragraph",
102
+ }
103
+ if label in _reverse_label_name_mapping:
104
+ return _reverse_label_name_mapping[label]
105
+ return label.value
106
+
107
+
108
+ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "file"):
109
+ """Convert a DoclingDocument to the legacy format."""
110
+ title = ""
111
+ desc: DsDocumentDescription = DsDocumentDescription(logs=[])
112
+
113
+ if doc.origin is not None:
114
+ document_hash = _create_hash(str(doc.origin.binary_hash))
115
+ filename = doc.origin.filename
116
+ else:
117
+ document_hash = _create_hash(str(uuid.uuid4()))
118
+ filename = fallback_filaname
119
+
120
+ page_hashes = [
121
+ PageReference(
122
+ hash=_create_hash(document_hash + ":" + str(p.page_no - 1)),
123
+ page=p.page_no,
124
+ model="default",
125
+ )
126
+ for p in doc.pages.values()
127
+ ]
128
+
129
+ file_info = DsFileInfoObject(
130
+ filename=filename,
131
+ document_hash=document_hash,
132
+ num_pages=len(doc.pages),
133
+ page_hashes=page_hashes,
134
+ )
135
+
136
+ main_text: list[Union[Ref, BaseText]] = []
137
+ tables: list[DsSchemaTable] = []
138
+ figures: list[Figure] = []
139
+ equations: list[BaseCell] = []
140
+ footnotes: list[BaseText] = []
141
+ page_headers: list[BaseText] = []
142
+ page_footers: list[BaseText] = []
143
+
144
+ # TODO: populate page_headers page_footers from doc.furniture
145
+
146
+ embedded_captions = set()
147
+ for ix, (item, level) in enumerate(doc.iterate_items(doc.body)):
148
+
149
+ if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
150
+ caption = item.caption_text(doc)
151
+ if caption:
152
+ embedded_captions.add(caption)
153
+
154
+ for item, level in doc.iterate_items():
155
+ if isinstance(item, DocItem):
156
+ item_type = item.label
157
+
158
+ if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
159
+
160
+ if isinstance(item, ListItem) and item.marker:
161
+ text = f"{item.marker} {item.text}"
162
+ else:
163
+ text = item.text
164
+
165
+ # Can be empty.
166
+ prov = [
167
+ Prov(
168
+ bbox=p.bbox.as_tuple(),
169
+ page=p.page_no,
170
+ span=[0, len(item.text)],
171
+ )
172
+ for p in item.prov
173
+ ]
174
+ main_text.append(
175
+ BaseText(
176
+ text=text,
177
+ obj_type=doc_item_label_to_legacy_type(item.label),
178
+ name=doc_item_label_to_legacy_name(item.label),
179
+ prov=prov,
180
+ )
181
+ )
182
+
183
+ # skip captions of they are embedded in the actual
184
+ # floating object
185
+ if item_type == DocItemLabel.CAPTION and text in embedded_captions:
186
+ continue
187
+
188
+ elif isinstance(item, TableItem) and item.data:
189
+ index = len(tables)
190
+ ref_str = f"#/tables/{index}"
191
+ main_text.append(
192
+ Ref(
193
+ name=doc_item_label_to_legacy_name(item.label),
194
+ obj_type=doc_item_label_to_legacy_type(item.label),
195
+ ref=ref_str,
196
+ ),
197
+ )
198
+
199
+ # Initialise empty table data grid (only empty cells)
200
+ table_data = [
201
+ [
202
+ DsTableCell(
203
+ text="",
204
+ # bbox=[0,0,0,0],
205
+ spans=[[i, j]],
206
+ obj_type="body",
207
+ )
208
+ for j in range(item.data.num_cols)
209
+ ]
210
+ for i in range(item.data.num_rows)
211
+ ]
212
+
213
+ # Overwrite cells in table data for which there is actual cell content.
214
+ for cell in item.data.table_cells:
215
+ for i in range(
216
+ min(cell.start_row_offset_idx, item.data.num_rows),
217
+ min(cell.end_row_offset_idx, item.data.num_rows),
218
+ ):
219
+ for j in range(
220
+ min(cell.start_col_offset_idx, item.data.num_cols),
221
+ min(cell.end_col_offset_idx, item.data.num_cols),
222
+ ):
223
+ celltype = "body"
224
+ if cell.column_header:
225
+ celltype = "col_header"
226
+ elif cell.row_header:
227
+ celltype = "row_header"
228
+ elif cell.row_section:
229
+ celltype = "row_section"
230
+
231
+ def _make_spans(cell: TableCell, table_item: TableItem):
232
+ for rspan in range(
233
+ min(
234
+ cell.start_row_offset_idx,
235
+ table_item.data.num_rows,
236
+ ),
237
+ min(
238
+ cell.end_row_offset_idx,
239
+ table_item.data.num_rows,
240
+ ),
241
+ ):
242
+ for cspan in range(
243
+ min(
244
+ cell.start_col_offset_idx,
245
+ table_item.data.num_cols,
246
+ ),
247
+ min(
248
+ cell.end_col_offset_idx,
249
+ table_item.data.num_cols,
250
+ ),
251
+ ):
252
+ yield [rspan, cspan]
253
+
254
+ spans = list(_make_spans(cell, item))
255
+ table_data[i][j] = GlmTableCell(
256
+ text=cell.text,
257
+ bbox=(
258
+ cell.bbox.as_tuple()
259
+ if cell.bbox is not None
260
+ else None
261
+ ), # check if this is bottom-left
262
+ spans=spans,
263
+ obj_type=celltype,
264
+ col=j,
265
+ row=i,
266
+ row_header=cell.row_header,
267
+ row_section=cell.row_section,
268
+ col_header=cell.column_header,
269
+ row_span=[
270
+ cell.start_row_offset_idx,
271
+ cell.end_row_offset_idx,
272
+ ],
273
+ col_span=[
274
+ cell.start_col_offset_idx,
275
+ cell.end_col_offset_idx,
276
+ ],
277
+ )
278
+
279
+ # Compute the caption
280
+ caption = item.caption_text(doc)
281
+
282
+ tables.append(
283
+ DsSchemaTable(
284
+ text=caption,
285
+ num_cols=item.data.num_cols,
286
+ num_rows=item.data.num_rows,
287
+ obj_type=doc_item_label_to_legacy_type(item.label),
288
+ data=table_data,
289
+ prov=[
290
+ Prov(
291
+ bbox=p.bbox.as_tuple(),
292
+ page=p.page_no,
293
+ span=[0, 0],
294
+ )
295
+ for p in item.prov
296
+ ],
297
+ )
298
+ )
299
+
300
+ elif isinstance(item, PictureItem):
301
+ index = len(figures)
302
+ ref_str = f"#/figures/{index}"
303
+ main_text.append(
304
+ Ref(
305
+ name=doc_item_label_to_legacy_name(item.label),
306
+ obj_type=doc_item_label_to_legacy_type(item.label),
307
+ ref=ref_str,
308
+ ),
309
+ )
310
+
311
+ # Compute the caption
312
+ caption = item.caption_text(doc)
313
+
314
+ figures.append(
315
+ Figure(
316
+ prov=[
317
+ Prov(
318
+ bbox=p.bbox.as_tuple(),
319
+ page=p.page_no,
320
+ span=[0, len(caption)],
321
+ )
322
+ for p in item.prov
323
+ ],
324
+ obj_type=doc_item_label_to_legacy_type(item.label),
325
+ text=caption,
326
+ # data=[[]],
327
+ )
328
+ )
329
+
330
+ page_dimensions = [
331
+ PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
332
+ for p in doc.pages.values()
333
+ ]
334
+
335
+ legacy_doc: DsDocument = DsDocument(
336
+ name=title,
337
+ description=desc,
338
+ file_info=file_info,
339
+ main_text=main_text,
340
+ equations=equations,
341
+ footnotes=footnotes,
342
+ page_headers=page_headers,
343
+ page_footers=page_footers,
344
+ tables=tables,
345
+ figures=figures,
346
+ page_dimensions=page_dimensions,
347
+ )
348
+
349
+ return legacy_doc
350
+
351
+
352
+ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # noqa: C901
353
+ """Convert a legacy document to DoclingDocument.
354
+
355
+ It is known that the following content will not be preserved in the transformation:
356
+ - name of labels (upper vs lower case)
357
+ - caption of figures are not in main-text anymore
358
+ - s3_data removed
359
+ - model metadata removed
360
+ - logs removed
361
+ - document hash cannot be preserved
362
+ """
363
+
364
+ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
365
+ """Create a new provenance from a legacy item."""
366
+ prov: Optional[ProvenanceItem] = None
367
+ if item.prov is not None and len(item.prov) > 0:
368
+ prov = ProvenanceItem(
369
+ page_no=int(item.prov[0].page),
370
+ charspan=tuple(item.prov[0].span),
371
+ bbox=BoundingBox.from_tuple(
372
+ tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
373
+ ),
374
+ )
375
+ return prov
376
+
377
+ origin = DocumentOrigin(
378
+ mimetype="application/pdf",
379
+ filename=legacy_doc.file_info.filename,
380
+ binary_hash=legacy_doc.file_info.document_hash,
381
+ )
382
+ doc_name = Path(origin.filename).stem
383
+
384
+ doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
385
+
386
+ # define pages
387
+ if legacy_doc.page_dimensions is not None:
388
+ for page_dim in legacy_doc.page_dimensions:
389
+ page_no = int(page_dim.page)
390
+ size = Size(width=page_dim.width, height=page_dim.height)
391
+
392
+ doc.add_page(page_no=page_no, size=size)
393
+
394
+ # page headers
395
+ if legacy_doc.page_headers is not None:
396
+ for text_item in legacy_doc.page_headers:
397
+ if text_item.text is None:
398
+ continue
399
+ prov = _transform_prov(text_item)
400
+ doc.add_text(
401
+ label=DocItemLabel.PAGE_HEADER,
402
+ text=text_item.text,
403
+ parent=doc.furniture,
404
+ )
405
+
406
+ # page footers
407
+ if legacy_doc.page_footers is not None:
408
+ for text_item in legacy_doc.page_footers:
409
+ if text_item.text is None:
410
+ continue
411
+ prov = _transform_prov(text_item)
412
+ doc.add_text(
413
+ label=DocItemLabel.PAGE_FOOTER,
414
+ text=text_item.text,
415
+ parent=doc.furniture,
416
+ )
417
+
418
+ # footnotes
419
+ if legacy_doc.footnotes is not None:
420
+ for text_item in legacy_doc.footnotes:
421
+ if text_item.text is None:
422
+ continue
423
+ prov = _transform_prov(text_item)
424
+ doc.add_text(
425
+ label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture
426
+ )
427
+
428
+ # main-text content
429
+ if legacy_doc.main_text is not None:
430
+ item: Optional[Union[BaseCell, BaseText]]
431
+
432
+ # collect all captions embedded in table and figure objects
433
+ # to avoid repeating them
434
+ embedded_captions: Dict[str, int] = {}
435
+ for ix, orig_item in enumerate(legacy_doc.main_text):
436
+ item = (
437
+ legacy_doc._resolve_ref(orig_item)
438
+ if isinstance(orig_item, Ref)
439
+ else orig_item
440
+ )
441
+ if item is None:
442
+ continue
443
+
444
+ if isinstance(item, (DsSchemaTable, Figure)) and item.text:
445
+ embedded_captions[item.text] = ix
446
+
447
+ # build lookup from floating objects to their caption item
448
+ floating_to_caption: Dict[int, BaseText] = {}
449
+ for ix, orig_item in enumerate(legacy_doc.main_text):
450
+ item = (
451
+ legacy_doc._resolve_ref(orig_item)
452
+ if isinstance(orig_item, Ref)
453
+ else orig_item
454
+ )
455
+ if item is None:
456
+ continue
457
+
458
+ item_type = item.obj_type.lower()
459
+ if (
460
+ isinstance(item, BaseText)
461
+ and (
462
+ item_type == "caption"
463
+ or (item.name is not None and item.name.lower() == "caption")
464
+ )
465
+ and item.text in embedded_captions
466
+ ):
467
+ floating_ix = embedded_captions[item.text]
468
+ floating_to_caption[floating_ix] = item
469
+
470
+ # main loop iteration
471
+ current_list: Optional[GroupItem] = None
472
+ for ix, orig_item in enumerate(legacy_doc.main_text):
473
+ item = (
474
+ legacy_doc._resolve_ref(orig_item)
475
+ if isinstance(orig_item, Ref)
476
+ else orig_item
477
+ )
478
+ if item is None:
479
+ continue
480
+
481
+ prov = _transform_prov(item)
482
+ item_type = item.obj_type.lower()
483
+
484
+ # if a group is needed, add it
485
+ if isinstance(item, BaseText) and (
486
+ item_type in "list-item-level-1" or item.name in {"list", "list-item"}
487
+ ):
488
+ if current_list is None:
489
+ current_list = doc.add_group(label=GroupLabel.LIST, name="list")
490
+ else:
491
+ current_list = None
492
+
493
+ # add the document item in the document
494
+ if isinstance(item, BaseText):
495
+ text = item.text if item.text is not None else ""
496
+ label_name = item.name if item.name is not None else "text"
497
+
498
+ if item_type == "caption":
499
+ if text in embedded_captions:
500
+ # skip captions if they are embedded in the actual
501
+ # floating objects
502
+ continue
503
+ else:
504
+ # captions without a related object are inserted as text
505
+ doc.add_text(label=DocItemLabel.TEXT, text=text, prov=prov)
506
+
507
+ # first title match
508
+ if item_type == "title":
509
+ doc.add_title(text=text, prov=prov)
510
+
511
+ # secondary titles
512
+ elif item_type in {
513
+ "subtitle-level-1",
514
+ }:
515
+ doc.add_heading(text=text, prov=prov)
516
+
517
+ # list item
518
+ elif item_type in "list-item-level-1" or label_name in {
519
+ "list",
520
+ "list-item",
521
+ }:
522
+ # TODO: Infer if this is a numbered or a bullet list item
523
+ doc.add_list_item(
524
+ text=text, enumerated=False, prov=prov, parent=current_list
525
+ )
526
+
527
+ # normal text
528
+ else:
529
+ label = DocItemLabel.TEXT
530
+ normalized_label_name = label_name.replace("-", "_")
531
+ if normalized_label_name is not None:
532
+ try:
533
+ label = DocItemLabel(normalized_label_name)
534
+ except ValueError:
535
+ pass
536
+ doc.add_text(label=label, text=text, prov=prov)
537
+
538
+ elif isinstance(item, DsSchemaTable):
539
+
540
+ table_data = TableData(num_cols=item.num_cols, num_rows=item.num_rows)
541
+ if item.data is not None:
542
+ seen_spans = set()
543
+ for row_ix, row in enumerate(item.data):
544
+ for col_ix, orig_cell_data in enumerate(row):
545
+
546
+ cell_bbox: Optional[BoundingBox] = (
547
+ BoundingBox.from_tuple(
548
+ tuple(orig_cell_data.bbox),
549
+ origin=CoordOrigin.BOTTOMLEFT,
550
+ )
551
+ if orig_cell_data.bbox is not None
552
+ else None
553
+ )
554
+ cell = TableCell(
555
+ start_row_offset_idx=row_ix,
556
+ end_row_offset_idx=row_ix + 1,
557
+ start_col_offset_idx=col_ix,
558
+ end_col_offset_idx=col_ix + 1,
559
+ text=orig_cell_data.text,
560
+ bbox=cell_bbox,
561
+ column_header=(orig_cell_data.obj_type == "col_header"),
562
+ row_header=(orig_cell_data.obj_type == "row_header"),
563
+ row_section=(orig_cell_data.obj_type == "row_section"),
564
+ )
565
+
566
+ if orig_cell_data.spans is not None:
567
+ # convert to a tuple of tuples for hashing
568
+ spans_tuple = tuple(
569
+ tuple(span) for span in orig_cell_data.spans
570
+ )
571
+
572
+ # skip repeated spans
573
+ if spans_tuple in seen_spans:
574
+ continue
575
+
576
+ seen_spans.add(spans_tuple)
577
+
578
+ cell.start_row_offset_idx = min(
579
+ s[0] for s in spans_tuple
580
+ )
581
+ cell.end_row_offset_idx = (
582
+ max(s[0] for s in spans_tuple) + 1
583
+ )
584
+ cell.start_col_offset_idx = min(
585
+ s[1] for s in spans_tuple
586
+ )
587
+ cell.end_col_offset_idx = (
588
+ max(s[1] for s in spans_tuple) + 1
589
+ )
590
+
591
+ cell.row_span = (
592
+ cell.end_row_offset_idx - cell.start_row_offset_idx
593
+ )
594
+ cell.col_span = (
595
+ cell.end_col_offset_idx - cell.start_col_offset_idx
596
+ )
597
+
598
+ table_data.table_cells.append(cell)
599
+
600
+ new_item = doc.add_table(data=table_data, prov=prov)
601
+ if (caption_item := floating_to_caption.get(ix)) is not None:
602
+ if caption_item.text is not None:
603
+ caption_prov = _transform_prov(caption_item)
604
+ caption = doc.add_text(
605
+ label=DocItemLabel.CAPTION,
606
+ text=caption_item.text,
607
+ prov=caption_prov,
608
+ parent=new_item,
609
+ )
610
+ new_item.captions.append(caption.get_ref())
611
+
612
+ elif isinstance(item, Figure):
613
+ new_item = doc.add_picture(prov=prov)
614
+ if (caption_item := floating_to_caption.get(ix)) is not None:
615
+ if caption_item.text is not None:
616
+ caption_prov = _transform_prov(caption_item)
617
+ caption = doc.add_text(
618
+ label=DocItemLabel.CAPTION,
619
+ text=caption_item.text,
620
+ prov=caption_prov,
621
+ parent=new_item,
622
+ )
623
+ new_item.captions.append(caption.get_ref())
624
+
625
+ # equations
626
+ elif (
627
+ isinstance(item, BaseCell)
628
+ and item.text is not None
629
+ and item_type in {"formula", "equation"}
630
+ ):
631
+ doc.add_text(label=DocItemLabel.FORMULA, text=item.text, prov=prov)
632
+
633
+ return doc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.8.0
3
+ Version: 2.10.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -35,6 +35,7 @@ Requires-Dist: pyyaml (>=5.1,<7.0.0)
35
35
  Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
36
36
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
37
37
  Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
38
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
38
39
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
39
40
  Project-URL: Repository, https://github.com/DS4SD/docling-core
40
41
  Description-Content-Type: text/markdown
@@ -1,4 +1,6 @@
1
1
  docling_core/__init__.py,sha256=D0afxif-BMUrgx2cYk1cwxiwATRYaGXsIMk_z4nw1Vs,90
2
+ docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,19
3
+ docling_core/cli/view.py,sha256=bhxvPQWIJVo2g_pRL0GjQwjDw-jdiRXp1-BTbG849go,1746
2
4
  docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
5
  docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
4
6
  docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
@@ -17,12 +19,12 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
17
19
  docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
18
20
  docling_core/transforms/chunker/base.py,sha256=PZl6QN41cZseTPkTwPzysDHYYFb6DwDSKw0QVSiFfG0,2541
19
21
  docling_core/transforms/chunker/hierarchical_chunker.py,sha256=cy3sE9w_7l-uoIEUcfnZlQweDHUoyAJTQ6IkzxxVjFY,8052
20
- docling_core/transforms/chunker/hybrid_chunker.py,sha256=LUzlqtTbXfhY40bhBVGtjEMZXFWRz1XH53OGqBh2Z3Y,11224
22
+ docling_core/transforms/chunker/hybrid_chunker.py,sha256=9bGhjr4vzpXbOMLCydCl81r1HbzMuMlo9ABfXyLRtd4,11375
21
23
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
22
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
23
25
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
24
26
  docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
25
- docling_core/types/doc/document.py,sha256=FoEm1GFV2JeXdxtj-ZINe7S_b_rZZjSKOSa72J16ork,90522
27
+ docling_core/types/doc/document.py,sha256=9t6FPvrxT9gKtUaYMP_Kyhz_izo2p6TQX_LlG2Fj5hY,91593
26
28
  docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
27
29
  docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
28
30
  docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
@@ -30,7 +32,7 @@ docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6
30
32
  docling_core/types/gen/generic.py,sha256=l4CZ4_Lb8ONG36WNJWbKX5hGKvTh_yU-hXp5hsm7uVU,844
31
33
  docling_core/types/io/__init__.py,sha256=7QYvFRaDE0AzBg8e7tvsVNlLBbCbAbQ9rP2TU8aXR1k,350
32
34
  docling_core/types/legacy_doc/__init__.py,sha256=Pzj_8rft6SJTVTCHgXRwHtuZjL6LK_6dcBWjikL9biY,125
33
- docling_core/types/legacy_doc/base.py,sha256=l8NKCuORUQ1ebjdGWpj6b30oQEvtErLsIHKQHbbJiPg,14683
35
+ docling_core/types/legacy_doc/base.py,sha256=aBKBunw6M6nvEq4lqP1cfFWK3GpGa6PXwNQqbvcJ3dU,14718
34
36
  docling_core/types/legacy_doc/doc_ann.py,sha256=CIQHW8yzu70bsMR9gtu7dqe4oz603Tq2eDDt9sh-tYo,1203
35
37
  docling_core/types/legacy_doc/doc_ocr.py,sha256=FfFqHAyMSbFt5cKeE7QLcxS0qUweBilBJoN9CH2TsQs,1394
36
38
  docling_core/types/legacy_doc/doc_raw.py,sha256=LrvQ9DhNjBRy98p_F9PUyHZeTGAxMKWqJzY4WJ7v-xs,3895
@@ -51,10 +53,11 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
51
53
  docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
52
54
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
53
55
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
56
+ docling_core/utils/legacy.py,sha256=xfp7U0JqjI60K3loWiNTk8w08_KfCUzTb2MNULBOIz4,24396
54
57
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
55
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
56
- docling_core-2.8.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
57
- docling_core-2.8.0.dist-info/METADATA,sha256=HNRaSRjkC-DkeOvguUK82YRbCUDYir4cuSG6-qqKT1U,5703
58
- docling_core-2.8.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
59
- docling_core-2.8.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
60
- docling_core-2.8.0.dist-info/RECORD,,
59
+ docling_core-2.10.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.10.0.dist-info/METADATA,sha256=2Xr2MRaXihKpNdNhAwfZT973ffbX7GGs19ylGCBwfe4,5744
61
+ docling_core-2.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.10.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.10.0.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
+ docling-view=docling_core.cli.view:app
2
3
  generate_docs=docling_core.utils.generate_docs:main
3
4
  generate_jsonschema=docling_core.utils.generate_jsonschema:main
4
5
  validate=docling_core.utils.validate:main