docling-core 2.9.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling-core might be problematic. Click here for more details.

@@ -0,0 +1 @@
1
+ """CLI package."""
@@ -0,0 +1,68 @@
1
+ #
2
+ # Copyright IBM Corp. 2024 - 2024
3
+ # SPDX-License-Identifier: MIT
4
+ #
5
+
6
+ """CLI for docling viewer."""
7
+ import importlib
8
+ import tempfile
9
+ import webbrowser
10
+ from pathlib import Path
11
+ from typing import Annotated, Optional
12
+
13
+ import typer
14
+
15
+ from docling_core.types.doc import DoclingDocument
16
+ from docling_core.types.doc.base import ImageRefMode
17
+ from docling_core.utils.file import resolve_source_to_path
18
+
19
+ app = typer.Typer(
20
+ name="Docling",
21
+ no_args_is_help=True,
22
+ add_completion=False,
23
+ pretty_exceptions_enable=False,
24
+ )
25
+
26
+
27
+ def version_callback(value: bool):
28
+ """Callback for version inspection."""
29
+ if value:
30
+ docling_core_version = importlib.metadata.version("docling-core")
31
+ print(f"Docling Core version: {docling_core_version}")
32
+ raise typer.Exit()
33
+
34
+
35
+ @app.command(no_args_is_help=True)
36
+ def view(
37
+ source: Annotated[
38
+ str,
39
+ typer.Argument(
40
+ ...,
41
+ metavar="source",
42
+ help="Docling JSON file to view.",
43
+ ),
44
+ ],
45
+ version: Annotated[
46
+ Optional[bool],
47
+ typer.Option(
48
+ "--version",
49
+ callback=version_callback,
50
+ is_eager=True,
51
+ help="Show version information.",
52
+ ),
53
+ ] = None,
54
+ ):
55
+ """Display a Docling JSON file on the default browser."""
56
+ path = resolve_source_to_path(source=source)
57
+ doc = DoclingDocument.load_from_json(filename=path)
58
+ target_path = Path(tempfile.mkdtemp()) / "out.html"
59
+ html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
60
+ with open(target_path, "w") as f:
61
+ f.write(html_output)
62
+ webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
63
+
64
+
65
+ click_app = typer.main.get_command(app)
66
+
67
+ if __name__ == "__main__":
68
+ app()
@@ -44,7 +44,9 @@ class HybridChunker(BaseChunker):
44
44
 
45
45
  model_config = ConfigDict(arbitrary_types_allowed=True)
46
46
 
47
- tokenizer: Union[PreTrainedTokenizerBase, str]
47
+ tokenizer: Union[PreTrainedTokenizerBase, str] = (
48
+ "sentence-transformers/all-MiniLM-L6-v2"
49
+ )
48
50
  max_tokens: int = None # type: ignore[assignment]
49
51
  merge_peers: bool = True
50
52
 
@@ -96,6 +98,7 @@ class HybridChunker(BaseChunker):
96
98
  doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
97
99
  headings=doc_chunk.meta.headings,
98
100
  captions=doc_chunk.meta.captions,
101
+ origin=doc_chunk.meta.origin,
99
102
  )
100
103
  new_chunk = DocChunk(text=window_text, meta=meta)
101
104
  return new_chunk
@@ -242,6 +245,7 @@ class HybridChunker(BaseChunker):
242
245
  doc_items=window_items,
243
246
  headings=current_headings_and_captions[0],
244
247
  captions=current_headings_and_captions[1],
248
+ origin=chunk.meta.origin,
245
249
  )
246
250
  new_chunk = DocChunk(
247
251
  text=window_text,
@@ -49,7 +49,6 @@ DEFAULT_EXPORT_LABELS = {
49
49
  DocItemLabel.DOCUMENT_INDEX,
50
50
  DocItemLabel.SECTION_HEADER,
51
51
  DocItemLabel.PARAGRAPH,
52
- DocItemLabel.CAPTION,
53
52
  DocItemLabel.TABLE,
54
53
  DocItemLabel.PICTURE,
55
54
  DocItemLabel.FORMULA,
@@ -58,6 +57,7 @@ DEFAULT_EXPORT_LABELS = {
58
57
  DocItemLabel.TEXT,
59
58
  DocItemLabel.LIST_ITEM,
60
59
  DocItemLabel.CODE,
60
+ DocItemLabel.REFERENCE,
61
61
  }
62
62
 
63
63
 
@@ -593,6 +593,21 @@ class DocItem(
593
593
  class TextItem(DocItem):
594
594
  """TextItem."""
595
595
 
596
+ label: typing.Literal[
597
+ DocItemLabel.CAPTION,
598
+ DocItemLabel.CHECKBOX_SELECTED,
599
+ DocItemLabel.CHECKBOX_UNSELECTED,
600
+ DocItemLabel.CODE,
601
+ DocItemLabel.FOOTNOTE,
602
+ DocItemLabel.FORMULA,
603
+ DocItemLabel.PAGE_FOOTER,
604
+ DocItemLabel.PAGE_HEADER,
605
+ DocItemLabel.PARAGRAPH,
606
+ DocItemLabel.REFERENCE,
607
+ DocItemLabel.TEXT,
608
+ DocItemLabel.TITLE,
609
+ ]
610
+
596
611
  orig: str # untreated representation
597
612
  text: str # sanitized representation
598
613
 
@@ -644,8 +659,10 @@ class TextItem(DocItem):
644
659
  class SectionHeaderItem(TextItem):
645
660
  """SectionItem."""
646
661
 
647
- label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
648
- level: LevelNumber
662
+ label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
663
+ DocItemLabel.SECTION_HEADER # type: ignore[assignment]
664
+ )
665
+ level: LevelNumber = 1
649
666
 
650
667
  def export_to_document_tokens(
651
668
  self,
@@ -695,9 +712,11 @@ class SectionHeaderItem(TextItem):
695
712
  class ListItem(TextItem):
696
713
  """SectionItem."""
697
714
 
698
- label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
715
+ label: typing.Literal[DocItemLabel.LIST_ITEM] = (
716
+ DocItemLabel.LIST_ITEM # type: ignore[assignment]
717
+ )
699
718
  enumerated: bool = False
700
- marker: str # The bullet or number symbol that prefixes this list item
719
+ marker: str = "-" # The bullet or number symbol that prefixes this list item
701
720
 
702
721
 
703
722
  class FloatingItem(DocItem):
@@ -923,7 +942,10 @@ class TableItem(FloatingItem):
923
942
  """TableItem."""
924
943
 
925
944
  data: TableData
926
- label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
945
+ label: typing.Literal[
946
+ DocItemLabel.DOCUMENT_INDEX,
947
+ DocItemLabel.TABLE,
948
+ ] = DocItemLabel.TABLE
927
949
 
928
950
  def export_to_dataframe(self) -> pd.DataFrame:
929
951
  """Export the table as a Pandas DataFrame."""
@@ -1272,9 +1294,19 @@ class TableItem(FloatingItem):
1272
1294
  class KeyValueItem(DocItem):
1273
1295
  """KeyValueItem."""
1274
1296
 
1297
+ label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
1298
+
1275
1299
 
1276
- ContentItem = Union[
1277
- TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
1300
+ ContentItem = Annotated[
1301
+ Union[
1302
+ TextItem,
1303
+ SectionHeaderItem,
1304
+ ListItem,
1305
+ PictureItem,
1306
+ TableItem,
1307
+ KeyValueItem,
1308
+ ],
1309
+ Field(discriminator="label"),
1278
1310
  ]
1279
1311
 
1280
1312
 
@@ -1376,13 +1408,13 @@ class DoclingDocument(BaseModel):
1376
1408
  self,
1377
1409
  label: Optional[GroupLabel] = None,
1378
1410
  name: Optional[str] = None,
1379
- parent: Optional[GroupItem] = None,
1411
+ parent: Optional[NodeItem] = None,
1380
1412
  ) -> GroupItem:
1381
1413
  """add_group.
1382
1414
 
1383
1415
  :param label: Optional[GroupLabel]: (Default value = None)
1384
1416
  :param name: Optional[str]: (Default value = None)
1385
- :param parent: Optional[GroupItem]: (Default value = None)
1417
+ :param parent: Optional[NodeItem]: (Default value = None)
1386
1418
 
1387
1419
  """
1388
1420
  if not parent:
@@ -1409,7 +1441,7 @@ class DoclingDocument(BaseModel):
1409
1441
  marker: Optional[str] = None,
1410
1442
  orig: Optional[str] = None,
1411
1443
  prov: Optional[ProvenanceItem] = None,
1412
- parent: Optional[GroupItem] = None,
1444
+ parent: Optional[NodeItem] = None,
1413
1445
  ):
1414
1446
  """add_list_item.
1415
1447
 
@@ -1417,7 +1449,7 @@ class DoclingDocument(BaseModel):
1417
1449
  :param text: str:
1418
1450
  :param orig: Optional[str]: (Default value = None)
1419
1451
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1420
- :param parent: Optional[GroupItem]: (Default value = None)
1452
+ :param parent: Optional[NodeItem]: (Default value = None)
1421
1453
 
1422
1454
  """
1423
1455
  if not parent:
@@ -1452,7 +1484,7 @@ class DoclingDocument(BaseModel):
1452
1484
  text: str,
1453
1485
  orig: Optional[str] = None,
1454
1486
  prov: Optional[ProvenanceItem] = None,
1455
- parent: Optional[GroupItem] = None,
1487
+ parent: Optional[NodeItem] = None,
1456
1488
  ):
1457
1489
  """add_text.
1458
1490
 
@@ -1460,7 +1492,7 @@ class DoclingDocument(BaseModel):
1460
1492
  :param text: str:
1461
1493
  :param orig: Optional[str]: (Default value = None)
1462
1494
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1463
- :param parent: Optional[GroupItem]: (Default value = None)
1495
+ :param parent: Optional[NodeItem]: (Default value = None)
1464
1496
 
1465
1497
  """
1466
1498
  # Catch a few cases that are in principle allowed
@@ -1504,15 +1536,16 @@ class DoclingDocument(BaseModel):
1504
1536
  data: TableData,
1505
1537
  caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
1506
1538
  prov: Optional[ProvenanceItem] = None,
1507
- parent: Optional[GroupItem] = None,
1539
+ parent: Optional[NodeItem] = None,
1540
+ label: DocItemLabel = DocItemLabel.TABLE,
1508
1541
  ):
1509
1542
  """add_table.
1510
1543
 
1511
- :param data: BaseTableData:
1512
- :param caption: Optional[Union[TextItem:
1513
- :param RefItem]]: (Default value = None)
1514
- :param # This is not cool yet.prov: Optional[ProvenanceItem]
1515
- :param parent: Optional[GroupItem]: (Default value = None)
1544
+ :param data: TableData:
1545
+ :param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
1546
+ :param prov: Optional[ProvenanceItem]: (Default value = None)
1547
+ :param parent: Optional[NodeItem]: (Default value = None)
1548
+ :param label: DocItemLabel: (Default value = DocItemLabel.TABLE)
1516
1549
 
1517
1550
  """
1518
1551
  if not parent:
@@ -1522,7 +1555,7 @@ class DoclingDocument(BaseModel):
1522
1555
  cref = f"#/tables/{table_index}"
1523
1556
 
1524
1557
  tbl_item = TableItem(
1525
- label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
1558
+ label=label, data=data, self_ref=cref, parent=parent.get_ref()
1526
1559
  )
1527
1560
  if prov:
1528
1561
  tbl_item.prov.append(prov)
@@ -1540,7 +1573,7 @@ class DoclingDocument(BaseModel):
1540
1573
  image: Optional[ImageRef] = None,
1541
1574
  caption: Optional[Union[TextItem, RefItem]] = None,
1542
1575
  prov: Optional[ProvenanceItem] = None,
1543
- parent: Optional[GroupItem] = None,
1576
+ parent: Optional[NodeItem] = None,
1544
1577
  ):
1545
1578
  """add_picture.
1546
1579
 
@@ -1548,7 +1581,7 @@ class DoclingDocument(BaseModel):
1548
1581
  :param caption: Optional[Union[TextItem:
1549
1582
  :param RefItem]]: (Default value = None)
1550
1583
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1551
- :param parent: Optional[GroupItem]: (Default value = None)
1584
+ :param parent: Optional[NodeItem]: (Default value = None)
1552
1585
  """
1553
1586
  if not parent:
1554
1587
  parent = self.body
@@ -1578,14 +1611,14 @@ class DoclingDocument(BaseModel):
1578
1611
  text: str,
1579
1612
  orig: Optional[str] = None,
1580
1613
  prov: Optional[ProvenanceItem] = None,
1581
- parent: Optional[GroupItem] = None,
1614
+ parent: Optional[NodeItem] = None,
1582
1615
  ):
1583
1616
  """add_title.
1584
1617
 
1585
1618
  :param text: str:
1586
1619
  :param orig: Optional[str]: (Default value = None)
1587
1620
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1588
- :param parent: Optional[GroupItem]: (Default value = None)
1621
+ :param parent: Optional[NodeItem]: (Default value = None)
1589
1622
  """
1590
1623
  if not parent:
1591
1624
  parent = self.body
@@ -1616,7 +1649,7 @@ class DoclingDocument(BaseModel):
1616
1649
  orig: Optional[str] = None,
1617
1650
  level: LevelNumber = 1,
1618
1651
  prov: Optional[ProvenanceItem] = None,
1619
- parent: Optional[GroupItem] = None,
1652
+ parent: Optional[NodeItem] = None,
1620
1653
  ):
1621
1654
  """add_heading.
1622
1655
 
@@ -1625,7 +1658,7 @@ class DoclingDocument(BaseModel):
1625
1658
  :param orig: Optional[str]: (Default value = None)
1626
1659
  :param level: LevelNumber: (Default value = 1)
1627
1660
  :param prov: Optional[ProvenanceItem]: (Default value = None)
1628
- :param parent: Optional[GroupItem]: (Default value = None)
1661
+ :param parent: Optional[NodeItem]: (Default value = None)
1629
1662
  """
1630
1663
  if not parent:
1631
1664
  parent = self.body
@@ -2055,10 +2088,6 @@ class DoclingDocument(BaseModel):
2055
2088
  text = f"```\n{item.text}\n```\n"
2056
2089
  mdtexts.append(text)
2057
2090
 
2058
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2059
- # captions are printed in picture and table ... skipping for now
2060
- continue
2061
-
2062
2091
  elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
2063
2092
  in_list = True
2064
2093
  # Calculate indent based on list_nesting_level
@@ -2350,10 +2379,6 @@ class DoclingDocument(BaseModel):
2350
2379
  text = f"<pre>{item.text}</pre>"
2351
2380
  html_texts.append(text)
2352
2381
 
2353
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2354
- # captions are printed in picture and table ... skipping for now
2355
- continue
2356
-
2357
2382
  elif isinstance(item, ListItem):
2358
2383
 
2359
2384
  text = f"<li>{item.text}</li>"
@@ -2555,10 +2580,6 @@ class DoclingDocument(BaseModel):
2555
2580
  result += f"<unordered_list>{delim}"
2556
2581
  in_ordered_list.append(False)
2557
2582
 
2558
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2559
- # captions are printed in picture and table ... skipping for now
2560
- continue
2561
-
2562
2583
  elif isinstance(item, SectionHeaderItem):
2563
2584
 
2564
2585
  result += item.export_to_document_tokens(
@@ -2664,10 +2685,6 @@ class DoclingDocument(BaseModel):
2664
2685
  indent * level + f"item-{i} at level {level}: {item.label}: {text}"
2665
2686
  )
2666
2687
 
2667
- elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
2668
- # captions are printed in picture and table ... skipping for now
2669
- continue
2670
-
2671
2688
  elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
2672
2689
  text = get_text(text=item.text, max_text_len=max_text_len)
2673
2690
 
@@ -46,6 +46,8 @@ class GroupLabel(str, Enum):
46
46
  SECTION = "section"
47
47
  SHEET = "sheet"
48
48
  SLIDE = "slide"
49
+ FORM_AREA = "form_area"
50
+ KEY_VALUE_AREA = "key_value_area"
49
51
 
50
52
  def __str__(self):
51
53
  """Get string value."""
@@ -7,19 +7,26 @@
7
7
 
8
8
  import hashlib
9
9
  import uuid
10
- from typing import Union
10
+ from pathlib import Path
11
+ from typing import Dict, Optional, Union
11
12
 
12
13
  from docling_core.types.doc import (
14
+ BoundingBox,
15
+ CoordOrigin,
13
16
  DocItem,
14
17
  DocItemLabel,
15
18
  DoclingDocument,
19
+ DocumentOrigin,
16
20
  PictureItem,
21
+ ProvenanceItem,
17
22
  SectionHeaderItem,
23
+ Size,
18
24
  TableCell,
19
25
  TableItem,
20
26
  TextItem,
21
27
  )
22
- from docling_core.types.doc.document import ListItem
28
+ from docling_core.types.doc.document import GroupItem, ListItem, TableData
29
+ from docling_core.types.doc.labels import GroupLabel
23
30
  from docling_core.types.legacy_doc.base import (
24
31
  BaseCell,
25
32
  BaseText,
@@ -342,5 +349,285 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
342
349
  return legacy_doc
343
350
 
344
351
 
345
- # def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument:
346
- # """Convert a legacy document to DoclingDocument."""
352
+ def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # noqa: C901
353
+ """Convert a legacy document to DoclingDocument.
354
+
355
+ It is known that the following content will not be preserved in the transformation:
356
+ - name of labels (upper vs lower case)
357
+ - caption of figures are not in main-text anymore
358
+ - s3_data removed
359
+ - model metadata removed
360
+ - logs removed
361
+ - document hash cannot be preserved
362
+ """
363
+
364
+ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
365
+ """Create a new provenance from a legacy item."""
366
+ prov: Optional[ProvenanceItem] = None
367
+ if item.prov is not None and len(item.prov) > 0:
368
+ prov = ProvenanceItem(
369
+ page_no=int(item.prov[0].page),
370
+ charspan=tuple(item.prov[0].span),
371
+ bbox=BoundingBox.from_tuple(
372
+ tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
373
+ ),
374
+ )
375
+ return prov
376
+
377
+ origin = DocumentOrigin(
378
+ mimetype="application/pdf",
379
+ filename=legacy_doc.file_info.filename,
380
+ binary_hash=legacy_doc.file_info.document_hash,
381
+ )
382
+ doc_name = Path(origin.filename).stem
383
+
384
+ doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
385
+
386
+ # define pages
387
+ if legacy_doc.page_dimensions is not None:
388
+ for page_dim in legacy_doc.page_dimensions:
389
+ page_no = int(page_dim.page)
390
+ size = Size(width=page_dim.width, height=page_dim.height)
391
+
392
+ doc.add_page(page_no=page_no, size=size)
393
+
394
+ # page headers
395
+ if legacy_doc.page_headers is not None:
396
+ for text_item in legacy_doc.page_headers:
397
+ if text_item.text is None:
398
+ continue
399
+ prov = _transform_prov(text_item)
400
+ doc.add_text(
401
+ label=DocItemLabel.PAGE_HEADER,
402
+ text=text_item.text,
403
+ parent=doc.furniture,
404
+ )
405
+
406
+ # page footers
407
+ if legacy_doc.page_footers is not None:
408
+ for text_item in legacy_doc.page_footers:
409
+ if text_item.text is None:
410
+ continue
411
+ prov = _transform_prov(text_item)
412
+ doc.add_text(
413
+ label=DocItemLabel.PAGE_FOOTER,
414
+ text=text_item.text,
415
+ parent=doc.furniture,
416
+ )
417
+
418
+ # footnotes
419
+ if legacy_doc.footnotes is not None:
420
+ for text_item in legacy_doc.footnotes:
421
+ if text_item.text is None:
422
+ continue
423
+ prov = _transform_prov(text_item)
424
+ doc.add_text(
425
+ label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture
426
+ )
427
+
428
+ # main-text content
429
+ if legacy_doc.main_text is not None:
430
+ item: Optional[Union[BaseCell, BaseText]]
431
+
432
+ # collect all captions embedded in table and figure objects
433
+ # to avoid repeating them
434
+ embedded_captions: Dict[str, int] = {}
435
+ for ix, orig_item in enumerate(legacy_doc.main_text):
436
+ item = (
437
+ legacy_doc._resolve_ref(orig_item)
438
+ if isinstance(orig_item, Ref)
439
+ else orig_item
440
+ )
441
+ if item is None:
442
+ continue
443
+
444
+ if isinstance(item, (DsSchemaTable, Figure)) and item.text:
445
+ embedded_captions[item.text] = ix
446
+
447
+ # build lookup from floating objects to their caption item
448
+ floating_to_caption: Dict[int, BaseText] = {}
449
+ for ix, orig_item in enumerate(legacy_doc.main_text):
450
+ item = (
451
+ legacy_doc._resolve_ref(orig_item)
452
+ if isinstance(orig_item, Ref)
453
+ else orig_item
454
+ )
455
+ if item is None:
456
+ continue
457
+
458
+ item_type = item.obj_type.lower()
459
+ if (
460
+ isinstance(item, BaseText)
461
+ and (
462
+ item_type == "caption"
463
+ or (item.name is not None and item.name.lower() == "caption")
464
+ )
465
+ and item.text in embedded_captions
466
+ ):
467
+ floating_ix = embedded_captions[item.text]
468
+ floating_to_caption[floating_ix] = item
469
+
470
+ # main loop iteration
471
+ current_list: Optional[GroupItem] = None
472
+ for ix, orig_item in enumerate(legacy_doc.main_text):
473
+ item = (
474
+ legacy_doc._resolve_ref(orig_item)
475
+ if isinstance(orig_item, Ref)
476
+ else orig_item
477
+ )
478
+ if item is None:
479
+ continue
480
+
481
+ prov = _transform_prov(item)
482
+ item_type = item.obj_type.lower()
483
+
484
+ # if a group is needed, add it
485
+ if isinstance(item, BaseText) and (
486
+ item_type in "list-item-level-1" or item.name in {"list", "list-item"}
487
+ ):
488
+ if current_list is None:
489
+ current_list = doc.add_group(label=GroupLabel.LIST, name="list")
490
+ else:
491
+ current_list = None
492
+
493
+ # add the document item in the document
494
+ if isinstance(item, BaseText):
495
+ text = item.text if item.text is not None else ""
496
+ label_name = item.name if item.name is not None else "text"
497
+
498
+ if item_type == "caption":
499
+ if text in embedded_captions:
500
+ # skip captions if they are embedded in the actual
501
+ # floating objects
502
+ continue
503
+ else:
504
+ # captions without a related object are inserted as text
505
+ doc.add_text(label=DocItemLabel.TEXT, text=text, prov=prov)
506
+
507
+ # first title match
508
+ if item_type == "title":
509
+ doc.add_title(text=text, prov=prov)
510
+
511
+ # secondary titles
512
+ elif item_type in {
513
+ "subtitle-level-1",
514
+ }:
515
+ doc.add_heading(text=text, prov=prov)
516
+
517
+ # list item
518
+ elif item_type in "list-item-level-1" or label_name in {
519
+ "list",
520
+ "list-item",
521
+ }:
522
+ # TODO: Infer if this is a numbered or a bullet list item
523
+ doc.add_list_item(
524
+ text=text, enumerated=False, prov=prov, parent=current_list
525
+ )
526
+
527
+ # normal text
528
+ else:
529
+ label = DocItemLabel.TEXT
530
+ normalized_label_name = label_name.replace("-", "_")
531
+ if normalized_label_name is not None:
532
+ try:
533
+ label = DocItemLabel(normalized_label_name)
534
+ except ValueError:
535
+ pass
536
+ doc.add_text(label=label, text=text, prov=prov)
537
+
538
+ elif isinstance(item, DsSchemaTable):
539
+
540
+ table_data = TableData(num_cols=item.num_cols, num_rows=item.num_rows)
541
+ if item.data is not None:
542
+ seen_spans = set()
543
+ for row_ix, row in enumerate(item.data):
544
+ for col_ix, orig_cell_data in enumerate(row):
545
+
546
+ cell_bbox: Optional[BoundingBox] = (
547
+ BoundingBox.from_tuple(
548
+ tuple(orig_cell_data.bbox),
549
+ origin=CoordOrigin.BOTTOMLEFT,
550
+ )
551
+ if orig_cell_data.bbox is not None
552
+ else None
553
+ )
554
+ cell = TableCell(
555
+ start_row_offset_idx=row_ix,
556
+ end_row_offset_idx=row_ix + 1,
557
+ start_col_offset_idx=col_ix,
558
+ end_col_offset_idx=col_ix + 1,
559
+ text=orig_cell_data.text,
560
+ bbox=cell_bbox,
561
+ column_header=(orig_cell_data.obj_type == "col_header"),
562
+ row_header=(orig_cell_data.obj_type == "row_header"),
563
+ row_section=(orig_cell_data.obj_type == "row_section"),
564
+ )
565
+
566
+ if orig_cell_data.spans is not None:
567
+ # convert to a tuple of tuples for hashing
568
+ spans_tuple = tuple(
569
+ tuple(span) for span in orig_cell_data.spans
570
+ )
571
+
572
+ # skip repeated spans
573
+ if spans_tuple in seen_spans:
574
+ continue
575
+
576
+ seen_spans.add(spans_tuple)
577
+
578
+ cell.start_row_offset_idx = min(
579
+ s[0] for s in spans_tuple
580
+ )
581
+ cell.end_row_offset_idx = (
582
+ max(s[0] for s in spans_tuple) + 1
583
+ )
584
+ cell.start_col_offset_idx = min(
585
+ s[1] for s in spans_tuple
586
+ )
587
+ cell.end_col_offset_idx = (
588
+ max(s[1] for s in spans_tuple) + 1
589
+ )
590
+
591
+ cell.row_span = (
592
+ cell.end_row_offset_idx - cell.start_row_offset_idx
593
+ )
594
+ cell.col_span = (
595
+ cell.end_col_offset_idx - cell.start_col_offset_idx
596
+ )
597
+
598
+ table_data.table_cells.append(cell)
599
+
600
+ new_item = doc.add_table(data=table_data, prov=prov)
601
+ if (caption_item := floating_to_caption.get(ix)) is not None:
602
+ if caption_item.text is not None:
603
+ caption_prov = _transform_prov(caption_item)
604
+ caption = doc.add_text(
605
+ label=DocItemLabel.CAPTION,
606
+ text=caption_item.text,
607
+ prov=caption_prov,
608
+ parent=new_item,
609
+ )
610
+ new_item.captions.append(caption.get_ref())
611
+
612
+ elif isinstance(item, Figure):
613
+ new_item = doc.add_picture(prov=prov)
614
+ if (caption_item := floating_to_caption.get(ix)) is not None:
615
+ if caption_item.text is not None:
616
+ caption_prov = _transform_prov(caption_item)
617
+ caption = doc.add_text(
618
+ label=DocItemLabel.CAPTION,
619
+ text=caption_item.text,
620
+ prov=caption_prov,
621
+ parent=new_item,
622
+ )
623
+ new_item.captions.append(caption.get_ref())
624
+
625
+ # equations
626
+ elif (
627
+ isinstance(item, BaseCell)
628
+ and item.text is not None
629
+ and item_type in {"formula", "equation"}
630
+ ):
631
+ doc.add_text(label=DocItemLabel.FORMULA, text=item.text, prov=prov)
632
+
633
+ return doc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling-core
3
- Version: 2.9.0
3
+ Version: 2.11.0
4
4
  Summary: A python library to define and validate data types in Docling.
5
5
  Home-page: https://ds4sd.github.io/
6
6
  License: MIT
@@ -35,6 +35,7 @@ Requires-Dist: pyyaml (>=5.1,<7.0.0)
35
35
  Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
36
36
  Requires-Dist: tabulate (>=0.9.0,<0.10.0)
37
37
  Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
38
+ Requires-Dist: typer (>=0.12.5,<0.13.0)
38
39
  Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
39
40
  Project-URL: Repository, https://github.com/DS4SD/docling-core
40
41
  Description-Content-Type: text/markdown
@@ -1,4 +1,6 @@
1
1
  docling_core/__init__.py,sha256=D0afxif-BMUrgx2cYk1cwxiwATRYaGXsIMk_z4nw1Vs,90
2
+ docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,19
3
+ docling_core/cli/view.py,sha256=bhxvPQWIJVo2g_pRL0GjQwjDw-jdiRXp1-BTbG849go,1746
2
4
  docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
5
  docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
4
6
  docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
@@ -17,13 +19,13 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
17
19
  docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
18
20
  docling_core/transforms/chunker/base.py,sha256=PZl6QN41cZseTPkTwPzysDHYYFb6DwDSKw0QVSiFfG0,2541
19
21
  docling_core/transforms/chunker/hierarchical_chunker.py,sha256=cy3sE9w_7l-uoIEUcfnZlQweDHUoyAJTQ6IkzxxVjFY,8052
20
- docling_core/transforms/chunker/hybrid_chunker.py,sha256=LUzlqtTbXfhY40bhBVGtjEMZXFWRz1XH53OGqBh2Z3Y,11224
22
+ docling_core/transforms/chunker/hybrid_chunker.py,sha256=9bGhjr4vzpXbOMLCydCl81r1HbzMuMlo9ABfXyLRtd4,11375
21
23
  docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
22
24
  docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
23
25
  docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
24
26
  docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
25
- docling_core/types/doc/document.py,sha256=nyyQWikflk2XRJSB2b-V2MEMMvEok0g35v9iEyIODj8,91521
26
- docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
27
+ docling_core/types/doc/document.py,sha256=9t6FPvrxT9gKtUaYMP_Kyhz_izo2p6TQX_LlG2Fj5hY,91593
28
+ docling_core/types/doc/labels.py,sha256=4BG_wNG1qDc5E3qQHixPjM_IAxGjGo14hobNyfTycZw,1662
27
29
  docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
28
30
  docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
29
31
  docling_core/types/gen/__init__.py,sha256=C6TuCfvpSnSL5XDOFMcYHUY2-i08vvfOGRcdu6Af0pI,124
@@ -51,11 +53,11 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
51
53
  docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
52
54
  docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
53
55
  docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
54
- docling_core/utils/legacy.py,sha256=mncL2r2PL5rVXTXhgOArYGVwXs0PWaJ4RxuCRMfNxac,12814
56
+ docling_core/utils/legacy.py,sha256=xfp7U0JqjI60K3loWiNTk8w08_KfCUzTb2MNULBOIz4,24396
55
57
  docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
56
58
  docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
57
- docling_core-2.9.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
58
- docling_core-2.9.0.dist-info/METADATA,sha256=P7s_dSFfZ_lvmwRFJRCOnxvR3iavYGX-3kzthwAs2vk,5703
59
- docling_core-2.9.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
60
- docling_core-2.9.0.dist-info/entry_points.txt,sha256=jIxlWv3tnO04irlZc0zfhqJIgz1bg9Hha4AkaLWSdUA,177
61
- docling_core-2.9.0.dist-info/RECORD,,
59
+ docling_core-2.11.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
60
+ docling_core-2.11.0.dist-info/METADATA,sha256=4Xb7VqXg4dAxRWiT-KThSn4i_TiIsoIXdhyN8eZOWSk,5744
61
+ docling_core-2.11.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
62
+ docling_core-2.11.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
63
+ docling_core-2.11.0.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  [console_scripts]
2
+ docling-view=docling_core.cli.view:app
2
3
  generate_docs=docling_core.utils.generate_docs:main
3
4
  generate_jsonschema=docling_core.utils.generate_jsonschema:main
4
5
  validate=docling_core.utils.validate:main