docling-core 2.9.0__py3-none-any.whl → 2.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling-core might be problematic. Click here for more details.
- docling_core/cli/__init__.py +1 -0
- docling_core/cli/view.py +68 -0
- docling_core/transforms/chunker/hybrid_chunker.py +5 -1
- docling_core/types/doc/document.py +60 -43
- docling_core/utils/legacy.py +291 -4
- {docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/METADATA +2 -1
- {docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/RECORD +10 -8
- {docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/entry_points.txt +1 -0
- {docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/LICENSE +0 -0
- {docling_core-2.9.0.dist-info → docling_core-2.10.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""CLI package."""
|
docling_core/cli/view.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright IBM Corp. 2024 - 2024
|
|
3
|
+
# SPDX-License-Identifier: MIT
|
|
4
|
+
#
|
|
5
|
+
|
|
6
|
+
"""CLI for docling viewer."""
|
|
7
|
+
import importlib
|
|
8
|
+
import tempfile
|
|
9
|
+
import webbrowser
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Annotated, Optional
|
|
12
|
+
|
|
13
|
+
import typer
|
|
14
|
+
|
|
15
|
+
from docling_core.types.doc import DoclingDocument
|
|
16
|
+
from docling_core.types.doc.base import ImageRefMode
|
|
17
|
+
from docling_core.utils.file import resolve_source_to_path
|
|
18
|
+
|
|
19
|
+
app = typer.Typer(
|
|
20
|
+
name="Docling",
|
|
21
|
+
no_args_is_help=True,
|
|
22
|
+
add_completion=False,
|
|
23
|
+
pretty_exceptions_enable=False,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def version_callback(value: bool):
|
|
28
|
+
"""Callback for version inspection."""
|
|
29
|
+
if value:
|
|
30
|
+
docling_core_version = importlib.metadata.version("docling-core")
|
|
31
|
+
print(f"Docling Core version: {docling_core_version}")
|
|
32
|
+
raise typer.Exit()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@app.command(no_args_is_help=True)
|
|
36
|
+
def view(
|
|
37
|
+
source: Annotated[
|
|
38
|
+
str,
|
|
39
|
+
typer.Argument(
|
|
40
|
+
...,
|
|
41
|
+
metavar="source",
|
|
42
|
+
help="Docling JSON file to view.",
|
|
43
|
+
),
|
|
44
|
+
],
|
|
45
|
+
version: Annotated[
|
|
46
|
+
Optional[bool],
|
|
47
|
+
typer.Option(
|
|
48
|
+
"--version",
|
|
49
|
+
callback=version_callback,
|
|
50
|
+
is_eager=True,
|
|
51
|
+
help="Show version information.",
|
|
52
|
+
),
|
|
53
|
+
] = None,
|
|
54
|
+
):
|
|
55
|
+
"""Display a Docling JSON file on the default browser."""
|
|
56
|
+
path = resolve_source_to_path(source=source)
|
|
57
|
+
doc = DoclingDocument.load_from_json(filename=path)
|
|
58
|
+
target_path = Path(tempfile.mkdtemp()) / "out.html"
|
|
59
|
+
html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
|
|
60
|
+
with open(target_path, "w") as f:
|
|
61
|
+
f.write(html_output)
|
|
62
|
+
webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
click_app = typer.main.get_command(app)
|
|
66
|
+
|
|
67
|
+
if __name__ == "__main__":
|
|
68
|
+
app()
|
|
@@ -44,7 +44,9 @@ class HybridChunker(BaseChunker):
|
|
|
44
44
|
|
|
45
45
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
46
46
|
|
|
47
|
-
tokenizer: Union[PreTrainedTokenizerBase, str]
|
|
47
|
+
tokenizer: Union[PreTrainedTokenizerBase, str] = (
|
|
48
|
+
"sentence-transformers/all-MiniLM-L6-v2"
|
|
49
|
+
)
|
|
48
50
|
max_tokens: int = None # type: ignore[assignment]
|
|
49
51
|
merge_peers: bool = True
|
|
50
52
|
|
|
@@ -96,6 +98,7 @@ class HybridChunker(BaseChunker):
|
|
|
96
98
|
doc_items=doc_chunk.meta.doc_items[window_start : window_end + 1],
|
|
97
99
|
headings=doc_chunk.meta.headings,
|
|
98
100
|
captions=doc_chunk.meta.captions,
|
|
101
|
+
origin=doc_chunk.meta.origin,
|
|
99
102
|
)
|
|
100
103
|
new_chunk = DocChunk(text=window_text, meta=meta)
|
|
101
104
|
return new_chunk
|
|
@@ -242,6 +245,7 @@ class HybridChunker(BaseChunker):
|
|
|
242
245
|
doc_items=window_items,
|
|
243
246
|
headings=current_headings_and_captions[0],
|
|
244
247
|
captions=current_headings_and_captions[1],
|
|
248
|
+
origin=chunk.meta.origin,
|
|
245
249
|
)
|
|
246
250
|
new_chunk = DocChunk(
|
|
247
251
|
text=window_text,
|
|
@@ -49,7 +49,6 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
49
49
|
DocItemLabel.DOCUMENT_INDEX,
|
|
50
50
|
DocItemLabel.SECTION_HEADER,
|
|
51
51
|
DocItemLabel.PARAGRAPH,
|
|
52
|
-
DocItemLabel.CAPTION,
|
|
53
52
|
DocItemLabel.TABLE,
|
|
54
53
|
DocItemLabel.PICTURE,
|
|
55
54
|
DocItemLabel.FORMULA,
|
|
@@ -58,6 +57,7 @@ DEFAULT_EXPORT_LABELS = {
|
|
|
58
57
|
DocItemLabel.TEXT,
|
|
59
58
|
DocItemLabel.LIST_ITEM,
|
|
60
59
|
DocItemLabel.CODE,
|
|
60
|
+
DocItemLabel.REFERENCE,
|
|
61
61
|
}
|
|
62
62
|
|
|
63
63
|
|
|
@@ -593,6 +593,21 @@ class DocItem(
|
|
|
593
593
|
class TextItem(DocItem):
|
|
594
594
|
"""TextItem."""
|
|
595
595
|
|
|
596
|
+
label: typing.Literal[
|
|
597
|
+
DocItemLabel.CAPTION,
|
|
598
|
+
DocItemLabel.CHECKBOX_SELECTED,
|
|
599
|
+
DocItemLabel.CHECKBOX_UNSELECTED,
|
|
600
|
+
DocItemLabel.CODE,
|
|
601
|
+
DocItemLabel.FOOTNOTE,
|
|
602
|
+
DocItemLabel.FORMULA,
|
|
603
|
+
DocItemLabel.PAGE_FOOTER,
|
|
604
|
+
DocItemLabel.PAGE_HEADER,
|
|
605
|
+
DocItemLabel.PARAGRAPH,
|
|
606
|
+
DocItemLabel.REFERENCE,
|
|
607
|
+
DocItemLabel.TEXT,
|
|
608
|
+
DocItemLabel.TITLE,
|
|
609
|
+
]
|
|
610
|
+
|
|
596
611
|
orig: str # untreated representation
|
|
597
612
|
text: str # sanitized representation
|
|
598
613
|
|
|
@@ -644,8 +659,10 @@ class TextItem(DocItem):
|
|
|
644
659
|
class SectionHeaderItem(TextItem):
|
|
645
660
|
"""SectionItem."""
|
|
646
661
|
|
|
647
|
-
label: typing.Literal[DocItemLabel.SECTION_HEADER] =
|
|
648
|
-
|
|
662
|
+
label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
|
|
663
|
+
DocItemLabel.SECTION_HEADER # type: ignore[assignment]
|
|
664
|
+
)
|
|
665
|
+
level: LevelNumber = 1
|
|
649
666
|
|
|
650
667
|
def export_to_document_tokens(
|
|
651
668
|
self,
|
|
@@ -695,9 +712,11 @@ class SectionHeaderItem(TextItem):
|
|
|
695
712
|
class ListItem(TextItem):
|
|
696
713
|
"""SectionItem."""
|
|
697
714
|
|
|
698
|
-
label: typing.Literal[DocItemLabel.LIST_ITEM] =
|
|
715
|
+
label: typing.Literal[DocItemLabel.LIST_ITEM] = (
|
|
716
|
+
DocItemLabel.LIST_ITEM # type: ignore[assignment]
|
|
717
|
+
)
|
|
699
718
|
enumerated: bool = False
|
|
700
|
-
marker: str # The bullet or number symbol that prefixes this list item
|
|
719
|
+
marker: str = "-" # The bullet or number symbol that prefixes this list item
|
|
701
720
|
|
|
702
721
|
|
|
703
722
|
class FloatingItem(DocItem):
|
|
@@ -923,7 +942,10 @@ class TableItem(FloatingItem):
|
|
|
923
942
|
"""TableItem."""
|
|
924
943
|
|
|
925
944
|
data: TableData
|
|
926
|
-
label: typing.Literal[
|
|
945
|
+
label: typing.Literal[
|
|
946
|
+
DocItemLabel.DOCUMENT_INDEX,
|
|
947
|
+
DocItemLabel.TABLE,
|
|
948
|
+
] = DocItemLabel.TABLE
|
|
927
949
|
|
|
928
950
|
def export_to_dataframe(self) -> pd.DataFrame:
|
|
929
951
|
"""Export the table as a Pandas DataFrame."""
|
|
@@ -1272,9 +1294,19 @@ class TableItem(FloatingItem):
|
|
|
1272
1294
|
class KeyValueItem(DocItem):
|
|
1273
1295
|
"""KeyValueItem."""
|
|
1274
1296
|
|
|
1297
|
+
label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
|
|
1298
|
+
|
|
1275
1299
|
|
|
1276
|
-
ContentItem =
|
|
1277
|
-
|
|
1300
|
+
ContentItem = Annotated[
|
|
1301
|
+
Union[
|
|
1302
|
+
TextItem,
|
|
1303
|
+
SectionHeaderItem,
|
|
1304
|
+
ListItem,
|
|
1305
|
+
PictureItem,
|
|
1306
|
+
TableItem,
|
|
1307
|
+
KeyValueItem,
|
|
1308
|
+
],
|
|
1309
|
+
Field(discriminator="label"),
|
|
1278
1310
|
]
|
|
1279
1311
|
|
|
1280
1312
|
|
|
@@ -1376,13 +1408,13 @@ class DoclingDocument(BaseModel):
|
|
|
1376
1408
|
self,
|
|
1377
1409
|
label: Optional[GroupLabel] = None,
|
|
1378
1410
|
name: Optional[str] = None,
|
|
1379
|
-
parent: Optional[
|
|
1411
|
+
parent: Optional[NodeItem] = None,
|
|
1380
1412
|
) -> GroupItem:
|
|
1381
1413
|
"""add_group.
|
|
1382
1414
|
|
|
1383
1415
|
:param label: Optional[GroupLabel]: (Default value = None)
|
|
1384
1416
|
:param name: Optional[str]: (Default value = None)
|
|
1385
|
-
:param parent: Optional[
|
|
1417
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1386
1418
|
|
|
1387
1419
|
"""
|
|
1388
1420
|
if not parent:
|
|
@@ -1409,7 +1441,7 @@ class DoclingDocument(BaseModel):
|
|
|
1409
1441
|
marker: Optional[str] = None,
|
|
1410
1442
|
orig: Optional[str] = None,
|
|
1411
1443
|
prov: Optional[ProvenanceItem] = None,
|
|
1412
|
-
parent: Optional[
|
|
1444
|
+
parent: Optional[NodeItem] = None,
|
|
1413
1445
|
):
|
|
1414
1446
|
"""add_list_item.
|
|
1415
1447
|
|
|
@@ -1417,7 +1449,7 @@ class DoclingDocument(BaseModel):
|
|
|
1417
1449
|
:param text: str:
|
|
1418
1450
|
:param orig: Optional[str]: (Default value = None)
|
|
1419
1451
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1420
|
-
:param parent: Optional[
|
|
1452
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1421
1453
|
|
|
1422
1454
|
"""
|
|
1423
1455
|
if not parent:
|
|
@@ -1452,7 +1484,7 @@ class DoclingDocument(BaseModel):
|
|
|
1452
1484
|
text: str,
|
|
1453
1485
|
orig: Optional[str] = None,
|
|
1454
1486
|
prov: Optional[ProvenanceItem] = None,
|
|
1455
|
-
parent: Optional[
|
|
1487
|
+
parent: Optional[NodeItem] = None,
|
|
1456
1488
|
):
|
|
1457
1489
|
"""add_text.
|
|
1458
1490
|
|
|
@@ -1460,7 +1492,7 @@ class DoclingDocument(BaseModel):
|
|
|
1460
1492
|
:param text: str:
|
|
1461
1493
|
:param orig: Optional[str]: (Default value = None)
|
|
1462
1494
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1463
|
-
:param parent: Optional[
|
|
1495
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1464
1496
|
|
|
1465
1497
|
"""
|
|
1466
1498
|
# Catch a few cases that are in principle allowed
|
|
@@ -1504,15 +1536,16 @@ class DoclingDocument(BaseModel):
|
|
|
1504
1536
|
data: TableData,
|
|
1505
1537
|
caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
|
|
1506
1538
|
prov: Optional[ProvenanceItem] = None,
|
|
1507
|
-
parent: Optional[
|
|
1539
|
+
parent: Optional[NodeItem] = None,
|
|
1540
|
+
label: DocItemLabel = DocItemLabel.TABLE,
|
|
1508
1541
|
):
|
|
1509
1542
|
"""add_table.
|
|
1510
1543
|
|
|
1511
|
-
:param data:
|
|
1512
|
-
:param caption: Optional[Union[TextItem:
|
|
1513
|
-
:param
|
|
1514
|
-
:param
|
|
1515
|
-
:param
|
|
1544
|
+
:param data: TableData:
|
|
1545
|
+
:param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
|
|
1546
|
+
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1547
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1548
|
+
:param label: DocItemLabel: (Default value = DocItemLabel.TABLE)
|
|
1516
1549
|
|
|
1517
1550
|
"""
|
|
1518
1551
|
if not parent:
|
|
@@ -1522,7 +1555,7 @@ class DoclingDocument(BaseModel):
|
|
|
1522
1555
|
cref = f"#/tables/{table_index}"
|
|
1523
1556
|
|
|
1524
1557
|
tbl_item = TableItem(
|
|
1525
|
-
label=
|
|
1558
|
+
label=label, data=data, self_ref=cref, parent=parent.get_ref()
|
|
1526
1559
|
)
|
|
1527
1560
|
if prov:
|
|
1528
1561
|
tbl_item.prov.append(prov)
|
|
@@ -1540,7 +1573,7 @@ class DoclingDocument(BaseModel):
|
|
|
1540
1573
|
image: Optional[ImageRef] = None,
|
|
1541
1574
|
caption: Optional[Union[TextItem, RefItem]] = None,
|
|
1542
1575
|
prov: Optional[ProvenanceItem] = None,
|
|
1543
|
-
parent: Optional[
|
|
1576
|
+
parent: Optional[NodeItem] = None,
|
|
1544
1577
|
):
|
|
1545
1578
|
"""add_picture.
|
|
1546
1579
|
|
|
@@ -1548,7 +1581,7 @@ class DoclingDocument(BaseModel):
|
|
|
1548
1581
|
:param caption: Optional[Union[TextItem:
|
|
1549
1582
|
:param RefItem]]: (Default value = None)
|
|
1550
1583
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1551
|
-
:param parent: Optional[
|
|
1584
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1552
1585
|
"""
|
|
1553
1586
|
if not parent:
|
|
1554
1587
|
parent = self.body
|
|
@@ -1578,14 +1611,14 @@ class DoclingDocument(BaseModel):
|
|
|
1578
1611
|
text: str,
|
|
1579
1612
|
orig: Optional[str] = None,
|
|
1580
1613
|
prov: Optional[ProvenanceItem] = None,
|
|
1581
|
-
parent: Optional[
|
|
1614
|
+
parent: Optional[NodeItem] = None,
|
|
1582
1615
|
):
|
|
1583
1616
|
"""add_title.
|
|
1584
1617
|
|
|
1585
1618
|
:param text: str:
|
|
1586
1619
|
:param orig: Optional[str]: (Default value = None)
|
|
1587
1620
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1588
|
-
:param parent: Optional[
|
|
1621
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1589
1622
|
"""
|
|
1590
1623
|
if not parent:
|
|
1591
1624
|
parent = self.body
|
|
@@ -1616,7 +1649,7 @@ class DoclingDocument(BaseModel):
|
|
|
1616
1649
|
orig: Optional[str] = None,
|
|
1617
1650
|
level: LevelNumber = 1,
|
|
1618
1651
|
prov: Optional[ProvenanceItem] = None,
|
|
1619
|
-
parent: Optional[
|
|
1652
|
+
parent: Optional[NodeItem] = None,
|
|
1620
1653
|
):
|
|
1621
1654
|
"""add_heading.
|
|
1622
1655
|
|
|
@@ -1625,7 +1658,7 @@ class DoclingDocument(BaseModel):
|
|
|
1625
1658
|
:param orig: Optional[str]: (Default value = None)
|
|
1626
1659
|
:param level: LevelNumber: (Default value = 1)
|
|
1627
1660
|
:param prov: Optional[ProvenanceItem]: (Default value = None)
|
|
1628
|
-
:param parent: Optional[
|
|
1661
|
+
:param parent: Optional[NodeItem]: (Default value = None)
|
|
1629
1662
|
"""
|
|
1630
1663
|
if not parent:
|
|
1631
1664
|
parent = self.body
|
|
@@ -2055,10 +2088,6 @@ class DoclingDocument(BaseModel):
|
|
|
2055
2088
|
text = f"```\n{item.text}\n```\n"
|
|
2056
2089
|
mdtexts.append(text)
|
|
2057
2090
|
|
|
2058
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2059
|
-
# captions are printed in picture and table ... skipping for now
|
|
2060
|
-
continue
|
|
2061
|
-
|
|
2062
2091
|
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2063
2092
|
in_list = True
|
|
2064
2093
|
# Calculate indent based on list_nesting_level
|
|
@@ -2350,10 +2379,6 @@ class DoclingDocument(BaseModel):
|
|
|
2350
2379
|
text = f"<pre>{item.text}</pre>"
|
|
2351
2380
|
html_texts.append(text)
|
|
2352
2381
|
|
|
2353
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2354
|
-
# captions are printed in picture and table ... skipping for now
|
|
2355
|
-
continue
|
|
2356
|
-
|
|
2357
2382
|
elif isinstance(item, ListItem):
|
|
2358
2383
|
|
|
2359
2384
|
text = f"<li>{item.text}</li>"
|
|
@@ -2555,10 +2580,6 @@ class DoclingDocument(BaseModel):
|
|
|
2555
2580
|
result += f"<unordered_list>{delim}"
|
|
2556
2581
|
in_ordered_list.append(False)
|
|
2557
2582
|
|
|
2558
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2559
|
-
# captions are printed in picture and table ... skipping for now
|
|
2560
|
-
continue
|
|
2561
|
-
|
|
2562
2583
|
elif isinstance(item, SectionHeaderItem):
|
|
2563
2584
|
|
|
2564
2585
|
result += item.export_to_document_tokens(
|
|
@@ -2664,10 +2685,6 @@ class DoclingDocument(BaseModel):
|
|
|
2664
2685
|
indent * level + f"item-{i} at level {level}: {item.label}: {text}"
|
|
2665
2686
|
)
|
|
2666
2687
|
|
|
2667
|
-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CAPTION]:
|
|
2668
|
-
# captions are printed in picture and table ... skipping for now
|
|
2669
|
-
continue
|
|
2670
|
-
|
|
2671
2688
|
elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
|
|
2672
2689
|
text = get_text(text=item.text, max_text_len=max_text_len)
|
|
2673
2690
|
|
docling_core/utils/legacy.py
CHANGED
|
@@ -7,19 +7,26 @@
|
|
|
7
7
|
|
|
8
8
|
import hashlib
|
|
9
9
|
import uuid
|
|
10
|
-
from
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, Optional, Union
|
|
11
12
|
|
|
12
13
|
from docling_core.types.doc import (
|
|
14
|
+
BoundingBox,
|
|
15
|
+
CoordOrigin,
|
|
13
16
|
DocItem,
|
|
14
17
|
DocItemLabel,
|
|
15
18
|
DoclingDocument,
|
|
19
|
+
DocumentOrigin,
|
|
16
20
|
PictureItem,
|
|
21
|
+
ProvenanceItem,
|
|
17
22
|
SectionHeaderItem,
|
|
23
|
+
Size,
|
|
18
24
|
TableCell,
|
|
19
25
|
TableItem,
|
|
20
26
|
TextItem,
|
|
21
27
|
)
|
|
22
|
-
from docling_core.types.doc.document import ListItem
|
|
28
|
+
from docling_core.types.doc.document import GroupItem, ListItem, TableData
|
|
29
|
+
from docling_core.types.doc.labels import GroupLabel
|
|
23
30
|
from docling_core.types.legacy_doc.base import (
|
|
24
31
|
BaseCell,
|
|
25
32
|
BaseText,
|
|
@@ -342,5 +349,285 @@ def docling_document_to_legacy(doc: DoclingDocument, fallback_filaname: str = "f
|
|
|
342
349
|
return legacy_doc
|
|
343
350
|
|
|
344
351
|
|
|
345
|
-
|
|
346
|
-
|
|
352
|
+
def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # noqa: C901
|
|
353
|
+
"""Convert a legacy document to DoclingDocument.
|
|
354
|
+
|
|
355
|
+
It is known that the following content will not be preserved in the transformation:
|
|
356
|
+
- name of labels (upper vs lower case)
|
|
357
|
+
- caption of figures are not in main-text anymore
|
|
358
|
+
- s3_data removed
|
|
359
|
+
- model metadata removed
|
|
360
|
+
- logs removed
|
|
361
|
+
- document hash cannot be preserved
|
|
362
|
+
"""
|
|
363
|
+
|
|
364
|
+
def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
|
|
365
|
+
"""Create a new provenance from a legacy item."""
|
|
366
|
+
prov: Optional[ProvenanceItem] = None
|
|
367
|
+
if item.prov is not None and len(item.prov) > 0:
|
|
368
|
+
prov = ProvenanceItem(
|
|
369
|
+
page_no=int(item.prov[0].page),
|
|
370
|
+
charspan=tuple(item.prov[0].span),
|
|
371
|
+
bbox=BoundingBox.from_tuple(
|
|
372
|
+
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
|
|
373
|
+
),
|
|
374
|
+
)
|
|
375
|
+
return prov
|
|
376
|
+
|
|
377
|
+
origin = DocumentOrigin(
|
|
378
|
+
mimetype="application/pdf",
|
|
379
|
+
filename=legacy_doc.file_info.filename,
|
|
380
|
+
binary_hash=legacy_doc.file_info.document_hash,
|
|
381
|
+
)
|
|
382
|
+
doc_name = Path(origin.filename).stem
|
|
383
|
+
|
|
384
|
+
doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
|
385
|
+
|
|
386
|
+
# define pages
|
|
387
|
+
if legacy_doc.page_dimensions is not None:
|
|
388
|
+
for page_dim in legacy_doc.page_dimensions:
|
|
389
|
+
page_no = int(page_dim.page)
|
|
390
|
+
size = Size(width=page_dim.width, height=page_dim.height)
|
|
391
|
+
|
|
392
|
+
doc.add_page(page_no=page_no, size=size)
|
|
393
|
+
|
|
394
|
+
# page headers
|
|
395
|
+
if legacy_doc.page_headers is not None:
|
|
396
|
+
for text_item in legacy_doc.page_headers:
|
|
397
|
+
if text_item.text is None:
|
|
398
|
+
continue
|
|
399
|
+
prov = _transform_prov(text_item)
|
|
400
|
+
doc.add_text(
|
|
401
|
+
label=DocItemLabel.PAGE_HEADER,
|
|
402
|
+
text=text_item.text,
|
|
403
|
+
parent=doc.furniture,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# page footers
|
|
407
|
+
if legacy_doc.page_footers is not None:
|
|
408
|
+
for text_item in legacy_doc.page_footers:
|
|
409
|
+
if text_item.text is None:
|
|
410
|
+
continue
|
|
411
|
+
prov = _transform_prov(text_item)
|
|
412
|
+
doc.add_text(
|
|
413
|
+
label=DocItemLabel.PAGE_FOOTER,
|
|
414
|
+
text=text_item.text,
|
|
415
|
+
parent=doc.furniture,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# footnotes
|
|
419
|
+
if legacy_doc.footnotes is not None:
|
|
420
|
+
for text_item in legacy_doc.footnotes:
|
|
421
|
+
if text_item.text is None:
|
|
422
|
+
continue
|
|
423
|
+
prov = _transform_prov(text_item)
|
|
424
|
+
doc.add_text(
|
|
425
|
+
label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# main-text content
|
|
429
|
+
if legacy_doc.main_text is not None:
|
|
430
|
+
item: Optional[Union[BaseCell, BaseText]]
|
|
431
|
+
|
|
432
|
+
# collect all captions embedded in table and figure objects
|
|
433
|
+
# to avoid repeating them
|
|
434
|
+
embedded_captions: Dict[str, int] = {}
|
|
435
|
+
for ix, orig_item in enumerate(legacy_doc.main_text):
|
|
436
|
+
item = (
|
|
437
|
+
legacy_doc._resolve_ref(orig_item)
|
|
438
|
+
if isinstance(orig_item, Ref)
|
|
439
|
+
else orig_item
|
|
440
|
+
)
|
|
441
|
+
if item is None:
|
|
442
|
+
continue
|
|
443
|
+
|
|
444
|
+
if isinstance(item, (DsSchemaTable, Figure)) and item.text:
|
|
445
|
+
embedded_captions[item.text] = ix
|
|
446
|
+
|
|
447
|
+
# build lookup from floating objects to their caption item
|
|
448
|
+
floating_to_caption: Dict[int, BaseText] = {}
|
|
449
|
+
for ix, orig_item in enumerate(legacy_doc.main_text):
|
|
450
|
+
item = (
|
|
451
|
+
legacy_doc._resolve_ref(orig_item)
|
|
452
|
+
if isinstance(orig_item, Ref)
|
|
453
|
+
else orig_item
|
|
454
|
+
)
|
|
455
|
+
if item is None:
|
|
456
|
+
continue
|
|
457
|
+
|
|
458
|
+
item_type = item.obj_type.lower()
|
|
459
|
+
if (
|
|
460
|
+
isinstance(item, BaseText)
|
|
461
|
+
and (
|
|
462
|
+
item_type == "caption"
|
|
463
|
+
or (item.name is not None and item.name.lower() == "caption")
|
|
464
|
+
)
|
|
465
|
+
and item.text in embedded_captions
|
|
466
|
+
):
|
|
467
|
+
floating_ix = embedded_captions[item.text]
|
|
468
|
+
floating_to_caption[floating_ix] = item
|
|
469
|
+
|
|
470
|
+
# main loop iteration
|
|
471
|
+
current_list: Optional[GroupItem] = None
|
|
472
|
+
for ix, orig_item in enumerate(legacy_doc.main_text):
|
|
473
|
+
item = (
|
|
474
|
+
legacy_doc._resolve_ref(orig_item)
|
|
475
|
+
if isinstance(orig_item, Ref)
|
|
476
|
+
else orig_item
|
|
477
|
+
)
|
|
478
|
+
if item is None:
|
|
479
|
+
continue
|
|
480
|
+
|
|
481
|
+
prov = _transform_prov(item)
|
|
482
|
+
item_type = item.obj_type.lower()
|
|
483
|
+
|
|
484
|
+
# if a group is needed, add it
|
|
485
|
+
if isinstance(item, BaseText) and (
|
|
486
|
+
item_type in "list-item-level-1" or item.name in {"list", "list-item"}
|
|
487
|
+
):
|
|
488
|
+
if current_list is None:
|
|
489
|
+
current_list = doc.add_group(label=GroupLabel.LIST, name="list")
|
|
490
|
+
else:
|
|
491
|
+
current_list = None
|
|
492
|
+
|
|
493
|
+
# add the document item in the document
|
|
494
|
+
if isinstance(item, BaseText):
|
|
495
|
+
text = item.text if item.text is not None else ""
|
|
496
|
+
label_name = item.name if item.name is not None else "text"
|
|
497
|
+
|
|
498
|
+
if item_type == "caption":
|
|
499
|
+
if text in embedded_captions:
|
|
500
|
+
# skip captions if they are embedded in the actual
|
|
501
|
+
# floating objects
|
|
502
|
+
continue
|
|
503
|
+
else:
|
|
504
|
+
# captions without a related object are inserted as text
|
|
505
|
+
doc.add_text(label=DocItemLabel.TEXT, text=text, prov=prov)
|
|
506
|
+
|
|
507
|
+
# first title match
|
|
508
|
+
if item_type == "title":
|
|
509
|
+
doc.add_title(text=text, prov=prov)
|
|
510
|
+
|
|
511
|
+
# secondary titles
|
|
512
|
+
elif item_type in {
|
|
513
|
+
"subtitle-level-1",
|
|
514
|
+
}:
|
|
515
|
+
doc.add_heading(text=text, prov=prov)
|
|
516
|
+
|
|
517
|
+
# list item
|
|
518
|
+
elif item_type in "list-item-level-1" or label_name in {
|
|
519
|
+
"list",
|
|
520
|
+
"list-item",
|
|
521
|
+
}:
|
|
522
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
|
523
|
+
doc.add_list_item(
|
|
524
|
+
text=text, enumerated=False, prov=prov, parent=current_list
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# normal text
|
|
528
|
+
else:
|
|
529
|
+
label = DocItemLabel.TEXT
|
|
530
|
+
normalized_label_name = label_name.replace("-", "_")
|
|
531
|
+
if normalized_label_name is not None:
|
|
532
|
+
try:
|
|
533
|
+
label = DocItemLabel(normalized_label_name)
|
|
534
|
+
except ValueError:
|
|
535
|
+
pass
|
|
536
|
+
doc.add_text(label=label, text=text, prov=prov)
|
|
537
|
+
|
|
538
|
+
elif isinstance(item, DsSchemaTable):
|
|
539
|
+
|
|
540
|
+
table_data = TableData(num_cols=item.num_cols, num_rows=item.num_rows)
|
|
541
|
+
if item.data is not None:
|
|
542
|
+
seen_spans = set()
|
|
543
|
+
for row_ix, row in enumerate(item.data):
|
|
544
|
+
for col_ix, orig_cell_data in enumerate(row):
|
|
545
|
+
|
|
546
|
+
cell_bbox: Optional[BoundingBox] = (
|
|
547
|
+
BoundingBox.from_tuple(
|
|
548
|
+
tuple(orig_cell_data.bbox),
|
|
549
|
+
origin=CoordOrigin.BOTTOMLEFT,
|
|
550
|
+
)
|
|
551
|
+
if orig_cell_data.bbox is not None
|
|
552
|
+
else None
|
|
553
|
+
)
|
|
554
|
+
cell = TableCell(
|
|
555
|
+
start_row_offset_idx=row_ix,
|
|
556
|
+
end_row_offset_idx=row_ix + 1,
|
|
557
|
+
start_col_offset_idx=col_ix,
|
|
558
|
+
end_col_offset_idx=col_ix + 1,
|
|
559
|
+
text=orig_cell_data.text,
|
|
560
|
+
bbox=cell_bbox,
|
|
561
|
+
column_header=(orig_cell_data.obj_type == "col_header"),
|
|
562
|
+
row_header=(orig_cell_data.obj_type == "row_header"),
|
|
563
|
+
row_section=(orig_cell_data.obj_type == "row_section"),
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
if orig_cell_data.spans is not None:
|
|
567
|
+
# convert to a tuple of tuples for hashing
|
|
568
|
+
spans_tuple = tuple(
|
|
569
|
+
tuple(span) for span in orig_cell_data.spans
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
# skip repeated spans
|
|
573
|
+
if spans_tuple in seen_spans:
|
|
574
|
+
continue
|
|
575
|
+
|
|
576
|
+
seen_spans.add(spans_tuple)
|
|
577
|
+
|
|
578
|
+
cell.start_row_offset_idx = min(
|
|
579
|
+
s[0] for s in spans_tuple
|
|
580
|
+
)
|
|
581
|
+
cell.end_row_offset_idx = (
|
|
582
|
+
max(s[0] for s in spans_tuple) + 1
|
|
583
|
+
)
|
|
584
|
+
cell.start_col_offset_idx = min(
|
|
585
|
+
s[1] for s in spans_tuple
|
|
586
|
+
)
|
|
587
|
+
cell.end_col_offset_idx = (
|
|
588
|
+
max(s[1] for s in spans_tuple) + 1
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
cell.row_span = (
|
|
592
|
+
cell.end_row_offset_idx - cell.start_row_offset_idx
|
|
593
|
+
)
|
|
594
|
+
cell.col_span = (
|
|
595
|
+
cell.end_col_offset_idx - cell.start_col_offset_idx
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
table_data.table_cells.append(cell)
|
|
599
|
+
|
|
600
|
+
new_item = doc.add_table(data=table_data, prov=prov)
|
|
601
|
+
if (caption_item := floating_to_caption.get(ix)) is not None:
|
|
602
|
+
if caption_item.text is not None:
|
|
603
|
+
caption_prov = _transform_prov(caption_item)
|
|
604
|
+
caption = doc.add_text(
|
|
605
|
+
label=DocItemLabel.CAPTION,
|
|
606
|
+
text=caption_item.text,
|
|
607
|
+
prov=caption_prov,
|
|
608
|
+
parent=new_item,
|
|
609
|
+
)
|
|
610
|
+
new_item.captions.append(caption.get_ref())
|
|
611
|
+
|
|
612
|
+
elif isinstance(item, Figure):
|
|
613
|
+
new_item = doc.add_picture(prov=prov)
|
|
614
|
+
if (caption_item := floating_to_caption.get(ix)) is not None:
|
|
615
|
+
if caption_item.text is not None:
|
|
616
|
+
caption_prov = _transform_prov(caption_item)
|
|
617
|
+
caption = doc.add_text(
|
|
618
|
+
label=DocItemLabel.CAPTION,
|
|
619
|
+
text=caption_item.text,
|
|
620
|
+
prov=caption_prov,
|
|
621
|
+
parent=new_item,
|
|
622
|
+
)
|
|
623
|
+
new_item.captions.append(caption.get_ref())
|
|
624
|
+
|
|
625
|
+
# equations
|
|
626
|
+
elif (
|
|
627
|
+
isinstance(item, BaseCell)
|
|
628
|
+
and item.text is not None
|
|
629
|
+
and item_type in {"formula", "equation"}
|
|
630
|
+
):
|
|
631
|
+
doc.add_text(label=DocItemLabel.FORMULA, text=item.text, prov=prov)
|
|
632
|
+
|
|
633
|
+
return doc
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: docling-core
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.10.0
|
|
4
4
|
Summary: A python library to define and validate data types in Docling.
|
|
5
5
|
Home-page: https://ds4sd.github.io/
|
|
6
6
|
License: MIT
|
|
@@ -35,6 +35,7 @@ Requires-Dist: pyyaml (>=5.1,<7.0.0)
|
|
|
35
35
|
Requires-Dist: semchunk (>=2.2.0,<3.0.0) ; extra == "chunking"
|
|
36
36
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
37
37
|
Requires-Dist: transformers (>=4.34.0,<5.0.0) ; extra == "chunking"
|
|
38
|
+
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
|
38
39
|
Requires-Dist: typing-extensions (>=4.12.2,<5.0.0)
|
|
39
40
|
Project-URL: Repository, https://github.com/DS4SD/docling-core
|
|
40
41
|
Description-Content-Type: text/markdown
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
docling_core/__init__.py,sha256=D0afxif-BMUrgx2cYk1cwxiwATRYaGXsIMk_z4nw1Vs,90
|
|
2
|
+
docling_core/cli/__init__.py,sha256=C63yWifzpA0IV7YWDatpAdrhoV8zjqxAKv0xMf09VdM,19
|
|
3
|
+
docling_core/cli/view.py,sha256=bhxvPQWIJVo2g_pRL0GjQwjDw-jdiRXp1-BTbG849go,1746
|
|
2
4
|
docling_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
5
|
docling_core/resources/schemas/doc/ANN.json,sha256=04U5j-PU9m5w7IagJ_rHcAx7qUtLkUuaWZO9GuYHnTA,4202
|
|
4
6
|
docling_core/resources/schemas/doc/DOC.json,sha256=9tVKpCqDGGq3074Nn5qlUCdTN-5k1Q0ri_scJblwnLE,6686
|
|
@@ -17,12 +19,12 @@ docling_core/transforms/__init__.py,sha256=P81y_oqkiTN4Ld5crh1gQ6BbHqqR6C6nBt9AC
|
|
|
17
19
|
docling_core/transforms/chunker/__init__.py,sha256=YdizSKXLmmK9eyYBsarHWr8Mx_AoA0PT7c0absibZMk,306
|
|
18
20
|
docling_core/transforms/chunker/base.py,sha256=PZl6QN41cZseTPkTwPzysDHYYFb6DwDSKw0QVSiFfG0,2541
|
|
19
21
|
docling_core/transforms/chunker/hierarchical_chunker.py,sha256=cy3sE9w_7l-uoIEUcfnZlQweDHUoyAJTQ6IkzxxVjFY,8052
|
|
20
|
-
docling_core/transforms/chunker/hybrid_chunker.py,sha256=
|
|
22
|
+
docling_core/transforms/chunker/hybrid_chunker.py,sha256=9bGhjr4vzpXbOMLCydCl81r1HbzMuMlo9ABfXyLRtd4,11375
|
|
21
23
|
docling_core/types/__init__.py,sha256=MVRSgsk5focwGyAplh_TRR3dEecIXpd98g_u3zZ5HXo,260
|
|
22
24
|
docling_core/types/base.py,sha256=PusJskRVL19y-hq0BgXr5e8--QEqSqLnFNJ8UbOqW88,8318
|
|
23
25
|
docling_core/types/doc/__init__.py,sha256=bEL4zKVOG7Wxm6xQrgF58mu-Teds9aSavuEAKVNhrTU,639
|
|
24
26
|
docling_core/types/doc/base.py,sha256=_ttU8QI8wXDTQRUnN5n7L6D9wYFVLSAibxlFoMbgAsk,4557
|
|
25
|
-
docling_core/types/doc/document.py,sha256=
|
|
27
|
+
docling_core/types/doc/document.py,sha256=9t6FPvrxT9gKtUaYMP_Kyhz_izo2p6TQX_LlG2Fj5hY,91593
|
|
26
28
|
docling_core/types/doc/labels.py,sha256=A8vWP82VAeXO1rlCO0oDKo_Hb8uDeQe0myOTY3P03hk,1596
|
|
27
29
|
docling_core/types/doc/tokens.py,sha256=uU_MYW_p7ypf7eYICFBvxdnVaPZ7CQnvZmbJ6oPrtEA,6134
|
|
28
30
|
docling_core/types/doc/utils.py,sha256=YDOh_ZD1Y7OmCEDdCLJ_MO5K3HA67nc_acfhOK6WztU,1439
|
|
@@ -51,11 +53,11 @@ docling_core/utils/alias.py,sha256=B6Lqvss8CbaNARHLR4qSmNh9OkB6LvqTpxfsFmkLAFo,8
|
|
|
51
53
|
docling_core/utils/file.py,sha256=GzX0pclvewwPoqHJSaVUuULzSJwJgkCUwgKgJ7G5ohQ,5628
|
|
52
54
|
docling_core/utils/generate_docs.py,sha256=BdKAoduWXOc7YMvcmlhjoJOFlUxij1ybxglj6LZDtC8,2290
|
|
53
55
|
docling_core/utils/generate_jsonschema.py,sha256=uNX1O5XnjyB5nA66XqZXTt3YbGuR2tyi_OhHepHYtZg,1654
|
|
54
|
-
docling_core/utils/legacy.py,sha256=
|
|
56
|
+
docling_core/utils/legacy.py,sha256=xfp7U0JqjI60K3loWiNTk8w08_KfCUzTb2MNULBOIz4,24396
|
|
55
57
|
docling_core/utils/validate.py,sha256=3FmnxnKTDZC5J9OGxCL3U3DGRl0t0bBV1NcySXswdas,2031
|
|
56
58
|
docling_core/utils/validators.py,sha256=azcrndLzhNkTWnbFSu9shJ5D3j_znnLrIFA5R8hzmGU,2798
|
|
57
|
-
docling_core-2.
|
|
58
|
-
docling_core-2.
|
|
59
|
-
docling_core-2.
|
|
60
|
-
docling_core-2.
|
|
61
|
-
docling_core-2.
|
|
59
|
+
docling_core-2.10.0.dist-info/LICENSE,sha256=2M9-6EoQ1sxFztTOkXGAtwUDJvnWaAHdB9BYWVwGkIw,1087
|
|
60
|
+
docling_core-2.10.0.dist-info/METADATA,sha256=2Xr2MRaXihKpNdNhAwfZT973ffbX7GGs19ylGCBwfe4,5744
|
|
61
|
+
docling_core-2.10.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
62
|
+
docling_core-2.10.0.dist-info/entry_points.txt,sha256=oClcdb2L2RKx4jdqUykY16Kum_f0_whwWhGzIodyidc,216
|
|
63
|
+
docling_core-2.10.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|