docling 2.23.0__py3-none-any.whl → 2.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +152 -149
- docling/backend/xml/jats_backend.py +6 -68
- docling/backend/xml/uspto_backend.py +48 -27
- docling/models/page_assemble_model.py +8 -0
- docling/models/readingorder_model.py +389 -0
- docling/models/tesseract_ocr_cli_model.py +3 -1
- docling/pipeline/standard_pdf_pipeline.py +2 -2
- {docling-2.23.0.dist-info → docling-2.24.0.dist-info}/METADATA +3 -4
- {docling-2.23.0.dist-info → docling-2.24.0.dist-info}/RECORD +12 -12
- docling/models/ds_glm_model.py +0 -386
- {docling-2.23.0.dist-info → docling-2.24.0.dist-info}/LICENSE +0 -0
- {docling-2.23.0.dist-info → docling-2.24.0.dist-info}/WHEEL +0 -0
- {docling-2.23.0.dist-info → docling-2.24.0.dist-info}/entry_points.txt +0 -0
@@ -14,7 +14,7 @@ from abc import ABC, abstractmethod
|
|
14
14
|
from enum import Enum, unique
|
15
15
|
from io import BytesIO
|
16
16
|
from pathlib import Path
|
17
|
-
from typing import
|
17
|
+
from typing import Final, Optional, Union
|
18
18
|
|
19
19
|
from bs4 import BeautifulSoup, Tag
|
20
20
|
from docling_core.types.doc import (
|
@@ -1406,6 +1406,10 @@ class XmlTable:
|
|
1406
1406
|
http://oasis-open.org/specs/soextblx.dtd
|
1407
1407
|
"""
|
1408
1408
|
|
1409
|
+
class ColInfo(TypedDict):
|
1410
|
+
ncols: int
|
1411
|
+
colinfo: list[dict]
|
1412
|
+
|
1409
1413
|
class MinColInfoType(TypedDict):
|
1410
1414
|
offset: list[int]
|
1411
1415
|
colwidth: list[int]
|
@@ -1425,7 +1429,7 @@ class XmlTable:
|
|
1425
1429
|
self.empty_text = ""
|
1426
1430
|
self._soup = BeautifulSoup(input, features="xml")
|
1427
1431
|
|
1428
|
-
def _create_tg_range(self, tgs: list[
|
1432
|
+
def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
|
1429
1433
|
"""Create a unified range along the table groups.
|
1430
1434
|
|
1431
1435
|
Args:
|
@@ -1532,19 +1536,26 @@ class XmlTable:
|
|
1532
1536
|
Returns:
|
1533
1537
|
A docling table object.
|
1534
1538
|
"""
|
1535
|
-
tgs_align = []
|
1536
|
-
tg_secs = table
|
1539
|
+
tgs_align: list[XmlTable.ColInfo] = []
|
1540
|
+
tg_secs = table("tgroup")
|
1537
1541
|
if tg_secs:
|
1538
1542
|
for tg_sec in tg_secs:
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1543
|
+
if not isinstance(tg_sec, Tag):
|
1544
|
+
continue
|
1545
|
+
col_val = tg_sec.get("cols")
|
1546
|
+
ncols = (
|
1547
|
+
int(col_val)
|
1548
|
+
if isinstance(col_val, str) and col_val.isnumeric()
|
1549
|
+
else 1
|
1550
|
+
)
|
1551
|
+
tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
|
1552
|
+
cs_secs = tg_sec("colspec")
|
1544
1553
|
if cs_secs:
|
1545
1554
|
for cs_sec in cs_secs:
|
1546
|
-
|
1547
|
-
|
1555
|
+
if not isinstance(cs_sec, Tag):
|
1556
|
+
continue
|
1557
|
+
colname = cs_sec.get("colname")
|
1558
|
+
colwidth = cs_sec.get("colwidth")
|
1548
1559
|
tg_align["colinfo"].append(
|
1549
1560
|
{"colname": colname, "colwidth": colwidth}
|
1550
1561
|
)
|
@@ -1565,16 +1576,23 @@ class XmlTable:
|
|
1565
1576
|
table_data: list[TableCell] = []
|
1566
1577
|
i_row_global = 0
|
1567
1578
|
is_row_empty: bool = True
|
1568
|
-
tg_secs = table
|
1579
|
+
tg_secs = table("tgroup")
|
1569
1580
|
if tg_secs:
|
1570
1581
|
for itg, tg_sec in enumerate(tg_secs):
|
1582
|
+
if not isinstance(tg_sec, Tag):
|
1583
|
+
continue
|
1571
1584
|
tg_range = tgs_range[itg]
|
1572
|
-
row_secs = tg_sec
|
1585
|
+
row_secs = tg_sec(["row", "tr"])
|
1573
1586
|
|
1574
1587
|
if row_secs:
|
1575
1588
|
for row_sec in row_secs:
|
1576
|
-
|
1577
|
-
|
1589
|
+
if not isinstance(row_sec, Tag):
|
1590
|
+
continue
|
1591
|
+
entry_secs = row_sec(["entry", "td"])
|
1592
|
+
is_header: bool = (
|
1593
|
+
row_sec.parent is not None
|
1594
|
+
and row_sec.parent.name == "thead"
|
1595
|
+
)
|
1578
1596
|
|
1579
1597
|
ncols = 0
|
1580
1598
|
local_row: list[TableCell] = []
|
@@ -1582,23 +1600,26 @@ class XmlTable:
|
|
1582
1600
|
if entry_secs:
|
1583
1601
|
wrong_nbr_cols = False
|
1584
1602
|
for ientry, entry_sec in enumerate(entry_secs):
|
1603
|
+
if not isinstance(entry_sec, Tag):
|
1604
|
+
continue
|
1585
1605
|
text = entry_sec.get_text().strip()
|
1586
1606
|
|
1587
1607
|
# start-end
|
1588
|
-
namest = entry_sec.
|
1589
|
-
nameend = entry_sec.
|
1590
|
-
|
1591
|
-
|
1592
|
-
|
1593
|
-
|
1608
|
+
namest = entry_sec.get("namest")
|
1609
|
+
nameend = entry_sec.get("nameend")
|
1610
|
+
start = (
|
1611
|
+
int(namest)
|
1612
|
+
if isinstance(namest, str) and namest.isnumeric()
|
1613
|
+
else ientry + 1
|
1614
|
+
)
|
1594
1615
|
if isinstance(nameend, str) and nameend.isnumeric():
|
1595
|
-
|
1616
|
+
end = int(nameend)
|
1596
1617
|
shift = 0
|
1597
1618
|
else:
|
1598
|
-
|
1619
|
+
end = ientry + 2
|
1599
1620
|
shift = 1
|
1600
1621
|
|
1601
|
-
if
|
1622
|
+
if end > len(tg_range["cell_offst"]):
|
1602
1623
|
wrong_nbr_cols = True
|
1603
1624
|
self.nbr_messages += 1
|
1604
1625
|
if self.nbr_messages <= self.max_nbr_messages:
|
@@ -1608,8 +1629,8 @@ class XmlTable:
|
|
1608
1629
|
break
|
1609
1630
|
|
1610
1631
|
range_ = [
|
1611
|
-
tg_range["cell_offst"][
|
1612
|
-
tg_range["cell_offst"][
|
1632
|
+
tg_range["cell_offst"][start - 1],
|
1633
|
+
tg_range["cell_offst"][end - 1] - shift,
|
1613
1634
|
]
|
1614
1635
|
|
1615
1636
|
# add row and replicate cell if needed
|
@@ -1668,7 +1689,7 @@ class XmlTable:
|
|
1668
1689
|
A docling table data.
|
1669
1690
|
"""
|
1670
1691
|
section = self._soup.find("table")
|
1671
|
-
if section
|
1692
|
+
if isinstance(section, Tag):
|
1672
1693
|
table = self._parse_table(section)
|
1673
1694
|
if table.num_rows == 0 or table.num_cols == 0:
|
1674
1695
|
_log.warning("The parsed USPTO table is empty")
|
@@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):
|
|
52
52
|
|
53
53
|
sanitized_text = "".join(lines)
|
54
54
|
|
55
|
+
# Text normalization
|
56
|
+
sanitized_text = sanitized_text.replace("⁄", "/")
|
57
|
+
sanitized_text = sanitized_text.replace("’", "'")
|
58
|
+
sanitized_text = sanitized_text.replace("‘", "'")
|
59
|
+
sanitized_text = sanitized_text.replace("“", '"')
|
60
|
+
sanitized_text = sanitized_text.replace("”", '"')
|
61
|
+
sanitized_text = sanitized_text.replace("•", "·")
|
62
|
+
|
55
63
|
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
56
64
|
|
57
65
|
def __call__(
|
@@ -0,0 +1,389 @@
|
|
1
|
+
import copy
|
2
|
+
import random
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Dict, List
|
5
|
+
|
6
|
+
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItem,
|
10
|
+
DocItemLabel,
|
11
|
+
DoclingDocument,
|
12
|
+
DocumentOrigin,
|
13
|
+
GroupLabel,
|
14
|
+
NodeItem,
|
15
|
+
ProvenanceItem,
|
16
|
+
RefItem,
|
17
|
+
TableData,
|
18
|
+
)
|
19
|
+
from docling_core.types.doc.document import ContentLayer
|
20
|
+
from docling_core.types.legacy_doc.base import Ref
|
21
|
+
from docling_core.types.legacy_doc.document import BaseText
|
22
|
+
from docling_ibm_models.reading_order.reading_order_rb import (
|
23
|
+
PageElement as ReadingOrderPageElement,
|
24
|
+
)
|
25
|
+
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
26
|
+
from PIL import ImageDraw
|
27
|
+
from pydantic import BaseModel, ConfigDict
|
28
|
+
|
29
|
+
from docling.datamodel.base_models import (
|
30
|
+
BasePageElement,
|
31
|
+
Cluster,
|
32
|
+
ContainerElement,
|
33
|
+
FigureElement,
|
34
|
+
Table,
|
35
|
+
TextElement,
|
36
|
+
)
|
37
|
+
from docling.datamodel.document import ConversionResult
|
38
|
+
from docling.datamodel.settings import settings
|
39
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
40
|
+
|
41
|
+
|
42
|
+
class ReadingOrderOptions(BaseModel):
|
43
|
+
model_config = ConfigDict(protected_namespaces=())
|
44
|
+
|
45
|
+
model_names: str = "" # e.g. "language;term;reference"
|
46
|
+
|
47
|
+
|
48
|
+
class ReadingOrderModel:
|
49
|
+
def __init__(self, options: ReadingOrderOptions):
|
50
|
+
self.options = options
|
51
|
+
self.ro_model = ReadingOrderPredictor()
|
52
|
+
|
53
|
+
def _assembled_to_readingorder_elements(
|
54
|
+
self, conv_res: ConversionResult
|
55
|
+
) -> List[ReadingOrderPageElement]:
|
56
|
+
|
57
|
+
elements: List[ReadingOrderPageElement] = []
|
58
|
+
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
59
|
+
|
60
|
+
for element in conv_res.assembled.elements:
|
61
|
+
|
62
|
+
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
63
|
+
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
64
|
+
text = element.text or ""
|
65
|
+
|
66
|
+
elements.append(
|
67
|
+
ReadingOrderPageElement(
|
68
|
+
cid=len(elements),
|
69
|
+
ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
|
70
|
+
text=text,
|
71
|
+
page_no=element.page_no,
|
72
|
+
page_size=page_no_to_pages[element.page_no].size,
|
73
|
+
label=element.label,
|
74
|
+
l=bbox.l,
|
75
|
+
r=bbox.r,
|
76
|
+
b=bbox.b,
|
77
|
+
t=bbox.t,
|
78
|
+
coord_origin=bbox.coord_origin,
|
79
|
+
)
|
80
|
+
)
|
81
|
+
|
82
|
+
return elements
|
83
|
+
|
84
|
+
def _add_child_elements(
|
85
|
+
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
86
|
+
):
|
87
|
+
|
88
|
+
child: Cluster
|
89
|
+
for child in element.cluster.children:
|
90
|
+
c_label = child.label
|
91
|
+
c_bbox = child.bbox.to_bottom_left_origin(
|
92
|
+
doc.pages[element.page_no + 1].size.height
|
93
|
+
)
|
94
|
+
c_text = " ".join(
|
95
|
+
[
|
96
|
+
cell.text.replace("\x02", "-").strip()
|
97
|
+
for cell in child.cells
|
98
|
+
if len(cell.text.strip()) > 0
|
99
|
+
]
|
100
|
+
)
|
101
|
+
|
102
|
+
c_prov = ProvenanceItem(
|
103
|
+
page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
|
104
|
+
)
|
105
|
+
if c_label == DocItemLabel.LIST_ITEM:
|
106
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
107
|
+
doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
|
108
|
+
elif c_label == DocItemLabel.SECTION_HEADER:
|
109
|
+
doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
|
110
|
+
else:
|
111
|
+
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
112
|
+
|
113
|
+
def _readingorder_elements_to_docling_doc(
|
114
|
+
self,
|
115
|
+
conv_res: ConversionResult,
|
116
|
+
ro_elements: List[ReadingOrderPageElement],
|
117
|
+
el_to_captions_mapping: Dict[int, List[int]],
|
118
|
+
el_to_footnotes_mapping: Dict[int, List[int]],
|
119
|
+
el_merges_mapping: Dict[int, List[int]],
|
120
|
+
) -> DoclingDocument:
|
121
|
+
|
122
|
+
id_to_elem = {
|
123
|
+
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
124
|
+
for elem in conv_res.assembled.elements
|
125
|
+
}
|
126
|
+
cid_to_rels = {rel.cid: rel for rel in ro_elements}
|
127
|
+
|
128
|
+
origin = DocumentOrigin(
|
129
|
+
mimetype="application/pdf",
|
130
|
+
filename=conv_res.input.file.name,
|
131
|
+
binary_hash=conv_res.input.document_hash,
|
132
|
+
)
|
133
|
+
doc_name = Path(origin.filename).stem
|
134
|
+
out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
135
|
+
|
136
|
+
for page in conv_res.pages:
|
137
|
+
page_no = page.page_no + 1
|
138
|
+
size = page.size
|
139
|
+
|
140
|
+
assert size is not None
|
141
|
+
|
142
|
+
out_doc.add_page(page_no=page_no, size=size)
|
143
|
+
|
144
|
+
current_list = None
|
145
|
+
skippable_cids = {
|
146
|
+
cid
|
147
|
+
for mapping in (
|
148
|
+
el_to_captions_mapping,
|
149
|
+
el_to_footnotes_mapping,
|
150
|
+
el_merges_mapping,
|
151
|
+
)
|
152
|
+
for lst in mapping.values()
|
153
|
+
for cid in lst
|
154
|
+
}
|
155
|
+
|
156
|
+
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
157
|
+
|
158
|
+
for rel in ro_elements:
|
159
|
+
if rel.cid in skippable_cids:
|
160
|
+
continue
|
161
|
+
element = id_to_elem[rel.ref.cref]
|
162
|
+
|
163
|
+
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
164
|
+
|
165
|
+
if isinstance(element, TextElement):
|
166
|
+
if element.label == DocItemLabel.CODE:
|
167
|
+
cap_text = element.text
|
168
|
+
prov = ProvenanceItem(
|
169
|
+
page_no=element.page_no + 1,
|
170
|
+
charspan=(0, len(cap_text)),
|
171
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
172
|
+
)
|
173
|
+
code_item = out_doc.add_code(text=cap_text, prov=prov)
|
174
|
+
|
175
|
+
if rel.cid in el_to_captions_mapping.keys():
|
176
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
177
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
178
|
+
new_cap_item = self._add_caption_or_footnote(
|
179
|
+
caption_elem, out_doc, code_item, page_height
|
180
|
+
)
|
181
|
+
|
182
|
+
code_item.captions.append(new_cap_item.get_ref())
|
183
|
+
|
184
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
185
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
186
|
+
footnote_elem = id_to_elem[
|
187
|
+
cid_to_rels[footnote_cid].ref.cref
|
188
|
+
]
|
189
|
+
new_footnote_item = self._add_caption_or_footnote(
|
190
|
+
footnote_elem, out_doc, code_item, page_height
|
191
|
+
)
|
192
|
+
|
193
|
+
code_item.footnotes.append(new_footnote_item.get_ref())
|
194
|
+
else:
|
195
|
+
|
196
|
+
new_item, current_list = self._handle_text_element(
|
197
|
+
element, out_doc, current_list, page_height
|
198
|
+
)
|
199
|
+
|
200
|
+
if rel.cid in el_merges_mapping.keys():
|
201
|
+
for merged_cid in el_merges_mapping[rel.cid]:
|
202
|
+
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
|
203
|
+
|
204
|
+
self._merge_elements(
|
205
|
+
element, merged_elem, new_item, page_height
|
206
|
+
)
|
207
|
+
|
208
|
+
elif isinstance(element, Table):
|
209
|
+
|
210
|
+
tbl_data = TableData(
|
211
|
+
num_rows=element.num_rows,
|
212
|
+
num_cols=element.num_cols,
|
213
|
+
table_cells=element.table_cells,
|
214
|
+
)
|
215
|
+
|
216
|
+
prov = ProvenanceItem(
|
217
|
+
page_no=element.page_no + 1,
|
218
|
+
charspan=(0, 0),
|
219
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
220
|
+
)
|
221
|
+
|
222
|
+
tbl = out_doc.add_table(
|
223
|
+
data=tbl_data, prov=prov, label=element.cluster.label
|
224
|
+
)
|
225
|
+
|
226
|
+
if rel.cid in el_to_captions_mapping.keys():
|
227
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
228
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
229
|
+
new_cap_item = self._add_caption_or_footnote(
|
230
|
+
caption_elem, out_doc, tbl, page_height
|
231
|
+
)
|
232
|
+
|
233
|
+
tbl.captions.append(new_cap_item.get_ref())
|
234
|
+
|
235
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
236
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
237
|
+
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
238
|
+
new_footnote_item = self._add_caption_or_footnote(
|
239
|
+
footnote_elem, out_doc, tbl, page_height
|
240
|
+
)
|
241
|
+
|
242
|
+
tbl.footnotes.append(new_footnote_item.get_ref())
|
243
|
+
|
244
|
+
# TODO: Consider adding children of Table.
|
245
|
+
|
246
|
+
elif isinstance(element, FigureElement):
|
247
|
+
cap_text = ""
|
248
|
+
prov = ProvenanceItem(
|
249
|
+
page_no=element.page_no + 1,
|
250
|
+
charspan=(0, len(cap_text)),
|
251
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
252
|
+
)
|
253
|
+
pic = out_doc.add_picture(prov=prov)
|
254
|
+
|
255
|
+
if rel.cid in el_to_captions_mapping.keys():
|
256
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
257
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
258
|
+
new_cap_item = self._add_caption_or_footnote(
|
259
|
+
caption_elem, out_doc, pic, page_height
|
260
|
+
)
|
261
|
+
|
262
|
+
pic.captions.append(new_cap_item.get_ref())
|
263
|
+
|
264
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
265
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
266
|
+
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
267
|
+
new_footnote_item = self._add_caption_or_footnote(
|
268
|
+
footnote_elem, out_doc, pic, page_height
|
269
|
+
)
|
270
|
+
|
271
|
+
pic.footnotes.append(new_footnote_item.get_ref())
|
272
|
+
|
273
|
+
self._add_child_elements(element, pic, out_doc)
|
274
|
+
|
275
|
+
elif isinstance(element, ContainerElement): # Form, KV region
|
276
|
+
label = element.label
|
277
|
+
group_label = GroupLabel.UNSPECIFIED
|
278
|
+
if label == DocItemLabel.FORM:
|
279
|
+
group_label = GroupLabel.FORM_AREA
|
280
|
+
elif label == DocItemLabel.KEY_VALUE_REGION:
|
281
|
+
group_label = GroupLabel.KEY_VALUE_AREA
|
282
|
+
|
283
|
+
container_el = out_doc.add_group(label=group_label)
|
284
|
+
|
285
|
+
self._add_child_elements(element, container_el, out_doc)
|
286
|
+
|
287
|
+
return out_doc
|
288
|
+
|
289
|
+
def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
|
290
|
+
assert isinstance(elem, TextElement)
|
291
|
+
text = elem.text
|
292
|
+
prov = ProvenanceItem(
|
293
|
+
page_no=elem.page_no + 1,
|
294
|
+
charspan=(0, len(text)),
|
295
|
+
bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
|
296
|
+
)
|
297
|
+
new_item = out_doc.add_text(
|
298
|
+
label=elem.label, text=text, prov=prov, parent=parent
|
299
|
+
)
|
300
|
+
return new_item
|
301
|
+
|
302
|
+
def _handle_text_element(self, element, out_doc, current_list, page_height):
|
303
|
+
cap_text = element.text
|
304
|
+
|
305
|
+
prov = ProvenanceItem(
|
306
|
+
page_no=element.page_no + 1,
|
307
|
+
charspan=(0, len(cap_text)),
|
308
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
309
|
+
)
|
310
|
+
label = element.label
|
311
|
+
if label == DocItemLabel.LIST_ITEM:
|
312
|
+
if current_list is None:
|
313
|
+
current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
|
314
|
+
|
315
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
316
|
+
new_item = out_doc.add_list_item(
|
317
|
+
text=cap_text, enumerated=False, prov=prov, parent=current_list
|
318
|
+
)
|
319
|
+
elif label == DocItemLabel.SECTION_HEADER:
|
320
|
+
current_list = None
|
321
|
+
|
322
|
+
new_item = out_doc.add_heading(text=cap_text, prov=prov)
|
323
|
+
elif label == DocItemLabel.FORMULA:
|
324
|
+
current_list = None
|
325
|
+
|
326
|
+
new_item = out_doc.add_text(
|
327
|
+
label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
|
328
|
+
)
|
329
|
+
else:
|
330
|
+
current_list = None
|
331
|
+
|
332
|
+
content_layer = ContentLayer.BODY
|
333
|
+
if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
334
|
+
content_layer = ContentLayer.FURNITURE
|
335
|
+
|
336
|
+
new_item = out_doc.add_text(
|
337
|
+
label=element.label,
|
338
|
+
text=cap_text,
|
339
|
+
prov=prov,
|
340
|
+
content_layer=content_layer,
|
341
|
+
)
|
342
|
+
return new_item, current_list
|
343
|
+
|
344
|
+
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
345
|
+
assert isinstance(
|
346
|
+
merged_elem, type(element)
|
347
|
+
), "Merged element must be of same type as element."
|
348
|
+
assert (
|
349
|
+
merged_elem.label == new_item.label
|
350
|
+
), "Labels of merged elements must match."
|
351
|
+
prov = ProvenanceItem(
|
352
|
+
page_no=element.page_no + 1,
|
353
|
+
charspan=(
|
354
|
+
len(new_item.text) + 1,
|
355
|
+
len(new_item.text) + 1 + len(merged_elem.text),
|
356
|
+
),
|
357
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
358
|
+
)
|
359
|
+
new_item.text += f" {merged_elem.text}"
|
360
|
+
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
|
361
|
+
new_item.prov.append(prov)
|
362
|
+
|
363
|
+
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
364
|
+
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
365
|
+
page_elements = self._assembled_to_readingorder_elements(conv_res)
|
366
|
+
|
367
|
+
# Apply reading order
|
368
|
+
sorted_elements = self.ro_model.predict_reading_order(
|
369
|
+
page_elements=page_elements
|
370
|
+
)
|
371
|
+
el_to_captions_mapping = self.ro_model.predict_to_captions(
|
372
|
+
sorted_elements=sorted_elements
|
373
|
+
)
|
374
|
+
el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
|
375
|
+
sorted_elements=sorted_elements
|
376
|
+
)
|
377
|
+
el_merges_mapping = self.ro_model.predict_merges(
|
378
|
+
sorted_elements=sorted_elements
|
379
|
+
)
|
380
|
+
|
381
|
+
docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
|
382
|
+
conv_res,
|
383
|
+
sorted_elements,
|
384
|
+
el_to_captions_mapping,
|
385
|
+
el_to_footnotes_mapping,
|
386
|
+
el_merges_mapping,
|
387
|
+
)
|
388
|
+
|
389
|
+
return docling_doc
|
@@ -114,7 +114,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
114
114
|
# _log.info("df: ", df.head())
|
115
115
|
|
116
116
|
# Filter rows that contain actual text (ignore header or empty rows)
|
117
|
-
df_filtered = df[
|
117
|
+
df_filtered = df[
|
118
|
+
df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
|
119
|
+
]
|
118
120
|
|
119
121
|
return df_filtered
|
120
122
|
|
@@ -27,7 +27,6 @@ from docling.models.document_picture_classifier import (
|
|
27
27
|
DocumentPictureClassifier,
|
28
28
|
DocumentPictureClassifierOptions,
|
29
29
|
)
|
30
|
-
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
31
30
|
from docling.models.easyocr_model import EasyOcrModel
|
32
31
|
from docling.models.layout_model import LayoutModel
|
33
32
|
from docling.models.ocr_mac_model import OcrMacModel
|
@@ -40,6 +39,7 @@ from docling.models.picture_description_api_model import PictureDescriptionApiMo
|
|
40
39
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
41
40
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
42
41
|
from docling.models.rapid_ocr_model import RapidOcrModel
|
42
|
+
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
43
43
|
from docling.models.table_structure_model import TableStructureModel
|
44
44
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
45
45
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
@@ -76,7 +76,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
76
76
|
or self.pipeline_options.generate_table_images
|
77
77
|
)
|
78
78
|
|
79
|
-
self.glm_model =
|
79
|
+
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
80
80
|
|
81
81
|
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
82
82
|
raise RuntimeError(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.24.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -25,11 +25,10 @@ Provides-Extra: ocrmac
|
|
25
25
|
Provides-Extra: rapidocr
|
26
26
|
Provides-Extra: tesserocr
|
27
27
|
Provides-Extra: vlm
|
28
|
-
Requires-Dist: beautifulsoup4 (>=4.12.3,<
|
28
|
+
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
29
29
|
Requires-Dist: certifi (>=2024.7.4)
|
30
|
-
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
31
30
|
Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
|
32
|
-
Requires-Dist: docling-ibm-models (>=3.
|
31
|
+
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
32
|
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
34
33
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
35
34
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
@@ -5,7 +5,7 @@ docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQ
|
|
5
5
|
docling/backend/csv_backend.py,sha256=xuId4JGEXjoyPgO9Fy9hQ5C-ezXvJwv0TGB8fyFHgWM,4533
|
6
6
|
docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
|
7
7
|
docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
|
8
|
-
docling/backend/html_backend.py,sha256=
|
8
|
+
docling/backend/html_backend.py,sha256=BxYvYmgcio6IqROMFKgyYyoankcNUccalCeYlmTE4fk,16094
|
9
9
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
11
11
|
docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
|
@@ -15,8 +15,8 @@ docling/backend/msword_backend.py,sha256=V4miLIcOH8DDlSCm25F_DALBW60Uf9JoSS0TB4y
|
|
15
15
|
docling/backend/pdf_backend.py,sha256=17Pr8dWsD1C4FYUprrwMM9trDGW-JYLjrcScx1Ul4io,2048
|
16
16
|
docling/backend/pypdfium2_backend.py,sha256=QSPfp903ZtSpoNqPmcIek0HmvETrJ1kkwrdxnF5pjS0,9014
|
17
17
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
docling/backend/xml/jats_backend.py,sha256=
|
19
|
-
docling/backend/xml/uspto_backend.py,sha256=
|
18
|
+
docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqafCvdr7s,24945
|
19
|
+
docling/backend/xml/uspto_backend.py,sha256=IGUNeF2xpLeaVrX6nKb-jXgtSYD2ozULsrDPcrI1IbQ,71040
|
20
20
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
21
21
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
|
@@ -34,23 +34,23 @@ docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,
|
|
34
34
|
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
35
35
|
docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
|
36
36
|
docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
|
37
|
-
docling/models/ds_glm_model.py,sha256=1jLEM-B_oHFevKq23zDQpdifE3eJL7qiLr5YLpEf1kQ,15217
|
38
37
|
docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
|
39
38
|
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
40
39
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
41
|
-
docling/models/page_assemble_model.py,sha256=
|
40
|
+
docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
|
42
41
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
43
42
|
docling/models/picture_description_api_model.py,sha256=SKNoHpqzbfM8iO-DJJ4ccyNVqO0B2d9neLBnXqt50FY,3186
|
44
43
|
docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0ZxwO7yubhh2RkaC_8e8,1910
|
45
44
|
docling/models/picture_description_vlm_model.py,sha256=a2vYUdlcA0--_8neY0tTiU8reCf29NCbVMKwWdMy2QQ,3653
|
46
45
|
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
46
|
+
docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
|
47
47
|
docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
|
48
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
48
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=F5EhS4NDEmLkPq-a0P7o2LrzjmJgACzlYXTDvtD3NtY,9343
|
49
49
|
docling/models/tesseract_ocr_model.py,sha256=ikGu6QNknLG64c9yYIb0Ix6MGhBzOoa1ODbNc8MT5r8,8508
|
50
50
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
51
51
|
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
52
52
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
53
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
53
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=IQHktVYvueTrYnIgLonaMvfYKKsU3L-hC9dqrR-Lw8g,12904
|
54
54
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
55
55
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
@@ -62,8 +62,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
|
|
62
62
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
63
63
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
64
64
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
65
|
-
docling-2.
|
66
|
-
docling-2.
|
67
|
-
docling-2.
|
68
|
-
docling-2.
|
69
|
-
docling-2.
|
65
|
+
docling-2.24.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
66
|
+
docling-2.24.0.dist-info/METADATA,sha256=0MJ5mBt0GwsZotaSpHnAWzdzWcu_BQFGqGzNR3gRpG4,8672
|
67
|
+
docling-2.24.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
68
|
+
docling-2.24.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
69
|
+
docling-2.24.0.dist-info/RECORD,,
|