docling 2.23.1__py3-none-any.whl → 2.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/models/page_assemble_model.py +8 -0
- docling/models/readingorder_model.py +389 -0
- docling/pipeline/standard_pdf_pipeline.py +2 -2
- {docling-2.23.1.dist-info → docling-2.24.0.dist-info}/METADATA +2 -3
- {docling-2.23.1.dist-info → docling-2.24.0.dist-info}/RECORD +8 -8
- docling/models/ds_glm_model.py +0 -386
- {docling-2.23.1.dist-info → docling-2.24.0.dist-info}/LICENSE +0 -0
- {docling-2.23.1.dist-info → docling-2.24.0.dist-info}/WHEEL +0 -0
- {docling-2.23.1.dist-info → docling-2.24.0.dist-info}/entry_points.txt +0 -0
@@ -52,6 +52,14 @@ class PageAssembleModel(BasePageModel):
|
|
52
52
|
|
53
53
|
sanitized_text = "".join(lines)
|
54
54
|
|
55
|
+
# Text normalization
|
56
|
+
sanitized_text = sanitized_text.replace("⁄", "/")
|
57
|
+
sanitized_text = sanitized_text.replace("’", "'")
|
58
|
+
sanitized_text = sanitized_text.replace("‘", "'")
|
59
|
+
sanitized_text = sanitized_text.replace("“", '"')
|
60
|
+
sanitized_text = sanitized_text.replace("”", '"')
|
61
|
+
sanitized_text = sanitized_text.replace("•", "·")
|
62
|
+
|
55
63
|
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
56
64
|
|
57
65
|
def __call__(
|
@@ -0,0 +1,389 @@
|
|
1
|
+
import copy
|
2
|
+
import random
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Dict, List
|
5
|
+
|
6
|
+
from docling_core.types.doc import (
|
7
|
+
BoundingBox,
|
8
|
+
CoordOrigin,
|
9
|
+
DocItem,
|
10
|
+
DocItemLabel,
|
11
|
+
DoclingDocument,
|
12
|
+
DocumentOrigin,
|
13
|
+
GroupLabel,
|
14
|
+
NodeItem,
|
15
|
+
ProvenanceItem,
|
16
|
+
RefItem,
|
17
|
+
TableData,
|
18
|
+
)
|
19
|
+
from docling_core.types.doc.document import ContentLayer
|
20
|
+
from docling_core.types.legacy_doc.base import Ref
|
21
|
+
from docling_core.types.legacy_doc.document import BaseText
|
22
|
+
from docling_ibm_models.reading_order.reading_order_rb import (
|
23
|
+
PageElement as ReadingOrderPageElement,
|
24
|
+
)
|
25
|
+
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
26
|
+
from PIL import ImageDraw
|
27
|
+
from pydantic import BaseModel, ConfigDict
|
28
|
+
|
29
|
+
from docling.datamodel.base_models import (
|
30
|
+
BasePageElement,
|
31
|
+
Cluster,
|
32
|
+
ContainerElement,
|
33
|
+
FigureElement,
|
34
|
+
Table,
|
35
|
+
TextElement,
|
36
|
+
)
|
37
|
+
from docling.datamodel.document import ConversionResult
|
38
|
+
from docling.datamodel.settings import settings
|
39
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
40
|
+
|
41
|
+
|
42
|
+
class ReadingOrderOptions(BaseModel):
|
43
|
+
model_config = ConfigDict(protected_namespaces=())
|
44
|
+
|
45
|
+
model_names: str = "" # e.g. "language;term;reference"
|
46
|
+
|
47
|
+
|
48
|
+
class ReadingOrderModel:
|
49
|
+
def __init__(self, options: ReadingOrderOptions):
|
50
|
+
self.options = options
|
51
|
+
self.ro_model = ReadingOrderPredictor()
|
52
|
+
|
53
|
+
def _assembled_to_readingorder_elements(
|
54
|
+
self, conv_res: ConversionResult
|
55
|
+
) -> List[ReadingOrderPageElement]:
|
56
|
+
|
57
|
+
elements: List[ReadingOrderPageElement] = []
|
58
|
+
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
59
|
+
|
60
|
+
for element in conv_res.assembled.elements:
|
61
|
+
|
62
|
+
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
63
|
+
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
64
|
+
text = element.text or ""
|
65
|
+
|
66
|
+
elements.append(
|
67
|
+
ReadingOrderPageElement(
|
68
|
+
cid=len(elements),
|
69
|
+
ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
|
70
|
+
text=text,
|
71
|
+
page_no=element.page_no,
|
72
|
+
page_size=page_no_to_pages[element.page_no].size,
|
73
|
+
label=element.label,
|
74
|
+
l=bbox.l,
|
75
|
+
r=bbox.r,
|
76
|
+
b=bbox.b,
|
77
|
+
t=bbox.t,
|
78
|
+
coord_origin=bbox.coord_origin,
|
79
|
+
)
|
80
|
+
)
|
81
|
+
|
82
|
+
return elements
|
83
|
+
|
84
|
+
def _add_child_elements(
|
85
|
+
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
86
|
+
):
|
87
|
+
|
88
|
+
child: Cluster
|
89
|
+
for child in element.cluster.children:
|
90
|
+
c_label = child.label
|
91
|
+
c_bbox = child.bbox.to_bottom_left_origin(
|
92
|
+
doc.pages[element.page_no + 1].size.height
|
93
|
+
)
|
94
|
+
c_text = " ".join(
|
95
|
+
[
|
96
|
+
cell.text.replace("\x02", "-").strip()
|
97
|
+
for cell in child.cells
|
98
|
+
if len(cell.text.strip()) > 0
|
99
|
+
]
|
100
|
+
)
|
101
|
+
|
102
|
+
c_prov = ProvenanceItem(
|
103
|
+
page_no=element.page_no + 1, charspan=(0, len(c_text)), bbox=c_bbox
|
104
|
+
)
|
105
|
+
if c_label == DocItemLabel.LIST_ITEM:
|
106
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
107
|
+
doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
|
108
|
+
elif c_label == DocItemLabel.SECTION_HEADER:
|
109
|
+
doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
|
110
|
+
else:
|
111
|
+
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
112
|
+
|
113
|
+
def _readingorder_elements_to_docling_doc(
|
114
|
+
self,
|
115
|
+
conv_res: ConversionResult,
|
116
|
+
ro_elements: List[ReadingOrderPageElement],
|
117
|
+
el_to_captions_mapping: Dict[int, List[int]],
|
118
|
+
el_to_footnotes_mapping: Dict[int, List[int]],
|
119
|
+
el_merges_mapping: Dict[int, List[int]],
|
120
|
+
) -> DoclingDocument:
|
121
|
+
|
122
|
+
id_to_elem = {
|
123
|
+
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
124
|
+
for elem in conv_res.assembled.elements
|
125
|
+
}
|
126
|
+
cid_to_rels = {rel.cid: rel for rel in ro_elements}
|
127
|
+
|
128
|
+
origin = DocumentOrigin(
|
129
|
+
mimetype="application/pdf",
|
130
|
+
filename=conv_res.input.file.name,
|
131
|
+
binary_hash=conv_res.input.document_hash,
|
132
|
+
)
|
133
|
+
doc_name = Path(origin.filename).stem
|
134
|
+
out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
135
|
+
|
136
|
+
for page in conv_res.pages:
|
137
|
+
page_no = page.page_no + 1
|
138
|
+
size = page.size
|
139
|
+
|
140
|
+
assert size is not None
|
141
|
+
|
142
|
+
out_doc.add_page(page_no=page_no, size=size)
|
143
|
+
|
144
|
+
current_list = None
|
145
|
+
skippable_cids = {
|
146
|
+
cid
|
147
|
+
for mapping in (
|
148
|
+
el_to_captions_mapping,
|
149
|
+
el_to_footnotes_mapping,
|
150
|
+
el_merges_mapping,
|
151
|
+
)
|
152
|
+
for lst in mapping.values()
|
153
|
+
for cid in lst
|
154
|
+
}
|
155
|
+
|
156
|
+
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
157
|
+
|
158
|
+
for rel in ro_elements:
|
159
|
+
if rel.cid in skippable_cids:
|
160
|
+
continue
|
161
|
+
element = id_to_elem[rel.ref.cref]
|
162
|
+
|
163
|
+
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
164
|
+
|
165
|
+
if isinstance(element, TextElement):
|
166
|
+
if element.label == DocItemLabel.CODE:
|
167
|
+
cap_text = element.text
|
168
|
+
prov = ProvenanceItem(
|
169
|
+
page_no=element.page_no + 1,
|
170
|
+
charspan=(0, len(cap_text)),
|
171
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
172
|
+
)
|
173
|
+
code_item = out_doc.add_code(text=cap_text, prov=prov)
|
174
|
+
|
175
|
+
if rel.cid in el_to_captions_mapping.keys():
|
176
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
177
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
178
|
+
new_cap_item = self._add_caption_or_footnote(
|
179
|
+
caption_elem, out_doc, code_item, page_height
|
180
|
+
)
|
181
|
+
|
182
|
+
code_item.captions.append(new_cap_item.get_ref())
|
183
|
+
|
184
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
185
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
186
|
+
footnote_elem = id_to_elem[
|
187
|
+
cid_to_rels[footnote_cid].ref.cref
|
188
|
+
]
|
189
|
+
new_footnote_item = self._add_caption_or_footnote(
|
190
|
+
footnote_elem, out_doc, code_item, page_height
|
191
|
+
)
|
192
|
+
|
193
|
+
code_item.footnotes.append(new_footnote_item.get_ref())
|
194
|
+
else:
|
195
|
+
|
196
|
+
new_item, current_list = self._handle_text_element(
|
197
|
+
element, out_doc, current_list, page_height
|
198
|
+
)
|
199
|
+
|
200
|
+
if rel.cid in el_merges_mapping.keys():
|
201
|
+
for merged_cid in el_merges_mapping[rel.cid]:
|
202
|
+
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
|
203
|
+
|
204
|
+
self._merge_elements(
|
205
|
+
element, merged_elem, new_item, page_height
|
206
|
+
)
|
207
|
+
|
208
|
+
elif isinstance(element, Table):
|
209
|
+
|
210
|
+
tbl_data = TableData(
|
211
|
+
num_rows=element.num_rows,
|
212
|
+
num_cols=element.num_cols,
|
213
|
+
table_cells=element.table_cells,
|
214
|
+
)
|
215
|
+
|
216
|
+
prov = ProvenanceItem(
|
217
|
+
page_no=element.page_no + 1,
|
218
|
+
charspan=(0, 0),
|
219
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
220
|
+
)
|
221
|
+
|
222
|
+
tbl = out_doc.add_table(
|
223
|
+
data=tbl_data, prov=prov, label=element.cluster.label
|
224
|
+
)
|
225
|
+
|
226
|
+
if rel.cid in el_to_captions_mapping.keys():
|
227
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
228
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
229
|
+
new_cap_item = self._add_caption_or_footnote(
|
230
|
+
caption_elem, out_doc, tbl, page_height
|
231
|
+
)
|
232
|
+
|
233
|
+
tbl.captions.append(new_cap_item.get_ref())
|
234
|
+
|
235
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
236
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
237
|
+
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
238
|
+
new_footnote_item = self._add_caption_or_footnote(
|
239
|
+
footnote_elem, out_doc, tbl, page_height
|
240
|
+
)
|
241
|
+
|
242
|
+
tbl.footnotes.append(new_footnote_item.get_ref())
|
243
|
+
|
244
|
+
# TODO: Consider adding children of Table.
|
245
|
+
|
246
|
+
elif isinstance(element, FigureElement):
|
247
|
+
cap_text = ""
|
248
|
+
prov = ProvenanceItem(
|
249
|
+
page_no=element.page_no + 1,
|
250
|
+
charspan=(0, len(cap_text)),
|
251
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
252
|
+
)
|
253
|
+
pic = out_doc.add_picture(prov=prov)
|
254
|
+
|
255
|
+
if rel.cid in el_to_captions_mapping.keys():
|
256
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
257
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
258
|
+
new_cap_item = self._add_caption_or_footnote(
|
259
|
+
caption_elem, out_doc, pic, page_height
|
260
|
+
)
|
261
|
+
|
262
|
+
pic.captions.append(new_cap_item.get_ref())
|
263
|
+
|
264
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
265
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
266
|
+
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
267
|
+
new_footnote_item = self._add_caption_or_footnote(
|
268
|
+
footnote_elem, out_doc, pic, page_height
|
269
|
+
)
|
270
|
+
|
271
|
+
pic.footnotes.append(new_footnote_item.get_ref())
|
272
|
+
|
273
|
+
self._add_child_elements(element, pic, out_doc)
|
274
|
+
|
275
|
+
elif isinstance(element, ContainerElement): # Form, KV region
|
276
|
+
label = element.label
|
277
|
+
group_label = GroupLabel.UNSPECIFIED
|
278
|
+
if label == DocItemLabel.FORM:
|
279
|
+
group_label = GroupLabel.FORM_AREA
|
280
|
+
elif label == DocItemLabel.KEY_VALUE_REGION:
|
281
|
+
group_label = GroupLabel.KEY_VALUE_AREA
|
282
|
+
|
283
|
+
container_el = out_doc.add_group(label=group_label)
|
284
|
+
|
285
|
+
self._add_child_elements(element, container_el, out_doc)
|
286
|
+
|
287
|
+
return out_doc
|
288
|
+
|
289
|
+
def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
|
290
|
+
assert isinstance(elem, TextElement)
|
291
|
+
text = elem.text
|
292
|
+
prov = ProvenanceItem(
|
293
|
+
page_no=elem.page_no + 1,
|
294
|
+
charspan=(0, len(text)),
|
295
|
+
bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
|
296
|
+
)
|
297
|
+
new_item = out_doc.add_text(
|
298
|
+
label=elem.label, text=text, prov=prov, parent=parent
|
299
|
+
)
|
300
|
+
return new_item
|
301
|
+
|
302
|
+
def _handle_text_element(self, element, out_doc, current_list, page_height):
|
303
|
+
cap_text = element.text
|
304
|
+
|
305
|
+
prov = ProvenanceItem(
|
306
|
+
page_no=element.page_no + 1,
|
307
|
+
charspan=(0, len(cap_text)),
|
308
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
309
|
+
)
|
310
|
+
label = element.label
|
311
|
+
if label == DocItemLabel.LIST_ITEM:
|
312
|
+
if current_list is None:
|
313
|
+
current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
|
314
|
+
|
315
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
316
|
+
new_item = out_doc.add_list_item(
|
317
|
+
text=cap_text, enumerated=False, prov=prov, parent=current_list
|
318
|
+
)
|
319
|
+
elif label == DocItemLabel.SECTION_HEADER:
|
320
|
+
current_list = None
|
321
|
+
|
322
|
+
new_item = out_doc.add_heading(text=cap_text, prov=prov)
|
323
|
+
elif label == DocItemLabel.FORMULA:
|
324
|
+
current_list = None
|
325
|
+
|
326
|
+
new_item = out_doc.add_text(
|
327
|
+
label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
|
328
|
+
)
|
329
|
+
else:
|
330
|
+
current_list = None
|
331
|
+
|
332
|
+
content_layer = ContentLayer.BODY
|
333
|
+
if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
334
|
+
content_layer = ContentLayer.FURNITURE
|
335
|
+
|
336
|
+
new_item = out_doc.add_text(
|
337
|
+
label=element.label,
|
338
|
+
text=cap_text,
|
339
|
+
prov=prov,
|
340
|
+
content_layer=content_layer,
|
341
|
+
)
|
342
|
+
return new_item, current_list
|
343
|
+
|
344
|
+
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
345
|
+
assert isinstance(
|
346
|
+
merged_elem, type(element)
|
347
|
+
), "Merged element must be of same type as element."
|
348
|
+
assert (
|
349
|
+
merged_elem.label == new_item.label
|
350
|
+
), "Labels of merged elements must match."
|
351
|
+
prov = ProvenanceItem(
|
352
|
+
page_no=element.page_no + 1,
|
353
|
+
charspan=(
|
354
|
+
len(new_item.text) + 1,
|
355
|
+
len(new_item.text) + 1 + len(merged_elem.text),
|
356
|
+
),
|
357
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
358
|
+
)
|
359
|
+
new_item.text += f" {merged_elem.text}"
|
360
|
+
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
|
361
|
+
new_item.prov.append(prov)
|
362
|
+
|
363
|
+
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
364
|
+
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
365
|
+
page_elements = self._assembled_to_readingorder_elements(conv_res)
|
366
|
+
|
367
|
+
# Apply reading order
|
368
|
+
sorted_elements = self.ro_model.predict_reading_order(
|
369
|
+
page_elements=page_elements
|
370
|
+
)
|
371
|
+
el_to_captions_mapping = self.ro_model.predict_to_captions(
|
372
|
+
sorted_elements=sorted_elements
|
373
|
+
)
|
374
|
+
el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
|
375
|
+
sorted_elements=sorted_elements
|
376
|
+
)
|
377
|
+
el_merges_mapping = self.ro_model.predict_merges(
|
378
|
+
sorted_elements=sorted_elements
|
379
|
+
)
|
380
|
+
|
381
|
+
docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
|
382
|
+
conv_res,
|
383
|
+
sorted_elements,
|
384
|
+
el_to_captions_mapping,
|
385
|
+
el_to_footnotes_mapping,
|
386
|
+
el_merges_mapping,
|
387
|
+
)
|
388
|
+
|
389
|
+
return docling_doc
|
@@ -27,7 +27,6 @@ from docling.models.document_picture_classifier import (
|
|
27
27
|
DocumentPictureClassifier,
|
28
28
|
DocumentPictureClassifierOptions,
|
29
29
|
)
|
30
|
-
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
31
30
|
from docling.models.easyocr_model import EasyOcrModel
|
32
31
|
from docling.models.layout_model import LayoutModel
|
33
32
|
from docling.models.ocr_mac_model import OcrMacModel
|
@@ -40,6 +39,7 @@ from docling.models.picture_description_api_model import PictureDescriptionApiMo
|
|
40
39
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
41
40
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
42
41
|
from docling.models.rapid_ocr_model import RapidOcrModel
|
42
|
+
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
|
43
43
|
from docling.models.table_structure_model import TableStructureModel
|
44
44
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
45
45
|
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
@@ -76,7 +76,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
76
76
|
or self.pipeline_options.generate_table_images
|
77
77
|
)
|
78
78
|
|
79
|
-
self.glm_model =
|
79
|
+
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
80
80
|
|
81
81
|
if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
|
82
82
|
raise RuntimeError(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.24.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -27,9 +27,8 @@ Provides-Extra: tesserocr
|
|
27
27
|
Provides-Extra: vlm
|
28
28
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
29
29
|
Requires-Dist: certifi (>=2024.7.4)
|
30
|
-
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
31
30
|
Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
|
32
|
-
Requires-Dist: docling-ibm-models (>=3.
|
31
|
+
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
32
|
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
34
33
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
35
34
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
@@ -34,23 +34,23 @@ docling/models/base_model.py,sha256=q_lKeQ0FT70idXlZ3JgyAv8dA8J3bZWBSDBkqTzy0lo,
|
|
34
34
|
docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_VRTE,6919
|
35
35
|
docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
|
36
36
|
docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
|
37
|
-
docling/models/ds_glm_model.py,sha256=1jLEM-B_oHFevKq23zDQpdifE3eJL7qiLr5YLpEf1kQ,15217
|
38
37
|
docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
|
39
38
|
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
40
39
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
41
|
-
docling/models/page_assemble_model.py,sha256=
|
40
|
+
docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
|
42
41
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
43
42
|
docling/models/picture_description_api_model.py,sha256=SKNoHpqzbfM8iO-DJJ4ccyNVqO0B2d9neLBnXqt50FY,3186
|
44
43
|
docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0ZxwO7yubhh2RkaC_8e8,1910
|
45
44
|
docling/models/picture_description_vlm_model.py,sha256=a2vYUdlcA0--_8neY0tTiU8reCf29NCbVMKwWdMy2QQ,3653
|
46
45
|
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
46
|
+
docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
|
47
47
|
docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
|
48
48
|
docling/models/tesseract_ocr_cli_model.py,sha256=F5EhS4NDEmLkPq-a0P7o2LrzjmJgACzlYXTDvtD3NtY,9343
|
49
49
|
docling/models/tesseract_ocr_model.py,sha256=ikGu6QNknLG64c9yYIb0Ix6MGhBzOoa1ODbNc8MT5r8,8508
|
50
50
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
51
51
|
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
52
52
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
53
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
53
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=IQHktVYvueTrYnIgLonaMvfYKKsU3L-hC9dqrR-Lw8g,12904
|
54
54
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
55
55
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
56
|
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
@@ -62,8 +62,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
|
|
62
62
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
63
63
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
64
64
|
docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
|
65
|
-
docling-2.
|
66
|
-
docling-2.
|
67
|
-
docling-2.
|
68
|
-
docling-2.
|
69
|
-
docling-2.
|
65
|
+
docling-2.24.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
66
|
+
docling-2.24.0.dist-info/METADATA,sha256=0MJ5mBt0GwsZotaSpHnAWzdzWcu_BQFGqGzNR3gRpG4,8672
|
67
|
+
docling-2.24.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
68
|
+
docling-2.24.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
69
|
+
docling-2.24.0.dist-info/RECORD,,
|
docling/models/ds_glm_model.py
DELETED
@@ -1,386 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import random
|
3
|
-
from pathlib import Path
|
4
|
-
from typing import List, Union
|
5
|
-
|
6
|
-
from deepsearch_glm.andromeda_nlp import nlp_model
|
7
|
-
from docling_core.types.doc import (
|
8
|
-
BoundingBox,
|
9
|
-
CoordOrigin,
|
10
|
-
DocItemLabel,
|
11
|
-
DoclingDocument,
|
12
|
-
)
|
13
|
-
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
14
|
-
from docling_core.types.legacy_doc.base import (
|
15
|
-
Figure,
|
16
|
-
PageDimensions,
|
17
|
-
PageReference,
|
18
|
-
Prov,
|
19
|
-
Ref,
|
20
|
-
)
|
21
|
-
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
22
|
-
from docling_core.types.legacy_doc.base import TableCell
|
23
|
-
from docling_core.types.legacy_doc.document import BaseText
|
24
|
-
from docling_core.types.legacy_doc.document import (
|
25
|
-
CCSDocumentDescription as DsDocumentDescription,
|
26
|
-
)
|
27
|
-
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
28
|
-
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
29
|
-
from PIL import ImageDraw
|
30
|
-
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
31
|
-
|
32
|
-
from docling.datamodel.base_models import (
|
33
|
-
Cluster,
|
34
|
-
ContainerElement,
|
35
|
-
FigureElement,
|
36
|
-
Table,
|
37
|
-
TextElement,
|
38
|
-
)
|
39
|
-
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
40
|
-
from docling.datamodel.settings import settings
|
41
|
-
from docling.utils.glm_utils import to_docling_document
|
42
|
-
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
43
|
-
from docling.utils.utils import create_hash
|
44
|
-
|
45
|
-
|
46
|
-
class GlmOptions(BaseModel):
|
47
|
-
model_config = ConfigDict(protected_namespaces=())
|
48
|
-
|
49
|
-
model_names: str = "" # e.g. "language;term;reference"
|
50
|
-
|
51
|
-
|
52
|
-
class GlmModel:
|
53
|
-
def __init__(self, options: GlmOptions):
|
54
|
-
self.options = options
|
55
|
-
|
56
|
-
self.model = nlp_model(loglevel="error", text_ordering=True)
|
57
|
-
|
58
|
-
def _to_legacy_document(self, conv_res) -> DsDocument:
|
59
|
-
title = ""
|
60
|
-
desc: DsDocumentDescription = DsDocumentDescription(logs=[])
|
61
|
-
|
62
|
-
page_hashes = [
|
63
|
-
PageReference(
|
64
|
-
hash=create_hash(conv_res.input.document_hash + ":" + str(p.page_no)),
|
65
|
-
page=p.page_no + 1,
|
66
|
-
model="default",
|
67
|
-
)
|
68
|
-
for p in conv_res.pages
|
69
|
-
]
|
70
|
-
|
71
|
-
file_info = DsFileInfoObject(
|
72
|
-
filename=conv_res.input.file.name,
|
73
|
-
document_hash=conv_res.input.document_hash,
|
74
|
-
num_pages=conv_res.input.page_count,
|
75
|
-
page_hashes=page_hashes,
|
76
|
-
)
|
77
|
-
|
78
|
-
main_text: List[Union[Ref, BaseText]] = []
|
79
|
-
page_headers: List[Union[Ref, BaseText]] = []
|
80
|
-
page_footers: List[Union[Ref, BaseText]] = []
|
81
|
-
|
82
|
-
tables: List[DsSchemaTable] = []
|
83
|
-
figures: List[Figure] = []
|
84
|
-
|
85
|
-
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
86
|
-
|
87
|
-
for element in conv_res.assembled.body:
|
88
|
-
# Convert bboxes to lower-left origin.
|
89
|
-
target_bbox = DsBoundingBox(
|
90
|
-
element.cluster.bbox.to_bottom_left_origin(
|
91
|
-
page_no_to_page[element.page_no].size.height
|
92
|
-
).as_tuple()
|
93
|
-
)
|
94
|
-
|
95
|
-
if isinstance(element, TextElement):
|
96
|
-
main_text.append(
|
97
|
-
BaseText(
|
98
|
-
text=element.text,
|
99
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
100
|
-
name=element.label,
|
101
|
-
prov=[
|
102
|
-
Prov(
|
103
|
-
bbox=target_bbox,
|
104
|
-
page=element.page_no + 1,
|
105
|
-
span=[0, len(element.text)],
|
106
|
-
)
|
107
|
-
],
|
108
|
-
)
|
109
|
-
)
|
110
|
-
elif isinstance(element, Table):
|
111
|
-
index = len(tables)
|
112
|
-
ref_str = f"#/tables/{index}"
|
113
|
-
main_text.append(
|
114
|
-
Ref(
|
115
|
-
name=element.label,
|
116
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
117
|
-
ref=ref_str,
|
118
|
-
),
|
119
|
-
)
|
120
|
-
|
121
|
-
# Initialise empty table data grid (only empty cells)
|
122
|
-
table_data = [
|
123
|
-
[
|
124
|
-
TableCell(
|
125
|
-
text="",
|
126
|
-
# bbox=[0,0,0,0],
|
127
|
-
spans=[[i, j]],
|
128
|
-
obj_type="body",
|
129
|
-
)
|
130
|
-
for j in range(element.num_cols)
|
131
|
-
]
|
132
|
-
for i in range(element.num_rows)
|
133
|
-
]
|
134
|
-
|
135
|
-
# Overwrite cells in table data for which there is actual cell content.
|
136
|
-
for cell in element.table_cells:
|
137
|
-
for i in range(
|
138
|
-
min(cell.start_row_offset_idx, element.num_rows),
|
139
|
-
min(cell.end_row_offset_idx, element.num_rows),
|
140
|
-
):
|
141
|
-
for j in range(
|
142
|
-
min(cell.start_col_offset_idx, element.num_cols),
|
143
|
-
min(cell.end_col_offset_idx, element.num_cols),
|
144
|
-
):
|
145
|
-
celltype = "body"
|
146
|
-
if cell.column_header:
|
147
|
-
celltype = "col_header"
|
148
|
-
elif cell.row_header:
|
149
|
-
celltype = "row_header"
|
150
|
-
elif cell.row_section:
|
151
|
-
celltype = "row_section"
|
152
|
-
|
153
|
-
def make_spans(cell):
|
154
|
-
for rspan in range(
|
155
|
-
min(cell.start_row_offset_idx, element.num_rows),
|
156
|
-
min(cell.end_row_offset_idx, element.num_rows),
|
157
|
-
):
|
158
|
-
for cspan in range(
|
159
|
-
min(
|
160
|
-
cell.start_col_offset_idx, element.num_cols
|
161
|
-
),
|
162
|
-
min(cell.end_col_offset_idx, element.num_cols),
|
163
|
-
):
|
164
|
-
yield [rspan, cspan]
|
165
|
-
|
166
|
-
spans = list(make_spans(cell))
|
167
|
-
if cell.bbox is not None:
|
168
|
-
bbox = cell.bbox.to_bottom_left_origin(
|
169
|
-
page_no_to_page[element.page_no].size.height
|
170
|
-
).as_tuple()
|
171
|
-
else:
|
172
|
-
bbox = None
|
173
|
-
|
174
|
-
table_data[i][j] = TableCell(
|
175
|
-
text=cell.text,
|
176
|
-
bbox=bbox,
|
177
|
-
# col=j,
|
178
|
-
# row=i,
|
179
|
-
spans=spans,
|
180
|
-
obj_type=celltype,
|
181
|
-
# col_span=[cell.start_col_offset_idx, cell.end_col_offset_idx],
|
182
|
-
# row_span=[cell.start_row_offset_idx, cell.end_row_offset_idx]
|
183
|
-
)
|
184
|
-
|
185
|
-
tables.append(
|
186
|
-
DsSchemaTable(
|
187
|
-
num_cols=element.num_cols,
|
188
|
-
num_rows=element.num_rows,
|
189
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
190
|
-
data=table_data,
|
191
|
-
prov=[
|
192
|
-
Prov(
|
193
|
-
bbox=target_bbox,
|
194
|
-
page=element.page_no + 1,
|
195
|
-
span=[0, 0],
|
196
|
-
)
|
197
|
-
],
|
198
|
-
)
|
199
|
-
)
|
200
|
-
|
201
|
-
elif isinstance(element, FigureElement):
|
202
|
-
index = len(figures)
|
203
|
-
ref_str = f"#/figures/{index}"
|
204
|
-
main_text.append(
|
205
|
-
Ref(
|
206
|
-
name=element.label,
|
207
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
208
|
-
ref=ref_str,
|
209
|
-
),
|
210
|
-
)
|
211
|
-
figures.append(
|
212
|
-
Figure(
|
213
|
-
prov=[
|
214
|
-
Prov(
|
215
|
-
bbox=target_bbox,
|
216
|
-
page=element.page_no + 1,
|
217
|
-
span=[0, 0],
|
218
|
-
)
|
219
|
-
],
|
220
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
221
|
-
payload={
|
222
|
-
"children": TypeAdapter(List[Cluster]).dump_python(
|
223
|
-
element.cluster.children
|
224
|
-
)
|
225
|
-
}, # hack to channel child clusters through GLM
|
226
|
-
)
|
227
|
-
)
|
228
|
-
elif isinstance(element, ContainerElement):
|
229
|
-
main_text.append(
|
230
|
-
BaseText(
|
231
|
-
text="",
|
232
|
-
payload={
|
233
|
-
"children": TypeAdapter(List[Cluster]).dump_python(
|
234
|
-
element.cluster.children
|
235
|
-
)
|
236
|
-
}, # hack to channel child clusters through GLM
|
237
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
238
|
-
name=element.label,
|
239
|
-
prov=[
|
240
|
-
Prov(
|
241
|
-
bbox=target_bbox,
|
242
|
-
page=element.page_no + 1,
|
243
|
-
span=[0, 0],
|
244
|
-
)
|
245
|
-
],
|
246
|
-
)
|
247
|
-
)
|
248
|
-
|
249
|
-
# We can throw in headers and footers at the end of the legacy doc
|
250
|
-
# since the reading-order will re-sort it later.
|
251
|
-
for element in conv_res.assembled.headers:
|
252
|
-
# Convert bboxes to lower-left origin.
|
253
|
-
target_bbox = DsBoundingBox(
|
254
|
-
element.cluster.bbox.to_bottom_left_origin(
|
255
|
-
page_no_to_page[element.page_no].size.height
|
256
|
-
).as_tuple()
|
257
|
-
)
|
258
|
-
|
259
|
-
if isinstance(element, TextElement):
|
260
|
-
|
261
|
-
tel = BaseText(
|
262
|
-
text=element.text,
|
263
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
264
|
-
name=element.label,
|
265
|
-
prov=[
|
266
|
-
Prov(
|
267
|
-
bbox=target_bbox,
|
268
|
-
page=element.page_no + 1,
|
269
|
-
span=[0, len(element.text)],
|
270
|
-
)
|
271
|
-
],
|
272
|
-
)
|
273
|
-
if element.label == DocItemLabel.PAGE_HEADER:
|
274
|
-
index = len(page_headers)
|
275
|
-
ref_str = f"#/page-headers/{index}"
|
276
|
-
main_text.append(
|
277
|
-
Ref(
|
278
|
-
name=element.label,
|
279
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
280
|
-
ref=ref_str,
|
281
|
-
),
|
282
|
-
)
|
283
|
-
page_headers.append(tel)
|
284
|
-
elif element.label == DocItemLabel.PAGE_FOOTER:
|
285
|
-
index = len(page_footers)
|
286
|
-
ref_str = f"#/page-footers/{index}"
|
287
|
-
main_text.append(
|
288
|
-
Ref(
|
289
|
-
name=element.label,
|
290
|
-
obj_type=layout_label_to_ds_type.get(element.label),
|
291
|
-
ref=ref_str,
|
292
|
-
),
|
293
|
-
)
|
294
|
-
page_footers.append(tel)
|
295
|
-
|
296
|
-
page_dimensions = [
|
297
|
-
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
298
|
-
for p in conv_res.pages
|
299
|
-
if p.size is not None
|
300
|
-
]
|
301
|
-
|
302
|
-
ds_doc: DsDocument = DsDocument(
|
303
|
-
name=title,
|
304
|
-
description=desc,
|
305
|
-
file_info=file_info,
|
306
|
-
main_text=main_text,
|
307
|
-
tables=tables,
|
308
|
-
figures=figures,
|
309
|
-
page_dimensions=page_dimensions,
|
310
|
-
page_headers=page_headers,
|
311
|
-
page_footers=page_footers,
|
312
|
-
)
|
313
|
-
|
314
|
-
return ds_doc
|
315
|
-
|
316
|
-
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
317
|
-
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
318
|
-
ds_doc = self._to_legacy_document(conv_res)
|
319
|
-
ds_doc_dict = ds_doc.model_dump(by_alias=True, exclude_none=True)
|
320
|
-
|
321
|
-
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
322
|
-
|
323
|
-
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
324
|
-
1 == 1
|
325
|
-
|
326
|
-
# DEBUG code:
|
327
|
-
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
328
|
-
clusters_to_draw = []
|
329
|
-
image = copy.deepcopy(conv_res.pages[page_no].image)
|
330
|
-
for ix, elem in enumerate(ds_document.main_text):
|
331
|
-
if isinstance(elem, BaseText):
|
332
|
-
prov = elem.prov[0] # type: ignore
|
333
|
-
elif isinstance(elem, Ref):
|
334
|
-
_, arr, index = elem.ref.split("/")
|
335
|
-
index = int(index) # type: ignore
|
336
|
-
if arr == "tables":
|
337
|
-
prov = ds_document.tables[index].prov[0]
|
338
|
-
elif arr == "figures":
|
339
|
-
prov = ds_document.pictures[index].prov[0]
|
340
|
-
else:
|
341
|
-
prov = None
|
342
|
-
|
343
|
-
if prov and prov.page == page_no:
|
344
|
-
clusters_to_draw.append(
|
345
|
-
Cluster(
|
346
|
-
id=ix,
|
347
|
-
label=elem.name,
|
348
|
-
bbox=BoundingBox.from_tuple(
|
349
|
-
coord=prov.bbox, # type: ignore
|
350
|
-
origin=CoordOrigin.BOTTOMLEFT,
|
351
|
-
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
352
|
-
)
|
353
|
-
)
|
354
|
-
|
355
|
-
draw = ImageDraw.Draw(image)
|
356
|
-
for c in clusters_to_draw:
|
357
|
-
x0, y0, x1, y1 = c.bbox.as_tuple()
|
358
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
359
|
-
draw.text((x0 + 2, y0 + 2), f"{c.id}:{c.label}", fill=(255, 0, 0, 255))
|
360
|
-
|
361
|
-
cell_color = (
|
362
|
-
random.randint(30, 140),
|
363
|
-
random.randint(30, 140),
|
364
|
-
random.randint(30, 140),
|
365
|
-
)
|
366
|
-
for tc in c.cells: # [:1]:
|
367
|
-
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
368
|
-
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
369
|
-
|
370
|
-
if show:
|
371
|
-
image.show()
|
372
|
-
else:
|
373
|
-
out_path: Path = (
|
374
|
-
Path(settings.debug.debug_output_path)
|
375
|
-
/ f"debug_{conv_res.input.file.stem}"
|
376
|
-
)
|
377
|
-
out_path.mkdir(parents=True, exist_ok=True)
|
378
|
-
|
379
|
-
out_file = out_path / f"doc_page_{page_no:05}.png"
|
380
|
-
image.save(str(out_file), format="png")
|
381
|
-
|
382
|
-
# for item in ds_doc.page_dimensions:
|
383
|
-
# page_no = item.page
|
384
|
-
# draw_clusters_and_cells(ds_doc, page_no)
|
385
|
-
|
386
|
-
return docling_doc
|
File without changes
|
File without changes
|
File without changes
|