docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from docling_core.types.doc import (
|
|
4
|
+
DocItemLabel,
|
|
5
|
+
DoclingDocument,
|
|
6
|
+
DocumentOrigin,
|
|
7
|
+
GroupLabel,
|
|
8
|
+
NodeItem,
|
|
9
|
+
ProvenanceItem,
|
|
10
|
+
RefItem,
|
|
11
|
+
RichTableCell,
|
|
12
|
+
TableData,
|
|
13
|
+
)
|
|
14
|
+
from docling_core.types.doc.document import ContentLayer
|
|
15
|
+
from docling_ibm_models.list_item_normalizer.list_marker_processor import (
|
|
16
|
+
ListItemMarkerProcessor,
|
|
17
|
+
)
|
|
18
|
+
from docling_ibm_models.reading_order.reading_order_rb import (
|
|
19
|
+
PageElement as ReadingOrderPageElement,
|
|
20
|
+
ReadingOrderPredictor,
|
|
21
|
+
)
|
|
22
|
+
from pydantic import BaseModel, ConfigDict
|
|
23
|
+
|
|
24
|
+
from docling.datamodel.base_models import (
|
|
25
|
+
BasePageElement,
|
|
26
|
+
Cluster,
|
|
27
|
+
ContainerElement,
|
|
28
|
+
FigureElement,
|
|
29
|
+
Table,
|
|
30
|
+
TextElement,
|
|
31
|
+
)
|
|
32
|
+
from docling.datamodel.document import ConversionResult
|
|
33
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ReadingOrderOptions(BaseModel):
|
|
37
|
+
model_config = ConfigDict(protected_namespaces=())
|
|
38
|
+
|
|
39
|
+
model_names: str = "" # e.g. "language;term;reference"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ReadingOrderModel:
|
|
43
|
+
def __init__(self, options: ReadingOrderOptions):
|
|
44
|
+
self.options = options
|
|
45
|
+
self.ro_model = ReadingOrderPredictor()
|
|
46
|
+
self.list_item_processor = ListItemMarkerProcessor()
|
|
47
|
+
|
|
48
|
+
def _assembled_to_readingorder_elements(
|
|
49
|
+
self, conv_res: ConversionResult
|
|
50
|
+
) -> list[ReadingOrderPageElement]:
|
|
51
|
+
elements: list[ReadingOrderPageElement] = []
|
|
52
|
+
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
|
53
|
+
|
|
54
|
+
for element in conv_res.assembled.elements:
|
|
55
|
+
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
|
56
|
+
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
|
57
|
+
text = element.text or ""
|
|
58
|
+
|
|
59
|
+
elements.append(
|
|
60
|
+
ReadingOrderPageElement(
|
|
61
|
+
cid=len(elements),
|
|
62
|
+
ref=RefItem(cref=f"#/{element.page_no}/{element.cluster.id}"),
|
|
63
|
+
text=text,
|
|
64
|
+
page_no=element.page_no,
|
|
65
|
+
page_size=page_no_to_pages[element.page_no].size,
|
|
66
|
+
label=element.label,
|
|
67
|
+
l=bbox.l,
|
|
68
|
+
r=bbox.r,
|
|
69
|
+
b=bbox.b,
|
|
70
|
+
t=bbox.t,
|
|
71
|
+
coord_origin=bbox.coord_origin,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return elements
|
|
76
|
+
|
|
77
|
+
def _add_child_elements(
|
|
78
|
+
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
|
79
|
+
):
|
|
80
|
+
child: Cluster
|
|
81
|
+
for child in element.cluster.children:
|
|
82
|
+
c_label = child.label
|
|
83
|
+
c_bbox = child.bbox.to_bottom_left_origin(
|
|
84
|
+
doc.pages[element.page_no].size.height
|
|
85
|
+
)
|
|
86
|
+
c_text = " ".join(
|
|
87
|
+
[
|
|
88
|
+
cell.text.replace("\x02", "-").strip()
|
|
89
|
+
for cell in child.cells
|
|
90
|
+
if len(cell.text.strip()) > 0
|
|
91
|
+
]
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
c_prov = ProvenanceItem(
|
|
95
|
+
page_no=element.page_no, charspan=(0, len(c_text)), bbox=c_bbox
|
|
96
|
+
)
|
|
97
|
+
if c_label == DocItemLabel.LIST_ITEM:
|
|
98
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
|
99
|
+
l_item = doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
|
|
100
|
+
self.list_item_processor.process_list_item(l_item)
|
|
101
|
+
elif c_label == DocItemLabel.SECTION_HEADER:
|
|
102
|
+
doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
|
|
103
|
+
else:
|
|
104
|
+
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
|
105
|
+
|
|
106
|
+
def _create_rich_cell_group(
|
|
107
|
+
self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
|
|
108
|
+
) -> RefItem:
|
|
109
|
+
"""Create a group containing all child elements for a rich table cell."""
|
|
110
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
|
|
111
|
+
group_element = doc.add_group(
|
|
112
|
+
label=GroupLabel.UNSPECIFIED,
|
|
113
|
+
name=group_name,
|
|
114
|
+
parent=table_item,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Add all child elements to the group
|
|
118
|
+
self._add_child_elements(element, group_element, doc)
|
|
119
|
+
|
|
120
|
+
return group_element.get_ref()
|
|
121
|
+
|
|
122
|
+
def _readingorder_elements_to_docling_doc(
|
|
123
|
+
self,
|
|
124
|
+
conv_res: ConversionResult,
|
|
125
|
+
ro_elements: list[ReadingOrderPageElement],
|
|
126
|
+
el_to_captions_mapping: dict[int, list[int]],
|
|
127
|
+
el_to_footnotes_mapping: dict[int, list[int]],
|
|
128
|
+
el_merges_mapping: dict[int, list[int]],
|
|
129
|
+
) -> DoclingDocument:
|
|
130
|
+
id_to_elem = {
|
|
131
|
+
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
|
132
|
+
for elem in conv_res.assembled.elements
|
|
133
|
+
}
|
|
134
|
+
cid_to_rels = {rel.cid: rel for rel in ro_elements}
|
|
135
|
+
|
|
136
|
+
origin = DocumentOrigin(
|
|
137
|
+
mimetype="application/pdf",
|
|
138
|
+
filename=conv_res.input.file.name,
|
|
139
|
+
binary_hash=conv_res.input.document_hash,
|
|
140
|
+
)
|
|
141
|
+
doc_name = Path(origin.filename).stem
|
|
142
|
+
out_doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
|
143
|
+
|
|
144
|
+
for page in conv_res.pages:
|
|
145
|
+
page_no = page.page_no
|
|
146
|
+
size = page.size
|
|
147
|
+
|
|
148
|
+
assert size is not None, "Page size is not initialized."
|
|
149
|
+
|
|
150
|
+
out_doc.add_page(page_no=page_no, size=size)
|
|
151
|
+
|
|
152
|
+
current_list = None
|
|
153
|
+
skippable_cids = {
|
|
154
|
+
cid
|
|
155
|
+
for mapping in (
|
|
156
|
+
el_to_captions_mapping,
|
|
157
|
+
el_to_footnotes_mapping,
|
|
158
|
+
el_merges_mapping,
|
|
159
|
+
)
|
|
160
|
+
for lst in mapping.values()
|
|
161
|
+
for cid in lst
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
|
165
|
+
|
|
166
|
+
for rel in ro_elements:
|
|
167
|
+
if rel.cid in skippable_cids:
|
|
168
|
+
continue
|
|
169
|
+
element = id_to_elem[rel.ref.cref]
|
|
170
|
+
|
|
171
|
+
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
|
172
|
+
|
|
173
|
+
if isinstance(element, TextElement):
|
|
174
|
+
if element.label == DocItemLabel.CODE:
|
|
175
|
+
cap_text = element.text
|
|
176
|
+
prov = ProvenanceItem(
|
|
177
|
+
page_no=element.page_no,
|
|
178
|
+
charspan=(0, len(cap_text)),
|
|
179
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
|
180
|
+
)
|
|
181
|
+
code_item = out_doc.add_code(text=cap_text, prov=prov)
|
|
182
|
+
|
|
183
|
+
if rel.cid in el_to_captions_mapping.keys():
|
|
184
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
|
185
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
|
186
|
+
new_cap_item = self._add_caption_or_footnote(
|
|
187
|
+
caption_elem, out_doc, code_item, page_height
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
code_item.captions.append(new_cap_item.get_ref())
|
|
191
|
+
|
|
192
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
|
193
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
|
194
|
+
footnote_elem = id_to_elem[
|
|
195
|
+
cid_to_rels[footnote_cid].ref.cref
|
|
196
|
+
]
|
|
197
|
+
new_footnote_item = self._add_caption_or_footnote(
|
|
198
|
+
footnote_elem, out_doc, code_item, page_height
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
code_item.footnotes.append(new_footnote_item.get_ref())
|
|
202
|
+
else:
|
|
203
|
+
new_item, current_list = self._handle_text_element(
|
|
204
|
+
element, out_doc, current_list, page_height
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if rel.cid in el_merges_mapping.keys():
|
|
208
|
+
for merged_cid in el_merges_mapping[rel.cid]:
|
|
209
|
+
merged_elem = id_to_elem[cid_to_rels[merged_cid].ref.cref]
|
|
210
|
+
|
|
211
|
+
self._merge_elements(
|
|
212
|
+
element, merged_elem, new_item, page_height
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
elif isinstance(element, Table):
|
|
216
|
+
# Check if table has no structure prediction
|
|
217
|
+
if element.num_rows == 0 and element.num_cols == 0:
|
|
218
|
+
# Only create 1x1 table if there are children to put in it
|
|
219
|
+
if element.cluster.children:
|
|
220
|
+
# Create minimal 1x1 table with rich cell containing all children
|
|
221
|
+
tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
|
|
222
|
+
else:
|
|
223
|
+
# Create empty table with no structure
|
|
224
|
+
tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
|
|
225
|
+
else:
|
|
226
|
+
tbl_data = TableData(
|
|
227
|
+
num_rows=element.num_rows,
|
|
228
|
+
num_cols=element.num_cols,
|
|
229
|
+
table_cells=element.table_cells,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
prov = ProvenanceItem(
|
|
233
|
+
page_no=element.page_no,
|
|
234
|
+
charspan=(0, 0),
|
|
235
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
tbl = out_doc.add_table(
|
|
239
|
+
data=tbl_data, prov=prov, label=element.cluster.label
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if rel.cid in el_to_captions_mapping.keys():
|
|
243
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
|
244
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
|
245
|
+
new_cap_item = self._add_caption_or_footnote(
|
|
246
|
+
caption_elem, out_doc, tbl, page_height
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
tbl.captions.append(new_cap_item.get_ref())
|
|
250
|
+
|
|
251
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
|
252
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
|
253
|
+
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
|
254
|
+
new_footnote_item = self._add_caption_or_footnote(
|
|
255
|
+
footnote_elem, out_doc, tbl, page_height
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
tbl.footnotes.append(new_footnote_item.get_ref())
|
|
259
|
+
|
|
260
|
+
# Handle case where table has no structure prediction but has children
|
|
261
|
+
if (
|
|
262
|
+
element.num_rows == 0
|
|
263
|
+
and element.num_cols == 0
|
|
264
|
+
and element.cluster.children
|
|
265
|
+
):
|
|
266
|
+
# Create rich cell containing all child elements
|
|
267
|
+
rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
|
|
268
|
+
|
|
269
|
+
# Create rich table cell spanning the entire 1x1 table
|
|
270
|
+
rich_cell = RichTableCell(
|
|
271
|
+
text="", # Empty text since content is in the group
|
|
272
|
+
row_span=1,
|
|
273
|
+
col_span=1,
|
|
274
|
+
start_row_offset_idx=0,
|
|
275
|
+
end_row_offset_idx=1,
|
|
276
|
+
start_col_offset_idx=0,
|
|
277
|
+
end_col_offset_idx=1,
|
|
278
|
+
column_header=False,
|
|
279
|
+
row_header=False,
|
|
280
|
+
ref=rich_cell_ref,
|
|
281
|
+
)
|
|
282
|
+
out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
|
|
283
|
+
|
|
284
|
+
# TODO: Consider adding children of Table.
|
|
285
|
+
|
|
286
|
+
elif isinstance(element, FigureElement):
|
|
287
|
+
cap_text = ""
|
|
288
|
+
prov = ProvenanceItem(
|
|
289
|
+
page_no=element.page_no,
|
|
290
|
+
charspan=(0, len(cap_text)),
|
|
291
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
|
292
|
+
)
|
|
293
|
+
pic = out_doc.add_picture(prov=prov)
|
|
294
|
+
|
|
295
|
+
if rel.cid in el_to_captions_mapping.keys():
|
|
296
|
+
for caption_cid in el_to_captions_mapping[rel.cid]:
|
|
297
|
+
caption_elem = id_to_elem[cid_to_rels[caption_cid].ref.cref]
|
|
298
|
+
new_cap_item = self._add_caption_or_footnote(
|
|
299
|
+
caption_elem, out_doc, pic, page_height
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
pic.captions.append(new_cap_item.get_ref())
|
|
303
|
+
|
|
304
|
+
if rel.cid in el_to_footnotes_mapping.keys():
|
|
305
|
+
for footnote_cid in el_to_footnotes_mapping[rel.cid]:
|
|
306
|
+
footnote_elem = id_to_elem[cid_to_rels[footnote_cid].ref.cref]
|
|
307
|
+
new_footnote_item = self._add_caption_or_footnote(
|
|
308
|
+
footnote_elem, out_doc, pic, page_height
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
pic.footnotes.append(new_footnote_item.get_ref())
|
|
312
|
+
|
|
313
|
+
self._add_child_elements(element, pic, out_doc)
|
|
314
|
+
|
|
315
|
+
elif isinstance(element, ContainerElement): # Form, KV region
|
|
316
|
+
label = element.label
|
|
317
|
+
group_label = GroupLabel.UNSPECIFIED
|
|
318
|
+
if label == DocItemLabel.FORM:
|
|
319
|
+
group_label = GroupLabel.FORM_AREA
|
|
320
|
+
elif label == DocItemLabel.KEY_VALUE_REGION:
|
|
321
|
+
group_label = GroupLabel.KEY_VALUE_AREA
|
|
322
|
+
|
|
323
|
+
container_el = out_doc.add_group(label=group_label)
|
|
324
|
+
|
|
325
|
+
self._add_child_elements(element, container_el, out_doc)
|
|
326
|
+
|
|
327
|
+
return out_doc
|
|
328
|
+
|
|
329
|
+
def _add_caption_or_footnote(self, elem, out_doc, parent, page_height):
|
|
330
|
+
assert isinstance(elem, TextElement)
|
|
331
|
+
text = elem.text
|
|
332
|
+
prov = ProvenanceItem(
|
|
333
|
+
page_no=elem.page_no,
|
|
334
|
+
charspan=(0, len(text)),
|
|
335
|
+
bbox=elem.cluster.bbox.to_bottom_left_origin(page_height),
|
|
336
|
+
)
|
|
337
|
+
new_item = out_doc.add_text(
|
|
338
|
+
label=elem.label, text=text, prov=prov, parent=parent
|
|
339
|
+
)
|
|
340
|
+
return new_item
|
|
341
|
+
|
|
342
|
+
def _handle_text_element(self, element, out_doc, current_list, page_height):
|
|
343
|
+
cap_text = element.text
|
|
344
|
+
|
|
345
|
+
prov = ProvenanceItem(
|
|
346
|
+
page_no=element.page_no,
|
|
347
|
+
charspan=(0, len(cap_text)),
|
|
348
|
+
bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
|
349
|
+
)
|
|
350
|
+
label = element.label
|
|
351
|
+
if label == DocItemLabel.LIST_ITEM:
|
|
352
|
+
if current_list is None:
|
|
353
|
+
current_list = out_doc.add_group(label=GroupLabel.LIST, name="list")
|
|
354
|
+
|
|
355
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
|
356
|
+
new_item = out_doc.add_list_item(
|
|
357
|
+
text=cap_text, enumerated=False, prov=prov, parent=current_list
|
|
358
|
+
)
|
|
359
|
+
self.list_item_processor.process_list_item(new_item)
|
|
360
|
+
|
|
361
|
+
elif label == DocItemLabel.SECTION_HEADER:
|
|
362
|
+
current_list = None
|
|
363
|
+
|
|
364
|
+
new_item = out_doc.add_heading(text=cap_text, prov=prov)
|
|
365
|
+
elif label == DocItemLabel.FORMULA:
|
|
366
|
+
current_list = None
|
|
367
|
+
|
|
368
|
+
new_item = out_doc.add_text(
|
|
369
|
+
label=DocItemLabel.FORMULA, text="", orig=cap_text, prov=prov
|
|
370
|
+
)
|
|
371
|
+
else:
|
|
372
|
+
current_list = None
|
|
373
|
+
|
|
374
|
+
content_layer = ContentLayer.BODY
|
|
375
|
+
if element.label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
376
|
+
content_layer = ContentLayer.FURNITURE
|
|
377
|
+
|
|
378
|
+
new_item = out_doc.add_text(
|
|
379
|
+
label=element.label,
|
|
380
|
+
text=cap_text,
|
|
381
|
+
prov=prov,
|
|
382
|
+
content_layer=content_layer,
|
|
383
|
+
)
|
|
384
|
+
return new_item, current_list
|
|
385
|
+
|
|
386
|
+
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
|
387
|
+
assert isinstance(merged_elem, type(element)), (
|
|
388
|
+
"Merged element must be of same type as element."
|
|
389
|
+
)
|
|
390
|
+
assert merged_elem.label == new_item.label, (
|
|
391
|
+
"Labels of merged elements must match."
|
|
392
|
+
)
|
|
393
|
+
prov = ProvenanceItem(
|
|
394
|
+
page_no=merged_elem.page_no,
|
|
395
|
+
charspan=(
|
|
396
|
+
len(new_item.text) + 1,
|
|
397
|
+
len(new_item.text) + 1 + len(merged_elem.text),
|
|
398
|
+
),
|
|
399
|
+
bbox=merged_elem.cluster.bbox.to_bottom_left_origin(page_height),
|
|
400
|
+
)
|
|
401
|
+
new_item.text += f" {merged_elem.text}"
|
|
402
|
+
new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element.
|
|
403
|
+
new_item.prov.append(prov)
|
|
404
|
+
|
|
405
|
+
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
|
406
|
+
with TimeRecorder(conv_res, "reading_order", scope=ProfilingScope.DOCUMENT):
|
|
407
|
+
page_elements = self._assembled_to_readingorder_elements(conv_res)
|
|
408
|
+
|
|
409
|
+
# Apply reading order
|
|
410
|
+
sorted_elements = self.ro_model.predict_reading_order(
|
|
411
|
+
page_elements=page_elements
|
|
412
|
+
)
|
|
413
|
+
el_to_captions_mapping = self.ro_model.predict_to_captions(
|
|
414
|
+
sorted_elements=sorted_elements
|
|
415
|
+
)
|
|
416
|
+
el_to_footnotes_mapping = self.ro_model.predict_to_footnotes(
|
|
417
|
+
sorted_elements=sorted_elements
|
|
418
|
+
)
|
|
419
|
+
el_merges_mapping = self.ro_model.predict_merges(
|
|
420
|
+
sorted_elements=sorted_elements
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
docling_doc: DoclingDocument = self._readingorder_elements_to_docling_doc(
|
|
424
|
+
conv_res,
|
|
425
|
+
sorted_elements,
|
|
426
|
+
el_to_captions_mapping,
|
|
427
|
+
el_to_footnotes_mapping,
|
|
428
|
+
el_merges_mapping,
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
return docling_doc
|
|
File without changes
|