docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,146 @@
1
+ import logging
2
+ from collections.abc import Iterable
3
+ from typing import Any, Dict, List, Tuple, Union
4
+
5
+ from docling_core.types.doc import BoundingBox, CoordOrigin
6
+ from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
7
+
8
+ from docling.datamodel.document import ConversionResult, Page
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ def generate_multimodal_pages(
14
+ doc_result: ConversionResult,
15
+ ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
16
+ label_to_doclaynet = {
17
+ "title": "title",
18
+ "table-of-contents": "document_index",
19
+ "subtitle-level-1": "section_header",
20
+ "checkbox-selected": "checkbox_selected",
21
+ "checkbox-unselected": "checkbox_unselected",
22
+ "caption": "caption",
23
+ "page-header": "page_header",
24
+ "page-footer": "page_footer",
25
+ "footnote": "footnote",
26
+ "table": "table",
27
+ "formula": "formula",
28
+ "list-item": "list_item",
29
+ "code": "code",
30
+ "figure": "picture",
31
+ "picture": "picture",
32
+ "reference": "text",
33
+ "paragraph": "text",
34
+ "text": "text",
35
+ }
36
+
37
+ content_text = ""
38
+ page_no = 0
39
+ start_ix = 0
40
+ end_ix = 0
41
+ doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
42
+
43
+ doc = doc_result.legacy_document
44
+
45
+ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
46
+ segments = []
47
+
48
+ for ix, item in doc_items:
49
+ item_type = item.obj_type
50
+ label = label_to_doclaynet.get(item_type, None)
51
+
52
+ if label is None or item.prov is None or page.size is None:
53
+ continue
54
+
55
+ bbox = BoundingBox.from_tuple(
56
+ tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
57
+ )
58
+ new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
59
+ page_size=page.size
60
+ )
61
+
62
+ new_segment = {
63
+ "index_in_doc": ix,
64
+ "label": label,
65
+ "text": item.text if item.text is not None else "",
66
+ "bbox": new_bbox.as_tuple(),
67
+ "data": [],
68
+ }
69
+
70
+ if isinstance(item, Table):
71
+ table_html = item.export_to_html()
72
+ new_segment["data"].append(
73
+ {
74
+ "html_seq": table_html,
75
+ "otsl_seq": "",
76
+ }
77
+ )
78
+
79
+ segments.append(new_segment)
80
+
81
+ return segments
82
+
83
+ def _process_page_cells(page: Page):
84
+ cells: List[dict] = []
85
+ if page.size is None:
86
+ return cells
87
+ for cell in page.cells:
88
+ new_bbox = (
89
+ cell.rect.to_bounding_box()
90
+ .to_top_left_origin(page_height=page.size.height)
91
+ .normalized(page_size=page.size)
92
+ )
93
+ is_ocr = cell.from_ocr
94
+ ocr_confidence = cell.confidence
95
+ cells.append(
96
+ {
97
+ "text": cell.text,
98
+ "bbox": new_bbox.as_tuple(),
99
+ "ocr": is_ocr,
100
+ "ocr_confidence": ocr_confidence,
101
+ }
102
+ )
103
+ return cells
104
+
105
+ def _process_page():
106
+ page_ix = page_no - 1
107
+ page = doc_result.pages[page_ix]
108
+
109
+ page_cells = _process_page_cells(page=page)
110
+ page_segments = _process_page_segments(doc_items=doc_items, page=page)
111
+ content_md = doc.export_to_markdown(
112
+ main_text_start=start_ix, main_text_stop=end_ix
113
+ )
114
+ # No page-tagging since we only do 1 page at the time
115
+ content_dt = doc.export_to_document_tokens(
116
+ main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
117
+ )
118
+
119
+ return content_text, content_md, content_dt, page_cells, page_segments, page
120
+
121
+ if doc.main_text is None:
122
+ return
123
+ for ix, orig_item in enumerate(doc.main_text):
124
+ item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
125
+ if item is None or item.prov is None or len(item.prov) == 0:
126
+ _log.debug(f"Skipping item {orig_item}")
127
+ continue
128
+
129
+ item_page = item.prov[0].page
130
+
131
+ # Page is complete
132
+ if page_no > 0 and item_page > page_no:
133
+ yield _process_page()
134
+
135
+ start_ix = ix
136
+ doc_items = []
137
+ content_text = ""
138
+
139
+ page_no = item_page
140
+ end_ix = ix
141
+ doc_items.append((ix, item))
142
+ if item.text is not None and item.text != "":
143
+ content_text += item.text + " "
144
+
145
+ if len(doc_items) > 0:
146
+ yield _process_page()
@@ -0,0 +1,361 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ import pandas as pd
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ CoordOrigin,
9
+ DocItemLabel,
10
+ DoclingDocument,
11
+ DocumentOrigin,
12
+ GroupLabel,
13
+ ProvenanceItem,
14
+ Size,
15
+ TableCell,
16
+ TableData,
17
+ )
18
+ from docling_core.types.doc.document import ContentLayer
19
+
20
+
21
+ def resolve_item(paths, obj):
22
+ """Find item in document from a reference path"""
23
+
24
+ if len(paths) == 0:
25
+ return obj
26
+
27
+ if paths[0] == "#":
28
+ return resolve_item(paths[1:], obj)
29
+
30
+ try:
31
+ key = int(paths[0])
32
+ except Exception:
33
+ key = paths[0]
34
+
35
+ if len(paths) == 1:
36
+ if isinstance(key, str) and key in obj:
37
+ return obj[key]
38
+ elif isinstance(key, int) and key < len(obj):
39
+ return obj[key]
40
+ else:
41
+ return None
42
+
43
+ elif len(paths) > 1:
44
+ if isinstance(key, str) and key in obj:
45
+ return resolve_item(paths[1:], obj[key])
46
+ elif isinstance(key, int) and key < len(obj):
47
+ return resolve_item(paths[1:], obj[key])
48
+ else:
49
+ return None
50
+
51
+ else:
52
+ return None
53
+
54
+
55
+ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
56
+ unique_objects = []
57
+ seen_spans = set()
58
+
59
+ for sublist in grid:
60
+ for obj in sublist:
61
+ # Convert the spans list to a tuple of tuples for hashing
62
+ spans_tuple = tuple(tuple(span) for span in obj["spans"])
63
+ if spans_tuple not in seen_spans:
64
+ seen_spans.add(spans_tuple)
65
+ unique_objects.append(obj)
66
+
67
+ return unique_objects
68
+
69
+
70
+ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
71
+ origin = DocumentOrigin(
72
+ mimetype="application/pdf",
73
+ filename=doc_glm["file-info"]["filename"],
74
+ binary_hash=doc_glm["file-info"]["document-hash"],
75
+ )
76
+ doc_name = Path(origin.filename).stem
77
+
78
+ doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
79
+
80
+ for page_dim in doc_glm["page-dimensions"]:
81
+ page_no = int(page_dim["page"])
82
+ size = Size(width=page_dim["width"], height=page_dim["height"])
83
+
84
+ doc.add_page(page_no=page_no, size=size)
85
+
86
+ if "properties" in doc_glm:
87
+ props = pd.DataFrame(
88
+ doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
89
+ )
90
+ else:
91
+ props = pd.DataFrame()
92
+
93
+ current_list = None
94
+
95
+ for ix, pelem in enumerate(doc_glm["page-elements"]):
96
+ ptype = pelem["type"]
97
+ span_i = pelem["span"][0]
98
+ span_j = pelem["span"][1]
99
+
100
+ if "iref" not in pelem:
101
+ # print(json.dumps(pelem, indent=2))
102
+ continue
103
+
104
+ iref = pelem["iref"]
105
+
106
+ if re.match("#/figures/(\\d+)/captions/(.+)", iref):
107
+ # print(f"skip {iref}")
108
+ continue
109
+
110
+ if re.match("#/tables/(\\d+)/captions/(.+)", iref):
111
+ # print(f"skip {iref}")
112
+ continue
113
+
114
+ path = iref.split("/")
115
+ obj = resolve_item(path, doc_glm)
116
+
117
+ if obj is None:
118
+ current_list = None
119
+ print(f"warning: undefined {path}")
120
+ continue
121
+
122
+ if ptype == "figure":
123
+ current_list = None
124
+ text = ""
125
+ caption_refs = []
126
+ for caption in obj["captions"]:
127
+ text += caption["text"]
128
+
129
+ for nprov in caption["prov"]:
130
+ npaths = nprov["$ref"].split("/")
131
+ nelem = resolve_item(npaths, doc_glm)
132
+
133
+ if nelem is None:
134
+ # print(f"warning: undefined caption {npaths}")
135
+ continue
136
+
137
+ span_i = nelem["span"][0]
138
+ span_j = nelem["span"][1]
139
+
140
+ cap_text = caption["text"][span_i:span_j]
141
+
142
+ # doc_glm["page-elements"].remove(nelem)
143
+
144
+ prov = ProvenanceItem(
145
+ page_no=nelem["page"],
146
+ charspan=tuple(nelem["span"]),
147
+ bbox=BoundingBox.from_tuple(
148
+ nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
149
+ ),
150
+ )
151
+
152
+ caption_obj = doc.add_text(
153
+ label=DocItemLabel.CAPTION, text=cap_text, prov=prov
154
+ )
155
+ caption_refs.append(caption_obj.get_ref())
156
+
157
+ prov = ProvenanceItem(
158
+ page_no=pelem["page"],
159
+ charspan=(0, len(text)),
160
+ bbox=BoundingBox.from_tuple(
161
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
162
+ ),
163
+ )
164
+
165
+ pic = doc.add_picture(prov=prov)
166
+ pic.captions.extend(caption_refs)
167
+ _add_child_elements(pic, doc, obj, pelem)
168
+
169
+ elif ptype == "table":
170
+ current_list = None
171
+ text = ""
172
+ caption_refs = []
173
+ item_label = DocItemLabel(pelem["name"])
174
+
175
+ for caption in obj["captions"]:
176
+ text += caption["text"]
177
+
178
+ for nprov in caption["prov"]:
179
+ npaths = nprov["$ref"].split("/")
180
+ nelem = resolve_item(npaths, doc_glm)
181
+
182
+ if nelem is None:
183
+ # print(f"warning: undefined caption {npaths}")
184
+ continue
185
+
186
+ span_i = nelem["span"][0]
187
+ span_j = nelem["span"][1]
188
+
189
+ cap_text = caption["text"][span_i:span_j]
190
+
191
+ # doc_glm["page-elements"].remove(nelem)
192
+
193
+ prov = ProvenanceItem(
194
+ page_no=nelem["page"],
195
+ charspan=tuple(nelem["span"]),
196
+ bbox=BoundingBox.from_tuple(
197
+ nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
198
+ ),
199
+ )
200
+
201
+ caption_obj = doc.add_text(
202
+ label=DocItemLabel.CAPTION, text=cap_text, prov=prov
203
+ )
204
+ caption_refs.append(caption_obj.get_ref())
205
+
206
+ table_cells_glm = _flatten_table_grid(obj["data"])
207
+
208
+ table_cells = []
209
+ for tbl_cell_glm in table_cells_glm:
210
+ if tbl_cell_glm["bbox"] is not None:
211
+ bbox = BoundingBox.from_tuple(
212
+ tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
213
+ )
214
+ else:
215
+ bbox = None
216
+
217
+ is_col_header = False
218
+ is_row_header = False
219
+ is_row_section = False
220
+
221
+ if tbl_cell_glm["type"] == "col_header":
222
+ is_col_header = True
223
+ elif tbl_cell_glm["type"] == "row_header":
224
+ is_row_header = True
225
+ elif tbl_cell_glm["type"] == "row_section":
226
+ is_row_section = True
227
+
228
+ table_cells.append(
229
+ TableCell(
230
+ row_span=tbl_cell_glm["row-span"][1]
231
+ - tbl_cell_glm["row-span"][0],
232
+ col_span=tbl_cell_glm["col-span"][1]
233
+ - tbl_cell_glm["col-span"][0],
234
+ start_row_offset_idx=tbl_cell_glm["row-span"][0],
235
+ end_row_offset_idx=tbl_cell_glm["row-span"][1],
236
+ start_col_offset_idx=tbl_cell_glm["col-span"][0],
237
+ end_col_offset_idx=tbl_cell_glm["col-span"][1],
238
+ text=tbl_cell_glm["text"],
239
+ bbox=bbox,
240
+ column_header=is_col_header,
241
+ row_header=is_row_header,
242
+ row_section=is_row_section,
243
+ )
244
+ )
245
+
246
+ tbl_data = TableData(
247
+ num_rows=obj.get("#-rows", 0),
248
+ num_cols=obj.get("#-cols", 0),
249
+ table_cells=table_cells,
250
+ )
251
+
252
+ prov = ProvenanceItem(
253
+ page_no=pelem["page"],
254
+ charspan=(0, 0),
255
+ bbox=BoundingBox.from_tuple(
256
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
257
+ ),
258
+ )
259
+
260
+ tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label)
261
+ tbl.captions.extend(caption_refs)
262
+
263
+ elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]:
264
+ label = DocItemLabel(ptype)
265
+ group_label = GroupLabel.UNSPECIFIED
266
+ if label == DocItemLabel.FORM:
267
+ group_label = GroupLabel.FORM_AREA
268
+ elif label == DocItemLabel.KEY_VALUE_REGION:
269
+ group_label = GroupLabel.KEY_VALUE_AREA
270
+
271
+ container_el = doc.add_group(label=group_label)
272
+
273
+ _add_child_elements(container_el, doc, obj, pelem)
274
+ elif "text" in obj:
275
+ text = obj["text"][span_i:span_j]
276
+
277
+ type_label = pelem["type"]
278
+ name_label = pelem["name"]
279
+ if update_name_label and len(props) > 0 and type_label == "paragraph":
280
+ prop = props[
281
+ (props["type"] == "semantic") & (props["subj_path"] == iref)
282
+ ]
283
+ if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
284
+ name_label = prop.iloc[0]["label"]
285
+
286
+ prov = ProvenanceItem(
287
+ page_no=pelem["page"],
288
+ charspan=(0, len(text)),
289
+ bbox=BoundingBox.from_tuple(
290
+ pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
291
+ ),
292
+ )
293
+ label = DocItemLabel(name_label)
294
+
295
+ if label == DocItemLabel.LIST_ITEM:
296
+ if current_list is None:
297
+ current_list = doc.add_group(label=GroupLabel.LIST, name="list")
298
+
299
+ # TODO: Infer if this is a numbered or a bullet list item
300
+ doc.add_list_item(
301
+ text=text, enumerated=False, prov=prov, parent=current_list
302
+ )
303
+ elif label == DocItemLabel.SECTION_HEADER:
304
+ current_list = None
305
+
306
+ doc.add_heading(text=text, prov=prov)
307
+ elif label == DocItemLabel.CODE:
308
+ current_list = None
309
+
310
+ doc.add_code(text=text, prov=prov)
311
+ elif label == DocItemLabel.FORMULA:
312
+ current_list = None
313
+
314
+ doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
315
+ elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
316
+ current_list = None
317
+
318
+ doc.add_text(
319
+ label=DocItemLabel(name_label),
320
+ text=text,
321
+ prov=prov,
322
+ content_layer=ContentLayer.FURNITURE,
323
+ )
324
+ else:
325
+ current_list = None
326
+
327
+ doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
328
+
329
+ return doc
330
+
331
+
332
+ def _add_child_elements(container_el, doc, obj, pelem):
333
+ payload = obj.get("payload")
334
+ if payload is not None:
335
+ children = payload.get("children", [])
336
+
337
+ for child in children:
338
+ c_label = DocItemLabel(child["label"])
339
+ c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
340
+ doc.pages[pelem["page"]].size.height
341
+ )
342
+ c_text = " ".join(
343
+ [
344
+ cell["text"].replace("\x02", "-").strip()
345
+ for cell in child["cells"]
346
+ if len(cell["text"].strip()) > 0
347
+ ]
348
+ )
349
+
350
+ c_prov = ProvenanceItem(
351
+ page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
352
+ )
353
+ if c_label == DocItemLabel.LIST_ITEM:
354
+ # TODO: Infer if this is a numbered or a bullet list item
355
+ doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
356
+ elif c_label == DocItemLabel.SECTION_HEADER:
357
+ doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
358
+ else:
359
+ doc.add_text(
360
+ parent=container_el, label=c_label, text=c_text, prov=c_prov
361
+ )