docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
docling/utils/export.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from collections.abc import Iterable
|
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
4
|
+
|
|
5
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
6
|
+
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
|
7
|
+
|
|
8
|
+
from docling.datamodel.document import ConversionResult, Page
|
|
9
|
+
|
|
10
|
+
_log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def generate_multimodal_pages(
|
|
14
|
+
doc_result: ConversionResult,
|
|
15
|
+
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
|
16
|
+
label_to_doclaynet = {
|
|
17
|
+
"title": "title",
|
|
18
|
+
"table-of-contents": "document_index",
|
|
19
|
+
"subtitle-level-1": "section_header",
|
|
20
|
+
"checkbox-selected": "checkbox_selected",
|
|
21
|
+
"checkbox-unselected": "checkbox_unselected",
|
|
22
|
+
"caption": "caption",
|
|
23
|
+
"page-header": "page_header",
|
|
24
|
+
"page-footer": "page_footer",
|
|
25
|
+
"footnote": "footnote",
|
|
26
|
+
"table": "table",
|
|
27
|
+
"formula": "formula",
|
|
28
|
+
"list-item": "list_item",
|
|
29
|
+
"code": "code",
|
|
30
|
+
"figure": "picture",
|
|
31
|
+
"picture": "picture",
|
|
32
|
+
"reference": "text",
|
|
33
|
+
"paragraph": "text",
|
|
34
|
+
"text": "text",
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
content_text = ""
|
|
38
|
+
page_no = 0
|
|
39
|
+
start_ix = 0
|
|
40
|
+
end_ix = 0
|
|
41
|
+
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
|
|
42
|
+
|
|
43
|
+
doc = doc_result.legacy_document
|
|
44
|
+
|
|
45
|
+
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
|
|
46
|
+
segments = []
|
|
47
|
+
|
|
48
|
+
for ix, item in doc_items:
|
|
49
|
+
item_type = item.obj_type
|
|
50
|
+
label = label_to_doclaynet.get(item_type, None)
|
|
51
|
+
|
|
52
|
+
if label is None or item.prov is None or page.size is None:
|
|
53
|
+
continue
|
|
54
|
+
|
|
55
|
+
bbox = BoundingBox.from_tuple(
|
|
56
|
+
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
|
|
57
|
+
)
|
|
58
|
+
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
|
|
59
|
+
page_size=page.size
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
new_segment = {
|
|
63
|
+
"index_in_doc": ix,
|
|
64
|
+
"label": label,
|
|
65
|
+
"text": item.text if item.text is not None else "",
|
|
66
|
+
"bbox": new_bbox.as_tuple(),
|
|
67
|
+
"data": [],
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if isinstance(item, Table):
|
|
71
|
+
table_html = item.export_to_html()
|
|
72
|
+
new_segment["data"].append(
|
|
73
|
+
{
|
|
74
|
+
"html_seq": table_html,
|
|
75
|
+
"otsl_seq": "",
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
segments.append(new_segment)
|
|
80
|
+
|
|
81
|
+
return segments
|
|
82
|
+
|
|
83
|
+
def _process_page_cells(page: Page):
|
|
84
|
+
cells: List[dict] = []
|
|
85
|
+
if page.size is None:
|
|
86
|
+
return cells
|
|
87
|
+
for cell in page.cells:
|
|
88
|
+
new_bbox = (
|
|
89
|
+
cell.rect.to_bounding_box()
|
|
90
|
+
.to_top_left_origin(page_height=page.size.height)
|
|
91
|
+
.normalized(page_size=page.size)
|
|
92
|
+
)
|
|
93
|
+
is_ocr = cell.from_ocr
|
|
94
|
+
ocr_confidence = cell.confidence
|
|
95
|
+
cells.append(
|
|
96
|
+
{
|
|
97
|
+
"text": cell.text,
|
|
98
|
+
"bbox": new_bbox.as_tuple(),
|
|
99
|
+
"ocr": is_ocr,
|
|
100
|
+
"ocr_confidence": ocr_confidence,
|
|
101
|
+
}
|
|
102
|
+
)
|
|
103
|
+
return cells
|
|
104
|
+
|
|
105
|
+
def _process_page():
|
|
106
|
+
page_ix = page_no - 1
|
|
107
|
+
page = doc_result.pages[page_ix]
|
|
108
|
+
|
|
109
|
+
page_cells = _process_page_cells(page=page)
|
|
110
|
+
page_segments = _process_page_segments(doc_items=doc_items, page=page)
|
|
111
|
+
content_md = doc.export_to_markdown(
|
|
112
|
+
main_text_start=start_ix, main_text_stop=end_ix
|
|
113
|
+
)
|
|
114
|
+
# No page-tagging since we only do 1 page at the time
|
|
115
|
+
content_dt = doc.export_to_document_tokens(
|
|
116
|
+
main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return content_text, content_md, content_dt, page_cells, page_segments, page
|
|
120
|
+
|
|
121
|
+
if doc.main_text is None:
|
|
122
|
+
return
|
|
123
|
+
for ix, orig_item in enumerate(doc.main_text):
|
|
124
|
+
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
|
125
|
+
if item is None or item.prov is None or len(item.prov) == 0:
|
|
126
|
+
_log.debug(f"Skipping item {orig_item}")
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
item_page = item.prov[0].page
|
|
130
|
+
|
|
131
|
+
# Page is complete
|
|
132
|
+
if page_no > 0 and item_page > page_no:
|
|
133
|
+
yield _process_page()
|
|
134
|
+
|
|
135
|
+
start_ix = ix
|
|
136
|
+
doc_items = []
|
|
137
|
+
content_text = ""
|
|
138
|
+
|
|
139
|
+
page_no = item_page
|
|
140
|
+
end_ix = ix
|
|
141
|
+
doc_items.append((ix, item))
|
|
142
|
+
if item.text is not None and item.text != "":
|
|
143
|
+
content_text += item.text + " "
|
|
144
|
+
|
|
145
|
+
if len(doc_items) > 0:
|
|
146
|
+
yield _process_page()
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from docling_core.types.doc import (
|
|
7
|
+
BoundingBox,
|
|
8
|
+
CoordOrigin,
|
|
9
|
+
DocItemLabel,
|
|
10
|
+
DoclingDocument,
|
|
11
|
+
DocumentOrigin,
|
|
12
|
+
GroupLabel,
|
|
13
|
+
ProvenanceItem,
|
|
14
|
+
Size,
|
|
15
|
+
TableCell,
|
|
16
|
+
TableData,
|
|
17
|
+
)
|
|
18
|
+
from docling_core.types.doc.document import ContentLayer
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def resolve_item(paths, obj):
|
|
22
|
+
"""Find item in document from a reference path"""
|
|
23
|
+
|
|
24
|
+
if len(paths) == 0:
|
|
25
|
+
return obj
|
|
26
|
+
|
|
27
|
+
if paths[0] == "#":
|
|
28
|
+
return resolve_item(paths[1:], obj)
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
key = int(paths[0])
|
|
32
|
+
except Exception:
|
|
33
|
+
key = paths[0]
|
|
34
|
+
|
|
35
|
+
if len(paths) == 1:
|
|
36
|
+
if isinstance(key, str) and key in obj:
|
|
37
|
+
return obj[key]
|
|
38
|
+
elif isinstance(key, int) and key < len(obj):
|
|
39
|
+
return obj[key]
|
|
40
|
+
else:
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
elif len(paths) > 1:
|
|
44
|
+
if isinstance(key, str) and key in obj:
|
|
45
|
+
return resolve_item(paths[1:], obj[key])
|
|
46
|
+
elif isinstance(key, int) and key < len(obj):
|
|
47
|
+
return resolve_item(paths[1:], obj[key])
|
|
48
|
+
else:
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
else:
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
|
56
|
+
unique_objects = []
|
|
57
|
+
seen_spans = set()
|
|
58
|
+
|
|
59
|
+
for sublist in grid:
|
|
60
|
+
for obj in sublist:
|
|
61
|
+
# Convert the spans list to a tuple of tuples for hashing
|
|
62
|
+
spans_tuple = tuple(tuple(span) for span in obj["spans"])
|
|
63
|
+
if spans_tuple not in seen_spans:
|
|
64
|
+
seen_spans.add(spans_tuple)
|
|
65
|
+
unique_objects.append(obj)
|
|
66
|
+
|
|
67
|
+
return unique_objects
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
|
|
71
|
+
origin = DocumentOrigin(
|
|
72
|
+
mimetype="application/pdf",
|
|
73
|
+
filename=doc_glm["file-info"]["filename"],
|
|
74
|
+
binary_hash=doc_glm["file-info"]["document-hash"],
|
|
75
|
+
)
|
|
76
|
+
doc_name = Path(origin.filename).stem
|
|
77
|
+
|
|
78
|
+
doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
|
|
79
|
+
|
|
80
|
+
for page_dim in doc_glm["page-dimensions"]:
|
|
81
|
+
page_no = int(page_dim["page"])
|
|
82
|
+
size = Size(width=page_dim["width"], height=page_dim["height"])
|
|
83
|
+
|
|
84
|
+
doc.add_page(page_no=page_no, size=size)
|
|
85
|
+
|
|
86
|
+
if "properties" in doc_glm:
|
|
87
|
+
props = pd.DataFrame(
|
|
88
|
+
doc_glm["properties"]["data"], columns=doc_glm["properties"]["headers"]
|
|
89
|
+
)
|
|
90
|
+
else:
|
|
91
|
+
props = pd.DataFrame()
|
|
92
|
+
|
|
93
|
+
current_list = None
|
|
94
|
+
|
|
95
|
+
for ix, pelem in enumerate(doc_glm["page-elements"]):
|
|
96
|
+
ptype = pelem["type"]
|
|
97
|
+
span_i = pelem["span"][0]
|
|
98
|
+
span_j = pelem["span"][1]
|
|
99
|
+
|
|
100
|
+
if "iref" not in pelem:
|
|
101
|
+
# print(json.dumps(pelem, indent=2))
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
iref = pelem["iref"]
|
|
105
|
+
|
|
106
|
+
if re.match("#/figures/(\\d+)/captions/(.+)", iref):
|
|
107
|
+
# print(f"skip {iref}")
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
if re.match("#/tables/(\\d+)/captions/(.+)", iref):
|
|
111
|
+
# print(f"skip {iref}")
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
path = iref.split("/")
|
|
115
|
+
obj = resolve_item(path, doc_glm)
|
|
116
|
+
|
|
117
|
+
if obj is None:
|
|
118
|
+
current_list = None
|
|
119
|
+
print(f"warning: undefined {path}")
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
if ptype == "figure":
|
|
123
|
+
current_list = None
|
|
124
|
+
text = ""
|
|
125
|
+
caption_refs = []
|
|
126
|
+
for caption in obj["captions"]:
|
|
127
|
+
text += caption["text"]
|
|
128
|
+
|
|
129
|
+
for nprov in caption["prov"]:
|
|
130
|
+
npaths = nprov["$ref"].split("/")
|
|
131
|
+
nelem = resolve_item(npaths, doc_glm)
|
|
132
|
+
|
|
133
|
+
if nelem is None:
|
|
134
|
+
# print(f"warning: undefined caption {npaths}")
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
span_i = nelem["span"][0]
|
|
138
|
+
span_j = nelem["span"][1]
|
|
139
|
+
|
|
140
|
+
cap_text = caption["text"][span_i:span_j]
|
|
141
|
+
|
|
142
|
+
# doc_glm["page-elements"].remove(nelem)
|
|
143
|
+
|
|
144
|
+
prov = ProvenanceItem(
|
|
145
|
+
page_no=nelem["page"],
|
|
146
|
+
charspan=tuple(nelem["span"]),
|
|
147
|
+
bbox=BoundingBox.from_tuple(
|
|
148
|
+
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
149
|
+
),
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
caption_obj = doc.add_text(
|
|
153
|
+
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
|
|
154
|
+
)
|
|
155
|
+
caption_refs.append(caption_obj.get_ref())
|
|
156
|
+
|
|
157
|
+
prov = ProvenanceItem(
|
|
158
|
+
page_no=pelem["page"],
|
|
159
|
+
charspan=(0, len(text)),
|
|
160
|
+
bbox=BoundingBox.from_tuple(
|
|
161
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
162
|
+
),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
pic = doc.add_picture(prov=prov)
|
|
166
|
+
pic.captions.extend(caption_refs)
|
|
167
|
+
_add_child_elements(pic, doc, obj, pelem)
|
|
168
|
+
|
|
169
|
+
elif ptype == "table":
|
|
170
|
+
current_list = None
|
|
171
|
+
text = ""
|
|
172
|
+
caption_refs = []
|
|
173
|
+
item_label = DocItemLabel(pelem["name"])
|
|
174
|
+
|
|
175
|
+
for caption in obj["captions"]:
|
|
176
|
+
text += caption["text"]
|
|
177
|
+
|
|
178
|
+
for nprov in caption["prov"]:
|
|
179
|
+
npaths = nprov["$ref"].split("/")
|
|
180
|
+
nelem = resolve_item(npaths, doc_glm)
|
|
181
|
+
|
|
182
|
+
if nelem is None:
|
|
183
|
+
# print(f"warning: undefined caption {npaths}")
|
|
184
|
+
continue
|
|
185
|
+
|
|
186
|
+
span_i = nelem["span"][0]
|
|
187
|
+
span_j = nelem["span"][1]
|
|
188
|
+
|
|
189
|
+
cap_text = caption["text"][span_i:span_j]
|
|
190
|
+
|
|
191
|
+
# doc_glm["page-elements"].remove(nelem)
|
|
192
|
+
|
|
193
|
+
prov = ProvenanceItem(
|
|
194
|
+
page_no=nelem["page"],
|
|
195
|
+
charspan=tuple(nelem["span"]),
|
|
196
|
+
bbox=BoundingBox.from_tuple(
|
|
197
|
+
nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
198
|
+
),
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
caption_obj = doc.add_text(
|
|
202
|
+
label=DocItemLabel.CAPTION, text=cap_text, prov=prov
|
|
203
|
+
)
|
|
204
|
+
caption_refs.append(caption_obj.get_ref())
|
|
205
|
+
|
|
206
|
+
table_cells_glm = _flatten_table_grid(obj["data"])
|
|
207
|
+
|
|
208
|
+
table_cells = []
|
|
209
|
+
for tbl_cell_glm in table_cells_glm:
|
|
210
|
+
if tbl_cell_glm["bbox"] is not None:
|
|
211
|
+
bbox = BoundingBox.from_tuple(
|
|
212
|
+
tbl_cell_glm["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
213
|
+
)
|
|
214
|
+
else:
|
|
215
|
+
bbox = None
|
|
216
|
+
|
|
217
|
+
is_col_header = False
|
|
218
|
+
is_row_header = False
|
|
219
|
+
is_row_section = False
|
|
220
|
+
|
|
221
|
+
if tbl_cell_glm["type"] == "col_header":
|
|
222
|
+
is_col_header = True
|
|
223
|
+
elif tbl_cell_glm["type"] == "row_header":
|
|
224
|
+
is_row_header = True
|
|
225
|
+
elif tbl_cell_glm["type"] == "row_section":
|
|
226
|
+
is_row_section = True
|
|
227
|
+
|
|
228
|
+
table_cells.append(
|
|
229
|
+
TableCell(
|
|
230
|
+
row_span=tbl_cell_glm["row-span"][1]
|
|
231
|
+
- tbl_cell_glm["row-span"][0],
|
|
232
|
+
col_span=tbl_cell_glm["col-span"][1]
|
|
233
|
+
- tbl_cell_glm["col-span"][0],
|
|
234
|
+
start_row_offset_idx=tbl_cell_glm["row-span"][0],
|
|
235
|
+
end_row_offset_idx=tbl_cell_glm["row-span"][1],
|
|
236
|
+
start_col_offset_idx=tbl_cell_glm["col-span"][0],
|
|
237
|
+
end_col_offset_idx=tbl_cell_glm["col-span"][1],
|
|
238
|
+
text=tbl_cell_glm["text"],
|
|
239
|
+
bbox=bbox,
|
|
240
|
+
column_header=is_col_header,
|
|
241
|
+
row_header=is_row_header,
|
|
242
|
+
row_section=is_row_section,
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
tbl_data = TableData(
|
|
247
|
+
num_rows=obj.get("#-rows", 0),
|
|
248
|
+
num_cols=obj.get("#-cols", 0),
|
|
249
|
+
table_cells=table_cells,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
prov = ProvenanceItem(
|
|
253
|
+
page_no=pelem["page"],
|
|
254
|
+
charspan=(0, 0),
|
|
255
|
+
bbox=BoundingBox.from_tuple(
|
|
256
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
257
|
+
),
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label)
|
|
261
|
+
tbl.captions.extend(caption_refs)
|
|
262
|
+
|
|
263
|
+
elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]:
|
|
264
|
+
label = DocItemLabel(ptype)
|
|
265
|
+
group_label = GroupLabel.UNSPECIFIED
|
|
266
|
+
if label == DocItemLabel.FORM:
|
|
267
|
+
group_label = GroupLabel.FORM_AREA
|
|
268
|
+
elif label == DocItemLabel.KEY_VALUE_REGION:
|
|
269
|
+
group_label = GroupLabel.KEY_VALUE_AREA
|
|
270
|
+
|
|
271
|
+
container_el = doc.add_group(label=group_label)
|
|
272
|
+
|
|
273
|
+
_add_child_elements(container_el, doc, obj, pelem)
|
|
274
|
+
elif "text" in obj:
|
|
275
|
+
text = obj["text"][span_i:span_j]
|
|
276
|
+
|
|
277
|
+
type_label = pelem["type"]
|
|
278
|
+
name_label = pelem["name"]
|
|
279
|
+
if update_name_label and len(props) > 0 and type_label == "paragraph":
|
|
280
|
+
prop = props[
|
|
281
|
+
(props["type"] == "semantic") & (props["subj_path"] == iref)
|
|
282
|
+
]
|
|
283
|
+
if len(prop) == 1 and prop.iloc[0]["confidence"] > 0.85:
|
|
284
|
+
name_label = prop.iloc[0]["label"]
|
|
285
|
+
|
|
286
|
+
prov = ProvenanceItem(
|
|
287
|
+
page_no=pelem["page"],
|
|
288
|
+
charspan=(0, len(text)),
|
|
289
|
+
bbox=BoundingBox.from_tuple(
|
|
290
|
+
pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT
|
|
291
|
+
),
|
|
292
|
+
)
|
|
293
|
+
label = DocItemLabel(name_label)
|
|
294
|
+
|
|
295
|
+
if label == DocItemLabel.LIST_ITEM:
|
|
296
|
+
if current_list is None:
|
|
297
|
+
current_list = doc.add_group(label=GroupLabel.LIST, name="list")
|
|
298
|
+
|
|
299
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
|
300
|
+
doc.add_list_item(
|
|
301
|
+
text=text, enumerated=False, prov=prov, parent=current_list
|
|
302
|
+
)
|
|
303
|
+
elif label == DocItemLabel.SECTION_HEADER:
|
|
304
|
+
current_list = None
|
|
305
|
+
|
|
306
|
+
doc.add_heading(text=text, prov=prov)
|
|
307
|
+
elif label == DocItemLabel.CODE:
|
|
308
|
+
current_list = None
|
|
309
|
+
|
|
310
|
+
doc.add_code(text=text, prov=prov)
|
|
311
|
+
elif label == DocItemLabel.FORMULA:
|
|
312
|
+
current_list = None
|
|
313
|
+
|
|
314
|
+
doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
|
|
315
|
+
elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
|
316
|
+
current_list = None
|
|
317
|
+
|
|
318
|
+
doc.add_text(
|
|
319
|
+
label=DocItemLabel(name_label),
|
|
320
|
+
text=text,
|
|
321
|
+
prov=prov,
|
|
322
|
+
content_layer=ContentLayer.FURNITURE,
|
|
323
|
+
)
|
|
324
|
+
else:
|
|
325
|
+
current_list = None
|
|
326
|
+
|
|
327
|
+
doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov)
|
|
328
|
+
|
|
329
|
+
return doc
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _add_child_elements(container_el, doc, obj, pelem):
|
|
333
|
+
payload = obj.get("payload")
|
|
334
|
+
if payload is not None:
|
|
335
|
+
children = payload.get("children", [])
|
|
336
|
+
|
|
337
|
+
for child in children:
|
|
338
|
+
c_label = DocItemLabel(child["label"])
|
|
339
|
+
c_bbox = BoundingBox.model_validate(child["bbox"]).to_bottom_left_origin(
|
|
340
|
+
doc.pages[pelem["page"]].size.height
|
|
341
|
+
)
|
|
342
|
+
c_text = " ".join(
|
|
343
|
+
[
|
|
344
|
+
cell["text"].replace("\x02", "-").strip()
|
|
345
|
+
for cell in child["cells"]
|
|
346
|
+
if len(cell["text"].strip()) > 0
|
|
347
|
+
]
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
c_prov = ProvenanceItem(
|
|
351
|
+
page_no=pelem["page"], charspan=(0, len(c_text)), bbox=c_bbox
|
|
352
|
+
)
|
|
353
|
+
if c_label == DocItemLabel.LIST_ITEM:
|
|
354
|
+
# TODO: Infer if this is a numbered or a bullet list item
|
|
355
|
+
doc.add_list_item(parent=container_el, text=c_text, prov=c_prov)
|
|
356
|
+
elif c_label == DocItemLabel.SECTION_HEADER:
|
|
357
|
+
doc.add_heading(parent=container_el, text=c_text, prov=c_prov)
|
|
358
|
+
else:
|
|
359
|
+
doc.add_text(
|
|
360
|
+
parent=container_el, label=c_label, text=c_text, prov=c_prov
|
|
361
|
+
)
|