docling 1.6.2__py3-none-any.whl → 1.19.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +17 -8
- docling/backend/docling_parse_backend.py +42 -26
- docling/backend/pypdfium2_backend.py +33 -11
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +253 -0
- docling/datamodel/base_models.py +39 -27
- docling/datamodel/document.py +115 -17
- docling/datamodel/pipeline_options.py +67 -0
- docling/document_converter.py +65 -44
- docling/models/base_ocr_model.py +4 -4
- docling/models/ds_glm_model.py +11 -7
- docling/models/easyocr_model.py +19 -4
- docling/models/layout_model.py +3 -3
- docling/models/table_structure_model.py +18 -2
- docling/models/tesseract_ocr_cli_model.py +167 -0
- docling/models/tesseract_ocr_model.py +122 -0
- docling/pipeline/base_model_pipeline.py +4 -3
- docling/pipeline/standard_model_pipeline.py +36 -8
- docling/utils/export.py +145 -0
- {docling-1.6.2.dist-info → docling-1.19.1.dist-info}/LICENSE +1 -1
- docling-1.19.1.dist-info/METADATA +380 -0
- docling-1.19.1.dist-info/RECORD +34 -0
- docling-1.19.1.dist-info/entry_points.txt +3 -0
- docling-1.6.2.dist-info/METADATA +0 -192
- docling-1.6.2.dist-info/RECORD +0 -27
- {docling-1.6.2.dist-info → docling-1.19.1.dist-info}/WHEEL +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import copy
|
2
|
+
from pathlib import Path
|
2
3
|
from typing import Iterable, List
|
3
4
|
|
4
5
|
import numpy
|
@@ -12,16 +13,22 @@ from docling.datamodel.base_models import (
|
|
12
13
|
TableElement,
|
13
14
|
TableStructurePrediction,
|
14
15
|
)
|
16
|
+
from docling.datamodel.pipeline_options import TableFormerMode
|
15
17
|
|
16
18
|
|
17
19
|
class TableStructureModel:
|
18
20
|
def __init__(self, config):
|
19
21
|
self.config = config
|
20
22
|
self.do_cell_matching = config["do_cell_matching"]
|
23
|
+
self.mode = config["mode"]
|
21
24
|
|
22
25
|
self.enabled = config["enabled"]
|
23
26
|
if self.enabled:
|
24
|
-
artifacts_path = config["artifacts_path"]
|
27
|
+
artifacts_path: Path = config["artifacts_path"]
|
28
|
+
|
29
|
+
if self.mode == TableFormerMode.ACCURATE:
|
30
|
+
artifacts_path = artifacts_path / "fat"
|
31
|
+
|
25
32
|
# Third Party
|
26
33
|
import docling_ibm_models.tableformer.common as c
|
27
34
|
|
@@ -44,7 +51,16 @@ class TableStructureModel:
|
|
44
51
|
|
45
52
|
for tc in table_element.table_cells:
|
46
53
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
47
|
-
|
54
|
+
if tc.column_header:
|
55
|
+
width = 3
|
56
|
+
else:
|
57
|
+
width = 1
|
58
|
+
draw.rectangle([(x0, y0), (x1, y1)], outline="blue", width=width)
|
59
|
+
draw.text(
|
60
|
+
(x0 + 3, y0 + 3),
|
61
|
+
text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
|
62
|
+
fill="black",
|
63
|
+
)
|
48
64
|
|
49
65
|
image.show()
|
50
66
|
|
@@ -0,0 +1,167 @@
|
|
1
|
+
import io
|
2
|
+
import logging
|
3
|
+
import tempfile
|
4
|
+
from subprocess import DEVNULL, PIPE, Popen
|
5
|
+
from typing import Iterable, Tuple
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
10
|
+
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
11
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
12
|
+
|
13
|
+
_log = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class TesseractOcrCliModel(BaseOcrModel):
|
17
|
+
|
18
|
+
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
19
|
+
super().__init__(enabled=enabled, options=options)
|
20
|
+
self.options: TesseractCliOcrOptions
|
21
|
+
|
22
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
|
+
|
24
|
+
self._name = None
|
25
|
+
self._version = None
|
26
|
+
|
27
|
+
if self.enabled:
|
28
|
+
try:
|
29
|
+
self._get_name_and_version()
|
30
|
+
|
31
|
+
except Exception as exc:
|
32
|
+
raise RuntimeError(
|
33
|
+
f"Tesseract is not available, aborting: {exc} "
|
34
|
+
"Install tesseract on your system and the tesseract binary is discoverable. "
|
35
|
+
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
|
36
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
37
|
+
)
|
38
|
+
|
39
|
+
def _get_name_and_version(self) -> Tuple[str, str]:
|
40
|
+
|
41
|
+
if self._name != None and self._version != None:
|
42
|
+
return self._name, self._version
|
43
|
+
|
44
|
+
cmd = [self.options.tesseract_cmd, "--version"]
|
45
|
+
|
46
|
+
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
47
|
+
stdout, stderr = proc.communicate()
|
48
|
+
|
49
|
+
proc.wait()
|
50
|
+
|
51
|
+
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
|
52
|
+
# to stderr, so check both.
|
53
|
+
version_line = (
|
54
|
+
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
|
55
|
+
.split("\n")[0]
|
56
|
+
.strip()
|
57
|
+
)
|
58
|
+
|
59
|
+
# If everything else fails...
|
60
|
+
if not version_line:
|
61
|
+
version_line = "tesseract XXX"
|
62
|
+
|
63
|
+
name, version = version_line.split(" ")
|
64
|
+
|
65
|
+
self._name = name
|
66
|
+
self._version = version
|
67
|
+
|
68
|
+
return name, version
|
69
|
+
|
70
|
+
def _run_tesseract(self, ifilename: str):
|
71
|
+
|
72
|
+
cmd = [self.options.tesseract_cmd]
|
73
|
+
|
74
|
+
if self.options.lang is not None and len(self.options.lang) > 0:
|
75
|
+
cmd.append("-l")
|
76
|
+
cmd.append("+".join(self.options.lang))
|
77
|
+
if self.options.path is not None:
|
78
|
+
cmd.append("--tessdata-dir")
|
79
|
+
cmd.append(self.options.path)
|
80
|
+
|
81
|
+
cmd += [ifilename, "stdout", "tsv"]
|
82
|
+
_log.info("command: {}".format(" ".join(cmd)))
|
83
|
+
|
84
|
+
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
85
|
+
output, _ = proc.communicate()
|
86
|
+
|
87
|
+
# _log.info(output)
|
88
|
+
|
89
|
+
# Decode the byte string to a regular string
|
90
|
+
decoded_data = output.decode("utf-8")
|
91
|
+
# _log.info(decoded_data)
|
92
|
+
|
93
|
+
# Read the TSV file generated by Tesseract
|
94
|
+
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
95
|
+
|
96
|
+
# Display the dataframe (optional)
|
97
|
+
# _log.info("df: ", df.head())
|
98
|
+
|
99
|
+
# Filter rows that contain actual text (ignore header or empty rows)
|
100
|
+
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
|
101
|
+
|
102
|
+
return df_filtered
|
103
|
+
|
104
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
105
|
+
|
106
|
+
if not self.enabled:
|
107
|
+
yield from page_batch
|
108
|
+
return
|
109
|
+
|
110
|
+
for page in page_batch:
|
111
|
+
ocr_rects = self.get_ocr_rects(page)
|
112
|
+
|
113
|
+
all_ocr_cells = []
|
114
|
+
for ocr_rect in ocr_rects:
|
115
|
+
# Skip zero area boxes
|
116
|
+
if ocr_rect.area() == 0:
|
117
|
+
continue
|
118
|
+
high_res_image = page._backend.get_page_image(
|
119
|
+
scale=self.scale, cropbox=ocr_rect
|
120
|
+
)
|
121
|
+
|
122
|
+
with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
|
123
|
+
fname = image_file.name
|
124
|
+
high_res_image.save(fname)
|
125
|
+
|
126
|
+
df = self._run_tesseract(fname)
|
127
|
+
|
128
|
+
# _log.info(df)
|
129
|
+
|
130
|
+
# Print relevant columns (bounding box and text)
|
131
|
+
for ix, row in df.iterrows():
|
132
|
+
text = row["text"]
|
133
|
+
conf = row["conf"]
|
134
|
+
|
135
|
+
l = float(row["left"])
|
136
|
+
b = float(row["top"])
|
137
|
+
w = float(row["width"])
|
138
|
+
h = float(row["height"])
|
139
|
+
|
140
|
+
t = b + h
|
141
|
+
r = l + w
|
142
|
+
|
143
|
+
cell = OcrCell(
|
144
|
+
id=ix,
|
145
|
+
text=text,
|
146
|
+
confidence=conf / 100.0,
|
147
|
+
bbox=BoundingBox.from_tuple(
|
148
|
+
coord=(
|
149
|
+
(l / self.scale) + ocr_rect.l,
|
150
|
+
(b / self.scale) + ocr_rect.t,
|
151
|
+
(r / self.scale) + ocr_rect.l,
|
152
|
+
(t / self.scale) + ocr_rect.t,
|
153
|
+
),
|
154
|
+
origin=CoordOrigin.TOPLEFT,
|
155
|
+
),
|
156
|
+
)
|
157
|
+
all_ocr_cells.append(cell)
|
158
|
+
|
159
|
+
## Remove OCR cells which overlap with programmatic cells.
|
160
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
161
|
+
|
162
|
+
page.cells.extend(filtered_ocr_cells)
|
163
|
+
|
164
|
+
# DEBUG code:
|
165
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
166
|
+
|
167
|
+
yield page
|
@@ -0,0 +1,122 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Iterable
|
3
|
+
|
4
|
+
import numpy
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
7
|
+
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
8
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class TesseractOcrModel(BaseOcrModel):
|
14
|
+
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
15
|
+
super().__init__(enabled=enabled, options=options)
|
16
|
+
self.options: TesseractCliOcrOptions
|
17
|
+
|
18
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
19
|
+
self.reader = None
|
20
|
+
|
21
|
+
if self.enabled:
|
22
|
+
setup_errmsg = (
|
23
|
+
"tesserocr is not correctly installed. "
|
24
|
+
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
25
|
+
"Note that tesserocr might have to be manually compiled for working with"
|
26
|
+
"your Tesseract installation. The Docling documentation provides examples for it. "
|
27
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
28
|
+
)
|
29
|
+
try:
|
30
|
+
import tesserocr
|
31
|
+
except ImportError:
|
32
|
+
raise ImportError(setup_errmsg)
|
33
|
+
|
34
|
+
try:
|
35
|
+
tesseract_version = tesserocr.tesseract_version()
|
36
|
+
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
37
|
+
except:
|
38
|
+
raise ImportError(setup_errmsg)
|
39
|
+
|
40
|
+
# Initialize the tesseractAPI
|
41
|
+
lang = "+".join(self.options.lang)
|
42
|
+
if self.options.path is not None:
|
43
|
+
self.reader = tesserocr.PyTessBaseAPI(
|
44
|
+
path=self.options.path,
|
45
|
+
lang=lang,
|
46
|
+
psm=tesserocr.PSM.AUTO,
|
47
|
+
init=True,
|
48
|
+
oem=tesserocr.OEM.DEFAULT,
|
49
|
+
)
|
50
|
+
else:
|
51
|
+
self.reader = tesserocr.PyTessBaseAPI(
|
52
|
+
lang=lang,
|
53
|
+
psm=tesserocr.PSM.AUTO,
|
54
|
+
init=True,
|
55
|
+
oem=tesserocr.OEM.DEFAULT,
|
56
|
+
)
|
57
|
+
self.reader_RIL = tesserocr.RIL
|
58
|
+
|
59
|
+
def __del__(self):
|
60
|
+
if self.reader is not None:
|
61
|
+
# Finalize the tesseractAPI
|
62
|
+
self.reader.End()
|
63
|
+
|
64
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
65
|
+
|
66
|
+
if not self.enabled:
|
67
|
+
yield from page_batch
|
68
|
+
return
|
69
|
+
|
70
|
+
for page in page_batch:
|
71
|
+
ocr_rects = self.get_ocr_rects(page)
|
72
|
+
|
73
|
+
all_ocr_cells = []
|
74
|
+
for ocr_rect in ocr_rects:
|
75
|
+
# Skip zero area boxes
|
76
|
+
if ocr_rect.area() == 0:
|
77
|
+
continue
|
78
|
+
high_res_image = page._backend.get_page_image(
|
79
|
+
scale=self.scale, cropbox=ocr_rect
|
80
|
+
)
|
81
|
+
|
82
|
+
# Retrieve text snippets with their bounding boxes
|
83
|
+
self.reader.SetImage(high_res_image)
|
84
|
+
boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
|
85
|
+
|
86
|
+
cells = []
|
87
|
+
for ix, (im, box, _, _) in enumerate(boxes):
|
88
|
+
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
89
|
+
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
90
|
+
|
91
|
+
# Extract text within the bounding box
|
92
|
+
text = self.reader.GetUTF8Text().strip()
|
93
|
+
confidence = self.reader.MeanTextConf()
|
94
|
+
left = box["x"] / self.scale
|
95
|
+
bottom = box["y"] / self.scale
|
96
|
+
right = (box["x"] + box["w"]) / self.scale
|
97
|
+
top = (box["y"] + box["h"]) / self.scale
|
98
|
+
|
99
|
+
cells.append(
|
100
|
+
OcrCell(
|
101
|
+
id=ix,
|
102
|
+
text=text,
|
103
|
+
confidence=confidence,
|
104
|
+
bbox=BoundingBox.from_tuple(
|
105
|
+
coord=(left, top, right, bottom),
|
106
|
+
origin=CoordOrigin.TOPLEFT,
|
107
|
+
),
|
108
|
+
)
|
109
|
+
)
|
110
|
+
|
111
|
+
# del high_res_image
|
112
|
+
all_ocr_cells.extend(cells)
|
113
|
+
|
114
|
+
## Remove OCR cells which overlap with programmatic cells.
|
115
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
116
|
+
|
117
|
+
page.cells.extend(filtered_ocr_cells)
|
118
|
+
|
119
|
+
# DEBUG code:
|
120
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
121
|
+
|
122
|
+
yield page
|
@@ -1,12 +1,13 @@
|
|
1
1
|
from pathlib import Path
|
2
|
-
from typing import Iterable
|
2
|
+
from typing import Callable, Iterable, List
|
3
3
|
|
4
|
-
from docling.datamodel.base_models import Page
|
4
|
+
from docling.datamodel.base_models import Page
|
5
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
5
6
|
|
6
7
|
|
7
8
|
class BaseModelPipeline:
|
8
9
|
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
9
|
-
self.model_pipe = []
|
10
|
+
self.model_pipe: List[Callable] = []
|
10
11
|
self.artifacts_path = artifacts_path
|
11
12
|
self.pipeline_options = pipeline_options
|
12
13
|
|
@@ -1,37 +1,65 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
|
3
|
-
from docling.datamodel.
|
3
|
+
from docling.datamodel.pipeline_options import (
|
4
|
+
EasyOcrOptions,
|
5
|
+
PipelineOptions,
|
6
|
+
TesseractCliOcrOptions,
|
7
|
+
TesseractOcrOptions,
|
8
|
+
)
|
9
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
4
10
|
from docling.models.easyocr_model import EasyOcrModel
|
5
11
|
from docling.models.layout_model import LayoutModel
|
6
12
|
from docling.models.table_structure_model import TableStructureModel
|
13
|
+
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
14
|
+
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
7
15
|
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
8
16
|
|
9
17
|
|
10
18
|
class StandardModelPipeline(BaseModelPipeline):
|
11
|
-
_layout_model_path = "model_artifacts/layout/beehive_v0.0.
|
19
|
+
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
12
20
|
_table_model_path = "model_artifacts/tableformer"
|
13
21
|
|
14
22
|
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
15
23
|
super().__init__(artifacts_path, pipeline_options)
|
16
24
|
|
25
|
+
ocr_model: BaseOcrModel
|
26
|
+
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
27
|
+
ocr_model = EasyOcrModel(
|
28
|
+
enabled=pipeline_options.do_ocr,
|
29
|
+
options=pipeline_options.ocr_options,
|
30
|
+
)
|
31
|
+
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
|
32
|
+
ocr_model = TesseractOcrCliModel(
|
33
|
+
enabled=pipeline_options.do_ocr,
|
34
|
+
options=pipeline_options.ocr_options,
|
35
|
+
)
|
36
|
+
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
37
|
+
ocr_model = TesseractOcrModel(
|
38
|
+
enabled=pipeline_options.do_ocr,
|
39
|
+
options=pipeline_options.ocr_options,
|
40
|
+
)
|
41
|
+
else:
|
42
|
+
raise RuntimeError(
|
43
|
+
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
44
|
+
)
|
45
|
+
|
17
46
|
self.model_pipe = [
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
"enabled": pipeline_options.do_ocr,
|
22
|
-
}
|
23
|
-
),
|
47
|
+
# OCR
|
48
|
+
ocr_model,
|
49
|
+
# Layout
|
24
50
|
LayoutModel(
|
25
51
|
config={
|
26
52
|
"artifacts_path": artifacts_path
|
27
53
|
/ StandardModelPipeline._layout_model_path
|
28
54
|
}
|
29
55
|
),
|
56
|
+
# Table structure
|
30
57
|
TableStructureModel(
|
31
58
|
config={
|
32
59
|
"artifacts_path": artifacts_path
|
33
60
|
/ StandardModelPipeline._table_model_path,
|
34
61
|
"enabled": pipeline_options.do_table_structure,
|
62
|
+
"mode": pipeline_options.table_structure_options.mode,
|
35
63
|
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
36
64
|
}
|
37
65
|
),
|
docling/utils/export.py
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any, Dict, Iterable, List, Tuple, Union
|
3
|
+
|
4
|
+
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
|
7
|
+
from docling.datamodel.document import ConversionResult, Page
|
8
|
+
|
9
|
+
_log = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
def generate_multimodal_pages(
|
13
|
+
doc_result: ConversionResult,
|
14
|
+
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
15
|
+
|
16
|
+
label_to_doclaynet = {
|
17
|
+
"title": "title",
|
18
|
+
"table-of-contents": "document_index",
|
19
|
+
"subtitle-level-1": "section_header",
|
20
|
+
"checkbox-selected": "checkbox_selected",
|
21
|
+
"checkbox-unselected": "checkbox_unselected",
|
22
|
+
"caption": "caption",
|
23
|
+
"page-header": "page_header",
|
24
|
+
"page-footer": "page_footer",
|
25
|
+
"footnote": "footnote",
|
26
|
+
"table": "table",
|
27
|
+
"formula": "formula",
|
28
|
+
"list-item": "list_item",
|
29
|
+
"code": "code",
|
30
|
+
"figure": "picture",
|
31
|
+
"picture": "picture",
|
32
|
+
"reference": "text",
|
33
|
+
"paragraph": "text",
|
34
|
+
"text": "text",
|
35
|
+
}
|
36
|
+
|
37
|
+
content_text = ""
|
38
|
+
page_no = 0
|
39
|
+
start_ix = 0
|
40
|
+
end_ix = 0
|
41
|
+
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
|
42
|
+
|
43
|
+
doc = doc_result.output
|
44
|
+
|
45
|
+
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
|
46
|
+
segments = []
|
47
|
+
|
48
|
+
for ix, item in doc_items:
|
49
|
+
item_type = item.obj_type
|
50
|
+
label = label_to_doclaynet.get(item_type, None)
|
51
|
+
|
52
|
+
if label is None or item.prov is None or page.size is None:
|
53
|
+
continue
|
54
|
+
|
55
|
+
bbox = BoundingBox.from_tuple(
|
56
|
+
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
|
57
|
+
)
|
58
|
+
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
|
59
|
+
page_size=page.size
|
60
|
+
)
|
61
|
+
|
62
|
+
new_segment = {
|
63
|
+
"index_in_doc": ix,
|
64
|
+
"label": label,
|
65
|
+
"text": item.text if item.text is not None else "",
|
66
|
+
"bbox": new_bbox.as_tuple(),
|
67
|
+
"data": [],
|
68
|
+
}
|
69
|
+
|
70
|
+
if isinstance(item, Table):
|
71
|
+
table_html = item.export_to_html()
|
72
|
+
new_segment["data"].append(
|
73
|
+
{
|
74
|
+
"html_seq": table_html,
|
75
|
+
"otsl_seq": "",
|
76
|
+
}
|
77
|
+
)
|
78
|
+
|
79
|
+
segments.append(new_segment)
|
80
|
+
|
81
|
+
return segments
|
82
|
+
|
83
|
+
def _process_page_cells(page: Page):
|
84
|
+
cells: List[dict] = []
|
85
|
+
if page.size is None:
|
86
|
+
return cells
|
87
|
+
for cell in page.cells:
|
88
|
+
new_bbox = cell.bbox.to_top_left_origin(
|
89
|
+
page_height=page.size.height
|
90
|
+
).normalized(page_size=page.size)
|
91
|
+
is_ocr = isinstance(cell, OcrCell)
|
92
|
+
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
|
93
|
+
cells.append(
|
94
|
+
{
|
95
|
+
"text": cell.text,
|
96
|
+
"bbox": new_bbox.as_tuple(),
|
97
|
+
"ocr": is_ocr,
|
98
|
+
"ocr_confidence": ocr_confidence,
|
99
|
+
}
|
100
|
+
)
|
101
|
+
return cells
|
102
|
+
|
103
|
+
def _process_page():
|
104
|
+
page_ix = page_no - 1
|
105
|
+
page = doc_result.pages[page_ix]
|
106
|
+
|
107
|
+
page_cells = _process_page_cells(page=page)
|
108
|
+
page_segments = _process_page_segments(doc_items=doc_items, page=page)
|
109
|
+
content_md = doc.export_to_markdown(
|
110
|
+
main_text_start=start_ix, main_text_stop=end_ix
|
111
|
+
)
|
112
|
+
# No page-tagging since we only do 1 page at the time
|
113
|
+
content_dt = doc.export_to_document_tokens(
|
114
|
+
main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
|
115
|
+
)
|
116
|
+
|
117
|
+
return content_text, content_md, content_dt, page_cells, page_segments, page
|
118
|
+
|
119
|
+
if doc.main_text is None:
|
120
|
+
return
|
121
|
+
for ix, orig_item in enumerate(doc.main_text):
|
122
|
+
|
123
|
+
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
124
|
+
if item is None or item.prov is None or len(item.prov) == 0:
|
125
|
+
_log.debug(f"Skipping item {orig_item}")
|
126
|
+
continue
|
127
|
+
|
128
|
+
item_page = item.prov[0].page
|
129
|
+
|
130
|
+
# Page is complete
|
131
|
+
if page_no > 0 and item_page > page_no:
|
132
|
+
yield _process_page()
|
133
|
+
|
134
|
+
start_ix = ix
|
135
|
+
doc_items = []
|
136
|
+
content_text = ""
|
137
|
+
|
138
|
+
page_no = item_page
|
139
|
+
end_ix = ix
|
140
|
+
doc_items.append((ix, item))
|
141
|
+
if item.text is not None and item.text != "":
|
142
|
+
content_text += item.text + " "
|
143
|
+
|
144
|
+
if len(doc_items) > 0:
|
145
|
+
yield _process_page()
|