docling 1.19.1__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +33 -37
- docling/backend/asciidoc_backend.py +431 -0
- docling/backend/docling_parse_backend.py +20 -16
- docling/backend/docling_parse_v2_backend.py +248 -0
- docling/backend/html_backend.py +429 -0
- docling/backend/md_backend.py +346 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +496 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +16 -11
- docling/cli/main.py +96 -65
- docling/datamodel/base_models.py +79 -193
- docling/datamodel/document.py +405 -320
- docling/datamodel/pipeline_options.py +19 -3
- docling/datamodel/settings.py +16 -1
- docling/document_converter.py +240 -251
- docling/models/base_model.py +28 -0
- docling/models/base_ocr_model.py +40 -10
- docling/models/ds_glm_model.py +244 -30
- docling/models/easyocr_model.py +57 -42
- docling/models/layout_model.py +158 -116
- docling/models/page_assemble_model.py +127 -101
- docling/models/page_preprocessing_model.py +79 -0
- docling/models/table_structure_model.py +162 -116
- docling/models/tesseract_ocr_cli_model.py +76 -59
- docling/models/tesseract_ocr_model.py +90 -58
- docling/pipeline/base_pipeline.py +189 -0
- docling/pipeline/simple_pipeline.py +56 -0
- docling/pipeline/standard_pdf_pipeline.py +201 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling/utils/profiling.py +62 -0
- docling-2.4.1.dist-info/METADATA +154 -0
- docling-2.4.1.dist-info/RECORD +45 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.1.dist-info/METADATA +0 -380
- docling-1.19.1.dist-info/RECORD +0 -34
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/LICENSE +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/WHEEL +0 -0
- {docling-1.19.1.dist-info → docling-2.4.1.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -5,22 +5,31 @@ import time
|
|
5
5
|
import warnings
|
6
6
|
from enum import Enum
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Annotated, Iterable, List, Optional
|
8
|
+
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
9
9
|
|
10
10
|
import typer
|
11
11
|
from docling_core.utils.file import resolve_file_source
|
12
12
|
|
13
13
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
|
+
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
15
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
14
16
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
15
|
-
from docling.datamodel.base_models import
|
16
|
-
|
17
|
+
from docling.datamodel.base_models import (
|
18
|
+
ConversionStatus,
|
19
|
+
FormatToExtensions,
|
20
|
+
InputFormat,
|
21
|
+
OutputFormat,
|
22
|
+
)
|
23
|
+
from docling.datamodel.document import ConversionResult
|
17
24
|
from docling.datamodel.pipeline_options import (
|
18
25
|
EasyOcrOptions,
|
19
|
-
|
26
|
+
OcrOptions,
|
27
|
+
PdfPipelineOptions,
|
28
|
+
TableFormerMode,
|
20
29
|
TesseractCliOcrOptions,
|
21
30
|
TesseractOcrOptions,
|
22
31
|
)
|
23
|
-
from docling.document_converter import DocumentConverter
|
32
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
24
33
|
|
25
34
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
26
35
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@@ -53,9 +62,10 @@ def version_callback(value: bool):
|
|
53
62
|
|
54
63
|
|
55
64
|
# Define an enum for the backend options
|
56
|
-
class
|
65
|
+
class PdfBackend(str, Enum):
|
57
66
|
PYPDFIUM2 = "pypdfium2"
|
58
|
-
|
67
|
+
DLPARSE_V1 = "dlparse_v1"
|
68
|
+
DLPARSE_V2 = "dlparse_v2"
|
59
69
|
|
60
70
|
|
61
71
|
# Define an enum for the ocr engines
|
@@ -85,30 +95,30 @@ def export_documents(
|
|
85
95
|
# Export Deep Search document JSON format:
|
86
96
|
if export_json:
|
87
97
|
fname = output_dir / f"{doc_filename}.json"
|
88
|
-
with fname.open("w") as fp:
|
98
|
+
with fname.open("w", encoding="utf8") as fp:
|
89
99
|
_log.info(f"writing JSON output to {fname}")
|
90
|
-
fp.write(json.dumps(conv_res.
|
100
|
+
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
91
101
|
|
92
102
|
# Export Text format:
|
93
103
|
if export_txt:
|
94
104
|
fname = output_dir / f"{doc_filename}.txt"
|
95
|
-
with fname.open("w") as fp:
|
105
|
+
with fname.open("w", encoding="utf8") as fp:
|
96
106
|
_log.info(f"writing Text output to {fname}")
|
97
|
-
fp.write(conv_res.
|
107
|
+
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
98
108
|
|
99
109
|
# Export Markdown format:
|
100
110
|
if export_md:
|
101
111
|
fname = output_dir / f"{doc_filename}.md"
|
102
|
-
with fname.open("w") as fp:
|
112
|
+
with fname.open("w", encoding="utf8") as fp:
|
103
113
|
_log.info(f"writing Markdown output to {fname}")
|
104
|
-
fp.write(conv_res.
|
114
|
+
fp.write(conv_res.document.export_to_markdown())
|
105
115
|
|
106
116
|
# Export Document Tags format:
|
107
117
|
if export_doctags:
|
108
118
|
fname = output_dir / f"{doc_filename}.doctags"
|
109
|
-
with fname.open("w") as fp:
|
119
|
+
with fname.open("w", encoding="utf8") as fp:
|
110
120
|
_log.info(f"writing Doc Tags output to {fname}")
|
111
|
-
fp.write(conv_res.
|
121
|
+
fp.write(conv_res.document.export_to_document_tokens())
|
112
122
|
|
113
123
|
else:
|
114
124
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
@@ -129,44 +139,42 @@ def convert(
|
|
129
139
|
help="PDF files to convert. Can be local file / directory paths or URL.",
|
130
140
|
),
|
131
141
|
],
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
] =
|
138
|
-
|
139
|
-
|
140
|
-
typer.Option(
|
141
|
-
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
|
142
|
-
),
|
143
|
-
] = True,
|
144
|
-
export_txt: Annotated[
|
145
|
-
bool,
|
146
|
-
typer.Option(
|
147
|
-
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
|
148
|
-
),
|
149
|
-
] = False,
|
150
|
-
export_doctags: Annotated[
|
151
|
-
bool,
|
152
|
-
typer.Option(
|
153
|
-
...,
|
154
|
-
"--doctags/--no-doctags",
|
155
|
-
help="If enabled the document is exported as Doc Tags.",
|
156
|
-
),
|
157
|
-
] = False,
|
142
|
+
from_formats: List[InputFormat] = typer.Option(
|
143
|
+
None,
|
144
|
+
"--from",
|
145
|
+
help="Specify input formats to convert from. Defaults to all formats.",
|
146
|
+
),
|
147
|
+
to_formats: List[OutputFormat] = typer.Option(
|
148
|
+
None, "--to", help="Specify output formats. Defaults to Markdown."
|
149
|
+
),
|
158
150
|
ocr: Annotated[
|
159
151
|
bool,
|
160
152
|
typer.Option(
|
161
153
|
..., help="If enabled, the bitmap content will be processed using OCR."
|
162
154
|
),
|
163
155
|
] = True,
|
164
|
-
backend: Annotated[
|
165
|
-
Backend, typer.Option(..., help="The PDF backend to use.")
|
166
|
-
] = Backend.DOCLING,
|
167
156
|
ocr_engine: Annotated[
|
168
157
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
169
158
|
] = OcrEngine.EASYOCR,
|
159
|
+
pdf_backend: Annotated[
|
160
|
+
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
161
|
+
] = PdfBackend.DLPARSE_V1,
|
162
|
+
table_mode: Annotated[
|
163
|
+
TableFormerMode,
|
164
|
+
typer.Option(..., help="The mode to use in the table structure model."),
|
165
|
+
] = TableFormerMode.FAST,
|
166
|
+
artifacts_path: Annotated[
|
167
|
+
Optional[Path],
|
168
|
+
typer.Option(..., help="If provided, the location of the model artifacts."),
|
169
|
+
] = None,
|
170
|
+
abort_on_error: Annotated[
|
171
|
+
bool,
|
172
|
+
typer.Option(
|
173
|
+
...,
|
174
|
+
"--abort-on-error/--no-abort-on-error",
|
175
|
+
help="If enabled, the bitmap content will be processed using OCR.",
|
176
|
+
),
|
177
|
+
] = False,
|
170
178
|
output: Annotated[
|
171
179
|
Path, typer.Option(..., help="Output directory where results are saved.")
|
172
180
|
] = Path("."),
|
@@ -182,6 +190,9 @@ def convert(
|
|
182
190
|
):
|
183
191
|
logging.basicConfig(level=logging.INFO)
|
184
192
|
|
193
|
+
if from_formats is None:
|
194
|
+
from_formats = [e for e in InputFormat]
|
195
|
+
|
185
196
|
input_doc_paths: List[Path] = []
|
186
197
|
for src in input_sources:
|
187
198
|
source = resolve_file_source(source=src)
|
@@ -191,48 +202,68 @@ def convert(
|
|
191
202
|
)
|
192
203
|
raise typer.Abort()
|
193
204
|
elif source.is_dir():
|
194
|
-
|
195
|
-
|
205
|
+
for fmt in from_formats:
|
206
|
+
for ext in FormatToExtensions[fmt]:
|
207
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
208
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
196
209
|
else:
|
197
210
|
input_doc_paths.append(source)
|
198
211
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
case _:
|
207
|
-
raise RuntimeError(f"Unexpected backend type {backend}")
|
212
|
+
if to_formats is None:
|
213
|
+
to_formats = [OutputFormat.MARKDOWN]
|
214
|
+
|
215
|
+
export_json = OutputFormat.JSON in to_formats
|
216
|
+
export_md = OutputFormat.MARKDOWN in to_formats
|
217
|
+
export_txt = OutputFormat.TEXT in to_formats
|
218
|
+
export_doctags = OutputFormat.DOCTAGS in to_formats
|
208
219
|
|
209
220
|
match ocr_engine:
|
210
221
|
case OcrEngine.EASYOCR:
|
211
|
-
ocr_options = EasyOcrOptions()
|
222
|
+
ocr_options: OcrOptions = EasyOcrOptions()
|
212
223
|
case OcrEngine.TESSERACT_CLI:
|
213
224
|
ocr_options = TesseractCliOcrOptions()
|
214
225
|
case OcrEngine.TESSERACT:
|
215
226
|
ocr_options = TesseractOcrOptions()
|
216
227
|
case _:
|
217
|
-
raise RuntimeError(f"Unexpected
|
228
|
+
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
218
229
|
|
219
|
-
pipeline_options =
|
230
|
+
pipeline_options = PdfPipelineOptions(
|
220
231
|
do_ocr=ocr,
|
221
232
|
ocr_options=ocr_options,
|
222
233
|
do_table_structure=True,
|
223
234
|
)
|
224
|
-
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
235
|
+
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
236
|
+
pipeline_options.table_structure_options.mode = table_mode
|
237
|
+
|
238
|
+
if artifacts_path is not None:
|
239
|
+
pipeline_options.artifacts_path = artifacts_path
|
240
|
+
|
241
|
+
match pdf_backend:
|
242
|
+
case PdfBackend.DLPARSE_V1:
|
243
|
+
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
244
|
+
case PdfBackend.DLPARSE_V2:
|
245
|
+
backend = DoclingParseV2DocumentBackend
|
246
|
+
case PdfBackend.PYPDFIUM2:
|
247
|
+
backend = PyPdfiumDocumentBackend
|
248
|
+
case _:
|
249
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
250
|
+
|
251
|
+
format_options: Dict[InputFormat, FormatOption] = {
|
252
|
+
InputFormat.PDF: PdfFormatOption(
|
253
|
+
pipeline_options=pipeline_options,
|
254
|
+
backend=backend, # pdf_backend
|
255
|
+
)
|
256
|
+
}
|
225
257
|
doc_converter = DocumentConverter(
|
226
|
-
|
227
|
-
|
258
|
+
allowed_formats=from_formats,
|
259
|
+
format_options=format_options,
|
228
260
|
)
|
229
261
|
|
230
|
-
# Define input files
|
231
|
-
input = DocumentConversionInput.from_paths(input_doc_paths)
|
232
|
-
|
233
262
|
start_time = time.time()
|
234
263
|
|
235
|
-
conv_results = doc_converter.
|
264
|
+
conv_results = doc_converter.convert_all(
|
265
|
+
input_doc_paths, raises_on_error=abort_on_error
|
266
|
+
)
|
236
267
|
|
237
268
|
output.mkdir(parents=True, exist_ok=True)
|
238
269
|
export_documents(
|
docling/datamodel/base_models.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1
|
-
import copy
|
2
|
-
import warnings
|
3
1
|
from enum import Enum, auto
|
4
2
|
from io import BytesIO
|
5
|
-
from typing import
|
6
|
-
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
4
|
+
|
5
|
+
from docling_core.types.doc import (
|
6
|
+
BoundingBox,
|
7
|
+
DocItemLabel,
|
8
|
+
PictureDataType,
|
9
|
+
Size,
|
10
|
+
TableCell,
|
11
|
+
)
|
7
12
|
from PIL.Image import Image
|
8
|
-
from pydantic import BaseModel, ConfigDict
|
9
|
-
from typing_extensions import Self
|
13
|
+
from pydantic import BaseModel, ConfigDict
|
10
14
|
|
11
|
-
|
12
|
-
from docling.
|
13
|
-
PipelineOptions,
|
14
|
-
TableStructureOptions,
|
15
|
-
)
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from docling.backend.pdf_backend import PdfPageBackend
|
16
17
|
|
17
18
|
|
18
19
|
class ConversionStatus(str, Enum):
|
@@ -23,18 +24,67 @@ class ConversionStatus(str, Enum):
|
|
23
24
|
PARTIAL_SUCCESS = auto()
|
24
25
|
|
25
26
|
|
27
|
+
class InputFormat(str, Enum):
|
28
|
+
DOCX = "docx"
|
29
|
+
PPTX = "pptx"
|
30
|
+
HTML = "html"
|
31
|
+
IMAGE = "image"
|
32
|
+
PDF = "pdf"
|
33
|
+
ASCIIDOC = "asciidoc"
|
34
|
+
MD = "md"
|
35
|
+
|
36
|
+
|
37
|
+
class OutputFormat(str, Enum):
|
38
|
+
MARKDOWN = "md"
|
39
|
+
JSON = "json"
|
40
|
+
TEXT = "text"
|
41
|
+
DOCTAGS = "doctags"
|
42
|
+
|
43
|
+
|
44
|
+
FormatToExtensions: Dict[InputFormat, List[str]] = {
|
45
|
+
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
46
|
+
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
47
|
+
InputFormat.PDF: ["pdf"],
|
48
|
+
InputFormat.MD: ["md"],
|
49
|
+
InputFormat.HTML: ["html", "htm", "xhtml"],
|
50
|
+
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
51
|
+
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
52
|
+
}
|
53
|
+
|
54
|
+
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
55
|
+
InputFormat.DOCX: [
|
56
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
57
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
58
|
+
],
|
59
|
+
InputFormat.PPTX: [
|
60
|
+
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
61
|
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
62
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
63
|
+
],
|
64
|
+
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
65
|
+
InputFormat.IMAGE: [
|
66
|
+
"image/png",
|
67
|
+
"image/jpeg",
|
68
|
+
"image/tiff",
|
69
|
+
"image/gif",
|
70
|
+
"image/bmp",
|
71
|
+
],
|
72
|
+
InputFormat.PDF: ["application/pdf"],
|
73
|
+
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
74
|
+
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
75
|
+
}
|
76
|
+
MimeTypeToFormat = {
|
77
|
+
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
78
|
+
}
|
79
|
+
|
80
|
+
|
26
81
|
class DocInputType(str, Enum):
|
27
82
|
PATH = auto()
|
28
83
|
STREAM = auto()
|
29
84
|
|
30
85
|
|
31
|
-
class CoordOrigin(str, Enum):
|
32
|
-
TOPLEFT = auto()
|
33
|
-
BOTTOMLEFT = auto()
|
34
|
-
|
35
|
-
|
36
86
|
class DoclingComponentType(str, Enum):
|
37
|
-
|
87
|
+
DOCUMENT_BACKEND = auto()
|
38
88
|
MODEL = auto()
|
39
89
|
DOC_ASSEMBLER = auto()
|
40
90
|
|
@@ -45,118 +95,6 @@ class ErrorItem(BaseModel):
|
|
45
95
|
error_message: str
|
46
96
|
|
47
97
|
|
48
|
-
class PageSize(BaseModel):
|
49
|
-
width: float = 0.0
|
50
|
-
height: float = 0.0
|
51
|
-
|
52
|
-
|
53
|
-
class BoundingBox(BaseModel):
|
54
|
-
l: float # left
|
55
|
-
t: float # top
|
56
|
-
r: float # right
|
57
|
-
b: float # bottom
|
58
|
-
|
59
|
-
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
60
|
-
|
61
|
-
@property
|
62
|
-
def width(self):
|
63
|
-
return self.r - self.l
|
64
|
-
|
65
|
-
@property
|
66
|
-
def height(self):
|
67
|
-
return abs(self.t - self.b)
|
68
|
-
|
69
|
-
def scaled(self, scale: float) -> "BoundingBox":
|
70
|
-
out_bbox = copy.deepcopy(self)
|
71
|
-
out_bbox.l *= scale
|
72
|
-
out_bbox.r *= scale
|
73
|
-
out_bbox.t *= scale
|
74
|
-
out_bbox.b *= scale
|
75
|
-
|
76
|
-
return out_bbox
|
77
|
-
|
78
|
-
def normalized(self, page_size: PageSize) -> "BoundingBox":
|
79
|
-
out_bbox = copy.deepcopy(self)
|
80
|
-
out_bbox.l /= page_size.width
|
81
|
-
out_bbox.r /= page_size.width
|
82
|
-
out_bbox.t /= page_size.height
|
83
|
-
out_bbox.b /= page_size.height
|
84
|
-
|
85
|
-
return out_bbox
|
86
|
-
|
87
|
-
def as_tuple(self):
|
88
|
-
if self.coord_origin == CoordOrigin.TOPLEFT:
|
89
|
-
return (self.l, self.t, self.r, self.b)
|
90
|
-
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
91
|
-
return (self.l, self.b, self.r, self.t)
|
92
|
-
|
93
|
-
@classmethod
|
94
|
-
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
95
|
-
if origin == CoordOrigin.TOPLEFT:
|
96
|
-
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
97
|
-
if r < l:
|
98
|
-
l, r = r, l
|
99
|
-
if b < t:
|
100
|
-
b, t = t, b
|
101
|
-
|
102
|
-
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
103
|
-
elif origin == CoordOrigin.BOTTOMLEFT:
|
104
|
-
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
105
|
-
if r < l:
|
106
|
-
l, r = r, l
|
107
|
-
if b > t:
|
108
|
-
b, t = t, b
|
109
|
-
|
110
|
-
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
111
|
-
|
112
|
-
def area(self) -> float:
|
113
|
-
area = (self.r - self.l) * (self.b - self.t)
|
114
|
-
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
115
|
-
area = -area
|
116
|
-
return area
|
117
|
-
|
118
|
-
def intersection_area_with(self, other: "BoundingBox") -> float:
|
119
|
-
# Calculate intersection coordinates
|
120
|
-
left = max(self.l, other.l)
|
121
|
-
top = max(self.t, other.t)
|
122
|
-
right = min(self.r, other.r)
|
123
|
-
bottom = min(self.b, other.b)
|
124
|
-
|
125
|
-
# Calculate intersection dimensions
|
126
|
-
width = right - left
|
127
|
-
height = bottom - top
|
128
|
-
|
129
|
-
# If the bounding boxes do not overlap, width or height will be negative
|
130
|
-
if width <= 0 or height <= 0:
|
131
|
-
return 0.0
|
132
|
-
|
133
|
-
return width * height
|
134
|
-
|
135
|
-
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
136
|
-
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
137
|
-
return self
|
138
|
-
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
139
|
-
return BoundingBox(
|
140
|
-
l=self.l,
|
141
|
-
r=self.r,
|
142
|
-
t=page_height - self.t,
|
143
|
-
b=page_height - self.b,
|
144
|
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
145
|
-
)
|
146
|
-
|
147
|
-
def to_top_left_origin(self, page_height):
|
148
|
-
if self.coord_origin == CoordOrigin.TOPLEFT:
|
149
|
-
return self
|
150
|
-
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
151
|
-
return BoundingBox(
|
152
|
-
l=self.l,
|
153
|
-
r=self.r,
|
154
|
-
t=page_height - self.t, # self.b
|
155
|
-
b=page_height - self.b, # self.t
|
156
|
-
coord_origin=CoordOrigin.TOPLEFT,
|
157
|
-
)
|
158
|
-
|
159
|
-
|
160
98
|
class Cell(BaseModel):
|
161
99
|
id: int
|
162
100
|
text: str
|
@@ -169,14 +107,14 @@ class OcrCell(Cell):
|
|
169
107
|
|
170
108
|
class Cluster(BaseModel):
|
171
109
|
id: int
|
172
|
-
label:
|
110
|
+
label: DocItemLabel
|
173
111
|
bbox: BoundingBox
|
174
112
|
confidence: float = 1.0
|
175
113
|
cells: List[Cell] = []
|
176
114
|
|
177
115
|
|
178
116
|
class BasePageElement(BaseModel):
|
179
|
-
label:
|
117
|
+
label: DocItemLabel
|
180
118
|
id: int
|
181
119
|
page_no: int
|
182
120
|
cluster: Cluster
|
@@ -187,37 +125,7 @@ class LayoutPrediction(BaseModel):
|
|
187
125
|
clusters: List[Cluster] = []
|
188
126
|
|
189
127
|
|
190
|
-
class
|
191
|
-
bbox: BoundingBox
|
192
|
-
row_span: int
|
193
|
-
col_span: int
|
194
|
-
start_row_offset_idx: int
|
195
|
-
end_row_offset_idx: int
|
196
|
-
start_col_offset_idx: int
|
197
|
-
end_col_offset_idx: int
|
198
|
-
text: str
|
199
|
-
column_header: bool = False
|
200
|
-
row_header: bool = False
|
201
|
-
row_section: bool = False
|
202
|
-
|
203
|
-
@model_validator(mode="before")
|
204
|
-
@classmethod
|
205
|
-
def from_dict_format(cls, data: Any) -> Any:
|
206
|
-
if isinstance(data, Dict):
|
207
|
-
text = data["bbox"].get("token", "")
|
208
|
-
if not len(text):
|
209
|
-
text_cells = data.pop("text_cell_bboxes", None)
|
210
|
-
if text_cells:
|
211
|
-
for el in text_cells:
|
212
|
-
text += el["token"] + " "
|
213
|
-
|
214
|
-
text = text.strip()
|
215
|
-
data["text"] = text
|
216
|
-
|
217
|
-
return data
|
218
|
-
|
219
|
-
|
220
|
-
class TableElement(BasePageElement):
|
128
|
+
class Table(BasePageElement):
|
221
129
|
otsl_seq: List[str]
|
222
130
|
num_rows: int = 0
|
223
131
|
num_cols: int = 0
|
@@ -225,18 +133,15 @@ class TableElement(BasePageElement):
|
|
225
133
|
|
226
134
|
|
227
135
|
class TableStructurePrediction(BaseModel):
|
228
|
-
table_map: Dict[int,
|
229
|
-
|
230
|
-
|
231
|
-
class TextElement(BasePageElement): ...
|
136
|
+
table_map: Dict[int, Table] = {}
|
232
137
|
|
233
138
|
|
234
|
-
class
|
235
|
-
|
139
|
+
class TextElement(BasePageElement):
|
140
|
+
text: str
|
236
141
|
|
237
142
|
|
238
143
|
class FigureElement(BasePageElement):
|
239
|
-
|
144
|
+
annotations: List[PictureDataType] = []
|
240
145
|
provenance: Optional[str] = None
|
241
146
|
predicted_class: Optional[str] = None
|
242
147
|
confidence: Optional[float] = None
|
@@ -259,7 +164,7 @@ class PagePredictions(BaseModel):
|
|
259
164
|
equations_prediction: Optional[EquationPrediction] = None
|
260
165
|
|
261
166
|
|
262
|
-
PageElement = Union[TextElement,
|
167
|
+
PageElement = Union[TextElement, Table, FigureElement]
|
263
168
|
|
264
169
|
|
265
170
|
class AssembledUnit(BaseModel):
|
@@ -272,13 +177,13 @@ class Page(BaseModel):
|
|
272
177
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
273
178
|
|
274
179
|
page_no: int
|
275
|
-
page_hash: Optional[str] = None
|
276
|
-
size: Optional[
|
180
|
+
# page_hash: Optional[str] = None
|
181
|
+
size: Optional[Size] = None
|
277
182
|
cells: List[Cell] = []
|
278
183
|
predictions: PagePredictions = PagePredictions()
|
279
184
|
assembled: Optional[AssembledUnit] = None
|
280
185
|
|
281
|
-
_backend: Optional[PdfPageBackend] = (
|
186
|
+
_backend: Optional["PdfPageBackend"] = (
|
282
187
|
None # Internal PDF backend. By default it is cleared during assembling.
|
283
188
|
)
|
284
189
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
@@ -301,24 +206,5 @@ class Page(BaseModel):
|
|
301
206
|
class DocumentStream(BaseModel):
|
302
207
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
303
208
|
|
304
|
-
|
209
|
+
name: str
|
305
210
|
stream: BytesIO
|
306
|
-
|
307
|
-
|
308
|
-
class AssembleOptions(BaseModel):
|
309
|
-
keep_page_images: Annotated[
|
310
|
-
bool,
|
311
|
-
Field(
|
312
|
-
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
313
|
-
),
|
314
|
-
] = False # False: page images are removed in the assemble step
|
315
|
-
images_scale: Optional[float] = None # if set, the scale for generated images
|
316
|
-
|
317
|
-
@model_validator(mode="after")
|
318
|
-
def set_page_images_from_deprecated(self) -> Self:
|
319
|
-
with warnings.catch_warnings():
|
320
|
-
warnings.simplefilter("ignore", DeprecationWarning)
|
321
|
-
default_scale = 1.0
|
322
|
-
if self.keep_page_images and self.images_scale is None:
|
323
|
-
self.images_scale = default_scale
|
324
|
-
return self
|