docling 1.19.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/abstract_backend.py +32 -37
- docling/backend/docling_parse_backend.py +16 -12
- docling/backend/docling_parse_v2_backend.py +240 -0
- docling/backend/html_backend.py +425 -0
- docling/backend/mspowerpoint_backend.py +375 -0
- docling/backend/msword_backend.py +509 -0
- docling/backend/pdf_backend.py +78 -0
- docling/backend/pypdfium2_backend.py +15 -10
- docling/cli/main.py +61 -60
- docling/datamodel/base_models.py +73 -193
- docling/datamodel/document.py +379 -324
- docling/datamodel/pipeline_options.py +16 -0
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +215 -252
- docling/models/base_model.py +25 -0
- docling/models/base_ocr_model.py +19 -6
- docling/models/ds_glm_model.py +220 -22
- docling/models/easyocr_model.py +45 -40
- docling/models/layout_model.py +130 -114
- docling/models/page_assemble_model.py +119 -95
- docling/models/page_preprocessing_model.py +61 -0
- docling/models/table_structure_model.py +122 -111
- docling/models/tesseract_ocr_cli_model.py +65 -58
- docling/models/tesseract_ocr_model.py +58 -50
- docling/pipeline/base_pipeline.py +190 -0
- docling/pipeline/simple_pipeline.py +59 -0
- docling/pipeline/standard_pdf_pipeline.py +198 -0
- docling/utils/export.py +4 -3
- docling/utils/layout_utils.py +17 -11
- docling-2.1.0.dist-info/METADATA +149 -0
- docling-2.1.0.dist-info/RECORD +42 -0
- docling/pipeline/base_model_pipeline.py +0 -18
- docling/pipeline/standard_model_pipeline.py +0 -66
- docling-1.19.0.dist-info/METADATA +0 -380
- docling-1.19.0.dist-info/RECORD +0 -34
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/LICENSE +0 -0
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/WHEEL +0 -0
- {docling-1.19.0.dist-info → docling-2.1.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -5,22 +5,27 @@ import time
|
|
5
5
|
import warnings
|
6
6
|
from enum import Enum
|
7
7
|
from pathlib import Path
|
8
|
-
from typing import Annotated, Iterable, List, Optional
|
8
|
+
from typing import Annotated, Dict, Iterable, List, Optional
|
9
9
|
|
10
10
|
import typer
|
11
11
|
from docling_core.utils.file import resolve_file_source
|
12
12
|
|
13
13
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
14
|
-
from docling.
|
15
|
-
|
16
|
-
|
14
|
+
from docling.datamodel.base_models import (
|
15
|
+
ConversionStatus,
|
16
|
+
FormatToExtensions,
|
17
|
+
InputFormat,
|
18
|
+
OutputFormat,
|
19
|
+
)
|
20
|
+
from docling.datamodel.document import ConversionResult
|
17
21
|
from docling.datamodel.pipeline_options import (
|
18
22
|
EasyOcrOptions,
|
19
|
-
|
23
|
+
OcrOptions,
|
24
|
+
PdfPipelineOptions,
|
20
25
|
TesseractCliOcrOptions,
|
21
26
|
TesseractOcrOptions,
|
22
27
|
)
|
23
|
-
from docling.document_converter import DocumentConverter
|
28
|
+
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
24
29
|
|
25
30
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
26
31
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@@ -87,28 +92,28 @@ def export_documents(
|
|
87
92
|
fname = output_dir / f"{doc_filename}.json"
|
88
93
|
with fname.open("w") as fp:
|
89
94
|
_log.info(f"writing JSON output to {fname}")
|
90
|
-
fp.write(json.dumps(conv_res.
|
95
|
+
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
91
96
|
|
92
97
|
# Export Text format:
|
93
98
|
if export_txt:
|
94
99
|
fname = output_dir / f"{doc_filename}.txt"
|
95
100
|
with fname.open("w") as fp:
|
96
101
|
_log.info(f"writing Text output to {fname}")
|
97
|
-
fp.write(conv_res.
|
102
|
+
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
98
103
|
|
99
104
|
# Export Markdown format:
|
100
105
|
if export_md:
|
101
106
|
fname = output_dir / f"{doc_filename}.md"
|
102
107
|
with fname.open("w") as fp:
|
103
108
|
_log.info(f"writing Markdown output to {fname}")
|
104
|
-
fp.write(conv_res.
|
109
|
+
fp.write(conv_res.document.export_to_markdown())
|
105
110
|
|
106
111
|
# Export Document Tags format:
|
107
112
|
if export_doctags:
|
108
113
|
fname = output_dir / f"{doc_filename}.doctags"
|
109
114
|
with fname.open("w") as fp:
|
110
115
|
_log.info(f"writing Doc Tags output to {fname}")
|
111
|
-
fp.write(conv_res.
|
116
|
+
fp.write(conv_res.document.export_to_document_tokens())
|
112
117
|
|
113
118
|
else:
|
114
119
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
@@ -129,44 +134,31 @@ def convert(
|
|
129
134
|
help="PDF files to convert. Can be local file / directory paths or URL.",
|
130
135
|
),
|
131
136
|
],
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
] =
|
138
|
-
|
139
|
-
|
140
|
-
typer.Option(
|
141
|
-
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
|
142
|
-
),
|
143
|
-
] = True,
|
144
|
-
export_txt: Annotated[
|
145
|
-
bool,
|
146
|
-
typer.Option(
|
147
|
-
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
|
148
|
-
),
|
149
|
-
] = False,
|
150
|
-
export_doctags: Annotated[
|
151
|
-
bool,
|
152
|
-
typer.Option(
|
153
|
-
...,
|
154
|
-
"--doctags/--no-doctags",
|
155
|
-
help="If enabled the document is exported as Doc Tags.",
|
156
|
-
),
|
157
|
-
] = False,
|
137
|
+
from_formats: List[InputFormat] = typer.Option(
|
138
|
+
None,
|
139
|
+
"--from",
|
140
|
+
help="Specify input formats to convert from. Defaults to all formats.",
|
141
|
+
),
|
142
|
+
to_formats: List[OutputFormat] = typer.Option(
|
143
|
+
None, "--to", help="Specify output formats. Defaults to Markdown."
|
144
|
+
),
|
158
145
|
ocr: Annotated[
|
159
146
|
bool,
|
160
147
|
typer.Option(
|
161
148
|
..., help="If enabled, the bitmap content will be processed using OCR."
|
162
149
|
),
|
163
150
|
] = True,
|
164
|
-
backend: Annotated[
|
165
|
-
Backend, typer.Option(..., help="The PDF backend to use.")
|
166
|
-
] = Backend.DOCLING,
|
167
151
|
ocr_engine: Annotated[
|
168
152
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
169
153
|
] = OcrEngine.EASYOCR,
|
154
|
+
abort_on_error: Annotated[
|
155
|
+
bool,
|
156
|
+
typer.Option(
|
157
|
+
...,
|
158
|
+
"--abort-on-error/--no-abort-on-error",
|
159
|
+
help="If enabled, the bitmap content will be processed using OCR.",
|
160
|
+
),
|
161
|
+
] = False,
|
170
162
|
output: Annotated[
|
171
163
|
Path, typer.Option(..., help="Output directory where results are saved.")
|
172
164
|
] = Path("."),
|
@@ -182,6 +174,9 @@ def convert(
|
|
182
174
|
):
|
183
175
|
logging.basicConfig(level=logging.INFO)
|
184
176
|
|
177
|
+
if from_formats is None:
|
178
|
+
from_formats = [e for e in InputFormat]
|
179
|
+
|
185
180
|
input_doc_paths: List[Path] = []
|
186
181
|
for src in input_sources:
|
187
182
|
source = resolve_file_source(source=src)
|
@@ -191,48 +186,54 @@ def convert(
|
|
191
186
|
)
|
192
187
|
raise typer.Abort()
|
193
188
|
elif source.is_dir():
|
194
|
-
|
195
|
-
|
189
|
+
for fmt in from_formats:
|
190
|
+
for ext in FormatToExtensions[fmt]:
|
191
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
192
|
+
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
196
193
|
else:
|
197
194
|
input_doc_paths.append(source)
|
198
195
|
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
case _:
|
207
|
-
raise RuntimeError(f"Unexpected backend type {backend}")
|
196
|
+
if to_formats is None:
|
197
|
+
to_formats = [OutputFormat.MARKDOWN]
|
198
|
+
|
199
|
+
export_json = OutputFormat.JSON in to_formats
|
200
|
+
export_md = OutputFormat.MARKDOWN in to_formats
|
201
|
+
export_txt = OutputFormat.TEXT in to_formats
|
202
|
+
export_doctags = OutputFormat.DOCTAGS in to_formats
|
208
203
|
|
209
204
|
match ocr_engine:
|
210
205
|
case OcrEngine.EASYOCR:
|
211
|
-
ocr_options = EasyOcrOptions()
|
206
|
+
ocr_options: OcrOptions = EasyOcrOptions()
|
212
207
|
case OcrEngine.TESSERACT_CLI:
|
213
208
|
ocr_options = TesseractCliOcrOptions()
|
214
209
|
case OcrEngine.TESSERACT:
|
215
210
|
ocr_options = TesseractOcrOptions()
|
216
211
|
case _:
|
217
|
-
raise RuntimeError(f"Unexpected
|
212
|
+
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
218
213
|
|
219
|
-
pipeline_options =
|
214
|
+
pipeline_options = PdfPipelineOptions(
|
220
215
|
do_ocr=ocr,
|
221
216
|
ocr_options=ocr_options,
|
222
217
|
do_table_structure=True,
|
223
218
|
)
|
224
|
-
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
219
|
+
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
220
|
+
|
221
|
+
format_options: Dict[InputFormat, FormatOption] = {
|
222
|
+
InputFormat.PDF: PdfFormatOption(
|
223
|
+
pipeline_options=pipeline_options,
|
224
|
+
backend=DoclingParseDocumentBackend, # pdf_backend
|
225
|
+
)
|
226
|
+
}
|
225
227
|
doc_converter = DocumentConverter(
|
226
|
-
|
227
|
-
|
228
|
+
allowed_formats=from_formats,
|
229
|
+
format_options=format_options,
|
228
230
|
)
|
229
231
|
|
230
|
-
# Define input files
|
231
|
-
input = DocumentConversionInput.from_paths(input_doc_paths)
|
232
|
-
|
233
232
|
start_time = time.time()
|
234
233
|
|
235
|
-
conv_results = doc_converter.
|
234
|
+
conv_results = doc_converter.convert_all(
|
235
|
+
input_doc_paths, raises_on_error=abort_on_error
|
236
|
+
)
|
236
237
|
|
237
238
|
output.mkdir(parents=True, exist_ok=True)
|
238
239
|
export_documents(
|
docling/datamodel/base_models.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1
|
-
import copy
|
2
|
-
import warnings
|
3
1
|
from enum import Enum, auto
|
4
2
|
from io import BytesIO
|
5
|
-
from typing import
|
6
|
-
|
3
|
+
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
|
4
|
+
|
5
|
+
from docling_core.types.doc import (
|
6
|
+
BoundingBox,
|
7
|
+
DocItemLabel,
|
8
|
+
PictureDataType,
|
9
|
+
Size,
|
10
|
+
TableCell,
|
11
|
+
)
|
7
12
|
from PIL.Image import Image
|
8
|
-
from pydantic import BaseModel, ConfigDict
|
9
|
-
from typing_extensions import Self
|
13
|
+
from pydantic import BaseModel, ConfigDict
|
10
14
|
|
11
|
-
|
12
|
-
from docling.
|
13
|
-
PipelineOptions,
|
14
|
-
TableStructureOptions,
|
15
|
-
)
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from docling.backend.pdf_backend import PdfPageBackend
|
16
17
|
|
17
18
|
|
18
19
|
class ConversionStatus(str, Enum):
|
@@ -23,18 +24,61 @@ class ConversionStatus(str, Enum):
|
|
23
24
|
PARTIAL_SUCCESS = auto()
|
24
25
|
|
25
26
|
|
27
|
+
class InputFormat(str, Enum):
|
28
|
+
DOCX = "docx"
|
29
|
+
PPTX = "pptx"
|
30
|
+
HTML = "html"
|
31
|
+
IMAGE = "image"
|
32
|
+
PDF = "pdf"
|
33
|
+
|
34
|
+
|
35
|
+
class OutputFormat(str, Enum):
|
36
|
+
MARKDOWN = "md"
|
37
|
+
JSON = "json"
|
38
|
+
TEXT = "text"
|
39
|
+
DOCTAGS = "doctags"
|
40
|
+
|
41
|
+
|
42
|
+
FormatToExtensions: Dict[InputFormat, List[str]] = {
|
43
|
+
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
44
|
+
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
45
|
+
InputFormat.PDF: ["pdf"],
|
46
|
+
InputFormat.HTML: ["html", "htm", "xhtml"],
|
47
|
+
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
48
|
+
}
|
49
|
+
|
50
|
+
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
51
|
+
InputFormat.DOCX: {
|
52
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
53
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
54
|
+
},
|
55
|
+
InputFormat.PPTX: {
|
56
|
+
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
57
|
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
58
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
59
|
+
},
|
60
|
+
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
|
61
|
+
InputFormat.IMAGE: {
|
62
|
+
"image/png",
|
63
|
+
"image/jpeg",
|
64
|
+
"image/tiff",
|
65
|
+
"image/gif",
|
66
|
+
"image/bmp",
|
67
|
+
},
|
68
|
+
InputFormat.PDF: {"application/pdf"},
|
69
|
+
}
|
70
|
+
MimeTypeToFormat = {
|
71
|
+
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
72
|
+
}
|
73
|
+
|
74
|
+
|
26
75
|
class DocInputType(str, Enum):
|
27
76
|
PATH = auto()
|
28
77
|
STREAM = auto()
|
29
78
|
|
30
79
|
|
31
|
-
class CoordOrigin(str, Enum):
|
32
|
-
TOPLEFT = auto()
|
33
|
-
BOTTOMLEFT = auto()
|
34
|
-
|
35
|
-
|
36
80
|
class DoclingComponentType(str, Enum):
|
37
|
-
|
81
|
+
DOCUMENT_BACKEND = auto()
|
38
82
|
MODEL = auto()
|
39
83
|
DOC_ASSEMBLER = auto()
|
40
84
|
|
@@ -45,118 +89,6 @@ class ErrorItem(BaseModel):
|
|
45
89
|
error_message: str
|
46
90
|
|
47
91
|
|
48
|
-
class PageSize(BaseModel):
|
49
|
-
width: float = 0.0
|
50
|
-
height: float = 0.0
|
51
|
-
|
52
|
-
|
53
|
-
class BoundingBox(BaseModel):
|
54
|
-
l: float # left
|
55
|
-
t: float # top
|
56
|
-
r: float # right
|
57
|
-
b: float # bottom
|
58
|
-
|
59
|
-
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
|
60
|
-
|
61
|
-
@property
|
62
|
-
def width(self):
|
63
|
-
return self.r - self.l
|
64
|
-
|
65
|
-
@property
|
66
|
-
def height(self):
|
67
|
-
return abs(self.t - self.b)
|
68
|
-
|
69
|
-
def scaled(self, scale: float) -> "BoundingBox":
|
70
|
-
out_bbox = copy.deepcopy(self)
|
71
|
-
out_bbox.l *= scale
|
72
|
-
out_bbox.r *= scale
|
73
|
-
out_bbox.t *= scale
|
74
|
-
out_bbox.b *= scale
|
75
|
-
|
76
|
-
return out_bbox
|
77
|
-
|
78
|
-
def normalized(self, page_size: PageSize) -> "BoundingBox":
|
79
|
-
out_bbox = copy.deepcopy(self)
|
80
|
-
out_bbox.l /= page_size.width
|
81
|
-
out_bbox.r /= page_size.width
|
82
|
-
out_bbox.t /= page_size.height
|
83
|
-
out_bbox.b /= page_size.height
|
84
|
-
|
85
|
-
return out_bbox
|
86
|
-
|
87
|
-
def as_tuple(self):
|
88
|
-
if self.coord_origin == CoordOrigin.TOPLEFT:
|
89
|
-
return (self.l, self.t, self.r, self.b)
|
90
|
-
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
91
|
-
return (self.l, self.b, self.r, self.t)
|
92
|
-
|
93
|
-
@classmethod
|
94
|
-
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
|
95
|
-
if origin == CoordOrigin.TOPLEFT:
|
96
|
-
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
|
97
|
-
if r < l:
|
98
|
-
l, r = r, l
|
99
|
-
if b < t:
|
100
|
-
b, t = t, b
|
101
|
-
|
102
|
-
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
103
|
-
elif origin == CoordOrigin.BOTTOMLEFT:
|
104
|
-
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
|
105
|
-
if r < l:
|
106
|
-
l, r = r, l
|
107
|
-
if b > t:
|
108
|
-
b, t = t, b
|
109
|
-
|
110
|
-
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
111
|
-
|
112
|
-
def area(self) -> float:
|
113
|
-
area = (self.r - self.l) * (self.b - self.t)
|
114
|
-
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
115
|
-
area = -area
|
116
|
-
return area
|
117
|
-
|
118
|
-
def intersection_area_with(self, other: "BoundingBox") -> float:
|
119
|
-
# Calculate intersection coordinates
|
120
|
-
left = max(self.l, other.l)
|
121
|
-
top = max(self.t, other.t)
|
122
|
-
right = min(self.r, other.r)
|
123
|
-
bottom = min(self.b, other.b)
|
124
|
-
|
125
|
-
# Calculate intersection dimensions
|
126
|
-
width = right - left
|
127
|
-
height = bottom - top
|
128
|
-
|
129
|
-
# If the bounding boxes do not overlap, width or height will be negative
|
130
|
-
if width <= 0 or height <= 0:
|
131
|
-
return 0.0
|
132
|
-
|
133
|
-
return width * height
|
134
|
-
|
135
|
-
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
|
136
|
-
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
137
|
-
return self
|
138
|
-
elif self.coord_origin == CoordOrigin.TOPLEFT:
|
139
|
-
return BoundingBox(
|
140
|
-
l=self.l,
|
141
|
-
r=self.r,
|
142
|
-
t=page_height - self.t,
|
143
|
-
b=page_height - self.b,
|
144
|
-
coord_origin=CoordOrigin.BOTTOMLEFT,
|
145
|
-
)
|
146
|
-
|
147
|
-
def to_top_left_origin(self, page_height):
|
148
|
-
if self.coord_origin == CoordOrigin.TOPLEFT:
|
149
|
-
return self
|
150
|
-
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
151
|
-
return BoundingBox(
|
152
|
-
l=self.l,
|
153
|
-
r=self.r,
|
154
|
-
t=page_height - self.t, # self.b
|
155
|
-
b=page_height - self.b, # self.t
|
156
|
-
coord_origin=CoordOrigin.TOPLEFT,
|
157
|
-
)
|
158
|
-
|
159
|
-
|
160
92
|
class Cell(BaseModel):
|
161
93
|
id: int
|
162
94
|
text: str
|
@@ -169,14 +101,14 @@ class OcrCell(Cell):
|
|
169
101
|
|
170
102
|
class Cluster(BaseModel):
|
171
103
|
id: int
|
172
|
-
label:
|
104
|
+
label: DocItemLabel
|
173
105
|
bbox: BoundingBox
|
174
106
|
confidence: float = 1.0
|
175
107
|
cells: List[Cell] = []
|
176
108
|
|
177
109
|
|
178
110
|
class BasePageElement(BaseModel):
|
179
|
-
label:
|
111
|
+
label: DocItemLabel
|
180
112
|
id: int
|
181
113
|
page_no: int
|
182
114
|
cluster: Cluster
|
@@ -187,37 +119,7 @@ class LayoutPrediction(BaseModel):
|
|
187
119
|
clusters: List[Cluster] = []
|
188
120
|
|
189
121
|
|
190
|
-
class
|
191
|
-
bbox: BoundingBox
|
192
|
-
row_span: int
|
193
|
-
col_span: int
|
194
|
-
start_row_offset_idx: int
|
195
|
-
end_row_offset_idx: int
|
196
|
-
start_col_offset_idx: int
|
197
|
-
end_col_offset_idx: int
|
198
|
-
text: str
|
199
|
-
column_header: bool = False
|
200
|
-
row_header: bool = False
|
201
|
-
row_section: bool = False
|
202
|
-
|
203
|
-
@model_validator(mode="before")
|
204
|
-
@classmethod
|
205
|
-
def from_dict_format(cls, data: Any) -> Any:
|
206
|
-
if isinstance(data, Dict):
|
207
|
-
text = data["bbox"].get("token", "")
|
208
|
-
if not len(text):
|
209
|
-
text_cells = data.pop("text_cell_bboxes", None)
|
210
|
-
if text_cells:
|
211
|
-
for el in text_cells:
|
212
|
-
text += el["token"] + " "
|
213
|
-
|
214
|
-
text = text.strip()
|
215
|
-
data["text"] = text
|
216
|
-
|
217
|
-
return data
|
218
|
-
|
219
|
-
|
220
|
-
class TableElement(BasePageElement):
|
122
|
+
class Table(BasePageElement):
|
221
123
|
otsl_seq: List[str]
|
222
124
|
num_rows: int = 0
|
223
125
|
num_cols: int = 0
|
@@ -225,18 +127,15 @@ class TableElement(BasePageElement):
|
|
225
127
|
|
226
128
|
|
227
129
|
class TableStructurePrediction(BaseModel):
|
228
|
-
table_map: Dict[int,
|
229
|
-
|
230
|
-
|
231
|
-
class TextElement(BasePageElement): ...
|
130
|
+
table_map: Dict[int, Table] = {}
|
232
131
|
|
233
132
|
|
234
|
-
class
|
235
|
-
|
133
|
+
class TextElement(BasePageElement):
|
134
|
+
text: str
|
236
135
|
|
237
136
|
|
238
137
|
class FigureElement(BasePageElement):
|
239
|
-
|
138
|
+
annotations: List[PictureDataType] = []
|
240
139
|
provenance: Optional[str] = None
|
241
140
|
predicted_class: Optional[str] = None
|
242
141
|
confidence: Optional[float] = None
|
@@ -259,7 +158,7 @@ class PagePredictions(BaseModel):
|
|
259
158
|
equations_prediction: Optional[EquationPrediction] = None
|
260
159
|
|
261
160
|
|
262
|
-
PageElement = Union[TextElement,
|
161
|
+
PageElement = Union[TextElement, Table, FigureElement]
|
263
162
|
|
264
163
|
|
265
164
|
class AssembledUnit(BaseModel):
|
@@ -272,13 +171,13 @@ class Page(BaseModel):
|
|
272
171
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
273
172
|
|
274
173
|
page_no: int
|
275
|
-
page_hash: Optional[str] = None
|
276
|
-
size: Optional[
|
174
|
+
# page_hash: Optional[str] = None
|
175
|
+
size: Optional[Size] = None
|
277
176
|
cells: List[Cell] = []
|
278
177
|
predictions: PagePredictions = PagePredictions()
|
279
178
|
assembled: Optional[AssembledUnit] = None
|
280
179
|
|
281
|
-
_backend: Optional[PdfPageBackend] = (
|
180
|
+
_backend: Optional["PdfPageBackend"] = (
|
282
181
|
None # Internal PDF backend. By default it is cleared during assembling.
|
283
182
|
)
|
284
183
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
@@ -301,24 +200,5 @@ class Page(BaseModel):
|
|
301
200
|
class DocumentStream(BaseModel):
|
302
201
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
303
202
|
|
304
|
-
|
203
|
+
name: str
|
305
204
|
stream: BytesIO
|
306
|
-
|
307
|
-
|
308
|
-
class AssembleOptions(BaseModel):
|
309
|
-
keep_page_images: Annotated[
|
310
|
-
bool,
|
311
|
-
Field(
|
312
|
-
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
|
313
|
-
),
|
314
|
-
] = False # False: page images are removed in the assemble step
|
315
|
-
images_scale: Optional[float] = None # if set, the scale for generated images
|
316
|
-
|
317
|
-
@model_validator(mode="after")
|
318
|
-
def set_page_images_from_deprecated(self) -> Self:
|
319
|
-
with warnings.catch_warnings():
|
320
|
-
warnings.simplefilter("ignore", DeprecationWarning)
|
321
|
-
default_scale = 1.0
|
322
|
-
if self.keep_page_images and self.images_scale is None:
|
323
|
-
self.images_scale = default_scale
|
324
|
-
return self
|