docling 2.69.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/__init__.py +0 -0
- docling/backend/__init__.py +0 -0
- docling/backend/abstract_backend.py +84 -0
- docling/backend/asciidoc_backend.py +443 -0
- docling/backend/csv_backend.py +125 -0
- docling/backend/docling_parse_backend.py +237 -0
- docling/backend/docling_parse_v2_backend.py +276 -0
- docling/backend/docling_parse_v4_backend.py +260 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/drawingml/utils.py +131 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +274 -0
- docling/backend/docx/latex/omml.py +459 -0
- docling/backend/html_backend.py +1502 -0
- docling/backend/image_backend.py +188 -0
- docling/backend/json/__init__.py +0 -0
- docling/backend/json/docling_json_backend.py +58 -0
- docling/backend/md_backend.py +618 -0
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/msexcel_backend.py +686 -0
- docling/backend/mspowerpoint_backend.py +398 -0
- docling/backend/msword_backend.py +1663 -0
- docling/backend/noop_backend.py +51 -0
- docling/backend/pdf_backend.py +82 -0
- docling/backend/pypdfium2_backend.py +417 -0
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/jats_backend.py +819 -0
- docling/backend/xml/uspto_backend.py +1905 -0
- docling/chunking/__init__.py +12 -0
- docling/cli/__init__.py +0 -0
- docling/cli/main.py +974 -0
- docling/cli/models.py +196 -0
- docling/cli/tools.py +17 -0
- docling/datamodel/__init__.py +0 -0
- docling/datamodel/accelerator_options.py +69 -0
- docling/datamodel/asr_model_specs.py +494 -0
- docling/datamodel/backend_options.py +102 -0
- docling/datamodel/base_models.py +493 -0
- docling/datamodel/document.py +699 -0
- docling/datamodel/extraction.py +39 -0
- docling/datamodel/layout_model_specs.py +91 -0
- docling/datamodel/pipeline_options.py +457 -0
- docling/datamodel/pipeline_options_asr_model.py +78 -0
- docling/datamodel/pipeline_options_vlm_model.py +136 -0
- docling/datamodel/settings.py +65 -0
- docling/datamodel/vlm_model_specs.py +365 -0
- docling/document_converter.py +559 -0
- docling/document_extractor.py +327 -0
- docling/exceptions.py +10 -0
- docling/experimental/__init__.py +5 -0
- docling/experimental/datamodel/__init__.py +1 -0
- docling/experimental/datamodel/table_crops_layout_options.py +13 -0
- docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
- docling/experimental/models/__init__.py +3 -0
- docling/experimental/models/table_crops_layout_model.py +114 -0
- docling/experimental/pipeline/__init__.py +1 -0
- docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
- docling/models/__init__.py +0 -0
- docling/models/base_layout_model.py +39 -0
- docling/models/base_model.py +230 -0
- docling/models/base_ocr_model.py +241 -0
- docling/models/base_table_model.py +45 -0
- docling/models/extraction/__init__.py +0 -0
- docling/models/extraction/nuextract_transformers_model.py +305 -0
- docling/models/factories/__init__.py +47 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/layout_factory.py +7 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/factories/table_factory.py +7 -0
- docling/models/picture_description_base_model.py +149 -0
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +60 -0
- docling/models/stages/__init__.py +0 -0
- docling/models/stages/code_formula/__init__.py +0 -0
- docling/models/stages/code_formula/code_formula_model.py +342 -0
- docling/models/stages/layout/__init__.py +0 -0
- docling/models/stages/layout/layout_model.py +249 -0
- docling/models/stages/ocr/__init__.py +0 -0
- docling/models/stages/ocr/auto_ocr_model.py +132 -0
- docling/models/stages/ocr/easyocr_model.py +200 -0
- docling/models/stages/ocr/ocr_mac_model.py +145 -0
- docling/models/stages/ocr/rapid_ocr_model.py +328 -0
- docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
- docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
- docling/models/stages/page_assemble/__init__.py +0 -0
- docling/models/stages/page_assemble/page_assemble_model.py +156 -0
- docling/models/stages/page_preprocessing/__init__.py +0 -0
- docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
- docling/models/stages/picture_classifier/__init__.py +0 -0
- docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
- docling/models/stages/picture_description/__init__.py +0 -0
- docling/models/stages/picture_description/picture_description_api_model.py +66 -0
- docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
- docling/models/stages/reading_order/__init__.py +0 -0
- docling/models/stages/reading_order/readingorder_model.py +431 -0
- docling/models/stages/table_structure/__init__.py +0 -0
- docling/models/stages/table_structure/table_structure_model.py +305 -0
- docling/models/utils/__init__.py +0 -0
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +45 -0
- docling/models/vlm_pipeline_models/__init__.py +1 -0
- docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
- docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
- docling/models/vlm_pipeline_models/mlx_model.py +325 -0
- docling/models/vlm_pipeline_models/vllm_model.py +344 -0
- docling/pipeline/__init__.py +0 -0
- docling/pipeline/asr_pipeline.py +431 -0
- docling/pipeline/base_extraction_pipeline.py +72 -0
- docling/pipeline/base_pipeline.py +326 -0
- docling/pipeline/extraction_vlm_pipeline.py +207 -0
- docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
- docling/pipeline/simple_pipeline.py +55 -0
- docling/pipeline/standard_pdf_pipeline.py +859 -0
- docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
- docling/pipeline/vlm_pipeline.py +416 -0
- docling/py.typed +1 -0
- docling/utils/__init__.py +0 -0
- docling/utils/accelerator_utils.py +97 -0
- docling/utils/api_image_request.py +205 -0
- docling/utils/deepseekocr_utils.py +388 -0
- docling/utils/export.py +146 -0
- docling/utils/glm_utils.py +361 -0
- docling/utils/layout_postprocessor.py +683 -0
- docling/utils/locks.py +3 -0
- docling/utils/model_downloader.py +168 -0
- docling/utils/ocr_utils.py +69 -0
- docling/utils/orientation.py +65 -0
- docling/utils/profiling.py +65 -0
- docling/utils/utils.py +65 -0
- docling/utils/visualization.py +85 -0
- docling-2.69.0.dist-info/METADATA +237 -0
- docling-2.69.0.dist-info/RECORD +138 -0
- docling-2.69.0.dist-info/WHEEL +5 -0
- docling-2.69.0.dist-info/entry_points.txt +6 -0
- docling-2.69.0.dist-info/licenses/LICENSE +21 -0
- docling-2.69.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import TYPE_CHECKING, Optional, Type, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from docling_core.types.doc import (
|
|
7
|
+
BoundingBox,
|
|
8
|
+
DocItemLabel,
|
|
9
|
+
NodeItem,
|
|
10
|
+
PictureDataType,
|
|
11
|
+
Size,
|
|
12
|
+
TableCell,
|
|
13
|
+
)
|
|
14
|
+
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
|
|
15
|
+
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
|
16
|
+
from docling_core.types.io import DocumentStream
|
|
17
|
+
|
|
18
|
+
# DO NOT REMOVE; explicitly exposed from this location
|
|
19
|
+
from PIL.Image import Image
|
|
20
|
+
from pydantic import (
|
|
21
|
+
BaseModel,
|
|
22
|
+
ConfigDict,
|
|
23
|
+
Field,
|
|
24
|
+
FieldSerializationInfo,
|
|
25
|
+
computed_field,
|
|
26
|
+
field_serializer,
|
|
27
|
+
field_validator,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from docling.backend.pdf_backend import PdfPageBackend
|
|
32
|
+
|
|
33
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
|
34
|
+
from docling.datamodel.pipeline_options import PipelineOptions
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class BaseFormatOption(BaseModel):
|
|
38
|
+
"""Base class for format options used by _DocumentConversionInput."""
|
|
39
|
+
|
|
40
|
+
pipeline_options: Optional[PipelineOptions] = None
|
|
41
|
+
backend: Type[AbstractDocumentBackend]
|
|
42
|
+
|
|
43
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class ConversionStatus(str, Enum):
|
|
47
|
+
PENDING = "pending"
|
|
48
|
+
STARTED = "started"
|
|
49
|
+
FAILURE = "failure"
|
|
50
|
+
SUCCESS = "success"
|
|
51
|
+
PARTIAL_SUCCESS = "partial_success"
|
|
52
|
+
SKIPPED = "skipped"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class InputFormat(str, Enum):
|
|
56
|
+
"""A document format supported by document backend parsers."""
|
|
57
|
+
|
|
58
|
+
DOCX = "docx"
|
|
59
|
+
PPTX = "pptx"
|
|
60
|
+
HTML = "html"
|
|
61
|
+
IMAGE = "image"
|
|
62
|
+
PDF = "pdf"
|
|
63
|
+
ASCIIDOC = "asciidoc"
|
|
64
|
+
MD = "md"
|
|
65
|
+
CSV = "csv"
|
|
66
|
+
XLSX = "xlsx"
|
|
67
|
+
XML_USPTO = "xml_uspto"
|
|
68
|
+
XML_JATS = "xml_jats"
|
|
69
|
+
METS_GBS = "mets_gbs"
|
|
70
|
+
JSON_DOCLING = "json_docling"
|
|
71
|
+
AUDIO = "audio"
|
|
72
|
+
VTT = "vtt"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class OutputFormat(str, Enum):
|
|
76
|
+
MARKDOWN = "md"
|
|
77
|
+
JSON = "json"
|
|
78
|
+
YAML = "yaml"
|
|
79
|
+
HTML = "html"
|
|
80
|
+
HTML_SPLIT_PAGE = "html_split_page"
|
|
81
|
+
TEXT = "text"
|
|
82
|
+
DOCTAGS = "doctags"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
FormatToExtensions: dict[InputFormat, list[str]] = {
|
|
86
|
+
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
|
87
|
+
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
|
88
|
+
InputFormat.PDF: ["pdf"],
|
|
89
|
+
InputFormat.MD: ["md"],
|
|
90
|
+
InputFormat.HTML: ["html", "htm", "xhtml"],
|
|
91
|
+
InputFormat.XML_JATS: ["xml", "nxml"],
|
|
92
|
+
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
|
|
93
|
+
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
|
94
|
+
InputFormat.CSV: ["csv"],
|
|
95
|
+
InputFormat.XLSX: ["xlsx", "xlsm"],
|
|
96
|
+
InputFormat.XML_USPTO: ["xml", "txt"],
|
|
97
|
+
InputFormat.METS_GBS: ["tar.gz"],
|
|
98
|
+
InputFormat.JSON_DOCLING: ["json"],
|
|
99
|
+
InputFormat.AUDIO: ["wav", "mp3", "m4a", "aac", "ogg", "flac", "mp4", "avi", "mov"],
|
|
100
|
+
InputFormat.VTT: ["vtt"],
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
FormatToMimeType: dict[InputFormat, list[str]] = {
|
|
104
|
+
InputFormat.DOCX: [
|
|
105
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
106
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
|
107
|
+
],
|
|
108
|
+
InputFormat.PPTX: [
|
|
109
|
+
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
|
110
|
+
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
111
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
112
|
+
],
|
|
113
|
+
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
|
114
|
+
InputFormat.XML_JATS: ["application/xml"],
|
|
115
|
+
InputFormat.IMAGE: [
|
|
116
|
+
"image/png",
|
|
117
|
+
"image/jpeg",
|
|
118
|
+
"image/tiff",
|
|
119
|
+
"image/gif",
|
|
120
|
+
"image/bmp",
|
|
121
|
+
"image/webp",
|
|
122
|
+
],
|
|
123
|
+
InputFormat.PDF: ["application/pdf"],
|
|
124
|
+
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
|
125
|
+
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
|
126
|
+
InputFormat.CSV: ["text/csv"],
|
|
127
|
+
InputFormat.XLSX: [
|
|
128
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
129
|
+
],
|
|
130
|
+
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
|
131
|
+
InputFormat.METS_GBS: ["application/mets+xml"],
|
|
132
|
+
InputFormat.JSON_DOCLING: ["application/json"],
|
|
133
|
+
InputFormat.AUDIO: [
|
|
134
|
+
"audio/x-wav",
|
|
135
|
+
"audio/mpeg",
|
|
136
|
+
"audio/wav",
|
|
137
|
+
"audio/mp3",
|
|
138
|
+
"audio/mp4",
|
|
139
|
+
"audio/m4a",
|
|
140
|
+
"audio/aac",
|
|
141
|
+
"audio/ogg",
|
|
142
|
+
"audio/flac",
|
|
143
|
+
"audio/x-flac",
|
|
144
|
+
"video/mp4",
|
|
145
|
+
"video/avi",
|
|
146
|
+
"video/x-msvideo",
|
|
147
|
+
"video/quicktime",
|
|
148
|
+
],
|
|
149
|
+
InputFormat.VTT: ["text/vtt"],
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
|
153
|
+
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
|
|
154
|
+
for value in FormatToMimeType.values()
|
|
155
|
+
for mime in value
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
class DocInputType(str, Enum):
|
|
160
|
+
PATH = "path"
|
|
161
|
+
STREAM = "stream"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class DoclingComponentType(str, Enum):
|
|
165
|
+
DOCUMENT_BACKEND = "document_backend"
|
|
166
|
+
MODEL = "model"
|
|
167
|
+
DOC_ASSEMBLER = "doc_assembler"
|
|
168
|
+
USER_INPUT = "user_input"
|
|
169
|
+
PIPELINE = "pipeline"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class VlmStopReason(str, Enum):
|
|
173
|
+
LENGTH = "length" # max tokens reached
|
|
174
|
+
STOP_SEQUENCE = "stop_sequence" # Custom stopping criteria met
|
|
175
|
+
END_OF_SEQUENCE = "end_of_sequence" # Model generated end-of-text token
|
|
176
|
+
UNSPECIFIED = "unspecified" # Defaul none value
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class ErrorItem(BaseModel):
|
|
180
|
+
component_type: DoclingComponentType
|
|
181
|
+
module_name: str
|
|
182
|
+
error_message: str
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class Cluster(BaseModel):
|
|
186
|
+
id: int
|
|
187
|
+
label: DocItemLabel
|
|
188
|
+
bbox: BoundingBox
|
|
189
|
+
confidence: float = 1.0
|
|
190
|
+
cells: list[TextCell] = []
|
|
191
|
+
children: list["Cluster"] = [] # Add child cluster support
|
|
192
|
+
|
|
193
|
+
@field_serializer("confidence")
|
|
194
|
+
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
195
|
+
return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class BasePageElement(BaseModel):
|
|
199
|
+
label: DocItemLabel
|
|
200
|
+
id: int
|
|
201
|
+
page_no: int
|
|
202
|
+
cluster: Cluster
|
|
203
|
+
text: Optional[str] = None
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class LayoutPrediction(BaseModel):
|
|
207
|
+
clusters: list[Cluster] = []
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
class VlmPredictionToken(BaseModel):
|
|
211
|
+
text: str = ""
|
|
212
|
+
token: int = -1
|
|
213
|
+
logprob: float = -1
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
class VlmPrediction(BaseModel):
|
|
217
|
+
text: str = ""
|
|
218
|
+
generated_tokens: list[VlmPredictionToken] = []
|
|
219
|
+
generation_time: float = -1
|
|
220
|
+
num_tokens: Optional[int] = None
|
|
221
|
+
stop_reason: VlmStopReason = VlmStopReason.UNSPECIFIED
|
|
222
|
+
input_prompt: Optional[str] = None
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class ContainerElement(
|
|
226
|
+
BasePageElement
|
|
227
|
+
): # Used for Form and Key-Value-Regions, only for typing.
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class Table(BasePageElement):
|
|
232
|
+
otsl_seq: list[str]
|
|
233
|
+
num_rows: int = 0
|
|
234
|
+
num_cols: int = 0
|
|
235
|
+
table_cells: list[TableCell]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class TableStructurePrediction(BaseModel):
|
|
239
|
+
table_map: dict[int, Table] = {}
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class TextElement(BasePageElement):
|
|
243
|
+
text: str
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
class FigureElement(BasePageElement):
|
|
247
|
+
annotations: list[PictureDataType] = []
|
|
248
|
+
provenance: Optional[str] = None
|
|
249
|
+
predicted_class: Optional[str] = None
|
|
250
|
+
confidence: Optional[float] = None
|
|
251
|
+
|
|
252
|
+
@field_serializer("confidence")
|
|
253
|
+
def _serialize(
|
|
254
|
+
self, value: Optional[float], info: FieldSerializationInfo
|
|
255
|
+
) -> Optional[float]:
|
|
256
|
+
return (
|
|
257
|
+
round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
|
|
258
|
+
if value is not None
|
|
259
|
+
else None
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class FigureClassificationPrediction(BaseModel):
|
|
264
|
+
figure_count: int = 0
|
|
265
|
+
figure_map: dict[int, FigureElement] = {}
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class EquationPrediction(BaseModel):
|
|
269
|
+
equation_count: int = 0
|
|
270
|
+
equation_map: dict[int, TextElement] = {}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
class PagePredictions(BaseModel):
|
|
274
|
+
layout: Optional[LayoutPrediction] = None
|
|
275
|
+
tablestructure: Optional[TableStructurePrediction] = None
|
|
276
|
+
figures_classification: Optional[FigureClassificationPrediction] = None
|
|
277
|
+
equations_prediction: Optional[EquationPrediction] = None
|
|
278
|
+
vlm_response: Optional[VlmPrediction] = None
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
class AssembledUnit(BaseModel):
|
|
285
|
+
elements: list[PageElement] = []
|
|
286
|
+
body: list[PageElement] = []
|
|
287
|
+
headers: list[PageElement] = []
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class ItemAndImageEnrichmentElement(BaseModel):
|
|
291
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
292
|
+
|
|
293
|
+
item: NodeItem
|
|
294
|
+
image: Image
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
class Page(BaseModel):
|
|
298
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
299
|
+
|
|
300
|
+
page_no: int
|
|
301
|
+
# page_hash: Optional[str] = None
|
|
302
|
+
size: Optional[Size] = None
|
|
303
|
+
parsed_page: Optional[SegmentedPdfPage] = None
|
|
304
|
+
predictions: PagePredictions = PagePredictions()
|
|
305
|
+
assembled: Optional[AssembledUnit] = None
|
|
306
|
+
|
|
307
|
+
_backend: Optional["PdfPageBackend"] = (
|
|
308
|
+
None # Internal PDF backend. By default it is cleared during assembling.
|
|
309
|
+
)
|
|
310
|
+
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
|
311
|
+
_image_cache: dict[
|
|
312
|
+
float, Image
|
|
313
|
+
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
|
314
|
+
|
|
315
|
+
@property
|
|
316
|
+
def cells(self) -> list[TextCell]:
|
|
317
|
+
"""Return text cells as a read-only view of parsed_page.textline_cells."""
|
|
318
|
+
if self.parsed_page is not None:
|
|
319
|
+
return self.parsed_page.textline_cells
|
|
320
|
+
else:
|
|
321
|
+
return []
|
|
322
|
+
|
|
323
|
+
def get_image(
|
|
324
|
+
self,
|
|
325
|
+
scale: float = 1.0,
|
|
326
|
+
max_size: Optional[int] = None,
|
|
327
|
+
cropbox: Optional[BoundingBox] = None,
|
|
328
|
+
) -> Optional[Image]:
|
|
329
|
+
if self._backend is None:
|
|
330
|
+
return self._image_cache.get(scale, None)
|
|
331
|
+
|
|
332
|
+
if max_size:
|
|
333
|
+
assert self.size is not None
|
|
334
|
+
scale = min(scale, max_size / max(self.size.as_tuple()))
|
|
335
|
+
|
|
336
|
+
if scale not in self._image_cache:
|
|
337
|
+
if cropbox is None:
|
|
338
|
+
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
|
339
|
+
else:
|
|
340
|
+
return self._backend.get_page_image(scale=scale, cropbox=cropbox)
|
|
341
|
+
|
|
342
|
+
if cropbox is None:
|
|
343
|
+
return self._image_cache[scale]
|
|
344
|
+
else:
|
|
345
|
+
page_im = self._image_cache[scale]
|
|
346
|
+
assert self.size is not None
|
|
347
|
+
return page_im.crop(
|
|
348
|
+
cropbox.to_top_left_origin(page_height=self.size.height)
|
|
349
|
+
.scaled(scale=scale)
|
|
350
|
+
.as_tuple()
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
@property
|
|
354
|
+
def image(self) -> Optional[Image]:
|
|
355
|
+
return self.get_image(scale=self._default_image_scale)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
## OpenAI API Request / Response Models ##
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
class OpenAiChatMessage(BaseModel):
|
|
362
|
+
role: str
|
|
363
|
+
content: str
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
class OpenAiResponseChoice(BaseModel):
|
|
367
|
+
index: int
|
|
368
|
+
message: OpenAiChatMessage
|
|
369
|
+
finish_reason: Optional[str]
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
class OpenAiResponseUsage(BaseModel):
|
|
373
|
+
prompt_tokens: int
|
|
374
|
+
completion_tokens: int
|
|
375
|
+
total_tokens: int
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
class OpenAiApiResponse(BaseModel):
|
|
379
|
+
model_config = ConfigDict(
|
|
380
|
+
protected_namespaces=(),
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
id: str
|
|
384
|
+
model: Optional[str] = None # returned by openai
|
|
385
|
+
choices: list[OpenAiResponseChoice]
|
|
386
|
+
created: int
|
|
387
|
+
usage: OpenAiResponseUsage
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
# Create a type alias for score values
|
|
391
|
+
ScoreValue = float
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
class QualityGrade(str, Enum):
|
|
395
|
+
POOR = "poor"
|
|
396
|
+
FAIR = "fair"
|
|
397
|
+
GOOD = "good"
|
|
398
|
+
EXCELLENT = "excellent"
|
|
399
|
+
UNSPECIFIED = "unspecified"
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
class PageConfidenceScores(BaseModel):
|
|
403
|
+
parse_score: ScoreValue = np.nan
|
|
404
|
+
layout_score: ScoreValue = np.nan
|
|
405
|
+
table_score: ScoreValue = np.nan
|
|
406
|
+
ocr_score: ScoreValue = np.nan
|
|
407
|
+
|
|
408
|
+
# Accept null/None or string "NaN" values on input and coerce to np.nan
|
|
409
|
+
@field_validator(
|
|
410
|
+
"parse_score", "layout_score", "table_score", "ocr_score", mode="before"
|
|
411
|
+
)
|
|
412
|
+
@classmethod
|
|
413
|
+
def _coerce_none_or_nan_str(cls, v):
|
|
414
|
+
if v is None:
|
|
415
|
+
return np.nan
|
|
416
|
+
if isinstance(v, str) and v.strip().lower() in {"nan", "null", "none", ""}:
|
|
417
|
+
return np.nan
|
|
418
|
+
return v
|
|
419
|
+
|
|
420
|
+
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
|
421
|
+
if score < 0.5:
|
|
422
|
+
return QualityGrade.POOR
|
|
423
|
+
elif score < 0.8:
|
|
424
|
+
return QualityGrade.FAIR
|
|
425
|
+
elif score < 0.9:
|
|
426
|
+
return QualityGrade.GOOD
|
|
427
|
+
elif score >= 0.9:
|
|
428
|
+
return QualityGrade.EXCELLENT
|
|
429
|
+
|
|
430
|
+
return QualityGrade.UNSPECIFIED
|
|
431
|
+
|
|
432
|
+
@computed_field # type: ignore
|
|
433
|
+
@property
|
|
434
|
+
def mean_grade(self) -> QualityGrade:
|
|
435
|
+
return self._score_to_grade(self.mean_score)
|
|
436
|
+
|
|
437
|
+
@computed_field # type: ignore
|
|
438
|
+
@property
|
|
439
|
+
def low_grade(self) -> QualityGrade:
|
|
440
|
+
return self._score_to_grade(self.low_score)
|
|
441
|
+
|
|
442
|
+
@computed_field # type: ignore
|
|
443
|
+
@property
|
|
444
|
+
def mean_score(self) -> ScoreValue:
|
|
445
|
+
return ScoreValue(
|
|
446
|
+
np.nanmean(
|
|
447
|
+
[
|
|
448
|
+
self.ocr_score,
|
|
449
|
+
self.table_score,
|
|
450
|
+
self.layout_score,
|
|
451
|
+
self.parse_score,
|
|
452
|
+
]
|
|
453
|
+
)
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
@computed_field # type: ignore
|
|
457
|
+
@property
|
|
458
|
+
def low_score(self) -> ScoreValue:
|
|
459
|
+
return ScoreValue(
|
|
460
|
+
np.nanquantile(
|
|
461
|
+
[
|
|
462
|
+
self.ocr_score,
|
|
463
|
+
self.table_score,
|
|
464
|
+
self.layout_score,
|
|
465
|
+
self.parse_score,
|
|
466
|
+
],
|
|
467
|
+
q=0.05,
|
|
468
|
+
)
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
class ConfidenceReport(PageConfidenceScores):
|
|
473
|
+
pages: dict[int, PageConfidenceScores] = Field(
|
|
474
|
+
default_factory=lambda: defaultdict(PageConfidenceScores)
|
|
475
|
+
)
|
|
476
|
+
|
|
477
|
+
@computed_field # type: ignore
|
|
478
|
+
@property
|
|
479
|
+
def mean_score(self) -> ScoreValue:
|
|
480
|
+
return ScoreValue(
|
|
481
|
+
np.nanmean(
|
|
482
|
+
[c.mean_score for c in self.pages.values()],
|
|
483
|
+
)
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
@computed_field # type: ignore
|
|
487
|
+
@property
|
|
488
|
+
def low_score(self) -> ScoreValue:
|
|
489
|
+
return ScoreValue(
|
|
490
|
+
np.nanmean(
|
|
491
|
+
[c.low_score for c in self.pages.values()],
|
|
492
|
+
)
|
|
493
|
+
)
|