docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,493 @@
1
+ from collections import defaultdict
2
+ from enum import Enum
3
+ from typing import TYPE_CHECKING, Optional, Type, Union
4
+
5
+ import numpy as np
6
+ from docling_core.types.doc import (
7
+ BoundingBox,
8
+ DocItemLabel,
9
+ NodeItem,
10
+ PictureDataType,
11
+ Size,
12
+ TableCell,
13
+ )
14
+ from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
15
+ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
16
+ from docling_core.types.io import DocumentStream
17
+
18
+ # DO NOT REMOVE; explicitly exposed from this location
19
+ from PIL.Image import Image
20
+ from pydantic import (
21
+ BaseModel,
22
+ ConfigDict,
23
+ Field,
24
+ FieldSerializationInfo,
25
+ computed_field,
26
+ field_serializer,
27
+ field_validator,
28
+ )
29
+
30
+ if TYPE_CHECKING:
31
+ from docling.backend.pdf_backend import PdfPageBackend
32
+
33
+ from docling.backend.abstract_backend import AbstractDocumentBackend
34
+ from docling.datamodel.pipeline_options import PipelineOptions
35
+
36
+
37
+ class BaseFormatOption(BaseModel):
38
+ """Base class for format options used by _DocumentConversionInput."""
39
+
40
+ pipeline_options: Optional[PipelineOptions] = None
41
+ backend: Type[AbstractDocumentBackend]
42
+
43
+ model_config = ConfigDict(arbitrary_types_allowed=True)
44
+
45
+
46
+ class ConversionStatus(str, Enum):
47
+ PENDING = "pending"
48
+ STARTED = "started"
49
+ FAILURE = "failure"
50
+ SUCCESS = "success"
51
+ PARTIAL_SUCCESS = "partial_success"
52
+ SKIPPED = "skipped"
53
+
54
+
55
+ class InputFormat(str, Enum):
56
+ """A document format supported by document backend parsers."""
57
+
58
+ DOCX = "docx"
59
+ PPTX = "pptx"
60
+ HTML = "html"
61
+ IMAGE = "image"
62
+ PDF = "pdf"
63
+ ASCIIDOC = "asciidoc"
64
+ MD = "md"
65
+ CSV = "csv"
66
+ XLSX = "xlsx"
67
+ XML_USPTO = "xml_uspto"
68
+ XML_JATS = "xml_jats"
69
+ METS_GBS = "mets_gbs"
70
+ JSON_DOCLING = "json_docling"
71
+ AUDIO = "audio"
72
+ VTT = "vtt"
73
+
74
+
75
+ class OutputFormat(str, Enum):
76
+ MARKDOWN = "md"
77
+ JSON = "json"
78
+ YAML = "yaml"
79
+ HTML = "html"
80
+ HTML_SPLIT_PAGE = "html_split_page"
81
+ TEXT = "text"
82
+ DOCTAGS = "doctags"
83
+
84
+
85
+ FormatToExtensions: dict[InputFormat, list[str]] = {
86
+ InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
87
+ InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
88
+ InputFormat.PDF: ["pdf"],
89
+ InputFormat.MD: ["md"],
90
+ InputFormat.HTML: ["html", "htm", "xhtml"],
91
+ InputFormat.XML_JATS: ["xml", "nxml"],
92
+ InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
93
+ InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
94
+ InputFormat.CSV: ["csv"],
95
+ InputFormat.XLSX: ["xlsx", "xlsm"],
96
+ InputFormat.XML_USPTO: ["xml", "txt"],
97
+ InputFormat.METS_GBS: ["tar.gz"],
98
+ InputFormat.JSON_DOCLING: ["json"],
99
+ InputFormat.AUDIO: ["wav", "mp3", "m4a", "aac", "ogg", "flac", "mp4", "avi", "mov"],
100
+ InputFormat.VTT: ["vtt"],
101
+ }
102
+
103
+ FormatToMimeType: dict[InputFormat, list[str]] = {
104
+ InputFormat.DOCX: [
105
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
106
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
107
+ ],
108
+ InputFormat.PPTX: [
109
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
110
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
111
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
112
+ ],
113
+ InputFormat.HTML: ["text/html", "application/xhtml+xml"],
114
+ InputFormat.XML_JATS: ["application/xml"],
115
+ InputFormat.IMAGE: [
116
+ "image/png",
117
+ "image/jpeg",
118
+ "image/tiff",
119
+ "image/gif",
120
+ "image/bmp",
121
+ "image/webp",
122
+ ],
123
+ InputFormat.PDF: ["application/pdf"],
124
+ InputFormat.ASCIIDOC: ["text/asciidoc"],
125
+ InputFormat.MD: ["text/markdown", "text/x-markdown"],
126
+ InputFormat.CSV: ["text/csv"],
127
+ InputFormat.XLSX: [
128
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
129
+ ],
130
+ InputFormat.XML_USPTO: ["application/xml", "text/plain"],
131
+ InputFormat.METS_GBS: ["application/mets+xml"],
132
+ InputFormat.JSON_DOCLING: ["application/json"],
133
+ InputFormat.AUDIO: [
134
+ "audio/x-wav",
135
+ "audio/mpeg",
136
+ "audio/wav",
137
+ "audio/mp3",
138
+ "audio/mp4",
139
+ "audio/m4a",
140
+ "audio/aac",
141
+ "audio/ogg",
142
+ "audio/flac",
143
+ "audio/x-flac",
144
+ "video/mp4",
145
+ "video/avi",
146
+ "video/x-msvideo",
147
+ "video/quicktime",
148
+ ],
149
+ InputFormat.VTT: ["text/vtt"],
150
+ }
151
+
152
+ MimeTypeToFormat: dict[str, list[InputFormat]] = {
153
+ mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
154
+ for value in FormatToMimeType.values()
155
+ for mime in value
156
+ }
157
+
158
+
159
+ class DocInputType(str, Enum):
160
+ PATH = "path"
161
+ STREAM = "stream"
162
+
163
+
164
+ class DoclingComponentType(str, Enum):
165
+ DOCUMENT_BACKEND = "document_backend"
166
+ MODEL = "model"
167
+ DOC_ASSEMBLER = "doc_assembler"
168
+ USER_INPUT = "user_input"
169
+ PIPELINE = "pipeline"
170
+
171
+
172
+ class VlmStopReason(str, Enum):
173
+ LENGTH = "length" # max tokens reached
174
+ STOP_SEQUENCE = "stop_sequence" # Custom stopping criteria met
175
+ END_OF_SEQUENCE = "end_of_sequence" # Model generated end-of-text token
176
+ UNSPECIFIED = "unspecified" # Defaul none value
177
+
178
+
179
+ class ErrorItem(BaseModel):
180
+ component_type: DoclingComponentType
181
+ module_name: str
182
+ error_message: str
183
+
184
+
185
+ class Cluster(BaseModel):
186
+ id: int
187
+ label: DocItemLabel
188
+ bbox: BoundingBox
189
+ confidence: float = 1.0
190
+ cells: list[TextCell] = []
191
+ children: list["Cluster"] = [] # Add child cluster support
192
+
193
+ @field_serializer("confidence")
194
+ def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
195
+ return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
196
+
197
+
198
+ class BasePageElement(BaseModel):
199
+ label: DocItemLabel
200
+ id: int
201
+ page_no: int
202
+ cluster: Cluster
203
+ text: Optional[str] = None
204
+
205
+
206
+ class LayoutPrediction(BaseModel):
207
+ clusters: list[Cluster] = []
208
+
209
+
210
+ class VlmPredictionToken(BaseModel):
211
+ text: str = ""
212
+ token: int = -1
213
+ logprob: float = -1
214
+
215
+
216
+ class VlmPrediction(BaseModel):
217
+ text: str = ""
218
+ generated_tokens: list[VlmPredictionToken] = []
219
+ generation_time: float = -1
220
+ num_tokens: Optional[int] = None
221
+ stop_reason: VlmStopReason = VlmStopReason.UNSPECIFIED
222
+ input_prompt: Optional[str] = None
223
+
224
+
225
+ class ContainerElement(
226
+ BasePageElement
227
+ ): # Used for Form and Key-Value-Regions, only for typing.
228
+ pass
229
+
230
+
231
+ class Table(BasePageElement):
232
+ otsl_seq: list[str]
233
+ num_rows: int = 0
234
+ num_cols: int = 0
235
+ table_cells: list[TableCell]
236
+
237
+
238
+ class TableStructurePrediction(BaseModel):
239
+ table_map: dict[int, Table] = {}
240
+
241
+
242
+ class TextElement(BasePageElement):
243
+ text: str
244
+
245
+
246
+ class FigureElement(BasePageElement):
247
+ annotations: list[PictureDataType] = []
248
+ provenance: Optional[str] = None
249
+ predicted_class: Optional[str] = None
250
+ confidence: Optional[float] = None
251
+
252
+ @field_serializer("confidence")
253
+ def _serialize(
254
+ self, value: Optional[float], info: FieldSerializationInfo
255
+ ) -> Optional[float]:
256
+ return (
257
+ round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)
258
+ if value is not None
259
+ else None
260
+ )
261
+
262
+
263
+ class FigureClassificationPrediction(BaseModel):
264
+ figure_count: int = 0
265
+ figure_map: dict[int, FigureElement] = {}
266
+
267
+
268
+ class EquationPrediction(BaseModel):
269
+ equation_count: int = 0
270
+ equation_map: dict[int, TextElement] = {}
271
+
272
+
273
+ class PagePredictions(BaseModel):
274
+ layout: Optional[LayoutPrediction] = None
275
+ tablestructure: Optional[TableStructurePrediction] = None
276
+ figures_classification: Optional[FigureClassificationPrediction] = None
277
+ equations_prediction: Optional[EquationPrediction] = None
278
+ vlm_response: Optional[VlmPrediction] = None
279
+
280
+
281
+ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
282
+
283
+
284
+ class AssembledUnit(BaseModel):
285
+ elements: list[PageElement] = []
286
+ body: list[PageElement] = []
287
+ headers: list[PageElement] = []
288
+
289
+
290
+ class ItemAndImageEnrichmentElement(BaseModel):
291
+ model_config = ConfigDict(arbitrary_types_allowed=True)
292
+
293
+ item: NodeItem
294
+ image: Image
295
+
296
+
297
+ class Page(BaseModel):
298
+ model_config = ConfigDict(arbitrary_types_allowed=True)
299
+
300
+ page_no: int
301
+ # page_hash: Optional[str] = None
302
+ size: Optional[Size] = None
303
+ parsed_page: Optional[SegmentedPdfPage] = None
304
+ predictions: PagePredictions = PagePredictions()
305
+ assembled: Optional[AssembledUnit] = None
306
+
307
+ _backend: Optional["PdfPageBackend"] = (
308
+ None # Internal PDF backend. By default it is cleared during assembling.
309
+ )
310
+ _default_image_scale: float = 1.0 # Default image scale for external usage.
311
+ _image_cache: dict[
312
+ float, Image
313
+ ] = {} # Cache of images in different scales. By default it is cleared during assembling.
314
+
315
+ @property
316
+ def cells(self) -> list[TextCell]:
317
+ """Return text cells as a read-only view of parsed_page.textline_cells."""
318
+ if self.parsed_page is not None:
319
+ return self.parsed_page.textline_cells
320
+ else:
321
+ return []
322
+
323
+ def get_image(
324
+ self,
325
+ scale: float = 1.0,
326
+ max_size: Optional[int] = None,
327
+ cropbox: Optional[BoundingBox] = None,
328
+ ) -> Optional[Image]:
329
+ if self._backend is None:
330
+ return self._image_cache.get(scale, None)
331
+
332
+ if max_size:
333
+ assert self.size is not None
334
+ scale = min(scale, max_size / max(self.size.as_tuple()))
335
+
336
+ if scale not in self._image_cache:
337
+ if cropbox is None:
338
+ self._image_cache[scale] = self._backend.get_page_image(scale=scale)
339
+ else:
340
+ return self._backend.get_page_image(scale=scale, cropbox=cropbox)
341
+
342
+ if cropbox is None:
343
+ return self._image_cache[scale]
344
+ else:
345
+ page_im = self._image_cache[scale]
346
+ assert self.size is not None
347
+ return page_im.crop(
348
+ cropbox.to_top_left_origin(page_height=self.size.height)
349
+ .scaled(scale=scale)
350
+ .as_tuple()
351
+ )
352
+
353
+ @property
354
+ def image(self) -> Optional[Image]:
355
+ return self.get_image(scale=self._default_image_scale)
356
+
357
+
358
+ ## OpenAI API Request / Response Models ##
359
+
360
+
361
+ class OpenAiChatMessage(BaseModel):
362
+ role: str
363
+ content: str
364
+
365
+
366
+ class OpenAiResponseChoice(BaseModel):
367
+ index: int
368
+ message: OpenAiChatMessage
369
+ finish_reason: Optional[str]
370
+
371
+
372
+ class OpenAiResponseUsage(BaseModel):
373
+ prompt_tokens: int
374
+ completion_tokens: int
375
+ total_tokens: int
376
+
377
+
378
+ class OpenAiApiResponse(BaseModel):
379
+ model_config = ConfigDict(
380
+ protected_namespaces=(),
381
+ )
382
+
383
+ id: str
384
+ model: Optional[str] = None # returned by openai
385
+ choices: list[OpenAiResponseChoice]
386
+ created: int
387
+ usage: OpenAiResponseUsage
388
+
389
+
390
+ # Create a type alias for score values
391
+ ScoreValue = float
392
+
393
+
394
+ class QualityGrade(str, Enum):
395
+ POOR = "poor"
396
+ FAIR = "fair"
397
+ GOOD = "good"
398
+ EXCELLENT = "excellent"
399
+ UNSPECIFIED = "unspecified"
400
+
401
+
402
+ class PageConfidenceScores(BaseModel):
403
+ parse_score: ScoreValue = np.nan
404
+ layout_score: ScoreValue = np.nan
405
+ table_score: ScoreValue = np.nan
406
+ ocr_score: ScoreValue = np.nan
407
+
408
+ # Accept null/None or string "NaN" values on input and coerce to np.nan
409
+ @field_validator(
410
+ "parse_score", "layout_score", "table_score", "ocr_score", mode="before"
411
+ )
412
+ @classmethod
413
+ def _coerce_none_or_nan_str(cls, v):
414
+ if v is None:
415
+ return np.nan
416
+ if isinstance(v, str) and v.strip().lower() in {"nan", "null", "none", ""}:
417
+ return np.nan
418
+ return v
419
+
420
+ def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
421
+ if score < 0.5:
422
+ return QualityGrade.POOR
423
+ elif score < 0.8:
424
+ return QualityGrade.FAIR
425
+ elif score < 0.9:
426
+ return QualityGrade.GOOD
427
+ elif score >= 0.9:
428
+ return QualityGrade.EXCELLENT
429
+
430
+ return QualityGrade.UNSPECIFIED
431
+
432
+ @computed_field # type: ignore
433
+ @property
434
+ def mean_grade(self) -> QualityGrade:
435
+ return self._score_to_grade(self.mean_score)
436
+
437
+ @computed_field # type: ignore
438
+ @property
439
+ def low_grade(self) -> QualityGrade:
440
+ return self._score_to_grade(self.low_score)
441
+
442
+ @computed_field # type: ignore
443
+ @property
444
+ def mean_score(self) -> ScoreValue:
445
+ return ScoreValue(
446
+ np.nanmean(
447
+ [
448
+ self.ocr_score,
449
+ self.table_score,
450
+ self.layout_score,
451
+ self.parse_score,
452
+ ]
453
+ )
454
+ )
455
+
456
+ @computed_field # type: ignore
457
+ @property
458
+ def low_score(self) -> ScoreValue:
459
+ return ScoreValue(
460
+ np.nanquantile(
461
+ [
462
+ self.ocr_score,
463
+ self.table_score,
464
+ self.layout_score,
465
+ self.parse_score,
466
+ ],
467
+ q=0.05,
468
+ )
469
+ )
470
+
471
+
472
+ class ConfidenceReport(PageConfidenceScores):
473
+ pages: dict[int, PageConfidenceScores] = Field(
474
+ default_factory=lambda: defaultdict(PageConfidenceScores)
475
+ )
476
+
477
+ @computed_field # type: ignore
478
+ @property
479
+ def mean_score(self) -> ScoreValue:
480
+ return ScoreValue(
481
+ np.nanmean(
482
+ [c.mean_score for c in self.pages.values()],
483
+ )
484
+ )
485
+
486
+ @computed_field # type: ignore
487
+ @property
488
+ def low_score(self) -> ScoreValue:
489
+ return ScoreValue(
490
+ np.nanmean(
491
+ [c.low_score for c in self.pages.values()],
492
+ )
493
+ )