docling 2.69.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (138) hide show
  1. docling/__init__.py +0 -0
  2. docling/backend/__init__.py +0 -0
  3. docling/backend/abstract_backend.py +84 -0
  4. docling/backend/asciidoc_backend.py +443 -0
  5. docling/backend/csv_backend.py +125 -0
  6. docling/backend/docling_parse_backend.py +237 -0
  7. docling/backend/docling_parse_v2_backend.py +276 -0
  8. docling/backend/docling_parse_v4_backend.py +260 -0
  9. docling/backend/docx/__init__.py +0 -0
  10. docling/backend/docx/drawingml/utils.py +131 -0
  11. docling/backend/docx/latex/__init__.py +0 -0
  12. docling/backend/docx/latex/latex_dict.py +274 -0
  13. docling/backend/docx/latex/omml.py +459 -0
  14. docling/backend/html_backend.py +1502 -0
  15. docling/backend/image_backend.py +188 -0
  16. docling/backend/json/__init__.py +0 -0
  17. docling/backend/json/docling_json_backend.py +58 -0
  18. docling/backend/md_backend.py +618 -0
  19. docling/backend/mets_gbs_backend.py +399 -0
  20. docling/backend/msexcel_backend.py +686 -0
  21. docling/backend/mspowerpoint_backend.py +398 -0
  22. docling/backend/msword_backend.py +1663 -0
  23. docling/backend/noop_backend.py +51 -0
  24. docling/backend/pdf_backend.py +82 -0
  25. docling/backend/pypdfium2_backend.py +417 -0
  26. docling/backend/webvtt_backend.py +572 -0
  27. docling/backend/xml/__init__.py +0 -0
  28. docling/backend/xml/jats_backend.py +819 -0
  29. docling/backend/xml/uspto_backend.py +1905 -0
  30. docling/chunking/__init__.py +12 -0
  31. docling/cli/__init__.py +0 -0
  32. docling/cli/main.py +974 -0
  33. docling/cli/models.py +196 -0
  34. docling/cli/tools.py +17 -0
  35. docling/datamodel/__init__.py +0 -0
  36. docling/datamodel/accelerator_options.py +69 -0
  37. docling/datamodel/asr_model_specs.py +494 -0
  38. docling/datamodel/backend_options.py +102 -0
  39. docling/datamodel/base_models.py +493 -0
  40. docling/datamodel/document.py +699 -0
  41. docling/datamodel/extraction.py +39 -0
  42. docling/datamodel/layout_model_specs.py +91 -0
  43. docling/datamodel/pipeline_options.py +457 -0
  44. docling/datamodel/pipeline_options_asr_model.py +78 -0
  45. docling/datamodel/pipeline_options_vlm_model.py +136 -0
  46. docling/datamodel/settings.py +65 -0
  47. docling/datamodel/vlm_model_specs.py +365 -0
  48. docling/document_converter.py +559 -0
  49. docling/document_extractor.py +327 -0
  50. docling/exceptions.py +10 -0
  51. docling/experimental/__init__.py +5 -0
  52. docling/experimental/datamodel/__init__.py +1 -0
  53. docling/experimental/datamodel/table_crops_layout_options.py +13 -0
  54. docling/experimental/datamodel/threaded_layout_vlm_pipeline_options.py +45 -0
  55. docling/experimental/models/__init__.py +3 -0
  56. docling/experimental/models/table_crops_layout_model.py +114 -0
  57. docling/experimental/pipeline/__init__.py +1 -0
  58. docling/experimental/pipeline/threaded_layout_vlm_pipeline.py +439 -0
  59. docling/models/__init__.py +0 -0
  60. docling/models/base_layout_model.py +39 -0
  61. docling/models/base_model.py +230 -0
  62. docling/models/base_ocr_model.py +241 -0
  63. docling/models/base_table_model.py +45 -0
  64. docling/models/extraction/__init__.py +0 -0
  65. docling/models/extraction/nuextract_transformers_model.py +305 -0
  66. docling/models/factories/__init__.py +47 -0
  67. docling/models/factories/base_factory.py +122 -0
  68. docling/models/factories/layout_factory.py +7 -0
  69. docling/models/factories/ocr_factory.py +11 -0
  70. docling/models/factories/picture_description_factory.py +11 -0
  71. docling/models/factories/table_factory.py +7 -0
  72. docling/models/picture_description_base_model.py +149 -0
  73. docling/models/plugins/__init__.py +0 -0
  74. docling/models/plugins/defaults.py +60 -0
  75. docling/models/stages/__init__.py +0 -0
  76. docling/models/stages/code_formula/__init__.py +0 -0
  77. docling/models/stages/code_formula/code_formula_model.py +342 -0
  78. docling/models/stages/layout/__init__.py +0 -0
  79. docling/models/stages/layout/layout_model.py +249 -0
  80. docling/models/stages/ocr/__init__.py +0 -0
  81. docling/models/stages/ocr/auto_ocr_model.py +132 -0
  82. docling/models/stages/ocr/easyocr_model.py +200 -0
  83. docling/models/stages/ocr/ocr_mac_model.py +145 -0
  84. docling/models/stages/ocr/rapid_ocr_model.py +328 -0
  85. docling/models/stages/ocr/tesseract_ocr_cli_model.py +331 -0
  86. docling/models/stages/ocr/tesseract_ocr_model.py +262 -0
  87. docling/models/stages/page_assemble/__init__.py +0 -0
  88. docling/models/stages/page_assemble/page_assemble_model.py +156 -0
  89. docling/models/stages/page_preprocessing/__init__.py +0 -0
  90. docling/models/stages/page_preprocessing/page_preprocessing_model.py +145 -0
  91. docling/models/stages/picture_classifier/__init__.py +0 -0
  92. docling/models/stages/picture_classifier/document_picture_classifier.py +246 -0
  93. docling/models/stages/picture_description/__init__.py +0 -0
  94. docling/models/stages/picture_description/picture_description_api_model.py +66 -0
  95. docling/models/stages/picture_description/picture_description_vlm_model.py +123 -0
  96. docling/models/stages/reading_order/__init__.py +0 -0
  97. docling/models/stages/reading_order/readingorder_model.py +431 -0
  98. docling/models/stages/table_structure/__init__.py +0 -0
  99. docling/models/stages/table_structure/table_structure_model.py +305 -0
  100. docling/models/utils/__init__.py +0 -0
  101. docling/models/utils/generation_utils.py +157 -0
  102. docling/models/utils/hf_model_download.py +45 -0
  103. docling/models/vlm_pipeline_models/__init__.py +1 -0
  104. docling/models/vlm_pipeline_models/api_vlm_model.py +180 -0
  105. docling/models/vlm_pipeline_models/hf_transformers_model.py +391 -0
  106. docling/models/vlm_pipeline_models/mlx_model.py +325 -0
  107. docling/models/vlm_pipeline_models/vllm_model.py +344 -0
  108. docling/pipeline/__init__.py +0 -0
  109. docling/pipeline/asr_pipeline.py +431 -0
  110. docling/pipeline/base_extraction_pipeline.py +72 -0
  111. docling/pipeline/base_pipeline.py +326 -0
  112. docling/pipeline/extraction_vlm_pipeline.py +207 -0
  113. docling/pipeline/legacy_standard_pdf_pipeline.py +262 -0
  114. docling/pipeline/simple_pipeline.py +55 -0
  115. docling/pipeline/standard_pdf_pipeline.py +859 -0
  116. docling/pipeline/threaded_standard_pdf_pipeline.py +5 -0
  117. docling/pipeline/vlm_pipeline.py +416 -0
  118. docling/py.typed +1 -0
  119. docling/utils/__init__.py +0 -0
  120. docling/utils/accelerator_utils.py +97 -0
  121. docling/utils/api_image_request.py +205 -0
  122. docling/utils/deepseekocr_utils.py +388 -0
  123. docling/utils/export.py +146 -0
  124. docling/utils/glm_utils.py +361 -0
  125. docling/utils/layout_postprocessor.py +683 -0
  126. docling/utils/locks.py +3 -0
  127. docling/utils/model_downloader.py +168 -0
  128. docling/utils/ocr_utils.py +69 -0
  129. docling/utils/orientation.py +65 -0
  130. docling/utils/profiling.py +65 -0
  131. docling/utils/utils.py +65 -0
  132. docling/utils/visualization.py +85 -0
  133. docling-2.69.0.dist-info/METADATA +237 -0
  134. docling-2.69.0.dist-info/RECORD +138 -0
  135. docling-2.69.0.dist-info/WHEEL +5 -0
  136. docling-2.69.0.dist-info/entry_points.txt +6 -0
  137. docling-2.69.0.dist-info/licenses/LICENSE +21 -0
  138. docling-2.69.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,262 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Iterable, Optional, Type
6
+
7
+ from docling_core.types.doc import BoundingBox, CoordOrigin
8
+ from docling_core.types.doc.page import TextCell
9
+
10
+ from docling.datamodel.accelerator_options import AcceleratorOptions
11
+ from docling.datamodel.base_models import Page
12
+ from docling.datamodel.document import ConversionResult
13
+ from docling.datamodel.pipeline_options import (
14
+ OcrOptions,
15
+ TesseractOcrOptions,
16
+ )
17
+ from docling.datamodel.settings import settings
18
+ from docling.models.base_ocr_model import BaseOcrModel
19
+ from docling.utils.ocr_utils import (
20
+ map_tesseract_script,
21
+ parse_tesseract_orientation,
22
+ tesseract_box_to_bounding_rectangle,
23
+ )
24
+ from docling.utils.profiling import TimeRecorder
25
+
26
+ _log = logging.getLogger(__name__)
27
+
28
+
29
+ class TesseractOcrModel(BaseOcrModel):
30
+ def __init__(
31
+ self,
32
+ enabled: bool,
33
+ artifacts_path: Optional[Path],
34
+ options: TesseractOcrOptions,
35
+ accelerator_options: AcceleratorOptions,
36
+ ):
37
+ super().__init__(
38
+ enabled=enabled,
39
+ artifacts_path=artifacts_path,
40
+ options=options,
41
+ accelerator_options=accelerator_options,
42
+ )
43
+ self.options: TesseractOcrOptions
44
+ self._is_auto: bool = "auto" in self.options.lang
45
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
46
+ self.reader = None
47
+ self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
48
+
49
+ if self.enabled:
50
+ install_errmsg = (
51
+ "tesserocr is not correctly installed. "
52
+ "Please install it via `pip install tesserocr` to use this OCR engine. "
53
+ "Note that tesserocr might have to be manually compiled for working with "
54
+ "your Tesseract installation. The Docling documentation provides examples for it. "
55
+ "Alternatively, Docling has support for other OCR engines. See the documentation: "
56
+ "https://docling-project.github.io/docling/installation/"
57
+ )
58
+ missing_langs_errmsg = (
59
+ "tesserocr is not correctly configured. No language models have been detected. "
60
+ "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
61
+ "You can find more information how to setup other OCR engines in Docling "
62
+ "documentation: "
63
+ "https://docling-project.github.io/docling/installation/"
64
+ )
65
+
66
+ try:
67
+ import tesserocr
68
+ except ImportError:
69
+ raise ImportError(install_errmsg)
70
+ try:
71
+ tesseract_version = tesserocr.tesseract_version()
72
+ except Exception:
73
+ raise ImportError(install_errmsg)
74
+
75
+ _, self._tesserocr_languages = tesserocr.get_languages()
76
+ if not self._tesserocr_languages:
77
+ raise ImportError(missing_langs_errmsg)
78
+
79
+ # Initialize the tesseractAPI
80
+ _log.debug("Initializing TesserOCR: %s", tesseract_version)
81
+ lang = "+".join(self.options.lang)
82
+
83
+ if any(lang.startswith("script/") for lang in self._tesserocr_languages):
84
+ self.script_prefix = "script/"
85
+ else:
86
+ self.script_prefix = ""
87
+
88
+ tesserocr_kwargs = {
89
+ "init": True,
90
+ "oem": tesserocr.OEM.DEFAULT,
91
+ }
92
+
93
+ self.osd_reader = None
94
+
95
+ if self.options.path is not None:
96
+ tesserocr_kwargs["path"] = self.options.path
97
+
98
+ # Set main OCR reader with configurable PSM
99
+ main_psm = (
100
+ self.options.psm if self.options.psm is not None else tesserocr.PSM.AUTO
101
+ )
102
+ if lang == "auto":
103
+ self.reader = tesserocr.PyTessBaseAPI(psm=main_psm, **tesserocr_kwargs)
104
+ else:
105
+ self.reader = tesserocr.PyTessBaseAPI(
106
+ lang=lang,
107
+ psm=main_psm,
108
+ **tesserocr_kwargs,
109
+ )
110
+ # OSD reader must use PSM.OSD_ONLY for orientation detection
111
+ self.osd_reader = tesserocr.PyTessBaseAPI(
112
+ lang="osd", psm=tesserocr.PSM.OSD_ONLY, **tesserocr_kwargs
113
+ )
114
+ self.reader_RIL = tesserocr.RIL
115
+
116
+ def __del__(self):
117
+ if self.reader is not None:
118
+ # Finalize the tesseractAPI
119
+ self.reader.End()
120
+ for script in self.script_readers:
121
+ self.script_readers[script].End()
122
+
123
+ def __call__(
124
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
125
+ ) -> Iterable[Page]:
126
+ if not self.enabled:
127
+ yield from page_batch
128
+ return
129
+
130
+ for page_i, page in enumerate(page_batch):
131
+ assert page._backend is not None
132
+ if not page._backend.is_valid():
133
+ yield page
134
+ else:
135
+ with TimeRecorder(conv_res, "ocr"):
136
+ assert self.reader is not None
137
+ assert self.osd_reader is not None
138
+ assert self._tesserocr_languages is not None
139
+
140
+ ocr_rects = self.get_ocr_rects(page)
141
+
142
+ all_ocr_cells = []
143
+ for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
144
+ # Skip zero area boxes
145
+ if ocr_rect.area() == 0:
146
+ continue
147
+ high_res_image = page._backend.get_page_image(
148
+ scale=self.scale, cropbox=ocr_rect
149
+ )
150
+
151
+ local_reader = self.reader
152
+ self.osd_reader.SetImage(high_res_image)
153
+
154
+ doc_orientation = 0
155
+ osd = self.osd_reader.DetectOrientationScript()
156
+
157
+ # No text, or Orientation and Script detection failure
158
+ if osd is None:
159
+ _log.error(
160
+ "OSD failed for doc (doc %s, page: %s, "
161
+ "OCR rectangle: %s)",
162
+ conv_res.input.file,
163
+ page_i,
164
+ ocr_rect_i,
165
+ )
166
+ # Skipping if OSD fail when in auto mode, otherwise proceed
167
+ # to OCR in the hope OCR will succeed while OSD failed
168
+ if self._is_auto:
169
+ continue
170
+ else:
171
+ doc_orientation = parse_tesseract_orientation(
172
+ osd["orient_deg"]
173
+ )
174
+ if doc_orientation != 0:
175
+ high_res_image = high_res_image.rotate(
176
+ -doc_orientation, expand=True
177
+ )
178
+ if self._is_auto:
179
+ script = osd["script_name"]
180
+ script = map_tesseract_script(script)
181
+ lang = f"{self.script_prefix}{script}"
182
+
183
+ # Check if the detected language is present in the system
184
+ if lang not in self._tesserocr_languages:
185
+ msg = f"Tesseract detected the script '{script}' and language '{lang}'."
186
+ msg += " However this language is not installed in your system and will be ignored."
187
+ _log.warning(msg)
188
+ else:
189
+ if script not in self.script_readers:
190
+ import tesserocr
191
+
192
+ self.script_readers[script] = (
193
+ tesserocr.PyTessBaseAPI(
194
+ path=self.reader.GetDatapath(),
195
+ lang=lang,
196
+ psm=self.options.psm
197
+ if self.options.psm is not None
198
+ else tesserocr.PSM.AUTO,
199
+ init=True,
200
+ oem=tesserocr.OEM.DEFAULT,
201
+ )
202
+ )
203
+ local_reader = self.script_readers[script]
204
+
205
+ local_reader.SetImage(high_res_image)
206
+ boxes = local_reader.GetComponentImages(
207
+ self.reader_RIL.TEXTLINE, True
208
+ )
209
+
210
+ cells = []
211
+ for ix, (im, box, _, _) in enumerate(boxes):
212
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
213
+ local_reader.SetRectangle(
214
+ box["x"], box["y"], box["w"], box["h"]
215
+ )
216
+
217
+ # Extract text within the bounding box
218
+ text = local_reader.GetUTF8Text().strip()
219
+ confidence = local_reader.MeanTextConf()
220
+ left, top = box["x"], box["y"]
221
+ right = left + box["w"]
222
+ bottom = top + box["h"]
223
+ bbox = BoundingBox(
224
+ l=left,
225
+ t=top,
226
+ r=right,
227
+ b=bottom,
228
+ coord_origin=CoordOrigin.TOPLEFT,
229
+ )
230
+ rect = tesseract_box_to_bounding_rectangle(
231
+ bbox,
232
+ original_offset=ocr_rect,
233
+ scale=self.scale,
234
+ orientation=doc_orientation,
235
+ im_size=high_res_image.size,
236
+ )
237
+ cells.append(
238
+ TextCell(
239
+ index=ix,
240
+ text=text,
241
+ orig=text,
242
+ from_ocr=True,
243
+ confidence=confidence,
244
+ rect=rect,
245
+ )
246
+ )
247
+
248
+ # del high_res_image
249
+ all_ocr_cells.extend(cells)
250
+
251
+ # Post-process the cells
252
+ self.post_process_cells(all_ocr_cells, page)
253
+
254
+ # DEBUG code:
255
+ if settings.debug.visualize_ocr:
256
+ self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
257
+
258
+ yield page
259
+
260
+ @classmethod
261
+ def get_options_type(cls) -> Type[OcrOptions]:
262
+ return TesseractOcrOptions
File without changes
@@ -0,0 +1,156 @@
1
+ import logging
2
+ import re
3
+ from collections.abc import Iterable
4
+ from typing import List
5
+
6
+ import numpy as np
7
+ from pydantic import BaseModel
8
+
9
+ from docling.datamodel.base_models import (
10
+ AssembledUnit,
11
+ ContainerElement,
12
+ FigureElement,
13
+ Page,
14
+ PageElement,
15
+ Table,
16
+ TextElement,
17
+ )
18
+ from docling.datamodel.document import ConversionResult
19
+ from docling.models.base_model import BasePageModel
20
+ from docling.models.stages.layout.layout_model import LayoutModel
21
+ from docling.utils.profiling import TimeRecorder
22
+
23
+ _log = logging.getLogger(__name__)
24
+
25
+
26
+ class PageAssembleOptions(BaseModel):
27
+ pass
28
+
29
+
30
+ class PageAssembleModel(BasePageModel):
31
+ def __init__(self, options: PageAssembleOptions):
32
+ self.options = options
33
+
34
+ def sanitize_text(self, lines):
35
+ if len(lines) <= 1:
36
+ return " ".join(lines)
37
+
38
+ for ix, line in enumerate(lines[1:]):
39
+ prev_line = lines[ix]
40
+
41
+ if prev_line.endswith("-"):
42
+ prev_words = re.findall(r"\b[\w]+\b", prev_line)
43
+ line_words = re.findall(r"\b[\w]+\b", line)
44
+
45
+ if (
46
+ len(prev_words)
47
+ and len(line_words)
48
+ and prev_words[-1].isalnum()
49
+ and line_words[0].isalnum()
50
+ ):
51
+ lines[ix] = prev_line[:-1]
52
+ else:
53
+ lines[ix] += " "
54
+
55
+ sanitized_text = "".join(lines)
56
+
57
+ # Text normalization
58
+ sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
59
+ sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
60
+ sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
61
+ sanitized_text = sanitized_text.replace("“", '"')
62
+ sanitized_text = sanitized_text.replace("”", '"')
63
+ sanitized_text = sanitized_text.replace("•", "·")
64
+
65
+ return sanitized_text.strip() # Strip any leading or trailing whitespace
66
+
67
+ def __call__(
68
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
69
+ ) -> Iterable[Page]:
70
+ for page in page_batch:
71
+ assert page._backend is not None
72
+ if not page._backend.is_valid():
73
+ yield page
74
+ else:
75
+ with TimeRecorder(conv_res, "page_assemble"):
76
+ assert page.predictions.layout is not None
77
+
78
+ # assembles some JSON output page by page.
79
+
80
+ elements: List[PageElement] = []
81
+ headers: List[PageElement] = []
82
+ body: List[PageElement] = []
83
+
84
+ for cluster in page.predictions.layout.clusters:
85
+ # _log.info("Cluster label seen:", cluster.label)
86
+ if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
87
+ textlines = [
88
+ cell.text.replace("\x02", "-").strip()
89
+ for cell in cluster.cells
90
+ if len(cell.text.strip()) > 0
91
+ ]
92
+ text = self.sanitize_text(textlines)
93
+ text_el = TextElement(
94
+ label=cluster.label,
95
+ id=cluster.id,
96
+ text=text,
97
+ page_no=page.page_no,
98
+ cluster=cluster,
99
+ )
100
+ elements.append(text_el)
101
+
102
+ if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
103
+ headers.append(text_el)
104
+ else:
105
+ body.append(text_el)
106
+ elif cluster.label in LayoutModel.TABLE_LABELS:
107
+ tbl = None
108
+ if page.predictions.tablestructure:
109
+ tbl = page.predictions.tablestructure.table_map.get(
110
+ cluster.id, None
111
+ )
112
+ if not tbl: # fallback: add table without structure, if it isn't present
113
+ tbl = Table(
114
+ label=cluster.label,
115
+ id=cluster.id,
116
+ text="",
117
+ otsl_seq=[],
118
+ table_cells=[],
119
+ cluster=cluster,
120
+ page_no=page.page_no,
121
+ )
122
+
123
+ elements.append(tbl)
124
+ body.append(tbl)
125
+ elif cluster.label == LayoutModel.FIGURE_LABEL:
126
+ fig = None
127
+ if page.predictions.figures_classification:
128
+ fig = page.predictions.figures_classification.figure_map.get(
129
+ cluster.id, None
130
+ )
131
+ if not fig: # fallback: add figure without classification, if it isn't present
132
+ fig = FigureElement(
133
+ label=cluster.label,
134
+ id=cluster.id,
135
+ text="",
136
+ data=None,
137
+ cluster=cluster,
138
+ page_no=page.page_no,
139
+ )
140
+ elements.append(fig)
141
+ body.append(fig)
142
+ elif cluster.label in LayoutModel.CONTAINER_LABELS:
143
+ container_el = ContainerElement(
144
+ label=cluster.label,
145
+ id=cluster.id,
146
+ page_no=page.page_no,
147
+ cluster=cluster,
148
+ )
149
+ elements.append(container_el)
150
+ body.append(container_el)
151
+
152
+ page.assembled = AssembledUnit(
153
+ elements=elements, headers=headers, body=body
154
+ )
155
+
156
+ yield page
File without changes
@@ -0,0 +1,145 @@
1
+ import re
2
+ import warnings
3
+ from collections.abc import Iterable
4
+ from pathlib import Path
5
+ from typing import Literal, Optional
6
+
7
+ import numpy as np
8
+ from PIL import ImageDraw
9
+ from pydantic import BaseModel
10
+
11
+ from docling.datamodel.base_models import Page
12
+ from docling.datamodel.document import ConversionResult
13
+ from docling.datamodel.settings import settings
14
+ from docling.models.base_model import BasePageModel
15
+ from docling.utils.profiling import TimeRecorder
16
+
17
+
18
+ class PagePreprocessingOptions(BaseModel):
19
+ images_scale: Optional[float]
20
+ skip_cell_extraction: bool = (
21
+ False # Skip text cell extraction for VLM-only processing
22
+ )
23
+
24
+
25
+ class PagePreprocessingModel(BasePageModel):
26
+ def __init__(self, options: PagePreprocessingOptions):
27
+ self.options = options
28
+
29
+ # Pre-compiled regex patterns for efficiency
30
+ self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
31
+ self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
32
+ self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
33
+ self.SLASH_NUMBER_GARBAGE_RE = re.compile(
34
+ r"(?:/\w+\s*){2,}"
35
+ ) # Two or more "/token " sequences
36
+
37
+ def __call__(
38
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
39
+ ) -> Iterable[Page]:
40
+ for page in page_batch:
41
+ assert page._backend is not None
42
+ if not page._backend.is_valid():
43
+ yield page
44
+ else:
45
+ with TimeRecorder(conv_res, "page_parse"):
46
+ page = self._populate_page_images(page)
47
+ if not self.options.skip_cell_extraction:
48
+ page = self._parse_page_cells(conv_res, page)
49
+ yield page
50
+
51
+ # Generate the page image and store it in the page object
52
+ def _populate_page_images(self, page: Page) -> Page:
53
+ # default scale
54
+ page.get_image(
55
+ scale=1.0
56
+ ) # puts the page image on the image cache at default scale
57
+
58
+ images_scale = self.options.images_scale
59
+ # user requested scales
60
+ if images_scale is not None:
61
+ page._default_image_scale = images_scale
62
+ page.get_image(
63
+ scale=images_scale
64
+ ) # this will trigger storing the image in the internal cache
65
+
66
+ return page
67
+
68
+ # Extract and populate the page cells and store it in the page object
69
+ def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
70
+ assert page._backend is not None
71
+
72
+ page.parsed_page = page._backend.get_segmented_page()
73
+ assert page.parsed_page is not None
74
+
75
+ # Rate the text quality from the PDF parser, and aggregate on page
76
+ text_scores = []
77
+ for c in page.cells:
78
+ score = self.rate_text_quality(c.text)
79
+ text_scores.append(score)
80
+
81
+ with warnings.catch_warnings():
82
+ warnings.filterwarnings(
83
+ "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
84
+ )
85
+ conv_res.confidence.pages[page.page_no].parse_score = float(
86
+ np.nanquantile(
87
+ text_scores, q=0.10
88
+ ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
89
+ )
90
+
91
+ # DEBUG code:
92
+ def draw_text_boxes(image, cells, show: bool = False):
93
+ draw = ImageDraw.Draw(image.copy())
94
+ for c in cells:
95
+ x0, y0, x1, y1 = (
96
+ c.to_bounding_box().l,
97
+ c.to_bounding_box().t,
98
+ c.to_bounding_box().r,
99
+ c.to_bounding_box().b,
100
+ )
101
+
102
+ draw.rectangle([(x0, y0), (x1, y1)], outline="red")
103
+ if show:
104
+ image.show()
105
+ else:
106
+ out_path: Path = (
107
+ Path(settings.debug.debug_output_path)
108
+ / f"debug_{conv_res.input.file.stem}"
109
+ )
110
+ out_path.mkdir(parents=True, exist_ok=True)
111
+
112
+ out_file = out_path / f"cells_page_{page.page_no:05}.png"
113
+ image.save(str(out_file), format="png")
114
+
115
+ if settings.debug.visualize_cells:
116
+ draw_text_boxes(page.get_image(scale=1.0), page.cells)
117
+
118
+ return page
119
+
120
+ def rate_text_quality(self, text: str) -> float:
121
+ # Hard errors: if any of these patterns are found, return 0.0 immediately.
122
+ blacklist_chars = ["�"]
123
+ if (
124
+ any(text.find(c) >= 0 for c in blacklist_chars)
125
+ or self.GLYPH_RE.search(text)
126
+ or self.SLASH_G_RE.search(text)
127
+ or self.SLASH_NUMBER_GARBAGE_RE.match(
128
+ text
129
+ ) # Check if text is mostly slash-number pattern
130
+ ):
131
+ return 0.0
132
+
133
+ penalty = 0.0
134
+
135
+ # Apply a penalty only if the fragmented words pattern occurs at least three times.
136
+ frag_matches = self.FRAG_RE.findall(text)
137
+ if len(frag_matches) >= 3:
138
+ penalty += 0.1 * len(frag_matches)
139
+
140
+ # Additional heuristic: if the average token length is below 2, add a penalty.
141
+ # tokens = text.split()
142
+ # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
143
+ # penalty += 0.2
144
+
145
+ return max(1.0 - penalty, 0.0)
File without changes