docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +192 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +19 -1
  14. docling/backend/msword_backend.py +68 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +135 -53
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +54 -32
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/hf_mlx_model.py +137 -0
  31. docling/models/ocr_mac_model.py +39 -11
  32. docling/models/page_preprocessing_model.py +4 -0
  33. docling/models/picture_description_api_model.py +20 -3
  34. docling/models/picture_description_base_model.py +19 -3
  35. docling/models/picture_description_vlm_model.py +14 -2
  36. docling/models/plugins/__init__.py +0 -0
  37. docling/models/plugins/defaults.py +28 -0
  38. docling/models/rapid_ocr_model.py +34 -13
  39. docling/models/table_structure_model.py +13 -4
  40. docling/models/tesseract_ocr_cli_model.py +40 -15
  41. docling/models/tesseract_ocr_model.py +37 -12
  42. docling/pipeline/standard_pdf_pipeline.py +25 -78
  43. docling/pipeline/vlm_pipeline.py +78 -398
  44. docling/utils/export.py +8 -6
  45. docling/utils/layout_postprocessor.py +26 -23
  46. docling/utils/visualization.py +1 -1
  47. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
  48. docling-2.28.0.dist-info/RECORD +84 -0
  49. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
  50. docling-2.26.0.dist-info/RECORD +0 -72
  51. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
  52. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
@@ -1,11 +1,17 @@
1
1
  import logging
2
- from typing import Iterable
2
+ from pathlib import Path
3
+ from typing import Iterable, Optional, Type
3
4
 
4
5
  from docling_core.types.doc import BoundingBox, CoordOrigin
6
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
5
7
 
6
- from docling.datamodel.base_models import Cell, OcrCell, Page
8
+ from docling.datamodel.base_models import Page
7
9
  from docling.datamodel.document import ConversionResult
8
- from docling.datamodel.pipeline_options import TesseractOcrOptions
10
+ from docling.datamodel.pipeline_options import (
11
+ AcceleratorOptions,
12
+ OcrOptions,
13
+ TesseractOcrOptions,
14
+ )
9
15
  from docling.datamodel.settings import settings
10
16
  from docling.models.base_ocr_model import BaseOcrModel
11
17
  from docling.utils.ocr_utils import map_tesseract_script
@@ -15,8 +21,19 @@ _log = logging.getLogger(__name__)
15
21
 
16
22
 
17
23
  class TesseractOcrModel(BaseOcrModel):
18
- def __init__(self, enabled: bool, options: TesseractOcrOptions):
19
- super().__init__(enabled=enabled, options=options)
24
+ def __init__(
25
+ self,
26
+ enabled: bool,
27
+ artifacts_path: Optional[Path],
28
+ options: TesseractOcrOptions,
29
+ accelerator_options: AcceleratorOptions,
30
+ ):
31
+ super().__init__(
32
+ enabled=enabled,
33
+ artifacts_path=artifacts_path,
34
+ options=options,
35
+ accelerator_options=accelerator_options,
36
+ )
20
37
  self.options: TesseractOcrOptions
21
38
 
22
39
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -31,14 +48,14 @@ class TesseractOcrModel(BaseOcrModel):
31
48
  "Note that tesserocr might have to be manually compiled for working with "
32
49
  "your Tesseract installation. The Docling documentation provides examples for it. "
33
50
  "Alternatively, Docling has support for other OCR engines. See the documentation: "
34
- "https://ds4sd.github.io/docling/installation/"
51
+ "https://docling-project.github.io/docling/installation/"
35
52
  )
36
53
  missing_langs_errmsg = (
37
54
  "tesserocr is not correctly configured. No language models have been detected. "
38
55
  "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. "
39
56
  "You can find more information how to setup other OCR engines in Docling "
40
57
  "documentation: "
41
- "https://ds4sd.github.io/docling/installation/"
58
+ "https://docling-project.github.io/docling/installation/"
42
59
  )
43
60
 
44
61
  try:
@@ -173,13 +190,17 @@ class TesseractOcrModel(BaseOcrModel):
173
190
  top = (box["y"] + box["h"]) / self.scale
174
191
 
175
192
  cells.append(
176
- OcrCell(
177
- id=ix,
193
+ TextCell(
194
+ index=ix,
178
195
  text=text,
196
+ orig=text,
197
+ from_ocr=True,
179
198
  confidence=confidence,
180
- bbox=BoundingBox.from_tuple(
181
- coord=(left, top, right, bottom),
182
- origin=CoordOrigin.TOPLEFT,
199
+ rect=BoundingRectangle.from_bounding_box(
200
+ BoundingBox.from_tuple(
201
+ coord=(left, top, right, bottom),
202
+ origin=CoordOrigin.TOPLEFT,
203
+ ),
183
204
  ),
184
205
  )
185
206
  )
@@ -195,3 +216,7 @@ class TesseractOcrModel(BaseOcrModel):
195
216
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
196
217
 
197
218
  yield page
219
+
220
+ @classmethod
221
+ def get_options_type(cls) -> Type[OcrOptions]:
222
+ return TesseractOcrOptions
@@ -10,16 +10,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
10
10
  from docling.backend.pdf_backend import PdfDocumentBackend
11
11
  from docling.datamodel.base_models import AssembledUnit, Page
12
12
  from docling.datamodel.document import ConversionResult
13
- from docling.datamodel.pipeline_options import (
14
- EasyOcrOptions,
15
- OcrMacOptions,
16
- PdfPipelineOptions,
17
- PictureDescriptionApiOptions,
18
- PictureDescriptionVlmOptions,
19
- RapidOcrOptions,
20
- TesseractCliOcrOptions,
21
- TesseractOcrOptions,
22
- )
13
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
23
14
  from docling.datamodel.settings import settings
24
15
  from docling.models.base_ocr_model import BaseOcrModel
25
16
  from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
@@ -27,22 +18,16 @@ from docling.models.document_picture_classifier import (
27
18
  DocumentPictureClassifier,
28
19
  DocumentPictureClassifierOptions,
29
20
  )
30
- from docling.models.easyocr_model import EasyOcrModel
21
+ from docling.models.factories import get_ocr_factory, get_picture_description_factory
31
22
  from docling.models.layout_model import LayoutModel
32
- from docling.models.ocr_mac_model import OcrMacModel
33
23
  from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
34
24
  from docling.models.page_preprocessing_model import (
35
25
  PagePreprocessingModel,
36
26
  PagePreprocessingOptions,
37
27
  )
38
- from docling.models.picture_description_api_model import PictureDescriptionApiModel
39
28
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
40
- from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
41
- from docling.models.rapid_ocr_model import RapidOcrModel
42
29
  from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
43
30
  from docling.models.table_structure_model import TableStructureModel
44
- from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
45
- from docling.models.tesseract_ocr_model import TesseractOcrModel
46
31
  from docling.pipeline.base_pipeline import PaginatedPipeline
47
32
  from docling.utils.model_downloader import download_models
48
33
  from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -78,16 +63,14 @@ class StandardPdfPipeline(PaginatedPipeline):
78
63
 
79
64
  self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
80
65
 
81
- if (ocr_model := self.get_ocr_model(artifacts_path=artifacts_path)) is None:
82
- raise RuntimeError(
83
- f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
84
- )
66
+ ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
85
67
 
86
68
  self.build_pipe = [
87
69
  # Pre-processing
88
70
  PagePreprocessingModel(
89
71
  options=PagePreprocessingOptions(
90
- images_scale=pipeline_options.images_scale
72
+ images_scale=pipeline_options.images_scale,
73
+ create_parsed_page=pipeline_options.generate_parsed_pages,
91
74
  )
92
75
  ),
93
76
  # OCR
@@ -163,66 +146,30 @@ class StandardPdfPipeline(PaginatedPipeline):
163
146
  output_dir = download_models(output_dir=local_dir, force=force, progress=False)
164
147
  return output_dir
165
148
 
166
- def get_ocr_model(
167
- self, artifacts_path: Optional[Path] = None
168
- ) -> Optional[BaseOcrModel]:
169
- if isinstance(self.pipeline_options.ocr_options, EasyOcrOptions):
170
- return EasyOcrModel(
171
- enabled=self.pipeline_options.do_ocr,
172
- artifacts_path=artifacts_path,
173
- options=self.pipeline_options.ocr_options,
174
- accelerator_options=self.pipeline_options.accelerator_options,
175
- )
176
- elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
177
- return TesseractOcrCliModel(
178
- enabled=self.pipeline_options.do_ocr,
179
- options=self.pipeline_options.ocr_options,
180
- )
181
- elif isinstance(self.pipeline_options.ocr_options, TesseractOcrOptions):
182
- return TesseractOcrModel(
183
- enabled=self.pipeline_options.do_ocr,
184
- options=self.pipeline_options.ocr_options,
185
- )
186
- elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
187
- return RapidOcrModel(
188
- enabled=self.pipeline_options.do_ocr,
189
- options=self.pipeline_options.ocr_options,
190
- accelerator_options=self.pipeline_options.accelerator_options,
191
- )
192
- elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
193
- if "darwin" != sys.platform:
194
- raise RuntimeError(
195
- f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
196
- )
197
- return OcrMacModel(
198
- enabled=self.pipeline_options.do_ocr,
199
- options=self.pipeline_options.ocr_options,
200
- )
201
- return None
149
+ def get_ocr_model(self, artifacts_path: Optional[Path] = None) -> BaseOcrModel:
150
+ factory = get_ocr_factory(
151
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
152
+ )
153
+ return factory.create_instance(
154
+ options=self.pipeline_options.ocr_options,
155
+ enabled=self.pipeline_options.do_ocr,
156
+ artifacts_path=artifacts_path,
157
+ accelerator_options=self.pipeline_options.accelerator_options,
158
+ )
202
159
 
203
160
  def get_picture_description_model(
204
161
  self, artifacts_path: Optional[Path] = None
205
162
  ) -> Optional[PictureDescriptionBaseModel]:
206
- if isinstance(
207
- self.pipeline_options.picture_description_options,
208
- PictureDescriptionApiOptions,
209
- ):
210
- return PictureDescriptionApiModel(
211
- enabled=self.pipeline_options.do_picture_description,
212
- enable_remote_services=self.pipeline_options.enable_remote_services,
213
- options=self.pipeline_options.picture_description_options,
214
- )
215
- elif isinstance(
216
- self.pipeline_options.picture_description_options,
217
- PictureDescriptionVlmOptions,
218
- ):
219
- return PictureDescriptionVlmModel(
220
- enabled=self.pipeline_options.do_picture_description,
221
- artifacts_path=artifacts_path,
222
- options=self.pipeline_options.picture_description_options,
223
- accelerator_options=self.pipeline_options.accelerator_options,
224
- )
225
- return None
163
+ factory = get_picture_description_factory(
164
+ allow_external_plugins=self.pipeline_options.allow_external_plugins
165
+ )
166
+ return factory.create_instance(
167
+ options=self.pipeline_options.picture_description_options,
168
+ enabled=self.pipeline_options.do_picture_description,
169
+ enable_remote_services=self.pipeline_options.enable_remote_services,
170
+ artifacts_path=artifacts_path,
171
+ accelerator_options=self.pipeline_options.accelerator_options,
172
+ )
226
173
 
227
174
  def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
228
175
  with TimeRecorder(conv_res, "page_init"):