docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +192 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +19 -1
  14. docling/backend/msword_backend.py +68 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +135 -53
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +54 -32
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/hf_mlx_model.py +137 -0
  31. docling/models/ocr_mac_model.py +39 -11
  32. docling/models/page_preprocessing_model.py +4 -0
  33. docling/models/picture_description_api_model.py +20 -3
  34. docling/models/picture_description_base_model.py +19 -3
  35. docling/models/picture_description_vlm_model.py +14 -2
  36. docling/models/plugins/__init__.py +0 -0
  37. docling/models/plugins/defaults.py +28 -0
  38. docling/models/rapid_ocr_model.py +34 -13
  39. docling/models/table_structure_model.py +13 -4
  40. docling/models/tesseract_ocr_cli_model.py +40 -15
  41. docling/models/tesseract_ocr_model.py +37 -12
  42. docling/pipeline/standard_pdf_pipeline.py +25 -78
  43. docling/pipeline/vlm_pipeline.py +78 -398
  44. docling/utils/export.py +8 -6
  45. docling/utils/layout_postprocessor.py +26 -23
  46. docling/utils/visualization.py +1 -1
  47. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
  48. docling-2.28.0.dist-info/RECORD +84 -0
  49. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
  50. docling-2.26.0.dist-info/RECORD +0 -72
  51. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
  52. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
@@ -1,10 +1,9 @@
1
1
  import logging
2
2
  import os
3
3
  import re
4
- import warnings
5
4
  from enum import Enum
6
5
  from pathlib import Path
7
- from typing import Annotated, Any, Dict, List, Literal, Optional, Union
6
+ from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
8
7
 
9
8
  from pydantic import (
10
9
  AnyUrl,
@@ -13,13 +12,8 @@ from pydantic import (
13
12
  Field,
14
13
  field_validator,
15
14
  model_validator,
16
- validator,
17
- )
18
- from pydantic_settings import (
19
- BaseSettings,
20
- PydanticBaseSettingsSource,
21
- SettingsConfigDict,
22
15
  )
16
+ from pydantic_settings import BaseSettings, SettingsConfigDict
23
17
  from typing_extensions import deprecated
24
18
 
25
19
  _log = logging.getLogger(__name__)
@@ -83,6 +77,12 @@ class AcceleratorOptions(BaseSettings):
83
77
  return data
84
78
 
85
79
 
80
+ class BaseOptions(BaseModel):
81
+ """Base class for options."""
82
+
83
+ kind: ClassVar[str]
84
+
85
+
86
86
  class TableFormerMode(str, Enum):
87
87
  """Modes for the TableFormer model."""
88
88
 
@@ -102,10 +102,9 @@ class TableStructureOptions(BaseModel):
102
102
  mode: TableFormerMode = TableFormerMode.ACCURATE
103
103
 
104
104
 
105
- class OcrOptions(BaseModel):
105
+ class OcrOptions(BaseOptions):
106
106
  """OCR options."""
107
107
 
108
- kind: str
109
108
  lang: List[str]
110
109
  force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
111
110
  bitmap_area_threshold: float = (
@@ -116,7 +115,7 @@ class OcrOptions(BaseModel):
116
115
  class RapidOcrOptions(OcrOptions):
117
116
  """Options for the RapidOCR engine."""
118
117
 
119
- kind: Literal["rapidocr"] = "rapidocr"
118
+ kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
120
119
 
121
120
  # English and chinese are the most commly used models and have been tested with RapidOCR.
122
121
  lang: List[str] = [
@@ -155,7 +154,7 @@ class RapidOcrOptions(OcrOptions):
155
154
  class EasyOcrOptions(OcrOptions):
156
155
  """Options for the EasyOCR engine."""
157
156
 
158
- kind: Literal["easyocr"] = "easyocr"
157
+ kind: ClassVar[Literal["easyocr"]] = "easyocr"
159
158
  lang: List[str] = ["fr", "de", "es", "en"]
160
159
 
161
160
  use_gpu: Optional[bool] = None
@@ -175,7 +174,7 @@ class EasyOcrOptions(OcrOptions):
175
174
  class TesseractCliOcrOptions(OcrOptions):
176
175
  """Options for the TesseractCli engine."""
177
176
 
178
- kind: Literal["tesseract"] = "tesseract"
177
+ kind: ClassVar[Literal["tesseract"]] = "tesseract"
179
178
  lang: List[str] = ["fra", "deu", "spa", "eng"]
180
179
  tesseract_cmd: str = "tesseract"
181
180
  path: Optional[str] = None
@@ -188,7 +187,7 @@ class TesseractCliOcrOptions(OcrOptions):
188
187
  class TesseractOcrOptions(OcrOptions):
189
188
  """Options for the Tesseract engine."""
190
189
 
191
- kind: Literal["tesserocr"] = "tesserocr"
190
+ kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
192
191
  lang: List[str] = ["fra", "deu", "spa", "eng"]
193
192
  path: Optional[str] = None
194
193
 
@@ -200,7 +199,7 @@ class TesseractOcrOptions(OcrOptions):
200
199
  class OcrMacOptions(OcrOptions):
201
200
  """Options for the Mac OCR engine."""
202
201
 
203
- kind: Literal["ocrmac"] = "ocrmac"
202
+ kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
204
203
  lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
205
204
  recognition: str = "accurate"
206
205
  framework: str = "vision"
@@ -210,8 +209,7 @@ class OcrMacOptions(OcrOptions):
210
209
  )
211
210
 
212
211
 
213
- class PictureDescriptionBaseOptions(BaseModel):
214
- kind: str
212
+ class PictureDescriptionBaseOptions(BaseOptions):
215
213
  batch_size: int = 8
216
214
  scale: float = 2
217
215
 
@@ -221,7 +219,7 @@ class PictureDescriptionBaseOptions(BaseModel):
221
219
 
222
220
 
223
221
  class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
224
- kind: Literal["api"] = "api"
222
+ kind: ClassVar[Literal["api"]] = "api"
225
223
 
226
224
  url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
227
225
  headers: Dict[str, str] = {}
@@ -233,7 +231,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
233
231
 
234
232
 
235
233
  class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
236
- kind: Literal["vlm"] = "vlm"
234
+ kind: ClassVar[Literal["vlm"]] = "vlm"
237
235
 
238
236
  repo_id: str
239
237
  prompt: str = "Describe this image in a few sentences."
@@ -265,6 +263,11 @@ class ResponseFormat(str, Enum):
265
263
  MARKDOWN = "markdown"
266
264
 
267
265
 
266
+ class InferenceFramework(str, Enum):
267
+ MLX = "mlx"
268
+ TRANSFORMERS = "transformers"
269
+
270
+
268
271
  class HuggingFaceVlmOptions(BaseVlmOptions):
269
272
  kind: Literal["hf_model_options"] = "hf_model_options"
270
273
 
@@ -273,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
273
276
  llm_int8_threshold: float = 6.0
274
277
  quantized: bool = False
275
278
 
279
+ inference_framework: InferenceFramework
276
280
  response_format: ResponseFormat
277
281
 
278
282
  @property
@@ -280,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
280
284
  return self.repo_id.replace("/", "--")
281
285
 
282
286
 
287
+ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
288
+ repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
289
+ prompt="Convert this page to docling.",
290
+ response_format=ResponseFormat.DOCTAGS,
291
+ inference_framework=InferenceFramework.MLX,
292
+ )
293
+
294
+
283
295
  smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
284
296
  repo_id="ds4sd/SmolDocling-256M-preview",
285
297
  prompt="Convert this page to docling.",
286
298
  response_format=ResponseFormat.DOCTAGS,
299
+ inference_framework=InferenceFramework.TRANSFORMERS,
287
300
  )
288
301
 
289
302
  granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
@@ -291,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
291
304
  # prompt="OCR the full page to markdown.",
292
305
  prompt="OCR this image.",
293
306
  response_format=ResponseFormat.MARKDOWN,
307
+ inference_framework=InferenceFramework.TRANSFORMERS,
294
308
  )
295
309
 
296
310
 
311
+ class VlmModelType(str, Enum):
312
+ SMOLDOCLING = "smoldocling"
313
+ GRANITE_VISION = "granite_vision"
314
+
315
+
297
316
  # Define an enum for the backend options
298
317
  class PdfBackend(str, Enum):
299
318
  """Enum of valid PDF backends."""
@@ -301,9 +320,11 @@ class PdfBackend(str, Enum):
301
320
  PYPDFIUM2 = "pypdfium2"
302
321
  DLPARSE_V1 = "dlparse_v1"
303
322
  DLPARSE_V2 = "dlparse_v2"
323
+ DLPARSE_V4 = "dlparse_v4"
304
324
 
305
325
 
306
326
  # Define an enum for the ocr engines
327
+ @deprecated("Use ocr_factory.registered_enum")
307
328
  class OcrEngine(str, Enum):
308
329
  """Enum of valid OCR engines."""
309
330
 
@@ -323,16 +344,18 @@ class PipelineOptions(BaseModel):
323
344
  document_timeout: Optional[float] = None
324
345
  accelerator_options: AcceleratorOptions = AcceleratorOptions()
325
346
  enable_remote_services: bool = False
347
+ allow_external_plugins: bool = False
326
348
 
327
349
 
328
350
  class PaginatedPipelineOptions(PipelineOptions):
351
+ artifacts_path: Optional[Union[Path, str]] = None
352
+
329
353
  images_scale: float = 1.0
330
354
  generate_page_images: bool = False
331
355
  generate_picture_images: bool = False
332
356
 
333
357
 
334
358
  class VlmPipelineOptions(PaginatedPipelineOptions):
335
- artifacts_path: Optional[Union[Path, str]] = None
336
359
 
337
360
  generate_page_images: bool = True
338
361
  force_backend_text: bool = (
@@ -345,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
345
368
  class PdfPipelineOptions(PaginatedPipelineOptions):
346
369
  """Options for the PDF pipeline."""
347
370
 
348
- artifacts_path: Optional[Union[Path, str]] = None
349
371
  do_table_structure: bool = True # True: perform table structure extraction
350
372
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
351
373
  do_code_enrichment: bool = False # True: perform code OCR
@@ -358,17 +380,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
358
380
  # If True, text from backend will be used instead of generated text
359
381
 
360
382
  table_structure_options: TableStructureOptions = TableStructureOptions()
361
- ocr_options: Union[
362
- EasyOcrOptions,
363
- TesseractCliOcrOptions,
364
- TesseractOcrOptions,
365
- OcrMacOptions,
366
- RapidOcrOptions,
367
- ] = Field(EasyOcrOptions(), discriminator="kind")
368
- picture_description_options: Annotated[
369
- Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
370
- Field(discriminator="kind"),
371
- ] = smolvlm_picture_description
383
+ ocr_options: OcrOptions = EasyOcrOptions()
384
+ picture_description_options: PictureDescriptionBaseOptions = (
385
+ smolvlm_picture_description
386
+ )
372
387
 
373
388
  images_scale: float = 1.0
374
389
  generate_page_images: bool = False
@@ -381,3 +396,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
381
396
  "before conversion and then use the `TableItem.get_image` function."
382
397
  ),
383
398
  )
399
+
400
+ generate_parsed_pages: bool = False
401
+
402
+
403
+ class PdfPipeline(str, Enum):
404
+ STANDARD = "standard"
405
+ VLM = "vlm"
@@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
11
11
  from docling.backend.abstract_backend import AbstractDocumentBackend
12
12
  from docling.backend.asciidoc_backend import AsciiDocBackend
13
13
  from docling.backend.csv_backend import CsvDocumentBackend
14
- from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
14
+ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
15
15
  from docling.backend.html_backend import HTMLDocumentBackend
16
16
  from docling.backend.json.docling_json_backend import DoclingJSONBackend
17
17
  from docling.backend.md_backend import MarkdownDocumentBackend
@@ -109,12 +109,12 @@ class XMLJatsFormatOption(FormatOption):
109
109
 
110
110
  class ImageFormatOption(FormatOption):
111
111
  pipeline_cls: Type = StandardPdfPipeline
112
- backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
112
+ backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
113
113
 
114
114
 
115
115
  class PdfFormatOption(FormatOption):
116
116
  pipeline_cls: Type = StandardPdfPipeline
117
- backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
117
+ backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
118
118
 
119
119
 
120
120
  def _get_default_option(format: InputFormat) -> FormatOption:
@@ -147,10 +147,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
147
147
  pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
148
148
  ),
149
149
  InputFormat.IMAGE: FormatOption(
150
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
150
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
151
151
  ),
152
152
  InputFormat.PDF: FormatOption(
153
- pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
153
+ pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
154
154
  ),
155
155
  InputFormat.JSON_DOCLING: FormatOption(
156
156
  pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
@@ -1,14 +1,22 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Generic, Iterable, Optional
2
+ from typing import Any, Generic, Iterable, Optional, Protocol, Type
3
3
 
4
4
  from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
5
5
  from typing_extensions import TypeVar
6
6
 
7
7
  from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
8
8
  from docling.datamodel.document import ConversionResult
9
+ from docling.datamodel.pipeline_options import BaseOptions
9
10
  from docling.datamodel.settings import settings
10
11
 
11
12
 
13
+ class BaseModelWithOptions(Protocol):
14
+ @classmethod
15
+ def get_options_type(cls) -> Type[BaseOptions]: ...
16
+
17
+ def __init__(self, *, options: BaseOptions, **kwargs): ...
18
+
19
+
12
20
  class BasePageModel(ABC):
13
21
  @abstractmethod
14
22
  def __call__(
@@ -2,25 +2,33 @@ import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
4
  from pathlib import Path
5
- from typing import Iterable, List
5
+ from typing import Iterable, List, Optional, Type
6
6
 
7
7
  import numpy as np
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
9
10
  from PIL import Image, ImageDraw
10
11
  from rtree import index
11
12
  from scipy.ndimage import binary_dilation, find_objects, label
12
13
 
13
- from docling.datamodel.base_models import Cell, OcrCell, Page
14
+ from docling.datamodel.base_models import Page
14
15
  from docling.datamodel.document import ConversionResult
15
- from docling.datamodel.pipeline_options import OcrOptions
16
+ from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
16
17
  from docling.datamodel.settings import settings
17
- from docling.models.base_model import BasePageModel
18
+ from docling.models.base_model import BaseModelWithOptions, BasePageModel
18
19
 
19
20
  _log = logging.getLogger(__name__)
20
21
 
21
22
 
22
- class BaseOcrModel(BasePageModel):
23
- def __init__(self, enabled: bool, options: OcrOptions):
23
+ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
24
+ def __init__(
25
+ self,
26
+ *,
27
+ enabled: bool,
28
+ artifacts_path: Optional[Path],
29
+ options: OcrOptions,
30
+ accelerator_options: AcceleratorOptions,
31
+ ):
24
32
  self.enabled = enabled
25
33
  self.options = options
26
34
 
@@ -104,11 +112,13 @@ class BaseOcrModel(BasePageModel):
104
112
  p.dimension = 2
105
113
  idx = index.Index(properties=p)
106
114
  for i, cell in enumerate(programmatic_cells):
107
- idx.insert(i, cell.bbox.as_tuple())
115
+ idx.insert(i, cell.rect.to_bounding_box().as_tuple())
108
116
 
109
117
  def is_overlapping_with_existing_cells(ocr_cell):
110
118
  # Query the R-tree to get overlapping rectangles
111
- possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
119
+ possible_matches_index = list(
120
+ idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
121
+ )
112
122
 
113
123
  return (
114
124
  len(possible_matches_index) > 0
@@ -125,10 +135,7 @@ class BaseOcrModel(BasePageModel):
125
135
  """
126
136
  if self.options.force_full_page_ocr:
127
137
  # If a full page OCR is forced, use only the OCR cells
128
- cells = [
129
- Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
130
- for c_ocr in ocr_cells
131
- ]
138
+ cells = ocr_cells
132
139
  return cells
133
140
 
134
141
  ## Remove OCR cells which overlap with programmatic cells.
@@ -156,7 +163,7 @@ class BaseOcrModel(BasePageModel):
156
163
 
157
164
  # Draw OCR and programmatic cells
158
165
  for tc in page.cells:
159
- x0, y0, x1, y1 = tc.bbox.as_tuple()
166
+ x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
160
167
  y0 *= scale_x
161
168
  y1 *= scale_y
162
169
  x0 *= scale_x
@@ -165,9 +172,8 @@ class BaseOcrModel(BasePageModel):
165
172
  if y1 <= y0:
166
173
  y1, y0 = y0, y1
167
174
 
168
- color = "gray"
169
- if isinstance(tc, OcrCell):
170
- color = "magenta"
175
+ color = "magenta" if tc.from_ocr else "gray"
176
+
171
177
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
172
178
 
173
179
  if show:
@@ -187,3 +193,8 @@ class BaseOcrModel(BasePageModel):
187
193
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
188
194
  ) -> Iterable[Page]:
189
195
  pass
196
+
197
+ @classmethod
198
+ @abstractmethod
199
+ def get_options_type(cls) -> Type[OcrOptions]:
200
+ pass
@@ -2,17 +2,19 @@ import logging
2
2
  import warnings
3
3
  import zipfile
4
4
  from pathlib import Path
5
- from typing import Iterable, List, Optional
5
+ from typing import Iterable, List, Optional, Type
6
6
 
7
7
  import numpy
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
9
10
 
10
- from docling.datamodel.base_models import Cell, OcrCell, Page
11
+ from docling.datamodel.base_models import Page
11
12
  from docling.datamodel.document import ConversionResult
12
13
  from docling.datamodel.pipeline_options import (
13
14
  AcceleratorDevice,
14
15
  AcceleratorOptions,
15
16
  EasyOcrOptions,
17
+ OcrOptions,
16
18
  )
17
19
  from docling.datamodel.settings import settings
18
20
  from docling.models.base_ocr_model import BaseOcrModel
@@ -33,7 +35,12 @@ class EasyOcrModel(BaseOcrModel):
33
35
  options: EasyOcrOptions,
34
36
  accelerator_options: AcceleratorOptions,
35
37
  ):
36
- super().__init__(enabled=enabled, options=options)
38
+ super().__init__(
39
+ enabled=enabled,
40
+ artifacts_path=artifacts_path,
41
+ options=options,
42
+ accelerator_options=accelerator_options,
43
+ )
37
44
  self.options: EasyOcrOptions
38
45
 
39
46
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -148,18 +155,22 @@ class EasyOcrModel(BaseOcrModel):
148
155
  del im
149
156
 
150
157
  cells = [
151
- OcrCell(
152
- id=ix,
158
+ TextCell(
159
+ index=ix,
153
160
  text=line[1],
161
+ orig=line[1],
162
+ from_ocr=True,
154
163
  confidence=line[2],
155
- bbox=BoundingBox.from_tuple(
156
- coord=(
157
- (line[0][0][0] / self.scale) + ocr_rect.l,
158
- (line[0][0][1] / self.scale) + ocr_rect.t,
159
- (line[0][2][0] / self.scale) + ocr_rect.l,
160
- (line[0][2][1] / self.scale) + ocr_rect.t,
161
- ),
162
- origin=CoordOrigin.TOPLEFT,
164
+ rect=BoundingRectangle.from_bounding_box(
165
+ BoundingBox.from_tuple(
166
+ coord=(
167
+ (line[0][0][0] / self.scale) + ocr_rect.l,
168
+ (line[0][0][1] / self.scale) + ocr_rect.t,
169
+ (line[0][2][0] / self.scale) + ocr_rect.l,
170
+ (line[0][2][1] / self.scale) + ocr_rect.t,
171
+ ),
172
+ origin=CoordOrigin.TOPLEFT,
173
+ )
163
174
  ),
164
175
  )
165
176
  for ix, line in enumerate(result)
@@ -175,3 +186,7 @@ class EasyOcrModel(BaseOcrModel):
175
186
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
176
187
 
177
188
  yield page
189
+
190
+ @classmethod
191
+ def get_options_type(cls) -> Type[OcrOptions]:
192
+ return EasyOcrOptions
@@ -0,0 +1,27 @@
1
+ import logging
2
+ from functools import lru_cache
3
+
4
+ from docling.models.factories.ocr_factory import OcrFactory
5
+ from docling.models.factories.picture_description_factory import (
6
+ PictureDescriptionFactory,
7
+ )
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ @lru_cache()
13
+ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
14
+ factory = OcrFactory()
15
+ factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
16
+ logger.info("Registered ocr engines: %r", factory.registered_kind)
17
+ return factory
18
+
19
+
20
+ @lru_cache()
21
+ def get_picture_description_factory(
22
+ allow_external_plugins: bool = False,
23
+ ) -> PictureDescriptionFactory:
24
+ factory = PictureDescriptionFactory()
25
+ factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
26
+ logger.info("Registered picture descriptions: %r", factory.registered_kind)
27
+ return factory
@@ -0,0 +1,122 @@
1
+ import enum
2
+ import logging
3
+ from abc import ABCMeta
4
+ from typing import Generic, Optional, Type, TypeVar
5
+
6
+ from pluggy import PluginManager
7
+ from pydantic import BaseModel
8
+
9
+ from docling.datamodel.pipeline_options import BaseOptions
10
+ from docling.models.base_model import BaseModelWithOptions
11
+
12
+ A = TypeVar("A", bound=BaseModelWithOptions)
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FactoryMeta(BaseModel):
19
+ kind: str
20
+ plugin_name: str
21
+ module: str
22
+
23
+
24
+ class BaseFactory(Generic[A], metaclass=ABCMeta):
25
+ default_plugin_name = "docling"
26
+
27
+ def __init__(self, plugin_attr_name: str, plugin_name=default_plugin_name):
28
+ self.plugin_name = plugin_name
29
+ self.plugin_attr_name = plugin_attr_name
30
+
31
+ self._classes: dict[Type[BaseOptions], Type[A]] = {}
32
+ self._meta: dict[Type[BaseOptions], FactoryMeta] = {}
33
+
34
+ @property
35
+ def registered_kind(self) -> list[str]:
36
+ return list(opt.kind for opt in self._classes.keys())
37
+
38
+ def get_enum(self) -> enum.Enum:
39
+ return enum.Enum(
40
+ self.plugin_attr_name + "_enum",
41
+ names={kind: kind for kind in self.registered_kind},
42
+ type=str,
43
+ module=__name__,
44
+ )
45
+
46
+ @property
47
+ def classes(self):
48
+ return self._classes
49
+
50
+ @property
51
+ def registered_meta(self):
52
+ return self._meta
53
+
54
+ def create_instance(self, options: BaseOptions, **kwargs) -> A:
55
+ try:
56
+ _cls = self._classes[type(options)]
57
+ return _cls(options=options, **kwargs)
58
+ except KeyError:
59
+ raise RuntimeError(self._err_msg_on_class_not_found(options.kind))
60
+
61
+ def create_options(self, kind: str, *args, **kwargs) -> BaseOptions:
62
+ for opt_cls, _ in self._classes.items():
63
+ if opt_cls.kind == kind:
64
+ return opt_cls(*args, **kwargs)
65
+ raise RuntimeError(self._err_msg_on_class_not_found(kind))
66
+
67
+ def _err_msg_on_class_not_found(self, kind: str):
68
+ msg = []
69
+
70
+ for opt, cls in self._classes.items():
71
+ msg.append(f"\t{opt.kind!r} => {cls!r}")
72
+
73
+ msg_str = "\n".join(msg)
74
+
75
+ return f"No class found with the name {kind!r}, known classes are:\n{msg_str}"
76
+
77
+ def register(self, cls: Type[A], plugin_name: str, plugin_module_name: str):
78
+ opt_type = cls.get_options_type()
79
+
80
+ if opt_type in self._classes:
81
+ raise ValueError(
82
+ f"{opt_type.kind!r} already registered to class {self._classes[opt_type]!r}"
83
+ )
84
+
85
+ self._classes[opt_type] = cls
86
+ self._meta[opt_type] = FactoryMeta(
87
+ kind=opt_type.kind, plugin_name=plugin_name, module=plugin_module_name
88
+ )
89
+
90
+ def load_from_plugins(
91
+ self, plugin_name: Optional[str] = None, allow_external_plugins: bool = False
92
+ ):
93
+ plugin_name = plugin_name or self.plugin_name
94
+
95
+ plugin_manager = PluginManager(plugin_name)
96
+ plugin_manager.load_setuptools_entrypoints(plugin_name)
97
+
98
+ for plugin_name, plugin_module in plugin_manager.list_name_plugin():
99
+ plugin_module_name = str(plugin_module.__name__) # type: ignore
100
+
101
+ if not allow_external_plugins and not plugin_module_name.startswith(
102
+ "docling."
103
+ ):
104
+ logger.warning(
105
+ f"The plugin {plugin_name} will not be loaded because Docling is being executed with allow_external_plugins=false."
106
+ )
107
+ continue
108
+
109
+ attr = getattr(plugin_module, self.plugin_attr_name, None)
110
+
111
+ if callable(attr):
112
+ logger.info("Loading plugin %r", plugin_name)
113
+
114
+ config = attr()
115
+ self.process_plugin(config, plugin_name, plugin_module_name)
116
+
117
+ def process_plugin(self, config, plugin_name: str, plugin_module_name: str):
118
+ for item in config[self.plugin_attr_name]:
119
+ try:
120
+ self.register(item, plugin_name, plugin_module_name)
121
+ except ValueError:
122
+ logger.warning("%r already registered", item)
@@ -0,0 +1,11 @@
1
+ import logging
2
+
3
+ from docling.models.base_ocr_model import BaseOcrModel
4
+ from docling.models.factories.base_factory import BaseFactory
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class OcrFactory(BaseFactory[BaseOcrModel]):
10
+ def __init__(self, *args, **kwargs):
11
+ super().__init__("ocr_engines", *args, **kwargs)
@@ -0,0 +1,11 @@
1
+ import logging
2
+
3
+ from docling.models.factories.base_factory import BaseFactory
4
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class PictureDescriptionFactory(BaseFactory[PictureDescriptionBaseModel]):
10
+ def __init__(self, *args, **kwargs):
11
+ super().__init__("picture_description", *args, **kwargs)