docling 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +185 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +1 -1
  14. docling/backend/msword_backend.py +65 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +60 -21
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +26 -30
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/ocr_mac_model.py +39 -11
  31. docling/models/page_preprocessing_model.py +4 -0
  32. docling/models/picture_description_api_model.py +20 -3
  33. docling/models/picture_description_base_model.py +19 -3
  34. docling/models/picture_description_vlm_model.py +14 -2
  35. docling/models/plugins/__init__.py +0 -0
  36. docling/models/plugins/defaults.py +28 -0
  37. docling/models/rapid_ocr_model.py +34 -13
  38. docling/models/table_structure_model.py +13 -4
  39. docling/models/tesseract_ocr_cli_model.py +40 -15
  40. docling/models/tesseract_ocr_model.py +37 -12
  41. docling/pipeline/standard_pdf_pipeline.py +25 -78
  42. docling/utils/export.py +8 -6
  43. docling/utils/layout_postprocessor.py +26 -23
  44. docling/utils/visualization.py +1 -1
  45. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/METADATA +48 -19
  46. docling-2.27.0.dist-info/RECORD +83 -0
  47. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/entry_points.txt +3 -0
  48. docling-2.26.0.dist-info/RECORD +0 -72
  49. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/LICENSE +0 -0
  50. {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/WHEEL +0 -0
@@ -2,25 +2,33 @@ import copy
2
2
  import logging
3
3
  from abc import abstractmethod
4
4
  from pathlib import Path
5
- from typing import Iterable, List
5
+ from typing import Iterable, List, Optional, Type
6
6
 
7
7
  import numpy as np
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
9
10
  from PIL import Image, ImageDraw
10
11
  from rtree import index
11
12
  from scipy.ndimage import binary_dilation, find_objects, label
12
13
 
13
- from docling.datamodel.base_models import Cell, OcrCell, Page
14
+ from docling.datamodel.base_models import Page
14
15
  from docling.datamodel.document import ConversionResult
15
- from docling.datamodel.pipeline_options import OcrOptions
16
+ from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
16
17
  from docling.datamodel.settings import settings
17
- from docling.models.base_model import BasePageModel
18
+ from docling.models.base_model import BaseModelWithOptions, BasePageModel
18
19
 
19
20
  _log = logging.getLogger(__name__)
20
21
 
21
22
 
22
- class BaseOcrModel(BasePageModel):
23
- def __init__(self, enabled: bool, options: OcrOptions):
23
+ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
24
+ def __init__(
25
+ self,
26
+ *,
27
+ enabled: bool,
28
+ artifacts_path: Optional[Path],
29
+ options: OcrOptions,
30
+ accelerator_options: AcceleratorOptions,
31
+ ):
24
32
  self.enabled = enabled
25
33
  self.options = options
26
34
 
@@ -104,11 +112,13 @@ class BaseOcrModel(BasePageModel):
104
112
  p.dimension = 2
105
113
  idx = index.Index(properties=p)
106
114
  for i, cell in enumerate(programmatic_cells):
107
- idx.insert(i, cell.bbox.as_tuple())
115
+ idx.insert(i, cell.rect.to_bounding_box().as_tuple())
108
116
 
109
117
  def is_overlapping_with_existing_cells(ocr_cell):
110
118
  # Query the R-tree to get overlapping rectangles
111
- possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
119
+ possible_matches_index = list(
120
+ idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
121
+ )
112
122
 
113
123
  return (
114
124
  len(possible_matches_index) > 0
@@ -125,10 +135,7 @@ class BaseOcrModel(BasePageModel):
125
135
  """
126
136
  if self.options.force_full_page_ocr:
127
137
  # If a full page OCR is forced, use only the OCR cells
128
- cells = [
129
- Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
130
- for c_ocr in ocr_cells
131
- ]
138
+ cells = ocr_cells
132
139
  return cells
133
140
 
134
141
  ## Remove OCR cells which overlap with programmatic cells.
@@ -156,7 +163,7 @@ class BaseOcrModel(BasePageModel):
156
163
 
157
164
  # Draw OCR and programmatic cells
158
165
  for tc in page.cells:
159
- x0, y0, x1, y1 = tc.bbox.as_tuple()
166
+ x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
160
167
  y0 *= scale_x
161
168
  y1 *= scale_y
162
169
  x0 *= scale_x
@@ -165,9 +172,8 @@ class BaseOcrModel(BasePageModel):
165
172
  if y1 <= y0:
166
173
  y1, y0 = y0, y1
167
174
 
168
- color = "gray"
169
- if isinstance(tc, OcrCell):
170
- color = "magenta"
175
+ color = "magenta" if tc.from_ocr else "gray"
176
+
171
177
  draw.rectangle([(x0, y0), (x1, y1)], outline=color)
172
178
 
173
179
  if show:
@@ -187,3 +193,8 @@ class BaseOcrModel(BasePageModel):
187
193
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
188
194
  ) -> Iterable[Page]:
189
195
  pass
196
+
197
+ @classmethod
198
+ @abstractmethod
199
+ def get_options_type(cls) -> Type[OcrOptions]:
200
+ pass
@@ -2,17 +2,19 @@ import logging
2
2
  import warnings
3
3
  import zipfile
4
4
  from pathlib import Path
5
- from typing import Iterable, List, Optional
5
+ from typing import Iterable, List, Optional, Type
6
6
 
7
7
  import numpy
8
8
  from docling_core.types.doc import BoundingBox, CoordOrigin
9
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
9
10
 
10
- from docling.datamodel.base_models import Cell, OcrCell, Page
11
+ from docling.datamodel.base_models import Page
11
12
  from docling.datamodel.document import ConversionResult
12
13
  from docling.datamodel.pipeline_options import (
13
14
  AcceleratorDevice,
14
15
  AcceleratorOptions,
15
16
  EasyOcrOptions,
17
+ OcrOptions,
16
18
  )
17
19
  from docling.datamodel.settings import settings
18
20
  from docling.models.base_ocr_model import BaseOcrModel
@@ -33,7 +35,12 @@ class EasyOcrModel(BaseOcrModel):
33
35
  options: EasyOcrOptions,
34
36
  accelerator_options: AcceleratorOptions,
35
37
  ):
36
- super().__init__(enabled=enabled, options=options)
38
+ super().__init__(
39
+ enabled=enabled,
40
+ artifacts_path=artifacts_path,
41
+ options=options,
42
+ accelerator_options=accelerator_options,
43
+ )
37
44
  self.options: EasyOcrOptions
38
45
 
39
46
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -148,18 +155,22 @@ class EasyOcrModel(BaseOcrModel):
148
155
  del im
149
156
 
150
157
  cells = [
151
- OcrCell(
152
- id=ix,
158
+ TextCell(
159
+ index=ix,
153
160
  text=line[1],
161
+ orig=line[1],
162
+ from_ocr=True,
154
163
  confidence=line[2],
155
- bbox=BoundingBox.from_tuple(
156
- coord=(
157
- (line[0][0][0] / self.scale) + ocr_rect.l,
158
- (line[0][0][1] / self.scale) + ocr_rect.t,
159
- (line[0][2][0] / self.scale) + ocr_rect.l,
160
- (line[0][2][1] / self.scale) + ocr_rect.t,
161
- ),
162
- origin=CoordOrigin.TOPLEFT,
164
+ rect=BoundingRectangle.from_bounding_box(
165
+ BoundingBox.from_tuple(
166
+ coord=(
167
+ (line[0][0][0] / self.scale) + ocr_rect.l,
168
+ (line[0][0][1] / self.scale) + ocr_rect.t,
169
+ (line[0][2][0] / self.scale) + ocr_rect.l,
170
+ (line[0][2][1] / self.scale) + ocr_rect.t,
171
+ ),
172
+ origin=CoordOrigin.TOPLEFT,
173
+ )
163
174
  ),
164
175
  )
165
176
  for ix, line in enumerate(result)
@@ -175,3 +186,7 @@ class EasyOcrModel(BaseOcrModel):
175
186
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
176
187
 
177
188
  yield page
189
+
190
+ @classmethod
191
+ def get_options_type(cls) -> Type[OcrOptions]:
192
+ return EasyOcrOptions
@@ -0,0 +1,27 @@
1
+ import logging
2
+ from functools import lru_cache
3
+
4
+ from docling.models.factories.ocr_factory import OcrFactory
5
+ from docling.models.factories.picture_description_factory import (
6
+ PictureDescriptionFactory,
7
+ )
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ @lru_cache()
13
+ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
14
+ factory = OcrFactory()
15
+ factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
16
+ logger.info("Registered ocr engines: %r", factory.registered_kind)
17
+ return factory
18
+
19
+
20
+ @lru_cache()
21
+ def get_picture_description_factory(
22
+ allow_external_plugins: bool = False,
23
+ ) -> PictureDescriptionFactory:
24
+ factory = PictureDescriptionFactory()
25
+ factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
26
+ logger.info("Registered picture descriptions: %r", factory.registered_kind)
27
+ return factory
@@ -0,0 +1,122 @@
1
+ import enum
2
+ import logging
3
+ from abc import ABCMeta
4
+ from typing import Generic, Optional, Type, TypeVar
5
+
6
+ from pluggy import PluginManager
7
+ from pydantic import BaseModel
8
+
9
+ from docling.datamodel.pipeline_options import BaseOptions
10
+ from docling.models.base_model import BaseModelWithOptions
11
+
12
+ A = TypeVar("A", bound=BaseModelWithOptions)
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class FactoryMeta(BaseModel):
19
+ kind: str
20
+ plugin_name: str
21
+ module: str
22
+
23
+
24
+ class BaseFactory(Generic[A], metaclass=ABCMeta):
25
+ default_plugin_name = "docling"
26
+
27
+ def __init__(self, plugin_attr_name: str, plugin_name=default_plugin_name):
28
+ self.plugin_name = plugin_name
29
+ self.plugin_attr_name = plugin_attr_name
30
+
31
+ self._classes: dict[Type[BaseOptions], Type[A]] = {}
32
+ self._meta: dict[Type[BaseOptions], FactoryMeta] = {}
33
+
34
+ @property
35
+ def registered_kind(self) -> list[str]:
36
+ return list(opt.kind for opt in self._classes.keys())
37
+
38
+ def get_enum(self) -> enum.Enum:
39
+ return enum.Enum(
40
+ self.plugin_attr_name + "_enum",
41
+ names={kind: kind for kind in self.registered_kind},
42
+ type=str,
43
+ module=__name__,
44
+ )
45
+
46
+ @property
47
+ def classes(self):
48
+ return self._classes
49
+
50
+ @property
51
+ def registered_meta(self):
52
+ return self._meta
53
+
54
+ def create_instance(self, options: BaseOptions, **kwargs) -> A:
55
+ try:
56
+ _cls = self._classes[type(options)]
57
+ return _cls(options=options, **kwargs)
58
+ except KeyError:
59
+ raise RuntimeError(self._err_msg_on_class_not_found(options.kind))
60
+
61
+ def create_options(self, kind: str, *args, **kwargs) -> BaseOptions:
62
+ for opt_cls, _ in self._classes.items():
63
+ if opt_cls.kind == kind:
64
+ return opt_cls(*args, **kwargs)
65
+ raise RuntimeError(self._err_msg_on_class_not_found(kind))
66
+
67
+ def _err_msg_on_class_not_found(self, kind: str):
68
+ msg = []
69
+
70
+ for opt, cls in self._classes.items():
71
+ msg.append(f"\t{opt.kind!r} => {cls!r}")
72
+
73
+ msg_str = "\n".join(msg)
74
+
75
+ return f"No class found with the name {kind!r}, known classes are:\n{msg_str}"
76
+
77
+ def register(self, cls: Type[A], plugin_name: str, plugin_module_name: str):
78
+ opt_type = cls.get_options_type()
79
+
80
+ if opt_type in self._classes:
81
+ raise ValueError(
82
+ f"{opt_type.kind!r} already registered to class {self._classes[opt_type]!r}"
83
+ )
84
+
85
+ self._classes[opt_type] = cls
86
+ self._meta[opt_type] = FactoryMeta(
87
+ kind=opt_type.kind, plugin_name=plugin_name, module=plugin_module_name
88
+ )
89
+
90
+ def load_from_plugins(
91
+ self, plugin_name: Optional[str] = None, allow_external_plugins: bool = False
92
+ ):
93
+ plugin_name = plugin_name or self.plugin_name
94
+
95
+ plugin_manager = PluginManager(plugin_name)
96
+ plugin_manager.load_setuptools_entrypoints(plugin_name)
97
+
98
+ for plugin_name, plugin_module in plugin_manager.list_name_plugin():
99
+ plugin_module_name = str(plugin_module.__name__) # type: ignore
100
+
101
+ if not allow_external_plugins and not plugin_module_name.startswith(
102
+ "docling."
103
+ ):
104
+ logger.warning(
105
+ f"The plugin {plugin_name} will not be loaded because Docling is being executed with allow_external_plugins=false."
106
+ )
107
+ continue
108
+
109
+ attr = getattr(plugin_module, self.plugin_attr_name, None)
110
+
111
+ if callable(attr):
112
+ logger.info("Loading plugin %r", plugin_name)
113
+
114
+ config = attr()
115
+ self.process_plugin(config, plugin_name, plugin_module_name)
116
+
117
+ def process_plugin(self, config, plugin_name: str, plugin_module_name: str):
118
+ for item in config[self.plugin_attr_name]:
119
+ try:
120
+ self.register(item, plugin_name, plugin_module_name)
121
+ except ValueError:
122
+ logger.warning("%r already registered", item)
@@ -0,0 +1,11 @@
1
+ import logging
2
+
3
+ from docling.models.base_ocr_model import BaseOcrModel
4
+ from docling.models.factories.base_factory import BaseFactory
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class OcrFactory(BaseFactory[BaseOcrModel]):
10
+ def __init__(self, *args, **kwargs):
11
+ super().__init__("ocr_engines", *args, **kwargs)
@@ -0,0 +1,11 @@
1
+ import logging
2
+
3
+ from docling.models.factories.base_factory import BaseFactory
4
+ from docling.models.picture_description_base_model import PictureDescriptionBaseModel
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class PictureDescriptionFactory(BaseFactory[PictureDescriptionBaseModel]):
10
+ def __init__(self, *args, **kwargs):
11
+ super().__init__("picture_description", *args, **kwargs)
@@ -1,12 +1,19 @@
1
1
  import logging
2
+ import sys
2
3
  import tempfile
3
- from typing import Iterable, Optional, Tuple
4
+ from pathlib import Path
5
+ from typing import Iterable, Optional, Tuple, Type
4
6
 
5
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
8
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
6
9
 
7
- from docling.datamodel.base_models import OcrCell, Page
10
+ from docling.datamodel.base_models import Page
8
11
  from docling.datamodel.document import ConversionResult
9
- from docling.datamodel.pipeline_options import OcrMacOptions
12
+ from docling.datamodel.pipeline_options import (
13
+ AcceleratorOptions,
14
+ OcrMacOptions,
15
+ OcrOptions,
16
+ )
10
17
  from docling.datamodel.settings import settings
11
18
  from docling.models.base_ocr_model import BaseOcrModel
12
19
  from docling.utils.profiling import TimeRecorder
@@ -15,18 +22,31 @@ _log = logging.getLogger(__name__)
15
22
 
16
23
 
17
24
  class OcrMacModel(BaseOcrModel):
18
- def __init__(self, enabled: bool, options: OcrMacOptions):
19
- super().__init__(enabled=enabled, options=options)
25
+ def __init__(
26
+ self,
27
+ enabled: bool,
28
+ artifacts_path: Optional[Path],
29
+ options: OcrMacOptions,
30
+ accelerator_options: AcceleratorOptions,
31
+ ):
32
+ super().__init__(
33
+ enabled=enabled,
34
+ artifacts_path=artifacts_path,
35
+ options=options,
36
+ accelerator_options=accelerator_options,
37
+ )
20
38
  self.options: OcrMacOptions
21
39
 
22
40
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
41
 
24
42
  if self.enabled:
43
+ if "darwin" != sys.platform:
44
+ raise RuntimeError(f"OcrMac is only supported on Mac.")
25
45
  install_errmsg = (
26
46
  "ocrmac is not correctly installed. "
27
47
  "Please install it via `pip install ocrmac` to use this OCR engine. "
28
48
  "Alternatively, Docling has support for other OCR engines. See the documentation: "
29
- "https://ds4sd.github.io/docling/installation/"
49
+ "https://docling-project.github.io/docling/installation/"
30
50
  )
31
51
  try:
32
52
  from ocrmac import ocrmac
@@ -94,13 +114,17 @@ class OcrMacModel(BaseOcrModel):
94
114
  bottom = y2 / self.scale
95
115
 
96
116
  cells.append(
97
- OcrCell(
98
- id=ix,
117
+ TextCell(
118
+ index=ix,
99
119
  text=text,
120
+ orig=text,
121
+ from_ocr=True,
100
122
  confidence=confidence,
101
- bbox=BoundingBox.from_tuple(
102
- coord=(left, top, right, bottom),
103
- origin=CoordOrigin.TOPLEFT,
123
+ rect=BoundingRectangle.from_bounding_box(
124
+ BoundingBox.from_tuple(
125
+ coord=(left, top, right, bottom),
126
+ origin=CoordOrigin.TOPLEFT,
127
+ )
104
128
  ),
105
129
  )
106
130
  )
@@ -116,3 +140,7 @@ class OcrMacModel(BaseOcrModel):
116
140
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
117
141
 
118
142
  yield page
143
+
144
+ @classmethod
145
+ def get_options_type(cls) -> Type[OcrOptions]:
146
+ return OcrMacOptions
@@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder
13
13
 
14
14
  class PagePreprocessingOptions(BaseModel):
15
15
  images_scale: Optional[float]
16
+ create_parsed_page: bool
16
17
 
17
18
 
18
19
  class PagePreprocessingModel(BasePageModel):
@@ -55,6 +56,9 @@ class PagePreprocessingModel(BasePageModel):
55
56
 
56
57
  page.cells = list(page._backend.get_text_cells())
57
58
 
59
+ if self.options.create_parsed_page:
60
+ page.parsed_page = page._backend.get_segmented_page()
61
+
58
62
  # DEBUG code:
59
63
  def draw_text_boxes(image, cells, show: bool = False):
60
64
  draw = ImageDraw.Draw(image)
@@ -1,13 +1,18 @@
1
1
  import base64
2
2
  import io
3
3
  import logging
4
- from typing import Iterable, List, Optional
4
+ from pathlib import Path
5
+ from typing import Iterable, List, Optional, Type, Union
5
6
 
6
7
  import requests
7
8
  from PIL import Image
8
9
  from pydantic import BaseModel, ConfigDict
9
10
 
10
- from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
11
+ from docling.datamodel.pipeline_options import (
12
+ AcceleratorOptions,
13
+ PictureDescriptionApiOptions,
14
+ PictureDescriptionBaseOptions,
15
+ )
11
16
  from docling.exceptions import OperationNotAllowed
12
17
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
13
18
 
@@ -46,13 +51,25 @@ class ApiResponse(BaseModel):
46
51
  class PictureDescriptionApiModel(PictureDescriptionBaseModel):
47
52
  # elements_batch_size = 4
48
53
 
54
+ @classmethod
55
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
56
+ return PictureDescriptionApiOptions
57
+
49
58
  def __init__(
50
59
  self,
51
60
  enabled: bool,
52
61
  enable_remote_services: bool,
62
+ artifacts_path: Optional[Union[Path, str]],
53
63
  options: PictureDescriptionApiOptions,
64
+ accelerator_options: AcceleratorOptions,
54
65
  ):
55
- super().__init__(enabled=enabled, options=options)
66
+ super().__init__(
67
+ enabled=enabled,
68
+ enable_remote_services=enable_remote_services,
69
+ artifacts_path=artifacts_path,
70
+ options=options,
71
+ accelerator_options=accelerator_options,
72
+ )
56
73
  self.options: PictureDescriptionApiOptions
57
74
 
58
75
  if self.enabled:
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ from abc import abstractmethod
2
3
  from pathlib import Path
3
- from typing import Any, Iterable, List, Optional, Union
4
+ from typing import Any, Iterable, List, Optional, Type, Union
4
5
 
5
6
  from docling_core.types.doc import (
6
7
  DoclingDocument,
@@ -13,20 +14,30 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
13
14
  )
14
15
  from PIL import Image
15
16
 
16
- from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
17
+ from docling.datamodel.pipeline_options import (
18
+ AcceleratorOptions,
19
+ PictureDescriptionBaseOptions,
20
+ )
17
21
  from docling.models.base_model import (
18
22
  BaseItemAndImageEnrichmentModel,
23
+ BaseModelWithOptions,
19
24
  ItemAndImageEnrichmentElement,
20
25
  )
21
26
 
22
27
 
23
- class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
28
+ class PictureDescriptionBaseModel(
29
+ BaseItemAndImageEnrichmentModel, BaseModelWithOptions
30
+ ):
24
31
  images_scale: float = 2.0
25
32
 
26
33
  def __init__(
27
34
  self,
35
+ *,
28
36
  enabled: bool,
37
+ enable_remote_services: bool,
38
+ artifacts_path: Optional[Union[Path, str]],
29
39
  options: PictureDescriptionBaseOptions,
40
+ accelerator_options: AcceleratorOptions,
30
41
  ):
31
42
  self.enabled = enabled
32
43
  self.options = options
@@ -62,3 +73,8 @@ class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
62
73
  PictureDescriptionData(text=output, provenance=self.provenance)
63
74
  )
64
75
  yield item
76
+
77
+ @classmethod
78
+ @abstractmethod
79
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
80
+ pass
@@ -1,10 +1,11 @@
1
1
  from pathlib import Path
2
- from typing import Iterable, Optional, Union
2
+ from typing import Iterable, Optional, Type, Union
3
3
 
4
4
  from PIL import Image
5
5
 
6
6
  from docling.datamodel.pipeline_options import (
7
7
  AcceleratorOptions,
8
+ PictureDescriptionBaseOptions,
8
9
  PictureDescriptionVlmOptions,
9
10
  )
10
11
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
@@ -13,14 +14,25 @@ from docling.utils.accelerator_utils import decide_device
13
14
 
14
15
  class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
15
16
 
17
+ @classmethod
18
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
19
+ return PictureDescriptionVlmOptions
20
+
16
21
  def __init__(
17
22
  self,
18
23
  enabled: bool,
24
+ enable_remote_services: bool,
19
25
  artifacts_path: Optional[Union[Path, str]],
20
26
  options: PictureDescriptionVlmOptions,
21
27
  accelerator_options: AcceleratorOptions,
22
28
  ):
23
- super().__init__(enabled=enabled, options=options)
29
+ super().__init__(
30
+ enabled=enabled,
31
+ enable_remote_services=enable_remote_services,
32
+ artifacts_path=artifacts_path,
33
+ options=options,
34
+ accelerator_options=accelerator_options,
35
+ )
24
36
  self.options: PictureDescriptionVlmOptions
25
37
 
26
38
  if self.enabled:
File without changes
@@ -0,0 +1,28 @@
1
+ from docling.models.easyocr_model import EasyOcrModel
2
+ from docling.models.ocr_mac_model import OcrMacModel
3
+ from docling.models.picture_description_api_model import PictureDescriptionApiModel
4
+ from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
5
+ from docling.models.rapid_ocr_model import RapidOcrModel
6
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
7
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
8
+
9
+
10
+ def ocr_engines():
11
+ return {
12
+ "ocr_engines": [
13
+ EasyOcrModel,
14
+ OcrMacModel,
15
+ RapidOcrModel,
16
+ TesseractOcrModel,
17
+ TesseractOcrCliModel,
18
+ ]
19
+ }
20
+
21
+
22
+ def picture_description():
23
+ return {
24
+ "picture_description": [
25
+ PictureDescriptionVlmModel,
26
+ PictureDescriptionApiModel,
27
+ ]
28
+ }