docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docling/backend/asciidoc_backend.py +1 -1
  2. docling/backend/csv_backend.py +1 -1
  3. docling/backend/docling_parse_backend.py +21 -13
  4. docling/backend/docling_parse_v2_backend.py +20 -12
  5. docling/backend/docling_parse_v4_backend.py +192 -0
  6. docling/backend/docx/__init__.py +0 -0
  7. docling/backend/docx/latex/__init__.py +0 -0
  8. docling/backend/docx/latex/latex_dict.py +271 -0
  9. docling/backend/docx/latex/omml.py +453 -0
  10. docling/backend/html_backend.py +7 -7
  11. docling/backend/md_backend.py +1 -1
  12. docling/backend/msexcel_backend.py +2 -45
  13. docling/backend/mspowerpoint_backend.py +19 -1
  14. docling/backend/msword_backend.py +68 -3
  15. docling/backend/pdf_backend.py +7 -2
  16. docling/backend/pypdfium2_backend.py +52 -30
  17. docling/backend/xml/uspto_backend.py +1 -1
  18. docling/cli/main.py +135 -53
  19. docling/cli/models.py +1 -1
  20. docling/datamodel/base_models.py +8 -10
  21. docling/datamodel/pipeline_options.py +54 -32
  22. docling/document_converter.py +5 -5
  23. docling/models/base_model.py +9 -1
  24. docling/models/base_ocr_model.py +27 -16
  25. docling/models/easyocr_model.py +28 -13
  26. docling/models/factories/__init__.py +27 -0
  27. docling/models/factories/base_factory.py +122 -0
  28. docling/models/factories/ocr_factory.py +11 -0
  29. docling/models/factories/picture_description_factory.py +11 -0
  30. docling/models/hf_mlx_model.py +137 -0
  31. docling/models/ocr_mac_model.py +39 -11
  32. docling/models/page_preprocessing_model.py +4 -0
  33. docling/models/picture_description_api_model.py +20 -3
  34. docling/models/picture_description_base_model.py +19 -3
  35. docling/models/picture_description_vlm_model.py +14 -2
  36. docling/models/plugins/__init__.py +0 -0
  37. docling/models/plugins/defaults.py +28 -0
  38. docling/models/rapid_ocr_model.py +34 -13
  39. docling/models/table_structure_model.py +13 -4
  40. docling/models/tesseract_ocr_cli_model.py +40 -15
  41. docling/models/tesseract_ocr_model.py +37 -12
  42. docling/pipeline/standard_pdf_pipeline.py +25 -78
  43. docling/pipeline/vlm_pipeline.py +78 -398
  44. docling/utils/export.py +8 -6
  45. docling/utils/layout_postprocessor.py +26 -23
  46. docling/utils/visualization.py +1 -1
  47. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
  48. docling-2.28.0.dist-info/RECORD +84 -0
  49. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
  50. docling-2.26.0.dist-info/RECORD +0 -72
  51. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
  52. {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,137 @@
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional
5
+
6
+ from docling.datamodel.base_models import Page, VlmPrediction
7
+ from docling.datamodel.document import ConversionResult
8
+ from docling.datamodel.pipeline_options import (
9
+ AcceleratorDevice,
10
+ AcceleratorOptions,
11
+ HuggingFaceVlmOptions,
12
+ )
13
+ from docling.datamodel.settings import settings
14
+ from docling.models.base_model import BasePageModel
15
+ from docling.utils.accelerator_utils import decide_device
16
+ from docling.utils.profiling import TimeRecorder
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ class HuggingFaceMlxModel(BasePageModel):
22
+
23
+ def __init__(
24
+ self,
25
+ enabled: bool,
26
+ artifacts_path: Optional[Path],
27
+ accelerator_options: AcceleratorOptions,
28
+ vlm_options: HuggingFaceVlmOptions,
29
+ ):
30
+ self.enabled = enabled
31
+
32
+ self.vlm_options = vlm_options
33
+
34
+ if self.enabled:
35
+
36
+ try:
37
+ from mlx_vlm import generate, load # type: ignore
38
+ from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
39
+ from mlx_vlm.utils import load_config, stream_generate # type: ignore
40
+ except ImportError:
41
+ raise ImportError(
42
+ "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
43
+ )
44
+
45
+ repo_cache_folder = vlm_options.repo_id.replace("/", "--")
46
+ self.apply_chat_template = apply_chat_template
47
+ self.stream_generate = stream_generate
48
+
49
+ # PARAMETERS:
50
+ if artifacts_path is None:
51
+ artifacts_path = self.download_models(self.vlm_options.repo_id)
52
+ elif (artifacts_path / repo_cache_folder).exists():
53
+ artifacts_path = artifacts_path / repo_cache_folder
54
+
55
+ self.param_question = vlm_options.prompt # "Perform Layout Analysis."
56
+
57
+ ## Load the model
58
+ self.vlm_model, self.processor = load(artifacts_path)
59
+ self.config = load_config(artifacts_path)
60
+
61
+ @staticmethod
62
+ def download_models(
63
+ repo_id: str,
64
+ local_dir: Optional[Path] = None,
65
+ force: bool = False,
66
+ progress: bool = False,
67
+ ) -> Path:
68
+ from huggingface_hub import snapshot_download
69
+ from huggingface_hub.utils import disable_progress_bars
70
+
71
+ if not progress:
72
+ disable_progress_bars()
73
+ download_path = snapshot_download(
74
+ repo_id=repo_id,
75
+ force_download=force,
76
+ local_dir=local_dir,
77
+ # revision="v0.0.1",
78
+ )
79
+
80
+ return Path(download_path)
81
+
82
+ def __call__(
83
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
84
+ ) -> Iterable[Page]:
85
+ for page in page_batch:
86
+ assert page._backend is not None
87
+ if not page._backend.is_valid():
88
+ yield page
89
+ else:
90
+ with TimeRecorder(conv_res, "vlm"):
91
+ assert page.size is not None
92
+
93
+ hi_res_image = page.get_image(scale=2.0) # 144dpi
94
+ # hi_res_image = page.get_image(scale=1.0) # 72dpi
95
+
96
+ if hi_res_image is not None:
97
+ im_width, im_height = hi_res_image.size
98
+
99
+ # populate page_tags with predicted doc tags
100
+ page_tags = ""
101
+
102
+ if hi_res_image:
103
+ if hi_res_image.mode != "RGB":
104
+ hi_res_image = hi_res_image.convert("RGB")
105
+
106
+ prompt = self.apply_chat_template(
107
+ self.processor, self.config, self.param_question, num_images=1
108
+ )
109
+
110
+ start_time = time.time()
111
+ # Call model to generate:
112
+ output = ""
113
+ for token in self.stream_generate(
114
+ self.vlm_model,
115
+ self.processor,
116
+ prompt,
117
+ [hi_res_image],
118
+ max_tokens=4096,
119
+ verbose=False,
120
+ ):
121
+ output += token.text
122
+ if "</doctag>" in token.text:
123
+ break
124
+
125
+ generation_time = time.time() - start_time
126
+ page_tags = output
127
+
128
+ # inference_time = time.time() - start_time
129
+ # tokens_per_second = num_tokens / generation_time
130
+ # print("")
131
+ # print(f"Page Inference Time: {inference_time:.2f} seconds")
132
+ # print(f"Total tokens on page: {num_tokens:.2f}")
133
+ # print(f"Tokens/sec: {tokens_per_second:.2f}")
134
+ # print("")
135
+ page.predictions.vlm_response = VlmPrediction(text=page_tags)
136
+
137
+ yield page
@@ -1,12 +1,19 @@
1
1
  import logging
2
+ import sys
2
3
  import tempfile
3
- from typing import Iterable, Optional, Tuple
4
+ from pathlib import Path
5
+ from typing import Iterable, Optional, Tuple, Type
4
6
 
5
7
  from docling_core.types.doc import BoundingBox, CoordOrigin
8
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
6
9
 
7
- from docling.datamodel.base_models import OcrCell, Page
10
+ from docling.datamodel.base_models import Page
8
11
  from docling.datamodel.document import ConversionResult
9
- from docling.datamodel.pipeline_options import OcrMacOptions
12
+ from docling.datamodel.pipeline_options import (
13
+ AcceleratorOptions,
14
+ OcrMacOptions,
15
+ OcrOptions,
16
+ )
10
17
  from docling.datamodel.settings import settings
11
18
  from docling.models.base_ocr_model import BaseOcrModel
12
19
  from docling.utils.profiling import TimeRecorder
@@ -15,18 +22,31 @@ _log = logging.getLogger(__name__)
15
22
 
16
23
 
17
24
  class OcrMacModel(BaseOcrModel):
18
- def __init__(self, enabled: bool, options: OcrMacOptions):
19
- super().__init__(enabled=enabled, options=options)
25
+ def __init__(
26
+ self,
27
+ enabled: bool,
28
+ artifacts_path: Optional[Path],
29
+ options: OcrMacOptions,
30
+ accelerator_options: AcceleratorOptions,
31
+ ):
32
+ super().__init__(
33
+ enabled=enabled,
34
+ artifacts_path=artifacts_path,
35
+ options=options,
36
+ accelerator_options=accelerator_options,
37
+ )
20
38
  self.options: OcrMacOptions
21
39
 
22
40
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
41
 
24
42
  if self.enabled:
43
+ if "darwin" != sys.platform:
44
+ raise RuntimeError(f"OcrMac is only supported on Mac.")
25
45
  install_errmsg = (
26
46
  "ocrmac is not correctly installed. "
27
47
  "Please install it via `pip install ocrmac` to use this OCR engine. "
28
48
  "Alternatively, Docling has support for other OCR engines. See the documentation: "
29
- "https://ds4sd.github.io/docling/installation/"
49
+ "https://docling-project.github.io/docling/installation/"
30
50
  )
31
51
  try:
32
52
  from ocrmac import ocrmac
@@ -94,13 +114,17 @@ class OcrMacModel(BaseOcrModel):
94
114
  bottom = y2 / self.scale
95
115
 
96
116
  cells.append(
97
- OcrCell(
98
- id=ix,
117
+ TextCell(
118
+ index=ix,
99
119
  text=text,
120
+ orig=text,
121
+ from_ocr=True,
100
122
  confidence=confidence,
101
- bbox=BoundingBox.from_tuple(
102
- coord=(left, top, right, bottom),
103
- origin=CoordOrigin.TOPLEFT,
123
+ rect=BoundingRectangle.from_bounding_box(
124
+ BoundingBox.from_tuple(
125
+ coord=(left, top, right, bottom),
126
+ origin=CoordOrigin.TOPLEFT,
127
+ )
104
128
  ),
105
129
  )
106
130
  )
@@ -116,3 +140,7 @@ class OcrMacModel(BaseOcrModel):
116
140
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
117
141
 
118
142
  yield page
143
+
144
+ @classmethod
145
+ def get_options_type(cls) -> Type[OcrOptions]:
146
+ return OcrMacOptions
@@ -13,6 +13,7 @@ from docling.utils.profiling import TimeRecorder
13
13
 
14
14
  class PagePreprocessingOptions(BaseModel):
15
15
  images_scale: Optional[float]
16
+ create_parsed_page: bool
16
17
 
17
18
 
18
19
  class PagePreprocessingModel(BasePageModel):
@@ -55,6 +56,9 @@ class PagePreprocessingModel(BasePageModel):
55
56
 
56
57
  page.cells = list(page._backend.get_text_cells())
57
58
 
59
+ if self.options.create_parsed_page:
60
+ page.parsed_page = page._backend.get_segmented_page()
61
+
58
62
  # DEBUG code:
59
63
  def draw_text_boxes(image, cells, show: bool = False):
60
64
  draw = ImageDraw.Draw(image)
@@ -1,13 +1,18 @@
1
1
  import base64
2
2
  import io
3
3
  import logging
4
- from typing import Iterable, List, Optional
4
+ from pathlib import Path
5
+ from typing import Iterable, List, Optional, Type, Union
5
6
 
6
7
  import requests
7
8
  from PIL import Image
8
9
  from pydantic import BaseModel, ConfigDict
9
10
 
10
- from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
11
+ from docling.datamodel.pipeline_options import (
12
+ AcceleratorOptions,
13
+ PictureDescriptionApiOptions,
14
+ PictureDescriptionBaseOptions,
15
+ )
11
16
  from docling.exceptions import OperationNotAllowed
12
17
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
13
18
 
@@ -46,13 +51,25 @@ class ApiResponse(BaseModel):
46
51
  class PictureDescriptionApiModel(PictureDescriptionBaseModel):
47
52
  # elements_batch_size = 4
48
53
 
54
+ @classmethod
55
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
56
+ return PictureDescriptionApiOptions
57
+
49
58
  def __init__(
50
59
  self,
51
60
  enabled: bool,
52
61
  enable_remote_services: bool,
62
+ artifacts_path: Optional[Union[Path, str]],
53
63
  options: PictureDescriptionApiOptions,
64
+ accelerator_options: AcceleratorOptions,
54
65
  ):
55
- super().__init__(enabled=enabled, options=options)
66
+ super().__init__(
67
+ enabled=enabled,
68
+ enable_remote_services=enable_remote_services,
69
+ artifacts_path=artifacts_path,
70
+ options=options,
71
+ accelerator_options=accelerator_options,
72
+ )
56
73
  self.options: PictureDescriptionApiOptions
57
74
 
58
75
  if self.enabled:
@@ -1,6 +1,7 @@
1
1
  import logging
2
+ from abc import abstractmethod
2
3
  from pathlib import Path
3
- from typing import Any, Iterable, List, Optional, Union
4
+ from typing import Any, Iterable, List, Optional, Type, Union
4
5
 
5
6
  from docling_core.types.doc import (
6
7
  DoclingDocument,
@@ -13,20 +14,30 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
13
14
  )
14
15
  from PIL import Image
15
16
 
16
- from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
17
+ from docling.datamodel.pipeline_options import (
18
+ AcceleratorOptions,
19
+ PictureDescriptionBaseOptions,
20
+ )
17
21
  from docling.models.base_model import (
18
22
  BaseItemAndImageEnrichmentModel,
23
+ BaseModelWithOptions,
19
24
  ItemAndImageEnrichmentElement,
20
25
  )
21
26
 
22
27
 
23
- class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
28
+ class PictureDescriptionBaseModel(
29
+ BaseItemAndImageEnrichmentModel, BaseModelWithOptions
30
+ ):
24
31
  images_scale: float = 2.0
25
32
 
26
33
  def __init__(
27
34
  self,
35
+ *,
28
36
  enabled: bool,
37
+ enable_remote_services: bool,
38
+ artifacts_path: Optional[Union[Path, str]],
29
39
  options: PictureDescriptionBaseOptions,
40
+ accelerator_options: AcceleratorOptions,
30
41
  ):
31
42
  self.enabled = enabled
32
43
  self.options = options
@@ -62,3 +73,8 @@ class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
62
73
  PictureDescriptionData(text=output, provenance=self.provenance)
63
74
  )
64
75
  yield item
76
+
77
+ @classmethod
78
+ @abstractmethod
79
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
80
+ pass
@@ -1,10 +1,11 @@
1
1
  from pathlib import Path
2
- from typing import Iterable, Optional, Union
2
+ from typing import Iterable, Optional, Type, Union
3
3
 
4
4
  from PIL import Image
5
5
 
6
6
  from docling.datamodel.pipeline_options import (
7
7
  AcceleratorOptions,
8
+ PictureDescriptionBaseOptions,
8
9
  PictureDescriptionVlmOptions,
9
10
  )
10
11
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
@@ -13,14 +14,25 @@ from docling.utils.accelerator_utils import decide_device
13
14
 
14
15
  class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
15
16
 
17
+ @classmethod
18
+ def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
19
+ return PictureDescriptionVlmOptions
20
+
16
21
  def __init__(
17
22
  self,
18
23
  enabled: bool,
24
+ enable_remote_services: bool,
19
25
  artifacts_path: Optional[Union[Path, str]],
20
26
  options: PictureDescriptionVlmOptions,
21
27
  accelerator_options: AcceleratorOptions,
22
28
  ):
23
- super().__init__(enabled=enabled, options=options)
29
+ super().__init__(
30
+ enabled=enabled,
31
+ enable_remote_services=enable_remote_services,
32
+ artifacts_path=artifacts_path,
33
+ options=options,
34
+ accelerator_options=accelerator_options,
35
+ )
24
36
  self.options: PictureDescriptionVlmOptions
25
37
 
26
38
  if self.enabled:
File without changes
@@ -0,0 +1,28 @@
1
+ from docling.models.easyocr_model import EasyOcrModel
2
+ from docling.models.ocr_mac_model import OcrMacModel
3
+ from docling.models.picture_description_api_model import PictureDescriptionApiModel
4
+ from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
5
+ from docling.models.rapid_ocr_model import RapidOcrModel
6
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
7
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
8
+
9
+
10
+ def ocr_engines():
11
+ return {
12
+ "ocr_engines": [
13
+ EasyOcrModel,
14
+ OcrMacModel,
15
+ RapidOcrModel,
16
+ TesseractOcrModel,
17
+ TesseractOcrCliModel,
18
+ ]
19
+ }
20
+
21
+
22
+ def picture_description():
23
+ return {
24
+ "picture_description": [
25
+ PictureDescriptionVlmModel,
26
+ PictureDescriptionApiModel,
27
+ ]
28
+ }
@@ -1,14 +1,17 @@
1
1
  import logging
2
- from typing import Iterable
2
+ from pathlib import Path
3
+ from typing import Iterable, Optional, Type
3
4
 
4
5
  import numpy
5
6
  from docling_core.types.doc import BoundingBox, CoordOrigin
7
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
6
8
 
7
- from docling.datamodel.base_models import OcrCell, Page
9
+ from docling.datamodel.base_models import Page
8
10
  from docling.datamodel.document import ConversionResult
9
11
  from docling.datamodel.pipeline_options import (
10
12
  AcceleratorDevice,
11
13
  AcceleratorOptions,
14
+ OcrOptions,
12
15
  RapidOcrOptions,
13
16
  )
14
17
  from docling.datamodel.settings import settings
@@ -23,10 +26,16 @@ class RapidOcrModel(BaseOcrModel):
23
26
  def __init__(
24
27
  self,
25
28
  enabled: bool,
29
+ artifacts_path: Optional[Path],
26
30
  options: RapidOcrOptions,
27
31
  accelerator_options: AcceleratorOptions,
28
32
  ):
29
- super().__init__(enabled=enabled, options=options)
33
+ super().__init__(
34
+ enabled=enabled,
35
+ artifacts_path=artifacts_path,
36
+ options=options,
37
+ accelerator_options=accelerator_options,
38
+ )
30
39
  self.options: RapidOcrOptions
31
40
 
32
41
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -100,18 +109,26 @@ class RapidOcrModel(BaseOcrModel):
100
109
 
101
110
  if result is not None:
102
111
  cells = [
103
- OcrCell(
104
- id=ix,
112
+ TextCell(
113
+ index=ix,
105
114
  text=line[1],
115
+ orig=line[1],
106
116
  confidence=line[2],
107
- bbox=BoundingBox.from_tuple(
108
- coord=(
109
- (line[0][0][0] / self.scale) + ocr_rect.l,
110
- (line[0][0][1] / self.scale) + ocr_rect.t,
111
- (line[0][2][0] / self.scale) + ocr_rect.l,
112
- (line[0][2][1] / self.scale) + ocr_rect.t,
113
- ),
114
- origin=CoordOrigin.TOPLEFT,
117
+ from_ocr=True,
118
+ rect=BoundingRectangle.from_bounding_box(
119
+ BoundingBox.from_tuple(
120
+ coord=(
121
+ (line[0][0][0] / self.scale)
122
+ + ocr_rect.l,
123
+ (line[0][0][1] / self.scale)
124
+ + ocr_rect.t,
125
+ (line[0][2][0] / self.scale)
126
+ + ocr_rect.l,
127
+ (line[0][2][1] / self.scale)
128
+ + ocr_rect.t,
129
+ ),
130
+ origin=CoordOrigin.TOPLEFT,
131
+ )
115
132
  ),
116
133
  )
117
134
  for ix, line in enumerate(result)
@@ -126,3 +143,7 @@ class RapidOcrModel(BaseOcrModel):
126
143
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
127
144
 
128
145
  yield page
146
+
147
+ @classmethod
148
+ def get_options_type(cls) -> Type[OcrOptions]:
149
+ return RapidOcrOptions
@@ -5,6 +5,7 @@ from typing import Iterable, Optional, Union
5
5
 
6
6
  import numpy
7
7
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
8
+ from docling_core.types.doc.page import BoundingRectangle
8
9
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
9
10
  from PIL import ImageDraw
10
11
 
@@ -129,7 +130,7 @@ class TableStructureModel(BasePageModel):
129
130
  draw.rectangle([(x0, y0), (x1, y1)], outline="red")
130
131
 
131
132
  for cell in table_element.cluster.cells:
132
- x0, y0, x1, y1 = cell.bbox.as_tuple()
133
+ x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
133
134
  x0 *= scale_x
134
135
  x1 *= scale_x
135
136
  y0 *= scale_x
@@ -223,11 +224,19 @@ class TableStructureModel(BasePageModel):
223
224
  # Only allow non empty stings (spaces) into the cells of a table
224
225
  if len(c.text.strip()) > 0:
225
226
  new_cell = copy.deepcopy(c)
226
- new_cell.bbox = new_cell.bbox.scaled(
227
- scale=self.scale
227
+ new_cell.rect = BoundingRectangle.from_bounding_box(
228
+ new_cell.rect.to_bounding_box().scaled(
229
+ scale=self.scale
230
+ )
228
231
  )
229
232
 
230
- tokens.append(new_cell.model_dump())
233
+ tokens.append(
234
+ {
235
+ "id": new_cell.index,
236
+ "text": new_cell.text,
237
+ "bbox": new_cell.rect.to_bounding_box().model_dump(),
238
+ }
239
+ )
231
240
  page_input["tokens"] = tokens
232
241
 
233
242
  tf_output = self.tf_predictor.multi_table_predict(
@@ -3,15 +3,21 @@ import io
3
3
  import logging
4
4
  import os
5
5
  import tempfile
6
+ from pathlib import Path
6
7
  from subprocess import DEVNULL, PIPE, Popen
7
- from typing import Iterable, List, Optional, Tuple
8
+ from typing import Iterable, List, Optional, Tuple, Type
8
9
 
9
10
  import pandas as pd
10
11
  from docling_core.types.doc import BoundingBox, CoordOrigin
12
+ from docling_core.types.doc.page import BoundingRectangle, TextCell
11
13
 
12
- from docling.datamodel.base_models import Cell, OcrCell, Page
14
+ from docling.datamodel.base_models import Page
13
15
  from docling.datamodel.document import ConversionResult
14
- from docling.datamodel.pipeline_options import TesseractCliOcrOptions
16
+ from docling.datamodel.pipeline_options import (
17
+ AcceleratorOptions,
18
+ OcrOptions,
19
+ TesseractCliOcrOptions,
20
+ )
15
21
  from docling.datamodel.settings import settings
16
22
  from docling.models.base_ocr_model import BaseOcrModel
17
23
  from docling.utils.ocr_utils import map_tesseract_script
@@ -21,8 +27,19 @@ _log = logging.getLogger(__name__)
21
27
 
22
28
 
23
29
  class TesseractOcrCliModel(BaseOcrModel):
24
- def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
25
- super().__init__(enabled=enabled, options=options)
30
+ def __init__(
31
+ self,
32
+ enabled: bool,
33
+ artifacts_path: Optional[Path],
34
+ options: TesseractCliOcrOptions,
35
+ accelerator_options: AcceleratorOptions,
36
+ ):
37
+ super().__init__(
38
+ enabled=enabled,
39
+ artifacts_path=artifacts_path,
40
+ options=options,
41
+ accelerator_options=accelerator_options,
42
+ )
26
43
  self.options: TesseractCliOcrOptions
27
44
 
28
45
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
@@ -228,18 +245,22 @@ class TesseractOcrCliModel(BaseOcrModel):
228
245
  t = b + h
229
246
  r = l + w
230
247
 
231
- cell = OcrCell(
232
- id=ix,
248
+ cell = TextCell(
249
+ index=ix,
233
250
  text=text,
251
+ orig=text,
252
+ from_ocr=True,
234
253
  confidence=conf / 100.0,
235
- bbox=BoundingBox.from_tuple(
236
- coord=(
237
- (l / self.scale) + ocr_rect.l,
238
- (b / self.scale) + ocr_rect.t,
239
- (r / self.scale) + ocr_rect.l,
240
- (t / self.scale) + ocr_rect.t,
241
- ),
242
- origin=CoordOrigin.TOPLEFT,
254
+ rect=BoundingRectangle.from_bounding_box(
255
+ BoundingBox.from_tuple(
256
+ coord=(
257
+ (l / self.scale) + ocr_rect.l,
258
+ (b / self.scale) + ocr_rect.t,
259
+ (r / self.scale) + ocr_rect.l,
260
+ (t / self.scale) + ocr_rect.t,
261
+ ),
262
+ origin=CoordOrigin.TOPLEFT,
263
+ )
243
264
  ),
244
265
  )
245
266
  all_ocr_cells.append(cell)
@@ -252,3 +273,7 @@ class TesseractOcrCliModel(BaseOcrModel):
252
273
  self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
253
274
 
254
275
  yield page
276
+
277
+ @classmethod
278
+ def get_options_type(cls) -> Type[OcrOptions]:
279
+ return TesseractCliOcrOptions