docling 1.18.0__tar.gz → 1.19.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {docling-1.18.0 → docling-1.19.0}/PKG-INFO +76 -1
  2. {docling-1.18.0 → docling-1.19.0}/README.md +73 -0
  3. {docling-1.18.0 → docling-1.19.0}/docling/cli/main.py +27 -1
  4. {docling-1.18.0 → docling-1.19.0}/docling/datamodel/base_models.py +4 -1
  5. docling-1.19.0/docling/datamodel/pipeline_options.py +67 -0
  6. {docling-1.18.0 → docling-1.19.0}/docling/document_converter.py +5 -3
  7. {docling-1.18.0 → docling-1.19.0}/docling/models/base_ocr_model.py +4 -4
  8. {docling-1.18.0 → docling-1.19.0}/docling/models/easyocr_model.py +19 -4
  9. docling-1.19.0/docling/models/tesseract_ocr_cli_model.py +167 -0
  10. docling-1.19.0/docling/models/tesseract_ocr_model.py +122 -0
  11. docling-1.19.0/docling/pipeline/standard_model_pipeline.py +66 -0
  12. {docling-1.18.0 → docling-1.19.0}/pyproject.toml +5 -1
  13. docling-1.18.0/docling/datamodel/pipeline_options.py +0 -25
  14. docling-1.18.0/docling/pipeline/standard_model_pipeline.py +0 -39
  15. {docling-1.18.0 → docling-1.19.0}/LICENSE +0 -0
  16. {docling-1.18.0 → docling-1.19.0}/docling/__init__.py +0 -0
  17. {docling-1.18.0 → docling-1.19.0}/docling/backend/__init__.py +0 -0
  18. {docling-1.18.0 → docling-1.19.0}/docling/backend/abstract_backend.py +0 -0
  19. {docling-1.18.0 → docling-1.19.0}/docling/backend/docling_parse_backend.py +0 -0
  20. {docling-1.18.0 → docling-1.19.0}/docling/backend/pypdfium2_backend.py +0 -0
  21. {docling-1.18.0 → docling-1.19.0}/docling/cli/__init__.py +0 -0
  22. {docling-1.18.0 → docling-1.19.0}/docling/datamodel/__init__.py +0 -0
  23. {docling-1.18.0 → docling-1.19.0}/docling/datamodel/document.py +0 -0
  24. {docling-1.18.0 → docling-1.19.0}/docling/datamodel/settings.py +0 -0
  25. {docling-1.18.0 → docling-1.19.0}/docling/models/__init__.py +0 -0
  26. {docling-1.18.0 → docling-1.19.0}/docling/models/ds_glm_model.py +0 -0
  27. {docling-1.18.0 → docling-1.19.0}/docling/models/layout_model.py +0 -0
  28. {docling-1.18.0 → docling-1.19.0}/docling/models/page_assemble_model.py +0 -0
  29. {docling-1.18.0 → docling-1.19.0}/docling/models/table_structure_model.py +0 -0
  30. {docling-1.18.0 → docling-1.19.0}/docling/pipeline/__init__.py +0 -0
  31. {docling-1.18.0 → docling-1.19.0}/docling/pipeline/base_model_pipeline.py +0 -0
  32. {docling-1.18.0 → docling-1.19.0}/docling/utils/__init__.py +0 -0
  33. {docling-1.18.0 → docling-1.19.0}/docling/utils/export.py +0 -0
  34. {docling-1.18.0 → docling-1.19.0}/docling/utils/layout_utils.py +0 -0
  35. {docling-1.18.0 → docling-1.19.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 1.18.0
3
+ Version: 1.19.0
4
4
  Summary: Docling PDF conversion package
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3.11
20
20
  Classifier: Programming Language :: Python :: 3.12
21
21
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Provides-Extra: tesserocr
22
23
  Requires-Dist: certifi (>=2024.7.4)
23
24
  Requires-Dist: deepsearch-glm (>=0.22.0,<0.23.0)
24
25
  Requires-Dist: docling-core (>=1.6.2,<2.0.0)
@@ -34,6 +35,7 @@ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
34
35
  Requires-Dist: requests (>=2.32.3,<3.0.0)
35
36
  Requires-Dist: rtree (>=1.3.0,<2.0.0)
36
37
  Requires-Dist: scipy (>=1.14.1,<2.0.0)
38
+ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
37
39
  Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
38
40
  Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
39
41
  Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
@@ -96,6 +98,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
96
98
  ```
97
99
  </details>
98
100
 
101
+ <details>
102
+ <summary><b>Alternative OCR engines</b></summary>
103
+
104
+ Docling supports multiple OCR engines for processing scanned documents. The current version provides
105
+ the following engines.
106
+
107
+ | Engine | Installation | Usage |
108
+ | ------ | ------------ | ----- |
109
+ | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
110
+ | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
111
+ | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
112
+
113
+ The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
114
+
115
+ ```python
116
+ from docling.datamodel.base_models import ConversionStatus, PipelineOptions
117
+ from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
118
+ from docling.document_converter import DocumentConverter
119
+
120
+ pipeline_options = PipelineOptions()
121
+ pipeline_options.do_ocr = True
122
+ pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
123
+
124
+ doc_converter = DocumentConverter(
125
+ pipeline_options=pipeline_options,
126
+ )
127
+ ```
128
+
129
+ #### Tesseract installation
130
+
131
+ [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
132
+ on most operating systems. For using this engine with Docling, Tesseract must be installed on your
133
+ system, using the packaging tool of your choice. Below we provide example commands.
134
+ After installing Tesseract you are expected to provide the path to its language files using the
135
+ `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
136
+
137
+ For macOS, we reccomend using [Homebrew](https://brew.sh/).
138
+
139
+ ```console
140
+ brew install tesseract leptonica pkg-config
141
+ TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
142
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
143
+ ```
144
+
145
+ For Debian-based systems.
146
+
147
+ ```console
148
+ apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
149
+ TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
150
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
151
+ ```
152
+
153
+ For RHEL systems.
154
+
155
+ ```console
156
+ dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
157
+ TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
158
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
159
+ ```
160
+
161
+ #### Linking to Tesseract
162
+ The most efficient usage of the Tesseract library is via linking. Docling is using
163
+ the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
164
+
165
+ If you get into installation issues of Tesserocr, we suggest using the following
166
+ installation options:
167
+
168
+ ```console
169
+ pip uninstall tesserocr
170
+ pip install --no-binary :all: tesserocr
171
+ ```
172
+ </details>
173
+
99
174
  <details>
100
175
  <summary><b>Docling development setup</b></summary>
101
176
 
@@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
52
52
  ```
53
53
  </details>
54
54
 
55
+ <details>
56
+ <summary><b>Alternative OCR engines</b></summary>
57
+
58
+ Docling supports multiple OCR engines for processing scanned documents. The current version provides
59
+ the following engines.
60
+
61
+ | Engine | Installation | Usage |
62
+ | ------ | ------------ | ----- |
63
+ | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
64
+ | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
65
+ | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
66
+
67
+ The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
68
+
69
+ ```python
70
+ from docling.datamodel.base_models import ConversionStatus, PipelineOptions
71
+ from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
72
+ from docling.document_converter import DocumentConverter
73
+
74
+ pipeline_options = PipelineOptions()
75
+ pipeline_options.do_ocr = True
76
+ pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
77
+
78
+ doc_converter = DocumentConverter(
79
+ pipeline_options=pipeline_options,
80
+ )
81
+ ```
82
+
83
+ #### Tesseract installation
84
+
85
+ [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
86
+ on most operating systems. For using this engine with Docling, Tesseract must be installed on your
87
+ system, using the packaging tool of your choice. Below we provide example commands.
88
+ After installing Tesseract you are expected to provide the path to its language files using the
89
+ `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
90
+
91
+ For macOS, we reccomend using [Homebrew](https://brew.sh/).
92
+
93
+ ```console
94
+ brew install tesseract leptonica pkg-config
95
+ TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
96
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
97
+ ```
98
+
99
+ For Debian-based systems.
100
+
101
+ ```console
102
+ apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
103
+ TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
104
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
105
+ ```
106
+
107
+ For RHEL systems.
108
+
109
+ ```console
110
+ dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
111
+ TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
112
+ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
113
+ ```
114
+
115
+ #### Linking to Tesseract
116
+ The most efficient usage of the Tesseract library is via linking. Docling is using
117
+ the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
118
+
119
+ If you get into installation issues of Tesserocr, we suggest using the following
120
+ installation options:
121
+
122
+ ```console
123
+ pip uninstall tesserocr
124
+ pip install --no-binary :all: tesserocr
125
+ ```
126
+ </details>
127
+
55
128
  <details>
56
129
  <summary><b>Docling development setup</b></summary>
57
130
 
@@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
14
14
  from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
15
15
  from docling.datamodel.base_models import ConversionStatus
16
16
  from docling.datamodel.document import ConversionResult, DocumentConversionInput
17
- from docling.datamodel.pipeline_options import PipelineOptions
17
+ from docling.datamodel.pipeline_options import (
18
+ EasyOcrOptions,
19
+ PipelineOptions,
20
+ TesseractCliOcrOptions,
21
+ TesseractOcrOptions,
22
+ )
18
23
  from docling.document_converter import DocumentConverter
19
24
 
20
25
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -53,6 +58,13 @@ class Backend(str, Enum):
53
58
  DOCLING = "docling"
54
59
 
55
60
 
61
+ # Define an enum for the ocr engines
62
+ class OcrEngine(str, Enum):
63
+ EASYOCR = "easyocr"
64
+ TESSERACT_CLI = "tesseract_cli"
65
+ TESSERACT = "tesseract"
66
+
67
+
56
68
  def export_documents(
57
69
  conv_results: Iterable[ConversionResult],
58
70
  output_dir: Path,
@@ -152,6 +164,9 @@ def convert(
152
164
  backend: Annotated[
153
165
  Backend, typer.Option(..., help="The PDF backend to use.")
154
166
  ] = Backend.DOCLING,
167
+ ocr_engine: Annotated[
168
+ OcrEngine, typer.Option(..., help="The OCR engine to use.")
169
+ ] = OcrEngine.EASYOCR,
155
170
  output: Annotated[
156
171
  Path, typer.Option(..., help="Output directory where results are saved.")
157
172
  ] = Path("."),
@@ -191,8 +206,19 @@ def convert(
191
206
  case _:
192
207
  raise RuntimeError(f"Unexpected backend type {backend}")
193
208
 
209
+ match ocr_engine:
210
+ case OcrEngine.EASYOCR:
211
+ ocr_options = EasyOcrOptions()
212
+ case OcrEngine.TESSERACT_CLI:
213
+ ocr_options = TesseractCliOcrOptions()
214
+ case OcrEngine.TESSERACT:
215
+ ocr_options = TesseractOcrOptions()
216
+ case _:
217
+ raise RuntimeError(f"Unexpected backend type {backend}")
218
+
194
219
  pipeline_options = PipelineOptions(
195
220
  do_ocr=ocr,
221
+ ocr_options=ocr_options,
196
222
  do_table_structure=True,
197
223
  )
198
224
  pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
@@ -110,7 +110,10 @@ class BoundingBox(BaseModel):
110
110
  return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
111
111
 
112
112
  def area(self) -> float:
113
- return (self.r - self.l) * (self.b - self.t)
113
+ area = (self.r - self.l) * (self.b - self.t)
114
+ if self.coord_origin == CoordOrigin.BOTTOMLEFT:
115
+ area = -area
116
+ return area
114
117
 
115
118
  def intersection_area_with(self, other: "BoundingBox") -> float:
116
119
  # Calculate intersection coordinates
@@ -0,0 +1,67 @@
1
+ from enum import Enum, auto
2
+ from typing import List, Literal, Optional, Union
3
+
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+
6
+
7
+ class TableFormerMode(str, Enum):
8
+ FAST = auto()
9
+ ACCURATE = auto()
10
+
11
+
12
+ class TableStructureOptions(BaseModel):
13
+ do_cell_matching: bool = (
14
+ True
15
+ # True: Matches predictions back to PDF cells. Can break table output if PDF cells
16
+ # are merged across table columns.
17
+ # False: Let table structure model define the text cells, ignore PDF cells.
18
+ )
19
+ mode: TableFormerMode = TableFormerMode.FAST
20
+
21
+
22
+ class OcrOptions(BaseModel):
23
+ kind: str
24
+
25
+
26
+ class EasyOcrOptions(OcrOptions):
27
+ kind: Literal["easyocr"] = "easyocr"
28
+ lang: List[str] = ["fr", "de", "es", "en"]
29
+ use_gpu: bool = True # same default as easyocr.Reader
30
+ model_storage_directory: Optional[str] = None
31
+ download_enabled: bool = True # same default as easyocr.Reader
32
+
33
+ model_config = ConfigDict(
34
+ extra="forbid",
35
+ protected_namespaces=(),
36
+ )
37
+
38
+
39
+ class TesseractCliOcrOptions(OcrOptions):
40
+ kind: Literal["tesseract"] = "tesseract"
41
+ lang: List[str] = ["fra", "deu", "spa", "eng"]
42
+ tesseract_cmd: str = "tesseract"
43
+ path: Optional[str] = None
44
+
45
+ model_config = ConfigDict(
46
+ extra="forbid",
47
+ )
48
+
49
+
50
+ class TesseractOcrOptions(OcrOptions):
51
+ kind: Literal["tesserocr"] = "tesserocr"
52
+ lang: List[str] = ["fra", "deu", "spa", "eng"]
53
+ path: Optional[str] = None
54
+
55
+ model_config = ConfigDict(
56
+ extra="forbid",
57
+ )
58
+
59
+
60
+ class PipelineOptions(BaseModel):
61
+ do_table_structure: bool = True # True: perform table structure extraction
62
+ do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
63
+
64
+ table_structure_options: TableStructureOptions = TableStructureOptions()
65
+ ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
66
+ Field(EasyOcrOptions(), discriminator="kind")
67
+ )
@@ -199,9 +199,6 @@ class DocumentConverter:
199
199
  end_pb_time = time.time() - start_pb_time
200
200
  _log.info(f"Finished converting page batch time={end_pb_time:.3f}")
201
201
 
202
- # Free up mem resources of PDF backend
203
- in_doc._backend.unload()
204
-
205
202
  conv_res.pages = all_assembled_pages
206
203
  self._assemble_doc(conv_res)
207
204
 
@@ -227,6 +224,11 @@ class DocumentConverter:
227
224
  f"{trace}"
228
225
  )
229
226
 
227
+ finally:
228
+ # Always unload the PDF backend, even in case of failure
229
+ if in_doc._backend:
230
+ in_doc._backend.unload()
231
+
230
232
  end_doc_time = time.time() - start_doc_time
231
233
  _log.info(
232
234
  f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
@@ -3,21 +3,21 @@ import logging
3
3
  from abc import abstractmethod
4
4
  from typing import Iterable, List, Tuple
5
5
 
6
- import numpy
7
6
  import numpy as np
8
7
  from PIL import Image, ImageDraw
9
8
  from rtree import index
10
9
  from scipy.ndimage import find_objects, label
11
10
 
12
11
  from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
12
+ from docling.datamodel.pipeline_options import OcrOptions
13
13
 
14
14
  _log = logging.getLogger(__name__)
15
15
 
16
16
 
17
17
  class BaseOcrModel:
18
- def __init__(self, config):
19
- self.config = config
20
- self.enabled = config["enabled"]
18
+ def __init__(self, enabled: bool, options: OcrOptions):
19
+ self.enabled = enabled
20
+ self.options = options
21
21
 
22
22
  # Computes the optimum amount and coordinates of rectangles to OCR on a given page
23
23
  def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
@@ -4,21 +4,33 @@ from typing import Iterable
4
4
  import numpy
5
5
 
6
6
  from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.datamodel.pipeline_options import EasyOcrOptions
7
8
  from docling.models.base_ocr_model import BaseOcrModel
8
9
 
9
10
  _log = logging.getLogger(__name__)
10
11
 
11
12
 
12
13
  class EasyOcrModel(BaseOcrModel):
13
- def __init__(self, config):
14
- super().__init__(config)
14
+ def __init__(self, enabled: bool, options: EasyOcrOptions):
15
+ super().__init__(enabled=enabled, options=options)
16
+ self.options: EasyOcrOptions
15
17
 
16
18
  self.scale = 3 # multiplier for 72 dpi == 216 dpi.
17
19
 
18
20
  if self.enabled:
19
- import easyocr
21
+ try:
22
+ import easyocr
23
+ except ImportError:
24
+ raise ImportError(
25
+ "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
26
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
27
+ )
20
28
 
21
- self.reader = easyocr.Reader(config["lang"])
29
+ self.reader = easyocr.Reader(
30
+ lang_list=self.options.lang,
31
+ model_storage_directory=self.options.model_storage_directory,
32
+ download_enabled=self.options.download_enabled,
33
+ )
22
34
 
23
35
  def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
24
36
 
@@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):
31
43
 
32
44
  all_ocr_cells = []
33
45
  for ocr_rect in ocr_rects:
46
+ # Skip zero area boxes
47
+ if ocr_rect.area() == 0:
48
+ continue
34
49
  high_res_image = page._backend.get_page_image(
35
50
  scale=self.scale, cropbox=ocr_rect
36
51
  )
@@ -0,0 +1,167 @@
1
+ import io
2
+ import logging
3
+ import tempfile
4
+ from subprocess import PIPE, Popen
5
+ from typing import Iterable, Tuple
6
+
7
+ import pandas as pd
8
+
9
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
10
+ from docling.datamodel.pipeline_options import TesseractCliOcrOptions
11
+ from docling.models.base_ocr_model import BaseOcrModel
12
+
13
+ _log = logging.getLogger(__name__)
14
+
15
+
16
+ class TesseractOcrCliModel(BaseOcrModel):
17
+
18
+ def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
19
+ super().__init__(enabled=enabled, options=options)
20
+ self.options: TesseractCliOcrOptions
21
+
22
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
23
+
24
+ self._name = None
25
+ self._version = None
26
+
27
+ if self.enabled:
28
+ try:
29
+ self._get_name_and_version()
30
+
31
+ except Exception as exc:
32
+ raise RuntimeError(
33
+ f"Tesseract is not available, aborting: {exc} "
34
+ "Install tesseract on your system and the tesseract binary is discoverable. "
35
+ "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
36
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
37
+ )
38
+
39
+ def _get_name_and_version(self) -> Tuple[str, str]:
40
+
41
+ if self._name != None and self._version != None:
42
+ return self._name, self._version
43
+
44
+ cmd = [self.options.tesseract_cmd, "--version"]
45
+
46
+ proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
47
+ stdout, stderr = proc.communicate()
48
+
49
+ proc.wait()
50
+
51
+ # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
52
+ # to stderr, so check both.
53
+ version_line = (
54
+ (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
55
+ .split("\n")[0]
56
+ .strip()
57
+ )
58
+
59
+ # If everything else fails...
60
+ if not version_line:
61
+ version_line = "tesseract XXX"
62
+
63
+ name, version = version_line.split(" ")
64
+
65
+ self._name = name
66
+ self._version = version
67
+
68
+ return name, version
69
+
70
+ def _run_tesseract(self, ifilename: str):
71
+
72
+ cmd = [self.options.tesseract_cmd]
73
+
74
+ if self.options.lang is not None and len(self.options.lang) > 0:
75
+ cmd.append("-l")
76
+ cmd.append("+".join(self.options.lang))
77
+ if self.options.path is not None:
78
+ cmd.append("--tessdata-dir")
79
+ cmd.append(self.options.path)
80
+
81
+ cmd += [ifilename, "stdout", "tsv"]
82
+ _log.info("command: {}".format(" ".join(cmd)))
83
+
84
+ proc = Popen(cmd, stdout=PIPE)
85
+ output, _ = proc.communicate()
86
+
87
+ # _log.info(output)
88
+
89
+ # Decode the byte string to a regular string
90
+ decoded_data = output.decode("utf-8")
91
+ # _log.info(decoded_data)
92
+
93
+ # Read the TSV file generated by Tesseract
94
+ df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
95
+
96
+ # Display the dataframe (optional)
97
+ # _log.info("df: ", df.head())
98
+
99
+ # Filter rows that contain actual text (ignore header or empty rows)
100
+ df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
101
+
102
+ return df_filtered
103
+
104
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
105
+
106
+ if not self.enabled:
107
+ yield from page_batch
108
+ return
109
+
110
+ for page in page_batch:
111
+ ocr_rects = self.get_ocr_rects(page)
112
+
113
+ all_ocr_cells = []
114
+ for ocr_rect in ocr_rects:
115
+ # Skip zero area boxes
116
+ if ocr_rect.area() == 0:
117
+ continue
118
+ high_res_image = page._backend.get_page_image(
119
+ scale=self.scale, cropbox=ocr_rect
120
+ )
121
+
122
+ with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
123
+ fname = image_file.name
124
+ high_res_image.save(fname)
125
+
126
+ df = self._run_tesseract(fname)
127
+
128
+ # _log.info(df)
129
+
130
+ # Print relevant columns (bounding box and text)
131
+ for ix, row in df.iterrows():
132
+ text = row["text"]
133
+ conf = row["conf"]
134
+
135
+ l = float(row["left"])
136
+ b = float(row["top"])
137
+ w = float(row["width"])
138
+ h = float(row["height"])
139
+
140
+ t = b + h
141
+ r = l + w
142
+
143
+ cell = OcrCell(
144
+ id=ix,
145
+ text=text,
146
+ confidence=conf / 100.0,
147
+ bbox=BoundingBox.from_tuple(
148
+ coord=(
149
+ (l / self.scale) + ocr_rect.l,
150
+ (b / self.scale) + ocr_rect.t,
151
+ (r / self.scale) + ocr_rect.l,
152
+ (t / self.scale) + ocr_rect.t,
153
+ ),
154
+ origin=CoordOrigin.TOPLEFT,
155
+ ),
156
+ )
157
+ all_ocr_cells.append(cell)
158
+
159
+ ## Remove OCR cells which overlap with programmatic cells.
160
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
161
+
162
+ page.cells.extend(filtered_ocr_cells)
163
+
164
+ # DEBUG code:
165
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
166
+
167
+ yield page
@@ -0,0 +1,122 @@
1
+ import logging
2
+ from typing import Iterable
3
+
4
+ import numpy
5
+
6
+ from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
7
+ from docling.datamodel.pipeline_options import TesseractCliOcrOptions
8
+ from docling.models.base_ocr_model import BaseOcrModel
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class TesseractOcrModel(BaseOcrModel):
14
+ def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
15
+ super().__init__(enabled=enabled, options=options)
16
+ self.options: TesseractCliOcrOptions
17
+
18
+ self.scale = 3 # multiplier for 72 dpi == 216 dpi.
19
+ self.reader = None
20
+
21
+ if self.enabled:
22
+ setup_errmsg = (
23
+ "tesserocr is not correctly installed. "
24
+ "Please install it via `pip install tesserocr` to use this OCR engine. "
25
+ "Note that tesserocr might have to be manually compiled for working with"
26
+ "your Tesseract installation. The Docling documentation provides examples for it. "
27
+ "Alternatively, Docling has support for other OCR engines. See the documentation."
28
+ )
29
+ try:
30
+ import tesserocr
31
+ except ImportError:
32
+ raise ImportError(setup_errmsg)
33
+
34
+ try:
35
+ tesseract_version = tesserocr.tesseract_version()
36
+ _log.debug("Initializing TesserOCR: %s", tesseract_version)
37
+ except:
38
+ raise ImportError(setup_errmsg)
39
+
40
+ # Initialize the tesseractAPI
41
+ lang = "+".join(self.options.lang)
42
+ if self.options.path is not None:
43
+ self.reader = tesserocr.PyTessBaseAPI(
44
+ path=self.options.path,
45
+ lang=lang,
46
+ psm=tesserocr.PSM.AUTO,
47
+ init=True,
48
+ oem=tesserocr.OEM.DEFAULT,
49
+ )
50
+ else:
51
+ self.reader = tesserocr.PyTessBaseAPI(
52
+ lang=lang,
53
+ psm=tesserocr.PSM.AUTO,
54
+ init=True,
55
+ oem=tesserocr.OEM.DEFAULT,
56
+ )
57
+ self.reader_RIL = tesserocr.RIL
58
+
59
+ def __del__(self):
60
+ if self.reader is not None:
61
+ # Finalize the tesseractAPI
62
+ self.reader.End()
63
+
64
+ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
65
+
66
+ if not self.enabled:
67
+ yield from page_batch
68
+ return
69
+
70
+ for page in page_batch:
71
+ ocr_rects = self.get_ocr_rects(page)
72
+
73
+ all_ocr_cells = []
74
+ for ocr_rect in ocr_rects:
75
+ # Skip zero area boxes
76
+ if ocr_rect.area() == 0:
77
+ continue
78
+ high_res_image = page._backend.get_page_image(
79
+ scale=self.scale, cropbox=ocr_rect
80
+ )
81
+
82
+ # Retrieve text snippets with their bounding boxes
83
+ self.reader.SetImage(high_res_image)
84
+ boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
85
+
86
+ cells = []
87
+ for ix, (im, box, _, _) in enumerate(boxes):
88
+ # Set the area of interest. Tesseract uses Bottom-Left for the origin
89
+ self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
90
+
91
+ # Extract text within the bounding box
92
+ text = self.reader.GetUTF8Text().strip()
93
+ confidence = self.reader.MeanTextConf()
94
+ left = box["x"] / self.scale
95
+ bottom = box["y"] / self.scale
96
+ right = (box["x"] + box["w"]) / self.scale
97
+ top = (box["y"] + box["h"]) / self.scale
98
+
99
+ cells.append(
100
+ OcrCell(
101
+ id=ix,
102
+ text=text,
103
+ confidence=confidence,
104
+ bbox=BoundingBox.from_tuple(
105
+ coord=(left, top, right, bottom),
106
+ origin=CoordOrigin.TOPLEFT,
107
+ ),
108
+ )
109
+ )
110
+
111
+ # del high_res_image
112
+ all_ocr_cells.extend(cells)
113
+
114
+ ## Remove OCR cells which overlap with programmatic cells.
115
+ filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
116
+
117
+ page.cells.extend(filtered_ocr_cells)
118
+
119
+ # DEBUG code:
120
+ # self.draw_ocr_rects_and_cells(page, ocr_rects)
121
+
122
+ yield page
@@ -0,0 +1,66 @@
1
+ from pathlib import Path
2
+
3
+ from docling.datamodel.pipeline_options import (
4
+ EasyOcrOptions,
5
+ PipelineOptions,
6
+ TesseractCliOcrOptions,
7
+ TesseractOcrOptions,
8
+ )
9
+ from docling.models.base_ocr_model import BaseOcrModel
10
+ from docling.models.easyocr_model import EasyOcrModel
11
+ from docling.models.layout_model import LayoutModel
12
+ from docling.models.table_structure_model import TableStructureModel
13
+ from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
14
+ from docling.models.tesseract_ocr_model import TesseractOcrModel
15
+ from docling.pipeline.base_model_pipeline import BaseModelPipeline
16
+
17
+
18
+ class StandardModelPipeline(BaseModelPipeline):
19
+ _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
20
+ _table_model_path = "model_artifacts/tableformer"
21
+
22
+ def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
23
+ super().__init__(artifacts_path, pipeline_options)
24
+
25
+ ocr_model: BaseOcrModel
26
+ if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
27
+ ocr_model = EasyOcrModel(
28
+ enabled=pipeline_options.do_ocr,
29
+ options=pipeline_options.ocr_options,
30
+ )
31
+ elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
32
+ ocr_model = TesseractOcrCliModel(
33
+ enabled=pipeline_options.do_ocr,
34
+ options=pipeline_options.ocr_options,
35
+ )
36
+ elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
37
+ ocr_model = TesseractOcrModel(
38
+ enabled=pipeline_options.do_ocr,
39
+ options=pipeline_options.ocr_options,
40
+ )
41
+ else:
42
+ raise RuntimeError(
43
+ f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
44
+ )
45
+
46
+ self.model_pipe = [
47
+ # OCR
48
+ ocr_model,
49
+ # Layout
50
+ LayoutModel(
51
+ config={
52
+ "artifacts_path": artifacts_path
53
+ / StandardModelPipeline._layout_model_path
54
+ }
55
+ ),
56
+ # Table structure
57
+ TableStructureModel(
58
+ config={
59
+ "artifacts_path": artifacts_path
60
+ / StandardModelPipeline._table_model_path,
61
+ "enabled": pipeline_options.do_table_structure,
62
+ "mode": pipeline_options.table_structure_options.mode,
63
+ "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
64
+ }
65
+ ),
66
+ ]
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "1.18.0" # DO NOT EDIT, updated automatically
3
+ version = "1.19.0" # DO NOT EDIT, updated automatically
4
4
  description = "Docling PDF conversion package"
5
5
  authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
6
6
  license = "MIT"
@@ -46,6 +46,7 @@ pydantic-settings = "^2.3.0"
46
46
  huggingface_hub = ">=0.23,<1"
47
47
  requests = "^2.32.3"
48
48
  easyocr = "^1.7"
49
+ tesserocr = { version = "^2.7.1", optional = true }
49
50
  docling-parse = "^1.4.1"
50
51
  certifi = ">=2024.7.4"
51
52
  rtree = "^1.3.0"
@@ -81,6 +82,9 @@ langchain-huggingface = "^0.0.3"
81
82
  langchain-milvus = "^0.1.4"
82
83
  langchain-text-splitters = "^0.2.4"
83
84
 
85
+ [tool.poetry.extras]
86
+ tesserocr = ["tesserocr"]
87
+
84
88
  [tool.poetry.scripts]
85
89
  docling = "docling.cli.main:app"
86
90
 
@@ -1,25 +0,0 @@
1
- from enum import Enum, auto
2
-
3
- from pydantic import BaseModel
4
-
5
-
6
- class TableFormerMode(str, Enum):
7
- FAST = auto()
8
- ACCURATE = auto()
9
-
10
-
11
- class TableStructureOptions(BaseModel):
12
- do_cell_matching: bool = (
13
- True
14
- # True: Matches predictions back to PDF cells. Can break table output if PDF cells
15
- # are merged across table columns.
16
- # False: Let table structure model define the text cells, ignore PDF cells.
17
- )
18
- mode: TableFormerMode = TableFormerMode.FAST
19
-
20
-
21
- class PipelineOptions(BaseModel):
22
- do_table_structure: bool = True # True: perform table structure extraction
23
- do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
24
-
25
- table_structure_options: TableStructureOptions = TableStructureOptions()
@@ -1,39 +0,0 @@
1
- from pathlib import Path
2
-
3
- from docling.datamodel.pipeline_options import PipelineOptions
4
- from docling.models.easyocr_model import EasyOcrModel
5
- from docling.models.layout_model import LayoutModel
6
- from docling.models.table_structure_model import TableStructureModel
7
- from docling.pipeline.base_model_pipeline import BaseModelPipeline
8
-
9
-
10
- class StandardModelPipeline(BaseModelPipeline):
11
- _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
12
- _table_model_path = "model_artifacts/tableformer"
13
-
14
- def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
15
- super().__init__(artifacts_path, pipeline_options)
16
-
17
- self.model_pipe = [
18
- EasyOcrModel(
19
- config={
20
- "lang": ["fr", "de", "es", "en"],
21
- "enabled": pipeline_options.do_ocr,
22
- }
23
- ),
24
- LayoutModel(
25
- config={
26
- "artifacts_path": artifacts_path
27
- / StandardModelPipeline._layout_model_path
28
- }
29
- ),
30
- TableStructureModel(
31
- config={
32
- "artifacts_path": artifacts_path
33
- / StandardModelPipeline._table_model_path,
34
- "enabled": pipeline_options.do_table_structure,
35
- "mode": pipeline_options.table_structure_options.mode,
36
- "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
37
- }
38
- ),
39
- ]
File without changes
File without changes