docling 1.18.0__tar.gz → 1.19.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-1.18.0 → docling-1.19.0}/PKG-INFO +76 -1
- {docling-1.18.0 → docling-1.19.0}/README.md +73 -0
- {docling-1.18.0 → docling-1.19.0}/docling/cli/main.py +27 -1
- {docling-1.18.0 → docling-1.19.0}/docling/datamodel/base_models.py +4 -1
- docling-1.19.0/docling/datamodel/pipeline_options.py +67 -0
- {docling-1.18.0 → docling-1.19.0}/docling/document_converter.py +5 -3
- {docling-1.18.0 → docling-1.19.0}/docling/models/base_ocr_model.py +4 -4
- {docling-1.18.0 → docling-1.19.0}/docling/models/easyocr_model.py +19 -4
- docling-1.19.0/docling/models/tesseract_ocr_cli_model.py +167 -0
- docling-1.19.0/docling/models/tesseract_ocr_model.py +122 -0
- docling-1.19.0/docling/pipeline/standard_model_pipeline.py +66 -0
- {docling-1.18.0 → docling-1.19.0}/pyproject.toml +5 -1
- docling-1.18.0/docling/datamodel/pipeline_options.py +0 -25
- docling-1.18.0/docling/pipeline/standard_model_pipeline.py +0 -39
- {docling-1.18.0 → docling-1.19.0}/LICENSE +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/__init__.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/backend/__init__.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/backend/abstract_backend.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/cli/__init__.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/datamodel/__init__.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/datamodel/document.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/datamodel/settings.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/models/__init__.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/models/ds_glm_model.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/models/layout_model.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/models/page_assemble_model.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/models/table_structure_model.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/pipeline/__init__.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/pipeline/base_model_pipeline.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/utils/__init__.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/utils/export.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/utils/layout_utils.py +0 -0
- {docling-1.18.0 → docling-1.19.0}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.19.0
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -19,6 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3.11
|
20
20
|
Classifier: Programming Language :: Python :: 3.12
|
21
21
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
22
|
+
Provides-Extra: tesserocr
|
22
23
|
Requires-Dist: certifi (>=2024.7.4)
|
23
24
|
Requires-Dist: deepsearch-glm (>=0.22.0,<0.23.0)
|
24
25
|
Requires-Dist: docling-core (>=1.6.2,<2.0.0)
|
@@ -34,6 +35,7 @@ Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
|
34
35
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
35
36
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
36
37
|
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
38
|
+
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
37
39
|
Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
38
40
|
Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
39
41
|
Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
@@ -96,6 +98,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
|
|
96
98
|
```
|
97
99
|
</details>
|
98
100
|
|
101
|
+
<details>
|
102
|
+
<summary><b>Alternative OCR engines</b></summary>
|
103
|
+
|
104
|
+
Docling supports multiple OCR engines for processing scanned documents. The current version provides
|
105
|
+
the following engines.
|
106
|
+
|
107
|
+
| Engine | Installation | Usage |
|
108
|
+
| ------ | ------------ | ----- |
|
109
|
+
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
|
110
|
+
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
|
111
|
+
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
|
112
|
+
|
113
|
+
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
114
|
+
|
115
|
+
```python
|
116
|
+
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
117
|
+
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
118
|
+
from docling.document_converter import DocumentConverter
|
119
|
+
|
120
|
+
pipeline_options = PipelineOptions()
|
121
|
+
pipeline_options.do_ocr = True
|
122
|
+
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
|
123
|
+
|
124
|
+
doc_converter = DocumentConverter(
|
125
|
+
pipeline_options=pipeline_options,
|
126
|
+
)
|
127
|
+
```
|
128
|
+
|
129
|
+
#### Tesseract installation
|
130
|
+
|
131
|
+
[Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
|
132
|
+
on most operating systems. For using this engine with Docling, Tesseract must be installed on your
|
133
|
+
system, using the packaging tool of your choice. Below we provide example commands.
|
134
|
+
After installing Tesseract you are expected to provide the path to its language files using the
|
135
|
+
`TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
|
136
|
+
|
137
|
+
For macOS, we reccomend using [Homebrew](https://brew.sh/).
|
138
|
+
|
139
|
+
```console
|
140
|
+
brew install tesseract leptonica pkg-config
|
141
|
+
TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
|
142
|
+
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
143
|
+
```
|
144
|
+
|
145
|
+
For Debian-based systems.
|
146
|
+
|
147
|
+
```console
|
148
|
+
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
|
149
|
+
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
|
150
|
+
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
151
|
+
```
|
152
|
+
|
153
|
+
For RHEL systems.
|
154
|
+
|
155
|
+
```console
|
156
|
+
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
|
157
|
+
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
158
|
+
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
159
|
+
```
|
160
|
+
|
161
|
+
#### Linking to Tesseract
|
162
|
+
The most efficient usage of the Tesseract library is via linking. Docling is using
|
163
|
+
the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
|
164
|
+
|
165
|
+
If you get into installation issues of Tesserocr, we suggest using the following
|
166
|
+
installation options:
|
167
|
+
|
168
|
+
```console
|
169
|
+
pip uninstall tesserocr
|
170
|
+
pip install --no-binary :all: tesserocr
|
171
|
+
```
|
172
|
+
</details>
|
173
|
+
|
99
174
|
<details>
|
100
175
|
<summary><b>Docling development setup</b></summary>
|
101
176
|
|
@@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
|
|
52
52
|
```
|
53
53
|
</details>
|
54
54
|
|
55
|
+
<details>
|
56
|
+
<summary><b>Alternative OCR engines</b></summary>
|
57
|
+
|
58
|
+
Docling supports multiple OCR engines for processing scanned documents. The current version provides
|
59
|
+
the following engines.
|
60
|
+
|
61
|
+
| Engine | Installation | Usage |
|
62
|
+
| ------ | ------------ | ----- |
|
63
|
+
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
|
64
|
+
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
|
65
|
+
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
|
66
|
+
|
67
|
+
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
68
|
+
|
69
|
+
```python
|
70
|
+
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
71
|
+
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
72
|
+
from docling.document_converter import DocumentConverter
|
73
|
+
|
74
|
+
pipeline_options = PipelineOptions()
|
75
|
+
pipeline_options.do_ocr = True
|
76
|
+
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
|
77
|
+
|
78
|
+
doc_converter = DocumentConverter(
|
79
|
+
pipeline_options=pipeline_options,
|
80
|
+
)
|
81
|
+
```
|
82
|
+
|
83
|
+
#### Tesseract installation
|
84
|
+
|
85
|
+
[Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
|
86
|
+
on most operating systems. For using this engine with Docling, Tesseract must be installed on your
|
87
|
+
system, using the packaging tool of your choice. Below we provide example commands.
|
88
|
+
After installing Tesseract you are expected to provide the path to its language files using the
|
89
|
+
`TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
|
90
|
+
|
91
|
+
For macOS, we reccomend using [Homebrew](https://brew.sh/).
|
92
|
+
|
93
|
+
```console
|
94
|
+
brew install tesseract leptonica pkg-config
|
95
|
+
TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
|
96
|
+
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
97
|
+
```
|
98
|
+
|
99
|
+
For Debian-based systems.
|
100
|
+
|
101
|
+
```console
|
102
|
+
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
|
103
|
+
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
|
104
|
+
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
105
|
+
```
|
106
|
+
|
107
|
+
For RHEL systems.
|
108
|
+
|
109
|
+
```console
|
110
|
+
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
|
111
|
+
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
112
|
+
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
113
|
+
```
|
114
|
+
|
115
|
+
#### Linking to Tesseract
|
116
|
+
The most efficient usage of the Tesseract library is via linking. Docling is using
|
117
|
+
the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
|
118
|
+
|
119
|
+
If you get into installation issues of Tesserocr, we suggest using the following
|
120
|
+
installation options:
|
121
|
+
|
122
|
+
```console
|
123
|
+
pip uninstall tesserocr
|
124
|
+
pip install --no-binary :all: tesserocr
|
125
|
+
```
|
126
|
+
</details>
|
127
|
+
|
55
128
|
<details>
|
56
129
|
<summary><b>Docling development setup</b></summary>
|
57
130
|
|
@@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
14
14
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
15
15
|
from docling.datamodel.base_models import ConversionStatus
|
16
16
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
17
|
-
from docling.datamodel.pipeline_options import
|
17
|
+
from docling.datamodel.pipeline_options import (
|
18
|
+
EasyOcrOptions,
|
19
|
+
PipelineOptions,
|
20
|
+
TesseractCliOcrOptions,
|
21
|
+
TesseractOcrOptions,
|
22
|
+
)
|
18
23
|
from docling.document_converter import DocumentConverter
|
19
24
|
|
20
25
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
@@ -53,6 +58,13 @@ class Backend(str, Enum):
|
|
53
58
|
DOCLING = "docling"
|
54
59
|
|
55
60
|
|
61
|
+
# Define an enum for the ocr engines
|
62
|
+
class OcrEngine(str, Enum):
|
63
|
+
EASYOCR = "easyocr"
|
64
|
+
TESSERACT_CLI = "tesseract_cli"
|
65
|
+
TESSERACT = "tesseract"
|
66
|
+
|
67
|
+
|
56
68
|
def export_documents(
|
57
69
|
conv_results: Iterable[ConversionResult],
|
58
70
|
output_dir: Path,
|
@@ -152,6 +164,9 @@ def convert(
|
|
152
164
|
backend: Annotated[
|
153
165
|
Backend, typer.Option(..., help="The PDF backend to use.")
|
154
166
|
] = Backend.DOCLING,
|
167
|
+
ocr_engine: Annotated[
|
168
|
+
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
169
|
+
] = OcrEngine.EASYOCR,
|
155
170
|
output: Annotated[
|
156
171
|
Path, typer.Option(..., help="Output directory where results are saved.")
|
157
172
|
] = Path("."),
|
@@ -191,8 +206,19 @@ def convert(
|
|
191
206
|
case _:
|
192
207
|
raise RuntimeError(f"Unexpected backend type {backend}")
|
193
208
|
|
209
|
+
match ocr_engine:
|
210
|
+
case OcrEngine.EASYOCR:
|
211
|
+
ocr_options = EasyOcrOptions()
|
212
|
+
case OcrEngine.TESSERACT_CLI:
|
213
|
+
ocr_options = TesseractCliOcrOptions()
|
214
|
+
case OcrEngine.TESSERACT:
|
215
|
+
ocr_options = TesseractOcrOptions()
|
216
|
+
case _:
|
217
|
+
raise RuntimeError(f"Unexpected backend type {backend}")
|
218
|
+
|
194
219
|
pipeline_options = PipelineOptions(
|
195
220
|
do_ocr=ocr,
|
221
|
+
ocr_options=ocr_options,
|
196
222
|
do_table_structure=True,
|
197
223
|
)
|
198
224
|
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
@@ -110,7 +110,10 @@ class BoundingBox(BaseModel):
|
|
110
110
|
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
111
111
|
|
112
112
|
def area(self) -> float:
|
113
|
-
|
113
|
+
area = (self.r - self.l) * (self.b - self.t)
|
114
|
+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
115
|
+
area = -area
|
116
|
+
return area
|
114
117
|
|
115
118
|
def intersection_area_with(self, other: "BoundingBox") -> float:
|
116
119
|
# Calculate intersection coordinates
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from enum import Enum, auto
|
2
|
+
from typing import List, Literal, Optional, Union
|
3
|
+
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
5
|
+
|
6
|
+
|
7
|
+
class TableFormerMode(str, Enum):
|
8
|
+
FAST = auto()
|
9
|
+
ACCURATE = auto()
|
10
|
+
|
11
|
+
|
12
|
+
class TableStructureOptions(BaseModel):
|
13
|
+
do_cell_matching: bool = (
|
14
|
+
True
|
15
|
+
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
16
|
+
# are merged across table columns.
|
17
|
+
# False: Let table structure model define the text cells, ignore PDF cells.
|
18
|
+
)
|
19
|
+
mode: TableFormerMode = TableFormerMode.FAST
|
20
|
+
|
21
|
+
|
22
|
+
class OcrOptions(BaseModel):
|
23
|
+
kind: str
|
24
|
+
|
25
|
+
|
26
|
+
class EasyOcrOptions(OcrOptions):
|
27
|
+
kind: Literal["easyocr"] = "easyocr"
|
28
|
+
lang: List[str] = ["fr", "de", "es", "en"]
|
29
|
+
use_gpu: bool = True # same default as easyocr.Reader
|
30
|
+
model_storage_directory: Optional[str] = None
|
31
|
+
download_enabled: bool = True # same default as easyocr.Reader
|
32
|
+
|
33
|
+
model_config = ConfigDict(
|
34
|
+
extra="forbid",
|
35
|
+
protected_namespaces=(),
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
class TesseractCliOcrOptions(OcrOptions):
|
40
|
+
kind: Literal["tesseract"] = "tesseract"
|
41
|
+
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
42
|
+
tesseract_cmd: str = "tesseract"
|
43
|
+
path: Optional[str] = None
|
44
|
+
|
45
|
+
model_config = ConfigDict(
|
46
|
+
extra="forbid",
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
class TesseractOcrOptions(OcrOptions):
|
51
|
+
kind: Literal["tesserocr"] = "tesserocr"
|
52
|
+
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
53
|
+
path: Optional[str] = None
|
54
|
+
|
55
|
+
model_config = ConfigDict(
|
56
|
+
extra="forbid",
|
57
|
+
)
|
58
|
+
|
59
|
+
|
60
|
+
class PipelineOptions(BaseModel):
|
61
|
+
do_table_structure: bool = True # True: perform table structure extraction
|
62
|
+
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
63
|
+
|
64
|
+
table_structure_options: TableStructureOptions = TableStructureOptions()
|
65
|
+
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
66
|
+
Field(EasyOcrOptions(), discriminator="kind")
|
67
|
+
)
|
@@ -199,9 +199,6 @@ class DocumentConverter:
|
|
199
199
|
end_pb_time = time.time() - start_pb_time
|
200
200
|
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
201
201
|
|
202
|
-
# Free up mem resources of PDF backend
|
203
|
-
in_doc._backend.unload()
|
204
|
-
|
205
202
|
conv_res.pages = all_assembled_pages
|
206
203
|
self._assemble_doc(conv_res)
|
207
204
|
|
@@ -227,6 +224,11 @@ class DocumentConverter:
|
|
227
224
|
f"{trace}"
|
228
225
|
)
|
229
226
|
|
227
|
+
finally:
|
228
|
+
# Always unload the PDF backend, even in case of failure
|
229
|
+
if in_doc._backend:
|
230
|
+
in_doc._backend.unload()
|
231
|
+
|
230
232
|
end_doc_time = time.time() - start_doc_time
|
231
233
|
_log.info(
|
232
234
|
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
@@ -3,21 +3,21 @@ import logging
|
|
3
3
|
from abc import abstractmethod
|
4
4
|
from typing import Iterable, List, Tuple
|
5
5
|
|
6
|
-
import numpy
|
7
6
|
import numpy as np
|
8
7
|
from PIL import Image, ImageDraw
|
9
8
|
from rtree import index
|
10
9
|
from scipy.ndimage import find_objects, label
|
11
10
|
|
12
11
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
12
|
+
from docling.datamodel.pipeline_options import OcrOptions
|
13
13
|
|
14
14
|
_log = logging.getLogger(__name__)
|
15
15
|
|
16
16
|
|
17
17
|
class BaseOcrModel:
|
18
|
-
def __init__(self,
|
19
|
-
self.
|
20
|
-
self.
|
18
|
+
def __init__(self, enabled: bool, options: OcrOptions):
|
19
|
+
self.enabled = enabled
|
20
|
+
self.options = options
|
21
21
|
|
22
22
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
23
23
|
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
@@ -4,21 +4,33 @@ from typing import Iterable
|
|
4
4
|
import numpy
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
7
|
+
from docling.datamodel.pipeline_options import EasyOcrOptions
|
7
8
|
from docling.models.base_ocr_model import BaseOcrModel
|
8
9
|
|
9
10
|
_log = logging.getLogger(__name__)
|
10
11
|
|
11
12
|
|
12
13
|
class EasyOcrModel(BaseOcrModel):
|
13
|
-
def __init__(self,
|
14
|
-
super().__init__(
|
14
|
+
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
15
|
+
super().__init__(enabled=enabled, options=options)
|
16
|
+
self.options: EasyOcrOptions
|
15
17
|
|
16
18
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
17
19
|
|
18
20
|
if self.enabled:
|
19
|
-
|
21
|
+
try:
|
22
|
+
import easyocr
|
23
|
+
except ImportError:
|
24
|
+
raise ImportError(
|
25
|
+
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
26
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
27
|
+
)
|
20
28
|
|
21
|
-
self.reader = easyocr.Reader(
|
29
|
+
self.reader = easyocr.Reader(
|
30
|
+
lang_list=self.options.lang,
|
31
|
+
model_storage_directory=self.options.model_storage_directory,
|
32
|
+
download_enabled=self.options.download_enabled,
|
33
|
+
)
|
22
34
|
|
23
35
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
24
36
|
|
@@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):
|
|
31
43
|
|
32
44
|
all_ocr_cells = []
|
33
45
|
for ocr_rect in ocr_rects:
|
46
|
+
# Skip zero area boxes
|
47
|
+
if ocr_rect.area() == 0:
|
48
|
+
continue
|
34
49
|
high_res_image = page._backend.get_page_image(
|
35
50
|
scale=self.scale, cropbox=ocr_rect
|
36
51
|
)
|
@@ -0,0 +1,167 @@
|
|
1
|
+
import io
|
2
|
+
import logging
|
3
|
+
import tempfile
|
4
|
+
from subprocess import PIPE, Popen
|
5
|
+
from typing import Iterable, Tuple
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
|
9
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
10
|
+
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
11
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
12
|
+
|
13
|
+
_log = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class TesseractOcrCliModel(BaseOcrModel):
|
17
|
+
|
18
|
+
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
19
|
+
super().__init__(enabled=enabled, options=options)
|
20
|
+
self.options: TesseractCliOcrOptions
|
21
|
+
|
22
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
|
+
|
24
|
+
self._name = None
|
25
|
+
self._version = None
|
26
|
+
|
27
|
+
if self.enabled:
|
28
|
+
try:
|
29
|
+
self._get_name_and_version()
|
30
|
+
|
31
|
+
except Exception as exc:
|
32
|
+
raise RuntimeError(
|
33
|
+
f"Tesseract is not available, aborting: {exc} "
|
34
|
+
"Install tesseract on your system and the tesseract binary is discoverable. "
|
35
|
+
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
|
36
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
37
|
+
)
|
38
|
+
|
39
|
+
def _get_name_and_version(self) -> Tuple[str, str]:
|
40
|
+
|
41
|
+
if self._name != None and self._version != None:
|
42
|
+
return self._name, self._version
|
43
|
+
|
44
|
+
cmd = [self.options.tesseract_cmd, "--version"]
|
45
|
+
|
46
|
+
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
47
|
+
stdout, stderr = proc.communicate()
|
48
|
+
|
49
|
+
proc.wait()
|
50
|
+
|
51
|
+
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
|
52
|
+
# to stderr, so check both.
|
53
|
+
version_line = (
|
54
|
+
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
|
55
|
+
.split("\n")[0]
|
56
|
+
.strip()
|
57
|
+
)
|
58
|
+
|
59
|
+
# If everything else fails...
|
60
|
+
if not version_line:
|
61
|
+
version_line = "tesseract XXX"
|
62
|
+
|
63
|
+
name, version = version_line.split(" ")
|
64
|
+
|
65
|
+
self._name = name
|
66
|
+
self._version = version
|
67
|
+
|
68
|
+
return name, version
|
69
|
+
|
70
|
+
def _run_tesseract(self, ifilename: str):
|
71
|
+
|
72
|
+
cmd = [self.options.tesseract_cmd]
|
73
|
+
|
74
|
+
if self.options.lang is not None and len(self.options.lang) > 0:
|
75
|
+
cmd.append("-l")
|
76
|
+
cmd.append("+".join(self.options.lang))
|
77
|
+
if self.options.path is not None:
|
78
|
+
cmd.append("--tessdata-dir")
|
79
|
+
cmd.append(self.options.path)
|
80
|
+
|
81
|
+
cmd += [ifilename, "stdout", "tsv"]
|
82
|
+
_log.info("command: {}".format(" ".join(cmd)))
|
83
|
+
|
84
|
+
proc = Popen(cmd, stdout=PIPE)
|
85
|
+
output, _ = proc.communicate()
|
86
|
+
|
87
|
+
# _log.info(output)
|
88
|
+
|
89
|
+
# Decode the byte string to a regular string
|
90
|
+
decoded_data = output.decode("utf-8")
|
91
|
+
# _log.info(decoded_data)
|
92
|
+
|
93
|
+
# Read the TSV file generated by Tesseract
|
94
|
+
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
95
|
+
|
96
|
+
# Display the dataframe (optional)
|
97
|
+
# _log.info("df: ", df.head())
|
98
|
+
|
99
|
+
# Filter rows that contain actual text (ignore header or empty rows)
|
100
|
+
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
|
101
|
+
|
102
|
+
return df_filtered
|
103
|
+
|
104
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
105
|
+
|
106
|
+
if not self.enabled:
|
107
|
+
yield from page_batch
|
108
|
+
return
|
109
|
+
|
110
|
+
for page in page_batch:
|
111
|
+
ocr_rects = self.get_ocr_rects(page)
|
112
|
+
|
113
|
+
all_ocr_cells = []
|
114
|
+
for ocr_rect in ocr_rects:
|
115
|
+
# Skip zero area boxes
|
116
|
+
if ocr_rect.area() == 0:
|
117
|
+
continue
|
118
|
+
high_res_image = page._backend.get_page_image(
|
119
|
+
scale=self.scale, cropbox=ocr_rect
|
120
|
+
)
|
121
|
+
|
122
|
+
with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
|
123
|
+
fname = image_file.name
|
124
|
+
high_res_image.save(fname)
|
125
|
+
|
126
|
+
df = self._run_tesseract(fname)
|
127
|
+
|
128
|
+
# _log.info(df)
|
129
|
+
|
130
|
+
# Print relevant columns (bounding box and text)
|
131
|
+
for ix, row in df.iterrows():
|
132
|
+
text = row["text"]
|
133
|
+
conf = row["conf"]
|
134
|
+
|
135
|
+
l = float(row["left"])
|
136
|
+
b = float(row["top"])
|
137
|
+
w = float(row["width"])
|
138
|
+
h = float(row["height"])
|
139
|
+
|
140
|
+
t = b + h
|
141
|
+
r = l + w
|
142
|
+
|
143
|
+
cell = OcrCell(
|
144
|
+
id=ix,
|
145
|
+
text=text,
|
146
|
+
confidence=conf / 100.0,
|
147
|
+
bbox=BoundingBox.from_tuple(
|
148
|
+
coord=(
|
149
|
+
(l / self.scale) + ocr_rect.l,
|
150
|
+
(b / self.scale) + ocr_rect.t,
|
151
|
+
(r / self.scale) + ocr_rect.l,
|
152
|
+
(t / self.scale) + ocr_rect.t,
|
153
|
+
),
|
154
|
+
origin=CoordOrigin.TOPLEFT,
|
155
|
+
),
|
156
|
+
)
|
157
|
+
all_ocr_cells.append(cell)
|
158
|
+
|
159
|
+
## Remove OCR cells which overlap with programmatic cells.
|
160
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
161
|
+
|
162
|
+
page.cells.extend(filtered_ocr_cells)
|
163
|
+
|
164
|
+
# DEBUG code:
|
165
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
166
|
+
|
167
|
+
yield page
|
@@ -0,0 +1,122 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Iterable
|
3
|
+
|
4
|
+
import numpy
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
7
|
+
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
8
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
9
|
+
|
10
|
+
_log = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class TesseractOcrModel(BaseOcrModel):
|
14
|
+
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
15
|
+
super().__init__(enabled=enabled, options=options)
|
16
|
+
self.options: TesseractCliOcrOptions
|
17
|
+
|
18
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
19
|
+
self.reader = None
|
20
|
+
|
21
|
+
if self.enabled:
|
22
|
+
setup_errmsg = (
|
23
|
+
"tesserocr is not correctly installed. "
|
24
|
+
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
25
|
+
"Note that tesserocr might have to be manually compiled for working with"
|
26
|
+
"your Tesseract installation. The Docling documentation provides examples for it. "
|
27
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
28
|
+
)
|
29
|
+
try:
|
30
|
+
import tesserocr
|
31
|
+
except ImportError:
|
32
|
+
raise ImportError(setup_errmsg)
|
33
|
+
|
34
|
+
try:
|
35
|
+
tesseract_version = tesserocr.tesseract_version()
|
36
|
+
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
37
|
+
except:
|
38
|
+
raise ImportError(setup_errmsg)
|
39
|
+
|
40
|
+
# Initialize the tesseractAPI
|
41
|
+
lang = "+".join(self.options.lang)
|
42
|
+
if self.options.path is not None:
|
43
|
+
self.reader = tesserocr.PyTessBaseAPI(
|
44
|
+
path=self.options.path,
|
45
|
+
lang=lang,
|
46
|
+
psm=tesserocr.PSM.AUTO,
|
47
|
+
init=True,
|
48
|
+
oem=tesserocr.OEM.DEFAULT,
|
49
|
+
)
|
50
|
+
else:
|
51
|
+
self.reader = tesserocr.PyTessBaseAPI(
|
52
|
+
lang=lang,
|
53
|
+
psm=tesserocr.PSM.AUTO,
|
54
|
+
init=True,
|
55
|
+
oem=tesserocr.OEM.DEFAULT,
|
56
|
+
)
|
57
|
+
self.reader_RIL = tesserocr.RIL
|
58
|
+
|
59
|
+
def __del__(self):
|
60
|
+
if self.reader is not None:
|
61
|
+
# Finalize the tesseractAPI
|
62
|
+
self.reader.End()
|
63
|
+
|
64
|
+
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
65
|
+
|
66
|
+
if not self.enabled:
|
67
|
+
yield from page_batch
|
68
|
+
return
|
69
|
+
|
70
|
+
for page in page_batch:
|
71
|
+
ocr_rects = self.get_ocr_rects(page)
|
72
|
+
|
73
|
+
all_ocr_cells = []
|
74
|
+
for ocr_rect in ocr_rects:
|
75
|
+
# Skip zero area boxes
|
76
|
+
if ocr_rect.area() == 0:
|
77
|
+
continue
|
78
|
+
high_res_image = page._backend.get_page_image(
|
79
|
+
scale=self.scale, cropbox=ocr_rect
|
80
|
+
)
|
81
|
+
|
82
|
+
# Retrieve text snippets with their bounding boxes
|
83
|
+
self.reader.SetImage(high_res_image)
|
84
|
+
boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
|
85
|
+
|
86
|
+
cells = []
|
87
|
+
for ix, (im, box, _, _) in enumerate(boxes):
|
88
|
+
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
89
|
+
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
90
|
+
|
91
|
+
# Extract text within the bounding box
|
92
|
+
text = self.reader.GetUTF8Text().strip()
|
93
|
+
confidence = self.reader.MeanTextConf()
|
94
|
+
left = box["x"] / self.scale
|
95
|
+
bottom = box["y"] / self.scale
|
96
|
+
right = (box["x"] + box["w"]) / self.scale
|
97
|
+
top = (box["y"] + box["h"]) / self.scale
|
98
|
+
|
99
|
+
cells.append(
|
100
|
+
OcrCell(
|
101
|
+
id=ix,
|
102
|
+
text=text,
|
103
|
+
confidence=confidence,
|
104
|
+
bbox=BoundingBox.from_tuple(
|
105
|
+
coord=(left, top, right, bottom),
|
106
|
+
origin=CoordOrigin.TOPLEFT,
|
107
|
+
),
|
108
|
+
)
|
109
|
+
)
|
110
|
+
|
111
|
+
# del high_res_image
|
112
|
+
all_ocr_cells.extend(cells)
|
113
|
+
|
114
|
+
## Remove OCR cells which overlap with programmatic cells.
|
115
|
+
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
116
|
+
|
117
|
+
page.cells.extend(filtered_ocr_cells)
|
118
|
+
|
119
|
+
# DEBUG code:
|
120
|
+
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
121
|
+
|
122
|
+
yield page
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
|
3
|
+
from docling.datamodel.pipeline_options import (
|
4
|
+
EasyOcrOptions,
|
5
|
+
PipelineOptions,
|
6
|
+
TesseractCliOcrOptions,
|
7
|
+
TesseractOcrOptions,
|
8
|
+
)
|
9
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
10
|
+
from docling.models.easyocr_model import EasyOcrModel
|
11
|
+
from docling.models.layout_model import LayoutModel
|
12
|
+
from docling.models.table_structure_model import TableStructureModel
|
13
|
+
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
14
|
+
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
15
|
+
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
16
|
+
|
17
|
+
|
18
|
+
class StandardModelPipeline(BaseModelPipeline):
|
19
|
+
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
20
|
+
_table_model_path = "model_artifacts/tableformer"
|
21
|
+
|
22
|
+
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
23
|
+
super().__init__(artifacts_path, pipeline_options)
|
24
|
+
|
25
|
+
ocr_model: BaseOcrModel
|
26
|
+
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
27
|
+
ocr_model = EasyOcrModel(
|
28
|
+
enabled=pipeline_options.do_ocr,
|
29
|
+
options=pipeline_options.ocr_options,
|
30
|
+
)
|
31
|
+
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
|
32
|
+
ocr_model = TesseractOcrCliModel(
|
33
|
+
enabled=pipeline_options.do_ocr,
|
34
|
+
options=pipeline_options.ocr_options,
|
35
|
+
)
|
36
|
+
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
37
|
+
ocr_model = TesseractOcrModel(
|
38
|
+
enabled=pipeline_options.do_ocr,
|
39
|
+
options=pipeline_options.ocr_options,
|
40
|
+
)
|
41
|
+
else:
|
42
|
+
raise RuntimeError(
|
43
|
+
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
44
|
+
)
|
45
|
+
|
46
|
+
self.model_pipe = [
|
47
|
+
# OCR
|
48
|
+
ocr_model,
|
49
|
+
# Layout
|
50
|
+
LayoutModel(
|
51
|
+
config={
|
52
|
+
"artifacts_path": artifacts_path
|
53
|
+
/ StandardModelPipeline._layout_model_path
|
54
|
+
}
|
55
|
+
),
|
56
|
+
# Table structure
|
57
|
+
TableStructureModel(
|
58
|
+
config={
|
59
|
+
"artifacts_path": artifacts_path
|
60
|
+
/ StandardModelPipeline._table_model_path,
|
61
|
+
"enabled": pipeline_options.do_table_structure,
|
62
|
+
"mode": pipeline_options.table_structure_options.mode,
|
63
|
+
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
64
|
+
}
|
65
|
+
),
|
66
|
+
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "1.
|
3
|
+
version = "1.19.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "Docling PDF conversion package"
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -46,6 +46,7 @@ pydantic-settings = "^2.3.0"
|
|
46
46
|
huggingface_hub = ">=0.23,<1"
|
47
47
|
requests = "^2.32.3"
|
48
48
|
easyocr = "^1.7"
|
49
|
+
tesserocr = { version = "^2.7.1", optional = true }
|
49
50
|
docling-parse = "^1.4.1"
|
50
51
|
certifi = ">=2024.7.4"
|
51
52
|
rtree = "^1.3.0"
|
@@ -81,6 +82,9 @@ langchain-huggingface = "^0.0.3"
|
|
81
82
|
langchain-milvus = "^0.1.4"
|
82
83
|
langchain-text-splitters = "^0.2.4"
|
83
84
|
|
85
|
+
[tool.poetry.extras]
|
86
|
+
tesserocr = ["tesserocr"]
|
87
|
+
|
84
88
|
[tool.poetry.scripts]
|
85
89
|
docling = "docling.cli.main:app"
|
86
90
|
|
@@ -1,25 +0,0 @@
|
|
1
|
-
from enum import Enum, auto
|
2
|
-
|
3
|
-
from pydantic import BaseModel
|
4
|
-
|
5
|
-
|
6
|
-
class TableFormerMode(str, Enum):
|
7
|
-
FAST = auto()
|
8
|
-
ACCURATE = auto()
|
9
|
-
|
10
|
-
|
11
|
-
class TableStructureOptions(BaseModel):
|
12
|
-
do_cell_matching: bool = (
|
13
|
-
True
|
14
|
-
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
15
|
-
# are merged across table columns.
|
16
|
-
# False: Let table structure model define the text cells, ignore PDF cells.
|
17
|
-
)
|
18
|
-
mode: TableFormerMode = TableFormerMode.FAST
|
19
|
-
|
20
|
-
|
21
|
-
class PipelineOptions(BaseModel):
|
22
|
-
do_table_structure: bool = True # True: perform table structure extraction
|
23
|
-
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
24
|
-
|
25
|
-
table_structure_options: TableStructureOptions = TableStructureOptions()
|
@@ -1,39 +0,0 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
|
3
|
-
from docling.datamodel.pipeline_options import PipelineOptions
|
4
|
-
from docling.models.easyocr_model import EasyOcrModel
|
5
|
-
from docling.models.layout_model import LayoutModel
|
6
|
-
from docling.models.table_structure_model import TableStructureModel
|
7
|
-
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
8
|
-
|
9
|
-
|
10
|
-
class StandardModelPipeline(BaseModelPipeline):
|
11
|
-
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
12
|
-
_table_model_path = "model_artifacts/tableformer"
|
13
|
-
|
14
|
-
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
15
|
-
super().__init__(artifacts_path, pipeline_options)
|
16
|
-
|
17
|
-
self.model_pipe = [
|
18
|
-
EasyOcrModel(
|
19
|
-
config={
|
20
|
-
"lang": ["fr", "de", "es", "en"],
|
21
|
-
"enabled": pipeline_options.do_ocr,
|
22
|
-
}
|
23
|
-
),
|
24
|
-
LayoutModel(
|
25
|
-
config={
|
26
|
-
"artifacts_path": artifacts_path
|
27
|
-
/ StandardModelPipeline._layout_model_path
|
28
|
-
}
|
29
|
-
),
|
30
|
-
TableStructureModel(
|
31
|
-
config={
|
32
|
-
"artifacts_path": artifacts_path
|
33
|
-
/ StandardModelPipeline._table_model_path,
|
34
|
-
"enabled": pipeline_options.do_table_structure,
|
35
|
-
"mode": pipeline_options.table_structure_options.mode,
|
36
|
-
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
37
|
-
}
|
38
|
-
),
|
39
|
-
]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|