docling 2.2.1__tar.gz → 2.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.2.1 → docling-2.3.1}/PKG-INFO +7 -10
- {docling-2.2.1 → docling-2.3.1}/README.md +3 -2
- {docling-2.2.1 → docling-2.3.1}/docling/backend/asciidoc_backend.py +0 -4
- {docling-2.2.1 → docling-2.3.1}/docling/backend/html_backend.py +25 -25
- {docling-2.2.1 → docling-2.3.1}/docling/datamodel/base_models.py +1 -1
- {docling-2.2.1 → docling-2.3.1}/docling/datamodel/document.py +3 -1
- {docling-2.2.1 → docling-2.3.1}/docling/datamodel/settings.py +15 -1
- {docling-2.2.1 → docling-2.3.1}/docling/document_converter.py +20 -12
- {docling-2.2.1 → docling-2.3.1}/docling/models/base_model.py +4 -1
- {docling-2.2.1 → docling-2.3.1}/docling/models/base_ocr_model.py +21 -4
- {docling-2.2.1 → docling-2.3.1}/docling/models/ds_glm_model.py +27 -11
- docling-2.3.1/docling/models/easyocr_model.py +100 -0
- {docling-2.2.1 → docling-2.3.1}/docling/models/layout_model.py +87 -61
- docling-2.3.1/docling/models/page_assemble_model.py +174 -0
- {docling-2.2.1 → docling-2.3.1}/docling/models/page_preprocessing_model.py +25 -7
- docling-2.3.1/docling/models/table_structure_model.py +206 -0
- {docling-2.2.1 → docling-2.3.1}/docling/models/tesseract_ocr_cli_model.py +62 -52
- docling-2.3.1/docling/models/tesseract_ocr_model.py +142 -0
- docling-2.3.1/docling/pipeline/base_pipeline.py +189 -0
- {docling-2.2.1 → docling-2.3.1}/docling/pipeline/simple_pipeline.py +8 -11
- {docling-2.2.1 → docling-2.3.1}/docling/pipeline/standard_pdf_pipeline.py +59 -56
- docling-2.3.1/docling/utils/profiling.py +62 -0
- {docling-2.2.1 → docling-2.3.1}/pyproject.toml +17 -15
- docling-2.2.1/docling/models/easyocr_model.py +0 -90
- docling-2.2.1/docling/models/page_assemble_model.py +0 -172
- docling-2.2.1/docling/models/table_structure_model.py +0 -171
- docling-2.2.1/docling/models/tesseract_ocr_model.py +0 -130
- docling-2.2.1/docling/pipeline/base_pipeline.py +0 -190
- {docling-2.2.1 → docling-2.3.1}/LICENSE +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/__init__.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/__init__.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/abstract_backend.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/md_backend.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/msword_backend.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/pdf_backend.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/cli/__init__.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/cli/main.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/datamodel/__init__.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/datamodel/pipeline_options.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/models/__init__.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/pipeline/__init__.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/utils/__init__.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/utils/export.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/utils/layout_utils.py +0 -0
- {docling-2.2.1 → docling-2.3.1}/docling/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.3.1
|
4
4
|
Summary: Docling PDF conversion package
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -23,9 +23,9 @@ Provides-Extra: tesserocr
|
|
23
23
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
24
24
|
Requires-Dist: certifi (>=2024.7.4)
|
25
25
|
Requires-Dist: deepsearch-glm (>=0.26.1,<0.27.0)
|
26
|
-
Requires-Dist: docling-core (>=2.
|
27
|
-
Requires-Dist: docling-ibm-models (>=2.0.
|
28
|
-
Requires-Dist: docling-parse (>=2.0.
|
26
|
+
Requires-Dist: docling-core (>=2.3.0,<3.0.0)
|
27
|
+
Requires-Dist: docling-ibm-models (>=2.0.3,<3.0.0)
|
28
|
+
Requires-Dist: docling-parse (>=2.0.2,<3.0.0)
|
29
29
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
30
30
|
Requires-Dist: filetype (>=1.2.0,<2.0.0)
|
31
31
|
Requires-Dist: huggingface_hub (>=0.23,<1)
|
@@ -41,10 +41,6 @@ Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
41
41
|
Requires-Dist: rtree (>=1.3.0,<2.0.0)
|
42
42
|
Requires-Dist: scipy (>=1.14.1,<2.0.0)
|
43
43
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
44
|
-
Requires-Dist: torch (>=2.2.2,<2.3.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
45
|
-
Requires-Dist: torch (>=2.2.2,<3.0.0) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
46
|
-
Requires-Dist: torchvision (>=0,<1) ; sys_platform != "darwin" or platform_machine != "x86_64"
|
47
|
-
Requires-Dist: torchvision (>=0.17.2,<0.18.0) ; sys_platform == "darwin" and platform_machine == "x86_64"
|
48
44
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
49
45
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
50
46
|
Description-Content-Type: text/markdown
|
@@ -73,8 +69,9 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
73
69
|
|
74
70
|
## Features
|
75
71
|
|
76
|
-
* 🗂️
|
77
|
-
* 📑 Advanced PDF document understanding
|
72
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
|
73
|
+
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
74
|
+
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
78
75
|
* 📝 Metadata extraction, including title, authors, references & language
|
79
76
|
* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
|
80
77
|
* 🔍 OCR support for scanned PDFs
|
@@ -22,8 +22,9 @@ Docling parses documents and exports them to the desired format with ease and sp
|
|
22
22
|
|
23
23
|
## Features
|
24
24
|
|
25
|
-
* 🗂️
|
26
|
-
* 📑 Advanced PDF document understanding
|
25
|
+
* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON
|
26
|
+
* 📑 Advanced PDF document understanding including page layout, reading order & table structures
|
27
|
+
* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format
|
27
28
|
* 📝 Metadata extraction, including title, authors, references & language
|
28
29
|
* 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications
|
29
30
|
* 🔍 OCR support for scanned PDFs
|
@@ -1,24 +1,20 @@
|
|
1
1
|
import logging
|
2
|
-
import os
|
3
2
|
import re
|
4
3
|
from io import BytesIO
|
5
4
|
from pathlib import Path
|
6
5
|
from typing import Set, Union
|
7
6
|
|
8
7
|
from docling_core.types.doc import (
|
9
|
-
DocItem,
|
10
8
|
DocItemLabel,
|
11
9
|
DoclingDocument,
|
12
10
|
DocumentOrigin,
|
13
11
|
GroupItem,
|
14
12
|
GroupLabel,
|
15
13
|
ImageRef,
|
16
|
-
NodeItem,
|
17
14
|
Size,
|
18
15
|
TableCell,
|
19
16
|
TableData,
|
20
17
|
)
|
21
|
-
from pydantic import AnyUrl
|
22
18
|
|
23
19
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
24
20
|
from docling.datamodel.base_models import InputFormat
|
@@ -179,31 +179,31 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
179
179
|
self.parents[self.level] = doc.add_text(
|
180
180
|
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
181
181
|
)
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
182
|
+
else:
|
183
|
+
if hlevel > self.level:
|
184
|
+
|
185
|
+
# add invisible group
|
186
|
+
for i in range(self.level + 1, hlevel):
|
187
|
+
self.parents[i] = doc.add_group(
|
188
|
+
name=f"header-{i}",
|
189
|
+
label=GroupLabel.SECTION,
|
190
|
+
parent=self.parents[i - 1],
|
191
|
+
)
|
192
|
+
self.level = hlevel
|
193
|
+
|
194
|
+
elif hlevel < self.level:
|
195
|
+
|
196
|
+
# remove the tail
|
197
|
+
for key, val in self.parents.items():
|
198
|
+
if key > hlevel:
|
199
|
+
self.parents[key] = None
|
200
|
+
self.level = hlevel
|
201
|
+
|
202
|
+
self.parents[hlevel] = doc.add_heading(
|
203
|
+
parent=self.parents[hlevel - 1],
|
204
|
+
text=text,
|
205
|
+
level=hlevel,
|
206
|
+
)
|
207
207
|
|
208
208
|
def handle_paragraph(self, element, idx, doc):
|
209
209
|
"""Handles paragraph tags (p)."""
|
@@ -3,7 +3,7 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional,
|
6
|
+
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
7
7
|
|
8
8
|
import filetype
|
9
9
|
from docling_core.types.doc import (
|
@@ -52,6 +52,7 @@ from docling.datamodel.base_models import (
|
|
52
52
|
Page,
|
53
53
|
)
|
54
54
|
from docling.datamodel.settings import DocumentLimits
|
55
|
+
from docling.utils.profiling import ProfilingItem
|
55
56
|
from docling.utils.utils import create_file_hash, create_hash
|
56
57
|
|
57
58
|
if TYPE_CHECKING:
|
@@ -187,6 +188,7 @@ class ConversionResult(BaseModel):
|
|
187
188
|
|
188
189
|
pages: List[Page] = []
|
189
190
|
assembled: AssembledUnit = AssembledUnit()
|
191
|
+
timings: Dict[str, ProfilingItem] = {}
|
190
192
|
|
191
193
|
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
192
194
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import sys
|
2
|
+
from pathlib import Path
|
2
3
|
|
3
4
|
from pydantic import BaseModel
|
4
5
|
from pydantic_settings import BaseSettings
|
@@ -26,8 +27,21 @@ class BatchConcurrencySettings(BaseModel):
|
|
26
27
|
# To force models into single core: export OMP_NUM_THREADS=1
|
27
28
|
|
28
29
|
|
30
|
+
class DebugSettings(BaseModel):
|
31
|
+
visualize_cells: bool = False
|
32
|
+
visualize_ocr: bool = False
|
33
|
+
visualize_layout: bool = False
|
34
|
+
visualize_tables: bool = False
|
35
|
+
|
36
|
+
profile_pipeline_timings: bool = False
|
37
|
+
|
38
|
+
# Path used to output debug information.
|
39
|
+
debug_output_path: str = str(Path.cwd() / "debug")
|
40
|
+
|
41
|
+
|
29
42
|
class AppSettings(BaseSettings):
|
30
43
|
perf: BatchConcurrencySettings
|
44
|
+
debug: DebugSettings
|
31
45
|
|
32
46
|
|
33
|
-
settings = AppSettings(perf=BatchConcurrencySettings())
|
47
|
+
settings = AppSettings(perf=BatchConcurrencySettings(), debug=DebugSettings())
|
@@ -139,6 +139,10 @@ class DocumentConverter:
|
|
139
139
|
|
140
140
|
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
141
141
|
|
142
|
+
def initialize_pipeline(self, format: InputFormat):
|
143
|
+
"""Initialize the conversion pipeline for the selected format."""
|
144
|
+
self._get_pipeline(doc_format=format)
|
145
|
+
|
142
146
|
@validate_call(config=ConfigDict(strict=True))
|
143
147
|
def convert(
|
144
148
|
self,
|
@@ -189,32 +193,43 @@ class DocumentConverter:
|
|
189
193
|
) -> Iterator[ConversionResult]:
|
190
194
|
assert self.format_to_options is not None
|
191
195
|
|
196
|
+
start_time = time.monotonic()
|
197
|
+
|
192
198
|
for input_batch in chunkify(
|
193
199
|
conv_input.docs(self.format_to_options),
|
194
200
|
settings.perf.doc_batch_size, # pass format_options
|
195
201
|
):
|
196
202
|
_log.info(f"Going to convert document batch...")
|
203
|
+
|
197
204
|
# parallel processing only within input_batch
|
198
205
|
# with ThreadPoolExecutor(
|
199
206
|
# max_workers=settings.perf.doc_batch_concurrency
|
200
207
|
# ) as pool:
|
201
208
|
# yield from pool.map(self.process_document, input_batch)
|
202
|
-
|
203
209
|
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
210
|
+
|
204
211
|
for item in map(
|
205
212
|
partial(self._process_document, raises_on_error=raises_on_error),
|
206
213
|
input_batch,
|
207
214
|
):
|
215
|
+
elapsed = time.monotonic() - start_time
|
216
|
+
start_time = time.monotonic()
|
217
|
+
|
208
218
|
if item is not None:
|
219
|
+
_log.info(
|
220
|
+
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
221
|
+
)
|
209
222
|
yield item
|
223
|
+
else:
|
224
|
+
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
210
225
|
|
211
|
-
def _get_pipeline(self,
|
226
|
+
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
212
227
|
assert self.format_to_options is not None
|
213
228
|
|
214
|
-
fopt = self.format_to_options.get(
|
229
|
+
fopt = self.format_to_options.get(doc_format)
|
215
230
|
|
216
231
|
if fopt is None:
|
217
|
-
raise RuntimeError(f"Could not get pipeline for
|
232
|
+
raise RuntimeError(f"Could not get pipeline for {doc_format}")
|
218
233
|
else:
|
219
234
|
pipeline_class = fopt.pipeline_cls
|
220
235
|
pipeline_options = fopt.pipeline_options
|
@@ -237,22 +252,15 @@ class DocumentConverter:
|
|
237
252
|
assert self.allowed_formats is not None
|
238
253
|
assert in_doc.format in self.allowed_formats
|
239
254
|
|
240
|
-
start_doc_time = time.time()
|
241
|
-
|
242
255
|
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
243
256
|
|
244
|
-
end_doc_time = time.time() - start_doc_time
|
245
|
-
_log.info(
|
246
|
-
f"Finished converting document {in_doc.file.name} in {end_doc_time:.2f} seconds."
|
247
|
-
)
|
248
|
-
|
249
257
|
return conv_res
|
250
258
|
|
251
259
|
def _execute_pipeline(
|
252
260
|
self, in_doc: InputDocument, raises_on_error: bool
|
253
261
|
) -> ConversionResult:
|
254
262
|
if in_doc.valid:
|
255
|
-
pipeline = self._get_pipeline(in_doc)
|
263
|
+
pipeline = self._get_pipeline(in_doc.format)
|
256
264
|
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
257
265
|
if raises_on_error:
|
258
266
|
raise RuntimeError(
|
@@ -4,11 +4,14 @@ from typing import Any, Iterable
|
|
4
4
|
from docling_core.types.doc import DoclingDocument, NodeItem
|
5
5
|
|
6
6
|
from docling.datamodel.base_models import Page
|
7
|
+
from docling.datamodel.document import ConversionResult
|
7
8
|
|
8
9
|
|
9
10
|
class BasePageModel(ABC):
|
10
11
|
@abstractmethod
|
11
|
-
def __call__(
|
12
|
+
def __call__(
|
13
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
14
|
+
) -> Iterable[Page]:
|
12
15
|
pass
|
13
16
|
|
14
17
|
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import copy
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
|
+
from pathlib import Path
|
4
5
|
from typing import Iterable, List
|
5
6
|
|
6
7
|
import numpy as np
|
@@ -10,12 +11,15 @@ from rtree import index
|
|
10
11
|
from scipy.ndimage import find_objects, label
|
11
12
|
|
12
13
|
from docling.datamodel.base_models import OcrCell, Page
|
14
|
+
from docling.datamodel.document import ConversionResult
|
13
15
|
from docling.datamodel.pipeline_options import OcrOptions
|
16
|
+
from docling.datamodel.settings import settings
|
17
|
+
from docling.models.base_model import BasePageModel
|
14
18
|
|
15
19
|
_log = logging.getLogger(__name__)
|
16
20
|
|
17
21
|
|
18
|
-
class BaseOcrModel:
|
22
|
+
class BaseOcrModel(BasePageModel):
|
19
23
|
def __init__(self, enabled: bool, options: OcrOptions):
|
20
24
|
self.enabled = enabled
|
21
25
|
self.options = options
|
@@ -113,7 +117,7 @@ class BaseOcrModel:
|
|
113
117
|
]
|
114
118
|
return filtered_ocr_cells
|
115
119
|
|
116
|
-
def draw_ocr_rects_and_cells(self, page, ocr_rects):
|
120
|
+
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
117
121
|
image = copy.deepcopy(page.image)
|
118
122
|
draw = ImageDraw.Draw(image, "RGBA")
|
119
123
|
|
@@ -130,8 +134,21 @@ class BaseOcrModel:
|
|
130
134
|
if isinstance(tc, OcrCell):
|
131
135
|
color = "magenta"
|
132
136
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
133
|
-
|
137
|
+
|
138
|
+
if show:
|
139
|
+
image.show()
|
140
|
+
else:
|
141
|
+
out_path: Path = (
|
142
|
+
Path(settings.debug.debug_output_path)
|
143
|
+
/ f"debug_{conv_res.input.file.stem}"
|
144
|
+
)
|
145
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
146
|
+
|
147
|
+
out_file = out_path / f"ocr_page_{page.page_no:05}.png"
|
148
|
+
image.save(str(out_file), format="png")
|
134
149
|
|
135
150
|
@abstractmethod
|
136
|
-
def __call__(
|
151
|
+
def __call__(
|
152
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
153
|
+
) -> Iterable[Page]:
|
137
154
|
pass
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import copy
|
2
2
|
import random
|
3
|
+
from pathlib import Path
|
3
4
|
from typing import List, Union
|
4
5
|
|
5
6
|
from deepsearch_glm.nlp_utils import init_nlp_model
|
@@ -27,6 +28,8 @@ from pydantic import BaseModel, ConfigDict
|
|
27
28
|
|
28
29
|
from docling.datamodel.base_models import Cluster, FigureElement, Table, TextElement
|
29
30
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
31
|
+
from docling.datamodel.settings import settings
|
32
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
30
33
|
from docling.utils.utils import create_hash
|
31
34
|
|
32
35
|
|
@@ -226,23 +229,24 @@ class GlmModel:
|
|
226
229
|
return ds_doc
|
227
230
|
|
228
231
|
def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
229
|
-
|
230
|
-
|
232
|
+
with TimeRecorder(conv_res, "glm", scope=ProfilingScope.DOCUMENT):
|
233
|
+
ds_doc = self._to_legacy_document(conv_res)
|
234
|
+
ds_doc_dict = ds_doc.model_dump(by_alias=True)
|
231
235
|
|
232
|
-
|
236
|
+
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
233
237
|
|
234
|
-
|
238
|
+
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
235
239
|
|
236
240
|
# DEBUG code:
|
237
|
-
def draw_clusters_and_cells(ds_document, page_no):
|
241
|
+
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
238
242
|
clusters_to_draw = []
|
239
243
|
image = copy.deepcopy(conv_res.pages[page_no].image)
|
240
244
|
for ix, elem in enumerate(ds_document.main_text):
|
241
245
|
if isinstance(elem, BaseText):
|
242
|
-
prov = elem.prov[0]
|
246
|
+
prov = elem.prov[0] # type: ignore
|
243
247
|
elif isinstance(elem, Ref):
|
244
248
|
_, arr, index = elem.ref.split("/")
|
245
|
-
index = int(index)
|
249
|
+
index = int(index) # type: ignore
|
246
250
|
if arr == "tables":
|
247
251
|
prov = ds_document.tables[index].prov[0]
|
248
252
|
elif arr == "figures":
|
@@ -256,7 +260,7 @@ class GlmModel:
|
|
256
260
|
id=ix,
|
257
261
|
label=elem.name,
|
258
262
|
bbox=BoundingBox.from_tuple(
|
259
|
-
coord=prov.bbox,
|
263
|
+
coord=prov.bbox, # type: ignore
|
260
264
|
origin=CoordOrigin.BOTTOMLEFT,
|
261
265
|
).to_top_left_origin(conv_res.pages[page_no].size.height),
|
262
266
|
)
|
@@ -276,9 +280,21 @@ class GlmModel:
|
|
276
280
|
for tc in c.cells: # [:1]:
|
277
281
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
278
282
|
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
279
|
-
image.show()
|
280
283
|
|
281
|
-
|
282
|
-
|
284
|
+
if show:
|
285
|
+
image.show()
|
286
|
+
else:
|
287
|
+
out_path: Path = (
|
288
|
+
Path(settings.debug.debug_output_path)
|
289
|
+
/ f"debug_{conv_res.input.file.stem}"
|
290
|
+
)
|
291
|
+
out_path.mkdir(parents=True, exist_ok=True)
|
292
|
+
|
293
|
+
out_file = out_path / f"doc_page_{page_no:05}.png"
|
294
|
+
image.save(str(out_file), format="png")
|
295
|
+
|
296
|
+
# for item in ds_doc.page_dimensions:
|
297
|
+
# page_no = item.page
|
298
|
+
# draw_clusters_and_cells(ds_doc, page_no)
|
283
299
|
|
284
300
|
return docling_doc
|
@@ -0,0 +1,100 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Iterable
|
3
|
+
|
4
|
+
import numpy
|
5
|
+
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
|
+
|
7
|
+
from docling.datamodel.base_models import OcrCell, Page
|
8
|
+
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.pipeline_options import EasyOcrOptions
|
10
|
+
from docling.datamodel.settings import settings
|
11
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
12
|
+
from docling.utils.profiling import TimeRecorder
|
13
|
+
|
14
|
+
_log = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
class EasyOcrModel(BaseOcrModel):
|
18
|
+
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
19
|
+
super().__init__(enabled=enabled, options=options)
|
20
|
+
self.options: EasyOcrOptions
|
21
|
+
|
22
|
+
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
23
|
+
|
24
|
+
if self.enabled:
|
25
|
+
try:
|
26
|
+
import easyocr
|
27
|
+
except ImportError:
|
28
|
+
raise ImportError(
|
29
|
+
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
30
|
+
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
31
|
+
)
|
32
|
+
|
33
|
+
self.reader = easyocr.Reader(
|
34
|
+
lang_list=self.options.lang,
|
35
|
+
model_storage_directory=self.options.model_storage_directory,
|
36
|
+
download_enabled=self.options.download_enabled,
|
37
|
+
)
|
38
|
+
|
39
|
+
def __call__(
|
40
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
41
|
+
) -> Iterable[Page]:
|
42
|
+
|
43
|
+
if not self.enabled:
|
44
|
+
yield from page_batch
|
45
|
+
return
|
46
|
+
|
47
|
+
for page in page_batch:
|
48
|
+
|
49
|
+
assert page._backend is not None
|
50
|
+
if not page._backend.is_valid():
|
51
|
+
yield page
|
52
|
+
else:
|
53
|
+
with TimeRecorder(conv_res, "ocr"):
|
54
|
+
ocr_rects = self.get_ocr_rects(page)
|
55
|
+
|
56
|
+
all_ocr_cells = []
|
57
|
+
for ocr_rect in ocr_rects:
|
58
|
+
# Skip zero area boxes
|
59
|
+
if ocr_rect.area() == 0:
|
60
|
+
continue
|
61
|
+
high_res_image = page._backend.get_page_image(
|
62
|
+
scale=self.scale, cropbox=ocr_rect
|
63
|
+
)
|
64
|
+
im = numpy.array(high_res_image)
|
65
|
+
result = self.reader.readtext(im)
|
66
|
+
|
67
|
+
del high_res_image
|
68
|
+
del im
|
69
|
+
|
70
|
+
cells = [
|
71
|
+
OcrCell(
|
72
|
+
id=ix,
|
73
|
+
text=line[1],
|
74
|
+
confidence=line[2],
|
75
|
+
bbox=BoundingBox.from_tuple(
|
76
|
+
coord=(
|
77
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
78
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
79
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
80
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
81
|
+
),
|
82
|
+
origin=CoordOrigin.TOPLEFT,
|
83
|
+
),
|
84
|
+
)
|
85
|
+
for ix, line in enumerate(result)
|
86
|
+
]
|
87
|
+
all_ocr_cells.extend(cells)
|
88
|
+
|
89
|
+
## Remove OCR cells which overlap with programmatic cells.
|
90
|
+
filtered_ocr_cells = self.filter_ocr_cells(
|
91
|
+
all_ocr_cells, page.cells
|
92
|
+
)
|
93
|
+
|
94
|
+
page.cells.extend(filtered_ocr_cells)
|
95
|
+
|
96
|
+
# DEBUG code:
|
97
|
+
if settings.debug.visualize_ocr:
|
98
|
+
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
99
|
+
|
100
|
+
yield page
|