docling 2.11.0__py3-none-any.whl → 2.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/xml/__init__.py +0 -0
- docling/backend/xml/uspto_backend.py +1888 -0
- docling/cli/main.py +8 -0
- docling/datamodel/base_models.py +18 -4
- docling/datamodel/document.py +77 -13
- docling/datamodel/pipeline_options.py +68 -4
- docling/datamodel/settings.py +1 -0
- docling/document_converter.py +11 -2
- docling/models/ds_glm_model.py +34 -4
- docling/models/easyocr_model.py +37 -3
- docling/models/layout_model.py +144 -280
- docling/models/page_assemble_model.py +11 -1
- docling/models/rapid_ocr_model.py +24 -45
- docling/models/table_structure_model.py +49 -33
- docling/pipeline/base_pipeline.py +3 -1
- docling/pipeline/standard_pdf_pipeline.py +7 -3
- docling/utils/accelerator_utils.py +42 -0
- docling/utils/glm_utils.py +11 -3
- docling/utils/layout_postprocessor.py +666 -0
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/METADATA +3 -3
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/RECORD +24 -21
- docling/utils/layout_utils.py +0 -812
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/LICENSE +0 -0
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/WHEEL +0 -0
- {docling-2.11.0.dist-info → docling-2.13.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -26,6 +26,8 @@ from docling.datamodel.base_models import (
|
|
26
26
|
)
|
27
27
|
from docling.datamodel.document import ConversionResult
|
28
28
|
from docling.datamodel.pipeline_options import (
|
29
|
+
AcceleratorDevice,
|
30
|
+
AcceleratorOptions,
|
29
31
|
EasyOcrOptions,
|
30
32
|
OcrEngine,
|
31
33
|
OcrMacOptions,
|
@@ -257,6 +259,10 @@ def convert(
|
|
257
259
|
help="The timeout for processing each document, in seconds.",
|
258
260
|
),
|
259
261
|
] = None,
|
262
|
+
num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
|
263
|
+
device: Annotated[
|
264
|
+
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
265
|
+
] = AcceleratorDevice.AUTO,
|
260
266
|
):
|
261
267
|
if verbose == 0:
|
262
268
|
logging.basicConfig(level=logging.WARNING)
|
@@ -336,7 +342,9 @@ def convert(
|
|
336
342
|
if ocr_lang_list is not None:
|
337
343
|
ocr_options.lang = ocr_lang_list
|
338
344
|
|
345
|
+
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
339
346
|
pipeline_options = PdfPipelineOptions(
|
347
|
+
accelerator_options=accelerator_options,
|
340
348
|
do_ocr=ocr,
|
341
349
|
ocr_options=ocr_options,
|
342
350
|
do_table_structure=True,
|
docling/datamodel/base_models.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from enum import Enum
|
1
|
+
from enum import Enum
|
2
2
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
3
3
|
|
4
4
|
from docling_core.types.doc import (
|
@@ -28,6 +28,8 @@ class ConversionStatus(str, Enum):
|
|
28
28
|
|
29
29
|
|
30
30
|
class InputFormat(str, Enum):
|
31
|
+
"""A document format supported by document backend parsers."""
|
32
|
+
|
31
33
|
DOCX = "docx"
|
32
34
|
PPTX = "pptx"
|
33
35
|
HTML = "html"
|
@@ -36,6 +38,7 @@ class InputFormat(str, Enum):
|
|
36
38
|
ASCIIDOC = "asciidoc"
|
37
39
|
MD = "md"
|
38
40
|
XLSX = "xlsx"
|
41
|
+
XML_USPTO = "xml_uspto"
|
39
42
|
|
40
43
|
|
41
44
|
class OutputFormat(str, Enum):
|
@@ -55,6 +58,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
55
58
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
56
59
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
57
60
|
InputFormat.XLSX: ["xlsx"],
|
61
|
+
InputFormat.XML_USPTO: ["xml", "txt"],
|
58
62
|
}
|
59
63
|
|
60
64
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
@@ -81,10 +85,13 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
81
85
|
InputFormat.XLSX: [
|
82
86
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
83
87
|
],
|
88
|
+
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
84
89
|
}
|
85
90
|
|
86
|
-
MimeTypeToFormat = {
|
87
|
-
mime: fmt for fmt
|
91
|
+
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
92
|
+
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
|
93
|
+
for value in FormatToMimeType.values()
|
94
|
+
for mime in value
|
88
95
|
}
|
89
96
|
|
90
97
|
|
@@ -122,6 +129,7 @@ class Cluster(BaseModel):
|
|
122
129
|
bbox: BoundingBox
|
123
130
|
confidence: float = 1.0
|
124
131
|
cells: List[Cell] = []
|
132
|
+
children: List["Cluster"] = [] # Add child cluster support
|
125
133
|
|
126
134
|
|
127
135
|
class BasePageElement(BaseModel):
|
@@ -136,6 +144,12 @@ class LayoutPrediction(BaseModel):
|
|
136
144
|
clusters: List[Cluster] = []
|
137
145
|
|
138
146
|
|
147
|
+
class ContainerElement(
|
148
|
+
BasePageElement
|
149
|
+
): # Used for Form and Key-Value-Regions, only for typing.
|
150
|
+
pass
|
151
|
+
|
152
|
+
|
139
153
|
class Table(BasePageElement):
|
140
154
|
otsl_seq: List[str]
|
141
155
|
num_rows: int = 0
|
@@ -175,7 +189,7 @@ class PagePredictions(BaseModel):
|
|
175
189
|
equations_prediction: Optional[EquationPrediction] = None
|
176
190
|
|
177
191
|
|
178
|
-
PageElement = Union[TextElement, Table, FigureElement]
|
192
|
+
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
179
193
|
|
180
194
|
|
181
195
|
class AssembledUnit(BaseModel):
|
docling/datamodel/document.py
CHANGED
@@ -3,7 +3,17 @@ import re
|
|
3
3
|
from enum import Enum
|
4
4
|
from io import BytesIO
|
5
5
|
from pathlib import Path, PurePath
|
6
|
-
from typing import
|
6
|
+
from typing import (
|
7
|
+
TYPE_CHECKING,
|
8
|
+
Dict,
|
9
|
+
Iterable,
|
10
|
+
List,
|
11
|
+
Literal,
|
12
|
+
Optional,
|
13
|
+
Set,
|
14
|
+
Type,
|
15
|
+
Union,
|
16
|
+
)
|
7
17
|
|
8
18
|
import filetype
|
9
19
|
from docling_core.types.doc import (
|
@@ -63,7 +73,7 @@ _log = logging.getLogger(__name__)
|
|
63
73
|
|
64
74
|
layout_label_to_ds_type = {
|
65
75
|
DocItemLabel.TITLE: "title",
|
66
|
-
DocItemLabel.DOCUMENT_INDEX: "table
|
76
|
+
DocItemLabel.DOCUMENT_INDEX: "table",
|
67
77
|
DocItemLabel.SECTION_HEADER: "subtitle-level-1",
|
68
78
|
DocItemLabel.CHECKBOX_SELECTED: "checkbox-selected",
|
69
79
|
DocItemLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
|
@@ -78,6 +88,8 @@ layout_label_to_ds_type = {
|
|
78
88
|
DocItemLabel.PICTURE: "figure",
|
79
89
|
DocItemLabel.TEXT: "paragraph",
|
80
90
|
DocItemLabel.PARAGRAPH: "paragraph",
|
91
|
+
DocItemLabel.FORM: DocItemLabel.FORM.value,
|
92
|
+
DocItemLabel.KEY_VALUE_REGION: DocItemLabel.KEY_VALUE_REGION.value,
|
81
93
|
}
|
82
94
|
|
83
95
|
_EMPTY_DOCLING_DOC = DoclingDocument(name="dummy")
|
@@ -235,7 +247,7 @@ class _DocumentConversionInput(BaseModel):
|
|
235
247
|
if isinstance(obj, Path):
|
236
248
|
yield InputDocument(
|
237
249
|
path_or_stream=obj,
|
238
|
-
format=format,
|
250
|
+
format=format, # type: ignore[arg-type]
|
239
251
|
filename=obj.name,
|
240
252
|
limits=self.limits,
|
241
253
|
backend=backend,
|
@@ -243,7 +255,7 @@ class _DocumentConversionInput(BaseModel):
|
|
243
255
|
elif isinstance(obj, DocumentStream):
|
244
256
|
yield InputDocument(
|
245
257
|
path_or_stream=obj.stream,
|
246
|
-
format=format,
|
258
|
+
format=format, # type: ignore[arg-type]
|
247
259
|
filename=obj.name,
|
248
260
|
limits=self.limits,
|
249
261
|
backend=backend,
|
@@ -251,15 +263,15 @@ class _DocumentConversionInput(BaseModel):
|
|
251
263
|
else:
|
252
264
|
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
253
265
|
|
254
|
-
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
266
|
+
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
|
255
267
|
content = b"" # empty binary blob
|
256
|
-
|
268
|
+
formats: list[InputFormat] = []
|
257
269
|
|
258
270
|
if isinstance(obj, Path):
|
259
271
|
mime = filetype.guess_mime(str(obj))
|
260
272
|
if mime is None:
|
261
273
|
ext = obj.suffix[1:]
|
262
|
-
mime =
|
274
|
+
mime = _DocumentConversionInput._mime_from_extension(ext)
|
263
275
|
if mime is None: # must guess from
|
264
276
|
with obj.open("rb") as f:
|
265
277
|
content = f.read(1024) # Read first 1KB
|
@@ -274,15 +286,53 @@ class _DocumentConversionInput(BaseModel):
|
|
274
286
|
if ("." in obj.name and not obj.name.startswith("."))
|
275
287
|
else ""
|
276
288
|
)
|
277
|
-
mime =
|
289
|
+
mime = _DocumentConversionInput._mime_from_extension(ext)
|
278
290
|
|
279
|
-
mime = mime or
|
291
|
+
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
|
280
292
|
mime = mime or "text/plain"
|
293
|
+
formats = MimeTypeToFormat.get(mime, [])
|
294
|
+
if formats:
|
295
|
+
# TODO: remove application/xml case after adding another XML parse
|
296
|
+
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
|
297
|
+
return formats[0]
|
298
|
+
else: # ambiguity in formats
|
299
|
+
return _DocumentConversionInput._guess_from_content(
|
300
|
+
content, mime, formats
|
301
|
+
)
|
302
|
+
else:
|
303
|
+
return None
|
304
|
+
|
305
|
+
@staticmethod
|
306
|
+
def _guess_from_content(
|
307
|
+
content: bytes, mime: str, formats: list[InputFormat]
|
308
|
+
) -> Optional[InputFormat]:
|
309
|
+
"""Guess the input format of a document by checking part of its content."""
|
310
|
+
input_format: Optional[InputFormat] = None
|
311
|
+
content_str = content.decode("utf-8")
|
312
|
+
|
313
|
+
if mime == "application/xml":
|
314
|
+
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
|
315
|
+
if match_doctype:
|
316
|
+
xml_doctype = match_doctype.group()
|
317
|
+
if InputFormat.XML_USPTO in formats and any(
|
318
|
+
item in xml_doctype
|
319
|
+
for item in (
|
320
|
+
"us-patent-application-v4",
|
321
|
+
"us-patent-grant-v4",
|
322
|
+
"us-grant-025",
|
323
|
+
"patent-application-publication",
|
324
|
+
)
|
325
|
+
):
|
326
|
+
input_format = InputFormat.XML_USPTO
|
327
|
+
|
328
|
+
elif mime == "text/plain":
|
329
|
+
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
|
330
|
+
input_format = InputFormat.XML_USPTO
|
281
331
|
|
282
|
-
|
283
|
-
return format
|
332
|
+
return input_format
|
284
333
|
|
285
|
-
|
334
|
+
@staticmethod
|
335
|
+
def _mime_from_extension(ext):
|
286
336
|
mime = None
|
287
337
|
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
288
338
|
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
@@ -293,7 +343,19 @@ class _DocumentConversionInput(BaseModel):
|
|
293
343
|
|
294
344
|
return mime
|
295
345
|
|
296
|
-
|
346
|
+
@staticmethod
|
347
|
+
def _detect_html_xhtml(
|
348
|
+
content: bytes,
|
349
|
+
) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
|
350
|
+
"""Guess the mime type of an XHTML, HTML, or XML file from its content.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
content: A short piece of a document from its beginning.
|
354
|
+
|
355
|
+
Returns:
|
356
|
+
The mime type of an XHTML, HTML, or XML file, or None if the content does
|
357
|
+
not match any of these formats.
|
358
|
+
"""
|
297
359
|
content_str = content.decode("ascii", errors="ignore").lower()
|
298
360
|
# Remove XML comments
|
299
361
|
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
|
@@ -302,6 +364,8 @@ class _DocumentConversionInput(BaseModel):
|
|
302
364
|
if re.match(r"<\?xml", content_str):
|
303
365
|
if "xhtml" in content_str[:1000]:
|
304
366
|
return "application/xhtml+xml"
|
367
|
+
else:
|
368
|
+
return "application/xml"
|
305
369
|
|
306
370
|
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
307
371
|
return "text/html"
|
@@ -1,8 +1,66 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
import warnings
|
1
4
|
from enum import Enum
|
2
5
|
from pathlib import Path
|
3
|
-
from typing import List, Literal, Optional, Union
|
6
|
+
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
|
4
7
|
|
5
|
-
from pydantic import BaseModel, ConfigDict, Field
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
9
|
+
from pydantic_settings import (
|
10
|
+
BaseSettings,
|
11
|
+
PydanticBaseSettingsSource,
|
12
|
+
SettingsConfigDict,
|
13
|
+
)
|
14
|
+
from typing_extensions import deprecated
|
15
|
+
|
16
|
+
_log = logging.getLogger(__name__)
|
17
|
+
|
18
|
+
|
19
|
+
class AcceleratorDevice(str, Enum):
|
20
|
+
"""Devices to run model inference"""
|
21
|
+
|
22
|
+
AUTO = "auto"
|
23
|
+
CPU = "cpu"
|
24
|
+
CUDA = "cuda"
|
25
|
+
MPS = "mps"
|
26
|
+
|
27
|
+
|
28
|
+
class AcceleratorOptions(BaseSettings):
|
29
|
+
model_config = SettingsConfigDict(
|
30
|
+
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
31
|
+
)
|
32
|
+
|
33
|
+
num_threads: int = 4
|
34
|
+
device: AcceleratorDevice = AcceleratorDevice.AUTO
|
35
|
+
|
36
|
+
@model_validator(mode="before")
|
37
|
+
@classmethod
|
38
|
+
def check_alternative_envvars(cls, data: Any) -> Any:
|
39
|
+
r"""
|
40
|
+
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
41
|
+
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
42
|
+
|
43
|
+
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
44
|
+
the same functionality. In case the alias envvar is set and the user tries to override the
|
45
|
+
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
46
|
+
as an extra input instead of simply overwriting the evvar value for that parameter.
|
47
|
+
"""
|
48
|
+
if isinstance(data, dict):
|
49
|
+
input_num_threads = data.get("num_threads")
|
50
|
+
|
51
|
+
# Check if to set the num_threads from the alternative envvar
|
52
|
+
if input_num_threads is None:
|
53
|
+
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
54
|
+
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
55
|
+
if docling_num_threads is None and omp_num_threads is not None:
|
56
|
+
try:
|
57
|
+
data["num_threads"] = int(omp_num_threads)
|
58
|
+
except ValueError:
|
59
|
+
_log.error(
|
60
|
+
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
61
|
+
omp_num_threads,
|
62
|
+
)
|
63
|
+
return data
|
6
64
|
|
7
65
|
|
8
66
|
class TableFormerMode(str, Enum):
|
@@ -78,9 +136,14 @@ class EasyOcrOptions(OcrOptions):
|
|
78
136
|
|
79
137
|
kind: Literal["easyocr"] = "easyocr"
|
80
138
|
lang: List[str] = ["fr", "de", "es", "en"]
|
81
|
-
|
139
|
+
|
140
|
+
use_gpu: Optional[bool] = None
|
141
|
+
|
142
|
+
confidence_threshold: float = 0.65
|
143
|
+
|
82
144
|
model_storage_directory: Optional[str] = None
|
83
|
-
|
145
|
+
recog_network: Optional[str] = "standard"
|
146
|
+
download_enabled: bool = True
|
84
147
|
|
85
148
|
model_config = ConfigDict(
|
86
149
|
extra="forbid",
|
@@ -153,6 +216,7 @@ class PipelineOptions(BaseModel):
|
|
153
216
|
True # This default will be set to False on a future version of docling
|
154
217
|
)
|
155
218
|
document_timeout: Optional[float] = None
|
219
|
+
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
156
220
|
|
157
221
|
|
158
222
|
class PdfPipelineOptions(PipelineOptions):
|
docling/datamodel/settings.py
CHANGED
docling/document_converter.py
CHANGED
@@ -15,6 +15,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
|
15
15
|
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
16
16
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
17
17
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
18
|
+
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
18
19
|
from docling.datamodel.base_models import (
|
19
20
|
ConversionStatus,
|
20
21
|
DoclingComponentType,
|
@@ -82,12 +83,17 @@ class HTMLFormatOption(FormatOption):
|
|
82
83
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
83
84
|
|
84
85
|
|
85
|
-
class
|
86
|
+
class PatentUsptoFormatOption(FormatOption):
|
87
|
+
pipeline_cls: Type = SimplePipeline
|
88
|
+
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
89
|
+
|
90
|
+
|
91
|
+
class ImageFormatOption(FormatOption):
|
86
92
|
pipeline_cls: Type = StandardPdfPipeline
|
87
93
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
88
94
|
|
89
95
|
|
90
|
-
class
|
96
|
+
class PdfFormatOption(FormatOption):
|
91
97
|
pipeline_cls: Type = StandardPdfPipeline
|
92
98
|
backend: Type[AbstractDocumentBackend] = DoclingParseV2DocumentBackend
|
93
99
|
|
@@ -112,6 +118,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
112
118
|
InputFormat.HTML: FormatOption(
|
113
119
|
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
114
120
|
),
|
121
|
+
InputFormat.XML_USPTO: FormatOption(
|
122
|
+
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
123
|
+
),
|
115
124
|
InputFormat.IMAGE: FormatOption(
|
116
125
|
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
|
117
126
|
),
|
docling/models/ds_glm_model.py
CHANGED
@@ -22,9 +22,15 @@ from docling_core.types.legacy_doc.document import (
|
|
22
22
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
23
23
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
24
24
|
from PIL import ImageDraw
|
25
|
-
from pydantic import BaseModel, ConfigDict
|
26
|
-
|
27
|
-
from docling.datamodel.base_models import
|
25
|
+
from pydantic import BaseModel, ConfigDict, TypeAdapter
|
26
|
+
|
27
|
+
from docling.datamodel.base_models import (
|
28
|
+
Cluster,
|
29
|
+
ContainerElement,
|
30
|
+
FigureElement,
|
31
|
+
Table,
|
32
|
+
TextElement,
|
33
|
+
)
|
28
34
|
from docling.datamodel.document import ConversionResult, layout_label_to_ds_type
|
29
35
|
from docling.datamodel.settings import settings
|
30
36
|
from docling.utils.glm_utils import to_docling_document
|
@@ -204,7 +210,31 @@ class GlmModel:
|
|
204
210
|
)
|
205
211
|
],
|
206
212
|
obj_type=layout_label_to_ds_type.get(element.label),
|
207
|
-
|
213
|
+
payload={
|
214
|
+
"children": TypeAdapter(List[Cluster]).dump_python(
|
215
|
+
element.cluster.children
|
216
|
+
)
|
217
|
+
}, # hack to channel child clusters through GLM
|
218
|
+
)
|
219
|
+
)
|
220
|
+
elif isinstance(element, ContainerElement):
|
221
|
+
main_text.append(
|
222
|
+
BaseText(
|
223
|
+
text="",
|
224
|
+
payload={
|
225
|
+
"children": TypeAdapter(List[Cluster]).dump_python(
|
226
|
+
element.cluster.children
|
227
|
+
)
|
228
|
+
}, # hack to channel child clusters through GLM
|
229
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
230
|
+
name=element.label,
|
231
|
+
prov=[
|
232
|
+
Prov(
|
233
|
+
bbox=target_bbox,
|
234
|
+
page=element.page_no + 1,
|
235
|
+
span=[0, 0],
|
236
|
+
)
|
237
|
+
],
|
208
238
|
)
|
209
239
|
)
|
210
240
|
|
docling/models/easyocr_model.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
import warnings
|
2
3
|
from typing import Iterable
|
3
4
|
|
4
5
|
import numpy
|
@@ -7,16 +8,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
|
7
8
|
|
8
9
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
9
10
|
from docling.datamodel.document import ConversionResult
|
10
|
-
from docling.datamodel.pipeline_options import
|
11
|
+
from docling.datamodel.pipeline_options import (
|
12
|
+
AcceleratorDevice,
|
13
|
+
AcceleratorOptions,
|
14
|
+
EasyOcrOptions,
|
15
|
+
)
|
11
16
|
from docling.datamodel.settings import settings
|
12
17
|
from docling.models.base_ocr_model import BaseOcrModel
|
18
|
+
from docling.utils.accelerator_utils import decide_device
|
13
19
|
from docling.utils.profiling import TimeRecorder
|
14
20
|
|
15
21
|
_log = logging.getLogger(__name__)
|
16
22
|
|
17
23
|
|
18
24
|
class EasyOcrModel(BaseOcrModel):
|
19
|
-
def __init__(
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
enabled: bool,
|
28
|
+
options: EasyOcrOptions,
|
29
|
+
accelerator_options: AcceleratorOptions,
|
30
|
+
):
|
20
31
|
super().__init__(enabled=enabled, options=options)
|
21
32
|
self.options: EasyOcrOptions
|
22
33
|
|
@@ -31,11 +42,33 @@ class EasyOcrModel(BaseOcrModel):
|
|
31
42
|
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
32
43
|
)
|
33
44
|
|
45
|
+
if self.options.use_gpu is None:
|
46
|
+
device = decide_device(accelerator_options.device)
|
47
|
+
# Enable easyocr GPU if running on CUDA, MPS
|
48
|
+
use_gpu = any(
|
49
|
+
[
|
50
|
+
device.startswith(x)
|
51
|
+
for x in [
|
52
|
+
AcceleratorDevice.CUDA.value,
|
53
|
+
AcceleratorDevice.MPS.value,
|
54
|
+
]
|
55
|
+
]
|
56
|
+
)
|
57
|
+
else:
|
58
|
+
warnings.warn(
|
59
|
+
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
|
60
|
+
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
|
61
|
+
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
|
62
|
+
)
|
63
|
+
use_gpu = self.options.use_gpu
|
64
|
+
|
34
65
|
self.reader = easyocr.Reader(
|
35
66
|
lang_list=self.options.lang,
|
36
|
-
gpu=
|
67
|
+
gpu=use_gpu,
|
37
68
|
model_storage_directory=self.options.model_storage_directory,
|
69
|
+
recog_network=self.options.recog_network,
|
38
70
|
download_enabled=self.options.download_enabled,
|
71
|
+
verbose=False,
|
39
72
|
)
|
40
73
|
|
41
74
|
def __call__(
|
@@ -85,6 +118,7 @@ class EasyOcrModel(BaseOcrModel):
|
|
85
118
|
),
|
86
119
|
)
|
87
120
|
for ix, line in enumerate(result)
|
121
|
+
if line[2] >= self.options.confidence_threshold
|
88
122
|
]
|
89
123
|
all_ocr_cells.extend(cells)
|
90
124
|
|