docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +21 -13
- docling/backend/docling_parse_v2_backend.py +20 -12
- docling/backend/docling_parse_v4_backend.py +192 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +271 -0
- docling/backend/docx/latex/omml.py +453 -0
- docling/backend/html_backend.py +7 -7
- docling/backend/md_backend.py +1 -1
- docling/backend/msexcel_backend.py +2 -45
- docling/backend/mspowerpoint_backend.py +19 -1
- docling/backend/msword_backend.py +68 -3
- docling/backend/pdf_backend.py +7 -2
- docling/backend/pypdfium2_backend.py +52 -30
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +135 -53
- docling/cli/models.py +1 -1
- docling/datamodel/base_models.py +8 -10
- docling/datamodel/pipeline_options.py +54 -32
- docling/document_converter.py +5 -5
- docling/models/base_model.py +9 -1
- docling/models/base_ocr_model.py +27 -16
- docling/models/easyocr_model.py +28 -13
- docling/models/factories/__init__.py +27 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/hf_mlx_model.py +137 -0
- docling/models/ocr_mac_model.py +39 -11
- docling/models/page_preprocessing_model.py +4 -0
- docling/models/picture_description_api_model.py +20 -3
- docling/models/picture_description_base_model.py +19 -3
- docling/models/picture_description_vlm_model.py +14 -2
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +28 -0
- docling/models/rapid_ocr_model.py +34 -13
- docling/models/table_structure_model.py +13 -4
- docling/models/tesseract_ocr_cli_model.py +40 -15
- docling/models/tesseract_ocr_model.py +37 -12
- docling/pipeline/standard_pdf_pipeline.py +25 -78
- docling/pipeline/vlm_pipeline.py +78 -398
- docling/utils/export.py +8 -6
- docling/utils/layout_postprocessor.py +26 -23
- docling/utils/visualization.py +1 -1
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
- docling-2.28.0.dist-info/RECORD +84 -0
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
- docling-2.26.0.dist-info/RECORD +0 -72
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
@@ -1,10 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
3
|
import re
|
4
|
-
import warnings
|
5
4
|
from enum import Enum
|
6
5
|
from pathlib import Path
|
7
|
-
from typing import
|
6
|
+
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
8
7
|
|
9
8
|
from pydantic import (
|
10
9
|
AnyUrl,
|
@@ -13,13 +12,8 @@ from pydantic import (
|
|
13
12
|
Field,
|
14
13
|
field_validator,
|
15
14
|
model_validator,
|
16
|
-
validator,
|
17
|
-
)
|
18
|
-
from pydantic_settings import (
|
19
|
-
BaseSettings,
|
20
|
-
PydanticBaseSettingsSource,
|
21
|
-
SettingsConfigDict,
|
22
15
|
)
|
16
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
23
17
|
from typing_extensions import deprecated
|
24
18
|
|
25
19
|
_log = logging.getLogger(__name__)
|
@@ -83,6 +77,12 @@ class AcceleratorOptions(BaseSettings):
|
|
83
77
|
return data
|
84
78
|
|
85
79
|
|
80
|
+
class BaseOptions(BaseModel):
|
81
|
+
"""Base class for options."""
|
82
|
+
|
83
|
+
kind: ClassVar[str]
|
84
|
+
|
85
|
+
|
86
86
|
class TableFormerMode(str, Enum):
|
87
87
|
"""Modes for the TableFormer model."""
|
88
88
|
|
@@ -102,10 +102,9 @@ class TableStructureOptions(BaseModel):
|
|
102
102
|
mode: TableFormerMode = TableFormerMode.ACCURATE
|
103
103
|
|
104
104
|
|
105
|
-
class OcrOptions(
|
105
|
+
class OcrOptions(BaseOptions):
|
106
106
|
"""OCR options."""
|
107
107
|
|
108
|
-
kind: str
|
109
108
|
lang: List[str]
|
110
109
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
111
110
|
bitmap_area_threshold: float = (
|
@@ -116,7 +115,7 @@ class OcrOptions(BaseModel):
|
|
116
115
|
class RapidOcrOptions(OcrOptions):
|
117
116
|
"""Options for the RapidOCR engine."""
|
118
117
|
|
119
|
-
kind: Literal["rapidocr"] = "rapidocr"
|
118
|
+
kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
|
120
119
|
|
121
120
|
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
122
121
|
lang: List[str] = [
|
@@ -155,7 +154,7 @@ class RapidOcrOptions(OcrOptions):
|
|
155
154
|
class EasyOcrOptions(OcrOptions):
|
156
155
|
"""Options for the EasyOCR engine."""
|
157
156
|
|
158
|
-
kind: Literal["easyocr"] = "easyocr"
|
157
|
+
kind: ClassVar[Literal["easyocr"]] = "easyocr"
|
159
158
|
lang: List[str] = ["fr", "de", "es", "en"]
|
160
159
|
|
161
160
|
use_gpu: Optional[bool] = None
|
@@ -175,7 +174,7 @@ class EasyOcrOptions(OcrOptions):
|
|
175
174
|
class TesseractCliOcrOptions(OcrOptions):
|
176
175
|
"""Options for the TesseractCli engine."""
|
177
176
|
|
178
|
-
kind: Literal["tesseract"] = "tesseract"
|
177
|
+
kind: ClassVar[Literal["tesseract"]] = "tesseract"
|
179
178
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
180
179
|
tesseract_cmd: str = "tesseract"
|
181
180
|
path: Optional[str] = None
|
@@ -188,7 +187,7 @@ class TesseractCliOcrOptions(OcrOptions):
|
|
188
187
|
class TesseractOcrOptions(OcrOptions):
|
189
188
|
"""Options for the Tesseract engine."""
|
190
189
|
|
191
|
-
kind: Literal["tesserocr"] = "tesserocr"
|
190
|
+
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
|
192
191
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
193
192
|
path: Optional[str] = None
|
194
193
|
|
@@ -200,7 +199,7 @@ class TesseractOcrOptions(OcrOptions):
|
|
200
199
|
class OcrMacOptions(OcrOptions):
|
201
200
|
"""Options for the Mac OCR engine."""
|
202
201
|
|
203
|
-
kind: Literal["ocrmac"] = "ocrmac"
|
202
|
+
kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
|
204
203
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
205
204
|
recognition: str = "accurate"
|
206
205
|
framework: str = "vision"
|
@@ -210,8 +209,7 @@ class OcrMacOptions(OcrOptions):
|
|
210
209
|
)
|
211
210
|
|
212
211
|
|
213
|
-
class PictureDescriptionBaseOptions(
|
214
|
-
kind: str
|
212
|
+
class PictureDescriptionBaseOptions(BaseOptions):
|
215
213
|
batch_size: int = 8
|
216
214
|
scale: float = 2
|
217
215
|
|
@@ -221,7 +219,7 @@ class PictureDescriptionBaseOptions(BaseModel):
|
|
221
219
|
|
222
220
|
|
223
221
|
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
224
|
-
kind: Literal["api"] = "api"
|
222
|
+
kind: ClassVar[Literal["api"]] = "api"
|
225
223
|
|
226
224
|
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
227
225
|
headers: Dict[str, str] = {}
|
@@ -233,7 +231,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|
233
231
|
|
234
232
|
|
235
233
|
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
236
|
-
kind: Literal["vlm"] = "vlm"
|
234
|
+
kind: ClassVar[Literal["vlm"]] = "vlm"
|
237
235
|
|
238
236
|
repo_id: str
|
239
237
|
prompt: str = "Describe this image in a few sentences."
|
@@ -265,6 +263,11 @@ class ResponseFormat(str, Enum):
|
|
265
263
|
MARKDOWN = "markdown"
|
266
264
|
|
267
265
|
|
266
|
+
class InferenceFramework(str, Enum):
|
267
|
+
MLX = "mlx"
|
268
|
+
TRANSFORMERS = "transformers"
|
269
|
+
|
270
|
+
|
268
271
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
269
272
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
270
273
|
|
@@ -273,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
273
276
|
llm_int8_threshold: float = 6.0
|
274
277
|
quantized: bool = False
|
275
278
|
|
279
|
+
inference_framework: InferenceFramework
|
276
280
|
response_format: ResponseFormat
|
277
281
|
|
278
282
|
@property
|
@@ -280,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
280
284
|
return self.repo_id.replace("/", "--")
|
281
285
|
|
282
286
|
|
287
|
+
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
288
|
+
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
289
|
+
prompt="Convert this page to docling.",
|
290
|
+
response_format=ResponseFormat.DOCTAGS,
|
291
|
+
inference_framework=InferenceFramework.MLX,
|
292
|
+
)
|
293
|
+
|
294
|
+
|
283
295
|
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
284
296
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
285
297
|
prompt="Convert this page to docling.",
|
286
298
|
response_format=ResponseFormat.DOCTAGS,
|
299
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
287
300
|
)
|
288
301
|
|
289
302
|
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
@@ -291,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
291
304
|
# prompt="OCR the full page to markdown.",
|
292
305
|
prompt="OCR this image.",
|
293
306
|
response_format=ResponseFormat.MARKDOWN,
|
307
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
294
308
|
)
|
295
309
|
|
296
310
|
|
311
|
+
class VlmModelType(str, Enum):
|
312
|
+
SMOLDOCLING = "smoldocling"
|
313
|
+
GRANITE_VISION = "granite_vision"
|
314
|
+
|
315
|
+
|
297
316
|
# Define an enum for the backend options
|
298
317
|
class PdfBackend(str, Enum):
|
299
318
|
"""Enum of valid PDF backends."""
|
@@ -301,9 +320,11 @@ class PdfBackend(str, Enum):
|
|
301
320
|
PYPDFIUM2 = "pypdfium2"
|
302
321
|
DLPARSE_V1 = "dlparse_v1"
|
303
322
|
DLPARSE_V2 = "dlparse_v2"
|
323
|
+
DLPARSE_V4 = "dlparse_v4"
|
304
324
|
|
305
325
|
|
306
326
|
# Define an enum for the ocr engines
|
327
|
+
@deprecated("Use ocr_factory.registered_enum")
|
307
328
|
class OcrEngine(str, Enum):
|
308
329
|
"""Enum of valid OCR engines."""
|
309
330
|
|
@@ -323,16 +344,18 @@ class PipelineOptions(BaseModel):
|
|
323
344
|
document_timeout: Optional[float] = None
|
324
345
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
325
346
|
enable_remote_services: bool = False
|
347
|
+
allow_external_plugins: bool = False
|
326
348
|
|
327
349
|
|
328
350
|
class PaginatedPipelineOptions(PipelineOptions):
|
351
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
352
|
+
|
329
353
|
images_scale: float = 1.0
|
330
354
|
generate_page_images: bool = False
|
331
355
|
generate_picture_images: bool = False
|
332
356
|
|
333
357
|
|
334
358
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
335
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
336
359
|
|
337
360
|
generate_page_images: bool = True
|
338
361
|
force_backend_text: bool = (
|
@@ -345,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
345
368
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
346
369
|
"""Options for the PDF pipeline."""
|
347
370
|
|
348
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
349
371
|
do_table_structure: bool = True # True: perform table structure extraction
|
350
372
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
351
373
|
do_code_enrichment: bool = False # True: perform code OCR
|
@@ -358,17 +380,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
358
380
|
# If True, text from backend will be used instead of generated text
|
359
381
|
|
360
382
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
361
|
-
ocr_options:
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
OcrMacOptions,
|
366
|
-
RapidOcrOptions,
|
367
|
-
] = Field(EasyOcrOptions(), discriminator="kind")
|
368
|
-
picture_description_options: Annotated[
|
369
|
-
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
370
|
-
Field(discriminator="kind"),
|
371
|
-
] = smolvlm_picture_description
|
383
|
+
ocr_options: OcrOptions = EasyOcrOptions()
|
384
|
+
picture_description_options: PictureDescriptionBaseOptions = (
|
385
|
+
smolvlm_picture_description
|
386
|
+
)
|
372
387
|
|
373
388
|
images_scale: float = 1.0
|
374
389
|
generate_page_images: bool = False
|
@@ -381,3 +396,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
381
396
|
"before conversion and then use the `TableItem.get_image` function."
|
382
397
|
),
|
383
398
|
)
|
399
|
+
|
400
|
+
generate_parsed_pages: bool = False
|
401
|
+
|
402
|
+
|
403
|
+
class PdfPipeline(str, Enum):
|
404
|
+
STANDARD = "standard"
|
405
|
+
VLM = "vlm"
|
docling/document_converter.py
CHANGED
@@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
|
11
11
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
12
12
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
13
13
|
from docling.backend.csv_backend import CsvDocumentBackend
|
14
|
-
from docling.backend.
|
14
|
+
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
15
15
|
from docling.backend.html_backend import HTMLDocumentBackend
|
16
16
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
17
17
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
@@ -109,12 +109,12 @@ class XMLJatsFormatOption(FormatOption):
|
|
109
109
|
|
110
110
|
class ImageFormatOption(FormatOption):
|
111
111
|
pipeline_cls: Type = StandardPdfPipeline
|
112
|
-
backend: Type[AbstractDocumentBackend] =
|
112
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
113
113
|
|
114
114
|
|
115
115
|
class PdfFormatOption(FormatOption):
|
116
116
|
pipeline_cls: Type = StandardPdfPipeline
|
117
|
-
backend: Type[AbstractDocumentBackend] =
|
117
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
118
118
|
|
119
119
|
|
120
120
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
@@ -147,10 +147,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
147
147
|
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
148
148
|
),
|
149
149
|
InputFormat.IMAGE: FormatOption(
|
150
|
-
pipeline_cls=StandardPdfPipeline, backend=
|
150
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
151
151
|
),
|
152
152
|
InputFormat.PDF: FormatOption(
|
153
|
-
pipeline_cls=StandardPdfPipeline, backend=
|
153
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
154
154
|
),
|
155
155
|
InputFormat.JSON_DOCLING: FormatOption(
|
156
156
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
docling/models/base_model.py
CHANGED
@@ -1,14 +1,22 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from typing import Any, Generic, Iterable, Optional
|
2
|
+
from typing import Any, Generic, Iterable, Optional, Protocol, Type
|
3
3
|
|
4
4
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
5
5
|
from typing_extensions import TypeVar
|
6
6
|
|
7
7
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
8
8
|
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.pipeline_options import BaseOptions
|
9
10
|
from docling.datamodel.settings import settings
|
10
11
|
|
11
12
|
|
13
|
+
class BaseModelWithOptions(Protocol):
|
14
|
+
@classmethod
|
15
|
+
def get_options_type(cls) -> Type[BaseOptions]: ...
|
16
|
+
|
17
|
+
def __init__(self, *, options: BaseOptions, **kwargs): ...
|
18
|
+
|
19
|
+
|
12
20
|
class BasePageModel(ABC):
|
13
21
|
@abstractmethod
|
14
22
|
def __call__(
|
docling/models/base_ocr_model.py
CHANGED
@@ -2,25 +2,33 @@ import copy
|
|
2
2
|
import logging
|
3
3
|
from abc import abstractmethod
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Iterable, List
|
5
|
+
from typing import Iterable, List, Optional, Type
|
6
6
|
|
7
7
|
import numpy as np
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
+
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
9
10
|
from PIL import Image, ImageDraw
|
10
11
|
from rtree import index
|
11
12
|
from scipy.ndimage import binary_dilation, find_objects, label
|
12
13
|
|
13
|
-
from docling.datamodel.base_models import
|
14
|
+
from docling.datamodel.base_models import Page
|
14
15
|
from docling.datamodel.document import ConversionResult
|
15
|
-
from docling.datamodel.pipeline_options import OcrOptions
|
16
|
+
from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
|
16
17
|
from docling.datamodel.settings import settings
|
17
|
-
from docling.models.base_model import BasePageModel
|
18
|
+
from docling.models.base_model import BaseModelWithOptions, BasePageModel
|
18
19
|
|
19
20
|
_log = logging.getLogger(__name__)
|
20
21
|
|
21
22
|
|
22
|
-
class BaseOcrModel(BasePageModel):
|
23
|
-
def __init__(
|
23
|
+
class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
24
|
+
def __init__(
|
25
|
+
self,
|
26
|
+
*,
|
27
|
+
enabled: bool,
|
28
|
+
artifacts_path: Optional[Path],
|
29
|
+
options: OcrOptions,
|
30
|
+
accelerator_options: AcceleratorOptions,
|
31
|
+
):
|
24
32
|
self.enabled = enabled
|
25
33
|
self.options = options
|
26
34
|
|
@@ -104,11 +112,13 @@ class BaseOcrModel(BasePageModel):
|
|
104
112
|
p.dimension = 2
|
105
113
|
idx = index.Index(properties=p)
|
106
114
|
for i, cell in enumerate(programmatic_cells):
|
107
|
-
idx.insert(i, cell.
|
115
|
+
idx.insert(i, cell.rect.to_bounding_box().as_tuple())
|
108
116
|
|
109
117
|
def is_overlapping_with_existing_cells(ocr_cell):
|
110
118
|
# Query the R-tree to get overlapping rectangles
|
111
|
-
possible_matches_index = list(
|
119
|
+
possible_matches_index = list(
|
120
|
+
idx.intersection(ocr_cell.rect.to_bounding_box().as_tuple())
|
121
|
+
)
|
112
122
|
|
113
123
|
return (
|
114
124
|
len(possible_matches_index) > 0
|
@@ -125,10 +135,7 @@ class BaseOcrModel(BasePageModel):
|
|
125
135
|
"""
|
126
136
|
if self.options.force_full_page_ocr:
|
127
137
|
# If a full page OCR is forced, use only the OCR cells
|
128
|
-
cells =
|
129
|
-
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
130
|
-
for c_ocr in ocr_cells
|
131
|
-
]
|
138
|
+
cells = ocr_cells
|
132
139
|
return cells
|
133
140
|
|
134
141
|
## Remove OCR cells which overlap with programmatic cells.
|
@@ -156,7 +163,7 @@ class BaseOcrModel(BasePageModel):
|
|
156
163
|
|
157
164
|
# Draw OCR and programmatic cells
|
158
165
|
for tc in page.cells:
|
159
|
-
x0, y0, x1, y1 = tc.
|
166
|
+
x0, y0, x1, y1 = tc.rect.to_bounding_box().as_tuple()
|
160
167
|
y0 *= scale_x
|
161
168
|
y1 *= scale_y
|
162
169
|
x0 *= scale_x
|
@@ -165,9 +172,8 @@ class BaseOcrModel(BasePageModel):
|
|
165
172
|
if y1 <= y0:
|
166
173
|
y1, y0 = y0, y1
|
167
174
|
|
168
|
-
color = "gray"
|
169
|
-
|
170
|
-
color = "magenta"
|
175
|
+
color = "magenta" if tc.from_ocr else "gray"
|
176
|
+
|
171
177
|
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
|
172
178
|
|
173
179
|
if show:
|
@@ -187,3 +193,8 @@ class BaseOcrModel(BasePageModel):
|
|
187
193
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
188
194
|
) -> Iterable[Page]:
|
189
195
|
pass
|
196
|
+
|
197
|
+
@classmethod
|
198
|
+
@abstractmethod
|
199
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
200
|
+
pass
|
docling/models/easyocr_model.py
CHANGED
@@ -2,17 +2,19 @@ import logging
|
|
2
2
|
import warnings
|
3
3
|
import zipfile
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Iterable, List, Optional
|
5
|
+
from typing import Iterable, List, Optional, Type
|
6
6
|
|
7
7
|
import numpy
|
8
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
9
|
+
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
9
10
|
|
10
|
-
from docling.datamodel.base_models import
|
11
|
+
from docling.datamodel.base_models import Page
|
11
12
|
from docling.datamodel.document import ConversionResult
|
12
13
|
from docling.datamodel.pipeline_options import (
|
13
14
|
AcceleratorDevice,
|
14
15
|
AcceleratorOptions,
|
15
16
|
EasyOcrOptions,
|
17
|
+
OcrOptions,
|
16
18
|
)
|
17
19
|
from docling.datamodel.settings import settings
|
18
20
|
from docling.models.base_ocr_model import BaseOcrModel
|
@@ -33,7 +35,12 @@ class EasyOcrModel(BaseOcrModel):
|
|
33
35
|
options: EasyOcrOptions,
|
34
36
|
accelerator_options: AcceleratorOptions,
|
35
37
|
):
|
36
|
-
super().__init__(
|
38
|
+
super().__init__(
|
39
|
+
enabled=enabled,
|
40
|
+
artifacts_path=artifacts_path,
|
41
|
+
options=options,
|
42
|
+
accelerator_options=accelerator_options,
|
43
|
+
)
|
37
44
|
self.options: EasyOcrOptions
|
38
45
|
|
39
46
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
@@ -148,18 +155,22 @@ class EasyOcrModel(BaseOcrModel):
|
|
148
155
|
del im
|
149
156
|
|
150
157
|
cells = [
|
151
|
-
|
152
|
-
|
158
|
+
TextCell(
|
159
|
+
index=ix,
|
153
160
|
text=line[1],
|
161
|
+
orig=line[1],
|
162
|
+
from_ocr=True,
|
154
163
|
confidence=line[2],
|
155
|
-
|
156
|
-
|
157
|
-
(
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
164
|
+
rect=BoundingRectangle.from_bounding_box(
|
165
|
+
BoundingBox.from_tuple(
|
166
|
+
coord=(
|
167
|
+
(line[0][0][0] / self.scale) + ocr_rect.l,
|
168
|
+
(line[0][0][1] / self.scale) + ocr_rect.t,
|
169
|
+
(line[0][2][0] / self.scale) + ocr_rect.l,
|
170
|
+
(line[0][2][1] / self.scale) + ocr_rect.t,
|
171
|
+
),
|
172
|
+
origin=CoordOrigin.TOPLEFT,
|
173
|
+
)
|
163
174
|
),
|
164
175
|
)
|
165
176
|
for ix, line in enumerate(result)
|
@@ -175,3 +186,7 @@ class EasyOcrModel(BaseOcrModel):
|
|
175
186
|
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
176
187
|
|
177
188
|
yield page
|
189
|
+
|
190
|
+
@classmethod
|
191
|
+
def get_options_type(cls) -> Type[OcrOptions]:
|
192
|
+
return EasyOcrOptions
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import logging
|
2
|
+
from functools import lru_cache
|
3
|
+
|
4
|
+
from docling.models.factories.ocr_factory import OcrFactory
|
5
|
+
from docling.models.factories.picture_description_factory import (
|
6
|
+
PictureDescriptionFactory,
|
7
|
+
)
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
@lru_cache()
|
13
|
+
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
14
|
+
factory = OcrFactory()
|
15
|
+
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
16
|
+
logger.info("Registered ocr engines: %r", factory.registered_kind)
|
17
|
+
return factory
|
18
|
+
|
19
|
+
|
20
|
+
@lru_cache()
|
21
|
+
def get_picture_description_factory(
|
22
|
+
allow_external_plugins: bool = False,
|
23
|
+
) -> PictureDescriptionFactory:
|
24
|
+
factory = PictureDescriptionFactory()
|
25
|
+
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
26
|
+
logger.info("Registered picture descriptions: %r", factory.registered_kind)
|
27
|
+
return factory
|
@@ -0,0 +1,122 @@
|
|
1
|
+
import enum
|
2
|
+
import logging
|
3
|
+
from abc import ABCMeta
|
4
|
+
from typing import Generic, Optional, Type, TypeVar
|
5
|
+
|
6
|
+
from pluggy import PluginManager
|
7
|
+
from pydantic import BaseModel
|
8
|
+
|
9
|
+
from docling.datamodel.pipeline_options import BaseOptions
|
10
|
+
from docling.models.base_model import BaseModelWithOptions
|
11
|
+
|
12
|
+
A = TypeVar("A", bound=BaseModelWithOptions)
|
13
|
+
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class FactoryMeta(BaseModel):
|
19
|
+
kind: str
|
20
|
+
plugin_name: str
|
21
|
+
module: str
|
22
|
+
|
23
|
+
|
24
|
+
class BaseFactory(Generic[A], metaclass=ABCMeta):
|
25
|
+
default_plugin_name = "docling"
|
26
|
+
|
27
|
+
def __init__(self, plugin_attr_name: str, plugin_name=default_plugin_name):
|
28
|
+
self.plugin_name = plugin_name
|
29
|
+
self.plugin_attr_name = plugin_attr_name
|
30
|
+
|
31
|
+
self._classes: dict[Type[BaseOptions], Type[A]] = {}
|
32
|
+
self._meta: dict[Type[BaseOptions], FactoryMeta] = {}
|
33
|
+
|
34
|
+
@property
|
35
|
+
def registered_kind(self) -> list[str]:
|
36
|
+
return list(opt.kind for opt in self._classes.keys())
|
37
|
+
|
38
|
+
def get_enum(self) -> enum.Enum:
|
39
|
+
return enum.Enum(
|
40
|
+
self.plugin_attr_name + "_enum",
|
41
|
+
names={kind: kind for kind in self.registered_kind},
|
42
|
+
type=str,
|
43
|
+
module=__name__,
|
44
|
+
)
|
45
|
+
|
46
|
+
@property
|
47
|
+
def classes(self):
|
48
|
+
return self._classes
|
49
|
+
|
50
|
+
@property
|
51
|
+
def registered_meta(self):
|
52
|
+
return self._meta
|
53
|
+
|
54
|
+
def create_instance(self, options: BaseOptions, **kwargs) -> A:
|
55
|
+
try:
|
56
|
+
_cls = self._classes[type(options)]
|
57
|
+
return _cls(options=options, **kwargs)
|
58
|
+
except KeyError:
|
59
|
+
raise RuntimeError(self._err_msg_on_class_not_found(options.kind))
|
60
|
+
|
61
|
+
def create_options(self, kind: str, *args, **kwargs) -> BaseOptions:
|
62
|
+
for opt_cls, _ in self._classes.items():
|
63
|
+
if opt_cls.kind == kind:
|
64
|
+
return opt_cls(*args, **kwargs)
|
65
|
+
raise RuntimeError(self._err_msg_on_class_not_found(kind))
|
66
|
+
|
67
|
+
def _err_msg_on_class_not_found(self, kind: str):
|
68
|
+
msg = []
|
69
|
+
|
70
|
+
for opt, cls in self._classes.items():
|
71
|
+
msg.append(f"\t{opt.kind!r} => {cls!r}")
|
72
|
+
|
73
|
+
msg_str = "\n".join(msg)
|
74
|
+
|
75
|
+
return f"No class found with the name {kind!r}, known classes are:\n{msg_str}"
|
76
|
+
|
77
|
+
def register(self, cls: Type[A], plugin_name: str, plugin_module_name: str):
|
78
|
+
opt_type = cls.get_options_type()
|
79
|
+
|
80
|
+
if opt_type in self._classes:
|
81
|
+
raise ValueError(
|
82
|
+
f"{opt_type.kind!r} already registered to class {self._classes[opt_type]!r}"
|
83
|
+
)
|
84
|
+
|
85
|
+
self._classes[opt_type] = cls
|
86
|
+
self._meta[opt_type] = FactoryMeta(
|
87
|
+
kind=opt_type.kind, plugin_name=plugin_name, module=plugin_module_name
|
88
|
+
)
|
89
|
+
|
90
|
+
def load_from_plugins(
|
91
|
+
self, plugin_name: Optional[str] = None, allow_external_plugins: bool = False
|
92
|
+
):
|
93
|
+
plugin_name = plugin_name or self.plugin_name
|
94
|
+
|
95
|
+
plugin_manager = PluginManager(plugin_name)
|
96
|
+
plugin_manager.load_setuptools_entrypoints(plugin_name)
|
97
|
+
|
98
|
+
for plugin_name, plugin_module in plugin_manager.list_name_plugin():
|
99
|
+
plugin_module_name = str(plugin_module.__name__) # type: ignore
|
100
|
+
|
101
|
+
if not allow_external_plugins and not plugin_module_name.startswith(
|
102
|
+
"docling."
|
103
|
+
):
|
104
|
+
logger.warning(
|
105
|
+
f"The plugin {plugin_name} will not be loaded because Docling is being executed with allow_external_plugins=false."
|
106
|
+
)
|
107
|
+
continue
|
108
|
+
|
109
|
+
attr = getattr(plugin_module, self.plugin_attr_name, None)
|
110
|
+
|
111
|
+
if callable(attr):
|
112
|
+
logger.info("Loading plugin %r", plugin_name)
|
113
|
+
|
114
|
+
config = attr()
|
115
|
+
self.process_plugin(config, plugin_name, plugin_module_name)
|
116
|
+
|
117
|
+
def process_plugin(self, config, plugin_name: str, plugin_module_name: str):
|
118
|
+
for item in config[self.plugin_attr_name]:
|
119
|
+
try:
|
120
|
+
self.register(item, plugin_name, plugin_module_name)
|
121
|
+
except ValueError:
|
122
|
+
logger.warning("%r already registered", item)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from docling.models.base_ocr_model import BaseOcrModel
|
4
|
+
from docling.models.factories.base_factory import BaseFactory
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class OcrFactory(BaseFactory[BaseOcrModel]):
|
10
|
+
def __init__(self, *args, **kwargs):
|
11
|
+
super().__init__("ocr_engines", *args, **kwargs)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
import logging
|
2
|
+
|
3
|
+
from docling.models.factories.base_factory import BaseFactory
|
4
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class PictureDescriptionFactory(BaseFactory[PictureDescriptionBaseModel]):
|
10
|
+
def __init__(self, *args, **kwargs):
|
11
|
+
super().__init__("picture_description", *args, **kwargs)
|