docling 2.26.0__py3-none-any.whl → 2.27.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +21 -13
- docling/backend/docling_parse_v2_backend.py +20 -12
- docling/backend/docling_parse_v4_backend.py +185 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +271 -0
- docling/backend/docx/latex/omml.py +453 -0
- docling/backend/html_backend.py +7 -7
- docling/backend/md_backend.py +1 -1
- docling/backend/msexcel_backend.py +2 -45
- docling/backend/mspowerpoint_backend.py +1 -1
- docling/backend/msword_backend.py +65 -3
- docling/backend/pdf_backend.py +7 -2
- docling/backend/pypdfium2_backend.py +52 -30
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +60 -21
- docling/cli/models.py +1 -1
- docling/datamodel/base_models.py +8 -10
- docling/datamodel/pipeline_options.py +26 -30
- docling/document_converter.py +5 -5
- docling/models/base_model.py +9 -1
- docling/models/base_ocr_model.py +27 -16
- docling/models/easyocr_model.py +28 -13
- docling/models/factories/__init__.py +27 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/ocr_mac_model.py +39 -11
- docling/models/page_preprocessing_model.py +4 -0
- docling/models/picture_description_api_model.py +20 -3
- docling/models/picture_description_base_model.py +19 -3
- docling/models/picture_description_vlm_model.py +14 -2
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +28 -0
- docling/models/rapid_ocr_model.py +34 -13
- docling/models/table_structure_model.py +13 -4
- docling/models/tesseract_ocr_cli_model.py +40 -15
- docling/models/tesseract_ocr_model.py +37 -12
- docling/pipeline/standard_pdf_pipeline.py +25 -78
- docling/utils/export.py +8 -6
- docling/utils/layout_postprocessor.py +26 -23
- docling/utils/visualization.py +1 -1
- {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/METADATA +48 -19
- docling-2.27.0.dist-info/RECORD +83 -0
- {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/entry_points.txt +3 -0
- docling-2.26.0.dist-info/RECORD +0 -72
- {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/LICENSE +0 -0
- {docling-2.26.0.dist-info → docling-2.27.0.dist-info}/WHEEL +0 -0
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
import pypdfium2.raw as pdfium_c
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
10
|
+
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
10
11
|
from PIL import Image, ImageDraw
|
11
12
|
from pypdfium2 import PdfTextPage
|
12
13
|
from pypdfium2._helpers.misc import PdfiumError
|
13
14
|
|
14
15
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
15
|
-
from docling.datamodel.base_models import Cell
|
16
16
|
from docling.utils.locks import pypdfium2_lock
|
17
17
|
|
18
18
|
if TYPE_CHECKING:
|
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
68
68
|
|
69
69
|
return text_piece
|
70
70
|
|
71
|
-
def
|
71
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
72
|
+
return None
|
73
|
+
|
74
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
72
75
|
with pypdfium2_lock:
|
73
76
|
if not self.text_page:
|
74
77
|
self.text_page = self._ppage.get_textpage()
|
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
84
87
|
text_piece = self.text_page.get_text_bounded(*rect)
|
85
88
|
x0, y0, x1, y1 = rect
|
86
89
|
cells.append(
|
87
|
-
|
88
|
-
|
90
|
+
TextCell(
|
91
|
+
index=cell_counter,
|
89
92
|
text=text_piece,
|
90
|
-
|
91
|
-
|
93
|
+
orig=text_piece,
|
94
|
+
from_ocr=False,
|
95
|
+
rect=BoundingRectangle.from_bounding_box(
|
96
|
+
BoundingBox(
|
97
|
+
l=x0,
|
98
|
+
b=y0,
|
99
|
+
r=x1,
|
100
|
+
t=y1,
|
101
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
102
|
+
)
|
92
103
|
).to_top_left_origin(page_size.height),
|
93
104
|
)
|
94
105
|
)
|
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
97
108
|
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
98
109
|
# The cell merging code below is to clean this up.
|
99
110
|
def merge_horizontal_cells(
|
100
|
-
cells: List[
|
111
|
+
cells: List[TextCell],
|
101
112
|
horizontal_threshold_factor: float = 1.0,
|
102
113
|
vertical_threshold_factor: float = 0.5,
|
103
|
-
) -> List[
|
114
|
+
) -> List[TextCell]:
|
104
115
|
if not cells:
|
105
116
|
return []
|
106
117
|
|
107
|
-
def group_rows(cells: List[
|
118
|
+
def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
|
108
119
|
rows = []
|
109
120
|
current_row = [cells[0]]
|
110
|
-
row_top = cells[0].
|
111
|
-
row_bottom = cells[0].
|
112
|
-
row_height = cells[0].
|
121
|
+
row_top = cells[0].rect.to_bounding_box().t
|
122
|
+
row_bottom = cells[0].rect.to_bounding_box().b
|
123
|
+
row_height = cells[0].rect.to_bounding_box().height
|
113
124
|
|
114
125
|
for cell in cells[1:]:
|
115
126
|
vertical_threshold = row_height * vertical_threshold_factor
|
116
127
|
if (
|
117
|
-
abs(cell.
|
118
|
-
|
128
|
+
abs(cell.rect.to_bounding_box().t - row_top)
|
129
|
+
<= vertical_threshold
|
130
|
+
and abs(cell.rect.to_bounding_box().b - row_bottom)
|
131
|
+
<= vertical_threshold
|
119
132
|
):
|
120
133
|
current_row.append(cell)
|
121
|
-
row_top = min(row_top, cell.
|
122
|
-
row_bottom = max(row_bottom, cell.
|
134
|
+
row_top = min(row_top, cell.rect.to_bounding_box().t)
|
135
|
+
row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
|
123
136
|
row_height = row_bottom - row_top
|
124
137
|
else:
|
125
138
|
rows.append(current_row)
|
126
139
|
current_row = [cell]
|
127
|
-
row_top = cell.
|
128
|
-
row_bottom = cell.
|
129
|
-
row_height = cell.
|
140
|
+
row_top = cell.rect.to_bounding_box().t
|
141
|
+
row_bottom = cell.rect.to_bounding_box().b
|
142
|
+
row_height = cell.rect.to_bounding_box().height
|
130
143
|
|
131
144
|
if current_row:
|
132
145
|
rows.append(current_row)
|
133
146
|
|
134
147
|
return rows
|
135
148
|
|
136
|
-
def merge_row(row: List[
|
149
|
+
def merge_row(row: List[TextCell]) -> List[TextCell]:
|
137
150
|
merged = []
|
138
151
|
current_group = [row[0]]
|
139
152
|
|
140
153
|
for cell in row[1:]:
|
141
154
|
prev_cell = current_group[-1]
|
142
|
-
avg_height = (
|
155
|
+
avg_height = (
|
156
|
+
prev_cell.rect.height + cell.rect.to_bounding_box().height
|
157
|
+
) / 2
|
143
158
|
if (
|
144
|
-
cell.
|
159
|
+
cell.rect.to_bounding_box().l
|
160
|
+
- prev_cell.rect.to_bounding_box().r
|
145
161
|
<= avg_height * horizontal_threshold_factor
|
146
162
|
):
|
147
163
|
current_group.append(cell)
|
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
154
170
|
|
155
171
|
return merged
|
156
172
|
|
157
|
-
def merge_group(group: List[
|
173
|
+
def merge_group(group: List[TextCell]) -> TextCell:
|
158
174
|
if len(group) == 1:
|
159
175
|
return group[0]
|
160
176
|
|
161
177
|
merged_text = "".join(cell.text for cell in group)
|
162
178
|
merged_bbox = BoundingBox(
|
163
|
-
l=min(cell.
|
164
|
-
t=min(cell.
|
165
|
-
r=max(cell.
|
166
|
-
b=max(cell.
|
179
|
+
l=min(cell.rect.to_bounding_box().l for cell in group),
|
180
|
+
t=min(cell.rect.to_bounding_box().t for cell in group),
|
181
|
+
r=max(cell.rect.to_bounding_box().r for cell in group),
|
182
|
+
b=max(cell.rect.to_bounding_box().b for cell in group),
|
183
|
+
)
|
184
|
+
return TextCell(
|
185
|
+
index=group[0].index,
|
186
|
+
text=merged_text,
|
187
|
+
orig=merged_text,
|
188
|
+
rect=BoundingRectangle.from_bounding_box(merged_bbox),
|
189
|
+
from_ocr=False,
|
167
190
|
)
|
168
|
-
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
|
169
191
|
|
170
192
|
rows = group_rows(cells)
|
171
193
|
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
172
194
|
|
173
195
|
for i, cell in enumerate(merged_cells, 1):
|
174
|
-
cell.
|
196
|
+
cell.index = i
|
175
197
|
|
176
198
|
return merged_cells
|
177
199
|
|
@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
181
203
|
) # make new image to avoid drawing on the saved ones
|
182
204
|
draw = ImageDraw.Draw(image)
|
183
205
|
for c in cells:
|
184
|
-
x0, y0, x1, y1 = c.
|
206
|
+
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
185
207
|
cell_color = (
|
186
208
|
random.randint(30, 140),
|
187
209
|
random.randint(30, 140),
|
@@ -999,7 +999,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|
999
999
|
parent=self.parents[self.level],
|
1000
1000
|
)
|
1001
1001
|
|
1002
|
-
last_claim.text += f" {value}" if last_claim.text else value
|
1002
|
+
last_claim.text += f" {value.strip()}" if last_claim.text else value.strip()
|
1003
1003
|
|
1004
1004
|
elif field == self.Field.CAPTION.value and section in (
|
1005
1005
|
self.Section.SUMMARY.value,
|
docling/cli/main.py
CHANGED
@@ -9,6 +9,7 @@ import warnings
|
|
9
9
|
from pathlib import Path
|
10
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
11
11
|
|
12
|
+
import rich.table
|
12
13
|
import typer
|
13
14
|
from docling_core.types.doc import ImageRefMode
|
14
15
|
from docling_core.utils.file import resolve_source_to_path
|
@@ -16,6 +17,7 @@ from pydantic import TypeAdapter
|
|
16
17
|
|
17
18
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
18
19
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
20
|
+
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
19
21
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
20
22
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
21
23
|
from docling.datamodel.base_models import (
|
@@ -29,18 +31,14 @@ from docling.datamodel.pipeline_options import (
|
|
29
31
|
AcceleratorDevice,
|
30
32
|
AcceleratorOptions,
|
31
33
|
EasyOcrOptions,
|
32
|
-
OcrEngine,
|
33
|
-
OcrMacOptions,
|
34
34
|
OcrOptions,
|
35
35
|
PdfBackend,
|
36
36
|
PdfPipelineOptions,
|
37
|
-
RapidOcrOptions,
|
38
37
|
TableFormerMode,
|
39
|
-
TesseractCliOcrOptions,
|
40
|
-
TesseractOcrOptions,
|
41
38
|
)
|
42
39
|
from docling.datamodel.settings import settings
|
43
40
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
41
|
+
from docling.models.factories import get_ocr_factory
|
44
42
|
|
45
43
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
46
44
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@@ -48,8 +46,11 @@ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr
|
|
48
46
|
_log = logging.getLogger(__name__)
|
49
47
|
from rich.console import Console
|
50
48
|
|
49
|
+
console = Console()
|
51
50
|
err_console = Console(stderr=True)
|
52
51
|
|
52
|
+
ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
|
53
|
+
ocr_engines_enum_internal = ocr_factory_internal.get_enum()
|
53
54
|
|
54
55
|
app = typer.Typer(
|
55
56
|
name="Docling",
|
@@ -77,6 +78,24 @@ def version_callback(value: bool):
|
|
77
78
|
raise typer.Exit()
|
78
79
|
|
79
80
|
|
81
|
+
def show_external_plugins_callback(value: bool):
|
82
|
+
if value:
|
83
|
+
ocr_factory_all = get_ocr_factory(allow_external_plugins=True)
|
84
|
+
table = rich.table.Table(title="Available OCR engines")
|
85
|
+
table.add_column("Name", justify="right")
|
86
|
+
table.add_column("Plugin")
|
87
|
+
table.add_column("Package")
|
88
|
+
for meta in ocr_factory_all.registered_meta.values():
|
89
|
+
if not meta.module.startswith("docling."):
|
90
|
+
table.add_row(
|
91
|
+
f"[bold]{meta.kind}[/bold]",
|
92
|
+
meta.plugin_name,
|
93
|
+
meta.module.split(".")[0],
|
94
|
+
)
|
95
|
+
rich.print(table)
|
96
|
+
raise typer.Exit()
|
97
|
+
|
98
|
+
|
80
99
|
def export_documents(
|
81
100
|
conv_results: Iterable[ConversionResult],
|
82
101
|
output_dir: Path,
|
@@ -195,8 +214,16 @@ def convert(
|
|
195
214
|
),
|
196
215
|
] = False,
|
197
216
|
ocr_engine: Annotated[
|
198
|
-
|
199
|
-
|
217
|
+
str,
|
218
|
+
typer.Option(
|
219
|
+
...,
|
220
|
+
help=(
|
221
|
+
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
222
|
+
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
|
223
|
+
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
224
|
+
),
|
225
|
+
),
|
226
|
+
] = EasyOcrOptions.kind,
|
200
227
|
ocr_lang: Annotated[
|
201
228
|
Optional[str],
|
202
229
|
typer.Option(
|
@@ -240,6 +267,21 @@ def convert(
|
|
240
267
|
..., help="Must be enabled when using models connecting to remote services."
|
241
268
|
),
|
242
269
|
] = False,
|
270
|
+
allow_external_plugins: Annotated[
|
271
|
+
bool,
|
272
|
+
typer.Option(
|
273
|
+
..., help="Must be enabled for loading modules from third-party plugins."
|
274
|
+
),
|
275
|
+
] = False,
|
276
|
+
show_external_plugins: Annotated[
|
277
|
+
bool,
|
278
|
+
typer.Option(
|
279
|
+
...,
|
280
|
+
help="List the third-party plugins which are available when the option --allow-external-plugins is set.",
|
281
|
+
callback=show_external_plugins_callback,
|
282
|
+
is_eager=True,
|
283
|
+
),
|
284
|
+
] = False,
|
243
285
|
abort_on_error: Annotated[
|
244
286
|
bool,
|
245
287
|
typer.Option(
|
@@ -367,18 +409,11 @@ def convert(
|
|
367
409
|
export_txt = OutputFormat.TEXT in to_formats
|
368
410
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
369
411
|
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
376
|
-
elif ocr_engine == OcrEngine.OCRMAC:
|
377
|
-
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
378
|
-
elif ocr_engine == OcrEngine.RAPIDOCR:
|
379
|
-
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
380
|
-
else:
|
381
|
-
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
412
|
+
ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
|
413
|
+
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
|
414
|
+
kind=ocr_engine,
|
415
|
+
force_full_page_ocr=force_ocr,
|
416
|
+
)
|
382
417
|
|
383
418
|
ocr_lang_list = _split_list(ocr_lang)
|
384
419
|
if ocr_lang_list is not None:
|
@@ -386,6 +421,7 @@ def convert(
|
|
386
421
|
|
387
422
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
388
423
|
pipeline_options = PdfPipelineOptions(
|
424
|
+
allow_external_plugins=allow_external_plugins,
|
389
425
|
enable_remote_services=enable_remote_services,
|
390
426
|
accelerator_options=accelerator_options,
|
391
427
|
do_ocr=ocr,
|
@@ -412,12 +448,15 @@ def convert(
|
|
412
448
|
if artifacts_path is not None:
|
413
449
|
pipeline_options.artifacts_path = artifacts_path
|
414
450
|
|
451
|
+
backend: Type[PdfDocumentBackend]
|
415
452
|
if pdf_backend == PdfBackend.DLPARSE_V1:
|
416
|
-
backend
|
453
|
+
backend = DoclingParseDocumentBackend
|
417
454
|
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
418
455
|
backend = DoclingParseV2DocumentBackend
|
456
|
+
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
457
|
+
backend = DoclingParseV4DocumentBackend # type: ignore
|
419
458
|
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
420
|
-
backend = PyPdfiumDocumentBackend
|
459
|
+
backend = PyPdfiumDocumentBackend # type: ignore
|
421
460
|
else:
|
422
461
|
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
423
462
|
|
docling/cli/models.py
CHANGED
@@ -121,7 +121,7 @@ def download(
|
|
121
121
|
"Using the CLI:",
|
122
122
|
f"`docling --artifacts-path={output_dir} FILE`",
|
123
123
|
"\n",
|
124
|
-
"Using Python: see the documentation at <https://
|
124
|
+
"Using Python: see the documentation at <https://docling-project.github.io/docling/usage>.",
|
125
125
|
)
|
126
126
|
|
127
127
|
|
docling/datamodel/base_models.py
CHANGED
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
|
|
9
9
|
Size,
|
10
10
|
TableCell,
|
11
11
|
)
|
12
|
+
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
12
13
|
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
13
14
|
DocumentStream,
|
14
15
|
)
|
@@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
|
|
123
124
|
error_message: str
|
124
125
|
|
125
126
|
|
126
|
-
class Cell(BaseModel):
|
127
|
-
id: int
|
128
|
-
text: str
|
129
|
-
bbox: BoundingBox
|
130
|
-
|
131
|
-
|
132
|
-
class OcrCell(Cell):
|
133
|
-
confidence: float
|
127
|
+
# class Cell(BaseModel):
|
128
|
+
# id: int
|
129
|
+
# text: str
|
130
|
+
# bbox: BoundingBox
|
134
131
|
|
135
132
|
|
136
133
|
class Cluster(BaseModel):
|
@@ -138,7 +135,7 @@ class Cluster(BaseModel):
|
|
138
135
|
label: DocItemLabel
|
139
136
|
bbox: BoundingBox
|
140
137
|
confidence: float = 1.0
|
141
|
-
cells: List[
|
138
|
+
cells: List[TextCell] = []
|
142
139
|
children: List["Cluster"] = [] # Add child cluster support
|
143
140
|
|
144
141
|
|
@@ -226,7 +223,8 @@ class Page(BaseModel):
|
|
226
223
|
page_no: int
|
227
224
|
# page_hash: Optional[str] = None
|
228
225
|
size: Optional[Size] = None
|
229
|
-
cells: List[
|
226
|
+
cells: List[TextCell] = []
|
227
|
+
parsed_page: Optional[SegmentedPdfPage] = None
|
230
228
|
predictions: PagePredictions = PagePredictions()
|
231
229
|
assembled: Optional[AssembledUnit] = None
|
232
230
|
|
@@ -1,10 +1,9 @@
|
|
1
1
|
import logging
|
2
2
|
import os
|
3
3
|
import re
|
4
|
-
import warnings
|
5
4
|
from enum import Enum
|
6
5
|
from pathlib import Path
|
7
|
-
from typing import
|
6
|
+
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
8
7
|
|
9
8
|
from pydantic import (
|
10
9
|
AnyUrl,
|
@@ -13,13 +12,8 @@ from pydantic import (
|
|
13
12
|
Field,
|
14
13
|
field_validator,
|
15
14
|
model_validator,
|
16
|
-
validator,
|
17
|
-
)
|
18
|
-
from pydantic_settings import (
|
19
|
-
BaseSettings,
|
20
|
-
PydanticBaseSettingsSource,
|
21
|
-
SettingsConfigDict,
|
22
15
|
)
|
16
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
23
17
|
from typing_extensions import deprecated
|
24
18
|
|
25
19
|
_log = logging.getLogger(__name__)
|
@@ -83,6 +77,12 @@ class AcceleratorOptions(BaseSettings):
|
|
83
77
|
return data
|
84
78
|
|
85
79
|
|
80
|
+
class BaseOptions(BaseModel):
|
81
|
+
"""Base class for options."""
|
82
|
+
|
83
|
+
kind: ClassVar[str]
|
84
|
+
|
85
|
+
|
86
86
|
class TableFormerMode(str, Enum):
|
87
87
|
"""Modes for the TableFormer model."""
|
88
88
|
|
@@ -102,10 +102,9 @@ class TableStructureOptions(BaseModel):
|
|
102
102
|
mode: TableFormerMode = TableFormerMode.ACCURATE
|
103
103
|
|
104
104
|
|
105
|
-
class OcrOptions(
|
105
|
+
class OcrOptions(BaseOptions):
|
106
106
|
"""OCR options."""
|
107
107
|
|
108
|
-
kind: str
|
109
108
|
lang: List[str]
|
110
109
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
111
110
|
bitmap_area_threshold: float = (
|
@@ -116,7 +115,7 @@ class OcrOptions(BaseModel):
|
|
116
115
|
class RapidOcrOptions(OcrOptions):
|
117
116
|
"""Options for the RapidOCR engine."""
|
118
117
|
|
119
|
-
kind: Literal["rapidocr"] = "rapidocr"
|
118
|
+
kind: ClassVar[Literal["rapidocr"]] = "rapidocr"
|
120
119
|
|
121
120
|
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
122
121
|
lang: List[str] = [
|
@@ -155,7 +154,7 @@ class RapidOcrOptions(OcrOptions):
|
|
155
154
|
class EasyOcrOptions(OcrOptions):
|
156
155
|
"""Options for the EasyOCR engine."""
|
157
156
|
|
158
|
-
kind: Literal["easyocr"] = "easyocr"
|
157
|
+
kind: ClassVar[Literal["easyocr"]] = "easyocr"
|
159
158
|
lang: List[str] = ["fr", "de", "es", "en"]
|
160
159
|
|
161
160
|
use_gpu: Optional[bool] = None
|
@@ -175,7 +174,7 @@ class EasyOcrOptions(OcrOptions):
|
|
175
174
|
class TesseractCliOcrOptions(OcrOptions):
|
176
175
|
"""Options for the TesseractCli engine."""
|
177
176
|
|
178
|
-
kind: Literal["tesseract"] = "tesseract"
|
177
|
+
kind: ClassVar[Literal["tesseract"]] = "tesseract"
|
179
178
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
180
179
|
tesseract_cmd: str = "tesseract"
|
181
180
|
path: Optional[str] = None
|
@@ -188,7 +187,7 @@ class TesseractCliOcrOptions(OcrOptions):
|
|
188
187
|
class TesseractOcrOptions(OcrOptions):
|
189
188
|
"""Options for the Tesseract engine."""
|
190
189
|
|
191
|
-
kind: Literal["tesserocr"] = "tesserocr"
|
190
|
+
kind: ClassVar[Literal["tesserocr"]] = "tesserocr"
|
192
191
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
193
192
|
path: Optional[str] = None
|
194
193
|
|
@@ -200,7 +199,7 @@ class TesseractOcrOptions(OcrOptions):
|
|
200
199
|
class OcrMacOptions(OcrOptions):
|
201
200
|
"""Options for the Mac OCR engine."""
|
202
201
|
|
203
|
-
kind: Literal["ocrmac"] = "ocrmac"
|
202
|
+
kind: ClassVar[Literal["ocrmac"]] = "ocrmac"
|
204
203
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
205
204
|
recognition: str = "accurate"
|
206
205
|
framework: str = "vision"
|
@@ -210,8 +209,7 @@ class OcrMacOptions(OcrOptions):
|
|
210
209
|
)
|
211
210
|
|
212
211
|
|
213
|
-
class PictureDescriptionBaseOptions(
|
214
|
-
kind: str
|
212
|
+
class PictureDescriptionBaseOptions(BaseOptions):
|
215
213
|
batch_size: int = 8
|
216
214
|
scale: float = 2
|
217
215
|
|
@@ -221,7 +219,7 @@ class PictureDescriptionBaseOptions(BaseModel):
|
|
221
219
|
|
222
220
|
|
223
221
|
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
224
|
-
kind: Literal["api"] = "api"
|
222
|
+
kind: ClassVar[Literal["api"]] = "api"
|
225
223
|
|
226
224
|
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
227
225
|
headers: Dict[str, str] = {}
|
@@ -233,7 +231,7 @@ class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|
233
231
|
|
234
232
|
|
235
233
|
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
236
|
-
kind: Literal["vlm"] = "vlm"
|
234
|
+
kind: ClassVar[Literal["vlm"]] = "vlm"
|
237
235
|
|
238
236
|
repo_id: str
|
239
237
|
prompt: str = "Describe this image in a few sentences."
|
@@ -301,9 +299,11 @@ class PdfBackend(str, Enum):
|
|
301
299
|
PYPDFIUM2 = "pypdfium2"
|
302
300
|
DLPARSE_V1 = "dlparse_v1"
|
303
301
|
DLPARSE_V2 = "dlparse_v2"
|
302
|
+
DLPARSE_V4 = "dlparse_v4"
|
304
303
|
|
305
304
|
|
306
305
|
# Define an enum for the ocr engines
|
306
|
+
@deprecated("Use ocr_factory.registered_enum")
|
307
307
|
class OcrEngine(str, Enum):
|
308
308
|
"""Enum of valid OCR engines."""
|
309
309
|
|
@@ -323,6 +323,7 @@ class PipelineOptions(BaseModel):
|
|
323
323
|
document_timeout: Optional[float] = None
|
324
324
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
325
325
|
enable_remote_services: bool = False
|
326
|
+
allow_external_plugins: bool = False
|
326
327
|
|
327
328
|
|
328
329
|
class PaginatedPipelineOptions(PipelineOptions):
|
@@ -358,17 +359,10 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
358
359
|
# If True, text from backend will be used instead of generated text
|
359
360
|
|
360
361
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
361
|
-
ocr_options:
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
OcrMacOptions,
|
366
|
-
RapidOcrOptions,
|
367
|
-
] = Field(EasyOcrOptions(), discriminator="kind")
|
368
|
-
picture_description_options: Annotated[
|
369
|
-
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
370
|
-
Field(discriminator="kind"),
|
371
|
-
] = smolvlm_picture_description
|
362
|
+
ocr_options: OcrOptions = EasyOcrOptions()
|
363
|
+
picture_description_options: PictureDescriptionBaseOptions = (
|
364
|
+
smolvlm_picture_description
|
365
|
+
)
|
372
366
|
|
373
367
|
images_scale: float = 1.0
|
374
368
|
generate_page_images: bool = False
|
@@ -381,3 +375,5 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
381
375
|
"before conversion and then use the `TableItem.get_image` function."
|
382
376
|
),
|
383
377
|
)
|
378
|
+
|
379
|
+
generate_parsed_pages: bool = False
|
docling/document_converter.py
CHANGED
@@ -11,7 +11,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
|
11
11
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
12
12
|
from docling.backend.asciidoc_backend import AsciiDocBackend
|
13
13
|
from docling.backend.csv_backend import CsvDocumentBackend
|
14
|
-
from docling.backend.
|
14
|
+
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
15
15
|
from docling.backend.html_backend import HTMLDocumentBackend
|
16
16
|
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
17
17
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
@@ -109,12 +109,12 @@ class XMLJatsFormatOption(FormatOption):
|
|
109
109
|
|
110
110
|
class ImageFormatOption(FormatOption):
|
111
111
|
pipeline_cls: Type = StandardPdfPipeline
|
112
|
-
backend: Type[AbstractDocumentBackend] =
|
112
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
113
113
|
|
114
114
|
|
115
115
|
class PdfFormatOption(FormatOption):
|
116
116
|
pipeline_cls: Type = StandardPdfPipeline
|
117
|
-
backend: Type[AbstractDocumentBackend] =
|
117
|
+
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
118
118
|
|
119
119
|
|
120
120
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
@@ -147,10 +147,10 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
147
147
|
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
148
148
|
),
|
149
149
|
InputFormat.IMAGE: FormatOption(
|
150
|
-
pipeline_cls=StandardPdfPipeline, backend=
|
150
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
151
151
|
),
|
152
152
|
InputFormat.PDF: FormatOption(
|
153
|
-
pipeline_cls=StandardPdfPipeline, backend=
|
153
|
+
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
154
154
|
),
|
155
155
|
InputFormat.JSON_DOCLING: FormatOption(
|
156
156
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
docling/models/base_model.py
CHANGED
@@ -1,14 +1,22 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
|
-
from typing import Any, Generic, Iterable, Optional
|
2
|
+
from typing import Any, Generic, Iterable, Optional, Protocol, Type
|
3
3
|
|
4
4
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
5
5
|
from typing_extensions import TypeVar
|
6
6
|
|
7
7
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
8
8
|
from docling.datamodel.document import ConversionResult
|
9
|
+
from docling.datamodel.pipeline_options import BaseOptions
|
9
10
|
from docling.datamodel.settings import settings
|
10
11
|
|
11
12
|
|
13
|
+
class BaseModelWithOptions(Protocol):
|
14
|
+
@classmethod
|
15
|
+
def get_options_type(cls) -> Type[BaseOptions]: ...
|
16
|
+
|
17
|
+
def __init__(self, *, options: BaseOptions, **kwargs): ...
|
18
|
+
|
19
|
+
|
12
20
|
class BasePageModel(ABC):
|
13
21
|
@abstractmethod
|
14
22
|
def __call__(
|