docling 2.26.0__py3-none-any.whl → 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +21 -13
- docling/backend/docling_parse_v2_backend.py +20 -12
- docling/backend/docling_parse_v4_backend.py +192 -0
- docling/backend/docx/__init__.py +0 -0
- docling/backend/docx/latex/__init__.py +0 -0
- docling/backend/docx/latex/latex_dict.py +271 -0
- docling/backend/docx/latex/omml.py +453 -0
- docling/backend/html_backend.py +7 -7
- docling/backend/md_backend.py +1 -1
- docling/backend/msexcel_backend.py +2 -45
- docling/backend/mspowerpoint_backend.py +19 -1
- docling/backend/msword_backend.py +68 -3
- docling/backend/pdf_backend.py +7 -2
- docling/backend/pypdfium2_backend.py +52 -30
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +135 -53
- docling/cli/models.py +1 -1
- docling/datamodel/base_models.py +8 -10
- docling/datamodel/pipeline_options.py +54 -32
- docling/document_converter.py +5 -5
- docling/models/base_model.py +9 -1
- docling/models/base_ocr_model.py +27 -16
- docling/models/easyocr_model.py +28 -13
- docling/models/factories/__init__.py +27 -0
- docling/models/factories/base_factory.py +122 -0
- docling/models/factories/ocr_factory.py +11 -0
- docling/models/factories/picture_description_factory.py +11 -0
- docling/models/hf_mlx_model.py +137 -0
- docling/models/ocr_mac_model.py +39 -11
- docling/models/page_preprocessing_model.py +4 -0
- docling/models/picture_description_api_model.py +20 -3
- docling/models/picture_description_base_model.py +19 -3
- docling/models/picture_description_vlm_model.py +14 -2
- docling/models/plugins/__init__.py +0 -0
- docling/models/plugins/defaults.py +28 -0
- docling/models/rapid_ocr_model.py +34 -13
- docling/models/table_structure_model.py +13 -4
- docling/models/tesseract_ocr_cli_model.py +40 -15
- docling/models/tesseract_ocr_model.py +37 -12
- docling/pipeline/standard_pdf_pipeline.py +25 -78
- docling/pipeline/vlm_pipeline.py +78 -398
- docling/utils/export.py +8 -6
- docling/utils/layout_postprocessor.py +26 -23
- docling/utils/visualization.py +1 -1
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/METADATA +47 -23
- docling-2.28.0.dist-info/RECORD +84 -0
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +3 -0
- docling-2.26.0.dist-info/RECORD +0 -72
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
- {docling-2.26.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
@@ -26,6 +26,7 @@ from PIL import Image, UnidentifiedImageError
|
|
26
26
|
from typing_extensions import override
|
27
27
|
|
28
28
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
29
|
+
from docling.backend.docx.latex.omml import oMath2Latex
|
29
30
|
from docling.datamodel.base_models import InputFormat
|
30
31
|
from docling.datamodel.document import InputDocument
|
31
32
|
|
@@ -260,6 +261,27 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
260
261
|
else:
|
261
262
|
return label, None
|
262
263
|
|
264
|
+
def handle_equations_in_text(self, element, text):
|
265
|
+
only_texts = []
|
266
|
+
only_equations = []
|
267
|
+
texts_and_equations = []
|
268
|
+
for subt in element.iter():
|
269
|
+
tag_name = etree.QName(subt).localname
|
270
|
+
if tag_name == "t" and "math" not in subt.tag:
|
271
|
+
only_texts.append(subt.text)
|
272
|
+
texts_and_equations.append(subt.text)
|
273
|
+
elif "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
274
|
+
latex_equation = str(oMath2Latex(subt))
|
275
|
+
only_equations.append(latex_equation)
|
276
|
+
texts_and_equations.append(latex_equation)
|
277
|
+
|
278
|
+
if "".join(only_texts).strip() != text.strip():
|
279
|
+
# If we are not able to reconstruct the initial raw text
|
280
|
+
# do not try to parse equations and return the original
|
281
|
+
return text, []
|
282
|
+
|
283
|
+
return "".join(texts_and_equations), only_equations
|
284
|
+
|
263
285
|
def handle_text_elements(
|
264
286
|
self,
|
265
287
|
element: BaseOxmlElement,
|
@@ -268,9 +290,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
268
290
|
) -> None:
|
269
291
|
paragraph = Paragraph(element, docx_obj)
|
270
292
|
|
271
|
-
|
293
|
+
raw_text = paragraph.text
|
294
|
+
text, equations = self.handle_equations_in_text(element=element, text=raw_text)
|
295
|
+
|
296
|
+
if text is None:
|
272
297
|
return
|
273
|
-
text =
|
298
|
+
text = text.strip()
|
274
299
|
|
275
300
|
# Common styles for bullet and numbered lists.
|
276
301
|
# "List Bullet", "List Number", "List Paragraph"
|
@@ -323,6 +348,46 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
323
348
|
elif "Heading" in p_style_id:
|
324
349
|
self.add_header(doc, p_level, text)
|
325
350
|
|
351
|
+
elif len(equations) > 0:
|
352
|
+
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
353
|
+
# Standalone equation
|
354
|
+
level = self.get_level()
|
355
|
+
doc.add_text(
|
356
|
+
label=DocItemLabel.FORMULA,
|
357
|
+
parent=self.parents[level - 1],
|
358
|
+
text=text,
|
359
|
+
)
|
360
|
+
else:
|
361
|
+
# Inline equation
|
362
|
+
level = self.get_level()
|
363
|
+
inline_equation = doc.add_group(
|
364
|
+
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
365
|
+
)
|
366
|
+
text_tmp = text
|
367
|
+
for eq in equations:
|
368
|
+
if len(text_tmp) == 0:
|
369
|
+
break
|
370
|
+
|
371
|
+
pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
|
372
|
+
text_tmp = text_tmp.split(eq, maxsplit=1)[1]
|
373
|
+
if len(pre_eq_text) > 0:
|
374
|
+
doc.add_text(
|
375
|
+
label=DocItemLabel.PARAGRAPH,
|
376
|
+
parent=inline_equation,
|
377
|
+
text=pre_eq_text,
|
378
|
+
)
|
379
|
+
doc.add_text(
|
380
|
+
label=DocItemLabel.FORMULA,
|
381
|
+
parent=inline_equation,
|
382
|
+
text=eq,
|
383
|
+
)
|
384
|
+
if len(text_tmp) > 0:
|
385
|
+
doc.add_text(
|
386
|
+
label=DocItemLabel.PARAGRAPH,
|
387
|
+
parent=inline_equation,
|
388
|
+
text=text_tmp,
|
389
|
+
)
|
390
|
+
|
326
391
|
elif p_style_id in [
|
327
392
|
"Paragraph",
|
328
393
|
"Normal",
|
@@ -539,7 +604,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
539
604
|
end_row_offset_idx=row.grid_cols_before + spanned_idx,
|
540
605
|
start_col_offset_idx=col_idx,
|
541
606
|
end_col_offset_idx=col_idx + cell.grid_span,
|
542
|
-
|
607
|
+
column_header=row.grid_cols_before + row_idx == 0,
|
543
608
|
row_header=False,
|
544
609
|
)
|
545
610
|
data.table_cells.append(table_cell)
|
docling/backend/pdf_backend.py
CHANGED
@@ -4,10 +4,11 @@ from pathlib import Path
|
|
4
4
|
from typing import Iterable, Optional, Set, Union
|
5
5
|
|
6
6
|
from docling_core.types.doc import BoundingBox, Size
|
7
|
+
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
7
8
|
from PIL import Image
|
8
9
|
|
9
10
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
10
|
-
from docling.datamodel.base_models import
|
11
|
+
from docling.datamodel.base_models import InputFormat
|
11
12
|
from docling.datamodel.document import InputDocument
|
12
13
|
|
13
14
|
|
@@ -17,7 +18,11 @@ class PdfPageBackend(ABC):
|
|
17
18
|
pass
|
18
19
|
|
19
20
|
@abstractmethod
|
20
|
-
def
|
21
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
22
|
+
pass
|
23
|
+
|
24
|
+
@abstractmethod
|
25
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
21
26
|
pass
|
22
27
|
|
23
28
|
@abstractmethod
|
@@ -7,12 +7,12 @@ from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
|
7
7
|
import pypdfium2 as pdfium
|
8
8
|
import pypdfium2.raw as pdfium_c
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
10
|
+
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
10
11
|
from PIL import Image, ImageDraw
|
11
12
|
from pypdfium2 import PdfTextPage
|
12
13
|
from pypdfium2._helpers.misc import PdfiumError
|
13
14
|
|
14
15
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
15
|
-
from docling.datamodel.base_models import Cell
|
16
16
|
from docling.utils.locks import pypdfium2_lock
|
17
17
|
|
18
18
|
if TYPE_CHECKING:
|
@@ -68,7 +68,10 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
68
68
|
|
69
69
|
return text_piece
|
70
70
|
|
71
|
-
def
|
71
|
+
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
72
|
+
return None
|
73
|
+
|
74
|
+
def get_text_cells(self) -> Iterable[TextCell]:
|
72
75
|
with pypdfium2_lock:
|
73
76
|
if not self.text_page:
|
74
77
|
self.text_page = self._ppage.get_textpage()
|
@@ -84,11 +87,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
84
87
|
text_piece = self.text_page.get_text_bounded(*rect)
|
85
88
|
x0, y0, x1, y1 = rect
|
86
89
|
cells.append(
|
87
|
-
|
88
|
-
|
90
|
+
TextCell(
|
91
|
+
index=cell_counter,
|
89
92
|
text=text_piece,
|
90
|
-
|
91
|
-
|
93
|
+
orig=text_piece,
|
94
|
+
from_ocr=False,
|
95
|
+
rect=BoundingRectangle.from_bounding_box(
|
96
|
+
BoundingBox(
|
97
|
+
l=x0,
|
98
|
+
b=y0,
|
99
|
+
r=x1,
|
100
|
+
t=y1,
|
101
|
+
coord_origin=CoordOrigin.BOTTOMLEFT,
|
102
|
+
)
|
92
103
|
).to_top_left_origin(page_size.height),
|
93
104
|
)
|
94
105
|
)
|
@@ -97,51 +108,56 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
97
108
|
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
98
109
|
# The cell merging code below is to clean this up.
|
99
110
|
def merge_horizontal_cells(
|
100
|
-
cells: List[
|
111
|
+
cells: List[TextCell],
|
101
112
|
horizontal_threshold_factor: float = 1.0,
|
102
113
|
vertical_threshold_factor: float = 0.5,
|
103
|
-
) -> List[
|
114
|
+
) -> List[TextCell]:
|
104
115
|
if not cells:
|
105
116
|
return []
|
106
117
|
|
107
|
-
def group_rows(cells: List[
|
118
|
+
def group_rows(cells: List[TextCell]) -> List[List[TextCell]]:
|
108
119
|
rows = []
|
109
120
|
current_row = [cells[0]]
|
110
|
-
row_top = cells[0].
|
111
|
-
row_bottom = cells[0].
|
112
|
-
row_height = cells[0].
|
121
|
+
row_top = cells[0].rect.to_bounding_box().t
|
122
|
+
row_bottom = cells[0].rect.to_bounding_box().b
|
123
|
+
row_height = cells[0].rect.to_bounding_box().height
|
113
124
|
|
114
125
|
for cell in cells[1:]:
|
115
126
|
vertical_threshold = row_height * vertical_threshold_factor
|
116
127
|
if (
|
117
|
-
abs(cell.
|
118
|
-
|
128
|
+
abs(cell.rect.to_bounding_box().t - row_top)
|
129
|
+
<= vertical_threshold
|
130
|
+
and abs(cell.rect.to_bounding_box().b - row_bottom)
|
131
|
+
<= vertical_threshold
|
119
132
|
):
|
120
133
|
current_row.append(cell)
|
121
|
-
row_top = min(row_top, cell.
|
122
|
-
row_bottom = max(row_bottom, cell.
|
134
|
+
row_top = min(row_top, cell.rect.to_bounding_box().t)
|
135
|
+
row_bottom = max(row_bottom, cell.rect.to_bounding_box().b)
|
123
136
|
row_height = row_bottom - row_top
|
124
137
|
else:
|
125
138
|
rows.append(current_row)
|
126
139
|
current_row = [cell]
|
127
|
-
row_top = cell.
|
128
|
-
row_bottom = cell.
|
129
|
-
row_height = cell.
|
140
|
+
row_top = cell.rect.to_bounding_box().t
|
141
|
+
row_bottom = cell.rect.to_bounding_box().b
|
142
|
+
row_height = cell.rect.to_bounding_box().height
|
130
143
|
|
131
144
|
if current_row:
|
132
145
|
rows.append(current_row)
|
133
146
|
|
134
147
|
return rows
|
135
148
|
|
136
|
-
def merge_row(row: List[
|
149
|
+
def merge_row(row: List[TextCell]) -> List[TextCell]:
|
137
150
|
merged = []
|
138
151
|
current_group = [row[0]]
|
139
152
|
|
140
153
|
for cell in row[1:]:
|
141
154
|
prev_cell = current_group[-1]
|
142
|
-
avg_height = (
|
155
|
+
avg_height = (
|
156
|
+
prev_cell.rect.height + cell.rect.to_bounding_box().height
|
157
|
+
) / 2
|
143
158
|
if (
|
144
|
-
cell.
|
159
|
+
cell.rect.to_bounding_box().l
|
160
|
+
- prev_cell.rect.to_bounding_box().r
|
145
161
|
<= avg_height * horizontal_threshold_factor
|
146
162
|
):
|
147
163
|
current_group.append(cell)
|
@@ -154,24 +170,30 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
154
170
|
|
155
171
|
return merged
|
156
172
|
|
157
|
-
def merge_group(group: List[
|
173
|
+
def merge_group(group: List[TextCell]) -> TextCell:
|
158
174
|
if len(group) == 1:
|
159
175
|
return group[0]
|
160
176
|
|
161
177
|
merged_text = "".join(cell.text for cell in group)
|
162
178
|
merged_bbox = BoundingBox(
|
163
|
-
l=min(cell.
|
164
|
-
t=min(cell.
|
165
|
-
r=max(cell.
|
166
|
-
b=max(cell.
|
179
|
+
l=min(cell.rect.to_bounding_box().l for cell in group),
|
180
|
+
t=min(cell.rect.to_bounding_box().t for cell in group),
|
181
|
+
r=max(cell.rect.to_bounding_box().r for cell in group),
|
182
|
+
b=max(cell.rect.to_bounding_box().b for cell in group),
|
183
|
+
)
|
184
|
+
return TextCell(
|
185
|
+
index=group[0].index,
|
186
|
+
text=merged_text,
|
187
|
+
orig=merged_text,
|
188
|
+
rect=BoundingRectangle.from_bounding_box(merged_bbox),
|
189
|
+
from_ocr=False,
|
167
190
|
)
|
168
|
-
return Cell(id=group[0].id, text=merged_text, bbox=merged_bbox)
|
169
191
|
|
170
192
|
rows = group_rows(cells)
|
171
193
|
merged_cells = [cell for row in rows for cell in merge_row(row)]
|
172
194
|
|
173
195
|
for i, cell in enumerate(merged_cells, 1):
|
174
|
-
cell.
|
196
|
+
cell.index = i
|
175
197
|
|
176
198
|
return merged_cells
|
177
199
|
|
@@ -181,7 +203,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|
181
203
|
) # make new image to avoid drawing on the saved ones
|
182
204
|
draw = ImageDraw.Draw(image)
|
183
205
|
for c in cells:
|
184
|
-
x0, y0, x1, y1 = c.
|
206
|
+
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
185
207
|
cell_color = (
|
186
208
|
random.randint(30, 140),
|
187
209
|
random.randint(30, 140),
|
@@ -999,7 +999,7 @@ class PatentUsptoGrantAps(PatentUspto):
|
|
999
999
|
parent=self.parents[self.level],
|
1000
1000
|
)
|
1001
1001
|
|
1002
|
-
last_claim.text += f" {value}" if last_claim.text else value
|
1002
|
+
last_claim.text += f" {value.strip()}" if last_claim.text else value.strip()
|
1003
1003
|
|
1004
1004
|
elif field == self.Field.CAPTION.value and section in (
|
1005
1005
|
self.Section.SUMMARY.value,
|
docling/cli/main.py
CHANGED
@@ -9,6 +9,7 @@ import warnings
|
|
9
9
|
from pathlib import Path
|
10
10
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
11
11
|
|
12
|
+
import rich.table
|
12
13
|
import typer
|
13
14
|
from docling_core.types.doc import ImageRefMode
|
14
15
|
from docling_core.utils.file import resolve_source_to_path
|
@@ -16,6 +17,7 @@ from pydantic import TypeAdapter
|
|
16
17
|
|
17
18
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
18
19
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
20
|
+
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
19
21
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
20
22
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
21
23
|
from docling.datamodel.base_models import (
|
@@ -29,18 +31,22 @@ from docling.datamodel.pipeline_options import (
|
|
29
31
|
AcceleratorDevice,
|
30
32
|
AcceleratorOptions,
|
31
33
|
EasyOcrOptions,
|
32
|
-
OcrEngine,
|
33
|
-
OcrMacOptions,
|
34
34
|
OcrOptions,
|
35
|
+
PaginatedPipelineOptions,
|
35
36
|
PdfBackend,
|
37
|
+
PdfPipeline,
|
36
38
|
PdfPipelineOptions,
|
37
|
-
RapidOcrOptions,
|
38
39
|
TableFormerMode,
|
39
|
-
|
40
|
-
|
40
|
+
VlmModelType,
|
41
|
+
VlmPipelineOptions,
|
42
|
+
granite_vision_vlm_conversion_options,
|
43
|
+
smoldocling_vlm_conversion_options,
|
44
|
+
smoldocling_vlm_mlx_conversion_options,
|
41
45
|
)
|
42
46
|
from docling.datamodel.settings import settings
|
43
47
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
48
|
+
from docling.models.factories import get_ocr_factory
|
49
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
44
50
|
|
45
51
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
46
52
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@@ -48,8 +54,11 @@ warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr
|
|
48
54
|
_log = logging.getLogger(__name__)
|
49
55
|
from rich.console import Console
|
50
56
|
|
57
|
+
console = Console()
|
51
58
|
err_console = Console(stderr=True)
|
52
59
|
|
60
|
+
ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
|
61
|
+
ocr_engines_enum_internal = ocr_factory_internal.get_enum()
|
53
62
|
|
54
63
|
app = typer.Typer(
|
55
64
|
name="Docling",
|
@@ -77,6 +86,24 @@ def version_callback(value: bool):
|
|
77
86
|
raise typer.Exit()
|
78
87
|
|
79
88
|
|
89
|
+
def show_external_plugins_callback(value: bool):
|
90
|
+
if value:
|
91
|
+
ocr_factory_all = get_ocr_factory(allow_external_plugins=True)
|
92
|
+
table = rich.table.Table(title="Available OCR engines")
|
93
|
+
table.add_column("Name", justify="right")
|
94
|
+
table.add_column("Plugin")
|
95
|
+
table.add_column("Package")
|
96
|
+
for meta in ocr_factory_all.registered_meta.values():
|
97
|
+
if not meta.module.startswith("docling."):
|
98
|
+
table.add_row(
|
99
|
+
f"[bold]{meta.kind}[/bold]",
|
100
|
+
meta.plugin_name,
|
101
|
+
meta.module.split(".")[0],
|
102
|
+
)
|
103
|
+
rich.print(table)
|
104
|
+
raise typer.Exit()
|
105
|
+
|
106
|
+
|
80
107
|
def export_documents(
|
81
108
|
conv_results: Iterable[ConversionResult],
|
82
109
|
output_dir: Path,
|
@@ -181,6 +208,14 @@ def convert(
|
|
181
208
|
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
182
209
|
),
|
183
210
|
] = ImageRefMode.EMBEDDED,
|
211
|
+
pipeline: Annotated[
|
212
|
+
PdfPipeline,
|
213
|
+
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
214
|
+
] = PdfPipeline.STANDARD,
|
215
|
+
vlm_model: Annotated[
|
216
|
+
VlmModelType,
|
217
|
+
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
218
|
+
] = VlmModelType.SMOLDOCLING,
|
184
219
|
ocr: Annotated[
|
185
220
|
bool,
|
186
221
|
typer.Option(
|
@@ -195,8 +230,16 @@ def convert(
|
|
195
230
|
),
|
196
231
|
] = False,
|
197
232
|
ocr_engine: Annotated[
|
198
|
-
|
199
|
-
|
233
|
+
str,
|
234
|
+
typer.Option(
|
235
|
+
...,
|
236
|
+
help=(
|
237
|
+
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
238
|
+
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
|
239
|
+
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
240
|
+
),
|
241
|
+
),
|
242
|
+
] = EasyOcrOptions.kind,
|
200
243
|
ocr_lang: Annotated[
|
201
244
|
Optional[str],
|
202
245
|
typer.Option(
|
@@ -240,6 +283,21 @@ def convert(
|
|
240
283
|
..., help="Must be enabled when using models connecting to remote services."
|
241
284
|
),
|
242
285
|
] = False,
|
286
|
+
allow_external_plugins: Annotated[
|
287
|
+
bool,
|
288
|
+
typer.Option(
|
289
|
+
..., help="Must be enabled for loading modules from third-party plugins."
|
290
|
+
),
|
291
|
+
] = False,
|
292
|
+
show_external_plugins: Annotated[
|
293
|
+
bool,
|
294
|
+
typer.Option(
|
295
|
+
...,
|
296
|
+
help="List the third-party plugins which are available when the option --allow-external-plugins is set.",
|
297
|
+
callback=show_external_plugins_callback,
|
298
|
+
is_eager=True,
|
299
|
+
),
|
300
|
+
] = False,
|
243
301
|
abort_on_error: Annotated[
|
244
302
|
bool,
|
245
303
|
typer.Option(
|
@@ -367,64 +425,88 @@ def convert(
|
|
367
425
|
export_txt = OutputFormat.TEXT in to_formats
|
368
426
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
369
427
|
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
376
|
-
elif ocr_engine == OcrEngine.OCRMAC:
|
377
|
-
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
378
|
-
elif ocr_engine == OcrEngine.RAPIDOCR:
|
379
|
-
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
380
|
-
else:
|
381
|
-
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
428
|
+
ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
|
429
|
+
ocr_options: OcrOptions = ocr_factory.create_options( # type: ignore
|
430
|
+
kind=ocr_engine,
|
431
|
+
force_full_page_ocr=force_ocr,
|
432
|
+
)
|
382
433
|
|
383
434
|
ocr_lang_list = _split_list(ocr_lang)
|
384
435
|
if ocr_lang_list is not None:
|
385
436
|
ocr_options.lang = ocr_lang_list
|
386
437
|
|
387
438
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
388
|
-
pipeline_options
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
439
|
+
pipeline_options: PaginatedPipelineOptions
|
440
|
+
|
441
|
+
if pipeline == PdfPipeline.STANDARD:
|
442
|
+
pipeline_options = PdfPipelineOptions(
|
443
|
+
allow_external_plugins=allow_external_plugins,
|
444
|
+
enable_remote_services=enable_remote_services,
|
445
|
+
accelerator_options=accelerator_options,
|
446
|
+
do_ocr=ocr,
|
447
|
+
ocr_options=ocr_options,
|
448
|
+
do_table_structure=True,
|
449
|
+
do_code_enrichment=enrich_code,
|
450
|
+
do_formula_enrichment=enrich_formula,
|
451
|
+
do_picture_description=enrich_picture_description,
|
452
|
+
do_picture_classification=enrich_picture_classes,
|
453
|
+
document_timeout=document_timeout,
|
454
|
+
)
|
455
|
+
pipeline_options.table_structure_options.do_cell_matching = (
|
456
|
+
True # do_cell_matching
|
457
|
+
)
|
458
|
+
pipeline_options.table_structure_options.mode = table_mode
|
404
459
|
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
460
|
+
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
461
|
+
pipeline_options.generate_page_images = True
|
462
|
+
pipeline_options.generate_picture_images = (
|
463
|
+
True # FIXME: to be deprecated in verson 3
|
464
|
+
)
|
465
|
+
pipeline_options.images_scale = 2
|
466
|
+
|
467
|
+
backend: Type[PdfDocumentBackend]
|
468
|
+
if pdf_backend == PdfBackend.DLPARSE_V1:
|
469
|
+
backend = DoclingParseDocumentBackend
|
470
|
+
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
471
|
+
backend = DoclingParseV2DocumentBackend
|
472
|
+
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
473
|
+
backend = DoclingParseV4DocumentBackend # type: ignore
|
474
|
+
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
475
|
+
backend = PyPdfiumDocumentBackend # type: ignore
|
476
|
+
else:
|
477
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
478
|
+
|
479
|
+
pdf_format_option = PdfFormatOption(
|
480
|
+
pipeline_options=pipeline_options,
|
481
|
+
backend=backend, # pdf_backend
|
482
|
+
)
|
483
|
+
elif pipeline == PdfPipeline.VLM:
|
484
|
+
pipeline_options = VlmPipelineOptions()
|
485
|
+
|
486
|
+
if vlm_model == VlmModelType.GRANITE_VISION:
|
487
|
+
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
488
|
+
elif vlm_model == VlmModelType.SMOLDOCLING:
|
489
|
+
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
490
|
+
if sys.platform == "darwin":
|
491
|
+
try:
|
492
|
+
import mlx_vlm
|
493
|
+
|
494
|
+
pipeline_options.vlm_options = (
|
495
|
+
smoldocling_vlm_mlx_conversion_options
|
496
|
+
)
|
497
|
+
except ImportError:
|
498
|
+
_log.warning(
|
499
|
+
"To run SmolDocling faster, please install mlx-vlm:\n"
|
500
|
+
"pip install mlx-vlm"
|
501
|
+
)
|
502
|
+
|
503
|
+
pdf_format_option = PdfFormatOption(
|
504
|
+
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
409
505
|
)
|
410
|
-
pipeline_options.images_scale = 2
|
411
506
|
|
412
507
|
if artifacts_path is not None:
|
413
508
|
pipeline_options.artifacts_path = artifacts_path
|
414
509
|
|
415
|
-
if pdf_backend == PdfBackend.DLPARSE_V1:
|
416
|
-
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
417
|
-
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
418
|
-
backend = DoclingParseV2DocumentBackend
|
419
|
-
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
420
|
-
backend = PyPdfiumDocumentBackend
|
421
|
-
else:
|
422
|
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
423
|
-
|
424
|
-
pdf_format_option = PdfFormatOption(
|
425
|
-
pipeline_options=pipeline_options,
|
426
|
-
backend=backend, # pdf_backend
|
427
|
-
)
|
428
510
|
format_options: Dict[InputFormat, FormatOption] = {
|
429
511
|
InputFormat.PDF: pdf_format_option,
|
430
512
|
InputFormat.IMAGE: pdf_format_option,
|
docling/cli/models.py
CHANGED
@@ -121,7 +121,7 @@ def download(
|
|
121
121
|
"Using the CLI:",
|
122
122
|
f"`docling --artifacts-path={output_dir} FILE`",
|
123
123
|
"\n",
|
124
|
-
"Using Python: see the documentation at <https://
|
124
|
+
"Using Python: see the documentation at <https://docling-project.github.io/docling/usage>.",
|
125
125
|
)
|
126
126
|
|
127
127
|
|
docling/datamodel/base_models.py
CHANGED
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
|
|
9
9
|
Size,
|
10
10
|
TableCell,
|
11
11
|
)
|
12
|
+
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
12
13
|
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
13
14
|
DocumentStream,
|
14
15
|
)
|
@@ -123,14 +124,10 @@ class ErrorItem(BaseModel):
|
|
123
124
|
error_message: str
|
124
125
|
|
125
126
|
|
126
|
-
class Cell(BaseModel):
|
127
|
-
id: int
|
128
|
-
text: str
|
129
|
-
bbox: BoundingBox
|
130
|
-
|
131
|
-
|
132
|
-
class OcrCell(Cell):
|
133
|
-
confidence: float
|
127
|
+
# class Cell(BaseModel):
|
128
|
+
# id: int
|
129
|
+
# text: str
|
130
|
+
# bbox: BoundingBox
|
134
131
|
|
135
132
|
|
136
133
|
class Cluster(BaseModel):
|
@@ -138,7 +135,7 @@ class Cluster(BaseModel):
|
|
138
135
|
label: DocItemLabel
|
139
136
|
bbox: BoundingBox
|
140
137
|
confidence: float = 1.0
|
141
|
-
cells: List[
|
138
|
+
cells: List[TextCell] = []
|
142
139
|
children: List["Cluster"] = [] # Add child cluster support
|
143
140
|
|
144
141
|
|
@@ -226,7 +223,8 @@ class Page(BaseModel):
|
|
226
223
|
page_no: int
|
227
224
|
# page_hash: Optional[str] = None
|
228
225
|
size: Optional[Size] = None
|
229
|
-
cells: List[
|
226
|
+
cells: List[TextCell] = []
|
227
|
+
parsed_page: Optional[SegmentedPdfPage] = None
|
230
228
|
predictions: PagePredictions = PagePredictions()
|
231
229
|
assembled: Optional[AssembledUnit] = None
|
232
230
|
|