docling 2.27.0__py3-none-any.whl → 2.28.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v4_backend.py +20 -13
- docling/backend/mspowerpoint_backend.py +18 -0
- docling/backend/msword_backend.py +56 -14
- docling/cli/main.py +81 -38
- docling/datamodel/pipeline_options.py +28 -2
- docling/document_converter.py +29 -17
- docling/models/hf_mlx_model.py +137 -0
- docling/models/page_preprocessing_model.py +7 -1
- docling/pipeline/vlm_pipeline.py +78 -398
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/METADATA +27 -32
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/RECORD +14 -13
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/LICENSE +0 -0
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/WHEEL +0 -0
- {docling-2.27.0.dist-info → docling-2.28.1.dist-info}/entry_points.txt +0 -0
@@ -112,23 +112,30 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
112
112
|
padbox.r = page_size.width - padbox.r
|
113
113
|
padbox.t = page_size.height - padbox.t
|
114
114
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
115
|
+
with pypdfium2_lock:
|
116
|
+
image = (
|
117
|
+
self._ppage.render(
|
118
|
+
scale=scale * 1.5,
|
119
|
+
rotation=0, # no additional rotation
|
120
|
+
crop=padbox.as_tuple(),
|
121
|
+
)
|
122
|
+
.to_pil()
|
123
|
+
.resize(
|
124
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
125
|
+
)
|
126
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
124
127
|
|
125
128
|
return image
|
126
129
|
|
127
130
|
def get_size(self) -> Size:
|
128
|
-
|
129
|
-
width=self.
|
130
|
-
|
131
|
-
|
131
|
+
with pypdfium2_lock:
|
132
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
133
|
+
|
134
|
+
# TODO: Take width and height from docling-parse.
|
135
|
+
# return Size(
|
136
|
+
# width=self._dpage.dimension.width,
|
137
|
+
# height=self._dpage.dimension.height,
|
138
|
+
# )
|
132
139
|
|
133
140
|
def unload(self):
|
134
141
|
self._ppage = None
|
@@ -16,6 +16,7 @@ from docling_core.types.doc import (
|
|
16
16
|
TableCell,
|
17
17
|
TableData,
|
18
18
|
)
|
19
|
+
from docling_core.types.doc.document import ContentLayer
|
19
20
|
from PIL import Image, UnidentifiedImageError
|
20
21
|
from pptx import Presentation
|
21
22
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
@@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
421
422
|
for shape in slide.shapes:
|
422
423
|
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
423
424
|
|
425
|
+
# Handle notes slide
|
426
|
+
if slide.has_notes_slide:
|
427
|
+
notes_slide = slide.notes_slide
|
428
|
+
notes_text = notes_slide.notes_text_frame.text.strip()
|
429
|
+
if notes_text:
|
430
|
+
bbox = BoundingBox(l=0, t=0, r=0, b=0)
|
431
|
+
prov = ProvenanceItem(
|
432
|
+
page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
|
433
|
+
)
|
434
|
+
doc.add_text(
|
435
|
+
label=DocItemLabel.TEXT,
|
436
|
+
parent=parent_slide,
|
437
|
+
text=notes_text,
|
438
|
+
prov=prov,
|
439
|
+
content_layer=ContentLayer.FURNITURE,
|
440
|
+
)
|
441
|
+
|
424
442
|
return doc
|
@@ -53,6 +53,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
53
53
|
self.max_levels: int = 10
|
54
54
|
self.level_at_new_list: Optional[int] = None
|
55
55
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
56
|
+
self.numbered_headers: dict[int, int] = {}
|
56
57
|
for i in range(-1, self.max_levels):
|
57
58
|
self.parents[i] = None
|
58
59
|
|
@@ -275,8 +276,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
275
276
|
only_equations.append(latex_equation)
|
276
277
|
texts_and_equations.append(latex_equation)
|
277
278
|
|
278
|
-
if "".join(only_texts) != text:
|
279
|
-
|
279
|
+
if "".join(only_texts).strip() != text.strip():
|
280
|
+
# If we are not able to reconstruct the initial raw text
|
281
|
+
# do not try to parse equations and return the original
|
282
|
+
return text, []
|
280
283
|
|
281
284
|
return "".join(texts_and_equations), only_equations
|
282
285
|
|
@@ -344,7 +347,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
344
347
|
parent=None, label=DocItemLabel.TITLE, text=text
|
345
348
|
)
|
346
349
|
elif "Heading" in p_style_id:
|
347
|
-
|
350
|
+
style_element = getattr(paragraph.style, "element", None)
|
351
|
+
if style_element:
|
352
|
+
is_numbered_style = (
|
353
|
+
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
354
|
+
)
|
355
|
+
else:
|
356
|
+
is_numbered_style = False
|
357
|
+
self.add_header(doc, p_level, text, is_numbered_style)
|
348
358
|
|
349
359
|
elif len(equations) > 0:
|
350
360
|
if (raw_text is None or len(raw_text) == 0) and len(text) > 0:
|
@@ -365,6 +375,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
365
375
|
for eq in equations:
|
366
376
|
if len(text_tmp) == 0:
|
367
377
|
break
|
378
|
+
|
368
379
|
pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
|
369
380
|
text_tmp = text_tmp.split(eq, maxsplit=1)[1]
|
370
381
|
if len(pre_eq_text) > 0:
|
@@ -412,7 +423,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
412
423
|
return
|
413
424
|
|
414
425
|
def add_header(
|
415
|
-
self,
|
426
|
+
self,
|
427
|
+
doc: DoclingDocument,
|
428
|
+
curr_level: Optional[int],
|
429
|
+
text: str,
|
430
|
+
is_numbered_style: bool = False,
|
416
431
|
) -> None:
|
417
432
|
level = self.get_level()
|
418
433
|
if isinstance(curr_level, int):
|
@@ -430,17 +445,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
430
445
|
if key >= curr_level:
|
431
446
|
self.parents[key] = None
|
432
447
|
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
level=curr_level,
|
437
|
-
)
|
448
|
+
current_level = curr_level
|
449
|
+
parent_level = curr_level - 1
|
450
|
+
add_level = curr_level
|
438
451
|
else:
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
452
|
+
current_level = self.level
|
453
|
+
parent_level = self.level - 1
|
454
|
+
add_level = 1
|
455
|
+
|
456
|
+
if is_numbered_style:
|
457
|
+
if add_level in self.numbered_headers:
|
458
|
+
self.numbered_headers[add_level] += 1
|
459
|
+
else:
|
460
|
+
self.numbered_headers[add_level] = 1
|
461
|
+
text = f"{self.numbered_headers[add_level]} {text}"
|
462
|
+
|
463
|
+
# Reset deeper levels
|
464
|
+
next_level = add_level + 1
|
465
|
+
while next_level in self.numbered_headers:
|
466
|
+
self.numbered_headers[next_level] = 0
|
467
|
+
next_level += 1
|
468
|
+
|
469
|
+
# Scan upper levels
|
470
|
+
previous_level = add_level - 1
|
471
|
+
while previous_level in self.numbered_headers:
|
472
|
+
# MSWord convention: no empty sublevels
|
473
|
+
# I.e., sub-sub section (2.0.1) without a sub-section (2.1)
|
474
|
+
# is processed as 2.1.1
|
475
|
+
if self.numbered_headers[previous_level] == 0:
|
476
|
+
self.numbered_headers[previous_level] += 1
|
477
|
+
|
478
|
+
text = f"{self.numbered_headers[previous_level]}.{text}"
|
479
|
+
previous_level -= 1
|
480
|
+
|
481
|
+
self.parents[current_level] = doc.add_heading(
|
482
|
+
parent=self.parents[parent_level],
|
483
|
+
text=text,
|
484
|
+
level=add_level,
|
485
|
+
)
|
444
486
|
return
|
445
487
|
|
446
488
|
def add_listitem(
|
docling/cli/main.py
CHANGED
@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
|
|
32
32
|
AcceleratorOptions,
|
33
33
|
EasyOcrOptions,
|
34
34
|
OcrOptions,
|
35
|
+
PaginatedPipelineOptions,
|
35
36
|
PdfBackend,
|
37
|
+
PdfPipeline,
|
36
38
|
PdfPipelineOptions,
|
37
39
|
TableFormerMode,
|
40
|
+
VlmModelType,
|
41
|
+
VlmPipelineOptions,
|
42
|
+
granite_vision_vlm_conversion_options,
|
43
|
+
smoldocling_vlm_conversion_options,
|
44
|
+
smoldocling_vlm_mlx_conversion_options,
|
38
45
|
)
|
39
46
|
from docling.datamodel.settings import settings
|
40
47
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
41
48
|
from docling.models.factories import get_ocr_factory
|
49
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
42
50
|
|
43
51
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
44
52
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@@ -200,6 +208,14 @@ def convert(
|
|
200
208
|
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
201
209
|
),
|
202
210
|
] = ImageRefMode.EMBEDDED,
|
211
|
+
pipeline: Annotated[
|
212
|
+
PdfPipeline,
|
213
|
+
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
214
|
+
] = PdfPipeline.STANDARD,
|
215
|
+
vlm_model: Annotated[
|
216
|
+
VlmModelType,
|
217
|
+
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
218
|
+
] = VlmModelType.SMOLDOCLING,
|
203
219
|
ocr: Annotated[
|
204
220
|
bool,
|
205
221
|
typer.Option(
|
@@ -420,50 +436,77 @@ def convert(
|
|
420
436
|
ocr_options.lang = ocr_lang_list
|
421
437
|
|
422
438
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
423
|
-
pipeline_options
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
439
|
+
pipeline_options: PaginatedPipelineOptions
|
440
|
+
|
441
|
+
if pipeline == PdfPipeline.STANDARD:
|
442
|
+
pipeline_options = PdfPipelineOptions(
|
443
|
+
allow_external_plugins=allow_external_plugins,
|
444
|
+
enable_remote_services=enable_remote_services,
|
445
|
+
accelerator_options=accelerator_options,
|
446
|
+
do_ocr=ocr,
|
447
|
+
ocr_options=ocr_options,
|
448
|
+
do_table_structure=True,
|
449
|
+
do_code_enrichment=enrich_code,
|
450
|
+
do_formula_enrichment=enrich_formula,
|
451
|
+
do_picture_description=enrich_picture_description,
|
452
|
+
do_picture_classification=enrich_picture_classes,
|
453
|
+
document_timeout=document_timeout,
|
454
|
+
)
|
455
|
+
pipeline_options.table_structure_options.do_cell_matching = (
|
456
|
+
True # do_cell_matching
|
457
|
+
)
|
458
|
+
pipeline_options.table_structure_options.mode = table_mode
|
459
|
+
|
460
|
+
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
461
|
+
pipeline_options.generate_page_images = True
|
462
|
+
pipeline_options.generate_picture_images = (
|
463
|
+
True # FIXME: to be deprecated in verson 3
|
464
|
+
)
|
465
|
+
pipeline_options.images_scale = 2
|
466
|
+
|
467
|
+
backend: Type[PdfDocumentBackend]
|
468
|
+
if pdf_backend == PdfBackend.DLPARSE_V1:
|
469
|
+
backend = DoclingParseDocumentBackend
|
470
|
+
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
471
|
+
backend = DoclingParseV2DocumentBackend
|
472
|
+
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
473
|
+
backend = DoclingParseV4DocumentBackend # type: ignore
|
474
|
+
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
475
|
+
backend = PyPdfiumDocumentBackend # type: ignore
|
476
|
+
else:
|
477
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
478
|
+
|
479
|
+
pdf_format_option = PdfFormatOption(
|
480
|
+
pipeline_options=pipeline_options,
|
481
|
+
backend=backend, # pdf_backend
|
482
|
+
)
|
483
|
+
elif pipeline == PdfPipeline.VLM:
|
484
|
+
pipeline_options = VlmPipelineOptions()
|
485
|
+
|
486
|
+
if vlm_model == VlmModelType.GRANITE_VISION:
|
487
|
+
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
488
|
+
elif vlm_model == VlmModelType.SMOLDOCLING:
|
489
|
+
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
490
|
+
if sys.platform == "darwin":
|
491
|
+
try:
|
492
|
+
import mlx_vlm
|
493
|
+
|
494
|
+
pipeline_options.vlm_options = (
|
495
|
+
smoldocling_vlm_mlx_conversion_options
|
496
|
+
)
|
497
|
+
except ImportError:
|
498
|
+
_log.warning(
|
499
|
+
"To run SmolDocling faster, please install mlx-vlm:\n"
|
500
|
+
"pip install mlx-vlm"
|
501
|
+
)
|
440
502
|
|
441
|
-
|
442
|
-
|
443
|
-
pipeline_options.generate_picture_images = (
|
444
|
-
True # FIXME: to be deprecated in verson 3
|
503
|
+
pdf_format_option = PdfFormatOption(
|
504
|
+
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
445
505
|
)
|
446
|
-
pipeline_options.images_scale = 2
|
447
506
|
|
448
507
|
if artifacts_path is not None:
|
449
508
|
pipeline_options.artifacts_path = artifacts_path
|
450
509
|
|
451
|
-
backend: Type[PdfDocumentBackend]
|
452
|
-
if pdf_backend == PdfBackend.DLPARSE_V1:
|
453
|
-
backend = DoclingParseDocumentBackend
|
454
|
-
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
455
|
-
backend = DoclingParseV2DocumentBackend
|
456
|
-
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
457
|
-
backend = DoclingParseV4DocumentBackend # type: ignore
|
458
|
-
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
459
|
-
backend = PyPdfiumDocumentBackend # type: ignore
|
460
|
-
else:
|
461
|
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
462
|
-
|
463
|
-
pdf_format_option = PdfFormatOption(
|
464
|
-
pipeline_options=pipeline_options,
|
465
|
-
backend=backend, # pdf_backend
|
466
|
-
)
|
467
510
|
format_options: Dict[InputFormat, FormatOption] = {
|
468
511
|
InputFormat.PDF: pdf_format_option,
|
469
512
|
InputFormat.IMAGE: pdf_format_option,
|
@@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
|
|
263
263
|
MARKDOWN = "markdown"
|
264
264
|
|
265
265
|
|
266
|
+
class InferenceFramework(str, Enum):
|
267
|
+
MLX = "mlx"
|
268
|
+
TRANSFORMERS = "transformers"
|
269
|
+
|
270
|
+
|
266
271
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
267
272
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
268
273
|
|
@@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
271
276
|
llm_int8_threshold: float = 6.0
|
272
277
|
quantized: bool = False
|
273
278
|
|
279
|
+
inference_framework: InferenceFramework
|
274
280
|
response_format: ResponseFormat
|
275
281
|
|
276
282
|
@property
|
@@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
278
284
|
return self.repo_id.replace("/", "--")
|
279
285
|
|
280
286
|
|
287
|
+
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
288
|
+
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
289
|
+
prompt="Convert this page to docling.",
|
290
|
+
response_format=ResponseFormat.DOCTAGS,
|
291
|
+
inference_framework=InferenceFramework.MLX,
|
292
|
+
)
|
293
|
+
|
294
|
+
|
281
295
|
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
282
296
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
283
297
|
prompt="Convert this page to docling.",
|
284
298
|
response_format=ResponseFormat.DOCTAGS,
|
299
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
285
300
|
)
|
286
301
|
|
287
302
|
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
@@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
289
304
|
# prompt="OCR the full page to markdown.",
|
290
305
|
prompt="OCR this image.",
|
291
306
|
response_format=ResponseFormat.MARKDOWN,
|
307
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
292
308
|
)
|
293
309
|
|
294
310
|
|
311
|
+
class VlmModelType(str, Enum):
|
312
|
+
SMOLDOCLING = "smoldocling"
|
313
|
+
GRANITE_VISION = "granite_vision"
|
314
|
+
|
315
|
+
|
295
316
|
# Define an enum for the backend options
|
296
317
|
class PdfBackend(str, Enum):
|
297
318
|
"""Enum of valid PDF backends."""
|
@@ -327,13 +348,14 @@ class PipelineOptions(BaseModel):
|
|
327
348
|
|
328
349
|
|
329
350
|
class PaginatedPipelineOptions(PipelineOptions):
|
351
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
352
|
+
|
330
353
|
images_scale: float = 1.0
|
331
354
|
generate_page_images: bool = False
|
332
355
|
generate_picture_images: bool = False
|
333
356
|
|
334
357
|
|
335
358
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
336
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
337
359
|
|
338
360
|
generate_page_images: bool = True
|
339
361
|
force_backend_text: bool = (
|
@@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
346
368
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
347
369
|
"""Options for the PDF pipeline."""
|
348
370
|
|
349
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
350
371
|
do_table_structure: bool = True # True: perform table structure extraction
|
351
372
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
352
373
|
do_code_enrichment: bool = False # True: perform code OCR
|
@@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
377
398
|
)
|
378
399
|
|
379
400
|
generate_parsed_pages: bool = False
|
401
|
+
|
402
|
+
|
403
|
+
class PdfPipeline(str, Enum):
|
404
|
+
STANDARD = "standard"
|
405
|
+
VLM = "vlm"
|
docling/document_converter.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import hashlib
|
1
2
|
import logging
|
2
3
|
import math
|
3
4
|
import sys
|
@@ -181,7 +182,14 @@ class DocumentConverter:
|
|
181
182
|
)
|
182
183
|
for format in self.allowed_formats
|
183
184
|
}
|
184
|
-
self.initialized_pipelines: Dict[
|
185
|
+
self.initialized_pipelines: Dict[
|
186
|
+
Tuple[Type[BasePipeline], str], BasePipeline
|
187
|
+
] = {}
|
188
|
+
|
189
|
+
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
190
|
+
"""Generate a hash of pipeline options to use as part of the cache key."""
|
191
|
+
options_str = str(pipeline_options.model_dump())
|
192
|
+
return hashlib.md5(options_str.encode("utf-8")).hexdigest()
|
185
193
|
|
186
194
|
def initialize_pipeline(self, format: InputFormat):
|
187
195
|
"""Initialize the conversion pipeline for the selected format."""
|
@@ -279,31 +287,36 @@ class DocumentConverter:
|
|
279
287
|
yield item
|
280
288
|
|
281
289
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
290
|
+
"""Retrieve or initialize a pipeline, reusing instances based on class and options."""
|
282
291
|
fopt = self.format_to_options.get(doc_format)
|
283
292
|
|
284
|
-
if fopt is None:
|
293
|
+
if fopt is None or fopt.pipeline_options is None:
|
285
294
|
return None
|
286
|
-
else:
|
287
|
-
pipeline_class = fopt.pipeline_cls
|
288
|
-
pipeline_options = fopt.pipeline_options
|
289
295
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
296
|
+
pipeline_class = fopt.pipeline_cls
|
297
|
+
pipeline_options = fopt.pipeline_options
|
298
|
+
options_hash = self._get_pipeline_options_hash(pipeline_options)
|
299
|
+
|
300
|
+
# Use a composite key to cache pipelines
|
301
|
+
cache_key = (pipeline_class, options_hash)
|
302
|
+
|
303
|
+
if cache_key not in self.initialized_pipelines:
|
304
|
+
_log.info(
|
305
|
+
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
306
|
+
)
|
307
|
+
self.initialized_pipelines[cache_key] = pipeline_class(
|
299
308
|
pipeline_options=pipeline_options
|
300
309
|
)
|
301
|
-
|
310
|
+
else:
|
311
|
+
_log.debug(
|
312
|
+
f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
313
|
+
)
|
314
|
+
|
315
|
+
return self.initialized_pipelines[cache_key]
|
302
316
|
|
303
317
|
def _process_document(
|
304
318
|
self, in_doc: InputDocument, raises_on_error: bool
|
305
319
|
) -> ConversionResult:
|
306
|
-
|
307
320
|
valid = (
|
308
321
|
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
309
322
|
)
|
@@ -345,7 +358,6 @@ class DocumentConverter:
|
|
345
358
|
else:
|
346
359
|
if raises_on_error:
|
347
360
|
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
348
|
-
|
349
361
|
else:
|
350
362
|
# invalid doc or not of desired format
|
351
363
|
conv_res = ConversionResult(
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import logging
|
2
|
+
import time
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Iterable, List, Optional
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import Page, VlmPrediction
|
7
|
+
from docling.datamodel.document import ConversionResult
|
8
|
+
from docling.datamodel.pipeline_options import (
|
9
|
+
AcceleratorDevice,
|
10
|
+
AcceleratorOptions,
|
11
|
+
HuggingFaceVlmOptions,
|
12
|
+
)
|
13
|
+
from docling.datamodel.settings import settings
|
14
|
+
from docling.models.base_model import BasePageModel
|
15
|
+
from docling.utils.accelerator_utils import decide_device
|
16
|
+
from docling.utils.profiling import TimeRecorder
|
17
|
+
|
18
|
+
_log = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class HuggingFaceMlxModel(BasePageModel):
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
enabled: bool,
|
26
|
+
artifacts_path: Optional[Path],
|
27
|
+
accelerator_options: AcceleratorOptions,
|
28
|
+
vlm_options: HuggingFaceVlmOptions,
|
29
|
+
):
|
30
|
+
self.enabled = enabled
|
31
|
+
|
32
|
+
self.vlm_options = vlm_options
|
33
|
+
|
34
|
+
if self.enabled:
|
35
|
+
|
36
|
+
try:
|
37
|
+
from mlx_vlm import generate, load # type: ignore
|
38
|
+
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
39
|
+
from mlx_vlm.utils import load_config, stream_generate # type: ignore
|
40
|
+
except ImportError:
|
41
|
+
raise ImportError(
|
42
|
+
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
43
|
+
)
|
44
|
+
|
45
|
+
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
46
|
+
self.apply_chat_template = apply_chat_template
|
47
|
+
self.stream_generate = stream_generate
|
48
|
+
|
49
|
+
# PARAMETERS:
|
50
|
+
if artifacts_path is None:
|
51
|
+
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
52
|
+
elif (artifacts_path / repo_cache_folder).exists():
|
53
|
+
artifacts_path = artifacts_path / repo_cache_folder
|
54
|
+
|
55
|
+
self.param_question = vlm_options.prompt # "Perform Layout Analysis."
|
56
|
+
|
57
|
+
## Load the model
|
58
|
+
self.vlm_model, self.processor = load(artifacts_path)
|
59
|
+
self.config = load_config(artifacts_path)
|
60
|
+
|
61
|
+
@staticmethod
|
62
|
+
def download_models(
|
63
|
+
repo_id: str,
|
64
|
+
local_dir: Optional[Path] = None,
|
65
|
+
force: bool = False,
|
66
|
+
progress: bool = False,
|
67
|
+
) -> Path:
|
68
|
+
from huggingface_hub import snapshot_download
|
69
|
+
from huggingface_hub.utils import disable_progress_bars
|
70
|
+
|
71
|
+
if not progress:
|
72
|
+
disable_progress_bars()
|
73
|
+
download_path = snapshot_download(
|
74
|
+
repo_id=repo_id,
|
75
|
+
force_download=force,
|
76
|
+
local_dir=local_dir,
|
77
|
+
# revision="v0.0.1",
|
78
|
+
)
|
79
|
+
|
80
|
+
return Path(download_path)
|
81
|
+
|
82
|
+
def __call__(
|
83
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
84
|
+
) -> Iterable[Page]:
|
85
|
+
for page in page_batch:
|
86
|
+
assert page._backend is not None
|
87
|
+
if not page._backend.is_valid():
|
88
|
+
yield page
|
89
|
+
else:
|
90
|
+
with TimeRecorder(conv_res, "vlm"):
|
91
|
+
assert page.size is not None
|
92
|
+
|
93
|
+
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
94
|
+
# hi_res_image = page.get_image(scale=1.0) # 72dpi
|
95
|
+
|
96
|
+
if hi_res_image is not None:
|
97
|
+
im_width, im_height = hi_res_image.size
|
98
|
+
|
99
|
+
# populate page_tags with predicted doc tags
|
100
|
+
page_tags = ""
|
101
|
+
|
102
|
+
if hi_res_image:
|
103
|
+
if hi_res_image.mode != "RGB":
|
104
|
+
hi_res_image = hi_res_image.convert("RGB")
|
105
|
+
|
106
|
+
prompt = self.apply_chat_template(
|
107
|
+
self.processor, self.config, self.param_question, num_images=1
|
108
|
+
)
|
109
|
+
|
110
|
+
start_time = time.time()
|
111
|
+
# Call model to generate:
|
112
|
+
output = ""
|
113
|
+
for token in self.stream_generate(
|
114
|
+
self.vlm_model,
|
115
|
+
self.processor,
|
116
|
+
prompt,
|
117
|
+
[hi_res_image],
|
118
|
+
max_tokens=4096,
|
119
|
+
verbose=False,
|
120
|
+
):
|
121
|
+
output += token.text
|
122
|
+
if "</doctag>" in token.text:
|
123
|
+
break
|
124
|
+
|
125
|
+
generation_time = time.time() - start_time
|
126
|
+
page_tags = output
|
127
|
+
|
128
|
+
# inference_time = time.time() - start_time
|
129
|
+
# tokens_per_second = num_tokens / generation_time
|
130
|
+
# print("")
|
131
|
+
# print(f"Page Inference Time: {inference_time:.2f} seconds")
|
132
|
+
# print(f"Total tokens on page: {num_tokens:.2f}")
|
133
|
+
# print(f"Tokens/sec: {tokens_per_second:.2f}")
|
134
|
+
# print("")
|
135
|
+
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
136
|
+
|
137
|
+
yield page
|
@@ -63,7 +63,13 @@ class PagePreprocessingModel(BasePageModel):
|
|
63
63
|
def draw_text_boxes(image, cells, show: bool = False):
|
64
64
|
draw = ImageDraw.Draw(image)
|
65
65
|
for c in cells:
|
66
|
-
x0, y0, x1, y1 =
|
66
|
+
x0, y0, x1, y1 = (
|
67
|
+
c.to_bounding_box().l,
|
68
|
+
c.to_bounding_box().t,
|
69
|
+
c.to_bounding_box().r,
|
70
|
+
c.to_bounding_box().b,
|
71
|
+
)
|
72
|
+
|
67
73
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
68
74
|
if show:
|
69
75
|
image.show()
|