docling 2.27.0__py3-none-any.whl → 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v4_backend.py +20 -13
- docling/backend/mspowerpoint_backend.py +18 -0
- docling/backend/msword_backend.py +5 -2
- docling/cli/main.py +81 -38
- docling/datamodel/pipeline_options.py +28 -2
- docling/models/hf_mlx_model.py +137 -0
- docling/pipeline/vlm_pipeline.py +78 -398
- {docling-2.27.0.dist-info → docling-2.28.0.dist-info}/METADATA +27 -32
- {docling-2.27.0.dist-info → docling-2.28.0.dist-info}/RECORD +12 -11
- {docling-2.27.0.dist-info → docling-2.28.0.dist-info}/LICENSE +0 -0
- {docling-2.27.0.dist-info → docling-2.28.0.dist-info}/WHEEL +0 -0
- {docling-2.27.0.dist-info → docling-2.28.0.dist-info}/entry_points.txt +0 -0
@@ -112,23 +112,30 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|
112
112
|
padbox.r = page_size.width - padbox.r
|
113
113
|
padbox.t = page_size.height - padbox.t
|
114
114
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
115
|
+
with pypdfium2_lock:
|
116
|
+
image = (
|
117
|
+
self._ppage.render(
|
118
|
+
scale=scale * 1.5,
|
119
|
+
rotation=0, # no additional rotation
|
120
|
+
crop=padbox.as_tuple(),
|
121
|
+
)
|
122
|
+
.to_pil()
|
123
|
+
.resize(
|
124
|
+
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
125
|
+
)
|
126
|
+
) # We resize the image from 1.5x the given scale to make it sharper.
|
124
127
|
|
125
128
|
return image
|
126
129
|
|
127
130
|
def get_size(self) -> Size:
|
128
|
-
|
129
|
-
width=self.
|
130
|
-
|
131
|
-
|
131
|
+
with pypdfium2_lock:
|
132
|
+
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
133
|
+
|
134
|
+
# TODO: Take width and height from docling-parse.
|
135
|
+
# return Size(
|
136
|
+
# width=self._dpage.dimension.width,
|
137
|
+
# height=self._dpage.dimension.height,
|
138
|
+
# )
|
132
139
|
|
133
140
|
def unload(self):
|
134
141
|
self._ppage = None
|
@@ -16,6 +16,7 @@ from docling_core.types.doc import (
|
|
16
16
|
TableCell,
|
17
17
|
TableData,
|
18
18
|
)
|
19
|
+
from docling_core.types.doc.document import ContentLayer
|
19
20
|
from PIL import Image, UnidentifiedImageError
|
20
21
|
from pptx import Presentation
|
21
22
|
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
|
@@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|
421
422
|
for shape in slide.shapes:
|
422
423
|
handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
|
423
424
|
|
425
|
+
# Handle notes slide
|
426
|
+
if slide.has_notes_slide:
|
427
|
+
notes_slide = slide.notes_slide
|
428
|
+
notes_text = notes_slide.notes_text_frame.text.strip()
|
429
|
+
if notes_text:
|
430
|
+
bbox = BoundingBox(l=0, t=0, r=0, b=0)
|
431
|
+
prov = ProvenanceItem(
|
432
|
+
page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
|
433
|
+
)
|
434
|
+
doc.add_text(
|
435
|
+
label=DocItemLabel.TEXT,
|
436
|
+
parent=parent_slide,
|
437
|
+
text=notes_text,
|
438
|
+
prov=prov,
|
439
|
+
content_layer=ContentLayer.FURNITURE,
|
440
|
+
)
|
441
|
+
|
424
442
|
return doc
|
@@ -275,8 +275,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
275
275
|
only_equations.append(latex_equation)
|
276
276
|
texts_and_equations.append(latex_equation)
|
277
277
|
|
278
|
-
if "".join(only_texts) != text:
|
279
|
-
|
278
|
+
if "".join(only_texts).strip() != text.strip():
|
279
|
+
# If we are not able to reconstruct the initial raw text
|
280
|
+
# do not try to parse equations and return the original
|
281
|
+
return text, []
|
280
282
|
|
281
283
|
return "".join(texts_and_equations), only_equations
|
282
284
|
|
@@ -365,6 +367,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
365
367
|
for eq in equations:
|
366
368
|
if len(text_tmp) == 0:
|
367
369
|
break
|
370
|
+
|
368
371
|
pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
|
369
372
|
text_tmp = text_tmp.split(eq, maxsplit=1)[1]
|
370
373
|
if len(pre_eq_text) > 0:
|
docling/cli/main.py
CHANGED
@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
|
|
32
32
|
AcceleratorOptions,
|
33
33
|
EasyOcrOptions,
|
34
34
|
OcrOptions,
|
35
|
+
PaginatedPipelineOptions,
|
35
36
|
PdfBackend,
|
37
|
+
PdfPipeline,
|
36
38
|
PdfPipelineOptions,
|
37
39
|
TableFormerMode,
|
40
|
+
VlmModelType,
|
41
|
+
VlmPipelineOptions,
|
42
|
+
granite_vision_vlm_conversion_options,
|
43
|
+
smoldocling_vlm_conversion_options,
|
44
|
+
smoldocling_vlm_mlx_conversion_options,
|
38
45
|
)
|
39
46
|
from docling.datamodel.settings import settings
|
40
47
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
41
48
|
from docling.models.factories import get_ocr_factory
|
49
|
+
from docling.pipeline.vlm_pipeline import VlmPipeline
|
42
50
|
|
43
51
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
44
52
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
@@ -200,6 +208,14 @@ def convert(
|
|
200
208
|
help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
201
209
|
),
|
202
210
|
] = ImageRefMode.EMBEDDED,
|
211
|
+
pipeline: Annotated[
|
212
|
+
PdfPipeline,
|
213
|
+
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
|
214
|
+
] = PdfPipeline.STANDARD,
|
215
|
+
vlm_model: Annotated[
|
216
|
+
VlmModelType,
|
217
|
+
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
|
218
|
+
] = VlmModelType.SMOLDOCLING,
|
203
219
|
ocr: Annotated[
|
204
220
|
bool,
|
205
221
|
typer.Option(
|
@@ -420,50 +436,77 @@ def convert(
|
|
420
436
|
ocr_options.lang = ocr_lang_list
|
421
437
|
|
422
438
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
423
|
-
pipeline_options
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
439
|
+
pipeline_options: PaginatedPipelineOptions
|
440
|
+
|
441
|
+
if pipeline == PdfPipeline.STANDARD:
|
442
|
+
pipeline_options = PdfPipelineOptions(
|
443
|
+
allow_external_plugins=allow_external_plugins,
|
444
|
+
enable_remote_services=enable_remote_services,
|
445
|
+
accelerator_options=accelerator_options,
|
446
|
+
do_ocr=ocr,
|
447
|
+
ocr_options=ocr_options,
|
448
|
+
do_table_structure=True,
|
449
|
+
do_code_enrichment=enrich_code,
|
450
|
+
do_formula_enrichment=enrich_formula,
|
451
|
+
do_picture_description=enrich_picture_description,
|
452
|
+
do_picture_classification=enrich_picture_classes,
|
453
|
+
document_timeout=document_timeout,
|
454
|
+
)
|
455
|
+
pipeline_options.table_structure_options.do_cell_matching = (
|
456
|
+
True # do_cell_matching
|
457
|
+
)
|
458
|
+
pipeline_options.table_structure_options.mode = table_mode
|
459
|
+
|
460
|
+
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
461
|
+
pipeline_options.generate_page_images = True
|
462
|
+
pipeline_options.generate_picture_images = (
|
463
|
+
True # FIXME: to be deprecated in verson 3
|
464
|
+
)
|
465
|
+
pipeline_options.images_scale = 2
|
466
|
+
|
467
|
+
backend: Type[PdfDocumentBackend]
|
468
|
+
if pdf_backend == PdfBackend.DLPARSE_V1:
|
469
|
+
backend = DoclingParseDocumentBackend
|
470
|
+
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
471
|
+
backend = DoclingParseV2DocumentBackend
|
472
|
+
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
473
|
+
backend = DoclingParseV4DocumentBackend # type: ignore
|
474
|
+
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
475
|
+
backend = PyPdfiumDocumentBackend # type: ignore
|
476
|
+
else:
|
477
|
+
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
478
|
+
|
479
|
+
pdf_format_option = PdfFormatOption(
|
480
|
+
pipeline_options=pipeline_options,
|
481
|
+
backend=backend, # pdf_backend
|
482
|
+
)
|
483
|
+
elif pipeline == PdfPipeline.VLM:
|
484
|
+
pipeline_options = VlmPipelineOptions()
|
485
|
+
|
486
|
+
if vlm_model == VlmModelType.GRANITE_VISION:
|
487
|
+
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
488
|
+
elif vlm_model == VlmModelType.SMOLDOCLING:
|
489
|
+
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
490
|
+
if sys.platform == "darwin":
|
491
|
+
try:
|
492
|
+
import mlx_vlm
|
493
|
+
|
494
|
+
pipeline_options.vlm_options = (
|
495
|
+
smoldocling_vlm_mlx_conversion_options
|
496
|
+
)
|
497
|
+
except ImportError:
|
498
|
+
_log.warning(
|
499
|
+
"To run SmolDocling faster, please install mlx-vlm:\n"
|
500
|
+
"pip install mlx-vlm"
|
501
|
+
)
|
440
502
|
|
441
|
-
|
442
|
-
|
443
|
-
pipeline_options.generate_picture_images = (
|
444
|
-
True # FIXME: to be deprecated in verson 3
|
503
|
+
pdf_format_option = PdfFormatOption(
|
504
|
+
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
445
505
|
)
|
446
|
-
pipeline_options.images_scale = 2
|
447
506
|
|
448
507
|
if artifacts_path is not None:
|
449
508
|
pipeline_options.artifacts_path = artifacts_path
|
450
509
|
|
451
|
-
backend: Type[PdfDocumentBackend]
|
452
|
-
if pdf_backend == PdfBackend.DLPARSE_V1:
|
453
|
-
backend = DoclingParseDocumentBackend
|
454
|
-
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
455
|
-
backend = DoclingParseV2DocumentBackend
|
456
|
-
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
457
|
-
backend = DoclingParseV4DocumentBackend # type: ignore
|
458
|
-
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
459
|
-
backend = PyPdfiumDocumentBackend # type: ignore
|
460
|
-
else:
|
461
|
-
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
462
|
-
|
463
|
-
pdf_format_option = PdfFormatOption(
|
464
|
-
pipeline_options=pipeline_options,
|
465
|
-
backend=backend, # pdf_backend
|
466
|
-
)
|
467
510
|
format_options: Dict[InputFormat, FormatOption] = {
|
468
511
|
InputFormat.PDF: pdf_format_option,
|
469
512
|
InputFormat.IMAGE: pdf_format_option,
|
@@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
|
|
263
263
|
MARKDOWN = "markdown"
|
264
264
|
|
265
265
|
|
266
|
+
class InferenceFramework(str, Enum):
|
267
|
+
MLX = "mlx"
|
268
|
+
TRANSFORMERS = "transformers"
|
269
|
+
|
270
|
+
|
266
271
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
267
272
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
268
273
|
|
@@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
271
276
|
llm_int8_threshold: float = 6.0
|
272
277
|
quantized: bool = False
|
273
278
|
|
279
|
+
inference_framework: InferenceFramework
|
274
280
|
response_format: ResponseFormat
|
275
281
|
|
276
282
|
@property
|
@@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
278
284
|
return self.repo_id.replace("/", "--")
|
279
285
|
|
280
286
|
|
287
|
+
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
288
|
+
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
289
|
+
prompt="Convert this page to docling.",
|
290
|
+
response_format=ResponseFormat.DOCTAGS,
|
291
|
+
inference_framework=InferenceFramework.MLX,
|
292
|
+
)
|
293
|
+
|
294
|
+
|
281
295
|
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
282
296
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
283
297
|
prompt="Convert this page to docling.",
|
284
298
|
response_format=ResponseFormat.DOCTAGS,
|
299
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
285
300
|
)
|
286
301
|
|
287
302
|
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
@@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
289
304
|
# prompt="OCR the full page to markdown.",
|
290
305
|
prompt="OCR this image.",
|
291
306
|
response_format=ResponseFormat.MARKDOWN,
|
307
|
+
inference_framework=InferenceFramework.TRANSFORMERS,
|
292
308
|
)
|
293
309
|
|
294
310
|
|
311
|
+
class VlmModelType(str, Enum):
|
312
|
+
SMOLDOCLING = "smoldocling"
|
313
|
+
GRANITE_VISION = "granite_vision"
|
314
|
+
|
315
|
+
|
295
316
|
# Define an enum for the backend options
|
296
317
|
class PdfBackend(str, Enum):
|
297
318
|
"""Enum of valid PDF backends."""
|
@@ -327,13 +348,14 @@ class PipelineOptions(BaseModel):
|
|
327
348
|
|
328
349
|
|
329
350
|
class PaginatedPipelineOptions(PipelineOptions):
|
351
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
352
|
+
|
330
353
|
images_scale: float = 1.0
|
331
354
|
generate_page_images: bool = False
|
332
355
|
generate_picture_images: bool = False
|
333
356
|
|
334
357
|
|
335
358
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
336
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
337
359
|
|
338
360
|
generate_page_images: bool = True
|
339
361
|
force_backend_text: bool = (
|
@@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
346
368
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
347
369
|
"""Options for the PDF pipeline."""
|
348
370
|
|
349
|
-
artifacts_path: Optional[Union[Path, str]] = None
|
350
371
|
do_table_structure: bool = True # True: perform table structure extraction
|
351
372
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
352
373
|
do_code_enrichment: bool = False # True: perform code OCR
|
@@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
377
398
|
)
|
378
399
|
|
379
400
|
generate_parsed_pages: bool = False
|
401
|
+
|
402
|
+
|
403
|
+
class PdfPipeline(str, Enum):
|
404
|
+
STANDARD = "standard"
|
405
|
+
VLM = "vlm"
|
@@ -0,0 +1,137 @@
|
|
1
|
+
import logging
|
2
|
+
import time
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Iterable, List, Optional
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import Page, VlmPrediction
|
7
|
+
from docling.datamodel.document import ConversionResult
|
8
|
+
from docling.datamodel.pipeline_options import (
|
9
|
+
AcceleratorDevice,
|
10
|
+
AcceleratorOptions,
|
11
|
+
HuggingFaceVlmOptions,
|
12
|
+
)
|
13
|
+
from docling.datamodel.settings import settings
|
14
|
+
from docling.models.base_model import BasePageModel
|
15
|
+
from docling.utils.accelerator_utils import decide_device
|
16
|
+
from docling.utils.profiling import TimeRecorder
|
17
|
+
|
18
|
+
_log = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class HuggingFaceMlxModel(BasePageModel):
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
enabled: bool,
|
26
|
+
artifacts_path: Optional[Path],
|
27
|
+
accelerator_options: AcceleratorOptions,
|
28
|
+
vlm_options: HuggingFaceVlmOptions,
|
29
|
+
):
|
30
|
+
self.enabled = enabled
|
31
|
+
|
32
|
+
self.vlm_options = vlm_options
|
33
|
+
|
34
|
+
if self.enabled:
|
35
|
+
|
36
|
+
try:
|
37
|
+
from mlx_vlm import generate, load # type: ignore
|
38
|
+
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
39
|
+
from mlx_vlm.utils import load_config, stream_generate # type: ignore
|
40
|
+
except ImportError:
|
41
|
+
raise ImportError(
|
42
|
+
"mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
|
43
|
+
)
|
44
|
+
|
45
|
+
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
46
|
+
self.apply_chat_template = apply_chat_template
|
47
|
+
self.stream_generate = stream_generate
|
48
|
+
|
49
|
+
# PARAMETERS:
|
50
|
+
if artifacts_path is None:
|
51
|
+
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
52
|
+
elif (artifacts_path / repo_cache_folder).exists():
|
53
|
+
artifacts_path = artifacts_path / repo_cache_folder
|
54
|
+
|
55
|
+
self.param_question = vlm_options.prompt # "Perform Layout Analysis."
|
56
|
+
|
57
|
+
## Load the model
|
58
|
+
self.vlm_model, self.processor = load(artifacts_path)
|
59
|
+
self.config = load_config(artifacts_path)
|
60
|
+
|
61
|
+
@staticmethod
|
62
|
+
def download_models(
|
63
|
+
repo_id: str,
|
64
|
+
local_dir: Optional[Path] = None,
|
65
|
+
force: bool = False,
|
66
|
+
progress: bool = False,
|
67
|
+
) -> Path:
|
68
|
+
from huggingface_hub import snapshot_download
|
69
|
+
from huggingface_hub.utils import disable_progress_bars
|
70
|
+
|
71
|
+
if not progress:
|
72
|
+
disable_progress_bars()
|
73
|
+
download_path = snapshot_download(
|
74
|
+
repo_id=repo_id,
|
75
|
+
force_download=force,
|
76
|
+
local_dir=local_dir,
|
77
|
+
# revision="v0.0.1",
|
78
|
+
)
|
79
|
+
|
80
|
+
return Path(download_path)
|
81
|
+
|
82
|
+
def __call__(
|
83
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
84
|
+
) -> Iterable[Page]:
|
85
|
+
for page in page_batch:
|
86
|
+
assert page._backend is not None
|
87
|
+
if not page._backend.is_valid():
|
88
|
+
yield page
|
89
|
+
else:
|
90
|
+
with TimeRecorder(conv_res, "vlm"):
|
91
|
+
assert page.size is not None
|
92
|
+
|
93
|
+
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
94
|
+
# hi_res_image = page.get_image(scale=1.0) # 72dpi
|
95
|
+
|
96
|
+
if hi_res_image is not None:
|
97
|
+
im_width, im_height = hi_res_image.size
|
98
|
+
|
99
|
+
# populate page_tags with predicted doc tags
|
100
|
+
page_tags = ""
|
101
|
+
|
102
|
+
if hi_res_image:
|
103
|
+
if hi_res_image.mode != "RGB":
|
104
|
+
hi_res_image = hi_res_image.convert("RGB")
|
105
|
+
|
106
|
+
prompt = self.apply_chat_template(
|
107
|
+
self.processor, self.config, self.param_question, num_images=1
|
108
|
+
)
|
109
|
+
|
110
|
+
start_time = time.time()
|
111
|
+
# Call model to generate:
|
112
|
+
output = ""
|
113
|
+
for token in self.stream_generate(
|
114
|
+
self.vlm_model,
|
115
|
+
self.processor,
|
116
|
+
prompt,
|
117
|
+
[hi_res_image],
|
118
|
+
max_tokens=4096,
|
119
|
+
verbose=False,
|
120
|
+
):
|
121
|
+
output += token.text
|
122
|
+
if "</doctag>" in token.text:
|
123
|
+
break
|
124
|
+
|
125
|
+
generation_time = time.time() - start_time
|
126
|
+
page_tags = output
|
127
|
+
|
128
|
+
# inference_time = time.time() - start_time
|
129
|
+
# tokens_per_second = num_tokens / generation_time
|
130
|
+
# print("")
|
131
|
+
# print(f"Page Inference Time: {inference_time:.2f} seconds")
|
132
|
+
# print(f"Total tokens on page: {num_tokens:.2f}")
|
133
|
+
# print(f"Tokens/sec: {tokens_per_second:.2f}")
|
134
|
+
# print("")
|
135
|
+
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
136
|
+
|
137
|
+
yield page
|
docling/pipeline/vlm_pipeline.py
CHANGED
@@ -1,30 +1,13 @@
|
|
1
|
-
import itertools
|
2
1
|
import logging
|
3
|
-
import re
|
4
2
|
import warnings
|
5
3
|
from io import BytesIO
|
6
|
-
|
7
|
-
# from io import BytesIO
|
8
4
|
from pathlib import Path
|
9
|
-
from typing import Optional
|
5
|
+
from typing import List, Optional, Union, cast
|
10
6
|
|
11
|
-
from docling_core.types import DoclingDocument
|
12
|
-
from docling_core.types.doc import
|
13
|
-
|
14
|
-
|
15
|
-
DocItemLabel,
|
16
|
-
DoclingDocument,
|
17
|
-
GroupLabel,
|
18
|
-
ImageRef,
|
19
|
-
ImageRefMode,
|
20
|
-
PictureItem,
|
21
|
-
ProvenanceItem,
|
22
|
-
Size,
|
23
|
-
TableCell,
|
24
|
-
TableData,
|
25
|
-
TableItem,
|
26
|
-
)
|
27
|
-
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
7
|
+
# from docling_core.types import DoclingDocument
|
8
|
+
from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
|
9
|
+
from docling_core.types.doc.document import DocTagsDocument
|
10
|
+
from PIL import Image as PILImage
|
28
11
|
|
29
12
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
30
13
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
@@ -32,11 +15,12 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
|
32
15
|
from docling.datamodel.base_models import InputFormat, Page
|
33
16
|
from docling.datamodel.document import ConversionResult, InputDocument
|
34
17
|
from docling.datamodel.pipeline_options import (
|
35
|
-
|
18
|
+
InferenceFramework,
|
36
19
|
ResponseFormat,
|
37
20
|
VlmPipelineOptions,
|
38
21
|
)
|
39
22
|
from docling.datamodel.settings import settings
|
23
|
+
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
40
24
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
41
25
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
42
26
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
@@ -50,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
|
|
50
34
|
super().__init__(pipeline_options)
|
51
35
|
self.keep_backend = True
|
52
36
|
|
53
|
-
warnings.warn(
|
54
|
-
"The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
|
55
|
-
category=UserWarning,
|
56
|
-
stacklevel=2,
|
57
|
-
)
|
58
|
-
|
59
37
|
self.pipeline_options: VlmPipelineOptions
|
60
38
|
|
61
39
|
artifacts_path: Optional[Path] = None
|
@@ -79,14 +57,27 @@ class VlmPipeline(PaginatedPipeline):
|
|
79
57
|
|
80
58
|
self.keep_images = self.pipeline_options.generate_page_images
|
81
59
|
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
60
|
+
if (
|
61
|
+
self.pipeline_options.vlm_options.inference_framework
|
62
|
+
== InferenceFramework.MLX
|
63
|
+
):
|
64
|
+
self.build_pipe = [
|
65
|
+
HuggingFaceMlxModel(
|
66
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
67
|
+
artifacts_path=artifacts_path,
|
68
|
+
accelerator_options=pipeline_options.accelerator_options,
|
69
|
+
vlm_options=self.pipeline_options.vlm_options,
|
70
|
+
),
|
71
|
+
]
|
72
|
+
else:
|
73
|
+
self.build_pipe = [
|
74
|
+
HuggingFaceVlmModel(
|
75
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
76
|
+
artifacts_path=artifacts_path,
|
77
|
+
accelerator_options=pipeline_options.accelerator_options,
|
78
|
+
vlm_options=self.pipeline_options.vlm_options,
|
79
|
+
),
|
80
|
+
]
|
90
81
|
|
91
82
|
self.enrichment_pipe = [
|
92
83
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
@@ -100,6 +91,17 @@ class VlmPipeline(PaginatedPipeline):
|
|
100
91
|
|
101
92
|
return page
|
102
93
|
|
94
|
+
def extract_text_from_backend(
|
95
|
+
self, page: Page, bbox: Union[BoundingBox, None]
|
96
|
+
) -> str:
|
97
|
+
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
98
|
+
text = ""
|
99
|
+
if bbox:
|
100
|
+
if page.size:
|
101
|
+
if page._backend:
|
102
|
+
text = page._backend.get_text_in_rect(bbox)
|
103
|
+
return text
|
104
|
+
|
103
105
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
104
106
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
105
107
|
|
@@ -107,7 +109,45 @@ class VlmPipeline(PaginatedPipeline):
|
|
107
109
|
self.pipeline_options.vlm_options.response_format
|
108
110
|
== ResponseFormat.DOCTAGS
|
109
111
|
):
|
110
|
-
|
112
|
+
doctags_list = []
|
113
|
+
image_list = []
|
114
|
+
for page in conv_res.pages:
|
115
|
+
predicted_doctags = ""
|
116
|
+
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
117
|
+
if page.predictions.vlm_response:
|
118
|
+
predicted_doctags = page.predictions.vlm_response.text
|
119
|
+
if page.image:
|
120
|
+
img = page.image
|
121
|
+
image_list.append(img)
|
122
|
+
doctags_list.append(predicted_doctags)
|
123
|
+
|
124
|
+
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
125
|
+
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
126
|
+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
127
|
+
doctags_list_c, image_list_c
|
128
|
+
)
|
129
|
+
conv_res.document.load_from_doctags(doctags_doc)
|
130
|
+
|
131
|
+
# If forced backend text, replace model predicted text with backend one
|
132
|
+
if page.size:
|
133
|
+
if self.force_backend_text:
|
134
|
+
scale = self.pipeline_options.images_scale
|
135
|
+
for element, _level in conv_res.document.iterate_items():
|
136
|
+
if (
|
137
|
+
not isinstance(element, TextItem)
|
138
|
+
or len(element.prov) == 0
|
139
|
+
):
|
140
|
+
continue
|
141
|
+
crop_bbox = (
|
142
|
+
element.prov[0]
|
143
|
+
.bbox.scaled(scale=scale)
|
144
|
+
.to_top_left_origin(
|
145
|
+
page_height=page.size.height * scale
|
146
|
+
)
|
147
|
+
)
|
148
|
+
txt = self.extract_text_from_backend(page, crop_bbox)
|
149
|
+
element.text = txt
|
150
|
+
element.orig = txt
|
111
151
|
elif (
|
112
152
|
self.pipeline_options.vlm_options.response_format
|
113
153
|
== ResponseFormat.MARKDOWN
|
@@ -165,366 +205,6 @@ class VlmPipeline(PaginatedPipeline):
|
|
165
205
|
)
|
166
206
|
return backend.convert()
|
167
207
|
|
168
|
-
def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
|
169
|
-
###############################################
|
170
|
-
# Tag definitions and color mappings
|
171
|
-
###############################################
|
172
|
-
|
173
|
-
# Maps the recognized tag to a Docling label.
|
174
|
-
# Code items will be given DocItemLabel.CODE
|
175
|
-
tag_to_doclabel = {
|
176
|
-
"title": DocItemLabel.TITLE,
|
177
|
-
"document_index": DocItemLabel.DOCUMENT_INDEX,
|
178
|
-
"otsl": DocItemLabel.TABLE,
|
179
|
-
"section_header_level_1": DocItemLabel.SECTION_HEADER,
|
180
|
-
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
|
181
|
-
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
|
182
|
-
"text": DocItemLabel.TEXT,
|
183
|
-
"page_header": DocItemLabel.PAGE_HEADER,
|
184
|
-
"page_footer": DocItemLabel.PAGE_FOOTER,
|
185
|
-
"formula": DocItemLabel.FORMULA,
|
186
|
-
"caption": DocItemLabel.CAPTION,
|
187
|
-
"picture": DocItemLabel.PICTURE,
|
188
|
-
"list_item": DocItemLabel.LIST_ITEM,
|
189
|
-
"footnote": DocItemLabel.FOOTNOTE,
|
190
|
-
"code": DocItemLabel.CODE,
|
191
|
-
}
|
192
|
-
|
193
|
-
# Maps each tag to an associated bounding box color.
|
194
|
-
tag_to_color = {
|
195
|
-
"title": "blue",
|
196
|
-
"document_index": "darkblue",
|
197
|
-
"otsl": "green",
|
198
|
-
"section_header_level_1": "purple",
|
199
|
-
"checkbox_selected": "black",
|
200
|
-
"checkbox_unselected": "gray",
|
201
|
-
"text": "red",
|
202
|
-
"page_header": "orange",
|
203
|
-
"page_footer": "cyan",
|
204
|
-
"formula": "pink",
|
205
|
-
"caption": "magenta",
|
206
|
-
"picture": "yellow",
|
207
|
-
"list_item": "brown",
|
208
|
-
"footnote": "darkred",
|
209
|
-
"code": "lightblue",
|
210
|
-
}
|
211
|
-
|
212
|
-
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
213
|
-
"""Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
|
214
|
-
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
215
|
-
if len(coords) == 4:
|
216
|
-
l, t, r, b = map(float, coords)
|
217
|
-
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
218
|
-
return None
|
219
|
-
|
220
|
-
def extract_inner_text(text_chunk: str) -> str:
|
221
|
-
"""Strips all <...> tags inside the chunk to get the raw text content."""
|
222
|
-
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
223
|
-
|
224
|
-
def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
|
225
|
-
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
226
|
-
text = ""
|
227
|
-
if bbox:
|
228
|
-
if page.size:
|
229
|
-
bbox.l = bbox.l * page.size.width
|
230
|
-
bbox.t = bbox.t * page.size.height
|
231
|
-
bbox.r = bbox.r * page.size.width
|
232
|
-
bbox.b = bbox.b * page.size.height
|
233
|
-
if page._backend:
|
234
|
-
text = page._backend.get_text_in_rect(bbox)
|
235
|
-
return text
|
236
|
-
|
237
|
-
def otsl_parse_texts(texts, tokens):
|
238
|
-
split_word = TableToken.OTSL_NL.value
|
239
|
-
split_row_tokens = [
|
240
|
-
list(y)
|
241
|
-
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
242
|
-
if not x
|
243
|
-
]
|
244
|
-
table_cells = []
|
245
|
-
r_idx = 0
|
246
|
-
c_idx = 0
|
247
|
-
|
248
|
-
def count_right(tokens, c_idx, r_idx, which_tokens):
|
249
|
-
span = 0
|
250
|
-
c_idx_iter = c_idx
|
251
|
-
while tokens[r_idx][c_idx_iter] in which_tokens:
|
252
|
-
c_idx_iter += 1
|
253
|
-
span += 1
|
254
|
-
if c_idx_iter >= len(tokens[r_idx]):
|
255
|
-
return span
|
256
|
-
return span
|
257
|
-
|
258
|
-
def count_down(tokens, c_idx, r_idx, which_tokens):
|
259
|
-
span = 0
|
260
|
-
r_idx_iter = r_idx
|
261
|
-
while tokens[r_idx_iter][c_idx] in which_tokens:
|
262
|
-
r_idx_iter += 1
|
263
|
-
span += 1
|
264
|
-
if r_idx_iter >= len(tokens):
|
265
|
-
return span
|
266
|
-
return span
|
267
|
-
|
268
|
-
for i, text in enumerate(texts):
|
269
|
-
cell_text = ""
|
270
|
-
if text in [
|
271
|
-
TableToken.OTSL_FCEL.value,
|
272
|
-
TableToken.OTSL_ECEL.value,
|
273
|
-
TableToken.OTSL_CHED.value,
|
274
|
-
TableToken.OTSL_RHED.value,
|
275
|
-
TableToken.OTSL_SROW.value,
|
276
|
-
]:
|
277
|
-
row_span = 1
|
278
|
-
col_span = 1
|
279
|
-
right_offset = 1
|
280
|
-
if text != TableToken.OTSL_ECEL.value:
|
281
|
-
cell_text = texts[i + 1]
|
282
|
-
right_offset = 2
|
283
|
-
|
284
|
-
# Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
|
285
|
-
next_right_cell = ""
|
286
|
-
if i + right_offset < len(texts):
|
287
|
-
next_right_cell = texts[i + right_offset]
|
288
|
-
|
289
|
-
next_bottom_cell = ""
|
290
|
-
if r_idx + 1 < len(split_row_tokens):
|
291
|
-
if c_idx < len(split_row_tokens[r_idx + 1]):
|
292
|
-
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
293
|
-
|
294
|
-
if next_right_cell in [
|
295
|
-
TableToken.OTSL_LCEL.value,
|
296
|
-
TableToken.OTSL_XCEL.value,
|
297
|
-
]:
|
298
|
-
# we have horisontal spanning cell or 2d spanning cell
|
299
|
-
col_span += count_right(
|
300
|
-
split_row_tokens,
|
301
|
-
c_idx + 1,
|
302
|
-
r_idx,
|
303
|
-
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
304
|
-
)
|
305
|
-
if next_bottom_cell in [
|
306
|
-
TableToken.OTSL_UCEL.value,
|
307
|
-
TableToken.OTSL_XCEL.value,
|
308
|
-
]:
|
309
|
-
# we have a vertical spanning cell or 2d spanning cell
|
310
|
-
row_span += count_down(
|
311
|
-
split_row_tokens,
|
312
|
-
c_idx,
|
313
|
-
r_idx + 1,
|
314
|
-
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
315
|
-
)
|
316
|
-
|
317
|
-
table_cells.append(
|
318
|
-
TableCell(
|
319
|
-
text=cell_text.strip(),
|
320
|
-
row_span=row_span,
|
321
|
-
col_span=col_span,
|
322
|
-
start_row_offset_idx=r_idx,
|
323
|
-
end_row_offset_idx=r_idx + row_span,
|
324
|
-
start_col_offset_idx=c_idx,
|
325
|
-
end_col_offset_idx=c_idx + col_span,
|
326
|
-
)
|
327
|
-
)
|
328
|
-
if text in [
|
329
|
-
TableToken.OTSL_FCEL.value,
|
330
|
-
TableToken.OTSL_ECEL.value,
|
331
|
-
TableToken.OTSL_CHED.value,
|
332
|
-
TableToken.OTSL_RHED.value,
|
333
|
-
TableToken.OTSL_SROW.value,
|
334
|
-
TableToken.OTSL_LCEL.value,
|
335
|
-
TableToken.OTSL_UCEL.value,
|
336
|
-
TableToken.OTSL_XCEL.value,
|
337
|
-
]:
|
338
|
-
c_idx += 1
|
339
|
-
if text == TableToken.OTSL_NL.value:
|
340
|
-
r_idx += 1
|
341
|
-
c_idx = 0
|
342
|
-
return table_cells, split_row_tokens
|
343
|
-
|
344
|
-
def otsl_extract_tokens_and_text(s: str):
|
345
|
-
# Pattern to match anything enclosed by < > (including the angle brackets themselves)
|
346
|
-
pattern = r"(<[^>]+>)"
|
347
|
-
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
348
|
-
tokens = re.findall(pattern, s)
|
349
|
-
# Remove any tokens that start with "<loc_"
|
350
|
-
tokens = [
|
351
|
-
token
|
352
|
-
for token in tokens
|
353
|
-
if not (
|
354
|
-
token.startswith(rf"<{DocumentToken.LOC.value}")
|
355
|
-
or token
|
356
|
-
in [
|
357
|
-
rf"<{DocumentToken.OTSL.value}>",
|
358
|
-
rf"</{DocumentToken.OTSL.value}>",
|
359
|
-
]
|
360
|
-
)
|
361
|
-
]
|
362
|
-
# Split the string by those tokens to get the in-between text
|
363
|
-
text_parts = re.split(pattern, s)
|
364
|
-
text_parts = [
|
365
|
-
token
|
366
|
-
for token in text_parts
|
367
|
-
if not (
|
368
|
-
token.startswith(rf"<{DocumentToken.LOC.value}")
|
369
|
-
or token
|
370
|
-
in [
|
371
|
-
rf"<{DocumentToken.OTSL.value}>",
|
372
|
-
rf"</{DocumentToken.OTSL.value}>",
|
373
|
-
]
|
374
|
-
)
|
375
|
-
]
|
376
|
-
# Remove any empty or purely whitespace strings from text_parts
|
377
|
-
text_parts = [part for part in text_parts if part.strip()]
|
378
|
-
|
379
|
-
return tokens, text_parts
|
380
|
-
|
381
|
-
def parse_table_content(otsl_content: str) -> TableData:
|
382
|
-
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
383
|
-
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
384
|
-
|
385
|
-
return TableData(
|
386
|
-
num_rows=len(split_row_tokens),
|
387
|
-
num_cols=(
|
388
|
-
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
389
|
-
),
|
390
|
-
table_cells=table_cells,
|
391
|
-
)
|
392
|
-
|
393
|
-
doc = DoclingDocument(name="Document")
|
394
|
-
for pg_idx, page in enumerate(pages):
|
395
|
-
xml_content = ""
|
396
|
-
predicted_text = ""
|
397
|
-
if page.predictions.vlm_response:
|
398
|
-
predicted_text = page.predictions.vlm_response.text
|
399
|
-
image = page.image
|
400
|
-
|
401
|
-
page_no = pg_idx + 1
|
402
|
-
bounding_boxes = []
|
403
|
-
|
404
|
-
if page.size:
|
405
|
-
pg_width = page.size.width
|
406
|
-
pg_height = page.size.height
|
407
|
-
size = Size(width=pg_width, height=pg_height)
|
408
|
-
parent_page = doc.add_page(page_no=page_no, size=size)
|
409
|
-
|
410
|
-
"""
|
411
|
-
1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
|
412
|
-
2. For each chunk, extracts bounding box (if any) and inner text.
|
413
|
-
3. Adds the item to a DoclingDocument structure with the right label.
|
414
|
-
4. Tracks bounding boxes + color in a separate list for later visualization.
|
415
|
-
"""
|
416
|
-
|
417
|
-
# Regex for all recognized tags
|
418
|
-
tag_pattern = (
|
419
|
-
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
420
|
-
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
421
|
-
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
422
|
-
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
423
|
-
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
424
|
-
rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
425
|
-
rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
426
|
-
)
|
427
|
-
|
428
|
-
# DocumentToken.OTSL
|
429
|
-
pattern = re.compile(tag_pattern, re.DOTALL)
|
430
|
-
|
431
|
-
# Go through each match in order
|
432
|
-
for match in pattern.finditer(predicted_text):
|
433
|
-
full_chunk = match.group(0)
|
434
|
-
tag_name = match.group("tag")
|
435
|
-
|
436
|
-
bbox = extract_bounding_box(full_chunk)
|
437
|
-
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
438
|
-
color = tag_to_color.get(tag_name, "white")
|
439
|
-
|
440
|
-
# Store bounding box + color
|
441
|
-
if bbox:
|
442
|
-
bounding_boxes.append((bbox, color))
|
443
|
-
|
444
|
-
if tag_name == DocumentToken.OTSL.value:
|
445
|
-
table_data = parse_table_content(full_chunk)
|
446
|
-
bbox = extract_bounding_box(full_chunk)
|
447
|
-
|
448
|
-
if bbox:
|
449
|
-
prov = ProvenanceItem(
|
450
|
-
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
451
|
-
charspan=(0, 0),
|
452
|
-
page_no=page_no,
|
453
|
-
)
|
454
|
-
doc.add_table(data=table_data, prov=prov)
|
455
|
-
else:
|
456
|
-
doc.add_table(data=table_data)
|
457
|
-
|
458
|
-
elif tag_name == DocItemLabel.PICTURE:
|
459
|
-
text_caption_content = extract_inner_text(full_chunk)
|
460
|
-
if image:
|
461
|
-
if bbox:
|
462
|
-
im_width, im_height = image.size
|
463
|
-
|
464
|
-
crop_box = (
|
465
|
-
int(bbox.l * im_width),
|
466
|
-
int(bbox.t * im_height),
|
467
|
-
int(bbox.r * im_width),
|
468
|
-
int(bbox.b * im_height),
|
469
|
-
)
|
470
|
-
cropped_image = image.crop(crop_box)
|
471
|
-
pic = doc.add_picture(
|
472
|
-
parent=None,
|
473
|
-
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
474
|
-
prov=(
|
475
|
-
ProvenanceItem(
|
476
|
-
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
477
|
-
charspan=(0, 0),
|
478
|
-
page_no=page_no,
|
479
|
-
)
|
480
|
-
),
|
481
|
-
)
|
482
|
-
# If there is a caption to an image, add it as well
|
483
|
-
if len(text_caption_content) > 0:
|
484
|
-
caption_item = doc.add_text(
|
485
|
-
label=DocItemLabel.CAPTION,
|
486
|
-
text=text_caption_content,
|
487
|
-
parent=None,
|
488
|
-
)
|
489
|
-
pic.captions.append(caption_item.get_ref())
|
490
|
-
else:
|
491
|
-
if bbox:
|
492
|
-
# In case we don't have access to an binary of an image
|
493
|
-
doc.add_picture(
|
494
|
-
parent=None,
|
495
|
-
prov=ProvenanceItem(
|
496
|
-
bbox=bbox, charspan=(0, 0), page_no=page_no
|
497
|
-
),
|
498
|
-
)
|
499
|
-
# If there is a caption to an image, add it as well
|
500
|
-
if len(text_caption_content) > 0:
|
501
|
-
caption_item = doc.add_text(
|
502
|
-
label=DocItemLabel.CAPTION,
|
503
|
-
text=text_caption_content,
|
504
|
-
parent=None,
|
505
|
-
)
|
506
|
-
pic.captions.append(caption_item.get_ref())
|
507
|
-
else:
|
508
|
-
# For everything else, treat as text
|
509
|
-
if self.force_backend_text:
|
510
|
-
text_content = extract_text_from_backend(page, bbox)
|
511
|
-
else:
|
512
|
-
text_content = extract_inner_text(full_chunk)
|
513
|
-
doc.add_text(
|
514
|
-
label=doc_label,
|
515
|
-
text=text_content,
|
516
|
-
prov=(
|
517
|
-
ProvenanceItem(
|
518
|
-
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
519
|
-
charspan=(0, len(text_content)),
|
520
|
-
page_no=page_no,
|
521
|
-
)
|
522
|
-
if bbox
|
523
|
-
else None
|
524
|
-
),
|
525
|
-
)
|
526
|
-
return doc
|
527
|
-
|
528
208
|
@classmethod
|
529
209
|
def get_default_options(cls) -> VlmPipelineOptions:
|
530
210
|
return VlmPipelineOptions()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.28.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.23.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
|
|
86
86
|
[](https://opensource.org/licenses/MIT)
|
87
87
|
[](https://pepy.tech/projects/docling)
|
88
88
|
[](https://apify.com/vancura/docling)
|
89
|
+
[](https://lfaidata.foundation/projects/)
|
89
90
|
|
90
91
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
91
92
|
|
@@ -98,12 +99,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
98
99
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
99
100
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
100
101
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
102
|
+
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
|
101
103
|
* 💻 Simple and convenient CLI
|
102
104
|
|
103
105
|
### Coming soon
|
104
106
|
|
105
107
|
* 📝 Metadata extraction, including title, authors, references & language
|
106
|
-
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
107
108
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
108
109
|
* 📝 Complex chemistry understanding (Molecular structures)
|
109
110
|
|
@@ -120,7 +121,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl
|
|
120
121
|
|
121
122
|
## Getting started
|
122
123
|
|
123
|
-
To convert individual documents, use `convert()`, for example:
|
124
|
+
To convert individual documents with python, use `convert()`, for example:
|
124
125
|
|
125
126
|
```python
|
126
127
|
from docling.document_converter import DocumentConverter
|
@@ -134,6 +135,22 @@ print(result.document.export_to_markdown()) # output: "## Docling Technical Rep
|
|
134
135
|
More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
|
135
136
|
the docs.
|
136
137
|
|
138
|
+
## CLI
|
139
|
+
|
140
|
+
Docling has a built-in CLI to run conversions.
|
141
|
+
|
142
|
+
```bash
|
143
|
+
docling https://arxiv.org/pdf/2206.01062
|
144
|
+
```
|
145
|
+
|
146
|
+
You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
|
147
|
+
```bash
|
148
|
+
docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
|
149
|
+
```
|
150
|
+
This will use MLX acceleration on supported Apple Silicon hardware.
|
151
|
+
|
152
|
+
Read more [here](https://docling-project.github.io/docling/usage/)
|
153
|
+
|
137
154
|
## Documentation
|
138
155
|
|
139
156
|
Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
|
@@ -150,32 +167,6 @@ To further accelerate your AI application development, check out Docling's nativ
|
|
150
167
|
[integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
|
151
168
|
and tools.
|
152
169
|
|
153
|
-
## Apify Actor
|
154
|
-
|
155
|
-
<a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
|
156
|
-
|
157
|
-
You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
|
158
|
-
|
159
|
-
```bash
|
160
|
-
apify call vancura/docling -i '{
|
161
|
-
"options": {
|
162
|
-
"to_formats": ["md", "json", "html", "text", "doctags"]
|
163
|
-
},
|
164
|
-
"http_sources": [
|
165
|
-
{"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
|
166
|
-
{"url": "https://arxiv.org/pdf/2408.09869"}
|
167
|
-
]
|
168
|
-
}'
|
169
|
-
```
|
170
|
-
|
171
|
-
The Actor stores results in:
|
172
|
-
|
173
|
-
* Processed document in key-value store (`OUTPUT_RESULT`)
|
174
|
-
* Processing logs (`DOCLING_LOG`)
|
175
|
-
* Dataset record with result URL and status
|
176
|
-
|
177
|
-
Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
|
178
|
-
|
179
170
|
## Get help and support
|
180
171
|
|
181
172
|
Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
|
@@ -210,9 +201,13 @@ If you use Docling in your projects, please consider citing the following:
|
|
210
201
|
The Docling codebase is under MIT license.
|
211
202
|
For individual model usage, please refer to the model licenses found in the original packages.
|
212
203
|
|
213
|
-
##
|
204
|
+
## LF AI & Data
|
205
|
+
|
206
|
+
Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
|
207
|
+
|
208
|
+
### IBM ❤️ Open Source AI
|
214
209
|
|
215
|
-
|
210
|
+
The project was started by the AI for knowledge team at IBM Research Zurich.
|
216
211
|
|
217
212
|
[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
|
218
213
|
[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
|
@@ -5,7 +5,7 @@ docling/backend/asciidoc_backend.py,sha256=xBtmYkRkPICIfMbB8AFIw_or4IZGB17mP_LhX
|
|
5
5
|
docling/backend/csv_backend.py,sha256=lCNSkgB55IbAig7w4IyXRkX23aM3Nojj6GdXNoaNjY4,4536
|
6
6
|
docling/backend/docling_parse_backend.py,sha256=tcy4cPD_dtGD37CjivbFvwzwXVcrb3HVmofyasxLum8,7991
|
7
7
|
docling/backend/docling_parse_v2_backend.py,sha256=70kXqYhht-A8zb9z5emMe_1i0l9dyQGrM8lg1cmAvqc,9369
|
8
|
-
docling/backend/docling_parse_v4_backend.py,sha256=
|
8
|
+
docling/backend/docling_parse_v4_backend.py,sha256=IECMJQWEvYqQv043_1Ho6dLkCbuaK8cMUsqcxwqruXo,6287
|
9
9
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
11
|
docling/backend/docx/latex/latex_dict.py,sha256=a0UC3VLmG1BLN-hGmEaQamzKbDB10fCz0U8qRU--aBw,6613
|
@@ -15,8 +15,8 @@ docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
|
|
15
15
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
16
16
|
docling/backend/md_backend.py,sha256=v230PXShYJo2QaabwUHiBpE-EGScHIerjL78zPaJpZM,16837
|
17
17
|
docling/backend/msexcel_backend.py,sha256=_ZVZFKRRijpg-Xz10xNxu2m-NpDaYvoiBqEZP6GbrgE,11095
|
18
|
-
docling/backend/mspowerpoint_backend.py,sha256=
|
19
|
-
docling/backend/msword_backend.py,sha256=
|
18
|
+
docling/backend/mspowerpoint_backend.py,sha256=zXdXr8nGJJbPGTgR5_dqq5WmNL1wDCaK0RqFqtuHPqs,17213
|
19
|
+
docling/backend/msword_backend.py,sha256=VjTvJe249FjHJDBpK0RC4iyosMzmpJLTuFIAPNEdReU,23259
|
20
20
|
docling/backend/pdf_backend.py,sha256=odWb1rxk3WCUIEJMhq-dYFNUQ1pSDuNHbU9wlTZIRAs,2211
|
21
21
|
docling/backend/pypdfium2_backend.py,sha256=wRwhA5XHRqL7vyNhCAHM6P-ONkwtyjKG9LgC4NJ-4i8,10784
|
22
22
|
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -24,13 +24,13 @@ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqaf
|
|
24
24
|
docling/backend/xml/uspto_backend.py,sha256=H0jwIt2skOke_yEUk0wfXCtodrB-hrj2ygLtB3jMWaI,71056
|
25
25
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
26
26
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
docling/cli/main.py,sha256=
|
27
|
+
docling/cli/main.py,sha256=zr36i-itYkX013g_DK6aNiNe8UPaD27_A7UtG5qwLUo,20174
|
28
28
|
docling/cli/models.py,sha256=tM_qbMM3YOPxFU7JlME96MLbtd1CX_bOAK7FS-NhJvY,3979
|
29
29
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
30
30
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
31
|
docling/datamodel/base_models.py,sha256=MAHr8LlffZ2uIXZ3AXOsikh_-oQIEYTiwwjsz-dQW9U,7287
|
32
32
|
docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
|
33
|
-
docling/datamodel/pipeline_options.py,sha256=
|
33
|
+
docling/datamodel/pipeline_options.py,sha256=TpRf_-7UuCjjaytFWA0nL2m-KP4no9jeAjaXRjBLMLE,12593
|
34
34
|
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
35
35
|
docling/document_converter.py,sha256=LwbnfGzma937EmSrNWMzM-dldI9Cbu4DUgY8gL1OVHo,13184
|
36
36
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
@@ -44,6 +44,7 @@ docling/models/factories/__init__.py,sha256=e4lFmRfmW5hWqvJjY5xaVFbvCQhDBCrVeSq8
|
|
44
44
|
docling/models/factories/base_factory.py,sha256=pNR9-B_BKs2sYNyHnp2ON2l3r6Dy9lcof4qmwHlAryI,4032
|
45
45
|
docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
|
46
46
|
docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
|
47
|
+
docling/models/hf_mlx_model.py,sha256=2eSHphJm5LAfiSA24blVMc2znJlKMYrtmmzq8ffc-rU,4924
|
47
48
|
docling/models/hf_vlm_model.py,sha256=NUtLEuG-kNGJeDHWmQKAAOZG4WF0a5hn-KXUUM1mHBQ,6820
|
48
49
|
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
49
50
|
docling/models/ocr_mac_model.py,sha256=2pZaUWg19go_u88mKWr5y_52PAYEN__GsbyUYLdY4zo,5353
|
@@ -63,7 +64,7 @@ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
63
64
|
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
64
65
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
65
66
|
docling/pipeline/standard_pdf_pipeline.py,sha256=tHOHFyJajX6IAhm4y3I27uqn5jfMTuCaSaFOKT5JM2M,10593
|
66
|
-
docling/pipeline/vlm_pipeline.py,sha256=
|
67
|
+
docling/pipeline/vlm_pipeline.py,sha256=1eKt3gqWf6PxGvYZuqhKi2BFljJGJWIyHemzOAwa39Y,9065
|
67
68
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
68
69
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
70
|
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
@@ -76,8 +77,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
|
|
76
77
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
77
78
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
78
79
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
79
|
-
docling-2.
|
80
|
-
docling-2.
|
81
|
-
docling-2.
|
82
|
-
docling-2.
|
83
|
-
docling-2.
|
80
|
+
docling-2.28.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
81
|
+
docling-2.28.0.dist-info/METADATA,sha256=miIkWRX5hgrOeGbyYDAiQaymAR6PxK6Qdlss5DR1YhM,9982
|
82
|
+
docling-2.28.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
83
|
+
docling-2.28.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
|
84
|
+
docling-2.28.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|