docling 2.27.0__py3-none-any.whl → 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -112,23 +112,30 @@ class DoclingParseV4PageBackend(PdfPageBackend):
112
112
  padbox.r = page_size.width - padbox.r
113
113
  padbox.t = page_size.height - padbox.t
114
114
 
115
- image = (
116
- self._ppage.render(
117
- scale=scale * 1.5,
118
- rotation=0, # no additional rotation
119
- crop=padbox.as_tuple(),
120
- )
121
- .to_pil()
122
- .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
123
- ) # We resize the image from 1.5x the given scale to make it sharper.
115
+ with pypdfium2_lock:
116
+ image = (
117
+ self._ppage.render(
118
+ scale=scale * 1.5,
119
+ rotation=0, # no additional rotation
120
+ crop=padbox.as_tuple(),
121
+ )
122
+ .to_pil()
123
+ .resize(
124
+ size=(round(cropbox.width * scale), round(cropbox.height * scale))
125
+ )
126
+ ) # We resize the image from 1.5x the given scale to make it sharper.
124
127
 
125
128
  return image
126
129
 
127
130
  def get_size(self) -> Size:
128
- return Size(
129
- width=self._dpage.dimension.width,
130
- height=self._dpage.dimension.height,
131
- )
131
+ with pypdfium2_lock:
132
+ return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
133
+
134
+ # TODO: Take width and height from docling-parse.
135
+ # return Size(
136
+ # width=self._dpage.dimension.width,
137
+ # height=self._dpage.dimension.height,
138
+ # )
132
139
 
133
140
  def unload(self):
134
141
  self._ppage = None
@@ -16,6 +16,7 @@ from docling_core.types.doc import (
16
16
  TableCell,
17
17
  TableData,
18
18
  )
19
+ from docling_core.types.doc.document import ContentLayer
19
20
  from PIL import Image, UnidentifiedImageError
20
21
  from pptx import Presentation
21
22
  from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
@@ -421,4 +422,21 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
421
422
  for shape in slide.shapes:
422
423
  handle_shapes(shape, parent_slide, slide_ind, doc, slide_size)
423
424
 
425
+ # Handle notes slide
426
+ if slide.has_notes_slide:
427
+ notes_slide = slide.notes_slide
428
+ notes_text = notes_slide.notes_text_frame.text.strip()
429
+ if notes_text:
430
+ bbox = BoundingBox(l=0, t=0, r=0, b=0)
431
+ prov = ProvenanceItem(
432
+ page_no=slide_ind + 1, charspan=[0, len(notes_text)], bbox=bbox
433
+ )
434
+ doc.add_text(
435
+ label=DocItemLabel.TEXT,
436
+ parent=parent_slide,
437
+ text=notes_text,
438
+ prov=prov,
439
+ content_layer=ContentLayer.FURNITURE,
440
+ )
441
+
424
442
  return doc
@@ -275,8 +275,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
275
275
  only_equations.append(latex_equation)
276
276
  texts_and_equations.append(latex_equation)
277
277
 
278
- if "".join(only_texts) != text:
279
- return text
278
+ if "".join(only_texts).strip() != text.strip():
279
+ # If we are not able to reconstruct the initial raw text
280
+ # do not try to parse equations and return the original
281
+ return text, []
280
282
 
281
283
  return "".join(texts_and_equations), only_equations
282
284
 
@@ -365,6 +367,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
365
367
  for eq in equations:
366
368
  if len(text_tmp) == 0:
367
369
  break
370
+
368
371
  pre_eq_text = text_tmp.split(eq, maxsplit=1)[0]
369
372
  text_tmp = text_tmp.split(eq, maxsplit=1)[1]
370
373
  if len(pre_eq_text) > 0:
docling/cli/main.py CHANGED
@@ -32,13 +32,21 @@ from docling.datamodel.pipeline_options import (
32
32
  AcceleratorOptions,
33
33
  EasyOcrOptions,
34
34
  OcrOptions,
35
+ PaginatedPipelineOptions,
35
36
  PdfBackend,
37
+ PdfPipeline,
36
38
  PdfPipelineOptions,
37
39
  TableFormerMode,
40
+ VlmModelType,
41
+ VlmPipelineOptions,
42
+ granite_vision_vlm_conversion_options,
43
+ smoldocling_vlm_conversion_options,
44
+ smoldocling_vlm_mlx_conversion_options,
38
45
  )
39
46
  from docling.datamodel.settings import settings
40
47
  from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
41
48
  from docling.models.factories import get_ocr_factory
49
+ from docling.pipeline.vlm_pipeline import VlmPipeline
42
50
 
43
51
  warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
44
52
  warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -200,6 +208,14 @@ def convert(
200
208
  help="Image export mode for the document (only in case of JSON, Markdown or HTML). With `placeholder`, only the position of the image is marked in the output. In `embedded` mode, the image is embedded as base64 encoded string. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
201
209
  ),
202
210
  ] = ImageRefMode.EMBEDDED,
211
+ pipeline: Annotated[
212
+ PdfPipeline,
213
+ typer.Option(..., help="Choose the pipeline to process PDF or image files."),
214
+ ] = PdfPipeline.STANDARD,
215
+ vlm_model: Annotated[
216
+ VlmModelType,
217
+ typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
218
+ ] = VlmModelType.SMOLDOCLING,
203
219
  ocr: Annotated[
204
220
  bool,
205
221
  typer.Option(
@@ -420,50 +436,77 @@ def convert(
420
436
  ocr_options.lang = ocr_lang_list
421
437
 
422
438
  accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
423
- pipeline_options = PdfPipelineOptions(
424
- allow_external_plugins=allow_external_plugins,
425
- enable_remote_services=enable_remote_services,
426
- accelerator_options=accelerator_options,
427
- do_ocr=ocr,
428
- ocr_options=ocr_options,
429
- do_table_structure=True,
430
- do_code_enrichment=enrich_code,
431
- do_formula_enrichment=enrich_formula,
432
- do_picture_description=enrich_picture_description,
433
- do_picture_classification=enrich_picture_classes,
434
- document_timeout=document_timeout,
435
- )
436
- pipeline_options.table_structure_options.do_cell_matching = (
437
- True # do_cell_matching
438
- )
439
- pipeline_options.table_structure_options.mode = table_mode
439
+ pipeline_options: PaginatedPipelineOptions
440
+
441
+ if pipeline == PdfPipeline.STANDARD:
442
+ pipeline_options = PdfPipelineOptions(
443
+ allow_external_plugins=allow_external_plugins,
444
+ enable_remote_services=enable_remote_services,
445
+ accelerator_options=accelerator_options,
446
+ do_ocr=ocr,
447
+ ocr_options=ocr_options,
448
+ do_table_structure=True,
449
+ do_code_enrichment=enrich_code,
450
+ do_formula_enrichment=enrich_formula,
451
+ do_picture_description=enrich_picture_description,
452
+ do_picture_classification=enrich_picture_classes,
453
+ document_timeout=document_timeout,
454
+ )
455
+ pipeline_options.table_structure_options.do_cell_matching = (
456
+ True # do_cell_matching
457
+ )
458
+ pipeline_options.table_structure_options.mode = table_mode
459
+
460
+ if image_export_mode != ImageRefMode.PLACEHOLDER:
461
+ pipeline_options.generate_page_images = True
462
+ pipeline_options.generate_picture_images = (
463
+ True # FIXME: to be deprecated in verson 3
464
+ )
465
+ pipeline_options.images_scale = 2
466
+
467
+ backend: Type[PdfDocumentBackend]
468
+ if pdf_backend == PdfBackend.DLPARSE_V1:
469
+ backend = DoclingParseDocumentBackend
470
+ elif pdf_backend == PdfBackend.DLPARSE_V2:
471
+ backend = DoclingParseV2DocumentBackend
472
+ elif pdf_backend == PdfBackend.DLPARSE_V4:
473
+ backend = DoclingParseV4DocumentBackend # type: ignore
474
+ elif pdf_backend == PdfBackend.PYPDFIUM2:
475
+ backend = PyPdfiumDocumentBackend # type: ignore
476
+ else:
477
+ raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
478
+
479
+ pdf_format_option = PdfFormatOption(
480
+ pipeline_options=pipeline_options,
481
+ backend=backend, # pdf_backend
482
+ )
483
+ elif pipeline == PdfPipeline.VLM:
484
+ pipeline_options = VlmPipelineOptions()
485
+
486
+ if vlm_model == VlmModelType.GRANITE_VISION:
487
+ pipeline_options.vlm_options = granite_vision_vlm_conversion_options
488
+ elif vlm_model == VlmModelType.SMOLDOCLING:
489
+ pipeline_options.vlm_options = smoldocling_vlm_conversion_options
490
+ if sys.platform == "darwin":
491
+ try:
492
+ import mlx_vlm
493
+
494
+ pipeline_options.vlm_options = (
495
+ smoldocling_vlm_mlx_conversion_options
496
+ )
497
+ except ImportError:
498
+ _log.warning(
499
+ "To run SmolDocling faster, please install mlx-vlm:\n"
500
+ "pip install mlx-vlm"
501
+ )
440
502
 
441
- if image_export_mode != ImageRefMode.PLACEHOLDER:
442
- pipeline_options.generate_page_images = True
443
- pipeline_options.generate_picture_images = (
444
- True # FIXME: to be deprecated in verson 3
503
+ pdf_format_option = PdfFormatOption(
504
+ pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
445
505
  )
446
- pipeline_options.images_scale = 2
447
506
 
448
507
  if artifacts_path is not None:
449
508
  pipeline_options.artifacts_path = artifacts_path
450
509
 
451
- backend: Type[PdfDocumentBackend]
452
- if pdf_backend == PdfBackend.DLPARSE_V1:
453
- backend = DoclingParseDocumentBackend
454
- elif pdf_backend == PdfBackend.DLPARSE_V2:
455
- backend = DoclingParseV2DocumentBackend
456
- elif pdf_backend == PdfBackend.DLPARSE_V4:
457
- backend = DoclingParseV4DocumentBackend # type: ignore
458
- elif pdf_backend == PdfBackend.PYPDFIUM2:
459
- backend = PyPdfiumDocumentBackend # type: ignore
460
- else:
461
- raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
462
-
463
- pdf_format_option = PdfFormatOption(
464
- pipeline_options=pipeline_options,
465
- backend=backend, # pdf_backend
466
- )
467
510
  format_options: Dict[InputFormat, FormatOption] = {
468
511
  InputFormat.PDF: pdf_format_option,
469
512
  InputFormat.IMAGE: pdf_format_option,
@@ -263,6 +263,11 @@ class ResponseFormat(str, Enum):
263
263
  MARKDOWN = "markdown"
264
264
 
265
265
 
266
+ class InferenceFramework(str, Enum):
267
+ MLX = "mlx"
268
+ TRANSFORMERS = "transformers"
269
+
270
+
266
271
  class HuggingFaceVlmOptions(BaseVlmOptions):
267
272
  kind: Literal["hf_model_options"] = "hf_model_options"
268
273
 
@@ -271,6 +276,7 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
271
276
  llm_int8_threshold: float = 6.0
272
277
  quantized: bool = False
273
278
 
279
+ inference_framework: InferenceFramework
274
280
  response_format: ResponseFormat
275
281
 
276
282
  @property
@@ -278,10 +284,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
278
284
  return self.repo_id.replace("/", "--")
279
285
 
280
286
 
287
+ smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
288
+ repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
289
+ prompt="Convert this page to docling.",
290
+ response_format=ResponseFormat.DOCTAGS,
291
+ inference_framework=InferenceFramework.MLX,
292
+ )
293
+
294
+
281
295
  smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
282
296
  repo_id="ds4sd/SmolDocling-256M-preview",
283
297
  prompt="Convert this page to docling.",
284
298
  response_format=ResponseFormat.DOCTAGS,
299
+ inference_framework=InferenceFramework.TRANSFORMERS,
285
300
  )
286
301
 
287
302
  granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
@@ -289,9 +304,15 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
289
304
  # prompt="OCR the full page to markdown.",
290
305
  prompt="OCR this image.",
291
306
  response_format=ResponseFormat.MARKDOWN,
307
+ inference_framework=InferenceFramework.TRANSFORMERS,
292
308
  )
293
309
 
294
310
 
311
+ class VlmModelType(str, Enum):
312
+ SMOLDOCLING = "smoldocling"
313
+ GRANITE_VISION = "granite_vision"
314
+
315
+
295
316
  # Define an enum for the backend options
296
317
  class PdfBackend(str, Enum):
297
318
  """Enum of valid PDF backends."""
@@ -327,13 +348,14 @@ class PipelineOptions(BaseModel):
327
348
 
328
349
 
329
350
  class PaginatedPipelineOptions(PipelineOptions):
351
+ artifacts_path: Optional[Union[Path, str]] = None
352
+
330
353
  images_scale: float = 1.0
331
354
  generate_page_images: bool = False
332
355
  generate_picture_images: bool = False
333
356
 
334
357
 
335
358
  class VlmPipelineOptions(PaginatedPipelineOptions):
336
- artifacts_path: Optional[Union[Path, str]] = None
337
359
 
338
360
  generate_page_images: bool = True
339
361
  force_backend_text: bool = (
@@ -346,7 +368,6 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
346
368
  class PdfPipelineOptions(PaginatedPipelineOptions):
347
369
  """Options for the PDF pipeline."""
348
370
 
349
- artifacts_path: Optional[Union[Path, str]] = None
350
371
  do_table_structure: bool = True # True: perform table structure extraction
351
372
  do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
352
373
  do_code_enrichment: bool = False # True: perform code OCR
@@ -377,3 +398,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
377
398
  )
378
399
 
379
400
  generate_parsed_pages: bool = False
401
+
402
+
403
+ class PdfPipeline(str, Enum):
404
+ STANDARD = "standard"
405
+ VLM = "vlm"
@@ -0,0 +1,137 @@
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional
5
+
6
+ from docling.datamodel.base_models import Page, VlmPrediction
7
+ from docling.datamodel.document import ConversionResult
8
+ from docling.datamodel.pipeline_options import (
9
+ AcceleratorDevice,
10
+ AcceleratorOptions,
11
+ HuggingFaceVlmOptions,
12
+ )
13
+ from docling.datamodel.settings import settings
14
+ from docling.models.base_model import BasePageModel
15
+ from docling.utils.accelerator_utils import decide_device
16
+ from docling.utils.profiling import TimeRecorder
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ class HuggingFaceMlxModel(BasePageModel):
22
+
23
+ def __init__(
24
+ self,
25
+ enabled: bool,
26
+ artifacts_path: Optional[Path],
27
+ accelerator_options: AcceleratorOptions,
28
+ vlm_options: HuggingFaceVlmOptions,
29
+ ):
30
+ self.enabled = enabled
31
+
32
+ self.vlm_options = vlm_options
33
+
34
+ if self.enabled:
35
+
36
+ try:
37
+ from mlx_vlm import generate, load # type: ignore
38
+ from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
39
+ from mlx_vlm.utils import load_config, stream_generate # type: ignore
40
+ except ImportError:
41
+ raise ImportError(
42
+ "mlx-vlm is not installed. Please install it via `pip install mlx-vlm` to use MLX VLM models."
43
+ )
44
+
45
+ repo_cache_folder = vlm_options.repo_id.replace("/", "--")
46
+ self.apply_chat_template = apply_chat_template
47
+ self.stream_generate = stream_generate
48
+
49
+ # PARAMETERS:
50
+ if artifacts_path is None:
51
+ artifacts_path = self.download_models(self.vlm_options.repo_id)
52
+ elif (artifacts_path / repo_cache_folder).exists():
53
+ artifacts_path = artifacts_path / repo_cache_folder
54
+
55
+ self.param_question = vlm_options.prompt # "Perform Layout Analysis."
56
+
57
+ ## Load the model
58
+ self.vlm_model, self.processor = load(artifacts_path)
59
+ self.config = load_config(artifacts_path)
60
+
61
+ @staticmethod
62
+ def download_models(
63
+ repo_id: str,
64
+ local_dir: Optional[Path] = None,
65
+ force: bool = False,
66
+ progress: bool = False,
67
+ ) -> Path:
68
+ from huggingface_hub import snapshot_download
69
+ from huggingface_hub.utils import disable_progress_bars
70
+
71
+ if not progress:
72
+ disable_progress_bars()
73
+ download_path = snapshot_download(
74
+ repo_id=repo_id,
75
+ force_download=force,
76
+ local_dir=local_dir,
77
+ # revision="v0.0.1",
78
+ )
79
+
80
+ return Path(download_path)
81
+
82
+ def __call__(
83
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
84
+ ) -> Iterable[Page]:
85
+ for page in page_batch:
86
+ assert page._backend is not None
87
+ if not page._backend.is_valid():
88
+ yield page
89
+ else:
90
+ with TimeRecorder(conv_res, "vlm"):
91
+ assert page.size is not None
92
+
93
+ hi_res_image = page.get_image(scale=2.0) # 144dpi
94
+ # hi_res_image = page.get_image(scale=1.0) # 72dpi
95
+
96
+ if hi_res_image is not None:
97
+ im_width, im_height = hi_res_image.size
98
+
99
+ # populate page_tags with predicted doc tags
100
+ page_tags = ""
101
+
102
+ if hi_res_image:
103
+ if hi_res_image.mode != "RGB":
104
+ hi_res_image = hi_res_image.convert("RGB")
105
+
106
+ prompt = self.apply_chat_template(
107
+ self.processor, self.config, self.param_question, num_images=1
108
+ )
109
+
110
+ start_time = time.time()
111
+ # Call model to generate:
112
+ output = ""
113
+ for token in self.stream_generate(
114
+ self.vlm_model,
115
+ self.processor,
116
+ prompt,
117
+ [hi_res_image],
118
+ max_tokens=4096,
119
+ verbose=False,
120
+ ):
121
+ output += token.text
122
+ if "</doctag>" in token.text:
123
+ break
124
+
125
+ generation_time = time.time() - start_time
126
+ page_tags = output
127
+
128
+ # inference_time = time.time() - start_time
129
+ # tokens_per_second = num_tokens / generation_time
130
+ # print("")
131
+ # print(f"Page Inference Time: {inference_time:.2f} seconds")
132
+ # print(f"Total tokens on page: {num_tokens:.2f}")
133
+ # print(f"Tokens/sec: {tokens_per_second:.2f}")
134
+ # print("")
135
+ page.predictions.vlm_response = VlmPrediction(text=page_tags)
136
+
137
+ yield page
@@ -1,30 +1,13 @@
1
- import itertools
2
1
  import logging
3
- import re
4
2
  import warnings
5
3
  from io import BytesIO
6
-
7
- # from io import BytesIO
8
4
  from pathlib import Path
9
- from typing import Optional
5
+ from typing import List, Optional, Union, cast
10
6
 
11
- from docling_core.types import DoclingDocument
12
- from docling_core.types.doc import (
13
- BoundingBox,
14
- DocItem,
15
- DocItemLabel,
16
- DoclingDocument,
17
- GroupLabel,
18
- ImageRef,
19
- ImageRefMode,
20
- PictureItem,
21
- ProvenanceItem,
22
- Size,
23
- TableCell,
24
- TableData,
25
- TableItem,
26
- )
27
- from docling_core.types.doc.tokens import DocumentToken, TableToken
7
+ # from docling_core.types import DoclingDocument
8
+ from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
9
+ from docling_core.types.doc.document import DocTagsDocument
10
+ from PIL import Image as PILImage
28
11
 
29
12
  from docling.backend.abstract_backend import AbstractDocumentBackend
30
13
  from docling.backend.md_backend import MarkdownDocumentBackend
@@ -32,11 +15,12 @@ from docling.backend.pdf_backend import PdfDocumentBackend
32
15
  from docling.datamodel.base_models import InputFormat, Page
33
16
  from docling.datamodel.document import ConversionResult, InputDocument
34
17
  from docling.datamodel.pipeline_options import (
35
- PdfPipelineOptions,
18
+ InferenceFramework,
36
19
  ResponseFormat,
37
20
  VlmPipelineOptions,
38
21
  )
39
22
  from docling.datamodel.settings import settings
23
+ from docling.models.hf_mlx_model import HuggingFaceMlxModel
40
24
  from docling.models.hf_vlm_model import HuggingFaceVlmModel
41
25
  from docling.pipeline.base_pipeline import PaginatedPipeline
42
26
  from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -50,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
50
34
  super().__init__(pipeline_options)
51
35
  self.keep_backend = True
52
36
 
53
- warnings.warn(
54
- "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
55
- category=UserWarning,
56
- stacklevel=2,
57
- )
58
-
59
37
  self.pipeline_options: VlmPipelineOptions
60
38
 
61
39
  artifacts_path: Optional[Path] = None
@@ -79,14 +57,27 @@ class VlmPipeline(PaginatedPipeline):
79
57
 
80
58
  self.keep_images = self.pipeline_options.generate_page_images
81
59
 
82
- self.build_pipe = [
83
- HuggingFaceVlmModel(
84
- enabled=True, # must be always enabled for this pipeline to make sense.
85
- artifacts_path=artifacts_path,
86
- accelerator_options=pipeline_options.accelerator_options,
87
- vlm_options=self.pipeline_options.vlm_options,
88
- ),
89
- ]
60
+ if (
61
+ self.pipeline_options.vlm_options.inference_framework
62
+ == InferenceFramework.MLX
63
+ ):
64
+ self.build_pipe = [
65
+ HuggingFaceMlxModel(
66
+ enabled=True, # must be always enabled for this pipeline to make sense.
67
+ artifacts_path=artifacts_path,
68
+ accelerator_options=pipeline_options.accelerator_options,
69
+ vlm_options=self.pipeline_options.vlm_options,
70
+ ),
71
+ ]
72
+ else:
73
+ self.build_pipe = [
74
+ HuggingFaceVlmModel(
75
+ enabled=True, # must be always enabled for this pipeline to make sense.
76
+ artifacts_path=artifacts_path,
77
+ accelerator_options=pipeline_options.accelerator_options,
78
+ vlm_options=self.pipeline_options.vlm_options,
79
+ ),
80
+ ]
90
81
 
91
82
  self.enrichment_pipe = [
92
83
  # Other models working on `NodeItem` elements in the DoclingDocument
@@ -100,6 +91,17 @@ class VlmPipeline(PaginatedPipeline):
100
91
 
101
92
  return page
102
93
 
94
+ def extract_text_from_backend(
95
+ self, page: Page, bbox: Union[BoundingBox, None]
96
+ ) -> str:
97
+ # Convert bounding box normalized to 0-100 into page coordinates for cropping
98
+ text = ""
99
+ if bbox:
100
+ if page.size:
101
+ if page._backend:
102
+ text = page._backend.get_text_in_rect(bbox)
103
+ return text
104
+
103
105
  def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
104
106
  with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
105
107
 
@@ -107,7 +109,45 @@ class VlmPipeline(PaginatedPipeline):
107
109
  self.pipeline_options.vlm_options.response_format
108
110
  == ResponseFormat.DOCTAGS
109
111
  ):
110
- conv_res.document = self._turn_tags_into_doc(conv_res.pages)
112
+ doctags_list = []
113
+ image_list = []
114
+ for page in conv_res.pages:
115
+ predicted_doctags = ""
116
+ img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
117
+ if page.predictions.vlm_response:
118
+ predicted_doctags = page.predictions.vlm_response.text
119
+ if page.image:
120
+ img = page.image
121
+ image_list.append(img)
122
+ doctags_list.append(predicted_doctags)
123
+
124
+ doctags_list_c = cast(List[Union[Path, str]], doctags_list)
125
+ image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
126
+ doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
127
+ doctags_list_c, image_list_c
128
+ )
129
+ conv_res.document.load_from_doctags(doctags_doc)
130
+
131
+ # If forced backend text, replace model predicted text with backend one
132
+ if page.size:
133
+ if self.force_backend_text:
134
+ scale = self.pipeline_options.images_scale
135
+ for element, _level in conv_res.document.iterate_items():
136
+ if (
137
+ not isinstance(element, TextItem)
138
+ or len(element.prov) == 0
139
+ ):
140
+ continue
141
+ crop_bbox = (
142
+ element.prov[0]
143
+ .bbox.scaled(scale=scale)
144
+ .to_top_left_origin(
145
+ page_height=page.size.height * scale
146
+ )
147
+ )
148
+ txt = self.extract_text_from_backend(page, crop_bbox)
149
+ element.text = txt
150
+ element.orig = txt
111
151
  elif (
112
152
  self.pipeline_options.vlm_options.response_format
113
153
  == ResponseFormat.MARKDOWN
@@ -165,366 +205,6 @@ class VlmPipeline(PaginatedPipeline):
165
205
  )
166
206
  return backend.convert()
167
207
 
168
- def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
169
- ###############################################
170
- # Tag definitions and color mappings
171
- ###############################################
172
-
173
- # Maps the recognized tag to a Docling label.
174
- # Code items will be given DocItemLabel.CODE
175
- tag_to_doclabel = {
176
- "title": DocItemLabel.TITLE,
177
- "document_index": DocItemLabel.DOCUMENT_INDEX,
178
- "otsl": DocItemLabel.TABLE,
179
- "section_header_level_1": DocItemLabel.SECTION_HEADER,
180
- "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
181
- "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
182
- "text": DocItemLabel.TEXT,
183
- "page_header": DocItemLabel.PAGE_HEADER,
184
- "page_footer": DocItemLabel.PAGE_FOOTER,
185
- "formula": DocItemLabel.FORMULA,
186
- "caption": DocItemLabel.CAPTION,
187
- "picture": DocItemLabel.PICTURE,
188
- "list_item": DocItemLabel.LIST_ITEM,
189
- "footnote": DocItemLabel.FOOTNOTE,
190
- "code": DocItemLabel.CODE,
191
- }
192
-
193
- # Maps each tag to an associated bounding box color.
194
- tag_to_color = {
195
- "title": "blue",
196
- "document_index": "darkblue",
197
- "otsl": "green",
198
- "section_header_level_1": "purple",
199
- "checkbox_selected": "black",
200
- "checkbox_unselected": "gray",
201
- "text": "red",
202
- "page_header": "orange",
203
- "page_footer": "cyan",
204
- "formula": "pink",
205
- "caption": "magenta",
206
- "picture": "yellow",
207
- "list_item": "brown",
208
- "footnote": "darkred",
209
- "code": "lightblue",
210
- }
211
-
212
- def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
213
- """Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
214
- coords = re.findall(r"<loc_(\d+)>", text_chunk)
215
- if len(coords) == 4:
216
- l, t, r, b = map(float, coords)
217
- return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
218
- return None
219
-
220
- def extract_inner_text(text_chunk: str) -> str:
221
- """Strips all <...> tags inside the chunk to get the raw text content."""
222
- return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
223
-
224
- def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
225
- # Convert bounding box normalized to 0-100 into page coordinates for cropping
226
- text = ""
227
- if bbox:
228
- if page.size:
229
- bbox.l = bbox.l * page.size.width
230
- bbox.t = bbox.t * page.size.height
231
- bbox.r = bbox.r * page.size.width
232
- bbox.b = bbox.b * page.size.height
233
- if page._backend:
234
- text = page._backend.get_text_in_rect(bbox)
235
- return text
236
-
237
- def otsl_parse_texts(texts, tokens):
238
- split_word = TableToken.OTSL_NL.value
239
- split_row_tokens = [
240
- list(y)
241
- for x, y in itertools.groupby(tokens, lambda z: z == split_word)
242
- if not x
243
- ]
244
- table_cells = []
245
- r_idx = 0
246
- c_idx = 0
247
-
248
- def count_right(tokens, c_idx, r_idx, which_tokens):
249
- span = 0
250
- c_idx_iter = c_idx
251
- while tokens[r_idx][c_idx_iter] in which_tokens:
252
- c_idx_iter += 1
253
- span += 1
254
- if c_idx_iter >= len(tokens[r_idx]):
255
- return span
256
- return span
257
-
258
- def count_down(tokens, c_idx, r_idx, which_tokens):
259
- span = 0
260
- r_idx_iter = r_idx
261
- while tokens[r_idx_iter][c_idx] in which_tokens:
262
- r_idx_iter += 1
263
- span += 1
264
- if r_idx_iter >= len(tokens):
265
- return span
266
- return span
267
-
268
- for i, text in enumerate(texts):
269
- cell_text = ""
270
- if text in [
271
- TableToken.OTSL_FCEL.value,
272
- TableToken.OTSL_ECEL.value,
273
- TableToken.OTSL_CHED.value,
274
- TableToken.OTSL_RHED.value,
275
- TableToken.OTSL_SROW.value,
276
- ]:
277
- row_span = 1
278
- col_span = 1
279
- right_offset = 1
280
- if text != TableToken.OTSL_ECEL.value:
281
- cell_text = texts[i + 1]
282
- right_offset = 2
283
-
284
- # Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
285
- next_right_cell = ""
286
- if i + right_offset < len(texts):
287
- next_right_cell = texts[i + right_offset]
288
-
289
- next_bottom_cell = ""
290
- if r_idx + 1 < len(split_row_tokens):
291
- if c_idx < len(split_row_tokens[r_idx + 1]):
292
- next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
293
-
294
- if next_right_cell in [
295
- TableToken.OTSL_LCEL.value,
296
- TableToken.OTSL_XCEL.value,
297
- ]:
298
- # we have horisontal spanning cell or 2d spanning cell
299
- col_span += count_right(
300
- split_row_tokens,
301
- c_idx + 1,
302
- r_idx,
303
- [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
304
- )
305
- if next_bottom_cell in [
306
- TableToken.OTSL_UCEL.value,
307
- TableToken.OTSL_XCEL.value,
308
- ]:
309
- # we have a vertical spanning cell or 2d spanning cell
310
- row_span += count_down(
311
- split_row_tokens,
312
- c_idx,
313
- r_idx + 1,
314
- [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
315
- )
316
-
317
- table_cells.append(
318
- TableCell(
319
- text=cell_text.strip(),
320
- row_span=row_span,
321
- col_span=col_span,
322
- start_row_offset_idx=r_idx,
323
- end_row_offset_idx=r_idx + row_span,
324
- start_col_offset_idx=c_idx,
325
- end_col_offset_idx=c_idx + col_span,
326
- )
327
- )
328
- if text in [
329
- TableToken.OTSL_FCEL.value,
330
- TableToken.OTSL_ECEL.value,
331
- TableToken.OTSL_CHED.value,
332
- TableToken.OTSL_RHED.value,
333
- TableToken.OTSL_SROW.value,
334
- TableToken.OTSL_LCEL.value,
335
- TableToken.OTSL_UCEL.value,
336
- TableToken.OTSL_XCEL.value,
337
- ]:
338
- c_idx += 1
339
- if text == TableToken.OTSL_NL.value:
340
- r_idx += 1
341
- c_idx = 0
342
- return table_cells, split_row_tokens
343
-
344
- def otsl_extract_tokens_and_text(s: str):
345
- # Pattern to match anything enclosed by < > (including the angle brackets themselves)
346
- pattern = r"(<[^>]+>)"
347
- # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
348
- tokens = re.findall(pattern, s)
349
- # Remove any tokens that start with "<loc_"
350
- tokens = [
351
- token
352
- for token in tokens
353
- if not (
354
- token.startswith(rf"<{DocumentToken.LOC.value}")
355
- or token
356
- in [
357
- rf"<{DocumentToken.OTSL.value}>",
358
- rf"</{DocumentToken.OTSL.value}>",
359
- ]
360
- )
361
- ]
362
- # Split the string by those tokens to get the in-between text
363
- text_parts = re.split(pattern, s)
364
- text_parts = [
365
- token
366
- for token in text_parts
367
- if not (
368
- token.startswith(rf"<{DocumentToken.LOC.value}")
369
- or token
370
- in [
371
- rf"<{DocumentToken.OTSL.value}>",
372
- rf"</{DocumentToken.OTSL.value}>",
373
- ]
374
- )
375
- ]
376
- # Remove any empty or purely whitespace strings from text_parts
377
- text_parts = [part for part in text_parts if part.strip()]
378
-
379
- return tokens, text_parts
380
-
381
- def parse_table_content(otsl_content: str) -> TableData:
382
- tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
383
- table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
384
-
385
- return TableData(
386
- num_rows=len(split_row_tokens),
387
- num_cols=(
388
- max(len(row) for row in split_row_tokens) if split_row_tokens else 0
389
- ),
390
- table_cells=table_cells,
391
- )
392
-
393
- doc = DoclingDocument(name="Document")
394
- for pg_idx, page in enumerate(pages):
395
- xml_content = ""
396
- predicted_text = ""
397
- if page.predictions.vlm_response:
398
- predicted_text = page.predictions.vlm_response.text
399
- image = page.image
400
-
401
- page_no = pg_idx + 1
402
- bounding_boxes = []
403
-
404
- if page.size:
405
- pg_width = page.size.width
406
- pg_height = page.size.height
407
- size = Size(width=pg_width, height=pg_height)
408
- parent_page = doc.add_page(page_no=page_no, size=size)
409
-
410
- """
411
- 1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
412
- 2. For each chunk, extracts bounding box (if any) and inner text.
413
- 3. Adds the item to a DoclingDocument structure with the right label.
414
- 4. Tracks bounding boxes + color in a separate list for later visualization.
415
- """
416
-
417
- # Regex for all recognized tags
418
- tag_pattern = (
419
- rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
420
- rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
421
- rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
422
- rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
423
- rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
424
- rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
425
- rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
426
- )
427
-
428
- # DocumentToken.OTSL
429
- pattern = re.compile(tag_pattern, re.DOTALL)
430
-
431
- # Go through each match in order
432
- for match in pattern.finditer(predicted_text):
433
- full_chunk = match.group(0)
434
- tag_name = match.group("tag")
435
-
436
- bbox = extract_bounding_box(full_chunk)
437
- doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
438
- color = tag_to_color.get(tag_name, "white")
439
-
440
- # Store bounding box + color
441
- if bbox:
442
- bounding_boxes.append((bbox, color))
443
-
444
- if tag_name == DocumentToken.OTSL.value:
445
- table_data = parse_table_content(full_chunk)
446
- bbox = extract_bounding_box(full_chunk)
447
-
448
- if bbox:
449
- prov = ProvenanceItem(
450
- bbox=bbox.resize_by_scale(pg_width, pg_height),
451
- charspan=(0, 0),
452
- page_no=page_no,
453
- )
454
- doc.add_table(data=table_data, prov=prov)
455
- else:
456
- doc.add_table(data=table_data)
457
-
458
- elif tag_name == DocItemLabel.PICTURE:
459
- text_caption_content = extract_inner_text(full_chunk)
460
- if image:
461
- if bbox:
462
- im_width, im_height = image.size
463
-
464
- crop_box = (
465
- int(bbox.l * im_width),
466
- int(bbox.t * im_height),
467
- int(bbox.r * im_width),
468
- int(bbox.b * im_height),
469
- )
470
- cropped_image = image.crop(crop_box)
471
- pic = doc.add_picture(
472
- parent=None,
473
- image=ImageRef.from_pil(image=cropped_image, dpi=72),
474
- prov=(
475
- ProvenanceItem(
476
- bbox=bbox.resize_by_scale(pg_width, pg_height),
477
- charspan=(0, 0),
478
- page_no=page_no,
479
- )
480
- ),
481
- )
482
- # If there is a caption to an image, add it as well
483
- if len(text_caption_content) > 0:
484
- caption_item = doc.add_text(
485
- label=DocItemLabel.CAPTION,
486
- text=text_caption_content,
487
- parent=None,
488
- )
489
- pic.captions.append(caption_item.get_ref())
490
- else:
491
- if bbox:
492
- # In case we don't have access to an binary of an image
493
- doc.add_picture(
494
- parent=None,
495
- prov=ProvenanceItem(
496
- bbox=bbox, charspan=(0, 0), page_no=page_no
497
- ),
498
- )
499
- # If there is a caption to an image, add it as well
500
- if len(text_caption_content) > 0:
501
- caption_item = doc.add_text(
502
- label=DocItemLabel.CAPTION,
503
- text=text_caption_content,
504
- parent=None,
505
- )
506
- pic.captions.append(caption_item.get_ref())
507
- else:
508
- # For everything else, treat as text
509
- if self.force_backend_text:
510
- text_content = extract_text_from_backend(page, bbox)
511
- else:
512
- text_content = extract_inner_text(full_chunk)
513
- doc.add_text(
514
- label=doc_label,
515
- text=text_content,
516
- prov=(
517
- ProvenanceItem(
518
- bbox=bbox.resize_by_scale(pg_width, pg_height),
519
- charspan=(0, len(text_content)),
520
- page_no=page_no,
521
- )
522
- if bbox
523
- else None
524
- ),
525
- )
526
- return doc
527
-
528
208
  @classmethod
529
209
  def get_default_options(cls) -> VlmPipelineOptions:
530
210
  return VlmPipelineOptions()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.27.0
3
+ Version: 2.28.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.23.0,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
86
86
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
87
87
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
88
88
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
89
+ [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
89
90
 
90
91
  Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
91
92
 
@@ -98,12 +99,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
98
99
  * 🔒 Local execution capabilities for sensitive data and air-gapped environments
99
100
  * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
100
101
  * 🔍 Extensive OCR support for scanned PDFs and images
102
+ * 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
101
103
  * 💻 Simple and convenient CLI
102
104
 
103
105
  ### Coming soon
104
106
 
105
107
  * 📝 Metadata extraction, including title, authors, references & language
106
- * 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
107
108
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
108
109
  * 📝 Complex chemistry understanding (Molecular structures)
109
110
 
@@ -120,7 +121,7 @@ More [detailed installation instructions](https://docling-project.github.io/docl
120
121
 
121
122
  ## Getting started
122
123
 
123
- To convert individual documents, use `convert()`, for example:
124
+ To convert individual documents with python, use `convert()`, for example:
124
125
 
125
126
  ```python
126
127
  from docling.document_converter import DocumentConverter
@@ -134,6 +135,22 @@ print(result.document.export_to_markdown()) # output: "## Docling Technical Rep
134
135
  More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in
135
136
  the docs.
136
137
 
138
+ ## CLI
139
+
140
+ Docling has a built-in CLI to run conversions.
141
+
142
+ ```bash
143
+ docling https://arxiv.org/pdf/2206.01062
144
+ ```
145
+
146
+ You can also use 🥚[SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview) and other VLMs via Docling CLI:
147
+ ```bash
148
+ docling --pipeline vlm --vlm-model smoldocling https://arxiv.org/pdf/2206.01062
149
+ ```
150
+ This will use MLX acceleration on supported Apple Silicon hardware.
151
+
152
+ Read more [here](https://docling-project.github.io/docling/usage/)
153
+
137
154
  ## Documentation
138
155
 
139
156
  Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on
@@ -150,32 +167,6 @@ To further accelerate your AI application development, check out Docling's nativ
150
167
  [integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks
151
168
  and tools.
152
169
 
153
- ## Apify Actor
154
-
155
- <a href="https://apify.com/vancura/docling?fpr=docling"><img src="https://apify.com/ext/run-on-apify.png" alt="Run Docling Actor on Apify" width="176" height="39" /></a>
156
-
157
- You can run Docling in the cloud without installation using the [Docling Actor](https://apify.com/vancura/docling?fpr=docling) on Apify platform. Simply provide a document URL and get the processed result:
158
-
159
- ```bash
160
- apify call vancura/docling -i '{
161
- "options": {
162
- "to_formats": ["md", "json", "html", "text", "doctags"]
163
- },
164
- "http_sources": [
165
- {"url": "https://vancura.dev/assets/actor-test/facial-hairstyles-and-filtering-facepiece-respirators.pdf"},
166
- {"url": "https://arxiv.org/pdf/2408.09869"}
167
- ]
168
- }'
169
- ```
170
-
171
- The Actor stores results in:
172
-
173
- * Processed document in key-value store (`OUTPUT_RESULT`)
174
- * Processing logs (`DOCLING_LOG`)
175
- * Dataset record with result URL and status
176
-
177
- Read more about the [Docling Actor](.actor/README.md), including how to use it via the Apify API and CLI.
178
-
179
170
  ## Get help and support
180
171
 
181
172
  Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions).
@@ -210,9 +201,13 @@ If you use Docling in your projects, please consider citing the following:
210
201
  The Docling codebase is under MIT license.
211
202
  For individual model usage, please refer to the model licenses found in the original packages.
212
203
 
213
- ## IBM ❤️ Open Source AI
204
+ ## LF AI & Data
205
+
206
+ Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
207
+
208
+ ### IBM ❤️ Open Source AI
214
209
 
215
- Docling has been brought to you by IBM.
210
+ The project was started by the AI for knowledge team at IBM Research Zurich.
216
211
 
217
212
  [supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/
218
213
  [docling_document]: https://docling-project.github.io/docling/concepts/docling_document/
@@ -5,7 +5,7 @@ docling/backend/asciidoc_backend.py,sha256=xBtmYkRkPICIfMbB8AFIw_or4IZGB17mP_LhX
5
5
  docling/backend/csv_backend.py,sha256=lCNSkgB55IbAig7w4IyXRkX23aM3Nojj6GdXNoaNjY4,4536
6
6
  docling/backend/docling_parse_backend.py,sha256=tcy4cPD_dtGD37CjivbFvwzwXVcrb3HVmofyasxLum8,7991
7
7
  docling/backend/docling_parse_v2_backend.py,sha256=70kXqYhht-A8zb9z5emMe_1i0l9dyQGrM8lg1cmAvqc,9369
8
- docling/backend/docling_parse_v4_backend.py,sha256=sUjcgD62n2Z15gOYhLNAnwkzqSAnlQ8eKkDuVrlK_rk,6002
8
+ docling/backend/docling_parse_v4_backend.py,sha256=IECMJQWEvYqQv043_1Ho6dLkCbuaK8cMUsqcxwqruXo,6287
9
9
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  docling/backend/docx/latex/latex_dict.py,sha256=a0UC3VLmG1BLN-hGmEaQamzKbDB10fCz0U8qRU--aBw,6613
@@ -15,8 +15,8 @@ docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hS
15
15
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
16
16
  docling/backend/md_backend.py,sha256=v230PXShYJo2QaabwUHiBpE-EGScHIerjL78zPaJpZM,16837
17
17
  docling/backend/msexcel_backend.py,sha256=_ZVZFKRRijpg-Xz10xNxu2m-NpDaYvoiBqEZP6GbrgE,11095
18
- docling/backend/mspowerpoint_backend.py,sha256=wUriELF9wHwThITXxSyseVASe6W6Sw0E7Qg_U-Q3JNU,16434
19
- docling/backend/msword_backend.py,sha256=uSQJ5PHoTIlw2bcAe8NGWutjgceNYWfg4N1ze17F4D0,23101
18
+ docling/backend/mspowerpoint_backend.py,sha256=zXdXr8nGJJbPGTgR5_dqq5WmNL1wDCaK0RqFqtuHPqs,17213
19
+ docling/backend/msword_backend.py,sha256=VjTvJe249FjHJDBpK0RC4iyosMzmpJLTuFIAPNEdReU,23259
20
20
  docling/backend/pdf_backend.py,sha256=odWb1rxk3WCUIEJMhq-dYFNUQ1pSDuNHbU9wlTZIRAs,2211
21
21
  docling/backend/pypdfium2_backend.py,sha256=wRwhA5XHRqL7vyNhCAHM6P-ONkwtyjKG9LgC4NJ-4i8,10784
22
22
  docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -24,13 +24,13 @@ docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqaf
24
24
  docling/backend/xml/uspto_backend.py,sha256=H0jwIt2skOke_yEUk0wfXCtodrB-hrj2ygLtB3jMWaI,71056
25
25
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
26
26
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- docling/cli/main.py,sha256=1N4h1HrNCWEymkqb4_mXyplcdVgVNAR7lRAZFXTiRKk,18310
27
+ docling/cli/main.py,sha256=zr36i-itYkX013g_DK6aNiNe8UPaD27_A7UtG5qwLUo,20174
28
28
  docling/cli/models.py,sha256=tM_qbMM3YOPxFU7JlME96MLbtd1CX_bOAK7FS-NhJvY,3979
29
29
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
30
30
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  docling/datamodel/base_models.py,sha256=MAHr8LlffZ2uIXZ3AXOsikh_-oQIEYTiwwjsz-dQW9U,7287
32
32
  docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
33
- docling/datamodel/pipeline_options.py,sha256=n45Xgl1qnrHZxztd4CyhdDPYa8FygADJ8EpfbUuIlmc,11963
33
+ docling/datamodel/pipeline_options.py,sha256=TpRf_-7UuCjjaytFWA0nL2m-KP4no9jeAjaXRjBLMLE,12593
34
34
  docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
35
35
  docling/document_converter.py,sha256=LwbnfGzma937EmSrNWMzM-dldI9Cbu4DUgY8gL1OVHo,13184
36
36
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
@@ -44,6 +44,7 @@ docling/models/factories/__init__.py,sha256=e4lFmRfmW5hWqvJjY5xaVFbvCQhDBCrVeSq8
44
44
  docling/models/factories/base_factory.py,sha256=pNR9-B_BKs2sYNyHnp2ON2l3r6Dy9lcof4qmwHlAryI,4032
45
45
  docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
46
46
  docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
47
+ docling/models/hf_mlx_model.py,sha256=2eSHphJm5LAfiSA24blVMc2znJlKMYrtmmzq8ffc-rU,4924
47
48
  docling/models/hf_vlm_model.py,sha256=NUtLEuG-kNGJeDHWmQKAAOZG4WF0a5hn-KXUUM1mHBQ,6820
48
49
  docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
49
50
  docling/models/ocr_mac_model.py,sha256=2pZaUWg19go_u88mKWr5y_52PAYEN__GsbyUYLdY4zo,5353
@@ -63,7 +64,7 @@ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
63
64
  docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
64
65
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
65
66
  docling/pipeline/standard_pdf_pipeline.py,sha256=tHOHFyJajX6IAhm4y3I27uqn5jfMTuCaSaFOKT5JM2M,10593
66
- docling/pipeline/vlm_pipeline.py,sha256=glPwNH1QEuHj35L3tdPyuCX0CGlJn81ZDFrj3WwLa7o,22265
67
+ docling/pipeline/vlm_pipeline.py,sha256=1eKt3gqWf6PxGvYZuqhKi2BFljJGJWIyHemzOAwa39Y,9065
67
68
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
68
69
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
70
  docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
@@ -76,8 +77,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
76
77
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
77
78
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
78
79
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
79
- docling-2.27.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
80
- docling-2.27.0.dist-info/METADATA,sha256=bjSjck82ddDda67NwQaZwW_s9T_jTHw9lE3RhhXf1Y4,10142
81
- docling-2.27.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
82
- docling-2.27.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
83
- docling-2.27.0.dist-info/RECORD,,
80
+ docling-2.28.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
81
+ docling-2.28.0.dist-info/METADATA,sha256=miIkWRX5hgrOeGbyYDAiQaymAR6PxK6Qdlss5DR1YhM,9982
82
+ docling-2.28.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
+ docling-2.28.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
84
+ docling-2.28.0.dist-info/RECORD,,