docling 2.24.0__py3-none-any.whl → 2.25.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
1
  import logging
2
2
  from io import BytesIO
3
3
  from pathlib import Path
4
- from typing import Optional, Union, cast
4
+ from typing import Final, Optional, Union, cast
5
5
 
6
6
  from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
7
+ from bs4.element import PreformattedString
7
8
  from docling_core.types.doc import (
8
9
  DocItem,
9
10
  DocItemLabel,
@@ -22,12 +23,29 @@ from docling.datamodel.document import InputDocument
22
23
 
23
24
  _log = logging.getLogger(__name__)
24
25
 
26
+ # tags that generate NodeItem elements
27
+ TAGS_FOR_NODE_ITEMS: Final = [
28
+ "h1",
29
+ "h2",
30
+ "h3",
31
+ "h4",
32
+ "h5",
33
+ "h6",
34
+ "p",
35
+ "pre",
36
+ "ul",
37
+ "ol",
38
+ "li",
39
+ "table",
40
+ "figure",
41
+ "img",
42
+ ]
43
+
25
44
 
26
45
  class HTMLDocumentBackend(DeclarativeDocumentBackend):
27
46
  @override
28
47
  def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
29
48
  super().__init__(in_doc, path_or_stream)
30
- _log.debug("About to init HTML backend...")
31
49
  self.soup: Optional[Tag] = None
32
50
  # HTML file:
33
51
  self.path_or_stream = path_or_stream
@@ -88,6 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
88
106
  assert self.soup is not None
89
107
  content = self.soup.body or self.soup
90
108
  # Replace <br> tags with newline characters
109
+ # TODO: remove style to avoid losing text from tags like i, b, span, ...
91
110
  for br in content("br"):
92
111
  br.replace_with(NavigableString("\n"))
93
112
  self.walk(content, doc)
@@ -99,6 +118,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
99
118
 
100
119
  def walk(self, tag: Tag, doc: DoclingDocument) -> None:
101
120
  # Iterate over elements in the body of the document
121
+ text: str = ""
102
122
  for element in tag.children:
103
123
  if isinstance(element, Tag):
104
124
  try:
@@ -108,6 +128,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
108
128
  f"Error processing child from tag{tag.name}: {exc_child}"
109
129
  )
110
130
  raise exc_child
131
+ elif isinstance(element, NavigableString) and not isinstance(
132
+ element, PreformattedString
133
+ ):
134
+ # Floating text outside paragraphs or analyzed tags
135
+ text += element
136
+ siblings: list[Tag] = [
137
+ item for item in element.next_siblings if isinstance(item, Tag)
138
+ ]
139
+ if element.next_sibling is None or any(
140
+ [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
141
+ ):
142
+ text = text.strip()
143
+ if text and tag.name in ["div"]:
144
+ doc.add_text(
145
+ parent=self.parents[self.level],
146
+ label=DocItemLabel.PARAGRAPH,
147
+ text=text,
148
+ )
149
+ text = ""
111
150
 
112
151
  return
113
152
 
@@ -158,7 +197,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
158
197
  text = element.text.strip()
159
198
 
160
199
  if hlevel == 1:
161
- for key, val in self.parents.items():
200
+ for key in self.parents.keys():
162
201
  self.parents[key] = None
163
202
 
164
203
  self.level = 1
docling/cli/models.py CHANGED
@@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
32
32
  CODE_FORMULA = "code_formula"
33
33
  PICTURE_CLASSIFIER = "picture_classifier"
34
34
  SMOLVLM = "smolvlm"
35
+ GRANITE_VISION = "granite_vision"
35
36
  EASYOCR = "easyocr"
36
37
 
37
38
 
39
+ _default_models = [
40
+ _AvailableModels.LAYOUT,
41
+ _AvailableModels.TABLEFORMER,
42
+ _AvailableModels.CODE_FORMULA,
43
+ _AvailableModels.PICTURE_CLASSIFIER,
44
+ _AvailableModels.EASYOCR,
45
+ ]
46
+
47
+
38
48
  @app.command("download")
39
49
  def download(
40
50
  output_dir: Annotated[
@@ -43,18 +53,27 @@ def download(
43
53
  ...,
44
54
  "-o",
45
55
  "--output-dir",
46
- help="The directory where all the models are downloaded.",
56
+ help="The directory where to download the models.",
47
57
  ),
48
58
  ] = (settings.cache_dir / "models"),
49
59
  force: Annotated[
50
- bool, typer.Option(..., help="If true, the download will be forced")
60
+ bool, typer.Option(..., help="If true, the download will be forced.")
51
61
  ] = False,
52
62
  models: Annotated[
53
63
  Optional[list[_AvailableModels]],
54
64
  typer.Argument(
55
- help=f"Models to download (default behavior: all will be downloaded)",
65
+ help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
56
66
  ),
57
67
  ] = None,
68
+ all: Annotated[
69
+ bool,
70
+ typer.Option(
71
+ ...,
72
+ "--all",
73
+ help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
74
+ show_default=True,
75
+ ),
76
+ ] = False,
58
77
  quiet: Annotated[
59
78
  bool,
60
79
  typer.Option(
@@ -65,6 +84,10 @@ def download(
65
84
  ),
66
85
  ] = False,
67
86
  ):
87
+ if models and all:
88
+ raise typer.BadParameter(
89
+ "Cannot simultaneously set 'all' parameter and specify models to download."
90
+ )
68
91
  if not quiet:
69
92
  FORMAT = "%(message)s"
70
93
  logging.basicConfig(
@@ -73,7 +96,7 @@ def download(
73
96
  datefmt="[%X]",
74
97
  handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
75
98
  )
76
- to_download = models or [m for m in _AvailableModels]
99
+ to_download = models or ([m for m in _AvailableModels] if all else _default_models)
77
100
  output_dir = download_models(
78
101
  output_dir=output_dir,
79
102
  force=force,
@@ -83,6 +106,7 @@ def download(
83
106
  with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
84
107
  with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
85
108
  with_smolvlm=_AvailableModels.SMOLVLM in to_download,
109
+ with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
86
110
  with_easyocr=_AvailableModels.EASYOCR in to_download,
87
111
  )
88
112
 
@@ -154,6 +154,10 @@ class LayoutPrediction(BaseModel):
154
154
  clusters: List[Cluster] = []
155
155
 
156
156
 
157
+ class VlmPrediction(BaseModel):
158
+ text: str = ""
159
+
160
+
157
161
  class ContainerElement(
158
162
  BasePageElement
159
163
  ): # Used for Form and Key-Value-Regions, only for typing.
@@ -197,6 +201,7 @@ class PagePredictions(BaseModel):
197
201
  tablestructure: Optional[TableStructurePrediction] = None
198
202
  figures_classification: Optional[FigureClassificationPrediction] = None
199
203
  equations_prediction: Optional[EquationPrediction] = None
204
+ vlm_response: Optional[VlmPrediction] = None
200
205
 
201
206
 
202
207
  PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
@@ -41,6 +41,7 @@ class AcceleratorOptions(BaseSettings):
41
41
 
42
42
  num_threads: int = 4
43
43
  device: Union[str, AcceleratorDevice] = "auto"
44
+ cuda_use_flash_attention2: bool = False
44
45
 
45
46
  @field_validator("device")
46
47
  def validate_device(cls, value):
@@ -254,6 +255,45 @@ granite_picture_description = PictureDescriptionVlmOptions(
254
255
  )
255
256
 
256
257
 
258
+ class BaseVlmOptions(BaseModel):
259
+ kind: str
260
+ prompt: str
261
+
262
+
263
+ class ResponseFormat(str, Enum):
264
+ DOCTAGS = "doctags"
265
+ MARKDOWN = "markdown"
266
+
267
+
268
+ class HuggingFaceVlmOptions(BaseVlmOptions):
269
+ kind: Literal["hf_model_options"] = "hf_model_options"
270
+
271
+ repo_id: str
272
+ load_in_8bit: bool = True
273
+ llm_int8_threshold: float = 6.0
274
+ quantized: bool = False
275
+
276
+ response_format: ResponseFormat
277
+
278
+ @property
279
+ def repo_cache_folder(self) -> str:
280
+ return self.repo_id.replace("/", "--")
281
+
282
+
283
+ smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
284
+ repo_id="ds4sd/SmolDocling-256M-preview",
285
+ prompt="Convert this page to docling.",
286
+ response_format=ResponseFormat.DOCTAGS,
287
+ )
288
+
289
+ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
290
+ repo_id="ibm-granite/granite-vision-3.1-2b-preview",
291
+ # prompt="OCR the full page to markdown.",
292
+ prompt="OCR this image.",
293
+ response_format=ResponseFormat.MARKDOWN,
294
+ )
295
+
296
+
257
297
  # Define an enum for the backend options
258
298
  class PdfBackend(str, Enum):
259
299
  """Enum of valid PDF backends."""
@@ -285,7 +325,24 @@ class PipelineOptions(BaseModel):
285
325
  enable_remote_services: bool = False
286
326
 
287
327
 
288
- class PdfPipelineOptions(PipelineOptions):
328
+ class PaginatedPipelineOptions(PipelineOptions):
329
+ images_scale: float = 1.0
330
+ generate_page_images: bool = False
331
+ generate_picture_images: bool = False
332
+
333
+
334
+ class VlmPipelineOptions(PaginatedPipelineOptions):
335
+ artifacts_path: Optional[Union[Path, str]] = None
336
+
337
+ generate_page_images: bool = True
338
+ force_backend_text: bool = (
339
+ False # (To be used with vlms, or other generative models)
340
+ )
341
+ # If True, text from backend will be used instead of generated text
342
+ vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
343
+
344
+
345
+ class PdfPipelineOptions(PaginatedPipelineOptions):
289
346
  """Options for the PDF pipeline."""
290
347
 
291
348
  artifacts_path: Optional[Union[Path, str]] = None
@@ -295,6 +352,10 @@ class PdfPipelineOptions(PipelineOptions):
295
352
  do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
296
353
  do_picture_classification: bool = False # True: classify pictures in documents
297
354
  do_picture_description: bool = False # True: run describe pictures in documents
355
+ force_backend_text: bool = (
356
+ False # (To be used with vlms, or other generative models)
357
+ )
358
+ # If True, text from backend will be used instead of generated text
298
359
 
299
360
  table_structure_options: TableStructureOptions = TableStructureOptions()
300
361
  ocr_options: Union[
@@ -0,0 +1,180 @@
1
+ import logging
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Iterable, List, Optional
5
+
6
+ from docling.datamodel.base_models import Page, VlmPrediction
7
+ from docling.datamodel.document import ConversionResult
8
+ from docling.datamodel.pipeline_options import (
9
+ AcceleratorDevice,
10
+ AcceleratorOptions,
11
+ HuggingFaceVlmOptions,
12
+ )
13
+ from docling.datamodel.settings import settings
14
+ from docling.models.base_model import BasePageModel
15
+ from docling.utils.accelerator_utils import decide_device
16
+ from docling.utils.profiling import TimeRecorder
17
+
18
+ _log = logging.getLogger(__name__)
19
+
20
+
21
+ class HuggingFaceVlmModel(BasePageModel):
22
+
23
+ def __init__(
24
+ self,
25
+ enabled: bool,
26
+ artifacts_path: Optional[Path],
27
+ accelerator_options: AcceleratorOptions,
28
+ vlm_options: HuggingFaceVlmOptions,
29
+ ):
30
+ self.enabled = enabled
31
+
32
+ self.vlm_options = vlm_options
33
+
34
+ if self.enabled:
35
+ import torch
36
+ from transformers import ( # type: ignore
37
+ AutoModelForVision2Seq,
38
+ AutoProcessor,
39
+ BitsAndBytesConfig,
40
+ )
41
+
42
+ device = decide_device(accelerator_options.device)
43
+ self.device = device
44
+
45
+ _log.debug("Available device for HuggingFace VLM: {}".format(device))
46
+
47
+ repo_cache_folder = vlm_options.repo_id.replace("/", "--")
48
+
49
+ # PARAMETERS:
50
+ if artifacts_path is None:
51
+ artifacts_path = self.download_models(self.vlm_options.repo_id)
52
+ elif (artifacts_path / repo_cache_folder).exists():
53
+ artifacts_path = artifacts_path / repo_cache_folder
54
+
55
+ self.param_question = vlm_options.prompt # "Perform Layout Analysis."
56
+ self.param_quantization_config = BitsAndBytesConfig(
57
+ load_in_8bit=vlm_options.load_in_8bit, # True,
58
+ llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0
59
+ )
60
+ self.param_quantized = vlm_options.quantized # False
61
+
62
+ self.processor = AutoProcessor.from_pretrained(artifacts_path)
63
+ if not self.param_quantized:
64
+ self.vlm_model = AutoModelForVision2Seq.from_pretrained(
65
+ artifacts_path,
66
+ device_map=device,
67
+ torch_dtype=torch.bfloat16,
68
+ _attn_implementation=(
69
+ "flash_attention_2"
70
+ if self.device.startswith("cuda")
71
+ and accelerator_options.cuda_use_flash_attention2
72
+ else "eager"
73
+ ),
74
+ ) # .to(self.device)
75
+
76
+ else:
77
+ self.vlm_model = AutoModelForVision2Seq.from_pretrained(
78
+ artifacts_path,
79
+ device_map=device,
80
+ torch_dtype="auto",
81
+ quantization_config=self.param_quantization_config,
82
+ _attn_implementation=(
83
+ "flash_attention_2"
84
+ if self.device.startswith("cuda")
85
+ and accelerator_options.cuda_use_flash_attention2
86
+ else "eager"
87
+ ),
88
+ ) # .to(self.device)
89
+
90
+ @staticmethod
91
+ def download_models(
92
+ repo_id: str,
93
+ local_dir: Optional[Path] = None,
94
+ force: bool = False,
95
+ progress: bool = False,
96
+ ) -> Path:
97
+ from huggingface_hub import snapshot_download
98
+ from huggingface_hub.utils import disable_progress_bars
99
+
100
+ if not progress:
101
+ disable_progress_bars()
102
+ download_path = snapshot_download(
103
+ repo_id=repo_id,
104
+ force_download=force,
105
+ local_dir=local_dir,
106
+ # revision="v0.0.1",
107
+ )
108
+
109
+ return Path(download_path)
110
+
111
+ def __call__(
112
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
113
+ ) -> Iterable[Page]:
114
+ for page in page_batch:
115
+ assert page._backend is not None
116
+ if not page._backend.is_valid():
117
+ yield page
118
+ else:
119
+ with TimeRecorder(conv_res, "vlm"):
120
+ assert page.size is not None
121
+
122
+ hi_res_image = page.get_image(scale=2.0) # 144dpi
123
+ # hi_res_image = page.get_image(scale=1.0) # 72dpi
124
+
125
+ if hi_res_image is not None:
126
+ im_width, im_height = hi_res_image.size
127
+
128
+ # populate page_tags with predicted doc tags
129
+ page_tags = ""
130
+
131
+ if hi_res_image:
132
+ if hi_res_image.mode != "RGB":
133
+ hi_res_image = hi_res_image.convert("RGB")
134
+
135
+ messages = [
136
+ {
137
+ "role": "user",
138
+ "content": [
139
+ {
140
+ "type": "text",
141
+ "text": "This is a page from a document.",
142
+ },
143
+ {"type": "image"},
144
+ {"type": "text", "text": self.param_question},
145
+ ],
146
+ }
147
+ ]
148
+ prompt = self.processor.apply_chat_template(
149
+ messages, add_generation_prompt=False
150
+ )
151
+ inputs = self.processor(
152
+ text=prompt, images=[hi_res_image], return_tensors="pt"
153
+ )
154
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
155
+
156
+ start_time = time.time()
157
+ # Call model to generate:
158
+ generated_ids = self.vlm_model.generate(
159
+ **inputs, max_new_tokens=4096, use_cache=True
160
+ )
161
+
162
+ generation_time = time.time() - start_time
163
+ generated_texts = self.processor.batch_decode(
164
+ generated_ids[:, inputs["input_ids"].shape[1] :],
165
+ skip_special_tokens=False,
166
+ )[0]
167
+
168
+ num_tokens = len(generated_ids[0])
169
+ page_tags = generated_texts
170
+
171
+ # inference_time = time.time() - start_time
172
+ # tokens_per_second = num_tokens / generation_time
173
+ # print("")
174
+ # print(f"Page Inference Time: {inference_time:.2f} seconds")
175
+ # print(f"Total tokens on page: {num_tokens:.2f}")
176
+ # print(f"Tokens/sec: {tokens_per_second:.2f}")
177
+ # print("")
178
+ page.predictions.vlm_response = VlmPrediction(text=page_tags)
179
+
180
+ yield page
@@ -41,9 +41,9 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
41
41
  )
42
42
 
43
43
  # Initialize processor and model
44
- self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
44
+ self.processor = AutoProcessor.from_pretrained(artifacts_path)
45
45
  self.model = AutoModelForVision2Seq.from_pretrained(
46
- self.options.repo_id,
46
+ artifacts_path,
47
47
  torch_dtype=torch.bfloat16,
48
48
  _attn_implementation=(
49
49
  "flash_attention_2" if self.device.startswith("cuda") else "eager"
@@ -0,0 +1,534 @@
1
+ import itertools
2
+ import logging
3
+ import re
4
+ import warnings
5
+ from io import BytesIO
6
+
7
+ # from io import BytesIO
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from docling_core.types import DoclingDocument
12
+ from docling_core.types.doc import (
13
+ BoundingBox,
14
+ DocItem,
15
+ DocItemLabel,
16
+ DoclingDocument,
17
+ GroupLabel,
18
+ ImageRef,
19
+ ImageRefMode,
20
+ PictureItem,
21
+ ProvenanceItem,
22
+ Size,
23
+ TableCell,
24
+ TableData,
25
+ TableItem,
26
+ )
27
+ from docling_core.types.doc.tokens import DocumentToken, TableToken
28
+
29
+ from docling.backend.abstract_backend import AbstractDocumentBackend
30
+ from docling.backend.md_backend import MarkdownDocumentBackend
31
+ from docling.backend.pdf_backend import PdfDocumentBackend
32
+ from docling.datamodel.base_models import InputFormat, Page
33
+ from docling.datamodel.document import ConversionResult, InputDocument
34
+ from docling.datamodel.pipeline_options import (
35
+ PdfPipelineOptions,
36
+ ResponseFormat,
37
+ VlmPipelineOptions,
38
+ )
39
+ from docling.datamodel.settings import settings
40
+ from docling.models.hf_vlm_model import HuggingFaceVlmModel
41
+ from docling.pipeline.base_pipeline import PaginatedPipeline
42
+ from docling.utils.profiling import ProfilingScope, TimeRecorder
43
+
44
+ _log = logging.getLogger(__name__)
45
+
46
+
47
+ class VlmPipeline(PaginatedPipeline):
48
+
49
+ def __init__(self, pipeline_options: VlmPipelineOptions):
50
+ super().__init__(pipeline_options)
51
+ self.keep_backend = True
52
+
53
+ warnings.warn(
54
+ "The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
55
+ category=UserWarning,
56
+ stacklevel=2,
57
+ )
58
+
59
+ self.pipeline_options: VlmPipelineOptions
60
+
61
+ artifacts_path: Optional[Path] = None
62
+ if pipeline_options.artifacts_path is not None:
63
+ artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
64
+ elif settings.artifacts_path is not None:
65
+ artifacts_path = Path(settings.artifacts_path).expanduser()
66
+
67
+ if artifacts_path is not None and not artifacts_path.is_dir():
68
+ raise RuntimeError(
69
+ f"The value of {artifacts_path=} is not valid. "
70
+ "When defined, it must point to a folder containing all models required by the pipeline."
71
+ )
72
+
73
+ # force_backend_text = False - use text that is coming from VLM response
74
+ # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
75
+ self.force_backend_text = (
76
+ pipeline_options.force_backend_text
77
+ and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS
78
+ )
79
+
80
+ self.keep_images = self.pipeline_options.generate_page_images
81
+
82
+ self.build_pipe = [
83
+ HuggingFaceVlmModel(
84
+ enabled=True, # must be always enabled for this pipeline to make sense.
85
+ artifacts_path=artifacts_path,
86
+ accelerator_options=pipeline_options.accelerator_options,
87
+ vlm_options=self.pipeline_options.vlm_options,
88
+ ),
89
+ ]
90
+
91
+ self.enrichment_pipe = [
92
+ # Other models working on `NodeItem` elements in the DoclingDocument
93
+ ]
94
+
95
+ def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
96
+ with TimeRecorder(conv_res, "page_init"):
97
+ page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
98
+ if page._backend is not None and page._backend.is_valid():
99
+ page.size = page._backend.get_size()
100
+
101
+ return page
102
+
103
+ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
104
+ with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
105
+
106
+ if (
107
+ self.pipeline_options.vlm_options.response_format
108
+ == ResponseFormat.DOCTAGS
109
+ ):
110
+ conv_res.document = self._turn_tags_into_doc(conv_res.pages)
111
+ elif (
112
+ self.pipeline_options.vlm_options.response_format
113
+ == ResponseFormat.MARKDOWN
114
+ ):
115
+ conv_res.document = self._turn_md_into_doc(conv_res)
116
+
117
+ else:
118
+ raise RuntimeError(
119
+ f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
120
+ )
121
+
122
+ # Generate images of the requested element types
123
+ if self.pipeline_options.generate_picture_images:
124
+ scale = self.pipeline_options.images_scale
125
+ for element, _level in conv_res.document.iterate_items():
126
+ if not isinstance(element, DocItem) or len(element.prov) == 0:
127
+ continue
128
+ if (
129
+ isinstance(element, PictureItem)
130
+ and self.pipeline_options.generate_picture_images
131
+ ):
132
+ page_ix = element.prov[0].page_no - 1
133
+ page = conv_res.pages[page_ix]
134
+ assert page.size is not None
135
+ assert page.image is not None
136
+
137
+ crop_bbox = (
138
+ element.prov[0]
139
+ .bbox.scaled(scale=scale)
140
+ .to_top_left_origin(page_height=page.size.height * scale)
141
+ )
142
+
143
+ cropped_im = page.image.crop(crop_bbox.as_tuple())
144
+ element.image = ImageRef.from_pil(
145
+ cropped_im, dpi=int(72 * scale)
146
+ )
147
+
148
+ return conv_res
149
+
150
+ def _turn_md_into_doc(self, conv_res):
151
+ predicted_text = ""
152
+ for pg_idx, page in enumerate(conv_res.pages):
153
+ if page.predictions.vlm_response:
154
+ predicted_text += page.predictions.vlm_response.text + "\n\n"
155
+ response_bytes = BytesIO(predicted_text.encode("utf8"))
156
+ out_doc = InputDocument(
157
+ path_or_stream=response_bytes,
158
+ filename=conv_res.input.file.name,
159
+ format=InputFormat.MD,
160
+ backend=MarkdownDocumentBackend,
161
+ )
162
+ backend = MarkdownDocumentBackend(
163
+ in_doc=out_doc,
164
+ path_or_stream=response_bytes,
165
+ )
166
+ return backend.convert()
167
+
168
+ def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
169
+ ###############################################
170
+ # Tag definitions and color mappings
171
+ ###############################################
172
+
173
+ # Maps the recognized tag to a Docling label.
174
+ # Code items will be given DocItemLabel.CODE
175
+ tag_to_doclabel = {
176
+ "title": DocItemLabel.TITLE,
177
+ "document_index": DocItemLabel.DOCUMENT_INDEX,
178
+ "otsl": DocItemLabel.TABLE,
179
+ "section_header_level_1": DocItemLabel.SECTION_HEADER,
180
+ "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
181
+ "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
182
+ "text": DocItemLabel.TEXT,
183
+ "page_header": DocItemLabel.PAGE_HEADER,
184
+ "page_footer": DocItemLabel.PAGE_FOOTER,
185
+ "formula": DocItemLabel.FORMULA,
186
+ "caption": DocItemLabel.CAPTION,
187
+ "picture": DocItemLabel.PICTURE,
188
+ "list_item": DocItemLabel.LIST_ITEM,
189
+ "footnote": DocItemLabel.FOOTNOTE,
190
+ "code": DocItemLabel.CODE,
191
+ }
192
+
193
+ # Maps each tag to an associated bounding box color.
194
+ tag_to_color = {
195
+ "title": "blue",
196
+ "document_index": "darkblue",
197
+ "otsl": "green",
198
+ "section_header_level_1": "purple",
199
+ "checkbox_selected": "black",
200
+ "checkbox_unselected": "gray",
201
+ "text": "red",
202
+ "page_header": "orange",
203
+ "page_footer": "cyan",
204
+ "formula": "pink",
205
+ "caption": "magenta",
206
+ "picture": "yellow",
207
+ "list_item": "brown",
208
+ "footnote": "darkred",
209
+ "code": "lightblue",
210
+ }
211
+
212
+ def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
213
+ """Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
214
+ coords = re.findall(r"<loc_(\d+)>", text_chunk)
215
+ if len(coords) == 4:
216
+ l, t, r, b = map(float, coords)
217
+ return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
218
+ return None
219
+
220
+ def extract_inner_text(text_chunk: str) -> str:
221
+ """Strips all <...> tags inside the chunk to get the raw text content."""
222
+ return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
223
+
224
+ def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
225
+ # Convert bounding box normalized to 0-100 into page coordinates for cropping
226
+ text = ""
227
+ if bbox:
228
+ if page.size:
229
+ bbox.l = bbox.l * page.size.width
230
+ bbox.t = bbox.t * page.size.height
231
+ bbox.r = bbox.r * page.size.width
232
+ bbox.b = bbox.b * page.size.height
233
+ if page._backend:
234
+ text = page._backend.get_text_in_rect(bbox)
235
+ return text
236
+
237
+ def otsl_parse_texts(texts, tokens):
238
+ split_word = TableToken.OTSL_NL.value
239
+ split_row_tokens = [
240
+ list(y)
241
+ for x, y in itertools.groupby(tokens, lambda z: z == split_word)
242
+ if not x
243
+ ]
244
+ table_cells = []
245
+ r_idx = 0
246
+ c_idx = 0
247
+
248
+ def count_right(tokens, c_idx, r_idx, which_tokens):
249
+ span = 0
250
+ c_idx_iter = c_idx
251
+ while tokens[r_idx][c_idx_iter] in which_tokens:
252
+ c_idx_iter += 1
253
+ span += 1
254
+ if c_idx_iter >= len(tokens[r_idx]):
255
+ return span
256
+ return span
257
+
258
+ def count_down(tokens, c_idx, r_idx, which_tokens):
259
+ span = 0
260
+ r_idx_iter = r_idx
261
+ while tokens[r_idx_iter][c_idx] in which_tokens:
262
+ r_idx_iter += 1
263
+ span += 1
264
+ if r_idx_iter >= len(tokens):
265
+ return span
266
+ return span
267
+
268
+ for i, text in enumerate(texts):
269
+ cell_text = ""
270
+ if text in [
271
+ TableToken.OTSL_FCEL.value,
272
+ TableToken.OTSL_ECEL.value,
273
+ TableToken.OTSL_CHED.value,
274
+ TableToken.OTSL_RHED.value,
275
+ TableToken.OTSL_SROW.value,
276
+ ]:
277
+ row_span = 1
278
+ col_span = 1
279
+ right_offset = 1
280
+ if text != TableToken.OTSL_ECEL.value:
281
+ cell_text = texts[i + 1]
282
+ right_offset = 2
283
+
284
+ # Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
285
+ next_right_cell = ""
286
+ if i + right_offset < len(texts):
287
+ next_right_cell = texts[i + right_offset]
288
+
289
+ next_bottom_cell = ""
290
+ if r_idx + 1 < len(split_row_tokens):
291
+ if c_idx < len(split_row_tokens[r_idx + 1]):
292
+ next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
293
+
294
+ if next_right_cell in [
295
+ TableToken.OTSL_LCEL.value,
296
+ TableToken.OTSL_XCEL.value,
297
+ ]:
298
+ # we have horisontal spanning cell or 2d spanning cell
299
+ col_span += count_right(
300
+ split_row_tokens,
301
+ c_idx + 1,
302
+ r_idx,
303
+ [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
304
+ )
305
+ if next_bottom_cell in [
306
+ TableToken.OTSL_UCEL.value,
307
+ TableToken.OTSL_XCEL.value,
308
+ ]:
309
+ # we have a vertical spanning cell or 2d spanning cell
310
+ row_span += count_down(
311
+ split_row_tokens,
312
+ c_idx,
313
+ r_idx + 1,
314
+ [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
315
+ )
316
+
317
+ table_cells.append(
318
+ TableCell(
319
+ text=cell_text.strip(),
320
+ row_span=row_span,
321
+ col_span=col_span,
322
+ start_row_offset_idx=r_idx,
323
+ end_row_offset_idx=r_idx + row_span,
324
+ start_col_offset_idx=c_idx,
325
+ end_col_offset_idx=c_idx + col_span,
326
+ )
327
+ )
328
+ if text in [
329
+ TableToken.OTSL_FCEL.value,
330
+ TableToken.OTSL_ECEL.value,
331
+ TableToken.OTSL_CHED.value,
332
+ TableToken.OTSL_RHED.value,
333
+ TableToken.OTSL_SROW.value,
334
+ TableToken.OTSL_LCEL.value,
335
+ TableToken.OTSL_UCEL.value,
336
+ TableToken.OTSL_XCEL.value,
337
+ ]:
338
+ c_idx += 1
339
+ if text == TableToken.OTSL_NL.value:
340
+ r_idx += 1
341
+ c_idx = 0
342
+ return table_cells, split_row_tokens
343
+
344
+ def otsl_extract_tokens_and_text(s: str):
345
+ # Pattern to match anything enclosed by < > (including the angle brackets themselves)
346
+ pattern = r"(<[^>]+>)"
347
+ # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
348
+ tokens = re.findall(pattern, s)
349
+ # Remove any tokens that start with "<loc_"
350
+ tokens = [
351
+ token
352
+ for token in tokens
353
+ if not (
354
+ token.startswith(rf"<{DocumentToken.LOC.value}")
355
+ or token
356
+ in [
357
+ rf"<{DocumentToken.OTSL.value}>",
358
+ rf"</{DocumentToken.OTSL.value}>",
359
+ ]
360
+ )
361
+ ]
362
+ # Split the string by those tokens to get the in-between text
363
+ text_parts = re.split(pattern, s)
364
+ text_parts = [
365
+ token
366
+ for token in text_parts
367
+ if not (
368
+ token.startswith(rf"<{DocumentToken.LOC.value}")
369
+ or token
370
+ in [
371
+ rf"<{DocumentToken.OTSL.value}>",
372
+ rf"</{DocumentToken.OTSL.value}>",
373
+ ]
374
+ )
375
+ ]
376
+ # Remove any empty or purely whitespace strings from text_parts
377
+ text_parts = [part for part in text_parts if part.strip()]
378
+
379
+ return tokens, text_parts
380
+
381
+ def parse_table_content(otsl_content: str) -> TableData:
382
+ tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
383
+ table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
384
+
385
+ return TableData(
386
+ num_rows=len(split_row_tokens),
387
+ num_cols=(
388
+ max(len(row) for row in split_row_tokens) if split_row_tokens else 0
389
+ ),
390
+ table_cells=table_cells,
391
+ )
392
+
393
+ doc = DoclingDocument(name="Document")
394
+ for pg_idx, page in enumerate(pages):
395
+ xml_content = ""
396
+ predicted_text = ""
397
+ if page.predictions.vlm_response:
398
+ predicted_text = page.predictions.vlm_response.text
399
+ image = page.image
400
+
401
+ page_no = pg_idx + 1
402
+ bounding_boxes = []
403
+
404
+ if page.size:
405
+ pg_width = page.size.width
406
+ pg_height = page.size.height
407
+ size = Size(width=pg_width, height=pg_height)
408
+ parent_page = doc.add_page(page_no=page_no, size=size)
409
+
410
+ """
411
+ 1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
412
+ 2. For each chunk, extracts bounding box (if any) and inner text.
413
+ 3. Adds the item to a DoclingDocument structure with the right label.
414
+ 4. Tracks bounding boxes + color in a separate list for later visualization.
415
+ """
416
+
417
+ # Regex for all recognized tags
418
+ tag_pattern = (
419
+ rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
420
+ rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
421
+ rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
422
+ rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
423
+ rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
424
+ rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
425
+ rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
426
+ )
427
+
428
+ # DocumentToken.OTSL
429
+ pattern = re.compile(tag_pattern, re.DOTALL)
430
+
431
+ # Go through each match in order
432
+ for match in pattern.finditer(predicted_text):
433
+ full_chunk = match.group(0)
434
+ tag_name = match.group("tag")
435
+
436
+ bbox = extract_bounding_box(full_chunk)
437
+ doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
438
+ color = tag_to_color.get(tag_name, "white")
439
+
440
+ # Store bounding box + color
441
+ if bbox:
442
+ bounding_boxes.append((bbox, color))
443
+
444
+ if tag_name == DocumentToken.OTSL.value:
445
+ table_data = parse_table_content(full_chunk)
446
+ bbox = extract_bounding_box(full_chunk)
447
+
448
+ if bbox:
449
+ prov = ProvenanceItem(
450
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
451
+ charspan=(0, 0),
452
+ page_no=page_no,
453
+ )
454
+ doc.add_table(data=table_data, prov=prov)
455
+ else:
456
+ doc.add_table(data=table_data)
457
+
458
+ elif tag_name == DocItemLabel.PICTURE:
459
+ text_caption_content = extract_inner_text(full_chunk)
460
+ if image:
461
+ if bbox:
462
+ im_width, im_height = image.size
463
+
464
+ crop_box = (
465
+ int(bbox.l * im_width),
466
+ int(bbox.t * im_height),
467
+ int(bbox.r * im_width),
468
+ int(bbox.b * im_height),
469
+ )
470
+ cropped_image = image.crop(crop_box)
471
+ pic = doc.add_picture(
472
+ parent=None,
473
+ image=ImageRef.from_pil(image=cropped_image, dpi=72),
474
+ prov=(
475
+ ProvenanceItem(
476
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
477
+ charspan=(0, 0),
478
+ page_no=page_no,
479
+ )
480
+ ),
481
+ )
482
+ # If there is a caption to an image, add it as well
483
+ if len(text_caption_content) > 0:
484
+ caption_item = doc.add_text(
485
+ label=DocItemLabel.CAPTION,
486
+ text=text_caption_content,
487
+ parent=None,
488
+ )
489
+ pic.captions.append(caption_item.get_ref())
490
+ else:
491
+ if bbox:
492
+ # In case we don't have access to an binary of an image
493
+ doc.add_picture(
494
+ parent=None,
495
+ prov=ProvenanceItem(
496
+ bbox=bbox, charspan=(0, 0), page_no=page_no
497
+ ),
498
+ )
499
+ # If there is a caption to an image, add it as well
500
+ if len(text_caption_content) > 0:
501
+ caption_item = doc.add_text(
502
+ label=DocItemLabel.CAPTION,
503
+ text=text_caption_content,
504
+ parent=None,
505
+ )
506
+ pic.captions.append(caption_item.get_ref())
507
+ else:
508
+ # For everything else, treat as text
509
+ if self.force_backend_text:
510
+ text_content = extract_text_from_backend(page, bbox)
511
+ else:
512
+ text_content = extract_inner_text(full_chunk)
513
+ doc.add_text(
514
+ label=doc_label,
515
+ text=text_content,
516
+ prov=(
517
+ ProvenanceItem(
518
+ bbox=bbox.resize_by_scale(pg_width, pg_height),
519
+ charspan=(0, len(text_content)),
520
+ page_no=page_no,
521
+ )
522
+ if bbox
523
+ else None
524
+ ),
525
+ )
526
+ return doc
527
+
528
+ @classmethod
529
+ def get_default_options(cls) -> VlmPipelineOptions:
530
+ return VlmPipelineOptions()
531
+
532
+ @classmethod
533
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
534
+ return isinstance(backend, PdfDocumentBackend)
@@ -2,7 +2,10 @@ import logging
2
2
  from pathlib import Path
3
3
  from typing import Optional
4
4
 
5
- from docling.datamodel.pipeline_options import smolvlm_picture_description
5
+ from docling.datamodel.pipeline_options import (
6
+ granite_picture_description,
7
+ smolvlm_picture_description,
8
+ )
6
9
  from docling.datamodel.settings import settings
7
10
  from docling.models.code_formula_model import CodeFormulaModel
8
11
  from docling.models.document_picture_classifier import DocumentPictureClassifier
@@ -23,7 +26,8 @@ def download_models(
23
26
  with_tableformer: bool = True,
24
27
  with_code_formula: bool = True,
25
28
  with_picture_classifier: bool = True,
26
- with_smolvlm: bool = True,
29
+ with_smolvlm: bool = False,
30
+ with_granite_vision: bool = False,
27
31
  with_easyocr: bool = True,
28
32
  ):
29
33
  if output_dir is None:
@@ -73,6 +77,15 @@ def download_models(
73
77
  progress=progress,
74
78
  )
75
79
 
80
+ if with_granite_vision:
81
+ _log.info(f"Downloading Granite Vision model...")
82
+ PictureDescriptionVlmModel.download_models(
83
+ repo_id=granite_picture_description.repo_id,
84
+ local_dir=output_dir / granite_picture_description.repo_cache_folder,
85
+ force=force,
86
+ progress=progress,
87
+ )
88
+
76
89
  if with_easyocr:
77
90
  _log.info(f"Downloading easyocr models...")
78
91
  EasyOcrModel.download_models(
@@ -43,6 +43,11 @@ def draw_clusters(
43
43
  y0 *= scale_x
44
44
  y1 *= scale_y
45
45
 
46
+ if y1 <= y0:
47
+ y1, y0 = y0, y1
48
+ if x1 <= x0:
49
+ x1, x0 = x0, x1
50
+
46
51
  cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
47
52
  cluster_outline_color = (
48
53
  *list(DocItemLabel.get_color(c.label)),
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.24.0
3
+ Version: 2.25.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/DS4SD/docling
6
6
  License: MIT
@@ -25,6 +25,7 @@ Provides-Extra: ocrmac
25
25
  Provides-Extra: rapidocr
26
26
  Provides-Extra: tesserocr
27
27
  Provides-Extra: vlm
28
+ Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
28
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
29
30
  Requires-Dist: certifi (>=2024.7.4)
30
31
  Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
@@ -5,7 +5,7 @@ docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQ
5
5
  docling/backend/csv_backend.py,sha256=xuId4JGEXjoyPgO9Fy9hQ5C-ezXvJwv0TGB8fyFHgWM,4533
6
6
  docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
7
7
  docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
8
- docling/backend/html_backend.py,sha256=BxYvYmgcio6IqROMFKgyYyoankcNUccalCeYlmTE4fk,16094
8
+ docling/backend/html_backend.py,sha256=j5ivNBDMM0bs24GxTHGGcsA7Z0pnb3iEZ2QKS0Xxdrc,17286
9
9
  docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
11
11
  docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
@@ -20,12 +20,12 @@ docling/backend/xml/uspto_backend.py,sha256=IGUNeF2xpLeaVrX6nKb-jXgtSYD2ozULsrDP
20
20
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
21
21
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
22
  docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
23
- docling/cli/models.py,sha256=Z4IEuaXE9el5PuI6_6mR4D5Sn3y8WZzBtoIJPi6jL_s,3188
23
+ docling/cli/models.py,sha256=DDnz-boX2MexPxC8OnOMPgSPG0iwseT3xkkCfgPrZis,3969
24
24
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
25
25
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- docling/datamodel/base_models.py,sha256=b_8LiDCC4MkpqnKfsJjduH2DSsjADCllBLNB83Tpamw,7099
26
+ docling/datamodel/base_models.py,sha256=kMDT-rFhtJUFOOOry4wd2PzCMTLFixFklgSgmRDMS64,7201
27
27
  docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
28
- docling/datamodel/pipeline_options.py,sha256=5jXSVNGyOy6Ha18Wd80e7pYFmvRZk-2Lkgx0bwMOuq8,10234
28
+ docling/datamodel/pipeline_options.py,sha256=YpWqCqkA44YUFPhiBg_LYcfOAXxNhv10vZKrkfLtJ_I,11987
29
29
  docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
30
30
  docling/document_converter.py,sha256=AeiSmKzWcnOkZm8O-KIBG72g3l4W2CAsq3yEbfC1tiE,13184
31
31
  docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
@@ -35,13 +35,14 @@ docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_V
35
35
  docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
36
36
  docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
37
37
  docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
38
+ docling/models/hf_vlm_model.py,sha256=NUtLEuG-kNGJeDHWmQKAAOZG4WF0a5hn-KXUUM1mHBQ,6820
38
39
  docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
39
40
  docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
40
41
  docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
41
42
  docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
42
43
  docling/models/picture_description_api_model.py,sha256=SKNoHpqzbfM8iO-DJJ4ccyNVqO0B2d9neLBnXqt50FY,3186
43
44
  docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0ZxwO7yubhh2RkaC_8e8,1910
44
- docling/models/picture_description_vlm_model.py,sha256=a2vYUdlcA0--_8neY0tTiU8reCf29NCbVMKwWdMy2QQ,3653
45
+ docling/models/picture_description_vlm_model.py,sha256=EvKn4zWgTsQnbMFEoDhU3Ox4Pu5DkPqd2QewsGoXULU,3641
45
46
  docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
46
47
  docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
47
48
  docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
@@ -51,19 +52,20 @@ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
51
52
  docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
52
53
  docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
53
54
  docling/pipeline/standard_pdf_pipeline.py,sha256=IQHktVYvueTrYnIgLonaMvfYKKsU3L-hC9dqrR-Lw8g,12904
55
+ docling/pipeline/vlm_pipeline.py,sha256=glPwNH1QEuHj35L3tdPyuCX0CGlJn81ZDFrj3WwLa7o,22265
54
56
  docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
55
57
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
56
58
  docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
57
59
  docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
58
60
  docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
59
61
  docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
60
- docling/utils/model_downloader.py,sha256=XK3ozGXyQcNPvrSsevTwR9VnY41JWovlsGk_ZBnu6FU,2787
62
+ docling/utils/model_downloader.py,sha256=sxAQvjiIu9m2Ur5Ot5C5SATmgWJAHi0xSjzxj8QXYJk,3213
61
63
  docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
62
64
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
63
65
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
64
- docling/utils/visualization.py,sha256=4pn-80fVuE04ken7hUg5Ar47ndRSL9MWBgdHM-1g1zU,2735
65
- docling-2.24.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
66
- docling-2.24.0.dist-info/METADATA,sha256=0MJ5mBt0GwsZotaSpHnAWzdzWcu_BQFGqGzNR3gRpG4,8672
67
- docling-2.24.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
68
- docling-2.24.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
69
- docling-2.24.0.dist-info/RECORD,,
66
+ docling/utils/visualization.py,sha256=cmbIroPQXPmJdFrNIfpC26WpijBwx05qmpu3QhiG1EI,2850
67
+ docling-2.25.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
68
+ docling-2.25.0.dist-info/METADATA,sha256=9k71yJWmZHMXgiGxqsmh6KhItKh5kvIDG5TpX2-1vgI,8797
69
+ docling-2.25.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
70
+ docling-2.25.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
71
+ docling-2.25.0.dist-info/RECORD,,