docling 2.28.4__py3-none-any.whl → 2.30.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docling/cli/main.py CHANGED
@@ -40,6 +40,7 @@ from docling.datamodel.pipeline_options import (
40
40
  VlmModelType,
41
41
  VlmPipelineOptions,
42
42
  granite_vision_vlm_conversion_options,
43
+ granite_vision_vlm_ollama_conversion_options,
43
44
  smoldocling_vlm_conversion_options,
44
45
  smoldocling_vlm_mlx_conversion_options,
45
46
  )
@@ -60,6 +61,44 @@ err_console = Console(stderr=True)
60
61
  ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
61
62
  ocr_engines_enum_internal = ocr_factory_internal.get_enum()
62
63
 
64
+ DOCLING_ASCII_ART = r"""
65
+ ████ ██████
66
+ ███░░██░░░░░██████
67
+ ████████░░░░░░░░████████████
68
+ ████████░░░░░░░░░░░░░░░░░░████████
69
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░██████
70
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
71
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
72
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
73
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
74
+ ██████░░░░░░░ ░░░░░░░░░░░░░░░░░░░░░░ ░░░░░░░██████
75
+ ██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
76
+ ██████░░░░░░ ░░░░░░░░░░░░░░░ ░░░░░░██████
77
+ ███▒██░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒███
78
+ ███▒██░░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒████
79
+ ███▒██░░░░░░ ██ ██ ░░░░░░░░░░░░ ██ ██ ░░░░░██▒▒███
80
+ ███▒███░░░░░ ██ ░░░░████░░░░ ██ ░░░░░██▒▒███
81
+ ████▒▒██░░░░░░ ░░░███▒▒▒▒███░░░ ░░░░░░░██▒▒████
82
+ ████▒▒██░░░░░░░░░░░░░░░░░█▒▒▒▒▒▒▒▒▒▒█░░░░░░░░░░░░░░░░███▒▒████
83
+ ████▒▒▒██░░░░░░░░░░░░█████ ▒▒▒▒▒▒ ██████░░░░░░░░░░░██▒▒▒████
84
+ ███▒▒▒▒██░░░░░░░░███▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒███░░░░░░░░██▒▒▒▒███
85
+ ███▒▒▒▒▒███░░░░░░██▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒██░░░░░░███▒▒▒▒▒███
86
+ ████▒▒▒▒▒████░░░░░░██████████████████████░░░░░░████▒▒▒▒▒████
87
+ ███▒▒▒▒▒▒▒▒████░░░░░░░░░░░░░░░░░░░░░░░░░░░████▒▒▒▒▒▒▒▒▒███
88
+ ████▒▒▒▒▒▒▒▒███░░░░░████████████████████████▒▒▒▒▒▒▒▒▒████
89
+ ████▒▒▒▒▒▒██░░░░░░█ █░░░░░██▒▒▒▒▒▒████
90
+ ████▒▒▒▒█░░░░░░░█ D O C L I N G █░░░░░░░░██▒▒▒████
91
+ ████▒▒██░░░░░░█ █░░░░░░░░░░█▒▒████
92
+ ██████░░░░░░█ D O C L I N G █░░░░░░░░░░░██████
93
+ ████░░░░░█ █░░░░░░░░░░░░████
94
+ █████░░█ D O C L I N G █░░░░░░░░░░░█████
95
+ █████ █░░░░░░░░████████
96
+ ██ D O C L I N G █░░░░░░░░█████
97
+ █ █░░░████████
98
+ █████████████████████████████
99
+ """
100
+
101
+
63
102
  app = typer.Typer(
64
103
  name="Docling",
65
104
  no_args_is_help=True,
@@ -68,6 +107,12 @@ app = typer.Typer(
68
107
  )
69
108
 
70
109
 
110
+ def logo_callback(value: bool):
111
+ if value:
112
+ print(DOCLING_ASCII_ART)
113
+ raise typer.Exit()
114
+
115
+
71
116
  def version_callback(value: bool):
72
117
  if value:
73
118
  docling_version = importlib.metadata.version("docling")
@@ -109,6 +154,7 @@ def export_documents(
109
154
  output_dir: Path,
110
155
  export_json: bool,
111
156
  export_html: bool,
157
+ export_html_split_page: bool,
112
158
  export_md: bool,
113
159
  export_txt: bool,
114
160
  export_doctags: bool,
@@ -136,7 +182,15 @@ def export_documents(
136
182
  fname = output_dir / f"{doc_filename}.html"
137
183
  _log.info(f"writing HTML output to {fname}")
138
184
  conv_res.document.save_as_html(
139
- filename=fname, image_mode=image_export_mode
185
+ filename=fname, image_mode=image_export_mode, split_page_view=False
186
+ )
187
+
188
+ # Export HTML format:
189
+ if export_html_split_page:
190
+ fname = output_dir / f"{doc_filename}.html"
191
+ _log.info(f"writing HTML output to {fname}")
192
+ conv_res.document.save_as_html(
193
+ filename=fname, image_mode=image_export_mode, split_page_view=True
140
194
  )
141
195
 
142
196
  # Export Text format:
@@ -356,6 +410,12 @@ def convert(
356
410
  device: Annotated[
357
411
  AcceleratorDevice, typer.Option(..., help="Accelerator device")
358
412
  ] = AcceleratorDevice.AUTO,
413
+ docling_logo: Annotated[
414
+ Optional[bool],
415
+ typer.Option(
416
+ "--logo", callback=logo_callback, is_eager=True, help="Docling logo"
417
+ ),
418
+ ] = None,
359
419
  ):
360
420
  if verbose == 0:
361
421
  logging.basicConfig(level=logging.WARNING)
@@ -421,6 +481,7 @@ def convert(
421
481
 
422
482
  export_json = OutputFormat.JSON in to_formats
423
483
  export_html = OutputFormat.HTML in to_formats
484
+ export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
424
485
  export_md = OutputFormat.MARKDOWN in to_formats
425
486
  export_txt = OutputFormat.TEXT in to_formats
426
487
  export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -481,10 +542,16 @@ def convert(
481
542
  backend=backend, # pdf_backend
482
543
  )
483
544
  elif pipeline == PdfPipeline.VLM:
484
- pipeline_options = VlmPipelineOptions()
545
+ pipeline_options = VlmPipelineOptions(
546
+ enable_remote_services=enable_remote_services,
547
+ )
485
548
 
486
549
  if vlm_model == VlmModelType.GRANITE_VISION:
487
550
  pipeline_options.vlm_options = granite_vision_vlm_conversion_options
551
+ elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
552
+ pipeline_options.vlm_options = (
553
+ granite_vision_vlm_ollama_conversion_options
554
+ )
488
555
  elif vlm_model == VlmModelType.SMOLDOCLING:
489
556
  pipeline_options.vlm_options = smoldocling_vlm_conversion_options
490
557
  if sys.platform == "darwin":
@@ -528,6 +595,7 @@ def convert(
528
595
  output_dir=output,
529
596
  export_json=export_json,
530
597
  export_html=export_html,
598
+ export_html_split_page=export_html_split_page,
531
599
  export_md=export_md,
532
600
  export_txt=export_txt,
533
601
  export_doctags=export_doctags,
@@ -50,6 +50,7 @@ class OutputFormat(str, Enum):
50
50
  MARKDOWN = "md"
51
51
  JSON = "json"
52
52
  HTML = "html"
53
+ HTML_SPLIT_PAGE = "html_split_page"
53
54
  TEXT = "text"
54
55
  DOCTAGS = "doctags"
55
56
 
@@ -262,3 +263,35 @@ class Page(BaseModel):
262
263
  @property
263
264
  def image(self) -> Optional[Image]:
264
265
  return self.get_image(scale=self._default_image_scale)
266
+
267
+
268
+ ## OpenAI API Request / Response Models ##
269
+
270
+
271
+ class OpenAiChatMessage(BaseModel):
272
+ role: str
273
+ content: str
274
+
275
+
276
+ class OpenAiResponseChoice(BaseModel):
277
+ index: int
278
+ message: OpenAiChatMessage
279
+ finish_reason: str
280
+
281
+
282
+ class OpenAiResponseUsage(BaseModel):
283
+ prompt_tokens: int
284
+ completion_tokens: int
285
+ total_tokens: int
286
+
287
+
288
+ class OpenAiApiResponse(BaseModel):
289
+ model_config = ConfigDict(
290
+ protected_namespaces=(),
291
+ )
292
+
293
+ id: str
294
+ model: Optional[str] = None # returned by openai
295
+ choices: List[OpenAiResponseChoice]
296
+ created: int
297
+ usage: OpenAiResponseUsage
@@ -283,6 +283,13 @@ class _DocumentConversionInput(BaseModel):
283
283
  if mime is None: # must guess from
284
284
  with obj.open("rb") as f:
285
285
  content = f.read(1024) # Read first 1KB
286
+ if mime is not None and mime.lower() == "application/zip":
287
+ if obj.suffixes[-1].lower() == ".xlsx":
288
+ mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
289
+ elif obj.suffixes[-1].lower() == ".docx":
290
+ mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
291
+ elif obj.suffixes[-1].lower() == ".pptx":
292
+ mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
286
293
 
287
294
  elif isinstance(obj, DocumentStream):
288
295
  content = obj.stream.read(8192)
@@ -213,8 +213,8 @@ class PictureDescriptionBaseOptions(BaseOptions):
213
213
  batch_size: int = 8
214
214
  scale: float = 2
215
215
 
216
- bitmap_area_threshold: float = (
217
- 0.2 # percentage of the area for a bitmap to processed with the models
216
+ picture_area_threshold: float = (
217
+ 0.05 # percentage of the area for a picture to processed with the models
218
218
  )
219
219
 
220
220
 
@@ -266,6 +266,7 @@ class ResponseFormat(str, Enum):
266
266
  class InferenceFramework(str, Enum):
267
267
  MLX = "mlx"
268
268
  TRANSFORMERS = "transformers"
269
+ OPENAI = "openai"
269
270
 
270
271
 
271
272
  class HuggingFaceVlmOptions(BaseVlmOptions):
@@ -284,6 +285,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
284
285
  return self.repo_id.replace("/", "--")
285
286
 
286
287
 
288
+ class ApiVlmOptions(BaseVlmOptions):
289
+ kind: Literal["api_model_options"] = "api_model_options"
290
+
291
+ url: AnyUrl = AnyUrl(
292
+ "http://localhost:11434/v1/chat/completions"
293
+ ) # Default to ollama
294
+ headers: Dict[str, str] = {}
295
+ params: Dict[str, Any] = {}
296
+ scale: float = 2.0
297
+ timeout: float = 60
298
+ response_format: ResponseFormat
299
+
300
+
287
301
  smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
288
302
  repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
289
303
  prompt="Convert this page to docling.",
@@ -307,10 +321,20 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
307
321
  inference_framework=InferenceFramework.TRANSFORMERS,
308
322
  )
309
323
 
324
+ granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
325
+ url=AnyUrl("http://localhost:11434/v1/chat/completions"),
326
+ params={"model": "granite3.2-vision:2b"},
327
+ prompt="OCR the full page to markdown.",
328
+ scale=1.0,
329
+ timeout=120,
330
+ response_format=ResponseFormat.MARKDOWN,
331
+ )
332
+
310
333
 
311
334
  class VlmModelType(str, Enum):
312
335
  SMOLDOCLING = "smoldocling"
313
336
  GRANITE_VISION = "granite_vision"
337
+ GRANITE_VISION_OLLAMA = "granite_vision_ollama"
314
338
 
315
339
 
316
340
  # Define an enum for the backend options
@@ -362,7 +386,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
362
386
  False # (To be used with vlms, or other generative models)
363
387
  )
364
388
  # If True, text from backend will be used instead of generated text
365
- vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
389
+ vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
390
+ smoldocling_vlm_conversion_options
391
+ )
366
392
 
367
393
 
368
394
  class PdfPipelineOptions(PaginatedPipelineOptions):
@@ -0,0 +1,67 @@
1
+ from typing import Iterable
2
+
3
+ from docling.datamodel.base_models import Page, VlmPrediction
4
+ from docling.datamodel.document import ConversionResult
5
+ from docling.datamodel.pipeline_options import ApiVlmOptions
6
+ from docling.exceptions import OperationNotAllowed
7
+ from docling.models.base_model import BasePageModel
8
+ from docling.utils.api_image_request import api_image_request
9
+ from docling.utils.profiling import TimeRecorder
10
+
11
+
12
+ class ApiVlmModel(BasePageModel):
13
+
14
+ def __init__(
15
+ self,
16
+ enabled: bool,
17
+ enable_remote_services: bool,
18
+ vlm_options: ApiVlmOptions,
19
+ ):
20
+ self.enabled = enabled
21
+ self.vlm_options = vlm_options
22
+ if self.enabled:
23
+ if not enable_remote_services:
24
+ raise OperationNotAllowed(
25
+ "Connections to remote services is only allowed when set explicitly. "
26
+ "pipeline_options.enable_remote_services=True, or using the CLI "
27
+ "--enable-remote-services."
28
+ )
29
+
30
+ self.timeout = self.vlm_options.timeout
31
+ self.prompt_content = (
32
+ f"This is a page from a document.\n{self.vlm_options.prompt}"
33
+ )
34
+ self.params = {
35
+ **self.vlm_options.params,
36
+ "temperature": 0,
37
+ }
38
+
39
+ def __call__(
40
+ self, conv_res: ConversionResult, page_batch: Iterable[Page]
41
+ ) -> Iterable[Page]:
42
+ for page in page_batch:
43
+ assert page._backend is not None
44
+ if not page._backend.is_valid():
45
+ yield page
46
+ else:
47
+ with TimeRecorder(conv_res, "vlm"):
48
+ assert page.size is not None
49
+
50
+ hi_res_image = page.get_image(scale=self.vlm_options.scale)
51
+ assert hi_res_image is not None
52
+ if hi_res_image:
53
+ if hi_res_image.mode != "RGB":
54
+ hi_res_image = hi_res_image.convert("RGB")
55
+
56
+ page_tags = api_image_request(
57
+ image=hi_res_image,
58
+ prompt=self.prompt_content,
59
+ url=self.vlm_options.url,
60
+ timeout=self.timeout,
61
+ headers=self.vlm_options.headers,
62
+ **self.params,
63
+ )
64
+
65
+ page.predictions.vlm_response = VlmPrediction(text=page_tags)
66
+
67
+ yield page
@@ -1,12 +1,7 @@
1
- import base64
2
- import io
3
- import logging
4
1
  from pathlib import Path
5
- from typing import Iterable, List, Optional, Type, Union
2
+ from typing import Iterable, Optional, Type, Union
6
3
 
7
- import requests
8
4
  from PIL import Image
9
- from pydantic import BaseModel, ConfigDict
10
5
 
11
6
  from docling.datamodel.pipeline_options import (
12
7
  AcceleratorOptions,
@@ -15,37 +10,7 @@ from docling.datamodel.pipeline_options import (
15
10
  )
16
11
  from docling.exceptions import OperationNotAllowed
17
12
  from docling.models.picture_description_base_model import PictureDescriptionBaseModel
18
-
19
- _log = logging.getLogger(__name__)
20
-
21
-
22
- class ChatMessage(BaseModel):
23
- role: str
24
- content: str
25
-
26
-
27
- class ResponseChoice(BaseModel):
28
- index: int
29
- message: ChatMessage
30
- finish_reason: str
31
-
32
-
33
- class ResponseUsage(BaseModel):
34
- prompt_tokens: int
35
- completion_tokens: int
36
- total_tokens: int
37
-
38
-
39
- class ApiResponse(BaseModel):
40
- model_config = ConfigDict(
41
- protected_namespaces=(),
42
- )
43
-
44
- id: str
45
- model: Optional[str] = None # returned by openai
46
- choices: List[ResponseChoice]
47
- created: int
48
- usage: ResponseUsage
13
+ from docling.utils.api_image_request import api_image_request
49
14
 
50
15
 
51
16
  class PictureDescriptionApiModel(PictureDescriptionBaseModel):
@@ -83,43 +48,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
83
48
  # Note: technically we could make a batch request here,
84
49
  # but not all APIs will allow for it. For example, vllm won't allow more than 1.
85
50
  for image in images:
86
- img_io = io.BytesIO()
87
- image.save(img_io, "PNG")
88
- image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
89
-
90
- messages = [
91
- {
92
- "role": "user",
93
- "content": [
94
- {
95
- "type": "text",
96
- "text": self.options.prompt,
97
- },
98
- {
99
- "type": "image_url",
100
- "image_url": {
101
- "url": f"data:image/png;base64,{image_base64}"
102
- },
103
- },
104
- ],
105
- }
106
- ]
107
-
108
- payload = {
109
- "messages": messages,
110
- **self.options.params,
111
- }
112
-
113
- r = requests.post(
114
- str(self.options.url),
115
- headers=self.options.headers,
116
- json=payload,
51
+ yield api_image_request(
52
+ image=image,
53
+ prompt=self.options.prompt,
54
+ url=self.options.url,
117
55
  timeout=self.options.timeout,
56
+ headers=self.options.headers,
57
+ **self.options.params,
118
58
  )
119
- if not r.ok:
120
- _log.error(f"Error calling the API. Reponse was {r.text}")
121
- r.raise_for_status()
122
-
123
- api_resp = ApiResponse.model_validate_json(r.text)
124
- generated_text = api_resp.choices[0].message.content.strip()
125
- yield generated_text
@@ -63,8 +63,20 @@ class PictureDescriptionBaseModel(
63
63
  elements: List[PictureItem] = []
64
64
  for el in element_batch:
65
65
  assert isinstance(el.item, PictureItem)
66
- elements.append(el.item)
67
- images.append(el.image)
66
+ describe_image = True
67
+ # Don't describe the image if it's smaller than the threshold
68
+ if len(el.item.prov) > 0:
69
+ prov = el.item.prov[0] # PictureItems have at most a single provenance
70
+ page = doc.pages.get(prov.page_no)
71
+ if page is not None:
72
+ page_area = page.size.width * page.size.height
73
+ if page_area > 0:
74
+ area_fraction = prov.bbox.area() / page_area
75
+ if area_fraction < self.options.picture_area_threshold:
76
+ describe_image = False
77
+ if describe_image:
78
+ elements.append(el.item)
79
+ images.append(el.image)
68
80
 
69
81
  outputs = self._annotate_images(images)
70
82
 
@@ -247,7 +247,7 @@ class TesseractOcrCliModel(BaseOcrModel):
247
247
 
248
248
  cell = TextCell(
249
249
  index=ix,
250
- text=text,
250
+ text=str(text),
251
251
  orig=text,
252
252
  from_ocr=True,
253
253
  confidence=conf / 100.0,
@@ -2,7 +2,7 @@ import logging
2
2
  import sys
3
3
  import warnings
4
4
  from pathlib import Path
5
- from typing import Optional
5
+ from typing import Optional, cast
6
6
 
7
7
  from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
8
8
 
@@ -226,7 +226,11 @@ class StandardPdfPipeline(PaginatedPipeline):
226
226
  and self.pipeline_options.generate_table_images
227
227
  ):
228
228
  page_ix = element.prov[0].page_no - 1
229
- page = conv_res.pages[page_ix]
229
+ page = next(
230
+ (p for p in conv_res.pages if p.page_no == page_ix),
231
+ cast("Page", None),
232
+ )
233
+ assert page is not None
230
234
  assert page.size is not None
231
235
  assert page.image is not None
232
236
 
@@ -15,11 +15,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend
15
15
  from docling.datamodel.base_models import InputFormat, Page
16
16
  from docling.datamodel.document import ConversionResult, InputDocument
17
17
  from docling.datamodel.pipeline_options import (
18
+ ApiVlmOptions,
19
+ HuggingFaceVlmOptions,
18
20
  InferenceFramework,
19
21
  ResponseFormat,
20
22
  VlmPipelineOptions,
21
23
  )
22
24
  from docling.datamodel.settings import settings
25
+ from docling.models.api_vlm_model import ApiVlmModel
23
26
  from docling.models.hf_mlx_model import HuggingFaceMlxModel
24
27
  from docling.models.hf_vlm_model import HuggingFaceVlmModel
25
28
  from docling.pipeline.base_pipeline import PaginatedPipeline
@@ -57,27 +60,34 @@ class VlmPipeline(PaginatedPipeline):
57
60
 
58
61
  self.keep_images = self.pipeline_options.generate_page_images
59
62
 
60
- if (
61
- self.pipeline_options.vlm_options.inference_framework
62
- == InferenceFramework.MLX
63
- ):
63
+ if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
64
64
  self.build_pipe = [
65
- HuggingFaceMlxModel(
65
+ ApiVlmModel(
66
66
  enabled=True, # must be always enabled for this pipeline to make sense.
67
- artifacts_path=artifacts_path,
68
- accelerator_options=pipeline_options.accelerator_options,
69
- vlm_options=self.pipeline_options.vlm_options,
70
- ),
71
- ]
72
- else:
73
- self.build_pipe = [
74
- HuggingFaceVlmModel(
75
- enabled=True, # must be always enabled for this pipeline to make sense.
76
- artifacts_path=artifacts_path,
77
- accelerator_options=pipeline_options.accelerator_options,
78
- vlm_options=self.pipeline_options.vlm_options,
67
+ enable_remote_services=self.pipeline_options.enable_remote_services,
68
+ vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
79
69
  ),
80
70
  ]
71
+ elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
72
+ vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
73
+ if vlm_options.inference_framework == InferenceFramework.MLX:
74
+ self.build_pipe = [
75
+ HuggingFaceMlxModel(
76
+ enabled=True, # must be always enabled for this pipeline to make sense.
77
+ artifacts_path=artifacts_path,
78
+ accelerator_options=pipeline_options.accelerator_options,
79
+ vlm_options=vlm_options,
80
+ ),
81
+ ]
82
+ else:
83
+ self.build_pipe = [
84
+ HuggingFaceVlmModel(
85
+ enabled=True, # must be always enabled for this pipeline to make sense.
86
+ artifacts_path=artifacts_path,
87
+ accelerator_options=pipeline_options.accelerator_options,
88
+ vlm_options=vlm_options,
89
+ ),
90
+ ]
81
91
 
82
92
  self.enrichment_pipe = [
83
93
  # Other models working on `NodeItem` elements in the DoclingDocument
@@ -0,0 +1,61 @@
1
+ import base64
2
+ import logging
3
+ from io import BytesIO
4
+ from typing import Dict, Optional
5
+
6
+ import requests
7
+ from PIL import Image
8
+ from pydantic import AnyUrl
9
+
10
+ from docling.datamodel.base_models import OpenAiApiResponse
11
+
12
+ _log = logging.getLogger(__name__)
13
+
14
+
15
+ def api_image_request(
16
+ image: Image.Image,
17
+ prompt: str,
18
+ url: AnyUrl,
19
+ timeout: float = 20,
20
+ headers: Optional[Dict[str, str]] = None,
21
+ **params,
22
+ ) -> str:
23
+ img_io = BytesIO()
24
+ image.save(img_io, "PNG")
25
+ image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
26
+ messages = [
27
+ {
28
+ "role": "user",
29
+ "content": [
30
+ {
31
+ "type": "image_url",
32
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
33
+ },
34
+ {
35
+ "type": "text",
36
+ "text": prompt,
37
+ },
38
+ ],
39
+ }
40
+ ]
41
+
42
+ payload = {
43
+ "messages": messages,
44
+ **params,
45
+ }
46
+
47
+ headers = headers or {}
48
+
49
+ r = requests.post(
50
+ str(url),
51
+ headers=headers,
52
+ json=payload,
53
+ timeout=timeout,
54
+ )
55
+ if not r.ok:
56
+ _log.error(f"Error calling the API. Response was {r.text}")
57
+ r.raise_for_status()
58
+
59
+ api_resp = OpenAiApiResponse.model_validate_json(r.text)
60
+ generated_text = api_resp.choices[0].message.content.strip()
61
+ return generated_text
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.28.4
3
+ Version: 2.30.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.24.1,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
58
58
  Requires-Dist: tqdm (>=4.65.0,<5.0.0)
59
59
  Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
60
60
  Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
61
- Requires-Dist: typer (>=0.12.5,<0.13.0)
61
+ Requires-Dist: typer (>=0.12.5,<0.16.0)
62
62
  Project-URL: Repository, https://github.com/docling-project/docling
63
63
  Description-Content-Type: text/markdown
64
64