docling 2.28.4__py3-none-any.whl → 2.30.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docx/latex/latex_dict.py +3 -0
- docling/backend/docx/latex/omml.py +14 -14
- docling/backend/html_backend.py +2 -1
- docling/backend/msexcel_backend.py +272 -90
- docling/backend/mspowerpoint_backend.py +4 -3
- docling/backend/msword_backend.py +320 -118
- docling/cli/main.py +70 -2
- docling/datamodel/base_models.py +33 -0
- docling/datamodel/document.py +7 -0
- docling/datamodel/pipeline_options.py +29 -3
- docling/models/api_vlm_model.py +67 -0
- docling/models/picture_description_api_model.py +8 -75
- docling/models/picture_description_base_model.py +14 -2
- docling/models/tesseract_ocr_cli_model.py +1 -1
- docling/pipeline/standard_pdf_pipeline.py +6 -2
- docling/pipeline/vlm_pipeline.py +27 -17
- docling/utils/api_image_request.py +61 -0
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/METADATA +3 -3
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/RECORD +22 -20
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/LICENSE +0 -0
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/WHEEL +0 -0
- {docling-2.28.4.dist-info → docling-2.30.0.dist-info}/entry_points.txt +0 -0
docling/cli/main.py
CHANGED
@@ -40,6 +40,7 @@ from docling.datamodel.pipeline_options import (
|
|
40
40
|
VlmModelType,
|
41
41
|
VlmPipelineOptions,
|
42
42
|
granite_vision_vlm_conversion_options,
|
43
|
+
granite_vision_vlm_ollama_conversion_options,
|
43
44
|
smoldocling_vlm_conversion_options,
|
44
45
|
smoldocling_vlm_mlx_conversion_options,
|
45
46
|
)
|
@@ -60,6 +61,44 @@ err_console = Console(stderr=True)
|
|
60
61
|
ocr_factory_internal = get_ocr_factory(allow_external_plugins=False)
|
61
62
|
ocr_engines_enum_internal = ocr_factory_internal.get_enum()
|
62
63
|
|
64
|
+
DOCLING_ASCII_ART = r"""
|
65
|
+
████ ██████
|
66
|
+
███░░██░░░░░██████
|
67
|
+
████████░░░░░░░░████████████
|
68
|
+
████████░░░░░░░░░░░░░░░░░░████████
|
69
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░██████
|
70
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
|
71
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░█████
|
72
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
|
73
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
|
74
|
+
██████░░░░░░░ ░░░░░░░░░░░░░░░░░░░░░░ ░░░░░░░██████
|
75
|
+
██████░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░██████
|
76
|
+
██████░░░░░░ ░░░░░░░░░░░░░░░ ░░░░░░██████
|
77
|
+
███▒██░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒███
|
78
|
+
███▒██░░░░░░ ████ ░░░░░░░░░░░░ ████ ░░░░░██▒████
|
79
|
+
███▒██░░░░░░ ██ ██ ░░░░░░░░░░░░ ██ ██ ░░░░░██▒▒███
|
80
|
+
███▒███░░░░░ ██ ░░░░████░░░░ ██ ░░░░░██▒▒███
|
81
|
+
████▒▒██░░░░░░ ░░░███▒▒▒▒███░░░ ░░░░░░░██▒▒████
|
82
|
+
████▒▒██░░░░░░░░░░░░░░░░░█▒▒▒▒▒▒▒▒▒▒█░░░░░░░░░░░░░░░░███▒▒████
|
83
|
+
████▒▒▒██░░░░░░░░░░░░█████ ▒▒▒▒▒▒ ██████░░░░░░░░░░░██▒▒▒████
|
84
|
+
███▒▒▒▒██░░░░░░░░███▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒███░░░░░░░░██▒▒▒▒███
|
85
|
+
███▒▒▒▒▒███░░░░░░██▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒▒██░░░░░░███▒▒▒▒▒███
|
86
|
+
████▒▒▒▒▒████░░░░░░██████████████████████░░░░░░████▒▒▒▒▒████
|
87
|
+
███▒▒▒▒▒▒▒▒████░░░░░░░░░░░░░░░░░░░░░░░░░░░████▒▒▒▒▒▒▒▒▒███
|
88
|
+
████▒▒▒▒▒▒▒▒███░░░░░████████████████████████▒▒▒▒▒▒▒▒▒████
|
89
|
+
████▒▒▒▒▒▒██░░░░░░█ █░░░░░██▒▒▒▒▒▒████
|
90
|
+
████▒▒▒▒█░░░░░░░█ D O C L I N G █░░░░░░░░██▒▒▒████
|
91
|
+
████▒▒██░░░░░░█ █░░░░░░░░░░█▒▒████
|
92
|
+
██████░░░░░░█ D O C L I N G █░░░░░░░░░░░██████
|
93
|
+
████░░░░░█ █░░░░░░░░░░░░████
|
94
|
+
█████░░█ D O C L I N G █░░░░░░░░░░░█████
|
95
|
+
█████ █░░░░░░░░████████
|
96
|
+
██ D O C L I N G █░░░░░░░░█████
|
97
|
+
█ █░░░████████
|
98
|
+
█████████████████████████████
|
99
|
+
"""
|
100
|
+
|
101
|
+
|
63
102
|
app = typer.Typer(
|
64
103
|
name="Docling",
|
65
104
|
no_args_is_help=True,
|
@@ -68,6 +107,12 @@ app = typer.Typer(
|
|
68
107
|
)
|
69
108
|
|
70
109
|
|
110
|
+
def logo_callback(value: bool):
|
111
|
+
if value:
|
112
|
+
print(DOCLING_ASCII_ART)
|
113
|
+
raise typer.Exit()
|
114
|
+
|
115
|
+
|
71
116
|
def version_callback(value: bool):
|
72
117
|
if value:
|
73
118
|
docling_version = importlib.metadata.version("docling")
|
@@ -109,6 +154,7 @@ def export_documents(
|
|
109
154
|
output_dir: Path,
|
110
155
|
export_json: bool,
|
111
156
|
export_html: bool,
|
157
|
+
export_html_split_page: bool,
|
112
158
|
export_md: bool,
|
113
159
|
export_txt: bool,
|
114
160
|
export_doctags: bool,
|
@@ -136,7 +182,15 @@ def export_documents(
|
|
136
182
|
fname = output_dir / f"{doc_filename}.html"
|
137
183
|
_log.info(f"writing HTML output to {fname}")
|
138
184
|
conv_res.document.save_as_html(
|
139
|
-
filename=fname, image_mode=image_export_mode
|
185
|
+
filename=fname, image_mode=image_export_mode, split_page_view=False
|
186
|
+
)
|
187
|
+
|
188
|
+
# Export HTML format:
|
189
|
+
if export_html_split_page:
|
190
|
+
fname = output_dir / f"{doc_filename}.html"
|
191
|
+
_log.info(f"writing HTML output to {fname}")
|
192
|
+
conv_res.document.save_as_html(
|
193
|
+
filename=fname, image_mode=image_export_mode, split_page_view=True
|
140
194
|
)
|
141
195
|
|
142
196
|
# Export Text format:
|
@@ -356,6 +410,12 @@ def convert(
|
|
356
410
|
device: Annotated[
|
357
411
|
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
358
412
|
] = AcceleratorDevice.AUTO,
|
413
|
+
docling_logo: Annotated[
|
414
|
+
Optional[bool],
|
415
|
+
typer.Option(
|
416
|
+
"--logo", callback=logo_callback, is_eager=True, help="Docling logo"
|
417
|
+
),
|
418
|
+
] = None,
|
359
419
|
):
|
360
420
|
if verbose == 0:
|
361
421
|
logging.basicConfig(level=logging.WARNING)
|
@@ -421,6 +481,7 @@ def convert(
|
|
421
481
|
|
422
482
|
export_json = OutputFormat.JSON in to_formats
|
423
483
|
export_html = OutputFormat.HTML in to_formats
|
484
|
+
export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
|
424
485
|
export_md = OutputFormat.MARKDOWN in to_formats
|
425
486
|
export_txt = OutputFormat.TEXT in to_formats
|
426
487
|
export_doctags = OutputFormat.DOCTAGS in to_formats
|
@@ -481,10 +542,16 @@ def convert(
|
|
481
542
|
backend=backend, # pdf_backend
|
482
543
|
)
|
483
544
|
elif pipeline == PdfPipeline.VLM:
|
484
|
-
pipeline_options = VlmPipelineOptions(
|
545
|
+
pipeline_options = VlmPipelineOptions(
|
546
|
+
enable_remote_services=enable_remote_services,
|
547
|
+
)
|
485
548
|
|
486
549
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
487
550
|
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
551
|
+
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
552
|
+
pipeline_options.vlm_options = (
|
553
|
+
granite_vision_vlm_ollama_conversion_options
|
554
|
+
)
|
488
555
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
489
556
|
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
490
557
|
if sys.platform == "darwin":
|
@@ -528,6 +595,7 @@ def convert(
|
|
528
595
|
output_dir=output,
|
529
596
|
export_json=export_json,
|
530
597
|
export_html=export_html,
|
598
|
+
export_html_split_page=export_html_split_page,
|
531
599
|
export_md=export_md,
|
532
600
|
export_txt=export_txt,
|
533
601
|
export_doctags=export_doctags,
|
docling/datamodel/base_models.py
CHANGED
@@ -50,6 +50,7 @@ class OutputFormat(str, Enum):
|
|
50
50
|
MARKDOWN = "md"
|
51
51
|
JSON = "json"
|
52
52
|
HTML = "html"
|
53
|
+
HTML_SPLIT_PAGE = "html_split_page"
|
53
54
|
TEXT = "text"
|
54
55
|
DOCTAGS = "doctags"
|
55
56
|
|
@@ -262,3 +263,35 @@ class Page(BaseModel):
|
|
262
263
|
@property
|
263
264
|
def image(self) -> Optional[Image]:
|
264
265
|
return self.get_image(scale=self._default_image_scale)
|
266
|
+
|
267
|
+
|
268
|
+
## OpenAI API Request / Response Models ##
|
269
|
+
|
270
|
+
|
271
|
+
class OpenAiChatMessage(BaseModel):
|
272
|
+
role: str
|
273
|
+
content: str
|
274
|
+
|
275
|
+
|
276
|
+
class OpenAiResponseChoice(BaseModel):
|
277
|
+
index: int
|
278
|
+
message: OpenAiChatMessage
|
279
|
+
finish_reason: str
|
280
|
+
|
281
|
+
|
282
|
+
class OpenAiResponseUsage(BaseModel):
|
283
|
+
prompt_tokens: int
|
284
|
+
completion_tokens: int
|
285
|
+
total_tokens: int
|
286
|
+
|
287
|
+
|
288
|
+
class OpenAiApiResponse(BaseModel):
|
289
|
+
model_config = ConfigDict(
|
290
|
+
protected_namespaces=(),
|
291
|
+
)
|
292
|
+
|
293
|
+
id: str
|
294
|
+
model: Optional[str] = None # returned by openai
|
295
|
+
choices: List[OpenAiResponseChoice]
|
296
|
+
created: int
|
297
|
+
usage: OpenAiResponseUsage
|
docling/datamodel/document.py
CHANGED
@@ -283,6 +283,13 @@ class _DocumentConversionInput(BaseModel):
|
|
283
283
|
if mime is None: # must guess from
|
284
284
|
with obj.open("rb") as f:
|
285
285
|
content = f.read(1024) # Read first 1KB
|
286
|
+
if mime is not None and mime.lower() == "application/zip":
|
287
|
+
if obj.suffixes[-1].lower() == ".xlsx":
|
288
|
+
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
289
|
+
elif obj.suffixes[-1].lower() == ".docx":
|
290
|
+
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
291
|
+
elif obj.suffixes[-1].lower() == ".pptx":
|
292
|
+
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
286
293
|
|
287
294
|
elif isinstance(obj, DocumentStream):
|
288
295
|
content = obj.stream.read(8192)
|
@@ -213,8 +213,8 @@ class PictureDescriptionBaseOptions(BaseOptions):
|
|
213
213
|
batch_size: int = 8
|
214
214
|
scale: float = 2
|
215
215
|
|
216
|
-
|
217
|
-
0.
|
216
|
+
picture_area_threshold: float = (
|
217
|
+
0.05 # percentage of the area for a picture to processed with the models
|
218
218
|
)
|
219
219
|
|
220
220
|
|
@@ -266,6 +266,7 @@ class ResponseFormat(str, Enum):
|
|
266
266
|
class InferenceFramework(str, Enum):
|
267
267
|
MLX = "mlx"
|
268
268
|
TRANSFORMERS = "transformers"
|
269
|
+
OPENAI = "openai"
|
269
270
|
|
270
271
|
|
271
272
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
@@ -284,6 +285,19 @@ class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
284
285
|
return self.repo_id.replace("/", "--")
|
285
286
|
|
286
287
|
|
288
|
+
class ApiVlmOptions(BaseVlmOptions):
|
289
|
+
kind: Literal["api_model_options"] = "api_model_options"
|
290
|
+
|
291
|
+
url: AnyUrl = AnyUrl(
|
292
|
+
"http://localhost:11434/v1/chat/completions"
|
293
|
+
) # Default to ollama
|
294
|
+
headers: Dict[str, str] = {}
|
295
|
+
params: Dict[str, Any] = {}
|
296
|
+
scale: float = 2.0
|
297
|
+
timeout: float = 60
|
298
|
+
response_format: ResponseFormat
|
299
|
+
|
300
|
+
|
287
301
|
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
288
302
|
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
289
303
|
prompt="Convert this page to docling.",
|
@@ -307,10 +321,20 @@ granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
307
321
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
308
322
|
)
|
309
323
|
|
324
|
+
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
325
|
+
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
326
|
+
params={"model": "granite3.2-vision:2b"},
|
327
|
+
prompt="OCR the full page to markdown.",
|
328
|
+
scale=1.0,
|
329
|
+
timeout=120,
|
330
|
+
response_format=ResponseFormat.MARKDOWN,
|
331
|
+
)
|
332
|
+
|
310
333
|
|
311
334
|
class VlmModelType(str, Enum):
|
312
335
|
SMOLDOCLING = "smoldocling"
|
313
336
|
GRANITE_VISION = "granite_vision"
|
337
|
+
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
314
338
|
|
315
339
|
|
316
340
|
# Define an enum for the backend options
|
@@ -362,7 +386,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
362
386
|
False # (To be used with vlms, or other generative models)
|
363
387
|
)
|
364
388
|
# If True, text from backend will be used instead of generated text
|
365
|
-
vlm_options: Union[HuggingFaceVlmOptions] =
|
389
|
+
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
|
390
|
+
smoldocling_vlm_conversion_options
|
391
|
+
)
|
366
392
|
|
367
393
|
|
368
394
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from typing import Iterable
|
2
|
+
|
3
|
+
from docling.datamodel.base_models import Page, VlmPrediction
|
4
|
+
from docling.datamodel.document import ConversionResult
|
5
|
+
from docling.datamodel.pipeline_options import ApiVlmOptions
|
6
|
+
from docling.exceptions import OperationNotAllowed
|
7
|
+
from docling.models.base_model import BasePageModel
|
8
|
+
from docling.utils.api_image_request import api_image_request
|
9
|
+
from docling.utils.profiling import TimeRecorder
|
10
|
+
|
11
|
+
|
12
|
+
class ApiVlmModel(BasePageModel):
|
13
|
+
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
enabled: bool,
|
17
|
+
enable_remote_services: bool,
|
18
|
+
vlm_options: ApiVlmOptions,
|
19
|
+
):
|
20
|
+
self.enabled = enabled
|
21
|
+
self.vlm_options = vlm_options
|
22
|
+
if self.enabled:
|
23
|
+
if not enable_remote_services:
|
24
|
+
raise OperationNotAllowed(
|
25
|
+
"Connections to remote services is only allowed when set explicitly. "
|
26
|
+
"pipeline_options.enable_remote_services=True, or using the CLI "
|
27
|
+
"--enable-remote-services."
|
28
|
+
)
|
29
|
+
|
30
|
+
self.timeout = self.vlm_options.timeout
|
31
|
+
self.prompt_content = (
|
32
|
+
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
33
|
+
)
|
34
|
+
self.params = {
|
35
|
+
**self.vlm_options.params,
|
36
|
+
"temperature": 0,
|
37
|
+
}
|
38
|
+
|
39
|
+
def __call__(
|
40
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
41
|
+
) -> Iterable[Page]:
|
42
|
+
for page in page_batch:
|
43
|
+
assert page._backend is not None
|
44
|
+
if not page._backend.is_valid():
|
45
|
+
yield page
|
46
|
+
else:
|
47
|
+
with TimeRecorder(conv_res, "vlm"):
|
48
|
+
assert page.size is not None
|
49
|
+
|
50
|
+
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
51
|
+
assert hi_res_image is not None
|
52
|
+
if hi_res_image:
|
53
|
+
if hi_res_image.mode != "RGB":
|
54
|
+
hi_res_image = hi_res_image.convert("RGB")
|
55
|
+
|
56
|
+
page_tags = api_image_request(
|
57
|
+
image=hi_res_image,
|
58
|
+
prompt=self.prompt_content,
|
59
|
+
url=self.vlm_options.url,
|
60
|
+
timeout=self.timeout,
|
61
|
+
headers=self.vlm_options.headers,
|
62
|
+
**self.params,
|
63
|
+
)
|
64
|
+
|
65
|
+
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
66
|
+
|
67
|
+
yield page
|
@@ -1,12 +1,7 @@
|
|
1
|
-
import base64
|
2
|
-
import io
|
3
|
-
import logging
|
4
1
|
from pathlib import Path
|
5
|
-
from typing import Iterable,
|
2
|
+
from typing import Iterable, Optional, Type, Union
|
6
3
|
|
7
|
-
import requests
|
8
4
|
from PIL import Image
|
9
|
-
from pydantic import BaseModel, ConfigDict
|
10
5
|
|
11
6
|
from docling.datamodel.pipeline_options import (
|
12
7
|
AcceleratorOptions,
|
@@ -15,37 +10,7 @@ from docling.datamodel.pipeline_options import (
|
|
15
10
|
)
|
16
11
|
from docling.exceptions import OperationNotAllowed
|
17
12
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
18
|
-
|
19
|
-
_log = logging.getLogger(__name__)
|
20
|
-
|
21
|
-
|
22
|
-
class ChatMessage(BaseModel):
|
23
|
-
role: str
|
24
|
-
content: str
|
25
|
-
|
26
|
-
|
27
|
-
class ResponseChoice(BaseModel):
|
28
|
-
index: int
|
29
|
-
message: ChatMessage
|
30
|
-
finish_reason: str
|
31
|
-
|
32
|
-
|
33
|
-
class ResponseUsage(BaseModel):
|
34
|
-
prompt_tokens: int
|
35
|
-
completion_tokens: int
|
36
|
-
total_tokens: int
|
37
|
-
|
38
|
-
|
39
|
-
class ApiResponse(BaseModel):
|
40
|
-
model_config = ConfigDict(
|
41
|
-
protected_namespaces=(),
|
42
|
-
)
|
43
|
-
|
44
|
-
id: str
|
45
|
-
model: Optional[str] = None # returned by openai
|
46
|
-
choices: List[ResponseChoice]
|
47
|
-
created: int
|
48
|
-
usage: ResponseUsage
|
13
|
+
from docling.utils.api_image_request import api_image_request
|
49
14
|
|
50
15
|
|
51
16
|
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
@@ -83,43 +48,11 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
|
83
48
|
# Note: technically we could make a batch request here,
|
84
49
|
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
85
50
|
for image in images:
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
messages = [
|
91
|
-
{
|
92
|
-
"role": "user",
|
93
|
-
"content": [
|
94
|
-
{
|
95
|
-
"type": "text",
|
96
|
-
"text": self.options.prompt,
|
97
|
-
},
|
98
|
-
{
|
99
|
-
"type": "image_url",
|
100
|
-
"image_url": {
|
101
|
-
"url": f"data:image/png;base64,{image_base64}"
|
102
|
-
},
|
103
|
-
},
|
104
|
-
],
|
105
|
-
}
|
106
|
-
]
|
107
|
-
|
108
|
-
payload = {
|
109
|
-
"messages": messages,
|
110
|
-
**self.options.params,
|
111
|
-
}
|
112
|
-
|
113
|
-
r = requests.post(
|
114
|
-
str(self.options.url),
|
115
|
-
headers=self.options.headers,
|
116
|
-
json=payload,
|
51
|
+
yield api_image_request(
|
52
|
+
image=image,
|
53
|
+
prompt=self.options.prompt,
|
54
|
+
url=self.options.url,
|
117
55
|
timeout=self.options.timeout,
|
56
|
+
headers=self.options.headers,
|
57
|
+
**self.options.params,
|
118
58
|
)
|
119
|
-
if not r.ok:
|
120
|
-
_log.error(f"Error calling the API. Reponse was {r.text}")
|
121
|
-
r.raise_for_status()
|
122
|
-
|
123
|
-
api_resp = ApiResponse.model_validate_json(r.text)
|
124
|
-
generated_text = api_resp.choices[0].message.content.strip()
|
125
|
-
yield generated_text
|
@@ -63,8 +63,20 @@ class PictureDescriptionBaseModel(
|
|
63
63
|
elements: List[PictureItem] = []
|
64
64
|
for el in element_batch:
|
65
65
|
assert isinstance(el.item, PictureItem)
|
66
|
-
|
67
|
-
|
66
|
+
describe_image = True
|
67
|
+
# Don't describe the image if it's smaller than the threshold
|
68
|
+
if len(el.item.prov) > 0:
|
69
|
+
prov = el.item.prov[0] # PictureItems have at most a single provenance
|
70
|
+
page = doc.pages.get(prov.page_no)
|
71
|
+
if page is not None:
|
72
|
+
page_area = page.size.width * page.size.height
|
73
|
+
if page_area > 0:
|
74
|
+
area_fraction = prov.bbox.area() / page_area
|
75
|
+
if area_fraction < self.options.picture_area_threshold:
|
76
|
+
describe_image = False
|
77
|
+
if describe_image:
|
78
|
+
elements.append(el.item)
|
79
|
+
images.append(el.image)
|
68
80
|
|
69
81
|
outputs = self._annotate_images(images)
|
70
82
|
|
@@ -2,7 +2,7 @@ import logging
|
|
2
2
|
import sys
|
3
3
|
import warnings
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Optional
|
5
|
+
from typing import Optional, cast
|
6
6
|
|
7
7
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
8
8
|
|
@@ -226,7 +226,11 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
226
226
|
and self.pipeline_options.generate_table_images
|
227
227
|
):
|
228
228
|
page_ix = element.prov[0].page_no - 1
|
229
|
-
page =
|
229
|
+
page = next(
|
230
|
+
(p for p in conv_res.pages if p.page_no == page_ix),
|
231
|
+
cast("Page", None),
|
232
|
+
)
|
233
|
+
assert page is not None
|
230
234
|
assert page.size is not None
|
231
235
|
assert page.image is not None
|
232
236
|
|
docling/pipeline/vlm_pipeline.py
CHANGED
@@ -15,11 +15,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
|
15
15
|
from docling.datamodel.base_models import InputFormat, Page
|
16
16
|
from docling.datamodel.document import ConversionResult, InputDocument
|
17
17
|
from docling.datamodel.pipeline_options import (
|
18
|
+
ApiVlmOptions,
|
19
|
+
HuggingFaceVlmOptions,
|
18
20
|
InferenceFramework,
|
19
21
|
ResponseFormat,
|
20
22
|
VlmPipelineOptions,
|
21
23
|
)
|
22
24
|
from docling.datamodel.settings import settings
|
25
|
+
from docling.models.api_vlm_model import ApiVlmModel
|
23
26
|
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
24
27
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
25
28
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
@@ -57,27 +60,34 @@ class VlmPipeline(PaginatedPipeline):
|
|
57
60
|
|
58
61
|
self.keep_images = self.pipeline_options.generate_page_images
|
59
62
|
|
60
|
-
if (
|
61
|
-
self.pipeline_options.vlm_options.inference_framework
|
62
|
-
== InferenceFramework.MLX
|
63
|
-
):
|
63
|
+
if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
|
64
64
|
self.build_pipe = [
|
65
|
-
|
65
|
+
ApiVlmModel(
|
66
66
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
67
|
-
|
68
|
-
|
69
|
-
vlm_options=self.pipeline_options.vlm_options,
|
70
|
-
),
|
71
|
-
]
|
72
|
-
else:
|
73
|
-
self.build_pipe = [
|
74
|
-
HuggingFaceVlmModel(
|
75
|
-
enabled=True, # must be always enabled for this pipeline to make sense.
|
76
|
-
artifacts_path=artifacts_path,
|
77
|
-
accelerator_options=pipeline_options.accelerator_options,
|
78
|
-
vlm_options=self.pipeline_options.vlm_options,
|
67
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
68
|
+
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
79
69
|
),
|
80
70
|
]
|
71
|
+
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
|
72
|
+
vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
|
73
|
+
if vlm_options.inference_framework == InferenceFramework.MLX:
|
74
|
+
self.build_pipe = [
|
75
|
+
HuggingFaceMlxModel(
|
76
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
77
|
+
artifacts_path=artifacts_path,
|
78
|
+
accelerator_options=pipeline_options.accelerator_options,
|
79
|
+
vlm_options=vlm_options,
|
80
|
+
),
|
81
|
+
]
|
82
|
+
else:
|
83
|
+
self.build_pipe = [
|
84
|
+
HuggingFaceVlmModel(
|
85
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
86
|
+
artifacts_path=artifacts_path,
|
87
|
+
accelerator_options=pipeline_options.accelerator_options,
|
88
|
+
vlm_options=vlm_options,
|
89
|
+
),
|
90
|
+
]
|
81
91
|
|
82
92
|
self.enrichment_pipe = [
|
83
93
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import base64
|
2
|
+
import logging
|
3
|
+
from io import BytesIO
|
4
|
+
from typing import Dict, Optional
|
5
|
+
|
6
|
+
import requests
|
7
|
+
from PIL import Image
|
8
|
+
from pydantic import AnyUrl
|
9
|
+
|
10
|
+
from docling.datamodel.base_models import OpenAiApiResponse
|
11
|
+
|
12
|
+
_log = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def api_image_request(
|
16
|
+
image: Image.Image,
|
17
|
+
prompt: str,
|
18
|
+
url: AnyUrl,
|
19
|
+
timeout: float = 20,
|
20
|
+
headers: Optional[Dict[str, str]] = None,
|
21
|
+
**params,
|
22
|
+
) -> str:
|
23
|
+
img_io = BytesIO()
|
24
|
+
image.save(img_io, "PNG")
|
25
|
+
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
26
|
+
messages = [
|
27
|
+
{
|
28
|
+
"role": "user",
|
29
|
+
"content": [
|
30
|
+
{
|
31
|
+
"type": "image_url",
|
32
|
+
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
33
|
+
},
|
34
|
+
{
|
35
|
+
"type": "text",
|
36
|
+
"text": prompt,
|
37
|
+
},
|
38
|
+
],
|
39
|
+
}
|
40
|
+
]
|
41
|
+
|
42
|
+
payload = {
|
43
|
+
"messages": messages,
|
44
|
+
**params,
|
45
|
+
}
|
46
|
+
|
47
|
+
headers = headers or {}
|
48
|
+
|
49
|
+
r = requests.post(
|
50
|
+
str(url),
|
51
|
+
headers=headers,
|
52
|
+
json=payload,
|
53
|
+
timeout=timeout,
|
54
|
+
)
|
55
|
+
if not r.ok:
|
56
|
+
_log.error(f"Error calling the API. Response was {r.text}")
|
57
|
+
r.raise_for_status()
|
58
|
+
|
59
|
+
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
60
|
+
generated_text = api_resp.choices[0].message.content.strip()
|
61
|
+
return generated_text
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.30.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -28,7 +28,7 @@ Provides-Extra: vlm
|
|
28
28
|
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
29
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
30
30
|
Requires-Dist: certifi (>=2024.7.4)
|
31
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.26.0,<3.0.0)
|
32
32
|
Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
|
33
33
|
Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
|
34
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -58,7 +58,7 @@ Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
|
58
58
|
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
59
59
|
Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
|
60
60
|
Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
61
|
-
Requires-Dist: typer (>=0.12.5,<0.
|
61
|
+
Requires-Dist: typer (>=0.12.5,<0.16.0)
|
62
62
|
Project-URL: Repository, https://github.com/docling-project/docling
|
63
63
|
Description-Content-Type: text/markdown
|
64
64
|
|