docling 2.53.0__py3-none-any.whl → 2.55.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/html_backend.py +254 -136
- docling/backend/md_backend.py +4 -1
- docling/backend/msword_backend.py +177 -76
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/jats_backend.py +111 -7
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +5 -0
- docling/datamodel/base_models.py +23 -23
- docling/datamodel/document.py +2 -0
- docling/datamodel/pipeline_options_vlm_model.py +13 -2
- docling/datamodel/vlm_model_specs.py +9 -0
- docling/document_converter.py +4 -0
- docling/models/api_vlm_model.py +45 -16
- docling/models/base_model.py +2 -1
- docling/models/readingorder_model.py +1 -1
- docling/models/table_structure_model.py +3 -3
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +6 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +75 -14
- docling/models/vlm_models_inline/mlx_model.py +58 -1
- docling/models/vlm_models_inline/vllm_model.py +189 -124
- docling/utils/api_image_request.py +107 -1
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/METADATA +5 -5
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/RECORD +29 -27
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/WHEEL +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/entry_points.txt +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/top_level.txt +0 -0
|
@@ -7,9 +7,7 @@ from typing import Any, Dict, Optional, Union
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
from PIL.Image import Image
|
|
9
9
|
|
|
10
|
-
from docling.datamodel.accelerator_options import
|
|
11
|
-
AcceleratorOptions,
|
|
12
|
-
)
|
|
10
|
+
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
13
11
|
from docling.datamodel.base_models import Page, VlmPrediction
|
|
14
12
|
from docling.datamodel.document import ConversionResult
|
|
15
13
|
from docling.datamodel.pipeline_options_vlm_model import (
|
|
@@ -17,9 +15,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
|
|
|
17
15
|
TransformersPromptStyle,
|
|
18
16
|
)
|
|
19
17
|
from docling.models.base_model import BaseVlmPageModel
|
|
20
|
-
from docling.models.utils.hf_model_download import
|
|
21
|
-
HuggingFaceModelDownloadMixin,
|
|
22
|
-
)
|
|
18
|
+
from docling.models.utils.hf_model_download import HuggingFaceModelDownloadMixin
|
|
23
19
|
from docling.utils.accelerator_utils import decide_device
|
|
24
20
|
from docling.utils.profiling import TimeRecorder
|
|
25
21
|
|
|
@@ -27,6 +23,62 @@ _log = logging.getLogger(__name__)
|
|
|
27
23
|
|
|
28
24
|
|
|
29
25
|
class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
26
|
+
"""
|
|
27
|
+
vLLM-backed vision-language model that accepts PIL images (or numpy arrays)
|
|
28
|
+
via vLLM's multi_modal_data, with prompt formatting handled by formulate_prompt().
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
# --------- Allowlist of vLLM args ---------
|
|
32
|
+
# SamplingParams (runtime generation controls)
|
|
33
|
+
_VLLM_SAMPLING_KEYS = {
|
|
34
|
+
# Core
|
|
35
|
+
"max_tokens",
|
|
36
|
+
"temperature",
|
|
37
|
+
"top_p",
|
|
38
|
+
"top_k",
|
|
39
|
+
# Penalties
|
|
40
|
+
"presence_penalty",
|
|
41
|
+
"frequency_penalty",
|
|
42
|
+
"repetition_penalty",
|
|
43
|
+
# Stops / outputs
|
|
44
|
+
"stop",
|
|
45
|
+
"stop_token_ids",
|
|
46
|
+
"skip_special_tokens",
|
|
47
|
+
"spaces_between_special_tokens",
|
|
48
|
+
# Search / length
|
|
49
|
+
"n",
|
|
50
|
+
"best_of",
|
|
51
|
+
"length_penalty",
|
|
52
|
+
"early_stopping",
|
|
53
|
+
# Misc
|
|
54
|
+
"logprobs",
|
|
55
|
+
"prompt_logprobs",
|
|
56
|
+
"min_p",
|
|
57
|
+
"seed",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# LLM(...) / EngineArgs (engine/load-time controls)
|
|
61
|
+
_VLLM_ENGINE_KEYS = {
|
|
62
|
+
# Model/tokenizer/impl
|
|
63
|
+
"tokenizer",
|
|
64
|
+
"tokenizer_mode",
|
|
65
|
+
"download_dir",
|
|
66
|
+
# Parallelism / memory / lengths
|
|
67
|
+
"tensor_parallel_size",
|
|
68
|
+
"pipeline_parallel_size",
|
|
69
|
+
"gpu_memory_utilization",
|
|
70
|
+
"max_model_len",
|
|
71
|
+
"max_num_batched_tokens",
|
|
72
|
+
"kv_cache_dtype",
|
|
73
|
+
"dtype",
|
|
74
|
+
# Quantization (coarse switch)
|
|
75
|
+
"quantization",
|
|
76
|
+
# Multimodal limits
|
|
77
|
+
"limit_mm_per_prompt",
|
|
78
|
+
# Execution toggles
|
|
79
|
+
"enforce_eager",
|
|
80
|
+
}
|
|
81
|
+
|
|
30
82
|
def __init__(
|
|
31
83
|
self,
|
|
32
84
|
enabled: bool,
|
|
@@ -35,120 +87,147 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
35
87
|
vlm_options: InlineVlmOptions,
|
|
36
88
|
):
|
|
37
89
|
self.enabled = enabled
|
|
38
|
-
|
|
39
90
|
self.vlm_options = vlm_options
|
|
40
91
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
supported_devices=vlm_options.supported_devices,
|
|
48
|
-
)
|
|
49
|
-
_log.debug(f"Available device for VLM: {self.device}")
|
|
50
|
-
|
|
51
|
-
self.max_new_tokens = vlm_options.max_new_tokens
|
|
52
|
-
self.temperature = vlm_options.temperature
|
|
53
|
-
|
|
54
|
-
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
|
92
|
+
self.llm = None
|
|
93
|
+
self.sampling_params = None
|
|
94
|
+
self.processor = None # used for CHAT templating in formulate_prompt()
|
|
95
|
+
self.device = "cpu"
|
|
96
|
+
self.max_new_tokens = vlm_options.max_new_tokens
|
|
97
|
+
self.temperature = vlm_options.temperature
|
|
55
98
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
elif (artifacts_path / repo_cache_folder).exists():
|
|
59
|
-
artifacts_path = artifacts_path / repo_cache_folder
|
|
60
|
-
|
|
61
|
-
# Initialize VLLM LLM
|
|
62
|
-
llm_kwargs: Dict[str, Any] = {
|
|
63
|
-
"model": str(artifacts_path),
|
|
64
|
-
"limit_mm_per_prompt": {"image": 1},
|
|
65
|
-
"trust_remote_code": vlm_options.trust_remote_code,
|
|
66
|
-
"model_impl": "transformers",
|
|
67
|
-
"gpu_memory_utilization": 0.3, # hardcoded for now, leaves room for ~3 different models.
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
# Add device-specific configurations
|
|
71
|
-
|
|
72
|
-
if self.device == "cpu":
|
|
73
|
-
llm_kwargs["device"] = "cpu"
|
|
99
|
+
if not self.enabled:
|
|
100
|
+
return
|
|
74
101
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
if vlm_options.load_in_8bit:
|
|
78
|
-
llm_kwargs["quantization"] = "bitsandbytes"
|
|
102
|
+
from transformers import AutoProcessor
|
|
103
|
+
from vllm import LLM, SamplingParams
|
|
79
104
|
|
|
80
|
-
|
|
105
|
+
# Device selection
|
|
106
|
+
self.device = decide_device(
|
|
107
|
+
accelerator_options.device, supported_devices=vlm_options.supported_devices
|
|
108
|
+
)
|
|
109
|
+
_log.debug(f"Available device for VLM: {self.device}")
|
|
81
110
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
111
|
+
# Resolve artifacts path / cache folder
|
|
112
|
+
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
|
113
|
+
if artifacts_path is None:
|
|
114
|
+
artifacts_path = self.download_models(
|
|
115
|
+
self.vlm_options.repo_id, revision=self.vlm_options.revision
|
|
86
116
|
)
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
117
|
+
elif (artifacts_path / repo_cache_folder).exists():
|
|
118
|
+
artifacts_path = artifacts_path / repo_cache_folder
|
|
119
|
+
|
|
120
|
+
# --------- Strict split & validation of extra_generation_config ---------
|
|
121
|
+
extra_cfg = self.vlm_options.extra_generation_config
|
|
122
|
+
|
|
123
|
+
load_cfg = {k: v for k, v in extra_cfg.items() if k in self._VLLM_ENGINE_KEYS}
|
|
124
|
+
gen_cfg = {k: v for k, v in extra_cfg.items() if k in self._VLLM_SAMPLING_KEYS}
|
|
125
|
+
|
|
126
|
+
unknown = sorted(
|
|
127
|
+
k
|
|
128
|
+
for k in extra_cfg.keys()
|
|
129
|
+
if k not in self._VLLM_ENGINE_KEYS and k not in self._VLLM_SAMPLING_KEYS
|
|
130
|
+
)
|
|
131
|
+
if unknown:
|
|
132
|
+
_log.warning(
|
|
133
|
+
"Ignoring unknown extra_generation_config keys for vLLM: %s", unknown
|
|
94
134
|
)
|
|
95
135
|
|
|
136
|
+
# --------- Construct LLM kwargs (engine/load-time) ---------
|
|
137
|
+
llm_kwargs: Dict[str, Any] = {
|
|
138
|
+
"model": str(artifacts_path),
|
|
139
|
+
"model_impl": "transformers",
|
|
140
|
+
"limit_mm_per_prompt": {"image": 1},
|
|
141
|
+
"revision": self.vlm_options.revision,
|
|
142
|
+
"trust_remote_code": self.vlm_options.trust_remote_code,
|
|
143
|
+
**load_cfg,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
if self.device == "cpu":
|
|
147
|
+
llm_kwargs.setdefault("enforce_eager", True)
|
|
148
|
+
else:
|
|
149
|
+
llm_kwargs.setdefault(
|
|
150
|
+
"gpu_memory_utilization", 0.3
|
|
151
|
+
) # room for other models
|
|
152
|
+
|
|
153
|
+
# Quantization (kept as-is; coarse)
|
|
154
|
+
if self.vlm_options.quantized and self.vlm_options.load_in_8bit:
|
|
155
|
+
llm_kwargs.setdefault("quantization", "bitsandbytes")
|
|
156
|
+
|
|
157
|
+
# Initialize vLLM LLM
|
|
158
|
+
self.llm = LLM(**llm_kwargs)
|
|
159
|
+
|
|
160
|
+
# Initialize processor for prompt templating (needed for CHAT style)
|
|
161
|
+
self.processor = AutoProcessor.from_pretrained(
|
|
162
|
+
artifacts_path,
|
|
163
|
+
trust_remote_code=self.vlm_options.trust_remote_code,
|
|
164
|
+
revision=self.vlm_options.revision,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# --------- SamplingParams (runtime) ---------
|
|
168
|
+
self.sampling_params = SamplingParams(
|
|
169
|
+
temperature=self.temperature,
|
|
170
|
+
max_tokens=self.max_new_tokens,
|
|
171
|
+
stop=(self.vlm_options.stop_strings or None),
|
|
172
|
+
**gen_cfg,
|
|
173
|
+
)
|
|
174
|
+
|
|
96
175
|
def __call__(
|
|
97
176
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
98
177
|
) -> Iterable[Page]:
|
|
178
|
+
# If disabled, pass-through
|
|
179
|
+
if not self.enabled:
|
|
180
|
+
for page in page_batch:
|
|
181
|
+
yield page
|
|
182
|
+
return
|
|
183
|
+
|
|
99
184
|
page_list = list(page_batch)
|
|
100
185
|
if not page_list:
|
|
101
186
|
return
|
|
102
187
|
|
|
103
|
-
|
|
104
|
-
|
|
188
|
+
# Preserve original order
|
|
189
|
+
original_order = page_list[:]
|
|
105
190
|
|
|
191
|
+
# Separate valid/invalid
|
|
192
|
+
valid_pages: list[Page] = []
|
|
193
|
+
invalid_pages: list[Page] = []
|
|
106
194
|
for page in page_list:
|
|
107
195
|
assert page._backend is not None
|
|
108
|
-
if
|
|
109
|
-
invalid_pages.append(page)
|
|
110
|
-
else:
|
|
196
|
+
if page._backend.is_valid():
|
|
111
197
|
valid_pages.append(page)
|
|
198
|
+
else:
|
|
199
|
+
invalid_pages.append(page)
|
|
112
200
|
|
|
113
|
-
# Process valid pages in batch
|
|
114
201
|
if valid_pages:
|
|
115
202
|
with TimeRecorder(conv_res, "vlm"):
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
pages_with_images = []
|
|
203
|
+
images: list[Image] = []
|
|
204
|
+
user_prompts: list[str] = []
|
|
205
|
+
pages_with_images: list[Page] = []
|
|
120
206
|
|
|
121
207
|
for page in valid_pages:
|
|
122
208
|
assert page.size is not None
|
|
123
209
|
hi_res_image = page.get_image(
|
|
124
|
-
scale=self.vlm_options.scale,
|
|
210
|
+
scale=self.vlm_options.scale,
|
|
211
|
+
max_size=self.vlm_options.max_size,
|
|
125
212
|
)
|
|
213
|
+
if hi_res_image is None:
|
|
214
|
+
continue
|
|
126
215
|
|
|
127
|
-
|
|
128
|
-
if hi_res_image is not None:
|
|
129
|
-
images.append(hi_res_image)
|
|
216
|
+
images.append(hi_res_image)
|
|
130
217
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
|
134
|
-
else:
|
|
135
|
-
user_prompt = self.vlm_options.prompt
|
|
218
|
+
# Define prompt structure
|
|
219
|
+
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
|
|
136
220
|
|
|
137
|
-
|
|
138
|
-
|
|
221
|
+
user_prompts.append(user_prompt)
|
|
222
|
+
pages_with_images.append(page)
|
|
139
223
|
|
|
140
|
-
|
|
141
|
-
if images: # Only if we have valid images
|
|
224
|
+
if images:
|
|
142
225
|
predictions = list(self.process_images(images, user_prompts))
|
|
143
|
-
|
|
144
|
-
# Attach results to pages
|
|
145
226
|
for page, prediction in zip(pages_with_images, predictions):
|
|
146
227
|
page.predictions.vlm_response = prediction
|
|
147
228
|
|
|
148
|
-
# Yield
|
|
149
|
-
for page in
|
|
150
|
-
yield page
|
|
151
|
-
for page in valid_pages:
|
|
229
|
+
# Yield in original order
|
|
230
|
+
for page in original_order:
|
|
152
231
|
yield page
|
|
153
232
|
|
|
154
233
|
def process_images(
|
|
@@ -156,50 +235,33 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
156
235
|
image_batch: Iterable[Union[Image, np.ndarray]],
|
|
157
236
|
prompt: Union[str, list[str]],
|
|
158
237
|
) -> Iterable[VlmPrediction]:
|
|
159
|
-
"""Process
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
image_batch: Iterable of PIL Images or numpy arrays
|
|
163
|
-
prompt: Either:
|
|
164
|
-
- str: Single prompt used for all images
|
|
165
|
-
- list[str]: List of prompts (one per image, must match image count)
|
|
238
|
+
"""Process images in a single batched vLLM inference call."""
|
|
239
|
+
import numpy as np
|
|
240
|
+
from PIL import Image as PILImage
|
|
166
241
|
|
|
167
|
-
|
|
168
|
-
ValueError: If prompt list length doesn't match image count.
|
|
169
|
-
"""
|
|
242
|
+
# -- Normalize images to RGB PIL
|
|
170
243
|
pil_images: list[Image] = []
|
|
171
|
-
|
|
172
244
|
for img in image_batch:
|
|
173
|
-
# Convert numpy array to PIL Image if needed
|
|
174
245
|
if isinstance(img, np.ndarray):
|
|
175
|
-
if img.ndim == 3 and img.shape[2] in
|
|
176
|
-
from PIL import Image as PILImage
|
|
177
|
-
|
|
246
|
+
if img.ndim == 3 and img.shape[2] in (3, 4):
|
|
178
247
|
pil_img = PILImage.fromarray(img.astype(np.uint8))
|
|
179
248
|
elif img.ndim == 2:
|
|
180
|
-
from PIL import Image as PILImage
|
|
181
|
-
|
|
182
249
|
pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
|
|
183
250
|
else:
|
|
184
251
|
raise ValueError(f"Unsupported numpy array shape: {img.shape}")
|
|
185
252
|
else:
|
|
186
253
|
pil_img = img
|
|
187
|
-
|
|
188
|
-
# Ensure image is in RGB mode (handles RGBA, L, etc.)
|
|
189
254
|
if pil_img.mode != "RGB":
|
|
190
255
|
pil_img = pil_img.convert("RGB")
|
|
191
|
-
|
|
192
256
|
pil_images.append(pil_img)
|
|
193
257
|
|
|
194
|
-
if
|
|
258
|
+
if not pil_images:
|
|
195
259
|
return
|
|
196
260
|
|
|
197
|
-
#
|
|
261
|
+
# Normalize prompts
|
|
198
262
|
if isinstance(prompt, str):
|
|
199
|
-
# Single prompt for all images
|
|
200
263
|
user_prompts = [prompt] * len(pil_images)
|
|
201
264
|
elif isinstance(prompt, list):
|
|
202
|
-
# List of prompts (one per image)
|
|
203
265
|
if len(prompt) != len(pil_images):
|
|
204
266
|
raise ValueError(
|
|
205
267
|
f"Number of prompts ({len(prompt)}) must match number of images ({len(pil_images)})"
|
|
@@ -208,28 +270,31 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
|
|
|
208
270
|
else:
|
|
209
271
|
raise ValueError(f"prompt must be str or list[str], got {type(prompt)}")
|
|
210
272
|
|
|
211
|
-
# Format prompts
|
|
212
|
-
prompts: list[str] = [
|
|
213
|
-
self.formulate_prompt(user_prompt) for user_prompt in user_prompts
|
|
214
|
-
]
|
|
273
|
+
# Format prompts
|
|
274
|
+
prompts: list[str] = [self.formulate_prompt(up) for up in user_prompts]
|
|
215
275
|
|
|
216
|
-
#
|
|
217
|
-
llm_inputs = [
|
|
218
|
-
|
|
219
|
-
|
|
276
|
+
# Build vLLM inputs
|
|
277
|
+
llm_inputs = [
|
|
278
|
+
{"prompt": p, "multi_modal_data": {"image": im}}
|
|
279
|
+
for p, im in zip(prompts, pil_images)
|
|
280
|
+
]
|
|
220
281
|
|
|
282
|
+
# Generate
|
|
283
|
+
assert self.llm is not None and self.sampling_params is not None
|
|
221
284
|
start_time = time.time()
|
|
222
285
|
outputs = self.llm.generate(llm_inputs, sampling_params=self.sampling_params) # type: ignore
|
|
223
286
|
generation_time = time.time() - start_time
|
|
224
287
|
|
|
225
|
-
#
|
|
226
|
-
if
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
f"Generated {num_tokens} tokens in
|
|
230
|
-
|
|
288
|
+
# Optional debug
|
|
289
|
+
if outputs:
|
|
290
|
+
try:
|
|
291
|
+
num_tokens = len(outputs[0].outputs[0].token_ids)
|
|
292
|
+
_log.debug(f"Generated {num_tokens} tokens in {generation_time:.2f}s.")
|
|
293
|
+
except Exception:
|
|
294
|
+
pass
|
|
231
295
|
|
|
296
|
+
# Emit predictions
|
|
232
297
|
for output in outputs:
|
|
233
|
-
|
|
234
|
-
decoded_text = self.vlm_options.decode_response(
|
|
298
|
+
text = output.outputs[0].text if output.outputs else ""
|
|
299
|
+
decoded_text = self.vlm_options.decode_response(text)
|
|
235
300
|
yield VlmPrediction(text=decoded_text, generation_time=generation_time)
|
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
import base64
|
|
2
|
+
import json
|
|
2
3
|
import logging
|
|
3
4
|
from io import BytesIO
|
|
4
|
-
from typing import Dict, Optional
|
|
5
|
+
from typing import Dict, List, Optional
|
|
5
6
|
|
|
6
7
|
import requests
|
|
7
8
|
from PIL import Image
|
|
8
9
|
from pydantic import AnyUrl
|
|
9
10
|
|
|
10
11
|
from docling.datamodel.base_models import OpenAiApiResponse
|
|
12
|
+
from docling.models.utils.generation_utils import GenerationStopper
|
|
11
13
|
|
|
12
14
|
_log = logging.getLogger(__name__)
|
|
13
15
|
|
|
@@ -59,3 +61,107 @@ def api_image_request(
|
|
|
59
61
|
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
|
60
62
|
generated_text = api_resp.choices[0].message.content.strip()
|
|
61
63
|
return generated_text
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def api_image_request_streaming(
|
|
67
|
+
image: Image.Image,
|
|
68
|
+
prompt: str,
|
|
69
|
+
url: AnyUrl,
|
|
70
|
+
*,
|
|
71
|
+
timeout: float = 20,
|
|
72
|
+
headers: Optional[Dict[str, str]] = None,
|
|
73
|
+
generation_stoppers: List[GenerationStopper] = [],
|
|
74
|
+
**params,
|
|
75
|
+
) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
|
|
78
|
+
Parses SSE lines: 'data: {json}\\n\\n', terminated by 'data: [DONE]'.
|
|
79
|
+
Accumulates text and calls stopper.should_stop(window) as chunks arrive.
|
|
80
|
+
If stopper triggers, the HTTP connection is closed to abort server-side generation.
|
|
81
|
+
"""
|
|
82
|
+
img_io = BytesIO()
|
|
83
|
+
image.save(img_io, "PNG")
|
|
84
|
+
image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
|
85
|
+
|
|
86
|
+
messages = [
|
|
87
|
+
{
|
|
88
|
+
"role": "user",
|
|
89
|
+
"content": [
|
|
90
|
+
{
|
|
91
|
+
"type": "image_url",
|
|
92
|
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
|
93
|
+
},
|
|
94
|
+
{"type": "text", "text": prompt},
|
|
95
|
+
],
|
|
96
|
+
}
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
payload = {
|
|
100
|
+
"messages": messages,
|
|
101
|
+
"stream": True, # <-- critical for SSE streaming
|
|
102
|
+
**params,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# Debug: Log the payload to verify temperature is included
|
|
106
|
+
_log.debug(f"API streaming request payload: {json.dumps(payload, indent=2)}")
|
|
107
|
+
|
|
108
|
+
# Some servers require Accept: text/event-stream for SSE.
|
|
109
|
+
# It's safe to set it; OpenAI-compatible servers tolerate it.
|
|
110
|
+
hdrs = {"Accept": "text/event-stream", **(headers or {})}
|
|
111
|
+
|
|
112
|
+
# Try to force temperature via header if server ignores payload parameter
|
|
113
|
+
if "temperature" in params:
|
|
114
|
+
hdrs["X-Temperature"] = str(params["temperature"])
|
|
115
|
+
|
|
116
|
+
# Stream the HTTP response
|
|
117
|
+
with requests.post(
|
|
118
|
+
str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
|
|
119
|
+
) as r:
|
|
120
|
+
if not r.ok:
|
|
121
|
+
_log.error(
|
|
122
|
+
f"Error calling the API {url} in streaming mode. Response was {r.text}"
|
|
123
|
+
)
|
|
124
|
+
r.raise_for_status()
|
|
125
|
+
|
|
126
|
+
full_text = []
|
|
127
|
+
for raw_line in r.iter_lines(decode_unicode=True):
|
|
128
|
+
if not raw_line: # keep-alives / blank lines
|
|
129
|
+
continue
|
|
130
|
+
if not raw_line.startswith("data:"):
|
|
131
|
+
# Some proxies inject comments; ignore anything not starting with 'data:'
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
data = raw_line[len("data:") :].strip()
|
|
135
|
+
if data == "[DONE]":
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
obj = json.loads(data)
|
|
140
|
+
except json.JSONDecodeError:
|
|
141
|
+
_log.debug("Skipping non-JSON SSE chunk: %r", data[:200])
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
# OpenAI-compatible delta format
|
|
145
|
+
# obj["choices"][0]["delta"]["content"] may be None or missing (e.g., tool calls)
|
|
146
|
+
try:
|
|
147
|
+
delta = obj["choices"][0].get("delta") or {}
|
|
148
|
+
piece = delta.get("content") or ""
|
|
149
|
+
except (KeyError, IndexError) as e:
|
|
150
|
+
_log.debug("Unexpected SSE chunk shape: %s", e)
|
|
151
|
+
piece = ""
|
|
152
|
+
|
|
153
|
+
if piece:
|
|
154
|
+
full_text.append(piece)
|
|
155
|
+
for stopper in generation_stoppers:
|
|
156
|
+
# Respect stopper's lookback window. We use a simple string window which
|
|
157
|
+
# works with the GenerationStopper interface.
|
|
158
|
+
lookback = max(1, stopper.lookback_tokens())
|
|
159
|
+
window = "".join(full_text)[-lookback:]
|
|
160
|
+
if stopper.should_stop(window):
|
|
161
|
+
# Break out of the loop cleanly. The context manager will handle
|
|
162
|
+
# closing the connection when we exit the 'with' block.
|
|
163
|
+
# vLLM/OpenAI-compatible servers will detect the client disconnect
|
|
164
|
+
# and abort the request server-side.
|
|
165
|
+
return "".join(full_text)
|
|
166
|
+
|
|
167
|
+
return "".join(full_text)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.55.0
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
|
|
|
26
26
|
Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
29
|
-
Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.
|
|
29
|
+
Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.2
|
|
30
30
|
Requires-Dist: docling-parse<5.0.0,>=4.4.0
|
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.1
|
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
|
@@ -37,7 +37,7 @@ Requires-Dist: requests<3.0.0,>=2.32.2
|
|
|
37
37
|
Requires-Dist: easyocr<2.0,>=1.7
|
|
38
38
|
Requires-Dist: certifi>=2024.7.4
|
|
39
39
|
Requires-Dist: rtree<2.0.0,>=1.3.0
|
|
40
|
-
Requires-Dist: typer<0.
|
|
40
|
+
Requires-Dist: typer<0.20.0,>=0.12.5
|
|
41
41
|
Requires-Dist: python-docx<2.0.0,>=1.1.2
|
|
42
42
|
Requires-Dist: python-pptx<2.0.0,>=1.0.2
|
|
43
43
|
Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
|
|
@@ -101,7 +101,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
|
101
101
|
|
|
102
102
|
## Features
|
|
103
103
|
|
|
104
|
-
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
|
|
104
|
+
* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
|
|
105
105
|
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
|
|
106
106
|
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
|
|
107
107
|
* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
|
|
@@ -117,13 +117,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|
|
117
117
|
* 📤 Structured [information extraction][extraction] \[🧪 beta\]
|
|
118
118
|
* 📑 New layout model (**Heron**) by default, for faster PDF parsing
|
|
119
119
|
* 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
|
|
120
|
+
* 💬 Parsing of Web Video Text Tracks (WebVTT) files
|
|
120
121
|
|
|
121
122
|
### Coming soon
|
|
122
123
|
|
|
123
124
|
* 📝 Metadata extraction, including title, authors, references & language
|
|
124
125
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
|
125
126
|
* 📝 Complex chemistry understanding (Molecular structures)
|
|
126
|
-
* 📝 Parsing of Web Video Text Tracks (WebVTT) files
|
|
127
127
|
|
|
128
128
|
## Installation
|
|
129
129
|
|