docling 2.53.0__py3-none-any.whl → 2.55.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,9 +7,7 @@ from typing import Any, Dict, Optional, Union
7
7
  import numpy as np
8
8
  from PIL.Image import Image
9
9
 
10
- from docling.datamodel.accelerator_options import (
11
- AcceleratorOptions,
12
- )
10
+ from docling.datamodel.accelerator_options import AcceleratorOptions
13
11
  from docling.datamodel.base_models import Page, VlmPrediction
14
12
  from docling.datamodel.document import ConversionResult
15
13
  from docling.datamodel.pipeline_options_vlm_model import (
@@ -17,9 +15,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
17
15
  TransformersPromptStyle,
18
16
  )
19
17
  from docling.models.base_model import BaseVlmPageModel
20
- from docling.models.utils.hf_model_download import (
21
- HuggingFaceModelDownloadMixin,
22
- )
18
+ from docling.models.utils.hf_model_download import HuggingFaceModelDownloadMixin
23
19
  from docling.utils.accelerator_utils import decide_device
24
20
  from docling.utils.profiling import TimeRecorder
25
21
 
@@ -27,6 +23,62 @@ _log = logging.getLogger(__name__)
27
23
 
28
24
 
29
25
  class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
26
+ """
27
+ vLLM-backed vision-language model that accepts PIL images (or numpy arrays)
28
+ via vLLM's multi_modal_data, with prompt formatting handled by formulate_prompt().
29
+ """
30
+
31
+ # --------- Allowlist of vLLM args ---------
32
+ # SamplingParams (runtime generation controls)
33
+ _VLLM_SAMPLING_KEYS = {
34
+ # Core
35
+ "max_tokens",
36
+ "temperature",
37
+ "top_p",
38
+ "top_k",
39
+ # Penalties
40
+ "presence_penalty",
41
+ "frequency_penalty",
42
+ "repetition_penalty",
43
+ # Stops / outputs
44
+ "stop",
45
+ "stop_token_ids",
46
+ "skip_special_tokens",
47
+ "spaces_between_special_tokens",
48
+ # Search / length
49
+ "n",
50
+ "best_of",
51
+ "length_penalty",
52
+ "early_stopping",
53
+ # Misc
54
+ "logprobs",
55
+ "prompt_logprobs",
56
+ "min_p",
57
+ "seed",
58
+ }
59
+
60
+ # LLM(...) / EngineArgs (engine/load-time controls)
61
+ _VLLM_ENGINE_KEYS = {
62
+ # Model/tokenizer/impl
63
+ "tokenizer",
64
+ "tokenizer_mode",
65
+ "download_dir",
66
+ # Parallelism / memory / lengths
67
+ "tensor_parallel_size",
68
+ "pipeline_parallel_size",
69
+ "gpu_memory_utilization",
70
+ "max_model_len",
71
+ "max_num_batched_tokens",
72
+ "kv_cache_dtype",
73
+ "dtype",
74
+ # Quantization (coarse switch)
75
+ "quantization",
76
+ # Multimodal limits
77
+ "limit_mm_per_prompt",
78
+ # Execution toggles
79
+ "enforce_eager",
80
+ }
81
+
30
82
  def __init__(
31
83
  self,
32
84
  enabled: bool,
@@ -35,120 +87,147 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
35
87
  vlm_options: InlineVlmOptions,
36
88
  ):
37
89
  self.enabled = enabled
38
-
39
90
  self.vlm_options = vlm_options
40
91
 
41
- if self.enabled:
42
- from transformers import AutoProcessor
43
- from vllm import LLM, SamplingParams
44
-
45
- self.device = decide_device(
46
- accelerator_options.device,
47
- supported_devices=vlm_options.supported_devices,
48
- )
49
- _log.debug(f"Available device for VLM: {self.device}")
50
-
51
- self.max_new_tokens = vlm_options.max_new_tokens
52
- self.temperature = vlm_options.temperature
53
-
54
- repo_cache_folder = vlm_options.repo_id.replace("/", "--")
92
+ self.llm = None
93
+ self.sampling_params = None
94
+ self.processor = None # used for CHAT templating in formulate_prompt()
95
+ self.device = "cpu"
96
+ self.max_new_tokens = vlm_options.max_new_tokens
97
+ self.temperature = vlm_options.temperature
55
98
 
56
- if artifacts_path is None:
57
- artifacts_path = self.download_models(self.vlm_options.repo_id)
58
- elif (artifacts_path / repo_cache_folder).exists():
59
- artifacts_path = artifacts_path / repo_cache_folder
60
-
61
- # Initialize VLLM LLM
62
- llm_kwargs: Dict[str, Any] = {
63
- "model": str(artifacts_path),
64
- "limit_mm_per_prompt": {"image": 1},
65
- "trust_remote_code": vlm_options.trust_remote_code,
66
- "model_impl": "transformers",
67
- "gpu_memory_utilization": 0.3, # hardcoded for now, leaves room for ~3 different models.
68
- }
69
-
70
- # Add device-specific configurations
71
-
72
- if self.device == "cpu":
73
- llm_kwargs["device"] = "cpu"
99
+ if not self.enabled:
100
+ return
74
101
 
75
- # Add quantization if specified
76
- if vlm_options.quantized:
77
- if vlm_options.load_in_8bit:
78
- llm_kwargs["quantization"] = "bitsandbytes"
102
+ from transformers import AutoProcessor
103
+ from vllm import LLM, SamplingParams
79
104
 
80
- self.llm = LLM(**llm_kwargs)
105
+ # Device selection
106
+ self.device = decide_device(
107
+ accelerator_options.device, supported_devices=vlm_options.supported_devices
108
+ )
109
+ _log.debug(f"Available device for VLM: {self.device}")
81
110
 
82
- # Initialize processor for prompt formatting
83
- self.processor = AutoProcessor.from_pretrained(
84
- artifacts_path,
85
- trust_remote_code=vlm_options.trust_remote_code,
111
+ # Resolve artifacts path / cache folder
112
+ repo_cache_folder = vlm_options.repo_id.replace("/", "--")
113
+ if artifacts_path is None:
114
+ artifacts_path = self.download_models(
115
+ self.vlm_options.repo_id, revision=self.vlm_options.revision
86
116
  )
87
-
88
- # Set up sampling parameters
89
- self.sampling_params = SamplingParams(
90
- temperature=self.temperature,
91
- max_tokens=self.max_new_tokens,
92
- stop=vlm_options.stop_strings if vlm_options.stop_strings else None,
93
- **vlm_options.extra_generation_config,
117
+ elif (artifacts_path / repo_cache_folder).exists():
118
+ artifacts_path = artifacts_path / repo_cache_folder
119
+
120
+ # --------- Strict split & validation of extra_generation_config ---------
121
+ extra_cfg = self.vlm_options.extra_generation_config
122
+
123
+ load_cfg = {k: v for k, v in extra_cfg.items() if k in self._VLLM_ENGINE_KEYS}
124
+ gen_cfg = {k: v for k, v in extra_cfg.items() if k in self._VLLM_SAMPLING_KEYS}
125
+
126
+ unknown = sorted(
127
+ k
128
+ for k in extra_cfg.keys()
129
+ if k not in self._VLLM_ENGINE_KEYS and k not in self._VLLM_SAMPLING_KEYS
130
+ )
131
+ if unknown:
132
+ _log.warning(
133
+ "Ignoring unknown extra_generation_config keys for vLLM: %s", unknown
94
134
  )
95
135
 
136
+ # --------- Construct LLM kwargs (engine/load-time) ---------
137
+ llm_kwargs: Dict[str, Any] = {
138
+ "model": str(artifacts_path),
139
+ "model_impl": "transformers",
140
+ "limit_mm_per_prompt": {"image": 1},
141
+ "revision": self.vlm_options.revision,
142
+ "trust_remote_code": self.vlm_options.trust_remote_code,
143
+ **load_cfg,
144
+ }
145
+
146
+ if self.device == "cpu":
147
+ llm_kwargs.setdefault("enforce_eager", True)
148
+ else:
149
+ llm_kwargs.setdefault(
150
+ "gpu_memory_utilization", 0.3
151
+ ) # room for other models
152
+
153
+ # Quantization (kept as-is; coarse)
154
+ if self.vlm_options.quantized and self.vlm_options.load_in_8bit:
155
+ llm_kwargs.setdefault("quantization", "bitsandbytes")
156
+
157
+ # Initialize vLLM LLM
158
+ self.llm = LLM(**llm_kwargs)
159
+
160
+ # Initialize processor for prompt templating (needed for CHAT style)
161
+ self.processor = AutoProcessor.from_pretrained(
162
+ artifacts_path,
163
+ trust_remote_code=self.vlm_options.trust_remote_code,
164
+ revision=self.vlm_options.revision,
165
+ )
166
+
167
+ # --------- SamplingParams (runtime) ---------
168
+ self.sampling_params = SamplingParams(
169
+ temperature=self.temperature,
170
+ max_tokens=self.max_new_tokens,
171
+ stop=(self.vlm_options.stop_strings or None),
172
+ **gen_cfg,
173
+ )
174
+
96
175
  def __call__(
97
176
  self, conv_res: ConversionResult, page_batch: Iterable[Page]
98
177
  ) -> Iterable[Page]:
178
+ # If disabled, pass-through
179
+ if not self.enabled:
180
+ for page in page_batch:
181
+ yield page
182
+ return
183
+
99
184
  page_list = list(page_batch)
100
185
  if not page_list:
101
186
  return
102
187
 
103
- valid_pages = []
104
- invalid_pages = []
188
+ # Preserve original order
189
+ original_order = page_list[:]
105
190
 
191
+ # Separate valid/invalid
192
+ valid_pages: list[Page] = []
193
+ invalid_pages: list[Page] = []
106
194
  for page in page_list:
107
195
  assert page._backend is not None
108
- if not page._backend.is_valid():
109
- invalid_pages.append(page)
110
- else:
196
+ if page._backend.is_valid():
111
197
  valid_pages.append(page)
198
+ else:
199
+ invalid_pages.append(page)
112
200
 
113
- # Process valid pages in batch
114
201
  if valid_pages:
115
202
  with TimeRecorder(conv_res, "vlm"):
116
- # Prepare images and prompts for batch processing
117
- images = []
118
- user_prompts = []
119
- pages_with_images = []
203
+ images: list[Image] = []
204
+ user_prompts: list[str] = []
205
+ pages_with_images: list[Page] = []
120
206
 
121
207
  for page in valid_pages:
122
208
  assert page.size is not None
123
209
  hi_res_image = page.get_image(
124
- scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
210
+ scale=self.vlm_options.scale,
211
+ max_size=self.vlm_options.max_size,
125
212
  )
213
+ if hi_res_image is None:
214
+ continue
126
215
 
127
- # Only process pages with valid images
128
- if hi_res_image is not None:
129
- images.append(hi_res_image)
216
+ images.append(hi_res_image)
130
217
 
131
- # Define prompt structure
132
- if callable(self.vlm_options.prompt):
133
- user_prompt = self.vlm_options.prompt(page.parsed_page)
134
- else:
135
- user_prompt = self.vlm_options.prompt
218
+ # Define prompt structure
219
+ user_prompt = self.vlm_options.build_prompt(page.parsed_page)
136
220
 
137
- user_prompts.append(user_prompt)
138
- pages_with_images.append(page)
221
+ user_prompts.append(user_prompt)
222
+ pages_with_images.append(page)
139
223
 
140
- # Use process_images for the actual inference
141
- if images: # Only if we have valid images
224
+ if images:
142
225
  predictions = list(self.process_images(images, user_prompts))
143
-
144
- # Attach results to pages
145
226
  for page, prediction in zip(pages_with_images, predictions):
146
227
  page.predictions.vlm_response = prediction
147
228
 
148
- # Yield all pages (valid and invalid)
149
- for page in invalid_pages:
150
- yield page
151
- for page in valid_pages:
229
+ # Yield in original order
230
+ for page in original_order:
152
231
  yield page
153
232
 
154
233
  def process_images(
@@ -156,50 +235,33 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
156
235
  image_batch: Iterable[Union[Image, np.ndarray]],
157
236
  prompt: Union[str, list[str]],
158
237
  ) -> Iterable[VlmPrediction]:
159
- """Process raw images without page metadata in a single batched inference call.
160
-
161
- Args:
162
- image_batch: Iterable of PIL Images or numpy arrays
163
- prompt: Either:
164
- - str: Single prompt used for all images
165
- - list[str]: List of prompts (one per image, must match image count)
238
+ """Process images in a single batched vLLM inference call."""
239
+ import numpy as np
240
+ from PIL import Image as PILImage
166
241
 
167
- Raises:
168
- ValueError: If prompt list length doesn't match image count.
169
- """
242
+ # -- Normalize images to RGB PIL
170
243
  pil_images: list[Image] = []
171
-
172
244
  for img in image_batch:
173
- # Convert numpy array to PIL Image if needed
174
245
  if isinstance(img, np.ndarray):
175
- if img.ndim == 3 and img.shape[2] in [3, 4]:
176
- from PIL import Image as PILImage
177
-
246
+ if img.ndim == 3 and img.shape[2] in (3, 4):
178
247
  pil_img = PILImage.fromarray(img.astype(np.uint8))
179
248
  elif img.ndim == 2:
180
- from PIL import Image as PILImage
181
-
182
249
  pil_img = PILImage.fromarray(img.astype(np.uint8), mode="L")
183
250
  else:
184
251
  raise ValueError(f"Unsupported numpy array shape: {img.shape}")
185
252
  else:
186
253
  pil_img = img
187
-
188
- # Ensure image is in RGB mode (handles RGBA, L, etc.)
189
254
  if pil_img.mode != "RGB":
190
255
  pil_img = pil_img.convert("RGB")
191
-
192
256
  pil_images.append(pil_img)
193
257
 
194
- if len(pil_images) == 0:
258
+ if not pil_images:
195
259
  return
196
260
 
197
- # Handle prompt parameter
261
+ # Normalize prompts
198
262
  if isinstance(prompt, str):
199
- # Single prompt for all images
200
263
  user_prompts = [prompt] * len(pil_images)
201
264
  elif isinstance(prompt, list):
202
- # List of prompts (one per image)
203
265
  if len(prompt) != len(pil_images):
204
266
  raise ValueError(
205
267
  f"Number of prompts ({len(prompt)}) must match number of images ({len(pil_images)})"
@@ -208,28 +270,31 @@ class VllmVlmModel(BaseVlmPageModel, HuggingFaceModelDownloadMixin):
208
270
  else:
209
271
  raise ValueError(f"prompt must be str or list[str], got {type(prompt)}")
210
272
 
211
- # Format prompts individually
212
- prompts: list[str] = [
213
- self.formulate_prompt(user_prompt) for user_prompt in user_prompts
214
- ]
273
+ # Format prompts
274
+ prompts: list[str] = [self.formulate_prompt(up) for up in user_prompts]
215
275
 
216
- # Prepare VLLM inputs
217
- llm_inputs = []
218
- for prompt, image in zip(prompts, pil_images):
219
- llm_inputs.append({"prompt": prompt, "multi_modal_data": {"image": image}})
276
+ # Build vLLM inputs
277
+ llm_inputs = [
278
+ {"prompt": p, "multi_modal_data": {"image": im}}
279
+ for p, im in zip(prompts, pil_images)
280
+ ]
220
281
 
282
+ # Generate
283
+ assert self.llm is not None and self.sampling_params is not None
221
284
  start_time = time.time()
222
285
  outputs = self.llm.generate(llm_inputs, sampling_params=self.sampling_params) # type: ignore
223
286
  generation_time = time.time() - start_time
224
287
 
225
- # Logging tokens count for the first sample as a representative metric
226
- if len(outputs) > 0:
227
- num_tokens = len(outputs[0].outputs[0].token_ids)
228
- _log.debug(
229
- f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
230
- )
288
+ # Optional debug
289
+ if outputs:
290
+ try:
291
+ num_tokens = len(outputs[0].outputs[0].token_ids)
292
+ _log.debug(f"Generated {num_tokens} tokens in {generation_time:.2f}s.")
293
+ except Exception:
294
+ pass
231
295
 
296
+ # Emit predictions
232
297
  for output in outputs:
233
- # Apply decode_response to the output text
234
- decoded_text = self.vlm_options.decode_response(output.outputs[0].text)
298
+ text = output.outputs[0].text if output.outputs else ""
299
+ decoded_text = self.vlm_options.decode_response(text)
235
300
  yield VlmPrediction(text=decoded_text, generation_time=generation_time)
@@ -1,13 +1,15 @@
1
1
  import base64
2
+ import json
2
3
  import logging
3
4
  from io import BytesIO
4
- from typing import Dict, Optional
5
+ from typing import Dict, List, Optional
5
6
 
6
7
  import requests
7
8
  from PIL import Image
8
9
  from pydantic import AnyUrl
9
10
 
10
11
  from docling.datamodel.base_models import OpenAiApiResponse
12
+ from docling.models.utils.generation_utils import GenerationStopper
11
13
 
12
14
  _log = logging.getLogger(__name__)
13
15
 
@@ -59,3 +61,107 @@ def api_image_request(
59
61
  api_resp = OpenAiApiResponse.model_validate_json(r.text)
60
62
  generated_text = api_resp.choices[0].message.content.strip()
61
63
  return generated_text
64
+
65
+
66
+ def api_image_request_streaming(
67
+ image: Image.Image,
68
+ prompt: str,
69
+ url: AnyUrl,
70
+ *,
71
+ timeout: float = 20,
72
+ headers: Optional[Dict[str, str]] = None,
73
+ generation_stoppers: List[GenerationStopper] = [],
74
+ **params,
75
+ ) -> str:
76
+ """
77
+ Stream a chat completion from an OpenAI-compatible server (e.g., vLLM).
78
+ Parses SSE lines: 'data: {json}\\n\\n', terminated by 'data: [DONE]'.
79
+ Accumulates text and calls stopper.should_stop(window) as chunks arrive.
80
+ If stopper triggers, the HTTP connection is closed to abort server-side generation.
81
+ """
82
+ img_io = BytesIO()
83
+ image.save(img_io, "PNG")
84
+ image_b64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
85
+
86
+ messages = [
87
+ {
88
+ "role": "user",
89
+ "content": [
90
+ {
91
+ "type": "image_url",
92
+ "image_url": {"url": f"data:image/png;base64,{image_b64}"},
93
+ },
94
+ {"type": "text", "text": prompt},
95
+ ],
96
+ }
97
+ ]
98
+
99
+ payload = {
100
+ "messages": messages,
101
+ "stream": True, # <-- critical for SSE streaming
102
+ **params,
103
+ }
104
+
105
+ # Debug: Log the payload to verify temperature is included
106
+ _log.debug(f"API streaming request payload: {json.dumps(payload, indent=2)}")
107
+
108
+ # Some servers require Accept: text/event-stream for SSE.
109
+ # It's safe to set it; OpenAI-compatible servers tolerate it.
110
+ hdrs = {"Accept": "text/event-stream", **(headers or {})}
111
+
112
+ # Try to force temperature via header if server ignores payload parameter
113
+ if "temperature" in params:
114
+ hdrs["X-Temperature"] = str(params["temperature"])
115
+
116
+ # Stream the HTTP response
117
+ with requests.post(
118
+ str(url), headers=hdrs, json=payload, timeout=timeout, stream=True
119
+ ) as r:
120
+ if not r.ok:
121
+ _log.error(
122
+ f"Error calling the API {url} in streaming mode. Response was {r.text}"
123
+ )
124
+ r.raise_for_status()
125
+
126
+ full_text = []
127
+ for raw_line in r.iter_lines(decode_unicode=True):
128
+ if not raw_line: # keep-alives / blank lines
129
+ continue
130
+ if not raw_line.startswith("data:"):
131
+ # Some proxies inject comments; ignore anything not starting with 'data:'
132
+ continue
133
+
134
+ data = raw_line[len("data:") :].strip()
135
+ if data == "[DONE]":
136
+ break
137
+
138
+ try:
139
+ obj = json.loads(data)
140
+ except json.JSONDecodeError:
141
+ _log.debug("Skipping non-JSON SSE chunk: %r", data[:200])
142
+ continue
143
+
144
+ # OpenAI-compatible delta format
145
+ # obj["choices"][0]["delta"]["content"] may be None or missing (e.g., tool calls)
146
+ try:
147
+ delta = obj["choices"][0].get("delta") or {}
148
+ piece = delta.get("content") or ""
149
+ except (KeyError, IndexError) as e:
150
+ _log.debug("Unexpected SSE chunk shape: %s", e)
151
+ piece = ""
152
+
153
+ if piece:
154
+ full_text.append(piece)
155
+ for stopper in generation_stoppers:
156
+ # Respect stopper's lookback window. We use a simple string window which
157
+ # works with the GenerationStopper interface.
158
+ lookback = max(1, stopper.lookback_tokens())
159
+ window = "".join(full_text)[-lookback:]
160
+ if stopper.should_stop(window):
161
+ # Break out of the loop cleanly. The context manager will handle
162
+ # closing the connection when we exit the 'with' block.
163
+ # vLLM/OpenAI-compatible servers will detect the client disconnect
164
+ # and abort the request server-side.
165
+ return "".join(full_text)
166
+
167
+ return "".join(full_text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.53.0
3
+ Version: 2.55.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -26,7 +26,7 @@ Requires-Python: <4.0,>=3.9
26
26
  Description-Content-Type: text/markdown
27
27
  License-File: LICENSE
28
28
  Requires-Dist: pydantic<3.0.0,>=2.0.0
29
- Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.0
29
+ Requires-Dist: docling-core[chunking]<3.0.0,>=2.48.2
30
30
  Requires-Dist: docling-parse<5.0.0,>=4.4.0
31
31
  Requires-Dist: docling-ibm-models<4,>=3.9.1
32
32
  Requires-Dist: filetype<2.0.0,>=1.2.0
@@ -37,7 +37,7 @@ Requires-Dist: requests<3.0.0,>=2.32.2
37
37
  Requires-Dist: easyocr<2.0,>=1.7
38
38
  Requires-Dist: certifi>=2024.7.4
39
39
  Requires-Dist: rtree<2.0.0,>=1.3.0
40
- Requires-Dist: typer<0.17.0,>=0.12.5
40
+ Requires-Dist: typer<0.20.0,>=0.12.5
41
41
  Requires-Dist: python-docx<2.0.0,>=1.1.2
42
42
  Requires-Dist: python-pptx<2.0.0,>=1.0.2
43
43
  Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
@@ -101,7 +101,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
101
101
 
102
102
  ## Features
103
103
 
104
- * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
104
+ * 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, VTT, images (PNG, TIFF, JPEG, ...), and more
105
105
  * 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
106
106
  * 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
107
107
  * ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
@@ -117,13 +117,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
117
117
  * 📤 Structured [information extraction][extraction] \[🧪 beta\]
118
118
  * 📑 New layout model (**Heron**) by default, for faster PDF parsing
119
119
  * 🔌 [MCP server](https://docling-project.github.io/docling/usage/mcp/) for agentic applications
120
+ * 💬 Parsing of Web Video Text Tracks (WebVTT) files
120
121
 
121
122
  ### Coming soon
122
123
 
123
124
  * 📝 Metadata extraction, including title, authors, references & language
124
125
  * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
125
126
  * 📝 Complex chemistry understanding (Molecular structures)
126
- * 📝 Parsing of Web Video Text Tracks (WebVTT) files
127
127
 
128
128
  ## Installation
129
129