docling 2.53.0__py3-none-any.whl → 2.55.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,9 +2,9 @@ import logging
2
2
  import traceback
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Final, Optional, Union
5
+ from typing import Final, Optional, Union, cast
6
6
 
7
- from bs4 import BeautifulSoup, Tag
7
+ from bs4 import BeautifulSoup, NavigableString, Tag
8
8
  from docling_core.types.doc import (
9
9
  DocItemLabel,
10
10
  DoclingDocument,
@@ -12,6 +12,8 @@ from docling_core.types.doc import (
12
12
  GroupItem,
13
13
  GroupLabel,
14
14
  NodeItem,
15
+ TableCell,
16
+ TableData,
15
17
  TextItem,
16
18
  )
17
19
  from lxml import etree
@@ -350,7 +352,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
350
352
 
351
353
  return
352
354
 
353
- def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
355
+ def _parse_element_citation(self, node: etree._Element) -> str:
354
356
  citation: Citation = {
355
357
  "author_names": "",
356
358
  "title": "",
@@ -535,6 +537,110 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
535
537
 
536
538
  return
537
539
 
540
+ @staticmethod
541
+ def parse_table_data(element: Tag) -> Optional[TableData]:
542
+ # TODO, see how to implement proper support for rich tables from HTML backend
543
+ nested_tables = element.find("table")
544
+ if nested_tables is not None:
545
+ _log.debug("Skipping nested table.")
546
+ return None
547
+
548
+ # Find the number of rows and columns (taking into account spans)
549
+ num_rows = 0
550
+ num_cols = 0
551
+ for row in element("tr"):
552
+ col_count = 0
553
+ is_row_header = True
554
+ if not isinstance(row, Tag):
555
+ continue
556
+ for cell in row(["td", "th"]):
557
+ if not isinstance(row, Tag):
558
+ continue
559
+ cell_tag = cast(Tag, cell)
560
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
561
+ col_count += col_span
562
+ if cell_tag.name == "td" or row_span == 1:
563
+ is_row_header = False
564
+ num_cols = max(num_cols, col_count)
565
+ if not is_row_header:
566
+ num_rows += 1
567
+
568
+ _log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
569
+
570
+ grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
571
+
572
+ data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
573
+
574
+ # Iterate over the rows in the table
575
+ start_row_span = 0
576
+ row_idx = -1
577
+ for row in element("tr"):
578
+ if not isinstance(row, Tag):
579
+ continue
580
+
581
+ # For each row, find all the column cells (both <td> and <th>)
582
+ cells = row(["td", "th"])
583
+
584
+ # Check if cell is in a column header or row header
585
+ col_header = True
586
+ row_header = True
587
+ for html_cell in cells:
588
+ if isinstance(html_cell, Tag):
589
+ _, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
590
+ if html_cell.name == "td":
591
+ col_header = False
592
+ row_header = False
593
+ elif row_span == 1:
594
+ row_header = False
595
+ if not row_header:
596
+ row_idx += 1
597
+ start_row_span = 0
598
+ else:
599
+ start_row_span += 1
600
+
601
+ # Extract the text content of each cell
602
+ col_idx = 0
603
+ for html_cell in cells:
604
+ if not isinstance(html_cell, Tag):
605
+ continue
606
+
607
+ # extract inline formulas
608
+ for formula in html_cell("inline-formula"):
609
+ math_parts = formula.text.split("$$")
610
+ if len(math_parts) == 3:
611
+ math_formula = f"$${math_parts[1]}$$"
612
+ formula.replace_with(NavigableString(math_formula))
613
+
614
+ # TODO: extract content correctly from table-cells with lists
615
+ text = HTMLDocumentBackend.get_text(html_cell).strip()
616
+ col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
617
+ if row_header:
618
+ row_span -= 1
619
+ while (
620
+ col_idx < num_cols
621
+ and grid[row_idx + start_row_span][col_idx] is not None
622
+ ):
623
+ col_idx += 1
624
+ for r in range(start_row_span, start_row_span + row_span):
625
+ for c in range(col_span):
626
+ if row_idx + r < num_rows and col_idx + c < num_cols:
627
+ grid[row_idx + r][col_idx + c] = text
628
+
629
+ table_cell = TableCell(
630
+ text=text,
631
+ row_span=row_span,
632
+ col_span=col_span,
633
+ start_row_offset_idx=start_row_span + row_idx,
634
+ end_row_offset_idx=start_row_span + row_idx + row_span,
635
+ start_col_offset_idx=col_idx,
636
+ end_col_offset_idx=col_idx + col_span,
637
+ column_header=col_header,
638
+ row_header=((not col_header) and html_cell.name == "th"),
639
+ )
640
+ data.table_cells.append(table_cell)
641
+
642
+ return data
643
+
538
644
  def _add_table(
539
645
  self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
540
646
  ) -> None:
@@ -543,8 +649,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
543
649
  if not isinstance(table_tag, Tag):
544
650
  return
545
651
 
546
- data = HTMLDocumentBackend.parse_table_data(table_tag)
547
-
652
+ data = JatsDocumentBackend.parse_table_data(table_tag)
548
653
  # TODO: format label vs caption once styling is supported
549
654
  label = table_xml_component["label"]
550
655
  caption = table_xml_component["caption"]
@@ -554,7 +659,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
554
659
  if table_text
555
660
  else None
556
661
  )
557
-
558
662
  if data is not None:
559
663
  doc.add_table(data=data, parent=parent, caption=table_caption)
560
664
 
@@ -609,7 +713,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
609
713
  )
610
714
  return
611
715
 
612
- def _walk_linear( # noqa: C901
716
+ def _walk_linear(
613
717
  self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
614
718
  ) -> str:
615
719
  skip_tags = ["term"]
@@ -1523,7 +1523,7 @@ class XmlTable:
1523
1523
 
1524
1524
  return ncols_max
1525
1525
 
1526
- def _parse_table(self, table: Tag) -> TableData: # noqa: C901
1526
+ def _parse_table(self, table: Tag) -> TableData:
1527
1527
  """Parse the content of a table tag.
1528
1528
 
1529
1529
  Args:
docling/cli/main.py CHANGED
@@ -66,6 +66,7 @@ from docling.datamodel.vlm_model_specs import (
66
66
  GRANITE_VISION_TRANSFORMERS,
67
67
  GRANITEDOCLING_MLX,
68
68
  GRANITEDOCLING_TRANSFORMERS,
69
+ GRANITEDOCLING_VLLM,
69
70
  SMOLDOCLING_MLX,
70
71
  SMOLDOCLING_TRANSFORMERS,
71
72
  SMOLDOCLING_VLLM,
@@ -686,6 +687,7 @@ def convert( # noqa: C901
686
687
  "To run SmolDocling faster, please install mlx-vlm:\n"
687
688
  "pip install mlx-vlm"
688
689
  )
690
+
689
691
  elif vlm_model == VlmModelType.GRANITEDOCLING:
690
692
  pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
691
693
  if sys.platform == "darwin":
@@ -701,6 +703,9 @@ def convert( # noqa: C901
701
703
  elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
702
704
  pipeline_options.vlm_options = SMOLDOCLING_VLLM
703
705
 
706
+ elif vlm_model == VlmModelType.GRANITEDOCLING_VLLM:
707
+ pipeline_options.vlm_options = GRANITEDOCLING_VLLM
708
+
704
709
  pdf_format_option = PdfFormatOption(
705
710
  pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
706
711
  )
@@ -1,7 +1,6 @@
1
- import math
2
1
  from collections import defaultdict
3
2
  from enum import Enum
4
- from typing import TYPE_CHECKING, Dict, List, Optional, Type, Union
3
+ from typing import TYPE_CHECKING, Optional, Type, Union
5
4
 
6
5
  import numpy as np
7
6
  from docling_core.types.doc import (
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
14
13
  )
15
14
  from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
16
15
  from docling_core.types.doc.page import SegmentedPdfPage, TextCell
17
- from docling_core.types.io import (
18
- DocumentStream,
19
- )
16
+ from docling_core.types.io import DocumentStream
20
17
 
21
18
  # DO NOT REMOVE; explicitly exposed from this location
22
19
  from PIL.Image import Image
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
71
68
  METS_GBS = "mets_gbs"
72
69
  JSON_DOCLING = "json_docling"
73
70
  AUDIO = "audio"
71
+ VTT = "vtt"
74
72
 
75
73
 
76
74
  class OutputFormat(str, Enum):
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
82
80
  DOCTAGS = "doctags"
83
81
 
84
82
 
85
- FormatToExtensions: Dict[InputFormat, List[str]] = {
83
+ FormatToExtensions: dict[InputFormat, list[str]] = {
86
84
  InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
87
85
  InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
88
86
  InputFormat.PDF: ["pdf"],
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
97
95
  InputFormat.METS_GBS: ["tar.gz"],
98
96
  InputFormat.JSON_DOCLING: ["json"],
99
97
  InputFormat.AUDIO: ["wav", "mp3"],
98
+ InputFormat.VTT: ["vtt"],
100
99
  }
101
100
 
102
- FormatToMimeType: Dict[InputFormat, List[str]] = {
101
+ FormatToMimeType: dict[InputFormat, list[str]] = {
103
102
  InputFormat.DOCX: [
104
103
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
105
104
  "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
130
129
  InputFormat.METS_GBS: ["application/mets+xml"],
131
130
  InputFormat.JSON_DOCLING: ["application/json"],
132
131
  InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
132
+ InputFormat.VTT: ["text/vtt"],
133
133
  }
134
134
 
135
135
  MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
162
162
  label: DocItemLabel
163
163
  bbox: BoundingBox
164
164
  confidence: float = 1.0
165
- cells: List[TextCell] = []
166
- children: List["Cluster"] = [] # Add child cluster support
165
+ cells: list[TextCell] = []
166
+ children: list["Cluster"] = [] # Add child cluster support
167
167
 
168
168
  @field_serializer("confidence")
169
169
  def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
179
179
 
180
180
 
181
181
  class LayoutPrediction(BaseModel):
182
- clusters: List[Cluster] = []
182
+ clusters: list[Cluster] = []
183
183
 
184
184
 
185
185
  class VlmPredictionToken(BaseModel):
@@ -201,14 +201,14 @@ class ContainerElement(
201
201
 
202
202
 
203
203
  class Table(BasePageElement):
204
- otsl_seq: List[str]
204
+ otsl_seq: list[str]
205
205
  num_rows: int = 0
206
206
  num_cols: int = 0
207
- table_cells: List[TableCell]
207
+ table_cells: list[TableCell]
208
208
 
209
209
 
210
210
  class TableStructurePrediction(BaseModel):
211
- table_map: Dict[int, Table] = {}
211
+ table_map: dict[int, Table] = {}
212
212
 
213
213
 
214
214
  class TextElement(BasePageElement):
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
216
216
 
217
217
 
218
218
  class FigureElement(BasePageElement):
219
- annotations: List[PictureDataType] = []
219
+ annotations: list[PictureDataType] = []
220
220
  provenance: Optional[str] = None
221
221
  predicted_class: Optional[str] = None
222
222
  confidence: Optional[float] = None
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
234
234
 
235
235
  class FigureClassificationPrediction(BaseModel):
236
236
  figure_count: int = 0
237
- figure_map: Dict[int, FigureElement] = {}
237
+ figure_map: dict[int, FigureElement] = {}
238
238
 
239
239
 
240
240
  class EquationPrediction(BaseModel):
241
241
  equation_count: int = 0
242
- equation_map: Dict[int, TextElement] = {}
242
+ equation_map: dict[int, TextElement] = {}
243
243
 
244
244
 
245
245
  class PagePredictions(BaseModel):
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
254
254
 
255
255
 
256
256
  class AssembledUnit(BaseModel):
257
- elements: List[PageElement] = []
258
- body: List[PageElement] = []
259
- headers: List[PageElement] = []
257
+ elements: list[PageElement] = []
258
+ body: list[PageElement] = []
259
+ headers: list[PageElement] = []
260
260
 
261
261
 
262
262
  class ItemAndImageEnrichmentElement(BaseModel):
@@ -280,12 +280,12 @@ class Page(BaseModel):
280
280
  None # Internal PDF backend. By default it is cleared during assembling.
281
281
  )
282
282
  _default_image_scale: float = 1.0 # Default image scale for external usage.
283
- _image_cache: Dict[
283
+ _image_cache: dict[
284
284
  float, Image
285
285
  ] = {} # Cache of images in different scales. By default it is cleared during assembling.
286
286
 
287
287
  @property
288
- def cells(self) -> List[TextCell]:
288
+ def cells(self) -> list[TextCell]:
289
289
  """Return text cells as a read-only view of parsed_page.textline_cells."""
290
290
  if self.parsed_page is not None:
291
291
  return self.parsed_page.textline_cells
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
354
354
 
355
355
  id: str
356
356
  model: Optional[str] = None # returned by openai
357
- choices: List[OpenAiResponseChoice]
357
+ choices: list[OpenAiResponseChoice]
358
358
  created: int
359
359
  usage: OpenAiResponseUsage
360
360
 
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
430
430
 
431
431
 
432
432
  class ConfidenceReport(PageConfidenceScores):
433
- pages: Dict[int, PageConfidenceScores] = Field(
433
+ pages: dict[int, PageConfidenceScores] = Field(
434
434
  default_factory=lambda: defaultdict(PageConfidenceScores)
435
435
  )
436
436
 
@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
394
394
  mime = FormatToMimeType[InputFormat.PPTX][0]
395
395
  elif ext in FormatToExtensions[InputFormat.XLSX]:
396
396
  mime = FormatToMimeType[InputFormat.XLSX][0]
397
+ elif ext in FormatToExtensions[InputFormat.VTT]:
398
+ mime = FormatToMimeType[InputFormat.VTT][0]
397
399
 
398
400
  return mime
399
401
 
@@ -1,11 +1,13 @@
1
1
  from enum import Enum
2
- from typing import Any, Dict, List, Literal, Optional
2
+ from typing import Any, Dict, List, Literal, Optional, Union
3
3
 
4
4
  from docling_core.types.doc.page import SegmentedPage
5
- from pydantic import AnyUrl, BaseModel
5
+ from pydantic import AnyUrl, BaseModel, ConfigDict
6
+ from transformers import StoppingCriteria
6
7
  from typing_extensions import deprecated
7
8
 
8
9
  from docling.datamodel.accelerator_options import AcceleratorDevice
10
+ from docling.models.utils.generation_utils import GenerationStopper
9
11
 
10
12
 
11
13
  class BaseVlmOptions(BaseModel):
@@ -50,9 +52,12 @@ class TransformersPromptStyle(str, Enum):
50
52
 
51
53
 
52
54
  class InlineVlmOptions(BaseVlmOptions):
55
+ model_config = ConfigDict(arbitrary_types_allowed=True)
56
+
53
57
  kind: Literal["inline_model_options"] = "inline_model_options"
54
58
 
55
59
  repo_id: str
60
+ revision: str = "main"
56
61
  trust_remote_code: bool = False
57
62
  load_in_8bit: bool = True
58
63
  llm_int8_threshold: float = 6.0
@@ -71,6 +76,7 @@ class InlineVlmOptions(BaseVlmOptions):
71
76
  ]
72
77
 
73
78
  stop_strings: List[str] = []
79
+ custom_stopping_criteria: List[Union[StoppingCriteria, GenerationStopper]] = []
74
80
  extra_generation_config: Dict[str, Any] = {}
75
81
  extra_processor_kwargs: Dict[str, Any] = {}
76
82
 
@@ -88,6 +94,8 @@ class HuggingFaceVlmOptions(InlineVlmOptions):
88
94
 
89
95
 
90
96
  class ApiVlmOptions(BaseVlmOptions):
97
+ model_config = ConfigDict(arbitrary_types_allowed=True)
98
+
91
99
  kind: Literal["api_model_options"] = "api_model_options"
92
100
 
93
101
  url: AnyUrl = AnyUrl(
@@ -98,3 +106,6 @@ class ApiVlmOptions(BaseVlmOptions):
98
106
  timeout: float = 60
99
107
  concurrency: int = 1
100
108
  response_format: ResponseFormat
109
+
110
+ stop_strings: List[str] = []
111
+ custom_stopping_criteria: List[Union[GenerationStopper]] = []
@@ -29,12 +29,20 @@ GRANITEDOCLING_TRANSFORMERS = InlineVlmOptions(
29
29
  AcceleratorDevice.CPU,
30
30
  AcceleratorDevice.CUDA,
31
31
  ],
32
+ extra_generation_config=dict(skip_special_tokens=False),
32
33
  scale=2.0,
33
34
  temperature=0.0,
34
35
  max_new_tokens=8192,
35
36
  stop_strings=["</doctag>", "<|end_of_text|>"],
36
37
  )
37
38
 
39
+ GRANITEDOCLING_VLLM = GRANITEDOCLING_TRANSFORMERS.model_copy()
40
+ GRANITEDOCLING_VLLM.inference_framework = InferenceFramework.VLLM
41
+ GRANITEDOCLING_VLLM.revision = (
42
+ "untied" # change back to "main" with next vllm relase after 0.10.2
43
+ )
44
+
45
+
38
46
  GRANITEDOCLING_MLX = InlineVlmOptions(
39
47
  repo_id="ibm-granite/granite-docling-258M-mlx",
40
48
  prompt="Convert this page to docling.",
@@ -302,3 +310,4 @@ class VlmModelType(str, Enum):
302
310
  GRANITE_VISION_OLLAMA = "granite_vision_ollama"
303
311
  GOT_OCR_2 = "got_ocr_2"
304
312
  GRANITEDOCLING = "granite_docling"
313
+ GRANITEDOCLING_VLLM = "granite_docling_vllm"
@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
25
25
  from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
26
26
  from docling.backend.msword_backend import MsWordDocumentBackend
27
27
  from docling.backend.noop_backend import NoOpBackend
28
+ from docling.backend.webvtt_backend import WebVTTDocumentBackend
28
29
  from docling.backend.xml.jats_backend import JatsDocumentBackend
29
30
  from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
30
31
  from docling.datamodel.base_models import (
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
170
171
  pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
171
172
  ),
172
173
  InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
174
+ InputFormat.VTT: FormatOption(
175
+ pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
176
+ ),
173
177
  }
174
178
  if (options := format_to_default_options.get(format)) is not None:
175
179
  return options
@@ -1,12 +1,18 @@
1
1
  from collections.abc import Iterable
2
2
  from concurrent.futures import ThreadPoolExecutor
3
3
 
4
+ from transformers import StoppingCriteria
5
+
4
6
  from docling.datamodel.base_models import Page, VlmPrediction
5
7
  from docling.datamodel.document import ConversionResult
6
8
  from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
7
9
  from docling.exceptions import OperationNotAllowed
8
10
  from docling.models.base_model import BasePageModel
9
- from docling.utils.api_image_request import api_image_request
11
+ from docling.models.utils.generation_utils import GenerationStopper
12
+ from docling.utils.api_image_request import (
13
+ api_image_request,
14
+ api_image_request_streaming,
15
+ )
10
16
  from docling.utils.profiling import TimeRecorder
11
17
 
12
18
 
@@ -41,19 +47,43 @@ class ApiVlmModel(BasePageModel):
41
47
  assert page._backend is not None
42
48
  if not page._backend.is_valid():
43
49
  return page
44
- else:
45
- with TimeRecorder(conv_res, "vlm"):
46
- assert page.size is not None
47
50
 
48
- hi_res_image = page.get_image(
49
- scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
50
- )
51
- assert hi_res_image is not None
52
- if hi_res_image:
53
- if hi_res_image.mode != "RGB":
54
- hi_res_image = hi_res_image.convert("RGB")
51
+ with TimeRecorder(conv_res, "vlm"):
52
+ assert page.size is not None
53
+
54
+ hi_res_image = page.get_image(
55
+ scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
56
+ )
57
+ assert hi_res_image is not None
58
+ if hi_res_image and hi_res_image.mode != "RGB":
59
+ hi_res_image = hi_res_image.convert("RGB")
55
60
 
56
- prompt = self.vlm_options.build_prompt(page.parsed_page)
61
+ prompt = self.vlm_options.build_prompt(page.parsed_page)
62
+
63
+ if self.vlm_options.custom_stopping_criteria:
64
+ # Instantiate any GenerationStopper classes before passing to streaming
65
+ instantiated_stoppers = []
66
+ for criteria in self.vlm_options.custom_stopping_criteria:
67
+ if isinstance(criteria, GenerationStopper):
68
+ instantiated_stoppers.append(criteria)
69
+ elif isinstance(criteria, type) and issubclass(
70
+ criteria, GenerationStopper
71
+ ):
72
+ instantiated_stoppers.append(criteria())
73
+ # Skip non-GenerationStopper criteria (should have been caught in validation)
74
+
75
+ # Streaming path with early abort support
76
+ page_tags = api_image_request_streaming(
77
+ image=hi_res_image,
78
+ prompt=prompt,
79
+ url=self.vlm_options.url,
80
+ timeout=self.timeout,
81
+ headers=self.vlm_options.headers,
82
+ generation_stoppers=instantiated_stoppers,
83
+ **self.params,
84
+ )
85
+ else:
86
+ # Non-streaming fallback (existing behavior)
57
87
  page_tags = api_image_request(
58
88
  image=hi_res_image,
59
89
  prompt=prompt,
@@ -63,10 +93,9 @@ class ApiVlmModel(BasePageModel):
63
93
  **self.params,
64
94
  )
65
95
 
66
- page_tags = self.vlm_options.decode_response(page_tags)
67
- page.predictions.vlm_response = VlmPrediction(text=page_tags)
68
-
69
- return page
96
+ page_tags = self.vlm_options.decode_response(page_tags)
97
+ page.predictions.vlm_response = VlmPrediction(text=page_tags)
98
+ return page
70
99
 
71
100
  with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
72
101
  yield from executor.map(_vlm_request, page_batch)
@@ -88,7 +88,8 @@ class BaseVlmPageModel(BasePageModel, BaseVlmModel):
88
88
 
89
89
  if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
90
90
  return user_prompt
91
-
91
+ elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
92
+ return ""
92
93
  elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
93
94
  _log.debug("Using specialized prompt for Phi-4")
94
95
  # Note: This might need adjustment for VLLM vs transformers
@@ -103,7 +103,7 @@ class ReadingOrderModel:
103
103
  else:
104
104
  doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
105
105
 
106
- def _readingorder_elements_to_docling_doc( # noqa: C901
106
+ def _readingorder_elements_to_docling_doc(
107
107
  self,
108
108
  conv_res: ConversionResult,
109
109
  ro_elements: List[ReadingOrderPageElement],
@@ -121,7 +121,7 @@ class TableStructureModel(BasePageModel):
121
121
 
122
122
  for table_element in tbl_list:
123
123
  x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
124
- y0 *= scale_x
124
+ y0 *= scale_y
125
125
  y1 *= scale_y
126
126
  x0 *= scale_x
127
127
  x1 *= scale_x
@@ -132,7 +132,7 @@ class TableStructureModel(BasePageModel):
132
132
  x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
133
133
  x0 *= scale_x
134
134
  x1 *= scale_x
135
- y0 *= scale_x
135
+ y0 *= scale_y
136
136
  y1 *= scale_y
137
137
 
138
138
  draw.rectangle([(x0, y0), (x1, y1)], outline="green")
@@ -142,7 +142,7 @@ class TableStructureModel(BasePageModel):
142
142
  x0, y0, x1, y1 = tc.bbox.as_tuple()
143
143
  x0 *= scale_x
144
144
  x1 *= scale_x
145
- y0 *= scale_x
145
+ y0 *= scale_y
146
146
  y1 *= scale_y
147
147
 
148
148
  if tc.column_header: