docling 2.53.0__py3-none-any.whl → 2.55.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +1 -1
- docling/backend/html_backend.py +254 -136
- docling/backend/md_backend.py +4 -1
- docling/backend/msword_backend.py +177 -76
- docling/backend/webvtt_backend.py +572 -0
- docling/backend/xml/jats_backend.py +111 -7
- docling/backend/xml/uspto_backend.py +1 -1
- docling/cli/main.py +5 -0
- docling/datamodel/base_models.py +23 -23
- docling/datamodel/document.py +2 -0
- docling/datamodel/pipeline_options_vlm_model.py +13 -2
- docling/datamodel/vlm_model_specs.py +9 -0
- docling/document_converter.py +4 -0
- docling/models/api_vlm_model.py +45 -16
- docling/models/base_model.py +2 -1
- docling/models/readingorder_model.py +1 -1
- docling/models/table_structure_model.py +3 -3
- docling/models/utils/generation_utils.py +157 -0
- docling/models/utils/hf_model_download.py +6 -1
- docling/models/vlm_models_inline/hf_transformers_model.py +75 -14
- docling/models/vlm_models_inline/mlx_model.py +58 -1
- docling/models/vlm_models_inline/vllm_model.py +189 -124
- docling/utils/api_image_request.py +107 -1
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/METADATA +5 -5
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/RECORD +29 -27
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/WHEEL +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/entry_points.txt +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.53.0.dist-info → docling-2.55.0.dist-info}/top_level.txt +0 -0
|
@@ -2,9 +2,9 @@ import logging
|
|
|
2
2
|
import traceback
|
|
3
3
|
from io import BytesIO
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Final, Optional, Union
|
|
5
|
+
from typing import Final, Optional, Union, cast
|
|
6
6
|
|
|
7
|
-
from bs4 import BeautifulSoup, Tag
|
|
7
|
+
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
8
8
|
from docling_core.types.doc import (
|
|
9
9
|
DocItemLabel,
|
|
10
10
|
DoclingDocument,
|
|
@@ -12,6 +12,8 @@ from docling_core.types.doc import (
|
|
|
12
12
|
GroupItem,
|
|
13
13
|
GroupLabel,
|
|
14
14
|
NodeItem,
|
|
15
|
+
TableCell,
|
|
16
|
+
TableData,
|
|
15
17
|
TextItem,
|
|
16
18
|
)
|
|
17
19
|
from lxml import etree
|
|
@@ -350,7 +352,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
|
350
352
|
|
|
351
353
|
return
|
|
352
354
|
|
|
353
|
-
def _parse_element_citation(self, node: etree._Element) -> str:
|
|
355
|
+
def _parse_element_citation(self, node: etree._Element) -> str:
|
|
354
356
|
citation: Citation = {
|
|
355
357
|
"author_names": "",
|
|
356
358
|
"title": "",
|
|
@@ -535,6 +537,110 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
|
535
537
|
|
|
536
538
|
return
|
|
537
539
|
|
|
540
|
+
@staticmethod
|
|
541
|
+
def parse_table_data(element: Tag) -> Optional[TableData]:
|
|
542
|
+
# TODO, see how to implement proper support for rich tables from HTML backend
|
|
543
|
+
nested_tables = element.find("table")
|
|
544
|
+
if nested_tables is not None:
|
|
545
|
+
_log.debug("Skipping nested table.")
|
|
546
|
+
return None
|
|
547
|
+
|
|
548
|
+
# Find the number of rows and columns (taking into account spans)
|
|
549
|
+
num_rows = 0
|
|
550
|
+
num_cols = 0
|
|
551
|
+
for row in element("tr"):
|
|
552
|
+
col_count = 0
|
|
553
|
+
is_row_header = True
|
|
554
|
+
if not isinstance(row, Tag):
|
|
555
|
+
continue
|
|
556
|
+
for cell in row(["td", "th"]):
|
|
557
|
+
if not isinstance(row, Tag):
|
|
558
|
+
continue
|
|
559
|
+
cell_tag = cast(Tag, cell)
|
|
560
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(cell_tag)
|
|
561
|
+
col_count += col_span
|
|
562
|
+
if cell_tag.name == "td" or row_span == 1:
|
|
563
|
+
is_row_header = False
|
|
564
|
+
num_cols = max(num_cols, col_count)
|
|
565
|
+
if not is_row_header:
|
|
566
|
+
num_rows += 1
|
|
567
|
+
|
|
568
|
+
_log.debug(f"The table has {num_rows} rows and {num_cols} cols.")
|
|
569
|
+
|
|
570
|
+
grid: list = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
|
571
|
+
|
|
572
|
+
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
|
573
|
+
|
|
574
|
+
# Iterate over the rows in the table
|
|
575
|
+
start_row_span = 0
|
|
576
|
+
row_idx = -1
|
|
577
|
+
for row in element("tr"):
|
|
578
|
+
if not isinstance(row, Tag):
|
|
579
|
+
continue
|
|
580
|
+
|
|
581
|
+
# For each row, find all the column cells (both <td> and <th>)
|
|
582
|
+
cells = row(["td", "th"])
|
|
583
|
+
|
|
584
|
+
# Check if cell is in a column header or row header
|
|
585
|
+
col_header = True
|
|
586
|
+
row_header = True
|
|
587
|
+
for html_cell in cells:
|
|
588
|
+
if isinstance(html_cell, Tag):
|
|
589
|
+
_, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
|
590
|
+
if html_cell.name == "td":
|
|
591
|
+
col_header = False
|
|
592
|
+
row_header = False
|
|
593
|
+
elif row_span == 1:
|
|
594
|
+
row_header = False
|
|
595
|
+
if not row_header:
|
|
596
|
+
row_idx += 1
|
|
597
|
+
start_row_span = 0
|
|
598
|
+
else:
|
|
599
|
+
start_row_span += 1
|
|
600
|
+
|
|
601
|
+
# Extract the text content of each cell
|
|
602
|
+
col_idx = 0
|
|
603
|
+
for html_cell in cells:
|
|
604
|
+
if not isinstance(html_cell, Tag):
|
|
605
|
+
continue
|
|
606
|
+
|
|
607
|
+
# extract inline formulas
|
|
608
|
+
for formula in html_cell("inline-formula"):
|
|
609
|
+
math_parts = formula.text.split("$$")
|
|
610
|
+
if len(math_parts) == 3:
|
|
611
|
+
math_formula = f"$${math_parts[1]}$$"
|
|
612
|
+
formula.replace_with(NavigableString(math_formula))
|
|
613
|
+
|
|
614
|
+
# TODO: extract content correctly from table-cells with lists
|
|
615
|
+
text = HTMLDocumentBackend.get_text(html_cell).strip()
|
|
616
|
+
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
|
617
|
+
if row_header:
|
|
618
|
+
row_span -= 1
|
|
619
|
+
while (
|
|
620
|
+
col_idx < num_cols
|
|
621
|
+
and grid[row_idx + start_row_span][col_idx] is not None
|
|
622
|
+
):
|
|
623
|
+
col_idx += 1
|
|
624
|
+
for r in range(start_row_span, start_row_span + row_span):
|
|
625
|
+
for c in range(col_span):
|
|
626
|
+
if row_idx + r < num_rows and col_idx + c < num_cols:
|
|
627
|
+
grid[row_idx + r][col_idx + c] = text
|
|
628
|
+
|
|
629
|
+
table_cell = TableCell(
|
|
630
|
+
text=text,
|
|
631
|
+
row_span=row_span,
|
|
632
|
+
col_span=col_span,
|
|
633
|
+
start_row_offset_idx=start_row_span + row_idx,
|
|
634
|
+
end_row_offset_idx=start_row_span + row_idx + row_span,
|
|
635
|
+
start_col_offset_idx=col_idx,
|
|
636
|
+
end_col_offset_idx=col_idx + col_span,
|
|
637
|
+
column_header=col_header,
|
|
638
|
+
row_header=((not col_header) and html_cell.name == "th"),
|
|
639
|
+
)
|
|
640
|
+
data.table_cells.append(table_cell)
|
|
641
|
+
|
|
642
|
+
return data
|
|
643
|
+
|
|
538
644
|
def _add_table(
|
|
539
645
|
self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
|
|
540
646
|
) -> None:
|
|
@@ -543,8 +649,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
|
543
649
|
if not isinstance(table_tag, Tag):
|
|
544
650
|
return
|
|
545
651
|
|
|
546
|
-
data =
|
|
547
|
-
|
|
652
|
+
data = JatsDocumentBackend.parse_table_data(table_tag)
|
|
548
653
|
# TODO: format label vs caption once styling is supported
|
|
549
654
|
label = table_xml_component["label"]
|
|
550
655
|
caption = table_xml_component["caption"]
|
|
@@ -554,7 +659,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
|
554
659
|
if table_text
|
|
555
660
|
else None
|
|
556
661
|
)
|
|
557
|
-
|
|
558
662
|
if data is not None:
|
|
559
663
|
doc.add_table(data=data, parent=parent, caption=table_caption)
|
|
560
664
|
|
|
@@ -609,7 +713,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|
|
609
713
|
)
|
|
610
714
|
return
|
|
611
715
|
|
|
612
|
-
def _walk_linear(
|
|
716
|
+
def _walk_linear(
|
|
613
717
|
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
|
614
718
|
) -> str:
|
|
615
719
|
skip_tags = ["term"]
|
docling/cli/main.py
CHANGED
|
@@ -66,6 +66,7 @@ from docling.datamodel.vlm_model_specs import (
|
|
|
66
66
|
GRANITE_VISION_TRANSFORMERS,
|
|
67
67
|
GRANITEDOCLING_MLX,
|
|
68
68
|
GRANITEDOCLING_TRANSFORMERS,
|
|
69
|
+
GRANITEDOCLING_VLLM,
|
|
69
70
|
SMOLDOCLING_MLX,
|
|
70
71
|
SMOLDOCLING_TRANSFORMERS,
|
|
71
72
|
SMOLDOCLING_VLLM,
|
|
@@ -686,6 +687,7 @@ def convert( # noqa: C901
|
|
|
686
687
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
|
687
688
|
"pip install mlx-vlm"
|
|
688
689
|
)
|
|
690
|
+
|
|
689
691
|
elif vlm_model == VlmModelType.GRANITEDOCLING:
|
|
690
692
|
pipeline_options.vlm_options = GRANITEDOCLING_TRANSFORMERS
|
|
691
693
|
if sys.platform == "darwin":
|
|
@@ -701,6 +703,9 @@ def convert( # noqa: C901
|
|
|
701
703
|
elif vlm_model == VlmModelType.SMOLDOCLING_VLLM:
|
|
702
704
|
pipeline_options.vlm_options = SMOLDOCLING_VLLM
|
|
703
705
|
|
|
706
|
+
elif vlm_model == VlmModelType.GRANITEDOCLING_VLLM:
|
|
707
|
+
pipeline_options.vlm_options = GRANITEDOCLING_VLLM
|
|
708
|
+
|
|
704
709
|
pdf_format_option = PdfFormatOption(
|
|
705
710
|
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
|
706
711
|
)
|
docling/datamodel/base_models.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import math
|
|
2
1
|
from collections import defaultdict
|
|
3
2
|
from enum import Enum
|
|
4
|
-
from typing import TYPE_CHECKING,
|
|
3
|
+
from typing import TYPE_CHECKING, Optional, Type, Union
|
|
5
4
|
|
|
6
5
|
import numpy as np
|
|
7
6
|
from docling_core.types.doc import (
|
|
@@ -14,9 +13,7 @@ from docling_core.types.doc import (
|
|
|
14
13
|
)
|
|
15
14
|
from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float
|
|
16
15
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
|
17
|
-
from docling_core.types.io import
|
|
18
|
-
DocumentStream,
|
|
19
|
-
)
|
|
16
|
+
from docling_core.types.io import DocumentStream
|
|
20
17
|
|
|
21
18
|
# DO NOT REMOVE; explicitly exposed from this location
|
|
22
19
|
from PIL.Image import Image
|
|
@@ -71,6 +68,7 @@ class InputFormat(str, Enum):
|
|
|
71
68
|
METS_GBS = "mets_gbs"
|
|
72
69
|
JSON_DOCLING = "json_docling"
|
|
73
70
|
AUDIO = "audio"
|
|
71
|
+
VTT = "vtt"
|
|
74
72
|
|
|
75
73
|
|
|
76
74
|
class OutputFormat(str, Enum):
|
|
@@ -82,7 +80,7 @@ class OutputFormat(str, Enum):
|
|
|
82
80
|
DOCTAGS = "doctags"
|
|
83
81
|
|
|
84
82
|
|
|
85
|
-
FormatToExtensions:
|
|
83
|
+
FormatToExtensions: dict[InputFormat, list[str]] = {
|
|
86
84
|
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
|
87
85
|
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
|
88
86
|
InputFormat.PDF: ["pdf"],
|
|
@@ -97,9 +95,10 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
|
97
95
|
InputFormat.METS_GBS: ["tar.gz"],
|
|
98
96
|
InputFormat.JSON_DOCLING: ["json"],
|
|
99
97
|
InputFormat.AUDIO: ["wav", "mp3"],
|
|
98
|
+
InputFormat.VTT: ["vtt"],
|
|
100
99
|
}
|
|
101
100
|
|
|
102
|
-
FormatToMimeType:
|
|
101
|
+
FormatToMimeType: dict[InputFormat, list[str]] = {
|
|
103
102
|
InputFormat.DOCX: [
|
|
104
103
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
105
104
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
|
@@ -130,6 +129,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
|
130
129
|
InputFormat.METS_GBS: ["application/mets+xml"],
|
|
131
130
|
InputFormat.JSON_DOCLING: ["application/json"],
|
|
132
131
|
InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
|
|
132
|
+
InputFormat.VTT: ["text/vtt"],
|
|
133
133
|
}
|
|
134
134
|
|
|
135
135
|
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
|
@@ -162,8 +162,8 @@ class Cluster(BaseModel):
|
|
|
162
162
|
label: DocItemLabel
|
|
163
163
|
bbox: BoundingBox
|
|
164
164
|
confidence: float = 1.0
|
|
165
|
-
cells:
|
|
166
|
-
children:
|
|
165
|
+
cells: list[TextCell] = []
|
|
166
|
+
children: list["Cluster"] = [] # Add child cluster support
|
|
167
167
|
|
|
168
168
|
@field_serializer("confidence")
|
|
169
169
|
def _serialize(self, value: float, info: FieldSerializationInfo) -> float:
|
|
@@ -179,7 +179,7 @@ class BasePageElement(BaseModel):
|
|
|
179
179
|
|
|
180
180
|
|
|
181
181
|
class LayoutPrediction(BaseModel):
|
|
182
|
-
clusters:
|
|
182
|
+
clusters: list[Cluster] = []
|
|
183
183
|
|
|
184
184
|
|
|
185
185
|
class VlmPredictionToken(BaseModel):
|
|
@@ -201,14 +201,14 @@ class ContainerElement(
|
|
|
201
201
|
|
|
202
202
|
|
|
203
203
|
class Table(BasePageElement):
|
|
204
|
-
otsl_seq:
|
|
204
|
+
otsl_seq: list[str]
|
|
205
205
|
num_rows: int = 0
|
|
206
206
|
num_cols: int = 0
|
|
207
|
-
table_cells:
|
|
207
|
+
table_cells: list[TableCell]
|
|
208
208
|
|
|
209
209
|
|
|
210
210
|
class TableStructurePrediction(BaseModel):
|
|
211
|
-
table_map:
|
|
211
|
+
table_map: dict[int, Table] = {}
|
|
212
212
|
|
|
213
213
|
|
|
214
214
|
class TextElement(BasePageElement):
|
|
@@ -216,7 +216,7 @@ class TextElement(BasePageElement):
|
|
|
216
216
|
|
|
217
217
|
|
|
218
218
|
class FigureElement(BasePageElement):
|
|
219
|
-
annotations:
|
|
219
|
+
annotations: list[PictureDataType] = []
|
|
220
220
|
provenance: Optional[str] = None
|
|
221
221
|
predicted_class: Optional[str] = None
|
|
222
222
|
confidence: Optional[float] = None
|
|
@@ -234,12 +234,12 @@ class FigureElement(BasePageElement):
|
|
|
234
234
|
|
|
235
235
|
class FigureClassificationPrediction(BaseModel):
|
|
236
236
|
figure_count: int = 0
|
|
237
|
-
figure_map:
|
|
237
|
+
figure_map: dict[int, FigureElement] = {}
|
|
238
238
|
|
|
239
239
|
|
|
240
240
|
class EquationPrediction(BaseModel):
|
|
241
241
|
equation_count: int = 0
|
|
242
|
-
equation_map:
|
|
242
|
+
equation_map: dict[int, TextElement] = {}
|
|
243
243
|
|
|
244
244
|
|
|
245
245
|
class PagePredictions(BaseModel):
|
|
@@ -254,9 +254,9 @@ PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
|
|
254
254
|
|
|
255
255
|
|
|
256
256
|
class AssembledUnit(BaseModel):
|
|
257
|
-
elements:
|
|
258
|
-
body:
|
|
259
|
-
headers:
|
|
257
|
+
elements: list[PageElement] = []
|
|
258
|
+
body: list[PageElement] = []
|
|
259
|
+
headers: list[PageElement] = []
|
|
260
260
|
|
|
261
261
|
|
|
262
262
|
class ItemAndImageEnrichmentElement(BaseModel):
|
|
@@ -280,12 +280,12 @@ class Page(BaseModel):
|
|
|
280
280
|
None # Internal PDF backend. By default it is cleared during assembling.
|
|
281
281
|
)
|
|
282
282
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
|
283
|
-
_image_cache:
|
|
283
|
+
_image_cache: dict[
|
|
284
284
|
float, Image
|
|
285
285
|
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
|
286
286
|
|
|
287
287
|
@property
|
|
288
|
-
def cells(self) ->
|
|
288
|
+
def cells(self) -> list[TextCell]:
|
|
289
289
|
"""Return text cells as a read-only view of parsed_page.textline_cells."""
|
|
290
290
|
if self.parsed_page is not None:
|
|
291
291
|
return self.parsed_page.textline_cells
|
|
@@ -354,7 +354,7 @@ class OpenAiApiResponse(BaseModel):
|
|
|
354
354
|
|
|
355
355
|
id: str
|
|
356
356
|
model: Optional[str] = None # returned by openai
|
|
357
|
-
choices:
|
|
357
|
+
choices: list[OpenAiResponseChoice]
|
|
358
358
|
created: int
|
|
359
359
|
usage: OpenAiResponseUsage
|
|
360
360
|
|
|
@@ -430,7 +430,7 @@ class PageConfidenceScores(BaseModel):
|
|
|
430
430
|
|
|
431
431
|
|
|
432
432
|
class ConfidenceReport(PageConfidenceScores):
|
|
433
|
-
pages:
|
|
433
|
+
pages: dict[int, PageConfidenceScores] = Field(
|
|
434
434
|
default_factory=lambda: defaultdict(PageConfidenceScores)
|
|
435
435
|
)
|
|
436
436
|
|
docling/datamodel/document.py
CHANGED
|
@@ -394,6 +394,8 @@ class _DocumentConversionInput(BaseModel):
|
|
|
394
394
|
mime = FormatToMimeType[InputFormat.PPTX][0]
|
|
395
395
|
elif ext in FormatToExtensions[InputFormat.XLSX]:
|
|
396
396
|
mime = FormatToMimeType[InputFormat.XLSX][0]
|
|
397
|
+
elif ext in FormatToExtensions[InputFormat.VTT]:
|
|
398
|
+
mime = FormatToMimeType[InputFormat.VTT][0]
|
|
397
399
|
|
|
398
400
|
return mime
|
|
399
401
|
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
|
-
from typing import Any, Dict, List, Literal, Optional
|
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
3
3
|
|
|
4
4
|
from docling_core.types.doc.page import SegmentedPage
|
|
5
|
-
from pydantic import AnyUrl, BaseModel
|
|
5
|
+
from pydantic import AnyUrl, BaseModel, ConfigDict
|
|
6
|
+
from transformers import StoppingCriteria
|
|
6
7
|
from typing_extensions import deprecated
|
|
7
8
|
|
|
8
9
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
|
10
|
+
from docling.models.utils.generation_utils import GenerationStopper
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
class BaseVlmOptions(BaseModel):
|
|
@@ -50,9 +52,12 @@ class TransformersPromptStyle(str, Enum):
|
|
|
50
52
|
|
|
51
53
|
|
|
52
54
|
class InlineVlmOptions(BaseVlmOptions):
|
|
55
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
56
|
+
|
|
53
57
|
kind: Literal["inline_model_options"] = "inline_model_options"
|
|
54
58
|
|
|
55
59
|
repo_id: str
|
|
60
|
+
revision: str = "main"
|
|
56
61
|
trust_remote_code: bool = False
|
|
57
62
|
load_in_8bit: bool = True
|
|
58
63
|
llm_int8_threshold: float = 6.0
|
|
@@ -71,6 +76,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
|
|
71
76
|
]
|
|
72
77
|
|
|
73
78
|
stop_strings: List[str] = []
|
|
79
|
+
custom_stopping_criteria: List[Union[StoppingCriteria, GenerationStopper]] = []
|
|
74
80
|
extra_generation_config: Dict[str, Any] = {}
|
|
75
81
|
extra_processor_kwargs: Dict[str, Any] = {}
|
|
76
82
|
|
|
@@ -88,6 +94,8 @@ class HuggingFaceVlmOptions(InlineVlmOptions):
|
|
|
88
94
|
|
|
89
95
|
|
|
90
96
|
class ApiVlmOptions(BaseVlmOptions):
|
|
97
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
98
|
+
|
|
91
99
|
kind: Literal["api_model_options"] = "api_model_options"
|
|
92
100
|
|
|
93
101
|
url: AnyUrl = AnyUrl(
|
|
@@ -98,3 +106,6 @@ class ApiVlmOptions(BaseVlmOptions):
|
|
|
98
106
|
timeout: float = 60
|
|
99
107
|
concurrency: int = 1
|
|
100
108
|
response_format: ResponseFormat
|
|
109
|
+
|
|
110
|
+
stop_strings: List[str] = []
|
|
111
|
+
custom_stopping_criteria: List[Union[GenerationStopper]] = []
|
|
@@ -29,12 +29,20 @@ GRANITEDOCLING_TRANSFORMERS = InlineVlmOptions(
|
|
|
29
29
|
AcceleratorDevice.CPU,
|
|
30
30
|
AcceleratorDevice.CUDA,
|
|
31
31
|
],
|
|
32
|
+
extra_generation_config=dict(skip_special_tokens=False),
|
|
32
33
|
scale=2.0,
|
|
33
34
|
temperature=0.0,
|
|
34
35
|
max_new_tokens=8192,
|
|
35
36
|
stop_strings=["</doctag>", "<|end_of_text|>"],
|
|
36
37
|
)
|
|
37
38
|
|
|
39
|
+
GRANITEDOCLING_VLLM = GRANITEDOCLING_TRANSFORMERS.model_copy()
|
|
40
|
+
GRANITEDOCLING_VLLM.inference_framework = InferenceFramework.VLLM
|
|
41
|
+
GRANITEDOCLING_VLLM.revision = (
|
|
42
|
+
"untied" # change back to "main" with next vllm relase after 0.10.2
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
38
46
|
GRANITEDOCLING_MLX = InlineVlmOptions(
|
|
39
47
|
repo_id="ibm-granite/granite-docling-258M-mlx",
|
|
40
48
|
prompt="Convert this page to docling.",
|
|
@@ -302,3 +310,4 @@ class VlmModelType(str, Enum):
|
|
|
302
310
|
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
|
303
311
|
GOT_OCR_2 = "got_ocr_2"
|
|
304
312
|
GRANITEDOCLING = "granite_docling"
|
|
313
|
+
GRANITEDOCLING_VLLM = "granite_docling_vllm"
|
docling/document_converter.py
CHANGED
|
@@ -25,6 +25,7 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
|
|
25
25
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
|
26
26
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
|
27
27
|
from docling.backend.noop_backend import NoOpBackend
|
|
28
|
+
from docling.backend.webvtt_backend import WebVTTDocumentBackend
|
|
28
29
|
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
|
29
30
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
|
30
31
|
from docling.datamodel.base_models import (
|
|
@@ -170,6 +171,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|
|
170
171
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
|
171
172
|
),
|
|
172
173
|
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
|
|
174
|
+
InputFormat.VTT: FormatOption(
|
|
175
|
+
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
|
|
176
|
+
),
|
|
173
177
|
}
|
|
174
178
|
if (options := format_to_default_options.get(format)) is not None:
|
|
175
179
|
return options
|
docling/models/api_vlm_model.py
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
from collections.abc import Iterable
|
|
2
2
|
from concurrent.futures import ThreadPoolExecutor
|
|
3
3
|
|
|
4
|
+
from transformers import StoppingCriteria
|
|
5
|
+
|
|
4
6
|
from docling.datamodel.base_models import Page, VlmPrediction
|
|
5
7
|
from docling.datamodel.document import ConversionResult
|
|
6
8
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
|
|
7
9
|
from docling.exceptions import OperationNotAllowed
|
|
8
10
|
from docling.models.base_model import BasePageModel
|
|
9
|
-
from docling.utils.
|
|
11
|
+
from docling.models.utils.generation_utils import GenerationStopper
|
|
12
|
+
from docling.utils.api_image_request import (
|
|
13
|
+
api_image_request,
|
|
14
|
+
api_image_request_streaming,
|
|
15
|
+
)
|
|
10
16
|
from docling.utils.profiling import TimeRecorder
|
|
11
17
|
|
|
12
18
|
|
|
@@ -41,19 +47,43 @@ class ApiVlmModel(BasePageModel):
|
|
|
41
47
|
assert page._backend is not None
|
|
42
48
|
if not page._backend.is_valid():
|
|
43
49
|
return page
|
|
44
|
-
else:
|
|
45
|
-
with TimeRecorder(conv_res, "vlm"):
|
|
46
|
-
assert page.size is not None
|
|
47
50
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
51
|
+
with TimeRecorder(conv_res, "vlm"):
|
|
52
|
+
assert page.size is not None
|
|
53
|
+
|
|
54
|
+
hi_res_image = page.get_image(
|
|
55
|
+
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
|
|
56
|
+
)
|
|
57
|
+
assert hi_res_image is not None
|
|
58
|
+
if hi_res_image and hi_res_image.mode != "RGB":
|
|
59
|
+
hi_res_image = hi_res_image.convert("RGB")
|
|
55
60
|
|
|
56
|
-
|
|
61
|
+
prompt = self.vlm_options.build_prompt(page.parsed_page)
|
|
62
|
+
|
|
63
|
+
if self.vlm_options.custom_stopping_criteria:
|
|
64
|
+
# Instantiate any GenerationStopper classes before passing to streaming
|
|
65
|
+
instantiated_stoppers = []
|
|
66
|
+
for criteria in self.vlm_options.custom_stopping_criteria:
|
|
67
|
+
if isinstance(criteria, GenerationStopper):
|
|
68
|
+
instantiated_stoppers.append(criteria)
|
|
69
|
+
elif isinstance(criteria, type) and issubclass(
|
|
70
|
+
criteria, GenerationStopper
|
|
71
|
+
):
|
|
72
|
+
instantiated_stoppers.append(criteria())
|
|
73
|
+
# Skip non-GenerationStopper criteria (should have been caught in validation)
|
|
74
|
+
|
|
75
|
+
# Streaming path with early abort support
|
|
76
|
+
page_tags = api_image_request_streaming(
|
|
77
|
+
image=hi_res_image,
|
|
78
|
+
prompt=prompt,
|
|
79
|
+
url=self.vlm_options.url,
|
|
80
|
+
timeout=self.timeout,
|
|
81
|
+
headers=self.vlm_options.headers,
|
|
82
|
+
generation_stoppers=instantiated_stoppers,
|
|
83
|
+
**self.params,
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
# Non-streaming fallback (existing behavior)
|
|
57
87
|
page_tags = api_image_request(
|
|
58
88
|
image=hi_res_image,
|
|
59
89
|
prompt=prompt,
|
|
@@ -63,10 +93,9 @@ class ApiVlmModel(BasePageModel):
|
|
|
63
93
|
**self.params,
|
|
64
94
|
)
|
|
65
95
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
return page
|
|
96
|
+
page_tags = self.vlm_options.decode_response(page_tags)
|
|
97
|
+
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
|
98
|
+
return page
|
|
70
99
|
|
|
71
100
|
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
|
|
72
101
|
yield from executor.map(_vlm_request, page_batch)
|
docling/models/base_model.py
CHANGED
|
@@ -88,7 +88,8 @@ class BaseVlmPageModel(BasePageModel, BaseVlmModel):
|
|
|
88
88
|
|
|
89
89
|
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
|
|
90
90
|
return user_prompt
|
|
91
|
-
|
|
91
|
+
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.NONE:
|
|
92
|
+
return ""
|
|
92
93
|
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
|
93
94
|
_log.debug("Using specialized prompt for Phi-4")
|
|
94
95
|
# Note: This might need adjustment for VLLM vs transformers
|
|
@@ -103,7 +103,7 @@ class ReadingOrderModel:
|
|
|
103
103
|
else:
|
|
104
104
|
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
|
105
105
|
|
|
106
|
-
def _readingorder_elements_to_docling_doc(
|
|
106
|
+
def _readingorder_elements_to_docling_doc(
|
|
107
107
|
self,
|
|
108
108
|
conv_res: ConversionResult,
|
|
109
109
|
ro_elements: List[ReadingOrderPageElement],
|
|
@@ -121,7 +121,7 @@ class TableStructureModel(BasePageModel):
|
|
|
121
121
|
|
|
122
122
|
for table_element in tbl_list:
|
|
123
123
|
x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
|
|
124
|
-
y0 *=
|
|
124
|
+
y0 *= scale_y
|
|
125
125
|
y1 *= scale_y
|
|
126
126
|
x0 *= scale_x
|
|
127
127
|
x1 *= scale_x
|
|
@@ -132,7 +132,7 @@ class TableStructureModel(BasePageModel):
|
|
|
132
132
|
x0, y0, x1, y1 = cell.rect.to_bounding_box().as_tuple()
|
|
133
133
|
x0 *= scale_x
|
|
134
134
|
x1 *= scale_x
|
|
135
|
-
y0 *=
|
|
135
|
+
y0 *= scale_y
|
|
136
136
|
y1 *= scale_y
|
|
137
137
|
|
|
138
138
|
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
|
|
@@ -142,7 +142,7 @@ class TableStructureModel(BasePageModel):
|
|
|
142
142
|
x0, y0, x1, y1 = tc.bbox.as_tuple()
|
|
143
143
|
x0 *= scale_x
|
|
144
144
|
x1 *= scale_x
|
|
145
|
-
y0 *=
|
|
145
|
+
y0 *= scale_y
|
|
146
146
|
y1 *= scale_y
|
|
147
147
|
|
|
148
148
|
if tc.column_header:
|