docling 2.24.0__py3-none-any.whl → 2.25.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +42 -3
- docling/cli/models.py +28 -4
- docling/datamodel/base_models.py +5 -0
- docling/datamodel/pipeline_options.py +62 -1
- docling/models/hf_vlm_model.py +180 -0
- docling/models/picture_description_vlm_model.py +2 -2
- docling/pipeline/vlm_pipeline.py +534 -0
- docling/utils/model_downloader.py +15 -2
- docling/utils/visualization.py +5 -0
- {docling-2.24.0.dist-info → docling-2.25.0.dist-info}/METADATA +2 -1
- {docling-2.24.0.dist-info → docling-2.25.0.dist-info}/RECORD +14 -12
- {docling-2.24.0.dist-info → docling-2.25.0.dist-info}/LICENSE +0 -0
- {docling-2.24.0.dist-info → docling-2.25.0.dist-info}/WHEEL +0 -0
- {docling-2.24.0.dist-info → docling-2.25.0.dist-info}/entry_points.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
import logging
|
2
2
|
from io import BytesIO
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import Optional, Union, cast
|
4
|
+
from typing import Final, Optional, Union, cast
|
5
5
|
|
6
6
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
7
|
+
from bs4.element import PreformattedString
|
7
8
|
from docling_core.types.doc import (
|
8
9
|
DocItem,
|
9
10
|
DocItemLabel,
|
@@ -22,12 +23,29 @@ from docling.datamodel.document import InputDocument
|
|
22
23
|
|
23
24
|
_log = logging.getLogger(__name__)
|
24
25
|
|
26
|
+
# tags that generate NodeItem elements
|
27
|
+
TAGS_FOR_NODE_ITEMS: Final = [
|
28
|
+
"h1",
|
29
|
+
"h2",
|
30
|
+
"h3",
|
31
|
+
"h4",
|
32
|
+
"h5",
|
33
|
+
"h6",
|
34
|
+
"p",
|
35
|
+
"pre",
|
36
|
+
"ul",
|
37
|
+
"ol",
|
38
|
+
"li",
|
39
|
+
"table",
|
40
|
+
"figure",
|
41
|
+
"img",
|
42
|
+
]
|
43
|
+
|
25
44
|
|
26
45
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
27
46
|
@override
|
28
47
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
29
48
|
super().__init__(in_doc, path_or_stream)
|
30
|
-
_log.debug("About to init HTML backend...")
|
31
49
|
self.soup: Optional[Tag] = None
|
32
50
|
# HTML file:
|
33
51
|
self.path_or_stream = path_or_stream
|
@@ -88,6 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
88
106
|
assert self.soup is not None
|
89
107
|
content = self.soup.body or self.soup
|
90
108
|
# Replace <br> tags with newline characters
|
109
|
+
# TODO: remove style to avoid losing text from tags like i, b, span, ...
|
91
110
|
for br in content("br"):
|
92
111
|
br.replace_with(NavigableString("\n"))
|
93
112
|
self.walk(content, doc)
|
@@ -99,6 +118,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
99
118
|
|
100
119
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
101
120
|
# Iterate over elements in the body of the document
|
121
|
+
text: str = ""
|
102
122
|
for element in tag.children:
|
103
123
|
if isinstance(element, Tag):
|
104
124
|
try:
|
@@ -108,6 +128,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
108
128
|
f"Error processing child from tag{tag.name}: {exc_child}"
|
109
129
|
)
|
110
130
|
raise exc_child
|
131
|
+
elif isinstance(element, NavigableString) and not isinstance(
|
132
|
+
element, PreformattedString
|
133
|
+
):
|
134
|
+
# Floating text outside paragraphs or analyzed tags
|
135
|
+
text += element
|
136
|
+
siblings: list[Tag] = [
|
137
|
+
item for item in element.next_siblings if isinstance(item, Tag)
|
138
|
+
]
|
139
|
+
if element.next_sibling is None or any(
|
140
|
+
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
|
141
|
+
):
|
142
|
+
text = text.strip()
|
143
|
+
if text and tag.name in ["div"]:
|
144
|
+
doc.add_text(
|
145
|
+
parent=self.parents[self.level],
|
146
|
+
label=DocItemLabel.PARAGRAPH,
|
147
|
+
text=text,
|
148
|
+
)
|
149
|
+
text = ""
|
111
150
|
|
112
151
|
return
|
113
152
|
|
@@ -158,7 +197,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
158
197
|
text = element.text.strip()
|
159
198
|
|
160
199
|
if hlevel == 1:
|
161
|
-
for key
|
200
|
+
for key in self.parents.keys():
|
162
201
|
self.parents[key] = None
|
163
202
|
|
164
203
|
self.level = 1
|
docling/cli/models.py
CHANGED
@@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
|
|
32
32
|
CODE_FORMULA = "code_formula"
|
33
33
|
PICTURE_CLASSIFIER = "picture_classifier"
|
34
34
|
SMOLVLM = "smolvlm"
|
35
|
+
GRANITE_VISION = "granite_vision"
|
35
36
|
EASYOCR = "easyocr"
|
36
37
|
|
37
38
|
|
39
|
+
_default_models = [
|
40
|
+
_AvailableModels.LAYOUT,
|
41
|
+
_AvailableModels.TABLEFORMER,
|
42
|
+
_AvailableModels.CODE_FORMULA,
|
43
|
+
_AvailableModels.PICTURE_CLASSIFIER,
|
44
|
+
_AvailableModels.EASYOCR,
|
45
|
+
]
|
46
|
+
|
47
|
+
|
38
48
|
@app.command("download")
|
39
49
|
def download(
|
40
50
|
output_dir: Annotated[
|
@@ -43,18 +53,27 @@ def download(
|
|
43
53
|
...,
|
44
54
|
"-o",
|
45
55
|
"--output-dir",
|
46
|
-
help="The directory where
|
56
|
+
help="The directory where to download the models.",
|
47
57
|
),
|
48
58
|
] = (settings.cache_dir / "models"),
|
49
59
|
force: Annotated[
|
50
|
-
bool, typer.Option(..., help="If true, the download will be forced")
|
60
|
+
bool, typer.Option(..., help="If true, the download will be forced.")
|
51
61
|
] = False,
|
52
62
|
models: Annotated[
|
53
63
|
Optional[list[_AvailableModels]],
|
54
64
|
typer.Argument(
|
55
|
-
help=f"Models to download (default behavior:
|
65
|
+
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
|
56
66
|
),
|
57
67
|
] = None,
|
68
|
+
all: Annotated[
|
69
|
+
bool,
|
70
|
+
typer.Option(
|
71
|
+
...,
|
72
|
+
"--all",
|
73
|
+
help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
|
74
|
+
show_default=True,
|
75
|
+
),
|
76
|
+
] = False,
|
58
77
|
quiet: Annotated[
|
59
78
|
bool,
|
60
79
|
typer.Option(
|
@@ -65,6 +84,10 @@ def download(
|
|
65
84
|
),
|
66
85
|
] = False,
|
67
86
|
):
|
87
|
+
if models and all:
|
88
|
+
raise typer.BadParameter(
|
89
|
+
"Cannot simultaneously set 'all' parameter and specify models to download."
|
90
|
+
)
|
68
91
|
if not quiet:
|
69
92
|
FORMAT = "%(message)s"
|
70
93
|
logging.basicConfig(
|
@@ -73,7 +96,7 @@ def download(
|
|
73
96
|
datefmt="[%X]",
|
74
97
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
75
98
|
)
|
76
|
-
to_download = models or [m for m in _AvailableModels]
|
99
|
+
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
|
77
100
|
output_dir = download_models(
|
78
101
|
output_dir=output_dir,
|
79
102
|
force=force,
|
@@ -83,6 +106,7 @@ def download(
|
|
83
106
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
84
107
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
85
108
|
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
109
|
+
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
86
110
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
87
111
|
)
|
88
112
|
|
docling/datamodel/base_models.py
CHANGED
@@ -154,6 +154,10 @@ class LayoutPrediction(BaseModel):
|
|
154
154
|
clusters: List[Cluster] = []
|
155
155
|
|
156
156
|
|
157
|
+
class VlmPrediction(BaseModel):
|
158
|
+
text: str = ""
|
159
|
+
|
160
|
+
|
157
161
|
class ContainerElement(
|
158
162
|
BasePageElement
|
159
163
|
): # Used for Form and Key-Value-Regions, only for typing.
|
@@ -197,6 +201,7 @@ class PagePredictions(BaseModel):
|
|
197
201
|
tablestructure: Optional[TableStructurePrediction] = None
|
198
202
|
figures_classification: Optional[FigureClassificationPrediction] = None
|
199
203
|
equations_prediction: Optional[EquationPrediction] = None
|
204
|
+
vlm_response: Optional[VlmPrediction] = None
|
200
205
|
|
201
206
|
|
202
207
|
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
@@ -41,6 +41,7 @@ class AcceleratorOptions(BaseSettings):
|
|
41
41
|
|
42
42
|
num_threads: int = 4
|
43
43
|
device: Union[str, AcceleratorDevice] = "auto"
|
44
|
+
cuda_use_flash_attention2: bool = False
|
44
45
|
|
45
46
|
@field_validator("device")
|
46
47
|
def validate_device(cls, value):
|
@@ -254,6 +255,45 @@ granite_picture_description = PictureDescriptionVlmOptions(
|
|
254
255
|
)
|
255
256
|
|
256
257
|
|
258
|
+
class BaseVlmOptions(BaseModel):
|
259
|
+
kind: str
|
260
|
+
prompt: str
|
261
|
+
|
262
|
+
|
263
|
+
class ResponseFormat(str, Enum):
|
264
|
+
DOCTAGS = "doctags"
|
265
|
+
MARKDOWN = "markdown"
|
266
|
+
|
267
|
+
|
268
|
+
class HuggingFaceVlmOptions(BaseVlmOptions):
|
269
|
+
kind: Literal["hf_model_options"] = "hf_model_options"
|
270
|
+
|
271
|
+
repo_id: str
|
272
|
+
load_in_8bit: bool = True
|
273
|
+
llm_int8_threshold: float = 6.0
|
274
|
+
quantized: bool = False
|
275
|
+
|
276
|
+
response_format: ResponseFormat
|
277
|
+
|
278
|
+
@property
|
279
|
+
def repo_cache_folder(self) -> str:
|
280
|
+
return self.repo_id.replace("/", "--")
|
281
|
+
|
282
|
+
|
283
|
+
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
284
|
+
repo_id="ds4sd/SmolDocling-256M-preview",
|
285
|
+
prompt="Convert this page to docling.",
|
286
|
+
response_format=ResponseFormat.DOCTAGS,
|
287
|
+
)
|
288
|
+
|
289
|
+
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
290
|
+
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
291
|
+
# prompt="OCR the full page to markdown.",
|
292
|
+
prompt="OCR this image.",
|
293
|
+
response_format=ResponseFormat.MARKDOWN,
|
294
|
+
)
|
295
|
+
|
296
|
+
|
257
297
|
# Define an enum for the backend options
|
258
298
|
class PdfBackend(str, Enum):
|
259
299
|
"""Enum of valid PDF backends."""
|
@@ -285,7 +325,24 @@ class PipelineOptions(BaseModel):
|
|
285
325
|
enable_remote_services: bool = False
|
286
326
|
|
287
327
|
|
288
|
-
class
|
328
|
+
class PaginatedPipelineOptions(PipelineOptions):
|
329
|
+
images_scale: float = 1.0
|
330
|
+
generate_page_images: bool = False
|
331
|
+
generate_picture_images: bool = False
|
332
|
+
|
333
|
+
|
334
|
+
class VlmPipelineOptions(PaginatedPipelineOptions):
|
335
|
+
artifacts_path: Optional[Union[Path, str]] = None
|
336
|
+
|
337
|
+
generate_page_images: bool = True
|
338
|
+
force_backend_text: bool = (
|
339
|
+
False # (To be used with vlms, or other generative models)
|
340
|
+
)
|
341
|
+
# If True, text from backend will be used instead of generated text
|
342
|
+
vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
|
343
|
+
|
344
|
+
|
345
|
+
class PdfPipelineOptions(PaginatedPipelineOptions):
|
289
346
|
"""Options for the PDF pipeline."""
|
290
347
|
|
291
348
|
artifacts_path: Optional[Union[Path, str]] = None
|
@@ -295,6 +352,10 @@ class PdfPipelineOptions(PipelineOptions):
|
|
295
352
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
296
353
|
do_picture_classification: bool = False # True: classify pictures in documents
|
297
354
|
do_picture_description: bool = False # True: run describe pictures in documents
|
355
|
+
force_backend_text: bool = (
|
356
|
+
False # (To be used with vlms, or other generative models)
|
357
|
+
)
|
358
|
+
# If True, text from backend will be used instead of generated text
|
298
359
|
|
299
360
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
300
361
|
ocr_options: Union[
|
@@ -0,0 +1,180 @@
|
|
1
|
+
import logging
|
2
|
+
import time
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Iterable, List, Optional
|
5
|
+
|
6
|
+
from docling.datamodel.base_models import Page, VlmPrediction
|
7
|
+
from docling.datamodel.document import ConversionResult
|
8
|
+
from docling.datamodel.pipeline_options import (
|
9
|
+
AcceleratorDevice,
|
10
|
+
AcceleratorOptions,
|
11
|
+
HuggingFaceVlmOptions,
|
12
|
+
)
|
13
|
+
from docling.datamodel.settings import settings
|
14
|
+
from docling.models.base_model import BasePageModel
|
15
|
+
from docling.utils.accelerator_utils import decide_device
|
16
|
+
from docling.utils.profiling import TimeRecorder
|
17
|
+
|
18
|
+
_log = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
|
21
|
+
class HuggingFaceVlmModel(BasePageModel):
|
22
|
+
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
enabled: bool,
|
26
|
+
artifacts_path: Optional[Path],
|
27
|
+
accelerator_options: AcceleratorOptions,
|
28
|
+
vlm_options: HuggingFaceVlmOptions,
|
29
|
+
):
|
30
|
+
self.enabled = enabled
|
31
|
+
|
32
|
+
self.vlm_options = vlm_options
|
33
|
+
|
34
|
+
if self.enabled:
|
35
|
+
import torch
|
36
|
+
from transformers import ( # type: ignore
|
37
|
+
AutoModelForVision2Seq,
|
38
|
+
AutoProcessor,
|
39
|
+
BitsAndBytesConfig,
|
40
|
+
)
|
41
|
+
|
42
|
+
device = decide_device(accelerator_options.device)
|
43
|
+
self.device = device
|
44
|
+
|
45
|
+
_log.debug("Available device for HuggingFace VLM: {}".format(device))
|
46
|
+
|
47
|
+
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
48
|
+
|
49
|
+
# PARAMETERS:
|
50
|
+
if artifacts_path is None:
|
51
|
+
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
52
|
+
elif (artifacts_path / repo_cache_folder).exists():
|
53
|
+
artifacts_path = artifacts_path / repo_cache_folder
|
54
|
+
|
55
|
+
self.param_question = vlm_options.prompt # "Perform Layout Analysis."
|
56
|
+
self.param_quantization_config = BitsAndBytesConfig(
|
57
|
+
load_in_8bit=vlm_options.load_in_8bit, # True,
|
58
|
+
llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0
|
59
|
+
)
|
60
|
+
self.param_quantized = vlm_options.quantized # False
|
61
|
+
|
62
|
+
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
63
|
+
if not self.param_quantized:
|
64
|
+
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
|
65
|
+
artifacts_path,
|
66
|
+
device_map=device,
|
67
|
+
torch_dtype=torch.bfloat16,
|
68
|
+
_attn_implementation=(
|
69
|
+
"flash_attention_2"
|
70
|
+
if self.device.startswith("cuda")
|
71
|
+
and accelerator_options.cuda_use_flash_attention2
|
72
|
+
else "eager"
|
73
|
+
),
|
74
|
+
) # .to(self.device)
|
75
|
+
|
76
|
+
else:
|
77
|
+
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
|
78
|
+
artifacts_path,
|
79
|
+
device_map=device,
|
80
|
+
torch_dtype="auto",
|
81
|
+
quantization_config=self.param_quantization_config,
|
82
|
+
_attn_implementation=(
|
83
|
+
"flash_attention_2"
|
84
|
+
if self.device.startswith("cuda")
|
85
|
+
and accelerator_options.cuda_use_flash_attention2
|
86
|
+
else "eager"
|
87
|
+
),
|
88
|
+
) # .to(self.device)
|
89
|
+
|
90
|
+
@staticmethod
|
91
|
+
def download_models(
|
92
|
+
repo_id: str,
|
93
|
+
local_dir: Optional[Path] = None,
|
94
|
+
force: bool = False,
|
95
|
+
progress: bool = False,
|
96
|
+
) -> Path:
|
97
|
+
from huggingface_hub import snapshot_download
|
98
|
+
from huggingface_hub.utils import disable_progress_bars
|
99
|
+
|
100
|
+
if not progress:
|
101
|
+
disable_progress_bars()
|
102
|
+
download_path = snapshot_download(
|
103
|
+
repo_id=repo_id,
|
104
|
+
force_download=force,
|
105
|
+
local_dir=local_dir,
|
106
|
+
# revision="v0.0.1",
|
107
|
+
)
|
108
|
+
|
109
|
+
return Path(download_path)
|
110
|
+
|
111
|
+
def __call__(
|
112
|
+
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
113
|
+
) -> Iterable[Page]:
|
114
|
+
for page in page_batch:
|
115
|
+
assert page._backend is not None
|
116
|
+
if not page._backend.is_valid():
|
117
|
+
yield page
|
118
|
+
else:
|
119
|
+
with TimeRecorder(conv_res, "vlm"):
|
120
|
+
assert page.size is not None
|
121
|
+
|
122
|
+
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
123
|
+
# hi_res_image = page.get_image(scale=1.0) # 72dpi
|
124
|
+
|
125
|
+
if hi_res_image is not None:
|
126
|
+
im_width, im_height = hi_res_image.size
|
127
|
+
|
128
|
+
# populate page_tags with predicted doc tags
|
129
|
+
page_tags = ""
|
130
|
+
|
131
|
+
if hi_res_image:
|
132
|
+
if hi_res_image.mode != "RGB":
|
133
|
+
hi_res_image = hi_res_image.convert("RGB")
|
134
|
+
|
135
|
+
messages = [
|
136
|
+
{
|
137
|
+
"role": "user",
|
138
|
+
"content": [
|
139
|
+
{
|
140
|
+
"type": "text",
|
141
|
+
"text": "This is a page from a document.",
|
142
|
+
},
|
143
|
+
{"type": "image"},
|
144
|
+
{"type": "text", "text": self.param_question},
|
145
|
+
],
|
146
|
+
}
|
147
|
+
]
|
148
|
+
prompt = self.processor.apply_chat_template(
|
149
|
+
messages, add_generation_prompt=False
|
150
|
+
)
|
151
|
+
inputs = self.processor(
|
152
|
+
text=prompt, images=[hi_res_image], return_tensors="pt"
|
153
|
+
)
|
154
|
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
155
|
+
|
156
|
+
start_time = time.time()
|
157
|
+
# Call model to generate:
|
158
|
+
generated_ids = self.vlm_model.generate(
|
159
|
+
**inputs, max_new_tokens=4096, use_cache=True
|
160
|
+
)
|
161
|
+
|
162
|
+
generation_time = time.time() - start_time
|
163
|
+
generated_texts = self.processor.batch_decode(
|
164
|
+
generated_ids[:, inputs["input_ids"].shape[1] :],
|
165
|
+
skip_special_tokens=False,
|
166
|
+
)[0]
|
167
|
+
|
168
|
+
num_tokens = len(generated_ids[0])
|
169
|
+
page_tags = generated_texts
|
170
|
+
|
171
|
+
# inference_time = time.time() - start_time
|
172
|
+
# tokens_per_second = num_tokens / generation_time
|
173
|
+
# print("")
|
174
|
+
# print(f"Page Inference Time: {inference_time:.2f} seconds")
|
175
|
+
# print(f"Total tokens on page: {num_tokens:.2f}")
|
176
|
+
# print(f"Tokens/sec: {tokens_per_second:.2f}")
|
177
|
+
# print("")
|
178
|
+
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
179
|
+
|
180
|
+
yield page
|
@@ -41,9 +41,9 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|
41
41
|
)
|
42
42
|
|
43
43
|
# Initialize processor and model
|
44
|
-
self.processor = AutoProcessor.from_pretrained(
|
44
|
+
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
45
45
|
self.model = AutoModelForVision2Seq.from_pretrained(
|
46
|
-
|
46
|
+
artifacts_path,
|
47
47
|
torch_dtype=torch.bfloat16,
|
48
48
|
_attn_implementation=(
|
49
49
|
"flash_attention_2" if self.device.startswith("cuda") else "eager"
|
@@ -0,0 +1,534 @@
|
|
1
|
+
import itertools
|
2
|
+
import logging
|
3
|
+
import re
|
4
|
+
import warnings
|
5
|
+
from io import BytesIO
|
6
|
+
|
7
|
+
# from io import BytesIO
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Optional
|
10
|
+
|
11
|
+
from docling_core.types import DoclingDocument
|
12
|
+
from docling_core.types.doc import (
|
13
|
+
BoundingBox,
|
14
|
+
DocItem,
|
15
|
+
DocItemLabel,
|
16
|
+
DoclingDocument,
|
17
|
+
GroupLabel,
|
18
|
+
ImageRef,
|
19
|
+
ImageRefMode,
|
20
|
+
PictureItem,
|
21
|
+
ProvenanceItem,
|
22
|
+
Size,
|
23
|
+
TableCell,
|
24
|
+
TableData,
|
25
|
+
TableItem,
|
26
|
+
)
|
27
|
+
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
28
|
+
|
29
|
+
from docling.backend.abstract_backend import AbstractDocumentBackend
|
30
|
+
from docling.backend.md_backend import MarkdownDocumentBackend
|
31
|
+
from docling.backend.pdf_backend import PdfDocumentBackend
|
32
|
+
from docling.datamodel.base_models import InputFormat, Page
|
33
|
+
from docling.datamodel.document import ConversionResult, InputDocument
|
34
|
+
from docling.datamodel.pipeline_options import (
|
35
|
+
PdfPipelineOptions,
|
36
|
+
ResponseFormat,
|
37
|
+
VlmPipelineOptions,
|
38
|
+
)
|
39
|
+
from docling.datamodel.settings import settings
|
40
|
+
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
41
|
+
from docling.pipeline.base_pipeline import PaginatedPipeline
|
42
|
+
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
43
|
+
|
44
|
+
_log = logging.getLogger(__name__)
|
45
|
+
|
46
|
+
|
47
|
+
class VlmPipeline(PaginatedPipeline):
|
48
|
+
|
49
|
+
def __init__(self, pipeline_options: VlmPipelineOptions):
|
50
|
+
super().__init__(pipeline_options)
|
51
|
+
self.keep_backend = True
|
52
|
+
|
53
|
+
warnings.warn(
|
54
|
+
"The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
|
55
|
+
category=UserWarning,
|
56
|
+
stacklevel=2,
|
57
|
+
)
|
58
|
+
|
59
|
+
self.pipeline_options: VlmPipelineOptions
|
60
|
+
|
61
|
+
artifacts_path: Optional[Path] = None
|
62
|
+
if pipeline_options.artifacts_path is not None:
|
63
|
+
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
64
|
+
elif settings.artifacts_path is not None:
|
65
|
+
artifacts_path = Path(settings.artifacts_path).expanduser()
|
66
|
+
|
67
|
+
if artifacts_path is not None and not artifacts_path.is_dir():
|
68
|
+
raise RuntimeError(
|
69
|
+
f"The value of {artifacts_path=} is not valid. "
|
70
|
+
"When defined, it must point to a folder containing all models required by the pipeline."
|
71
|
+
)
|
72
|
+
|
73
|
+
# force_backend_text = False - use text that is coming from VLM response
|
74
|
+
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
|
75
|
+
self.force_backend_text = (
|
76
|
+
pipeline_options.force_backend_text
|
77
|
+
and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS
|
78
|
+
)
|
79
|
+
|
80
|
+
self.keep_images = self.pipeline_options.generate_page_images
|
81
|
+
|
82
|
+
self.build_pipe = [
|
83
|
+
HuggingFaceVlmModel(
|
84
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
85
|
+
artifacts_path=artifacts_path,
|
86
|
+
accelerator_options=pipeline_options.accelerator_options,
|
87
|
+
vlm_options=self.pipeline_options.vlm_options,
|
88
|
+
),
|
89
|
+
]
|
90
|
+
|
91
|
+
self.enrichment_pipe = [
|
92
|
+
# Other models working on `NodeItem` elements in the DoclingDocument
|
93
|
+
]
|
94
|
+
|
95
|
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
96
|
+
with TimeRecorder(conv_res, "page_init"):
|
97
|
+
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
98
|
+
if page._backend is not None and page._backend.is_valid():
|
99
|
+
page.size = page._backend.get_size()
|
100
|
+
|
101
|
+
return page
|
102
|
+
|
103
|
+
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
104
|
+
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
105
|
+
|
106
|
+
if (
|
107
|
+
self.pipeline_options.vlm_options.response_format
|
108
|
+
== ResponseFormat.DOCTAGS
|
109
|
+
):
|
110
|
+
conv_res.document = self._turn_tags_into_doc(conv_res.pages)
|
111
|
+
elif (
|
112
|
+
self.pipeline_options.vlm_options.response_format
|
113
|
+
== ResponseFormat.MARKDOWN
|
114
|
+
):
|
115
|
+
conv_res.document = self._turn_md_into_doc(conv_res)
|
116
|
+
|
117
|
+
else:
|
118
|
+
raise RuntimeError(
|
119
|
+
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
|
120
|
+
)
|
121
|
+
|
122
|
+
# Generate images of the requested element types
|
123
|
+
if self.pipeline_options.generate_picture_images:
|
124
|
+
scale = self.pipeline_options.images_scale
|
125
|
+
for element, _level in conv_res.document.iterate_items():
|
126
|
+
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
127
|
+
continue
|
128
|
+
if (
|
129
|
+
isinstance(element, PictureItem)
|
130
|
+
and self.pipeline_options.generate_picture_images
|
131
|
+
):
|
132
|
+
page_ix = element.prov[0].page_no - 1
|
133
|
+
page = conv_res.pages[page_ix]
|
134
|
+
assert page.size is not None
|
135
|
+
assert page.image is not None
|
136
|
+
|
137
|
+
crop_bbox = (
|
138
|
+
element.prov[0]
|
139
|
+
.bbox.scaled(scale=scale)
|
140
|
+
.to_top_left_origin(page_height=page.size.height * scale)
|
141
|
+
)
|
142
|
+
|
143
|
+
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
144
|
+
element.image = ImageRef.from_pil(
|
145
|
+
cropped_im, dpi=int(72 * scale)
|
146
|
+
)
|
147
|
+
|
148
|
+
return conv_res
|
149
|
+
|
150
|
+
def _turn_md_into_doc(self, conv_res):
|
151
|
+
predicted_text = ""
|
152
|
+
for pg_idx, page in enumerate(conv_res.pages):
|
153
|
+
if page.predictions.vlm_response:
|
154
|
+
predicted_text += page.predictions.vlm_response.text + "\n\n"
|
155
|
+
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
156
|
+
out_doc = InputDocument(
|
157
|
+
path_or_stream=response_bytes,
|
158
|
+
filename=conv_res.input.file.name,
|
159
|
+
format=InputFormat.MD,
|
160
|
+
backend=MarkdownDocumentBackend,
|
161
|
+
)
|
162
|
+
backend = MarkdownDocumentBackend(
|
163
|
+
in_doc=out_doc,
|
164
|
+
path_or_stream=response_bytes,
|
165
|
+
)
|
166
|
+
return backend.convert()
|
167
|
+
|
168
|
+
def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
|
169
|
+
###############################################
|
170
|
+
# Tag definitions and color mappings
|
171
|
+
###############################################
|
172
|
+
|
173
|
+
# Maps the recognized tag to a Docling label.
|
174
|
+
# Code items will be given DocItemLabel.CODE
|
175
|
+
tag_to_doclabel = {
|
176
|
+
"title": DocItemLabel.TITLE,
|
177
|
+
"document_index": DocItemLabel.DOCUMENT_INDEX,
|
178
|
+
"otsl": DocItemLabel.TABLE,
|
179
|
+
"section_header_level_1": DocItemLabel.SECTION_HEADER,
|
180
|
+
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
|
181
|
+
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
|
182
|
+
"text": DocItemLabel.TEXT,
|
183
|
+
"page_header": DocItemLabel.PAGE_HEADER,
|
184
|
+
"page_footer": DocItemLabel.PAGE_FOOTER,
|
185
|
+
"formula": DocItemLabel.FORMULA,
|
186
|
+
"caption": DocItemLabel.CAPTION,
|
187
|
+
"picture": DocItemLabel.PICTURE,
|
188
|
+
"list_item": DocItemLabel.LIST_ITEM,
|
189
|
+
"footnote": DocItemLabel.FOOTNOTE,
|
190
|
+
"code": DocItemLabel.CODE,
|
191
|
+
}
|
192
|
+
|
193
|
+
# Maps each tag to an associated bounding box color.
|
194
|
+
tag_to_color = {
|
195
|
+
"title": "blue",
|
196
|
+
"document_index": "darkblue",
|
197
|
+
"otsl": "green",
|
198
|
+
"section_header_level_1": "purple",
|
199
|
+
"checkbox_selected": "black",
|
200
|
+
"checkbox_unselected": "gray",
|
201
|
+
"text": "red",
|
202
|
+
"page_header": "orange",
|
203
|
+
"page_footer": "cyan",
|
204
|
+
"formula": "pink",
|
205
|
+
"caption": "magenta",
|
206
|
+
"picture": "yellow",
|
207
|
+
"list_item": "brown",
|
208
|
+
"footnote": "darkred",
|
209
|
+
"code": "lightblue",
|
210
|
+
}
|
211
|
+
|
212
|
+
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
213
|
+
"""Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
|
214
|
+
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
215
|
+
if len(coords) == 4:
|
216
|
+
l, t, r, b = map(float, coords)
|
217
|
+
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
218
|
+
return None
|
219
|
+
|
220
|
+
def extract_inner_text(text_chunk: str) -> str:
|
221
|
+
"""Strips all <...> tags inside the chunk to get the raw text content."""
|
222
|
+
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
223
|
+
|
224
|
+
def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
|
225
|
+
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
226
|
+
text = ""
|
227
|
+
if bbox:
|
228
|
+
if page.size:
|
229
|
+
bbox.l = bbox.l * page.size.width
|
230
|
+
bbox.t = bbox.t * page.size.height
|
231
|
+
bbox.r = bbox.r * page.size.width
|
232
|
+
bbox.b = bbox.b * page.size.height
|
233
|
+
if page._backend:
|
234
|
+
text = page._backend.get_text_in_rect(bbox)
|
235
|
+
return text
|
236
|
+
|
237
|
+
def otsl_parse_texts(texts, tokens):
|
238
|
+
split_word = TableToken.OTSL_NL.value
|
239
|
+
split_row_tokens = [
|
240
|
+
list(y)
|
241
|
+
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
242
|
+
if not x
|
243
|
+
]
|
244
|
+
table_cells = []
|
245
|
+
r_idx = 0
|
246
|
+
c_idx = 0
|
247
|
+
|
248
|
+
def count_right(tokens, c_idx, r_idx, which_tokens):
|
249
|
+
span = 0
|
250
|
+
c_idx_iter = c_idx
|
251
|
+
while tokens[r_idx][c_idx_iter] in which_tokens:
|
252
|
+
c_idx_iter += 1
|
253
|
+
span += 1
|
254
|
+
if c_idx_iter >= len(tokens[r_idx]):
|
255
|
+
return span
|
256
|
+
return span
|
257
|
+
|
258
|
+
def count_down(tokens, c_idx, r_idx, which_tokens):
|
259
|
+
span = 0
|
260
|
+
r_idx_iter = r_idx
|
261
|
+
while tokens[r_idx_iter][c_idx] in which_tokens:
|
262
|
+
r_idx_iter += 1
|
263
|
+
span += 1
|
264
|
+
if r_idx_iter >= len(tokens):
|
265
|
+
return span
|
266
|
+
return span
|
267
|
+
|
268
|
+
for i, text in enumerate(texts):
|
269
|
+
cell_text = ""
|
270
|
+
if text in [
|
271
|
+
TableToken.OTSL_FCEL.value,
|
272
|
+
TableToken.OTSL_ECEL.value,
|
273
|
+
TableToken.OTSL_CHED.value,
|
274
|
+
TableToken.OTSL_RHED.value,
|
275
|
+
TableToken.OTSL_SROW.value,
|
276
|
+
]:
|
277
|
+
row_span = 1
|
278
|
+
col_span = 1
|
279
|
+
right_offset = 1
|
280
|
+
if text != TableToken.OTSL_ECEL.value:
|
281
|
+
cell_text = texts[i + 1]
|
282
|
+
right_offset = 2
|
283
|
+
|
284
|
+
# Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
|
285
|
+
next_right_cell = ""
|
286
|
+
if i + right_offset < len(texts):
|
287
|
+
next_right_cell = texts[i + right_offset]
|
288
|
+
|
289
|
+
next_bottom_cell = ""
|
290
|
+
if r_idx + 1 < len(split_row_tokens):
|
291
|
+
if c_idx < len(split_row_tokens[r_idx + 1]):
|
292
|
+
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
293
|
+
|
294
|
+
if next_right_cell in [
|
295
|
+
TableToken.OTSL_LCEL.value,
|
296
|
+
TableToken.OTSL_XCEL.value,
|
297
|
+
]:
|
298
|
+
# we have horisontal spanning cell or 2d spanning cell
|
299
|
+
col_span += count_right(
|
300
|
+
split_row_tokens,
|
301
|
+
c_idx + 1,
|
302
|
+
r_idx,
|
303
|
+
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
304
|
+
)
|
305
|
+
if next_bottom_cell in [
|
306
|
+
TableToken.OTSL_UCEL.value,
|
307
|
+
TableToken.OTSL_XCEL.value,
|
308
|
+
]:
|
309
|
+
# we have a vertical spanning cell or 2d spanning cell
|
310
|
+
row_span += count_down(
|
311
|
+
split_row_tokens,
|
312
|
+
c_idx,
|
313
|
+
r_idx + 1,
|
314
|
+
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
315
|
+
)
|
316
|
+
|
317
|
+
table_cells.append(
|
318
|
+
TableCell(
|
319
|
+
text=cell_text.strip(),
|
320
|
+
row_span=row_span,
|
321
|
+
col_span=col_span,
|
322
|
+
start_row_offset_idx=r_idx,
|
323
|
+
end_row_offset_idx=r_idx + row_span,
|
324
|
+
start_col_offset_idx=c_idx,
|
325
|
+
end_col_offset_idx=c_idx + col_span,
|
326
|
+
)
|
327
|
+
)
|
328
|
+
if text in [
|
329
|
+
TableToken.OTSL_FCEL.value,
|
330
|
+
TableToken.OTSL_ECEL.value,
|
331
|
+
TableToken.OTSL_CHED.value,
|
332
|
+
TableToken.OTSL_RHED.value,
|
333
|
+
TableToken.OTSL_SROW.value,
|
334
|
+
TableToken.OTSL_LCEL.value,
|
335
|
+
TableToken.OTSL_UCEL.value,
|
336
|
+
TableToken.OTSL_XCEL.value,
|
337
|
+
]:
|
338
|
+
c_idx += 1
|
339
|
+
if text == TableToken.OTSL_NL.value:
|
340
|
+
r_idx += 1
|
341
|
+
c_idx = 0
|
342
|
+
return table_cells, split_row_tokens
|
343
|
+
|
344
|
+
def otsl_extract_tokens_and_text(s: str):
|
345
|
+
# Pattern to match anything enclosed by < > (including the angle brackets themselves)
|
346
|
+
pattern = r"(<[^>]+>)"
|
347
|
+
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
348
|
+
tokens = re.findall(pattern, s)
|
349
|
+
# Remove any tokens that start with "<loc_"
|
350
|
+
tokens = [
|
351
|
+
token
|
352
|
+
for token in tokens
|
353
|
+
if not (
|
354
|
+
token.startswith(rf"<{DocumentToken.LOC.value}")
|
355
|
+
or token
|
356
|
+
in [
|
357
|
+
rf"<{DocumentToken.OTSL.value}>",
|
358
|
+
rf"</{DocumentToken.OTSL.value}>",
|
359
|
+
]
|
360
|
+
)
|
361
|
+
]
|
362
|
+
# Split the string by those tokens to get the in-between text
|
363
|
+
text_parts = re.split(pattern, s)
|
364
|
+
text_parts = [
|
365
|
+
token
|
366
|
+
for token in text_parts
|
367
|
+
if not (
|
368
|
+
token.startswith(rf"<{DocumentToken.LOC.value}")
|
369
|
+
or token
|
370
|
+
in [
|
371
|
+
rf"<{DocumentToken.OTSL.value}>",
|
372
|
+
rf"</{DocumentToken.OTSL.value}>",
|
373
|
+
]
|
374
|
+
)
|
375
|
+
]
|
376
|
+
# Remove any empty or purely whitespace strings from text_parts
|
377
|
+
text_parts = [part for part in text_parts if part.strip()]
|
378
|
+
|
379
|
+
return tokens, text_parts
|
380
|
+
|
381
|
+
def parse_table_content(otsl_content: str) -> TableData:
|
382
|
+
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
383
|
+
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
384
|
+
|
385
|
+
return TableData(
|
386
|
+
num_rows=len(split_row_tokens),
|
387
|
+
num_cols=(
|
388
|
+
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
389
|
+
),
|
390
|
+
table_cells=table_cells,
|
391
|
+
)
|
392
|
+
|
393
|
+
doc = DoclingDocument(name="Document")
|
394
|
+
for pg_idx, page in enumerate(pages):
|
395
|
+
xml_content = ""
|
396
|
+
predicted_text = ""
|
397
|
+
if page.predictions.vlm_response:
|
398
|
+
predicted_text = page.predictions.vlm_response.text
|
399
|
+
image = page.image
|
400
|
+
|
401
|
+
page_no = pg_idx + 1
|
402
|
+
bounding_boxes = []
|
403
|
+
|
404
|
+
if page.size:
|
405
|
+
pg_width = page.size.width
|
406
|
+
pg_height = page.size.height
|
407
|
+
size = Size(width=pg_width, height=pg_height)
|
408
|
+
parent_page = doc.add_page(page_no=page_no, size=size)
|
409
|
+
|
410
|
+
"""
|
411
|
+
1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
|
412
|
+
2. For each chunk, extracts bounding box (if any) and inner text.
|
413
|
+
3. Adds the item to a DoclingDocument structure with the right label.
|
414
|
+
4. Tracks bounding boxes + color in a separate list for later visualization.
|
415
|
+
"""
|
416
|
+
|
417
|
+
# Regex for all recognized tags
|
418
|
+
tag_pattern = (
|
419
|
+
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
420
|
+
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
421
|
+
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
422
|
+
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
423
|
+
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
424
|
+
rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
425
|
+
rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
426
|
+
)
|
427
|
+
|
428
|
+
# DocumentToken.OTSL
|
429
|
+
pattern = re.compile(tag_pattern, re.DOTALL)
|
430
|
+
|
431
|
+
# Go through each match in order
|
432
|
+
for match in pattern.finditer(predicted_text):
|
433
|
+
full_chunk = match.group(0)
|
434
|
+
tag_name = match.group("tag")
|
435
|
+
|
436
|
+
bbox = extract_bounding_box(full_chunk)
|
437
|
+
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
438
|
+
color = tag_to_color.get(tag_name, "white")
|
439
|
+
|
440
|
+
# Store bounding box + color
|
441
|
+
if bbox:
|
442
|
+
bounding_boxes.append((bbox, color))
|
443
|
+
|
444
|
+
if tag_name == DocumentToken.OTSL.value:
|
445
|
+
table_data = parse_table_content(full_chunk)
|
446
|
+
bbox = extract_bounding_box(full_chunk)
|
447
|
+
|
448
|
+
if bbox:
|
449
|
+
prov = ProvenanceItem(
|
450
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
451
|
+
charspan=(0, 0),
|
452
|
+
page_no=page_no,
|
453
|
+
)
|
454
|
+
doc.add_table(data=table_data, prov=prov)
|
455
|
+
else:
|
456
|
+
doc.add_table(data=table_data)
|
457
|
+
|
458
|
+
elif tag_name == DocItemLabel.PICTURE:
|
459
|
+
text_caption_content = extract_inner_text(full_chunk)
|
460
|
+
if image:
|
461
|
+
if bbox:
|
462
|
+
im_width, im_height = image.size
|
463
|
+
|
464
|
+
crop_box = (
|
465
|
+
int(bbox.l * im_width),
|
466
|
+
int(bbox.t * im_height),
|
467
|
+
int(bbox.r * im_width),
|
468
|
+
int(bbox.b * im_height),
|
469
|
+
)
|
470
|
+
cropped_image = image.crop(crop_box)
|
471
|
+
pic = doc.add_picture(
|
472
|
+
parent=None,
|
473
|
+
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
474
|
+
prov=(
|
475
|
+
ProvenanceItem(
|
476
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
477
|
+
charspan=(0, 0),
|
478
|
+
page_no=page_no,
|
479
|
+
)
|
480
|
+
),
|
481
|
+
)
|
482
|
+
# If there is a caption to an image, add it as well
|
483
|
+
if len(text_caption_content) > 0:
|
484
|
+
caption_item = doc.add_text(
|
485
|
+
label=DocItemLabel.CAPTION,
|
486
|
+
text=text_caption_content,
|
487
|
+
parent=None,
|
488
|
+
)
|
489
|
+
pic.captions.append(caption_item.get_ref())
|
490
|
+
else:
|
491
|
+
if bbox:
|
492
|
+
# In case we don't have access to an binary of an image
|
493
|
+
doc.add_picture(
|
494
|
+
parent=None,
|
495
|
+
prov=ProvenanceItem(
|
496
|
+
bbox=bbox, charspan=(0, 0), page_no=page_no
|
497
|
+
),
|
498
|
+
)
|
499
|
+
# If there is a caption to an image, add it as well
|
500
|
+
if len(text_caption_content) > 0:
|
501
|
+
caption_item = doc.add_text(
|
502
|
+
label=DocItemLabel.CAPTION,
|
503
|
+
text=text_caption_content,
|
504
|
+
parent=None,
|
505
|
+
)
|
506
|
+
pic.captions.append(caption_item.get_ref())
|
507
|
+
else:
|
508
|
+
# For everything else, treat as text
|
509
|
+
if self.force_backend_text:
|
510
|
+
text_content = extract_text_from_backend(page, bbox)
|
511
|
+
else:
|
512
|
+
text_content = extract_inner_text(full_chunk)
|
513
|
+
doc.add_text(
|
514
|
+
label=doc_label,
|
515
|
+
text=text_content,
|
516
|
+
prov=(
|
517
|
+
ProvenanceItem(
|
518
|
+
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
519
|
+
charspan=(0, len(text_content)),
|
520
|
+
page_no=page_no,
|
521
|
+
)
|
522
|
+
if bbox
|
523
|
+
else None
|
524
|
+
),
|
525
|
+
)
|
526
|
+
return doc
|
527
|
+
|
528
|
+
@classmethod
|
529
|
+
def get_default_options(cls) -> VlmPipelineOptions:
|
530
|
+
return VlmPipelineOptions()
|
531
|
+
|
532
|
+
@classmethod
|
533
|
+
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
534
|
+
return isinstance(backend, PdfDocumentBackend)
|
@@ -2,7 +2,10 @@ import logging
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Optional
|
4
4
|
|
5
|
-
from docling.datamodel.pipeline_options import
|
5
|
+
from docling.datamodel.pipeline_options import (
|
6
|
+
granite_picture_description,
|
7
|
+
smolvlm_picture_description,
|
8
|
+
)
|
6
9
|
from docling.datamodel.settings import settings
|
7
10
|
from docling.models.code_formula_model import CodeFormulaModel
|
8
11
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
@@ -23,7 +26,8 @@ def download_models(
|
|
23
26
|
with_tableformer: bool = True,
|
24
27
|
with_code_formula: bool = True,
|
25
28
|
with_picture_classifier: bool = True,
|
26
|
-
with_smolvlm: bool =
|
29
|
+
with_smolvlm: bool = False,
|
30
|
+
with_granite_vision: bool = False,
|
27
31
|
with_easyocr: bool = True,
|
28
32
|
):
|
29
33
|
if output_dir is None:
|
@@ -73,6 +77,15 @@ def download_models(
|
|
73
77
|
progress=progress,
|
74
78
|
)
|
75
79
|
|
80
|
+
if with_granite_vision:
|
81
|
+
_log.info(f"Downloading Granite Vision model...")
|
82
|
+
PictureDescriptionVlmModel.download_models(
|
83
|
+
repo_id=granite_picture_description.repo_id,
|
84
|
+
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
85
|
+
force=force,
|
86
|
+
progress=progress,
|
87
|
+
)
|
88
|
+
|
76
89
|
if with_easyocr:
|
77
90
|
_log.info(f"Downloading easyocr models...")
|
78
91
|
EasyOcrModel.download_models(
|
docling/utils/visualization.py
CHANGED
@@ -43,6 +43,11 @@ def draw_clusters(
|
|
43
43
|
y0 *= scale_x
|
44
44
|
y1 *= scale_y
|
45
45
|
|
46
|
+
if y1 <= y0:
|
47
|
+
y1, y0 = y0, y1
|
48
|
+
if x1 <= x0:
|
49
|
+
x1, x0 = x0, x1
|
50
|
+
|
46
51
|
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
47
52
|
cluster_outline_color = (
|
48
53
|
*list(DocItemLabel.get_color(c.label)),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.25.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -25,6 +25,7 @@ Provides-Extra: ocrmac
|
|
25
25
|
Provides-Extra: rapidocr
|
26
26
|
Provides-Extra: tesserocr
|
27
27
|
Provides-Extra: vlm
|
28
|
+
Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
28
29
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
|
29
30
|
Requires-Dist: certifi (>=2024.7.4)
|
30
31
|
Requires-Dist: docling-core[chunking] (>=2.19.0,<3.0.0)
|
@@ -5,7 +5,7 @@ docling/backend/asciidoc_backend.py,sha256=zyHxlG_BvlLwvpdNca3P6aopxOJZw8wbDFkJQ
|
|
5
5
|
docling/backend/csv_backend.py,sha256=xuId4JGEXjoyPgO9Fy9hQ5C-ezXvJwv0TGB8fyFHgWM,4533
|
6
6
|
docling/backend/docling_parse_backend.py,sha256=hEEJibI1oJS0LAnFoIs6gMshS3bCqGtVxHnDNvBGZuA,7649
|
7
7
|
docling/backend/docling_parse_v2_backend.py,sha256=IpwrBrtLGwNRl5AYO-o3NjEfNRsAkuMhzvDt2HXb9Ko,8655
|
8
|
-
docling/backend/html_backend.py,sha256=
|
8
|
+
docling/backend/html_backend.py,sha256=j5ivNBDMM0bs24GxTHGGcsA7Z0pnb3iEZ2QKS0Xxdrc,17286
|
9
9
|
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
10
|
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
11
11
|
docling/backend/md_backend.py,sha256=NaVfcnEH-5bwVovjn76EobF6B6Wm8AhaTZ4E8k0TUPo,16826
|
@@ -20,12 +20,12 @@ docling/backend/xml/uspto_backend.py,sha256=IGUNeF2xpLeaVrX6nKb-jXgtSYD2ozULsrDP
|
|
20
20
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
21
21
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
22
|
docling/cli/main.py,sha256=pCJ_GFgxsgZ0soz32OhMl-CWi7YXIrvax_m9Qw4UhMs,16839
|
23
|
-
docling/cli/models.py,sha256=
|
23
|
+
docling/cli/models.py,sha256=DDnz-boX2MexPxC8OnOMPgSPG0iwseT3xkkCfgPrZis,3969
|
24
24
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
25
25
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
26
|
-
docling/datamodel/base_models.py,sha256=
|
26
|
+
docling/datamodel/base_models.py,sha256=kMDT-rFhtJUFOOOry4wd2PzCMTLFixFklgSgmRDMS64,7201
|
27
27
|
docling/datamodel/document.py,sha256=DbJifyMgBEkAk80BMYXTuSgqH2vijDENDkU7Fmr6j_g,14567
|
28
|
-
docling/datamodel/pipeline_options.py,sha256=
|
28
|
+
docling/datamodel/pipeline_options.py,sha256=YpWqCqkA44YUFPhiBg_LYcfOAXxNhv10vZKrkfLtJ_I,11987
|
29
29
|
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
30
30
|
docling/document_converter.py,sha256=AeiSmKzWcnOkZm8O-KIBG72g3l4W2CAsq3yEbfC1tiE,13184
|
31
31
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
@@ -35,13 +35,14 @@ docling/models/base_ocr_model.py,sha256=YiUMvdjnHw9SHjnfJKT5INrPMoIGEf_Z2OApfl_V
|
|
35
35
|
docling/models/code_formula_model.py,sha256=6grbRPWaLljadheT5s4omdT6hmXfin4gJU17csWvhjY,8611
|
36
36
|
docling/models/document_picture_classifier.py,sha256=6I_j6fG5fnhIV6rqN31LYikNTZyg5isXrVs0GIqHDaY,6235
|
37
37
|
docling/models/easyocr_model.py,sha256=ePg1exAXeOzkBRBT-6PBSmqKFmnNFkCEd4HNDsGVgLM,6860
|
38
|
+
docling/models/hf_vlm_model.py,sha256=NUtLEuG-kNGJeDHWmQKAAOZG4WF0a5hn-KXUUM1mHBQ,6820
|
38
39
|
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
39
40
|
docling/models/ocr_mac_model.py,sha256=bLP14UUmZcSzjDe-HLj-mtksTuBmsCTg2C1wCxUpan0,4502
|
40
41
|
docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
|
41
42
|
docling/models/page_preprocessing_model.py,sha256=1gVrZjObKxAvXkkKvXnIFApPOggzgiTFPtt1CGbMbSs,2763
|
42
43
|
docling/models/picture_description_api_model.py,sha256=SKNoHpqzbfM8iO-DJJ4ccyNVqO0B2d9neLBnXqt50FY,3186
|
43
44
|
docling/models/picture_description_base_model.py,sha256=rZLIW1_CaRAw_EP3zuI8ktC0ZxwO7yubhh2RkaC_8e8,1910
|
44
|
-
docling/models/picture_description_vlm_model.py,sha256=
|
45
|
+
docling/models/picture_description_vlm_model.py,sha256=EvKn4zWgTsQnbMFEoDhU3Ox4Pu5DkPqd2QewsGoXULU,3641
|
45
46
|
docling/models/rapid_ocr_model.py,sha256=2HXmurNRPP6qyqn7U5h9NQIs8zi0TMHf56CpcKQk0fU,5038
|
46
47
|
docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
|
47
48
|
docling/models/table_structure_model.py,sha256=UIqWlw_9JNfGsO86c00rPb4GCg-yNliKEwyhCqlsZbM,11225
|
@@ -51,19 +52,20 @@ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
51
52
|
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
52
53
|
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
53
54
|
docling/pipeline/standard_pdf_pipeline.py,sha256=IQHktVYvueTrYnIgLonaMvfYKKsU3L-hC9dqrR-Lw8g,12904
|
55
|
+
docling/pipeline/vlm_pipeline.py,sha256=glPwNH1QEuHj35L3tdPyuCX0CGlJn81ZDFrj3WwLa7o,22265
|
54
56
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
55
57
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
56
58
|
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
57
59
|
docling/utils/export.py,sha256=KyGF1BVDHPFfHVXZc8vegsWlFfOgGPP2YckWpTadyI8,4694
|
58
60
|
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
59
61
|
docling/utils/layout_postprocessor.py,sha256=urRzeF9PrKiMBvA6DdHHwyLxG06CMhelgJeV5B1l6l0,24258
|
60
|
-
docling/utils/model_downloader.py,sha256=
|
62
|
+
docling/utils/model_downloader.py,sha256=sxAQvjiIu9m2Ur5Ot5C5SATmgWJAHi0xSjzxj8QXYJk,3213
|
61
63
|
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
62
64
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
63
65
|
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
64
|
-
docling/utils/visualization.py,sha256=
|
65
|
-
docling-2.
|
66
|
-
docling-2.
|
67
|
-
docling-2.
|
68
|
-
docling-2.
|
69
|
-
docling-2.
|
66
|
+
docling/utils/visualization.py,sha256=cmbIroPQXPmJdFrNIfpC26WpijBwx05qmpu3QhiG1EI,2850
|
67
|
+
docling-2.25.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
68
|
+
docling-2.25.0.dist-info/METADATA,sha256=9k71yJWmZHMXgiGxqsmh6KhItKh5kvIDG5TpX2-1vgI,8797
|
69
|
+
docling-2.25.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
70
|
+
docling-2.25.0.dist-info/entry_points.txt,sha256=cFrINXsORijdm2EWJzf1m9_rDxH9G9W1fP385-9atY4,84
|
71
|
+
docling-2.25.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|