docling 2.19.0__tar.gz → 2.21.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docling-2.19.0 → docling-2.21.0}/PKG-INFO +5 -2
- {docling-2.19.0 → docling-2.21.0}/docling/cli/main.py +5 -0
- {docling-2.19.0 → docling-2.21.0}/docling/cli/models.py +2 -0
- {docling-2.19.0 → docling-2.21.0}/docling/datamodel/pipeline_options.py +52 -2
- {docling-2.19.0 → docling-2.21.0}/docling/models/base_model.py +2 -2
- {docling-2.19.0 → docling-2.21.0}/docling/models/ds_glm_model.py +60 -2
- {docling-2.19.0 → docling-2.21.0}/docling/models/easyocr_model.py +0 -2
- docling-2.21.0/docling/models/picture_description_api_model.py +101 -0
- docling-2.21.0/docling/models/picture_description_base_model.py +64 -0
- docling-2.21.0/docling/models/picture_description_vlm_model.py +109 -0
- {docling-2.19.0 → docling-2.21.0}/docling/pipeline/standard_pdf_pipeline.py +41 -1
- {docling-2.19.0 → docling-2.21.0}/docling/utils/glm_utils.py +10 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/model_downloader.py +12 -0
- {docling-2.19.0 → docling-2.21.0}/pyproject.toml +9 -3
- {docling-2.19.0 → docling-2.21.0}/LICENSE +0 -0
- {docling-2.19.0 → docling-2.21.0}/README.md +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/abstract_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/asciidoc_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/docling_parse_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/docling_parse_v2_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/html_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/json/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/json/docling_json_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/md_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/msexcel_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/mspowerpoint_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/msword_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/pdf_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/pypdfium2_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/xml/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/xml/pubmed_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/backend/xml/uspto_backend.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/chunking/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/cli/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/cli/tools.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/datamodel/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/datamodel/base_models.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/datamodel/document.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/datamodel/settings.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/document_converter.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/exceptions.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/base_ocr_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/code_formula_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/document_picture_classifier.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/layout_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/ocr_mac_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/page_assemble_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/page_preprocessing_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/rapid_ocr_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/table_structure_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/tesseract_ocr_cli_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/models/tesseract_ocr_model.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/pipeline/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/pipeline/base_pipeline.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/pipeline/simple_pipeline.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/py.typed +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/__init__.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/accelerator_utils.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/export.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/layout_postprocessor.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/ocr_utils.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/profiling.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/utils.py +0 -0
- {docling-2.19.0 → docling-2.21.0}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.21.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/DS4SD/docling
|
6
6
|
License: MIT
|
@@ -24,10 +24,11 @@ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
24
|
Provides-Extra: ocrmac
|
25
25
|
Provides-Extra: rapidocr
|
26
26
|
Provides-Extra: tesserocr
|
27
|
+
Provides-Extra: vlm
|
27
28
|
Requires-Dist: beautifulsoup4 (>=4.12.3,<4.13.0)
|
28
29
|
Requires-Dist: certifi (>=2024.7.4)
|
29
30
|
Requires-Dist: deepsearch-glm (>=1.0.0,<2.0.0)
|
30
|
-
Requires-Dist: docling-core[chunking] (>=2.
|
31
|
+
Requires-Dist: docling-core[chunking] (>=2.18.0,<3.0.0)
|
31
32
|
Requires-Dist: docling-ibm-models (>=3.3.0,<4.0.0)
|
32
33
|
Requires-Dist: docling-parse (>=3.3.0,<4.0.0)
|
33
34
|
Requires-Dist: easyocr (>=1.7,<2.0)
|
@@ -53,6 +54,8 @@ Requires-Dist: scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"
|
|
53
54
|
Requires-Dist: scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"
|
54
55
|
Requires-Dist: tesserocr (>=2.7.1,<3.0.0) ; extra == "tesserocr"
|
55
56
|
Requires-Dist: tqdm (>=4.65.0,<5.0.0)
|
57
|
+
Requires-Dist: transformers (>=4.42.0,<4.43.0) ; (sys_platform == "darwin" and platform_machine == "x86_64") and (extra == "vlm")
|
58
|
+
Requires-Dist: transformers (>=4.46.0,<5.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
|
56
59
|
Requires-Dist: typer (>=0.12.5,<0.13.0)
|
57
60
|
Project-URL: Repository, https://github.com/DS4SD/docling
|
58
61
|
Description-Content-Type: text/markdown
|
@@ -226,6 +226,10 @@ def convert(
|
|
226
226
|
help="Enable the picture classification enrichment model in the pipeline.",
|
227
227
|
),
|
228
228
|
] = False,
|
229
|
+
enrich_picture_description: Annotated[
|
230
|
+
bool,
|
231
|
+
typer.Option(..., help="Enable the picture description model in the pipeline."),
|
232
|
+
] = False,
|
229
233
|
artifacts_path: Annotated[
|
230
234
|
Optional[Path],
|
231
235
|
typer.Option(..., help="If provided, the location of the model artifacts."),
|
@@ -382,6 +386,7 @@ def convert(
|
|
382
386
|
do_table_structure=True,
|
383
387
|
do_code_enrichment=enrich_code,
|
384
388
|
do_formula_enrichment=enrich_formula,
|
389
|
+
do_picture_description=enrich_picture_description,
|
385
390
|
do_picture_classification=enrich_picture_classes,
|
386
391
|
document_timeout=document_timeout,
|
387
392
|
)
|
@@ -31,6 +31,7 @@ class _AvailableModels(str, Enum):
|
|
31
31
|
TABLEFORMER = "tableformer"
|
32
32
|
CODE_FORMULA = "code_formula"
|
33
33
|
PICTURE_CLASSIFIER = "picture_classifier"
|
34
|
+
SMOLVLM = "smolvlm"
|
34
35
|
EASYOCR = "easyocr"
|
35
36
|
|
36
37
|
|
@@ -81,6 +82,7 @@ def download(
|
|
81
82
|
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
|
82
83
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
83
84
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
85
|
+
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
84
86
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
85
87
|
)
|
86
88
|
|
@@ -2,9 +2,9 @@ import logging
|
|
2
2
|
import os
|
3
3
|
from enum import Enum
|
4
4
|
from pathlib import Path
|
5
|
-
from typing import Any, List, Literal, Optional, Union
|
5
|
+
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
6
6
|
|
7
|
-
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
7
|
+
from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
|
8
8
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
9
9
|
|
10
10
|
_log = logging.getLogger(__name__)
|
@@ -184,6 +184,51 @@ class OcrMacOptions(OcrOptions):
|
|
184
184
|
)
|
185
185
|
|
186
186
|
|
187
|
+
class PictureDescriptionBaseOptions(BaseModel):
|
188
|
+
kind: str
|
189
|
+
batch_size: int = 8
|
190
|
+
scale: float = 2
|
191
|
+
|
192
|
+
bitmap_area_threshold: float = (
|
193
|
+
0.2 # percentage of the area for a bitmap to processed with the models
|
194
|
+
)
|
195
|
+
|
196
|
+
|
197
|
+
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
198
|
+
kind: Literal["api"] = "api"
|
199
|
+
|
200
|
+
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
201
|
+
headers: Dict[str, str] = {}
|
202
|
+
params: Dict[str, Any] = {}
|
203
|
+
timeout: float = 20
|
204
|
+
|
205
|
+
prompt: str = "Describe this image in a few sentences."
|
206
|
+
provenance: str = ""
|
207
|
+
|
208
|
+
|
209
|
+
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
210
|
+
kind: Literal["vlm"] = "vlm"
|
211
|
+
|
212
|
+
repo_id: str
|
213
|
+
prompt: str = "Describe this image in a few sentences."
|
214
|
+
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
|
215
|
+
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
|
216
|
+
|
217
|
+
@property
|
218
|
+
def repo_cache_folder(self) -> str:
|
219
|
+
return self.repo_id.replace("/", "--")
|
220
|
+
|
221
|
+
|
222
|
+
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
223
|
+
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
224
|
+
)
|
225
|
+
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
|
226
|
+
granite_picture_description = PictureDescriptionVlmOptions(
|
227
|
+
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
228
|
+
prompt="What is shown in this image?",
|
229
|
+
)
|
230
|
+
|
231
|
+
|
187
232
|
# Define an enum for the backend options
|
188
233
|
class PdfBackend(str, Enum):
|
189
234
|
"""Enum of valid PDF backends."""
|
@@ -223,6 +268,7 @@ class PdfPipelineOptions(PipelineOptions):
|
|
223
268
|
do_code_enrichment: bool = False # True: perform code OCR
|
224
269
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
225
270
|
do_picture_classification: bool = False # True: classify pictures in documents
|
271
|
+
do_picture_description: bool = False # True: run describe pictures in documents
|
226
272
|
|
227
273
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
228
274
|
ocr_options: Union[
|
@@ -232,6 +278,10 @@ class PdfPipelineOptions(PipelineOptions):
|
|
232
278
|
OcrMacOptions,
|
233
279
|
RapidOcrOptions,
|
234
280
|
] = Field(EasyOcrOptions(), discriminator="kind")
|
281
|
+
picture_description_options: Annotated[
|
282
|
+
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
283
|
+
Field(discriminator="kind"),
|
284
|
+
] = smolvlm_picture_description
|
235
285
|
|
236
286
|
images_scale: float = 1.0
|
237
287
|
generate_page_images: bool = False
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from abc import ABC, abstractmethod
|
2
2
|
from typing import Any, Generic, Iterable, Optional
|
3
3
|
|
4
|
-
from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem
|
4
|
+
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
5
5
|
from typing_extensions import TypeVar
|
6
6
|
|
7
7
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
|
@@ -64,7 +64,7 @@ class BaseItemAndImageEnrichmentModel(
|
|
64
64
|
if not self.is_processable(doc=conv_res.document, element=element):
|
65
65
|
return None
|
66
66
|
|
67
|
-
assert isinstance(element,
|
67
|
+
assert isinstance(element, DocItem)
|
68
68
|
element_prov = element.prov[0]
|
69
69
|
|
70
70
|
bbox = element_prov.bbox
|
@@ -4,7 +4,12 @@ from pathlib import Path
|
|
4
4
|
from typing import List, Union
|
5
5
|
|
6
6
|
from deepsearch_glm.andromeda_nlp import nlp_model
|
7
|
-
from docling_core.types.doc import
|
7
|
+
from docling_core.types.doc import (
|
8
|
+
BoundingBox,
|
9
|
+
CoordOrigin,
|
10
|
+
DocItemLabel,
|
11
|
+
DoclingDocument,
|
12
|
+
)
|
8
13
|
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
|
9
14
|
from docling_core.types.legacy_doc.base import (
|
10
15
|
Figure,
|
@@ -71,12 +76,15 @@ class GlmModel:
|
|
71
76
|
)
|
72
77
|
|
73
78
|
main_text: List[Union[Ref, BaseText]] = []
|
79
|
+
page_headers: List[Union[Ref, BaseText]] = []
|
80
|
+
page_footers: List[Union[Ref, BaseText]] = []
|
81
|
+
|
74
82
|
tables: List[DsSchemaTable] = []
|
75
83
|
figures: List[Figure] = []
|
76
84
|
|
77
85
|
page_no_to_page = {p.page_no: p for p in conv_res.pages}
|
78
86
|
|
79
|
-
for element in conv_res.assembled.
|
87
|
+
for element in conv_res.assembled.body:
|
80
88
|
# Convert bboxes to lower-left origin.
|
81
89
|
target_bbox = DsBoundingBox(
|
82
90
|
element.cluster.bbox.to_bottom_left_origin(
|
@@ -238,6 +246,53 @@ class GlmModel:
|
|
238
246
|
)
|
239
247
|
)
|
240
248
|
|
249
|
+
# We can throw in headers and footers at the end of the legacy doc
|
250
|
+
# since the reading-order will re-sort it later.
|
251
|
+
for element in conv_res.assembled.headers:
|
252
|
+
# Convert bboxes to lower-left origin.
|
253
|
+
target_bbox = DsBoundingBox(
|
254
|
+
element.cluster.bbox.to_bottom_left_origin(
|
255
|
+
page_no_to_page[element.page_no].size.height
|
256
|
+
).as_tuple()
|
257
|
+
)
|
258
|
+
|
259
|
+
if isinstance(element, TextElement):
|
260
|
+
|
261
|
+
tel = BaseText(
|
262
|
+
text=element.text,
|
263
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
264
|
+
name=element.label,
|
265
|
+
prov=[
|
266
|
+
Prov(
|
267
|
+
bbox=target_bbox,
|
268
|
+
page=element.page_no + 1,
|
269
|
+
span=[0, len(element.text)],
|
270
|
+
)
|
271
|
+
],
|
272
|
+
)
|
273
|
+
if element.label == DocItemLabel.PAGE_HEADER:
|
274
|
+
index = len(page_headers)
|
275
|
+
ref_str = f"#/page-headers/{index}"
|
276
|
+
main_text.append(
|
277
|
+
Ref(
|
278
|
+
name=element.label,
|
279
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
280
|
+
ref=ref_str,
|
281
|
+
),
|
282
|
+
)
|
283
|
+
page_headers.append(tel)
|
284
|
+
elif element.label == DocItemLabel.PAGE_FOOTER:
|
285
|
+
index = len(page_footers)
|
286
|
+
ref_str = f"#/page-footers/{index}"
|
287
|
+
main_text.append(
|
288
|
+
Ref(
|
289
|
+
name=element.label,
|
290
|
+
obj_type=layout_label_to_ds_type.get(element.label),
|
291
|
+
ref=ref_str,
|
292
|
+
),
|
293
|
+
)
|
294
|
+
page_footers.append(tel)
|
295
|
+
|
241
296
|
page_dimensions = [
|
242
297
|
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
|
243
298
|
for p in conv_res.pages
|
@@ -252,6 +307,8 @@ class GlmModel:
|
|
252
307
|
tables=tables,
|
253
308
|
figures=figures,
|
254
309
|
page_dimensions=page_dimensions,
|
310
|
+
page_headers=page_headers,
|
311
|
+
page_footers=page_footers,
|
255
312
|
)
|
256
313
|
|
257
314
|
return ds_doc
|
@@ -264,6 +321,7 @@ class GlmModel:
|
|
264
321
|
glm_doc = self.model.apply_on_doc(ds_doc_dict)
|
265
322
|
|
266
323
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
324
|
+
1 == 1
|
267
325
|
|
268
326
|
# DEBUG code:
|
269
327
|
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
|
@@ -4,9 +4,7 @@ import zipfile
|
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import Iterable, List, Optional
|
6
6
|
|
7
|
-
import httpx
|
8
7
|
import numpy
|
9
|
-
import torch
|
10
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
11
9
|
|
12
10
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
@@ -0,0 +1,101 @@
|
|
1
|
+
import base64
|
2
|
+
import io
|
3
|
+
import logging
|
4
|
+
from typing import Iterable, List, Optional
|
5
|
+
|
6
|
+
import requests
|
7
|
+
from PIL import Image
|
8
|
+
from pydantic import BaseModel, ConfigDict
|
9
|
+
|
10
|
+
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
|
11
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
12
|
+
|
13
|
+
_log = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class ChatMessage(BaseModel):
|
17
|
+
role: str
|
18
|
+
content: str
|
19
|
+
|
20
|
+
|
21
|
+
class ResponseChoice(BaseModel):
|
22
|
+
index: int
|
23
|
+
message: ChatMessage
|
24
|
+
finish_reason: str
|
25
|
+
|
26
|
+
|
27
|
+
class ResponseUsage(BaseModel):
|
28
|
+
prompt_tokens: int
|
29
|
+
completion_tokens: int
|
30
|
+
total_tokens: int
|
31
|
+
|
32
|
+
|
33
|
+
class ApiResponse(BaseModel):
|
34
|
+
model_config = ConfigDict(
|
35
|
+
protected_namespaces=(),
|
36
|
+
)
|
37
|
+
|
38
|
+
id: str
|
39
|
+
model: Optional[str] = None # returned by openai
|
40
|
+
choices: List[ResponseChoice]
|
41
|
+
created: int
|
42
|
+
usage: ResponseUsage
|
43
|
+
|
44
|
+
|
45
|
+
class PictureDescriptionApiModel(PictureDescriptionBaseModel):
|
46
|
+
# elements_batch_size = 4
|
47
|
+
|
48
|
+
def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
|
49
|
+
super().__init__(enabled=enabled, options=options)
|
50
|
+
self.options: PictureDescriptionApiOptions
|
51
|
+
|
52
|
+
if self.enabled:
|
53
|
+
if options.url.host != "localhost":
|
54
|
+
raise NotImplementedError(
|
55
|
+
"The options try to connect to remote APIs which are not yet allowed."
|
56
|
+
)
|
57
|
+
|
58
|
+
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
59
|
+
# Note: technically we could make a batch request here,
|
60
|
+
# but not all APIs will allow for it. For example, vllm won't allow more than 1.
|
61
|
+
for image in images:
|
62
|
+
img_io = io.BytesIO()
|
63
|
+
image.save(img_io, "PNG")
|
64
|
+
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
65
|
+
|
66
|
+
messages = [
|
67
|
+
{
|
68
|
+
"role": "user",
|
69
|
+
"content": [
|
70
|
+
{
|
71
|
+
"type": "text",
|
72
|
+
"text": self.options.prompt,
|
73
|
+
},
|
74
|
+
{
|
75
|
+
"type": "image_url",
|
76
|
+
"image_url": {
|
77
|
+
"url": f"data:image/png;base64,{image_base64}"
|
78
|
+
},
|
79
|
+
},
|
80
|
+
],
|
81
|
+
}
|
82
|
+
]
|
83
|
+
|
84
|
+
payload = {
|
85
|
+
"messages": messages,
|
86
|
+
**self.options.params,
|
87
|
+
}
|
88
|
+
|
89
|
+
r = requests.post(
|
90
|
+
str(self.options.url),
|
91
|
+
headers=self.options.headers,
|
92
|
+
json=payload,
|
93
|
+
timeout=self.options.timeout,
|
94
|
+
)
|
95
|
+
if not r.ok:
|
96
|
+
_log.error(f"Error calling the API. Reponse was {r.text}")
|
97
|
+
r.raise_for_status()
|
98
|
+
|
99
|
+
api_resp = ApiResponse.model_validate_json(r.text)
|
100
|
+
generated_text = api_resp.choices[0].message.content.strip()
|
101
|
+
yield generated_text
|
@@ -0,0 +1,64 @@
|
|
1
|
+
import logging
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, Iterable, List, Optional, Union
|
4
|
+
|
5
|
+
from docling_core.types.doc import (
|
6
|
+
DoclingDocument,
|
7
|
+
NodeItem,
|
8
|
+
PictureClassificationClass,
|
9
|
+
PictureItem,
|
10
|
+
)
|
11
|
+
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
12
|
+
PictureDescriptionData,
|
13
|
+
)
|
14
|
+
from PIL import Image
|
15
|
+
|
16
|
+
from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
|
17
|
+
from docling.models.base_model import (
|
18
|
+
BaseItemAndImageEnrichmentModel,
|
19
|
+
ItemAndImageEnrichmentElement,
|
20
|
+
)
|
21
|
+
|
22
|
+
|
23
|
+
class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
|
24
|
+
images_scale: float = 2.0
|
25
|
+
|
26
|
+
def __init__(
|
27
|
+
self,
|
28
|
+
enabled: bool,
|
29
|
+
options: PictureDescriptionBaseOptions,
|
30
|
+
):
|
31
|
+
self.enabled = enabled
|
32
|
+
self.options = options
|
33
|
+
self.provenance = "not-implemented"
|
34
|
+
|
35
|
+
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
36
|
+
return self.enabled and isinstance(element, PictureItem)
|
37
|
+
|
38
|
+
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
39
|
+
raise NotImplementedError
|
40
|
+
|
41
|
+
def __call__(
|
42
|
+
self,
|
43
|
+
doc: DoclingDocument,
|
44
|
+
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
45
|
+
) -> Iterable[NodeItem]:
|
46
|
+
if not self.enabled:
|
47
|
+
for element in element_batch:
|
48
|
+
yield element.item
|
49
|
+
return
|
50
|
+
|
51
|
+
images: List[Image.Image] = []
|
52
|
+
elements: List[PictureItem] = []
|
53
|
+
for el in element_batch:
|
54
|
+
assert isinstance(el.item, PictureItem)
|
55
|
+
elements.append(el.item)
|
56
|
+
images.append(el.image)
|
57
|
+
|
58
|
+
outputs = self._annotate_images(images)
|
59
|
+
|
60
|
+
for item, output in zip(elements, outputs):
|
61
|
+
item.annotations.append(
|
62
|
+
PictureDescriptionData(text=output, provenance=self.provenance)
|
63
|
+
)
|
64
|
+
yield item
|
@@ -0,0 +1,109 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Iterable, Optional, Union
|
3
|
+
|
4
|
+
from PIL import Image
|
5
|
+
|
6
|
+
from docling.datamodel.pipeline_options import (
|
7
|
+
AcceleratorOptions,
|
8
|
+
PictureDescriptionVlmOptions,
|
9
|
+
)
|
10
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
11
|
+
from docling.utils.accelerator_utils import decide_device
|
12
|
+
|
13
|
+
|
14
|
+
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
15
|
+
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
enabled: bool,
|
19
|
+
artifacts_path: Optional[Union[Path, str]],
|
20
|
+
options: PictureDescriptionVlmOptions,
|
21
|
+
accelerator_options: AcceleratorOptions,
|
22
|
+
):
|
23
|
+
super().__init__(enabled=enabled, options=options)
|
24
|
+
self.options: PictureDescriptionVlmOptions
|
25
|
+
|
26
|
+
if self.enabled:
|
27
|
+
|
28
|
+
if artifacts_path is None:
|
29
|
+
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
30
|
+
else:
|
31
|
+
artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder
|
32
|
+
|
33
|
+
self.device = decide_device(accelerator_options.device)
|
34
|
+
|
35
|
+
try:
|
36
|
+
import torch
|
37
|
+
from transformers import AutoModelForVision2Seq, AutoProcessor
|
38
|
+
except ImportError:
|
39
|
+
raise ImportError(
|
40
|
+
"transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`."
|
41
|
+
)
|
42
|
+
|
43
|
+
# Initialize processor and model
|
44
|
+
self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
|
45
|
+
self.model = AutoModelForVision2Seq.from_pretrained(
|
46
|
+
self.options.repo_id,
|
47
|
+
torch_dtype=torch.bfloat16,
|
48
|
+
_attn_implementation=(
|
49
|
+
"flash_attention_2" if self.device.startswith("cuda") else "eager"
|
50
|
+
),
|
51
|
+
).to(self.device)
|
52
|
+
|
53
|
+
self.provenance = f"{self.options.repo_id}"
|
54
|
+
|
55
|
+
@staticmethod
|
56
|
+
def download_models(
|
57
|
+
repo_id: str,
|
58
|
+
local_dir: Optional[Path] = None,
|
59
|
+
force: bool = False,
|
60
|
+
progress: bool = False,
|
61
|
+
) -> Path:
|
62
|
+
from huggingface_hub import snapshot_download
|
63
|
+
from huggingface_hub.utils import disable_progress_bars
|
64
|
+
|
65
|
+
if not progress:
|
66
|
+
disable_progress_bars()
|
67
|
+
download_path = snapshot_download(
|
68
|
+
repo_id=repo_id,
|
69
|
+
force_download=force,
|
70
|
+
local_dir=local_dir,
|
71
|
+
)
|
72
|
+
|
73
|
+
return Path(download_path)
|
74
|
+
|
75
|
+
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
76
|
+
from transformers import GenerationConfig
|
77
|
+
|
78
|
+
# Create input messages
|
79
|
+
messages = [
|
80
|
+
{
|
81
|
+
"role": "user",
|
82
|
+
"content": [
|
83
|
+
{"type": "image"},
|
84
|
+
{"type": "text", "text": self.options.prompt},
|
85
|
+
],
|
86
|
+
},
|
87
|
+
]
|
88
|
+
|
89
|
+
# TODO: do batch generation
|
90
|
+
|
91
|
+
for image in images:
|
92
|
+
# Prepare inputs
|
93
|
+
prompt = self.processor.apply_chat_template(
|
94
|
+
messages, add_generation_prompt=True
|
95
|
+
)
|
96
|
+
inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
|
97
|
+
inputs = inputs.to(self.device)
|
98
|
+
|
99
|
+
# Generate outputs
|
100
|
+
generated_ids = self.model.generate(
|
101
|
+
**inputs,
|
102
|
+
generation_config=GenerationConfig(**self.options.generation_config),
|
103
|
+
)
|
104
|
+
generated_texts = self.processor.batch_decode(
|
105
|
+
generated_ids[:, inputs["input_ids"].shape[1] :],
|
106
|
+
skip_special_tokens=True,
|
107
|
+
)
|
108
|
+
|
109
|
+
yield generated_texts[0].strip()
|
@@ -14,6 +14,8 @@ from docling.datamodel.pipeline_options import (
|
|
14
14
|
EasyOcrOptions,
|
15
15
|
OcrMacOptions,
|
16
16
|
PdfPipelineOptions,
|
17
|
+
PictureDescriptionApiOptions,
|
18
|
+
PictureDescriptionVlmOptions,
|
17
19
|
RapidOcrOptions,
|
18
20
|
TesseractCliOcrOptions,
|
19
21
|
TesseractOcrOptions,
|
@@ -34,6 +36,9 @@ from docling.models.page_preprocessing_model import (
|
|
34
36
|
PagePreprocessingModel,
|
35
37
|
PagePreprocessingOptions,
|
36
38
|
)
|
39
|
+
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
40
|
+
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
41
|
+
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
37
42
|
from docling.models.rapid_ocr_model import RapidOcrModel
|
38
43
|
from docling.models.table_structure_model import TableStructureModel
|
39
44
|
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
@@ -95,8 +100,17 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
95
100
|
PageAssembleModel(options=PageAssembleOptions()),
|
96
101
|
]
|
97
102
|
|
103
|
+
# Picture description model
|
104
|
+
if (
|
105
|
+
picture_description_model := self.get_picture_description_model(
|
106
|
+
artifacts_path=artifacts_path
|
107
|
+
)
|
108
|
+
) is None:
|
109
|
+
raise RuntimeError(
|
110
|
+
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
|
111
|
+
)
|
112
|
+
|
98
113
|
self.enrichment_pipe = [
|
99
|
-
# Other models working on `NodeItem` elements in the DoclingDocument
|
100
114
|
# Code Formula Enrichment Model
|
101
115
|
CodeFormulaModel(
|
102
116
|
enabled=pipeline_options.do_code_enrichment
|
@@ -115,11 +129,14 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
115
129
|
options=DocumentPictureClassifierOptions(),
|
116
130
|
accelerator_options=pipeline_options.accelerator_options,
|
117
131
|
),
|
132
|
+
# Document Picture description
|
133
|
+
picture_description_model,
|
118
134
|
]
|
119
135
|
|
120
136
|
if (
|
121
137
|
self.pipeline_options.do_formula_enrichment
|
122
138
|
or self.pipeline_options.do_code_enrichment
|
139
|
+
or self.pipeline_options.do_picture_description
|
123
140
|
):
|
124
141
|
self.keep_backend = True
|
125
142
|
|
@@ -175,6 +192,29 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
175
192
|
)
|
176
193
|
return None
|
177
194
|
|
195
|
+
def get_picture_description_model(
|
196
|
+
self, artifacts_path: Optional[Path] = None
|
197
|
+
) -> Optional[PictureDescriptionBaseModel]:
|
198
|
+
if isinstance(
|
199
|
+
self.pipeline_options.picture_description_options,
|
200
|
+
PictureDescriptionApiOptions,
|
201
|
+
):
|
202
|
+
return PictureDescriptionApiModel(
|
203
|
+
enabled=self.pipeline_options.do_picture_description,
|
204
|
+
options=self.pipeline_options.picture_description_options,
|
205
|
+
)
|
206
|
+
elif isinstance(
|
207
|
+
self.pipeline_options.picture_description_options,
|
208
|
+
PictureDescriptionVlmOptions,
|
209
|
+
):
|
210
|
+
return PictureDescriptionVlmModel(
|
211
|
+
enabled=self.pipeline_options.do_picture_description,
|
212
|
+
artifacts_path=artifacts_path,
|
213
|
+
options=self.pipeline_options.picture_description_options,
|
214
|
+
accelerator_options=self.pipeline_options.accelerator_options,
|
215
|
+
)
|
216
|
+
return None
|
217
|
+
|
178
218
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
179
219
|
with TimeRecorder(conv_res, "page_init"):
|
180
220
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
@@ -15,6 +15,7 @@ from docling_core.types.doc import (
|
|
15
15
|
TableCell,
|
16
16
|
TableData,
|
17
17
|
)
|
18
|
+
from docling_core.types.doc.document import ContentLayer
|
18
19
|
|
19
20
|
|
20
21
|
def resolve_item(paths, obj):
|
@@ -311,6 +312,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|
311
312
|
current_list = None
|
312
313
|
|
313
314
|
doc.add_text(label=DocItemLabel.FORMULA, text="", orig=text, prov=prov)
|
315
|
+
elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
|
316
|
+
current_list = None
|
317
|
+
|
318
|
+
doc.add_text(
|
319
|
+
label=DocItemLabel(name_label),
|
320
|
+
text=text,
|
321
|
+
prov=prov,
|
322
|
+
content_layer=ContentLayer.FURNITURE,
|
323
|
+
)
|
314
324
|
else:
|
315
325
|
current_list = None
|
316
326
|
|
@@ -2,11 +2,13 @@ import logging
|
|
2
2
|
from pathlib import Path
|
3
3
|
from typing import Optional
|
4
4
|
|
5
|
+
from docling.datamodel.pipeline_options import smolvlm_picture_description
|
5
6
|
from docling.datamodel.settings import settings
|
6
7
|
from docling.models.code_formula_model import CodeFormulaModel
|
7
8
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
8
9
|
from docling.models.easyocr_model import EasyOcrModel
|
9
10
|
from docling.models.layout_model import LayoutModel
|
11
|
+
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
10
12
|
from docling.models.table_structure_model import TableStructureModel
|
11
13
|
|
12
14
|
_log = logging.getLogger(__name__)
|
@@ -21,6 +23,7 @@ def download_models(
|
|
21
23
|
with_tableformer: bool = True,
|
22
24
|
with_code_formula: bool = True,
|
23
25
|
with_picture_classifier: bool = True,
|
26
|
+
with_smolvlm: bool = True,
|
24
27
|
with_easyocr: bool = True,
|
25
28
|
):
|
26
29
|
if output_dir is None:
|
@@ -61,6 +64,15 @@ def download_models(
|
|
61
64
|
progress=progress,
|
62
65
|
)
|
63
66
|
|
67
|
+
if with_smolvlm:
|
68
|
+
_log.info(f"Downloading SmolVlm model...")
|
69
|
+
PictureDescriptionVlmModel.download_models(
|
70
|
+
repo_id=smolvlm_picture_description.repo_id,
|
71
|
+
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
72
|
+
force=force,
|
73
|
+
progress=progress,
|
74
|
+
)
|
75
|
+
|
64
76
|
if with_easyocr:
|
65
77
|
_log.info(f"Downloading easyocr models...")
|
66
78
|
EasyOcrModel.download_models(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "docling"
|
3
|
-
version = "2.
|
3
|
+
version = "2.21.0" # DO NOT EDIT, updated automatically
|
4
4
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
5
5
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
6
6
|
license = "MIT"
|
@@ -26,7 +26,7 @@ packages = [{include = "docling"}]
|
|
26
26
|
######################
|
27
27
|
python = "^3.9"
|
28
28
|
pydantic = "^2.0.0"
|
29
|
-
docling-core = {extras = ["chunking"], version = "^2.
|
29
|
+
docling-core = {extras = ["chunking"], version = "^2.18.0"}
|
30
30
|
docling-ibm-models = "^3.3.0"
|
31
31
|
deepsearch-glm = "^1.0.0"
|
32
32
|
docling-parse = "^3.3.0"
|
@@ -59,6 +59,10 @@ onnxruntime = [
|
|
59
59
|
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
60
60
|
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
61
61
|
]
|
62
|
+
transformers = [
|
63
|
+
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
|
64
|
+
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
|
65
|
+
]
|
62
66
|
pillow = "^10.0.0"
|
63
67
|
tqdm = "^4.65.0"
|
64
68
|
|
@@ -121,6 +125,7 @@ torchvision = [
|
|
121
125
|
[tool.poetry.extras]
|
122
126
|
tesserocr = ["tesserocr"]
|
123
127
|
ocrmac = ["ocrmac"]
|
128
|
+
vlm = ["transformers"]
|
124
129
|
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
125
130
|
|
126
131
|
[tool.poetry.scripts]
|
@@ -162,7 +167,8 @@ module = [
|
|
162
167
|
"deepsearch_glm.*",
|
163
168
|
"lxml.*",
|
164
169
|
"bs4.*",
|
165
|
-
"huggingface_hub.*"
|
170
|
+
"huggingface_hub.*",
|
171
|
+
"transformers.*",
|
166
172
|
]
|
167
173
|
ignore_missing_imports = true
|
168
174
|
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|