docling 2.29.0__py3-none-any.whl → 2.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +26 -9
- docling/backend/md_backend.py +5 -7
- docling/backend/msexcel_backend.py +271 -95
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +23 -15
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +10 -13
- docling/backend/xml/uspto_backend.py +15 -19
- docling/cli/main.py +27 -9
- docling/cli/models.py +2 -3
- docling/datamodel/base_models.py +40 -5
- docling/datamodel/document.py +18 -10
- docling/datamodel/pipeline_options.py +29 -4
- docling/document_converter.py +5 -5
- docling/models/api_vlm_model.py +66 -0
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +9 -75
- docling/models/picture_description_base_model.py +16 -5
- docling/models/picture_description_vlm_model.py +2 -3
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +8 -23
- docling/models/table_structure_model.py +2 -6
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +8 -6
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +6 -3
- docling/pipeline/vlm_pipeline.py +27 -20
- docling/utils/api_image_request.py +61 -0
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +7 -7
- docling/utils/utils.py +1 -1
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/METADATA +4 -3
- docling-2.31.0.dist-info/RECORD +86 -0
- docling-2.29.0.dist-info/RECORD +0 -84
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/LICENSE +0 -0
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/WHEEL +0 -0
- {docling-2.29.0.dist-info → docling-2.31.0.dist-info}/entry_points.txt +0 -0
@@ -1,12 +1,7 @@
|
|
1
|
-
import copy
|
2
|
-
import random
|
3
1
|
from pathlib import Path
|
4
2
|
from typing import Dict, List
|
5
3
|
|
6
4
|
from docling_core.types.doc import (
|
7
|
-
BoundingBox,
|
8
|
-
CoordOrigin,
|
9
|
-
DocItem,
|
10
5
|
DocItemLabel,
|
11
6
|
DoclingDocument,
|
12
7
|
DocumentOrigin,
|
@@ -17,13 +12,10 @@ from docling_core.types.doc import (
|
|
17
12
|
TableData,
|
18
13
|
)
|
19
14
|
from docling_core.types.doc.document import ContentLayer
|
20
|
-
from docling_core.types.legacy_doc.base import Ref
|
21
|
-
from docling_core.types.legacy_doc.document import BaseText
|
22
15
|
from docling_ibm_models.reading_order.reading_order_rb import (
|
23
16
|
PageElement as ReadingOrderPageElement,
|
17
|
+
ReadingOrderPredictor,
|
24
18
|
)
|
25
|
-
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
26
|
-
from PIL import ImageDraw
|
27
19
|
from pydantic import BaseModel, ConfigDict
|
28
20
|
|
29
21
|
from docling.datamodel.base_models import (
|
@@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
|
|
35
27
|
TextElement,
|
36
28
|
)
|
37
29
|
from docling.datamodel.document import ConversionResult
|
38
|
-
from docling.datamodel.settings import settings
|
39
30
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
40
31
|
|
41
32
|
|
@@ -53,12 +44,10 @@ class ReadingOrderModel:
|
|
53
44
|
def _assembled_to_readingorder_elements(
|
54
45
|
self, conv_res: ConversionResult
|
55
46
|
) -> List[ReadingOrderPageElement]:
|
56
|
-
|
57
47
|
elements: List[ReadingOrderPageElement] = []
|
58
48
|
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
59
49
|
|
60
50
|
for element in conv_res.assembled.elements:
|
61
|
-
|
62
51
|
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
63
52
|
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
64
53
|
text = element.text or ""
|
@@ -84,7 +73,6 @@ class ReadingOrderModel:
|
|
84
73
|
def _add_child_elements(
|
85
74
|
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
86
75
|
):
|
87
|
-
|
88
76
|
child: Cluster
|
89
77
|
for child in element.cluster.children:
|
90
78
|
c_label = child.label
|
@@ -110,7 +98,7 @@ class ReadingOrderModel:
|
|
110
98
|
else:
|
111
99
|
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
112
100
|
|
113
|
-
def _readingorder_elements_to_docling_doc(
|
101
|
+
def _readingorder_elements_to_docling_doc( # noqa: C901
|
114
102
|
self,
|
115
103
|
conv_res: ConversionResult,
|
116
104
|
ro_elements: List[ReadingOrderPageElement],
|
@@ -118,7 +106,6 @@ class ReadingOrderModel:
|
|
118
106
|
el_to_footnotes_mapping: Dict[int, List[int]],
|
119
107
|
el_merges_mapping: Dict[int, List[int]],
|
120
108
|
) -> DoclingDocument:
|
121
|
-
|
122
109
|
id_to_elem = {
|
123
110
|
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
124
111
|
for elem in conv_res.assembled.elements
|
@@ -192,7 +179,6 @@ class ReadingOrderModel:
|
|
192
179
|
|
193
180
|
code_item.footnotes.append(new_footnote_item.get_ref())
|
194
181
|
else:
|
195
|
-
|
196
182
|
new_item, current_list = self._handle_text_element(
|
197
183
|
element, out_doc, current_list, page_height
|
198
184
|
)
|
@@ -206,7 +192,6 @@ class ReadingOrderModel:
|
|
206
192
|
)
|
207
193
|
|
208
194
|
elif isinstance(element, Table):
|
209
|
-
|
210
195
|
tbl_data = TableData(
|
211
196
|
num_rows=element.num_rows,
|
212
197
|
num_cols=element.num_cols,
|
@@ -342,12 +327,12 @@ class ReadingOrderModel:
|
|
342
327
|
return new_item, current_list
|
343
328
|
|
344
329
|
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
345
|
-
assert isinstance(
|
346
|
-
|
347
|
-
)
|
348
|
-
assert (
|
349
|
-
|
350
|
-
)
|
330
|
+
assert isinstance(merged_elem, type(element)), (
|
331
|
+
"Merged element must be of same type as element."
|
332
|
+
)
|
333
|
+
assert merged_elem.label == new_item.label, (
|
334
|
+
"Labels of merged elements must match."
|
335
|
+
)
|
351
336
|
prov = ProvenanceItem(
|
352
337
|
page_no=element.page_no + 1,
|
353
338
|
charspan=(
|
@@ -1,13 +1,13 @@
|
|
1
1
|
import copy
|
2
2
|
import warnings
|
3
|
+
from collections.abc import Iterable
|
3
4
|
from pathlib import Path
|
4
|
-
from typing import
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
import numpy
|
7
8
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
8
9
|
from docling_core.types.doc.page import (
|
9
10
|
BoundingRectangle,
|
10
|
-
SegmentedPdfPage,
|
11
11
|
TextCellUnit,
|
12
12
|
)
|
13
13
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
@@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
|
|
44
44
|
|
45
45
|
self.enabled = enabled
|
46
46
|
if self.enabled:
|
47
|
-
|
48
47
|
if artifacts_path is None:
|
49
48
|
artifacts_path = self.download_models() / self._model_path
|
50
49
|
else:
|
@@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
|
|
175
174
|
def __call__(
|
176
175
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
177
176
|
) -> Iterable[Page]:
|
178
|
-
|
179
177
|
if not self.enabled:
|
180
178
|
yield from page_batch
|
181
179
|
return
|
@@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
|
|
186
184
|
yield page
|
187
185
|
else:
|
188
186
|
with TimeRecorder(conv_res, "table_structure"):
|
189
|
-
|
190
187
|
assert page.predictions.layout is not None
|
191
188
|
assert page.size is not None
|
192
189
|
|
@@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
|
|
260
257
|
table_out = tf_output[0]
|
261
258
|
table_cells = []
|
262
259
|
for element in table_out["tf_responses"]:
|
263
|
-
|
264
260
|
if not self.do_cell_matching:
|
265
261
|
the_bbox = BoundingBox.model_validate(
|
266
262
|
element["bbox"]
|
@@ -3,9 +3,10 @@ import io
|
|
3
3
|
import logging
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from collections.abc import Iterable
|
6
7
|
from pathlib import Path
|
7
8
|
from subprocess import DEVNULL, PIPE, Popen
|
8
|
-
from typing import
|
9
|
+
from typing import List, Optional, Tuple, Type
|
9
10
|
|
10
11
|
import pandas as pd
|
11
12
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
63
64
|
)
|
64
65
|
|
65
66
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
66
|
-
|
67
|
-
if self._name != None and self._version != None:
|
67
|
+
if self._name is not None and self._version is not None:
|
68
68
|
return self._name, self._version # type: ignore
|
69
69
|
|
70
70
|
cmd = [self.options.tesseract_cmd, "--version"]
|
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
125
125
|
# _log.info(decoded_data)
|
126
126
|
|
127
127
|
# Read the TSV file generated by Tesseract
|
128
|
-
|
128
|
+
df_result = pd.read_csv(
|
129
|
+
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
130
|
+
)
|
129
131
|
|
130
132
|
# Display the dataframe (optional)
|
131
133
|
# _log.info("df: ", df.head())
|
132
134
|
|
133
135
|
# Filter rows that contain actual text (ignore header or empty rows)
|
134
|
-
df_filtered =
|
135
|
-
|
136
|
+
df_filtered = df_result[
|
137
|
+
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
136
138
|
]
|
137
139
|
|
138
140
|
return df_filtered
|
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
149
151
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
150
152
|
output, _ = proc.communicate()
|
151
153
|
decoded_data = output.decode("utf-8")
|
152
|
-
|
154
|
+
df_detected = pd.read_csv(
|
153
155
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
154
156
|
)
|
155
|
-
scripts =
|
157
|
+
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
156
158
|
if len(scripts) == 0:
|
157
159
|
_log.warning("Tesseract cannot detect the script of the page")
|
158
160
|
return None
|
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
183
185
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
184
186
|
output, _ = proc.communicate()
|
185
187
|
decoded_data = output.decode("utf-8")
|
186
|
-
|
187
|
-
self._tesseract_languages =
|
188
|
+
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
189
|
+
self._tesseract_languages = df_list[0].tolist()[1:]
|
188
190
|
|
189
191
|
# Decide the script prefix
|
190
|
-
if any(
|
192
|
+
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
191
193
|
script_prefix = "script/"
|
192
194
|
else:
|
193
195
|
script_prefix = ""
|
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
197
199
|
def __call__(
|
198
200
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
199
201
|
) -> Iterable[Page]:
|
200
|
-
|
201
202
|
if not self.enabled:
|
202
203
|
yield from page_batch
|
203
204
|
return
|
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
225
226
|
fname = image_file.name
|
226
227
|
high_res_image.save(image_file)
|
227
228
|
|
228
|
-
|
229
|
+
df_result = self._run_tesseract(fname)
|
229
230
|
finally:
|
230
231
|
if os.path.exists(fname):
|
231
232
|
os.remove(fname)
|
232
233
|
|
233
|
-
# _log.info(
|
234
|
+
# _log.info(df_result)
|
234
235
|
|
235
236
|
# Print relevant columns (bounding box and text)
|
236
|
-
for ix, row in
|
237
|
+
for ix, row in df_result.iterrows():
|
237
238
|
text = row["text"]
|
238
239
|
conf = row["conf"]
|
239
240
|
|
240
|
-
l = float(row["left"])
|
241
|
+
l = float(row["left"]) # noqa: E741
|
241
242
|
b = float(row["top"])
|
242
243
|
w = float(row["width"])
|
243
244
|
h = float(row["height"])
|
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
|
+
from collections.abc import Iterable
|
2
3
|
from pathlib import Path
|
3
|
-
from typing import
|
4
|
+
from typing import Optional, Type
|
4
5
|
|
5
6
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
7
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
@@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|
37
38
|
self.options: TesseractOcrOptions
|
38
39
|
|
39
40
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
40
|
-
self.reader = None
|
41
|
-
self.osd_reader = None
|
42
|
-
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
43
41
|
|
44
42
|
if self.enabled:
|
45
43
|
install_errmsg = (
|
@@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
64
62
|
raise ImportError(install_errmsg)
|
65
63
|
try:
|
66
64
|
tesseract_version = tesserocr.tesseract_version()
|
67
|
-
except:
|
65
|
+
except Exception:
|
68
66
|
raise ImportError(install_errmsg)
|
69
67
|
|
70
68
|
_, self._tesserocr_languages = tesserocr.get_languages()
|
@@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
75
73
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
76
74
|
lang = "+".join(self.options.lang)
|
77
75
|
|
78
|
-
if any(
|
76
|
+
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
|
79
77
|
self.script_prefix = "script/"
|
80
78
|
else:
|
81
79
|
self.script_prefix = ""
|
@@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
|
|
86
84
|
"oem": tesserocr.OEM.DEFAULT,
|
87
85
|
}
|
88
86
|
|
87
|
+
self.reader = None
|
88
|
+
self.osd_reader = None
|
89
|
+
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
90
|
+
|
89
91
|
if self.options.path is not None:
|
90
92
|
tesserocr_kwargs["path"] = self.options.path
|
91
93
|
|
@@ -3,9 +3,10 @@ import logging
|
|
3
3
|
import time
|
4
4
|
import traceback
|
5
5
|
from abc import ABC, abstractmethod
|
6
|
-
from
|
6
|
+
from collections.abc import Iterable
|
7
|
+
from typing import Any, Callable, List
|
7
8
|
|
8
|
-
from docling_core.types.doc import
|
9
|
+
from docling_core.types.doc import NodeItem
|
9
10
|
|
10
11
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
12
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
@@ -64,7 +65,6 @@ class BasePipeline(ABC):
|
|
64
65
|
return conv_res
|
65
66
|
|
66
67
|
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
67
|
-
|
68
68
|
def _prepare_elements(
|
69
69
|
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
|
70
70
|
) -> Iterable[NodeItem]:
|
@@ -113,7 +113,6 @@ class BasePipeline(ABC):
|
|
113
113
|
|
114
114
|
|
115
115
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
116
|
-
|
117
116
|
def __init__(self, pipeline_options: PipelineOptions):
|
118
117
|
super().__init__(pipeline_options)
|
119
118
|
self.keep_backend = False
|
@@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
127
126
|
yield from page_batch
|
128
127
|
|
129
128
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
130
|
-
|
131
129
|
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
132
130
|
raise RuntimeError(
|
133
131
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
@@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
139
137
|
|
140
138
|
total_elapsed_time = 0.0
|
141
139
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
142
|
-
|
143
|
-
for i in range(0, conv_res.input.page_count):
|
140
|
+
for i in range(conv_res.input.page_count):
|
144
141
|
start_page, end_page = conv_res.input.limits.page_range
|
145
142
|
if (start_page - 1) <= i <= (end_page - 1):
|
146
143
|
conv_res.pages.append(Page(page_no=i))
|
@@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
161
158
|
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
162
159
|
|
163
160
|
for p in pipeline_pages: # Must exhaust!
|
164
|
-
|
165
161
|
# Cleanup cached images
|
166
162
|
if not self.keep_images:
|
167
163
|
p._image_cache = {}
|
@@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
|
|
24
24
|
super().__init__(pipeline_options)
|
25
25
|
|
26
26
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
27
|
-
|
28
27
|
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
29
28
|
raise RuntimeError(
|
30
29
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
@@ -1,8 +1,7 @@
|
|
1
1
|
import logging
|
2
|
-
import sys
|
3
2
|
import warnings
|
4
3
|
from pathlib import Path
|
5
|
-
from typing import Optional
|
4
|
+
from typing import Optional, cast
|
6
5
|
|
7
6
|
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
8
7
|
|
@@ -226,7 +225,11 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|
226
225
|
and self.pipeline_options.generate_table_images
|
227
226
|
):
|
228
227
|
page_ix = element.prov[0].page_no - 1
|
229
|
-
page =
|
228
|
+
page = next(
|
229
|
+
(p for p in conv_res.pages if p.page_no == page_ix),
|
230
|
+
cast("Page", None),
|
231
|
+
)
|
232
|
+
assert page is not None
|
230
233
|
assert page.size is not None
|
231
234
|
assert page.image is not None
|
232
235
|
|
docling/pipeline/vlm_pipeline.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
import logging
|
2
|
-
import warnings
|
3
2
|
from io import BytesIO
|
4
3
|
from pathlib import Path
|
5
4
|
from typing import List, Optional, Union, cast
|
@@ -15,11 +14,14 @@ from docling.backend.pdf_backend import PdfDocumentBackend
|
|
15
14
|
from docling.datamodel.base_models import InputFormat, Page
|
16
15
|
from docling.datamodel.document import ConversionResult, InputDocument
|
17
16
|
from docling.datamodel.pipeline_options import (
|
17
|
+
ApiVlmOptions,
|
18
|
+
HuggingFaceVlmOptions,
|
18
19
|
InferenceFramework,
|
19
20
|
ResponseFormat,
|
20
21
|
VlmPipelineOptions,
|
21
22
|
)
|
22
23
|
from docling.datamodel.settings import settings
|
24
|
+
from docling.models.api_vlm_model import ApiVlmModel
|
23
25
|
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
24
26
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
25
27
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
@@ -29,7 +31,6 @@ _log = logging.getLogger(__name__)
|
|
29
31
|
|
30
32
|
|
31
33
|
class VlmPipeline(PaginatedPipeline):
|
32
|
-
|
33
34
|
def __init__(self, pipeline_options: VlmPipelineOptions):
|
34
35
|
super().__init__(pipeline_options)
|
35
36
|
self.keep_backend = True
|
@@ -57,27 +58,34 @@ class VlmPipeline(PaginatedPipeline):
|
|
57
58
|
|
58
59
|
self.keep_images = self.pipeline_options.generate_page_images
|
59
60
|
|
60
|
-
if (
|
61
|
-
self.pipeline_options.vlm_options.inference_framework
|
62
|
-
== InferenceFramework.MLX
|
63
|
-
):
|
61
|
+
if isinstance(pipeline_options.vlm_options, ApiVlmOptions):
|
64
62
|
self.build_pipe = [
|
65
|
-
|
63
|
+
ApiVlmModel(
|
66
64
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
67
|
-
|
68
|
-
|
69
|
-
vlm_options=self.pipeline_options.vlm_options,
|
70
|
-
),
|
71
|
-
]
|
72
|
-
else:
|
73
|
-
self.build_pipe = [
|
74
|
-
HuggingFaceVlmModel(
|
75
|
-
enabled=True, # must be always enabled for this pipeline to make sense.
|
76
|
-
artifacts_path=artifacts_path,
|
77
|
-
accelerator_options=pipeline_options.accelerator_options,
|
78
|
-
vlm_options=self.pipeline_options.vlm_options,
|
65
|
+
enable_remote_services=self.pipeline_options.enable_remote_services,
|
66
|
+
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
79
67
|
),
|
80
68
|
]
|
69
|
+
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
|
70
|
+
vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
|
71
|
+
if vlm_options.inference_framework == InferenceFramework.MLX:
|
72
|
+
self.build_pipe = [
|
73
|
+
HuggingFaceMlxModel(
|
74
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
75
|
+
artifacts_path=artifacts_path,
|
76
|
+
accelerator_options=pipeline_options.accelerator_options,
|
77
|
+
vlm_options=vlm_options,
|
78
|
+
),
|
79
|
+
]
|
80
|
+
else:
|
81
|
+
self.build_pipe = [
|
82
|
+
HuggingFaceVlmModel(
|
83
|
+
enabled=True, # must be always enabled for this pipeline to make sense.
|
84
|
+
artifacts_path=artifacts_path,
|
85
|
+
accelerator_options=pipeline_options.accelerator_options,
|
86
|
+
vlm_options=vlm_options,
|
87
|
+
),
|
88
|
+
]
|
81
89
|
|
82
90
|
self.enrichment_pipe = [
|
83
91
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
@@ -104,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
|
|
104
112
|
|
105
113
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
106
114
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
107
|
-
|
108
115
|
if (
|
109
116
|
self.pipeline_options.vlm_options.response_format
|
110
117
|
== ResponseFormat.DOCTAGS
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import base64
|
2
|
+
import logging
|
3
|
+
from io import BytesIO
|
4
|
+
from typing import Dict, Optional
|
5
|
+
|
6
|
+
import requests
|
7
|
+
from PIL import Image
|
8
|
+
from pydantic import AnyUrl
|
9
|
+
|
10
|
+
from docling.datamodel.base_models import OpenAiApiResponse
|
11
|
+
|
12
|
+
_log = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def api_image_request(
|
16
|
+
image: Image.Image,
|
17
|
+
prompt: str,
|
18
|
+
url: AnyUrl,
|
19
|
+
timeout: float = 20,
|
20
|
+
headers: Optional[Dict[str, str]] = None,
|
21
|
+
**params,
|
22
|
+
) -> str:
|
23
|
+
img_io = BytesIO()
|
24
|
+
image.save(img_io, "PNG")
|
25
|
+
image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
|
26
|
+
messages = [
|
27
|
+
{
|
28
|
+
"role": "user",
|
29
|
+
"content": [
|
30
|
+
{
|
31
|
+
"type": "image_url",
|
32
|
+
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
33
|
+
},
|
34
|
+
{
|
35
|
+
"type": "text",
|
36
|
+
"text": prompt,
|
37
|
+
},
|
38
|
+
],
|
39
|
+
}
|
40
|
+
]
|
41
|
+
|
42
|
+
payload = {
|
43
|
+
"messages": messages,
|
44
|
+
**params,
|
45
|
+
}
|
46
|
+
|
47
|
+
headers = headers or {}
|
48
|
+
|
49
|
+
r = requests.post(
|
50
|
+
str(url),
|
51
|
+
headers=headers,
|
52
|
+
json=payload,
|
53
|
+
timeout=timeout,
|
54
|
+
)
|
55
|
+
if not r.ok:
|
56
|
+
_log.error(f"Error calling the API. Response was {r.text}")
|
57
|
+
r.raise_for_status()
|
58
|
+
|
59
|
+
api_resp = OpenAiApiResponse.model_validate_json(r.text)
|
60
|
+
generated_text = api_resp.choices[0].message.content.strip()
|
61
|
+
return generated_text
|
docling/utils/export.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
import logging
|
2
|
-
from
|
2
|
+
from collections.abc import Iterable
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
3
4
|
|
4
5
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
|
-
from docling_core.types.doc.page import TextCell
|
6
6
|
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
7
7
|
|
8
8
|
from docling.datamodel.document import ConversionResult, Page
|
@@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
|
|
13
13
|
def generate_multimodal_pages(
|
14
14
|
doc_result: ConversionResult,
|
15
15
|
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
16
|
-
|
17
16
|
label_to_doclaynet = {
|
18
17
|
"title": "title",
|
19
18
|
"table-of-contents": "document_index",
|
@@ -122,7 +121,6 @@ def generate_multimodal_pages(
|
|
122
121
|
if doc.main_text is None:
|
123
122
|
return
|
124
123
|
for ix, orig_item in enumerate(doc.main_text):
|
125
|
-
|
126
124
|
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
127
125
|
if item is None or item.prov is None or len(item.prov) == 0:
|
128
126
|
_log.debug(f"Skipping item {orig_item}")
|
docling/utils/glm_utils.py
CHANGED
@@ -29,7 +29,7 @@ def resolve_item(paths, obj):
|
|
29
29
|
|
30
30
|
try:
|
31
31
|
key = int(paths[0])
|
32
|
-
except:
|
32
|
+
except Exception:
|
33
33
|
key = paths[0]
|
34
34
|
|
35
35
|
if len(paths) == 1:
|
@@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
|
67
67
|
return unique_objects
|
68
68
|
|
69
69
|
|
70
|
-
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
70
|
+
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
|
71
71
|
origin = DocumentOrigin(
|
72
72
|
mimetype="application/pdf",
|
73
73
|
filename=doc_glm["file-info"]["filename"],
|
@@ -18,7 +18,7 @@ class UnionFind:
|
|
18
18
|
|
19
19
|
def __init__(self, elements):
|
20
20
|
self.parent = {elem: elem for elem in elements}
|
21
|
-
self.rank =
|
21
|
+
self.rank = dict.fromkeys(elements, 0)
|
22
22
|
|
23
23
|
def find(self, x):
|
24
24
|
if self.parent[x] != x:
|
@@ -484,7 +484,9 @@ class LayoutPostprocessor:
|
|
484
484
|
spatial_index = (
|
485
485
|
self.regular_index
|
486
486
|
if cluster_type == "regular"
|
487
|
-
else self.picture_index
|
487
|
+
else self.picture_index
|
488
|
+
if cluster_type == "picture"
|
489
|
+
else self.wrapper_index
|
488
490
|
)
|
489
491
|
|
490
492
|
# Map of currently valid clusters
|
@@ -37,7 +37,7 @@ def download_models(
|
|
37
37
|
output_dir.mkdir(exist_ok=True, parents=True)
|
38
38
|
|
39
39
|
if with_layout:
|
40
|
-
_log.info(
|
40
|
+
_log.info("Downloading layout model...")
|
41
41
|
LayoutModel.download_models(
|
42
42
|
local_dir=output_dir / LayoutModel._model_repo_folder,
|
43
43
|
force=force,
|
@@ -45,7 +45,7 @@ def download_models(
|
|
45
45
|
)
|
46
46
|
|
47
47
|
if with_tableformer:
|
48
|
-
_log.info(
|
48
|
+
_log.info("Downloading tableformer model...")
|
49
49
|
TableStructureModel.download_models(
|
50
50
|
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
51
51
|
force=force,
|
@@ -53,7 +53,7 @@ def download_models(
|
|
53
53
|
)
|
54
54
|
|
55
55
|
if with_picture_classifier:
|
56
|
-
_log.info(
|
56
|
+
_log.info("Downloading picture classifier model...")
|
57
57
|
DocumentPictureClassifier.download_models(
|
58
58
|
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
59
59
|
force=force,
|
@@ -61,7 +61,7 @@ def download_models(
|
|
61
61
|
)
|
62
62
|
|
63
63
|
if with_code_formula:
|
64
|
-
_log.info(
|
64
|
+
_log.info("Downloading code formula model...")
|
65
65
|
CodeFormulaModel.download_models(
|
66
66
|
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
67
67
|
force=force,
|
@@ -69,7 +69,7 @@ def download_models(
|
|
69
69
|
)
|
70
70
|
|
71
71
|
if with_smolvlm:
|
72
|
-
_log.info(
|
72
|
+
_log.info("Downloading SmolVlm model...")
|
73
73
|
PictureDescriptionVlmModel.download_models(
|
74
74
|
repo_id=smolvlm_picture_description.repo_id,
|
75
75
|
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
@@ -78,7 +78,7 @@ def download_models(
|
|
78
78
|
)
|
79
79
|
|
80
80
|
if with_granite_vision:
|
81
|
-
_log.info(
|
81
|
+
_log.info("Downloading Granite Vision model...")
|
82
82
|
PictureDescriptionVlmModel.download_models(
|
83
83
|
repo_id=granite_picture_description.repo_id,
|
84
84
|
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
@@ -87,7 +87,7 @@ def download_models(
|
|
87
87
|
)
|
88
88
|
|
89
89
|
if with_easyocr:
|
90
|
-
_log.info(
|
90
|
+
_log.info("Downloading easyocr models...")
|
91
91
|
EasyOcrModel.download_models(
|
92
92
|
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
93
93
|
force=force,
|
docling/utils/utils.py
CHANGED
@@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
|
|
13
13
|
if isinstance(iterator, List):
|
14
14
|
iterator = iter(iterator)
|
15
15
|
for first in iterator: # Take the first element from the iterator
|
16
|
-
yield [first
|
16
|
+
yield [first, *list(islice(iterator, chunk_size - 1))]
|
17
17
|
|
18
18
|
|
19
19
|
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|