docling 2.44.0__py3-none-any.whl → 2.46.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v4_backend.py +61 -27
- docling/backend/html_backend.py +356 -80
- docling/backend/mets_gbs_backend.py +399 -0
- docling/backend/pdf_backend.py +3 -3
- docling/cli/main.py +10 -0
- docling/datamodel/base_models.py +3 -0
- docling/datamodel/document.py +26 -0
- docling/datamodel/pipeline_options.py +1 -3
- docling/datamodel/pipeline_options_vlm_model.py +8 -2
- docling/document_converter.py +4 -0
- docling/models/api_vlm_model.py +2 -5
- docling/models/code_formula_model.py +87 -76
- docling/models/tesseract_ocr_cli_model.py +4 -2
- docling/models/vlm_models_inline/hf_transformers_model.py +2 -4
- docling/models/vlm_models_inline/mlx_model.py +2 -4
- docling/pipeline/base_pipeline.py +14 -5
- docling/pipeline/threaded_standard_pdf_pipeline.py +6 -4
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/METADATA +2 -2
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/RECORD +23 -22
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/WHEEL +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/entry_points.txt +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.44.0.dist-info → docling-2.46.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,4 @@
|
|
1
1
|
import re
|
2
|
-
from collections import Counter
|
3
2
|
from collections.abc import Iterable
|
4
3
|
from pathlib import Path
|
5
4
|
from typing import List, Literal, Optional, Tuple, Union
|
@@ -13,10 +12,11 @@ from docling_core.types.doc import (
|
|
13
12
|
TextItem,
|
14
13
|
)
|
15
14
|
from docling_core.types.doc.labels import CodeLanguageLabel
|
16
|
-
from PIL import Image
|
15
|
+
from PIL import Image
|
17
16
|
from pydantic import BaseModel
|
17
|
+
from transformers import AutoModelForImageTextToText, AutoProcessor
|
18
18
|
|
19
|
-
from docling.datamodel.accelerator_options import AcceleratorOptions
|
19
|
+
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
20
20
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
21
21
|
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
22
22
|
from docling.models.utils.hf_model_download import download_hf_model
|
@@ -65,9 +65,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
65
65
|
Processes the given batch of elements and enriches them with predictions.
|
66
66
|
"""
|
67
67
|
|
68
|
-
_model_repo_folder = "ds4sd--
|
68
|
+
_model_repo_folder = "ds4sd--CodeFormulaV2"
|
69
69
|
elements_batch_size = 5
|
70
|
-
images_scale = 1.
|
70
|
+
images_scale = 1.67 # = 120 dpi, aligned with training data resolution
|
71
71
|
expansion_factor = 0.18
|
72
72
|
|
73
73
|
def __init__(
|
@@ -95,10 +95,9 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
95
95
|
self.options = options
|
96
96
|
|
97
97
|
if self.enabled:
|
98
|
-
device = decide_device(
|
99
|
-
|
100
|
-
|
101
|
-
CodeFormulaPredictor,
|
98
|
+
self.device = decide_device(
|
99
|
+
accelerator_options.device,
|
100
|
+
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
|
102
101
|
)
|
103
102
|
|
104
103
|
if artifacts_path is None:
|
@@ -106,11 +105,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
106
105
|
else:
|
107
106
|
artifacts_path = artifacts_path / self._model_repo_folder
|
108
107
|
|
109
|
-
self.
|
110
|
-
artifacts_path
|
111
|
-
|
112
|
-
|
108
|
+
self._processor = AutoProcessor.from_pretrained(
|
109
|
+
artifacts_path,
|
110
|
+
)
|
111
|
+
self._model_max_length = self._processor.tokenizer.model_max_length
|
112
|
+
self._model = AutoModelForImageTextToText.from_pretrained(
|
113
|
+
artifacts_path, device_map=self.device
|
113
114
|
)
|
115
|
+
self._model.eval()
|
114
116
|
|
115
117
|
@staticmethod
|
116
118
|
def download_models(
|
@@ -119,8 +121,8 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
119
121
|
progress: bool = False,
|
120
122
|
) -> Path:
|
121
123
|
return download_hf_model(
|
122
|
-
repo_id="ds4sd/
|
123
|
-
revision="
|
124
|
+
repo_id="ds4sd/CodeFormulaV2",
|
125
|
+
revision="main",
|
124
126
|
local_dir=local_dir,
|
125
127
|
force=force,
|
126
128
|
progress=progress,
|
@@ -172,7 +174,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
172
174
|
- The second element is the extracted language if a match is found;
|
173
175
|
otherwise, `None`.
|
174
176
|
"""
|
175
|
-
pattern = r"^<_([^_>]+)_>\s(.*)"
|
177
|
+
pattern = r"^<_([^_>]+)_>\s*(.*)"
|
176
178
|
match = re.match(pattern, input_string, flags=re.DOTALL)
|
177
179
|
if match:
|
178
180
|
language = str(match.group(1)) # the captured programming language
|
@@ -203,81 +205,74 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
203
205
|
except ValueError:
|
204
206
|
return CodeLanguageLabel.UNKNOWN
|
205
207
|
|
206
|
-
def
|
208
|
+
def _get_prompt(self, label: str) -> str:
|
207
209
|
"""
|
208
|
-
|
210
|
+
Constructs the prompt for the model based on the input label.
|
209
211
|
|
210
212
|
Parameters
|
211
213
|
----------
|
212
|
-
|
213
|
-
|
214
|
+
label : str
|
215
|
+
The type of input, either 'code' or 'formula'.
|
214
216
|
|
215
217
|
Returns
|
216
218
|
-------
|
217
|
-
|
218
|
-
|
219
|
+
str
|
220
|
+
The constructed prompt including necessary tokens and query.
|
221
|
+
|
222
|
+
Raises
|
223
|
+
------
|
224
|
+
NotImplementedError
|
225
|
+
If the label is not 'code' or 'formula'.
|
219
226
|
"""
|
220
|
-
|
221
|
-
|
227
|
+
if label == "code":
|
228
|
+
query = "<code>"
|
229
|
+
elif label == "formula":
|
230
|
+
query = "<formula>"
|
231
|
+
else:
|
232
|
+
raise NotImplementedError("Label must be either code or formula")
|
222
233
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
right = img_np[:, -1] # shape (H,)
|
234
|
+
messages = [
|
235
|
+
{
|
236
|
+
"role": "user",
|
237
|
+
"content": [{"type": "image"}, {"type": "text", "text": query}],
|
238
|
+
},
|
239
|
+
]
|
230
240
|
|
231
|
-
|
232
|
-
|
241
|
+
prompt = self._processor.apply_chat_template(
|
242
|
+
messages, add_generation_prompt=True
|
243
|
+
)
|
233
244
|
|
234
|
-
|
235
|
-
freq = Counter(edges.tolist())
|
236
|
-
most_common_value, _ = freq.most_common(1)[0]
|
237
|
-
return int(most_common_value) # single channel color
|
245
|
+
return prompt
|
238
246
|
|
239
|
-
|
240
|
-
# Color image: shape (H, W, C)
|
241
|
-
top = img_np[0, :, :] # shape (W, C)
|
242
|
-
bottom = img_np[-1, :, :] # shape (W, C)
|
243
|
-
left = img_np[:, 0, :] # shape (H, C)
|
244
|
-
right = img_np[:, -1, :] # shape (H, C)
|
245
|
-
|
246
|
-
# Concatenate edges along first axis
|
247
|
-
edges = np.concatenate([top, bottom, left, right], axis=0)
|
248
|
-
|
249
|
-
# Convert each color to a tuple for counting
|
250
|
-
edges_as_tuples = [tuple(pixel) for pixel in edges]
|
251
|
-
freq = Counter(edges_as_tuples)
|
252
|
-
most_common_value, _ = freq.most_common(1)[0]
|
253
|
-
return most_common_value # e.g. (R, G, B) or (R, G, B, A)
|
254
|
-
|
255
|
-
def _pad_with_most_frequent_edge_color(
|
256
|
-
self, img: Union[Image.Image, np.ndarray], padding: Tuple[int, int, int, int]
|
257
|
-
):
|
247
|
+
def _post_process(self, texts: list[str]) -> list[str]:
|
258
248
|
"""
|
259
|
-
|
249
|
+
Processes a list of text strings by truncating at '<end_of_utterance>' and
|
250
|
+
removing a predefined set of unwanted substrings.
|
260
251
|
|
261
252
|
Parameters
|
262
253
|
----------
|
263
|
-
|
264
|
-
|
265
|
-
padding : tuple
|
266
|
-
Padding (left, top, right, bottom) in pixels.
|
254
|
+
texts : list[str]
|
255
|
+
A list of strings to be post-processed.
|
267
256
|
|
268
257
|
Returns
|
269
258
|
-------
|
270
|
-
|
259
|
+
list[str]
|
260
|
+
A list of cleaned strings with specified substrings removed and truncated at
|
261
|
+
'<end_of_utterance>' if present.
|
271
262
|
"""
|
272
|
-
|
273
|
-
pil_img = Image.fromarray(img)
|
274
|
-
else:
|
275
|
-
pil_img = img
|
263
|
+
to_remove = ["</code>", "</formula>", "<loc_0><loc_0><loc_500><loc_500>"]
|
276
264
|
|
277
|
-
|
265
|
+
def clean_text(text: str) -> str:
|
266
|
+
idx = text.find("<end_of_utterance>")
|
267
|
+
if idx != -1:
|
268
|
+
text = text[:idx]
|
278
269
|
|
279
|
-
|
280
|
-
|
270
|
+
for token in to_remove:
|
271
|
+
if token in text:
|
272
|
+
text = text.replace(token, "")
|
273
|
+
return text.lstrip()
|
274
|
+
|
275
|
+
return [clean_text(t) for t in texts]
|
281
276
|
|
282
277
|
def __call__(
|
283
278
|
self,
|
@@ -308,14 +303,30 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|
308
303
|
images: List[Union[Image.Image, np.ndarray]] = []
|
309
304
|
elements: List[TextItem] = []
|
310
305
|
for el in element_batch:
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
306
|
+
elements.append(el.item) # type: ignore[arg-type]
|
307
|
+
labels.append(el.item.label) # type: ignore[attr-defined]
|
308
|
+
images.append(el.image)
|
309
|
+
|
310
|
+
prompts = [self._get_prompt(label) for label in labels]
|
311
|
+
inputs = self._processor(
|
312
|
+
text=prompts,
|
313
|
+
images=images,
|
314
|
+
return_tensors="pt",
|
315
|
+
)
|
316
|
+
inputs = inputs.to(self.device)
|
317
317
|
|
318
|
-
|
318
|
+
gen_kwargs = dict(
|
319
|
+
max_new_tokens=self._model_max_length - inputs.input_ids.shape[1],
|
320
|
+
use_cache=True,
|
321
|
+
do_sample=False,
|
322
|
+
)
|
323
|
+
|
324
|
+
generated_ids = self._model.generate(**inputs, **gen_kwargs)
|
325
|
+
|
326
|
+
outputs = self._processor.batch_decode(
|
327
|
+
generated_ids[:, inputs.input_ids.shape[1] :], skip_special_tokens=False
|
328
|
+
)
|
329
|
+
outputs = self._post_process(outputs)
|
319
330
|
|
320
331
|
for item, output in zip(elements, outputs):
|
321
332
|
if isinstance(item, CodeItem):
|
@@ -320,6 +320,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
320
320
|
|
321
321
|
|
322
322
|
def _parse_orientation(df_osd: pd.DataFrame) -> int:
|
323
|
-
|
324
|
-
|
323
|
+
# For strictly optimal performance with invariant dataframe format:
|
324
|
+
mask = df_osd["key"].to_numpy() == "Orientation in degrees"
|
325
|
+
orientation_val = df_osd["value"].to_numpy()[mask][0]
|
326
|
+
orientation = parse_tesseract_orientation(orientation_val.strip())
|
325
327
|
return orientation
|
@@ -135,10 +135,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
135
135
|
)
|
136
136
|
|
137
137
|
# Define prompt structure
|
138
|
-
|
139
|
-
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
140
|
-
else:
|
141
|
-
user_prompt = self.vlm_options.prompt
|
138
|
+
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
|
142
139
|
prompt = self.formulate_prompt(user_prompt)
|
143
140
|
|
144
141
|
inputs = self.processor(
|
@@ -166,6 +163,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
|
166
163
|
_log.debug(
|
167
164
|
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
168
165
|
)
|
166
|
+
generated_texts = self.vlm_options.decode_response(generated_texts)
|
169
167
|
page.predictions.vlm_response = VlmPrediction(
|
170
168
|
text=generated_texts,
|
171
169
|
generation_time=generation_time,
|
@@ -84,10 +84,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
84
84
|
if hi_res_image.mode != "RGB":
|
85
85
|
hi_res_image = hi_res_image.convert("RGB")
|
86
86
|
|
87
|
-
|
88
|
-
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
89
|
-
else:
|
90
|
-
user_prompt = self.vlm_options.prompt
|
87
|
+
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
|
91
88
|
prompt = self.apply_chat_template(
|
92
89
|
self.processor, self.config, user_prompt, num_images=1
|
93
90
|
)
|
@@ -142,6 +139,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
|
142
139
|
_log.debug(
|
143
140
|
f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
|
144
141
|
)
|
142
|
+
page_tags = self.vlm_options.decode_response(page_tags)
|
145
143
|
page.predictions.vlm_response = VlmPrediction(
|
146
144
|
text=page_tags,
|
147
145
|
generation_time=generation_time,
|
@@ -8,7 +8,10 @@ from typing import Any, Callable, List
|
|
8
8
|
|
9
9
|
from docling_core.types.doc import NodeItem
|
10
10
|
|
11
|
-
from docling.backend.abstract_backend import
|
11
|
+
from docling.backend.abstract_backend import (
|
12
|
+
AbstractDocumentBackend,
|
13
|
+
PaginatedDocumentBackend,
|
14
|
+
)
|
12
15
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
13
16
|
from docling.datamodel.base_models import (
|
14
17
|
ConversionStatus,
|
@@ -17,7 +20,7 @@ from docling.datamodel.base_models import (
|
|
17
20
|
Page,
|
18
21
|
)
|
19
22
|
from docling.datamodel.document import ConversionResult, InputDocument
|
20
|
-
from docling.datamodel.pipeline_options import PipelineOptions
|
23
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
21
24
|
from docling.datamodel.settings import settings
|
22
25
|
from docling.models.base_model import GenericEnrichmentModel
|
23
26
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
@@ -126,10 +129,10 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
126
129
|
yield from page_batch
|
127
130
|
|
128
131
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
129
|
-
if not isinstance(conv_res.input._backend,
|
132
|
+
if not isinstance(conv_res.input._backend, PaginatedDocumentBackend):
|
130
133
|
raise RuntimeError(
|
131
|
-
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a
|
132
|
-
f"Can not convert this with a PDF pipeline. "
|
134
|
+
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a paginated backend. "
|
135
|
+
f"Can not convert this with a paginated PDF pipeline. "
|
133
136
|
f"Please check your format configuration on DocumentConverter."
|
134
137
|
)
|
135
138
|
# conv_res.status = ConversionStatus.FAILURE
|
@@ -165,6 +168,12 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
165
168
|
# Cleanup page backends
|
166
169
|
if not self.keep_backend and p._backend is not None:
|
167
170
|
p._backend.unload()
|
171
|
+
if (
|
172
|
+
isinstance(self.pipeline_options, PdfPipelineOptions)
|
173
|
+
and not self.pipeline_options.generate_parsed_pages
|
174
|
+
):
|
175
|
+
del p.parsed_page
|
176
|
+
p.parsed_page = None
|
168
177
|
|
169
178
|
end_batch_time = time.monotonic()
|
170
179
|
total_elapsed_time += end_batch_time - start_batch_time
|
@@ -565,10 +565,12 @@ class ThreadedStandardPdfPipeline(BasePipeline):
|
|
565
565
|
if not self.keep_images:
|
566
566
|
for p in conv_res.pages:
|
567
567
|
p._image_cache = {}
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
568
|
+
for p in conv_res.pages:
|
569
|
+
if not self.keep_backend and p._backend is not None:
|
570
|
+
p._backend.unload()
|
571
|
+
if not self.pipeline_options.generate_parsed_pages:
|
572
|
+
del p.parsed_page
|
573
|
+
p.parsed_page = None
|
572
574
|
|
573
575
|
# ---------------------------------------------------------------- assemble
|
574
576
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.46.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -27,7 +27,7 @@ Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.42.0
|
30
|
-
Requires-Dist: docling-parse<5.0.0,>=4.
|
30
|
+
Requires-Dist: docling-parse<5.0.0,>=4.2.2
|
31
31
|
Requires-Dist: docling-ibm-models<4,>=3.9.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2!=4.30.1,<5.0.0,>=4.30.0
|
@@ -1,5 +1,5 @@
|
|
1
1
|
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/document_converter.py,sha256=
|
2
|
+
docling/document_converter.py,sha256=7lid_uhGNuurYICweaA1jqtSbnhf3hpuUYUNleHh-Ww,15924
|
3
3
|
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
4
4
|
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
5
5
|
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -8,14 +8,15 @@ docling/backend/asciidoc_backend.py,sha256=RDNLrPJHxROiM7-NQdZn3DdvAyiPAndbSWcZo
|
|
8
8
|
docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
|
9
9
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
|
-
docling/backend/docling_parse_v4_backend.py,sha256=
|
12
|
-
docling/backend/html_backend.py,sha256=
|
11
|
+
docling/backend/docling_parse_v4_backend.py,sha256=MbCMxNGmoW4iuev9tX1Vt4jtIeak2kC9Uac3xQSRxeo,7509
|
12
|
+
docling/backend/html_backend.py,sha256=zJH4wkcyftvoA-ixC4MH-xjwl-TGTN9BvZT7Hhla2mc,34701
|
13
13
|
docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
|
14
|
+
docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
|
14
15
|
docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
|
15
16
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
16
17
|
docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
|
17
18
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
18
|
-
docling/backend/pdf_backend.py,sha256=
|
19
|
+
docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
|
19
20
|
docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
|
20
21
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
22
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -28,25 +29,25 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
|
|
28
29
|
docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
|
29
30
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
30
31
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
docling/cli/main.py,sha256
|
32
|
+
docling/cli/main.py,sha256=-W_vdKvSm5gZUZyvRpFH0YMI_1iJrP5sJOZ5_1bLorw,30359
|
32
33
|
docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
|
33
34
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
34
35
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
36
|
docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
|
36
37
|
docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
|
37
|
-
docling/datamodel/base_models.py,sha256=
|
38
|
-
docling/datamodel/document.py,sha256=
|
38
|
+
docling/datamodel/base_models.py,sha256=Ifd8PPHs4sW7ScwSqpa-y3rwgPbde_iw13Y2NUCPfU8,11944
|
39
|
+
docling/datamodel/document.py,sha256=zsxFYXvo6GtwGNogSDoBB1TFvkm7IOrP_VnqXNqBhJs,17329
|
39
40
|
docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
|
40
|
-
docling/datamodel/pipeline_options.py,sha256=
|
41
|
+
docling/datamodel/pipeline_options.py,sha256=vOLpuVF-d4nmr-L16EmmhGFn25SDsgExCfX5kPiyISg,10470
|
41
42
|
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
42
|
-
docling/datamodel/pipeline_options_vlm_model.py,sha256=
|
43
|
+
docling/datamodel/pipeline_options_vlm_model.py,sha256=eH-Cj_8aic9FdX4xGlBcf5_R9e152JAL2LhtY8d0rhw,2498
|
43
44
|
docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
|
44
45
|
docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
|
45
46
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
46
|
-
docling/models/api_vlm_model.py,sha256
|
47
|
+
docling/models/api_vlm_model.py,sha256=-zisU32pgDRbychyG6-neB0qweNbPaYnLXwiGT7SEdI,2859
|
47
48
|
docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
|
48
49
|
docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
|
49
|
-
docling/models/code_formula_model.py,sha256=
|
50
|
+
docling/models/code_formula_model.py,sha256=XRugm4EwifLRc-TrAk-glKlktJP-nAPneKh2EOovkJU,11308
|
50
51
|
docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
|
51
52
|
docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
|
52
53
|
docling/models/layout_model.py,sha256=Nfbo6keMB4vVjGoZdFMqD9CmZcWh-0bE3LkRjJTDJQ0,9146
|
@@ -59,7 +60,7 @@ docling/models/picture_description_vlm_model.py,sha256=yfyAFOy8RjxQJrafPMSAMrrpa
|
|
59
60
|
docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
|
60
61
|
docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
|
61
62
|
docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
|
62
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=
|
63
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=I3Gn28Y-LD8OfvyCElN9fLiNgpo2sT0uMkVt258253s,12881
|
63
64
|
docling/models/tesseract_ocr_model.py,sha256=GdI5Cjfi87qcehVbM3wdKRvKkl_F9A4bwTUbjXZCJYA,10745
|
64
65
|
docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
|
65
66
|
docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
|
@@ -70,14 +71,14 @@ docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8c
|
|
70
71
|
docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
72
|
docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
|
72
73
|
docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
73
|
-
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=
|
74
|
-
docling/models/vlm_models_inline/mlx_model.py,sha256=
|
74
|
+
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=Rwdr7neDpn5ehtrp6n7G21fcPBK2m9Har_6BFNdyw-Q,8359
|
75
|
+
docling/models/vlm_models_inline/mlx_model.py,sha256=YYYmopsITlX17JVS5KhLlb1IQSEVoSECNx_fXLHNpAc,5880
|
75
76
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
76
77
|
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
77
|
-
docling/pipeline/base_pipeline.py,sha256=
|
78
|
+
docling/pipeline/base_pipeline.py,sha256=VYVYndifTPSD2GWHKjfi4Y76M5qgt1DiygO-jowKsqM,9919
|
78
79
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
79
80
|
docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
|
80
|
-
docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=
|
81
|
+
docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=miPIyprtzPFYG94n6PmUgK4Nh7rqACYEGkWrlTbrZAc,26133
|
81
82
|
docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
|
82
83
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
83
84
|
docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
|
@@ -92,9 +93,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
92
93
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
93
94
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
94
95
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
95
|
-
docling-2.
|
96
|
-
docling-2.
|
97
|
-
docling-2.
|
98
|
-
docling-2.
|
99
|
-
docling-2.
|
100
|
-
docling-2.
|
96
|
+
docling-2.46.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
97
|
+
docling-2.46.0.dist-info/METADATA,sha256=fm7KVaUwGryyuRk7R_AkNSHo1BogY8-ra9gpCWXbnCA,10459
|
98
|
+
docling-2.46.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
99
|
+
docling-2.46.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
100
|
+
docling-2.46.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
101
|
+
docling-2.46.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|