docling 2.30.0__py3-none-any.whl → 2.31.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/asciidoc_backend.py +7 -15
- docling/backend/csv_backend.py +1 -1
- docling/backend/docling_parse_backend.py +2 -2
- docling/backend/docling_parse_v2_backend.py +2 -2
- docling/backend/docling_parse_v4_backend.py +3 -4
- docling/backend/docx/latex/latex_dict.py +0 -5
- docling/backend/docx/latex/omml.py +4 -7
- docling/backend/html_backend.py +66 -25
- docling/backend/md_backend.py +6 -8
- docling/backend/msexcel_backend.py +1 -7
- docling/backend/mspowerpoint_backend.py +4 -7
- docling/backend/msword_backend.py +5 -5
- docling/backend/pdf_backend.py +2 -1
- docling/backend/pypdfium2_backend.py +3 -3
- docling/backend/xml/jats_backend.py +11 -14
- docling/backend/xml/uspto_backend.py +19 -23
- docling/cli/main.py +8 -8
- docling/cli/models.py +6 -3
- docling/datamodel/base_models.py +7 -5
- docling/datamodel/document.py +19 -10
- docling/datamodel/pipeline_options.py +0 -1
- docling/document_converter.py +8 -6
- docling/models/api_vlm_model.py +1 -2
- docling/models/base_model.py +2 -4
- docling/models/base_ocr_model.py +2 -2
- docling/models/code_formula_model.py +2 -1
- docling/models/document_picture_classifier.py +2 -1
- docling/models/easyocr_model.py +10 -11
- docling/models/factories/__init__.py +2 -2
- docling/models/factories/base_factory.py +1 -1
- docling/models/hf_mlx_model.py +4 -6
- docling/models/hf_vlm_model.py +7 -5
- docling/models/layout_model.py +2 -2
- docling/models/ocr_mac_model.py +3 -4
- docling/models/page_assemble_model.py +7 -12
- docling/models/page_preprocessing_model.py +2 -1
- docling/models/picture_description_api_model.py +2 -1
- docling/models/picture_description_base_model.py +2 -3
- docling/models/picture_description_vlm_model.py +6 -4
- docling/models/rapid_ocr_model.py +2 -3
- docling/models/readingorder_model.py +9 -24
- docling/models/table_structure_model.py +4 -8
- docling/models/tesseract_ocr_cli_model.py +17 -16
- docling/models/tesseract_ocr_model.py +9 -5
- docling/pipeline/base_pipeline.py +4 -8
- docling/pipeline/simple_pipeline.py +0 -1
- docling/pipeline/standard_pdf_pipeline.py +0 -1
- docling/pipeline/vlm_pipeline.py +0 -3
- docling/utils/export.py +2 -4
- docling/utils/glm_utils.py +2 -2
- docling/utils/layout_postprocessor.py +4 -2
- docling/utils/model_downloader.py +31 -7
- docling/utils/utils.py +3 -3
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/METADATA +2 -1
- docling-2.31.1.dist-info/RECORD +86 -0
- docling-2.30.0.dist-info/RECORD +0 -86
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/LICENSE +0 -0
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/WHEEL +0 -0
- {docling-2.30.0.dist-info → docling-2.31.1.dist-info}/entry_points.txt +0 -0
@@ -3,9 +3,10 @@ import io
|
|
3
3
|
import logging
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
+
from collections.abc import Iterable
|
6
7
|
from pathlib import Path
|
7
8
|
from subprocess import DEVNULL, PIPE, Popen
|
8
|
-
from typing import
|
9
|
+
from typing import List, Optional, Tuple, Type
|
9
10
|
|
10
11
|
import pandas as pd
|
11
12
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
63
64
|
)
|
64
65
|
|
65
66
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
66
|
-
|
67
|
-
if self._name != None and self._version != None:
|
67
|
+
if self._name is not None and self._version is not None:
|
68
68
|
return self._name, self._version # type: ignore
|
69
69
|
|
70
70
|
cmd = [self.options.tesseract_cmd, "--version"]
|
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
125
125
|
# _log.info(decoded_data)
|
126
126
|
|
127
127
|
# Read the TSV file generated by Tesseract
|
128
|
-
|
128
|
+
df_result = pd.read_csv(
|
129
|
+
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
130
|
+
)
|
129
131
|
|
130
132
|
# Display the dataframe (optional)
|
131
133
|
# _log.info("df: ", df.head())
|
132
134
|
|
133
135
|
# Filter rows that contain actual text (ignore header or empty rows)
|
134
|
-
df_filtered =
|
135
|
-
|
136
|
+
df_filtered = df_result[
|
137
|
+
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
136
138
|
]
|
137
139
|
|
138
140
|
return df_filtered
|
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
149
151
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
150
152
|
output, _ = proc.communicate()
|
151
153
|
decoded_data = output.decode("utf-8")
|
152
|
-
|
154
|
+
df_detected = pd.read_csv(
|
153
155
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
154
156
|
)
|
155
|
-
scripts =
|
157
|
+
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
156
158
|
if len(scripts) == 0:
|
157
159
|
_log.warning("Tesseract cannot detect the script of the page")
|
158
160
|
return None
|
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
183
185
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
184
186
|
output, _ = proc.communicate()
|
185
187
|
decoded_data = output.decode("utf-8")
|
186
|
-
|
187
|
-
self._tesseract_languages =
|
188
|
+
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
189
|
+
self._tesseract_languages = df_list[0].tolist()[1:]
|
188
190
|
|
189
191
|
# Decide the script prefix
|
190
|
-
if any(
|
192
|
+
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
191
193
|
script_prefix = "script/"
|
192
194
|
else:
|
193
195
|
script_prefix = ""
|
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
197
199
|
def __call__(
|
198
200
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
199
201
|
) -> Iterable[Page]:
|
200
|
-
|
201
202
|
if not self.enabled:
|
202
203
|
yield from page_batch
|
203
204
|
return
|
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|
225
226
|
fname = image_file.name
|
226
227
|
high_res_image.save(image_file)
|
227
228
|
|
228
|
-
|
229
|
+
df_result = self._run_tesseract(fname)
|
229
230
|
finally:
|
230
231
|
if os.path.exists(fname):
|
231
232
|
os.remove(fname)
|
232
233
|
|
233
|
-
# _log.info(
|
234
|
+
# _log.info(df_result)
|
234
235
|
|
235
236
|
# Print relevant columns (bounding box and text)
|
236
|
-
for ix, row in
|
237
|
+
for ix, row in df_result.iterrows():
|
237
238
|
text = row["text"]
|
238
239
|
conf = row["conf"]
|
239
240
|
|
240
|
-
l = float(row["left"])
|
241
|
+
l = float(row["left"]) # noqa: E741
|
241
242
|
b = float(row["top"])
|
242
243
|
w = float(row["width"])
|
243
244
|
h = float(row["height"])
|
@@ -1,6 +1,9 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import logging
|
4
|
+
from collections.abc import Iterable
|
2
5
|
from pathlib import Path
|
3
|
-
from typing import
|
6
|
+
from typing import Optional, Type
|
4
7
|
|
5
8
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
6
9
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
@@ -38,7 +41,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|
38
41
|
|
39
42
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
40
43
|
self.reader = None
|
41
|
-
self.osd_reader = None
|
42
44
|
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
43
45
|
|
44
46
|
if self.enabled:
|
@@ -64,7 +66,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
64
66
|
raise ImportError(install_errmsg)
|
65
67
|
try:
|
66
68
|
tesseract_version = tesserocr.tesseract_version()
|
67
|
-
except:
|
69
|
+
except Exception:
|
68
70
|
raise ImportError(install_errmsg)
|
69
71
|
|
70
72
|
_, self._tesserocr_languages = tesserocr.get_languages()
|
@@ -75,7 +77,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
75
77
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
76
78
|
lang = "+".join(self.options.lang)
|
77
79
|
|
78
|
-
if any(
|
80
|
+
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
|
79
81
|
self.script_prefix = "script/"
|
80
82
|
else:
|
81
83
|
self.script_prefix = ""
|
@@ -86,6 +88,8 @@ class TesseractOcrModel(BaseOcrModel):
|
|
86
88
|
"oem": tesserocr.OEM.DEFAULT,
|
87
89
|
}
|
88
90
|
|
91
|
+
self.osd_reader = None
|
92
|
+
|
89
93
|
if self.options.path is not None:
|
90
94
|
tesserocr_kwargs["path"] = self.options.path
|
91
95
|
|
@@ -149,7 +153,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|
149
153
|
script = map_tesseract_script(script)
|
150
154
|
lang = f"{self.script_prefix}{script}"
|
151
155
|
|
152
|
-
# Check if the detected
|
156
|
+
# Check if the detected language is present in the system
|
153
157
|
if lang not in self._tesserocr_languages:
|
154
158
|
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
155
159
|
msg += " However this language is not installed in your system and will be ignored."
|
@@ -3,9 +3,10 @@ import logging
|
|
3
3
|
import time
|
4
4
|
import traceback
|
5
5
|
from abc import ABC, abstractmethod
|
6
|
-
from
|
6
|
+
from collections.abc import Iterable
|
7
|
+
from typing import Any, Callable, List
|
7
8
|
|
8
|
-
from docling_core.types.doc import
|
9
|
+
from docling_core.types.doc import NodeItem
|
9
10
|
|
10
11
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
11
12
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
@@ -64,7 +65,6 @@ class BasePipeline(ABC):
|
|
64
65
|
return conv_res
|
65
66
|
|
66
67
|
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
67
|
-
|
68
68
|
def _prepare_elements(
|
69
69
|
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
|
70
70
|
) -> Iterable[NodeItem]:
|
@@ -113,7 +113,6 @@ class BasePipeline(ABC):
|
|
113
113
|
|
114
114
|
|
115
115
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
116
|
-
|
117
116
|
def __init__(self, pipeline_options: PipelineOptions):
|
118
117
|
super().__init__(pipeline_options)
|
119
118
|
self.keep_backend = False
|
@@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
127
126
|
yield from page_batch
|
128
127
|
|
129
128
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
130
|
-
|
131
129
|
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
132
130
|
raise RuntimeError(
|
133
131
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
@@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
139
137
|
|
140
138
|
total_elapsed_time = 0.0
|
141
139
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
142
|
-
|
143
|
-
for i in range(0, conv_res.input.page_count):
|
140
|
+
for i in range(conv_res.input.page_count):
|
144
141
|
start_page, end_page = conv_res.input.limits.page_range
|
145
142
|
if (start_page - 1) <= i <= (end_page - 1):
|
146
143
|
conv_res.pages.append(Page(page_no=i))
|
@@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
161
158
|
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
162
159
|
|
163
160
|
for p in pipeline_pages: # Must exhaust!
|
164
|
-
|
165
161
|
# Cleanup cached images
|
166
162
|
if not self.keep_images:
|
167
163
|
p._image_cache = {}
|
@@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
|
|
24
24
|
super().__init__(pipeline_options)
|
25
25
|
|
26
26
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
27
|
-
|
28
27
|
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
29
28
|
raise RuntimeError(
|
30
29
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
docling/pipeline/vlm_pipeline.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
import logging
|
2
|
-
import warnings
|
3
2
|
from io import BytesIO
|
4
3
|
from pathlib import Path
|
5
4
|
from typing import List, Optional, Union, cast
|
@@ -32,7 +31,6 @@ _log = logging.getLogger(__name__)
|
|
32
31
|
|
33
32
|
|
34
33
|
class VlmPipeline(PaginatedPipeline):
|
35
|
-
|
36
34
|
def __init__(self, pipeline_options: VlmPipelineOptions):
|
37
35
|
super().__init__(pipeline_options)
|
38
36
|
self.keep_backend = True
|
@@ -114,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
|
|
114
112
|
|
115
113
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
116
114
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
117
|
-
|
118
115
|
if (
|
119
116
|
self.pipeline_options.vlm_options.response_format
|
120
117
|
== ResponseFormat.DOCTAGS
|
docling/utils/export.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
import logging
|
2
|
-
from
|
2
|
+
from collections.abc import Iterable
|
3
|
+
from typing import Any, Dict, List, Tuple, Union
|
3
4
|
|
4
5
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
5
|
-
from docling_core.types.doc.page import TextCell
|
6
6
|
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
7
7
|
|
8
8
|
from docling.datamodel.document import ConversionResult, Page
|
@@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
|
|
13
13
|
def generate_multimodal_pages(
|
14
14
|
doc_result: ConversionResult,
|
15
15
|
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
16
|
-
|
17
16
|
label_to_doclaynet = {
|
18
17
|
"title": "title",
|
19
18
|
"table-of-contents": "document_index",
|
@@ -122,7 +121,6 @@ def generate_multimodal_pages(
|
|
122
121
|
if doc.main_text is None:
|
123
122
|
return
|
124
123
|
for ix, orig_item in enumerate(doc.main_text):
|
125
|
-
|
126
124
|
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
127
125
|
if item is None or item.prov is None or len(item.prov) == 0:
|
128
126
|
_log.debug(f"Skipping item {orig_item}")
|
docling/utils/glm_utils.py
CHANGED
@@ -29,7 +29,7 @@ def resolve_item(paths, obj):
|
|
29
29
|
|
30
30
|
try:
|
31
31
|
key = int(paths[0])
|
32
|
-
except:
|
32
|
+
except Exception:
|
33
33
|
key = paths[0]
|
34
34
|
|
35
35
|
if len(paths) == 1:
|
@@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
|
67
67
|
return unique_objects
|
68
68
|
|
69
69
|
|
70
|
-
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
70
|
+
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
|
71
71
|
origin = DocumentOrigin(
|
72
72
|
mimetype="application/pdf",
|
73
73
|
filename=doc_glm["file-info"]["filename"],
|
@@ -18,7 +18,7 @@ class UnionFind:
|
|
18
18
|
|
19
19
|
def __init__(self, elements):
|
20
20
|
self.parent = {elem: elem for elem in elements}
|
21
|
-
self.rank =
|
21
|
+
self.rank = dict.fromkeys(elements, 0)
|
22
22
|
|
23
23
|
def find(self, x):
|
24
24
|
if self.parent[x] != x:
|
@@ -484,7 +484,9 @@ class LayoutPostprocessor:
|
|
484
484
|
spatial_index = (
|
485
485
|
self.regular_index
|
486
486
|
if cluster_type == "regular"
|
487
|
-
else self.picture_index
|
487
|
+
else self.picture_index
|
488
|
+
if cluster_type == "picture"
|
489
|
+
else self.wrapper_index
|
488
490
|
)
|
489
491
|
|
490
492
|
# Map of currently valid clusters
|
@@ -4,12 +4,15 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
from docling.datamodel.pipeline_options import (
|
6
6
|
granite_picture_description,
|
7
|
+
smoldocling_vlm_conversion_options,
|
8
|
+
smoldocling_vlm_mlx_conversion_options,
|
7
9
|
smolvlm_picture_description,
|
8
10
|
)
|
9
11
|
from docling.datamodel.settings import settings
|
10
12
|
from docling.models.code_formula_model import CodeFormulaModel
|
11
13
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
12
14
|
from docling.models.easyocr_model import EasyOcrModel
|
15
|
+
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
13
16
|
from docling.models.layout_model import LayoutModel
|
14
17
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
15
18
|
from docling.models.table_structure_model import TableStructureModel
|
@@ -27,6 +30,8 @@ def download_models(
|
|
27
30
|
with_code_formula: bool = True,
|
28
31
|
with_picture_classifier: bool = True,
|
29
32
|
with_smolvlm: bool = False,
|
33
|
+
with_smoldocling: bool = False,
|
34
|
+
with_smoldocling_mlx: bool = False,
|
30
35
|
with_granite_vision: bool = False,
|
31
36
|
with_easyocr: bool = True,
|
32
37
|
):
|
@@ -37,7 +42,7 @@ def download_models(
|
|
37
42
|
output_dir.mkdir(exist_ok=True, parents=True)
|
38
43
|
|
39
44
|
if with_layout:
|
40
|
-
_log.info(
|
45
|
+
_log.info("Downloading layout model...")
|
41
46
|
LayoutModel.download_models(
|
42
47
|
local_dir=output_dir / LayoutModel._model_repo_folder,
|
43
48
|
force=force,
|
@@ -45,7 +50,7 @@ def download_models(
|
|
45
50
|
)
|
46
51
|
|
47
52
|
if with_tableformer:
|
48
|
-
_log.info(
|
53
|
+
_log.info("Downloading tableformer model...")
|
49
54
|
TableStructureModel.download_models(
|
50
55
|
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
51
56
|
force=force,
|
@@ -53,7 +58,7 @@ def download_models(
|
|
53
58
|
)
|
54
59
|
|
55
60
|
if with_picture_classifier:
|
56
|
-
_log.info(
|
61
|
+
_log.info("Downloading picture classifier model...")
|
57
62
|
DocumentPictureClassifier.download_models(
|
58
63
|
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
59
64
|
force=force,
|
@@ -61,7 +66,7 @@ def download_models(
|
|
61
66
|
)
|
62
67
|
|
63
68
|
if with_code_formula:
|
64
|
-
_log.info(
|
69
|
+
_log.info("Downloading code formula model...")
|
65
70
|
CodeFormulaModel.download_models(
|
66
71
|
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
67
72
|
force=force,
|
@@ -69,7 +74,7 @@ def download_models(
|
|
69
74
|
)
|
70
75
|
|
71
76
|
if with_smolvlm:
|
72
|
-
_log.info(
|
77
|
+
_log.info("Downloading SmolVlm model...")
|
73
78
|
PictureDescriptionVlmModel.download_models(
|
74
79
|
repo_id=smolvlm_picture_description.repo_id,
|
75
80
|
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
@@ -77,8 +82,27 @@ def download_models(
|
|
77
82
|
progress=progress,
|
78
83
|
)
|
79
84
|
|
85
|
+
if with_smoldocling:
|
86
|
+
_log.info("Downloading SmolDocling model...")
|
87
|
+
HuggingFaceVlmModel.download_models(
|
88
|
+
repo_id=smoldocling_vlm_conversion_options.repo_id,
|
89
|
+
local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
|
90
|
+
force=force,
|
91
|
+
progress=progress,
|
92
|
+
)
|
93
|
+
|
94
|
+
if with_smoldocling_mlx:
|
95
|
+
_log.info("Downloading SmolDocling MLX model...")
|
96
|
+
HuggingFaceVlmModel.download_models(
|
97
|
+
repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
|
98
|
+
local_dir=output_dir
|
99
|
+
/ smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
|
100
|
+
force=force,
|
101
|
+
progress=progress,
|
102
|
+
)
|
103
|
+
|
80
104
|
if with_granite_vision:
|
81
|
-
_log.info(
|
105
|
+
_log.info("Downloading Granite Vision model...")
|
82
106
|
PictureDescriptionVlmModel.download_models(
|
83
107
|
repo_id=granite_picture_description.repo_id,
|
84
108
|
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
@@ -87,7 +111,7 @@ def download_models(
|
|
87
111
|
)
|
88
112
|
|
89
113
|
if with_easyocr:
|
90
|
-
_log.info(
|
114
|
+
_log.info("Downloading easyocr models...")
|
91
115
|
EasyOcrModel.download_models(
|
92
116
|
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
93
117
|
force=force,
|
docling/utils/utils.py
CHANGED
@@ -13,14 +13,14 @@ def chunkify(iterator, chunk_size):
|
|
13
13
|
if isinstance(iterator, List):
|
14
14
|
iterator = iter(iterator)
|
15
15
|
for first in iterator: # Take the first element from the iterator
|
16
|
-
yield [first
|
16
|
+
yield [first, *list(islice(iterator, chunk_size - 1))]
|
17
17
|
|
18
18
|
|
19
19
|
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
20
20
|
"""Create a stable page_hash of the path_or_stream of a file"""
|
21
21
|
|
22
22
|
block_size = 65536
|
23
|
-
hasher = hashlib.sha256()
|
23
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
24
24
|
|
25
25
|
def _hash_buf(binary_stream):
|
26
26
|
buf = binary_stream.read(block_size) # read and page_hash in chunks
|
@@ -38,7 +38,7 @@ def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
|
38
38
|
|
39
39
|
|
40
40
|
def create_hash(string: str):
|
41
|
-
hasher = hashlib.sha256()
|
41
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
42
42
|
hasher.update(string.encode("utf-8"))
|
43
43
|
|
44
44
|
return hasher.hexdigest()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.31.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Home-page: https://github.com/docling-project/docling
|
6
6
|
License: MIT
|
@@ -86,6 +86,7 @@ Description-Content-Type: text/markdown
|
|
86
86
|
[](https://opensource.org/licenses/MIT)
|
87
87
|
[](https://pepy.tech/projects/docling)
|
88
88
|
[](https://apify.com/vancura/docling)
|
89
|
+
[](https://www.bestpractices.dev/projects/10101)
|
89
90
|
[](https://lfaidata.foundation/projects/)
|
90
91
|
|
91
92
|
Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem.
|
@@ -0,0 +1,86 @@
|
|
1
|
+
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
+
docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
|
4
|
+
docling/backend/asciidoc_backend.py,sha256=VZ8Xk1VHGHRqBo_TdtMzRAu1NFaFaJ8dk4CaEcBaEm0,14038
|
5
|
+
docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
|
6
|
+
docling/backend/docling_parse_backend.py,sha256=V_CsUdN5RkGQBBq7A_ReAiUW4CQVh0-1Ur157Ozurdg,8017
|
7
|
+
docling/backend/docling_parse_v2_backend.py,sha256=6fokgqb1hMbZua33gL46EFamrwPTC7ms6ZuEHw-Dv28,9395
|
8
|
+
docling/backend/docling_parse_v4_backend.py,sha256=-WJZs0IsdN6blhkvTS1eh_qhujYLyJ3XcOMqS6AaXxg,6282
|
9
|
+
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
+
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
docling/backend/docx/latex/latex_dict.py,sha256=tFJp4ScT_AkY2ON7nLEa560p601Jq2glcZvMKxxjn7w,6593
|
12
|
+
docling/backend/docx/latex/omml.py,sha256=nEpcfyyrOucJyj6cD7wfThrIa-q0CQCoqMb3dkrhCRg,12094
|
13
|
+
docling/backend/html_backend.py,sha256=3K-l5SUAAyqISNEb7nPst_I51xzYOVOkgmwXh3lv9sw,21063
|
14
|
+
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
16
|
+
docling/backend/md_backend.py,sha256=JkY1qTvQFXjKSZGfD-83d-fZelorUG_l6mpJdYGqvX8,17210
|
17
|
+
docling/backend/msexcel_backend.py,sha256=3j0WQfqDpgPXdPMCguefdv7arcNVDedPD6gl54cmLn8,18110
|
18
|
+
docling/backend/mspowerpoint_backend.py,sha256=RwqfvvzrtM56L9uf7PR9lvlHJ-LyYGpkS1iVxkTl72Q,17203
|
19
|
+
docling/backend/msword_backend.py,sha256=lVVMNwt0WIl4RD5wAf8pc8bJsb60x1BA8hTTkVmEVa8,32477
|
20
|
+
docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
|
21
|
+
docling/backend/pypdfium2_backend.py,sha256=pX8f0WbUb0KTDTKyQuLzP_lgHHubyGXWD33vmpefPy8,10805
|
22
|
+
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
+
docling/backend/xml/jats_backend.py,sha256=ghGi9bHjx3BvaOtmzLw86-wZy4UxpQPOPQL4e73-BI8,24927
|
24
|
+
docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
|
25
|
+
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
26
|
+
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
+
docling/cli/main.py,sha256=D7WEY4x6pQCVFRy3peK9KUDOb0Y5IVc-vTDqPnHPK00,26138
|
28
|
+
docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
|
29
|
+
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
30
|
+
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
+
docling/datamodel/base_models.py,sha256=DRE_XoldtCreWF4ucO0iK0l8uOnfvnhQaYjV0z1Qe0M,7921
|
32
|
+
docling/datamodel/document.py,sha256=_0Z4zUgCB5677ZW8Y7C1fv75enLZJOJUjcUkGTSiTBA,15553
|
33
|
+
docling/datamodel/pipeline_options.py,sha256=-1QG8dY0RZkTJb66lXErEAnPq4F_1vgnk_5AcIr3cgU,13350
|
34
|
+
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
35
|
+
docling/document_converter.py,sha256=PRRr65nigQ3LZDl4G2fBMkOtJyswT7xyGt7fpUeDO3w,13849
|
36
|
+
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
37
|
+
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
+
docling/models/api_vlm_model.py,sha256=w1SzdG3Ypz_0iZGiX-skMwV1E1JnOHH2BJiNkcEEIAA,2478
|
39
|
+
docling/models/base_model.py,sha256=Zx_nByGYkubTvvYiQxwiB6P8lc7wOD4ZTC2QIw6vCEg,2950
|
40
|
+
docling/models/base_ocr_model.py,sha256=_iD8QCKQdv2VWrIuSRPyGP4oCz94h84WriHg9F2k-Z0,7172
|
41
|
+
docling/models/code_formula_model.py,sha256=9cplJFvP7jcJGz-p-MmL8_lqUhmaXZu7wKyX2aOTujs,11504
|
42
|
+
docling/models/document_picture_classifier.py,sha256=tyOnyM0vh8-pjh9PiHa_67YpK-3pc_vGQKlnfAyraBs,6255
|
43
|
+
docling/models/easyocr_model.py,sha256=3rgXMeB7LbMjevCAVDMG3voe3PQhQ7B-RyYrXzefUlQ,7365
|
44
|
+
docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
|
45
|
+
docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
|
46
|
+
docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
|
47
|
+
docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
|
48
|
+
docling/models/hf_mlx_model.py,sha256=B_B4hFU-jU0g_DQtQD8w4Ejorn10mkDuFI93wR_WhGk,4897
|
49
|
+
docling/models/hf_vlm_model.py,sha256=SiPMTLghMUjJ66dA2yN4UujpLO6PiOhLEPInWtXV_5s,6912
|
50
|
+
docling/models/layout_model.py,sha256=0fiJXJ4aPmcMsYY7rbN9LJ2mZ0_8G0ODY9kyNTAN3Ws,7823
|
51
|
+
docling/models/ocr_mac_model.py,sha256=A3TlEbvvwhkWiq9YARos3Y9yNcpPYQ7JGc_4hFtAK-8,5370
|
52
|
+
docling/models/page_assemble_model.py,sha256=GO7JI1D6T6EkSW94cLQobPGNQUahkxQqTPRwj5CnmFE,6304
|
53
|
+
docling/models/page_preprocessing_model.py,sha256=6pOGXiFQ-oz06UmJdcaYMdVyfZ0YVLWS6efGcx7Mxws,3105
|
54
|
+
docling/models/picture_description_api_model.py,sha256=qs3n0smC9DXhzwJeK_iQG08Y6ZFHInKtdGPVhzgvxgU,2091
|
55
|
+
docling/models/picture_description_base_model.py,sha256=FbBVXzAOB87xpJN28tuGCxoAdcf6mZNUOqJR7ljUg5g,2946
|
56
|
+
docling/models/picture_description_vlm_model.py,sha256=DiTjnehVy1n0N04xPUvZl8rx4TiNHzHn9Cnzy_ePGts,4177
|
57
|
+
docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
|
+
docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurHCZjp4,858
|
59
|
+
docling/models/rapid_ocr_model.py,sha256=Tq_1Egu5Hjx7Y69Vox17QTtRXztSyflB1fhN08CWQwY,5894
|
60
|
+
docling/models/readingorder_model.py,sha256=S9ru2ApY9sE-Uue3hptWHmbmElwo36bUbAikxCFpHYs,14574
|
61
|
+
docling/models/table_structure_model.py,sha256=1gxLaooK0IKMrnmS8nT1BItKqt1GAKghfpmLKb3i53g,12566
|
62
|
+
docling/models/tesseract_ocr_cli_model.py,sha256=iFdOud5ymoW9WV8bWLCDpd3LJBo9M5bTT5vc635zEDY,10229
|
63
|
+
docling/models/tesseract_ocr_model.py,sha256=72009TJL_7tXTEnhlsGRiw_KibrQ0LjZlCBtW8NtwUc,9339
|
64
|
+
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
65
|
+
docling/pipeline/base_pipeline.py,sha256=DnuxAf7EQusdSRae0QUVth-0f2mSff8JZjX-2vazk00,8751
|
66
|
+
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
67
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=iNZMMGiHTwV6I4u_jjqXhVJ_DiPn_O9qnnee3PQxidc,10773
|
68
|
+
docling/pipeline/vlm_pipeline.py,sha256=g3bxPEqxK8x-B5S6pOpNNo5GxCMCRDZgPJUFqsBA1eg,9720
|
69
|
+
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
70
|
+
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
|
+
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
72
|
+
docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
|
73
|
+
docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
|
74
|
+
docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
|
75
|
+
docling/utils/layout_postprocessor.py,sha256=x7exVG3HYzV9M_O78FfyoG43Y2L7PPMMydvSNwjqh8s,24528
|
76
|
+
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
77
|
+
docling/utils/model_downloader.py,sha256=ocvud3G3qlBQhzMo69Q3RJMnvq5HPZ2DwNbMuEp8RCs,4142
|
78
|
+
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
79
|
+
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
80
|
+
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
81
|
+
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
82
|
+
docling-2.31.1.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
83
|
+
docling-2.31.1.dist-info/METADATA,sha256=31fTxA8TvMdw_KdThEyn3Z5GAHAhNEtvFYlrPdzqV4w,10108
|
84
|
+
docling-2.31.1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
85
|
+
docling-2.31.1.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
|
86
|
+
docling-2.31.1.dist-info/RECORD,,
|
docling-2.30.0.dist-info/RECORD
DELETED
@@ -1,86 +0,0 @@
|
|
1
|
-
docling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
docling/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
|
-
docling/backend/abstract_backend.py,sha256=1lNxzwDTn303aXduPDVmTyXn-5ZIoWMLYqNxANGWmQQ,1658
|
4
|
-
docling/backend/asciidoc_backend.py,sha256=xBtmYkRkPICIfMbB8AFIw_or4IZGB17mP_LhXorvZ1k,14060
|
5
|
-
docling/backend/csv_backend.py,sha256=lCNSkgB55IbAig7w4IyXRkX23aM3Nojj6GdXNoaNjY4,4536
|
6
|
-
docling/backend/docling_parse_backend.py,sha256=tcy4cPD_dtGD37CjivbFvwzwXVcrb3HVmofyasxLum8,7991
|
7
|
-
docling/backend/docling_parse_v2_backend.py,sha256=70kXqYhht-A8zb9z5emMe_1i0l9dyQGrM8lg1cmAvqc,9369
|
8
|
-
docling/backend/docling_parse_v4_backend.py,sha256=IECMJQWEvYqQv043_1Ho6dLkCbuaK8cMUsqcxwqruXo,6287
|
9
|
-
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
10
|
-
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
docling/backend/docx/latex/latex_dict.py,sha256=5pOMY_KyxYmgBZ40IrA4q0t5L6JvXOCx5cVwoQE1lls,6690
|
12
|
-
docling/backend/docx/latex/omml.py,sha256=5zuXYOQ10e9nSTKFURBjoU-XSQZVHsVyIiCsGYGVAk8,12127
|
13
|
-
docling/backend/html_backend.py,sha256=ghPLZfdBEPBzLIO9IWzzx0t1Os9B9r4VyGyEZtMsZVI,19468
|
14
|
-
docling/backend/json/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
-
docling/backend/json/docling_json_backend.py,sha256=LlFMVoZrrCfVwbDuRbNN4Xg96Lujh4xxrTBt9jGhY9I,1984
|
16
|
-
docling/backend/md_backend.py,sha256=lqDiKIBHGsA0u-H1n9oVpPlrcpVT4gYRuNXXcyGlftM,17219
|
17
|
-
docling/backend/msexcel_backend.py,sha256=KRPoHRDv-mqko9RUHGQCzdRrvDo7g7zSU2Z5zoL_Hzo,18106
|
18
|
-
docling/backend/mspowerpoint_backend.py,sha256=X55-1anXm562wxAuYn5uwQkqKjirmgrn1KfbeaKUbXw,17273
|
19
|
-
docling/backend/msword_backend.py,sha256=CgNPjU8SQ7rkAYH_BGiUyv568MGhoH3R0M39WBT8gkc,32468
|
20
|
-
docling/backend/pdf_backend.py,sha256=odWb1rxk3WCUIEJMhq-dYFNUQ1pSDuNHbU9wlTZIRAs,2211
|
21
|
-
docling/backend/pypdfium2_backend.py,sha256=wRwhA5XHRqL7vyNhCAHM6P-ONkwtyjKG9LgC4NJ-4i8,10784
|
22
|
-
docling/backend/xml/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
docling/backend/xml/jats_backend.py,sha256=HXailrDjiwu4swwFnXy3lNfRtLZmkBBp4yqafCvdr7s,24945
|
24
|
-
docling/backend/xml/uspto_backend.py,sha256=H0jwIt2skOke_yEUk0wfXCtodrB-hrj2ygLtB3jMWaI,71056
|
25
|
-
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
26
|
-
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
|
-
docling/cli/main.py,sha256=TD-cEf4giuk1O5NPoB-heXHHteUqKoLsj4Rg4xsBUrs,26119
|
28
|
-
docling/cli/models.py,sha256=tM_qbMM3YOPxFU7JlME96MLbtd1CX_bOAK7FS-NhJvY,3979
|
29
|
-
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
30
|
-
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
docling/datamodel/base_models.py,sha256=fJfFMaHXc-CUrAVfhPF8lKrdb-gaXr2tohx6dHldvRU,7926
|
32
|
-
docling/datamodel/document.py,sha256=V0iK1MYOkPIzd4eQa-G8unp-t01fktlG9wwQ1IwE6Zg,15109
|
33
|
-
docling/datamodel/pipeline_options.py,sha256=iGLijZR-YOtmg0RQs59pqoG_1uGsDYbg5wMDD0FWYx4,13351
|
34
|
-
docling/datamodel/settings.py,sha256=bNMdowIKv7RUchabQTo4rFNEsxfB6pGg2LoZSY634zo,1869
|
35
|
-
docling/document_converter.py,sha256=LCX92FzgmXNJLFVSQfjqH9SGe3zA7FGwARedSigFIpY,13798
|
36
|
-
docling/exceptions.py,sha256=K1WnCS1leK2JtMB5ewZWKkb0EaijFgl-tRzrO9ntgPM,134
|
37
|
-
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
|
-
docling/models/api_vlm_model.py,sha256=6SxMsFPf0SbT365P67KspdpF3TXZSeu5kmPE3lXAhW4,2470
|
39
|
-
docling/models/base_model.py,sha256=9xJ0VIlpR2BzqoEWMC8LYp5Y96QAEKip4b_HCwCDltY,2931
|
40
|
-
docling/models/base_ocr_model.py,sha256=xvKMhE4ZOGkL2GAhpDvrAHLLFps3ZUfxXZ5ctL1lXUw,7226
|
41
|
-
docling/models/code_formula_model.py,sha256=mOu5luYMzyrCCr8MRGOciNcSvULpQysDd_FXn96WPc8,11477
|
42
|
-
docling/models/document_picture_classifier.py,sha256=fz77RsTdlnA_yC47O-KUq2xVWMKX0_9jm_EGcHliw-E,6235
|
43
|
-
docling/models/easyocr_model.py,sha256=ezq3yv5lORe7T1bbSoTZALck2oHqyEHq57cRfhMYCCQ,7401
|
44
|
-
docling/models/factories/__init__.py,sha256=e4lFmRfmW5hWqvJjY5xaVFbvCQhDBCrVeSq85Q2K_aM,872
|
45
|
-
docling/models/factories/base_factory.py,sha256=pNR9-B_BKs2sYNyHnp2ON2l3r6Dy9lcof4qmwHlAryI,4032
|
46
|
-
docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
|
47
|
-
docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
|
48
|
-
docling/models/hf_mlx_model.py,sha256=2eSHphJm5LAfiSA24blVMc2znJlKMYrtmmzq8ffc-rU,4924
|
49
|
-
docling/models/hf_vlm_model.py,sha256=NUtLEuG-kNGJeDHWmQKAAOZG4WF0a5hn-KXUUM1mHBQ,6820
|
50
|
-
docling/models/layout_model.py,sha256=7fQWipGV1HDrvbP4uOKa9QAicQl89jp7lailQmbFL3w,7804
|
51
|
-
docling/models/ocr_mac_model.py,sha256=2pZaUWg19go_u88mKWr5y_52PAYEN__GsbyUYLdY4zo,5353
|
52
|
-
docling/models/page_assemble_model.py,sha256=ivkCdbZJpFcGl7CazLegcP1tLK8ZixDfVhQXqsdW_UA,6359
|
53
|
-
docling/models/page_preprocessing_model.py,sha256=Ja7RE1K-2fWxWrxOzNm6QDSGqFf-MY6_uY5OAZ7AQSo,3078
|
54
|
-
docling/models/picture_description_api_model.py,sha256=DowWOU93MXAjj3N1A9ex88Sa3Nic2c3dfoOYir5jZEA,2064
|
55
|
-
docling/models/picture_description_base_model.py,sha256=khuhQZDAZemZMe4BsrBMpjEwkY3nhMFXuczjQpSQrVY,2971
|
56
|
-
docling/models/picture_description_vlm_model.py,sha256=I2Un3vfhQVeWEyZ3Sd3Kygw9la2QSZCwDfl_7XVlMm4,4042
|
57
|
-
docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
|
-
docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurHCZjp4,858
|
59
|
-
docling/models/rapid_ocr_model.py,sha256=C_I0Ek9mAPIyTFRHuNbqtXg1c15rLNDE1tJ6_hPIi4c,5869
|
60
|
-
docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
|
61
|
-
docling/models/table_structure_model.py,sha256=pvTsqUa5QIANBUfot0XXG1UUeku-eaUi04EPE-Yh2g0,12597
|
62
|
-
docling/models/tesseract_ocr_cli_model.py,sha256=CZ1W0QbvveIpXO0qSXmXFqz71P4PfLfJBQIqU_Wlg_E,10072
|
63
|
-
docling/models/tesseract_ocr_model.py,sha256=UpLAgKgJtBgbKtJELmKBNMcejJJKBCyFK0q-WgZN1Eg,9256
|
64
|
-
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
65
|
-
docling/pipeline/base_pipeline.py,sha256=9ABK-Cr235bxE5vweoIA5rgBZV_EF8qFxAqLI27H_Pg,8749
|
66
|
-
docling/pipeline/simple_pipeline.py,sha256=mZqANqUtAOFAyqQEShErQnAUz6tJFOl6zVoazEDJ_wE,2254
|
67
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=gPNqUparhIONG4AyMekW9OfZ7t8YMs0odhtbE6Z-Hxw,10784
|
68
|
-
docling/pipeline/vlm_pipeline.py,sha256=dqQYAd3viW577TVSZltnB4P-f-ZUWQh0J8SSFDuQN6Q,9738
|
69
|
-
docling/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
70
|
-
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
|
-
docling/utils/accelerator_utils.py,sha256=ONNRrC8fH-8E93WUCNhfOq1t7WrQ1T7-YsmExTOY5f0,2292
|
72
|
-
docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
|
73
|
-
docling/utils/export.py,sha256=4W-ptI1fLdVrtoqHdHY1RF9Xn2Yescs-hunITqxJ7Is,4697
|
74
|
-
docling/utils/glm_utils.py,sha256=W4JRoP0xQ6SJmhhIoAfcKxm5dr1CFvLHp8pqI1kdhxs,12250
|
75
|
-
docling/utils/layout_postprocessor.py,sha256=Q36DfcIYMuMfC6LzCBIrYtHK7pBE-Xyvjepz660s9UM,24508
|
76
|
-
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
77
|
-
docling/utils/model_downloader.py,sha256=sxAQvjiIu9m2Ur5Ot5C5SATmgWJAHi0xSjzxj8QXYJk,3213
|
78
|
-
docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,263
|
79
|
-
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
80
|
-
docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
|
81
|
-
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
82
|
-
docling-2.30.0.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
83
|
-
docling-2.30.0.dist-info/METADATA,sha256=HSI154YUnSDJE8BMMjOuu-U3EXQg0ksFuyuyzv7-UdU,9982
|
84
|
-
docling-2.30.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
85
|
-
docling-2.30.0.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
|
86
|
-
docling-2.30.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|