docling 2.39.0__py3-none-any.whl → 2.40.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/docling_parse_v4_backend.py +14 -4
- docling/backend/msexcel_backend.py +33 -14
- docling/datamodel/pipeline_options.py +8 -0
- docling/models/base_ocr_model.py +6 -2
- docling/models/layout_model.py +10 -3
- docling/models/picture_description_vlm_model.py +16 -11
- docling/models/plugins/defaults.py +9 -9
- docling/models/readingorder_model.py +8 -1
- docling/models/table_structure_model.py +3 -1
- docling/models/tesseract_ocr_model.py +10 -4
- docling/pipeline/standard_pdf_pipeline.py +1 -0
- docling/utils/accelerator_utils.py +2 -2
- docling/utils/layout_postprocessor.py +7 -2
- {docling-2.39.0.dist-info → docling-2.40.0.dist-info}/METADATA +3 -3
- {docling-2.39.0.dist-info → docling-2.40.0.dist-info}/RECORD +19 -19
- {docling-2.39.0.dist-info → docling-2.40.0.dist-info}/WHEEL +0 -0
- {docling-2.39.0.dist-info → docling-2.40.0.dist-info}/entry_points.txt +0 -0
- {docling-2.39.0.dist-info → docling-2.40.0.dist-info}/licenses/LICENSE +0 -0
- {docling-2.39.0.dist-info → docling-2.40.0.dist-info}/top_level.txt +0 -0
@@ -187,7 +187,17 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
|
187
187
|
|
188
188
|
def unload(self):
|
189
189
|
super().unload()
|
190
|
-
|
191
|
-
|
192
|
-
self.
|
193
|
-
|
190
|
+
# Unload docling-parse document first
|
191
|
+
if self.dp_doc is not None:
|
192
|
+
self.dp_doc.unload()
|
193
|
+
self.dp_doc = None
|
194
|
+
|
195
|
+
# Then close pypdfium2 document with proper locking
|
196
|
+
if self._pdoc is not None:
|
197
|
+
with pypdfium2_lock:
|
198
|
+
try:
|
199
|
+
self._pdoc.close()
|
200
|
+
except Exception:
|
201
|
+
# Ignore cleanup errors
|
202
|
+
pass
|
203
|
+
self._pdoc = None
|
@@ -337,10 +337,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
337
337
|
# Collect the data within the bounds
|
338
338
|
data = []
|
339
339
|
visited_cells: set[tuple[int, int]] = set()
|
340
|
-
for ri in
|
341
|
-
|
342
|
-
|
343
|
-
|
340
|
+
for ri, row in enumerate(
|
341
|
+
sheet.iter_rows(
|
342
|
+
min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
|
343
|
+
max_row=max_row + 1,
|
344
|
+
min_col=start_col + 1,
|
345
|
+
max_col=max_col + 1,
|
346
|
+
values_only=False,
|
347
|
+
),
|
348
|
+
start_row,
|
349
|
+
):
|
350
|
+
for rj, cell in enumerate(row, start_col):
|
344
351
|
# Check if the cell belongs to a merged range
|
345
352
|
row_span = 1
|
346
353
|
col_span = 1
|
@@ -397,10 +404,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
397
404
|
"""
|
398
405
|
max_row: int = start_row
|
399
406
|
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
407
|
+
for ri, (cell,) in enumerate(
|
408
|
+
sheet.iter_rows(
|
409
|
+
min_row=start_row + 2,
|
410
|
+
max_row=sheet.max_row,
|
411
|
+
min_col=start_col + 1,
|
412
|
+
max_col=start_col + 1,
|
413
|
+
values_only=False,
|
414
|
+
),
|
415
|
+
start_row + 1,
|
416
|
+
):
|
404
417
|
# Check if the cell is part of a merged range
|
405
418
|
merged_range = next(
|
406
419
|
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
@@ -414,7 +427,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
414
427
|
if merged_range:
|
415
428
|
max_row = max(max_row, merged_range.max_row - 1)
|
416
429
|
else:
|
417
|
-
max_row
|
430
|
+
max_row = ri
|
418
431
|
|
419
432
|
return max_row
|
420
433
|
|
@@ -433,10 +446,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
433
446
|
"""
|
434
447
|
max_col: int = start_col
|
435
448
|
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
449
|
+
for rj, (cell,) in enumerate(
|
450
|
+
sheet.iter_cols(
|
451
|
+
min_row=start_row + 1,
|
452
|
+
max_row=start_row + 1,
|
453
|
+
min_col=start_col + 2,
|
454
|
+
max_col=sheet.max_column,
|
455
|
+
values_only=False,
|
456
|
+
),
|
457
|
+
start_col + 1,
|
458
|
+
):
|
440
459
|
# Check if the cell is part of a merged range
|
441
460
|
merged_range = next(
|
442
461
|
(mr for mr in sheet.merged_cells.ranges if cell.coordinate in mr),
|
@@ -450,7 +469,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|
450
469
|
if merged_range:
|
451
470
|
max_col = max(max_col, merged_range.max_col - 1)
|
452
471
|
else:
|
453
|
-
max_col
|
472
|
+
max_col = rj
|
454
473
|
|
455
474
|
return max_col
|
456
475
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
import logging
|
2
|
+
from datetime import datetime
|
2
3
|
from enum import Enum
|
3
4
|
from pathlib import Path
|
4
5
|
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
@@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
265
266
|
)
|
266
267
|
|
267
268
|
|
269
|
+
class LayoutOptions(BaseModel):
|
270
|
+
"""Options for layout processing."""
|
271
|
+
|
272
|
+
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
273
|
+
|
274
|
+
|
268
275
|
class AsrPipelineOptions(PipelineOptions):
|
269
276
|
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
270
277
|
artifacts_path: Optional[Union[Path, str]] = None
|
@@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
289
296
|
picture_description_options: PictureDescriptionBaseOptions = (
|
290
297
|
smolvlm_picture_description
|
291
298
|
)
|
299
|
+
layout_options: LayoutOptions = LayoutOptions()
|
292
300
|
|
293
301
|
images_scale: float = 1.0
|
294
302
|
generate_page_images: bool = False
|
docling/models/base_ocr_model.py
CHANGED
@@ -3,14 +3,13 @@ import logging
|
|
3
3
|
from abc import abstractmethod
|
4
4
|
from collections.abc import Iterable
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import List, Optional, Type
|
6
|
+
from typing import TYPE_CHECKING, List, Optional, Type
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
10
10
|
from docling_core.types.doc.page import TextCell
|
11
11
|
from PIL import Image, ImageDraw
|
12
12
|
from rtree import index
|
13
|
-
from scipy.ndimage import binary_dilation, find_objects, label
|
14
13
|
|
15
14
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
16
15
|
from docling.datamodel.base_models import Page
|
@@ -31,11 +30,16 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
|
31
30
|
options: OcrOptions,
|
32
31
|
accelerator_options: AcceleratorOptions,
|
33
32
|
):
|
33
|
+
# Make sure any delay/error from import occurs on ocr model init and not first use
|
34
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
35
|
+
|
34
36
|
self.enabled = enabled
|
35
37
|
self.options = options
|
36
38
|
|
37
39
|
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
38
40
|
def get_ocr_rects(self, page: Page) -> List[BoundingBox]:
|
41
|
+
from scipy.ndimage import binary_dilation, find_objects, label
|
42
|
+
|
39
43
|
BITMAP_COVERAGE_TRESHOLD = 0.75
|
40
44
|
assert page.size is not None
|
41
45
|
|
docling/models/layout_model.py
CHANGED
@@ -7,12 +7,12 @@ from typing import Optional
|
|
7
7
|
|
8
8
|
import numpy as np
|
9
9
|
from docling_core.types.doc import DocItemLabel
|
10
|
-
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
11
10
|
from PIL import Image
|
12
11
|
|
13
12
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
14
13
|
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
15
14
|
from docling.datamodel.document import ConversionResult
|
15
|
+
from docling.datamodel.pipeline_options import LayoutOptions
|
16
16
|
from docling.datamodel.settings import settings
|
17
17
|
from docling.models.base_model import BasePageModel
|
18
18
|
from docling.models.utils.hf_model_download import download_hf_model
|
@@ -49,8 +49,15 @@ class LayoutModel(BasePageModel):
|
|
49
49
|
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
|
50
50
|
|
51
51
|
def __init__(
|
52
|
-
self,
|
52
|
+
self,
|
53
|
+
artifacts_path: Optional[Path],
|
54
|
+
accelerator_options: AcceleratorOptions,
|
55
|
+
options: LayoutOptions,
|
53
56
|
):
|
57
|
+
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
58
|
+
|
59
|
+
self.options = options
|
60
|
+
|
54
61
|
device = decide_device(accelerator_options.device)
|
55
62
|
|
56
63
|
if artifacts_path is None:
|
@@ -176,7 +183,7 @@ class LayoutModel(BasePageModel):
|
|
176
183
|
# Apply postprocessing
|
177
184
|
|
178
185
|
processed_clusters, processed_cells = LayoutPostprocessor(
|
179
|
-
page, clusters
|
186
|
+
page, clusters, self.options
|
180
187
|
).postprocess()
|
181
188
|
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
182
189
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import threading
|
1
2
|
from collections.abc import Iterable
|
2
3
|
from pathlib import Path
|
3
4
|
from typing import Optional, Type, Union
|
@@ -15,6 +16,9 @@ from docling.models.utils.hf_model_download import (
|
|
15
16
|
)
|
16
17
|
from docling.utils.accelerator_utils import decide_device
|
17
18
|
|
19
|
+
# Global lock for model initialization to prevent threading issues
|
20
|
+
_model_init_lock = threading.Lock()
|
21
|
+
|
18
22
|
|
19
23
|
class PictureDescriptionVlmModel(
|
20
24
|
PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
|
@@ -57,17 +61,18 @@ class PictureDescriptionVlmModel(
|
|
57
61
|
)
|
58
62
|
|
59
63
|
# Initialize processor and model
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
64
|
+
with _model_init_lock:
|
65
|
+
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
66
|
+
self.model = AutoModelForVision2Seq.from_pretrained(
|
67
|
+
artifacts_path,
|
68
|
+
torch_dtype=torch.bfloat16,
|
69
|
+
_attn_implementation=(
|
70
|
+
"flash_attention_2"
|
71
|
+
if self.device.startswith("cuda")
|
72
|
+
and accelerator_options.cuda_use_flash_attention2
|
73
|
+
else "eager"
|
74
|
+
),
|
75
|
+
).to(self.device)
|
71
76
|
|
72
77
|
self.provenance = f"{self.options.repo_id}"
|
73
78
|
|
@@ -1,13 +1,10 @@
|
|
1
|
-
from docling.models.easyocr_model import EasyOcrModel
|
2
|
-
from docling.models.ocr_mac_model import OcrMacModel
|
3
|
-
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
4
|
-
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
5
|
-
from docling.models.rapid_ocr_model import RapidOcrModel
|
6
|
-
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
7
|
-
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
8
|
-
|
9
|
-
|
10
1
|
def ocr_engines():
|
2
|
+
from docling.models.easyocr_model import EasyOcrModel
|
3
|
+
from docling.models.ocr_mac_model import OcrMacModel
|
4
|
+
from docling.models.rapid_ocr_model import RapidOcrModel
|
5
|
+
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
6
|
+
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
7
|
+
|
11
8
|
return {
|
12
9
|
"ocr_engines": [
|
13
10
|
EasyOcrModel,
|
@@ -20,6 +17,9 @@ def ocr_engines():
|
|
20
17
|
|
21
18
|
|
22
19
|
def picture_description():
|
20
|
+
from docling.models.picture_description_api_model import PictureDescriptionApiModel
|
21
|
+
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
22
|
+
|
23
23
|
return {
|
24
24
|
"picture_description": [
|
25
25
|
PictureDescriptionVlmModel,
|
@@ -12,6 +12,9 @@ from docling_core.types.doc import (
|
|
12
12
|
TableData,
|
13
13
|
)
|
14
14
|
from docling_core.types.doc.document import ContentLayer
|
15
|
+
from docling_ibm_models.list_item_normalizer.list_marker_processor import (
|
16
|
+
ListItemMarkerProcessor,
|
17
|
+
)
|
15
18
|
from docling_ibm_models.reading_order.reading_order_rb import (
|
16
19
|
PageElement as ReadingOrderPageElement,
|
17
20
|
ReadingOrderPredictor,
|
@@ -40,6 +43,7 @@ class ReadingOrderModel:
|
|
40
43
|
def __init__(self, options: ReadingOrderOptions):
|
41
44
|
self.options = options
|
42
45
|
self.ro_model = ReadingOrderPredictor()
|
46
|
+
self.list_item_processor = ListItemMarkerProcessor()
|
43
47
|
|
44
48
|
def _assembled_to_readingorder_elements(
|
45
49
|
self, conv_res: ConversionResult
|
@@ -92,7 +96,8 @@ class ReadingOrderModel:
|
|
92
96
|
)
|
93
97
|
if c_label == DocItemLabel.LIST_ITEM:
|
94
98
|
# TODO: Infer if this is a numbered or a bullet list item
|
95
|
-
doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
|
99
|
+
l_item = doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov)
|
100
|
+
self.list_item_processor.process_list_item(l_item)
|
96
101
|
elif c_label == DocItemLabel.SECTION_HEADER:
|
97
102
|
doc.add_heading(parent=doc_item, text=c_text, prov=c_prov)
|
98
103
|
else:
|
@@ -301,6 +306,8 @@ class ReadingOrderModel:
|
|
301
306
|
new_item = out_doc.add_list_item(
|
302
307
|
text=cap_text, enumerated=False, prov=prov, parent=current_list
|
303
308
|
)
|
309
|
+
self.list_item_processor.process_list_item(new_item)
|
310
|
+
|
304
311
|
elif label == DocItemLabel.SECTION_HEADER:
|
305
312
|
current_list = None
|
306
313
|
|
@@ -10,7 +10,6 @@ from docling_core.types.doc.page import (
|
|
10
10
|
BoundingRectangle,
|
11
11
|
TextCellUnit,
|
12
12
|
)
|
13
|
-
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
14
13
|
from PIL import ImageDraw
|
15
14
|
|
16
15
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
@@ -70,6 +69,9 @@ class TableStructureModel(BasePageModel):
|
|
70
69
|
|
71
70
|
# Third Party
|
72
71
|
import docling_ibm_models.tableformer.common as c
|
72
|
+
from docling_ibm_models.tableformer.data_management.tf_predictor import (
|
73
|
+
TFPredictor,
|
74
|
+
)
|
73
75
|
|
74
76
|
device = decide_device(accelerator_options.device)
|
75
77
|
|
@@ -144,7 +144,10 @@ class TesseractOcrModel(BaseOcrModel):
|
|
144
144
|
|
145
145
|
local_reader = self.reader
|
146
146
|
self.osd_reader.SetImage(high_res_image)
|
147
|
+
|
148
|
+
doc_orientation = 0
|
147
149
|
osd = self.osd_reader.DetectOrientationScript()
|
150
|
+
|
148
151
|
# No text, or Orientation and Script detection failure
|
149
152
|
if osd is None:
|
150
153
|
_log.error(
|
@@ -158,11 +161,14 @@ class TesseractOcrModel(BaseOcrModel):
|
|
158
161
|
# to OCR in the hope OCR will succeed while OSD failed
|
159
162
|
if self._is_auto:
|
160
163
|
continue
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
-doc_orientation, expand=True
|
164
|
+
else:
|
165
|
+
doc_orientation = parse_tesseract_orientation(
|
166
|
+
osd["orient_deg"]
|
165
167
|
)
|
168
|
+
if doc_orientation != 0:
|
169
|
+
high_res_image = high_res_image.rotate(
|
170
|
+
-doc_orientation, expand=True
|
171
|
+
)
|
166
172
|
if self._is_auto:
|
167
173
|
script = osd["script_name"]
|
168
174
|
script = map_tesseract_script(script)
|
@@ -1,8 +1,6 @@
|
|
1
1
|
import logging
|
2
2
|
from typing import List, Optional
|
3
3
|
|
4
|
-
import torch
|
5
|
-
|
6
4
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
7
5
|
|
8
6
|
_log = logging.getLogger(__name__)
|
@@ -18,6 +16,8 @@ def decide_device(
|
|
18
16
|
1. AUTO: Check for the best available device on the system.
|
19
17
|
2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
|
20
18
|
"""
|
19
|
+
import torch
|
20
|
+
|
21
21
|
device = "cpu"
|
22
22
|
|
23
23
|
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
|
|
9
9
|
from rtree import index
|
10
10
|
|
11
11
|
from docling.datamodel.base_models import BoundingBox, Cluster, Page
|
12
|
+
from docling.datamodel.pipeline_options import LayoutOptions
|
12
13
|
|
13
14
|
_log = logging.getLogger(__name__)
|
14
15
|
|
@@ -194,12 +195,16 @@ class LayoutPostprocessor:
|
|
194
195
|
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
195
196
|
}
|
196
197
|
|
197
|
-
def __init__(
|
198
|
+
def __init__(
|
199
|
+
self, page: Page, clusters: List[Cluster], options: LayoutOptions
|
200
|
+
) -> None:
|
198
201
|
"""Initialize processor with page and clusters."""
|
202
|
+
|
199
203
|
self.cells = page.cells
|
200
204
|
self.page = page
|
201
205
|
self.page_size = page.size
|
202
206
|
self.all_clusters = clusters
|
207
|
+
self.options = options
|
203
208
|
self.regular_clusters = [
|
204
209
|
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
205
210
|
]
|
@@ -267,7 +272,7 @@ class LayoutPostprocessor:
|
|
267
272
|
|
268
273
|
# Handle orphaned cells
|
269
274
|
unassigned = self._find_unassigned_cells(clusters)
|
270
|
-
if unassigned:
|
275
|
+
if unassigned and self.options.create_orphan_clusters:
|
271
276
|
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
272
277
|
orphan_clusters = []
|
273
278
|
for i, cell in enumerate(unassigned):
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.40.0
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -27,8 +27,8 @@ Description-Content-Type: text/markdown
|
|
27
27
|
License-File: LICENSE
|
28
28
|
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
29
29
|
Requires-Dist: docling-core[chunking]<3.0.0,>=2.39.0
|
30
|
-
Requires-Dist: docling-ibm-models<4.0.0,>=3.4.4
|
31
30
|
Requires-Dist: docling-parse<5.0.0,>=4.0.0
|
31
|
+
Requires-Dist: docling-ibm-models<4,>=3.6.0
|
32
32
|
Requires-Dist: filetype<2.0.0,>=1.2.0
|
33
33
|
Requires-Dist: pypdfium2<5.0.0,>=4.30.0
|
34
34
|
Requires-Dist: pydantic-settings<3.0.0,>=2.3.0
|
@@ -57,7 +57,7 @@ Requires-Dist: ocrmac<2.0.0,>=1.0.0; sys_platform == "darwin" and extra == "ocrm
|
|
57
57
|
Provides-Extra: vlm
|
58
58
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
59
59
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
60
|
-
Requires-Dist: mlx-vlm
|
60
|
+
Requires-Dist: mlx-vlm<0.2,>=0.1.22; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
61
61
|
Provides-Extra: rapidocr
|
62
62
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
63
63
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -8,10 +8,10 @@ docling/backend/asciidoc_backend.py,sha256=RDNLrPJHxROiM7-NQdZn3DdvAyiPAndbSWcZo
|
|
8
8
|
docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE0,4536
|
9
9
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
|
-
docling/backend/docling_parse_v4_backend.py,sha256=
|
11
|
+
docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
|
12
12
|
docling/backend/html_backend.py,sha256=Z959dzqYQO2pPE4xgPRxC5MR9j3nFGtiD6_F_osQ2iI,20670
|
13
13
|
docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
|
14
|
-
docling/backend/msexcel_backend.py,sha256=
|
14
|
+
docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
|
15
15
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
16
16
|
docling/backend/msword_backend.py,sha256=7mzPCF4bGWZPst5ntoV3aSxH5WUu2nBP-l8lgQT3tdw,44544
|
17
17
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
@@ -36,7 +36,7 @@ docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53
|
|
36
36
|
docling/datamodel/asr_model_specs.py,sha256=L7ETXsUKVbPsVcPLhEIMxQjd4UzMGZBVsy74CLsZBkU,2181
|
37
37
|
docling/datamodel/base_models.py,sha256=67o1ptOTT8tW7i-g6gM2JKEX_1CDbmKEMQ_B9ZYM2z0,11156
|
38
38
|
docling/datamodel/document.py,sha256=CA_dgt4V_phze5HXpfgfKNBKd1cPC1o3WE_IENX63EM,16252
|
39
|
-
docling/datamodel/pipeline_options.py,sha256=
|
39
|
+
docling/datamodel/pipeline_options.py,sha256=0uX3F5JLUfGgzXH_0SDCeBwxYmHGbFns5OWJjITRI98,9726
|
40
40
|
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
41
41
|
docling/datamodel/pipeline_options_vlm_model.py,sha256=rtDMVtKFZbgQD269w8FvHMXEhdRBrsA4rVYk6A-M-b4,2063
|
42
42
|
docling/datamodel/settings.py,sha256=ajMz7Ao2m0ZGYkfArqTDDbiF89O408mtgeh06PUi0MA,1900
|
@@ -44,28 +44,28 @@ docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1
|
|
44
44
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
45
45
|
docling/models/api_vlm_model.py,sha256=GDDJGAia4SJjK7JFxsZy5oEU-D8yQo8Kb3NvvPbTvT0,2820
|
46
46
|
docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
|
47
|
-
docling/models/base_ocr_model.py,sha256=
|
47
|
+
docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
|
48
48
|
docling/models/code_formula_model.py,sha256=5uWh-eI-Ejmv3DujKJoKKgJBuvPLokt7AJ_ybt8VHEw,11373
|
49
49
|
docling/models/document_picture_classifier.py,sha256=fkJLV7pMy3v6iNwOzVb6zdBU1dGtBM1ARHLIRPfoAG4,6124
|
50
50
|
docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77WqIM,7387
|
51
|
-
docling/models/layout_model.py,sha256=
|
51
|
+
docling/models/layout_model.py,sha256=P31JbnuhA7BHR_lquEmx9K7edK2986oO_YJ9z73A1iA,8625
|
52
52
|
docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
|
53
53
|
docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
|
54
54
|
docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
|
55
55
|
docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
|
56
56
|
docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
|
57
|
-
docling/models/picture_description_vlm_model.py,sha256=
|
57
|
+
docling/models/picture_description_vlm_model.py,sha256=nAUt-eZOX2GvaCiV2BJO7VppxUbP7udVIF4oe_sEYXo,4000
|
58
58
|
docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
|
59
|
-
docling/models/readingorder_model.py,sha256=
|
60
|
-
docling/models/table_structure_model.py,sha256=
|
59
|
+
docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
|
60
|
+
docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
|
61
61
|
docling/models/tesseract_ocr_cli_model.py,sha256=qcM3-n7Z_dm1CGBhVUcNr2XT41iXnU32zk4RqKHBl9I,12775
|
62
|
-
docling/models/tesseract_ocr_model.py,sha256=
|
62
|
+
docling/models/tesseract_ocr_model.py,sha256=GdI5Cjfi87qcehVbM3wdKRvKkl_F9A4bwTUbjXZCJYA,10745
|
63
63
|
docling/models/factories/__init__.py,sha256=x_EM5dDg_A3HBcBYzOoqwmA2AFLtJ1IzYDPX-R1A-Sg,868
|
64
64
|
docling/models/factories/base_factory.py,sha256=MfWIljMETi5aaVR-6qLTelW8u1gwDAQsOwg3fu7O4Qc,4028
|
65
65
|
docling/models/factories/ocr_factory.py,sha256=G5RkmkKvkl-ihpo6qSj8WC77VdlVSQ1s0ekwUX2ILts,316
|
66
66
|
docling/models/factories/picture_description_factory.py,sha256=Ru3-TnVVEKf5O07C_UpGf2HCOHc7j20AJzfficw3agM,385
|
67
67
|
docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
68
|
-
docling/models/plugins/defaults.py,sha256=
|
68
|
+
docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8cDucU4,886
|
69
69
|
docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
70
|
docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
|
71
71
|
docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -75,14 +75,14 @@ docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
75
75
|
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
76
76
|
docling/pipeline/base_pipeline.py,sha256=14yQrDjsojl4RgbBjKFSEfVBYR_sULZfBI1uDzFLi8Y,9331
|
77
77
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
78
|
-
docling/pipeline/standard_pdf_pipeline.py,sha256=
|
78
|
+
docling/pipeline/standard_pdf_pipeline.py,sha256=F0AziGycSWXCmnZuER7-QyzPdMlCsEM9_uQw51RKKD0,12716
|
79
79
|
docling/pipeline/vlm_pipeline.py,sha256=IrjDbajCPmUPep_jATKNiABST4tQ8mvpkQz9mtBQ8qQ,15279
|
80
80
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
81
|
-
docling/utils/accelerator_utils.py,sha256=
|
81
|
+
docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
|
82
82
|
docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
|
83
83
|
docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
|
84
84
|
docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
|
85
|
-
docling/utils/layout_postprocessor.py,sha256=
|
85
|
+
docling/utils/layout_postprocessor.py,sha256=QuTZZq4LNs1eM_n_2gubVfAuLBMkJiozfs3hp-jUpK4,24399
|
86
86
|
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
87
87
|
docling/utils/model_downloader.py,sha256=6TDxFOvMRYT8JyYyaQS_wXMJzNga61ImY3sFdks66qM,4004
|
88
88
|
docling/utils/ocr_utils.py,sha256=AOaDAHr5S74d-IRVR_LKhKynUTIurAwLJ3wNeY58gPA,2326
|
@@ -90,9 +90,9 @@ docling/utils/orientation.py,sha256=xXlOfowL54FKwjsTFrM7y3ogk1wChLNn_-u74tYIf1s,
|
|
90
90
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
91
91
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
92
92
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
93
|
-
docling-2.
|
94
|
-
docling-2.
|
95
|
-
docling-2.
|
96
|
-
docling-2.
|
97
|
-
docling-2.
|
98
|
-
docling-2.
|
93
|
+
docling-2.40.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
94
|
+
docling-2.40.0.dist-info/METADATA,sha256=j4a3p3XDeiaIyAQfNiP__KPrvMa7sgBIv9LhG7E-IlI,10274
|
95
|
+
docling-2.40.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
96
|
+
docling-2.40.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
97
|
+
docling-2.40.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
98
|
+
docling-2.40.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|