docling 2.46.0__py3-none-any.whl → 2.47.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +111 -13
- docling/backend/msword_backend.py +126 -16
- docling/cli/main.py +14 -0
- docling/cli/models.py +56 -0
- docling/datamodel/base_models.py +1 -1
- docling/datamodel/pipeline_options.py +3 -0
- docling/datamodel/pipeline_options_vlm_model.py +5 -0
- docling/datamodel/vlm_model_specs.py +114 -1
- docling/models/base_model.py +95 -2
- docling/models/page_preprocessing_model.py +5 -1
- docling/models/picture_description_vlm_model.py +4 -2
- docling/models/vlm_models_inline/__init__.py +1 -0
- docling/models/vlm_models_inline/hf_transformers_model.py +179 -79
- docling/models/vlm_models_inline/mlx_model.py +179 -68
- docling/models/vlm_models_inline/vllm_model.py +235 -0
- docling/pipeline/base_pipeline.py +3 -2
- docling/pipeline/threaded_standard_pdf_pipeline.py +1 -1
- docling/pipeline/vlm_pipeline.py +14 -1
- docling/utils/layout_postprocessor.py +51 -43
- {docling-2.46.0.dist-info → docling-2.47.1.dist-info}/METADATA +2 -1
- {docling-2.46.0.dist-info → docling-2.47.1.dist-info}/RECORD +25 -24
- {docling-2.46.0.dist-info → docling-2.47.1.dist-info}/WHEEL +0 -0
- {docling-2.46.0.dist-info → docling-2.47.1.dist-info}/entry_points.txt +0 -0
- {docling-2.46.0.dist-info → docling-2.47.1.dist-info}/licenses/LICENSE +0 -0
- {docling-2.46.0.dist-info → docling-2.47.1.dist-info}/top_level.txt +0 -0
@@ -239,15 +239,18 @@ class LayoutPostprocessor:
|
|
239
239
|
final_clusters = self._sort_clusters(
|
240
240
|
self.regular_clusters + self.special_clusters, mode="id"
|
241
241
|
)
|
242
|
-
for cluster in final_clusters:
|
243
|
-
cluster.cells = self._sort_cells(cluster.cells)
|
244
|
-
# Also sort cells in children if any
|
245
|
-
for child in cluster.children:
|
246
|
-
child.cells = self._sort_cells(child.cells)
|
247
242
|
|
248
|
-
|
249
|
-
|
250
|
-
|
243
|
+
# Conditionally process cells if not skipping cell assignment
|
244
|
+
if not self.options.skip_cell_assignment:
|
245
|
+
for cluster in final_clusters:
|
246
|
+
cluster.cells = self._sort_cells(cluster.cells)
|
247
|
+
# Also sort cells in children if any
|
248
|
+
for child in cluster.children:
|
249
|
+
child.cells = self._sort_cells(child.cells)
|
250
|
+
|
251
|
+
assert self.page.parsed_page is not None
|
252
|
+
self.page.parsed_page.textline_cells = self.cells
|
253
|
+
self.page.parsed_page.has_lines = len(self.cells) > 0
|
251
254
|
|
252
255
|
return final_clusters, self.cells
|
253
256
|
|
@@ -264,36 +267,38 @@ class LayoutPostprocessor:
|
|
264
267
|
if cluster.label in self.LABEL_REMAPPING:
|
265
268
|
cluster.label = self.LABEL_REMAPPING[cluster.label]
|
266
269
|
|
267
|
-
#
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
clusters
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
270
|
+
# Conditionally assign cells to clusters
|
271
|
+
if not self.options.skip_cell_assignment:
|
272
|
+
# Initial cell assignment
|
273
|
+
clusters = self._assign_cells_to_clusters(clusters)
|
274
|
+
|
275
|
+
# Remove clusters with no cells (if keep_empty_clusters is False),
|
276
|
+
# but always keep clusters with label DocItemLabel.FORMULA
|
277
|
+
if not self.options.keep_empty_clusters:
|
278
|
+
clusters = [
|
279
|
+
cluster
|
280
|
+
for cluster in clusters
|
281
|
+
if cluster.cells or cluster.label == DocItemLabel.FORMULA
|
282
|
+
]
|
283
|
+
|
284
|
+
# Handle orphaned cells
|
285
|
+
unassigned = self._find_unassigned_cells(clusters)
|
286
|
+
if unassigned and self.options.create_orphan_clusters:
|
287
|
+
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
288
|
+
orphan_clusters = []
|
289
|
+
for i, cell in enumerate(unassigned):
|
290
|
+
conf = cell.confidence
|
291
|
+
|
292
|
+
orphan_clusters.append(
|
293
|
+
Cluster(
|
294
|
+
id=next_id + i,
|
295
|
+
label=DocItemLabel.TEXT,
|
296
|
+
bbox=cell.to_bounding_box(),
|
297
|
+
confidence=conf,
|
298
|
+
cells=[cell],
|
299
|
+
)
|
294
300
|
)
|
295
|
-
)
|
296
|
-
clusters.extend(orphan_clusters)
|
301
|
+
clusters.extend(orphan_clusters)
|
297
302
|
|
298
303
|
# Iterative refinement
|
299
304
|
prev_count = len(clusters) + 1
|
@@ -350,12 +355,15 @@ class LayoutPostprocessor:
|
|
350
355
|
b=max(c.bbox.b for c in contained),
|
351
356
|
)
|
352
357
|
|
353
|
-
#
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
358
|
+
# Conditionally collect cells from children
|
359
|
+
if not self.options.skip_cell_assignment:
|
360
|
+
all_cells = []
|
361
|
+
for child in contained:
|
362
|
+
all_cells.extend(child.cells)
|
363
|
+
special.cells = self._deduplicate_cells(all_cells)
|
364
|
+
special.cells = self._sort_cells(special.cells)
|
365
|
+
else:
|
366
|
+
special.cells = []
|
359
367
|
|
360
368
|
picture_clusters = [
|
361
369
|
c for c in special_clusters if c.label == DocItemLabel.PICTURE
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.47.1
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -59,6 +59,7 @@ Provides-Extra: vlm
|
|
59
59
|
Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
|
60
60
|
Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
|
61
61
|
Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
|
62
|
+
Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux" and platform_machine == "x86_64") and extra == "vlm"
|
62
63
|
Provides-Extra: rapidocr
|
63
64
|
Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
|
64
65
|
Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
|
@@ -9,12 +9,12 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
|
|
9
9
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
11
|
docling/backend/docling_parse_v4_backend.py,sha256=MbCMxNGmoW4iuev9tX1Vt4jtIeak2kC9Uac3xQSRxeo,7509
|
12
|
-
docling/backend/html_backend.py,sha256=
|
12
|
+
docling/backend/html_backend.py,sha256=qPXmMiKxskSDJJK5e0a46xhkSiATgjyi02eMrY_ahR8,38323
|
13
13
|
docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
|
14
14
|
docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
|
15
15
|
docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
|
16
16
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
17
|
-
docling/backend/msword_backend.py,sha256=
|
17
|
+
docling/backend/msword_backend.py,sha256=fKeAMGGR5ABimedo_ofCQAybzdqmqWA3A3mpLl7X6qY,49129
|
18
18
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
19
19
|
docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
|
20
20
|
docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
|
@@ -29,23 +29,23 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
|
|
29
29
|
docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
|
30
30
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
31
31
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
32
|
-
docling/cli/main.py,sha256
|
33
|
-
docling/cli/models.py,sha256=
|
32
|
+
docling/cli/main.py,sha256=yRgDoc60wm5py2ztq6-q5BPnxfLJZ1EA8Bce-EcpLPs,30952
|
33
|
+
docling/cli/models.py,sha256=5C3CZz3HZXoCrBl92Is62KMCtUqsZK-oygj1hqzJ8vo,6008
|
34
34
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
35
35
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
36
|
docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
|
37
37
|
docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
|
38
|
-
docling/datamodel/base_models.py,sha256=
|
38
|
+
docling/datamodel/base_models.py,sha256=OI2-tBjH3PZMF_Zyyc4eezJ4gFXIBiKT4BYKYy6n81E,11924
|
39
39
|
docling/datamodel/document.py,sha256=zsxFYXvo6GtwGNogSDoBB1TFvkm7IOrP_VnqXNqBhJs,17329
|
40
40
|
docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
|
41
|
-
docling/datamodel/pipeline_options.py,sha256=
|
41
|
+
docling/datamodel/pipeline_options.py,sha256=x0RlEdTiEU9gH27YDRov1ZVMpTlx4BnqEoEtmOHd08k,10584
|
42
42
|
docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
|
43
|
-
docling/datamodel/pipeline_options_vlm_model.py,sha256=
|
43
|
+
docling/datamodel/pipeline_options_vlm_model.py,sha256=AcqqThSW74hwQ6x7pazzm57LnJiUqB7gQi5wFayGlbk,2628
|
44
44
|
docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
|
45
|
-
docling/datamodel/vlm_model_specs.py,sha256
|
45
|
+
docling/datamodel/vlm_model_specs.py,sha256=dFObfYlPyN7AbTCudsubsWvWTTx4F4Xz9GEJPkEV2_M,8175
|
46
46
|
docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
47
47
|
docling/models/api_vlm_model.py,sha256=-zisU32pgDRbychyG6-neB0qweNbPaYnLXwiGT7SEdI,2859
|
48
|
-
docling/models/base_model.py,sha256=
|
48
|
+
docling/models/base_model.py,sha256=tXFM7zJwF6Kn2EhtaB4QmgK4O2ruv1C7SjdBgM5QKak,6225
|
49
49
|
docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
|
50
50
|
docling/models/code_formula_model.py,sha256=XRugm4EwifLRc-TrAk-glKlktJP-nAPneKh2EOovkJU,11308
|
51
51
|
docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
|
@@ -53,10 +53,10 @@ docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77Wq
|
|
53
53
|
docling/models/layout_model.py,sha256=Nfbo6keMB4vVjGoZdFMqD9CmZcWh-0bE3LkRjJTDJQ0,9146
|
54
54
|
docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
|
55
55
|
docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
|
56
|
-
docling/models/page_preprocessing_model.py,sha256=
|
56
|
+
docling/models/page_preprocessing_model.py,sha256=rHNX1uP1ScTjVUlsxZ0eamK2uNUqI94WpnyrP10Pj6k,5277
|
57
57
|
docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
|
58
58
|
docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
|
59
|
-
docling/models/picture_description_vlm_model.py,sha256=
|
59
|
+
docling/models/picture_description_vlm_model.py,sha256=5BJvaF3PHuL9lCVYqPv9krh3h_7YwNSdKYw1EVEj13k,4156
|
60
60
|
docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
|
61
61
|
docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
|
62
62
|
docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
|
@@ -70,22 +70,23 @@ docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
|
|
70
70
|
docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8cDucU4,886
|
71
71
|
docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
72
72
|
docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
|
73
|
-
docling/models/vlm_models_inline/__init__.py,sha256=
|
74
|
-
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=
|
75
|
-
docling/models/vlm_models_inline/mlx_model.py,sha256=
|
73
|
+
docling/models/vlm_models_inline/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
74
|
+
docling/models/vlm_models_inline/hf_transformers_model.py,sha256=G0RpKwdzm5NiqIBHG5nWLwBsrDfDebzErzRkyXppZPw,12134
|
75
|
+
docling/models/vlm_models_inline/mlx_model.py,sha256=VP05v97mqzmaG4o9bOpJcxIlEqvNzAapJ15Zz3E3ACI,10169
|
76
|
+
docling/models/vlm_models_inline/vllm_model.py,sha256=_EnK1nfpAPJky7aRlyp8SUIghiZOQO8AkDN_hHqXLZg,8615
|
76
77
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
78
|
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
78
|
-
docling/pipeline/base_pipeline.py,sha256=
|
79
|
+
docling/pipeline/base_pipeline.py,sha256=Tl_C3adFABNxtE7hX83VSdx-j7D8GRvoFcno5A3Z-YQ,10062
|
79
80
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
80
81
|
docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
|
81
|
-
docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=
|
82
|
-
docling/pipeline/vlm_pipeline.py,sha256=
|
82
|
+
docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=NgdZxpfpElnvCgGlrQ8kSvq44LNzJcc6wOqD-AMrKZ0,26132
|
83
|
+
docling/pipeline/vlm_pipeline.py,sha256=PlllB2ZbhuUQilJ5_W8RhuBUbLtXdSjnrYnTVzHF0Vc,15955
|
83
84
|
docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
84
85
|
docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
|
85
86
|
docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
|
86
87
|
docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
|
87
88
|
docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
|
88
|
-
docling/utils/layout_postprocessor.py,sha256=
|
89
|
+
docling/utils/layout_postprocessor.py,sha256=sE9UR3Nv4iOk26uoIsN3bFioE7ScfAjj0orDBDneLXg,25166
|
89
90
|
docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
|
90
91
|
docling/utils/model_downloader.py,sha256=3vijCsAIVwWqehGBDRxRq7mJ3yRb9-zBsG00iqjqegU,4076
|
91
92
|
docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
|
@@ -93,9 +94,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
93
94
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
94
95
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
95
96
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
96
|
-
docling-2.
|
97
|
-
docling-2.
|
98
|
-
docling-2.
|
99
|
-
docling-2.
|
100
|
-
docling-2.
|
101
|
-
docling-2.
|
97
|
+
docling-2.47.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
98
|
+
docling-2.47.1.dist-info/METADATA,sha256=4n44qY1NuWCGZTAp2Zc68pxOOXAj4lurjWr503RAdU4,10602
|
99
|
+
docling-2.47.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
100
|
+
docling-2.47.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
101
|
+
docling-2.47.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
102
|
+
docling-2.47.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|