docling 2.46.0__py3-none-any.whl → 2.47.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -239,15 +239,18 @@ class LayoutPostprocessor:
239
239
  final_clusters = self._sort_clusters(
240
240
  self.regular_clusters + self.special_clusters, mode="id"
241
241
  )
242
- for cluster in final_clusters:
243
- cluster.cells = self._sort_cells(cluster.cells)
244
- # Also sort cells in children if any
245
- for child in cluster.children:
246
- child.cells = self._sort_cells(child.cells)
247
242
 
248
- assert self.page.parsed_page is not None
249
- self.page.parsed_page.textline_cells = self.cells
250
- self.page.parsed_page.has_lines = len(self.cells) > 0
243
+ # Conditionally process cells if not skipping cell assignment
244
+ if not self.options.skip_cell_assignment:
245
+ for cluster in final_clusters:
246
+ cluster.cells = self._sort_cells(cluster.cells)
247
+ # Also sort cells in children if any
248
+ for child in cluster.children:
249
+ child.cells = self._sort_cells(child.cells)
250
+
251
+ assert self.page.parsed_page is not None
252
+ self.page.parsed_page.textline_cells = self.cells
253
+ self.page.parsed_page.has_lines = len(self.cells) > 0
251
254
 
252
255
  return final_clusters, self.cells
253
256
 
@@ -264,36 +267,38 @@ class LayoutPostprocessor:
264
267
  if cluster.label in self.LABEL_REMAPPING:
265
268
  cluster.label = self.LABEL_REMAPPING[cluster.label]
266
269
 
267
- # Initial cell assignment
268
- clusters = self._assign_cells_to_clusters(clusters)
269
-
270
- # Remove clusters with no cells (if keep_empty_clusters is False),
271
- # but always keep clusters with label DocItemLabel.FORMULA
272
- if not self.options.keep_empty_clusters:
273
- clusters = [
274
- cluster
275
- for cluster in clusters
276
- if cluster.cells or cluster.label == DocItemLabel.FORMULA
277
- ]
278
-
279
- # Handle orphaned cells
280
- unassigned = self._find_unassigned_cells(clusters)
281
- if unassigned and self.options.create_orphan_clusters:
282
- next_id = max((c.id for c in self.all_clusters), default=0) + 1
283
- orphan_clusters = []
284
- for i, cell in enumerate(unassigned):
285
- conf = cell.confidence
286
-
287
- orphan_clusters.append(
288
- Cluster(
289
- id=next_id + i,
290
- label=DocItemLabel.TEXT,
291
- bbox=cell.to_bounding_box(),
292
- confidence=conf,
293
- cells=[cell],
270
+ # Conditionally assign cells to clusters
271
+ if not self.options.skip_cell_assignment:
272
+ # Initial cell assignment
273
+ clusters = self._assign_cells_to_clusters(clusters)
274
+
275
+ # Remove clusters with no cells (if keep_empty_clusters is False),
276
+ # but always keep clusters with label DocItemLabel.FORMULA
277
+ if not self.options.keep_empty_clusters:
278
+ clusters = [
279
+ cluster
280
+ for cluster in clusters
281
+ if cluster.cells or cluster.label == DocItemLabel.FORMULA
282
+ ]
283
+
284
+ # Handle orphaned cells
285
+ unassigned = self._find_unassigned_cells(clusters)
286
+ if unassigned and self.options.create_orphan_clusters:
287
+ next_id = max((c.id for c in self.all_clusters), default=0) + 1
288
+ orphan_clusters = []
289
+ for i, cell in enumerate(unassigned):
290
+ conf = cell.confidence
291
+
292
+ orphan_clusters.append(
293
+ Cluster(
294
+ id=next_id + i,
295
+ label=DocItemLabel.TEXT,
296
+ bbox=cell.to_bounding_box(),
297
+ confidence=conf,
298
+ cells=[cell],
299
+ )
294
300
  )
295
- )
296
- clusters.extend(orphan_clusters)
301
+ clusters.extend(orphan_clusters)
297
302
 
298
303
  # Iterative refinement
299
304
  prev_count = len(clusters) + 1
@@ -350,12 +355,15 @@ class LayoutPostprocessor:
350
355
  b=max(c.bbox.b for c in contained),
351
356
  )
352
357
 
353
- # Collect all cells from children
354
- all_cells = []
355
- for child in contained:
356
- all_cells.extend(child.cells)
357
- special.cells = self._deduplicate_cells(all_cells)
358
- special.cells = self._sort_cells(special.cells)
358
+ # Conditionally collect cells from children
359
+ if not self.options.skip_cell_assignment:
360
+ all_cells = []
361
+ for child in contained:
362
+ all_cells.extend(child.cells)
363
+ special.cells = self._deduplicate_cells(all_cells)
364
+ special.cells = self._sort_cells(special.cells)
365
+ else:
366
+ special.cells = []
359
367
 
360
368
  picture_clusters = [
361
369
  c for c in special_clusters if c.label == DocItemLabel.PICTURE
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.46.0
3
+ Version: 2.47.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -59,6 +59,7 @@ Provides-Extra: vlm
59
59
  Requires-Dist: transformers<5.0.0,>=4.46.0; extra == "vlm"
60
60
  Requires-Dist: accelerate<2.0.0,>=1.2.1; extra == "vlm"
61
61
  Requires-Dist: mlx-vlm<1.0.0,>=0.3.0; (python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64") and extra == "vlm"
62
+ Requires-Dist: vllm<1.0.0,>=0.10.0; (python_version >= "3.10" and sys_platform == "linux") and extra == "vlm"
62
63
  Provides-Extra: rapidocr
63
64
  Requires-Dist: rapidocr-onnxruntime<2.0.0,>=1.4.0; python_version < "3.13" and extra == "rapidocr"
64
65
  Requires-Dist: onnxruntime<2.0.0,>=1.7.0; extra == "rapidocr"
@@ -9,12 +9,12 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=MbCMxNGmoW4iuev9tX1Vt4jtIeak2kC9Uac3xQSRxeo,7509
12
- docling/backend/html_backend.py,sha256=zJH4wkcyftvoA-ixC4MH-xjwl-TGTN9BvZT7Hhla2mc,34701
12
+ docling/backend/html_backend.py,sha256=qPXmMiKxskSDJJK5e0a46xhkSiATgjyi02eMrY_ahR8,38323
13
13
  docling/backend/md_backend.py,sha256=qCI7SD9hnWWGrkG_drpzQv2Z7DVBG4Tsq3hhTsYV790,22562
14
14
  docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
15
15
  docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
16
16
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
17
- docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
17
+ docling/backend/msword_backend.py,sha256=fKeAMGGR5ABimedo_ofCQAybzdqmqWA3A3mpLl7X6qY,49129
18
18
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
19
19
  docling/backend/pdf_backend.py,sha256=Wcd1NSrAMjXK8VicTki5p-j-JLofklt07eF0kIG17_0,3361
20
20
  docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
@@ -29,23 +29,23 @@ docling/backend/xml/jats_backend.py,sha256=LPj33EFdi2MRCakkLWrRLlUAc-B-949f8zp5g
29
29
  docling/backend/xml/uspto_backend.py,sha256=nyAMr5ht7dclxkVDwsKNeiOhLQrUtRLS8JdscB2AVJg,70924
30
30
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
31
31
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
- docling/cli/main.py,sha256=-W_vdKvSm5gZUZyvRpFH0YMI_1iJrP5sJOZ5_1bLorw,30359
33
- docling/cli/models.py,sha256=9yLGp6QRJGpR86U3SjmWAXDt3MvBaJLLY4xDVdsu3O8,4160
32
+ docling/cli/main.py,sha256=yRgDoc60wm5py2ztq6-q5BPnxfLJZ1EA8Bce-EcpLPs,30952
33
+ docling/cli/models.py,sha256=5C3CZz3HZXoCrBl92Is62KMCtUqsZK-oygj1hqzJ8vo,6008
34
34
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
35
35
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  docling/datamodel/accelerator_options.py,sha256=wv6dOFTVAwr9onkE-0pfUqX_fDb6gX53iPPE6o8nKjI,2511
37
37
  docling/datamodel/asr_model_specs.py,sha256=Wg7z3zm_wXIWu122iPVy0RMECsA_JCFHrlFF-xxHoVQ,2187
38
- docling/datamodel/base_models.py,sha256=Ifd8PPHs4sW7ScwSqpa-y3rwgPbde_iw13Y2NUCPfU8,11944
38
+ docling/datamodel/base_models.py,sha256=OI2-tBjH3PZMF_Zyyc4eezJ4gFXIBiKT4BYKYy6n81E,11924
39
39
  docling/datamodel/document.py,sha256=zsxFYXvo6GtwGNogSDoBB1TFvkm7IOrP_VnqXNqBhJs,17329
40
40
  docling/datamodel/layout_model_specs.py,sha256=GSkJ-Z_0PVgwWGi7C7TsxbzRjlrWS9ZrHJjHumv-Z5U,2339
41
- docling/datamodel/pipeline_options.py,sha256=vOLpuVF-d4nmr-L16EmmhGFn25SDsgExCfX5kPiyISg,10470
41
+ docling/datamodel/pipeline_options.py,sha256=x0RlEdTiEU9gH27YDRov1ZVMpTlx4BnqEoEtmOHd08k,10584
42
42
  docling/datamodel/pipeline_options_asr_model.py,sha256=7X068xl-qpbyPxC7-TwX7Q6tLyZXGT5h1osZ_xLNLM0,1454
43
- docling/datamodel/pipeline_options_vlm_model.py,sha256=eH-Cj_8aic9FdX4xGlBcf5_R9e152JAL2LhtY8d0rhw,2498
43
+ docling/datamodel/pipeline_options_vlm_model.py,sha256=AcqqThSW74hwQ6x7pazzm57LnJiUqB7gQi5wFayGlbk,2628
44
44
  docling/datamodel/settings.py,sha256=c0MTw6pO5be_BKxHKYl4SaBJAw_qL-aapxp-g5HHj1A,2084
45
- docling/datamodel/vlm_model_specs.py,sha256=--jZexGeu-s_lWp7y_WwWEf6CD1J4XqADrS1-OY_pWM,4737
45
+ docling/datamodel/vlm_model_specs.py,sha256=dFObfYlPyN7AbTCudsubsWvWTTx4F4Xz9GEJPkEV2_M,8175
46
46
  docling/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
47
  docling/models/api_vlm_model.py,sha256=-zisU32pgDRbychyG6-neB0qweNbPaYnLXwiGT7SEdI,2859
48
- docling/models/base_model.py,sha256=NNjIapqCruAEAWR-CCdsNgXc2QkwiPYAcaQ_ZYe1W28,2978
48
+ docling/models/base_model.py,sha256=tXFM7zJwF6Kn2EhtaB4QmgK4O2ruv1C7SjdBgM5QKak,6225
49
49
  docling/models/base_ocr_model.py,sha256=kT8TylASOpPlY60rIG6VL6_eLVsfg5KvEVnZHzDWtR0,8193
50
50
  docling/models/code_formula_model.py,sha256=XRugm4EwifLRc-TrAk-glKlktJP-nAPneKh2EOovkJU,11308
51
51
  docling/models/document_picture_classifier.py,sha256=9JvoWeH5uQBC7levjM8zptk7UT-b8EQnD-2EnxTjTT4,6202
@@ -53,10 +53,10 @@ docling/models/easyocr_model.py,sha256=ECPBd-48cCw5s935NsPJO_C_1QuK_yAUGloMM77Wq
53
53
  docling/models/layout_model.py,sha256=Nfbo6keMB4vVjGoZdFMqD9CmZcWh-0bE3LkRjJTDJQ0,9146
54
54
  docling/models/ocr_mac_model.py,sha256=y-1DSFDbACHpEwNTfQwzN9ab8r5j5rBFNPtQ48BzsrA,5396
55
55
  docling/models/page_assemble_model.py,sha256=TvN1naez7dUodLxpUUBzpuMCpqZBTf6YSpewxgjzmrg,6323
56
- docling/models/page_preprocessing_model.py,sha256=x8MI4mvjizqEqAb5511dtrNRCJSb-lSmwHw0tmHPFiI,5103
56
+ docling/models/page_preprocessing_model.py,sha256=rHNX1uP1ScTjVUlsxZ0eamK2uNUqI94WpnyrP10Pj6k,5277
57
57
  docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCrS_btclO_ZCLAUqrfl0,2377
58
58
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
59
- docling/models/picture_description_vlm_model.py,sha256=yfyAFOy8RjxQJrafPMSAMrrpaYu3anahjRX6tCnVcs0,4028
59
+ docling/models/picture_description_vlm_model.py,sha256=5BJvaF3PHuL9lCVYqPv9krh3h_7YwNSdKYw1EVEj13k,4156
60
60
  docling/models/rapid_ocr_model.py,sha256=AMdc66s_iWO4p6nQ0LNjQMUYVxrDSxMyLNPpjPYt6N8,5916
61
61
  docling/models/readingorder_model.py,sha256=bZoXHaSwUsa8niSmJrbCuy784ixCeBXT-RQBUfgHJ4A,14925
62
62
  docling/models/table_structure_model.py,sha256=RFXo73f2q4XuKyaSqbxpznh7JVtlLcT0FsOWl9oZbSg,12518
@@ -70,22 +70,23 @@ docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
70
70
  docling/models/plugins/defaults.py,sha256=OAHWW2tCcUXSyDMFxV_lXVRjSBJ1n6z-Eb3R8cDucU4,886
71
71
  docling/models/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
72
  docling/models/utils/hf_model_download.py,sha256=scBEfsM4yl7xPzqe7UtPvDh9RfQZQnuOhqQKilYBHls,984
73
- docling/models/vlm_models_inline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
- docling/models/vlm_models_inline/hf_transformers_model.py,sha256=Rwdr7neDpn5ehtrp6n7G21fcPBK2m9Har_6BFNdyw-Q,8359
75
- docling/models/vlm_models_inline/mlx_model.py,sha256=YYYmopsITlX17JVS5KhLlb1IQSEVoSECNx_fXLHNpAc,5880
73
+ docling/models/vlm_models_inline/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
74
+ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=G0RpKwdzm5NiqIBHG5nWLwBsrDfDebzErzRkyXppZPw,12134
75
+ docling/models/vlm_models_inline/mlx_model.py,sha256=VP05v97mqzmaG4o9bOpJcxIlEqvNzAapJ15Zz3E3ACI,10169
76
+ docling/models/vlm_models_inline/vllm_model.py,sha256=_EnK1nfpAPJky7aRlyp8SUIghiZOQO8AkDN_hHqXLZg,8615
76
77
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
78
  docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
78
79
  docling/pipeline/base_pipeline.py,sha256=VYVYndifTPSD2GWHKjfi4Y76M5qgt1DiygO-jowKsqM,9919
79
80
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
80
81
  docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
81
- docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=miPIyprtzPFYG94n6PmUgK4Nh7rqACYEGkWrlTbrZAc,26133
82
- docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
82
+ docling/pipeline/threaded_standard_pdf_pipeline.py,sha256=NgdZxpfpElnvCgGlrQ8kSvq44LNzJcc6wOqD-AMrKZ0,26132
83
+ docling/pipeline/vlm_pipeline.py,sha256=PlllB2ZbhuUQilJ5_W8RhuBUbLtXdSjnrYnTVzHF0Vc,15955
83
84
  docling/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
84
85
  docling/utils/accelerator_utils.py,sha256=DSajLxVx1JEVT0zt5de26llciLNlVfIDfSa2zYCFJzQ,2909
85
86
  docling/utils/api_image_request.py,sha256=_CgdzmPqdsyXmyYUFGLZcXcoH586qC6A1p5vsNbj1Q0,1416
86
87
  docling/utils/export.py,sha256=VwVUnYDk3mhGmISDbVm306fwpGNnoojouStBD4UajXI,4673
87
88
  docling/utils/glm_utils.py,sha256=TKOWQqWAHsX_w4fvoAA7_2xCi_urhnp1DsmjY8_sk5w,12274
88
- docling/utils/layout_postprocessor.py,sha256=LFLbBE-o3kWu79d8ZcyHlZPIqzQfCabZCIPTJ51lZsY,24657
89
+ docling/utils/layout_postprocessor.py,sha256=sE9UR3Nv4iOk26uoIsN3bFioE7ScfAjj0orDBDneLXg,25166
89
90
  docling/utils/locks.py,sha256=RzqQtD5UispgV71pGN_nU6GYfeN11BN0Sh_Dq9ycqGo,52
90
91
  docling/utils/model_downloader.py,sha256=3vijCsAIVwWqehGBDRxRq7mJ3yRb9-zBsG00iqjqegU,4076
91
92
  docling/utils/ocr_utils.py,sha256=nmresYyfin0raanpQc_GGeU3WoLsfExf6SEXNIQ7Djg,2325
@@ -93,9 +94,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
93
94
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
94
95
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
95
96
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
96
- docling-2.46.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
97
- docling-2.46.0.dist-info/METADATA,sha256=fm7KVaUwGryyuRk7R_AkNSHo1BogY8-ra9gpCWXbnCA,10459
98
- docling-2.46.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
99
- docling-2.46.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
100
- docling-2.46.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
101
- docling-2.46.0.dist-info/RECORD,,
97
+ docling-2.47.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
98
+ docling-2.47.0.dist-info/METADATA,sha256=4cBB6CG8LFZzsDFRenI4f09ypOTCswZk2mVwczxEcVs,10569
99
+ docling-2.47.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
100
+ docling-2.47.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
101
+ docling-2.47.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
102
+ docling-2.47.0.dist-info/RECORD,,