docling 2.28.2__py3-none-any.whl → 2.28.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union
5
5
 
6
6
  import numpy
7
7
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
8
- from docling_core.types.doc.page import BoundingRectangle
8
+ from docling_core.types.doc.page import (
9
+ BoundingRectangle,
10
+ SegmentedPdfPage,
11
+ TextCellUnit,
12
+ )
9
13
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
10
14
  from PIL import ImageDraw
11
15
 
@@ -218,9 +222,21 @@ class TableStructureModel(BasePageModel):
218
222
 
219
223
  if len(table_bboxes):
220
224
  for table_cluster, tbl_box in in_tables:
221
-
225
+ # Check if word-level cells are available from backend:
226
+ sp = page._backend.get_segmented_page()
227
+ if sp is not None:
228
+ tcells = sp.get_cells_in_bbox(
229
+ cell_unit=TextCellUnit.WORD,
230
+ bbox=table_cluster.bbox,
231
+ )
232
+ if len(tcells) == 0:
233
+ # In case word-level cells yield empty
234
+ tcells = table_cluster.cells
235
+ else:
236
+ # Otherwise - we use normal (line/phrase) cells
237
+ tcells = table_cluster.cells
222
238
  tokens = []
223
- for c in table_cluster.cells:
239
+ for c in tcells:
224
240
  # Only allow non empty stings (spaces) into the cells of a table
225
241
  if len(c.text.strip()) > 0:
226
242
  new_cell = copy.deepcopy(c)
@@ -229,7 +245,6 @@ class TableStructureModel(BasePageModel):
229
245
  scale=self.scale
230
246
  )
231
247
  )
232
-
233
248
  tokens.append(
234
249
  {
235
250
  "id": new_cell.index,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.28.2
3
+ Version: 2.28.4
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.24.1,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -57,7 +57,7 @@ docling/models/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3
57
57
  docling/models/plugins/defaults.py,sha256=qslXGnRX07Z3GGttNriqaox0v0vXp4zs4KLurHCZjp4,858
58
58
  docling/models/rapid_ocr_model.py,sha256=C_I0Ek9mAPIyTFRHuNbqtXg1c15rLNDE1tJ6_hPIi4c,5869
59
59
  docling/models/readingorder_model.py,sha256=hNWbBX3uZv1FxMwKNKn2JFQuQqTspBLsJBVEidXr6Wk,14869
60
- docling/models/table_structure_model.py,sha256=_b6-2alzhzI19-thDGpM3mww54mxbHLkEiTYMU84d30,11773
60
+ docling/models/table_structure_model.py,sha256=pvTsqUa5QIANBUfot0XXG1UUeku-eaUi04EPE-Yh2g0,12597
61
61
  docling/models/tesseract_ocr_cli_model.py,sha256=S-rCisPrVa3ASvOWycqQoria0PtmNqgdg8YxrLbG1ww,10067
62
62
  docling/models/tesseract_ocr_model.py,sha256=UpLAgKgJtBgbKtJELmKBNMcejJJKBCyFK0q-WgZN1Eg,9256
63
63
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -77,8 +77,8 @@ docling/utils/ocr_utils.py,sha256=F7iOOjqolUcImUzir4qjDQd4QWSO3s6JC4WRn3U7uY4,26
77
77
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
78
78
  docling/utils/utils.py,sha256=0ozCk7zUkYzxRVmYoIB2zA1lqjQOuaQzxfGuf1wmKW4,1866
79
79
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
80
- docling-2.28.2.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
81
- docling-2.28.2.dist-info/METADATA,sha256=ZeYjkP0ZzlpqoseGod2_iuJPW9d4B16JCeSo2b61KIw,9982
82
- docling-2.28.2.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
- docling-2.28.2.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
84
- docling-2.28.2.dist-info/RECORD,,
80
+ docling-2.28.4.dist-info/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
81
+ docling-2.28.4.dist-info/METADATA,sha256=bwT1X-5lpPBI8mpDI2-DeLloD2Rcf170xBdERgFsq-w,9982
82
+ docling-2.28.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
+ docling-2.28.4.dist-info/entry_points.txt,sha256=pIxel-UeVo1S7FhoNG5xgEfPjLZfBLi_N9TsGPtJSLo,144
84
+ docling-2.28.4.dist-info/RECORD,,