docling 2.28.2__tar.gz → 2.28.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {docling-2.28.2 → docling-2.28.3}/PKG-INFO +2 -2
  2. {docling-2.28.2 → docling-2.28.3}/docling/models/table_structure_model.py +16 -4
  3. {docling-2.28.2 → docling-2.28.3}/pyproject.toml +2 -2
  4. {docling-2.28.2 → docling-2.28.3}/LICENSE +0 -0
  5. {docling-2.28.2 → docling-2.28.3}/README.md +0 -0
  6. {docling-2.28.2 → docling-2.28.3}/docling/__init__.py +0 -0
  7. {docling-2.28.2 → docling-2.28.3}/docling/backend/__init__.py +0 -0
  8. {docling-2.28.2 → docling-2.28.3}/docling/backend/abstract_backend.py +0 -0
  9. {docling-2.28.2 → docling-2.28.3}/docling/backend/asciidoc_backend.py +0 -0
  10. {docling-2.28.2 → docling-2.28.3}/docling/backend/csv_backend.py +0 -0
  11. {docling-2.28.2 → docling-2.28.3}/docling/backend/docling_parse_backend.py +0 -0
  12. {docling-2.28.2 → docling-2.28.3}/docling/backend/docling_parse_v2_backend.py +0 -0
  13. {docling-2.28.2 → docling-2.28.3}/docling/backend/docling_parse_v4_backend.py +0 -0
  14. {docling-2.28.2 → docling-2.28.3}/docling/backend/docx/__init__.py +0 -0
  15. {docling-2.28.2 → docling-2.28.3}/docling/backend/docx/latex/__init__.py +0 -0
  16. {docling-2.28.2 → docling-2.28.3}/docling/backend/docx/latex/latex_dict.py +0 -0
  17. {docling-2.28.2 → docling-2.28.3}/docling/backend/docx/latex/omml.py +0 -0
  18. {docling-2.28.2 → docling-2.28.3}/docling/backend/html_backend.py +0 -0
  19. {docling-2.28.2 → docling-2.28.3}/docling/backend/json/__init__.py +0 -0
  20. {docling-2.28.2 → docling-2.28.3}/docling/backend/json/docling_json_backend.py +0 -0
  21. {docling-2.28.2 → docling-2.28.3}/docling/backend/md_backend.py +0 -0
  22. {docling-2.28.2 → docling-2.28.3}/docling/backend/msexcel_backend.py +0 -0
  23. {docling-2.28.2 → docling-2.28.3}/docling/backend/mspowerpoint_backend.py +0 -0
  24. {docling-2.28.2 → docling-2.28.3}/docling/backend/msword_backend.py +0 -0
  25. {docling-2.28.2 → docling-2.28.3}/docling/backend/pdf_backend.py +0 -0
  26. {docling-2.28.2 → docling-2.28.3}/docling/backend/pypdfium2_backend.py +0 -0
  27. {docling-2.28.2 → docling-2.28.3}/docling/backend/xml/__init__.py +0 -0
  28. {docling-2.28.2 → docling-2.28.3}/docling/backend/xml/jats_backend.py +0 -0
  29. {docling-2.28.2 → docling-2.28.3}/docling/backend/xml/uspto_backend.py +0 -0
  30. {docling-2.28.2 → docling-2.28.3}/docling/chunking/__init__.py +0 -0
  31. {docling-2.28.2 → docling-2.28.3}/docling/cli/__init__.py +0 -0
  32. {docling-2.28.2 → docling-2.28.3}/docling/cli/main.py +0 -0
  33. {docling-2.28.2 → docling-2.28.3}/docling/cli/models.py +0 -0
  34. {docling-2.28.2 → docling-2.28.3}/docling/cli/tools.py +0 -0
  35. {docling-2.28.2 → docling-2.28.3}/docling/datamodel/__init__.py +0 -0
  36. {docling-2.28.2 → docling-2.28.3}/docling/datamodel/base_models.py +0 -0
  37. {docling-2.28.2 → docling-2.28.3}/docling/datamodel/document.py +0 -0
  38. {docling-2.28.2 → docling-2.28.3}/docling/datamodel/pipeline_options.py +0 -0
  39. {docling-2.28.2 → docling-2.28.3}/docling/datamodel/settings.py +0 -0
  40. {docling-2.28.2 → docling-2.28.3}/docling/document_converter.py +0 -0
  41. {docling-2.28.2 → docling-2.28.3}/docling/exceptions.py +0 -0
  42. {docling-2.28.2 → docling-2.28.3}/docling/models/__init__.py +0 -0
  43. {docling-2.28.2 → docling-2.28.3}/docling/models/base_model.py +0 -0
  44. {docling-2.28.2 → docling-2.28.3}/docling/models/base_ocr_model.py +0 -0
  45. {docling-2.28.2 → docling-2.28.3}/docling/models/code_formula_model.py +0 -0
  46. {docling-2.28.2 → docling-2.28.3}/docling/models/document_picture_classifier.py +0 -0
  47. {docling-2.28.2 → docling-2.28.3}/docling/models/easyocr_model.py +0 -0
  48. {docling-2.28.2 → docling-2.28.3}/docling/models/factories/__init__.py +0 -0
  49. {docling-2.28.2 → docling-2.28.3}/docling/models/factories/base_factory.py +0 -0
  50. {docling-2.28.2 → docling-2.28.3}/docling/models/factories/ocr_factory.py +0 -0
  51. {docling-2.28.2 → docling-2.28.3}/docling/models/factories/picture_description_factory.py +0 -0
  52. {docling-2.28.2 → docling-2.28.3}/docling/models/hf_mlx_model.py +0 -0
  53. {docling-2.28.2 → docling-2.28.3}/docling/models/hf_vlm_model.py +0 -0
  54. {docling-2.28.2 → docling-2.28.3}/docling/models/layout_model.py +0 -0
  55. {docling-2.28.2 → docling-2.28.3}/docling/models/ocr_mac_model.py +0 -0
  56. {docling-2.28.2 → docling-2.28.3}/docling/models/page_assemble_model.py +0 -0
  57. {docling-2.28.2 → docling-2.28.3}/docling/models/page_preprocessing_model.py +0 -0
  58. {docling-2.28.2 → docling-2.28.3}/docling/models/picture_description_api_model.py +0 -0
  59. {docling-2.28.2 → docling-2.28.3}/docling/models/picture_description_base_model.py +0 -0
  60. {docling-2.28.2 → docling-2.28.3}/docling/models/picture_description_vlm_model.py +0 -0
  61. {docling-2.28.2 → docling-2.28.3}/docling/models/plugins/__init__.py +0 -0
  62. {docling-2.28.2 → docling-2.28.3}/docling/models/plugins/defaults.py +0 -0
  63. {docling-2.28.2 → docling-2.28.3}/docling/models/rapid_ocr_model.py +0 -0
  64. {docling-2.28.2 → docling-2.28.3}/docling/models/readingorder_model.py +0 -0
  65. {docling-2.28.2 → docling-2.28.3}/docling/models/tesseract_ocr_cli_model.py +0 -0
  66. {docling-2.28.2 → docling-2.28.3}/docling/models/tesseract_ocr_model.py +0 -0
  67. {docling-2.28.2 → docling-2.28.3}/docling/pipeline/__init__.py +0 -0
  68. {docling-2.28.2 → docling-2.28.3}/docling/pipeline/base_pipeline.py +0 -0
  69. {docling-2.28.2 → docling-2.28.3}/docling/pipeline/simple_pipeline.py +0 -0
  70. {docling-2.28.2 → docling-2.28.3}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  71. {docling-2.28.2 → docling-2.28.3}/docling/pipeline/vlm_pipeline.py +0 -0
  72. {docling-2.28.2 → docling-2.28.3}/docling/py.typed +0 -0
  73. {docling-2.28.2 → docling-2.28.3}/docling/utils/__init__.py +0 -0
  74. {docling-2.28.2 → docling-2.28.3}/docling/utils/accelerator_utils.py +0 -0
  75. {docling-2.28.2 → docling-2.28.3}/docling/utils/export.py +0 -0
  76. {docling-2.28.2 → docling-2.28.3}/docling/utils/glm_utils.py +0 -0
  77. {docling-2.28.2 → docling-2.28.3}/docling/utils/layout_postprocessor.py +0 -0
  78. {docling-2.28.2 → docling-2.28.3}/docling/utils/locks.py +0 -0
  79. {docling-2.28.2 → docling-2.28.3}/docling/utils/model_downloader.py +0 -0
  80. {docling-2.28.2 → docling-2.28.3}/docling/utils/ocr_utils.py +0 -0
  81. {docling-2.28.2 → docling-2.28.3}/docling/utils/profiling.py +0 -0
  82. {docling-2.28.2 → docling-2.28.3}/docling/utils/utils.py +0 -0
  83. {docling-2.28.2 → docling-2.28.3}/docling/utils/visualization.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: docling
3
- Version: 2.28.2
3
+ Version: 2.28.3
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Home-page: https://github.com/docling-project/docling
6
6
  License: MIT
@@ -28,7 +28,7 @@ Provides-Extra: vlm
28
28
  Requires-Dist: accelerate (>=1.2.1,<2.0.0) ; (sys_platform != "darwin" or platform_machine != "x86_64") and (extra == "vlm")
29
29
  Requires-Dist: beautifulsoup4 (>=4.12.3,<5.0.0)
30
30
  Requires-Dist: certifi (>=2024.7.4)
31
- Requires-Dist: docling-core[chunking] (>=2.23.1,<3.0.0)
31
+ Requires-Dist: docling-core[chunking] (>=2.24.1,<3.0.0)
32
32
  Requires-Dist: docling-ibm-models (>=3.4.0,<4.0.0)
33
33
  Requires-Dist: docling-parse (>=4.0.0,<5.0.0)
34
34
  Requires-Dist: easyocr (>=1.7,<2.0)
@@ -5,7 +5,11 @@ from typing import Iterable, Optional, Union
5
5
 
6
6
  import numpy
7
7
  from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
8
- from docling_core.types.doc.page import BoundingRectangle
8
+ from docling_core.types.doc.page import (
9
+ BoundingRectangle,
10
+ SegmentedPdfPage,
11
+ TextCellUnit,
12
+ )
9
13
  from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
10
14
  from PIL import ImageDraw
11
15
 
@@ -218,9 +222,18 @@ class TableStructureModel(BasePageModel):
218
222
 
219
223
  if len(table_bboxes):
220
224
  for table_cluster, tbl_box in in_tables:
221
-
225
+ # Check if word-level cells are available from backend:
226
+ sp = page._backend.get_segmented_page()
227
+ if sp is not None:
228
+ tcells = sp.get_cells_in_bbox(
229
+ cell_unit=TextCellUnit.WORD,
230
+ bbox=table_cluster.bbox,
231
+ )
232
+ else:
233
+ # Otherwise - we use normal (line/phrase) cells
234
+ tcells = table_cluster.cells
222
235
  tokens = []
223
- for c in table_cluster.cells:
236
+ for c in tcells:
224
237
  # Only allow non empty stings (spaces) into the cells of a table
225
238
  if len(c.text.strip()) > 0:
226
239
  new_cell = copy.deepcopy(c)
@@ -229,7 +242,6 @@ class TableStructureModel(BasePageModel):
229
242
  scale=self.scale
230
243
  )
231
244
  )
232
-
233
245
  tokens.append(
234
246
  {
235
247
  "id": new_cell.index,
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "docling"
3
- version = "2.28.2" # DO NOT EDIT, updated automatically
3
+ version = "2.28.3" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  authors = [
6
6
  "Christoph Auer <cau@zurich.ibm.com>",
@@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
46
46
  ######################
47
47
  python = "^3.9"
48
48
  pydantic = "^2.0.0"
49
- docling-core = {extras = ["chunking"], version = "^2.23.1"}
49
+ docling-core = {extras = ["chunking"], version = "^2.24.1"}
50
50
  docling-ibm-models = "^3.4.0"
51
51
  docling-parse = "^4.0.0"
52
52
  filetype = "^1.2.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes