docling 2.55.0__tar.gz → 2.55.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

Files changed (147) hide show
  1. {docling-2.55.0 → docling-2.55.1}/PKG-INFO +1 -1
  2. {docling-2.55.0 → docling-2.55.1}/docling/backend/md_backend.py +4 -1
  3. {docling-2.55.0 → docling-2.55.1}/docling/cli/main.py +8 -1
  4. {docling-2.55.0 → docling-2.55.1}/docling/models/readingorder_model.py +56 -5
  5. {docling-2.55.0 → docling-2.55.1}/docling.egg-info/PKG-INFO +1 -1
  6. {docling-2.55.0 → docling-2.55.1}/pyproject.toml +2 -1
  7. {docling-2.55.0 → docling-2.55.1}/LICENSE +0 -0
  8. {docling-2.55.0 → docling-2.55.1}/README.md +0 -0
  9. {docling-2.55.0 → docling-2.55.1}/docling/__init__.py +0 -0
  10. {docling-2.55.0 → docling-2.55.1}/docling/backend/__init__.py +0 -0
  11. {docling-2.55.0 → docling-2.55.1}/docling/backend/abstract_backend.py +0 -0
  12. {docling-2.55.0 → docling-2.55.1}/docling/backend/asciidoc_backend.py +0 -0
  13. {docling-2.55.0 → docling-2.55.1}/docling/backend/csv_backend.py +0 -0
  14. {docling-2.55.0 → docling-2.55.1}/docling/backend/docling_parse_backend.py +0 -0
  15. {docling-2.55.0 → docling-2.55.1}/docling/backend/docling_parse_v2_backend.py +0 -0
  16. {docling-2.55.0 → docling-2.55.1}/docling/backend/docling_parse_v4_backend.py +0 -0
  17. {docling-2.55.0 → docling-2.55.1}/docling/backend/docx/__init__.py +0 -0
  18. {docling-2.55.0 → docling-2.55.1}/docling/backend/docx/latex/__init__.py +0 -0
  19. {docling-2.55.0 → docling-2.55.1}/docling/backend/docx/latex/latex_dict.py +0 -0
  20. {docling-2.55.0 → docling-2.55.1}/docling/backend/docx/latex/omml.py +0 -0
  21. {docling-2.55.0 → docling-2.55.1}/docling/backend/html_backend.py +0 -0
  22. {docling-2.55.0 → docling-2.55.1}/docling/backend/json/__init__.py +0 -0
  23. {docling-2.55.0 → docling-2.55.1}/docling/backend/json/docling_json_backend.py +0 -0
  24. {docling-2.55.0 → docling-2.55.1}/docling/backend/mets_gbs_backend.py +0 -0
  25. {docling-2.55.0 → docling-2.55.1}/docling/backend/msexcel_backend.py +0 -0
  26. {docling-2.55.0 → docling-2.55.1}/docling/backend/mspowerpoint_backend.py +0 -0
  27. {docling-2.55.0 → docling-2.55.1}/docling/backend/msword_backend.py +0 -0
  28. {docling-2.55.0 → docling-2.55.1}/docling/backend/noop_backend.py +0 -0
  29. {docling-2.55.0 → docling-2.55.1}/docling/backend/pdf_backend.py +0 -0
  30. {docling-2.55.0 → docling-2.55.1}/docling/backend/pypdfium2_backend.py +0 -0
  31. {docling-2.55.0 → docling-2.55.1}/docling/backend/webvtt_backend.py +0 -0
  32. {docling-2.55.0 → docling-2.55.1}/docling/backend/xml/__init__.py +0 -0
  33. {docling-2.55.0 → docling-2.55.1}/docling/backend/xml/jats_backend.py +0 -0
  34. {docling-2.55.0 → docling-2.55.1}/docling/backend/xml/uspto_backend.py +0 -0
  35. {docling-2.55.0 → docling-2.55.1}/docling/chunking/__init__.py +0 -0
  36. {docling-2.55.0 → docling-2.55.1}/docling/cli/__init__.py +0 -0
  37. {docling-2.55.0 → docling-2.55.1}/docling/cli/models.py +0 -0
  38. {docling-2.55.0 → docling-2.55.1}/docling/cli/tools.py +0 -0
  39. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/__init__.py +0 -0
  40. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/accelerator_options.py +0 -0
  41. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/asr_model_specs.py +0 -0
  42. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/base_models.py +0 -0
  43. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/document.py +0 -0
  44. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/extraction.py +0 -0
  45. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/layout_model_specs.py +0 -0
  46. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/pipeline_options.py +0 -0
  47. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  48. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  49. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/settings.py +0 -0
  50. {docling-2.55.0 → docling-2.55.1}/docling/datamodel/vlm_model_specs.py +0 -0
  51. {docling-2.55.0 → docling-2.55.1}/docling/document_converter.py +0 -0
  52. {docling-2.55.0 → docling-2.55.1}/docling/document_extractor.py +0 -0
  53. {docling-2.55.0 → docling-2.55.1}/docling/exceptions.py +0 -0
  54. {docling-2.55.0 → docling-2.55.1}/docling/models/__init__.py +0 -0
  55. {docling-2.55.0 → docling-2.55.1}/docling/models/api_vlm_model.py +0 -0
  56. {docling-2.55.0 → docling-2.55.1}/docling/models/base_model.py +0 -0
  57. {docling-2.55.0 → docling-2.55.1}/docling/models/base_ocr_model.py +0 -0
  58. {docling-2.55.0 → docling-2.55.1}/docling/models/code_formula_model.py +0 -0
  59. {docling-2.55.0 → docling-2.55.1}/docling/models/document_picture_classifier.py +0 -0
  60. {docling-2.55.0 → docling-2.55.1}/docling/models/easyocr_model.py +0 -0
  61. {docling-2.55.0 → docling-2.55.1}/docling/models/factories/__init__.py +0 -0
  62. {docling-2.55.0 → docling-2.55.1}/docling/models/factories/base_factory.py +0 -0
  63. {docling-2.55.0 → docling-2.55.1}/docling/models/factories/ocr_factory.py +0 -0
  64. {docling-2.55.0 → docling-2.55.1}/docling/models/factories/picture_description_factory.py +0 -0
  65. {docling-2.55.0 → docling-2.55.1}/docling/models/layout_model.py +0 -0
  66. {docling-2.55.0 → docling-2.55.1}/docling/models/ocr_mac_model.py +0 -0
  67. {docling-2.55.0 → docling-2.55.1}/docling/models/page_assemble_model.py +0 -0
  68. {docling-2.55.0 → docling-2.55.1}/docling/models/page_preprocessing_model.py +0 -0
  69. {docling-2.55.0 → docling-2.55.1}/docling/models/picture_description_api_model.py +0 -0
  70. {docling-2.55.0 → docling-2.55.1}/docling/models/picture_description_base_model.py +0 -0
  71. {docling-2.55.0 → docling-2.55.1}/docling/models/picture_description_vlm_model.py +0 -0
  72. {docling-2.55.0 → docling-2.55.1}/docling/models/plugins/__init__.py +0 -0
  73. {docling-2.55.0 → docling-2.55.1}/docling/models/plugins/defaults.py +0 -0
  74. {docling-2.55.0 → docling-2.55.1}/docling/models/rapid_ocr_model.py +0 -0
  75. {docling-2.55.0 → docling-2.55.1}/docling/models/table_structure_model.py +0 -0
  76. {docling-2.55.0 → docling-2.55.1}/docling/models/tesseract_ocr_cli_model.py +0 -0
  77. {docling-2.55.0 → docling-2.55.1}/docling/models/tesseract_ocr_model.py +0 -0
  78. {docling-2.55.0 → docling-2.55.1}/docling/models/utils/__init__.py +0 -0
  79. {docling-2.55.0 → docling-2.55.1}/docling/models/utils/generation_utils.py +0 -0
  80. {docling-2.55.0 → docling-2.55.1}/docling/models/utils/hf_model_download.py +0 -0
  81. {docling-2.55.0 → docling-2.55.1}/docling/models/vlm_models_inline/__init__.py +0 -0
  82. {docling-2.55.0 → docling-2.55.1}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  83. {docling-2.55.0 → docling-2.55.1}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  84. {docling-2.55.0 → docling-2.55.1}/docling/models/vlm_models_inline/nuextract_transformers_model.py +0 -0
  85. {docling-2.55.0 → docling-2.55.1}/docling/models/vlm_models_inline/vllm_model.py +0 -0
  86. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/__init__.py +0 -0
  87. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/asr_pipeline.py +0 -0
  88. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/base_extraction_pipeline.py +0 -0
  89. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/base_pipeline.py +0 -0
  90. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/extraction_vlm_pipeline.py +0 -0
  91. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/simple_pipeline.py +0 -0
  92. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  93. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/threaded_standard_pdf_pipeline.py +0 -0
  94. {docling-2.55.0 → docling-2.55.1}/docling/pipeline/vlm_pipeline.py +0 -0
  95. {docling-2.55.0 → docling-2.55.1}/docling/py.typed +0 -0
  96. {docling-2.55.0 → docling-2.55.1}/docling/utils/__init__.py +0 -0
  97. {docling-2.55.0 → docling-2.55.1}/docling/utils/accelerator_utils.py +0 -0
  98. {docling-2.55.0 → docling-2.55.1}/docling/utils/api_image_request.py +0 -0
  99. {docling-2.55.0 → docling-2.55.1}/docling/utils/export.py +0 -0
  100. {docling-2.55.0 → docling-2.55.1}/docling/utils/glm_utils.py +0 -0
  101. {docling-2.55.0 → docling-2.55.1}/docling/utils/layout_postprocessor.py +0 -0
  102. {docling-2.55.0 → docling-2.55.1}/docling/utils/locks.py +0 -0
  103. {docling-2.55.0 → docling-2.55.1}/docling/utils/model_downloader.py +0 -0
  104. {docling-2.55.0 → docling-2.55.1}/docling/utils/ocr_utils.py +0 -0
  105. {docling-2.55.0 → docling-2.55.1}/docling/utils/orientation.py +0 -0
  106. {docling-2.55.0 → docling-2.55.1}/docling/utils/profiling.py +0 -0
  107. {docling-2.55.0 → docling-2.55.1}/docling/utils/utils.py +0 -0
  108. {docling-2.55.0 → docling-2.55.1}/docling/utils/visualization.py +0 -0
  109. {docling-2.55.0 → docling-2.55.1}/docling.egg-info/SOURCES.txt +0 -0
  110. {docling-2.55.0 → docling-2.55.1}/docling.egg-info/dependency_links.txt +0 -0
  111. {docling-2.55.0 → docling-2.55.1}/docling.egg-info/entry_points.txt +0 -0
  112. {docling-2.55.0 → docling-2.55.1}/docling.egg-info/requires.txt +0 -0
  113. {docling-2.55.0 → docling-2.55.1}/docling.egg-info/top_level.txt +0 -0
  114. {docling-2.55.0 → docling-2.55.1}/setup.cfg +0 -0
  115. {docling-2.55.0 → docling-2.55.1}/tests/test_asr_pipeline.py +0 -0
  116. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_asciidoc.py +0 -0
  117. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_csv.py +0 -0
  118. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_docling_json.py +0 -0
  119. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_docling_parse.py +0 -0
  120. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_docling_parse_v2.py +0 -0
  121. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_docling_parse_v4.py +0 -0
  122. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_html.py +0 -0
  123. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_jats.py +0 -0
  124. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_markdown.py +0 -0
  125. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_mets_gbs.py +0 -0
  126. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_msexcel.py +0 -0
  127. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_msword.py +0 -0
  128. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_patent_uspto.py +0 -0
  129. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_pdfium.py +0 -0
  130. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_pptx.py +0 -0
  131. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_vtt.py +0 -0
  132. {docling-2.55.0 → docling-2.55.1}/tests/test_backend_webp.py +0 -0
  133. {docling-2.55.0 → docling-2.55.1}/tests/test_cli.py +0 -0
  134. {docling-2.55.0 → docling-2.55.1}/tests/test_code_formula.py +0 -0
  135. {docling-2.55.0 → docling-2.55.1}/tests/test_data_gen_flag.py +0 -0
  136. {docling-2.55.0 → docling-2.55.1}/tests/test_document_picture_classifier.py +0 -0
  137. {docling-2.55.0 → docling-2.55.1}/tests/test_e2e_conversion.py +0 -0
  138. {docling-2.55.0 → docling-2.55.1}/tests/test_e2e_ocr_conversion.py +0 -0
  139. {docling-2.55.0 → docling-2.55.1}/tests/test_extraction.py +0 -0
  140. {docling-2.55.0 → docling-2.55.1}/tests/test_input_doc.py +0 -0
  141. {docling-2.55.0 → docling-2.55.1}/tests/test_interfaces.py +0 -0
  142. {docling-2.55.0 → docling-2.55.1}/tests/test_invalid_input.py +0 -0
  143. {docling-2.55.0 → docling-2.55.1}/tests/test_legacy_format_transform.py +0 -0
  144. {docling-2.55.0 → docling-2.55.1}/tests/test_ocr_utils.py +0 -0
  145. {docling-2.55.0 → docling-2.55.1}/tests/test_options.py +0 -0
  146. {docling-2.55.0 → docling-2.55.1}/tests/test_settings_load.py +0 -0
  147. {docling-2.55.0 → docling-2.55.1}/tests/test_threaded_pipeline.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.55.0
3
+ Version: 2.55.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -249,7 +249,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
249
249
 
250
250
  # Iterates over all elements in the AST
251
251
  # Check for different element types and process relevant details
252
- if isinstance(element, marko.block.Heading) and len(element.children) > 0:
252
+ if (
253
+ isinstance(element, marko.block.Heading)
254
+ or isinstance(element, marko.block.SetextHeading)
255
+ ) and len(element.children) > 0:
253
256
  self._close_table(doc)
254
257
  _log.debug(
255
258
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
@@ -355,6 +355,13 @@ def convert( # noqa: C901
355
355
  help="Replace any existing text with OCR generated text over the full content.",
356
356
  ),
357
357
  ] = False,
358
+ tables: Annotated[
359
+ bool,
360
+ typer.Option(
361
+ ...,
362
+ help="If enabled, the table structure model will be used to extract table information.",
363
+ ),
364
+ ] = True,
358
365
  ocr_engine: Annotated[
359
366
  str,
360
367
  typer.Option(
@@ -591,7 +598,7 @@ def convert( # noqa: C901
591
598
  accelerator_options=accelerator_options,
592
599
  do_ocr=ocr,
593
600
  ocr_options=ocr_options,
594
- do_table_structure=True,
601
+ do_table_structure=tables,
595
602
  do_code_enrichment=enrich_code,
596
603
  do_formula_enrichment=enrich_formula,
597
604
  do_picture_description=enrich_picture_description,
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
9
9
  NodeItem,
10
10
  ProvenanceItem,
11
11
  RefItem,
12
+ RichTableCell,
12
13
  TableData,
13
14
  )
14
15
  from docling_core.types.doc.document import ContentLayer
@@ -103,6 +104,22 @@ class ReadingOrderModel:
103
104
  else:
104
105
  doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
105
106
 
107
+ def _create_rich_cell_group(
108
+ self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
109
+ ) -> RefItem:
110
+ """Create a group containing all child elements for a rich table cell."""
111
+ group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
112
+ group_element = doc.add_group(
113
+ label=GroupLabel.UNSPECIFIED,
114
+ name=group_name,
115
+ parent=table_item,
116
+ )
117
+
118
+ # Add all child elements to the group
119
+ self._add_child_elements(element, group_element, doc)
120
+
121
+ return group_element.get_ref()
122
+
106
123
  def _readingorder_elements_to_docling_doc(
107
124
  self,
108
125
  conv_res: ConversionResult,
@@ -197,11 +214,21 @@ class ReadingOrderModel:
197
214
  )
198
215
 
199
216
  elif isinstance(element, Table):
200
- tbl_data = TableData(
201
- num_rows=element.num_rows,
202
- num_cols=element.num_cols,
203
- table_cells=element.table_cells,
204
- )
217
+ # Check if table has no structure prediction
218
+ if element.num_rows == 0 and element.num_cols == 0:
219
+ # Only create 1x1 table if there are children to put in it
220
+ if element.cluster.children:
221
+ # Create minimal 1x1 table with rich cell containing all children
222
+ tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
223
+ else:
224
+ # Create empty table with no structure
225
+ tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
226
+ else:
227
+ tbl_data = TableData(
228
+ num_rows=element.num_rows,
229
+ num_cols=element.num_cols,
230
+ table_cells=element.table_cells,
231
+ )
205
232
 
206
233
  prov = ProvenanceItem(
207
234
  page_no=element.page_no + 1,
@@ -231,6 +258,30 @@ class ReadingOrderModel:
231
258
 
232
259
  tbl.footnotes.append(new_footnote_item.get_ref())
233
260
 
261
+ # Handle case where table has no structure prediction but has children
262
+ if (
263
+ element.num_rows == 0
264
+ and element.num_cols == 0
265
+ and element.cluster.children
266
+ ):
267
+ # Create rich cell containing all child elements
268
+ rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
269
+
270
+ # Create rich table cell spanning the entire 1x1 table
271
+ rich_cell = RichTableCell(
272
+ text="", # Empty text since content is in the group
273
+ row_span=1,
274
+ col_span=1,
275
+ start_row_offset_idx=0,
276
+ end_row_offset_idx=1,
277
+ start_col_offset_idx=0,
278
+ end_col_offset_idx=1,
279
+ column_header=False,
280
+ row_header=False,
281
+ ref=rich_cell_ref,
282
+ )
283
+ out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
284
+
234
285
  # TODO: Consider adding children of Table.
235
286
 
236
287
  elif isinstance(element, FigureElement):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.55.0
3
+ Version: 2.55.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.55.0" # DO NOT EDIT, updated automatically
3
+ version = "2.55.1" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -123,6 +123,7 @@ dev = [
123
123
  "pytest~=8.3",
124
124
  "pytest-cov>=6.1.1",
125
125
  "pytest-dependency~=0.6",
126
+ "pytest-durations~=1.6.1",
126
127
  "pytest-xdist~=3.3",
127
128
  "ipykernel~=6.29",
128
129
  "ipywidgets~=8.1",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes