docling 2.55.0__py3-none-any.whl → 2.55.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of docling might be problematic. Click here for more details.

@@ -249,7 +249,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
249
249
 
250
250
  # Iterates over all elements in the AST
251
251
  # Check for different element types and process relevant details
252
- if isinstance(element, marko.block.Heading) and len(element.children) > 0:
252
+ if (
253
+ isinstance(element, marko.block.Heading)
254
+ or isinstance(element, marko.block.SetextHeading)
255
+ ) and len(element.children) > 0:
253
256
  self._close_table(doc)
254
257
  _log.debug(
255
258
  f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
docling/cli/main.py CHANGED
@@ -355,6 +355,13 @@ def convert( # noqa: C901
355
355
  help="Replace any existing text with OCR generated text over the full content.",
356
356
  ),
357
357
  ] = False,
358
+ tables: Annotated[
359
+ bool,
360
+ typer.Option(
361
+ ...,
362
+ help="If enabled, the table structure model will be used to extract table information.",
363
+ ),
364
+ ] = True,
358
365
  ocr_engine: Annotated[
359
366
  str,
360
367
  typer.Option(
@@ -591,7 +598,7 @@ def convert( # noqa: C901
591
598
  accelerator_options=accelerator_options,
592
599
  do_ocr=ocr,
593
600
  ocr_options=ocr_options,
594
- do_table_structure=True,
601
+ do_table_structure=tables,
595
602
  do_code_enrichment=enrich_code,
596
603
  do_formula_enrichment=enrich_formula,
597
604
  do_picture_description=enrich_picture_description,
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
9
9
  NodeItem,
10
10
  ProvenanceItem,
11
11
  RefItem,
12
+ RichTableCell,
12
13
  TableData,
13
14
  )
14
15
  from docling_core.types.doc.document import ContentLayer
@@ -103,6 +104,22 @@ class ReadingOrderModel:
103
104
  else:
104
105
  doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
105
106
 
107
+ def _create_rich_cell_group(
108
+ self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
109
+ ) -> RefItem:
110
+ """Create a group containing all child elements for a rich table cell."""
111
+ group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
112
+ group_element = doc.add_group(
113
+ label=GroupLabel.UNSPECIFIED,
114
+ name=group_name,
115
+ parent=table_item,
116
+ )
117
+
118
+ # Add all child elements to the group
119
+ self._add_child_elements(element, group_element, doc)
120
+
121
+ return group_element.get_ref()
122
+
106
123
  def _readingorder_elements_to_docling_doc(
107
124
  self,
108
125
  conv_res: ConversionResult,
@@ -197,11 +214,21 @@ class ReadingOrderModel:
197
214
  )
198
215
 
199
216
  elif isinstance(element, Table):
200
- tbl_data = TableData(
201
- num_rows=element.num_rows,
202
- num_cols=element.num_cols,
203
- table_cells=element.table_cells,
204
- )
217
+ # Check if table has no structure prediction
218
+ if element.num_rows == 0 and element.num_cols == 0:
219
+ # Only create 1x1 table if there are children to put in it
220
+ if element.cluster.children:
221
+ # Create minimal 1x1 table with rich cell containing all children
222
+ tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
223
+ else:
224
+ # Create empty table with no structure
225
+ tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
226
+ else:
227
+ tbl_data = TableData(
228
+ num_rows=element.num_rows,
229
+ num_cols=element.num_cols,
230
+ table_cells=element.table_cells,
231
+ )
205
232
 
206
233
  prov = ProvenanceItem(
207
234
  page_no=element.page_no + 1,
@@ -231,6 +258,30 @@ class ReadingOrderModel:
231
258
 
232
259
  tbl.footnotes.append(new_footnote_item.get_ref())
233
260
 
261
+ # Handle case where table has no structure prediction but has children
262
+ if (
263
+ element.num_rows == 0
264
+ and element.num_cols == 0
265
+ and element.cluster.children
266
+ ):
267
+ # Create rich cell containing all child elements
268
+ rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
269
+
270
+ # Create rich table cell spanning the entire 1x1 table
271
+ rich_cell = RichTableCell(
272
+ text="", # Empty text since content is in the group
273
+ row_span=1,
274
+ col_span=1,
275
+ start_row_offset_idx=0,
276
+ end_row_offset_idx=1,
277
+ start_col_offset_idx=0,
278
+ end_col_offset_idx=1,
279
+ column_header=False,
280
+ row_header=False,
281
+ ref=rich_cell_ref,
282
+ )
283
+ out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
284
+
234
285
  # TODO: Consider adding children of Table.
235
286
 
236
287
  elif isinstance(element, FigureElement):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.55.0
3
+ Version: 2.55.1
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -11,7 +11,7 @@ docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3U
11
11
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
12
12
  docling/backend/docling_parse_v4_backend.py,sha256=xCBbaaXjNNrOaod9tmBuCbe5mL_ipmTNG2XOxVbGG3w,7891
13
13
  docling/backend/html_backend.py,sha256=r2m3aIKwwr8Vv2Fxri1FaZFvd4EWvTQlmSPwXeD79zg,47796
14
- docling/backend/md_backend.py,sha256=zrOUYoIYudUfigwnXRQocb_M4G_ptYfblNgr6BNTYQw,22678
14
+ docling/backend/md_backend.py,sha256=TWboEPHl93pqI_Go1a3XpP-KpzI3d17xo5ZW42Ul0kY,22764
15
15
  docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
16
16
  docling/backend/msexcel_backend.py,sha256=5JRbPwOjR1r45AMeIts1rj6InbOgLBf_CtAhvNPVmsQ,19157
17
17
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
@@ -31,7 +31,7 @@ docling/backend/xml/jats_backend.py,sha256=_BWpQQg3SlsHAOOj0v2qRJoVqaQzL91GqN1tK
31
31
  docling/backend/xml/uspto_backend.py,sha256=Tv4CE7V5_QwxTNJPl90CAd_mAbwaLGy8S6s6evh1Xow,70910
32
32
  docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
33
33
  docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- docling/cli/main.py,sha256=UX-5fRGVP_yGxTQez0x1PNnaNKRgWdcXGoPCHy-0uFM,32887
34
+ docling/cli/main.py,sha256=glci4i1KYphr_8WueEnrSQF08xvtGxXBi6EvWDGewHU,33091
35
35
  docling/cli/models.py,sha256=rw_2JfeJ-k_iOLpz3JfgL1QbJY__W9nE23nHdov6VfU,6252
36
36
  docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
37
37
  docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -61,7 +61,7 @@ docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCr
61
61
  docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
62
62
  docling/models/picture_description_vlm_model.py,sha256=Uja_BQSk7F-U1J2hm4yeLguirUzKYv1K8zRyw1IYomY,4150
63
63
  docling/models/rapid_ocr_model.py,sha256=anUVUwaj9Wubgu4FnHdYMuOVkQP_hJiLY1qRToelBoc,7700
64
- docling/models/readingorder_model.py,sha256=_usJdpM4GMWeGGneEwLLxa9grIGQb0XnNMugV72jGbY,14911
64
+ docling/models/readingorder_model.py,sha256=-j-UuvnsYWqZvY0gByKz0bjcBwOhWQTHerCopig_jVs,17266
65
65
  docling/models/table_structure_model.py,sha256=7g_mFf1YzfF8PXQfefNu6XYZu7TzJAn86zKb6IEUdCg,12518
66
66
  docling/models/tesseract_ocr_cli_model.py,sha256=I3Gn28Y-LD8OfvyCElN9fLiNgpo2sT0uMkVt258253s,12881
67
67
  docling/models/tesseract_ocr_model.py,sha256=GdI5Cjfi87qcehVbM3wdKRvKkl_F9A4bwTUbjXZCJYA,10745
@@ -101,9 +101,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
101
101
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
102
102
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
103
103
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
104
- docling-2.55.0.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
105
- docling-2.55.0.dist-info/METADATA,sha256=e1RK_bATZ2Q_Ie9kC6uHFCj99D7pkW678jxk_l0CHxk,11252
106
- docling-2.55.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
107
- docling-2.55.0.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
108
- docling-2.55.0.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
109
- docling-2.55.0.dist-info/RECORD,,
104
+ docling-2.55.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
105
+ docling-2.55.1.dist-info/METADATA,sha256=Q0vntiJGbmAM9ONc6x8a-CeCJWpLDJPCN5k7_hhn6sA,11252
106
+ docling-2.55.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
107
+ docling-2.55.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
108
+ docling-2.55.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
109
+ docling-2.55.1.dist-info/RECORD,,