docling 2.55.0__py3-none-any.whl → 2.55.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of docling might be problematic. Click here for more details.
- docling/backend/md_backend.py +4 -1
- docling/cli/main.py +8 -1
- docling/models/readingorder_model.py +56 -5
- {docling-2.55.0.dist-info → docling-2.55.1.dist-info}/METADATA +1 -1
- {docling-2.55.0.dist-info → docling-2.55.1.dist-info}/RECORD +9 -9
- {docling-2.55.0.dist-info → docling-2.55.1.dist-info}/WHEEL +0 -0
- {docling-2.55.0.dist-info → docling-2.55.1.dist-info}/entry_points.txt +0 -0
- {docling-2.55.0.dist-info → docling-2.55.1.dist-info}/licenses/LICENSE +0 -0
- {docling-2.55.0.dist-info → docling-2.55.1.dist-info}/top_level.txt +0 -0
docling/backend/md_backend.py
CHANGED
|
@@ -249,7 +249,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|
|
249
249
|
|
|
250
250
|
# Iterates over all elements in the AST
|
|
251
251
|
# Check for different element types and process relevant details
|
|
252
|
-
if
|
|
252
|
+
if (
|
|
253
|
+
isinstance(element, marko.block.Heading)
|
|
254
|
+
or isinstance(element, marko.block.SetextHeading)
|
|
255
|
+
) and len(element.children) > 0:
|
|
253
256
|
self._close_table(doc)
|
|
254
257
|
_log.debug(
|
|
255
258
|
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
docling/cli/main.py
CHANGED
|
@@ -355,6 +355,13 @@ def convert( # noqa: C901
|
|
|
355
355
|
help="Replace any existing text with OCR generated text over the full content.",
|
|
356
356
|
),
|
|
357
357
|
] = False,
|
|
358
|
+
tables: Annotated[
|
|
359
|
+
bool,
|
|
360
|
+
typer.Option(
|
|
361
|
+
...,
|
|
362
|
+
help="If enabled, the table structure model will be used to extract table information.",
|
|
363
|
+
),
|
|
364
|
+
] = True,
|
|
358
365
|
ocr_engine: Annotated[
|
|
359
366
|
str,
|
|
360
367
|
typer.Option(
|
|
@@ -591,7 +598,7 @@ def convert( # noqa: C901
|
|
|
591
598
|
accelerator_options=accelerator_options,
|
|
592
599
|
do_ocr=ocr,
|
|
593
600
|
ocr_options=ocr_options,
|
|
594
|
-
do_table_structure=
|
|
601
|
+
do_table_structure=tables,
|
|
595
602
|
do_code_enrichment=enrich_code,
|
|
596
603
|
do_formula_enrichment=enrich_formula,
|
|
597
604
|
do_picture_description=enrich_picture_description,
|
|
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
|
|
|
9
9
|
NodeItem,
|
|
10
10
|
ProvenanceItem,
|
|
11
11
|
RefItem,
|
|
12
|
+
RichTableCell,
|
|
12
13
|
TableData,
|
|
13
14
|
)
|
|
14
15
|
from docling_core.types.doc.document import ContentLayer
|
|
@@ -103,6 +104,22 @@ class ReadingOrderModel:
|
|
|
103
104
|
else:
|
|
104
105
|
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
|
105
106
|
|
|
107
|
+
def _create_rich_cell_group(
|
|
108
|
+
self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
|
|
109
|
+
) -> RefItem:
|
|
110
|
+
"""Create a group containing all child elements for a rich table cell."""
|
|
111
|
+
group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
|
|
112
|
+
group_element = doc.add_group(
|
|
113
|
+
label=GroupLabel.UNSPECIFIED,
|
|
114
|
+
name=group_name,
|
|
115
|
+
parent=table_item,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Add all child elements to the group
|
|
119
|
+
self._add_child_elements(element, group_element, doc)
|
|
120
|
+
|
|
121
|
+
return group_element.get_ref()
|
|
122
|
+
|
|
106
123
|
def _readingorder_elements_to_docling_doc(
|
|
107
124
|
self,
|
|
108
125
|
conv_res: ConversionResult,
|
|
@@ -197,11 +214,21 @@ class ReadingOrderModel:
|
|
|
197
214
|
)
|
|
198
215
|
|
|
199
216
|
elif isinstance(element, Table):
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
217
|
+
# Check if table has no structure prediction
|
|
218
|
+
if element.num_rows == 0 and element.num_cols == 0:
|
|
219
|
+
# Only create 1x1 table if there are children to put in it
|
|
220
|
+
if element.cluster.children:
|
|
221
|
+
# Create minimal 1x1 table with rich cell containing all children
|
|
222
|
+
tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
|
|
223
|
+
else:
|
|
224
|
+
# Create empty table with no structure
|
|
225
|
+
tbl_data = TableData(num_rows=0, num_cols=0, table_cells=[])
|
|
226
|
+
else:
|
|
227
|
+
tbl_data = TableData(
|
|
228
|
+
num_rows=element.num_rows,
|
|
229
|
+
num_cols=element.num_cols,
|
|
230
|
+
table_cells=element.table_cells,
|
|
231
|
+
)
|
|
205
232
|
|
|
206
233
|
prov = ProvenanceItem(
|
|
207
234
|
page_no=element.page_no + 1,
|
|
@@ -231,6 +258,30 @@ class ReadingOrderModel:
|
|
|
231
258
|
|
|
232
259
|
tbl.footnotes.append(new_footnote_item.get_ref())
|
|
233
260
|
|
|
261
|
+
# Handle case where table has no structure prediction but has children
|
|
262
|
+
if (
|
|
263
|
+
element.num_rows == 0
|
|
264
|
+
and element.num_cols == 0
|
|
265
|
+
and element.cluster.children
|
|
266
|
+
):
|
|
267
|
+
# Create rich cell containing all child elements
|
|
268
|
+
rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
|
|
269
|
+
|
|
270
|
+
# Create rich table cell spanning the entire 1x1 table
|
|
271
|
+
rich_cell = RichTableCell(
|
|
272
|
+
text="", # Empty text since content is in the group
|
|
273
|
+
row_span=1,
|
|
274
|
+
col_span=1,
|
|
275
|
+
start_row_offset_idx=0,
|
|
276
|
+
end_row_offset_idx=1,
|
|
277
|
+
start_col_offset_idx=0,
|
|
278
|
+
end_col_offset_idx=1,
|
|
279
|
+
column_header=False,
|
|
280
|
+
row_header=False,
|
|
281
|
+
ref=rich_cell_ref,
|
|
282
|
+
)
|
|
283
|
+
out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
|
|
284
|
+
|
|
234
285
|
# TODO: Consider adding children of Table.
|
|
235
286
|
|
|
236
287
|
elif isinstance(element, FigureElement):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docling
|
|
3
|
-
Version: 2.55.
|
|
3
|
+
Version: 2.55.1
|
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -11,7 +11,7 @@ docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3U
|
|
|
11
11
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
|
12
12
|
docling/backend/docling_parse_v4_backend.py,sha256=xCBbaaXjNNrOaod9tmBuCbe5mL_ipmTNG2XOxVbGG3w,7891
|
|
13
13
|
docling/backend/html_backend.py,sha256=r2m3aIKwwr8Vv2Fxri1FaZFvd4EWvTQlmSPwXeD79zg,47796
|
|
14
|
-
docling/backend/md_backend.py,sha256=
|
|
14
|
+
docling/backend/md_backend.py,sha256=TWboEPHl93pqI_Go1a3XpP-KpzI3d17xo5ZW42Ul0kY,22764
|
|
15
15
|
docling/backend/mets_gbs_backend.py,sha256=EA8sY6tbmGiysKGYPPZiNlK-i7Adn8bLTo-7Ym15hTU,12774
|
|
16
16
|
docling/backend/msexcel_backend.py,sha256=5JRbPwOjR1r45AMeIts1rj6InbOgLBf_CtAhvNPVmsQ,19157
|
|
17
17
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
|
@@ -31,7 +31,7 @@ docling/backend/xml/jats_backend.py,sha256=_BWpQQg3SlsHAOOj0v2qRJoVqaQzL91GqN1tK
|
|
|
31
31
|
docling/backend/xml/uspto_backend.py,sha256=Tv4CE7V5_QwxTNJPl90CAd_mAbwaLGy8S6s6evh1Xow,70910
|
|
32
32
|
docling/chunking/__init__.py,sha256=h83TDs0AuOV6oEPLAPrn9dpGKiU-2Vg6IRNo4cv6GDA,346
|
|
33
33
|
docling/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
|
-
docling/cli/main.py,sha256=
|
|
34
|
+
docling/cli/main.py,sha256=glci4i1KYphr_8WueEnrSQF08xvtGxXBi6EvWDGewHU,33091
|
|
35
35
|
docling/cli/models.py,sha256=rw_2JfeJ-k_iOLpz3JfgL1QbJY__W9nE23nHdov6VfU,6252
|
|
36
36
|
docling/cli/tools.py,sha256=QhtRxQG0TVrfsMqdv5i7J0_qQy1ZZyWYnHPwJl7b5oY,322
|
|
37
37
|
docling/datamodel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -61,7 +61,7 @@ docling/models/picture_description_api_model.py,sha256=o3EkV5aHW_6WzE_fdj_VRnNCr
|
|
|
61
61
|
docling/models/picture_description_base_model.py,sha256=kLthLhdlgwhootQ4_xhhcAk6A-vso5-qcsFJ3TcYfO0,2991
|
|
62
62
|
docling/models/picture_description_vlm_model.py,sha256=Uja_BQSk7F-U1J2hm4yeLguirUzKYv1K8zRyw1IYomY,4150
|
|
63
63
|
docling/models/rapid_ocr_model.py,sha256=anUVUwaj9Wubgu4FnHdYMuOVkQP_hJiLY1qRToelBoc,7700
|
|
64
|
-
docling/models/readingorder_model.py,sha256
|
|
64
|
+
docling/models/readingorder_model.py,sha256=-j-UuvnsYWqZvY0gByKz0bjcBwOhWQTHerCopig_jVs,17266
|
|
65
65
|
docling/models/table_structure_model.py,sha256=7g_mFf1YzfF8PXQfefNu6XYZu7TzJAn86zKb6IEUdCg,12518
|
|
66
66
|
docling/models/tesseract_ocr_cli_model.py,sha256=I3Gn28Y-LD8OfvyCElN9fLiNgpo2sT0uMkVt258253s,12881
|
|
67
67
|
docling/models/tesseract_ocr_model.py,sha256=GdI5Cjfi87qcehVbM3wdKRvKkl_F9A4bwTUbjXZCJYA,10745
|
|
@@ -101,9 +101,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
|
101
101
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
|
102
102
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
|
103
103
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
|
104
|
-
docling-2.55.
|
|
105
|
-
docling-2.55.
|
|
106
|
-
docling-2.55.
|
|
107
|
-
docling-2.55.
|
|
108
|
-
docling-2.55.
|
|
109
|
-
docling-2.55.
|
|
104
|
+
docling-2.55.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
|
105
|
+
docling-2.55.1.dist-info/METADATA,sha256=Q0vntiJGbmAM9ONc6x8a-CeCJWpLDJPCN5k7_hhn6sA,11252
|
|
106
|
+
docling-2.55.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
107
|
+
docling-2.55.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
|
108
|
+
docling-2.55.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
|
109
|
+
docling-2.55.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|