docling 2.42.1__py3-none-any.whl → 2.42.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling/backend/html_backend.py +33 -5
- docling/backend/msword_backend.py +10 -1
- docling/backend/pdf_backend.py +25 -1
- docling/pipeline/base_pipeline.py +7 -1
- {docling-2.42.1.dist-info → docling-2.42.2.dist-info}/METADATA +2 -1
- {docling-2.42.1.dist-info → docling-2.42.2.dist-info}/RECORD +10 -10
- {docling-2.42.1.dist-info → docling-2.42.2.dist-info}/WHEEL +0 -0
- {docling-2.42.1.dist-info → docling-2.42.2.dist-info}/entry_points.txt +0 -0
- {docling-2.42.1.dist-info → docling-2.42.2.dist-info}/licenses/LICENSE +0 -0
- {docling-2.42.1.dist-info → docling-2.42.2.dist-info}/top_level.txt +0 -0
docling/backend/html_backend.py
CHANGED
@@ -5,7 +5,7 @@ from io import BytesIO
|
|
5
5
|
from pathlib import Path
|
6
6
|
from typing import Final, Optional, Union, cast
|
7
7
|
|
8
|
-
from bs4 import BeautifulSoup, NavigableString, Tag
|
8
|
+
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
9
9
|
from bs4.element import PreformattedString
|
10
10
|
from docling_core.types.doc import (
|
11
11
|
DocItem,
|
@@ -297,7 +297,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
297
297
|
):
|
298
298
|
parts.append(child)
|
299
299
|
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
|
300
|
-
text_part =
|
300
|
+
text_part = HTMLDocumentBackend.get_text(child)
|
301
301
|
if text_part:
|
302
302
|
parts.append(text_part)
|
303
303
|
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
|
@@ -417,6 +417,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
417
417
|
content_layer=self.content_layer,
|
418
418
|
)
|
419
419
|
|
420
|
+
@staticmethod
|
421
|
+
def get_text(item: PageElement) -> str:
|
422
|
+
"""Concatenate all child strings of a PageElement.
|
423
|
+
|
424
|
+
This method is equivalent to `PageElement.get_text()` but also considers
|
425
|
+
certain tags. When called on a <p> or <li> tags, it returns the text with a
|
426
|
+
trailing space, otherwise the text is concatenated without separators.
|
427
|
+
"""
|
428
|
+
|
429
|
+
def _extract_text_recursively(item: PageElement) -> list[str]:
|
430
|
+
"""Recursively extract text from all child nodes."""
|
431
|
+
result: list[str] = []
|
432
|
+
|
433
|
+
if isinstance(item, NavigableString):
|
434
|
+
result = [item]
|
435
|
+
elif isinstance(item, Tag):
|
436
|
+
tag = cast(Tag, item)
|
437
|
+
parts: list[str] = []
|
438
|
+
for child in tag:
|
439
|
+
parts.extend(_extract_text_recursively(child))
|
440
|
+
result.append(
|
441
|
+
"".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
|
442
|
+
)
|
443
|
+
|
444
|
+
return result
|
445
|
+
|
446
|
+
parts: list[str] = _extract_text_recursively(item)
|
447
|
+
|
448
|
+
return "".join(parts)
|
449
|
+
|
420
450
|
@staticmethod
|
421
451
|
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
|
422
452
|
"""Extract colspan and rowspan values from a table cell tag.
|
@@ -510,9 +540,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|
510
540
|
formula.replace_with(NavigableString(math_formula))
|
511
541
|
|
512
542
|
# TODO: extract content correctly from table-cells with lists
|
513
|
-
text = html_cell.
|
514
|
-
|
515
|
-
# label = html_cell.name
|
543
|
+
text = HTMLDocumentBackend.get_text(html_cell).strip()
|
516
544
|
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
|
517
545
|
if row_header:
|
518
546
|
row_span -= 1
|
@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|
1104
1104
|
)
|
1105
1105
|
_log.debug(f" spanned before row {spanned_idx}")
|
1106
1106
|
|
1107
|
+
# Detect equations in cell text
|
1108
|
+
text, equations = self._handle_equations_in_text(
|
1109
|
+
element=cell._element, text=cell.text
|
1110
|
+
)
|
1111
|
+
if len(equations) == 0:
|
1112
|
+
text = cell.text
|
1113
|
+
else:
|
1114
|
+
text = text.replace("<eq>", "$").replace("</eq>", "$")
|
1115
|
+
|
1107
1116
|
table_cell = TableCell(
|
1108
|
-
text=
|
1117
|
+
text=text,
|
1109
1118
|
row_span=spanned_idx - row_idx,
|
1110
1119
|
col_span=cell.grid_span,
|
1111
1120
|
start_row_offset_idx=row.grid_cols_before + row_idx,
|
docling/backend/pdf_backend.py
CHANGED
@@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
|
|
57
57
|
if self.input_format is InputFormat.IMAGE:
|
58
58
|
buf = BytesIO()
|
59
59
|
img = Image.open(self.path_or_stream)
|
60
|
-
|
60
|
+
|
61
|
+
# Handle multi-page TIFF images
|
62
|
+
if hasattr(img, "n_frames") and img.n_frames > 1:
|
63
|
+
# Extract all frames from multi-page image
|
64
|
+
frames = []
|
65
|
+
try:
|
66
|
+
for i in range(img.n_frames):
|
67
|
+
img.seek(i)
|
68
|
+
frame = img.copy().convert("RGB")
|
69
|
+
frames.append(frame)
|
70
|
+
except EOFError:
|
71
|
+
pass
|
72
|
+
|
73
|
+
# Save as multi-page PDF
|
74
|
+
if frames:
|
75
|
+
frames[0].save(
|
76
|
+
buf, "PDF", save_all=True, append_images=frames[1:]
|
77
|
+
)
|
78
|
+
else:
|
79
|
+
# Fallback to single page if frame extraction fails
|
80
|
+
img.convert("RGB").save(buf, "PDF")
|
81
|
+
else:
|
82
|
+
# Single page image - convert to RGB and save
|
83
|
+
img.convert("RGB").save(buf, "PDF")
|
84
|
+
|
61
85
|
buf.seek(0)
|
62
86
|
self.path_or_stream = buf
|
63
87
|
else:
|
@@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|
217
217
|
return conv_res
|
218
218
|
|
219
219
|
def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
|
220
|
-
status =
|
220
|
+
status = conv_res.status
|
221
|
+
if status in [
|
222
|
+
ConversionStatus.PENDING,
|
223
|
+
ConversionStatus.STARTED,
|
224
|
+
]: # preserves ConversionStatus.PARTIAL_SUCCESS
|
225
|
+
status = ConversionStatus.SUCCESS
|
226
|
+
|
221
227
|
for page in conv_res.pages:
|
222
228
|
if page._backend is None or not page._backend.is_valid():
|
223
229
|
conv_res.errors.append(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: docling
|
3
|
-
Version: 2.42.
|
3
|
+
Version: 2.42.2
|
4
4
|
Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
|
5
5
|
Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
|
6
6
|
License-Expression: MIT
|
@@ -89,6 +89,7 @@ Dynamic: license-file
|
|
89
89
|
[](https://opensource.org/licenses/MIT)
|
90
90
|
[](https://pepy.tech/projects/docling)
|
91
91
|
[](https://apify.com/vancura/docling)
|
92
|
+
[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
|
92
93
|
[](https://www.bestpractices.dev/projects/10101)
|
93
94
|
[](https://lfaidata.foundation/projects/)
|
94
95
|
|
@@ -9,13 +9,13 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
|
|
9
9
|
docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
|
10
10
|
docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
|
11
11
|
docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
|
12
|
-
docling/backend/html_backend.py,sha256=
|
12
|
+
docling/backend/html_backend.py,sha256=1Sohqc1xQETx6qPw27nT0QR4EdpDQg5DlrsK3rrgv7A,20413
|
13
13
|
docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
|
14
14
|
docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
|
15
15
|
docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
|
16
|
-
docling/backend/msword_backend.py,sha256=
|
16
|
+
docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
|
17
17
|
docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
|
18
|
-
docling/backend/pdf_backend.py,sha256=
|
18
|
+
docling/backend/pdf_backend.py,sha256=sUBrCz1zvt6E7sVl4xHtrkpTBClOK0vBV2lLi_TRHNg,3237
|
19
19
|
docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
|
20
20
|
docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -74,7 +74,7 @@ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6D
|
|
74
74
|
docling/models/vlm_models_inline/mlx_model.py,sha256=p-H6wG31iVRoOjsqYaCVa4pEzxMP3vzLcsUatMjDJDQ,5948
|
75
75
|
docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
76
76
|
docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
|
77
|
-
docling/pipeline/base_pipeline.py,sha256=
|
77
|
+
docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
|
78
78
|
docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
|
79
79
|
docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
|
80
80
|
docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
|
@@ -91,9 +91,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
|
|
91
91
|
docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
|
92
92
|
docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
|
93
93
|
docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
|
94
|
-
docling-2.42.
|
95
|
-
docling-2.42.
|
96
|
-
docling-2.42.
|
97
|
-
docling-2.42.
|
98
|
-
docling-2.42.
|
99
|
-
docling-2.42.
|
94
|
+
docling-2.42.2.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
|
95
|
+
docling-2.42.2.dist-info/METADATA,sha256=1u5N4PTeuTbyxNgK9QK5DuqVf4cmSHOMcHlCeV7j5Do,10449
|
96
|
+
docling-2.42.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
97
|
+
docling-2.42.2.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
|
98
|
+
docling-2.42.2.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
|
99
|
+
docling-2.42.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|