docling 2.42.1__py3-none-any.whl → 2.42.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,7 +5,7 @@ from io import BytesIO
5
5
  from pathlib import Path
6
6
  from typing import Final, Optional, Union, cast
7
7
 
8
- from bs4 import BeautifulSoup, NavigableString, Tag
8
+ from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
9
9
  from bs4.element import PreformattedString
10
10
  from docling_core.types.doc import (
11
11
  DocItem,
@@ -297,7 +297,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
297
297
  ):
298
298
  parts.append(child)
299
299
  elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
300
- text_part = child.get_text()
300
+ text_part = HTMLDocumentBackend.get_text(child)
301
301
  if text_part:
302
302
  parts.append(text_part)
303
303
  li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
@@ -417,6 +417,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
417
417
  content_layer=self.content_layer,
418
418
  )
419
419
 
420
+ @staticmethod
421
+ def get_text(item: PageElement) -> str:
422
+ """Concatenate all child strings of a PageElement.
423
+
424
+ This method is equivalent to `PageElement.get_text()` but also considers
425
+ certain tags. When called on a <p> or <li> tags, it returns the text with a
426
+ trailing space, otherwise the text is concatenated without separators.
427
+ """
428
+
429
+ def _extract_text_recursively(item: PageElement) -> list[str]:
430
+ """Recursively extract text from all child nodes."""
431
+ result: list[str] = []
432
+
433
+ if isinstance(item, NavigableString):
434
+ result = [item]
435
+ elif isinstance(item, Tag):
436
+ tag = cast(Tag, item)
437
+ parts: list[str] = []
438
+ for child in tag:
439
+ parts.extend(_extract_text_recursively(child))
440
+ result.append(
441
+ "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
442
+ )
443
+
444
+ return result
445
+
446
+ parts: list[str] = _extract_text_recursively(item)
447
+
448
+ return "".join(parts)
449
+
420
450
  @staticmethod
421
451
  def _get_cell_spans(cell: Tag) -> tuple[int, int]:
422
452
  """Extract colspan and rowspan values from a table cell tag.
@@ -510,9 +540,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
510
540
  formula.replace_with(NavigableString(math_formula))
511
541
 
512
542
  # TODO: extract content correctly from table-cells with lists
513
- text = html_cell.text
514
-
515
- # label = html_cell.name
543
+ text = HTMLDocumentBackend.get_text(html_cell).strip()
516
544
  col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
517
545
  if row_header:
518
546
  row_span -= 1
@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1104
1104
  )
1105
1105
  _log.debug(f" spanned before row {spanned_idx}")
1106
1106
 
1107
+ # Detect equations in cell text
1108
+ text, equations = self._handle_equations_in_text(
1109
+ element=cell._element, text=cell.text
1110
+ )
1111
+ if len(equations) == 0:
1112
+ text = cell.text
1113
+ else:
1114
+ text = text.replace("<eq>", "$").replace("</eq>", "$")
1115
+
1107
1116
  table_cell = TableCell(
1108
- text=cell.text,
1117
+ text=text,
1109
1118
  row_span=spanned_idx - row_idx,
1110
1119
  col_span=cell.grid_span,
1111
1120
  start_row_offset_idx=row.grid_cols_before + row_idx,
@@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
57
57
  if self.input_format is InputFormat.IMAGE:
58
58
  buf = BytesIO()
59
59
  img = Image.open(self.path_or_stream)
60
- img.save(buf, "PDF")
60
+
61
+ # Handle multi-page TIFF images
62
+ if hasattr(img, "n_frames") and img.n_frames > 1:
63
+ # Extract all frames from multi-page image
64
+ frames = []
65
+ try:
66
+ for i in range(img.n_frames):
67
+ img.seek(i)
68
+ frame = img.copy().convert("RGB")
69
+ frames.append(frame)
70
+ except EOFError:
71
+ pass
72
+
73
+ # Save as multi-page PDF
74
+ if frames:
75
+ frames[0].save(
76
+ buf, "PDF", save_all=True, append_images=frames[1:]
77
+ )
78
+ else:
79
+ # Fallback to single page if frame extraction fails
80
+ img.convert("RGB").save(buf, "PDF")
81
+ else:
82
+ # Single page image - convert to RGB and save
83
+ img.convert("RGB").save(buf, "PDF")
84
+
61
85
  buf.seek(0)
62
86
  self.path_or_stream = buf
63
87
  else:
@@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
217
217
  return conv_res
218
218
 
219
219
  def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
220
- status = ConversionStatus.SUCCESS
220
+ status = conv_res.status
221
+ if status in [
222
+ ConversionStatus.PENDING,
223
+ ConversionStatus.STARTED,
224
+ ]: # preserves ConversionStatus.PARTIAL_SUCCESS
225
+ status = ConversionStatus.SUCCESS
226
+
221
227
  for page in conv_res.pages:
222
228
  if page._backend is None or not page._backend.is_valid():
223
229
  conv_res.errors.append(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.42.1
3
+ Version: 2.42.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -89,6 +89,7 @@ Dynamic: license-file
89
89
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
90
90
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
91
91
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
92
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
92
93
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
93
94
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
94
95
 
@@ -9,13 +9,13 @@ docling/backend/csv_backend.py,sha256=2g9famYG2W-ID9jEdZPxc6O8QGv1vWQfjN8pL-QMBE
9
9
  docling/backend/docling_parse_backend.py,sha256=9rUo1vPxX6QLzGqF-2B2iEYglZg6YQ3Uea00XrLluTg,7918
10
10
  docling/backend/docling_parse_v2_backend.py,sha256=3ckTfke8IICjaImlIzc3TRhG7KDuxDDba0AuCEcjA-M,9500
11
11
  docling/backend/docling_parse_v4_backend.py,sha256=qR_WRVq9JGtRioWCw6MnLWgbvXbC6Y1yds7Ol1-E6UQ,6550
12
- docling/backend/html_backend.py,sha256=gGkm3i7FpW2WCJ-_GPpOJNh1LUq1_-vRGyGURuPagck,19284
12
+ docling/backend/html_backend.py,sha256=1Sohqc1xQETx6qPw27nT0QR4EdpDQg5DlrsK3rrgv7A,20413
13
13
  docling/backend/md_backend.py,sha256=mfwGj8g2hGC-Q_HREtl_Web65uMVXD-Ie1nRqWTXzF0,21013
14
14
  docling/backend/msexcel_backend.py,sha256=cq8MQ2RSh6pqCiVrldjOerSww7dOPTWmCQoCBI57i6w,18579
15
15
  docling/backend/mspowerpoint_backend.py,sha256=wJgB2JStEPfD7MPpWQlpPN7bffPxaHFUnKD4wj8SLxU,15114
16
- docling/backend/msword_backend.py,sha256=7mzPCF4bGWZPst5ntoV3aSxH5WUu2nBP-l8lgQT3tdw,44544
16
+ docling/backend/msword_backend.py,sha256=DxMgPcq-Ao1vq7X2v8qqWeMs9MryPw_Jw3YRAAdXBtM,44904
17
17
  docling/backend/noop_backend.py,sha256=EOPbD86FzZPX-K_DpNrJh0_lC0bZz--4DpG-OagDNGY,1688
18
- docling/backend/pdf_backend.py,sha256=KE9TMuFO5WX-o5A_DAd4tEaLi4HMZ4XjKdpllItVkWM,2238
18
+ docling/backend/pdf_backend.py,sha256=sUBrCz1zvt6E7sVl4xHtrkpTBClOK0vBV2lLi_TRHNg,3237
19
19
  docling/backend/pypdfium2_backend.py,sha256=8dVniLHgiTdJuDbYr66kPp6Ccv5ZDlqDMEbA2xIfS7U,13370
20
20
  docling/backend/docx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  docling/backend/docx/latex/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -74,7 +74,7 @@ docling/models/vlm_models_inline/hf_transformers_model.py,sha256=LAnWFIHGblWln6D
74
74
  docling/models/vlm_models_inline/mlx_model.py,sha256=p-H6wG31iVRoOjsqYaCVa4pEzxMP3vzLcsUatMjDJDQ,5948
75
75
  docling/pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  docling/pipeline/asr_pipeline.py,sha256=tQkhu9fXdkSuYIL22xzV2YRUlQh-9qktHBbs2qeXhJI,9070
77
- docling/pipeline/base_pipeline.py,sha256=14yQrDjsojl4RgbBjKFSEfVBYR_sULZfBI1uDzFLi8Y,9331
77
+ docling/pipeline/base_pipeline.py,sha256=iwUqmttXF9D2myXyCAaIqFuGjBFhPkjAybcSAGpww-Q,9525
78
78
  docling/pipeline/simple_pipeline.py,sha256=TXZOwR7hZRji462ZTIpte0VJjzbxvNVE8dbLFANDhSU,2253
79
79
  docling/pipeline/standard_pdf_pipeline.py,sha256=yFishq4Cu01BiBGHk3Irr7ogcTQKeSC0QZImQVAhIaY,12740
80
80
  docling/pipeline/vlm_pipeline.py,sha256=0lj8tbXNpYF8OLBoLqP2BZfFpTHi40RoHVfvO_Nah4Q,15349
@@ -91,9 +91,9 @@ docling/utils/orientation.py,sha256=jTyLxyT31FlOodZoBMlADHNQK2lAWKYVs5z7pXd_6Cg,
91
91
  docling/utils/profiling.py,sha256=YaMGoB9MMZpagF9mb5ndoHj8Lpb9aIdb7El-Pl7IcFs,1753
92
92
  docling/utils/utils.py,sha256=kJtIYuzXeOyJHYlxmLAo7dGM5rEsDa1i84qEsUj1nio,1908
93
93
  docling/utils/visualization.py,sha256=tY2ylE2aiQKkmzlSLnFW-HTfFyqUUMguW18ldd1PLfo,2868
94
- docling-2.42.1.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
95
- docling-2.42.1.dist-info/METADATA,sha256=d46NOPDEps6dVLLMh3tWBCEQv7b_bwQQ46ndyqVO-ag,10310
96
- docling-2.42.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
- docling-2.42.1.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
98
- docling-2.42.1.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
99
- docling-2.42.1.dist-info/RECORD,,
94
+ docling-2.42.2.dist-info/licenses/LICENSE,sha256=mBb7ErEcM8VS9OhiGHnQ2kk75HwPhr54W1Oiz3965MY,1088
95
+ docling-2.42.2.dist-info/METADATA,sha256=1u5N4PTeuTbyxNgK9QK5DuqVf4cmSHOMcHlCeV7j5Do,10449
96
+ docling-2.42.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
97
+ docling-2.42.2.dist-info/entry_points.txt,sha256=hzVlbeE0aMSTQ9S0-NTYN0Hmgsn6qL_EA2qX4UbkAuY,149
98
+ docling-2.42.2.dist-info/top_level.txt,sha256=vkIywP-USjFyYo1AIRQbWQQaL3xB5jf8vkCYdTIfNic,8
99
+ docling-2.42.2.dist-info/RECORD,,