docling 2.42.1__tar.gz → 2.42.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. {docling-2.42.1 → docling-2.42.2}/PKG-INFO +2 -1
  2. {docling-2.42.1 → docling-2.42.2}/README.md +1 -0
  3. {docling-2.42.1 → docling-2.42.2}/docling/backend/html_backend.py +33 -5
  4. {docling-2.42.1 → docling-2.42.2}/docling/backend/msword_backend.py +10 -1
  5. {docling-2.42.1 → docling-2.42.2}/docling/backend/pdf_backend.py +25 -1
  6. {docling-2.42.1 → docling-2.42.2}/docling/pipeline/base_pipeline.py +7 -1
  7. {docling-2.42.1 → docling-2.42.2}/docling.egg-info/PKG-INFO +2 -1
  8. {docling-2.42.1 → docling-2.42.2}/pyproject.toml +1 -1
  9. {docling-2.42.1 → docling-2.42.2}/tests/test_input_doc.py +23 -0
  10. {docling-2.42.1 → docling-2.42.2}/LICENSE +0 -0
  11. {docling-2.42.1 → docling-2.42.2}/docling/__init__.py +0 -0
  12. {docling-2.42.1 → docling-2.42.2}/docling/backend/__init__.py +0 -0
  13. {docling-2.42.1 → docling-2.42.2}/docling/backend/abstract_backend.py +0 -0
  14. {docling-2.42.1 → docling-2.42.2}/docling/backend/asciidoc_backend.py +0 -0
  15. {docling-2.42.1 → docling-2.42.2}/docling/backend/csv_backend.py +0 -0
  16. {docling-2.42.1 → docling-2.42.2}/docling/backend/docling_parse_backend.py +0 -0
  17. {docling-2.42.1 → docling-2.42.2}/docling/backend/docling_parse_v2_backend.py +0 -0
  18. {docling-2.42.1 → docling-2.42.2}/docling/backend/docling_parse_v4_backend.py +0 -0
  19. {docling-2.42.1 → docling-2.42.2}/docling/backend/docx/__init__.py +0 -0
  20. {docling-2.42.1 → docling-2.42.2}/docling/backend/docx/latex/__init__.py +0 -0
  21. {docling-2.42.1 → docling-2.42.2}/docling/backend/docx/latex/latex_dict.py +0 -0
  22. {docling-2.42.1 → docling-2.42.2}/docling/backend/docx/latex/omml.py +0 -0
  23. {docling-2.42.1 → docling-2.42.2}/docling/backend/json/__init__.py +0 -0
  24. {docling-2.42.1 → docling-2.42.2}/docling/backend/json/docling_json_backend.py +0 -0
  25. {docling-2.42.1 → docling-2.42.2}/docling/backend/md_backend.py +0 -0
  26. {docling-2.42.1 → docling-2.42.2}/docling/backend/msexcel_backend.py +0 -0
  27. {docling-2.42.1 → docling-2.42.2}/docling/backend/mspowerpoint_backend.py +0 -0
  28. {docling-2.42.1 → docling-2.42.2}/docling/backend/noop_backend.py +0 -0
  29. {docling-2.42.1 → docling-2.42.2}/docling/backend/pypdfium2_backend.py +0 -0
  30. {docling-2.42.1 → docling-2.42.2}/docling/backend/xml/__init__.py +0 -0
  31. {docling-2.42.1 → docling-2.42.2}/docling/backend/xml/jats_backend.py +0 -0
  32. {docling-2.42.1 → docling-2.42.2}/docling/backend/xml/uspto_backend.py +0 -0
  33. {docling-2.42.1 → docling-2.42.2}/docling/chunking/__init__.py +0 -0
  34. {docling-2.42.1 → docling-2.42.2}/docling/cli/__init__.py +0 -0
  35. {docling-2.42.1 → docling-2.42.2}/docling/cli/main.py +0 -0
  36. {docling-2.42.1 → docling-2.42.2}/docling/cli/models.py +0 -0
  37. {docling-2.42.1 → docling-2.42.2}/docling/cli/tools.py +0 -0
  38. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/__init__.py +0 -0
  39. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/accelerator_options.py +0 -0
  40. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/asr_model_specs.py +0 -0
  41. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/base_models.py +0 -0
  42. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/document.py +0 -0
  43. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/layout_model_specs.py +0 -0
  44. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/pipeline_options.py +0 -0
  45. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/pipeline_options_asr_model.py +0 -0
  46. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/pipeline_options_vlm_model.py +0 -0
  47. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/settings.py +0 -0
  48. {docling-2.42.1 → docling-2.42.2}/docling/datamodel/vlm_model_specs.py +0 -0
  49. {docling-2.42.1 → docling-2.42.2}/docling/document_converter.py +0 -0
  50. {docling-2.42.1 → docling-2.42.2}/docling/exceptions.py +0 -0
  51. {docling-2.42.1 → docling-2.42.2}/docling/models/__init__.py +0 -0
  52. {docling-2.42.1 → docling-2.42.2}/docling/models/api_vlm_model.py +0 -0
  53. {docling-2.42.1 → docling-2.42.2}/docling/models/base_model.py +0 -0
  54. {docling-2.42.1 → docling-2.42.2}/docling/models/base_ocr_model.py +0 -0
  55. {docling-2.42.1 → docling-2.42.2}/docling/models/code_formula_model.py +0 -0
  56. {docling-2.42.1 → docling-2.42.2}/docling/models/document_picture_classifier.py +0 -0
  57. {docling-2.42.1 → docling-2.42.2}/docling/models/easyocr_model.py +0 -0
  58. {docling-2.42.1 → docling-2.42.2}/docling/models/factories/__init__.py +0 -0
  59. {docling-2.42.1 → docling-2.42.2}/docling/models/factories/base_factory.py +0 -0
  60. {docling-2.42.1 → docling-2.42.2}/docling/models/factories/ocr_factory.py +0 -0
  61. {docling-2.42.1 → docling-2.42.2}/docling/models/factories/picture_description_factory.py +0 -0
  62. {docling-2.42.1 → docling-2.42.2}/docling/models/layout_model.py +0 -0
  63. {docling-2.42.1 → docling-2.42.2}/docling/models/ocr_mac_model.py +0 -0
  64. {docling-2.42.1 → docling-2.42.2}/docling/models/page_assemble_model.py +0 -0
  65. {docling-2.42.1 → docling-2.42.2}/docling/models/page_preprocessing_model.py +0 -0
  66. {docling-2.42.1 → docling-2.42.2}/docling/models/picture_description_api_model.py +0 -0
  67. {docling-2.42.1 → docling-2.42.2}/docling/models/picture_description_base_model.py +0 -0
  68. {docling-2.42.1 → docling-2.42.2}/docling/models/picture_description_vlm_model.py +0 -0
  69. {docling-2.42.1 → docling-2.42.2}/docling/models/plugins/__init__.py +0 -0
  70. {docling-2.42.1 → docling-2.42.2}/docling/models/plugins/defaults.py +0 -0
  71. {docling-2.42.1 → docling-2.42.2}/docling/models/rapid_ocr_model.py +0 -0
  72. {docling-2.42.1 → docling-2.42.2}/docling/models/readingorder_model.py +0 -0
  73. {docling-2.42.1 → docling-2.42.2}/docling/models/table_structure_model.py +0 -0
  74. {docling-2.42.1 → docling-2.42.2}/docling/models/tesseract_ocr_cli_model.py +0 -0
  75. {docling-2.42.1 → docling-2.42.2}/docling/models/tesseract_ocr_model.py +0 -0
  76. {docling-2.42.1 → docling-2.42.2}/docling/models/utils/__init__.py +0 -0
  77. {docling-2.42.1 → docling-2.42.2}/docling/models/utils/hf_model_download.py +0 -0
  78. {docling-2.42.1 → docling-2.42.2}/docling/models/vlm_models_inline/__init__.py +0 -0
  79. {docling-2.42.1 → docling-2.42.2}/docling/models/vlm_models_inline/hf_transformers_model.py +0 -0
  80. {docling-2.42.1 → docling-2.42.2}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  81. {docling-2.42.1 → docling-2.42.2}/docling/pipeline/__init__.py +0 -0
  82. {docling-2.42.1 → docling-2.42.2}/docling/pipeline/asr_pipeline.py +0 -0
  83. {docling-2.42.1 → docling-2.42.2}/docling/pipeline/simple_pipeline.py +0 -0
  84. {docling-2.42.1 → docling-2.42.2}/docling/pipeline/standard_pdf_pipeline.py +0 -0
  85. {docling-2.42.1 → docling-2.42.2}/docling/pipeline/vlm_pipeline.py +0 -0
  86. {docling-2.42.1 → docling-2.42.2}/docling/py.typed +0 -0
  87. {docling-2.42.1 → docling-2.42.2}/docling/utils/__init__.py +0 -0
  88. {docling-2.42.1 → docling-2.42.2}/docling/utils/accelerator_utils.py +0 -0
  89. {docling-2.42.1 → docling-2.42.2}/docling/utils/api_image_request.py +0 -0
  90. {docling-2.42.1 → docling-2.42.2}/docling/utils/export.py +0 -0
  91. {docling-2.42.1 → docling-2.42.2}/docling/utils/glm_utils.py +0 -0
  92. {docling-2.42.1 → docling-2.42.2}/docling/utils/layout_postprocessor.py +0 -0
  93. {docling-2.42.1 → docling-2.42.2}/docling/utils/locks.py +0 -0
  94. {docling-2.42.1 → docling-2.42.2}/docling/utils/model_downloader.py +0 -0
  95. {docling-2.42.1 → docling-2.42.2}/docling/utils/ocr_utils.py +0 -0
  96. {docling-2.42.1 → docling-2.42.2}/docling/utils/orientation.py +0 -0
  97. {docling-2.42.1 → docling-2.42.2}/docling/utils/profiling.py +0 -0
  98. {docling-2.42.1 → docling-2.42.2}/docling/utils/utils.py +0 -0
  99. {docling-2.42.1 → docling-2.42.2}/docling/utils/visualization.py +0 -0
  100. {docling-2.42.1 → docling-2.42.2}/docling.egg-info/SOURCES.txt +0 -0
  101. {docling-2.42.1 → docling-2.42.2}/docling.egg-info/dependency_links.txt +0 -0
  102. {docling-2.42.1 → docling-2.42.2}/docling.egg-info/entry_points.txt +0 -0
  103. {docling-2.42.1 → docling-2.42.2}/docling.egg-info/requires.txt +0 -0
  104. {docling-2.42.1 → docling-2.42.2}/docling.egg-info/top_level.txt +0 -0
  105. {docling-2.42.1 → docling-2.42.2}/setup.cfg +0 -0
  106. {docling-2.42.1 → docling-2.42.2}/tests/test_asr_pipeline.py +0 -0
  107. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_asciidoc.py +0 -0
  108. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_csv.py +0 -0
  109. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_docling_json.py +0 -0
  110. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_docling_parse.py +0 -0
  111. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_docling_parse_v2.py +0 -0
  112. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_docling_parse_v4.py +0 -0
  113. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_html.py +0 -0
  114. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_jats.py +0 -0
  115. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_markdown.py +0 -0
  116. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_msexcel.py +0 -0
  117. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_msword.py +0 -0
  118. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_patent_uspto.py +0 -0
  119. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_pdfium.py +0 -0
  120. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_pptx.py +0 -0
  121. {docling-2.42.1 → docling-2.42.2}/tests/test_backend_webp.py +0 -0
  122. {docling-2.42.1 → docling-2.42.2}/tests/test_cli.py +0 -0
  123. {docling-2.42.1 → docling-2.42.2}/tests/test_code_formula.py +0 -0
  124. {docling-2.42.1 → docling-2.42.2}/tests/test_data_gen_flag.py +0 -0
  125. {docling-2.42.1 → docling-2.42.2}/tests/test_document_picture_classifier.py +0 -0
  126. {docling-2.42.1 → docling-2.42.2}/tests/test_e2e_conversion.py +0 -0
  127. {docling-2.42.1 → docling-2.42.2}/tests/test_e2e_ocr_conversion.py +0 -0
  128. {docling-2.42.1 → docling-2.42.2}/tests/test_interfaces.py +0 -0
  129. {docling-2.42.1 → docling-2.42.2}/tests/test_invalid_input.py +0 -0
  130. {docling-2.42.1 → docling-2.42.2}/tests/test_legacy_format_transform.py +0 -0
  131. {docling-2.42.1 → docling-2.42.2}/tests/test_ocr_utils.py +0 -0
  132. {docling-2.42.1 → docling-2.42.2}/tests/test_options.py +0 -0
  133. {docling-2.42.1 → docling-2.42.2}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.42.1
3
+ Version: 2.42.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -89,6 +89,7 @@ Dynamic: license-file
89
89
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
90
90
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
91
91
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
92
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
92
93
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
93
94
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
94
95
 
@@ -21,6 +21,7 @@
21
21
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
22
22
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
23
23
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
24
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
24
25
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
25
26
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
26
27
 
@@ -5,7 +5,7 @@ from io import BytesIO
5
5
  from pathlib import Path
6
6
  from typing import Final, Optional, Union, cast
7
7
 
8
- from bs4 import BeautifulSoup, NavigableString, Tag
8
+ from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
9
9
  from bs4.element import PreformattedString
10
10
  from docling_core.types.doc import (
11
11
  DocItem,
@@ -297,7 +297,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
297
297
  ):
298
298
  parts.append(child)
299
299
  elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
300
- text_part = child.get_text()
300
+ text_part = HTMLDocumentBackend.get_text(child)
301
301
  if text_part:
302
302
  parts.append(text_part)
303
303
  li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
@@ -417,6 +417,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
417
417
  content_layer=self.content_layer,
418
418
  )
419
419
 
420
+ @staticmethod
421
+ def get_text(item: PageElement) -> str:
422
+ """Concatenate all child strings of a PageElement.
423
+
424
+ This method is equivalent to `PageElement.get_text()` but also considers
425
+ certain tags. When called on a <p> or <li> tags, it returns the text with a
426
+ trailing space, otherwise the text is concatenated without separators.
427
+ """
428
+
429
+ def _extract_text_recursively(item: PageElement) -> list[str]:
430
+ """Recursively extract text from all child nodes."""
431
+ result: list[str] = []
432
+
433
+ if isinstance(item, NavigableString):
434
+ result = [item]
435
+ elif isinstance(item, Tag):
436
+ tag = cast(Tag, item)
437
+ parts: list[str] = []
438
+ for child in tag:
439
+ parts.extend(_extract_text_recursively(child))
440
+ result.append(
441
+ "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
442
+ )
443
+
444
+ return result
445
+
446
+ parts: list[str] = _extract_text_recursively(item)
447
+
448
+ return "".join(parts)
449
+
420
450
  @staticmethod
421
451
  def _get_cell_spans(cell: Tag) -> tuple[int, int]:
422
452
  """Extract colspan and rowspan values from a table cell tag.
@@ -510,9 +540,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
510
540
  formula.replace_with(NavigableString(math_formula))
511
541
 
512
542
  # TODO: extract content correctly from table-cells with lists
513
- text = html_cell.text
514
-
515
- # label = html_cell.name
543
+ text = HTMLDocumentBackend.get_text(html_cell).strip()
516
544
  col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
517
545
  if row_header:
518
546
  row_span -= 1
@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1104
1104
  )
1105
1105
  _log.debug(f" spanned before row {spanned_idx}")
1106
1106
 
1107
+ # Detect equations in cell text
1108
+ text, equations = self._handle_equations_in_text(
1109
+ element=cell._element, text=cell.text
1110
+ )
1111
+ if len(equations) == 0:
1112
+ text = cell.text
1113
+ else:
1114
+ text = text.replace("<eq>", "$").replace("</eq>", "$")
1115
+
1107
1116
  table_cell = TableCell(
1108
- text=cell.text,
1117
+ text=text,
1109
1118
  row_span=spanned_idx - row_idx,
1110
1119
  col_span=cell.grid_span,
1111
1120
  start_row_offset_idx=row.grid_cols_before + row_idx,
@@ -57,7 +57,31 @@ class PdfDocumentBackend(PaginatedDocumentBackend):
57
57
  if self.input_format is InputFormat.IMAGE:
58
58
  buf = BytesIO()
59
59
  img = Image.open(self.path_or_stream)
60
- img.save(buf, "PDF")
60
+
61
+ # Handle multi-page TIFF images
62
+ if hasattr(img, "n_frames") and img.n_frames > 1:
63
+ # Extract all frames from multi-page image
64
+ frames = []
65
+ try:
66
+ for i in range(img.n_frames):
67
+ img.seek(i)
68
+ frame = img.copy().convert("RGB")
69
+ frames.append(frame)
70
+ except EOFError:
71
+ pass
72
+
73
+ # Save as multi-page PDF
74
+ if frames:
75
+ frames[0].save(
76
+ buf, "PDF", save_all=True, append_images=frames[1:]
77
+ )
78
+ else:
79
+ # Fallback to single page if frame extraction fails
80
+ img.convert("RGB").save(buf, "PDF")
81
+ else:
82
+ # Single page image - convert to RGB and save
83
+ img.convert("RGB").save(buf, "PDF")
84
+
61
85
  buf.seek(0)
62
86
  self.path_or_stream = buf
63
87
  else:
@@ -217,7 +217,13 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
217
217
  return conv_res
218
218
 
219
219
  def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
220
- status = ConversionStatus.SUCCESS
220
+ status = conv_res.status
221
+ if status in [
222
+ ConversionStatus.PENDING,
223
+ ConversionStatus.STARTED,
224
+ ]: # preserves ConversionStatus.PARTIAL_SUCCESS
225
+ status = ConversionStatus.SUCCESS
226
+
221
227
  for page in conv_res.pages:
222
228
  if page._backend is None or not page._backend.is_valid():
223
229
  conv_res.errors.append(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.42.1
3
+ Version: 2.42.2
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -89,6 +89,7 @@ Dynamic: license-file
89
89
  [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT)
90
90
  [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling)
91
91
  [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling)
92
+ [![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
92
93
  [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101)
93
94
  [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/)
94
95
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "docling"
3
- version = "2.42.1" # DO NOT EDIT, updated automatically
3
+ version = "2.42.2" # DO NOT EDIT, updated automatically
4
4
  description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
5
5
  license = "MIT"
6
6
  keywords = [
@@ -243,3 +243,26 @@ def _make_input_doc_from_stream(doc_stream):
243
243
  backend=PdfFormatOption().backend, # use default
244
244
  )
245
245
  return in_doc
246
+
247
+
248
+ def test_tiff_two_pages():
249
+ tiff_path = Path("./tests/data/tiff/2206.01062.tif")
250
+ doc = InputDocument(
251
+ path_or_stream=tiff_path,
252
+ format=InputFormat.IMAGE,
253
+ backend=PdfFormatOption().backend, # use default backend
254
+ )
255
+ assert doc.valid is True
256
+ assert doc.page_count == 2
257
+
258
+ # Expect two full-page rectangles
259
+ rects_page1 = doc._backend.load_page(0).get_bitmap_rects()
260
+ rects_page2 = doc._backend.load_page(1).get_bitmap_rects()
261
+
262
+ page1_rect = next(rects_page1)
263
+ page2_rect = next(rects_page2)
264
+
265
+ assert page1_rect.t == page2_rect.t == 0
266
+ assert page1_rect.l == page2_rect.l == 0
267
+ assert page1_rect.r == page2_rect.r == 612.0
268
+ assert page1_rect.b == page2_rect.b == 792.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes