docling 2.36.0__tar.gz → 2.37.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {docling-2.36.0 → docling-2.37.0}/PKG-INFO +2 -3
  2. {docling-2.36.0 → docling-2.37.0}/docling/backend/asciidoc_backend.py +39 -18
  3. {docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_backend.py +61 -59
  4. {docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_v2_backend.py +72 -62
  5. {docling-2.36.0 → docling-2.37.0}/docling/backend/docling_parse_v4_backend.py +21 -19
  6. {docling-2.36.0 → docling-2.37.0}/docling/backend/mspowerpoint_backend.py +72 -113
  7. {docling-2.36.0 → docling-2.37.0}/docling/backend/msword_backend.py +28 -18
  8. {docling-2.36.0 → docling-2.37.0}/docling/backend/pypdfium2_backend.py +127 -53
  9. {docling-2.36.0 → docling-2.37.0}/docling/datamodel/base_models.py +10 -3
  10. {docling-2.36.0 → docling-2.37.0}/docling/datamodel/pipeline_options.py +3 -1
  11. {docling-2.36.0 → docling-2.37.0}/docling/datamodel/pipeline_options_vlm_model.py +2 -1
  12. {docling-2.36.0 → docling-2.37.0}/docling/models/base_ocr_model.py +33 -11
  13. {docling-2.36.0 → docling-2.37.0}/docling/models/easyocr_model.py +1 -1
  14. {docling-2.36.0 → docling-2.37.0}/docling/models/layout_model.py +2 -3
  15. {docling-2.36.0 → docling-2.37.0}/docling/models/ocr_mac_model.py +1 -1
  16. {docling-2.36.0 → docling-2.37.0}/docling/models/page_preprocessing_model.py +3 -6
  17. {docling-2.36.0 → docling-2.37.0}/docling/models/rapid_ocr_model.py +1 -1
  18. {docling-2.36.0 → docling-2.37.0}/docling/models/readingorder_model.py +2 -2
  19. {docling-2.36.0 → docling-2.37.0}/docling/models/tesseract_ocr_cli_model.py +4 -3
  20. {docling-2.36.0 → docling-2.37.0}/docling/models/tesseract_ocr_model.py +1 -1
  21. {docling-2.36.0 → docling-2.37.0}/docling/models/vlm_models_inline/hf_transformers_model.py +1 -0
  22. {docling-2.36.0 → docling-2.37.0}/docling/pipeline/standard_pdf_pipeline.py +0 -1
  23. {docling-2.36.0 → docling-2.37.0}/docling/utils/layout_postprocessor.py +11 -6
  24. {docling-2.36.0 → docling-2.37.0}/docling.egg-info/PKG-INFO +2 -3
  25. {docling-2.36.0 → docling-2.37.0}/docling.egg-info/requires.txt +1 -2
  26. {docling-2.36.0 → docling-2.37.0}/pyproject.toml +2 -3
  27. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_asciidoc.py +23 -1
  28. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_msexcel.py +11 -11
  29. {docling-2.36.0 → docling-2.37.0}/LICENSE +0 -0
  30. {docling-2.36.0 → docling-2.37.0}/README.md +0 -0
  31. {docling-2.36.0 → docling-2.37.0}/docling/__init__.py +0 -0
  32. {docling-2.36.0 → docling-2.37.0}/docling/backend/__init__.py +0 -0
  33. {docling-2.36.0 → docling-2.37.0}/docling/backend/abstract_backend.py +0 -0
  34. {docling-2.36.0 → docling-2.37.0}/docling/backend/csv_backend.py +0 -0
  35. {docling-2.36.0 → docling-2.37.0}/docling/backend/docx/__init__.py +0 -0
  36. {docling-2.36.0 → docling-2.37.0}/docling/backend/docx/latex/__init__.py +0 -0
  37. {docling-2.36.0 → docling-2.37.0}/docling/backend/docx/latex/latex_dict.py +0 -0
  38. {docling-2.36.0 → docling-2.37.0}/docling/backend/docx/latex/omml.py +0 -0
  39. {docling-2.36.0 → docling-2.37.0}/docling/backend/html_backend.py +0 -0
  40. {docling-2.36.0 → docling-2.37.0}/docling/backend/json/__init__.py +0 -0
  41. {docling-2.36.0 → docling-2.37.0}/docling/backend/json/docling_json_backend.py +0 -0
  42. {docling-2.36.0 → docling-2.37.0}/docling/backend/md_backend.py +0 -0
  43. {docling-2.36.0 → docling-2.37.0}/docling/backend/msexcel_backend.py +0 -0
  44. {docling-2.36.0 → docling-2.37.0}/docling/backend/pdf_backend.py +0 -0
  45. {docling-2.36.0 → docling-2.37.0}/docling/backend/xml/__init__.py +0 -0
  46. {docling-2.36.0 → docling-2.37.0}/docling/backend/xml/jats_backend.py +0 -0
  47. {docling-2.36.0 → docling-2.37.0}/docling/backend/xml/uspto_backend.py +0 -0
  48. {docling-2.36.0 → docling-2.37.0}/docling/chunking/__init__.py +0 -0
  49. {docling-2.36.0 → docling-2.37.0}/docling/cli/__init__.py +0 -0
  50. {docling-2.36.0 → docling-2.37.0}/docling/cli/main.py +0 -0
  51. {docling-2.36.0 → docling-2.37.0}/docling/cli/models.py +0 -0
  52. {docling-2.36.0 → docling-2.37.0}/docling/cli/tools.py +0 -0
  53. {docling-2.36.0 → docling-2.37.0}/docling/datamodel/__init__.py +0 -0
  54. {docling-2.36.0 → docling-2.37.0}/docling/datamodel/accelerator_options.py +0 -0
  55. {docling-2.36.0 → docling-2.37.0}/docling/datamodel/document.py +0 -0
  56. {docling-2.36.0 → docling-2.37.0}/docling/datamodel/settings.py +0 -0
  57. {docling-2.36.0 → docling-2.37.0}/docling/datamodel/vlm_model_specs.py +0 -0
  58. {docling-2.36.0 → docling-2.37.0}/docling/document_converter.py +0 -0
  59. {docling-2.36.0 → docling-2.37.0}/docling/exceptions.py +0 -0
  60. {docling-2.36.0 → docling-2.37.0}/docling/models/__init__.py +0 -0
  61. {docling-2.36.0 → docling-2.37.0}/docling/models/api_vlm_model.py +0 -0
  62. {docling-2.36.0 → docling-2.37.0}/docling/models/base_model.py +0 -0
  63. {docling-2.36.0 → docling-2.37.0}/docling/models/code_formula_model.py +0 -0
  64. {docling-2.36.0 → docling-2.37.0}/docling/models/document_picture_classifier.py +0 -0
  65. {docling-2.36.0 → docling-2.37.0}/docling/models/factories/__init__.py +0 -0
  66. {docling-2.36.0 → docling-2.37.0}/docling/models/factories/base_factory.py +0 -0
  67. {docling-2.36.0 → docling-2.37.0}/docling/models/factories/ocr_factory.py +0 -0
  68. {docling-2.36.0 → docling-2.37.0}/docling/models/factories/picture_description_factory.py +0 -0
  69. {docling-2.36.0 → docling-2.37.0}/docling/models/page_assemble_model.py +0 -0
  70. {docling-2.36.0 → docling-2.37.0}/docling/models/picture_description_api_model.py +0 -0
  71. {docling-2.36.0 → docling-2.37.0}/docling/models/picture_description_base_model.py +0 -0
  72. {docling-2.36.0 → docling-2.37.0}/docling/models/picture_description_vlm_model.py +0 -0
  73. {docling-2.36.0 → docling-2.37.0}/docling/models/plugins/__init__.py +0 -0
  74. {docling-2.36.0 → docling-2.37.0}/docling/models/plugins/defaults.py +0 -0
  75. {docling-2.36.0 → docling-2.37.0}/docling/models/table_structure_model.py +0 -0
  76. {docling-2.36.0 → docling-2.37.0}/docling/models/utils/__init__.py +0 -0
  77. {docling-2.36.0 → docling-2.37.0}/docling/models/utils/hf_model_download.py +0 -0
  78. {docling-2.36.0 → docling-2.37.0}/docling/models/vlm_models_inline/__init__.py +0 -0
  79. {docling-2.36.0 → docling-2.37.0}/docling/models/vlm_models_inline/mlx_model.py +0 -0
  80. {docling-2.36.0 → docling-2.37.0}/docling/pipeline/__init__.py +0 -0
  81. {docling-2.36.0 → docling-2.37.0}/docling/pipeline/base_pipeline.py +0 -0
  82. {docling-2.36.0 → docling-2.37.0}/docling/pipeline/simple_pipeline.py +0 -0
  83. {docling-2.36.0 → docling-2.37.0}/docling/pipeline/vlm_pipeline.py +0 -0
  84. {docling-2.36.0 → docling-2.37.0}/docling/py.typed +0 -0
  85. {docling-2.36.0 → docling-2.37.0}/docling/utils/__init__.py +0 -0
  86. {docling-2.36.0 → docling-2.37.0}/docling/utils/accelerator_utils.py +0 -0
  87. {docling-2.36.0 → docling-2.37.0}/docling/utils/api_image_request.py +0 -0
  88. {docling-2.36.0 → docling-2.37.0}/docling/utils/export.py +0 -0
  89. {docling-2.36.0 → docling-2.37.0}/docling/utils/glm_utils.py +0 -0
  90. {docling-2.36.0 → docling-2.37.0}/docling/utils/locks.py +0 -0
  91. {docling-2.36.0 → docling-2.37.0}/docling/utils/model_downloader.py +0 -0
  92. {docling-2.36.0 → docling-2.37.0}/docling/utils/ocr_utils.py +0 -0
  93. {docling-2.36.0 → docling-2.37.0}/docling/utils/orientation.py +0 -0
  94. {docling-2.36.0 → docling-2.37.0}/docling/utils/profiling.py +0 -0
  95. {docling-2.36.0 → docling-2.37.0}/docling/utils/utils.py +0 -0
  96. {docling-2.36.0 → docling-2.37.0}/docling/utils/visualization.py +0 -0
  97. {docling-2.36.0 → docling-2.37.0}/docling.egg-info/SOURCES.txt +0 -0
  98. {docling-2.36.0 → docling-2.37.0}/docling.egg-info/dependency_links.txt +0 -0
  99. {docling-2.36.0 → docling-2.37.0}/docling.egg-info/entry_points.txt +0 -0
  100. {docling-2.36.0 → docling-2.37.0}/docling.egg-info/top_level.txt +0 -0
  101. {docling-2.36.0 → docling-2.37.0}/setup.cfg +0 -0
  102. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_csv.py +0 -0
  103. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_docling_json.py +0 -0
  104. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_docling_parse.py +0 -0
  105. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_docling_parse_v2.py +0 -0
  106. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_docling_parse_v4.py +0 -0
  107. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_html.py +0 -0
  108. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_jats.py +0 -0
  109. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_markdown.py +0 -0
  110. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_msword.py +0 -0
  111. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_patent_uspto.py +0 -0
  112. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_pdfium.py +0 -0
  113. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_pptx.py +0 -0
  114. {docling-2.36.0 → docling-2.37.0}/tests/test_backend_webp.py +0 -0
  115. {docling-2.36.0 → docling-2.37.0}/tests/test_cli.py +0 -0
  116. {docling-2.36.0 → docling-2.37.0}/tests/test_code_formula.py +0 -0
  117. {docling-2.36.0 → docling-2.37.0}/tests/test_data_gen_flag.py +0 -0
  118. {docling-2.36.0 → docling-2.37.0}/tests/test_document_picture_classifier.py +0 -0
  119. {docling-2.36.0 → docling-2.37.0}/tests/test_e2e_conversion.py +0 -0
  120. {docling-2.36.0 → docling-2.37.0}/tests/test_e2e_ocr_conversion.py +2 -2
  121. {docling-2.36.0 → docling-2.37.0}/tests/test_input_doc.py +0 -0
  122. {docling-2.36.0 → docling-2.37.0}/tests/test_interfaces.py +0 -0
  123. {docling-2.36.0 → docling-2.37.0}/tests/test_invalid_input.py +0 -0
  124. {docling-2.36.0 → docling-2.37.0}/tests/test_legacy_format_transform.py +0 -0
  125. {docling-2.36.0 → docling-2.37.0}/tests/test_options.py +0 -0
  126. {docling-2.36.0 → docling-2.37.0}/tests/test_settings_load.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docling
3
- Version: 2.36.0
3
+ Version: 2.37.0
4
4
  Summary: SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications.
5
5
  Author-email: Christoph Auer <cau@zurich.ibm.com>, Michele Dolfi <dol@zurich.ibm.com>, Maxim Lysak <mly@zurich.ibm.com>, Nikos Livathinos <nli@zurich.ibm.com>, Ahmed Nassar <ahn@zurich.ibm.com>, Panos Vagenas <pva@zurich.ibm.com>, Peter Staar <taa@zurich.ibm.com>
6
6
  License-Expression: MIT
@@ -37,7 +37,7 @@ Requires-Dist: requests<3.0.0,>=2.32.2
37
37
  Requires-Dist: easyocr<2.0,>=1.7
38
38
  Requires-Dist: certifi>=2024.7.4
39
39
  Requires-Dist: rtree<2.0.0,>=1.3.0
40
- Requires-Dist: typer<0.16.0,>=0.12.5
40
+ Requires-Dist: typer<0.17.0,>=0.12.5
41
41
  Requires-Dist: python-docx<2.0.0,>=1.1.2
42
42
  Requires-Dist: python-pptx<2.0.0,>=1.0.2
43
43
  Requires-Dist: beautifulsoup4<5.0.0,>=4.12.3
@@ -49,7 +49,6 @@ Requires-Dist: pillow<12.0.0,>=10.0.0
49
49
  Requires-Dist: tqdm<5.0.0,>=4.65.0
50
50
  Requires-Dist: pluggy<2.0.0,>=1.0.0
51
51
  Requires-Dist: pylatexenc<3.0,>=2.10
52
- Requires-Dist: click<8.2.0
53
52
  Requires-Dist: scipy<2.0.0,>=1.6.0
54
53
  Provides-Extra: tesserocr
55
54
  Requires-Dist: tesserocr<3.0.0,>=2.7.1; extra == "tesserocr"
@@ -2,7 +2,7 @@ import logging
2
2
  import re
3
3
  from io import BytesIO
4
4
  from pathlib import Path
5
- from typing import Set, Union
5
+ from typing import Final, Set, Union
6
6
 
7
7
  from docling_core.types.doc import (
8
8
  DocItemLabel,
@@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument
22
22
 
23
23
  _log = logging.getLogger(__name__)
24
24
 
25
+ DEFAULT_IMAGE_WIDTH: Final = 128
26
+ DEFAULT_IMAGE_HEIGHT: Final = 128
27
+
25
28
 
26
29
  class AsciiDocBackend(DeclarativeDocumentBackend):
27
30
  def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
@@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
200
203
 
201
204
  item = self._parse_picture(line)
202
205
 
203
- size = None
206
+ size: Size
204
207
  if "width" in item and "height" in item:
205
208
  size = Size(width=int(item["width"]), height=int(item["height"]))
209
+ else:
210
+ size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT)
206
211
 
207
212
  uri = None
208
213
  if (
@@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
264
269
 
265
270
  return doc
266
271
 
267
- def _get_current_level(self, parents):
272
+ @staticmethod
273
+ def _get_current_level(parents):
268
274
  for k, v in parents.items():
269
275
  if v is None and k > 0:
270
276
  return k - 1
271
277
 
272
278
  return 0
273
279
 
274
- def _get_current_parent(self, parents):
280
+ @staticmethod
281
+ def _get_current_parent(parents):
275
282
  for k, v in parents.items():
276
283
  if v is None and k > 0:
277
284
  return parents[k - 1]
@@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
279
286
  return None
280
287
 
281
288
  # ========= Title
282
- def _is_title(self, line):
289
+ @staticmethod
290
+ def _is_title(line):
283
291
  return re.match(r"^= ", line)
284
292
 
285
- def _parse_title(self, line):
293
+ @staticmethod
294
+ def _parse_title(line):
286
295
  return {"type": "title", "text": line[2:].strip(), "level": 0}
287
296
 
288
297
  # ========= Section headers
289
- def _is_section_header(self, line):
298
+ @staticmethod
299
+ def _is_section_header(line):
290
300
  return re.match(r"^==+\s+", line)
291
301
 
292
- def _parse_section_header(self, line):
302
+ @staticmethod
303
+ def _parse_section_header(line):
293
304
  match = re.match(r"^(=+)\s+(.*)", line)
294
305
 
295
306
  marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
@@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
303
314
  }
304
315
 
305
316
  # ========= Lists
306
- def _is_list_item(self, line):
317
+ @staticmethod
318
+ def _is_list_item(line):
307
319
  return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
308
320
 
309
- def _parse_list_item(self, line):
321
+ @staticmethod
322
+ def _parse_list_item(line):
310
323
  """Extract the item marker (number or bullet symbol) and the text of the item."""
311
324
 
312
325
  match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
@@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
342
355
  }
343
356
 
344
357
  # ========= Tables
345
- def _is_table_line(self, line):
358
+ @staticmethod
359
+ def _is_table_line(line):
346
360
  return re.match(r"^\|.*\|", line)
347
361
 
348
- def _parse_table_line(self, line):
362
+ @staticmethod
363
+ def _parse_table_line(line):
349
364
  # Split table cells and trim extra spaces
350
365
  return [cell.strip() for cell in line.split("|") if cell.strip()]
351
366
 
352
- def _populate_table_as_grid(self, table_data):
367
+ @staticmethod
368
+ def _populate_table_as_grid(table_data):
353
369
  num_rows = len(table_data)
354
370
 
355
371
  # Adjust the table data into a grid format
@@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
380
396
  return data
381
397
 
382
398
  # ========= Pictures
383
- def _is_picture(self, line):
399
+ @staticmethod
400
+ def _is_picture(line):
384
401
  return re.match(r"^image::", line)
385
402
 
386
- def _parse_picture(self, line):
403
+ @staticmethod
404
+ def _parse_picture(line):
387
405
  """
388
406
  Parse an image macro, extracting its path and attributes.
389
407
  Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
@@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
406
424
  return {"type": "picture", "uri": line}
407
425
 
408
426
  # ========= Captions
409
- def _is_caption(self, line):
427
+ @staticmethod
428
+ def _is_caption(line):
410
429
  return re.match(r"^\.(.+)", line)
411
430
 
412
- def _parse_caption(self, line):
431
+ @staticmethod
432
+ def _parse_caption(line):
413
433
  mtch = re.match(r"^\.(.+)", line)
414
434
  if mtch:
415
435
  text = mtch.group(1)
@@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
418
438
  return {"type": "caption", "text": ""}
419
439
 
420
440
  # ========= Plain text
421
- def _parse_text(self, line):
441
+ @staticmethod
442
+ def _parse_text(line):
422
443
  return {"type": "text", "text": line.strip()}
@@ -7,12 +7,17 @@ from typing import List, Optional, Union
7
7
 
8
8
  import pypdfium2 as pdfium
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
10
- from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
10
+ from docling_core.types.doc.page import (
11
+ BoundingRectangle,
12
+ SegmentedPdfPage,
13
+ TextCell,
14
+ )
11
15
  from docling_parse.pdf_parsers import pdf_parser_v1
12
16
  from PIL import Image, ImageDraw
13
17
  from pypdfium2 import PdfPage
14
18
 
15
19
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
20
+ from docling.backend.pypdfium2_backend import get_pdf_page_geometry
16
21
  from docling.datamodel.document import InputDocument
17
22
 
18
23
  _log = logging.getLogger(__name__)
@@ -36,43 +41,8 @@ class DoclingParsePageBackend(PdfPageBackend):
36
41
  def is_valid(self) -> bool:
37
42
  return self.valid
38
43
 
39
- def get_text_in_rect(self, bbox: BoundingBox) -> str:
40
- if not self.valid:
41
- return ""
42
- # Find intersecting cells on the page
43
- text_piece = ""
44
- page_size = self.get_size()
45
- parser_width = self._dpage["width"]
46
- parser_height = self._dpage["height"]
47
-
48
- scale = (
49
- 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
50
- )
51
-
52
- for i in range(len(self._dpage["cells"])):
53
- rect = self._dpage["cells"][i]["box"]["device"]
54
- x0, y0, x1, y1 = rect
55
- cell_bbox = BoundingBox(
56
- l=x0 * scale * page_size.width / parser_width,
57
- b=y0 * scale * page_size.height / parser_height,
58
- r=x1 * scale * page_size.width / parser_width,
59
- t=y1 * scale * page_size.height / parser_height,
60
- coord_origin=CoordOrigin.BOTTOMLEFT,
61
- ).to_top_left_origin(page_height=page_size.height * scale)
62
-
63
- overlap_frac = cell_bbox.intersection_over_self(bbox)
64
-
65
- if overlap_frac > 0.5:
66
- if len(text_piece) > 0:
67
- text_piece += " "
68
- text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
69
-
70
- return text_piece
71
-
72
- def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
73
- return None
74
-
75
- def get_text_cells(self) -> Iterable[TextCell]:
44
+ def _compute_text_cells(self) -> List[TextCell]:
45
+ """Compute text cells from docling-parse data."""
76
46
  cells: List[TextCell] = []
77
47
  cell_counter = 0
78
48
 
@@ -102,7 +72,6 @@ class DoclingParsePageBackend(PdfPageBackend):
102
72
  from_ocr=False,
103
73
  rect=BoundingRectangle.from_bounding_box(
104
74
  BoundingBox(
105
- # l=x0, b=y0, r=x1, t=y1,
106
75
  l=x0 * page_size.width / parser_width,
107
76
  b=y0 * page_size.height / parser_height,
108
77
  r=x1 * page_size.width / parser_width,
@@ -115,30 +84,63 @@ class DoclingParsePageBackend(PdfPageBackend):
115
84
 
116
85
  cell_counter += 1
117
86
 
118
- def draw_clusters_and_cells():
119
- image = (
120
- self.get_page_image()
121
- ) # make new image to avoid drawing on the saved ones
122
- draw = ImageDraw.Draw(image)
123
- for c in cells:
124
- x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
125
- cell_color = (
126
- random.randint(30, 140),
127
- random.randint(30, 140),
128
- random.randint(30, 140),
129
- )
130
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
131
- image.show()
87
+ return cells
132
88
 
133
- # before merge:
134
- # draw_clusters_and_cells()
89
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
90
+ if not self.valid:
91
+ return ""
92
+ # Find intersecting cells on the page
93
+ text_piece = ""
94
+ page_size = self.get_size()
95
+ parser_width = self._dpage["width"]
96
+ parser_height = self._dpage["height"]
135
97
 
136
- # cells = merge_horizontal_cells(cells)
98
+ scale = (
99
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
100
+ )
137
101
 
138
- # after merge:
139
- # draw_clusters_and_cells()
102
+ for i in range(len(self._dpage["cells"])):
103
+ rect = self._dpage["cells"][i]["box"]["device"]
104
+ x0, y0, x1, y1 = rect
105
+ cell_bbox = BoundingBox(
106
+ l=x0 * scale * page_size.width / parser_width,
107
+ b=y0 * scale * page_size.height / parser_height,
108
+ r=x1 * scale * page_size.width / parser_width,
109
+ t=y1 * scale * page_size.height / parser_height,
110
+ coord_origin=CoordOrigin.BOTTOMLEFT,
111
+ ).to_top_left_origin(page_height=page_size.height * scale)
140
112
 
141
- return cells
113
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
114
+
115
+ if overlap_frac > 0.5:
116
+ if len(text_piece) > 0:
117
+ text_piece += " "
118
+ text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
119
+
120
+ return text_piece
121
+
122
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
123
+ if not self.valid:
124
+ return None
125
+
126
+ text_cells = self._compute_text_cells()
127
+
128
+ # Get the PDF page geometry from pypdfium2
129
+ dimension = get_pdf_page_geometry(self._ppage)
130
+
131
+ # Create SegmentedPdfPage
132
+ return SegmentedPdfPage(
133
+ dimension=dimension,
134
+ textline_cells=text_cells,
135
+ char_cells=[],
136
+ word_cells=[],
137
+ has_lines=len(text_cells) > 0,
138
+ has_words=False,
139
+ has_chars=False,
140
+ )
141
+
142
+ def get_text_cells(self) -> Iterable[TextCell]:
143
+ return self._compute_text_cells()
142
144
 
143
145
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
144
146
  AREA_THRESHOLD = 0 # 32 * 32
@@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
7
7
 
8
8
  import pypdfium2 as pdfium
9
9
  from docling_core.types.doc import BoundingBox, CoordOrigin
10
- from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
10
+ from docling_core.types.doc.page import (
11
+ BoundingRectangle,
12
+ PdfPageBoundaryType,
13
+ PdfPageGeometry,
14
+ SegmentedPdfPage,
15
+ TextCell,
16
+ )
11
17
  from docling_parse.pdf_parsers import pdf_parser_v2
12
18
  from PIL import Image, ImageDraw
13
19
  from pypdfium2 import PdfPage
14
20
 
15
21
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
22
+ from docling.backend.pypdfium2_backend import get_pdf_page_geometry
16
23
  from docling.datamodel.base_models import Size
17
24
  from docling.utils.locks import pypdfium2_lock
18
25
 
@@ -40,50 +47,8 @@ class DoclingParseV2PageBackend(PdfPageBackend):
40
47
  def is_valid(self) -> bool:
41
48
  return self.valid
42
49
 
43
- def get_text_in_rect(self, bbox: BoundingBox) -> str:
44
- if not self.valid:
45
- return ""
46
- # Find intersecting cells on the page
47
- text_piece = ""
48
- page_size = self.get_size()
49
-
50
- parser_width = self._dpage["sanitized"]["dimension"]["width"]
51
- parser_height = self._dpage["sanitized"]["dimension"]["height"]
52
-
53
- scale = (
54
- 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
55
- )
56
-
57
- cells_data = self._dpage["sanitized"]["cells"]["data"]
58
- cells_header = self._dpage["sanitized"]["cells"]["header"]
59
-
60
- for i, cell_data in enumerate(cells_data):
61
- x0 = cell_data[cells_header.index("x0")]
62
- y0 = cell_data[cells_header.index("y0")]
63
- x1 = cell_data[cells_header.index("x1")]
64
- y1 = cell_data[cells_header.index("y1")]
65
-
66
- cell_bbox = BoundingBox(
67
- l=x0 * scale * page_size.width / parser_width,
68
- b=y0 * scale * page_size.height / parser_height,
69
- r=x1 * scale * page_size.width / parser_width,
70
- t=y1 * scale * page_size.height / parser_height,
71
- coord_origin=CoordOrigin.BOTTOMLEFT,
72
- ).to_top_left_origin(page_height=page_size.height * scale)
73
-
74
- overlap_frac = cell_bbox.intersection_over_self(bbox)
75
-
76
- if overlap_frac > 0.5:
77
- if len(text_piece) > 0:
78
- text_piece += " "
79
- text_piece += cell_data[cells_header.index("text")]
80
-
81
- return text_piece
82
-
83
- def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
84
- return None
85
-
86
- def get_text_cells(self) -> Iterable[TextCell]:
50
+ def _compute_text_cells(self) -> List[TextCell]:
51
+ """Compute text cells from docling-parse v2 data."""
87
52
  cells: List[TextCell] = []
88
53
  cell_counter = 0
89
54
 
@@ -118,7 +83,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
118
83
  from_ocr=False,
119
84
  rect=BoundingRectangle.from_bounding_box(
120
85
  BoundingBox(
121
- # l=x0, b=y0, r=x1, t=y1,
122
86
  l=x0 * page_size.width / parser_width,
123
87
  b=y0 * page_size.height / parser_height,
124
88
  r=x1 * page_size.width / parser_width,
@@ -130,24 +94,70 @@ class DoclingParseV2PageBackend(PdfPageBackend):
130
94
  )
131
95
  cell_counter += 1
132
96
 
133
- def draw_clusters_and_cells():
134
- image = (
135
- self.get_page_image()
136
- ) # make new image to avoid drawing on the saved ones
137
- draw = ImageDraw.Draw(image)
138
- for c in cells:
139
- x0, y0, x1, y1 = c.bbox.as_tuple()
140
- cell_color = (
141
- random.randint(30, 140),
142
- random.randint(30, 140),
143
- random.randint(30, 140),
144
- )
145
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
146
- image.show()
97
+ return cells
147
98
 
148
- # draw_clusters_and_cells()
99
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
100
+ if not self.valid:
101
+ return ""
102
+ # Find intersecting cells on the page
103
+ text_piece = ""
104
+ page_size = self.get_size()
149
105
 
150
- return cells
106
+ parser_width = self._dpage["sanitized"]["dimension"]["width"]
107
+ parser_height = self._dpage["sanitized"]["dimension"]["height"]
108
+
109
+ scale = (
110
+ 1 # FIX - Replace with param in get_text_in_rect across backends (optional)
111
+ )
112
+
113
+ cells_data = self._dpage["sanitized"]["cells"]["data"]
114
+ cells_header = self._dpage["sanitized"]["cells"]["header"]
115
+
116
+ for i, cell_data in enumerate(cells_data):
117
+ x0 = cell_data[cells_header.index("x0")]
118
+ y0 = cell_data[cells_header.index("y0")]
119
+ x1 = cell_data[cells_header.index("x1")]
120
+ y1 = cell_data[cells_header.index("y1")]
121
+
122
+ cell_bbox = BoundingBox(
123
+ l=x0 * scale * page_size.width / parser_width,
124
+ b=y0 * scale * page_size.height / parser_height,
125
+ r=x1 * scale * page_size.width / parser_width,
126
+ t=y1 * scale * page_size.height / parser_height,
127
+ coord_origin=CoordOrigin.BOTTOMLEFT,
128
+ ).to_top_left_origin(page_height=page_size.height * scale)
129
+
130
+ overlap_frac = cell_bbox.intersection_over_self(bbox)
131
+
132
+ if overlap_frac > 0.5:
133
+ if len(text_piece) > 0:
134
+ text_piece += " "
135
+ text_piece += cell_data[cells_header.index("text")]
136
+
137
+ return text_piece
138
+
139
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
140
+ if not self.valid:
141
+ return None
142
+
143
+ text_cells = self._compute_text_cells()
144
+
145
+ # Get the PDF page geometry from pypdfium2
146
+ dimension = get_pdf_page_geometry(self._ppage)
147
+
148
+ # Create SegmentedPdfPage
149
+ return SegmentedPdfPage(
150
+ dimension=dimension,
151
+ textline_cells=text_cells,
152
+ char_cells=[],
153
+ word_cells=[],
154
+ has_textlines=len(text_cells) > 0,
155
+ has_words=False,
156
+ has_chars=False,
157
+ )
158
+
159
+ def get_text_cells(self) -> Iterable[TextCell]:
160
+ return self._compute_text_cells()
151
161
 
152
162
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
153
163
  AREA_THRESHOLD = 0 # 32 * 32
@@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
59
59
  return self._dpage
60
60
 
61
61
  def get_text_cells(self) -> Iterable[TextCell]:
62
- page_size = self.get_size()
63
-
64
- [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
65
-
66
- # for cell in self._dpage.textline_cells:
67
- # rect = cell.rect
68
- #
69
- # assert (
70
- # rect.to_bounding_box().l <= rect.to_bounding_box().r
71
- # ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
72
- # assert (
73
- # rect.to_bounding_box().t <= rect.to_bounding_box().b
74
- # ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
75
-
76
62
  return self._dpage.textline_cells
77
63
 
78
64
  def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
171
157
  self, page_no: int, create_words: bool = True, create_textlines: bool = True
172
158
  ) -> DoclingParseV4PageBackend:
173
159
  with pypdfium2_lock:
160
+ seg_page = self.dp_doc.get_page(
161
+ page_no + 1,
162
+ create_words=create_words,
163
+ create_textlines=create_textlines,
164
+ )
165
+
166
+ # In Docling, all TextCell instances are expected with top-left origin.
167
+ [
168
+ tc.to_top_left_origin(seg_page.dimension.height)
169
+ for tc in seg_page.textline_cells
170
+ ]
171
+ [
172
+ tc.to_top_left_origin(seg_page.dimension.height)
173
+ for tc in seg_page.char_cells
174
+ ]
175
+ [
176
+ tc.to_top_left_origin(seg_page.dimension.height)
177
+ for tc in seg_page.word_cells
178
+ ]
179
+
174
180
  return DoclingParseV4PageBackend(
175
- self.dp_doc.get_page(
176
- page_no + 1,
177
- create_words=create_words,
178
- create_textlines=create_textlines,
179
- ),
181
+ seg_page,
180
182
  self._pdoc[page_no],
181
183
  )
182
184