docling 2.36.1__py3-none-any.whl → 2.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. docling/backend/asciidoc_backend.py +39 -18
  2. docling/backend/docling_parse_backend.py +61 -59
  3. docling/backend/docling_parse_v2_backend.py +72 -62
  4. docling/backend/docling_parse_v4_backend.py +21 -19
  5. docling/backend/md_backend.py +101 -81
  6. docling/backend/mspowerpoint_backend.py +72 -113
  7. docling/backend/msword_backend.py +99 -80
  8. docling/backend/noop_backend.py +51 -0
  9. docling/backend/pypdfium2_backend.py +127 -53
  10. docling/cli/main.py +82 -14
  11. docling/datamodel/asr_model_specs.py +92 -0
  12. docling/datamodel/base_models.py +21 -4
  13. docling/datamodel/document.py +3 -1
  14. docling/datamodel/pipeline_options.py +15 -2
  15. docling/datamodel/pipeline_options_asr_model.py +57 -0
  16. docling/datamodel/pipeline_options_vlm_model.py +4 -4
  17. docling/document_converter.py +8 -0
  18. docling/models/api_vlm_model.py +3 -1
  19. docling/models/base_model.py +1 -1
  20. docling/models/base_ocr_model.py +33 -11
  21. docling/models/easyocr_model.py +1 -1
  22. docling/models/layout_model.py +2 -3
  23. docling/models/ocr_mac_model.py +1 -1
  24. docling/models/page_preprocessing_model.py +3 -6
  25. docling/models/rapid_ocr_model.py +1 -1
  26. docling/models/readingorder_model.py +3 -3
  27. docling/models/tesseract_ocr_cli_model.py +4 -3
  28. docling/models/tesseract_ocr_model.py +1 -1
  29. docling/models/vlm_models_inline/hf_transformers_model.py +4 -1
  30. docling/models/vlm_models_inline/mlx_model.py +3 -1
  31. docling/pipeline/asr_pipeline.py +253 -0
  32. docling/pipeline/base_pipeline.py +11 -0
  33. docling/pipeline/standard_pdf_pipeline.py +0 -1
  34. docling/utils/layout_postprocessor.py +11 -6
  35. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/METADATA +7 -4
  36. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/RECORD +40 -36
  37. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/WHEEL +0 -0
  38. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/entry_points.txt +0 -0
  39. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/licenses/LICENSE +0 -0
  40. {docling-2.36.1.dist-info → docling-2.38.0.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,7 @@ from docling_core.types.doc import (
14
14
  TableCell,
15
15
  TableData,
16
16
  )
17
- from docling_core.types.doc.document import Formatting
17
+ from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
18
18
  from docx import Document
19
19
  from docx.document import Document as DocxDocument
20
20
  from docx.oxml.table import CT_Tc
@@ -60,8 +60,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
60
60
  self.equation_bookends: str = "<eq>{EQ}</eq>"
61
61
  # Track processed textbox elements to avoid duplication
62
62
  self.processed_textbox_elements: List[int] = []
63
- # Track content hash of processed paragraphs to avoid duplicate content
64
- self.processed_paragraph_content: List[str] = []
65
63
 
66
64
  for i in range(-1, self.max_levels):
67
65
  self.parents[i] = None
@@ -86,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
86
84
  self.valid = True
87
85
  except Exception as e:
88
86
  raise RuntimeError(
89
- f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
87
+ f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
90
88
  ) from e
91
89
 
92
90
  @override
@@ -253,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
253
251
  self._handle_tables(element, docx_obj, doc)
254
252
  except Exception:
255
253
  _log.debug("could not parse a table, broken docx table")
256
-
254
+ # Check for Image
257
255
  elif drawing_blip:
258
256
  self._handle_pictures(docx_obj, drawing_blip, doc)
257
+ # Check for Text after the Image
258
+ if (
259
+ tag_name in ["p"]
260
+ and element.find(".//w:t", namespaces=namespaces) is not None
261
+ ):
262
+ self._handle_text_elements(element, docx_obj, doc)
259
263
  # Check for the sdt containers, like table of contents
260
264
  elif tag_name in ["sdt"]:
261
265
  sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@@ -270,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
270
274
  self._handle_text_elements(element, docx_obj, doc)
271
275
  else:
272
276
  _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
277
+
273
278
  return doc
274
279
 
275
280
  def _str_to_int(
@@ -580,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
580
585
  all_paragraphs = []
581
586
 
582
587
  # Sort paragraphs within each container, then process containers
583
- for container_id, paragraphs in container_paragraphs.items():
588
+ for paragraphs in container_paragraphs.values():
584
589
  # Sort by vertical position within each container
585
590
  sorted_container_paragraphs = sorted(
586
591
  paragraphs,
@@ -593,9 +598,29 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
593
598
  # Add the sorted paragraphs to our processing list
594
599
  all_paragraphs.extend(sorted_container_paragraphs)
595
600
 
601
+ # Track processed paragraphs to avoid duplicates (same content and position)
602
+ processed_paragraphs = set()
603
+
596
604
  # Process all the paragraphs
597
- for p, _ in all_paragraphs:
598
- self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
605
+ for p, position in all_paragraphs:
606
+ # Create paragraph object to get text content
607
+ paragraph = Paragraph(p, docx_obj)
608
+ text_content = paragraph.text
609
+
610
+ # Create a unique identifier based on content and position
611
+ paragraph_id = (text_content, position)
612
+
613
+ # Skip if this paragraph (same content and position) was already processed
614
+ if paragraph_id in processed_paragraphs:
615
+ _log.debug(
616
+ f"Skipping duplicate paragraph: content='{text_content[:50]}...', position={position}"
617
+ )
618
+ continue
619
+
620
+ # Mark this paragraph as processed
621
+ processed_paragraphs.add(paragraph_id)
622
+
623
+ self._handle_text_elements(p, docx_obj, doc)
599
624
 
600
625
  # Restore original parent
601
626
  self.parents[level] = original_parent
@@ -669,26 +694,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
669
694
  element: BaseOxmlElement,
670
695
  docx_obj: DocxDocument,
671
696
  doc: DoclingDocument,
672
- is_from_textbox: bool = False,
673
697
  ) -> None:
674
698
  paragraph = Paragraph(element, docx_obj)
675
-
676
- # Skip if from a textbox and this exact paragraph content was already processed
677
- # Skip if from a textbox and this exact paragraph content was already processed
678
- raw_text = paragraph.text
679
- if is_from_textbox and raw_text:
680
- # Create a simple hash of content to detect duplicates
681
- content_hash = f"{len(raw_text)}:{raw_text[:50]}"
682
- if content_hash in self.processed_paragraph_content:
683
- _log.debug(f"Skipping duplicate paragraph content: {content_hash}")
684
- return
685
- self.processed_paragraph_content.append(content_hash)
686
-
687
- text, equations = self._handle_equations_in_text(element=element, text=raw_text)
699
+ paragraph_elements = self._get_paragraph_elements(paragraph)
700
+ text, equations = self._handle_equations_in_text(
701
+ element=element, text=paragraph.text
702
+ )
688
703
 
689
704
  if text is None:
690
705
  return
691
- paragraph_elements = self._get_paragraph_elements(paragraph)
692
706
  text = text.strip()
693
707
 
694
708
  # Common styles for bullet and numbered lists.
@@ -750,7 +764,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
750
764
  self._add_header(doc, p_level, text, is_numbered_style)
751
765
 
752
766
  elif len(equations) > 0:
753
- if (raw_text is None or len(raw_text.strip()) == 0) and len(text) > 0:
767
+ if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
768
+ text
769
+ ) > 0:
754
770
  # Standalone equation
755
771
  level = self._get_level()
756
772
  doc.add_text(
@@ -902,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
902
918
  )
903
919
  return
904
920
 
921
+ def _add_formatted_list_item(
922
+ self,
923
+ doc: DoclingDocument,
924
+ elements: list,
925
+ marker: str,
926
+ enumerated: bool,
927
+ level: int,
928
+ ) -> None:
929
+ # This should not happen by construction
930
+ if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
931
+ return
932
+ if len(elements) == 1:
933
+ text, format, hyperlink = elements[0]
934
+ doc.add_list_item(
935
+ marker=marker,
936
+ enumerated=enumerated,
937
+ parent=self.parents[level],
938
+ text=text,
939
+ formatting=format,
940
+ hyperlink=hyperlink,
941
+ )
942
+ else:
943
+ new_item = doc.add_list_item(
944
+ marker=marker,
945
+ enumerated=enumerated,
946
+ parent=self.parents[level],
947
+ text="",
948
+ )
949
+ new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
950
+ for text, format, hyperlink in elements:
951
+ doc.add_text(
952
+ label=DocItemLabel.TEXT,
953
+ parent=new_parent,
954
+ text=text,
955
+ formatting=format,
956
+ hyperlink=hyperlink,
957
+ )
958
+
905
959
  def _add_list_item(
906
960
  self,
907
961
  *,
@@ -911,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
911
965
  elements: list,
912
966
  is_numbered: bool = False,
913
967
  ) -> None:
968
+ # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
969
+ if not elements:
970
+ return None
914
971
  enum_marker = ""
915
972
 
916
973
  level = self._get_level()
@@ -927,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
927
984
  if is_numbered:
928
985
  enum_marker = str(self.listIter) + "."
929
986
  is_numbered = True
930
- new_parent = self._create_or_reuse_parent(
931
- doc=doc,
932
- prev_parent=self.parents[level],
933
- paragraph_elements=elements,
987
+ self._add_formatted_list_item(
988
+ doc, elements, enum_marker, is_numbered, level
934
989
  )
935
- for text, format, hyperlink in elements:
936
- doc.add_list_item(
937
- marker=enum_marker,
938
- enumerated=is_numbered,
939
- parent=new_parent,
940
- text=text,
941
- formatting=format,
942
- hyperlink=hyperlink,
943
- )
944
-
945
990
  elif (
946
991
  self._prev_numid() == numid
947
992
  and self.level_at_new_list is not None
@@ -971,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
971
1016
  if is_numbered:
972
1017
  enum_marker = str(self.listIter) + "."
973
1018
  is_numbered = True
974
-
975
- new_parent = self._create_or_reuse_parent(
976
- doc=doc,
977
- prev_parent=self.parents[self.level_at_new_list + ilevel],
978
- paragraph_elements=elements,
1019
+ self._add_formatted_list_item(
1020
+ doc,
1021
+ elements,
1022
+ enum_marker,
1023
+ is_numbered,
1024
+ self.level_at_new_list + ilevel,
979
1025
  )
980
- for text, format, hyperlink in elements:
981
- doc.add_list_item(
982
- marker=enum_marker,
983
- enumerated=is_numbered,
984
- parent=new_parent,
985
- text=text,
986
- formatting=format,
987
- hyperlink=hyperlink,
988
- )
989
1026
  elif (
990
1027
  self._prev_numid() == numid
991
1028
  and self.level_at_new_list is not None
992
1029
  and prev_indent is not None
993
1030
  and ilevel < prev_indent
994
1031
  ): # Close list
995
- for k, v in self.parents.items():
1032
+ for k in self.parents:
996
1033
  if k > self.level_at_new_list + ilevel:
997
1034
  self.parents[k] = None
998
1035
 
@@ -1001,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1001
1038
  if is_numbered:
1002
1039
  enum_marker = str(self.listIter) + "."
1003
1040
  is_numbered = True
1004
- new_parent = self._create_or_reuse_parent(
1005
- doc=doc,
1006
- prev_parent=self.parents[self.level_at_new_list + ilevel],
1007
- paragraph_elements=elements,
1041
+ self._add_formatted_list_item(
1042
+ doc,
1043
+ elements,
1044
+ enum_marker,
1045
+ is_numbered,
1046
+ self.level_at_new_list + ilevel,
1008
1047
  )
1009
- for text, format, hyperlink in elements:
1010
- doc.add_list_item(
1011
- marker=enum_marker,
1012
- enumerated=is_numbered,
1013
- parent=new_parent,
1014
- text=text,
1015
- formatting=format,
1016
- hyperlink=hyperlink,
1017
- )
1018
1048
  self.listIter = 0
1019
1049
 
1020
1050
  elif self._prev_numid() == numid or prev_indent == ilevel:
@@ -1023,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
1023
1053
  if is_numbered:
1024
1054
  enum_marker = str(self.listIter) + "."
1025
1055
  is_numbered = True
1026
- new_parent = self._create_or_reuse_parent(
1027
- doc=doc,
1028
- prev_parent=self.parents[level - 1],
1029
- paragraph_elements=elements,
1056
+ self._add_formatted_list_item(
1057
+ doc, elements, enum_marker, is_numbered, level - 1
1030
1058
  )
1031
- for text, format, hyperlink in elements:
1032
- # Add the list item to the parent group
1033
- doc.add_list_item(
1034
- marker=enum_marker,
1035
- enumerated=is_numbered,
1036
- parent=new_parent,
1037
- text=text,
1038
- formatting=format,
1039
- hyperlink=hyperlink,
1040
- )
1059
+
1041
1060
  return
1042
1061
 
1043
1062
  def _handle_tables(
@@ -0,0 +1,51 @@
1
+ import logging
2
+ from io import BytesIO
3
+ from pathlib import Path
4
+ from typing import Set, Union
5
+
6
+ from docling.backend.abstract_backend import AbstractDocumentBackend
7
+ from docling.datamodel.base_models import InputFormat
8
+ from docling.datamodel.document import InputDocument
9
+
10
+ _log = logging.getLogger(__name__)
11
+
12
+
13
+ class NoOpBackend(AbstractDocumentBackend):
14
+ """
15
+ A no-op backend that only validates input existence.
16
+ Used e.g. for audio files where actual processing is handled by the ASR pipeline.
17
+ """
18
+
19
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
20
+ super().__init__(in_doc, path_or_stream)
21
+
22
+ _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
23
+
24
+ # Validate input
25
+ try:
26
+ if isinstance(self.path_or_stream, BytesIO):
27
+ # Check if stream has content
28
+ self.valid = len(self.path_or_stream.getvalue()) > 0
29
+ _log.debug(
30
+ f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
31
+ )
32
+ elif isinstance(self.path_or_stream, Path):
33
+ # Check if file exists
34
+ self.valid = self.path_or_stream.exists()
35
+ _log.debug(f"File exists: {self.valid}")
36
+ else:
37
+ self.valid = False
38
+ except Exception as e:
39
+ _log.error(f"NoOpBackend validation failed: {e}")
40
+ self.valid = False
41
+
42
+ def is_valid(self) -> bool:
43
+ return self.valid
44
+
45
+ @classmethod
46
+ def supports_pagination(cls) -> bool:
47
+ return False
48
+
49
+ @classmethod
50
+ def supported_formats(cls) -> Set[InputFormat]:
51
+ return set(InputFormat)
@@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
8
8
  import pypdfium2 as pdfium
9
9
  import pypdfium2.raw as pdfium_c
10
10
  from docling_core.types.doc import BoundingBox, CoordOrigin, Size
11
- from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
11
+ from docling_core.types.doc.page import (
12
+ BoundingRectangle,
13
+ PdfPageBoundaryType,
14
+ PdfPageGeometry,
15
+ SegmentedPdfPage,
16
+ TextCell,
17
+ )
12
18
  from PIL import Image, ImageDraw
13
19
  from pypdfium2 import PdfTextPage
14
20
  from pypdfium2._helpers.misc import PdfiumError
@@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
16
22
  from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
17
23
  from docling.utils.locks import pypdfium2_lock
18
24
 
25
+
26
+ def get_pdf_page_geometry(
27
+ ppage: pdfium.PdfPage,
28
+ angle: float = 0.0,
29
+ boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
30
+ ) -> PdfPageGeometry:
31
+ """
32
+ Create PdfPageGeometry from a pypdfium2 PdfPage object.
33
+
34
+ Args:
35
+ ppage: pypdfium2 PdfPage object
36
+ angle: Page rotation angle in degrees (default: 0.0)
37
+ boundary_type: The boundary type for the page (default: CROP_BOX)
38
+
39
+ Returns:
40
+ PdfPageGeometry with all the different bounding boxes properly set
41
+ """
42
+ with pypdfium2_lock:
43
+ # Get the main bounding box (intersection of crop_box and media_box)
44
+ bbox_tuple = ppage.get_bbox()
45
+ bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
46
+
47
+ # Get all the different page boxes from pypdfium2
48
+ media_box_tuple = ppage.get_mediabox()
49
+ crop_box_tuple = ppage.get_cropbox()
50
+ art_box_tuple = ppage.get_artbox()
51
+ bleed_box_tuple = ppage.get_bleedbox()
52
+ trim_box_tuple = ppage.get_trimbox()
53
+
54
+ # Convert to BoundingBox objects using existing from_tuple method
55
+ # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
56
+ # Use bbox as fallback when specific box types are not defined
57
+ media_bbox = (
58
+ BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
59
+ if media_box_tuple
60
+ else bbox
61
+ )
62
+ crop_bbox = (
63
+ BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
64
+ if crop_box_tuple
65
+ else bbox
66
+ )
67
+ art_bbox = (
68
+ BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
69
+ if art_box_tuple
70
+ else bbox
71
+ )
72
+ bleed_bbox = (
73
+ BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
74
+ if bleed_box_tuple
75
+ else bbox
76
+ )
77
+ trim_bbox = (
78
+ BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
79
+ if trim_box_tuple
80
+ else bbox
81
+ )
82
+
83
+ return PdfPageGeometry(
84
+ angle=angle,
85
+ rect=BoundingRectangle.from_bounding_box(bbox),
86
+ boundary_type=boundary_type,
87
+ art_bbox=art_bbox,
88
+ bleed_bbox=bleed_bbox,
89
+ crop_bbox=crop_bbox,
90
+ media_bbox=media_bbox,
91
+ trim_bbox=trim_bbox,
92
+ )
93
+
94
+
19
95
  if TYPE_CHECKING:
20
96
  from docling.datamodel.document import InputDocument
21
97
 
@@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
41
117
  def is_valid(self) -> bool:
42
118
  return self.valid
43
119
 
44
- def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
45
- AREA_THRESHOLD = 0 # 32 * 32
46
- page_size = self.get_size()
47
- with pypdfium2_lock:
48
- for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
49
- pos = obj.get_pos()
50
- cropbox = BoundingBox.from_tuple(
51
- pos, origin=CoordOrigin.BOTTOMLEFT
52
- ).to_top_left_origin(page_height=page_size.height)
53
-
54
- if cropbox.area() > AREA_THRESHOLD:
55
- cropbox = cropbox.scaled(scale=scale)
56
-
57
- yield cropbox
58
-
59
- def get_text_in_rect(self, bbox: BoundingBox) -> str:
60
- with pypdfium2_lock:
61
- if not self.text_page:
62
- self.text_page = self._ppage.get_textpage()
63
-
64
- if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
65
- bbox = bbox.to_bottom_left_origin(self.get_size().height)
66
-
67
- with pypdfium2_lock:
68
- text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
69
-
70
- return text_piece
71
-
72
- def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
73
- return None
74
-
75
- def get_text_cells(self) -> Iterable[TextCell]:
120
+ def _compute_text_cells(self) -> List[TextCell]:
121
+ """Compute text cells from pypdfium."""
76
122
  with pypdfium2_lock:
77
123
  if not self.text_page:
78
124
  self.text_page = self._ppage.get_textpage()
@@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
203
249
 
204
250
  return merged_cells
205
251
 
206
- def draw_clusters_and_cells():
207
- image = (
208
- self.get_page_image()
209
- ) # make new image to avoid drawing on the saved ones
210
- draw = ImageDraw.Draw(image)
211
- for c in cells:
212
- x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
213
- cell_color = (
214
- random.randint(30, 140),
215
- random.randint(30, 140),
216
- random.randint(30, 140),
217
- )
218
- draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
219
- image.show()
252
+ return merge_horizontal_cells(cells)
220
253
 
221
- # before merge:
222
- # draw_clusters_and_cells()
254
+ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
255
+ AREA_THRESHOLD = 0 # 32 * 32
256
+ page_size = self.get_size()
257
+ with pypdfium2_lock:
258
+ for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
259
+ pos = obj.get_pos()
260
+ cropbox = BoundingBox.from_tuple(
261
+ pos, origin=CoordOrigin.BOTTOMLEFT
262
+ ).to_top_left_origin(page_height=page_size.height)
223
263
 
224
- cells = merge_horizontal_cells(cells)
264
+ if cropbox.area() > AREA_THRESHOLD:
265
+ cropbox = cropbox.scaled(scale=scale)
225
266
 
226
- # after merge:
227
- # draw_clusters_and_cells()
267
+ yield cropbox
228
268
 
229
- return cells
269
+ def get_text_in_rect(self, bbox: BoundingBox) -> str:
270
+ with pypdfium2_lock:
271
+ if not self.text_page:
272
+ self.text_page = self._ppage.get_textpage()
273
+
274
+ if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
275
+ bbox = bbox.to_bottom_left_origin(self.get_size().height)
276
+
277
+ with pypdfium2_lock:
278
+ text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
279
+
280
+ return text_piece
281
+
282
+ def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
283
+ if not self.valid:
284
+ return None
285
+
286
+ text_cells = self._compute_text_cells()
287
+
288
+ # Get the PDF page geometry from pypdfium2
289
+ dimension = get_pdf_page_geometry(self._ppage)
290
+
291
+ # Create SegmentedPdfPage
292
+ return SegmentedPdfPage(
293
+ dimension=dimension,
294
+ textline_cells=text_cells,
295
+ char_cells=[],
296
+ word_cells=[],
297
+ has_textlines=len(text_cells) > 0,
298
+ has_words=False,
299
+ has_chars=False,
300
+ )
301
+
302
+ def get_text_cells(self) -> Iterable[TextCell]:
303
+ return self._compute_text_cells()
230
304
 
231
305
  def get_page_image(
232
306
  self, scale: float = 1, cropbox: Optional[BoundingBox] = None