natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +230 -151
  13. natural_pdf/classification/mixin.py +49 -35
  14. natural_pdf/classification/results.py +64 -46
  15. natural_pdf/collections/mixins.py +68 -20
  16. natural_pdf/collections/pdf_collection.py +177 -64
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +633 -190
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +503 -131
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,28 +1,37 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, overload
3
3
 
4
4
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
5
5
 
6
6
  # New Imports
7
7
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
8
8
 
9
+ from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
10
+ from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
11
+
12
+ # --- Classification Imports --- #
13
+ from natural_pdf.classification.mixin import ClassificationMixin
9
14
  from natural_pdf.elements.base import DirectionalMixin
15
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
16
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
17
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
18
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
10
19
 
11
20
  # Import new utils
12
21
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
13
22
 
14
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
23
+ # --- NEW: Import tqdm utility --- #
24
+ from natural_pdf.utils.tqdm_utils import get_tqdm
15
25
 
16
- # --- Classification Imports --- #
17
- from natural_pdf.classification.mixin import ClassificationMixin
18
- from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
19
26
  # --- End Classification Imports --- #
20
27
 
21
- from natural_pdf.utils.locks import pdf_render_lock # Import the lock
22
- from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
23
28
 
24
29
  if TYPE_CHECKING:
30
+ # --- NEW: Add Image type hint for classification --- #
31
+ from PIL.Image import Image
32
+
25
33
  from natural_pdf.core.page import Page
34
+ from natural_pdf.elements.collections import ElementCollection
26
35
  from natural_pdf.elements.text import TextElement
27
36
 
28
37
  # Import OCRManager conditionally to avoid circular imports
@@ -68,7 +77,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
68
77
  # --- ADDED --- Metadata store for mixins
69
78
  self.metadata: Dict[str, Any] = {}
70
79
  # --- NEW --- Central registry for analysis results
71
- self.analyses: Dict[str, Any] = {}
80
+ self.analyses: Dict[str, Any] = {}
72
81
  # --- END ADDED ---
73
82
 
74
83
  # Standard attributes for all elements
@@ -504,9 +513,37 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
504
513
 
505
514
  return inside
506
515
 
516
+ def is_element_center_inside(self, element: "Element") -> bool:
517
+ """
518
+ Check if the center point of an element's bounding box is inside this region.
519
+
520
+ Args:
521
+ element: Element to check
522
+
523
+ Returns:
524
+ True if the element's center point is inside the region, False otherwise.
525
+ """
526
+ # Check if element is on the same page
527
+ if not hasattr(element, "page") or element.page != self._page:
528
+ return False
529
+
530
+ # Ensure element has necessary attributes
531
+ if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
532
+ logger.warning(
533
+ f"Element {element} lacks bounding box attributes. Cannot check center point."
534
+ )
535
+ return False # Cannot determine position
536
+
537
+ # Calculate center point
538
+ center_x = (element.x0 + element.x1) / 2
539
+ center_y = (element.top + element.bottom) / 2
540
+
541
+ # Use the existing is_point_inside check
542
+ return self.is_point_inside(center_x, center_y)
543
+
507
544
  def _is_element_in_region(self, element: "Element", use_boundary_tolerance=True) -> bool:
508
545
  """
509
- Check if an element is within this region.
546
+ Check if an element intersects or is contained within this region.
510
547
 
511
548
  Args:
512
549
  element: Element to check
@@ -523,16 +560,101 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
523
560
  if not hasattr(element, "page") or element.page != self._page:
524
561
  return False
525
562
 
526
- # Calculate element center
563
+ return self.is_element_center_inside(element)
564
+ # return self.intersects(element)
565
+
566
+ def contains(self, element: "Element") -> bool:
567
+ """
568
+ Check if this region completely contains an element.
569
+
570
+ Args:
571
+ element: Element to check
572
+
573
+ Returns:
574
+ True if the element is completely contained within the region, False otherwise
575
+ """
576
+ # Check if element is on the same page
577
+ if not hasattr(element, "page") or element.page != self._page:
578
+ return False
579
+
580
+ # Ensure element has necessary attributes
581
+ if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
582
+ return False # Cannot determine position
583
+
584
+ # For rectangular regions, check if element's bbox is fully inside region's bbox
585
+ if not self.has_polygon:
586
+ return (
587
+ self.x0 <= element.x0
588
+ and element.x1 <= self.x1
589
+ and self.top <= element.top
590
+ and element.bottom <= self.bottom
591
+ )
592
+
593
+ # For polygon regions, check if all corners of the element are inside the polygon
594
+ element_corners = [
595
+ (element.x0, element.top), # top-left
596
+ (element.x1, element.top), # top-right
597
+ (element.x1, element.bottom), # bottom-right
598
+ (element.x0, element.bottom), # bottom-left
599
+ ]
600
+
601
+ return all(self.is_point_inside(x, y) for x, y in element_corners)
602
+
603
+ def intersects(self, element: "Element") -> bool:
604
+ """
605
+ Check if this region intersects with an element (any overlap).
606
+
607
+ Args:
608
+ element: Element to check
609
+
610
+ Returns:
611
+ True if the element overlaps with the region at all, False otherwise
612
+ """
613
+ # Check if element is on the same page
614
+ if not hasattr(element, "page") or element.page != self._page:
615
+ return False
616
+
527
617
  # Ensure element has necessary attributes
528
618
  if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
529
619
  return False # Cannot determine position
530
620
 
531
- element_center_x = (element.x0 + element.x1) / 2
532
- element_center_y = (element.top + element.bottom) / 2
621
+ # For rectangular regions, check for bbox overlap
622
+ if not self.has_polygon:
623
+ return (
624
+ self.x0 < element.x1
625
+ and self.x1 > element.x0
626
+ and self.top < element.bottom
627
+ and self.bottom > element.top
628
+ )
533
629
 
534
- # Check if center point is inside the region's geometry
535
- return self.is_point_inside(element_center_x, element_center_y)
630
+ # For polygon regions, check if any corner of the element is inside the polygon
631
+ element_corners = [
632
+ (element.x0, element.top), # top-left
633
+ (element.x1, element.top), # top-right
634
+ (element.x1, element.bottom), # bottom-right
635
+ (element.x0, element.bottom), # bottom-left
636
+ ]
637
+
638
+ # First check if any element corner is inside the polygon
639
+ if any(self.is_point_inside(x, y) for x, y in element_corners):
640
+ return True
641
+
642
+ # Also check if any polygon corner is inside the element's rectangle
643
+ for x, y in self.polygon:
644
+ if element.x0 <= x <= element.x1 and element.top <= y <= element.bottom:
645
+ return True
646
+
647
+ # Also check if any polygon edge intersects with any rectangle edge
648
+ # This is a simplification - for complex cases, we'd need a full polygon-rectangle
649
+ # intersection algorithm
650
+
651
+ # For now, return True if bounding boxes overlap (approximation for polygon-rectangle case)
652
+ return (
653
+ self.x0 < element.x1
654
+ and self.x1 > element.x0
655
+ and self.top < element.bottom
656
+ and self.bottom > element.top
657
+ )
536
658
 
537
659
  def highlight(
538
660
  self,
@@ -616,15 +738,15 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
616
738
 
617
739
  # Ensure coords are valid for cropping (left < right, top < bottom)
618
740
  if x0 >= x1:
619
- logger.warning(
620
- f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
621
- )
622
- return None
741
+ logger.warning(
742
+ f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
743
+ )
744
+ return None
623
745
  if top >= bottom:
624
- logger.warning(
625
- f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
626
- )
627
- return None
746
+ logger.warning(
747
+ f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
748
+ )
749
+ return None
628
750
 
629
751
  # Crop the image to just this region
630
752
  region_image = page_image.crop((x0, top, x1, bottom))
@@ -850,7 +972,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
850
972
  result = generate_text_layout(
851
973
  char_dicts=filtered_chars,
852
974
  layout_context_bbox=self.bbox, # Use region's bbox for context
853
- user_kwargs=kwargs, # Pass original kwargs to layout generator
975
+ user_kwargs=kwargs, # Pass original kwargs to layout generator
854
976
  )
855
977
 
856
978
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -858,40 +980,65 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
858
980
 
859
981
  def extract_table(
860
982
  self,
861
- method: str = None,
862
- table_settings: dict = None,
983
+ method: Optional[str] = None, # Make method optional
984
+ table_settings: Optional[dict] = None, # Use Optional
863
985
  use_ocr: bool = False,
864
- ocr_config: dict = None,
865
- ) -> List[List[str]]:
986
+ ocr_config: Optional[dict] = None, # Use Optional
987
+ text_options: Optional[Dict] = None,
988
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
989
+ # --- NEW: Add tqdm control option --- #
990
+ show_progress: bool = False, # Controls progress bar for text method
991
+ ) -> List[List[Optional[str]]]: # Return type allows Optional[str] for cells
866
992
  """
867
993
  Extract a table from this region.
868
994
 
869
995
  Args:
870
- method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
871
- table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
872
- use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
873
- ocr_config: OCR configuration parameters
996
+ method: Method to use: 'tatr', 'plumber', 'text', or None (auto-detect).
997
+ table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method).
998
+ use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
999
+ ocr_config: OCR configuration parameters.
1000
+ text_options: Dictionary of options for the 'text' method, corresponding to arguments
1001
+ of analyze_text_table_structure (e.g., snap_tolerance, expand_bbox).
1002
+ cell_extraction_func: Optional callable function that takes a cell Region object
1003
+ and returns its string content. Overrides default text extraction
1004
+ for the 'text' method.
1005
+ show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
874
1006
 
875
1007
  Returns:
876
- Table data as a list of rows, where each row is a list of cell values
1008
+ Table data as a list of rows, where each row is a list of cell values (str or None).
877
1009
  """
878
1010
  # Default settings if none provided
879
1011
  if table_settings is None:
880
1012
  table_settings = {}
1013
+ if text_options is None:
1014
+ text_options = {} # Initialize empty dict
881
1015
 
882
1016
  # Auto-detect method if not specified
883
- if method is None:
1017
+ effective_method = method
1018
+ if effective_method is None:
884
1019
  # If this is a TATR-detected region, use TATR method
885
1020
  if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
886
- method = "tatr"
1021
+ effective_method = "tatr"
887
1022
  else:
888
- method = "plumber"
1023
+ effective_method = "text"
1024
+
1025
+ logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
889
1026
 
890
1027
  # Use the selected method
891
- if method == "tatr":
1028
+ if effective_method == "tatr":
892
1029
  return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
893
- else: # Default to pdfplumber
1030
+ elif effective_method == "text":
1031
+ current_text_options = text_options.copy()
1032
+ current_text_options["cell_extraction_func"] = cell_extraction_func
1033
+ # --- Pass show_progress to the helper --- #
1034
+ current_text_options["show_progress"] = show_progress
1035
+ return self._extract_table_text(**current_text_options)
1036
+ elif effective_method == "plumber":
894
1037
  return self._extract_table_plumber(table_settings)
1038
+ else:
1039
+ raise ValueError(
1040
+ f"Unknown table extraction method: '{effective_method}'. Choose from 'tatr', 'plumber', 'text'."
1041
+ )
895
1042
 
896
1043
  def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
897
1044
  """
@@ -1052,46 +1199,273 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1052
1199
 
1053
1200
  return table_data
1054
1201
 
1055
- def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
1202
+ def _extract_table_text(self, **text_options) -> List[List[Optional[str]]]:
1056
1203
  """
1057
- Find the first element in this region matching the selector.
1204
+ Extracts table content based on text alignment analysis.
1058
1205
 
1059
1206
  Args:
1060
- selector: CSS-like selector string
1061
- apply_exclusions: Whether to apply exclusion regions
1062
- **kwargs: Additional parameters for element filtering
1207
+ **text_options: Options passed to analyze_text_table_structure,
1208
+ plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
1209
+ and 'show_progress'.
1210
+
1211
+ Returns:
1212
+ Table data as list of lists of strings (or None for empty cells).
1213
+ """
1214
+ cell_extraction_func = text_options.pop("cell_extraction_func", None)
1215
+ # --- Get show_progress option --- #
1216
+ show_progress = text_options.pop("show_progress", False)
1217
+
1218
+ # Analyze structure first (or use cached results)
1219
+ if "text_table_structure" in self.analyses:
1220
+ analysis_results = self.analyses["text_table_structure"]
1221
+ logger.debug("Using cached text table structure analysis results.")
1222
+ else:
1223
+ analysis_results = self.analyze_text_table_structure(**text_options)
1224
+
1225
+ if analysis_results is None or not analysis_results.get("cells"):
1226
+ logger.warning(f"Region {self.bbox}: No cells found using 'text' method.")
1227
+ return []
1228
+
1229
+ cell_dicts = analysis_results["cells"]
1230
+
1231
+ # --- Grid Reconstruction Logic --- #
1232
+ if not cell_dicts:
1233
+ return []
1234
+
1235
+ # 1. Get unique sorted top and left coordinates (cell boundaries)
1236
+ coord_tolerance = text_options.get("coordinate_grouping_tolerance", 1)
1237
+ tops = sorted(
1238
+ list(set(round(c["top"] / coord_tolerance) * coord_tolerance for c in cell_dicts))
1239
+ )
1240
+ lefts = sorted(
1241
+ list(set(round(c["left"] / coord_tolerance) * coord_tolerance for c in cell_dicts))
1242
+ )
1243
+
1244
+ # Refine boundaries (cluster_coords helper remains the same)
1245
+ def cluster_coords(coords):
1246
+ if not coords:
1247
+ return []
1248
+ clustered = []
1249
+ current_cluster = [coords[0]]
1250
+ for c in coords[1:]:
1251
+ if abs(c - current_cluster[-1]) <= coord_tolerance:
1252
+ current_cluster.append(c)
1253
+ else:
1254
+ clustered.append(min(current_cluster))
1255
+ current_cluster = [c]
1256
+ clustered.append(min(current_cluster))
1257
+ return clustered
1258
+
1259
+ unique_tops = cluster_coords(tops)
1260
+ unique_lefts = cluster_coords(lefts)
1261
+
1262
+ # --- Setup tqdm --- #
1263
+ tqdm = get_tqdm()
1264
+ # Determine iterable for tqdm
1265
+ cell_iterator = cell_dicts
1266
+ if show_progress:
1267
+ # Only wrap if progress should be shown
1268
+ cell_iterator = tqdm(
1269
+ cell_dicts,
1270
+ desc=f"Extracting text from {len(cell_dicts)} cells (text method)",
1271
+ unit="cell",
1272
+ leave=False, # Optional: Keep bar after completion
1273
+ )
1274
+ # --- End tqdm Setup --- #
1275
+
1276
+ # 2. Create a lookup map for cell text: {(rounded_top, rounded_left): cell_text}
1277
+ cell_text_map = {}
1278
+ # --- Use the potentially wrapped iterator --- #
1279
+ for cell_data in cell_iterator:
1280
+ try:
1281
+ cell_region = self.page.region(**cell_data)
1282
+ cell_value = None # Initialize
1283
+ if callable(cell_extraction_func):
1284
+ try:
1285
+ cell_value = cell_extraction_func(cell_region)
1286
+ if not isinstance(cell_value, (str, type(None))):
1287
+ logger.warning(
1288
+ f"Custom cell_extraction_func returned non-string/None type ({type(cell_value)}) for cell {cell_data}. Treating as None."
1289
+ )
1290
+ cell_value = None
1291
+ except Exception as func_err:
1292
+ logger.error(
1293
+ f"Error executing custom cell_extraction_func for cell {cell_data}: {func_err}",
1294
+ exc_info=True,
1295
+ )
1296
+ cell_value = None
1297
+ else:
1298
+ cell_value = cell_region.extract_text(
1299
+ layout=False, apply_exclusions=False
1300
+ ).strip()
1301
+
1302
+ rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
1303
+ rounded_left = round(cell_data["left"] / coord_tolerance) * coord_tolerance
1304
+ cell_text_map[(rounded_top, rounded_left)] = cell_value
1305
+ except Exception as e:
1306
+ logger.warning(f"Could not process cell {cell_data} for text extraction: {e}")
1307
+
1308
+ # 3. Build the final list-of-lists table (loop remains the same)
1309
+ final_table = []
1310
+ for row_top in unique_tops:
1311
+ row_data = []
1312
+ for col_left in unique_lefts:
1313
+ best_match_key = None
1314
+ min_dist_sq = float("inf")
1315
+ for map_top, map_left in cell_text_map.keys():
1316
+ if (
1317
+ abs(map_top - row_top) <= coord_tolerance
1318
+ and abs(map_left - col_left) <= coord_tolerance
1319
+ ):
1320
+ dist_sq = (map_top - row_top) ** 2 + (map_left - col_left) ** 2
1321
+ if dist_sq < min_dist_sq:
1322
+ min_dist_sq = dist_sq
1323
+ best_match_key = (map_top, map_left)
1324
+ cell_value = cell_text_map.get(best_match_key)
1325
+ row_data.append(cell_value)
1326
+ final_table.append(row_data)
1327
+
1328
+ return final_table
1329
+
1330
+ # --- END MODIFIED METHOD --- #
1331
+
1332
+ @overload
1333
+ def find(
1334
+ self,
1335
+ *,
1336
+ text: str,
1337
+ apply_exclusions: bool = True,
1338
+ regex: bool = False,
1339
+ case: bool = True,
1340
+ **kwargs,
1341
+ ) -> Optional["Element"]: ...
1342
+
1343
+ @overload
1344
+ def find(
1345
+ self,
1346
+ selector: str,
1347
+ *,
1348
+ apply_exclusions: bool = True,
1349
+ regex: bool = False,
1350
+ case: bool = True,
1351
+ **kwargs,
1352
+ ) -> Optional["Element"]: ...
1353
+
1354
+ def find(
1355
+ self,
1356
+ selector: Optional[str] = None, # Now optional
1357
+ *,
1358
+ text: Optional[str] = None, # New text parameter
1359
+ apply_exclusions: bool = True,
1360
+ regex: bool = False,
1361
+ case: bool = True,
1362
+ **kwargs,
1363
+ ) -> Optional["Element"]:
1364
+ """
1365
+ Find the first element in this region matching the selector OR text content.
1366
+
1367
+ Provide EITHER `selector` OR `text`, but not both.
1368
+
1369
+ Args:
1370
+ selector: CSS-like selector string.
1371
+ text: Text content to search for (equivalent to 'text:contains(...)').
1372
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1373
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1374
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1375
+ **kwargs: Additional parameters for element filtering.
1063
1376
 
1064
1377
  Returns:
1065
- First matching element or None
1378
+ First matching element or None.
1066
1379
  """
1067
- elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1068
- return elements.first if elements else None # Use .first property
1380
+ # Delegate validation and selector construction to find_all
1381
+ elements = self.find_all(
1382
+ selector=selector,
1383
+ text=text,
1384
+ apply_exclusions=apply_exclusions,
1385
+ regex=regex,
1386
+ case=case,
1387
+ **kwargs,
1388
+ )
1389
+ return elements.first if elements else None
1390
+
1391
+ @overload
1392
+ def find_all(
1393
+ self,
1394
+ *,
1395
+ text: str,
1396
+ apply_exclusions: bool = True,
1397
+ regex: bool = False,
1398
+ case: bool = True,
1399
+ **kwargs,
1400
+ ) -> "ElementCollection": ...
1401
+
1402
+ @overload
1403
+ def find_all(
1404
+ self,
1405
+ selector: str,
1406
+ *,
1407
+ apply_exclusions: bool = True,
1408
+ regex: bool = False,
1409
+ case: bool = True,
1410
+ **kwargs,
1411
+ ) -> "ElementCollection": ...
1069
1412
 
1070
1413
  def find_all(
1071
- self, selector: str, apply_exclusions=True, **kwargs
1072
- ) -> "ElementCollection": # Changed from _find_all
1414
+ self,
1415
+ selector: Optional[str] = None, # Now optional
1416
+ *,
1417
+ text: Optional[str] = None, # New text parameter
1418
+ apply_exclusions: bool = True,
1419
+ regex: bool = False,
1420
+ case: bool = True,
1421
+ **kwargs,
1422
+ ) -> "ElementCollection":
1073
1423
  """
1074
- Find all elements in this region matching the selector.
1424
+ Find all elements in this region matching the selector OR text content.
1425
+
1426
+ Provide EITHER `selector` OR `text`, but not both.
1075
1427
 
1076
1428
  Args:
1077
- selector: CSS-like selector string
1078
- apply_exclusions: Whether to apply exclusion regions
1079
- **kwargs: Additional parameters for element filtering
1429
+ selector: CSS-like selector string.
1430
+ text: Text content to search for (equivalent to 'text:contains(...)').
1431
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1432
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1433
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1434
+ **kwargs: Additional parameters for element filtering.
1080
1435
 
1081
1436
  Returns:
1082
- ElementCollection with matching elements
1437
+ ElementCollection with matching elements.
1083
1438
  """
1084
1439
  from natural_pdf.elements.collections import ElementCollection
1085
1440
 
1441
+ if selector is not None and text is not None:
1442
+ raise ValueError("Provide either 'selector' or 'text', not both.")
1443
+ if selector is None and text is None:
1444
+ raise ValueError("Provide either 'selector' or 'text'.")
1445
+
1446
+ # Construct selector if 'text' is provided
1447
+ effective_selector = ""
1448
+ if text is not None:
1449
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
1450
+ effective_selector = f'text:contains("{escaped_text}")'
1451
+ logger.debug(
1452
+ f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
1453
+ )
1454
+ elif selector is not None:
1455
+ effective_selector = selector
1456
+ else:
1457
+ raise ValueError("Internal error: No selector or text provided.")
1458
+
1086
1459
  # If we span multiple pages, filter our elements
1087
1460
  # TODO: Revisit multi-page region logic
1088
1461
  if self._spans_pages and self._multi_page_elements is not None:
1089
1462
  logger.warning("find_all on multi-page regions is not fully implemented.")
1090
1463
  # Temporary: Apply filter directly to cached elements
1091
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1092
-
1093
1464
  try:
1094
- selector_obj = parse_selector(selector)
1465
+ selector_obj = parse_selector(effective_selector)
1466
+ # Pass regex/case flags down
1467
+ kwargs["regex"] = regex
1468
+ kwargs["case"] = case
1095
1469
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
1096
1470
  matching = [el for el in self._multi_page_elements if filter_func(el)]
1097
1471
  return ElementCollection(matching)
@@ -1099,11 +1473,37 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1099
1473
  logger.error(f"Error applying selector to multi-page region elements: {e}")
1100
1474
  return ElementCollection([])
1101
1475
 
1102
- # Otherwise, get elements from the page and filter by selector and region
1103
- page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1104
- # Use the precise _is_element_in_region check
1105
- filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1106
- return ElementCollection(filtered_elements)
1476
+ # Normal case: Region is on a single page
1477
+ try:
1478
+ # Parse the final selector string
1479
+ selector_obj = parse_selector(effective_selector)
1480
+
1481
+ # Get all potentially relevant elements from the page
1482
+ # Let the page handle its exclusion logic if needed
1483
+ potential_elements = self.page.find_all(
1484
+ selector=effective_selector,
1485
+ apply_exclusions=False, # Apply exclusions LATER based on region bbox
1486
+ regex=regex,
1487
+ case=case,
1488
+ **kwargs,
1489
+ )
1490
+
1491
+ # Filter these elements to those strictly within the region's bounds
1492
+ region_bbox = self.bbox
1493
+ matching_elements = [
1494
+ el
1495
+ for el in potential_elements
1496
+ if el.x0 >= region_bbox[0]
1497
+ and el.top >= region_bbox[1]
1498
+ and el.x1 <= region_bbox[2]
1499
+ and el.bottom <= region_bbox[3]
1500
+ ]
1501
+
1502
+ return ElementCollection(matching_elements)
1503
+
1504
+ except Exception as e:
1505
+ logger.error(f"Error during find_all in region: {e}", exc_info=True)
1506
+ return ElementCollection([])
1107
1507
 
1108
1508
  def apply_ocr(self, replace=True, **ocr_params) -> "Region":
1109
1509
  """
@@ -1111,7 +1511,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1111
1511
 
1112
1512
  Args:
1113
1513
  replace: If True (default), removes existing OCR elements in the region
1114
- before adding new ones. If False, adds new OCR elements without
1514
+ before adding new ones. If False, adds new OCR elements without
1115
1515
  removing existing ones.
1116
1516
  **ocr_params: Keyword arguments passed to the OCR Manager.
1117
1517
  Common parameters like `engine`, `languages`, `min_confidence`,
@@ -1131,13 +1531,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1131
1531
 
1132
1532
  # If replace is True, find and remove existing OCR elements in this region
1133
1533
  if replace:
1134
- logger.info(f"Region {self.bbox}: Removing existing OCR elements before applying new OCR.")
1534
+ logger.info(
1535
+ f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
1536
+ )
1135
1537
  # Find all OCR elements in this region
1136
1538
  ocr_selector = "text[source=ocr]"
1137
1539
  ocr_elements = self.find_all(ocr_selector)
1138
-
1540
+
1139
1541
  if ocr_elements:
1140
- logger.info(f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove.")
1542
+ logger.info(
1543
+ f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove."
1544
+ )
1141
1545
  # Remove these elements from their page
1142
1546
  removed_count = ocr_elements.remove()
1143
1547
  logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
@@ -1661,8 +2065,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1661
2065
  return self.child_regions
1662
2066
 
1663
2067
  # Use existing selector parser to filter
1664
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1665
-
1666
2068
  try:
1667
2069
  selector_obj = parse_selector(selector)
1668
2070
  filter_func = selector_to_filter_func(selector_obj) # Removed region=self
@@ -1703,8 +2105,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1703
2105
 
1704
2106
  # Filter by selector if provided
1705
2107
  if selector is not None:
1706
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1707
-
1708
2108
  try:
1709
2109
  selector_obj = parse_selector(selector)
1710
2110
  filter_func = selector_to_filter_func(selector_obj) # Removed region=self
@@ -1717,11 +2117,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1717
2117
 
1718
2118
  return all_descendants
1719
2119
 
1720
- # Removed recursive=True, find_all on region shouldn't be recursive by default
1721
- # Renamed _find_all back to find_all
1722
- # def find_all(self, selector, apply_exclusions=True, **kwargs):
1723
- # See implementation above near get_elements
1724
-
1725
2120
  def __repr__(self) -> str:
1726
2121
  """String representation of the region."""
1727
2122
  poly_info = " (Polygon)" if self.has_polygon else ""
@@ -1772,44 +2167,218 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
1772
2167
 
1773
2168
  # --- Classification Mixin Implementation --- #
1774
2169
  def _get_classification_manager(self) -> "ClassificationManager":
1775
- if not hasattr(self, 'page') or not hasattr(self.page, 'pdf') or not hasattr(self.page.pdf, 'get_manager'):
1776
- raise AttributeError("ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing.")
2170
+ if (
2171
+ not hasattr(self, "page")
2172
+ or not hasattr(self.page, "pdf")
2173
+ or not hasattr(self.page.pdf, "get_manager")
2174
+ ):
2175
+ raise AttributeError(
2176
+ "ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing."
2177
+ )
1777
2178
  try:
1778
- # Use the PDF's manager registry accessor via page
1779
- return self.page.pdf.get_manager('classification')
2179
+ # Use the PDF's manager registry accessor via page
2180
+ return self.page.pdf.get_manager("classification")
1780
2181
  except (ValueError, RuntimeError, AttributeError) as e:
1781
- # Wrap potential errors from get_manager for clarity
1782
- raise AttributeError(f"Failed to get ClassificationManager from PDF via Page: {e}") from e
1783
-
1784
- def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
1785
- if model_type == 'text':
1786
- text_content = self.extract_text(layout=False) # Simple join for classification
2182
+ # Wrap potential errors from get_manager for clarity
2183
+ raise AttributeError(
2184
+ f"Failed to get ClassificationManager from PDF via Page: {e}"
2185
+ ) from e
2186
+
2187
+ def _get_classification_content(
2188
+ self, model_type: str, **kwargs
2189
+ ) -> Union[str, "Image"]: # Use "Image" for lazy import
2190
+ if model_type == "text":
2191
+ text_content = self.extract_text(layout=False) # Simple join for classification
1787
2192
  if not text_content or text_content.isspace():
1788
2193
  raise ValueError("Cannot classify region with 'text' model: No text content found.")
1789
2194
  return text_content
1790
- elif model_type == 'vision':
2195
+ elif model_type == "vision":
1791
2196
  # Get resolution from manager/kwargs if possible, else default
1792
2197
  # We access manager via the method to ensure it's available
1793
2198
  manager = self._get_classification_manager()
1794
- default_resolution = 150 # Manager doesn't store default res, set here
2199
+ default_resolution = 150 # Manager doesn't store default res, set here
1795
2200
  # Note: classify() passes resolution via **kwargs if user specifies
1796
- resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
2201
+ resolution = (
2202
+ kwargs.get("resolution", default_resolution)
2203
+ if "kwargs" in locals()
2204
+ else default_resolution
2205
+ )
1797
2206
 
1798
2207
  img = self.to_image(
1799
2208
  resolution=resolution,
1800
- include_highlights=False, # No highlights for classification input
1801
- crop_only=True # Just the region content
2209
+ include_highlights=False, # No highlights for classification input
2210
+ crop_only=True, # Just the region content
1802
2211
  )
1803
2212
  if img is None:
1804
- raise ValueError("Cannot classify region with 'vision' model: Failed to render image.")
2213
+ raise ValueError(
2214
+ "Cannot classify region with 'vision' model: Failed to render image."
2215
+ )
1805
2216
  return img
1806
2217
  else:
1807
2218
  raise ValueError(f"Unsupported model_type for classification: {model_type}")
1808
2219
 
1809
2220
  def _get_metadata_storage(self) -> Dict[str, Any]:
1810
2221
  # Ensure metadata exists
1811
- if not hasattr(self, 'metadata') or self.metadata is None:
2222
+ if not hasattr(self, "metadata") or self.metadata is None:
1812
2223
  self.metadata = {}
1813
2224
  return self.metadata
1814
2225
 
1815
2226
  # --- End Classification Mixin Implementation --- #
2227
+
2228
+ # --- NEW METHOD: analyze_text_table_structure ---
2229
+ def analyze_text_table_structure(
2230
+ self,
2231
+ snap_tolerance: int = 10,
2232
+ join_tolerance: int = 3,
2233
+ min_words_vertical: int = 3,
2234
+ min_words_horizontal: int = 1,
2235
+ intersection_tolerance: int = 3,
2236
+ expand_bbox: Optional[Dict[str, int]] = None,
2237
+ **kwargs,
2238
+ ) -> Optional[Dict]:
2239
+ """
2240
+ Analyzes the text elements within the region (or slightly expanded area)
2241
+ to find potential table structure (lines, cells) using text alignment logic
2242
+ adapted from pdfplumber.
2243
+
2244
+ Args:
2245
+ snap_tolerance: Tolerance for snapping parallel lines.
2246
+ join_tolerance: Tolerance for joining collinear lines.
2247
+ min_words_vertical: Minimum words needed to define a vertical line.
2248
+ min_words_horizontal: Minimum words needed to define a horizontal line.
2249
+ intersection_tolerance: Tolerance for detecting line intersections.
2250
+ expand_bbox: Optional dictionary to expand the search area slightly beyond
2251
+ the region's exact bounds (e.g., {'left': 5, 'right': 5}).
2252
+ **kwargs: Additional keyword arguments passed to
2253
+ find_text_based_tables (e.g., specific x/y tolerances).
2254
+
2255
+ Returns:
2256
+ A dictionary containing 'horizontal_edges', 'vertical_edges', 'cells' (list of dicts),
2257
+ and 'intersections', or None if pdfplumber is unavailable or an error occurs.
2258
+ """
2259
+
2260
+ # Determine the search region (expand if requested)
2261
+ search_region = self
2262
+ if expand_bbox and isinstance(expand_bbox, dict):
2263
+ try:
2264
+ search_region = self.expand(**expand_bbox)
2265
+ logger.debug(
2266
+ f"Expanded search region for text table analysis to: {search_region.bbox}"
2267
+ )
2268
+ except Exception as e:
2269
+ logger.warning(f"Could not expand region bbox: {e}. Using original region.")
2270
+ search_region = self
2271
+
2272
+ # Find text elements within the search region
2273
+ text_elements = search_region.find_all(
2274
+ "text", apply_exclusions=False
2275
+ ) # Use unfiltered text
2276
+ if not text_elements:
2277
+ logger.info(f"Region {self.bbox}: No text elements found for text table analysis.")
2278
+ return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
2279
+
2280
+ # Extract bounding boxes
2281
+ bboxes = [element.bbox for element in text_elements if hasattr(element, "bbox")]
2282
+ if not bboxes:
2283
+ logger.info(f"Region {self.bbox}: No bboxes extracted from text elements.")
2284
+ return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
2285
+
2286
+ # Call the utility function
2287
+ try:
2288
+ analysis_results = find_text_based_tables(
2289
+ bboxes=bboxes,
2290
+ snap_tolerance=snap_tolerance,
2291
+ join_tolerance=join_tolerance,
2292
+ min_words_vertical=min_words_vertical,
2293
+ min_words_horizontal=min_words_horizontal,
2294
+ intersection_tolerance=intersection_tolerance,
2295
+ **kwargs, # Pass through any extra specific tolerance args
2296
+ )
2297
+ # Store results in the region's analyses cache
2298
+ self.analyses["text_table_structure"] = analysis_results
2299
+ return analysis_results
2300
+ except ImportError:
2301
+ logger.error("pdfplumber library is required for 'text' table analysis but not found.")
2302
+ return None
2303
+ except Exception as e:
2304
+ logger.error(f"Error during text-based table analysis: {e}", exc_info=True)
2305
+ return None
2306
+
2307
+ # --- END NEW METHOD ---
2308
+
2309
+ # --- NEW METHOD: get_text_table_cells ---
2310
+ def get_text_table_cells(
2311
+ self,
2312
+ snap_tolerance: int = 10,
2313
+ join_tolerance: int = 3,
2314
+ min_words_vertical: int = 3,
2315
+ min_words_horizontal: int = 1,
2316
+ intersection_tolerance: int = 3,
2317
+ expand_bbox: Optional[Dict[str, int]] = None,
2318
+ **kwargs,
2319
+ ) -> "ElementCollection[Region]":
2320
+ """
2321
+ Analyzes text alignment to find table cells and returns them as
2322
+ temporary Region objects without adding them to the page.
2323
+
2324
+ Args:
2325
+ snap_tolerance: Tolerance for snapping parallel lines.
2326
+ join_tolerance: Tolerance for joining collinear lines.
2327
+ min_words_vertical: Minimum words needed to define a vertical line.
2328
+ min_words_horizontal: Minimum words needed to define a horizontal line.
2329
+ intersection_tolerance: Tolerance for detecting line intersections.
2330
+ expand_bbox: Optional dictionary to expand the search area slightly beyond
2331
+ the region's exact bounds (e.g., {'left': 5, 'right': 5}).
2332
+ **kwargs: Additional keyword arguments passed to
2333
+ find_text_based_tables (e.g., specific x/y tolerances).
2334
+
2335
+ Returns:
2336
+ An ElementCollection containing temporary Region objects for each detected cell,
2337
+ or an empty ElementCollection if no cells are found or an error occurs.
2338
+ """
2339
+ from natural_pdf.elements.collections import ElementCollection
2340
+
2341
+ # 1. Perform the analysis (or use cached results)
2342
+ if "text_table_structure" in self.analyses:
2343
+ analysis_results = self.analyses["text_table_structure"]
2344
+ logger.debug("get_text_table_cells: Using cached analysis results.")
2345
+ else:
2346
+ analysis_results = self.analyze_text_table_structure(
2347
+ snap_tolerance=snap_tolerance,
2348
+ join_tolerance=join_tolerance,
2349
+ min_words_vertical=min_words_vertical,
2350
+ min_words_horizontal=min_words_horizontal,
2351
+ intersection_tolerance=intersection_tolerance,
2352
+ expand_bbox=expand_bbox,
2353
+ **kwargs,
2354
+ )
2355
+
2356
+ # 2. Check if analysis was successful and cells were found
2357
+ if analysis_results is None or not analysis_results.get("cells"):
2358
+ logger.info(f"Region {self.bbox}: No cells found by text table analysis.")
2359
+ return ElementCollection([]) # Return empty collection
2360
+
2361
+ # 3. Create temporary Region objects for each cell dictionary
2362
+ cell_regions = []
2363
+ for cell_data in analysis_results["cells"]:
2364
+ try:
2365
+ # Use page.region to create the region object
2366
+ # It expects left, top, right, bottom keys
2367
+ cell_region = self.page.region(**cell_data)
2368
+
2369
+ # Set metadata on the temporary region
2370
+ cell_region.region_type = "table-cell"
2371
+ cell_region.normalized_type = "table-cell"
2372
+ cell_region.model = "pdfplumber-text"
2373
+ cell_region.source = "volatile" # Indicate it's not managed/persistent
2374
+ cell_region.parent_region = self # Link back to the region it came from
2375
+
2376
+ cell_regions.append(cell_region)
2377
+ except Exception as e:
2378
+ logger.warning(f"Could not create Region object for cell data {cell_data}: {e}")
2379
+
2380
+ # 4. Return the list wrapped in an ElementCollection
2381
+ logger.debug(f"get_text_table_cells: Created {len(cell_regions)} temporary cell regions.")
2382
+ return ElementCollection(cell_regions)
2383
+
2384
+ # --- END NEW METHOD ---