natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,20 +1,37 @@
1
1
  import logging
2
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
2
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, overload
3
3
 
4
4
  from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
5
5
 
6
6
  # New Imports
7
7
  from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
8
8
 
9
+ from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
10
+ from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
11
+
12
+ # --- Classification Imports --- #
13
+ from natural_pdf.classification.mixin import ClassificationMixin
9
14
  from natural_pdf.elements.base import DirectionalMixin
15
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
16
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
17
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
18
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
10
19
 
11
20
  # Import new utils
12
21
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
13
22
 
14
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
23
+ # --- NEW: Import tqdm utility --- #
24
+ from natural_pdf.utils.tqdm_utils import get_tqdm
25
+
26
+ # --- End Classification Imports --- #
27
+
15
28
 
16
29
  if TYPE_CHECKING:
30
+ # --- NEW: Add Image type hint for classification --- #
31
+ from PIL.Image import Image
32
+
17
33
  from natural_pdf.core.page import Page
34
+ from natural_pdf.elements.collections import ElementCollection
18
35
  from natural_pdf.elements.text import TextElement
19
36
 
20
37
  # Import OCRManager conditionally to avoid circular imports
@@ -27,7 +44,7 @@ except ImportError:
27
44
  logger = logging.getLogger(__name__)
28
45
 
29
46
 
30
- class Region(DirectionalMixin):
47
+ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
31
48
  """
32
49
  Represents a rectangular region on a page.
33
50
  """
@@ -57,6 +74,12 @@ class Region(DirectionalMixin):
57
74
  self.start_element = None
58
75
  self.end_element = None
59
76
 
77
+ # --- ADDED --- Metadata store for mixins
78
+ self.metadata: Dict[str, Any] = {}
79
+ # --- NEW --- Central registry for analysis results
80
+ self.analyses: Dict[str, Any] = {}
81
+ # --- END ADDED ---
82
+
60
83
  # Standard attributes for all elements
61
84
  self.object_type = "region" # For selector compatibility
62
85
 
@@ -490,9 +513,37 @@ class Region(DirectionalMixin):
490
513
 
491
514
  return inside
492
515
 
516
+ def is_element_center_inside(self, element: "Element") -> bool:
517
+ """
518
+ Check if the center point of an element's bounding box is inside this region.
519
+
520
+ Args:
521
+ element: Element to check
522
+
523
+ Returns:
524
+ True if the element's center point is inside the region, False otherwise.
525
+ """
526
+ # Check if element is on the same page
527
+ if not hasattr(element, "page") or element.page != self._page:
528
+ return False
529
+
530
+ # Ensure element has necessary attributes
531
+ if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
532
+ logger.warning(
533
+ f"Element {element} lacks bounding box attributes. Cannot check center point."
534
+ )
535
+ return False # Cannot determine position
536
+
537
+ # Calculate center point
538
+ center_x = (element.x0 + element.x1) / 2
539
+ center_y = (element.top + element.bottom) / 2
540
+
541
+ # Use the existing is_point_inside check
542
+ return self.is_point_inside(center_x, center_y)
543
+
493
544
  def _is_element_in_region(self, element: "Element", use_boundary_tolerance=True) -> bool:
494
545
  """
495
- Check if an element is within this region.
546
+ Check if an element intersects or is contained within this region.
496
547
 
497
548
  Args:
498
549
  element: Element to check
@@ -509,16 +560,101 @@ class Region(DirectionalMixin):
509
560
  if not hasattr(element, "page") or element.page != self._page:
510
561
  return False
511
562
 
512
- # Calculate element center
563
+ return self.is_element_center_inside(element)
564
+ # return self.intersects(element)
565
+
566
+ def contains(self, element: "Element") -> bool:
567
+ """
568
+ Check if this region completely contains an element.
569
+
570
+ Args:
571
+ element: Element to check
572
+
573
+ Returns:
574
+ True if the element is completely contained within the region, False otherwise
575
+ """
576
+ # Check if element is on the same page
577
+ if not hasattr(element, "page") or element.page != self._page:
578
+ return False
579
+
513
580
  # Ensure element has necessary attributes
514
581
  if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
515
582
  return False # Cannot determine position
516
583
 
517
- element_center_x = (element.x0 + element.x1) / 2
518
- element_center_y = (element.top + element.bottom) / 2
584
+ # For rectangular regions, check if element's bbox is fully inside region's bbox
585
+ if not self.has_polygon:
586
+ return (
587
+ self.x0 <= element.x0
588
+ and element.x1 <= self.x1
589
+ and self.top <= element.top
590
+ and element.bottom <= self.bottom
591
+ )
592
+
593
+ # For polygon regions, check if all corners of the element are inside the polygon
594
+ element_corners = [
595
+ (element.x0, element.top), # top-left
596
+ (element.x1, element.top), # top-right
597
+ (element.x1, element.bottom), # bottom-right
598
+ (element.x0, element.bottom), # bottom-left
599
+ ]
600
+
601
+ return all(self.is_point_inside(x, y) for x, y in element_corners)
602
+
603
+ def intersects(self, element: "Element") -> bool:
604
+ """
605
+ Check if this region intersects with an element (any overlap).
519
606
 
520
- # Check if center point is inside the region's geometry
521
- return self.is_point_inside(element_center_x, element_center_y)
607
+ Args:
608
+ element: Element to check
609
+
610
+ Returns:
611
+ True if the element overlaps with the region at all, False otherwise
612
+ """
613
+ # Check if element is on the same page
614
+ if not hasattr(element, "page") or element.page != self._page:
615
+ return False
616
+
617
+ # Ensure element has necessary attributes
618
+ if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
619
+ return False # Cannot determine position
620
+
621
+ # For rectangular regions, check for bbox overlap
622
+ if not self.has_polygon:
623
+ return (
624
+ self.x0 < element.x1
625
+ and self.x1 > element.x0
626
+ and self.top < element.bottom
627
+ and self.bottom > element.top
628
+ )
629
+
630
+ # For polygon regions, check if any corner of the element is inside the polygon
631
+ element_corners = [
632
+ (element.x0, element.top), # top-left
633
+ (element.x1, element.top), # top-right
634
+ (element.x1, element.bottom), # bottom-right
635
+ (element.x0, element.bottom), # bottom-left
636
+ ]
637
+
638
+ # First check if any element corner is inside the polygon
639
+ if any(self.is_point_inside(x, y) for x, y in element_corners):
640
+ return True
641
+
642
+ # Also check if any polygon corner is inside the element's rectangle
643
+ for x, y in self.polygon:
644
+ if element.x0 <= x <= element.x1 and element.top <= y <= element.bottom:
645
+ return True
646
+
647
+ # Also check if any polygon edge intersects with any rectangle edge
648
+ # This is a simplification - for complex cases, we'd need a full polygon-rectangle
649
+ # intersection algorithm
650
+
651
+ # For now, return True if bounding boxes overlap (approximation for polygon-rectangle case)
652
+ return (
653
+ self.x0 < element.x1
654
+ and self.x1 > element.x0
655
+ and self.top < element.bottom
656
+ and self.bottom > element.top
657
+ )
522
658
 
523
659
  def highlight(
524
660
  self,
@@ -600,6 +736,18 @@ class Region(DirectionalMixin):
600
736
  x1 = int(self.x1 * scale_factor)
601
737
  bottom = int(self.bottom * scale_factor)
602
738
 
739
+ # Ensure coords are valid for cropping (left < right, top < bottom)
740
+ if x0 >= x1:
741
+ logger.warning(
742
+ f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
743
+ )
744
+ return None
745
+ if top >= bottom:
746
+ logger.warning(
747
+ f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
748
+ )
749
+ return None
750
+
603
751
  # Crop the image to just this region
604
752
  region_image = page_image.crop((x0, top, x1, bottom))
605
753
 
@@ -776,11 +924,6 @@ class Region(DirectionalMixin):
776
924
  debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
777
925
  logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
778
926
 
779
- # --- Handle Docling source (priority) --- DEPRECATED or Adapt?
780
- # For now, let's bypass this and always use the standard extraction flow
781
- # based on contained elements to ensure consistency.
782
- # if self.model == 'docling' or hasattr(self, 'text_content'): ...
783
-
784
927
  # 1. Get Word Elements potentially within this region (initial broad phase)
785
928
  # Optimization: Could use spatial query if page elements were indexed
786
929
  page_words = self.page.words # Get all words from the page
@@ -829,7 +972,7 @@ class Region(DirectionalMixin):
829
972
  result = generate_text_layout(
830
973
  char_dicts=filtered_chars,
831
974
  layout_context_bbox=self.bbox, # Use region's bbox for context
832
- user_kwargs=kwargs,
975
+ user_kwargs=kwargs, # Pass original kwargs to layout generator
833
976
  )
834
977
 
835
978
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -837,40 +980,65 @@ class Region(DirectionalMixin):
837
980
 
838
981
  def extract_table(
839
982
  self,
840
- method: str = None,
841
- table_settings: dict = None,
983
+ method: Optional[str] = None, # Make method optional
984
+ table_settings: Optional[dict] = None, # Use Optional
842
985
  use_ocr: bool = False,
843
- ocr_config: dict = None,
844
- ) -> List[List[str]]:
986
+ ocr_config: Optional[dict] = None, # Use Optional
987
+ text_options: Optional[Dict] = None,
988
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
989
+ # --- NEW: Add tqdm control option --- #
990
+ show_progress: bool = False, # Controls progress bar for text method
991
+ ) -> List[List[Optional[str]]]: # Return type allows Optional[str] for cells
845
992
  """
846
993
  Extract a table from this region.
847
994
 
848
995
  Args:
849
- method: Method to use for extraction ('tatr', 'plumber', or None for auto-detection)
850
- table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
851
- use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
852
- ocr_config: OCR configuration parameters
996
+ method: Method to use: 'tatr', 'plumber', 'text', or None (auto-detect).
997
+ table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method).
998
+ use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
999
+ ocr_config: OCR configuration parameters.
1000
+ text_options: Dictionary of options for the 'text' method, corresponding to arguments
1001
+ of analyze_text_table_structure (e.g., snap_tolerance, expand_bbox).
1002
+ cell_extraction_func: Optional callable function that takes a cell Region object
1003
+ and returns its string content. Overrides default text extraction
1004
+ for the 'text' method.
1005
+ show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
853
1006
 
854
1007
  Returns:
855
- Table data as a list of rows, where each row is a list of cell values
1008
+ Table data as a list of rows, where each row is a list of cell values (str or None).
856
1009
  """
857
1010
  # Default settings if none provided
858
1011
  if table_settings is None:
859
1012
  table_settings = {}
1013
+ if text_options is None:
1014
+ text_options = {} # Initialize empty dict
860
1015
 
861
1016
  # Auto-detect method if not specified
862
- if method is None:
1017
+ effective_method = method
1018
+ if effective_method is None:
863
1019
  # If this is a TATR-detected region, use TATR method
864
1020
  if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
865
- method = "tatr"
1021
+ effective_method = "tatr"
866
1022
  else:
867
- method = "plumber"
1023
+ effective_method = "text"
1024
+
1025
+ logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
868
1026
 
869
1027
  # Use the selected method
870
- if method == "tatr":
1028
+ if effective_method == "tatr":
871
1029
  return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
872
- else: # Default to pdfplumber
1030
+ elif effective_method == "text":
1031
+ current_text_options = text_options.copy()
1032
+ current_text_options["cell_extraction_func"] = cell_extraction_func
1033
+ # --- Pass show_progress to the helper --- #
1034
+ current_text_options["show_progress"] = show_progress
1035
+ return self._extract_table_text(**current_text_options)
1036
+ elif effective_method == "plumber":
873
1037
  return self._extract_table_plumber(table_settings)
1038
+ else:
1039
+ raise ValueError(
1040
+ f"Unknown table extraction method: '{effective_method}'. Choose from 'tatr', 'plumber', 'text'."
1041
+ )
874
1042
 
875
1043
  def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
876
1044
  """
@@ -1031,46 +1199,273 @@ class Region(DirectionalMixin):
1031
1199
 
1032
1200
  return table_data
1033
1201
 
1034
- def find(self, selector: str, apply_exclusions=True, **kwargs) -> Optional["Element"]:
1202
+ def _extract_table_text(self, **text_options) -> List[List[Optional[str]]]:
1035
1203
  """
1036
- Find the first element in this region matching the selector.
1204
+ Extracts table content based on text alignment analysis.
1037
1205
 
1038
1206
  Args:
1039
- selector: CSS-like selector string
1040
- apply_exclusions: Whether to apply exclusion regions
1041
- **kwargs: Additional parameters for element filtering
1207
+ **text_options: Options passed to analyze_text_table_structure,
1208
+ plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
1209
+ and 'show_progress'.
1042
1210
 
1043
1211
  Returns:
1044
- First matching element or None
1212
+ Table data as list of lists of strings (or None for empty cells).
1045
1213
  """
1046
- elements = self.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1047
- return elements.first if elements else None # Use .first property
1214
+ cell_extraction_func = text_options.pop("cell_extraction_func", None)
1215
+ # --- Get show_progress option --- #
1216
+ show_progress = text_options.pop("show_progress", False)
1217
+
1218
+ # Analyze structure first (or use cached results)
1219
+ if "text_table_structure" in self.analyses:
1220
+ analysis_results = self.analyses["text_table_structure"]
1221
+ logger.debug("Using cached text table structure analysis results.")
1222
+ else:
1223
+ analysis_results = self.analyze_text_table_structure(**text_options)
1224
+
1225
+ if analysis_results is None or not analysis_results.get("cells"):
1226
+ logger.warning(f"Region {self.bbox}: No cells found using 'text' method.")
1227
+ return []
1228
+
1229
+ cell_dicts = analysis_results["cells"]
1230
+
1231
+ # --- Grid Reconstruction Logic --- #
1232
+ if not cell_dicts:
1233
+ return []
1234
+
1235
+ # 1. Get unique sorted top and left coordinates (cell boundaries)
1236
+ coord_tolerance = text_options.get("coordinate_grouping_tolerance", 1)
1237
+ tops = sorted(
1238
+ list(set(round(c["top"] / coord_tolerance) * coord_tolerance for c in cell_dicts))
1239
+ )
1240
+ lefts = sorted(
1241
+ list(set(round(c["left"] / coord_tolerance) * coord_tolerance for c in cell_dicts))
1242
+ )
1243
+
1244
+ # Refine boundaries (cluster_coords helper remains the same)
1245
+ def cluster_coords(coords):
1246
+ if not coords:
1247
+ return []
1248
+ clustered = []
1249
+ current_cluster = [coords[0]]
1250
+ for c in coords[1:]:
1251
+ if abs(c - current_cluster[-1]) <= coord_tolerance:
1252
+ current_cluster.append(c)
1253
+ else:
1254
+ clustered.append(min(current_cluster))
1255
+ current_cluster = [c]
1256
+ clustered.append(min(current_cluster))
1257
+ return clustered
1258
+
1259
+ unique_tops = cluster_coords(tops)
1260
+ unique_lefts = cluster_coords(lefts)
1261
+
1262
+ # --- Setup tqdm --- #
1263
+ tqdm = get_tqdm()
1264
+ # Determine iterable for tqdm
1265
+ cell_iterator = cell_dicts
1266
+ if show_progress:
1267
+ # Only wrap if progress should be shown
1268
+ cell_iterator = tqdm(
1269
+ cell_dicts,
1270
+ desc=f"Extracting text from {len(cell_dicts)} cells (text method)",
1271
+ unit="cell",
1272
+ leave=False, # Optional: Keep bar after completion
1273
+ )
1274
+ # --- End tqdm Setup --- #
1048
1275
 
1276
+ # 2. Create a lookup map for cell text: {(rounded_top, rounded_left): cell_text}
1277
+ cell_text_map = {}
1278
+ # --- Use the potentially wrapped iterator --- #
1279
+ for cell_data in cell_iterator:
1280
+ try:
1281
+ cell_region = self.page.region(**cell_data)
1282
+ cell_value = None # Initialize
1283
+ if callable(cell_extraction_func):
1284
+ try:
1285
+ cell_value = cell_extraction_func(cell_region)
1286
+ if not isinstance(cell_value, (str, type(None))):
1287
+ logger.warning(
1288
+ f"Custom cell_extraction_func returned non-string/None type ({type(cell_value)}) for cell {cell_data}. Treating as None."
1289
+ )
1290
+ cell_value = None
1291
+ except Exception as func_err:
1292
+ logger.error(
1293
+ f"Error executing custom cell_extraction_func for cell {cell_data}: {func_err}",
1294
+ exc_info=True,
1295
+ )
1296
+ cell_value = None
1297
+ else:
1298
+ cell_value = cell_region.extract_text(
1299
+ layout=False, apply_exclusions=False
1300
+ ).strip()
1301
+
1302
+ rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
1303
+ rounded_left = round(cell_data["left"] / coord_tolerance) * coord_tolerance
1304
+ cell_text_map[(rounded_top, rounded_left)] = cell_value
1305
+ except Exception as e:
1306
+ logger.warning(f"Could not process cell {cell_data} for text extraction: {e}")
1307
+
1308
+ # 3. Build the final list-of-lists table (loop remains the same)
1309
+ final_table = []
1310
+ for row_top in unique_tops:
1311
+ row_data = []
1312
+ for col_left in unique_lefts:
1313
+ best_match_key = None
1314
+ min_dist_sq = float("inf")
1315
+ for map_top, map_left in cell_text_map.keys():
1316
+ if (
1317
+ abs(map_top - row_top) <= coord_tolerance
1318
+ and abs(map_left - col_left) <= coord_tolerance
1319
+ ):
1320
+ dist_sq = (map_top - row_top) ** 2 + (map_left - col_left) ** 2
1321
+ if dist_sq < min_dist_sq:
1322
+ min_dist_sq = dist_sq
1323
+ best_match_key = (map_top, map_left)
1324
+ cell_value = cell_text_map.get(best_match_key)
1325
+ row_data.append(cell_value)
1326
+ final_table.append(row_data)
1327
+
1328
+ return final_table
1329
+
1330
+ # --- END MODIFIED METHOD --- #
1331
+
1332
+ @overload
1333
+ def find(
1334
+ self,
1335
+ *,
1336
+ text: str,
1337
+ apply_exclusions: bool = True,
1338
+ regex: bool = False,
1339
+ case: bool = True,
1340
+ **kwargs,
1341
+ ) -> Optional["Element"]: ...
1342
+
1343
+ @overload
1344
+ def find(
1345
+ self,
1346
+ selector: str,
1347
+ *,
1348
+ apply_exclusions: bool = True,
1349
+ regex: bool = False,
1350
+ case: bool = True,
1351
+ **kwargs,
1352
+ ) -> Optional["Element"]: ...
1353
+
1354
+ def find(
1355
+ self,
1356
+ selector: Optional[str] = None, # Now optional
1357
+ *,
1358
+ text: Optional[str] = None, # New text parameter
1359
+ apply_exclusions: bool = True,
1360
+ regex: bool = False,
1361
+ case: bool = True,
1362
+ **kwargs,
1363
+ ) -> Optional["Element"]:
1364
+ """
1365
+ Find the first element in this region matching the selector OR text content.
1366
+
1367
+ Provide EITHER `selector` OR `text`, but not both.
1368
+
1369
+ Args:
1370
+ selector: CSS-like selector string.
1371
+ text: Text content to search for (equivalent to 'text:contains(...)').
1372
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1373
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1374
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1375
+ **kwargs: Additional parameters for element filtering.
1376
+
1377
+ Returns:
1378
+ First matching element or None.
1379
+ """
1380
+ # Delegate validation and selector construction to find_all
1381
+ elements = self.find_all(
1382
+ selector=selector,
1383
+ text=text,
1384
+ apply_exclusions=apply_exclusions,
1385
+ regex=regex,
1386
+ case=case,
1387
+ **kwargs,
1388
+ )
1389
+ return elements.first if elements else None
1390
+
1391
+ @overload
1049
1392
  def find_all(
1050
- self, selector: str, apply_exclusions=True, **kwargs
1051
- ) -> "ElementCollection": # Changed from _find_all
1393
+ self,
1394
+ *,
1395
+ text: str,
1396
+ apply_exclusions: bool = True,
1397
+ regex: bool = False,
1398
+ case: bool = True,
1399
+ **kwargs,
1400
+ ) -> "ElementCollection": ...
1401
+
1402
+ @overload
1403
+ def find_all(
1404
+ self,
1405
+ selector: str,
1406
+ *,
1407
+ apply_exclusions: bool = True,
1408
+ regex: bool = False,
1409
+ case: bool = True,
1410
+ **kwargs,
1411
+ ) -> "ElementCollection": ...
1412
+
1413
+ def find_all(
1414
+ self,
1415
+ selector: Optional[str] = None, # Now optional
1416
+ *,
1417
+ text: Optional[str] = None, # New text parameter
1418
+ apply_exclusions: bool = True,
1419
+ regex: bool = False,
1420
+ case: bool = True,
1421
+ **kwargs,
1422
+ ) -> "ElementCollection":
1052
1423
  """
1053
- Find all elements in this region matching the selector.
1424
+ Find all elements in this region matching the selector OR text content.
1425
+
1426
+ Provide EITHER `selector` OR `text`, but not both.
1054
1427
 
1055
1428
  Args:
1056
- selector: CSS-like selector string
1057
- apply_exclusions: Whether to apply exclusion regions
1058
- **kwargs: Additional parameters for element filtering
1429
+ selector: CSS-like selector string.
1430
+ text: Text content to search for (equivalent to 'text:contains(...)').
1431
+ apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
1432
+ regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1433
+ case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
1434
+ **kwargs: Additional parameters for element filtering.
1059
1435
 
1060
1436
  Returns:
1061
- ElementCollection with matching elements
1437
+ ElementCollection with matching elements.
1062
1438
  """
1063
1439
  from natural_pdf.elements.collections import ElementCollection
1064
1440
 
1441
+ if selector is not None and text is not None:
1442
+ raise ValueError("Provide either 'selector' or 'text', not both.")
1443
+ if selector is None and text is None:
1444
+ raise ValueError("Provide either 'selector' or 'text'.")
1445
+
1446
+ # Construct selector if 'text' is provided
1447
+ effective_selector = ""
1448
+ if text is not None:
1449
+ escaped_text = text.replace('"', '\\"').replace("'", "\\'")
1450
+ effective_selector = f'text:contains("{escaped_text}")'
1451
+ logger.debug(
1452
+ f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
1453
+ )
1454
+ elif selector is not None:
1455
+ effective_selector = selector
1456
+ else:
1457
+ raise ValueError("Internal error: No selector or text provided.")
1458
+
1065
1459
  # If we span multiple pages, filter our elements
1066
1460
  # TODO: Revisit multi-page region logic
1067
1461
  if self._spans_pages and self._multi_page_elements is not None:
1068
1462
  logger.warning("find_all on multi-page regions is not fully implemented.")
1069
1463
  # Temporary: Apply filter directly to cached elements
1070
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1071
-
1072
1464
  try:
1073
- selector_obj = parse_selector(selector)
1465
+ selector_obj = parse_selector(effective_selector)
1466
+ # Pass regex/case flags down
1467
+ kwargs["regex"] = regex
1468
+ kwargs["case"] = case
1074
1469
  filter_func = selector_to_filter_func(selector_obj, **kwargs)
1075
1470
  matching = [el for el in self._multi_page_elements if filter_func(el)]
1076
1471
  return ElementCollection(matching)
@@ -1078,17 +1473,46 @@ class Region(DirectionalMixin):
1078
1473
  logger.error(f"Error applying selector to multi-page region elements: {e}")
1079
1474
  return ElementCollection([])
1080
1475
 
1081
- # Otherwise, get elements from the page and filter by selector and region
1082
- page_elements = self.page.find_all(selector, apply_exclusions=apply_exclusions, **kwargs)
1083
- # Use the precise _is_element_in_region check
1084
- filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1085
- return ElementCollection(filtered_elements)
1476
+ # Normal case: Region is on a single page
1477
+ try:
1478
+ # Parse the final selector string
1479
+ selector_obj = parse_selector(effective_selector)
1480
+
1481
+ # Get all potentially relevant elements from the page
1482
+ # Let the page handle its exclusion logic if needed
1483
+ potential_elements = self.page.find_all(
1484
+ selector=effective_selector,
1485
+ apply_exclusions=False, # Apply exclusions LATER based on region bbox
1486
+ regex=regex,
1487
+ case=case,
1488
+ **kwargs,
1489
+ )
1086
1490
 
1087
- def apply_ocr(self, **ocr_params) -> "Region":
1491
+ # Filter these elements to those strictly within the region's bounds
1492
+ region_bbox = self.bbox
1493
+ matching_elements = [
1494
+ el
1495
+ for el in potential_elements
1496
+ if el.x0 >= region_bbox[0]
1497
+ and el.top >= region_bbox[1]
1498
+ and el.x1 <= region_bbox[2]
1499
+ and el.bottom <= region_bbox[3]
1500
+ ]
1501
+
1502
+ return ElementCollection(matching_elements)
1503
+
1504
+ except Exception as e:
1505
+ logger.error(f"Error during find_all in region: {e}", exc_info=True)
1506
+ return ElementCollection([])
1507
+
1508
+ def apply_ocr(self, replace=True, **ocr_params) -> "Region":
1088
1509
  """
1089
1510
  Apply OCR to this region and return the created text elements.
1090
1511
 
1091
1512
  Args:
1513
+ replace: If True (default), removes existing OCR elements in the region
1514
+ before adding new ones. If False, adds new OCR elements without
1515
+ removing existing ones.
1092
1516
  **ocr_params: Keyword arguments passed to the OCR Manager.
1093
1517
  Common parameters like `engine`, `languages`, `min_confidence`,
1094
1518
  `device`, and `resolution` (for image rendering) should be
@@ -1098,12 +1522,32 @@ class Region(DirectionalMixin):
1098
1522
  an `options` object (e.g., `options=EasyOCROptions(...)`).
1099
1523
 
1100
1524
  Returns:
1101
- List of created TextElement objects representing OCR words/lines.
1525
+ Self for method chaining.
1102
1526
  """
1103
1527
  # Ensure OCRManager is available
1104
1528
  if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
1105
1529
  logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
1106
- return []
1530
+ return self
1531
+
1532
+ # If replace is True, find and remove existing OCR elements in this region
1533
+ if replace:
1534
+ logger.info(
1535
+ f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
1536
+ )
1537
+ # Find all OCR elements in this region
1538
+ ocr_selector = "text[source=ocr]"
1539
+ ocr_elements = self.find_all(ocr_selector)
1540
+
1541
+ if ocr_elements:
1542
+ logger.info(
1543
+ f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove."
1544
+ )
1545
+ # Remove these elements from their page
1546
+ removed_count = ocr_elements.remove()
1547
+ logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
1548
+ else:
1549
+ logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
1550
+
1107
1551
  ocr_mgr = self.page._parent._ocr_manager
1108
1552
 
1109
1553
  # Determine rendering resolution from parameters
@@ -1123,11 +1567,11 @@ class Region(DirectionalMixin):
1123
1567
  )
1124
1568
  if not region_image:
1125
1569
  logger.error("Failed to render region to image for OCR.")
1126
- return []
1570
+ return self
1127
1571
  logger.debug(f"Region rendered to image size: {region_image.size}")
1128
1572
  except Exception as e:
1129
1573
  logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
1130
- return []
1574
+ return self
1131
1575
 
1132
1576
  # Prepare args for the OCR Manager
1133
1577
  manager_args = {
@@ -1148,11 +1592,11 @@ class Region(DirectionalMixin):
1148
1592
  logger.error(
1149
1593
  f"OCRManager returned unexpected type for single region image: {type(results)}"
1150
1594
  )
1151
- return []
1595
+ return self
1152
1596
  logger.debug(f"Region OCR processing returned {len(results)} results.")
1153
1597
  except Exception as e:
1154
1598
  logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
1155
- return []
1599
+ return self
1156
1600
 
1157
1601
  # Convert results to TextElements
1158
1602
  scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
@@ -1621,8 +2065,6 @@ class Region(DirectionalMixin):
1621
2065
  return self.child_regions
1622
2066
 
1623
2067
  # Use existing selector parser to filter
1624
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1625
-
1626
2068
  try:
1627
2069
  selector_obj = parse_selector(selector)
1628
2070
  filter_func = selector_to_filter_func(selector_obj) # Removed region=self
@@ -1663,8 +2105,6 @@ class Region(DirectionalMixin):
1663
2105
 
1664
2106
  # Filter by selector if provided
1665
2107
  if selector is not None:
1666
- from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
1667
-
1668
2108
  try:
1669
2109
  selector_obj = parse_selector(selector)
1670
2110
  filter_func = selector_to_filter_func(selector_obj) # Removed region=self
@@ -1677,11 +2117,6 @@ class Region(DirectionalMixin):
1677
2117
 
1678
2118
  return all_descendants
1679
2119
 
1680
- # Removed recursive=True, find_all on region shouldn't be recursive by default
1681
- # Renamed _find_all back to find_all
1682
- # def find_all(self, selector, apply_exclusions=True, **kwargs):
1683
- # See implementation above near get_elements
1684
-
1685
2120
  def __repr__(self) -> str:
1686
2121
  """String representation of the region."""
1687
2122
  poly_info = " (Polygon)" if self.has_polygon else ""
@@ -1719,7 +2154,7 @@ class Region(DirectionalMixin):
1719
2154
  """
1720
2155
  # Find OCR elements specifically within this region
1721
2156
  # Note: We typically want to correct even if the element falls in an excluded area
1722
- target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
2157
+ target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
1723
2158
 
1724
2159
  # Delegate to the utility function
1725
2160
  _apply_ocr_correction_to_elements(
@@ -1729,3 +2164,221 @@ class Region(DirectionalMixin):
1729
2164
  )
1730
2165
 
1731
2166
  return self # Return self for chaining
2167
+
2168
+ # --- Classification Mixin Implementation --- #
2169
+ def _get_classification_manager(self) -> "ClassificationManager":
2170
+ if (
2171
+ not hasattr(self, "page")
2172
+ or not hasattr(self.page, "pdf")
2173
+ or not hasattr(self.page.pdf, "get_manager")
2174
+ ):
2175
+ raise AttributeError(
2176
+ "ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing."
2177
+ )
2178
+ try:
2179
+ # Use the PDF's manager registry accessor via page
2180
+ return self.page.pdf.get_manager("classification")
2181
+ except (ValueError, RuntimeError, AttributeError) as e:
2182
+ # Wrap potential errors from get_manager for clarity
2183
+ raise AttributeError(
2184
+ f"Failed to get ClassificationManager from PDF via Page: {e}"
2185
+ ) from e
2186
+
2187
+ def _get_classification_content(
2188
+ self, model_type: str, **kwargs
2189
+ ) -> Union[str, "Image"]: # Use "Image" for lazy import
2190
+ if model_type == "text":
2191
+ text_content = self.extract_text(layout=False) # Simple join for classification
2192
+ if not text_content or text_content.isspace():
2193
+ raise ValueError("Cannot classify region with 'text' model: No text content found.")
2194
+ return text_content
2195
+ elif model_type == "vision":
2196
+ # Get resolution from manager/kwargs if possible, else default
2197
+ # We access manager via the method to ensure it's available
2198
+ manager = self._get_classification_manager()
2199
+ default_resolution = 150 # Manager doesn't store default res, set here
2200
+ # Note: classify() passes resolution via **kwargs if user specifies
2201
+ resolution = (
2202
+ kwargs.get("resolution", default_resolution)
2203
+ if "kwargs" in locals()
2204
+ else default_resolution
2205
+ )
2206
+
2207
+ img = self.to_image(
2208
+ resolution=resolution,
2209
+ include_highlights=False, # No highlights for classification input
2210
+ crop_only=True, # Just the region content
2211
+ )
2212
+ if img is None:
2213
+ raise ValueError(
2214
+ "Cannot classify region with 'vision' model: Failed to render image."
2215
+ )
2216
+ return img
2217
+ else:
2218
+ raise ValueError(f"Unsupported model_type for classification: {model_type}")
2219
+
2220
+ def _get_metadata_storage(self) -> Dict[str, Any]:
2221
+ # Ensure metadata exists
2222
+ if not hasattr(self, "metadata") or self.metadata is None:
2223
+ self.metadata = {}
2224
+ return self.metadata
2225
+
2226
+ # --- End Classification Mixin Implementation --- #
2227
+
2228
+ # --- NEW METHOD: analyze_text_table_structure ---
2229
+ def analyze_text_table_structure(
2230
+ self,
2231
+ snap_tolerance: int = 10,
2232
+ join_tolerance: int = 3,
2233
+ min_words_vertical: int = 3,
2234
+ min_words_horizontal: int = 1,
2235
+ intersection_tolerance: int = 3,
2236
+ expand_bbox: Optional[Dict[str, int]] = None,
2237
+ **kwargs,
2238
+ ) -> Optional[Dict]:
2239
+ """
2240
+ Analyzes the text elements within the region (or slightly expanded area)
2241
+ to find potential table structure (lines, cells) using text alignment logic
2242
+ adapted from pdfplumber.
2243
+
2244
+ Args:
2245
+ snap_tolerance: Tolerance for snapping parallel lines.
2246
+ join_tolerance: Tolerance for joining collinear lines.
2247
+ min_words_vertical: Minimum words needed to define a vertical line.
2248
+ min_words_horizontal: Minimum words needed to define a horizontal line.
2249
+ intersection_tolerance: Tolerance for detecting line intersections.
2250
+ expand_bbox: Optional dictionary to expand the search area slightly beyond
2251
+ the region's exact bounds (e.g., {'left': 5, 'right': 5}).
2252
+ **kwargs: Additional keyword arguments passed to
2253
+ find_text_based_tables (e.g., specific x/y tolerances).
2254
+
2255
+ Returns:
2256
+ A dictionary containing 'horizontal_edges', 'vertical_edges', 'cells' (list of dicts),
2257
+ and 'intersections', or None if pdfplumber is unavailable or an error occurs.
2258
+ """
2259
+
2260
+ # Determine the search region (expand if requested)
2261
+ search_region = self
2262
+ if expand_bbox and isinstance(expand_bbox, dict):
2263
+ try:
2264
+ search_region = self.expand(**expand_bbox)
2265
+ logger.debug(
2266
+ f"Expanded search region for text table analysis to: {search_region.bbox}"
2267
+ )
2268
+ except Exception as e:
2269
+ logger.warning(f"Could not expand region bbox: {e}. Using original region.")
2270
+ search_region = self
2271
+
2272
+ # Find text elements within the search region
2273
+ text_elements = search_region.find_all(
2274
+ "text", apply_exclusions=False
2275
+ ) # Use unfiltered text
2276
+ if not text_elements:
2277
+ logger.info(f"Region {self.bbox}: No text elements found for text table analysis.")
2278
+ return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
2279
+
2280
+ # Extract bounding boxes
2281
+ bboxes = [element.bbox for element in text_elements if hasattr(element, "bbox")]
2282
+ if not bboxes:
2283
+ logger.info(f"Region {self.bbox}: No bboxes extracted from text elements.")
2284
+ return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
2285
+
2286
+ # Call the utility function
2287
+ try:
2288
+ analysis_results = find_text_based_tables(
2289
+ bboxes=bboxes,
2290
+ snap_tolerance=snap_tolerance,
2291
+ join_tolerance=join_tolerance,
2292
+ min_words_vertical=min_words_vertical,
2293
+ min_words_horizontal=min_words_horizontal,
2294
+ intersection_tolerance=intersection_tolerance,
2295
+ **kwargs, # Pass through any extra specific tolerance args
2296
+ )
2297
+ # Store results in the region's analyses cache
2298
+ self.analyses["text_table_structure"] = analysis_results
2299
+ return analysis_results
2300
+ except ImportError:
2301
+ logger.error("pdfplumber library is required for 'text' table analysis but not found.")
2302
+ return None
2303
+ except Exception as e:
2304
+ logger.error(f"Error during text-based table analysis: {e}", exc_info=True)
2305
+ return None
2306
+
2307
+ # --- END NEW METHOD ---
2308
+
2309
+ # --- NEW METHOD: get_text_table_cells ---
2310
+ def get_text_table_cells(
2311
+ self,
2312
+ snap_tolerance: int = 10,
2313
+ join_tolerance: int = 3,
2314
+ min_words_vertical: int = 3,
2315
+ min_words_horizontal: int = 1,
2316
+ intersection_tolerance: int = 3,
2317
+ expand_bbox: Optional[Dict[str, int]] = None,
2318
+ **kwargs,
2319
+ ) -> "ElementCollection[Region]":
2320
+ """
2321
+ Analyzes text alignment to find table cells and returns them as
2322
+ temporary Region objects without adding them to the page.
2323
+
2324
+ Args:
2325
+ snap_tolerance: Tolerance for snapping parallel lines.
2326
+ join_tolerance: Tolerance for joining collinear lines.
2327
+ min_words_vertical: Minimum words needed to define a vertical line.
2328
+ min_words_horizontal: Minimum words needed to define a horizontal line.
2329
+ intersection_tolerance: Tolerance for detecting line intersections.
2330
+ expand_bbox: Optional dictionary to expand the search area slightly beyond
2331
+ the region's exact bounds (e.g., {'left': 5, 'right': 5}).
2332
+ **kwargs: Additional keyword arguments passed to
2333
+ find_text_based_tables (e.g., specific x/y tolerances).
2334
+
2335
+ Returns:
2336
+ An ElementCollection containing temporary Region objects for each detected cell,
2337
+ or an empty ElementCollection if no cells are found or an error occurs.
2338
+ """
2339
+ from natural_pdf.elements.collections import ElementCollection
2340
+
2341
+ # 1. Perform the analysis (or use cached results)
2342
+ if "text_table_structure" in self.analyses:
2343
+ analysis_results = self.analyses["text_table_structure"]
2344
+ logger.debug("get_text_table_cells: Using cached analysis results.")
2345
+ else:
2346
+ analysis_results = self.analyze_text_table_structure(
2347
+ snap_tolerance=snap_tolerance,
2348
+ join_tolerance=join_tolerance,
2349
+ min_words_vertical=min_words_vertical,
2350
+ min_words_horizontal=min_words_horizontal,
2351
+ intersection_tolerance=intersection_tolerance,
2352
+ expand_bbox=expand_bbox,
2353
+ **kwargs,
2354
+ )
2355
+
2356
+ # 2. Check if analysis was successful and cells were found
2357
+ if analysis_results is None or not analysis_results.get("cells"):
2358
+ logger.info(f"Region {self.bbox}: No cells found by text table analysis.")
2359
+ return ElementCollection([]) # Return empty collection
2360
+
2361
+ # 3. Create temporary Region objects for each cell dictionary
2362
+ cell_regions = []
2363
+ for cell_data in analysis_results["cells"]:
2364
+ try:
2365
+ # Use page.region to create the region object
2366
+ # It expects left, top, right, bottom keys
2367
+ cell_region = self.page.region(**cell_data)
2368
+
2369
+ # Set metadata on the temporary region
2370
+ cell_region.region_type = "table-cell"
2371
+ cell_region.normalized_type = "table-cell"
2372
+ cell_region.model = "pdfplumber-text"
2373
+ cell_region.source = "volatile" # Indicate it's not managed/persistent
2374
+ cell_region.parent_region = self # Link back to the region it came from
2375
+
2376
+ cell_regions.append(cell_region)
2377
+ except Exception as e:
2378
+ logger.warning(f"Could not create Region object for cell data {cell_data}: {e}")
2379
+
2380
+ # 4. Return the list wrapped in an ElementCollection
2381
+ logger.debug(f"get_text_table_cells: Created {len(cell_regions)} temporary cell regions.")
2382
+ return ElementCollection(cell_regions)
2383
+
2384
+ # --- END NEW METHOD ---