natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +241 -158
- natural_pdf/classification/mixin.py +52 -38
- natural_pdf/classification/results.py +71 -45
- natural_pdf/collections/mixins.py +85 -20
- natural_pdf/collections/pdf_collection.py +245 -100
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +694 -195
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +610 -134
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
- natural_pdf-0.1.10.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1,28 +1,37 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, overload
|
3
3
|
|
4
4
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
5
5
|
|
6
6
|
# New Imports
|
7
7
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
8
8
|
|
9
|
+
from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
|
10
|
+
from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
|
11
|
+
|
12
|
+
# --- Classification Imports --- #
|
13
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
9
14
|
from natural_pdf.elements.base import DirectionalMixin
|
15
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
16
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
17
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
18
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
10
19
|
|
11
20
|
# Import new utils
|
12
21
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
13
22
|
|
14
|
-
|
23
|
+
# --- NEW: Import tqdm utility --- #
|
24
|
+
from natural_pdf.utils.tqdm_utils import get_tqdm
|
15
25
|
|
16
|
-
# --- Classification Imports --- #
|
17
|
-
from natural_pdf.classification.mixin import ClassificationMixin
|
18
|
-
from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
|
19
26
|
# --- End Classification Imports --- #
|
20
27
|
|
21
|
-
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
22
|
-
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
23
28
|
|
24
29
|
if TYPE_CHECKING:
|
30
|
+
# --- NEW: Add Image type hint for classification --- #
|
31
|
+
from PIL.Image import Image
|
32
|
+
|
25
33
|
from natural_pdf.core.page import Page
|
34
|
+
from natural_pdf.elements.collections import ElementCollection
|
26
35
|
from natural_pdf.elements.text import TextElement
|
27
36
|
|
28
37
|
# Import OCRManager conditionally to avoid circular imports
|
@@ -68,7 +77,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
68
77
|
# --- ADDED --- Metadata store for mixins
|
69
78
|
self.metadata: Dict[str, Any] = {}
|
70
79
|
# --- NEW --- Central registry for analysis results
|
71
|
-
self.analyses: Dict[str, Any] = {}
|
80
|
+
self.analyses: Dict[str, Any] = {}
|
72
81
|
# --- END ADDED ---
|
73
82
|
|
74
83
|
# Standard attributes for all elements
|
@@ -504,9 +513,37 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
504
513
|
|
505
514
|
return inside
|
506
515
|
|
516
|
+
def is_element_center_inside(self, element: "Element") -> bool:
|
517
|
+
"""
|
518
|
+
Check if the center point of an element's bounding box is inside this region.
|
519
|
+
|
520
|
+
Args:
|
521
|
+
element: Element to check
|
522
|
+
|
523
|
+
Returns:
|
524
|
+
True if the element's center point is inside the region, False otherwise.
|
525
|
+
"""
|
526
|
+
# Check if element is on the same page
|
527
|
+
if not hasattr(element, "page") or element.page != self._page:
|
528
|
+
return False
|
529
|
+
|
530
|
+
# Ensure element has necessary attributes
|
531
|
+
if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
|
532
|
+
logger.warning(
|
533
|
+
f"Element {element} lacks bounding box attributes. Cannot check center point."
|
534
|
+
)
|
535
|
+
return False # Cannot determine position
|
536
|
+
|
537
|
+
# Calculate center point
|
538
|
+
center_x = (element.x0 + element.x1) / 2
|
539
|
+
center_y = (element.top + element.bottom) / 2
|
540
|
+
|
541
|
+
# Use the existing is_point_inside check
|
542
|
+
return self.is_point_inside(center_x, center_y)
|
543
|
+
|
507
544
|
def _is_element_in_region(self, element: "Element", use_boundary_tolerance=True) -> bool:
|
508
545
|
"""
|
509
|
-
Check if an element is within this region.
|
546
|
+
Check if an element intersects or is contained within this region.
|
510
547
|
|
511
548
|
Args:
|
512
549
|
element: Element to check
|
@@ -523,16 +560,101 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
523
560
|
if not hasattr(element, "page") or element.page != self._page:
|
524
561
|
return False
|
525
562
|
|
526
|
-
|
563
|
+
return self.is_element_center_inside(element)
|
564
|
+
# return self.intersects(element)
|
565
|
+
|
566
|
+
def contains(self, element: "Element") -> bool:
|
567
|
+
"""
|
568
|
+
Check if this region completely contains an element.
|
569
|
+
|
570
|
+
Args:
|
571
|
+
element: Element to check
|
572
|
+
|
573
|
+
Returns:
|
574
|
+
True if the element is completely contained within the region, False otherwise
|
575
|
+
"""
|
576
|
+
# Check if element is on the same page
|
577
|
+
if not hasattr(element, "page") or element.page != self._page:
|
578
|
+
return False
|
579
|
+
|
580
|
+
# Ensure element has necessary attributes
|
581
|
+
if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
|
582
|
+
return False # Cannot determine position
|
583
|
+
|
584
|
+
# For rectangular regions, check if element's bbox is fully inside region's bbox
|
585
|
+
if not self.has_polygon:
|
586
|
+
return (
|
587
|
+
self.x0 <= element.x0
|
588
|
+
and element.x1 <= self.x1
|
589
|
+
and self.top <= element.top
|
590
|
+
and element.bottom <= self.bottom
|
591
|
+
)
|
592
|
+
|
593
|
+
# For polygon regions, check if all corners of the element are inside the polygon
|
594
|
+
element_corners = [
|
595
|
+
(element.x0, element.top), # top-left
|
596
|
+
(element.x1, element.top), # top-right
|
597
|
+
(element.x1, element.bottom), # bottom-right
|
598
|
+
(element.x0, element.bottom), # bottom-left
|
599
|
+
]
|
600
|
+
|
601
|
+
return all(self.is_point_inside(x, y) for x, y in element_corners)
|
602
|
+
|
603
|
+
def intersects(self, element: "Element") -> bool:
|
604
|
+
"""
|
605
|
+
Check if this region intersects with an element (any overlap).
|
606
|
+
|
607
|
+
Args:
|
608
|
+
element: Element to check
|
609
|
+
|
610
|
+
Returns:
|
611
|
+
True if the element overlaps with the region at all, False otherwise
|
612
|
+
"""
|
613
|
+
# Check if element is on the same page
|
614
|
+
if not hasattr(element, "page") or element.page != self._page:
|
615
|
+
return False
|
616
|
+
|
527
617
|
# Ensure element has necessary attributes
|
528
618
|
if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
|
529
619
|
return False # Cannot determine position
|
530
620
|
|
531
|
-
|
532
|
-
|
621
|
+
# For rectangular regions, check for bbox overlap
|
622
|
+
if not self.has_polygon:
|
623
|
+
return (
|
624
|
+
self.x0 < element.x1
|
625
|
+
and self.x1 > element.x0
|
626
|
+
and self.top < element.bottom
|
627
|
+
and self.bottom > element.top
|
628
|
+
)
|
533
629
|
|
534
|
-
#
|
535
|
-
|
630
|
+
# For polygon regions, check if any corner of the element is inside the polygon
|
631
|
+
element_corners = [
|
632
|
+
(element.x0, element.top), # top-left
|
633
|
+
(element.x1, element.top), # top-right
|
634
|
+
(element.x1, element.bottom), # bottom-right
|
635
|
+
(element.x0, element.bottom), # bottom-left
|
636
|
+
]
|
637
|
+
|
638
|
+
# First check if any element corner is inside the polygon
|
639
|
+
if any(self.is_point_inside(x, y) for x, y in element_corners):
|
640
|
+
return True
|
641
|
+
|
642
|
+
# Also check if any polygon corner is inside the element's rectangle
|
643
|
+
for x, y in self.polygon:
|
644
|
+
if element.x0 <= x <= element.x1 and element.top <= y <= element.bottom:
|
645
|
+
return True
|
646
|
+
|
647
|
+
# Also check if any polygon edge intersects with any rectangle edge
|
648
|
+
# This is a simplification - for complex cases, we'd need a full polygon-rectangle
|
649
|
+
# intersection algorithm
|
650
|
+
|
651
|
+
# For now, return True if bounding boxes overlap (approximation for polygon-rectangle case)
|
652
|
+
return (
|
653
|
+
self.x0 < element.x1
|
654
|
+
and self.x1 > element.x0
|
655
|
+
and self.top < element.bottom
|
656
|
+
and self.bottom > element.top
|
657
|
+
)
|
536
658
|
|
537
659
|
def highlight(
|
538
660
|
self,
|
@@ -616,15 +738,15 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
616
738
|
|
617
739
|
# Ensure coords are valid for cropping (left < right, top < bottom)
|
618
740
|
if x0 >= x1:
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
741
|
+
logger.warning(
|
742
|
+
f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
|
743
|
+
)
|
744
|
+
return None
|
623
745
|
if top >= bottom:
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
746
|
+
logger.warning(
|
747
|
+
f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
|
748
|
+
)
|
749
|
+
return None
|
628
750
|
|
629
751
|
# Crop the image to just this region
|
630
752
|
region_image = page_image.crop((x0, top, x1, bottom))
|
@@ -850,7 +972,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
850
972
|
result = generate_text_layout(
|
851
973
|
char_dicts=filtered_chars,
|
852
974
|
layout_context_bbox=self.bbox, # Use region's bbox for context
|
853
|
-
user_kwargs=kwargs,
|
975
|
+
user_kwargs=kwargs, # Pass original kwargs to layout generator
|
854
976
|
)
|
855
977
|
|
856
978
|
logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
|
@@ -858,40 +980,65 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
858
980
|
|
859
981
|
def extract_table(
|
860
982
|
self,
|
861
|
-
method: str = None,
|
862
|
-
table_settings: dict = None,
|
983
|
+
method: Optional[str] = None, # Make method optional
|
984
|
+
table_settings: Optional[dict] = None, # Use Optional
|
863
985
|
use_ocr: bool = False,
|
864
|
-
ocr_config: dict = None,
|
865
|
-
|
986
|
+
ocr_config: Optional[dict] = None, # Use Optional
|
987
|
+
text_options: Optional[Dict] = None,
|
988
|
+
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
989
|
+
# --- NEW: Add tqdm control option --- #
|
990
|
+
show_progress: bool = False, # Controls progress bar for text method
|
991
|
+
) -> List[List[Optional[str]]]: # Return type allows Optional[str] for cells
|
866
992
|
"""
|
867
993
|
Extract a table from this region.
|
868
994
|
|
869
995
|
Args:
|
870
|
-
method: Method to use
|
871
|
-
table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
|
872
|
-
use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
|
873
|
-
ocr_config: OCR configuration parameters
|
996
|
+
method: Method to use: 'tatr', 'plumber', 'text', or None (auto-detect).
|
997
|
+
table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method).
|
998
|
+
use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
|
999
|
+
ocr_config: OCR configuration parameters.
|
1000
|
+
text_options: Dictionary of options for the 'text' method, corresponding to arguments
|
1001
|
+
of analyze_text_table_structure (e.g., snap_tolerance, expand_bbox).
|
1002
|
+
cell_extraction_func: Optional callable function that takes a cell Region object
|
1003
|
+
and returns its string content. Overrides default text extraction
|
1004
|
+
for the 'text' method.
|
1005
|
+
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
874
1006
|
|
875
1007
|
Returns:
|
876
|
-
Table data as a list of rows, where each row is a list of cell values
|
1008
|
+
Table data as a list of rows, where each row is a list of cell values (str or None).
|
877
1009
|
"""
|
878
1010
|
# Default settings if none provided
|
879
1011
|
if table_settings is None:
|
880
1012
|
table_settings = {}
|
1013
|
+
if text_options is None:
|
1014
|
+
text_options = {} # Initialize empty dict
|
881
1015
|
|
882
1016
|
# Auto-detect method if not specified
|
883
|
-
|
1017
|
+
effective_method = method
|
1018
|
+
if effective_method is None:
|
884
1019
|
# If this is a TATR-detected region, use TATR method
|
885
1020
|
if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
|
886
|
-
|
1021
|
+
effective_method = "tatr"
|
887
1022
|
else:
|
888
|
-
|
1023
|
+
effective_method = "text"
|
1024
|
+
|
1025
|
+
logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
|
889
1026
|
|
890
1027
|
# Use the selected method
|
891
|
-
if
|
1028
|
+
if effective_method == "tatr":
|
892
1029
|
return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
|
893
|
-
|
1030
|
+
elif effective_method == "text":
|
1031
|
+
current_text_options = text_options.copy()
|
1032
|
+
current_text_options["cell_extraction_func"] = cell_extraction_func
|
1033
|
+
# --- Pass show_progress to the helper --- #
|
1034
|
+
current_text_options["show_progress"] = show_progress
|
1035
|
+
return self._extract_table_text(**current_text_options)
|
1036
|
+
elif effective_method == "plumber":
|
894
1037
|
return self._extract_table_plumber(table_settings)
|
1038
|
+
else:
|
1039
|
+
raise ValueError(
|
1040
|
+
f"Unknown table extraction method: '{effective_method}'. Choose from 'tatr', 'plumber', 'text'."
|
1041
|
+
)
|
895
1042
|
|
896
1043
|
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
897
1044
|
"""
|
@@ -1052,46 +1199,273 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1052
1199
|
|
1053
1200
|
return table_data
|
1054
1201
|
|
1055
|
-
def
|
1202
|
+
def _extract_table_text(self, **text_options) -> List[List[Optional[str]]]:
|
1056
1203
|
"""
|
1057
|
-
|
1204
|
+
Extracts table content based on text alignment analysis.
|
1058
1205
|
|
1059
1206
|
Args:
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1207
|
+
**text_options: Options passed to analyze_text_table_structure,
|
1208
|
+
plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
|
1209
|
+
and 'show_progress'.
|
1210
|
+
|
1211
|
+
Returns:
|
1212
|
+
Table data as list of lists of strings (or None for empty cells).
|
1213
|
+
"""
|
1214
|
+
cell_extraction_func = text_options.pop("cell_extraction_func", None)
|
1215
|
+
# --- Get show_progress option --- #
|
1216
|
+
show_progress = text_options.pop("show_progress", False)
|
1217
|
+
|
1218
|
+
# Analyze structure first (or use cached results)
|
1219
|
+
if "text_table_structure" in self.analyses:
|
1220
|
+
analysis_results = self.analyses["text_table_structure"]
|
1221
|
+
logger.debug("Using cached text table structure analysis results.")
|
1222
|
+
else:
|
1223
|
+
analysis_results = self.analyze_text_table_structure(**text_options)
|
1224
|
+
|
1225
|
+
if analysis_results is None or not analysis_results.get("cells"):
|
1226
|
+
logger.warning(f"Region {self.bbox}: No cells found using 'text' method.")
|
1227
|
+
return []
|
1228
|
+
|
1229
|
+
cell_dicts = analysis_results["cells"]
|
1230
|
+
|
1231
|
+
# --- Grid Reconstruction Logic --- #
|
1232
|
+
if not cell_dicts:
|
1233
|
+
return []
|
1234
|
+
|
1235
|
+
# 1. Get unique sorted top and left coordinates (cell boundaries)
|
1236
|
+
coord_tolerance = text_options.get("coordinate_grouping_tolerance", 1)
|
1237
|
+
tops = sorted(
|
1238
|
+
list(set(round(c["top"] / coord_tolerance) * coord_tolerance for c in cell_dicts))
|
1239
|
+
)
|
1240
|
+
lefts = sorted(
|
1241
|
+
list(set(round(c["left"] / coord_tolerance) * coord_tolerance for c in cell_dicts))
|
1242
|
+
)
|
1243
|
+
|
1244
|
+
# Refine boundaries (cluster_coords helper remains the same)
|
1245
|
+
def cluster_coords(coords):
|
1246
|
+
if not coords:
|
1247
|
+
return []
|
1248
|
+
clustered = []
|
1249
|
+
current_cluster = [coords[0]]
|
1250
|
+
for c in coords[1:]:
|
1251
|
+
if abs(c - current_cluster[-1]) <= coord_tolerance:
|
1252
|
+
current_cluster.append(c)
|
1253
|
+
else:
|
1254
|
+
clustered.append(min(current_cluster))
|
1255
|
+
current_cluster = [c]
|
1256
|
+
clustered.append(min(current_cluster))
|
1257
|
+
return clustered
|
1258
|
+
|
1259
|
+
unique_tops = cluster_coords(tops)
|
1260
|
+
unique_lefts = cluster_coords(lefts)
|
1261
|
+
|
1262
|
+
# --- Setup tqdm --- #
|
1263
|
+
tqdm = get_tqdm()
|
1264
|
+
# Determine iterable for tqdm
|
1265
|
+
cell_iterator = cell_dicts
|
1266
|
+
if show_progress:
|
1267
|
+
# Only wrap if progress should be shown
|
1268
|
+
cell_iterator = tqdm(
|
1269
|
+
cell_dicts,
|
1270
|
+
desc=f"Extracting text from {len(cell_dicts)} cells (text method)",
|
1271
|
+
unit="cell",
|
1272
|
+
leave=False, # Optional: Keep bar after completion
|
1273
|
+
)
|
1274
|
+
# --- End tqdm Setup --- #
|
1275
|
+
|
1276
|
+
# 2. Create a lookup map for cell text: {(rounded_top, rounded_left): cell_text}
|
1277
|
+
cell_text_map = {}
|
1278
|
+
# --- Use the potentially wrapped iterator --- #
|
1279
|
+
for cell_data in cell_iterator:
|
1280
|
+
try:
|
1281
|
+
cell_region = self.page.region(**cell_data)
|
1282
|
+
cell_value = None # Initialize
|
1283
|
+
if callable(cell_extraction_func):
|
1284
|
+
try:
|
1285
|
+
cell_value = cell_extraction_func(cell_region)
|
1286
|
+
if not isinstance(cell_value, (str, type(None))):
|
1287
|
+
logger.warning(
|
1288
|
+
f"Custom cell_extraction_func returned non-string/None type ({type(cell_value)}) for cell {cell_data}. Treating as None."
|
1289
|
+
)
|
1290
|
+
cell_value = None
|
1291
|
+
except Exception as func_err:
|
1292
|
+
logger.error(
|
1293
|
+
f"Error executing custom cell_extraction_func for cell {cell_data}: {func_err}",
|
1294
|
+
exc_info=True,
|
1295
|
+
)
|
1296
|
+
cell_value = None
|
1297
|
+
else:
|
1298
|
+
cell_value = cell_region.extract_text(
|
1299
|
+
layout=False, apply_exclusions=False
|
1300
|
+
).strip()
|
1301
|
+
|
1302
|
+
rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
|
1303
|
+
rounded_left = round(cell_data["left"] / coord_tolerance) * coord_tolerance
|
1304
|
+
cell_text_map[(rounded_top, rounded_left)] = cell_value
|
1305
|
+
except Exception as e:
|
1306
|
+
logger.warning(f"Could not process cell {cell_data} for text extraction: {e}")
|
1307
|
+
|
1308
|
+
# 3. Build the final list-of-lists table (loop remains the same)
|
1309
|
+
final_table = []
|
1310
|
+
for row_top in unique_tops:
|
1311
|
+
row_data = []
|
1312
|
+
for col_left in unique_lefts:
|
1313
|
+
best_match_key = None
|
1314
|
+
min_dist_sq = float("inf")
|
1315
|
+
for map_top, map_left in cell_text_map.keys():
|
1316
|
+
if (
|
1317
|
+
abs(map_top - row_top) <= coord_tolerance
|
1318
|
+
and abs(map_left - col_left) <= coord_tolerance
|
1319
|
+
):
|
1320
|
+
dist_sq = (map_top - row_top) ** 2 + (map_left - col_left) ** 2
|
1321
|
+
if dist_sq < min_dist_sq:
|
1322
|
+
min_dist_sq = dist_sq
|
1323
|
+
best_match_key = (map_top, map_left)
|
1324
|
+
cell_value = cell_text_map.get(best_match_key)
|
1325
|
+
row_data.append(cell_value)
|
1326
|
+
final_table.append(row_data)
|
1327
|
+
|
1328
|
+
return final_table
|
1329
|
+
|
1330
|
+
# --- END MODIFIED METHOD --- #
|
1331
|
+
|
1332
|
+
@overload
|
1333
|
+
def find(
|
1334
|
+
self,
|
1335
|
+
*,
|
1336
|
+
text: str,
|
1337
|
+
apply_exclusions: bool = True,
|
1338
|
+
regex: bool = False,
|
1339
|
+
case: bool = True,
|
1340
|
+
**kwargs,
|
1341
|
+
) -> Optional["Element"]: ...
|
1342
|
+
|
1343
|
+
@overload
|
1344
|
+
def find(
|
1345
|
+
self,
|
1346
|
+
selector: str,
|
1347
|
+
*,
|
1348
|
+
apply_exclusions: bool = True,
|
1349
|
+
regex: bool = False,
|
1350
|
+
case: bool = True,
|
1351
|
+
**kwargs,
|
1352
|
+
) -> Optional["Element"]: ...
|
1353
|
+
|
1354
|
+
def find(
|
1355
|
+
self,
|
1356
|
+
selector: Optional[str] = None, # Now optional
|
1357
|
+
*,
|
1358
|
+
text: Optional[str] = None, # New text parameter
|
1359
|
+
apply_exclusions: bool = True,
|
1360
|
+
regex: bool = False,
|
1361
|
+
case: bool = True,
|
1362
|
+
**kwargs,
|
1363
|
+
) -> Optional["Element"]:
|
1364
|
+
"""
|
1365
|
+
Find the first element in this region matching the selector OR text content.
|
1366
|
+
|
1367
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1368
|
+
|
1369
|
+
Args:
|
1370
|
+
selector: CSS-like selector string.
|
1371
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1372
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1373
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1374
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1375
|
+
**kwargs: Additional parameters for element filtering.
|
1063
1376
|
|
1064
1377
|
Returns:
|
1065
|
-
First matching element or None
|
1378
|
+
First matching element or None.
|
1066
1379
|
"""
|
1067
|
-
|
1068
|
-
|
1380
|
+
# Delegate validation and selector construction to find_all
|
1381
|
+
elements = self.find_all(
|
1382
|
+
selector=selector,
|
1383
|
+
text=text,
|
1384
|
+
apply_exclusions=apply_exclusions,
|
1385
|
+
regex=regex,
|
1386
|
+
case=case,
|
1387
|
+
**kwargs,
|
1388
|
+
)
|
1389
|
+
return elements.first if elements else None
|
1390
|
+
|
1391
|
+
@overload
|
1392
|
+
def find_all(
|
1393
|
+
self,
|
1394
|
+
*,
|
1395
|
+
text: str,
|
1396
|
+
apply_exclusions: bool = True,
|
1397
|
+
regex: bool = False,
|
1398
|
+
case: bool = True,
|
1399
|
+
**kwargs,
|
1400
|
+
) -> "ElementCollection": ...
|
1401
|
+
|
1402
|
+
@overload
|
1403
|
+
def find_all(
|
1404
|
+
self,
|
1405
|
+
selector: str,
|
1406
|
+
*,
|
1407
|
+
apply_exclusions: bool = True,
|
1408
|
+
regex: bool = False,
|
1409
|
+
case: bool = True,
|
1410
|
+
**kwargs,
|
1411
|
+
) -> "ElementCollection": ...
|
1069
1412
|
|
1070
1413
|
def find_all(
|
1071
|
-
self,
|
1072
|
-
|
1414
|
+
self,
|
1415
|
+
selector: Optional[str] = None, # Now optional
|
1416
|
+
*,
|
1417
|
+
text: Optional[str] = None, # New text parameter
|
1418
|
+
apply_exclusions: bool = True,
|
1419
|
+
regex: bool = False,
|
1420
|
+
case: bool = True,
|
1421
|
+
**kwargs,
|
1422
|
+
) -> "ElementCollection":
|
1073
1423
|
"""
|
1074
|
-
Find all elements in this region matching the selector.
|
1424
|
+
Find all elements in this region matching the selector OR text content.
|
1425
|
+
|
1426
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1075
1427
|
|
1076
1428
|
Args:
|
1077
|
-
selector: CSS-like selector string
|
1078
|
-
|
1079
|
-
|
1429
|
+
selector: CSS-like selector string.
|
1430
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1431
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1432
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1433
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1434
|
+
**kwargs: Additional parameters for element filtering.
|
1080
1435
|
|
1081
1436
|
Returns:
|
1082
|
-
ElementCollection with matching elements
|
1437
|
+
ElementCollection with matching elements.
|
1083
1438
|
"""
|
1084
1439
|
from natural_pdf.elements.collections import ElementCollection
|
1085
1440
|
|
1441
|
+
if selector is not None and text is not None:
|
1442
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
1443
|
+
if selector is None and text is None:
|
1444
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
1445
|
+
|
1446
|
+
# Construct selector if 'text' is provided
|
1447
|
+
effective_selector = ""
|
1448
|
+
if text is not None:
|
1449
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
1450
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
1451
|
+
logger.debug(
|
1452
|
+
f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
|
1453
|
+
)
|
1454
|
+
elif selector is not None:
|
1455
|
+
effective_selector = selector
|
1456
|
+
else:
|
1457
|
+
raise ValueError("Internal error: No selector or text provided.")
|
1458
|
+
|
1086
1459
|
# If we span multiple pages, filter our elements
|
1087
1460
|
# TODO: Revisit multi-page region logic
|
1088
1461
|
if self._spans_pages and self._multi_page_elements is not None:
|
1089
1462
|
logger.warning("find_all on multi-page regions is not fully implemented.")
|
1090
1463
|
# Temporary: Apply filter directly to cached elements
|
1091
|
-
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1092
|
-
|
1093
1464
|
try:
|
1094
|
-
selector_obj = parse_selector(
|
1465
|
+
selector_obj = parse_selector(effective_selector)
|
1466
|
+
# Pass regex/case flags down
|
1467
|
+
kwargs["regex"] = regex
|
1468
|
+
kwargs["case"] = case
|
1095
1469
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1096
1470
|
matching = [el for el in self._multi_page_elements if filter_func(el)]
|
1097
1471
|
return ElementCollection(matching)
|
@@ -1099,11 +1473,37 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1099
1473
|
logger.error(f"Error applying selector to multi-page region elements: {e}")
|
1100
1474
|
return ElementCollection([])
|
1101
1475
|
|
1102
|
-
#
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1476
|
+
# Normal case: Region is on a single page
|
1477
|
+
try:
|
1478
|
+
# Parse the final selector string
|
1479
|
+
selector_obj = parse_selector(effective_selector)
|
1480
|
+
|
1481
|
+
# Get all potentially relevant elements from the page
|
1482
|
+
# Let the page handle its exclusion logic if needed
|
1483
|
+
potential_elements = self.page.find_all(
|
1484
|
+
selector=effective_selector,
|
1485
|
+
apply_exclusions=False, # Apply exclusions LATER based on region bbox
|
1486
|
+
regex=regex,
|
1487
|
+
case=case,
|
1488
|
+
**kwargs,
|
1489
|
+
)
|
1490
|
+
|
1491
|
+
# Filter these elements to those strictly within the region's bounds
|
1492
|
+
region_bbox = self.bbox
|
1493
|
+
matching_elements = [
|
1494
|
+
el
|
1495
|
+
for el in potential_elements
|
1496
|
+
if el.x0 >= region_bbox[0]
|
1497
|
+
and el.top >= region_bbox[1]
|
1498
|
+
and el.x1 <= region_bbox[2]
|
1499
|
+
and el.bottom <= region_bbox[3]
|
1500
|
+
]
|
1501
|
+
|
1502
|
+
return ElementCollection(matching_elements)
|
1503
|
+
|
1504
|
+
except Exception as e:
|
1505
|
+
logger.error(f"Error during find_all in region: {e}", exc_info=True)
|
1506
|
+
return ElementCollection([])
|
1107
1507
|
|
1108
1508
|
def apply_ocr(self, replace=True, **ocr_params) -> "Region":
|
1109
1509
|
"""
|
@@ -1111,7 +1511,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1111
1511
|
|
1112
1512
|
Args:
|
1113
1513
|
replace: If True (default), removes existing OCR elements in the region
|
1114
|
-
before adding new ones. If False, adds new OCR elements without
|
1514
|
+
before adding new ones. If False, adds new OCR elements without
|
1115
1515
|
removing existing ones.
|
1116
1516
|
**ocr_params: Keyword arguments passed to the OCR Manager.
|
1117
1517
|
Common parameters like `engine`, `languages`, `min_confidence`,
|
@@ -1131,13 +1531,17 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1131
1531
|
|
1132
1532
|
# If replace is True, find and remove existing OCR elements in this region
|
1133
1533
|
if replace:
|
1134
|
-
logger.info(
|
1534
|
+
logger.info(
|
1535
|
+
f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
|
1536
|
+
)
|
1135
1537
|
# Find all OCR elements in this region
|
1136
1538
|
ocr_selector = "text[source=ocr]"
|
1137
1539
|
ocr_elements = self.find_all(ocr_selector)
|
1138
|
-
|
1540
|
+
|
1139
1541
|
if ocr_elements:
|
1140
|
-
logger.info(
|
1542
|
+
logger.info(
|
1543
|
+
f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove."
|
1544
|
+
)
|
1141
1545
|
# Remove these elements from their page
|
1142
1546
|
removed_count = ocr_elements.remove()
|
1143
1547
|
logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
|
@@ -1661,8 +2065,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1661
2065
|
return self.child_regions
|
1662
2066
|
|
1663
2067
|
# Use existing selector parser to filter
|
1664
|
-
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1665
|
-
|
1666
2068
|
try:
|
1667
2069
|
selector_obj = parse_selector(selector)
|
1668
2070
|
filter_func = selector_to_filter_func(selector_obj) # Removed region=self
|
@@ -1703,8 +2105,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1703
2105
|
|
1704
2106
|
# Filter by selector if provided
|
1705
2107
|
if selector is not None:
|
1706
|
-
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1707
|
-
|
1708
2108
|
try:
|
1709
2109
|
selector_obj = parse_selector(selector)
|
1710
2110
|
filter_func = selector_to_filter_func(selector_obj) # Removed region=self
|
@@ -1717,11 +2117,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1717
2117
|
|
1718
2118
|
return all_descendants
|
1719
2119
|
|
1720
|
-
# Removed recursive=True, find_all on region shouldn't be recursive by default
|
1721
|
-
# Renamed _find_all back to find_all
|
1722
|
-
# def find_all(self, selector, apply_exclusions=True, **kwargs):
|
1723
|
-
# See implementation above near get_elements
|
1724
|
-
|
1725
2120
|
def __repr__(self) -> str:
|
1726
2121
|
"""String representation of the region."""
|
1727
2122
|
poly_info = " (Polygon)" if self.has_polygon else ""
|
@@ -1772,44 +2167,218 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
|
1772
2167
|
|
1773
2168
|
# --- Classification Mixin Implementation --- #
|
1774
2169
|
def _get_classification_manager(self) -> "ClassificationManager":
|
1775
|
-
if
|
1776
|
-
|
2170
|
+
if (
|
2171
|
+
not hasattr(self, "page")
|
2172
|
+
or not hasattr(self.page, "pdf")
|
2173
|
+
or not hasattr(self.page.pdf, "get_manager")
|
2174
|
+
):
|
2175
|
+
raise AttributeError(
|
2176
|
+
"ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing."
|
2177
|
+
)
|
1777
2178
|
try:
|
1778
|
-
|
1779
|
-
|
2179
|
+
# Use the PDF's manager registry accessor via page
|
2180
|
+
return self.page.pdf.get_manager("classification")
|
1780
2181
|
except (ValueError, RuntimeError, AttributeError) as e:
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
2182
|
+
# Wrap potential errors from get_manager for clarity
|
2183
|
+
raise AttributeError(
|
2184
|
+
f"Failed to get ClassificationManager from PDF via Page: {e}"
|
2185
|
+
) from e
|
2186
|
+
|
2187
|
+
def _get_classification_content(
|
2188
|
+
self, model_type: str, **kwargs
|
2189
|
+
) -> Union[str, "Image"]: # Use "Image" for lazy import
|
2190
|
+
if model_type == "text":
|
2191
|
+
text_content = self.extract_text(layout=False) # Simple join for classification
|
1787
2192
|
if not text_content or text_content.isspace():
|
1788
2193
|
raise ValueError("Cannot classify region with 'text' model: No text content found.")
|
1789
2194
|
return text_content
|
1790
|
-
elif model_type ==
|
2195
|
+
elif model_type == "vision":
|
1791
2196
|
# Get resolution from manager/kwargs if possible, else default
|
1792
2197
|
# We access manager via the method to ensure it's available
|
1793
2198
|
manager = self._get_classification_manager()
|
1794
|
-
default_resolution = 150
|
2199
|
+
default_resolution = 150 # Manager doesn't store default res, set here
|
1795
2200
|
# Note: classify() passes resolution via **kwargs if user specifies
|
1796
|
-
resolution =
|
2201
|
+
resolution = (
|
2202
|
+
kwargs.get("resolution", default_resolution)
|
2203
|
+
if "kwargs" in locals()
|
2204
|
+
else default_resolution
|
2205
|
+
)
|
1797
2206
|
|
1798
2207
|
img = self.to_image(
|
1799
2208
|
resolution=resolution,
|
1800
|
-
include_highlights=False,
|
1801
|
-
crop_only=True
|
2209
|
+
include_highlights=False, # No highlights for classification input
|
2210
|
+
crop_only=True, # Just the region content
|
1802
2211
|
)
|
1803
2212
|
if img is None:
|
1804
|
-
raise ValueError(
|
2213
|
+
raise ValueError(
|
2214
|
+
"Cannot classify region with 'vision' model: Failed to render image."
|
2215
|
+
)
|
1805
2216
|
return img
|
1806
2217
|
else:
|
1807
2218
|
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
1808
2219
|
|
1809
2220
|
def _get_metadata_storage(self) -> Dict[str, Any]:
|
1810
2221
|
# Ensure metadata exists
|
1811
|
-
if not hasattr(self,
|
2222
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
1812
2223
|
self.metadata = {}
|
1813
2224
|
return self.metadata
|
1814
2225
|
|
1815
2226
|
# --- End Classification Mixin Implementation --- #
|
2227
|
+
|
2228
|
+
# --- NEW METHOD: analyze_text_table_structure ---
|
2229
|
+
def analyze_text_table_structure(
|
2230
|
+
self,
|
2231
|
+
snap_tolerance: int = 10,
|
2232
|
+
join_tolerance: int = 3,
|
2233
|
+
min_words_vertical: int = 3,
|
2234
|
+
min_words_horizontal: int = 1,
|
2235
|
+
intersection_tolerance: int = 3,
|
2236
|
+
expand_bbox: Optional[Dict[str, int]] = None,
|
2237
|
+
**kwargs,
|
2238
|
+
) -> Optional[Dict]:
|
2239
|
+
"""
|
2240
|
+
Analyzes the text elements within the region (or slightly expanded area)
|
2241
|
+
to find potential table structure (lines, cells) using text alignment logic
|
2242
|
+
adapted from pdfplumber.
|
2243
|
+
|
2244
|
+
Args:
|
2245
|
+
snap_tolerance: Tolerance for snapping parallel lines.
|
2246
|
+
join_tolerance: Tolerance for joining collinear lines.
|
2247
|
+
min_words_vertical: Minimum words needed to define a vertical line.
|
2248
|
+
min_words_horizontal: Minimum words needed to define a horizontal line.
|
2249
|
+
intersection_tolerance: Tolerance for detecting line intersections.
|
2250
|
+
expand_bbox: Optional dictionary to expand the search area slightly beyond
|
2251
|
+
the region's exact bounds (e.g., {'left': 5, 'right': 5}).
|
2252
|
+
**kwargs: Additional keyword arguments passed to
|
2253
|
+
find_text_based_tables (e.g., specific x/y tolerances).
|
2254
|
+
|
2255
|
+
Returns:
|
2256
|
+
A dictionary containing 'horizontal_edges', 'vertical_edges', 'cells' (list of dicts),
|
2257
|
+
and 'intersections', or None if pdfplumber is unavailable or an error occurs.
|
2258
|
+
"""
|
2259
|
+
|
2260
|
+
# Determine the search region (expand if requested)
|
2261
|
+
search_region = self
|
2262
|
+
if expand_bbox and isinstance(expand_bbox, dict):
|
2263
|
+
try:
|
2264
|
+
search_region = self.expand(**expand_bbox)
|
2265
|
+
logger.debug(
|
2266
|
+
f"Expanded search region for text table analysis to: {search_region.bbox}"
|
2267
|
+
)
|
2268
|
+
except Exception as e:
|
2269
|
+
logger.warning(f"Could not expand region bbox: {e}. Using original region.")
|
2270
|
+
search_region = self
|
2271
|
+
|
2272
|
+
# Find text elements within the search region
|
2273
|
+
text_elements = search_region.find_all(
|
2274
|
+
"text", apply_exclusions=False
|
2275
|
+
) # Use unfiltered text
|
2276
|
+
if not text_elements:
|
2277
|
+
logger.info(f"Region {self.bbox}: No text elements found for text table analysis.")
|
2278
|
+
return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
|
2279
|
+
|
2280
|
+
# Extract bounding boxes
|
2281
|
+
bboxes = [element.bbox for element in text_elements if hasattr(element, "bbox")]
|
2282
|
+
if not bboxes:
|
2283
|
+
logger.info(f"Region {self.bbox}: No bboxes extracted from text elements.")
|
2284
|
+
return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
|
2285
|
+
|
2286
|
+
# Call the utility function
|
2287
|
+
try:
|
2288
|
+
analysis_results = find_text_based_tables(
|
2289
|
+
bboxes=bboxes,
|
2290
|
+
snap_tolerance=snap_tolerance,
|
2291
|
+
join_tolerance=join_tolerance,
|
2292
|
+
min_words_vertical=min_words_vertical,
|
2293
|
+
min_words_horizontal=min_words_horizontal,
|
2294
|
+
intersection_tolerance=intersection_tolerance,
|
2295
|
+
**kwargs, # Pass through any extra specific tolerance args
|
2296
|
+
)
|
2297
|
+
# Store results in the region's analyses cache
|
2298
|
+
self.analyses["text_table_structure"] = analysis_results
|
2299
|
+
return analysis_results
|
2300
|
+
except ImportError:
|
2301
|
+
logger.error("pdfplumber library is required for 'text' table analysis but not found.")
|
2302
|
+
return None
|
2303
|
+
except Exception as e:
|
2304
|
+
logger.error(f"Error during text-based table analysis: {e}", exc_info=True)
|
2305
|
+
return None
|
2306
|
+
|
2307
|
+
# --- END NEW METHOD ---
|
2308
|
+
|
2309
|
+
# --- NEW METHOD: get_text_table_cells ---
|
2310
|
+
def get_text_table_cells(
|
2311
|
+
self,
|
2312
|
+
snap_tolerance: int = 10,
|
2313
|
+
join_tolerance: int = 3,
|
2314
|
+
min_words_vertical: int = 3,
|
2315
|
+
min_words_horizontal: int = 1,
|
2316
|
+
intersection_tolerance: int = 3,
|
2317
|
+
expand_bbox: Optional[Dict[str, int]] = None,
|
2318
|
+
**kwargs,
|
2319
|
+
) -> "ElementCollection[Region]":
|
2320
|
+
"""
|
2321
|
+
Analyzes text alignment to find table cells and returns them as
|
2322
|
+
temporary Region objects without adding them to the page.
|
2323
|
+
|
2324
|
+
Args:
|
2325
|
+
snap_tolerance: Tolerance for snapping parallel lines.
|
2326
|
+
join_tolerance: Tolerance for joining collinear lines.
|
2327
|
+
min_words_vertical: Minimum words needed to define a vertical line.
|
2328
|
+
min_words_horizontal: Minimum words needed to define a horizontal line.
|
2329
|
+
intersection_tolerance: Tolerance for detecting line intersections.
|
2330
|
+
expand_bbox: Optional dictionary to expand the search area slightly beyond
|
2331
|
+
the region's exact bounds (e.g., {'left': 5, 'right': 5}).
|
2332
|
+
**kwargs: Additional keyword arguments passed to
|
2333
|
+
find_text_based_tables (e.g., specific x/y tolerances).
|
2334
|
+
|
2335
|
+
Returns:
|
2336
|
+
An ElementCollection containing temporary Region objects for each detected cell,
|
2337
|
+
or an empty ElementCollection if no cells are found or an error occurs.
|
2338
|
+
"""
|
2339
|
+
from natural_pdf.elements.collections import ElementCollection
|
2340
|
+
|
2341
|
+
# 1. Perform the analysis (or use cached results)
|
2342
|
+
if "text_table_structure" in self.analyses:
|
2343
|
+
analysis_results = self.analyses["text_table_structure"]
|
2344
|
+
logger.debug("get_text_table_cells: Using cached analysis results.")
|
2345
|
+
else:
|
2346
|
+
analysis_results = self.analyze_text_table_structure(
|
2347
|
+
snap_tolerance=snap_tolerance,
|
2348
|
+
join_tolerance=join_tolerance,
|
2349
|
+
min_words_vertical=min_words_vertical,
|
2350
|
+
min_words_horizontal=min_words_horizontal,
|
2351
|
+
intersection_tolerance=intersection_tolerance,
|
2352
|
+
expand_bbox=expand_bbox,
|
2353
|
+
**kwargs,
|
2354
|
+
)
|
2355
|
+
|
2356
|
+
# 2. Check if analysis was successful and cells were found
|
2357
|
+
if analysis_results is None or not analysis_results.get("cells"):
|
2358
|
+
logger.info(f"Region {self.bbox}: No cells found by text table analysis.")
|
2359
|
+
return ElementCollection([]) # Return empty collection
|
2360
|
+
|
2361
|
+
# 3. Create temporary Region objects for each cell dictionary
|
2362
|
+
cell_regions = []
|
2363
|
+
for cell_data in analysis_results["cells"]:
|
2364
|
+
try:
|
2365
|
+
# Use page.region to create the region object
|
2366
|
+
# It expects left, top, right, bottom keys
|
2367
|
+
cell_region = self.page.region(**cell_data)
|
2368
|
+
|
2369
|
+
# Set metadata on the temporary region
|
2370
|
+
cell_region.region_type = "table-cell"
|
2371
|
+
cell_region.normalized_type = "table-cell"
|
2372
|
+
cell_region.model = "pdfplumber-text"
|
2373
|
+
cell_region.source = "volatile" # Indicate it's not managed/persistent
|
2374
|
+
cell_region.parent_region = self # Link back to the region it came from
|
2375
|
+
|
2376
|
+
cell_regions.append(cell_region)
|
2377
|
+
except Exception as e:
|
2378
|
+
logger.warning(f"Could not create Region object for cell data {cell_data}: {e}")
|
2379
|
+
|
2380
|
+
# 4. Return the list wrapped in an ElementCollection
|
2381
|
+
logger.debug(f"get_text_table_cells: Created {len(cell_regions)} temporary cell regions.")
|
2382
|
+
return ElementCollection(cell_regions)
|
2383
|
+
|
2384
|
+
# --- END NEW METHOD ---
|