natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -1,20 +1,37 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
|
2
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union, overload
|
3
3
|
|
4
4
|
from pdfplumber.utils.geometry import get_bbox_overlap, merge_bboxes, objects_to_bbox
|
5
5
|
|
6
6
|
# New Imports
|
7
7
|
from pdfplumber.utils.text import TEXTMAP_KWARGS, WORD_EXTRACTOR_KWARGS, chars_to_textmap
|
8
8
|
|
9
|
+
from natural_pdf.analyzers.layout.pdfplumber_table_finder import find_text_based_tables
|
10
|
+
from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
|
11
|
+
|
12
|
+
# --- Classification Imports --- #
|
13
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
9
14
|
from natural_pdf.elements.base import DirectionalMixin
|
15
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
16
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
17
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
18
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
10
19
|
|
11
20
|
# Import new utils
|
12
21
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
13
22
|
|
14
|
-
|
23
|
+
# --- NEW: Import tqdm utility --- #
|
24
|
+
from natural_pdf.utils.tqdm_utils import get_tqdm
|
25
|
+
|
26
|
+
# --- End Classification Imports --- #
|
27
|
+
|
15
28
|
|
16
29
|
if TYPE_CHECKING:
|
30
|
+
# --- NEW: Add Image type hint for classification --- #
|
31
|
+
from PIL.Image import Image
|
32
|
+
|
17
33
|
from natural_pdf.core.page import Page
|
34
|
+
from natural_pdf.elements.collections import ElementCollection
|
18
35
|
from natural_pdf.elements.text import TextElement
|
19
36
|
|
20
37
|
# Import OCRManager conditionally to avoid circular imports
|
@@ -27,7 +44,7 @@ except ImportError:
|
|
27
44
|
logger = logging.getLogger(__name__)
|
28
45
|
|
29
46
|
|
30
|
-
class Region(DirectionalMixin):
|
47
|
+
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
31
48
|
"""
|
32
49
|
Represents a rectangular region on a page.
|
33
50
|
"""
|
@@ -57,6 +74,12 @@ class Region(DirectionalMixin):
|
|
57
74
|
self.start_element = None
|
58
75
|
self.end_element = None
|
59
76
|
|
77
|
+
# --- ADDED --- Metadata store for mixins
|
78
|
+
self.metadata: Dict[str, Any] = {}
|
79
|
+
# --- NEW --- Central registry for analysis results
|
80
|
+
self.analyses: Dict[str, Any] = {}
|
81
|
+
# --- END ADDED ---
|
82
|
+
|
60
83
|
# Standard attributes for all elements
|
61
84
|
self.object_type = "region" # For selector compatibility
|
62
85
|
|
@@ -490,9 +513,37 @@ class Region(DirectionalMixin):
|
|
490
513
|
|
491
514
|
return inside
|
492
515
|
|
516
|
+
def is_element_center_inside(self, element: "Element") -> bool:
|
517
|
+
"""
|
518
|
+
Check if the center point of an element's bounding box is inside this region.
|
519
|
+
|
520
|
+
Args:
|
521
|
+
element: Element to check
|
522
|
+
|
523
|
+
Returns:
|
524
|
+
True if the element's center point is inside the region, False otherwise.
|
525
|
+
"""
|
526
|
+
# Check if element is on the same page
|
527
|
+
if not hasattr(element, "page") or element.page != self._page:
|
528
|
+
return False
|
529
|
+
|
530
|
+
# Ensure element has necessary attributes
|
531
|
+
if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
|
532
|
+
logger.warning(
|
533
|
+
f"Element {element} lacks bounding box attributes. Cannot check center point."
|
534
|
+
)
|
535
|
+
return False # Cannot determine position
|
536
|
+
|
537
|
+
# Calculate center point
|
538
|
+
center_x = (element.x0 + element.x1) / 2
|
539
|
+
center_y = (element.top + element.bottom) / 2
|
540
|
+
|
541
|
+
# Use the existing is_point_inside check
|
542
|
+
return self.is_point_inside(center_x, center_y)
|
543
|
+
|
493
544
|
def _is_element_in_region(self, element: "Element", use_boundary_tolerance=True) -> bool:
|
494
545
|
"""
|
495
|
-
Check if an element is within this region.
|
546
|
+
Check if an element intersects or is contained within this region.
|
496
547
|
|
497
548
|
Args:
|
498
549
|
element: Element to check
|
@@ -509,16 +560,101 @@ class Region(DirectionalMixin):
|
|
509
560
|
if not hasattr(element, "page") or element.page != self._page:
|
510
561
|
return False
|
511
562
|
|
512
|
-
|
563
|
+
return self.is_element_center_inside(element)
|
564
|
+
# return self.intersects(element)
|
565
|
+
|
566
|
+
def contains(self, element: "Element") -> bool:
|
567
|
+
"""
|
568
|
+
Check if this region completely contains an element.
|
569
|
+
|
570
|
+
Args:
|
571
|
+
element: Element to check
|
572
|
+
|
573
|
+
Returns:
|
574
|
+
True if the element is completely contained within the region, False otherwise
|
575
|
+
"""
|
576
|
+
# Check if element is on the same page
|
577
|
+
if not hasattr(element, "page") or element.page != self._page:
|
578
|
+
return False
|
579
|
+
|
513
580
|
# Ensure element has necessary attributes
|
514
581
|
if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
|
515
582
|
return False # Cannot determine position
|
516
583
|
|
517
|
-
|
518
|
-
|
584
|
+
# For rectangular regions, check if element's bbox is fully inside region's bbox
|
585
|
+
if not self.has_polygon:
|
586
|
+
return (
|
587
|
+
self.x0 <= element.x0
|
588
|
+
and element.x1 <= self.x1
|
589
|
+
and self.top <= element.top
|
590
|
+
and element.bottom <= self.bottom
|
591
|
+
)
|
592
|
+
|
593
|
+
# For polygon regions, check if all corners of the element are inside the polygon
|
594
|
+
element_corners = [
|
595
|
+
(element.x0, element.top), # top-left
|
596
|
+
(element.x1, element.top), # top-right
|
597
|
+
(element.x1, element.bottom), # bottom-right
|
598
|
+
(element.x0, element.bottom), # bottom-left
|
599
|
+
]
|
600
|
+
|
601
|
+
return all(self.is_point_inside(x, y) for x, y in element_corners)
|
602
|
+
|
603
|
+
def intersects(self, element: "Element") -> bool:
|
604
|
+
"""
|
605
|
+
Check if this region intersects with an element (any overlap).
|
519
606
|
|
520
|
-
|
521
|
-
|
607
|
+
Args:
|
608
|
+
element: Element to check
|
609
|
+
|
610
|
+
Returns:
|
611
|
+
True if the element overlaps with the region at all, False otherwise
|
612
|
+
"""
|
613
|
+
# Check if element is on the same page
|
614
|
+
if not hasattr(element, "page") or element.page != self._page:
|
615
|
+
return False
|
616
|
+
|
617
|
+
# Ensure element has necessary attributes
|
618
|
+
if not all(hasattr(element, attr) for attr in ["x0", "x1", "top", "bottom"]):
|
619
|
+
return False # Cannot determine position
|
620
|
+
|
621
|
+
# For rectangular regions, check for bbox overlap
|
622
|
+
if not self.has_polygon:
|
623
|
+
return (
|
624
|
+
self.x0 < element.x1
|
625
|
+
and self.x1 > element.x0
|
626
|
+
and self.top < element.bottom
|
627
|
+
and self.bottom > element.top
|
628
|
+
)
|
629
|
+
|
630
|
+
# For polygon regions, check if any corner of the element is inside the polygon
|
631
|
+
element_corners = [
|
632
|
+
(element.x0, element.top), # top-left
|
633
|
+
(element.x1, element.top), # top-right
|
634
|
+
(element.x1, element.bottom), # bottom-right
|
635
|
+
(element.x0, element.bottom), # bottom-left
|
636
|
+
]
|
637
|
+
|
638
|
+
# First check if any element corner is inside the polygon
|
639
|
+
if any(self.is_point_inside(x, y) for x, y in element_corners):
|
640
|
+
return True
|
641
|
+
|
642
|
+
# Also check if any polygon corner is inside the element's rectangle
|
643
|
+
for x, y in self.polygon:
|
644
|
+
if element.x0 <= x <= element.x1 and element.top <= y <= element.bottom:
|
645
|
+
return True
|
646
|
+
|
647
|
+
# Also check if any polygon edge intersects with any rectangle edge
|
648
|
+
# This is a simplification - for complex cases, we'd need a full polygon-rectangle
|
649
|
+
# intersection algorithm
|
650
|
+
|
651
|
+
# For now, return True if bounding boxes overlap (approximation for polygon-rectangle case)
|
652
|
+
return (
|
653
|
+
self.x0 < element.x1
|
654
|
+
and self.x1 > element.x0
|
655
|
+
and self.top < element.bottom
|
656
|
+
and self.bottom > element.top
|
657
|
+
)
|
522
658
|
|
523
659
|
def highlight(
|
524
660
|
self,
|
@@ -600,6 +736,18 @@ class Region(DirectionalMixin):
|
|
600
736
|
x1 = int(self.x1 * scale_factor)
|
601
737
|
bottom = int(self.bottom * scale_factor)
|
602
738
|
|
739
|
+
# Ensure coords are valid for cropping (left < right, top < bottom)
|
740
|
+
if x0 >= x1:
|
741
|
+
logger.warning(
|
742
|
+
f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
|
743
|
+
)
|
744
|
+
return None
|
745
|
+
if top >= bottom:
|
746
|
+
logger.warning(
|
747
|
+
f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
|
748
|
+
)
|
749
|
+
return None
|
750
|
+
|
603
751
|
# Crop the image to just this region
|
604
752
|
region_image = page_image.crop((x0, top, x1, bottom))
|
605
753
|
|
@@ -776,11 +924,6 @@ class Region(DirectionalMixin):
|
|
776
924
|
debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
|
777
925
|
logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
|
778
926
|
|
779
|
-
# --- Handle Docling source (priority) --- DEPRECATED or Adapt?
|
780
|
-
# For now, let's bypass this and always use the standard extraction flow
|
781
|
-
# based on contained elements to ensure consistency.
|
782
|
-
# if self.model == 'docling' or hasattr(self, 'text_content'): ...
|
783
|
-
|
784
927
|
# 1. Get Word Elements potentially within this region (initial broad phase)
|
785
928
|
# Optimization: Could use spatial query if page elements were indexed
|
786
929
|
page_words = self.page.words # Get all words from the page
|
@@ -829,7 +972,7 @@ class Region(DirectionalMixin):
|
|
829
972
|
result = generate_text_layout(
|
830
973
|
char_dicts=filtered_chars,
|
831
974
|
layout_context_bbox=self.bbox, # Use region's bbox for context
|
832
|
-
user_kwargs=kwargs,
|
975
|
+
user_kwargs=kwargs, # Pass original kwargs to layout generator
|
833
976
|
)
|
834
977
|
|
835
978
|
logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
|
@@ -837,40 +980,65 @@ class Region(DirectionalMixin):
|
|
837
980
|
|
838
981
|
def extract_table(
|
839
982
|
self,
|
840
|
-
method: str = None,
|
841
|
-
table_settings: dict = None,
|
983
|
+
method: Optional[str] = None, # Make method optional
|
984
|
+
table_settings: Optional[dict] = None, # Use Optional
|
842
985
|
use_ocr: bool = False,
|
843
|
-
ocr_config: dict = None,
|
844
|
-
|
986
|
+
ocr_config: Optional[dict] = None, # Use Optional
|
987
|
+
text_options: Optional[Dict] = None,
|
988
|
+
cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
|
989
|
+
# --- NEW: Add tqdm control option --- #
|
990
|
+
show_progress: bool = False, # Controls progress bar for text method
|
991
|
+
) -> List[List[Optional[str]]]: # Return type allows Optional[str] for cells
|
845
992
|
"""
|
846
993
|
Extract a table from this region.
|
847
994
|
|
848
995
|
Args:
|
849
|
-
method: Method to use
|
850
|
-
table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method)
|
851
|
-
use_ocr: Whether to use OCR for text extraction (only applicable with 'tatr' method)
|
852
|
-
ocr_config: OCR configuration parameters
|
996
|
+
method: Method to use: 'tatr', 'plumber', 'text', or None (auto-detect).
|
997
|
+
table_settings: Settings for pdfplumber table extraction (used only with 'plumber' method).
|
998
|
+
use_ocr: Whether to use OCR for text extraction (currently only applicable with 'tatr' method).
|
999
|
+
ocr_config: OCR configuration parameters.
|
1000
|
+
text_options: Dictionary of options for the 'text' method, corresponding to arguments
|
1001
|
+
of analyze_text_table_structure (e.g., snap_tolerance, expand_bbox).
|
1002
|
+
cell_extraction_func: Optional callable function that takes a cell Region object
|
1003
|
+
and returns its string content. Overrides default text extraction
|
1004
|
+
for the 'text' method.
|
1005
|
+
show_progress: If True, display a progress bar during cell text extraction for the 'text' method.
|
853
1006
|
|
854
1007
|
Returns:
|
855
|
-
Table data as a list of rows, where each row is a list of cell values
|
1008
|
+
Table data as a list of rows, where each row is a list of cell values (str or None).
|
856
1009
|
"""
|
857
1010
|
# Default settings if none provided
|
858
1011
|
if table_settings is None:
|
859
1012
|
table_settings = {}
|
1013
|
+
if text_options is None:
|
1014
|
+
text_options = {} # Initialize empty dict
|
860
1015
|
|
861
1016
|
# Auto-detect method if not specified
|
862
|
-
|
1017
|
+
effective_method = method
|
1018
|
+
if effective_method is None:
|
863
1019
|
# If this is a TATR-detected region, use TATR method
|
864
1020
|
if hasattr(self, "model") and self.model == "tatr" and self.region_type == "table":
|
865
|
-
|
1021
|
+
effective_method = "tatr"
|
866
1022
|
else:
|
867
|
-
|
1023
|
+
effective_method = "text"
|
1024
|
+
|
1025
|
+
logger.debug(f"Region {self.bbox}: Extracting table using method '{effective_method}'")
|
868
1026
|
|
869
1027
|
# Use the selected method
|
870
|
-
if
|
1028
|
+
if effective_method == "tatr":
|
871
1029
|
return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
|
872
|
-
|
1030
|
+
elif effective_method == "text":
|
1031
|
+
current_text_options = text_options.copy()
|
1032
|
+
current_text_options["cell_extraction_func"] = cell_extraction_func
|
1033
|
+
# --- Pass show_progress to the helper --- #
|
1034
|
+
current_text_options["show_progress"] = show_progress
|
1035
|
+
return self._extract_table_text(**current_text_options)
|
1036
|
+
elif effective_method == "plumber":
|
873
1037
|
return self._extract_table_plumber(table_settings)
|
1038
|
+
else:
|
1039
|
+
raise ValueError(
|
1040
|
+
f"Unknown table extraction method: '{effective_method}'. Choose from 'tatr', 'plumber', 'text'."
|
1041
|
+
)
|
874
1042
|
|
875
1043
|
def _extract_table_plumber(self, table_settings: dict) -> List[List[str]]:
|
876
1044
|
"""
|
@@ -1031,46 +1199,273 @@ class Region(DirectionalMixin):
|
|
1031
1199
|
|
1032
1200
|
return table_data
|
1033
1201
|
|
1034
|
-
def
|
1202
|
+
def _extract_table_text(self, **text_options) -> List[List[Optional[str]]]:
|
1035
1203
|
"""
|
1036
|
-
|
1204
|
+
Extracts table content based on text alignment analysis.
|
1037
1205
|
|
1038
1206
|
Args:
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1207
|
+
**text_options: Options passed to analyze_text_table_structure,
|
1208
|
+
plus optional 'cell_extraction_func', 'coordinate_grouping_tolerance',
|
1209
|
+
and 'show_progress'.
|
1042
1210
|
|
1043
1211
|
Returns:
|
1044
|
-
|
1212
|
+
Table data as list of lists of strings (or None for empty cells).
|
1045
1213
|
"""
|
1046
|
-
|
1047
|
-
|
1214
|
+
cell_extraction_func = text_options.pop("cell_extraction_func", None)
|
1215
|
+
# --- Get show_progress option --- #
|
1216
|
+
show_progress = text_options.pop("show_progress", False)
|
1217
|
+
|
1218
|
+
# Analyze structure first (or use cached results)
|
1219
|
+
if "text_table_structure" in self.analyses:
|
1220
|
+
analysis_results = self.analyses["text_table_structure"]
|
1221
|
+
logger.debug("Using cached text table structure analysis results.")
|
1222
|
+
else:
|
1223
|
+
analysis_results = self.analyze_text_table_structure(**text_options)
|
1224
|
+
|
1225
|
+
if analysis_results is None or not analysis_results.get("cells"):
|
1226
|
+
logger.warning(f"Region {self.bbox}: No cells found using 'text' method.")
|
1227
|
+
return []
|
1228
|
+
|
1229
|
+
cell_dicts = analysis_results["cells"]
|
1230
|
+
|
1231
|
+
# --- Grid Reconstruction Logic --- #
|
1232
|
+
if not cell_dicts:
|
1233
|
+
return []
|
1234
|
+
|
1235
|
+
# 1. Get unique sorted top and left coordinates (cell boundaries)
|
1236
|
+
coord_tolerance = text_options.get("coordinate_grouping_tolerance", 1)
|
1237
|
+
tops = sorted(
|
1238
|
+
list(set(round(c["top"] / coord_tolerance) * coord_tolerance for c in cell_dicts))
|
1239
|
+
)
|
1240
|
+
lefts = sorted(
|
1241
|
+
list(set(round(c["left"] / coord_tolerance) * coord_tolerance for c in cell_dicts))
|
1242
|
+
)
|
1243
|
+
|
1244
|
+
# Refine boundaries (cluster_coords helper remains the same)
|
1245
|
+
def cluster_coords(coords):
|
1246
|
+
if not coords:
|
1247
|
+
return []
|
1248
|
+
clustered = []
|
1249
|
+
current_cluster = [coords[0]]
|
1250
|
+
for c in coords[1:]:
|
1251
|
+
if abs(c - current_cluster[-1]) <= coord_tolerance:
|
1252
|
+
current_cluster.append(c)
|
1253
|
+
else:
|
1254
|
+
clustered.append(min(current_cluster))
|
1255
|
+
current_cluster = [c]
|
1256
|
+
clustered.append(min(current_cluster))
|
1257
|
+
return clustered
|
1258
|
+
|
1259
|
+
unique_tops = cluster_coords(tops)
|
1260
|
+
unique_lefts = cluster_coords(lefts)
|
1261
|
+
|
1262
|
+
# --- Setup tqdm --- #
|
1263
|
+
tqdm = get_tqdm()
|
1264
|
+
# Determine iterable for tqdm
|
1265
|
+
cell_iterator = cell_dicts
|
1266
|
+
if show_progress:
|
1267
|
+
# Only wrap if progress should be shown
|
1268
|
+
cell_iterator = tqdm(
|
1269
|
+
cell_dicts,
|
1270
|
+
desc=f"Extracting text from {len(cell_dicts)} cells (text method)",
|
1271
|
+
unit="cell",
|
1272
|
+
leave=False, # Optional: Keep bar after completion
|
1273
|
+
)
|
1274
|
+
# --- End tqdm Setup --- #
|
1048
1275
|
|
1276
|
+
# 2. Create a lookup map for cell text: {(rounded_top, rounded_left): cell_text}
|
1277
|
+
cell_text_map = {}
|
1278
|
+
# --- Use the potentially wrapped iterator --- #
|
1279
|
+
for cell_data in cell_iterator:
|
1280
|
+
try:
|
1281
|
+
cell_region = self.page.region(**cell_data)
|
1282
|
+
cell_value = None # Initialize
|
1283
|
+
if callable(cell_extraction_func):
|
1284
|
+
try:
|
1285
|
+
cell_value = cell_extraction_func(cell_region)
|
1286
|
+
if not isinstance(cell_value, (str, type(None))):
|
1287
|
+
logger.warning(
|
1288
|
+
f"Custom cell_extraction_func returned non-string/None type ({type(cell_value)}) for cell {cell_data}. Treating as None."
|
1289
|
+
)
|
1290
|
+
cell_value = None
|
1291
|
+
except Exception as func_err:
|
1292
|
+
logger.error(
|
1293
|
+
f"Error executing custom cell_extraction_func for cell {cell_data}: {func_err}",
|
1294
|
+
exc_info=True,
|
1295
|
+
)
|
1296
|
+
cell_value = None
|
1297
|
+
else:
|
1298
|
+
cell_value = cell_region.extract_text(
|
1299
|
+
layout=False, apply_exclusions=False
|
1300
|
+
).strip()
|
1301
|
+
|
1302
|
+
rounded_top = round(cell_data["top"] / coord_tolerance) * coord_tolerance
|
1303
|
+
rounded_left = round(cell_data["left"] / coord_tolerance) * coord_tolerance
|
1304
|
+
cell_text_map[(rounded_top, rounded_left)] = cell_value
|
1305
|
+
except Exception as e:
|
1306
|
+
logger.warning(f"Could not process cell {cell_data} for text extraction: {e}")
|
1307
|
+
|
1308
|
+
# 3. Build the final list-of-lists table (loop remains the same)
|
1309
|
+
final_table = []
|
1310
|
+
for row_top in unique_tops:
|
1311
|
+
row_data = []
|
1312
|
+
for col_left in unique_lefts:
|
1313
|
+
best_match_key = None
|
1314
|
+
min_dist_sq = float("inf")
|
1315
|
+
for map_top, map_left in cell_text_map.keys():
|
1316
|
+
if (
|
1317
|
+
abs(map_top - row_top) <= coord_tolerance
|
1318
|
+
and abs(map_left - col_left) <= coord_tolerance
|
1319
|
+
):
|
1320
|
+
dist_sq = (map_top - row_top) ** 2 + (map_left - col_left) ** 2
|
1321
|
+
if dist_sq < min_dist_sq:
|
1322
|
+
min_dist_sq = dist_sq
|
1323
|
+
best_match_key = (map_top, map_left)
|
1324
|
+
cell_value = cell_text_map.get(best_match_key)
|
1325
|
+
row_data.append(cell_value)
|
1326
|
+
final_table.append(row_data)
|
1327
|
+
|
1328
|
+
return final_table
|
1329
|
+
|
1330
|
+
# --- END MODIFIED METHOD --- #
|
1331
|
+
|
1332
|
+
@overload
|
1333
|
+
def find(
|
1334
|
+
self,
|
1335
|
+
*,
|
1336
|
+
text: str,
|
1337
|
+
apply_exclusions: bool = True,
|
1338
|
+
regex: bool = False,
|
1339
|
+
case: bool = True,
|
1340
|
+
**kwargs,
|
1341
|
+
) -> Optional["Element"]: ...
|
1342
|
+
|
1343
|
+
@overload
|
1344
|
+
def find(
|
1345
|
+
self,
|
1346
|
+
selector: str,
|
1347
|
+
*,
|
1348
|
+
apply_exclusions: bool = True,
|
1349
|
+
regex: bool = False,
|
1350
|
+
case: bool = True,
|
1351
|
+
**kwargs,
|
1352
|
+
) -> Optional["Element"]: ...
|
1353
|
+
|
1354
|
+
def find(
|
1355
|
+
self,
|
1356
|
+
selector: Optional[str] = None, # Now optional
|
1357
|
+
*,
|
1358
|
+
text: Optional[str] = None, # New text parameter
|
1359
|
+
apply_exclusions: bool = True,
|
1360
|
+
regex: bool = False,
|
1361
|
+
case: bool = True,
|
1362
|
+
**kwargs,
|
1363
|
+
) -> Optional["Element"]:
|
1364
|
+
"""
|
1365
|
+
Find the first element in this region matching the selector OR text content.
|
1366
|
+
|
1367
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1368
|
+
|
1369
|
+
Args:
|
1370
|
+
selector: CSS-like selector string.
|
1371
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1372
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1373
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1374
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1375
|
+
**kwargs: Additional parameters for element filtering.
|
1376
|
+
|
1377
|
+
Returns:
|
1378
|
+
First matching element or None.
|
1379
|
+
"""
|
1380
|
+
# Delegate validation and selector construction to find_all
|
1381
|
+
elements = self.find_all(
|
1382
|
+
selector=selector,
|
1383
|
+
text=text,
|
1384
|
+
apply_exclusions=apply_exclusions,
|
1385
|
+
regex=regex,
|
1386
|
+
case=case,
|
1387
|
+
**kwargs,
|
1388
|
+
)
|
1389
|
+
return elements.first if elements else None
|
1390
|
+
|
1391
|
+
@overload
|
1049
1392
|
def find_all(
|
1050
|
-
self,
|
1051
|
-
|
1393
|
+
self,
|
1394
|
+
*,
|
1395
|
+
text: str,
|
1396
|
+
apply_exclusions: bool = True,
|
1397
|
+
regex: bool = False,
|
1398
|
+
case: bool = True,
|
1399
|
+
**kwargs,
|
1400
|
+
) -> "ElementCollection": ...
|
1401
|
+
|
1402
|
+
@overload
|
1403
|
+
def find_all(
|
1404
|
+
self,
|
1405
|
+
selector: str,
|
1406
|
+
*,
|
1407
|
+
apply_exclusions: bool = True,
|
1408
|
+
regex: bool = False,
|
1409
|
+
case: bool = True,
|
1410
|
+
**kwargs,
|
1411
|
+
) -> "ElementCollection": ...
|
1412
|
+
|
1413
|
+
def find_all(
|
1414
|
+
self,
|
1415
|
+
selector: Optional[str] = None, # Now optional
|
1416
|
+
*,
|
1417
|
+
text: Optional[str] = None, # New text parameter
|
1418
|
+
apply_exclusions: bool = True,
|
1419
|
+
regex: bool = False,
|
1420
|
+
case: bool = True,
|
1421
|
+
**kwargs,
|
1422
|
+
) -> "ElementCollection":
|
1052
1423
|
"""
|
1053
|
-
Find all elements in this region matching the selector.
|
1424
|
+
Find all elements in this region matching the selector OR text content.
|
1425
|
+
|
1426
|
+
Provide EITHER `selector` OR `text`, but not both.
|
1054
1427
|
|
1055
1428
|
Args:
|
1056
|
-
selector: CSS-like selector string
|
1057
|
-
|
1058
|
-
|
1429
|
+
selector: CSS-like selector string.
|
1430
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1431
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
1432
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1433
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1434
|
+
**kwargs: Additional parameters for element filtering.
|
1059
1435
|
|
1060
1436
|
Returns:
|
1061
|
-
ElementCollection with matching elements
|
1437
|
+
ElementCollection with matching elements.
|
1062
1438
|
"""
|
1063
1439
|
from natural_pdf.elements.collections import ElementCollection
|
1064
1440
|
|
1441
|
+
if selector is not None and text is not None:
|
1442
|
+
raise ValueError("Provide either 'selector' or 'text', not both.")
|
1443
|
+
if selector is None and text is None:
|
1444
|
+
raise ValueError("Provide either 'selector' or 'text'.")
|
1445
|
+
|
1446
|
+
# Construct selector if 'text' is provided
|
1447
|
+
effective_selector = ""
|
1448
|
+
if text is not None:
|
1449
|
+
escaped_text = text.replace('"', '\\"').replace("'", "\\'")
|
1450
|
+
effective_selector = f'text:contains("{escaped_text}")'
|
1451
|
+
logger.debug(
|
1452
|
+
f"Using text shortcut: find_all(text='{text}') -> find_all('{effective_selector}')"
|
1453
|
+
)
|
1454
|
+
elif selector is not None:
|
1455
|
+
effective_selector = selector
|
1456
|
+
else:
|
1457
|
+
raise ValueError("Internal error: No selector or text provided.")
|
1458
|
+
|
1065
1459
|
# If we span multiple pages, filter our elements
|
1066
1460
|
# TODO: Revisit multi-page region logic
|
1067
1461
|
if self._spans_pages and self._multi_page_elements is not None:
|
1068
1462
|
logger.warning("find_all on multi-page regions is not fully implemented.")
|
1069
1463
|
# Temporary: Apply filter directly to cached elements
|
1070
|
-
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1071
|
-
|
1072
1464
|
try:
|
1073
|
-
selector_obj = parse_selector(
|
1465
|
+
selector_obj = parse_selector(effective_selector)
|
1466
|
+
# Pass regex/case flags down
|
1467
|
+
kwargs["regex"] = regex
|
1468
|
+
kwargs["case"] = case
|
1074
1469
|
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
1075
1470
|
matching = [el for el in self._multi_page_elements if filter_func(el)]
|
1076
1471
|
return ElementCollection(matching)
|
@@ -1078,17 +1473,46 @@ class Region(DirectionalMixin):
|
|
1078
1473
|
logger.error(f"Error applying selector to multi-page region elements: {e}")
|
1079
1474
|
return ElementCollection([])
|
1080
1475
|
|
1081
|
-
#
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1476
|
+
# Normal case: Region is on a single page
|
1477
|
+
try:
|
1478
|
+
# Parse the final selector string
|
1479
|
+
selector_obj = parse_selector(effective_selector)
|
1480
|
+
|
1481
|
+
# Get all potentially relevant elements from the page
|
1482
|
+
# Let the page handle its exclusion logic if needed
|
1483
|
+
potential_elements = self.page.find_all(
|
1484
|
+
selector=effective_selector,
|
1485
|
+
apply_exclusions=False, # Apply exclusions LATER based on region bbox
|
1486
|
+
regex=regex,
|
1487
|
+
case=case,
|
1488
|
+
**kwargs,
|
1489
|
+
)
|
1086
1490
|
|
1087
|
-
|
1491
|
+
# Filter these elements to those strictly within the region's bounds
|
1492
|
+
region_bbox = self.bbox
|
1493
|
+
matching_elements = [
|
1494
|
+
el
|
1495
|
+
for el in potential_elements
|
1496
|
+
if el.x0 >= region_bbox[0]
|
1497
|
+
and el.top >= region_bbox[1]
|
1498
|
+
and el.x1 <= region_bbox[2]
|
1499
|
+
and el.bottom <= region_bbox[3]
|
1500
|
+
]
|
1501
|
+
|
1502
|
+
return ElementCollection(matching_elements)
|
1503
|
+
|
1504
|
+
except Exception as e:
|
1505
|
+
logger.error(f"Error during find_all in region: {e}", exc_info=True)
|
1506
|
+
return ElementCollection([])
|
1507
|
+
|
1508
|
+
def apply_ocr(self, replace=True, **ocr_params) -> "Region":
|
1088
1509
|
"""
|
1089
1510
|
Apply OCR to this region and return the created text elements.
|
1090
1511
|
|
1091
1512
|
Args:
|
1513
|
+
replace: If True (default), removes existing OCR elements in the region
|
1514
|
+
before adding new ones. If False, adds new OCR elements without
|
1515
|
+
removing existing ones.
|
1092
1516
|
**ocr_params: Keyword arguments passed to the OCR Manager.
|
1093
1517
|
Common parameters like `engine`, `languages`, `min_confidence`,
|
1094
1518
|
`device`, and `resolution` (for image rendering) should be
|
@@ -1098,12 +1522,32 @@ class Region(DirectionalMixin):
|
|
1098
1522
|
an `options` object (e.g., `options=EasyOCROptions(...)`).
|
1099
1523
|
|
1100
1524
|
Returns:
|
1101
|
-
|
1525
|
+
Self for method chaining.
|
1102
1526
|
"""
|
1103
1527
|
# Ensure OCRManager is available
|
1104
1528
|
if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
|
1105
1529
|
logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
|
1106
|
-
return
|
1530
|
+
return self
|
1531
|
+
|
1532
|
+
# If replace is True, find and remove existing OCR elements in this region
|
1533
|
+
if replace:
|
1534
|
+
logger.info(
|
1535
|
+
f"Region {self.bbox}: Removing existing OCR elements before applying new OCR."
|
1536
|
+
)
|
1537
|
+
# Find all OCR elements in this region
|
1538
|
+
ocr_selector = "text[source=ocr]"
|
1539
|
+
ocr_elements = self.find_all(ocr_selector)
|
1540
|
+
|
1541
|
+
if ocr_elements:
|
1542
|
+
logger.info(
|
1543
|
+
f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove."
|
1544
|
+
)
|
1545
|
+
# Remove these elements from their page
|
1546
|
+
removed_count = ocr_elements.remove()
|
1547
|
+
logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
|
1548
|
+
else:
|
1549
|
+
logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
|
1550
|
+
|
1107
1551
|
ocr_mgr = self.page._parent._ocr_manager
|
1108
1552
|
|
1109
1553
|
# Determine rendering resolution from parameters
|
@@ -1123,11 +1567,11 @@ class Region(DirectionalMixin):
|
|
1123
1567
|
)
|
1124
1568
|
if not region_image:
|
1125
1569
|
logger.error("Failed to render region to image for OCR.")
|
1126
|
-
return
|
1570
|
+
return self
|
1127
1571
|
logger.debug(f"Region rendered to image size: {region_image.size}")
|
1128
1572
|
except Exception as e:
|
1129
1573
|
logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
|
1130
|
-
return
|
1574
|
+
return self
|
1131
1575
|
|
1132
1576
|
# Prepare args for the OCR Manager
|
1133
1577
|
manager_args = {
|
@@ -1148,11 +1592,11 @@ class Region(DirectionalMixin):
|
|
1148
1592
|
logger.error(
|
1149
1593
|
f"OCRManager returned unexpected type for single region image: {type(results)}"
|
1150
1594
|
)
|
1151
|
-
return
|
1595
|
+
return self
|
1152
1596
|
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
1153
1597
|
except Exception as e:
|
1154
1598
|
logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
|
1155
|
-
return
|
1599
|
+
return self
|
1156
1600
|
|
1157
1601
|
# Convert results to TextElements
|
1158
1602
|
scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
|
@@ -1621,8 +2065,6 @@ class Region(DirectionalMixin):
|
|
1621
2065
|
return self.child_regions
|
1622
2066
|
|
1623
2067
|
# Use existing selector parser to filter
|
1624
|
-
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1625
|
-
|
1626
2068
|
try:
|
1627
2069
|
selector_obj = parse_selector(selector)
|
1628
2070
|
filter_func = selector_to_filter_func(selector_obj) # Removed region=self
|
@@ -1663,8 +2105,6 @@ class Region(DirectionalMixin):
|
|
1663
2105
|
|
1664
2106
|
# Filter by selector if provided
|
1665
2107
|
if selector is not None:
|
1666
|
-
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
1667
|
-
|
1668
2108
|
try:
|
1669
2109
|
selector_obj = parse_selector(selector)
|
1670
2110
|
filter_func = selector_to_filter_func(selector_obj) # Removed region=self
|
@@ -1677,11 +2117,6 @@ class Region(DirectionalMixin):
|
|
1677
2117
|
|
1678
2118
|
return all_descendants
|
1679
2119
|
|
1680
|
-
# Removed recursive=True, find_all on region shouldn't be recursive by default
|
1681
|
-
# Renamed _find_all back to find_all
|
1682
|
-
# def find_all(self, selector, apply_exclusions=True, **kwargs):
|
1683
|
-
# See implementation above near get_elements
|
1684
|
-
|
1685
2120
|
def __repr__(self) -> str:
|
1686
2121
|
"""String representation of the region."""
|
1687
2122
|
poly_info = " (Polygon)" if self.has_polygon else ""
|
@@ -1719,7 +2154,7 @@ class Region(DirectionalMixin):
|
|
1719
2154
|
"""
|
1720
2155
|
# Find OCR elements specifically within this region
|
1721
2156
|
# Note: We typically want to correct even if the element falls in an excluded area
|
1722
|
-
target_elements = self.find_all(selector="text[source
|
2157
|
+
target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
|
1723
2158
|
|
1724
2159
|
# Delegate to the utility function
|
1725
2160
|
_apply_ocr_correction_to_elements(
|
@@ -1729,3 +2164,221 @@ class Region(DirectionalMixin):
|
|
1729
2164
|
)
|
1730
2165
|
|
1731
2166
|
return self # Return self for chaining
|
2167
|
+
|
2168
|
+
# --- Classification Mixin Implementation --- #
|
2169
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
2170
|
+
if (
|
2171
|
+
not hasattr(self, "page")
|
2172
|
+
or not hasattr(self.page, "pdf")
|
2173
|
+
or not hasattr(self.page.pdf, "get_manager")
|
2174
|
+
):
|
2175
|
+
raise AttributeError(
|
2176
|
+
"ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing."
|
2177
|
+
)
|
2178
|
+
try:
|
2179
|
+
# Use the PDF's manager registry accessor via page
|
2180
|
+
return self.page.pdf.get_manager("classification")
|
2181
|
+
except (ValueError, RuntimeError, AttributeError) as e:
|
2182
|
+
# Wrap potential errors from get_manager for clarity
|
2183
|
+
raise AttributeError(
|
2184
|
+
f"Failed to get ClassificationManager from PDF via Page: {e}"
|
2185
|
+
) from e
|
2186
|
+
|
2187
|
+
def _get_classification_content(
|
2188
|
+
self, model_type: str, **kwargs
|
2189
|
+
) -> Union[str, "Image"]: # Use "Image" for lazy import
|
2190
|
+
if model_type == "text":
|
2191
|
+
text_content = self.extract_text(layout=False) # Simple join for classification
|
2192
|
+
if not text_content or text_content.isspace():
|
2193
|
+
raise ValueError("Cannot classify region with 'text' model: No text content found.")
|
2194
|
+
return text_content
|
2195
|
+
elif model_type == "vision":
|
2196
|
+
# Get resolution from manager/kwargs if possible, else default
|
2197
|
+
# We access manager via the method to ensure it's available
|
2198
|
+
manager = self._get_classification_manager()
|
2199
|
+
default_resolution = 150 # Manager doesn't store default res, set here
|
2200
|
+
# Note: classify() passes resolution via **kwargs if user specifies
|
2201
|
+
resolution = (
|
2202
|
+
kwargs.get("resolution", default_resolution)
|
2203
|
+
if "kwargs" in locals()
|
2204
|
+
else default_resolution
|
2205
|
+
)
|
2206
|
+
|
2207
|
+
img = self.to_image(
|
2208
|
+
resolution=resolution,
|
2209
|
+
include_highlights=False, # No highlights for classification input
|
2210
|
+
crop_only=True, # Just the region content
|
2211
|
+
)
|
2212
|
+
if img is None:
|
2213
|
+
raise ValueError(
|
2214
|
+
"Cannot classify region with 'vision' model: Failed to render image."
|
2215
|
+
)
|
2216
|
+
return img
|
2217
|
+
else:
|
2218
|
+
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
2219
|
+
|
2220
|
+
def _get_metadata_storage(self) -> Dict[str, Any]:
|
2221
|
+
# Ensure metadata exists
|
2222
|
+
if not hasattr(self, "metadata") or self.metadata is None:
|
2223
|
+
self.metadata = {}
|
2224
|
+
return self.metadata
|
2225
|
+
|
2226
|
+
# --- End Classification Mixin Implementation --- #
|
2227
|
+
|
2228
|
+
# --- NEW METHOD: analyze_text_table_structure ---
|
2229
|
+
def analyze_text_table_structure(
|
2230
|
+
self,
|
2231
|
+
snap_tolerance: int = 10,
|
2232
|
+
join_tolerance: int = 3,
|
2233
|
+
min_words_vertical: int = 3,
|
2234
|
+
min_words_horizontal: int = 1,
|
2235
|
+
intersection_tolerance: int = 3,
|
2236
|
+
expand_bbox: Optional[Dict[str, int]] = None,
|
2237
|
+
**kwargs,
|
2238
|
+
) -> Optional[Dict]:
|
2239
|
+
"""
|
2240
|
+
Analyzes the text elements within the region (or slightly expanded area)
|
2241
|
+
to find potential table structure (lines, cells) using text alignment logic
|
2242
|
+
adapted from pdfplumber.
|
2243
|
+
|
2244
|
+
Args:
|
2245
|
+
snap_tolerance: Tolerance for snapping parallel lines.
|
2246
|
+
join_tolerance: Tolerance for joining collinear lines.
|
2247
|
+
min_words_vertical: Minimum words needed to define a vertical line.
|
2248
|
+
min_words_horizontal: Minimum words needed to define a horizontal line.
|
2249
|
+
intersection_tolerance: Tolerance for detecting line intersections.
|
2250
|
+
expand_bbox: Optional dictionary to expand the search area slightly beyond
|
2251
|
+
the region's exact bounds (e.g., {'left': 5, 'right': 5}).
|
2252
|
+
**kwargs: Additional keyword arguments passed to
|
2253
|
+
find_text_based_tables (e.g., specific x/y tolerances).
|
2254
|
+
|
2255
|
+
Returns:
|
2256
|
+
A dictionary containing 'horizontal_edges', 'vertical_edges', 'cells' (list of dicts),
|
2257
|
+
and 'intersections', or None if pdfplumber is unavailable or an error occurs.
|
2258
|
+
"""
|
2259
|
+
|
2260
|
+
# Determine the search region (expand if requested)
|
2261
|
+
search_region = self
|
2262
|
+
if expand_bbox and isinstance(expand_bbox, dict):
|
2263
|
+
try:
|
2264
|
+
search_region = self.expand(**expand_bbox)
|
2265
|
+
logger.debug(
|
2266
|
+
f"Expanded search region for text table analysis to: {search_region.bbox}"
|
2267
|
+
)
|
2268
|
+
except Exception as e:
|
2269
|
+
logger.warning(f"Could not expand region bbox: {e}. Using original region.")
|
2270
|
+
search_region = self
|
2271
|
+
|
2272
|
+
# Find text elements within the search region
|
2273
|
+
text_elements = search_region.find_all(
|
2274
|
+
"text", apply_exclusions=False
|
2275
|
+
) # Use unfiltered text
|
2276
|
+
if not text_elements:
|
2277
|
+
logger.info(f"Region {self.bbox}: No text elements found for text table analysis.")
|
2278
|
+
return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
|
2279
|
+
|
2280
|
+
# Extract bounding boxes
|
2281
|
+
bboxes = [element.bbox for element in text_elements if hasattr(element, "bbox")]
|
2282
|
+
if not bboxes:
|
2283
|
+
logger.info(f"Region {self.bbox}: No bboxes extracted from text elements.")
|
2284
|
+
return {"horizontal_edges": [], "vertical_edges": [], "cells": [], "intersections": {}}
|
2285
|
+
|
2286
|
+
# Call the utility function
|
2287
|
+
try:
|
2288
|
+
analysis_results = find_text_based_tables(
|
2289
|
+
bboxes=bboxes,
|
2290
|
+
snap_tolerance=snap_tolerance,
|
2291
|
+
join_tolerance=join_tolerance,
|
2292
|
+
min_words_vertical=min_words_vertical,
|
2293
|
+
min_words_horizontal=min_words_horizontal,
|
2294
|
+
intersection_tolerance=intersection_tolerance,
|
2295
|
+
**kwargs, # Pass through any extra specific tolerance args
|
2296
|
+
)
|
2297
|
+
# Store results in the region's analyses cache
|
2298
|
+
self.analyses["text_table_structure"] = analysis_results
|
2299
|
+
return analysis_results
|
2300
|
+
except ImportError:
|
2301
|
+
logger.error("pdfplumber library is required for 'text' table analysis but not found.")
|
2302
|
+
return None
|
2303
|
+
except Exception as e:
|
2304
|
+
logger.error(f"Error during text-based table analysis: {e}", exc_info=True)
|
2305
|
+
return None
|
2306
|
+
|
2307
|
+
# --- END NEW METHOD ---
|
2308
|
+
|
2309
|
+
# --- NEW METHOD: get_text_table_cells ---
|
2310
|
+
def get_text_table_cells(
|
2311
|
+
self,
|
2312
|
+
snap_tolerance: int = 10,
|
2313
|
+
join_tolerance: int = 3,
|
2314
|
+
min_words_vertical: int = 3,
|
2315
|
+
min_words_horizontal: int = 1,
|
2316
|
+
intersection_tolerance: int = 3,
|
2317
|
+
expand_bbox: Optional[Dict[str, int]] = None,
|
2318
|
+
**kwargs,
|
2319
|
+
) -> "ElementCollection[Region]":
|
2320
|
+
"""
|
2321
|
+
Analyzes text alignment to find table cells and returns them as
|
2322
|
+
temporary Region objects without adding them to the page.
|
2323
|
+
|
2324
|
+
Args:
|
2325
|
+
snap_tolerance: Tolerance for snapping parallel lines.
|
2326
|
+
join_tolerance: Tolerance for joining collinear lines.
|
2327
|
+
min_words_vertical: Minimum words needed to define a vertical line.
|
2328
|
+
min_words_horizontal: Minimum words needed to define a horizontal line.
|
2329
|
+
intersection_tolerance: Tolerance for detecting line intersections.
|
2330
|
+
expand_bbox: Optional dictionary to expand the search area slightly beyond
|
2331
|
+
the region's exact bounds (e.g., {'left': 5, 'right': 5}).
|
2332
|
+
**kwargs: Additional keyword arguments passed to
|
2333
|
+
find_text_based_tables (e.g., specific x/y tolerances).
|
2334
|
+
|
2335
|
+
Returns:
|
2336
|
+
An ElementCollection containing temporary Region objects for each detected cell,
|
2337
|
+
or an empty ElementCollection if no cells are found or an error occurs.
|
2338
|
+
"""
|
2339
|
+
from natural_pdf.elements.collections import ElementCollection
|
2340
|
+
|
2341
|
+
# 1. Perform the analysis (or use cached results)
|
2342
|
+
if "text_table_structure" in self.analyses:
|
2343
|
+
analysis_results = self.analyses["text_table_structure"]
|
2344
|
+
logger.debug("get_text_table_cells: Using cached analysis results.")
|
2345
|
+
else:
|
2346
|
+
analysis_results = self.analyze_text_table_structure(
|
2347
|
+
snap_tolerance=snap_tolerance,
|
2348
|
+
join_tolerance=join_tolerance,
|
2349
|
+
min_words_vertical=min_words_vertical,
|
2350
|
+
min_words_horizontal=min_words_horizontal,
|
2351
|
+
intersection_tolerance=intersection_tolerance,
|
2352
|
+
expand_bbox=expand_bbox,
|
2353
|
+
**kwargs,
|
2354
|
+
)
|
2355
|
+
|
2356
|
+
# 2. Check if analysis was successful and cells were found
|
2357
|
+
if analysis_results is None or not analysis_results.get("cells"):
|
2358
|
+
logger.info(f"Region {self.bbox}: No cells found by text table analysis.")
|
2359
|
+
return ElementCollection([]) # Return empty collection
|
2360
|
+
|
2361
|
+
# 3. Create temporary Region objects for each cell dictionary
|
2362
|
+
cell_regions = []
|
2363
|
+
for cell_data in analysis_results["cells"]:
|
2364
|
+
try:
|
2365
|
+
# Use page.region to create the region object
|
2366
|
+
# It expects left, top, right, bottom keys
|
2367
|
+
cell_region = self.page.region(**cell_data)
|
2368
|
+
|
2369
|
+
# Set metadata on the temporary region
|
2370
|
+
cell_region.region_type = "table-cell"
|
2371
|
+
cell_region.normalized_type = "table-cell"
|
2372
|
+
cell_region.model = "pdfplumber-text"
|
2373
|
+
cell_region.source = "volatile" # Indicate it's not managed/persistent
|
2374
|
+
cell_region.parent_region = self # Link back to the region it came from
|
2375
|
+
|
2376
|
+
cell_regions.append(cell_region)
|
2377
|
+
except Exception as e:
|
2378
|
+
logger.warning(f"Could not create Region object for cell data {cell_data}: {e}")
|
2379
|
+
|
2380
|
+
# 4. Return the list wrapped in an ElementCollection
|
2381
|
+
logger.debug(f"get_text_table_cells: Created {len(cell_regions)} temporary cell regions.")
|
2382
|
+
return ElementCollection(cell_regions)
|
2383
|
+
|
2384
|
+
# --- END NEW METHOD ---
|