natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
natural_pdf/elements/base.py
CHANGED
@@ -2,13 +2,15 @@
|
|
2
2
|
Base Element class for natural-pdf.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
5
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, overload
|
6
6
|
|
7
7
|
from PIL import Image
|
8
8
|
|
9
|
+
# Import selector parsing functions
|
10
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
11
|
+
|
9
12
|
if TYPE_CHECKING:
|
10
13
|
from natural_pdf.core.page import Page
|
11
|
-
from natural_pdf.elements.base import Element
|
12
14
|
from natural_pdf.elements.collections import ElementCollection
|
13
15
|
from natural_pdf.elements.region import Region
|
14
16
|
|
@@ -135,19 +137,11 @@ class DirectionalMixin:
|
|
135
137
|
# Adjust cross boundaries if cross_size is 'element'
|
136
138
|
if cross_size == "element":
|
137
139
|
if is_horizontal: # Adjust y0, y1
|
138
|
-
|
139
|
-
|
140
|
-
) # Use opposite boundary if excluding
|
141
|
-
target_y1 = target.bottom if include_endpoint else target.top
|
142
|
-
y0 = min(y0, target_y0)
|
143
|
-
y1 = max(y1, target_y1)
|
140
|
+
y0 = min(y0, self.y0)
|
141
|
+
y1 = max(y1, self.y1)
|
144
142
|
else: # Adjust x0, x1
|
145
|
-
|
146
|
-
|
147
|
-
) # Use opposite boundary if excluding
|
148
|
-
target_x1 = target.x1 if include_endpoint else target.x0
|
149
|
-
x0 = min(x0, target_x0)
|
150
|
-
x1 = max(x1, target_x1)
|
143
|
+
x0 = min(x0, self.x0)
|
144
|
+
x1 = max(x1, self.x1)
|
151
145
|
|
152
146
|
# 4. Finalize bbox coordinates
|
153
147
|
if is_horizontal:
|
@@ -525,7 +519,7 @@ class Element(DirectionalMixin):
|
|
525
519
|
selector: Optional selector to filter by
|
526
520
|
limit: Maximum number of elements to search through (default: 10)
|
527
521
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
528
|
-
**kwargs: Additional parameters
|
522
|
+
**kwargs: Additional parameters for selector filtering (e.g., regex, case)
|
529
523
|
|
530
524
|
Returns:
|
531
525
|
Next element or None if not found
|
@@ -548,13 +542,19 @@ class Element(DirectionalMixin):
|
|
548
542
|
# Limit search range for performance
|
549
543
|
candidates = candidates[:limit] if limit else candidates
|
550
544
|
|
551
|
-
#
|
552
|
-
|
545
|
+
# Parse the selector and create a filter function
|
546
|
+
parsed_selector = parse_selector(selector)
|
547
|
+
# Pass relevant kwargs (like regex, case) to the filter function builder
|
548
|
+
filter_func = selector_to_filter_func(parsed_selector, **kwargs)
|
549
|
+
|
550
|
+
# Iterate and return the first match
|
551
|
+
for candidate in candidates:
|
552
|
+
if filter_func(candidate):
|
553
|
+
return candidate
|
554
|
+
return None # No match found
|
553
555
|
|
554
|
-
|
555
|
-
return matches[0] if matches else None
|
556
|
+
# No selector, just return the next element if it exists
|
556
557
|
elif idx + 1 < len(all_elements):
|
557
|
-
# No selector, just return the next element
|
558
558
|
return all_elements[idx + 1]
|
559
559
|
|
560
560
|
return None
|
@@ -573,7 +573,7 @@ class Element(DirectionalMixin):
|
|
573
573
|
selector: Optional selector to filter by
|
574
574
|
limit: Maximum number of elements to search through (default: 10)
|
575
575
|
apply_exclusions: Whether to apply exclusion regions (default: True)
|
576
|
-
**kwargs: Additional parameters
|
576
|
+
**kwargs: Additional parameters for selector filtering (e.g., regex, case)
|
577
577
|
|
578
578
|
Returns:
|
579
579
|
Previous element or None if not found
|
@@ -598,13 +598,19 @@ class Element(DirectionalMixin):
|
|
598
598
|
# Limit search range for performance
|
599
599
|
candidates = candidates[:limit] if limit else candidates
|
600
600
|
|
601
|
-
#
|
602
|
-
|
601
|
+
# Parse the selector and create a filter function
|
602
|
+
parsed_selector = parse_selector(selector)
|
603
|
+
# Pass relevant kwargs (like regex, case) to the filter function builder
|
604
|
+
filter_func = selector_to_filter_func(parsed_selector, **kwargs)
|
605
|
+
|
606
|
+
# Iterate and return the first match (from reversed list)
|
607
|
+
for candidate in candidates:
|
608
|
+
if filter_func(candidate):
|
609
|
+
return candidate
|
610
|
+
return None # No match found
|
603
611
|
|
604
|
-
|
605
|
-
return matches[0] if matches else None # find_all returns a collection
|
612
|
+
# No selector, just return the previous element if it exists
|
606
613
|
elif idx > 0:
|
607
|
-
# No selector, just return the previous element
|
608
614
|
return all_elements[idx - 1]
|
609
615
|
|
610
616
|
return None
|
@@ -887,40 +893,128 @@ class Element(DirectionalMixin):
|
|
887
893
|
"""String representation of the element."""
|
888
894
|
return f"<{self.__class__.__name__} bbox={self.bbox}>"
|
889
895
|
|
890
|
-
|
896
|
+
@overload
|
897
|
+
def find(
|
898
|
+
self,
|
899
|
+
*,
|
900
|
+
text: str,
|
901
|
+
apply_exclusions: bool = True,
|
902
|
+
regex: bool = False,
|
903
|
+
case: bool = True,
|
904
|
+
**kwargs,
|
905
|
+
) -> Optional["Element"]: ...
|
906
|
+
|
907
|
+
@overload
|
908
|
+
def find(
|
909
|
+
self,
|
910
|
+
selector: str,
|
911
|
+
*,
|
912
|
+
apply_exclusions: bool = True,
|
913
|
+
regex: bool = False,
|
914
|
+
case: bool = True,
|
915
|
+
**kwargs,
|
916
|
+
) -> Optional["Element"]: ...
|
917
|
+
|
918
|
+
def find(
|
919
|
+
self,
|
920
|
+
selector: Optional[str] = None,
|
921
|
+
*,
|
922
|
+
text: Optional[str] = None,
|
923
|
+
apply_exclusions: bool = True,
|
924
|
+
regex: bool = False,
|
925
|
+
case: bool = True,
|
926
|
+
**kwargs,
|
927
|
+
) -> Optional["Element"]:
|
891
928
|
"""
|
892
|
-
Find first element within this element's bounds matching the selector.
|
929
|
+
Find first element within this element's bounds matching the selector OR text.
|
893
930
|
Creates a temporary region to perform the search.
|
894
931
|
|
932
|
+
Provide EITHER `selector` OR `text`, but not both.
|
933
|
+
|
895
934
|
Args:
|
896
|
-
selector: CSS-like selector string
|
897
|
-
|
898
|
-
|
935
|
+
selector: CSS-like selector string.
|
936
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
937
|
+
apply_exclusions: Whether to apply exclusion regions (default: True).
|
938
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
939
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
940
|
+
**kwargs: Additional parameters for element filtering.
|
899
941
|
|
900
942
|
Returns:
|
901
|
-
First matching element or None
|
943
|
+
First matching element or None.
|
902
944
|
"""
|
903
945
|
from natural_pdf.elements.region import Region
|
904
946
|
|
905
947
|
# Create a temporary region from this element's bounds
|
906
948
|
temp_region = Region(self.page, self.bbox)
|
907
|
-
|
949
|
+
# Delegate to the region's find method
|
950
|
+
return temp_region.find(
|
951
|
+
selector=selector,
|
952
|
+
text=text,
|
953
|
+
apply_exclusions=apply_exclusions,
|
954
|
+
regex=regex,
|
955
|
+
case=case,
|
956
|
+
**kwargs,
|
957
|
+
)
|
958
|
+
|
959
|
+
@overload
|
960
|
+
def find_all(
|
961
|
+
self,
|
962
|
+
*,
|
963
|
+
text: str,
|
964
|
+
apply_exclusions: bool = True,
|
965
|
+
regex: bool = False,
|
966
|
+
case: bool = True,
|
967
|
+
**kwargs,
|
968
|
+
) -> "ElementCollection": ...
|
969
|
+
|
970
|
+
@overload
|
971
|
+
def find_all(
|
972
|
+
self,
|
973
|
+
selector: str,
|
974
|
+
*,
|
975
|
+
apply_exclusions: bool = True,
|
976
|
+
regex: bool = False,
|
977
|
+
case: bool = True,
|
978
|
+
**kwargs,
|
979
|
+
) -> "ElementCollection": ...
|
908
980
|
|
909
|
-
def find_all(
|
981
|
+
def find_all(
|
982
|
+
self,
|
983
|
+
selector: Optional[str] = None,
|
984
|
+
*,
|
985
|
+
text: Optional[str] = None,
|
986
|
+
apply_exclusions: bool = True,
|
987
|
+
regex: bool = False,
|
988
|
+
case: bool = True,
|
989
|
+
**kwargs,
|
990
|
+
) -> "ElementCollection":
|
910
991
|
"""
|
911
|
-
Find all elements within this element's bounds matching the selector.
|
992
|
+
Find all elements within this element's bounds matching the selector OR text.
|
912
993
|
Creates a temporary region to perform the search.
|
913
994
|
|
995
|
+
Provide EITHER `selector` OR `text`, but not both.
|
996
|
+
|
914
997
|
Args:
|
915
|
-
selector: CSS-like selector string
|
916
|
-
|
917
|
-
|
998
|
+
selector: CSS-like selector string.
|
999
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
1000
|
+
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1001
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1002
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
1003
|
+
**kwargs: Additional parameters for element filtering.
|
918
1004
|
|
919
1005
|
Returns:
|
920
|
-
ElementCollection with matching elements
|
1006
|
+
ElementCollection with matching elements.
|
921
1007
|
"""
|
922
1008
|
from natural_pdf.elements.region import Region
|
923
1009
|
|
924
1010
|
# Create a temporary region from this element's bounds
|
925
1011
|
temp_region = Region(self.page, self.bbox)
|
926
|
-
|
1012
|
+
# Delegate to the region's find_all method
|
1013
|
+
return temp_region.find_all(
|
1014
|
+
selector=selector,
|
1015
|
+
text=text,
|
1016
|
+
apply_exclusions=apply_exclusions,
|
1017
|
+
regex=regex,
|
1018
|
+
case=case,
|
1019
|
+
**kwargs,
|
1020
|
+
)
|