natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py
CHANGED
@@ -38,7 +38,7 @@ from natural_pdf.extraction.mixin import ExtractionMixin
|
|
38
38
|
from natural_pdf.ocr import OCRManager, OCROptions
|
39
39
|
from natural_pdf.selectors.parser import parse_selector
|
40
40
|
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
-
from
|
41
|
+
from tqdm.auto import tqdm
|
42
42
|
|
43
43
|
try:
|
44
44
|
from typing import Any as TypingAny
|
@@ -60,6 +60,7 @@ except ImportError:
|
|
60
60
|
"Search dependencies are not installed. Install with: pip install natural-pdf[search]"
|
61
61
|
)
|
62
62
|
|
63
|
+
|
63
64
|
try:
|
64
65
|
from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
|
65
66
|
except ImportError:
|
@@ -70,7 +71,6 @@ except ImportError:
|
|
70
71
|
create_original_pdf = None
|
71
72
|
|
72
73
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
73
|
-
tqdm = get_tqdm()
|
74
74
|
|
75
75
|
DEFAULT_MANAGERS = {
|
76
76
|
"classification": ClassificationManager,
|
@@ -791,10 +791,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
791
791
|
"PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
|
792
792
|
)
|
793
793
|
if create_searchable_pdf is None:
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
794
|
+
raise ImportError(
|
795
|
+
"Saving searchable PDF requires 'pikepdf'. "
|
796
|
+
'Install with: pip install "natural-pdf[ocr-export]"'
|
797
|
+
)
|
798
798
|
output_path_str = str(output_path)
|
799
799
|
# Call the exporter directly, passing self (the PDF instance)
|
800
800
|
create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
|
@@ -842,55 +842,59 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
842
842
|
output_path_str = str(output_path_obj)
|
843
843
|
|
844
844
|
if ocr:
|
845
|
-
if create_searchable_pdf is None:
|
846
|
-
raise ImportError(
|
847
|
-
"Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
|
848
|
-
"Install with: pip install \"natural-pdf[ocr-export]\""
|
849
|
-
)
|
850
|
-
|
851
|
-
# Optional: Add warning about vector data loss similar to PageCollection
|
852
845
|
has_vector_elements = False
|
853
846
|
for page in self.pages:
|
854
|
-
if (
|
855
|
-
hasattr(page,
|
856
|
-
|
857
|
-
|
858
|
-
|
847
|
+
if (
|
848
|
+
hasattr(page, "rects")
|
849
|
+
and page.rects
|
850
|
+
or hasattr(page, "lines")
|
851
|
+
and page.lines
|
852
|
+
or hasattr(page, "curves")
|
853
|
+
and page.curves
|
854
|
+
or (
|
855
|
+
hasattr(page, "chars")
|
856
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.chars)
|
857
|
+
)
|
858
|
+
or (
|
859
|
+
hasattr(page, "words")
|
860
|
+
and any(getattr(el, "source", None) != "ocr" for el in page.words)
|
861
|
+
)
|
862
|
+
):
|
859
863
|
has_vector_elements = True
|
860
864
|
break
|
861
865
|
if has_vector_elements:
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
866
|
+
logger.warning(
|
867
|
+
"Warning: Saving with ocr=True creates an image-based PDF. "
|
868
|
+
"Original vector elements (rects, lines, non-OCR text/chars) "
|
869
|
+
"will not be preserved in the output file."
|
870
|
+
)
|
867
871
|
|
868
872
|
logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
|
869
873
|
try:
|
870
874
|
# Delegate to the searchable PDF exporter, passing self (PDF instance)
|
871
875
|
create_searchable_pdf(self, output_path_str, dpi=dpi)
|
872
876
|
except Exception as e:
|
873
|
-
|
877
|
+
raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
|
874
878
|
|
875
879
|
elif original:
|
876
880
|
if create_original_pdf is None:
|
877
881
|
raise ImportError(
|
878
882
|
"Saving with original=True requires 'pikepdf'. "
|
879
|
-
|
883
|
+
'Install with: pip install "natural-pdf[ocr-export]"'
|
880
884
|
)
|
881
885
|
|
882
|
-
|
886
|
+
# Optional: Add warning about losing OCR data similar to PageCollection
|
883
887
|
has_ocr_elements = False
|
884
888
|
for page in self.pages:
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
889
|
+
if hasattr(page, "find_all"):
|
890
|
+
ocr_text_elements = page.find_all("text[source=ocr]")
|
891
|
+
if ocr_text_elements:
|
892
|
+
has_ocr_elements = True
|
893
|
+
break
|
894
|
+
elif hasattr(page, "words"): # Fallback
|
895
|
+
if any(getattr(el, "source", None) == "ocr" for el in page.words):
|
896
|
+
has_ocr_elements = True
|
897
|
+
break
|
894
898
|
if has_ocr_elements:
|
895
899
|
logger.warning(
|
896
900
|
"Warning: Saving with original=True preserves original page content. "
|
@@ -899,11 +903,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
899
903
|
|
900
904
|
logger.info(f"Saving original PDF content to: {output_path_str}")
|
901
905
|
try:
|
902
|
-
|
903
|
-
|
906
|
+
# Delegate to the original PDF exporter, passing self (PDF instance)
|
907
|
+
create_original_pdf(self, output_path_str)
|
904
908
|
except Exception as e:
|
905
|
-
|
906
|
-
|
909
|
+
# Re-raise exception from exporter
|
910
|
+
raise e
|
907
911
|
|
908
912
|
def ask(
|
909
913
|
self,
|
@@ -1227,6 +1231,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1227
1231
|
"""Context manager exit."""
|
1228
1232
|
self.close()
|
1229
1233
|
|
1234
|
+
def __repr__(self) -> str:
|
1235
|
+
"""Return a string representation of the PDF object."""
|
1236
|
+
if not hasattr(self, "_pages"):
|
1237
|
+
page_count_str = "uninitialized"
|
1238
|
+
else:
|
1239
|
+
page_count_str = str(len(self._pages))
|
1240
|
+
|
1241
|
+
source_info = getattr(self, "source_path", "unknown source")
|
1242
|
+
return f"<PDF source='{source_info}' pages={page_count_str}>"
|
1243
|
+
|
1230
1244
|
def get_id(self) -> str:
|
1231
1245
|
"""Get unique identifier for this PDF."""
|
1232
1246
|
"""Get unique identifier for this PDF."""
|
@@ -1238,6 +1252,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1238
1252
|
self,
|
1239
1253
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1240
1254
|
resolution: int = 300,
|
1255
|
+
angle: Optional[float] = None,
|
1241
1256
|
detection_resolution: int = 72,
|
1242
1257
|
force_overwrite: bool = False,
|
1243
1258
|
**deskew_kwargs,
|
@@ -1256,6 +1271,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1256
1271
|
Args:
|
1257
1272
|
pages: Page indices/slice to include (0-based). If None, processes all pages.
|
1258
1273
|
resolution: DPI resolution for rendering the output deskewed pages.
|
1274
|
+
angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
|
1259
1275
|
detection_resolution: DPI resolution used for skew detection if angles are not
|
1260
1276
|
already cached on the page objects.
|
1261
1277
|
force_overwrite: If False (default), raises a ValueError if any target page
|
@@ -1300,14 +1316,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1300
1316
|
deskewed_images_bytes = []
|
1301
1317
|
logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
|
1302
1318
|
|
1303
|
-
# Use tqdm via get_tqdm
|
1304
1319
|
for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
|
1305
1320
|
try:
|
1306
1321
|
# Use page.deskew to get the corrected PIL image
|
1307
1322
|
# Pass down resolutions and kwargs
|
1308
1323
|
deskewed_img = page.deskew(
|
1309
1324
|
resolution=resolution,
|
1310
|
-
angle=
|
1325
|
+
angle=angle, # Let page.deskew handle detection/caching
|
1311
1326
|
detection_resolution=detection_resolution,
|
1312
1327
|
**deskew_kwargs,
|
1313
1328
|
)
|
@@ -1400,7 +1415,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1400
1415
|
except ImportError:
|
1401
1416
|
raise ImportError(
|
1402
1417
|
"Classification dependencies missing. "
|
1403
|
-
'Install with: pip install "natural-pdf[
|
1418
|
+
'Install with: pip install "natural-pdf[core-ml]"'
|
1404
1419
|
)
|
1405
1420
|
raise ClassificationError("ClassificationManager not available.")
|
1406
1421
|
|
natural_pdf/elements/base.py
CHANGED
@@ -15,6 +15,40 @@ if TYPE_CHECKING:
|
|
15
15
|
from natural_pdf.elements.region import Region
|
16
16
|
|
17
17
|
|
18
|
+
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
19
|
+
"""
|
20
|
+
Extract bounding box coordinates from any object that has bbox properties.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
obj: Object that might have bbox coordinates (Element, Region, etc.)
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
|
27
|
+
"""
|
28
|
+
# Try bbox property first (most common)
|
29
|
+
if hasattr(obj, 'bbox') and obj.bbox is not None:
|
30
|
+
bbox = obj.bbox
|
31
|
+
if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
|
32
|
+
return tuple(float(coord) for coord in bbox)
|
33
|
+
|
34
|
+
# Try individual coordinate properties
|
35
|
+
if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
|
36
|
+
try:
|
37
|
+
return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
|
38
|
+
except (ValueError, TypeError):
|
39
|
+
pass
|
40
|
+
|
41
|
+
# If object is a dict with bbox keys
|
42
|
+
if isinstance(obj, dict):
|
43
|
+
if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
|
44
|
+
try:
|
45
|
+
return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
|
46
|
+
except (ValueError, TypeError):
|
47
|
+
pass
|
48
|
+
|
49
|
+
return None
|
50
|
+
|
51
|
+
|
18
52
|
class DirectionalMixin:
|
19
53
|
"""
|
20
54
|
Mixin class providing directional methods for both Element and Region classes.
|
@@ -814,6 +848,7 @@ class Element(DirectionalMixin):
|
|
814
848
|
legend_position: str = "right",
|
815
849
|
color: Optional[Union[Tuple, str]] = "red", # Default color for single element
|
816
850
|
label: Optional[str] = None,
|
851
|
+
width: Optional[int] = None, # Add width parameter
|
817
852
|
) -> Optional["Image.Image"]:
|
818
853
|
"""
|
819
854
|
Show the page with only this element highlighted temporarily.
|
@@ -824,6 +859,7 @@ class Element(DirectionalMixin):
|
|
824
859
|
legend_position: Position of the legend
|
825
860
|
color: Color to highlight this element (default: red)
|
826
861
|
label: Optional label for this element in the legend
|
862
|
+
width: Optional width for the output image in pixels
|
827
863
|
|
828
864
|
Returns:
|
829
865
|
PIL Image of the page with only this element highlighted, or None if error.
|
@@ -861,6 +897,7 @@ class Element(DirectionalMixin):
|
|
861
897
|
page_index=self.page.index,
|
862
898
|
temporary_highlights=[temp_highlight_data],
|
863
899
|
scale=scale,
|
900
|
+
width=width, # Pass the width parameter
|
864
901
|
labels=labels,
|
865
902
|
legend_position=legend_position,
|
866
903
|
)
|
@@ -898,6 +935,7 @@ class Element(DirectionalMixin):
|
|
898
935
|
self,
|
899
936
|
*,
|
900
937
|
text: str,
|
938
|
+
contains: str = "all",
|
901
939
|
apply_exclusions: bool = True,
|
902
940
|
regex: bool = False,
|
903
941
|
case: bool = True,
|
@@ -909,6 +947,7 @@ class Element(DirectionalMixin):
|
|
909
947
|
self,
|
910
948
|
selector: str,
|
911
949
|
*,
|
950
|
+
contains: str = "all",
|
912
951
|
apply_exclusions: bool = True,
|
913
952
|
regex: bool = False,
|
914
953
|
case: bool = True,
|
@@ -920,6 +959,7 @@ class Element(DirectionalMixin):
|
|
920
959
|
selector: Optional[str] = None,
|
921
960
|
*,
|
922
961
|
text: Optional[str] = None,
|
962
|
+
contains: str = "all",
|
923
963
|
apply_exclusions: bool = True,
|
924
964
|
regex: bool = False,
|
925
965
|
case: bool = True,
|
@@ -934,6 +974,9 @@ class Element(DirectionalMixin):
|
|
934
974
|
Args:
|
935
975
|
selector: CSS-like selector string.
|
936
976
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
977
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
978
|
+
'any' (any overlap), or 'center' (center point inside).
|
979
|
+
(default: "all")
|
937
980
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
938
981
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
939
982
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -950,6 +993,7 @@ class Element(DirectionalMixin):
|
|
950
993
|
return temp_region.find(
|
951
994
|
selector=selector,
|
952
995
|
text=text,
|
996
|
+
contains=contains,
|
953
997
|
apply_exclusions=apply_exclusions,
|
954
998
|
regex=regex,
|
955
999
|
case=case,
|
@@ -961,6 +1005,7 @@ class Element(DirectionalMixin):
|
|
961
1005
|
self,
|
962
1006
|
*,
|
963
1007
|
text: str,
|
1008
|
+
contains: str = "all",
|
964
1009
|
apply_exclusions: bool = True,
|
965
1010
|
regex: bool = False,
|
966
1011
|
case: bool = True,
|
@@ -972,6 +1017,7 @@ class Element(DirectionalMixin):
|
|
972
1017
|
self,
|
973
1018
|
selector: str,
|
974
1019
|
*,
|
1020
|
+
contains: str = "all",
|
975
1021
|
apply_exclusions: bool = True,
|
976
1022
|
regex: bool = False,
|
977
1023
|
case: bool = True,
|
@@ -983,6 +1029,7 @@ class Element(DirectionalMixin):
|
|
983
1029
|
selector: Optional[str] = None,
|
984
1030
|
*,
|
985
1031
|
text: Optional[str] = None,
|
1032
|
+
contains: str = "all",
|
986
1033
|
apply_exclusions: bool = True,
|
987
1034
|
regex: bool = False,
|
988
1035
|
case: bool = True,
|
@@ -997,6 +1044,9 @@ class Element(DirectionalMixin):
|
|
997
1044
|
Args:
|
998
1045
|
selector: CSS-like selector string.
|
999
1046
|
text: Text content to search for (equivalent to 'text:contains(...)').
|
1047
|
+
contains: How to determine if elements are inside: 'all' (fully inside),
|
1048
|
+
'any' (any overlap), or 'center' (center point inside).
|
1049
|
+
(default: "all")
|
1000
1050
|
apply_exclusions: Whether to apply exclusion regions (default: True).
|
1001
1051
|
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
1002
1052
|
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
@@ -1013,6 +1063,7 @@ class Element(DirectionalMixin):
|
|
1013
1063
|
return temp_region.find_all(
|
1014
1064
|
selector=selector,
|
1015
1065
|
text=text,
|
1066
|
+
contains=contains,
|
1016
1067
|
apply_exclusions=apply_exclusions,
|
1017
1068
|
regex=regex,
|
1018
1069
|
case=case,
|