natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
  3. natural_pdf/analyzers/text_options.py +9 -1
  4. natural_pdf/analyzers/text_structure.py +371 -58
  5. natural_pdf/classification/manager.py +3 -4
  6. natural_pdf/collections/pdf_collection.py +19 -39
  7. natural_pdf/core/element_manager.py +11 -1
  8. natural_pdf/core/highlighting_service.py +146 -75
  9. natural_pdf/core/page.py +287 -188
  10. natural_pdf/core/pdf.py +57 -42
  11. natural_pdf/elements/base.py +51 -0
  12. natural_pdf/elements/collections.py +362 -67
  13. natural_pdf/elements/line.py +5 -0
  14. natural_pdf/elements/region.py +396 -23
  15. natural_pdf/exporters/data/__init__.py +0 -0
  16. natural_pdf/exporters/data/pdf.ttf +0 -0
  17. natural_pdf/exporters/data/sRGB.icc +0 -0
  18. natural_pdf/exporters/hocr.py +40 -61
  19. natural_pdf/exporters/hocr_font.py +7 -13
  20. natural_pdf/exporters/original_pdf.py +10 -13
  21. natural_pdf/exporters/paddleocr.py +51 -11
  22. natural_pdf/exporters/searchable_pdf.py +0 -10
  23. natural_pdf/flows/__init__.py +12 -0
  24. natural_pdf/flows/collections.py +533 -0
  25. natural_pdf/flows/element.py +382 -0
  26. natural_pdf/flows/flow.py +216 -0
  27. natural_pdf/flows/region.py +458 -0
  28. natural_pdf/search/__init__.py +65 -52
  29. natural_pdf/search/lancedb_search_service.py +325 -0
  30. natural_pdf/search/numpy_search_service.py +255 -0
  31. natural_pdf/search/searchable_mixin.py +25 -71
  32. natural_pdf/selectors/parser.py +163 -8
  33. natural_pdf/widgets/viewer.py +22 -31
  34. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
  35. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
  36. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
  37. natural_pdf/search/haystack_search_service.py +0 -687
  38. natural_pdf/search/haystack_utils.py +0 -474
  39. natural_pdf/utils/tqdm_utils.py +0 -51
  40. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
natural_pdf/core/pdf.py CHANGED
@@ -38,7 +38,7 @@ from natural_pdf.extraction.mixin import ExtractionMixin
38
38
  from natural_pdf.ocr import OCRManager, OCROptions
39
39
  from natural_pdf.selectors.parser import parse_selector
40
40
  from natural_pdf.utils.locks import pdf_render_lock
41
- from natural_pdf.utils.tqdm_utils import get_tqdm
41
+ from tqdm.auto import tqdm
42
42
 
43
43
  try:
44
44
  from typing import Any as TypingAny
@@ -60,6 +60,7 @@ except ImportError:
60
60
  "Search dependencies are not installed. Install with: pip install natural-pdf[search]"
61
61
  )
62
62
 
63
+
63
64
  try:
64
65
  from natural_pdf.exporters.searchable_pdf import create_searchable_pdf
65
66
  except ImportError:
@@ -70,7 +71,6 @@ except ImportError:
70
71
  create_original_pdf = None
71
72
 
72
73
  logger = logging.getLogger("natural_pdf.core.pdf")
73
- tqdm = get_tqdm()
74
74
 
75
75
  DEFAULT_MANAGERS = {
76
76
  "classification": ClassificationManager,
@@ -791,10 +791,10 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
791
791
  "PDF.save_searchable() is deprecated. Use PDF.save_pdf(..., ocr=True) instead."
792
792
  )
793
793
  if create_searchable_pdf is None:
794
- raise ImportError(
795
- "Saving searchable PDF requires 'pikepdf' and 'Pillow'. "
796
- "Install with: pip install \"natural-pdf[ocr-export]\""
797
- )
794
+ raise ImportError(
795
+ "Saving searchable PDF requires 'pikepdf'. "
796
+ 'Install with: pip install "natural-pdf[ocr-export]"'
797
+ )
798
798
  output_path_str = str(output_path)
799
799
  # Call the exporter directly, passing self (the PDF instance)
800
800
  create_searchable_pdf(self, output_path_str, dpi=dpi, **kwargs)
@@ -842,55 +842,59 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
842
842
  output_path_str = str(output_path_obj)
843
843
 
844
844
  if ocr:
845
- if create_searchable_pdf is None:
846
- raise ImportError(
847
- "Saving with ocr=True requires 'pikepdf' and 'Pillow'. "
848
- "Install with: pip install \"natural-pdf[ocr-export]\""
849
- )
850
-
851
- # Optional: Add warning about vector data loss similar to PageCollection
852
845
  has_vector_elements = False
853
846
  for page in self.pages:
854
- if (hasattr(page, 'rects') and page.rects or
855
- hasattr(page, 'lines') and page.lines or
856
- hasattr(page, 'curves') and page.curves or
857
- (hasattr(page, 'chars') and any(getattr(el, 'source', None) != 'ocr' for el in page.chars)) or
858
- (hasattr(page, 'words') and any(getattr(el, 'source', None) != 'ocr' for el in page.words))):
847
+ if (
848
+ hasattr(page, "rects")
849
+ and page.rects
850
+ or hasattr(page, "lines")
851
+ and page.lines
852
+ or hasattr(page, "curves")
853
+ and page.curves
854
+ or (
855
+ hasattr(page, "chars")
856
+ and any(getattr(el, "source", None) != "ocr" for el in page.chars)
857
+ )
858
+ or (
859
+ hasattr(page, "words")
860
+ and any(getattr(el, "source", None) != "ocr" for el in page.words)
861
+ )
862
+ ):
859
863
  has_vector_elements = True
860
864
  break
861
865
  if has_vector_elements:
862
- logger.warning(
863
- "Warning: Saving with ocr=True creates an image-based PDF. "
864
- "Original vector elements (rects, lines, non-OCR text/chars) "
865
- "will not be preserved in the output file."
866
- )
866
+ logger.warning(
867
+ "Warning: Saving with ocr=True creates an image-based PDF. "
868
+ "Original vector elements (rects, lines, non-OCR text/chars) "
869
+ "will not be preserved in the output file."
870
+ )
867
871
 
868
872
  logger.info(f"Saving searchable PDF (OCR text layer) to: {output_path_str}")
869
873
  try:
870
874
  # Delegate to the searchable PDF exporter, passing self (PDF instance)
871
875
  create_searchable_pdf(self, output_path_str, dpi=dpi)
872
876
  except Exception as e:
873
- raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
877
+ raise RuntimeError(f"Failed to create searchable PDF: {e}") from e
874
878
 
875
879
  elif original:
876
880
  if create_original_pdf is None:
877
881
  raise ImportError(
878
882
  "Saving with original=True requires 'pikepdf'. "
879
- "Install with: pip install \"natural-pdf[ocr-export]\""
883
+ 'Install with: pip install "natural-pdf[ocr-export]"'
880
884
  )
881
885
 
882
- # Optional: Add warning about losing OCR data similar to PageCollection
886
+ # Optional: Add warning about losing OCR data similar to PageCollection
883
887
  has_ocr_elements = False
884
888
  for page in self.pages:
885
- if hasattr(page, 'find_all'):
886
- ocr_text_elements = page.find_all("text[source=ocr]")
887
- if ocr_text_elements:
888
- has_ocr_elements = True
889
- break
890
- elif hasattr(page, 'words'): # Fallback
891
- if any(getattr(el, 'source', None) == 'ocr' for el in page.words):
892
- has_ocr_elements = True
893
- break
889
+ if hasattr(page, "find_all"):
890
+ ocr_text_elements = page.find_all("text[source=ocr]")
891
+ if ocr_text_elements:
892
+ has_ocr_elements = True
893
+ break
894
+ elif hasattr(page, "words"): # Fallback
895
+ if any(getattr(el, "source", None) == "ocr" for el in page.words):
896
+ has_ocr_elements = True
897
+ break
894
898
  if has_ocr_elements:
895
899
  logger.warning(
896
900
  "Warning: Saving with original=True preserves original page content. "
@@ -899,11 +903,11 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
899
903
 
900
904
  logger.info(f"Saving original PDF content to: {output_path_str}")
901
905
  try:
902
- # Delegate to the original PDF exporter, passing self (PDF instance)
903
- create_original_pdf(self, output_path_str)
906
+ # Delegate to the original PDF exporter, passing self (PDF instance)
907
+ create_original_pdf(self, output_path_str)
904
908
  except Exception as e:
905
- # Re-raise exception from exporter
906
- raise e
909
+ # Re-raise exception from exporter
910
+ raise e
907
911
 
908
912
  def ask(
909
913
  self,
@@ -1227,6 +1231,16 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1227
1231
  """Context manager exit."""
1228
1232
  self.close()
1229
1233
 
1234
+ def __repr__(self) -> str:
1235
+ """Return a string representation of the PDF object."""
1236
+ if not hasattr(self, "_pages"):
1237
+ page_count_str = "uninitialized"
1238
+ else:
1239
+ page_count_str = str(len(self._pages))
1240
+
1241
+ source_info = getattr(self, "source_path", "unknown source")
1242
+ return f"<PDF source='{source_info}' pages={page_count_str}>"
1243
+
1230
1244
  def get_id(self) -> str:
1231
1245
  """Get unique identifier for this PDF."""
1232
1246
  """Get unique identifier for this PDF."""
@@ -1238,6 +1252,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1238
1252
  self,
1239
1253
  pages: Optional[Union[Iterable[int], range, slice]] = None,
1240
1254
  resolution: int = 300,
1255
+ angle: Optional[float] = None,
1241
1256
  detection_resolution: int = 72,
1242
1257
  force_overwrite: bool = False,
1243
1258
  **deskew_kwargs,
@@ -1256,6 +1271,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1256
1271
  Args:
1257
1272
  pages: Page indices/slice to include (0-based). If None, processes all pages.
1258
1273
  resolution: DPI resolution for rendering the output deskewed pages.
1274
+ angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
1259
1275
  detection_resolution: DPI resolution used for skew detection if angles are not
1260
1276
  already cached on the page objects.
1261
1277
  force_overwrite: If False (default), raises a ValueError if any target page
@@ -1300,14 +1316,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1300
1316
  deskewed_images_bytes = []
1301
1317
  logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
1302
1318
 
1303
- # Use tqdm via get_tqdm
1304
1319
  for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
1305
1320
  try:
1306
1321
  # Use page.deskew to get the corrected PIL image
1307
1322
  # Pass down resolutions and kwargs
1308
1323
  deskewed_img = page.deskew(
1309
1324
  resolution=resolution,
1310
- angle=None, # Let page.deskew handle detection/caching
1325
+ angle=angle, # Let page.deskew handle detection/caching
1311
1326
  detection_resolution=detection_resolution,
1312
1327
  **deskew_kwargs,
1313
1328
  )
@@ -1400,7 +1415,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1400
1415
  except ImportError:
1401
1416
  raise ImportError(
1402
1417
  "Classification dependencies missing. "
1403
- 'Install with: pip install "natural-pdf[classification]"'
1418
+ 'Install with: pip install "natural-pdf[core-ml]"'
1404
1419
  )
1405
1420
  raise ClassificationError("ClassificationManager not available.")
1406
1421
 
@@ -15,6 +15,40 @@ if TYPE_CHECKING:
15
15
  from natural_pdf.elements.region import Region
16
16
 
17
17
 
18
+ def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
19
+ """
20
+ Extract bounding box coordinates from any object that has bbox properties.
21
+
22
+ Args:
23
+ obj: Object that might have bbox coordinates (Element, Region, etc.)
24
+
25
+ Returns:
26
+ Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
27
+ """
28
+ # Try bbox property first (most common)
29
+ if hasattr(obj, 'bbox') and obj.bbox is not None:
30
+ bbox = obj.bbox
31
+ if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
32
+ return tuple(float(coord) for coord in bbox)
33
+
34
+ # Try individual coordinate properties
35
+ if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
36
+ try:
37
+ return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
38
+ except (ValueError, TypeError):
39
+ pass
40
+
41
+ # If object is a dict with bbox keys
42
+ if isinstance(obj, dict):
43
+ if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
44
+ try:
45
+ return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
46
+ except (ValueError, TypeError):
47
+ pass
48
+
49
+ return None
50
+
51
+
18
52
  class DirectionalMixin:
19
53
  """
20
54
  Mixin class providing directional methods for both Element and Region classes.
@@ -814,6 +848,7 @@ class Element(DirectionalMixin):
814
848
  legend_position: str = "right",
815
849
  color: Optional[Union[Tuple, str]] = "red", # Default color for single element
816
850
  label: Optional[str] = None,
851
+ width: Optional[int] = None, # Add width parameter
817
852
  ) -> Optional["Image.Image"]:
818
853
  """
819
854
  Show the page with only this element highlighted temporarily.
@@ -824,6 +859,7 @@ class Element(DirectionalMixin):
824
859
  legend_position: Position of the legend
825
860
  color: Color to highlight this element (default: red)
826
861
  label: Optional label for this element in the legend
862
+ width: Optional width for the output image in pixels
827
863
 
828
864
  Returns:
829
865
  PIL Image of the page with only this element highlighted, or None if error.
@@ -861,6 +897,7 @@ class Element(DirectionalMixin):
861
897
  page_index=self.page.index,
862
898
  temporary_highlights=[temp_highlight_data],
863
899
  scale=scale,
900
+ width=width, # Pass the width parameter
864
901
  labels=labels,
865
902
  legend_position=legend_position,
866
903
  )
@@ -898,6 +935,7 @@ class Element(DirectionalMixin):
898
935
  self,
899
936
  *,
900
937
  text: str,
938
+ contains: str = "all",
901
939
  apply_exclusions: bool = True,
902
940
  regex: bool = False,
903
941
  case: bool = True,
@@ -909,6 +947,7 @@ class Element(DirectionalMixin):
909
947
  self,
910
948
  selector: str,
911
949
  *,
950
+ contains: str = "all",
912
951
  apply_exclusions: bool = True,
913
952
  regex: bool = False,
914
953
  case: bool = True,
@@ -920,6 +959,7 @@ class Element(DirectionalMixin):
920
959
  selector: Optional[str] = None,
921
960
  *,
922
961
  text: Optional[str] = None,
962
+ contains: str = "all",
923
963
  apply_exclusions: bool = True,
924
964
  regex: bool = False,
925
965
  case: bool = True,
@@ -934,6 +974,9 @@ class Element(DirectionalMixin):
934
974
  Args:
935
975
  selector: CSS-like selector string.
936
976
  text: Text content to search for (equivalent to 'text:contains(...)').
977
+ contains: How to determine if elements are inside: 'all' (fully inside),
978
+ 'any' (any overlap), or 'center' (center point inside).
979
+ (default: "all")
937
980
  apply_exclusions: Whether to apply exclusion regions (default: True).
938
981
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
939
982
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -950,6 +993,7 @@ class Element(DirectionalMixin):
950
993
  return temp_region.find(
951
994
  selector=selector,
952
995
  text=text,
996
+ contains=contains,
953
997
  apply_exclusions=apply_exclusions,
954
998
  regex=regex,
955
999
  case=case,
@@ -961,6 +1005,7 @@ class Element(DirectionalMixin):
961
1005
  self,
962
1006
  *,
963
1007
  text: str,
1008
+ contains: str = "all",
964
1009
  apply_exclusions: bool = True,
965
1010
  regex: bool = False,
966
1011
  case: bool = True,
@@ -972,6 +1017,7 @@ class Element(DirectionalMixin):
972
1017
  self,
973
1018
  selector: str,
974
1019
  *,
1020
+ contains: str = "all",
975
1021
  apply_exclusions: bool = True,
976
1022
  regex: bool = False,
977
1023
  case: bool = True,
@@ -983,6 +1029,7 @@ class Element(DirectionalMixin):
983
1029
  selector: Optional[str] = None,
984
1030
  *,
985
1031
  text: Optional[str] = None,
1032
+ contains: str = "all",
986
1033
  apply_exclusions: bool = True,
987
1034
  regex: bool = False,
988
1035
  case: bool = True,
@@ -997,6 +1044,9 @@ class Element(DirectionalMixin):
997
1044
  Args:
998
1045
  selector: CSS-like selector string.
999
1046
  text: Text content to search for (equivalent to 'text:contains(...)').
1047
+ contains: How to determine if elements are inside: 'all' (fully inside),
1048
+ 'any' (any overlap), or 'center' (center point inside).
1049
+ (default: "all")
1000
1050
  apply_exclusions: Whether to apply exclusion regions (default: True).
1001
1051
  regex: Whether to use regex for text search (`selector` or `text`) (default: False).
1002
1052
  case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
@@ -1013,6 +1063,7 @@ class Element(DirectionalMixin):
1013
1063
  return temp_region.find_all(
1014
1064
  selector=selector,
1015
1065
  text=text,
1066
+ contains=contains,
1016
1067
  apply_exclusions=apply_exclusions,
1017
1068
  regex=regex,
1018
1069
  case=case,