natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +578 -27
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +118 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
@@ -279,7 +279,7 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
279
279
  """Analyze typography patterns in text elements."""
280
280
  fonts = Counter()
281
281
  sizes = Counter()
282
- styles = {'bold': 0, 'italic': 0}
282
+ styles = {'bold': 0, 'italic': 0, 'strikeout': 0, 'underline': 0, 'highlight': 0}
283
283
  colors = Counter()
284
284
 
285
285
  for element in elements:
@@ -302,6 +302,12 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
302
302
  styles['bold'] += 1
303
303
  if getattr(element, 'italic', False):
304
304
  styles['italic'] += 1
305
+ if getattr(element, 'strikeout', False):
306
+ styles['strikeout'] += 1
307
+ if getattr(element, 'underline', False):
308
+ styles['underline'] += 1
309
+ if getattr(element, 'highlight', False):
310
+ styles['highlight'] += 1
305
311
 
306
312
  # Color - use TextElement's color property
307
313
  color = getattr(element, 'color', None)
@@ -328,13 +334,12 @@ def _analyze_typography(elements: List["Element"]) -> Dict[str, Any]:
328
334
 
329
335
  # Styles
330
336
  style_list = []
331
- if styles['bold']:
332
- style_list.append(f"{styles['bold']} bold")
333
- if styles['italic']:
334
- style_list.append(f"{styles['italic']} italic")
337
+ for style, count in styles.items():
338
+ if count > 0:
339
+ style_list.append(f"{count} {style}")
335
340
  if style_list:
336
341
  result['styles'] = ", ".join(style_list)
337
-
342
+
338
343
  # Colors
339
344
  if colors and len(colors) > 1: # Only show if there are multiple colors
340
345
  result['colors'] = dict(colors.most_common())
@@ -414,6 +414,114 @@ class DirectionalMixin:
414
414
 
415
415
  return new_region
416
416
 
417
+ # ------------------------------------------------------------------
418
+ # Spatial parent lookup
419
+ # ------------------------------------------------------------------
420
+
421
+ def parent(
422
+ self,
423
+ selector: Optional[str] = None,
424
+ *,
425
+ mode: str = "contains", # "contains" | "center" | "overlap"
426
+ ) -> Optional["Element"]:
427
+ """Return the *smallest* element/region that encloses this one.
428
+
429
+ The search is purely geometric – no pre-existing hierarchy is assumed.
430
+
431
+ Parameters
432
+ ----------
433
+ selector : str, optional
434
+ CSS-style selector used to filter candidate containers first.
435
+ mode : str, default "contains"
436
+ How to decide if a candidate encloses this element.
437
+
438
+ • ``"contains"`` – candidate bbox fully contains *self* bbox.
439
+ • ``"center"`` – candidate contains the centroid of *self*.
440
+ • ``"overlap"`` – any bbox intersection > 0 pt².
441
+
442
+ Returns
443
+ -------
444
+ Element | Region | None
445
+ The smallest-area container that matches, or *None* if none found.
446
+ """
447
+
448
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
449
+
450
+ # --- Gather candidates ------------------------------------------------
451
+ page = getattr(self, "page", None)
452
+ if page is None:
453
+ return None
454
+
455
+ # All basic elements
456
+ try:
457
+ candidates: List["Element"] = list(page.get_elements(apply_exclusions=False))
458
+ except Exception:
459
+ candidates = []
460
+
461
+ # Add detected regions if present
462
+ if hasattr(page, "_element_mgr") and hasattr(page._element_mgr, "regions"):
463
+ candidates.extend(list(page._element_mgr.regions))
464
+
465
+ # Remove self from pool
466
+ candidates = [c for c in candidates if c is not self]
467
+
468
+ # Apply selector filtering early if provided
469
+ if selector:
470
+ sel_obj = parse_selector(selector)
471
+ filt = selector_to_filter_func(sel_obj)
472
+ candidates = [c for c in candidates if filt(c)]
473
+
474
+ if not candidates:
475
+ return None
476
+
477
+ # Helper to extract bbox (x0, top, x1, bottom)
478
+ def _bbox(obj):
479
+ return extract_bbox(obj)
480
+
481
+ # Self metrics
482
+ self_bbox = _bbox(self)
483
+ if self_bbox is None:
484
+ return None
485
+ s_x0, s_y0, s_x1, s_y1 = self_bbox
486
+ s_cx = (s_x0 + s_x1) / 2
487
+ s_cy = (s_y0 + s_y1) / 2
488
+
489
+ matches: List["Element"] = []
490
+
491
+ for cand in candidates:
492
+ c_bbox = _bbox(cand)
493
+ if c_bbox is None:
494
+ continue
495
+ c_x0, c_y0, c_x1, c_y1 = c_bbox
496
+
497
+ if mode == "contains":
498
+ if c_x0 <= s_x0 and c_y0 <= s_y0 and c_x1 >= s_x1 and c_y1 >= s_y1:
499
+ matches.append(cand)
500
+ elif mode == "center":
501
+ if c_x0 <= s_cx <= c_x1 and c_y0 <= s_cy <= c_y1:
502
+ matches.append(cand)
503
+ elif mode == "overlap":
504
+ # Compute overlap rectangle
505
+ ox0 = max(c_x0, s_x0)
506
+ oy0 = max(c_y0, s_y0)
507
+ ox1 = min(c_x1, s_x1)
508
+ oy1 = min(c_y1, s_y1)
509
+ if ox1 > ox0 and oy1 > oy0:
510
+ matches.append(cand)
511
+
512
+ if not matches:
513
+ return None
514
+
515
+ # Pick the smallest-area match
516
+ def _area(obj):
517
+ bb = _bbox(obj)
518
+ if bb is None:
519
+ return float("inf")
520
+ return (bb[2] - bb[0]) * (bb[3] - bb[1])
521
+
522
+ matches.sort(key=_area)
523
+ return matches[0]
524
+
417
525
 
418
526
  class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
419
527
  """
@@ -805,25 +913,17 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
805
913
 
806
914
  def highlight(
807
915
  self,
808
- label: Optional[str] = None,
809
- color: Optional[Union[Tuple, str]] = None, # Allow string color
810
- use_color_cycling: bool = False,
916
+ label: str = "",
917
+ color: Optional[Tuple[float, float, float]] = None,
918
+ use_color_cycling: bool = True,
811
919
  include_attrs: Optional[List[str]] = None,
812
920
  existing: str = "append",
813
921
  ) -> "Element":
814
- """
815
- Highlight this element on the page.
816
-
817
- Args:
818
- label: Optional label for the highlight
819
- color: Color tuple/string for the highlight, or None to use automatic color
820
- use_color_cycling: Force color cycling even with no label (default: False)
821
- include_attrs: List of attribute names to display on the highlight (e.g., ['confidence', 'type'])
822
- existing: How to handle existing highlights - 'append' (default) or 'replace'
922
+ """Highlight the element with the specified colour.
823
923
 
824
- Returns:
825
- Self for method chaining
924
+ Highlight the element on the page.
826
925
  """
926
+
827
927
  # Access the correct highlighter service
828
928
  highlighter = self.page._highlighter
829
929
 
@@ -850,7 +950,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
850
950
 
851
951
  def show(
852
952
  self,
853
- scale: float = 2.0,
953
+ resolution: Optional[float] = None,
854
954
  labels: bool = True,
855
955
  legend_position: str = "right",
856
956
  color: Optional[Union[Tuple, str]] = "red", # Default color for single element
@@ -862,7 +962,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
862
962
  Show the page with only this element highlighted temporarily.
863
963
 
864
964
  Args:
865
- scale: Scale factor for rendering
965
+ resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
866
966
  labels: Whether to include a legend for the highlight
867
967
  legend_position: Position of the legend
868
968
  color: Color to highlight this element (default: red)
@@ -874,6 +974,13 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
874
974
  Returns:
875
975
  PIL Image of the page with only this element highlighted, or None if error.
876
976
  """
977
+ # Apply global options as defaults
978
+ import natural_pdf
979
+ if resolution is None:
980
+ if natural_pdf.options.image.resolution is not None:
981
+ resolution = natural_pdf.options.image.resolution
982
+ else:
983
+ resolution = 144 # Default resolution when none specified
877
984
  if not hasattr(self, "page") or not self.page:
878
985
  logger.warning(f"Cannot show element, missing 'page' attribute: {self}")
879
986
  return None
@@ -909,7 +1016,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
909
1016
  return service.render_preview(
910
1017
  page_index=self.page.index,
911
1018
  temporary_highlights=[temp_highlight_data],
912
- scale=scale,
1019
+ resolution=resolution,
913
1020
  width=width, # Pass the width parameter
914
1021
  labels=labels,
915
1022
  legend_position=legend_position,
@@ -920,22 +1027,29 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
920
1027
  return None
921
1028
 
922
1029
  def save(
923
- self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
1030
+ self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
924
1031
  ) -> None:
925
1032
  """
926
1033
  Save the page with this element highlighted to an image file.
927
1034
 
928
1035
  Args:
929
1036
  filename: Path to save the image to
930
- scale: Scale factor for rendering
1037
+ resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
931
1038
  labels: Whether to include a legend for labels
932
1039
  legend_position: Position of the legend
933
1040
 
934
1041
  Returns:
935
1042
  Self for method chaining
936
1043
  """
1044
+ # Apply global options as defaults
1045
+ import natural_pdf
1046
+ if resolution is None:
1047
+ if natural_pdf.options.image.resolution is not None:
1048
+ resolution = natural_pdf.options.image.resolution
1049
+ else:
1050
+ resolution = 144 # Default resolution when none specified
937
1051
  # Save the highlighted image
938
- self.page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
1052
+ self.page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
939
1053
  return self
940
1054
 
941
1055
  # Note: save_image method removed in favor of save()
@@ -859,7 +859,7 @@ class ElementCollection(
859
859
  distinct: bool = False,
860
860
  include_attrs: Optional[List[str]] = None,
861
861
  # --- Rendering Parameters ---
862
- scale: float = 2.0,
862
+ resolution: Optional[float] = None,
863
863
  labels: bool = True, # Use 'labels' consistent with service
864
864
  legend_position: str = "right",
865
865
  render_ocr: bool = False,
@@ -884,7 +884,7 @@ class ElementCollection(
884
884
  label_format: F-string to format group labels if group_by is used.
885
885
  distinct: Highlight each element distinctly (overrides group_by/label).
886
886
  include_attrs: Attributes to display on individual highlights.
887
- scale: Scale factor for rendering image.
887
+ resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI).
888
888
  labels: Whether to include a legend for the temporary highlights.
889
889
  legend_position: Position of the legend ('right', 'left', 'top', 'bottom').
890
890
  render_ocr: Whether to render OCR text.
@@ -900,6 +900,18 @@ class ElementCollection(
900
900
  Raises:
901
901
  ValueError: If the collection is empty or elements are on different pages/PDFs.
902
902
  """
903
+ # Apply global options as defaults, but allow explicit parameters to override
904
+ import natural_pdf
905
+
906
+ # Use global options if parameters are not explicitly set
907
+ if width is None:
908
+ width = natural_pdf.options.image.width
909
+ if resolution is None:
910
+ if natural_pdf.options.image.resolution is not None:
911
+ resolution = natural_pdf.options.image.resolution
912
+ else:
913
+ resolution = 144 # Default resolution when none specified
914
+
903
915
  if not self._elements:
904
916
  raise ValueError("Cannot show an empty collection.")
905
917
 
@@ -967,7 +979,7 @@ class ElementCollection(
967
979
  img = service.render_preview(
968
980
  page_index=page.index,
969
981
  temporary_highlights=highlight_data_list,
970
- scale=scale,
982
+ resolution=resolution,
971
983
  width=width, # Pass the width parameter
972
984
  labels=labels, # Use 'labels'
973
985
  legend_position=legend_position,
@@ -982,7 +994,7 @@ class ElementCollection(
982
994
  def save(
983
995
  self,
984
996
  filename: str,
985
- scale: float = 2.0,
997
+ resolution: Optional[float] = None,
986
998
  width: Optional[int] = None,
987
999
  labels: bool = True,
988
1000
  legend_position: str = "right",
@@ -993,7 +1005,7 @@ class ElementCollection(
993
1005
 
994
1006
  Args:
995
1007
  filename: Path to save the image to
996
- scale: Scale factor for rendering
1008
+ resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
997
1009
  width: Optional width for the output image in pixels
998
1010
  labels: Whether to include a legend for labels
999
1011
  legend_position: Position of the legend
@@ -1002,10 +1014,22 @@ class ElementCollection(
1002
1014
  Returns:
1003
1015
  Self for method chaining
1004
1016
  """
1017
+ # Apply global options as defaults, but allow explicit parameters to override
1018
+ import natural_pdf
1019
+
1020
+ # Use global options if parameters are not explicitly set
1021
+ if width is None:
1022
+ width = natural_pdf.options.image.width
1023
+ if resolution is None:
1024
+ if natural_pdf.options.image.resolution is not None:
1025
+ resolution = natural_pdf.options.image.resolution
1026
+ else:
1027
+ resolution = 144 # Default resolution when none specified
1028
+
1005
1029
  # Use to_image to generate and save the image
1006
1030
  self.to_image(
1007
1031
  path=filename,
1008
- scale=scale,
1032
+ resolution=resolution,
1009
1033
  width=width,
1010
1034
  labels=labels,
1011
1035
  legend_position=legend_position,
@@ -1016,7 +1040,7 @@ class ElementCollection(
1016
1040
  def to_image(
1017
1041
  self,
1018
1042
  path: Optional[str] = None,
1019
- scale: float = 2.0,
1043
+ resolution: Optional[float] = None,
1020
1044
  width: Optional[int] = None,
1021
1045
  labels: bool = True,
1022
1046
  legend_position: str = "right",
@@ -1028,7 +1052,7 @@ class ElementCollection(
1028
1052
 
1029
1053
  Args:
1030
1054
  path: Optional path to save the image to
1031
- scale: Scale factor for rendering
1055
+ resolution: Resolution in DPI for rendering (uses global options if not specified, defaults to 144 DPI)
1032
1056
  width: Optional width for the output image in pixels (height calculated to maintain aspect ratio)
1033
1057
  labels: Whether to include a legend for labels
1034
1058
  legend_position: Position of the legend
@@ -1043,7 +1067,7 @@ class ElementCollection(
1043
1067
  # Generate the image using to_image
1044
1068
  return page.to_image(
1045
1069
  path=path,
1046
- scale=scale,
1070
+ resolution=resolution,
1047
1071
  width=width,
1048
1072
  labels=labels,
1049
1073
  legend_position=legend_position,
@@ -1774,7 +1798,7 @@ class ElementCollection(
1774
1798
  self,
1775
1799
  padding: int = 1,
1776
1800
  threshold: float = 0.95,
1777
- resolution: float = 150,
1801
+ resolution: Optional[float] = None,
1778
1802
  show_progress: bool = True,
1779
1803
  ) -> "ElementCollection":
1780
1804
  """
@@ -1786,12 +1810,20 @@ class ElementCollection(
1786
1810
  Args:
1787
1811
  padding: Number of pixels to keep as padding after trimming (default: 1)
1788
1812
  threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
1789
- resolution: Resolution for image rendering in DPI (default: 150)
1813
+ resolution: Resolution for image rendering in DPI (default: uses global options, fallback to 144 DPI)
1790
1814
  show_progress: Whether to show a progress bar for the trimming operation
1791
1815
 
1792
1816
  Returns:
1793
1817
  New ElementCollection with trimmed regions
1794
1818
  """
1819
+ # Apply global options as defaults
1820
+ import natural_pdf
1821
+ if resolution is None:
1822
+ if natural_pdf.options.image.resolution is not None:
1823
+ resolution = natural_pdf.options.image.resolution
1824
+ else:
1825
+ resolution = 144 # Default resolution when none specified
1826
+
1795
1827
  return self.apply(
1796
1828
  lambda element: element.trim(
1797
1829
  padding=padding, threshold=threshold, resolution=resolution
@@ -0,0 +1,43 @@
1
+ from typing import TYPE_CHECKING, Any, Dict, Tuple
2
+
3
+ from natural_pdf.elements.base import Element
4
+
5
+ if TYPE_CHECKING:
6
+ from natural_pdf.core.page import Page
7
+
8
+ class ImageElement(Element):
9
+ """Represents a raster XObject (embedded image) on a PDF page."""
10
+
11
+ def __init__(self, obj: Dict[str, Any], page: "Page"):
12
+ super().__init__(obj, page)
13
+
14
+ # ------------------------------------------------------------------
15
+ # Simple attribute proxies
16
+ # ------------------------------------------------------------------
17
+ @property
18
+ def type(self) -> str: # noqa: D401 – short description already given
19
+ return "image"
20
+
21
+ @property
22
+ def width(self) -> float: # override just to use dict value directly
23
+ return float(self._obj.get("width", 0))
24
+
25
+ @property
26
+ def height(self) -> float:
27
+ return float(self._obj.get("height", 0))
28
+
29
+ @property
30
+ def srcsize(self) -> Tuple[float, float]:
31
+ """Original pixel dimensions of the embedded image (width, height)."""
32
+ return self._obj.get("srcsize", (None, None))
33
+
34
+ @property
35
+ def colorspace(self): # raw pdfminer data
36
+ return self._obj.get("colorspace")
37
+
38
+ # No text extraction for images
39
+ def extract_text(self, *args, **kwargs) -> str: # noqa: D401 – consistent signature
40
+ return ""
41
+
42
+ def __repr__(self):
43
+ return f"<ImageElement bbox={self.bbox} srcsize={self.srcsize}>"
@@ -26,6 +26,11 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
26
26
  # Import new utils
27
27
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
28
28
 
29
+ # ------------------------------------------------------------------
30
+ # Table utilities
31
+ # ------------------------------------------------------------------
32
+ from natural_pdf.tables import TableResult
33
+
29
34
  # --- End Classification Imports --- #
30
35
 
31
36
 
@@ -590,8 +595,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
590
595
 
591
596
  def to_image(
592
597
  self,
593
- scale: float = 2.0,
594
- resolution: float = 150,
598
+ resolution: Optional[float] = None,
595
599
  crop: bool = False,
596
600
  include_highlights: bool = True,
597
601
  **kwargs,
@@ -600,7 +604,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
600
604
  Generate an image of just this region.
601
605
 
602
606
  Args:
603
- resolution: Resolution in DPI for rendering (default: 150)
607
+ resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
604
608
  crop: If True, only crop the region without highlighting its boundaries
605
609
  include_highlights: Whether to include existing highlights (default: True)
606
610
  **kwargs: Additional parameters for page.to_image()
@@ -608,6 +612,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
608
612
  Returns:
609
613
  PIL Image of just this region
610
614
  """
615
+ # Apply global options as defaults
616
+ import natural_pdf
617
+ if resolution is None:
618
+ if natural_pdf.options.image.resolution is not None:
619
+ resolution = natural_pdf.options.image.resolution
620
+ else:
621
+ resolution = 144 # Default resolution when none specified
622
+
611
623
  # Handle the case where user wants the cropped region to have a specific width
612
624
  page_kwargs = kwargs.copy()
613
625
  effective_resolution = resolution # Start with the provided resolution
@@ -633,7 +645,6 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
633
645
 
634
646
  # First get the full page image with highlights if requested
635
647
  page_image = self._page.to_image(
636
- scale=scale,
637
648
  resolution=effective_resolution,
638
649
  include_highlights=include_highlights,
639
650
  **page_kwargs,
@@ -683,7 +694,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
683
694
 
684
695
  def show(
685
696
  self,
686
- scale: float = 2.0,
697
+ resolution: Optional[float] = None,
687
698
  labels: bool = True,
688
699
  legend_position: str = "right",
689
700
  # Add a default color for standalone show
@@ -696,7 +707,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
696
707
  Show the page with just this region highlighted temporarily.
697
708
 
698
709
  Args:
699
- scale: Scale factor for rendering
710
+ resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
700
711
  labels: Whether to include a legend for labels
701
712
  legend_position: Position of the legend
702
713
  color: Color to highlight this region (default: blue)
@@ -709,6 +720,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
709
720
  Returns:
710
721
  PIL Image of the page with only this region highlighted
711
722
  """
723
+ # Apply global options as defaults
724
+ import natural_pdf
725
+ if resolution is None:
726
+ if natural_pdf.options.image.resolution is not None:
727
+ resolution = natural_pdf.options.image.resolution
728
+ else:
729
+ resolution = 144 # Default resolution when none specified
730
+
712
731
  if not self._page:
713
732
  raise ValueError("Region must be associated with a page to show.")
714
733
 
@@ -737,7 +756,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
737
756
  return service.render_preview(
738
757
  page_index=self._page.index,
739
758
  temporary_highlights=[temp_highlight_data],
740
- scale=scale,
759
+ resolution=resolution,
741
760
  width=width, # Pass the width parameter
742
761
  labels=labels,
743
762
  legend_position=legend_position,
@@ -745,31 +764,39 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
745
764
  )
746
765
 
747
766
  def save(
748
- self, filename: str, scale: float = 2.0, labels: bool = True, legend_position: str = "right"
767
+ self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
749
768
  ) -> "Region":
750
769
  """
751
770
  Save the page with this region highlighted to an image file.
752
771
 
753
772
  Args:
754
773
  filename: Path to save the image to
755
- scale: Scale factor for rendering
774
+ resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
756
775
  labels: Whether to include a legend for labels
757
776
  legend_position: Position of the legend
758
777
 
759
778
  Returns:
760
779
  Self for method chaining
761
780
  """
781
+ # Apply global options as defaults
782
+ import natural_pdf
783
+ if resolution is None:
784
+ if natural_pdf.options.image.resolution is not None:
785
+ resolution = natural_pdf.options.image.resolution
786
+ else:
787
+ resolution = 144 # Default resolution when none specified
788
+
762
789
  # Highlight this region if not already highlighted
763
790
  self.highlight()
764
791
 
765
792
  # Save the highlighted image
766
- self._page.save_image(filename, scale=scale, labels=labels, legend_position=legend_position)
793
+ self._page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
767
794
  return self
768
795
 
769
796
  def save_image(
770
797
  self,
771
798
  filename: str,
772
- resolution: float = 150,
799
+ resolution: Optional[float] = None,
773
800
  crop: bool = False,
774
801
  include_highlights: bool = True,
775
802
  **kwargs,
@@ -779,7 +806,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
779
806
 
780
807
  Args:
781
808
  filename: Path to save the image to
782
- resolution: Resolution in DPI for rendering (default: 150)
809
+ resolution: Resolution in DPI for rendering (default: uses global options, fallback to 144 DPI)
783
810
  crop: If True, only crop the region without highlighting its boundaries
784
811
  include_highlights: Whether to include existing highlights (default: True)
785
812
  **kwargs: Additional parameters for page.to_image()
@@ -787,6 +814,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
787
814
  Returns:
788
815
  Self for method chaining
789
816
  """
817
+ # Apply global options as defaults
818
+ import natural_pdf
819
+ if resolution is None:
820
+ if natural_pdf.options.image.resolution is not None:
821
+ resolution = natural_pdf.options.image.resolution
822
+ else:
823
+ resolution = 144 # Default resolution when none specified
824
+
790
825
  # Get the region image
791
826
  image = self.to_image(
792
827
  resolution=resolution,
@@ -803,7 +838,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
803
838
  self,
804
839
  padding: int = 1,
805
840
  threshold: float = 0.95,
806
- resolution: float = 150,
841
+ resolution: Optional[float] = None,
807
842
  pre_shrink: float = 0.5,
808
843
  ) -> "Region":
809
844
  """
@@ -817,7 +852,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
817
852
  threshold: Threshold for considering a row/column as whitespace (0.0-1.0, default: 0.95)
818
853
  Higher values mean more strict whitespace detection.
819
854
  E.g., 0.95 means if 95% of pixels in a row/column are white, consider it whitespace.
820
- resolution: Resolution for image rendering in DPI (default: 150)
855
+ resolution: Resolution for image rendering in DPI (default: uses global options, fallback to 144 DPI)
821
856
  pre_shrink: Amount to shrink region before trimming, then expand back after (default: 0.5)
822
857
  This helps avoid detecting box borders/slivers as content.
823
858
 
@@ -834,6 +869,14 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
834
869
  # Conservative trimming with more padding
835
870
  loose = region.trim(padding=3, threshold=0.98)
836
871
  """
872
+ # Apply global options as defaults
873
+ import natural_pdf
874
+ if resolution is None:
875
+ if natural_pdf.options.image.resolution is not None:
876
+ resolution = natural_pdf.options.image.resolution
877
+ else:
878
+ resolution = 144 # Default resolution when none specified
879
+
837
880
  # Pre-shrink the region to avoid box slivers
838
881
  work_region = (
839
882
  self.expand(left=-pre_shrink, right=-pre_shrink, top=-pre_shrink, bottom=-pre_shrink)
@@ -1172,7 +1215,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1172
1215
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
1173
1216
  # --- NEW: Add tqdm control option --- #
1174
1217
  show_progress: bool = False, # Controls progress bar for text method
1175
- ) -> List[List[Optional[str]]]: # Return type allows Optional[str] for cells
1218
+ ) -> TableResult: # Return type allows Optional[str] for cells
1176
1219
  """
1177
1220
  Extract a table from this region.
1178
1221
 
@@ -1224,7 +1267,7 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1224
1267
  logger.debug(
1225
1268
  f"Region {self.bbox}: Found {len(cell_regions_in_table)} pre-computed table_cell regions – using 'cells' method."
1226
1269
  )
1227
- return self._extract_table_from_cells(cell_regions_in_table)
1270
+ return TableResult(self._extract_table_from_cells(cell_regions_in_table))
1228
1271
 
1229
1272
  # --------------------------------------------------------------- #
1230
1273
 
@@ -1280,19 +1323,21 @@ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin, ShapeDetect
1280
1323
 
1281
1324
  # Use the selected method
1282
1325
  if effective_method == "tatr":
1283
- return self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
1326
+ table_rows = self._extract_table_tatr(use_ocr=use_ocr, ocr_config=ocr_config)
1284
1327
  elif effective_method == "text":
1285
1328
  current_text_options = text_options.copy()
1286
1329
  current_text_options["cell_extraction_func"] = cell_extraction_func
1287
1330
  current_text_options["show_progress"] = show_progress
1288
- return self._extract_table_text(**current_text_options)
1331
+ table_rows = self._extract_table_text(**current_text_options)
1289
1332
  elif effective_method == "pdfplumber":
1290
- return self._extract_table_plumber(table_settings)
1333
+ table_rows = self._extract_table_plumber(table_settings)
1291
1334
  else:
1292
1335
  raise ValueError(
1293
1336
  f"Unknown table extraction method: '{method}'. Choose from 'tatr', 'pdfplumber', 'text', 'stream', 'lattice'."
1294
1337
  )
1295
1338
 
1339
+ return TableResult(table_rows)
1340
+
1296
1341
  def extract_tables(
1297
1342
  self,
1298
1343
  method: Optional[str] = None,