natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +578 -27
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +118 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
@@ -70,16 +70,16 @@ class HighlightRenderer:
70
70
  page: Page,
71
71
  base_image: Image.Image,
72
72
  highlights: List[Highlight],
73
- scale: float,
73
+ scale_factor: float,
74
74
  render_ocr: bool,
75
75
  ):
76
76
  self.page = page # Keep page reference for OCR rendering
77
77
  self.base_image = base_image.convert("RGBA") # Ensure RGBA
78
78
  self.highlights = highlights
79
- self.scale = scale
79
+ self.scale_factor = scale_factor # Renamed from scale to scale_factor for clarity
80
80
  self.render_ocr = render_ocr
81
81
  self.result_image = self.base_image.copy()
82
- self.vertex_size = max(3, int(2 * self.scale)) # Size of corner markers
82
+ self.vertex_size = max(3, int(2 * self.scale_factor)) # Size of corner markers
83
83
 
84
84
  def render(self) -> Image.Image:
85
85
  """Executes the rendering process."""
@@ -98,7 +98,7 @@ class HighlightRenderer:
98
98
  scaled_bbox = None
99
99
 
100
100
  if highlight.is_polygon:
101
- scaled_polygon = [(p[0] * self.scale, p[1] * self.scale) for p in highlight.polygon]
101
+ scaled_polygon = [(p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon]
102
102
  # Draw polygon fill and border
103
103
  draw.polygon(
104
104
  scaled_polygon, fill=highlight.color, outline=highlight.border_color, width=2
@@ -113,10 +113,10 @@ class HighlightRenderer:
113
113
  else: # Rectangle
114
114
  x0, top, x1, bottom = highlight.bbox
115
115
  x0_s, top_s, x1_s, bottom_s = (
116
- x0 * self.scale,
117
- top * self.scale,
118
- x1 * self.scale,
119
- bottom * self.scale,
116
+ x0 * self.scale_factor,
117
+ top * self.scale_factor,
118
+ x1 * self.scale_factor,
119
+ bottom * self.scale_factor,
120
120
  )
121
121
  scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
122
122
  # Draw rectangle fill and border
@@ -159,15 +159,15 @@ class HighlightRenderer:
159
159
  """Draws attribute key-value pairs on the highlight."""
160
160
  try:
161
161
  # Slightly larger font, scaled
162
- font_size = max(10, int(8 * self.scale))
162
+ font_size = max(10, int(8 * self.scale_factor))
163
163
  # Prioritize monospace fonts for better alignment
164
164
  font = ImageFont.truetype("Arial.ttf", font_size) # Fallback sans-serif
165
165
  except IOError:
166
166
  font = ImageFont.load_default()
167
167
  font_size = 10 # Reset size for default font
168
168
 
169
- line_height = font_size + int(4 * self.scale) # Scaled line spacing
170
- bg_padding = int(3 * self.scale)
169
+ line_height = font_size + int(4 * self.scale_factor) # Scaled line spacing
170
+ bg_padding = int(3 * self.scale_factor)
171
171
  max_width = 0
172
172
  text_lines = []
173
173
 
@@ -191,8 +191,8 @@ class HighlightRenderer:
191
191
  total_height = line_height * len(text_lines)
192
192
 
193
193
  # Position near top-right corner with padding
194
- x = bbox_scaled[2] - int(2 * self.scale) - max_width
195
- y = bbox_scaled[1] + int(2 * self.scale)
194
+ x = bbox_scaled[2] - int(2 * self.scale_factor) - max_width
195
+ y = bbox_scaled[1] + int(2 * self.scale_factor)
196
196
 
197
197
  # Draw background rectangle (semi-transparent white)
198
198
  bg_x0 = x - bg_padding
@@ -244,10 +244,10 @@ class HighlightRenderer:
244
244
  for element in ocr_elements:
245
245
  x0, top, x1, bottom = element.bbox
246
246
  x0_s, top_s, x1_s, bottom_s = (
247
- x0 * self.scale,
248
- top * self.scale,
249
- x1 * self.scale,
250
- bottom * self.scale,
247
+ x0 * self.scale_factor,
248
+ top * self.scale_factor,
249
+ x1 * self.scale_factor,
250
+ bottom * self.scale_factor,
251
251
  )
252
252
  box_w, box_h = x1_s - x0_s, bottom_s - top_s
253
253
 
@@ -556,19 +556,62 @@ class HighlightingService:
556
556
  self._highlights_by_page[page_index].append(highlight)
557
557
  logger.debug(f"Added highlight to page {page_index}: {highlight}")
558
558
 
559
+ # --- Invalidate page-level image cache --------------------------------
560
+ # The Page.to_image method maintains an internal cache keyed by rendering
561
+ # parameters. Because the cache key currently does **not** incorporate
562
+ # any information about the highlights themselves, it can return stale
563
+ # images after highlights are added or removed. To ensure the next
564
+ # render reflects the new highlights, we clear the cache for the
565
+ # affected page here.
566
+ try:
567
+ page_obj = self._pdf[page_index]
568
+ if hasattr(page_obj, "_to_image_cache"):
569
+ page_obj._to_image_cache.clear()
570
+ logger.debug(
571
+ f"Cleared cached to_image renders for page {page_index} after adding a highlight."
572
+ )
573
+ except Exception as cache_err: # pragma: no cover – never fail highlight creation
574
+ logger.warning(
575
+ f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
576
+ exc_info=True,
577
+ )
578
+
559
579
  def clear_all(self):
560
580
  """Clears all highlights from all pages and resets the color manager."""
561
581
  self._highlights_by_page = {}
562
582
  self._color_manager.reset()
563
583
  logger.info("Cleared all highlights and reset ColorManager.")
564
584
 
585
+ # Clear cached images for *all* pages because their visual state may
586
+ # depend on highlight visibility.
587
+ for idx, page in enumerate(self._pdf.pages):
588
+ try:
589
+ if hasattr(page, "_to_image_cache"):
590
+ page._to_image_cache.clear()
591
+ except Exception:
592
+ # Non-critical – keep going for remaining pages
593
+ continue
594
+
565
595
  def clear_page(self, page_index: int):
566
596
  """Clears all highlights from a specific page."""
567
597
  if page_index in self._highlights_by_page:
568
598
  del self._highlights_by_page[page_index]
569
599
  logger.debug(f"Cleared highlights for page {page_index}.")
570
- # Note: We typically don't reset the color manager when clearing a single page
571
- # to maintain color consistency if highlights are added back.
600
+
601
+ # Also clear any cached rendered images for this page so the next render
602
+ # reflects the removal of highlights.
603
+ try:
604
+ page_obj = self._pdf[page_index]
605
+ if hasattr(page_obj, "_to_image_cache"):
606
+ page_obj._to_image_cache.clear()
607
+ logger.debug(
608
+ f"Cleared cached to_image renders for page {page_index} after removing highlights."
609
+ )
610
+ except Exception as cache_err: # pragma: no cover
611
+ logger.warning(
612
+ f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
613
+ exc_info=True,
614
+ )
572
615
 
573
616
  def get_highlights_for_page(self, page_index: int) -> List[Highlight]:
574
617
  """Returns a list of Highlight objects for a specific page."""
@@ -581,11 +624,10 @@ class HighlightingService:
581
624
  def render_page(
582
625
  self,
583
626
  page_index: int,
584
- scale: float = 2.0,
627
+ resolution: float = 144,
585
628
  labels: bool = True,
586
629
  legend_position: str = "right",
587
630
  render_ocr: bool = False,
588
- resolution: Optional[float] = None,
589
631
  **kwargs, # Pass other args to pdfplumber.page.to_image if needed
590
632
  ) -> Optional[Image.Image]:
591
633
  """
@@ -594,12 +636,11 @@ class HighlightingService:
594
636
 
595
637
  Args:
596
638
  page_index: The 0-based index of the page to render.
597
- scale: Scale factor for rendering highlights if width/height/resolution not in kwargs.
639
+ resolution: Resolution (DPI) for the base page image if width/height not in kwargs.
640
+ Defaults to 144 DPI (equivalent to previous scale=2.0).
598
641
  labels: Whether to include a legend for highlights.
599
642
  legend_position: Position of the legend.
600
643
  render_ocr: Whether to render OCR text on the image.
601
- resolution: Optional resolution (DPI) for the base page image if width/height not in kwargs.
602
- Defaults to scale * 72 if not otherwise specified.
603
644
  kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
604
645
 
605
646
  Returns:
@@ -625,13 +666,16 @@ class HighlightingService:
625
666
  logger.debug(f"Rendering page {page_index} with height={to_image_args['height']}.")
626
667
  # Actual scale will be calculated after image creation
627
668
  else:
628
- # Use explicit resolution from kwargs if present, then the resolution param, then scale
629
- render_resolution = to_image_args.pop(
630
- "resolution", resolution
631
- ) # Use and remove from kwargs if present
669
+ # Use explicit resolution if provided via kwargs, otherwise fallback to the
670
+ # `resolution` parameter (which might be None). If we still end up with
671
+ # `None`, default to 144 DPI to avoid downstream errors.
672
+ render_resolution = to_image_args.pop("resolution", resolution)
632
673
  if render_resolution is None:
633
- render_resolution = scale * 72
634
- to_image_args["resolution"] = render_resolution # Add it back for the call
674
+ render_resolution = 144
675
+
676
+ # Reinstate into kwargs for pdfplumber
677
+ to_image_args["resolution"] = render_resolution
678
+
635
679
  actual_scale_x = render_resolution / 72.0
636
680
  actual_scale_y = render_resolution / 72.0
637
681
  logger.debug(
@@ -657,11 +701,11 @@ class HighlightingService:
657
701
  if page_obj.width > 0:
658
702
  actual_scale_x = base_image_pil.width / page_obj.width
659
703
  else:
660
- actual_scale_x = scale # Fallback
704
+ actual_scale_x = resolution / 72.0 # Fallback to resolution-based scale
661
705
  if page_obj.height > 0:
662
706
  actual_scale_y = base_image_pil.height / page_obj.height
663
707
  else:
664
- actual_scale_y = scale # Fallback
708
+ actual_scale_y = resolution / 72.0 # Fallback to resolution-based scale
665
709
  logger.debug(
666
710
  f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f}"
667
711
  )
@@ -682,14 +726,20 @@ class HighlightingService:
682
726
  page=page_obj,
683
727
  base_image=base_image_pil,
684
728
  highlights=highlights_on_page,
685
- scale=renderer_scale, # Use the determined actual scale
729
+ scale_factor=renderer_scale, # Use the determined actual scale
686
730
  render_ocr=render_ocr,
687
731
  )
688
732
  rendered_image = renderer.render()
689
733
  else:
690
734
  if render_ocr:
691
735
  # Still render OCR even if no highlights, using the determined actual scale
692
- renderer = HighlightRenderer(page_obj, base_image_pil, [], renderer_scale, True)
736
+ renderer = HighlightRenderer(
737
+ page=page_obj,
738
+ base_image=base_image_pil,
739
+ highlights=[],
740
+ scale_factor=renderer_scale,
741
+ render_ocr=True,
742
+ )
693
743
  rendered_image = renderer.render()
694
744
  else:
695
745
  rendered_image = base_image_pil # No highlights, no OCR requested
@@ -722,11 +772,10 @@ class HighlightingService:
722
772
  self,
723
773
  page_index: int,
724
774
  temporary_highlights: List[Dict],
725
- scale: float = 2.0,
775
+ resolution: float = 144,
726
776
  labels: bool = True,
727
777
  legend_position: str = "right",
728
778
  render_ocr: bool = False,
729
- resolution: Optional[float] = None,
730
779
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
731
780
  **kwargs,
732
781
  ) -> Optional[Image.Image]:
@@ -737,11 +786,11 @@ class HighlightingService:
737
786
  Args:
738
787
  page_index: Index of the page to render.
739
788
  temporary_highlights: List of highlight data dicts (from ElementCollection._prepare).
740
- scale: Original scale factor for rendering, used if width/height are not provided.
789
+ resolution: Resolution (DPI) for base page image rendering if width/height not used.
790
+ Defaults to 144 DPI (equivalent to previous scale=2.0).
741
791
  labels: Whether to include a legend.
742
792
  legend_position: Position of the legend.
743
793
  render_ocr: Whether to render OCR text.
744
- resolution: Resolution for base page image rendering if width/height not used.
745
794
  crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
746
795
  space to crop the output image to, before legends or other overlays are
747
796
  applied. If None, no cropping is performed.
@@ -777,9 +826,11 @@ class HighlightingService:
777
826
  # Resolution is implicitly handled by pdfplumber when height is set
778
827
  # after image is created, we will calculate actual_scale_x and actual_scale_y
779
828
  else:
780
- # Neither width nor height is provided, use resolution or scale.
781
- render_resolution = resolution if resolution is not None else scale * 72
829
+ # Neither width nor height is provided, rely on `resolution`.
830
+ # If `resolution` was explicitly passed as `None`, fall back to 144 DPI.
831
+ render_resolution = 144 if resolution is None else resolution
782
832
  to_image_args["resolution"] = render_resolution
833
+
783
834
  actual_scale_x = render_resolution / 72.0
784
835
  actual_scale_y = render_resolution / 72.0
785
836
  logger.debug(
@@ -804,11 +855,11 @@ class HighlightingService:
804
855
  if page_obj.width > 0:
805
856
  actual_scale_x = base_image_pil.width / page_obj.width
806
857
  else:
807
- actual_scale_x = scale # Fallback to original scale
858
+ actual_scale_x = resolution / 72.0 # Fallback to resolution-based scale
808
859
  if page_obj.height > 0:
809
860
  actual_scale_y = base_image_pil.height / page_obj.height
810
861
  else:
811
- actual_scale_y = scale # Fallback to original scale
862
+ actual_scale_y = resolution / 72.0 # Fallback to resolution-based scale
812
863
  logger.debug(
813
864
  f"Calculated actual scales for page {page_index}: x={actual_scale_x:.2f}, y={actual_scale_y:.2f} from image size {base_image_pil.size} and page size ({page_obj.width}, {page_obj.height})"
814
865
  )
@@ -855,7 +906,11 @@ class HighlightingService:
855
906
  renderer_scale = actual_scale_x
856
907
 
857
908
  renderer = HighlightRenderer(
858
- page_obj, base_image_pil, preview_highlights, renderer_scale, render_ocr
909
+ page=page_obj,
910
+ base_image=base_image_pil,
911
+ highlights=preview_highlights,
912
+ scale_factor=renderer_scale,
913
+ render_ocr=render_ocr,
859
914
  )
860
915
  rendered_image = renderer.render()
861
916
 
natural_pdf/core/page.py CHANGED
@@ -867,6 +867,28 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
867
867
  >>> page.region(right=200, width=50) # Region from x=150 to x=200
868
868
  >>> page.region(top=100, bottom=200, width="full") # Explicit full width
869
869
  """
870
+ # ------------------------------------------------------------------
871
+ # Percentage support – convert strings like "30%" to absolute values
872
+ # based on page dimensions. X-axis params (left, right, width) use
873
+ # page.width; Y-axis params (top, bottom, height) use page.height.
874
+ # ------------------------------------------------------------------
875
+
876
+ def _pct_to_abs(val, axis: str):
877
+ if isinstance(val, str) and val.strip().endswith("%"):
878
+ try:
879
+ pct = float(val.strip()[:-1]) / 100.0
880
+ except ValueError:
881
+ return val # leave unchanged if not a number
882
+ return pct * (self.width if axis == "x" else self.height)
883
+ return val
884
+
885
+ left = _pct_to_abs(left, "x")
886
+ right = _pct_to_abs(right, "x")
887
+ width = _pct_to_abs(width, "x")
888
+ top = _pct_to_abs(top, "y")
889
+ bottom = _pct_to_abs(bottom, "y")
890
+ height = _pct_to_abs(height, "y")
891
+
870
892
  # --- Type checking and basic validation ---
871
893
  is_width_numeric = isinstance(width, (int, float))
872
894
  is_width_string = isinstance(width, str)
@@ -1137,6 +1159,40 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1137
1159
  user_kwargs=kwargs, # Pass original user kwargs
1138
1160
  )
1139
1161
 
1162
+ # --- Optional: apply Unicode BiDi algorithm for mixed RTL/LTR correctness ---
1163
+ apply_bidi = kwargs.get("bidi", True)
1164
+ if apply_bidi and result:
1165
+ # Quick check for any RTL character
1166
+ import unicodedata
1167
+
1168
+ def _contains_rtl(s):
1169
+ return any(unicodedata.bidirectional(ch) in ("R", "AL", "AN") for ch in s)
1170
+
1171
+ if _contains_rtl(result):
1172
+ try:
1173
+ from bidi.algorithm import get_display # type: ignore
1174
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
1175
+
1176
+ result = "\n".join(
1177
+ mirror_brackets(
1178
+ get_display(
1179
+ line,
1180
+ base_dir=(
1181
+ "R"
1182
+ if any(
1183
+ unicodedata.bidirectional(ch)
1184
+ in ("R", "AL", "AN")
1185
+ for ch in line
1186
+ )
1187
+ else "L"
1188
+ ),
1189
+ )
1190
+ )
1191
+ for line in result.split("\n")
1192
+ )
1193
+ except ModuleNotFoundError:
1194
+ pass # silently skip if python-bidi not available
1195
+
1140
1196
  logger.debug(f"Page {self.number}: extract_text finished, result length: {len(result)}.")
1141
1197
  return result
1142
1198
 
@@ -1440,7 +1496,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1440
1496
 
1441
1497
  def show(
1442
1498
  self,
1443
- scale: float = 2.0,
1499
+ resolution: float = 144,
1444
1500
  width: Optional[int] = None,
1445
1501
  labels: bool = True,
1446
1502
  legend_position: str = "right",
@@ -1450,7 +1506,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1450
1506
  Generates and returns an image of the page with persistent highlights rendered.
1451
1507
 
1452
1508
  Args:
1453
- scale: Scale factor for rendering.
1509
+ resolution: Resolution in DPI for rendering (default: 144 DPI, equivalent to previous scale=2.0).
1454
1510
  width: Optional width for the output image.
1455
1511
  labels: Whether to include a legend for labels.
1456
1512
  legend_position: Position of the legend.
@@ -1460,7 +1516,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1460
1516
  PIL Image object of the page with highlights, or None if rendering fails.
1461
1517
  """
1462
1518
  return self.to_image(
1463
- scale=scale,
1519
+ resolution=resolution,
1464
1520
  width=width,
1465
1521
  labels=labels,
1466
1522
  legend_position=legend_position,
@@ -1471,13 +1527,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1471
1527
  def save_image(
1472
1528
  self,
1473
1529
  filename: str,
1474
- scale: float = 2.0,
1475
1530
  width: Optional[int] = None,
1476
1531
  labels: bool = True,
1477
1532
  legend_position: str = "right",
1478
1533
  render_ocr: bool = False,
1479
1534
  include_highlights: bool = True, # Allow saving without highlights
1480
- resolution: Optional[float] = None,
1535
+ resolution: float = 144,
1481
1536
  **kwargs,
1482
1537
  ) -> "Page":
1483
1538
  """
@@ -1485,13 +1540,12 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1485
1540
 
1486
1541
  Args:
1487
1542
  filename: Path to save the image to.
1488
- scale: Scale factor for rendering highlights.
1489
1543
  width: Optional width for the output image.
1490
1544
  labels: Whether to include a legend.
1491
1545
  legend_position: Position of the legend.
1492
1546
  render_ocr: Whether to render OCR text.
1493
1547
  include_highlights: Whether to render highlights.
1494
- resolution: Resolution for base image rendering.
1548
+ resolution: Resolution in DPI for base image rendering (default: 144 DPI, equivalent to previous scale=2.0).
1495
1549
  **kwargs: Additional args for pdfplumber's to_image.
1496
1550
 
1497
1551
  Returns:
@@ -1500,7 +1554,6 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1500
1554
  # Use to_image to generate and save the image
1501
1555
  self.to_image(
1502
1556
  path=filename,
1503
- scale=scale,
1504
1557
  width=width,
1505
1558
  labels=labels,
1506
1559
  legend_position=legend_position,
@@ -1554,7 +1607,6 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1554
1607
  def to_image(
1555
1608
  self,
1556
1609
  path: Optional[str] = None,
1557
- scale: float = 2.0,
1558
1610
  width: Optional[int] = None,
1559
1611
  labels: bool = True,
1560
1612
  legend_position: str = "right",
@@ -1569,12 +1621,11 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1569
1621
 
1570
1622
  Args:
1571
1623
  path: Optional path to save the image to.
1572
- scale: Scale factor for rendering highlights.
1573
1624
  width: Optional width for the output image.
1574
1625
  labels: Whether to include a legend for highlights.
1575
1626
  legend_position: Position of the legend.
1576
1627
  render_ocr: Whether to render OCR text on highlights.
1577
- resolution: Resolution in DPI for base page image (default: scale * 72).
1628
+ resolution: Resolution in DPI for base page image. If None, uses global setting or defaults to 144 DPI.
1578
1629
  include_highlights: Whether to render highlights.
1579
1630
  exclusions: Accepts one of the following:
1580
1631
  • None – no masking (default)
@@ -1593,11 +1644,13 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1593
1644
  # Use global options if parameters are not explicitly set
1594
1645
  if width is None:
1595
1646
  width = natural_pdf.options.image.width
1596
- if resolution is None and natural_pdf.options.image.resolution is not None:
1597
- resolution = natural_pdf.options.image.resolution
1647
+ if resolution is None:
1648
+ if natural_pdf.options.image.resolution is not None:
1649
+ resolution = natural_pdf.options.image.resolution
1650
+ else:
1651
+ resolution = 144 # Default resolution when none specified
1598
1652
  # 1. Create cache key (excluding path)
1599
1653
  cache_key_parts = [
1600
- scale,
1601
1654
  width,
1602
1655
  labels,
1603
1656
  legend_position,
@@ -1641,7 +1694,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1641
1694
  rendered_image_component: Optional[Image.Image] = (
1642
1695
  None # Renamed from 'image' in original
1643
1696
  )
1644
- render_resolution = resolution if resolution is not None else scale * 72
1697
+ render_resolution = resolution
1645
1698
  thread_id = threading.current_thread().name
1646
1699
  logger.debug(
1647
1700
  f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
@@ -1658,11 +1711,10 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
1658
1711
  # Delegate rendering to the central service
1659
1712
  rendered_image_component = self._highlighter.render_page(
1660
1713
  page_index=self.index,
1661
- scale=scale,
1714
+ resolution=render_resolution,
1662
1715
  labels=labels,
1663
1716
  legend_position=legend_position,
1664
1717
  render_ocr=render_ocr,
1665
- resolution=render_resolution, # Pass the calculated resolution
1666
1718
  **kwargs,
1667
1719
  )
1668
1720
  else:
@@ -2336,7 +2388,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2336
2388
  def show_preview(
2337
2389
  self,
2338
2390
  temporary_highlights: List[Dict],
2339
- scale: float = 2.0,
2391
+ resolution: float = 144,
2340
2392
  width: Optional[int] = None,
2341
2393
  labels: bool = True,
2342
2394
  legend_position: str = "right",
@@ -2349,7 +2401,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2349
2401
  Args:
2350
2402
  temporary_highlights: List of highlight data dictionaries (as prepared by
2351
2403
  ElementCollection._prepare_highlight_data).
2352
- scale: Scale factor for rendering.
2404
+ resolution: Resolution in DPI for rendering (default: 144 DPI, equivalent to previous scale=2.0).
2353
2405
  width: Optional width for the output image.
2354
2406
  labels: Whether to include a legend.
2355
2407
  legend_position: Position of the legend.
@@ -2363,7 +2415,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2363
2415
  img = self._highlighter.render_preview(
2364
2416
  page_index=self.index,
2365
2417
  temporary_highlights=temporary_highlights,
2366
- scale=scale,
2418
+ resolution=resolution,
2367
2419
  labels=labels,
2368
2420
  legend_position=legend_position,
2369
2421
  render_ocr=render_ocr,
@@ -2897,3 +2949,17 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
2897
2949
  properties, and other details for each element
2898
2950
  """
2899
2951
  return self.find_all('*').inspect(limit=limit)
2952
+
2953
+ @property
2954
+ def lines(self) -> List[Any]:
2955
+ """Get all line elements on this page."""
2956
+ return self._element_mgr.lines
2957
+
2958
+ # ------------------------------------------------------------------
2959
+ # Image elements
2960
+ # ------------------------------------------------------------------
2961
+
2962
+ @property
2963
+ def images(self) -> List[Any]:
2964
+ """Get all embedded raster images on this page."""
2965
+ return self._element_mgr.images
natural_pdf/core/pdf.py CHANGED
@@ -653,8 +653,6 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
653
653
  raise ValueError("Internal error: No selector or text provided.")
654
654
 
655
655
  selector_obj = parse_selector(effective_selector)
656
- kwargs["regex"] = regex
657
- kwargs["case"] = case
658
656
 
659
657
  # Search page by page
660
658
  for page in self.pages:
@@ -269,15 +269,28 @@ def _get_columns_for_type(element_type: str, show_page_column: bool) -> List[str
269
269
  base_columns = ['x0', 'top', 'x1', 'bottom']
270
270
 
271
271
  if element_type == 'word':
272
- columns = ['text'] + base_columns + ['font_family', 'font_variant', 'size', 'bold', 'italic', 'source', 'confidence']
273
- # Add color for text elements
272
+ columns = ['text'] + base_columns + [
273
+ 'font_family',
274
+ 'font_variant',
275
+ 'size',
276
+ 'bold',
277
+ 'italic',
278
+ 'strike',
279
+ 'underline',
280
+ 'highlight',
281
+ 'source',
282
+ 'confidence',
283
+ ]
284
+ # Add foreground text colour too
274
285
  columns.append('color')
275
286
  elif element_type == 'rect':
276
287
  columns = base_columns + ['width', 'height', 'stroke', 'fill', 'stroke_width']
277
288
  elif element_type == 'line':
278
289
  columns = base_columns + ['width', 'is_horizontal', 'is_vertical'] # LineElement properties
279
290
  elif element_type == 'region':
280
- columns = base_columns + ['width', 'height', 'type']
291
+ columns = base_columns + ['width', 'height', 'type', 'color']
292
+ elif element_type == 'blob':
293
+ columns = base_columns + ['width', 'height', 'color']
281
294
  else:
282
295
  columns = base_columns + ['type']
283
296
 
@@ -325,19 +338,37 @@ def _extract_element_value(element: "Element", column: str) -> Any:
325
338
  return fontname.split("+", 1)[0]
326
339
  return ''
327
340
 
328
- elif column in ['bold', 'italic']:
341
+ elif column in ['bold', 'italic', 'strike', 'underline']:
329
342
  value = getattr(element, column, False)
330
343
  return value if isinstance(value, bool) else False
331
344
 
345
+ elif column == 'highlight':
346
+ # If element is highlighted, return its colour; otherwise blank
347
+ if getattr(element, 'highlight', False):
348
+ col_val = getattr(element, 'highlight_color', None)
349
+ if col_val is None:
350
+ return 'True' # fallback if colour missing
351
+ # Convert tuple to hex
352
+ if isinstance(col_val, (tuple, list)) and len(col_val) >= 3:
353
+ try:
354
+ r, g, b = [int(v * 255) if v <= 1 else int(v) for v in col_val[:3]]
355
+ return f"#{r:02x}{g:02x}{b:02x}"
356
+ except Exception:
357
+ return str(col_val)
358
+ return str(col_val)
359
+ return ''
360
+
332
361
  elif column in ['stroke', 'fill', 'color']:
333
- # For rectangles and text, these return color tuples
334
362
  value = getattr(element, column, None)
363
+ # If already a string (e.g. '#ff00aa' or 'red') return as is
364
+ if isinstance(value, str):
365
+ return value
366
+ # If tuple/list convert to hex
335
367
  if value and isinstance(value, (tuple, list)) and len(value) >= 3:
336
- # Convert to hex color for display
337
368
  try:
338
369
  r, g, b = [int(v * 255) if v <= 1 else int(v) for v in value[:3]]
339
370
  return f"#{r:02x}{g:02x}{b:02x}"
340
- except:
371
+ except Exception:
341
372
  return str(value)
342
373
  return ""
343
374
 
@@ -406,7 +437,7 @@ def describe_element(element: "Element") -> "ElementSummary":
406
437
 
407
438
  # Add common text properties - use dict structure for proper list formatting
408
439
  text_props = {}
409
- for prop in ['font_family', 'size', 'bold', 'italic', 'source', 'confidence']:
440
+ for prop in ['font_family', 'size', 'bold', 'italic', 'strike', 'underline', 'highlight', 'source', 'confidence']:
410
441
  if hasattr(element, prop):
411
442
  value = getattr(element, prop)
412
443
  if value is not None:
@@ -414,7 +445,7 @@ def describe_element(element: "Element") -> "ElementSummary":
414
445
  text_props[prop] = round(value, 3)
415
446
  elif prop == 'size' and isinstance(value, (int, float)):
416
447
  text_props[prop] = round(value, 1)
417
- elif prop in ['bold', 'italic']:
448
+ elif prop in ['bold', 'italic', 'strike', 'underline']:
418
449
  text_props[prop] = value
419
450
  else:
420
451
  text_props[prop] = value