natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. natural_pdf/__init__.py +11 -6
  2. natural_pdf/analyzers/__init__.py +6 -1
  3. natural_pdf/analyzers/guides.py +354 -258
  4. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -4
  6. natural_pdf/analyzers/layout/paddle.py +11 -0
  7. natural_pdf/analyzers/layout/surya.py +2 -3
  8. natural_pdf/analyzers/shape_detection_mixin.py +25 -34
  9. natural_pdf/analyzers/text_structure.py +2 -2
  10. natural_pdf/classification/manager.py +1 -1
  11. natural_pdf/collections/mixins.py +3 -2
  12. natural_pdf/core/highlighting_service.py +743 -32
  13. natural_pdf/core/page.py +252 -399
  14. natural_pdf/core/page_collection.py +1249 -0
  15. natural_pdf/core/pdf.py +231 -89
  16. natural_pdf/{collections → core}/pdf_collection.py +18 -11
  17. natural_pdf/core/render_spec.py +335 -0
  18. natural_pdf/describe/base.py +1 -1
  19. natural_pdf/elements/__init__.py +1 -0
  20. natural_pdf/elements/base.py +108 -83
  21. natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
  22. natural_pdf/elements/line.py +0 -1
  23. natural_pdf/elements/rect.py +0 -1
  24. natural_pdf/elements/region.py +405 -280
  25. natural_pdf/elements/text.py +9 -7
  26. natural_pdf/exporters/base.py +2 -2
  27. natural_pdf/exporters/original_pdf.py +1 -1
  28. natural_pdf/exporters/paddleocr.py +2 -4
  29. natural_pdf/exporters/searchable_pdf.py +3 -2
  30. natural_pdf/extraction/mixin.py +1 -3
  31. natural_pdf/flows/collections.py +1 -69
  32. natural_pdf/flows/element.py +25 -0
  33. natural_pdf/flows/flow.py +1658 -19
  34. natural_pdf/flows/region.py +757 -263
  35. natural_pdf/ocr/ocr_options.py +0 -2
  36. natural_pdf/ocr/utils.py +2 -1
  37. natural_pdf/qa/document_qa.py +21 -5
  38. natural_pdf/search/search_service_protocol.py +1 -1
  39. natural_pdf/selectors/parser.py +35 -2
  40. natural_pdf/tables/result.py +35 -1
  41. natural_pdf/text_mixin.py +101 -0
  42. natural_pdf/utils/debug.py +2 -1
  43. natural_pdf/utils/highlighting.py +1 -0
  44. natural_pdf/utils/layout.py +2 -2
  45. natural_pdf/utils/packaging.py +4 -3
  46. natural_pdf/utils/text_extraction.py +15 -12
  47. natural_pdf/utils/visualization.py +385 -0
  48. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
  49. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
  50. optimization/memory_comparison.py +1 -1
  51. optimization/pdf_analyzer.py +2 -2
  52. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
  53. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
@@ -230,7 +230,9 @@ class TextElement(Element):
230
230
  # Default to black
231
231
  return (0, 0, 0)
232
232
 
233
- def extract_text(self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs) -> str:
233
+ def extract_text(
234
+ self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs
235
+ ) -> str:
234
236
  """
235
237
  Extract text from this element.
236
238
 
@@ -253,22 +255,22 @@ class TextElement(Element):
253
255
  # Apply content filtering if provided
254
256
  if content_filter is not None and result:
255
257
  import re
256
-
258
+
257
259
  if isinstance(content_filter, str):
258
260
  # Single regex pattern - remove matching characters
259
261
  try:
260
- result = re.sub(content_filter, '', result)
262
+ result = re.sub(content_filter, "", result)
261
263
  except re.error:
262
264
  pass # Invalid regex, skip filtering
263
-
265
+
264
266
  elif isinstance(content_filter, list):
265
267
  # List of regex patterns - remove characters matching ANY pattern
266
268
  try:
267
269
  for pattern in content_filter:
268
- result = re.sub(pattern, '', result)
270
+ result = re.sub(pattern, "", result)
269
271
  except re.error:
270
272
  pass # Invalid regex, skip filtering
271
-
273
+
272
274
  elif callable(content_filter):
273
275
  # Callable filter - apply to individual characters
274
276
  try:
@@ -276,7 +278,7 @@ class TextElement(Element):
276
278
  for char in result:
277
279
  if content_filter(char):
278
280
  filtered_chars.append(char)
279
- result = ''.join(filtered_chars)
281
+ result = "".join(filtered_chars)
280
282
  except Exception:
281
283
  pass # Function error, skip filtering
282
284
 
@@ -3,8 +3,8 @@ import logging
3
3
  from typing import TYPE_CHECKING, List, Union
4
4
 
5
5
  if TYPE_CHECKING:
6
- from natural_pdf.collections.pdf_collection import PDFCollection
7
6
  from natural_pdf.core.pdf import PDF
7
+ from natural_pdf.core.pdf_collection import PDFCollection
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
@@ -40,8 +40,8 @@ class FinetuneExporter(abc.ABC):
40
40
  """
41
41
  Helper to consistently resolve the input source to a list of PDF objects.
42
42
  """
43
- from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
44
43
  from natural_pdf.core.pdf import PDF # Avoid circular import at module level
44
+ from natural_pdf.core.pdf_collection import PDFCollection # Avoid circular import
45
45
 
46
46
  pdfs_to_process: List["PDF"] = []
47
47
  if isinstance(source, PDF):
@@ -17,8 +17,8 @@ except ImportError:
17
17
 
18
18
  if TYPE_CHECKING:
19
19
  from natural_pdf.core.page import Page
20
+ from natural_pdf.core.page_collection import PageCollection
20
21
  from natural_pdf.core.pdf import PDF
21
- from natural_pdf.elements.collections import PageCollection
22
22
 
23
23
  logger = logging.getLogger(__name__)
24
24
 
@@ -13,8 +13,8 @@ from natural_pdf.exporters.base import FinetuneExporter
13
13
  from natural_pdf.utils.identifiers import generate_short_path_hash
14
14
 
15
15
  if TYPE_CHECKING:
16
- from natural_pdf.collections.pdf_collection import PDFCollection
17
16
  from natural_pdf.core.pdf import PDF
17
+ from natural_pdf.core.pdf_collection import PDFCollection
18
18
  from natural_pdf.elements.text import TextElement
19
19
 
20
20
  logger = logging.getLogger(__name__)
@@ -216,9 +216,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
216
216
  try:
217
217
  # Expand region, render, and save image
218
218
  region = element.expand(self.padding)
219
- img = region.to_image(
220
- resolution=self.resolution, crop=True, include_highlights=False
221
- )
219
+ img = region.render(resolution=self.resolution, crop=True)
222
220
  img.save(absolute_image_path, "PNG")
223
221
 
224
222
  # Add to labels and character set
@@ -28,8 +28,8 @@ except ImportError:
28
28
 
29
29
  if TYPE_CHECKING:
30
30
  from natural_pdf.core.page import Page
31
+ from natural_pdf.core.page_collection import PageCollection
31
32
  from natural_pdf.core.pdf import PDF
32
- from natural_pdf.elements.collections import PageCollection
33
33
 
34
34
 
35
35
  logger = logging.getLogger(__name__)
@@ -345,7 +345,8 @@ def create_searchable_pdf(
345
345
  # 1. Render page image at target DPI
346
346
  logger.debug(f" Rendering page {i} to image ({dpi} DPI)...")
347
347
  # Use the Page's to_image method
348
- pil_image = page.to_image(resolution=dpi, include_highlights=False)
348
+ # Use render() for clean image without highlights
349
+ pil_image = page.render(resolution=dpi)
349
350
  pil_image.save(img_path, format="PNG")
350
351
  img_width, img_height = pil_image.size
351
352
  logger.debug(f" Image saved to {img_path} ({img_width}x{img_height})")
@@ -94,10 +94,8 @@ class ExtractionMixin(ABC):
94
94
  resolution = kwargs.pop("resolution", 72)
95
95
  include_highlights = kwargs.pop("include_highlights", False)
96
96
  labels = kwargs.pop("labels", False)
97
- return self.to_image(
97
+ return self.render(
98
98
  resolution=resolution,
99
- include_highlights=include_highlights,
100
- labels=labels,
101
99
  **kwargs,
102
100
  )
103
101
  else:
@@ -8,7 +8,7 @@ if TYPE_CHECKING:
8
8
  # from PIL.Image import Image as PIL_Image # No longer needed with Image.Image type hint
9
9
  from natural_pdf.core.page import Page as PhysicalPage
10
10
  from natural_pdf.elements.base import Element as PhysicalElement
11
- from natural_pdf.elements.collections import ElementCollection
11
+ from natural_pdf.elements.element_collection import ElementCollection
12
12
 
13
13
  from .element import FlowElement
14
14
  from .flow import Flow # Though not directly used in __init__, FlowRegion needs it.
@@ -632,73 +632,5 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
632
632
  all_cropped_images.extend(fr.to_images(resolution=resolution, **kwargs))
633
633
  return all_cropped_images
634
634
 
635
- def to_image(
636
- self,
637
- stack_direction: str = "vertical",
638
- background_color=(255, 255, 255),
639
- gap: int = 5,
640
- **kwargs_for_constituent_to_image,
641
- ) -> Optional[Image.Image]:
642
- """
643
- Creates a single composite image by stacking the composite images of each FlowRegion.
644
- Each FlowRegion's composite is generated by its own .to_image() method.
645
- These are then stacked.
646
-
647
- Args:
648
- stack_direction: "vertical" or "horizontal".
649
- background_color: Background for the final composite.
650
- gap: Gap in pixels between stacked FlowRegion images.
651
- **kwargs_for_constituent_to_image: Passed to each FlowRegion.to_image().
652
- """
653
- if not self._flow_regions:
654
- return None
655
-
656
- region_composites: List[Image.Image] = []
657
- for fr in self._flow_regions:
658
- img = fr.to_image(background_color=background_color, **kwargs_for_constituent_to_image)
659
- if img:
660
- region_composites.append(img)
661
-
662
- if not region_composites:
663
- return None
664
- if len(region_composites) == 1:
665
- return region_composites[0]
666
-
667
- if stack_direction == "vertical":
668
- final_width = max(img.width for img in region_composites)
669
- final_height = (
670
- sum(img.height for img in region_composites) + (len(region_composites) - 1) * gap
671
- )
672
- if final_width == 0 or final_height == 0:
673
- return None
674
-
675
- new_image = Image.new("RGB", (final_width, final_height), background_color)
676
- current_y = 0
677
- for img in region_composites:
678
- # Align to left for vertical stacking
679
- new_image.paste(img, (0, current_y))
680
- current_y += img.height + gap
681
- return new_image
682
- elif stack_direction == "horizontal":
683
- final_width = (
684
- sum(img.width for img in region_composites) + (len(region_composites) - 1) * gap
685
- )
686
- final_height = max(img.height for img in region_composites)
687
- if final_width == 0 or final_height == 0:
688
- return None
689
-
690
- new_image = Image.new("RGB", (final_width, final_height), background_color)
691
- current_x = 0
692
- for img in region_composites:
693
- # Align to top for horizontal stacking
694
- new_image.paste(img, (current_x, 0))
695
- current_x += img.width + gap
696
- return new_image
697
- else:
698
- logger.warning(
699
- f"Invalid stack_direction: {stack_direction}. Must be 'vertical' or 'horizontal'."
700
- )
701
- return None # Or perhaps return the list of images?
702
-
703
635
  def apply(self, func: Callable[["FlowRegion"], Any]) -> List[Any]:
704
636
  return [func(fr) for fr in self._flow_regions]
@@ -73,6 +73,31 @@ class FlowElement:
73
73
  """Returns the physical page of the underlying element."""
74
74
  return getattr(self.physical_object, "page", None)
75
75
 
76
+ def __getattr__(self, name: str) -> Any:
77
+ """
78
+ Delegate unknown attribute access to the physical_object.
79
+
80
+ This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
81
+ from the physical element are accessible on the FlowElement wrapper.
82
+
83
+ Args:
84
+ name: The attribute name being accessed
85
+
86
+ Returns:
87
+ The attribute value from physical_object
88
+
89
+ Raises:
90
+ AttributeError: If the attribute doesn't exist on physical_object either
91
+ """
92
+ try:
93
+ return getattr(self.physical_object, name)
94
+ except AttributeError:
95
+ # Provide a helpful error message that mentions both FlowElement and physical_object
96
+ raise AttributeError(
97
+ f"'{type(self).__name__}' object has no attribute '{name}' "
98
+ f"(also not found on underlying {type(self.physical_object).__name__})"
99
+ )
100
+
76
101
  def _flow_direction(
77
102
  self,
78
103
  direction: str, # "above", "below", "left", "right"