natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/elements/text.py
CHANGED
@@ -230,7 +230,9 @@ class TextElement(Element):
|
|
230
230
|
# Default to black
|
231
231
|
return (0, 0, 0)
|
232
232
|
|
233
|
-
def extract_text(
|
233
|
+
def extract_text(
|
234
|
+
self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs
|
235
|
+
) -> str:
|
234
236
|
"""
|
235
237
|
Extract text from this element.
|
236
238
|
|
@@ -253,22 +255,22 @@ class TextElement(Element):
|
|
253
255
|
# Apply content filtering if provided
|
254
256
|
if content_filter is not None and result:
|
255
257
|
import re
|
256
|
-
|
258
|
+
|
257
259
|
if isinstance(content_filter, str):
|
258
260
|
# Single regex pattern - remove matching characters
|
259
261
|
try:
|
260
|
-
result = re.sub(content_filter,
|
262
|
+
result = re.sub(content_filter, "", result)
|
261
263
|
except re.error:
|
262
264
|
pass # Invalid regex, skip filtering
|
263
|
-
|
265
|
+
|
264
266
|
elif isinstance(content_filter, list):
|
265
267
|
# List of regex patterns - remove characters matching ANY pattern
|
266
268
|
try:
|
267
269
|
for pattern in content_filter:
|
268
|
-
result = re.sub(pattern,
|
270
|
+
result = re.sub(pattern, "", result)
|
269
271
|
except re.error:
|
270
272
|
pass # Invalid regex, skip filtering
|
271
|
-
|
273
|
+
|
272
274
|
elif callable(content_filter):
|
273
275
|
# Callable filter - apply to individual characters
|
274
276
|
try:
|
@@ -276,7 +278,7 @@ class TextElement(Element):
|
|
276
278
|
for char in result:
|
277
279
|
if content_filter(char):
|
278
280
|
filtered_chars.append(char)
|
279
|
-
result =
|
281
|
+
result = "".join(filtered_chars)
|
280
282
|
except Exception:
|
281
283
|
pass # Function error, skip filtering
|
282
284
|
|
natural_pdf/exporters/base.py
CHANGED
@@ -3,8 +3,8 @@ import logging
|
|
3
3
|
from typing import TYPE_CHECKING, List, Union
|
4
4
|
|
5
5
|
if TYPE_CHECKING:
|
6
|
-
from natural_pdf.collections.pdf_collection import PDFCollection
|
7
6
|
from natural_pdf.core.pdf import PDF
|
7
|
+
from natural_pdf.core.pdf_collection import PDFCollection
|
8
8
|
|
9
9
|
logger = logging.getLogger(__name__)
|
10
10
|
|
@@ -40,8 +40,8 @@ class FinetuneExporter(abc.ABC):
|
|
40
40
|
"""
|
41
41
|
Helper to consistently resolve the input source to a list of PDF objects.
|
42
42
|
"""
|
43
|
-
from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
|
44
43
|
from natural_pdf.core.pdf import PDF # Avoid circular import at module level
|
44
|
+
from natural_pdf.core.pdf_collection import PDFCollection # Avoid circular import
|
45
45
|
|
46
46
|
pdfs_to_process: List["PDF"] = []
|
47
47
|
if isinstance(source, PDF):
|
@@ -17,8 +17,8 @@ except ImportError:
|
|
17
17
|
|
18
18
|
if TYPE_CHECKING:
|
19
19
|
from natural_pdf.core.page import Page
|
20
|
+
from natural_pdf.core.page_collection import PageCollection
|
20
21
|
from natural_pdf.core.pdf import PDF
|
21
|
-
from natural_pdf.elements.collections import PageCollection
|
22
22
|
|
23
23
|
logger = logging.getLogger(__name__)
|
24
24
|
|
@@ -13,8 +13,8 @@ from natural_pdf.exporters.base import FinetuneExporter
|
|
13
13
|
from natural_pdf.utils.identifiers import generate_short_path_hash
|
14
14
|
|
15
15
|
if TYPE_CHECKING:
|
16
|
-
from natural_pdf.collections.pdf_collection import PDFCollection
|
17
16
|
from natural_pdf.core.pdf import PDF
|
17
|
+
from natural_pdf.core.pdf_collection import PDFCollection
|
18
18
|
from natural_pdf.elements.text import TextElement
|
19
19
|
|
20
20
|
logger = logging.getLogger(__name__)
|
@@ -216,9 +216,7 @@ class PaddleOCRRecognitionExporter(FinetuneExporter):
|
|
216
216
|
try:
|
217
217
|
# Expand region, render, and save image
|
218
218
|
region = element.expand(self.padding)
|
219
|
-
img = region.
|
220
|
-
resolution=self.resolution, crop=True, include_highlights=False
|
221
|
-
)
|
219
|
+
img = region.render(resolution=self.resolution, crop=True)
|
222
220
|
img.save(absolute_image_path, "PNG")
|
223
221
|
|
224
222
|
# Add to labels and character set
|
@@ -28,8 +28,8 @@ except ImportError:
|
|
28
28
|
|
29
29
|
if TYPE_CHECKING:
|
30
30
|
from natural_pdf.core.page import Page
|
31
|
+
from natural_pdf.core.page_collection import PageCollection
|
31
32
|
from natural_pdf.core.pdf import PDF
|
32
|
-
from natural_pdf.elements.collections import PageCollection
|
33
33
|
|
34
34
|
|
35
35
|
logger = logging.getLogger(__name__)
|
@@ -345,7 +345,8 @@ def create_searchable_pdf(
|
|
345
345
|
# 1. Render page image at target DPI
|
346
346
|
logger.debug(f" Rendering page {i} to image ({dpi} DPI)...")
|
347
347
|
# Use the Page's to_image method
|
348
|
-
|
348
|
+
# Use render() for clean image without highlights
|
349
|
+
pil_image = page.render(resolution=dpi)
|
349
350
|
pil_image.save(img_path, format="PNG")
|
350
351
|
img_width, img_height = pil_image.size
|
351
352
|
logger.debug(f" Image saved to {img_path} ({img_width}x{img_height})")
|
natural_pdf/extraction/mixin.py
CHANGED
@@ -94,10 +94,8 @@ class ExtractionMixin(ABC):
|
|
94
94
|
resolution = kwargs.pop("resolution", 72)
|
95
95
|
include_highlights = kwargs.pop("include_highlights", False)
|
96
96
|
labels = kwargs.pop("labels", False)
|
97
|
-
return self.
|
97
|
+
return self.render(
|
98
98
|
resolution=resolution,
|
99
|
-
include_highlights=include_highlights,
|
100
|
-
labels=labels,
|
101
99
|
**kwargs,
|
102
100
|
)
|
103
101
|
else:
|
natural_pdf/flows/collections.py
CHANGED
@@ -8,7 +8,7 @@ if TYPE_CHECKING:
|
|
8
8
|
# from PIL.Image import Image as PIL_Image # No longer needed with Image.Image type hint
|
9
9
|
from natural_pdf.core.page import Page as PhysicalPage
|
10
10
|
from natural_pdf.elements.base import Element as PhysicalElement
|
11
|
-
from natural_pdf.elements.
|
11
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
12
12
|
|
13
13
|
from .element import FlowElement
|
14
14
|
from .flow import Flow # Though not directly used in __init__, FlowRegion needs it.
|
@@ -632,73 +632,5 @@ class FlowRegionCollection(MutableSequence[T_FRC]):
|
|
632
632
|
all_cropped_images.extend(fr.to_images(resolution=resolution, **kwargs))
|
633
633
|
return all_cropped_images
|
634
634
|
|
635
|
-
def to_image(
|
636
|
-
self,
|
637
|
-
stack_direction: str = "vertical",
|
638
|
-
background_color=(255, 255, 255),
|
639
|
-
gap: int = 5,
|
640
|
-
**kwargs_for_constituent_to_image,
|
641
|
-
) -> Optional[Image.Image]:
|
642
|
-
"""
|
643
|
-
Creates a single composite image by stacking the composite images of each FlowRegion.
|
644
|
-
Each FlowRegion's composite is generated by its own .to_image() method.
|
645
|
-
These are then stacked.
|
646
|
-
|
647
|
-
Args:
|
648
|
-
stack_direction: "vertical" or "horizontal".
|
649
|
-
background_color: Background for the final composite.
|
650
|
-
gap: Gap in pixels between stacked FlowRegion images.
|
651
|
-
**kwargs_for_constituent_to_image: Passed to each FlowRegion.to_image().
|
652
|
-
"""
|
653
|
-
if not self._flow_regions:
|
654
|
-
return None
|
655
|
-
|
656
|
-
region_composites: List[Image.Image] = []
|
657
|
-
for fr in self._flow_regions:
|
658
|
-
img = fr.to_image(background_color=background_color, **kwargs_for_constituent_to_image)
|
659
|
-
if img:
|
660
|
-
region_composites.append(img)
|
661
|
-
|
662
|
-
if not region_composites:
|
663
|
-
return None
|
664
|
-
if len(region_composites) == 1:
|
665
|
-
return region_composites[0]
|
666
|
-
|
667
|
-
if stack_direction == "vertical":
|
668
|
-
final_width = max(img.width for img in region_composites)
|
669
|
-
final_height = (
|
670
|
-
sum(img.height for img in region_composites) + (len(region_composites) - 1) * gap
|
671
|
-
)
|
672
|
-
if final_width == 0 or final_height == 0:
|
673
|
-
return None
|
674
|
-
|
675
|
-
new_image = Image.new("RGB", (final_width, final_height), background_color)
|
676
|
-
current_y = 0
|
677
|
-
for img in region_composites:
|
678
|
-
# Align to left for vertical stacking
|
679
|
-
new_image.paste(img, (0, current_y))
|
680
|
-
current_y += img.height + gap
|
681
|
-
return new_image
|
682
|
-
elif stack_direction == "horizontal":
|
683
|
-
final_width = (
|
684
|
-
sum(img.width for img in region_composites) + (len(region_composites) - 1) * gap
|
685
|
-
)
|
686
|
-
final_height = max(img.height for img in region_composites)
|
687
|
-
if final_width == 0 or final_height == 0:
|
688
|
-
return None
|
689
|
-
|
690
|
-
new_image = Image.new("RGB", (final_width, final_height), background_color)
|
691
|
-
current_x = 0
|
692
|
-
for img in region_composites:
|
693
|
-
# Align to top for horizontal stacking
|
694
|
-
new_image.paste(img, (current_x, 0))
|
695
|
-
current_x += img.width + gap
|
696
|
-
return new_image
|
697
|
-
else:
|
698
|
-
logger.warning(
|
699
|
-
f"Invalid stack_direction: {stack_direction}. Must be 'vertical' or 'horizontal'."
|
700
|
-
)
|
701
|
-
return None # Or perhaps return the list of images?
|
702
|
-
|
703
635
|
def apply(self, func: Callable[["FlowRegion"], Any]) -> List[Any]:
|
704
636
|
return [func(fr) for fr in self._flow_regions]
|
natural_pdf/flows/element.py
CHANGED
@@ -73,6 +73,31 @@ class FlowElement:
|
|
73
73
|
"""Returns the physical page of the underlying element."""
|
74
74
|
return getattr(self.physical_object, "page", None)
|
75
75
|
|
76
|
+
def __getattr__(self, name: str) -> Any:
|
77
|
+
"""
|
78
|
+
Delegate unknown attribute access to the physical_object.
|
79
|
+
|
80
|
+
This ensures that attributes like 'type', 'region_type', 'source', 'model', etc.
|
81
|
+
from the physical element are accessible on the FlowElement wrapper.
|
82
|
+
|
83
|
+
Args:
|
84
|
+
name: The attribute name being accessed
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
The attribute value from physical_object
|
88
|
+
|
89
|
+
Raises:
|
90
|
+
AttributeError: If the attribute doesn't exist on physical_object either
|
91
|
+
"""
|
92
|
+
try:
|
93
|
+
return getattr(self.physical_object, name)
|
94
|
+
except AttributeError:
|
95
|
+
# Provide a helpful error message that mentions both FlowElement and physical_object
|
96
|
+
raise AttributeError(
|
97
|
+
f"'{type(self).__name__}' object has no attribute '{name}' "
|
98
|
+
f"(also not found on underlying {type(self.physical_object).__name__})"
|
99
|
+
)
|
100
|
+
|
76
101
|
def _flow_direction(
|
77
102
|
self,
|
78
103
|
direction: str, # "above", "below", "left", "right"
|