natural-pdf 0.1.38__py3-none-any.whl → 0.1.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +6 -0
- natural_pdf/core/page.py +21 -21
- natural_pdf/core/pdf.py +77 -24
- natural_pdf/elements/collections.py +164 -40
- natural_pdf/elements/region.py +90 -40
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +702 -20
- natural_pdf/flows/region.py +52 -4
- natural_pdf/selectors/parser.py +34 -1
- natural_pdf/text_mixin.py +97 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/METADATA +1 -1
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/RECORD +16 -15
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.1.40.dist-info}/top_level.txt +0 -0
natural_pdf/__init__.py
CHANGED
@@ -76,6 +76,9 @@ from natural_pdf.core.page import Page
|
|
76
76
|
from natural_pdf.core.pdf import PDF
|
77
77
|
from natural_pdf.elements.collections import ElementCollection
|
78
78
|
from natural_pdf.elements.region import Region
|
79
|
+
from natural_pdf.flows.flow import Flow
|
80
|
+
from natural_pdf.flows.region import FlowRegion
|
81
|
+
from natural_pdf.analyzers.guides import Guides
|
79
82
|
|
80
83
|
ElementCollection = None
|
81
84
|
|
@@ -116,6 +119,9 @@ __all__ = [
|
|
116
119
|
"Page",
|
117
120
|
"Region",
|
118
121
|
"ElementCollection",
|
122
|
+
"Flow",
|
123
|
+
"FlowRegion",
|
124
|
+
"Guides",
|
119
125
|
"TextSearchOptions",
|
120
126
|
"MultiModalSearchOptions",
|
121
127
|
"BaseSearchOptions",
|
natural_pdf/core/page.py
CHANGED
@@ -64,7 +64,6 @@ from natural_pdf.core.element_manager import ElementManager
|
|
64
64
|
from natural_pdf.describe.mixin import DescribeMixin # Import describe mixin
|
65
65
|
from natural_pdf.elements.base import Element # Import base element
|
66
66
|
from natural_pdf.elements.text import TextElement
|
67
|
-
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
68
67
|
from natural_pdf.ocr import OCRManager, OCROptions
|
69
68
|
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
70
69
|
from natural_pdf.qa import DocumentQA, get_qa_engine
|
@@ -76,8 +75,9 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerW
|
|
76
75
|
|
77
76
|
# --- End Classification Imports --- #
|
78
77
|
|
79
|
-
|
80
|
-
|
78
|
+
# --- Text update mixin import --- #
|
79
|
+
from natural_pdf.text_mixin import TextMixin
|
80
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
81
81
|
|
82
82
|
|
83
83
|
try:
|
@@ -92,7 +92,7 @@ except ImportError:
|
|
92
92
|
logger = logging.getLogger(__name__)
|
93
93
|
|
94
94
|
|
95
|
-
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
95
|
+
class Page(TextMixin, ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMixin):
|
96
96
|
"""Enhanced Page wrapper built on top of pdfplumber.Page.
|
97
97
|
|
98
98
|
This class provides a fluent interface for working with PDF pages,
|
@@ -2886,25 +2886,25 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2886
2886
|
logger.info(f"Searchable PDF saved to: {output_path_str}")
|
2887
2887
|
|
2888
2888
|
# --- Added correct_ocr method ---
|
2889
|
-
def
|
2889
|
+
def update_text(
|
2890
2890
|
self,
|
2891
|
-
|
2892
|
-
selector:
|
2891
|
+
transform: Callable[[Any], Optional[str]],
|
2892
|
+
selector: str = "text",
|
2893
2893
|
max_workers: Optional[int] = None,
|
2894
2894
|
progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
|
2895
2895
|
) -> "Page": # Return self for chaining
|
2896
2896
|
"""
|
2897
|
-
Applies corrections to
|
2897
|
+
Applies corrections to text elements on this page
|
2898
2898
|
using a user-provided callback function, potentially in parallel.
|
2899
2899
|
|
2900
|
-
Finds text elements on this page
|
2901
|
-
|
2902
|
-
|
2903
|
-
a new string.
|
2900
|
+
Finds text elements on this page matching the *selector* argument and
|
2901
|
+
calls the ``transform`` for each, passing the element itself.
|
2902
|
+
Updates the element's text if the callback returns a new string.
|
2904
2903
|
|
2905
2904
|
Args:
|
2906
|
-
|
2907
|
-
|
2905
|
+
transform: A function accepting an element and returning
|
2906
|
+
`Optional[str]` (new text or None).
|
2907
|
+
selector: CSS-like selector string to match text elements.
|
2908
2908
|
max_workers: The maximum number of threads to use for parallel execution.
|
2909
2909
|
If None or 0 or 1, runs sequentially.
|
2910
2910
|
progress_callback: Optional callback function to call after processing each element.
|
@@ -2913,21 +2913,21 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2913
2913
|
Self for method chaining.
|
2914
2914
|
"""
|
2915
2915
|
logger.info(
|
2916
|
-
f"Page {self.number}: Starting
|
2916
|
+
f"Page {self.number}: Starting text update with callback '{transform.__name__}' (max_workers={max_workers}) and selector='{selector}'"
|
2917
2917
|
)
|
2918
2918
|
|
2919
2919
|
target_elements_collection = self.find_all(selector=selector, apply_exclusions=False)
|
2920
2920
|
target_elements = target_elements_collection.elements # Get the list
|
2921
2921
|
|
2922
2922
|
if not target_elements:
|
2923
|
-
logger.info(f"Page {self.number}: No
|
2923
|
+
logger.info(f"Page {self.number}: No text elements found to update.")
|
2924
2924
|
return self
|
2925
2925
|
|
2926
2926
|
element_pbar = None
|
2927
2927
|
try:
|
2928
2928
|
element_pbar = tqdm(
|
2929
2929
|
total=len(target_elements),
|
2930
|
-
desc=f"
|
2930
|
+
desc=f"Updating text Page {self.number}",
|
2931
2931
|
unit="element",
|
2932
2932
|
leave=False,
|
2933
2933
|
)
|
@@ -2941,7 +2941,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2941
2941
|
try:
|
2942
2942
|
current_text = getattr(element, "text", None)
|
2943
2943
|
# Call the user-provided callback
|
2944
|
-
corrected_text =
|
2944
|
+
corrected_text = transform(element)
|
2945
2945
|
|
2946
2946
|
# Validate result type
|
2947
2947
|
if corrected_text is not None and not isinstance(corrected_text, str):
|
@@ -2976,7 +2976,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
2976
2976
|
if max_workers is not None and max_workers > 1:
|
2977
2977
|
# --- Parallel execution --- #
|
2978
2978
|
logger.info(
|
2979
|
-
f"Page {self.number}: Running
|
2979
|
+
f"Page {self.number}: Running text update in parallel with {max_workers} workers."
|
2980
2980
|
)
|
2981
2981
|
futures = []
|
2982
2982
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
@@ -3012,7 +3012,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
3012
3012
|
|
3013
3013
|
else:
|
3014
3014
|
# --- Sequential execution --- #
|
3015
|
-
logger.info(f"Page {self.number}: Running
|
3015
|
+
logger.info(f"Page {self.number}: Running text update sequentially.")
|
3016
3016
|
for element in target_elements:
|
3017
3017
|
# Call the task function directly (it handles progress_callback)
|
3018
3018
|
processed_count += 1
|
@@ -3027,7 +3027,7 @@ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin, DescribeMi
|
|
3027
3027
|
updated_count += 1
|
3028
3028
|
|
3029
3029
|
logger.info(
|
3030
|
-
f"Page {self.number}:
|
3030
|
+
f"Page {self.number}: Text update finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
|
3031
3031
|
)
|
3032
3032
|
|
3033
3033
|
return self # Return self for chaining
|
natural_pdf/core/pdf.py
CHANGED
@@ -39,6 +39,10 @@ from natural_pdf.extraction.mixin import ExtractionMixin
|
|
39
39
|
from natural_pdf.ocr import OCRManager, OCROptions
|
40
40
|
from natural_pdf.selectors.parser import parse_selector
|
41
41
|
from natural_pdf.utils.locks import pdf_render_lock
|
42
|
+
from natural_pdf.text_mixin import TextMixin
|
43
|
+
|
44
|
+
if TYPE_CHECKING:
|
45
|
+
from natural_pdf.elements.collections import ElementCollection
|
42
46
|
|
43
47
|
try:
|
44
48
|
from typing import Any as TypingAny
|
@@ -247,7 +251,7 @@ class _LazyPageList(Sequence):
|
|
247
251
|
# --- End Lazy Page List Helper --- #
|
248
252
|
|
249
253
|
|
250
|
-
class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
254
|
+
class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin):
|
251
255
|
"""Enhanced PDF wrapper built on top of pdfplumber.
|
252
256
|
|
253
257
|
This class provides a fluent interface for working with PDF documents,
|
@@ -1229,6 +1233,62 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1229
1233
|
|
1230
1234
|
return all_tables
|
1231
1235
|
|
1236
|
+
def get_sections(
|
1237
|
+
self,
|
1238
|
+
start_elements=None,
|
1239
|
+
end_elements=None,
|
1240
|
+
new_section_on_page_break=False,
|
1241
|
+
boundary_inclusion="both",
|
1242
|
+
) -> "ElementCollection":
|
1243
|
+
"""
|
1244
|
+
Extract sections from the entire PDF based on start/end elements.
|
1245
|
+
|
1246
|
+
This method delegates to the PageCollection.get_sections() method,
|
1247
|
+
providing a convenient way to extract document sections across all pages.
|
1248
|
+
|
1249
|
+
Args:
|
1250
|
+
start_elements: Elements or selector string that mark the start of sections (optional)
|
1251
|
+
end_elements: Elements or selector string that mark the end of sections (optional)
|
1252
|
+
new_section_on_page_break: Whether to start a new section at page boundaries (default: False)
|
1253
|
+
boundary_inclusion: How to include boundary elements: 'start', 'end', 'both', or 'none' (default: 'both')
|
1254
|
+
|
1255
|
+
Returns:
|
1256
|
+
ElementCollection of Region objects representing the extracted sections
|
1257
|
+
|
1258
|
+
Example:
|
1259
|
+
Extract sections between headers:
|
1260
|
+
```python
|
1261
|
+
pdf = npdf.PDF("document.pdf")
|
1262
|
+
|
1263
|
+
# Get sections between headers
|
1264
|
+
sections = pdf.get_sections(
|
1265
|
+
start_elements='text[size>14]:bold',
|
1266
|
+
end_elements='text[size>14]:bold'
|
1267
|
+
)
|
1268
|
+
|
1269
|
+
# Get sections that break at page boundaries
|
1270
|
+
sections = pdf.get_sections(
|
1271
|
+
start_elements='text:contains("Chapter")',
|
1272
|
+
new_section_on_page_break=True
|
1273
|
+
)
|
1274
|
+
```
|
1275
|
+
|
1276
|
+
Note:
|
1277
|
+
You can provide only start_elements, only end_elements, or both.
|
1278
|
+
- With only start_elements: sections go from each start to the next start (or end of document)
|
1279
|
+
- With only end_elements: sections go from beginning of document to each end
|
1280
|
+
- With both: sections go from each start to the corresponding end
|
1281
|
+
"""
|
1282
|
+
if not hasattr(self, "_pages"):
|
1283
|
+
raise AttributeError("PDF pages not yet initialized.")
|
1284
|
+
|
1285
|
+
return self.pages.get_sections(
|
1286
|
+
start_elements=start_elements,
|
1287
|
+
end_elements=end_elements,
|
1288
|
+
new_section_on_page_break=new_section_on_page_break,
|
1289
|
+
boundary_inclusion=boundary_inclusion,
|
1290
|
+
)
|
1291
|
+
|
1232
1292
|
def save_searchable(self, output_path: Union[str, "Path"], dpi: int = 300, **kwargs):
|
1233
1293
|
"""
|
1234
1294
|
DEPRECATED: Use save_pdf(..., ocr=True) instead.
|
@@ -1703,32 +1763,28 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1703
1763
|
logger.error(f"Failed to export correction task: {e}")
|
1704
1764
|
raise
|
1705
1765
|
|
1706
|
-
def
|
1766
|
+
def update_text(
|
1707
1767
|
self,
|
1708
|
-
|
1768
|
+
transform: Callable[[Any], Optional[str]],
|
1709
1769
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1770
|
+
selector: str = "text",
|
1710
1771
|
max_workers: Optional[int] = None,
|
1711
1772
|
progress_callback: Optional[Callable[[], None]] = None,
|
1712
1773
|
) -> "PDF":
|
1713
1774
|
"""
|
1714
|
-
Applies corrections to
|
1715
|
-
Applies corrections to OCR text elements using a callback function.
|
1775
|
+
Applies corrections to text elements using a callback function.
|
1716
1776
|
|
1717
1777
|
Args:
|
1718
|
-
correction_callback: Function that takes an element and returns corrected text or None
|
1719
1778
|
correction_callback: Function that takes an element and returns corrected text or None
|
1720
1779
|
pages: Optional page indices/slice to limit the scope of correction
|
1721
|
-
|
1722
|
-
progress_callback: Optional callback function for progress updates
|
1780
|
+
selector: Selector to apply corrections to (default: "text")
|
1723
1781
|
max_workers: Maximum number of threads to use for parallel execution
|
1724
1782
|
progress_callback: Optional callback function for progress updates
|
1725
1783
|
|
1726
1784
|
Returns:
|
1727
1785
|
Self for method chaining
|
1728
|
-
Self for method chaining
|
1729
1786
|
"""
|
1730
1787
|
target_page_indices = []
|
1731
|
-
target_page_indices = []
|
1732
1788
|
if pages is None:
|
1733
1789
|
target_page_indices = list(range(len(self._pages)))
|
1734
1790
|
elif isinstance(pages, slice):
|
@@ -1741,32 +1797,29 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1741
1797
|
raise IndexError(f"Page index {idx} out of range (0-{len(self._pages)-1}).")
|
1742
1798
|
except (IndexError, TypeError, ValueError) as e:
|
1743
1799
|
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1744
|
-
raise ValueError(f"Invalid page index in 'pages': {pages}. Error: {e}") from e
|
1745
1800
|
else:
|
1746
1801
|
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1747
|
-
raise TypeError("'pages' must be None, a slice, or an iterable of page indices.")
|
1748
1802
|
|
1749
1803
|
if not target_page_indices:
|
1750
|
-
logger.warning("No pages selected for
|
1804
|
+
logger.warning("No pages selected for text update.")
|
1751
1805
|
return self
|
1752
1806
|
|
1753
|
-
logger.info(f"Starting
|
1754
|
-
logger.info(f"Starting OCR correction for pages: {target_page_indices}")
|
1807
|
+
logger.info(f"Starting text update for pages: {target_page_indices} with selector='{selector}'")
|
1755
1808
|
|
1756
1809
|
for page_idx in target_page_indices:
|
1757
1810
|
page = self._pages[page_idx]
|
1758
1811
|
try:
|
1759
|
-
|
1760
|
-
|
1761
|
-
|
1762
|
-
|
1763
|
-
|
1812
|
+
page.update_text(
|
1813
|
+
transform=transform,
|
1814
|
+
selector=selector,
|
1815
|
+
max_workers=max_workers,
|
1816
|
+
progress_callback=progress_callback,
|
1817
|
+
)
|
1764
1818
|
except Exception as e:
|
1765
|
-
logger.error(f"Error during
|
1766
|
-
logger.error(f"Error during
|
1819
|
+
logger.error(f"Error during text update on page {page_idx}: {e}")
|
1820
|
+
logger.error(f"Error during text update on page {page_idx}: {e}")
|
1767
1821
|
|
1768
|
-
logger.info("
|
1769
|
-
logger.info("OCR correction process finished.")
|
1822
|
+
logger.info("Text update process finished.")
|
1770
1823
|
return self
|
1771
1824
|
|
1772
1825
|
def __len__(self) -> int:
|