natural-pdf 0.1.33__py3-none-any.whl → 0.1.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +751 -607
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +120 -23
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/METADATA +1 -1
- natural_pdf-0.1.35.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.35.dist-info}/top_level.txt +0 -0
natural_pdf/describe/summary.py
CHANGED
@@ -8,53 +8,53 @@ from typing import Any, Dict, List, Union
|
|
8
8
|
class ElementSummary:
|
9
9
|
"""
|
10
10
|
Container for element summary data with markdown rendering.
|
11
|
-
|
11
|
+
|
12
12
|
Automatically renders as markdown in Jupyter notebooks and provides
|
13
13
|
access to underlying data as dictionaries.
|
14
14
|
"""
|
15
|
-
|
15
|
+
|
16
16
|
def __init__(self, data: Dict[str, Any], title: str = "Summary"):
|
17
17
|
"""
|
18
18
|
Initialize summary with data and optional title.
|
19
|
-
|
19
|
+
|
20
20
|
Args:
|
21
21
|
data: Dictionary containing summary sections
|
22
22
|
title: Title for the summary display
|
23
23
|
"""
|
24
24
|
self.data = data
|
25
25
|
self.title = title
|
26
|
-
|
26
|
+
|
27
27
|
def __str__(self) -> str:
|
28
28
|
"""String representation as markdown."""
|
29
29
|
return self._to_markdown()
|
30
|
-
|
30
|
+
|
31
31
|
def __repr__(self) -> str:
|
32
32
|
"""Repr as markdown for better display."""
|
33
33
|
return self._to_markdown()
|
34
|
-
|
34
|
+
|
35
35
|
def _repr_markdown_(self) -> str:
|
36
36
|
"""Jupyter notebook markdown rendering."""
|
37
37
|
return self._to_markdown()
|
38
|
-
|
38
|
+
|
39
39
|
def to_dict(self) -> Dict[str, Any]:
|
40
40
|
"""Return underlying data as dictionary."""
|
41
41
|
return self.data.copy()
|
42
|
-
|
42
|
+
|
43
43
|
def _to_markdown(self) -> str:
|
44
44
|
"""Convert data to markdown format."""
|
45
45
|
lines = [f"## {self.title}", ""]
|
46
|
-
|
46
|
+
|
47
47
|
for section_name, section_data in self.data.items():
|
48
48
|
lines.extend(self._format_section(section_name, section_data))
|
49
49
|
lines.append("") # Empty line between sections
|
50
|
-
|
50
|
+
|
51
51
|
return "\n".join(lines).rstrip()
|
52
|
-
|
52
|
+
|
53
53
|
def _format_section(self, name: str, data: Any) -> List[str]:
|
54
54
|
"""Format a single section as markdown."""
|
55
55
|
# Use bold text instead of headers for more compact display
|
56
|
-
section_title = name.replace(
|
57
|
-
|
56
|
+
section_title = name.replace("_", " ").title()
|
57
|
+
|
58
58
|
if isinstance(data, dict):
|
59
59
|
lines = [f"**{section_title}**:", ""]
|
60
60
|
lines.extend(self._format_dict(data, indent=""))
|
@@ -62,26 +62,26 @@ class ElementSummary:
|
|
62
62
|
lines = [f"**{section_title}**: {', '.join(str(item) for item in data)}"]
|
63
63
|
else:
|
64
64
|
lines = [f"**{section_title}**: {data}"]
|
65
|
-
|
65
|
+
|
66
66
|
return lines
|
67
|
-
|
67
|
+
|
68
68
|
def _format_dict(self, data: Dict[str, Any], indent: str = "") -> List[str]:
|
69
69
|
"""Format dictionary as markdown list."""
|
70
70
|
lines = []
|
71
|
-
|
71
|
+
|
72
72
|
for key, value in data.items():
|
73
|
-
key_display = key.replace(
|
74
|
-
|
73
|
+
key_display = key.replace("_", " ")
|
74
|
+
|
75
75
|
if isinstance(value, dict):
|
76
76
|
# Nested dict - always format as list items
|
77
77
|
lines.append(f"{indent}- **{key_display}**:")
|
78
78
|
for subkey, subvalue in value.items():
|
79
|
-
subkey_display = subkey.replace(
|
79
|
+
subkey_display = subkey.replace("_", " ")
|
80
80
|
if isinstance(subvalue, dict):
|
81
81
|
# Another level of nesting
|
82
82
|
lines.append(f"{indent} - **{subkey_display}**:")
|
83
83
|
for subsubkey, subsubvalue in subvalue.items():
|
84
|
-
subsubkey_display = subsubkey.replace(
|
84
|
+
subsubkey_display = subsubkey.replace("_", " ")
|
85
85
|
lines.append(f"{indent} - {subsubkey_display}: {subsubvalue}")
|
86
86
|
else:
|
87
87
|
lines.append(f"{indent} - {subkey_display}: {subvalue}")
|
@@ -93,9 +93,9 @@ class ElementSummary:
|
|
93
93
|
lines.append(f"{indent}- **{key_display}**: {len(value)} items")
|
94
94
|
else:
|
95
95
|
lines.append(f"{indent}- **{key_display}**: {value}")
|
96
|
-
|
96
|
+
|
97
97
|
return lines
|
98
|
-
|
98
|
+
|
99
99
|
def _format_list(self, data: List[Any]) -> List[str]:
|
100
100
|
"""Format list as markdown."""
|
101
101
|
lines = []
|
@@ -106,27 +106,18 @@ class ElementSummary:
|
|
106
106
|
else:
|
107
107
|
lines.append(f"- {item}")
|
108
108
|
return lines
|
109
|
-
|
110
109
|
|
111
|
-
|
112
110
|
def _format_horizontal_table(self, title: str, data: Dict[str, Any]) -> List[str]:
|
113
111
|
"""Format dict as horizontal table."""
|
114
112
|
headers = list(data.keys())
|
115
113
|
values = list(data.values())
|
116
|
-
|
114
|
+
|
117
115
|
# Create table
|
118
116
|
header_row = "| " + " | ".join(headers) + " |"
|
119
117
|
separator = "|" + "|".join("------" for _ in headers) + "|"
|
120
118
|
value_row = "| " + " | ".join(str(v) for v in values) + " |"
|
121
|
-
|
122
|
-
return [
|
123
|
-
f"- **{title}**:",
|
124
|
-
"",
|
125
|
-
header_row,
|
126
|
-
separator,
|
127
|
-
value_row,
|
128
|
-
""
|
129
|
-
]
|
119
|
+
|
120
|
+
return [f"- **{title}**:", "", header_row, separator, value_row, ""]
|
130
121
|
|
131
122
|
# Added for better VS Code and other frontends support
|
132
123
|
def _repr_html_(self) -> str: # type: ignore
|
@@ -147,11 +138,7 @@ class ElementSummary:
|
|
147
138
|
return _markdown.markdown(md_source, extensions=["tables"])
|
148
139
|
except Exception: # noqa: BLE001, broad-except
|
149
140
|
# Fallback: present the Markdown as-is inside a <pre> block.
|
150
|
-
escaped = (
|
151
|
-
md_source.replace("&", "&")
|
152
|
-
.replace("<", "<")
|
153
|
-
.replace(">", ">")
|
154
|
-
)
|
141
|
+
escaped = md_source.replace("&", "&").replace("<", "<").replace(">", ">")
|
155
142
|
return f"<pre>{escaped}</pre>"
|
156
143
|
|
157
144
|
|
@@ -159,40 +146,42 @@ class InspectionSummary(ElementSummary):
|
|
159
146
|
"""
|
160
147
|
Summary for element inspection with tabular data.
|
161
148
|
"""
|
162
|
-
|
149
|
+
|
163
150
|
def _format_section(self, name: str, data: Any) -> List[str]:
|
164
151
|
"""Format inspection section with element tables."""
|
165
|
-
section_title = name.replace(
|
166
|
-
|
167
|
-
if isinstance(data, dict) and
|
152
|
+
section_title = name.replace("_", " ").title()
|
153
|
+
|
154
|
+
if isinstance(data, dict) and "elements" in data:
|
168
155
|
# This is an element table section - use ### header for inspect
|
169
|
-
elements = data[
|
156
|
+
elements = data["elements"]
|
170
157
|
lines = [f"### {section_title}"]
|
171
158
|
if elements:
|
172
|
-
lines.extend(self._format_element_table(elements, data.get(
|
159
|
+
lines.extend(self._format_element_table(elements, data.get("columns", [])))
|
173
160
|
# Add note if truncated
|
174
|
-
if
|
161
|
+
if "note" in data:
|
175
162
|
lines.append(f"_{data['note']}_")
|
176
163
|
else:
|
177
164
|
lines.append("No elements found.")
|
178
165
|
else:
|
179
166
|
# Regular section formatting
|
180
167
|
lines = [f"**{section_title}**: {data}"]
|
181
|
-
|
168
|
+
|
182
169
|
return lines
|
183
|
-
|
184
|
-
def _format_element_table(
|
170
|
+
|
171
|
+
def _format_element_table(
|
172
|
+
self, elements: List[Dict[str, Any]], columns: List[str]
|
173
|
+
) -> List[str]:
|
185
174
|
"""Format elements as markdown table."""
|
186
175
|
if not elements or not columns:
|
187
176
|
return ["No elements to display."]
|
188
|
-
|
177
|
+
|
189
178
|
lines = [""] # Empty line before table
|
190
|
-
|
179
|
+
|
191
180
|
# Table header
|
192
181
|
header_row = "| " + " | ".join(columns) + " |"
|
193
182
|
separator = "|" + "|".join("------" for _ in columns) + "|"
|
194
183
|
lines.extend([header_row, separator])
|
195
|
-
|
184
|
+
|
196
185
|
# Table rows
|
197
186
|
for element in elements:
|
198
187
|
row_values = []
|
@@ -205,8 +194,8 @@ class InspectionSummary(ElementSummary):
|
|
205
194
|
elif isinstance(value, str) and len(value) > 50:
|
206
195
|
value = value[:50] + "..."
|
207
196
|
row_values.append(str(value))
|
208
|
-
|
197
|
+
|
209
198
|
row = "| " + " | ".join(row_values) + " |"
|
210
199
|
lines.append(row)
|
211
|
-
|
212
|
-
return lines
|
200
|
+
|
201
|
+
return lines
|
natural_pdf/elements/base.py
CHANGED
@@ -6,27 +6,49 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, overl
|
|
6
6
|
|
7
7
|
from PIL import Image
|
8
8
|
|
9
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
10
|
+
from natural_pdf.describe.mixin import DescribeMixin
|
11
|
+
|
9
12
|
# Import selector parsing functions
|
10
13
|
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
11
|
-
from natural_pdf.describe.mixin import DescribeMixin
|
12
|
-
from natural_pdf.classification.mixin import ClassificationMixin
|
13
14
|
|
14
15
|
if TYPE_CHECKING:
|
16
|
+
from natural_pdf.classification.manager import ClassificationManager # noqa: F401
|
15
17
|
from natural_pdf.core.page import Page
|
16
18
|
from natural_pdf.elements.collections import ElementCollection
|
17
19
|
from natural_pdf.elements.region import Region
|
18
|
-
from natural_pdf.classification.manager import ClassificationManager # noqa: F401
|
19
20
|
|
20
21
|
|
21
22
|
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
22
|
-
"""
|
23
|
-
|
23
|
+
"""Extract bounding box coordinates from any object that has bbox properties.
|
24
|
+
|
25
|
+
This utility function provides a standardized way to extract bounding box
|
26
|
+
coordinates from various object types that may store bbox information in
|
27
|
+
different formats (properties, attributes, or dictionary keys).
|
24
28
|
|
25
29
|
Args:
|
26
|
-
obj: Object that might have bbox coordinates
|
30
|
+
obj: Object that might have bbox coordinates. Can be an Element, Region,
|
31
|
+
dictionary, or any object with bbox-related attributes.
|
27
32
|
|
28
33
|
Returns:
|
29
|
-
Tuple of (x0, top, x1, bottom) or None if
|
34
|
+
Tuple of (x0, top, x1, bottom) coordinates as floats, or None if the
|
35
|
+
object doesn't have valid bbox properties. Coordinates are in PDF
|
36
|
+
coordinate system (points, with origin at bottom-left).
|
37
|
+
|
38
|
+
Example:
|
39
|
+
```python
|
40
|
+
# Works with various object types
|
41
|
+
element_bbox = extract_bbox(text_element) # From Element
|
42
|
+
region_bbox = extract_bbox(region) # From Region
|
43
|
+
dict_bbox = extract_bbox({ # From dictionary
|
44
|
+
'x0': 100, 'top': 200, 'x1': 300, 'bottom': 250
|
45
|
+
})
|
46
|
+
|
47
|
+
if element_bbox:
|
48
|
+
x0, top, x1, bottom = element_bbox
|
49
|
+
width = x1 - x0
|
50
|
+
height = bottom - top
|
51
|
+
```
|
30
52
|
"""
|
31
53
|
# Try bbox property first (most common)
|
32
54
|
if hasattr(obj, "bbox") and obj.bbox is not None:
|
@@ -53,8 +75,26 @@ def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
|
53
75
|
|
54
76
|
|
55
77
|
class DirectionalMixin:
|
56
|
-
"""
|
57
|
-
|
78
|
+
"""Mixin class providing directional methods for both Element and Region classes.
|
79
|
+
|
80
|
+
This mixin provides spatial navigation capabilities that allow elements and regions
|
81
|
+
to create new regions in specific directions (left, right, above, below) relative
|
82
|
+
to themselves. This forms the foundation of natural-pdf's spatial navigation system.
|
83
|
+
|
84
|
+
The directional methods use the PDF coordinate system where:
|
85
|
+
- x increases from left to right
|
86
|
+
- y increases from bottom to top (PDF standard)
|
87
|
+
- Origin (0, 0) is at the bottom-left of the page
|
88
|
+
|
89
|
+
Methods provided:
|
90
|
+
- left(): Create region to the left
|
91
|
+
- right(): Create region to the right
|
92
|
+
- above(): Create region above
|
93
|
+
- below(): Create region below
|
94
|
+
|
95
|
+
Note:
|
96
|
+
This mixin requires the implementing class to have 'page', 'x0', 'top',
|
97
|
+
'x1', and 'bottom' attributes for coordinate calculations.
|
58
98
|
"""
|
59
99
|
|
60
100
|
def _direction(
|
@@ -524,20 +564,88 @@ class DirectionalMixin:
|
|
524
564
|
|
525
565
|
|
526
566
|
class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
527
|
-
"""
|
528
|
-
Base class for all PDF elements.
|
567
|
+
"""Base class for all PDF elements.
|
529
568
|
|
530
569
|
This class provides common properties and methods for all PDF elements,
|
531
|
-
|
570
|
+
including text elements, rectangles, lines, images, and other geometric shapes.
|
571
|
+
It serves as the foundation for natural-pdf's element system and provides
|
572
|
+
spatial navigation, classification, and description capabilities through mixins.
|
573
|
+
|
574
|
+
The Element class wraps underlying pdfplumber objects and extends them with:
|
575
|
+
- Spatial navigation methods (left, right, above, below)
|
576
|
+
- Bounding box and coordinate properties
|
577
|
+
- Classification and description capabilities
|
578
|
+
- Polygon support for complex shapes
|
579
|
+
- Metadata storage for analysis results
|
580
|
+
|
581
|
+
All coordinates use the PDF coordinate system where:
|
582
|
+
- Origin (0, 0) is at the bottom-left of the page
|
583
|
+
- x increases from left to right
|
584
|
+
- y increases from bottom to top
|
585
|
+
|
586
|
+
Attributes:
|
587
|
+
type: Element type (e.g., 'char', 'line', 'rect', 'image').
|
588
|
+
bbox: Bounding box tuple (x0, top, x1, bottom).
|
589
|
+
x0: Left x-coordinate.
|
590
|
+
top: Top y-coordinate (minimum y).
|
591
|
+
x1: Right x-coordinate.
|
592
|
+
bottom: Bottom y-coordinate (maximum y).
|
593
|
+
width: Element width (x1 - x0).
|
594
|
+
height: Element height (bottom - top).
|
595
|
+
page: Reference to the parent Page object.
|
596
|
+
metadata: Dictionary for storing analysis results and custom data.
|
597
|
+
|
598
|
+
Example:
|
599
|
+
```python
|
600
|
+
pdf = npdf.PDF("document.pdf")
|
601
|
+
page = pdf.pages[0]
|
602
|
+
|
603
|
+
# Get text elements
|
604
|
+
text_elements = page.chars
|
605
|
+
for element in text_elements:
|
606
|
+
print(f"Text '{element.get_text()}' at {element.bbox}")
|
607
|
+
|
608
|
+
# Spatial navigation
|
609
|
+
first_char = page.chars[0]
|
610
|
+
region_to_right = first_char.right(size=100)
|
611
|
+
|
612
|
+
# Classification
|
613
|
+
element.classify("document_type", model="clip")
|
614
|
+
```
|
615
|
+
|
616
|
+
Note:
|
617
|
+
Element objects are typically created automatically when accessing page
|
618
|
+
collections (page.chars, page.words, page.rects, etc.). Direct instantiation
|
619
|
+
is rarely needed in normal usage.
|
532
620
|
"""
|
533
621
|
|
534
622
|
def __init__(self, obj: Dict[str, Any], page: "Page"):
|
535
|
-
"""
|
536
|
-
|
623
|
+
"""Initialize base element.
|
624
|
+
|
625
|
+
Creates an Element object that wraps a pdfplumber data object with enhanced
|
626
|
+
functionality for spatial navigation, analysis, and classification.
|
537
627
|
|
538
628
|
Args:
|
539
|
-
obj: The underlying pdfplumber object
|
540
|
-
|
629
|
+
obj: The underlying pdfplumber object dictionary containing element
|
630
|
+
properties like coordinates, text, fonts, etc. This typically comes
|
631
|
+
from pdfplumber's chars, words, rects, lines, or images collections.
|
632
|
+
page: The parent Page object that contains this element and provides
|
633
|
+
access to document-level functionality and other elements.
|
634
|
+
|
635
|
+
Note:
|
636
|
+
This constructor is typically called automatically when accessing element
|
637
|
+
collections through page properties. Direct instantiation is rarely needed.
|
638
|
+
|
639
|
+
Example:
|
640
|
+
```python
|
641
|
+
# Elements are usually accessed through page collections
|
642
|
+
page = pdf.pages[0]
|
643
|
+
chars = page.chars # Elements created automatically
|
644
|
+
|
645
|
+
# Direct construction (advanced usage)
|
646
|
+
pdfplumber_char = page._page.chars[0] # Raw pdfplumber data
|
647
|
+
element = Element(pdfplumber_char, page)
|
648
|
+
```
|
541
649
|
"""
|
542
650
|
self._obj = obj
|
543
651
|
self._page = page
|
@@ -976,6 +1084,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
976
1084
|
"""
|
977
1085
|
# Apply global options as defaults
|
978
1086
|
import natural_pdf
|
1087
|
+
|
979
1088
|
if resolution is None:
|
980
1089
|
if natural_pdf.options.image.resolution is not None:
|
981
1090
|
resolution = natural_pdf.options.image.resolution
|
@@ -1027,7 +1136,11 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
1027
1136
|
return None
|
1028
1137
|
|
1029
1138
|
def save(
|
1030
|
-
self,
|
1139
|
+
self,
|
1140
|
+
filename: str,
|
1141
|
+
resolution: Optional[float] = None,
|
1142
|
+
labels: bool = True,
|
1143
|
+
legend_position: str = "right",
|
1031
1144
|
) -> None:
|
1032
1145
|
"""
|
1033
1146
|
Save the page with this element highlighted to an image file.
|
@@ -1043,13 +1156,16 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
|
|
1043
1156
|
"""
|
1044
1157
|
# Apply global options as defaults
|
1045
1158
|
import natural_pdf
|
1159
|
+
|
1046
1160
|
if resolution is None:
|
1047
1161
|
if natural_pdf.options.image.resolution is not None:
|
1048
1162
|
resolution = natural_pdf.options.image.resolution
|
1049
1163
|
else:
|
1050
1164
|
resolution = 144 # Default resolution when none specified
|
1051
1165
|
# Save the highlighted image
|
1052
|
-
self.page.save_image(
|
1166
|
+
self.page.save_image(
|
1167
|
+
filename, resolution=resolution, labels=labels, legend_position=legend_position
|
1168
|
+
)
|
1053
1169
|
return self
|
1054
1170
|
|
1055
1171
|
# Note: save_image method removed in favor of save()
|
@@ -30,9 +30,9 @@ from tqdm.auto import tqdm
|
|
30
30
|
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
31
31
|
from natural_pdf.classification.manager import ClassificationManager
|
32
32
|
from natural_pdf.classification.mixin import ClassificationMixin
|
33
|
-
from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
|
34
33
|
from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
|
35
34
|
from natural_pdf.core.pdf import PDF
|
35
|
+
from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
|
36
36
|
from natural_pdf.elements.base import Element
|
37
37
|
from natural_pdf.elements.region import Region
|
38
38
|
from natural_pdf.elements.text import TextElement
|
@@ -81,16 +81,90 @@ class ElementCollection(
|
|
81
81
|
InspectMixin,
|
82
82
|
MutableSequence,
|
83
83
|
):
|
84
|
-
"""
|
85
|
-
|
84
|
+
"""Collection of PDF elements with batch operations.
|
85
|
+
|
86
|
+
ElementCollection provides a powerful interface for working with groups of
|
87
|
+
PDF elements (text, rectangles, lines, etc.) with batch processing capabilities.
|
88
|
+
It implements the MutableSequence protocol for list-like behavior while adding
|
89
|
+
specialized functionality for document analysis workflows.
|
90
|
+
|
91
|
+
The collection integrates multiple capabilities through mixins:
|
92
|
+
- Batch processing with .apply() method
|
93
|
+
- Export functionality for various formats
|
94
|
+
- AI-powered classification of element groups
|
95
|
+
- Spatial navigation for creating related regions
|
96
|
+
- Description and inspection capabilities
|
97
|
+
- Element filtering and selection
|
98
|
+
|
99
|
+
Collections support functional programming patterns and method chaining,
|
100
|
+
making it easy to build complex document processing pipelines.
|
101
|
+
|
102
|
+
Attributes:
|
103
|
+
elements: List of Element objects in the collection.
|
104
|
+
first: First element in the collection (None if empty).
|
105
|
+
last: Last element in the collection (None if empty).
|
106
|
+
|
107
|
+
Example:
|
108
|
+
Basic usage:
|
109
|
+
```python
|
110
|
+
pdf = npdf.PDF("document.pdf")
|
111
|
+
page = pdf.pages[0]
|
112
|
+
|
113
|
+
# Get collections of elements
|
114
|
+
all_text = page.chars
|
115
|
+
headers = page.find_all('text[size>12]:bold')
|
116
|
+
|
117
|
+
# Collection operations
|
118
|
+
print(f"Found {len(headers)} headers")
|
119
|
+
header_text = headers.get_text()
|
120
|
+
|
121
|
+
# Batch processing
|
122
|
+
results = headers.apply(lambda el: el.fontname)
|
123
|
+
```
|
124
|
+
|
125
|
+
Advanced workflows:
|
126
|
+
```python
|
127
|
+
# Functional programming style
|
128
|
+
important_text = (page.chars
|
129
|
+
.filter('text:contains("IMPORTANT")')
|
130
|
+
.apply(lambda el: el.text.upper())
|
131
|
+
.classify("urgency_level"))
|
132
|
+
|
133
|
+
# Spatial navigation from collections
|
134
|
+
content_region = headers.below(until='rect[height>2]')
|
135
|
+
|
136
|
+
# Export functionality
|
137
|
+
headers.save_pdf("headers_only.pdf")
|
138
|
+
```
|
139
|
+
|
140
|
+
Note:
|
141
|
+
Collections are typically created by page methods (page.chars, page.find_all())
|
142
|
+
or by filtering existing collections. Direct instantiation is less common.
|
86
143
|
"""
|
87
144
|
|
88
145
|
def __init__(self, elements: List[T]):
|
89
|
-
"""
|
90
|
-
|
146
|
+
"""Initialize a collection of elements.
|
147
|
+
|
148
|
+
Creates an ElementCollection that wraps a list of PDF elements and provides
|
149
|
+
enhanced functionality for batch operations, filtering, and analysis.
|
91
150
|
|
92
151
|
Args:
|
93
|
-
elements: List of Element objects
|
152
|
+
elements: List of Element objects (TextElement, RectangleElement, etc.)
|
153
|
+
to include in the collection. Can be empty for an empty collection.
|
154
|
+
|
155
|
+
Example:
|
156
|
+
```python
|
157
|
+
# Collections are usually created by page methods
|
158
|
+
chars = page.chars # ElementCollection[TextElement]
|
159
|
+
rects = page.rects # ElementCollection[RectangleElement]
|
160
|
+
|
161
|
+
# Direct creation (advanced usage)
|
162
|
+
selected_elements = ElementCollection([element1, element2, element3])
|
163
|
+
```
|
164
|
+
|
165
|
+
Note:
|
166
|
+
ElementCollection implements MutableSequence, so it behaves like a list
|
167
|
+
with additional natural-pdf functionality for document processing.
|
94
168
|
"""
|
95
169
|
self._elements = elements or []
|
96
170
|
|
@@ -1426,7 +1500,6 @@ class ElementCollection(
|
|
1426
1500
|
analysis_key: str = "classification",
|
1427
1501
|
multi_label: bool = False,
|
1428
1502
|
batch_size: int = 8,
|
1429
|
-
max_workers: Optional[int] = None,
|
1430
1503
|
progress_bar: bool = True,
|
1431
1504
|
**kwargs,
|
1432
1505
|
):
|
@@ -1440,8 +1513,6 @@ class ElementCollection(
|
|
1440
1513
|
analysis_key: Key for storing results in element.analyses.
|
1441
1514
|
multi_label: Allow multiple labels per item.
|
1442
1515
|
batch_size: Size of batches passed to the inference pipeline.
|
1443
|
-
max_workers: (Not currently used for classification batching which is
|
1444
|
-
handled by the underlying pipeline).
|
1445
1516
|
progress_bar: Display a progress bar.
|
1446
1517
|
**kwargs: Additional arguments for the ClassificationManager.
|
1447
1518
|
"""
|
@@ -1818,12 +1889,13 @@ class ElementCollection(
|
|
1818
1889
|
"""
|
1819
1890
|
# Apply global options as defaults
|
1820
1891
|
import natural_pdf
|
1892
|
+
|
1821
1893
|
if resolution is None:
|
1822
1894
|
if natural_pdf.options.image.resolution is not None:
|
1823
1895
|
resolution = natural_pdf.options.image.resolution
|
1824
1896
|
else:
|
1825
1897
|
resolution = 144 # Default resolution when none specified
|
1826
|
-
|
1898
|
+
|
1827
1899
|
return self.apply(
|
1828
1900
|
lambda element: element.trim(
|
1829
1901
|
padding=padding, threshold=threshold, resolution=resolution
|
@@ -1896,9 +1968,7 @@ class ElementCollection(
|
|
1896
1968
|
|
1897
1969
|
# Fallback to original behaviour: apply same clipping parameters to all elements
|
1898
1970
|
return self.apply(
|
1899
|
-
lambda element: element.clip(
|
1900
|
-
obj=obj, left=left, top=top, right=right, bottom=bottom
|
1901
|
-
)
|
1971
|
+
lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
|
1902
1972
|
)
|
1903
1973
|
|
1904
1974
|
# ------------------------------------------------------------------
|
@@ -2439,8 +2509,8 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2439
2509
|
page in this PageCollection."""
|
2440
2510
|
# Local imports to avoid top-level cycles
|
2441
2511
|
from natural_pdf.elements.region import Region
|
2442
|
-
from natural_pdf.flows.flow import Flow
|
2443
2512
|
from natural_pdf.flows.element import FlowElement
|
2513
|
+
from natural_pdf.flows.flow import Flow
|
2444
2514
|
from natural_pdf.flows.region import FlowRegion
|
2445
2515
|
|
2446
2516
|
start_pg = start_el.page
|
@@ -2462,10 +2532,12 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
|
|
2462
2532
|
|
2463
2533
|
flow = Flow(segments=parts, arrangement="vertical")
|
2464
2534
|
src_fe = FlowElement(physical_object=start_el, flow=flow)
|
2465
|
-
return FlowRegion(
|
2466
|
-
|
2467
|
-
|
2468
|
-
|
2535
|
+
return FlowRegion(
|
2536
|
+
flow=flow,
|
2537
|
+
constituent_regions=parts,
|
2538
|
+
source_flow_element=src_fe,
|
2539
|
+
boundary_element_found=end_el,
|
2540
|
+
)
|
2469
2541
|
|
2470
2542
|
# ------------------------------------------------------------------
|
2471
2543
|
|
natural_pdf/elements/image.py
CHANGED
@@ -5,6 +5,7 @@ from natural_pdf.elements.base import Element
|
|
5
5
|
if TYPE_CHECKING:
|
6
6
|
from natural_pdf.core.page import Page
|
7
7
|
|
8
|
+
|
8
9
|
class ImageElement(Element):
|
9
10
|
"""Represents a raster XObject (embedded image) on a PDF page."""
|
10
11
|
|
@@ -40,4 +41,4 @@ class ImageElement(Element):
|
|
40
41
|
return ""
|
41
42
|
|
42
43
|
def __repr__(self):
|
43
|
-
return f"<ImageElement bbox={self.bbox} srcsize={self.srcsize}>"
|
44
|
+
return f"<ImageElement bbox={self.bbox} srcsize={self.srcsize}>"
|
natural_pdf/elements/line.py
CHANGED
@@ -102,37 +102,6 @@ class LineElement(Element):
|
|
102
102
|
elif self.is_vertical:
|
103
103
|
return "vertical"
|
104
104
|
|
105
|
-
def text_above(self, distance: float = 5, **kwargs) -> Any:
|
106
|
-
"""
|
107
|
-
Get text elements above this line.
|
108
|
-
|
109
|
-
Args:
|
110
|
-
distance: Maximum distance above the line in points
|
111
|
-
**kwargs: Additional filter parameters
|
112
|
-
|
113
|
-
Returns:
|
114
|
-
ElementCollection of text elements above this line
|
115
|
-
"""
|
116
|
-
from natural_pdf.elements.collections import ElementCollection
|
117
|
-
|
118
|
-
# TODO: Implement proper filtering of elements above this line
|
119
|
-
return ElementCollection([]) # Placeholder
|
120
|
-
|
121
|
-
def text_below(self, distance: float = 5, **kwargs) -> Any:
|
122
|
-
"""
|
123
|
-
Get text elements below this line.
|
124
|
-
|
125
|
-
Args:
|
126
|
-
distance: Maximum distance below the line in points
|
127
|
-
**kwargs: Additional filter parameters
|
128
|
-
|
129
|
-
Returns:
|
130
|
-
ElementCollection of text elements below this line
|
131
|
-
"""
|
132
|
-
from natural_pdf.elements.collections import ElementCollection
|
133
|
-
|
134
|
-
# TODO: Implement proper filtering of elements below this line
|
135
|
-
return ElementCollection([]) # Placeholder
|
136
105
|
|
137
106
|
def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
|
138
107
|
"""
|