natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +113 -22
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -8,53 +8,53 @@ from typing import Any, Dict, List, Union
8
8
  class ElementSummary:
9
9
  """
10
10
  Container for element summary data with markdown rendering.
11
-
11
+
12
12
  Automatically renders as markdown in Jupyter notebooks and provides
13
13
  access to underlying data as dictionaries.
14
14
  """
15
-
15
+
16
16
  def __init__(self, data: Dict[str, Any], title: str = "Summary"):
17
17
  """
18
18
  Initialize summary with data and optional title.
19
-
19
+
20
20
  Args:
21
21
  data: Dictionary containing summary sections
22
22
  title: Title for the summary display
23
23
  """
24
24
  self.data = data
25
25
  self.title = title
26
-
26
+
27
27
  def __str__(self) -> str:
28
28
  """String representation as markdown."""
29
29
  return self._to_markdown()
30
-
30
+
31
31
  def __repr__(self) -> str:
32
32
  """Repr as markdown for better display."""
33
33
  return self._to_markdown()
34
-
34
+
35
35
  def _repr_markdown_(self) -> str:
36
36
  """Jupyter notebook markdown rendering."""
37
37
  return self._to_markdown()
38
-
38
+
39
39
  def to_dict(self) -> Dict[str, Any]:
40
40
  """Return underlying data as dictionary."""
41
41
  return self.data.copy()
42
-
42
+
43
43
  def _to_markdown(self) -> str:
44
44
  """Convert data to markdown format."""
45
45
  lines = [f"## {self.title}", ""]
46
-
46
+
47
47
  for section_name, section_data in self.data.items():
48
48
  lines.extend(self._format_section(section_name, section_data))
49
49
  lines.append("") # Empty line between sections
50
-
50
+
51
51
  return "\n".join(lines).rstrip()
52
-
52
+
53
53
  def _format_section(self, name: str, data: Any) -> List[str]:
54
54
  """Format a single section as markdown."""
55
55
  # Use bold text instead of headers for more compact display
56
- section_title = name.replace('_', ' ').title()
57
-
56
+ section_title = name.replace("_", " ").title()
57
+
58
58
  if isinstance(data, dict):
59
59
  lines = [f"**{section_title}**:", ""]
60
60
  lines.extend(self._format_dict(data, indent=""))
@@ -62,26 +62,26 @@ class ElementSummary:
62
62
  lines = [f"**{section_title}**: {', '.join(str(item) for item in data)}"]
63
63
  else:
64
64
  lines = [f"**{section_title}**: {data}"]
65
-
65
+
66
66
  return lines
67
-
67
+
68
68
  def _format_dict(self, data: Dict[str, Any], indent: str = "") -> List[str]:
69
69
  """Format dictionary as markdown list."""
70
70
  lines = []
71
-
71
+
72
72
  for key, value in data.items():
73
- key_display = key.replace('_', ' ')
74
-
73
+ key_display = key.replace("_", " ")
74
+
75
75
  if isinstance(value, dict):
76
76
  # Nested dict - always format as list items
77
77
  lines.append(f"{indent}- **{key_display}**:")
78
78
  for subkey, subvalue in value.items():
79
- subkey_display = subkey.replace('_', ' ')
79
+ subkey_display = subkey.replace("_", " ")
80
80
  if isinstance(subvalue, dict):
81
81
  # Another level of nesting
82
82
  lines.append(f"{indent} - **{subkey_display}**:")
83
83
  for subsubkey, subsubvalue in subvalue.items():
84
- subsubkey_display = subsubkey.replace('_', ' ')
84
+ subsubkey_display = subsubkey.replace("_", " ")
85
85
  lines.append(f"{indent} - {subsubkey_display}: {subsubvalue}")
86
86
  else:
87
87
  lines.append(f"{indent} - {subkey_display}: {subvalue}")
@@ -93,9 +93,9 @@ class ElementSummary:
93
93
  lines.append(f"{indent}- **{key_display}**: {len(value)} items")
94
94
  else:
95
95
  lines.append(f"{indent}- **{key_display}**: {value}")
96
-
96
+
97
97
  return lines
98
-
98
+
99
99
  def _format_list(self, data: List[Any]) -> List[str]:
100
100
  """Format list as markdown."""
101
101
  lines = []
@@ -106,27 +106,18 @@ class ElementSummary:
106
106
  else:
107
107
  lines.append(f"- {item}")
108
108
  return lines
109
-
110
109
 
111
-
112
110
  def _format_horizontal_table(self, title: str, data: Dict[str, Any]) -> List[str]:
113
111
  """Format dict as horizontal table."""
114
112
  headers = list(data.keys())
115
113
  values = list(data.values())
116
-
114
+
117
115
  # Create table
118
116
  header_row = "| " + " | ".join(headers) + " |"
119
117
  separator = "|" + "|".join("------" for _ in headers) + "|"
120
118
  value_row = "| " + " | ".join(str(v) for v in values) + " |"
121
-
122
- return [
123
- f"- **{title}**:",
124
- "",
125
- header_row,
126
- separator,
127
- value_row,
128
- ""
129
- ]
119
+
120
+ return [f"- **{title}**:", "", header_row, separator, value_row, ""]
130
121
 
131
122
  # Added for better VS Code and other frontends support
132
123
  def _repr_html_(self) -> str: # type: ignore
@@ -147,11 +138,7 @@ class ElementSummary:
147
138
  return _markdown.markdown(md_source, extensions=["tables"])
148
139
  except Exception: # noqa: BLE001, broad-except
149
140
  # Fallback: present the Markdown as-is inside a <pre> block.
150
- escaped = (
151
- md_source.replace("&", "&amp;")
152
- .replace("<", "&lt;")
153
- .replace(">", "&gt;")
154
- )
141
+ escaped = md_source.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
155
142
  return f"<pre>{escaped}</pre>"
156
143
 
157
144
 
@@ -159,40 +146,42 @@ class InspectionSummary(ElementSummary):
159
146
  """
160
147
  Summary for element inspection with tabular data.
161
148
  """
162
-
149
+
163
150
  def _format_section(self, name: str, data: Any) -> List[str]:
164
151
  """Format inspection section with element tables."""
165
- section_title = name.replace('_', ' ').title()
166
-
167
- if isinstance(data, dict) and 'elements' in data:
152
+ section_title = name.replace("_", " ").title()
153
+
154
+ if isinstance(data, dict) and "elements" in data:
168
155
  # This is an element table section - use ### header for inspect
169
- elements = data['elements']
156
+ elements = data["elements"]
170
157
  lines = [f"### {section_title}"]
171
158
  if elements:
172
- lines.extend(self._format_element_table(elements, data.get('columns', [])))
159
+ lines.extend(self._format_element_table(elements, data.get("columns", [])))
173
160
  # Add note if truncated
174
- if 'note' in data:
161
+ if "note" in data:
175
162
  lines.append(f"_{data['note']}_")
176
163
  else:
177
164
  lines.append("No elements found.")
178
165
  else:
179
166
  # Regular section formatting
180
167
  lines = [f"**{section_title}**: {data}"]
181
-
168
+
182
169
  return lines
183
-
184
- def _format_element_table(self, elements: List[Dict[str, Any]], columns: List[str]) -> List[str]:
170
+
171
+ def _format_element_table(
172
+ self, elements: List[Dict[str, Any]], columns: List[str]
173
+ ) -> List[str]:
185
174
  """Format elements as markdown table."""
186
175
  if not elements or not columns:
187
176
  return ["No elements to display."]
188
-
177
+
189
178
  lines = [""] # Empty line before table
190
-
179
+
191
180
  # Table header
192
181
  header_row = "| " + " | ".join(columns) + " |"
193
182
  separator = "|" + "|".join("------" for _ in columns) + "|"
194
183
  lines.extend([header_row, separator])
195
-
184
+
196
185
  # Table rows
197
186
  for element in elements:
198
187
  row_values = []
@@ -205,8 +194,8 @@ class InspectionSummary(ElementSummary):
205
194
  elif isinstance(value, str) and len(value) > 50:
206
195
  value = value[:50] + "..."
207
196
  row_values.append(str(value))
208
-
197
+
209
198
  row = "| " + " | ".join(row_values) + " |"
210
199
  lines.append(row)
211
-
212
- return lines
200
+
201
+ return lines
@@ -6,27 +6,49 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, overl
6
6
 
7
7
  from PIL import Image
8
8
 
9
+ from natural_pdf.classification.mixin import ClassificationMixin
10
+ from natural_pdf.describe.mixin import DescribeMixin
11
+
9
12
  # Import selector parsing functions
10
13
  from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
11
- from natural_pdf.describe.mixin import DescribeMixin
12
- from natural_pdf.classification.mixin import ClassificationMixin
13
14
 
14
15
  if TYPE_CHECKING:
16
+ from natural_pdf.classification.manager import ClassificationManager # noqa: F401
15
17
  from natural_pdf.core.page import Page
16
18
  from natural_pdf.elements.collections import ElementCollection
17
19
  from natural_pdf.elements.region import Region
18
- from natural_pdf.classification.manager import ClassificationManager # noqa: F401
19
20
 
20
21
 
21
22
  def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
22
- """
23
- Extract bounding box coordinates from any object that has bbox properties.
23
+ """Extract bounding box coordinates from any object that has bbox properties.
24
+
25
+ This utility function provides a standardized way to extract bounding box
26
+ coordinates from various object types that may store bbox information in
27
+ different formats (properties, attributes, or dictionary keys).
24
28
 
25
29
  Args:
26
- obj: Object that might have bbox coordinates (Element, Region, etc.)
30
+ obj: Object that might have bbox coordinates. Can be an Element, Region,
31
+ dictionary, or any object with bbox-related attributes.
27
32
 
28
33
  Returns:
29
- Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
34
+ Tuple of (x0, top, x1, bottom) coordinates as floats, or None if the
35
+ object doesn't have valid bbox properties. Coordinates are in PDF
36
+ coordinate system (points, with origin at bottom-left).
37
+
38
+ Example:
39
+ ```python
40
+ # Works with various object types
41
+ element_bbox = extract_bbox(text_element) # From Element
42
+ region_bbox = extract_bbox(region) # From Region
43
+ dict_bbox = extract_bbox({ # From dictionary
44
+ 'x0': 100, 'top': 200, 'x1': 300, 'bottom': 250
45
+ })
46
+
47
+ if element_bbox:
48
+ x0, top, x1, bottom = element_bbox
49
+ width = x1 - x0
50
+ height = bottom - top
51
+ ```
30
52
  """
31
53
  # Try bbox property first (most common)
32
54
  if hasattr(obj, "bbox") and obj.bbox is not None:
@@ -53,8 +75,26 @@ def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
53
75
 
54
76
 
55
77
  class DirectionalMixin:
56
- """
57
- Mixin class providing directional methods for both Element and Region classes.
78
+ """Mixin class providing directional methods for both Element and Region classes.
79
+
80
+ This mixin provides spatial navigation capabilities that allow elements and regions
81
+ to create new regions in specific directions (left, right, above, below) relative
82
+ to themselves. This forms the foundation of natural-pdf's spatial navigation system.
83
+
84
+ The directional methods use the PDF coordinate system where:
85
+ - x increases from left to right
86
+ - y increases from bottom to top (PDF standard)
87
+ - Origin (0, 0) is at the bottom-left of the page
88
+
89
+ Methods provided:
90
+ - left(): Create region to the left
91
+ - right(): Create region to the right
92
+ - above(): Create region above
93
+ - below(): Create region below
94
+
95
+ Note:
96
+ This mixin requires the implementing class to have 'page', 'x0', 'top',
97
+ 'x1', and 'bottom' attributes for coordinate calculations.
58
98
  """
59
99
 
60
100
  def _direction(
@@ -524,20 +564,88 @@ class DirectionalMixin:
524
564
 
525
565
 
526
566
  class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
527
- """
528
- Base class for all PDF elements.
567
+ """Base class for all PDF elements.
529
568
 
530
569
  This class provides common properties and methods for all PDF elements,
531
- such as text, rectangles, lines, etc.
570
+ including text elements, rectangles, lines, images, and other geometric shapes.
571
+ It serves as the foundation for natural-pdf's element system and provides
572
+ spatial navigation, classification, and description capabilities through mixins.
573
+
574
+ The Element class wraps underlying pdfplumber objects and extends them with:
575
+ - Spatial navigation methods (left, right, above, below)
576
+ - Bounding box and coordinate properties
577
+ - Classification and description capabilities
578
+ - Polygon support for complex shapes
579
+ - Metadata storage for analysis results
580
+
581
+ All coordinates use the PDF coordinate system where:
582
+ - Origin (0, 0) is at the bottom-left of the page
583
+ - x increases from left to right
584
+ - y increases from bottom to top
585
+
586
+ Attributes:
587
+ type: Element type (e.g., 'char', 'line', 'rect', 'image').
588
+ bbox: Bounding box tuple (x0, top, x1, bottom).
589
+ x0: Left x-coordinate.
590
+ top: Top y-coordinate (minimum y).
591
+ x1: Right x-coordinate.
592
+ bottom: Bottom y-coordinate (maximum y).
593
+ width: Element width (x1 - x0).
594
+ height: Element height (bottom - top).
595
+ page: Reference to the parent Page object.
596
+ metadata: Dictionary for storing analysis results and custom data.
597
+
598
+ Example:
599
+ ```python
600
+ pdf = npdf.PDF("document.pdf")
601
+ page = pdf.pages[0]
602
+
603
+ # Get text elements
604
+ text_elements = page.chars
605
+ for element in text_elements:
606
+ print(f"Text '{element.get_text()}' at {element.bbox}")
607
+
608
+ # Spatial navigation
609
+ first_char = page.chars[0]
610
+ region_to_right = first_char.right(size=100)
611
+
612
+ # Classification
613
+ element.classify("document_type", model="clip")
614
+ ```
615
+
616
+ Note:
617
+ Element objects are typically created automatically when accessing page
618
+ collections (page.chars, page.words, page.rects, etc.). Direct instantiation
619
+ is rarely needed in normal usage.
532
620
  """
533
621
 
534
622
  def __init__(self, obj: Dict[str, Any], page: "Page"):
535
- """
536
- Initialize base element.
623
+ """Initialize base element.
624
+
625
+ Creates an Element object that wraps a pdfplumber data object with enhanced
626
+ functionality for spatial navigation, analysis, and classification.
537
627
 
538
628
  Args:
539
- obj: The underlying pdfplumber object
540
- page: The parent Page object
629
+ obj: The underlying pdfplumber object dictionary containing element
630
+ properties like coordinates, text, fonts, etc. This typically comes
631
+ from pdfplumber's chars, words, rects, lines, or images collections.
632
+ page: The parent Page object that contains this element and provides
633
+ access to document-level functionality and other elements.
634
+
635
+ Note:
636
+ This constructor is typically called automatically when accessing element
637
+ collections through page properties. Direct instantiation is rarely needed.
638
+
639
+ Example:
640
+ ```python
641
+ # Elements are usually accessed through page collections
642
+ page = pdf.pages[0]
643
+ chars = page.chars # Elements created automatically
644
+
645
+ # Direct construction (advanced usage)
646
+ pdfplumber_char = page._page.chars[0] # Raw pdfplumber data
647
+ element = Element(pdfplumber_char, page)
648
+ ```
541
649
  """
542
650
  self._obj = obj
543
651
  self._page = page
@@ -976,6 +1084,7 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
976
1084
  """
977
1085
  # Apply global options as defaults
978
1086
  import natural_pdf
1087
+
979
1088
  if resolution is None:
980
1089
  if natural_pdf.options.image.resolution is not None:
981
1090
  resolution = natural_pdf.options.image.resolution
@@ -1027,7 +1136,11 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
1027
1136
  return None
1028
1137
 
1029
1138
  def save(
1030
- self, filename: str, resolution: Optional[float] = None, labels: bool = True, legend_position: str = "right"
1139
+ self,
1140
+ filename: str,
1141
+ resolution: Optional[float] = None,
1142
+ labels: bool = True,
1143
+ legend_position: str = "right",
1031
1144
  ) -> None:
1032
1145
  """
1033
1146
  Save the page with this element highlighted to an image file.
@@ -1043,13 +1156,16 @@ class Element(DirectionalMixin, ClassificationMixin, DescribeMixin):
1043
1156
  """
1044
1157
  # Apply global options as defaults
1045
1158
  import natural_pdf
1159
+
1046
1160
  if resolution is None:
1047
1161
  if natural_pdf.options.image.resolution is not None:
1048
1162
  resolution = natural_pdf.options.image.resolution
1049
1163
  else:
1050
1164
  resolution = 144 # Default resolution when none specified
1051
1165
  # Save the highlighted image
1052
- self.page.save_image(filename, resolution=resolution, labels=labels, legend_position=legend_position)
1166
+ self.page.save_image(
1167
+ filename, resolution=resolution, labels=labels, legend_position=legend_position
1168
+ )
1053
1169
  return self
1054
1170
 
1055
1171
  # Note: save_image method removed in favor of save()
@@ -30,9 +30,9 @@ from tqdm.auto import tqdm
30
30
  from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
31
31
  from natural_pdf.classification.manager import ClassificationManager
32
32
  from natural_pdf.classification.mixin import ClassificationMixin
33
- from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
34
33
  from natural_pdf.collections.mixins import ApplyMixin, DirectionalCollectionMixin
35
34
  from natural_pdf.core.pdf import PDF
35
+ from natural_pdf.describe.mixin import DescribeMixin, InspectMixin
36
36
  from natural_pdf.elements.base import Element
37
37
  from natural_pdf.elements.region import Region
38
38
  from natural_pdf.elements.text import TextElement
@@ -81,16 +81,90 @@ class ElementCollection(
81
81
  InspectMixin,
82
82
  MutableSequence,
83
83
  ):
84
- """
85
- Collection of PDF elements with batch operations.
84
+ """Collection of PDF elements with batch operations.
85
+
86
+ ElementCollection provides a powerful interface for working with groups of
87
+ PDF elements (text, rectangles, lines, etc.) with batch processing capabilities.
88
+ It implements the MutableSequence protocol for list-like behavior while adding
89
+ specialized functionality for document analysis workflows.
90
+
91
+ The collection integrates multiple capabilities through mixins:
92
+ - Batch processing with .apply() method
93
+ - Export functionality for various formats
94
+ - AI-powered classification of element groups
95
+ - Spatial navigation for creating related regions
96
+ - Description and inspection capabilities
97
+ - Element filtering and selection
98
+
99
+ Collections support functional programming patterns and method chaining,
100
+ making it easy to build complex document processing pipelines.
101
+
102
+ Attributes:
103
+ elements: List of Element objects in the collection.
104
+ first: First element in the collection (None if empty).
105
+ last: Last element in the collection (None if empty).
106
+
107
+ Example:
108
+ Basic usage:
109
+ ```python
110
+ pdf = npdf.PDF("document.pdf")
111
+ page = pdf.pages[0]
112
+
113
+ # Get collections of elements
114
+ all_text = page.chars
115
+ headers = page.find_all('text[size>12]:bold')
116
+
117
+ # Collection operations
118
+ print(f"Found {len(headers)} headers")
119
+ header_text = headers.get_text()
120
+
121
+ # Batch processing
122
+ results = headers.apply(lambda el: el.fontname)
123
+ ```
124
+
125
+ Advanced workflows:
126
+ ```python
127
+ # Functional programming style
128
+ important_text = (page.chars
129
+ .filter('text:contains("IMPORTANT")')
130
+ .apply(lambda el: el.text.upper())
131
+ .classify("urgency_level"))
132
+
133
+ # Spatial navigation from collections
134
+ content_region = headers.below(until='rect[height>2]')
135
+
136
+ # Export functionality
137
+ headers.save_pdf("headers_only.pdf")
138
+ ```
139
+
140
+ Note:
141
+ Collections are typically created by page methods (page.chars, page.find_all())
142
+ or by filtering existing collections. Direct instantiation is less common.
86
143
  """
87
144
 
88
145
  def __init__(self, elements: List[T]):
89
- """
90
- Initialize a collection of elements.
146
+ """Initialize a collection of elements.
147
+
148
+ Creates an ElementCollection that wraps a list of PDF elements and provides
149
+ enhanced functionality for batch operations, filtering, and analysis.
91
150
 
92
151
  Args:
93
- elements: List of Element objects
152
+ elements: List of Element objects (TextElement, RectangleElement, etc.)
153
+ to include in the collection. Can be empty for an empty collection.
154
+
155
+ Example:
156
+ ```python
157
+ # Collections are usually created by page methods
158
+ chars = page.chars # ElementCollection[TextElement]
159
+ rects = page.rects # ElementCollection[RectangleElement]
160
+
161
+ # Direct creation (advanced usage)
162
+ selected_elements = ElementCollection([element1, element2, element3])
163
+ ```
164
+
165
+ Note:
166
+ ElementCollection implements MutableSequence, so it behaves like a list
167
+ with additional natural-pdf functionality for document processing.
94
168
  """
95
169
  self._elements = elements or []
96
170
 
@@ -1426,7 +1500,6 @@ class ElementCollection(
1426
1500
  analysis_key: str = "classification",
1427
1501
  multi_label: bool = False,
1428
1502
  batch_size: int = 8,
1429
- max_workers: Optional[int] = None,
1430
1503
  progress_bar: bool = True,
1431
1504
  **kwargs,
1432
1505
  ):
@@ -1440,8 +1513,6 @@ class ElementCollection(
1440
1513
  analysis_key: Key for storing results in element.analyses.
1441
1514
  multi_label: Allow multiple labels per item.
1442
1515
  batch_size: Size of batches passed to the inference pipeline.
1443
- max_workers: (Not currently used for classification batching which is
1444
- handled by the underlying pipeline).
1445
1516
  progress_bar: Display a progress bar.
1446
1517
  **kwargs: Additional arguments for the ClassificationManager.
1447
1518
  """
@@ -1818,12 +1889,13 @@ class ElementCollection(
1818
1889
  """
1819
1890
  # Apply global options as defaults
1820
1891
  import natural_pdf
1892
+
1821
1893
  if resolution is None:
1822
1894
  if natural_pdf.options.image.resolution is not None:
1823
1895
  resolution = natural_pdf.options.image.resolution
1824
1896
  else:
1825
1897
  resolution = 144 # Default resolution when none specified
1826
-
1898
+
1827
1899
  return self.apply(
1828
1900
  lambda element: element.trim(
1829
1901
  padding=padding, threshold=threshold, resolution=resolution
@@ -1896,9 +1968,7 @@ class ElementCollection(
1896
1968
 
1897
1969
  # Fallback to original behaviour: apply same clipping parameters to all elements
1898
1970
  return self.apply(
1899
- lambda element: element.clip(
1900
- obj=obj, left=left, top=top, right=right, bottom=bottom
1901
- )
1971
+ lambda element: element.clip(obj=obj, left=left, top=top, right=right, bottom=bottom)
1902
1972
  )
1903
1973
 
1904
1974
  # ------------------------------------------------------------------
@@ -2439,8 +2509,8 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2439
2509
  page in this PageCollection."""
2440
2510
  # Local imports to avoid top-level cycles
2441
2511
  from natural_pdf.elements.region import Region
2442
- from natural_pdf.flows.flow import Flow
2443
2512
  from natural_pdf.flows.element import FlowElement
2513
+ from natural_pdf.flows.flow import Flow
2444
2514
  from natural_pdf.flows.region import FlowRegion
2445
2515
 
2446
2516
  start_pg = start_el.page
@@ -2462,10 +2532,12 @@ class PageCollection(Generic[P], ApplyMixin, ShapeDetectionMixin):
2462
2532
 
2463
2533
  flow = Flow(segments=parts, arrangement="vertical")
2464
2534
  src_fe = FlowElement(physical_object=start_el, flow=flow)
2465
- return FlowRegion(flow=flow,
2466
- constituent_regions=parts,
2467
- source_flow_element=src_fe,
2468
- boundary_element_found=end_el)
2535
+ return FlowRegion(
2536
+ flow=flow,
2537
+ constituent_regions=parts,
2538
+ source_flow_element=src_fe,
2539
+ boundary_element_found=end_el,
2540
+ )
2469
2541
 
2470
2542
  # ------------------------------------------------------------------
2471
2543
 
@@ -5,6 +5,7 @@ from natural_pdf.elements.base import Element
5
5
  if TYPE_CHECKING:
6
6
  from natural_pdf.core.page import Page
7
7
 
8
+
8
9
  class ImageElement(Element):
9
10
  """Represents a raster XObject (embedded image) on a PDF page."""
10
11
 
@@ -40,4 +41,4 @@ class ImageElement(Element):
40
41
  return ""
41
42
 
42
43
  def __repr__(self):
43
- return f"<ImageElement bbox={self.bbox} srcsize={self.srcsize}>"
44
+ return f"<ImageElement bbox={self.bbox} srcsize={self.srcsize}>"
@@ -102,37 +102,6 @@ class LineElement(Element):
102
102
  elif self.is_vertical:
103
103
  return "vertical"
104
104
 
105
- def text_above(self, distance: float = 5, **kwargs) -> Any:
106
- """
107
- Get text elements above this line.
108
-
109
- Args:
110
- distance: Maximum distance above the line in points
111
- **kwargs: Additional filter parameters
112
-
113
- Returns:
114
- ElementCollection of text elements above this line
115
- """
116
- from natural_pdf.elements.collections import ElementCollection
117
-
118
- # TODO: Implement proper filtering of elements above this line
119
- return ElementCollection([]) # Placeholder
120
-
121
- def text_below(self, distance: float = 5, **kwargs) -> Any:
122
- """
123
- Get text elements below this line.
124
-
125
- Args:
126
- distance: Maximum distance below the line in points
127
- **kwargs: Additional filter parameters
128
-
129
- Returns:
130
- ElementCollection of text elements below this line
131
- """
132
- from natural_pdf.elements.collections import ElementCollection
133
-
134
- # TODO: Implement proper filtering of elements below this line
135
- return ElementCollection([]) # Placeholder
136
105
 
137
106
  def extract_text(self, keep_blank_chars=True, apply_exclusions=True, **kwargs) -> str:
138
107
  """