natural-pdf 25.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. examples/__init__.py +3 -0
  2. examples/another_exclusion_example.py +20 -0
  3. examples/basic_usage.py +190 -0
  4. examples/boundary_exclusion_test.py +137 -0
  5. examples/boundary_inclusion_fix_test.py +157 -0
  6. examples/chainable_layout_example.py +70 -0
  7. examples/color_basic_test.py +49 -0
  8. examples/color_name_example.py +71 -0
  9. examples/color_test.py +62 -0
  10. examples/debug_ocr.py +91 -0
  11. examples/direct_ocr_test.py +148 -0
  12. examples/direct_paddle_test.py +99 -0
  13. examples/direct_qa_example.py +165 -0
  14. examples/document_layout_analysis.py +123 -0
  15. examples/document_qa_example.py +185 -0
  16. examples/exclusion_count_debug.py +128 -0
  17. examples/exclusion_debug.py +107 -0
  18. examples/exclusion_example.py +150 -0
  19. examples/exclusion_optimization_example.py +190 -0
  20. examples/extract_text_test.py +128 -0
  21. examples/font_aware_example.py +101 -0
  22. examples/font_variant_example.py +124 -0
  23. examples/footer_overlap_test.py +124 -0
  24. examples/highlight_all_example.py +82 -0
  25. examples/highlight_attributes_test.py +114 -0
  26. examples/highlight_confidence_display.py +122 -0
  27. examples/highlight_demo.py +110 -0
  28. examples/highlight_float_test.py +71 -0
  29. examples/highlight_test.py +147 -0
  30. examples/highlighting_example.py +123 -0
  31. examples/image_width_example.py +84 -0
  32. examples/improved_api_example.py +128 -0
  33. examples/layout_confidence_display_test.py +65 -0
  34. examples/layout_confidence_test.py +82 -0
  35. examples/layout_coordinate_debug.py +258 -0
  36. examples/layout_highlight_test.py +77 -0
  37. examples/logging_example.py +70 -0
  38. examples/ocr_comprehensive.py +193 -0
  39. examples/ocr_debug_example.py +87 -0
  40. examples/ocr_default_test.py +97 -0
  41. examples/ocr_engine_comparison.py +235 -0
  42. examples/ocr_example.py +89 -0
  43. examples/ocr_simplified_params.py +79 -0
  44. examples/ocr_visualization.py +102 -0
  45. examples/ocr_visualization_test.py +121 -0
  46. examples/paddle_layout_example.py +315 -0
  47. examples/paddle_layout_simple.py +74 -0
  48. examples/paddleocr_example.py +224 -0
  49. examples/page_collection_example.py +103 -0
  50. examples/polygon_highlight_example.py +83 -0
  51. examples/position_methods_example.py +134 -0
  52. examples/region_boundary_test.py +73 -0
  53. examples/region_exclusion_test.py +149 -0
  54. examples/region_expand_example.py +109 -0
  55. examples/region_image_example.py +116 -0
  56. examples/region_ocr_test.py +119 -0
  57. examples/region_sections_example.py +115 -0
  58. examples/school_books.py +49 -0
  59. examples/school_books_all.py +52 -0
  60. examples/scouring.py +36 -0
  61. examples/section_extraction_example.py +232 -0
  62. examples/simple_document_qa.py +97 -0
  63. examples/spatial_navigation_example.py +108 -0
  64. examples/table_extraction_example.py +135 -0
  65. examples/table_structure_detection.py +155 -0
  66. examples/tatr_cells_test.py +56 -0
  67. examples/tatr_ocr_table_test.py +94 -0
  68. examples/text_search_example.py +122 -0
  69. examples/text_style_example.py +110 -0
  70. examples/tiny-text.py +61 -0
  71. examples/until_boundaries_example.py +156 -0
  72. examples/until_example.py +112 -0
  73. examples/very_basics.py +15 -0
  74. natural_pdf/__init__.py +55 -0
  75. natural_pdf/analyzers/__init__.py +9 -0
  76. natural_pdf/analyzers/document_layout.py +736 -0
  77. natural_pdf/analyzers/text_structure.py +153 -0
  78. natural_pdf/core/__init__.py +3 -0
  79. natural_pdf/core/page.py +2376 -0
  80. natural_pdf/core/pdf.py +572 -0
  81. natural_pdf/elements/__init__.py +3 -0
  82. natural_pdf/elements/base.py +553 -0
  83. natural_pdf/elements/collections.py +770 -0
  84. natural_pdf/elements/line.py +124 -0
  85. natural_pdf/elements/rect.py +122 -0
  86. natural_pdf/elements/region.py +1366 -0
  87. natural_pdf/elements/text.py +304 -0
  88. natural_pdf/ocr/__init__.py +62 -0
  89. natural_pdf/ocr/easyocr_engine.py +254 -0
  90. natural_pdf/ocr/engine.py +158 -0
  91. natural_pdf/ocr/paddleocr_engine.py +263 -0
  92. natural_pdf/qa/__init__.py +3 -0
  93. natural_pdf/qa/document_qa.py +405 -0
  94. natural_pdf/selectors/__init__.py +4 -0
  95. natural_pdf/selectors/parser.py +360 -0
  96. natural_pdf/templates/__init__.py +1 -0
  97. natural_pdf/templates/ocr_debug.html +517 -0
  98. natural_pdf/utils/__init__.py +4 -0
  99. natural_pdf/utils/highlighting.py +605 -0
  100. natural_pdf/utils/ocr.py +515 -0
  101. natural_pdf/utils/reading_order.py +227 -0
  102. natural_pdf/utils/visualization.py +151 -0
  103. natural_pdf-25.3.16.dist-info/LICENSE +21 -0
  104. natural_pdf-25.3.16.dist-info/METADATA +268 -0
  105. natural_pdf-25.3.16.dist-info/RECORD +109 -0
  106. natural_pdf-25.3.16.dist-info/WHEEL +5 -0
  107. natural_pdf-25.3.16.dist-info/top_level.txt +3 -0
  108. tests/__init__.py +3 -0
  109. tests/test_pdf.py +39 -0
@@ -0,0 +1,227 @@
1
+ """
2
+ Reading order utilities for natural-pdf.
3
+ """
4
+ from typing import List, Dict, Any, Callable, Optional
5
+
6
+
7
+ def establish_reading_order(elements: List[Dict[str, Any]],
8
+ algorithm: str = 'basic') -> List[Dict[str, Any]]:
9
+ """
10
+ Establish reading order for a collection of elements.
11
+
12
+ Args:
13
+ elements: List of elements to order
14
+ algorithm: Algorithm to use ('basic', 'column', 'complex')
15
+
16
+ Returns:
17
+ List of elements in reading order
18
+ """
19
+ if algorithm == 'basic':
20
+ return _basic_reading_order(elements)
21
+ elif algorithm == 'column':
22
+ return _column_reading_order(elements)
23
+ elif algorithm == 'complex':
24
+ return _complex_reading_order(elements)
25
+ else:
26
+ # Default to basic
27
+ return _basic_reading_order(elements)
28
+
29
+
30
+ def _basic_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
31
+ """
32
+ Basic top-to-bottom, left-to-right reading order.
33
+
34
+ Args:
35
+ elements: List of elements to order
36
+
37
+ Returns:
38
+ List of elements in reading order
39
+ """
40
+ # Simple sort by y0 (top), then by x0 (left)
41
+ return sorted(elements, key=lambda e: (
42
+ e.get('top', e.get('y0', 0)),
43
+ e.get('x0', 0)
44
+ ))
45
+
46
+
47
+ def _column_reading_order(elements: List[Dict[str, Any]],
48
+ column_threshold: float = 0.2,
49
+ x_tolerance: float = 10.0) -> List[Dict[str, Any]]:
50
+ """
51
+ Reading order that accounts for columns.
52
+
53
+ This is more complex as it needs to detect columns first,
54
+ then read each column in order.
55
+
56
+ Args:
57
+ elements: List of elements to order
58
+ column_threshold: Percentage overlap threshold for column detection (0.0 to 1.0)
59
+ x_tolerance: Horizontal tolerance for determining column edges
60
+
61
+ Returns:
62
+ List of elements in reading order
63
+ """
64
+ if not elements:
65
+ return []
66
+
67
+ # 1. Group elements by line
68
+ lines = group_elements_by_line(elements)
69
+
70
+ # 2. For each line, find the x-coordinate ranges (potential column boundaries)
71
+ line_x_ranges = []
72
+ for line in lines:
73
+ for el in line:
74
+ x0 = el.get('x0', 0)
75
+ x1 = el.get('x1', 0)
76
+ line_x_ranges.append((x0, x1))
77
+
78
+ # If we don't have enough ranges to detect columns, just use basic ordering
79
+ if len(line_x_ranges) < 3:
80
+ return _basic_reading_order(elements)
81
+
82
+ # 3. Detect columns by clustering x-coordinate ranges
83
+ def overlaps(range1, range2, threshold=column_threshold):
84
+ """Determine if two ranges overlap by more than threshold percentage."""
85
+ # Calculate overlap
86
+ overlap_start = max(range1[0], range2[0])
87
+ overlap_end = min(range1[1], range2[1])
88
+ overlap = max(0, overlap_end - overlap_start)
89
+
90
+ # Calculate lengths
91
+ len1 = range1[1] - range1[0]
92
+ len2 = range2[1] - range2[0]
93
+
94
+ # Calculate overlap as percentage of the shorter range
95
+ shorter_len = min(len1, len2)
96
+ if shorter_len == 0:
97
+ return False
98
+
99
+ return overlap / shorter_len >= threshold
100
+
101
+ # Cluster x-ranges into columns
102
+ columns = []
103
+ for x_range in line_x_ranges:
104
+ # Skip zero-width ranges
105
+ if x_range[1] - x_range[0] <= 0:
106
+ continue
107
+
108
+ # Try to find an existing column to add to
109
+ added = False
110
+ for col in columns:
111
+ if any(overlaps(x_range, r) for r in col):
112
+ col.append(x_range)
113
+ added = True
114
+ break
115
+
116
+ # If not added to an existing column, create a new one
117
+ if not added:
118
+ columns.append([x_range])
119
+
120
+ # 4. Get column boundaries by averaging x-ranges in each column
121
+ column_bounds = []
122
+ for col in columns:
123
+ left = sum(r[0] for r in col) / len(col)
124
+ right = sum(r[1] for r in col) / len(col)
125
+ column_bounds.append((left, right))
126
+
127
+ # Sort columns by x-coordinate (left to right)
128
+ column_bounds.sort(key=lambda b: b[0])
129
+
130
+ # 5. Assign each element to a column
131
+ element_columns = {}
132
+ for el in elements:
133
+ # Get element x-coordinates
134
+ el_x0 = el.get('x0', 0)
135
+ el_x1 = el.get('x1', 0)
136
+ el_center = (el_x0 + el_x1) / 2
137
+
138
+ # Find the column this element belongs to
139
+ for i, (left, right) in enumerate(column_bounds):
140
+ # Extend bounds by tolerance
141
+ extended_left = left - x_tolerance
142
+ extended_right = right + x_tolerance
143
+
144
+ # Check if center point is within extended column bounds
145
+ if extended_left <= el_center <= extended_right:
146
+ element_columns[el] = i
147
+ break
148
+ else:
149
+ # If no column found, assign to nearest column
150
+ distances = [(i, min(abs(el_center - left), abs(el_center - right)))
151
+ for i, (left, right) in enumerate(column_bounds)]
152
+ nearest_col = min(distances, key=lambda d: d[1])[0]
153
+ element_columns[el] = nearest_col
154
+
155
+ # 6. Sort elements by column, then by vertical position
156
+ sorted_elements = []
157
+ for col_idx, _ in enumerate(column_bounds):
158
+ # Get elements in this column
159
+ col_elements = [el for el in elements if element_columns.get(el) == col_idx]
160
+ # Sort by top coordinate
161
+ col_elements.sort(key=lambda e: e.get('top', e.get('y0', 0)))
162
+ # Add to final list
163
+ sorted_elements.extend(col_elements)
164
+
165
+ return sorted_elements
166
+
167
+
168
+ def _complex_reading_order(elements: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
169
+ """
170
+ Complex reading order that accounts for various document structures.
171
+
172
+ This considers columns, text flow around images, tables, etc.
173
+
174
+ Args:
175
+ elements: List of elements to order
176
+
177
+ Returns:
178
+ List of elements in reading order
179
+ """
180
+ # TODO: Implement complex layout analysis
181
+ # For now, fall back to column-aware reading order
182
+ return _column_reading_order(elements)
183
+
184
+
185
+ def group_elements_by_line(elements: List[Dict[str, Any]],
186
+ tolerance: float = 3.0) -> List[List[Dict[str, Any]]]:
187
+ """
188
+ Group elements into lines based on vertical position.
189
+
190
+ Args:
191
+ elements: List of elements to group
192
+ tolerance: Maximum vertical distance for elements to be considered on the same line
193
+
194
+ Returns:
195
+ List of lists, where each sublist contains elements on the same line
196
+ """
197
+ if not elements:
198
+ return []
199
+
200
+ # Sort by top coordinate
201
+ sorted_elements = sorted(elements, key=lambda e: e.get('top', e.get('y0', 0)))
202
+
203
+ lines = []
204
+ current_line = [sorted_elements[0]]
205
+ current_top = sorted_elements[0].get('top', sorted_elements[0].get('y0', 0))
206
+
207
+ for element in sorted_elements[1:]:
208
+ element_top = element.get('top', element.get('y0', 0))
209
+
210
+ # If element is close enough to current line's top, add to current line
211
+ if abs(element_top - current_top) <= tolerance:
212
+ current_line.append(element)
213
+ else:
214
+ # Otherwise, start a new line
215
+ lines.append(current_line)
216
+ current_line = [element]
217
+ current_top = element_top
218
+
219
+ # Add the last line
220
+ if current_line:
221
+ lines.append(current_line)
222
+
223
+ # Sort elements within each line by x0
224
+ for line in lines:
225
+ line.sort(key=lambda e: e.get('x0', 0))
226
+
227
+ return lines
@@ -0,0 +1,151 @@
1
+ """
2
+ Visualization utilities for natural-pdf.
3
+ """
4
+ from typing import List, Dict, Tuple, Optional, Union, Any
5
+ import io
6
+ import math
7
+ import random
8
+ from PIL import Image, ImageDraw, ImageFont
9
+
10
+ # Define a list of visually distinct colors for highlighting
11
+ # Format: (R, G, B, alpha)
12
+ HIGHLIGHT_COLORS = [
13
+ (255, 255, 0, 100), # Yellow (semi-transparent)
14
+ (255, 0, 0, 100), # Red (semi-transparent)
15
+ (0, 255, 0, 100), # Green (semi-transparent)
16
+ (0, 0, 255, 100), # Blue (semi-transparent)
17
+ (255, 0, 255, 100), # Magenta (semi-transparent)
18
+ (0, 255, 255, 100), # Cyan (semi-transparent)
19
+ (255, 165, 0, 100), # Orange (semi-transparent)
20
+ (128, 0, 128, 100), # Purple (semi-transparent)
21
+ (0, 128, 0, 100), # Dark Green (semi-transparent)
22
+ (0, 0, 128, 100), # Navy (semi-transparent)
23
+ ]
24
+
25
+ # Keep track of the next color to use
26
+ _next_color_index = 0
27
+
28
+ def get_next_highlight_color() -> Tuple[int, int, int, int]:
29
+ """
30
+ Get the next highlight color in the cycle.
31
+
32
+ Returns:
33
+ Tuple of (R, G, B, alpha) values
34
+ """
35
+ global _next_color_index
36
+ color = HIGHLIGHT_COLORS[_next_color_index % len(HIGHLIGHT_COLORS)]
37
+ _next_color_index += 1
38
+ return color
39
+
40
+ def reset_highlight_colors():
41
+ """Reset the highlight color cycle."""
42
+ global _next_color_index
43
+ _next_color_index = 0
44
+
45
+ def get_random_highlight_color() -> Tuple[int, int, int, int]:
46
+ """
47
+ Get a random highlight color.
48
+
49
+ Returns:
50
+ Tuple of (R, G, B, alpha) values
51
+ """
52
+ return random.choice(HIGHLIGHT_COLORS)
53
+
54
+ def create_legend(labels_colors: Dict[str, Tuple[int, int, int, int]],
55
+ width: int = 200,
56
+ item_height: int = 30) -> Image.Image:
57
+ """
58
+ Create a legend image for the highlighted elements.
59
+
60
+ Args:
61
+ labels_colors: Dictionary mapping labels to colors
62
+ width: Width of the legend image
63
+ item_height: Height of each legend item
64
+
65
+ Returns:
66
+ PIL Image with the legend
67
+ """
68
+ # Calculate the height based on the number of labels
69
+ height = len(labels_colors) * item_height + 10 # 10px padding
70
+
71
+ # Create a white image
72
+ legend = Image.new('RGBA', (width, height), (255, 255, 255, 255))
73
+ draw = ImageDraw.Draw(legend)
74
+
75
+ # Try to load a font, use default if not available
76
+ try:
77
+ font = ImageFont.truetype("Arial", 12)
78
+ except IOError:
79
+ font = ImageFont.load_default()
80
+
81
+ # Draw each legend item
82
+ y = 5 # Start with 5px padding
83
+ for label, color in labels_colors.items():
84
+ # Get the color components
85
+ r, g, b, alpha = color
86
+
87
+ # Calculate the apparent color when drawn on white background
88
+ # Alpha blending formula: result = (source * alpha) + (dest * (1-alpha))
89
+ # Where alpha is normalized to 0-1 range
90
+ alpha_norm = alpha / 255.0
91
+ apparent_r = int(r * alpha_norm + 255 * (1 - alpha_norm))
92
+ apparent_g = int(g * alpha_norm + 255 * (1 - alpha_norm))
93
+ apparent_b = int(b * alpha_norm + 255 * (1 - alpha_norm))
94
+
95
+ # Use solid color that matches the apparent color of the semi-transparent highlight
96
+ legend_color = (apparent_r, apparent_g, apparent_b, 255)
97
+
98
+ # Draw the color box
99
+ draw.rectangle([(10, y), (30, y + item_height - 5)], fill=legend_color)
100
+
101
+ # Draw the label text
102
+ draw.text((40, y + item_height // 4), label, fill=(0, 0, 0, 255), font=font)
103
+
104
+ # Move to the next position
105
+ y += item_height
106
+
107
+ return legend
108
+
109
+ def merge_images_with_legend(image: Image.Image,
110
+ legend: Image.Image,
111
+ position: str = 'right') -> Image.Image:
112
+ """
113
+ Merge an image with a legend.
114
+
115
+ Args:
116
+ image: Main image
117
+ legend: Legend image
118
+ position: Position of the legend ('right', 'bottom', 'top', 'left')
119
+
120
+ Returns:
121
+ Merged image
122
+ """
123
+ if position == 'right':
124
+ # Create a new image with extra width for the legend
125
+ merged = Image.new('RGBA', (image.width + legend.width, max(image.height, legend.height)),
126
+ (255, 255, 255, 255))
127
+ merged.paste(image, (0, 0))
128
+ merged.paste(legend, (image.width, 0))
129
+ elif position == 'bottom':
130
+ # Create a new image with extra height for the legend
131
+ merged = Image.new('RGBA', (max(image.width, legend.width), image.height + legend.height),
132
+ (255, 255, 255, 255))
133
+ merged.paste(image, (0, 0))
134
+ merged.paste(legend, (0, image.height))
135
+ elif position == 'top':
136
+ # Create a new image with extra height for the legend
137
+ merged = Image.new('RGBA', (max(image.width, legend.width), image.height + legend.height),
138
+ (255, 255, 255, 255))
139
+ merged.paste(legend, (0, 0))
140
+ merged.paste(image, (0, legend.height))
141
+ elif position == 'left':
142
+ # Create a new image with extra width for the legend
143
+ merged = Image.new('RGBA', (image.width + legend.width, max(image.height, legend.height)),
144
+ (255, 255, 255, 255))
145
+ merged.paste(legend, (0, 0))
146
+ merged.paste(image, (legend.width, 0))
147
+ else:
148
+ # Invalid position, return the original image
149
+ merged = image
150
+
151
+ return merged
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023-2025 Jonathan Soma
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,268 @@
1
+ Metadata-Version: 2.2
2
+ Name: natural-pdf
3
+ Version: 25.3.16
4
+ Summary: A more intuitive interface for working with PDFs
5
+ Home-page: https://github.com/jsoma/natural-pdf
6
+ Author: Jonathan Soma
7
+ Author-email: jonathan.soma@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pdfplumber>=0.7.0
15
+ Requires-Dist: Pillow>=8.0.0
16
+ Requires-Dist: colour>=0.1.5
17
+ Requires-Dist: numpy>=1.20.0
18
+ Requires-Dist: doclayout_yolo>=0.0.3
19
+ Requires-Dist: torch>=2.0.0
20
+ Requires-Dist: torchvision>=0.15.0
21
+ Requires-Dist: transformers>=4.30.0
22
+ Requires-Dist: huggingface_hub>=0.19.0
23
+ Provides-Extra: easyocr
24
+ Requires-Dist: easyocr>=1.7.0; extra == "easyocr"
25
+ Provides-Extra: paddle
26
+ Requires-Dist: paddlepaddle>=2.5.0; extra == "paddle"
27
+ Requires-Dist: paddleocr>=2.7.0; extra == "paddle"
28
+ Provides-Extra: qa
29
+ Provides-Extra: core
30
+ Requires-Dist: pdfplumber>=0.7.0; extra == "core"
31
+ Requires-Dist: Pillow>=8.0.0; extra == "core"
32
+ Requires-Dist: colour>=0.1.5; extra == "core"
33
+ Requires-Dist: numpy>=1.20.0; extra == "core"
34
+ Provides-Extra: ai
35
+ Requires-Dist: doclayout_yolo>=0.0.3; extra == "ai"
36
+ Requires-Dist: torch>=2.0.0; extra == "ai"
37
+ Requires-Dist: torchvision>=0.15.0; extra == "ai"
38
+ Requires-Dist: transformers>=4.30.0; extra == "ai"
39
+ Requires-Dist: huggingface_hub>=0.19.0; extra == "ai"
40
+ Provides-Extra: all
41
+ Requires-Dist: easyocr>=1.7.0; extra == "all"
42
+ Requires-Dist: paddlepaddle>=2.5.0; extra == "all"
43
+ Requires-Dist: paddleocr>=2.7.0; extra == "all"
44
+ Dynamic: author
45
+ Dynamic: author-email
46
+ Dynamic: classifier
47
+ Dynamic: description
48
+ Dynamic: description-content-type
49
+ Dynamic: home-page
50
+ Dynamic: provides-extra
51
+ Dynamic: requires-dist
52
+ Dynamic: requires-python
53
+ Dynamic: summary
54
+
55
+ # Natural PDF
56
+
57
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
58
+
59
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
60
+
61
+ [Complete documentation here](https://jsoma.github.io/natural-pdf)
62
+
63
+ ## Features
64
+
65
+ - **Fluent API** for chaining operations
66
+ - **CSS-like selectors** for finding elements
67
+ - **Spatial navigation** with intuitive methods like `above()`, `below()`, and `select_until()`
68
+ - **Element collections** for batch operations
69
+ - **Visual highlighting** for debugging
70
+ - **Region visualization** with direct image extraction of specific regions
71
+ - **Text style analysis** for document structure
72
+ - **Exclusion zones** for headers, footers, and other areas to ignore
73
+ - **OCR integration** for extracting text from scanned documents
74
+ - **Document layout analysis** for detecting document structure with ML models
75
+ - **Table extraction** with multiple detection methods
76
+ - **Structured logging** with configurable levels and handlers
77
+
78
+ ## Installation
79
+
80
+ ```bash
81
+ pip install natural-pdf
82
+ ```
83
+
84
+ or if you're picky...
85
+
86
+ ```bash
87
+ # Minimal installation without AI models (faster, smaller)
88
+ pip install natural-pdf[core]
89
+
90
+ # With all OCR engines
91
+ pip install natural-pdf[easyocr,paddle]
92
+ ```
93
+
94
+ ## Quick Start
95
+
96
+ ```python
97
+ from natural_pdf import PDF
98
+
99
+ # Open a PDF
100
+ pdf = PDF('document.pdf')
101
+
102
+ # Get the first page
103
+ page = pdf.pages[0]
104
+
105
+ # Find elements using CSS-like selectors
106
+ heading = page.find('text:contains("Summary"):bold')
107
+
108
+ # Extract content below the heading
109
+ content = heading.below().extract_text()
110
+ print(content)
111
+
112
+ # Exclude headers and footers
113
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
114
+ page.add_exclusion(page.find_all('line')[-1].below())
115
+
116
+ # Extract clean text
117
+ clean_text = page.extract_text()
118
+ print(clean_text)
119
+ ```
120
+
121
+ ## Selectors
122
+
123
+ The library supports CSS-like selectors for finding elements:
124
+
125
+ ```python
126
+ # Find text containing a specific string
127
+ element = page.find('text:contains("Revenue")')
128
+
129
+ # Find bold text with a specific font size
130
+ headings = page.find_all('text[size>=12]:bold')
131
+
132
+ # Find thick red lines
133
+ lines = page.find_all('line[width>=2][color~=(1,0,0)]')
134
+ ```
135
+
136
+ ## Spatial Navigation
137
+
138
+ Navigate through the document with intuitive spatial methods:
139
+
140
+ ```python
141
+ # Get content below a heading
142
+ heading = page.find('text:contains("Introduction")')
143
+ content = heading.below().extract_text()
144
+
145
+ # Get content from one element to another
146
+ start = page.find('text:contains("Start")')
147
+ end = page.find('text:contains("End")')
148
+ region = start.select_until(end)
149
+ content = region.extract_text()
150
+ ```
151
+
152
+ ## Exclusion Zones
153
+
154
+ Exclude headers, footers, or other areas from extraction:
155
+
156
+ ```python
157
+ # Page-level exclusion
158
+ page.add_exclusion(page.find('text:contains("Page")').above())
159
+ page.add_exclusion(page.find_all('line')[-1].below())
160
+
161
+ # PDF-level exclusion with lambdas
162
+ pdf.add_exclusion(
163
+ lambda page: page.find('text:contains("Header")').above(),
164
+ label="headers"
165
+ )
166
+
167
+ # Extract text with exclusions applied
168
+ text = pdf.extract_text()
169
+
170
+ # Extract from a specific region with exclusions
171
+ summary = page.find('text:contains("Summary")')
172
+ conclusion = page.find('text:contains("Conclusion")')
173
+ region = page.create_region(summary.x0, summary.top, conclusion.x1, conclusion.bottom)
174
+ region_text = region.extract_text(apply_exclusions=True) # Excludes headers/footers
175
+
176
+ # Disable exclusions for a specific extraction
177
+ full_text = page.extract_text(apply_exclusions=False)
178
+ ```
179
+
180
+ Exclusions work efficiently with different region types:
181
+ - Regions without intersection with exclusion zones → exclusions ignored entirely
182
+ - Rectangular regions with header/footer exclusions → optimized cropping
183
+ - Complex regions with partial exclusions → advanced filtering with warning
184
+
185
+ ## OCR Integration
186
+
187
+ Extract text from scanned documents using OCR with multiple engine options:
188
+
189
+ ```python
190
+ # Using the default EasyOCR engine
191
+ pdf = PDF('scanned_document.pdf', ocr={
192
+ 'enabled': 'auto', # Only use OCR when necessary
193
+ 'languages': ['en'],
194
+ 'min_confidence': 0.5
195
+ })
196
+
197
+ # Using PaddleOCR for better Asian language support
198
+ pdf = PDF('scanned_document.pdf',
199
+ ocr_engine='paddleocr',
200
+ ocr={
201
+ 'enabled': True,
202
+ 'languages': ['zh-cn', 'en'], # Chinese and English
203
+ 'min_confidence': 0.3,
204
+ 'model_settings': {
205
+ 'use_angle_cls': False, # PaddleOCR-specific setting
206
+ 'rec_batch_num': 6
207
+ }
208
+ })
209
+
210
+ # Extract text, OCR will be used if needed
211
+ text = page.extract_text()
212
+
213
+ # Force OCR regardless of existing text
214
+ ocr_text = page.extract_text(ocr=True)
215
+
216
+ # Find OCR-detected text with high confidence
217
+ high_confidence = page.find_all('text[source=ocr][confidence>=0.8]')
218
+
219
+ # Visualize OCR results with color-coded confidence levels
220
+ for elem in page.find_all('text[source=ocr]'):
221
+ if elem.confidence >= 0.8:
222
+ color = (0, 1, 0, 0.3) # Green for high confidence
223
+ elif elem.confidence >= 0.5:
224
+ color = (1, 1, 0, 0.3) # Yellow for medium confidence
225
+ else:
226
+ color = (1, 0, 0, 0.3) # Red for low confidence
227
+
228
+ elem.highlight(color=color, label=f"OCR ({elem.confidence:.2f})")
229
+ page.save_image('ocr_results.png', labels=True)
230
+ ```
231
+
232
+ ## Logging
233
+
234
+ The library includes a structured logging system to provide visibility into its operations:
235
+
236
+ ```python
237
+ import logging
238
+ from natural_pdf import PDF, configure_logging
239
+
240
+ # Configure logging with INFO level to console
241
+ configure_logging(level=logging.INFO)
242
+
243
+ # Or log to a file with DEBUG level
244
+ file_handler = logging.FileHandler("natural_pdf.log")
245
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
246
+ configure_logging(level=logging.DEBUG, handler=file_handler)
247
+
248
+ # Now operations will generate logs
249
+ pdf = PDF("document.pdf")
250
+ # Log: natural_pdf.core.pdf - INFO - Initializing PDF from document.pdf
251
+
252
+ # Run layout detection with verbose logging
253
+ regions = pdf.pages[0].analyze_layout(
254
+ model="paddle",
255
+ model_params={"verbose": True}
256
+ )
257
+ # Log: natural_pdf.analyzers.layout.paddle - INFO - Starting PaddleLayout detection...
258
+ # Log: natural_pdf.analyzers.layout.paddle - DEBUG - Parameters: confidence=0.2...
259
+ ```
260
+
261
+ Logs follow a hierarchical structure matching the library's module organization:
262
+ - `natural_pdf.core` - Core PDF operations
263
+ - `natural_pdf.analyzers` - Layout analysis operations
264
+ - `natural_pdf.ocr` - OCR engine operations
265
+
266
+ ## More details
267
+
268
+ [Complete documentation here](https://jsoma.github.io/natural-pdf)