natural-pdf 25.3.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf-25.3.16/LICENSE +21 -0
  2. natural_pdf-25.3.16/MANIFEST.in +8 -0
  3. natural_pdf-25.3.16/PKG-INFO +268 -0
  4. natural_pdf-25.3.16/README.md +214 -0
  5. natural_pdf-25.3.16/docs/api/index.md +386 -0
  6. natural_pdf-25.3.16/docs/assets/favicon.png +3 -0
  7. natural_pdf-25.3.16/docs/assets/social-preview.png +17 -0
  8. natural_pdf-25.3.16/docs/document-qa/index.md +375 -0
  9. natural_pdf-25.3.16/docs/element-selection/index.md +213 -0
  10. natural_pdf-25.3.16/docs/explanations/index.md +28 -0
  11. natural_pdf-25.3.16/docs/explanations/ocr-challenges.md +221 -0
  12. natural_pdf-25.3.16/docs/explanations/pdf-extraction-challenges.md +203 -0
  13. natural_pdf-25.3.16/docs/explanations/pdf-fonts.md +214 -0
  14. natural_pdf-25.3.16/docs/index.md +303 -0
  15. natural_pdf-25.3.16/docs/installation/index.md +65 -0
  16. natural_pdf-25.3.16/docs/layout-analysis/index.md +314 -0
  17. natural_pdf-25.3.16/docs/ocr/index.md +222 -0
  18. natural_pdf-25.3.16/docs/pdf-navigation/index.md +255 -0
  19. natural_pdf-25.3.16/docs/regions/index.md +165 -0
  20. natural_pdf-25.3.16/docs/tables/index.md +359 -0
  21. natural_pdf-25.3.16/docs/text-extraction/index.md +426 -0
  22. natural_pdf-25.3.16/docs/visual-debugging/index.md +223 -0
  23. natural_pdf-25.3.16/examples/__init__.py +3 -0
  24. natural_pdf-25.3.16/examples/another_exclusion_example.py +20 -0
  25. natural_pdf-25.3.16/examples/basic_usage.py +190 -0
  26. natural_pdf-25.3.16/examples/boundary_exclusion_test.py +137 -0
  27. natural_pdf-25.3.16/examples/boundary_inclusion_fix_test.py +157 -0
  28. natural_pdf-25.3.16/examples/chainable_layout_example.py +70 -0
  29. natural_pdf-25.3.16/examples/color_basic_test.py +49 -0
  30. natural_pdf-25.3.16/examples/color_name_example.py +71 -0
  31. natural_pdf-25.3.16/examples/color_test.py +62 -0
  32. natural_pdf-25.3.16/examples/debug_ocr.py +91 -0
  33. natural_pdf-25.3.16/examples/direct_ocr_test.py +148 -0
  34. natural_pdf-25.3.16/examples/direct_paddle_test.py +99 -0
  35. natural_pdf-25.3.16/examples/direct_qa_example.py +165 -0
  36. natural_pdf-25.3.16/examples/document_layout_analysis.py +123 -0
  37. natural_pdf-25.3.16/examples/document_qa_example.py +185 -0
  38. natural_pdf-25.3.16/examples/exclusion_count_debug.py +128 -0
  39. natural_pdf-25.3.16/examples/exclusion_debug.py +107 -0
  40. natural_pdf-25.3.16/examples/exclusion_example.py +150 -0
  41. natural_pdf-25.3.16/examples/exclusion_optimization_example.py +190 -0
  42. natural_pdf-25.3.16/examples/extract_text_test.py +128 -0
  43. natural_pdf-25.3.16/examples/font_aware_example.py +101 -0
  44. natural_pdf-25.3.16/examples/font_variant_example.py +124 -0
  45. natural_pdf-25.3.16/examples/footer_overlap_test.py +124 -0
  46. natural_pdf-25.3.16/examples/highlight_all_example.py +82 -0
  47. natural_pdf-25.3.16/examples/highlight_attributes_test.py +114 -0
  48. natural_pdf-25.3.16/examples/highlight_confidence_display.py +122 -0
  49. natural_pdf-25.3.16/examples/highlight_demo.py +110 -0
  50. natural_pdf-25.3.16/examples/highlight_float_test.py +71 -0
  51. natural_pdf-25.3.16/examples/highlight_test.py +147 -0
  52. natural_pdf-25.3.16/examples/highlighting_example.py +123 -0
  53. natural_pdf-25.3.16/examples/image_width_example.py +84 -0
  54. natural_pdf-25.3.16/examples/improved_api_example.py +128 -0
  55. natural_pdf-25.3.16/examples/layout_confidence_display_test.py +65 -0
  56. natural_pdf-25.3.16/examples/layout_confidence_test.py +82 -0
  57. natural_pdf-25.3.16/examples/layout_coordinate_debug.py +258 -0
  58. natural_pdf-25.3.16/examples/layout_highlight_test.py +77 -0
  59. natural_pdf-25.3.16/examples/logging_example.py +70 -0
  60. natural_pdf-25.3.16/examples/ocr_comprehensive.py +193 -0
  61. natural_pdf-25.3.16/examples/ocr_debug_example.py +87 -0
  62. natural_pdf-25.3.16/examples/ocr_default_test.py +97 -0
  63. natural_pdf-25.3.16/examples/ocr_engine_comparison.py +235 -0
  64. natural_pdf-25.3.16/examples/ocr_example.py +89 -0
  65. natural_pdf-25.3.16/examples/ocr_simplified_params.py +79 -0
  66. natural_pdf-25.3.16/examples/ocr_visualization.py +102 -0
  67. natural_pdf-25.3.16/examples/ocr_visualization_test.py +121 -0
  68. natural_pdf-25.3.16/examples/paddle_layout_example.py +315 -0
  69. natural_pdf-25.3.16/examples/paddle_layout_simple.py +74 -0
  70. natural_pdf-25.3.16/examples/paddleocr_example.py +224 -0
  71. natural_pdf-25.3.16/examples/page_collection_example.py +103 -0
  72. natural_pdf-25.3.16/examples/polygon_highlight_example.py +83 -0
  73. natural_pdf-25.3.16/examples/position_methods_example.py +134 -0
  74. natural_pdf-25.3.16/examples/region_boundary_test.py +73 -0
  75. natural_pdf-25.3.16/examples/region_exclusion_test.py +149 -0
  76. natural_pdf-25.3.16/examples/region_expand_example.py +109 -0
  77. natural_pdf-25.3.16/examples/region_image_example.py +116 -0
  78. natural_pdf-25.3.16/examples/region_ocr_test.py +119 -0
  79. natural_pdf-25.3.16/examples/region_sections_example.py +115 -0
  80. natural_pdf-25.3.16/examples/school_books.py +49 -0
  81. natural_pdf-25.3.16/examples/school_books_all.py +52 -0
  82. natural_pdf-25.3.16/examples/scouring.py +36 -0
  83. natural_pdf-25.3.16/examples/section_extraction_example.py +232 -0
  84. natural_pdf-25.3.16/examples/simple_document_qa.py +97 -0
  85. natural_pdf-25.3.16/examples/spatial_navigation_example.py +108 -0
  86. natural_pdf-25.3.16/examples/table_extraction_example.py +135 -0
  87. natural_pdf-25.3.16/examples/table_structure_detection.py +155 -0
  88. natural_pdf-25.3.16/examples/tatr_cells_test.py +56 -0
  89. natural_pdf-25.3.16/examples/tatr_ocr_table_test.py +94 -0
  90. natural_pdf-25.3.16/examples/text_search_example.py +122 -0
  91. natural_pdf-25.3.16/examples/text_style_example.py +110 -0
  92. natural_pdf-25.3.16/examples/tiny-text.py +61 -0
  93. natural_pdf-25.3.16/examples/until_boundaries_example.py +156 -0
  94. natural_pdf-25.3.16/examples/until_example.py +112 -0
  95. natural_pdf-25.3.16/examples/very_basics.py +15 -0
  96. natural_pdf-25.3.16/natural_pdf/__init__.py +55 -0
  97. natural_pdf-25.3.16/natural_pdf/analyzers/__init__.py +9 -0
  98. natural_pdf-25.3.16/natural_pdf/analyzers/document_layout.py +736 -0
  99. natural_pdf-25.3.16/natural_pdf/analyzers/text_structure.py +153 -0
  100. natural_pdf-25.3.16/natural_pdf/core/__init__.py +3 -0
  101. natural_pdf-25.3.16/natural_pdf/core/page.py +2376 -0
  102. natural_pdf-25.3.16/natural_pdf/core/pdf.py +572 -0
  103. natural_pdf-25.3.16/natural_pdf/elements/__init__.py +3 -0
  104. natural_pdf-25.3.16/natural_pdf/elements/base.py +553 -0
  105. natural_pdf-25.3.16/natural_pdf/elements/collections.py +770 -0
  106. natural_pdf-25.3.16/natural_pdf/elements/line.py +124 -0
  107. natural_pdf-25.3.16/natural_pdf/elements/rect.py +122 -0
  108. natural_pdf-25.3.16/natural_pdf/elements/region.py +1366 -0
  109. natural_pdf-25.3.16/natural_pdf/elements/text.py +304 -0
  110. natural_pdf-25.3.16/natural_pdf/ocr/__init__.py +62 -0
  111. natural_pdf-25.3.16/natural_pdf/ocr/easyocr_engine.py +254 -0
  112. natural_pdf-25.3.16/natural_pdf/ocr/engine.py +158 -0
  113. natural_pdf-25.3.16/natural_pdf/ocr/paddleocr_engine.py +263 -0
  114. natural_pdf-25.3.16/natural_pdf/qa/__init__.py +3 -0
  115. natural_pdf-25.3.16/natural_pdf/qa/document_qa.py +405 -0
  116. natural_pdf-25.3.16/natural_pdf/selectors/__init__.py +4 -0
  117. natural_pdf-25.3.16/natural_pdf/selectors/parser.py +360 -0
  118. natural_pdf-25.3.16/natural_pdf/templates/__init__.py +1 -0
  119. natural_pdf-25.3.16/natural_pdf/templates/ocr_debug.html +517 -0
  120. natural_pdf-25.3.16/natural_pdf/utils/__init__.py +4 -0
  121. natural_pdf-25.3.16/natural_pdf/utils/highlighting.py +605 -0
  122. natural_pdf-25.3.16/natural_pdf/utils/ocr.py +515 -0
  123. natural_pdf-25.3.16/natural_pdf/utils/reading_order.py +227 -0
  124. natural_pdf-25.3.16/natural_pdf/utils/visualization.py +151 -0
  125. natural_pdf-25.3.16/natural_pdf.egg-info/PKG-INFO +268 -0
  126. natural_pdf-25.3.16/natural_pdf.egg-info/SOURCES.txt +132 -0
  127. natural_pdf-25.3.16/natural_pdf.egg-info/dependency_links.txt +1 -0
  128. natural_pdf-25.3.16/natural_pdf.egg-info/requires.txt +36 -0
  129. natural_pdf-25.3.16/natural_pdf.egg-info/top_level.txt +3 -0
  130. natural_pdf-25.3.16/pyproject.toml +22 -0
  131. natural_pdf-25.3.16/setup.cfg +4 -0
  132. natural_pdf-25.3.16/setup.py +63 -0
  133. natural_pdf-25.3.16/tests/__init__.py +3 -0
  134. natural_pdf-25.3.16/tests/test_pdf.py +39 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023-2025 Jonathan Soma
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,8 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include natural_pdf/templates *.html
4
+ recursive-include docs *.md *.png *.jpg *.gif
5
+ global-exclude __pycache__
6
+ global-exclude *.py[cod]
7
+ global-exclude *.so
8
+ global-exclude .DS_Store
@@ -0,0 +1,268 @@
1
+ Metadata-Version: 2.2
2
+ Name: natural-pdf
3
+ Version: 25.3.16
4
+ Summary: A more intuitive interface for working with PDFs
5
+ Home-page: https://github.com/jsoma/natural-pdf
6
+ Author: Jonathan Soma
7
+ Author-email: jonathan.soma@gmail.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pdfplumber>=0.7.0
15
+ Requires-Dist: Pillow>=8.0.0
16
+ Requires-Dist: colour>=0.1.5
17
+ Requires-Dist: numpy>=1.20.0
18
+ Requires-Dist: doclayout_yolo>=0.0.3
19
+ Requires-Dist: torch>=2.0.0
20
+ Requires-Dist: torchvision>=0.15.0
21
+ Requires-Dist: transformers>=4.30.0
22
+ Requires-Dist: huggingface_hub>=0.19.0
23
+ Provides-Extra: easyocr
24
+ Requires-Dist: easyocr>=1.7.0; extra == "easyocr"
25
+ Provides-Extra: paddle
26
+ Requires-Dist: paddlepaddle>=2.5.0; extra == "paddle"
27
+ Requires-Dist: paddleocr>=2.7.0; extra == "paddle"
28
+ Provides-Extra: qa
29
+ Provides-Extra: core
30
+ Requires-Dist: pdfplumber>=0.7.0; extra == "core"
31
+ Requires-Dist: Pillow>=8.0.0; extra == "core"
32
+ Requires-Dist: colour>=0.1.5; extra == "core"
33
+ Requires-Dist: numpy>=1.20.0; extra == "core"
34
+ Provides-Extra: ai
35
+ Requires-Dist: doclayout_yolo>=0.0.3; extra == "ai"
36
+ Requires-Dist: torch>=2.0.0; extra == "ai"
37
+ Requires-Dist: torchvision>=0.15.0; extra == "ai"
38
+ Requires-Dist: transformers>=4.30.0; extra == "ai"
39
+ Requires-Dist: huggingface_hub>=0.19.0; extra == "ai"
40
+ Provides-Extra: all
41
+ Requires-Dist: easyocr>=1.7.0; extra == "all"
42
+ Requires-Dist: paddlepaddle>=2.5.0; extra == "all"
43
+ Requires-Dist: paddleocr>=2.7.0; extra == "all"
44
+ Dynamic: author
45
+ Dynamic: author-email
46
+ Dynamic: classifier
47
+ Dynamic: description
48
+ Dynamic: description-content-type
49
+ Dynamic: home-page
50
+ Dynamic: provides-extra
51
+ Dynamic: requires-dist
52
+ Dynamic: requires-python
53
+ Dynamic: summary
54
+
55
+ # Natural PDF
56
+
57
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
58
+
59
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
60
+
61
+ [Complete documentation here](https://jsoma.github.io/natural-pdf)
62
+
63
+ ## Features
64
+
65
+ - **Fluent API** for chaining operations
66
+ - **CSS-like selectors** for finding elements
67
+ - **Spatial navigation** with intuitive methods like `above()`, `below()`, and `select_until()`
68
+ - **Element collections** for batch operations
69
+ - **Visual highlighting** for debugging
70
+ - **Region visualization** with direct image extraction of specific regions
71
+ - **Text style analysis** for document structure
72
+ - **Exclusion zones** for headers, footers, and other areas to ignore
73
+ - **OCR integration** for extracting text from scanned documents
74
+ - **Document layout analysis** for detecting document structure with ML models
75
+ - **Table extraction** with multiple detection methods
76
+ - **Structured logging** with configurable levels and handlers
77
+
78
+ ## Installation
79
+
80
+ ```bash
81
+ pip install natural-pdf
82
+ ```
83
+
84
+ or if you're picky...
85
+
86
+ ```bash
87
+ # Minimal installation without AI models (faster, smaller)
88
+ pip install natural-pdf[core]
89
+
90
+ # With all OCR engines
91
+ pip install natural-pdf[easyocr,paddle]
92
+ ```
93
+
94
+ ## Quick Start
95
+
96
+ ```python
97
+ from natural_pdf import PDF
98
+
99
+ # Open a PDF
100
+ pdf = PDF('document.pdf')
101
+
102
+ # Get the first page
103
+ page = pdf.pages[0]
104
+
105
+ # Find elements using CSS-like selectors
106
+ heading = page.find('text:contains("Summary"):bold')
107
+
108
+ # Extract content below the heading
109
+ content = heading.below().extract_text()
110
+ print(content)
111
+
112
+ # Exclude headers and footers
113
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
114
+ page.add_exclusion(page.find_all('line')[-1].below())
115
+
116
+ # Extract clean text
117
+ clean_text = page.extract_text()
118
+ print(clean_text)
119
+ ```
120
+
121
+ ## Selectors
122
+
123
+ The library supports CSS-like selectors for finding elements:
124
+
125
+ ```python
126
+ # Find text containing a specific string
127
+ element = page.find('text:contains("Revenue")')
128
+
129
+ # Find bold text with a specific font size
130
+ headings = page.find_all('text[size>=12]:bold')
131
+
132
+ # Find thick red lines
133
+ lines = page.find_all('line[width>=2][color~=(1,0,0)]')
134
+ ```
135
+
136
+ ## Spatial Navigation
137
+
138
+ Navigate through the document with intuitive spatial methods:
139
+
140
+ ```python
141
+ # Get content below a heading
142
+ heading = page.find('text:contains("Introduction")')
143
+ content = heading.below().extract_text()
144
+
145
+ # Get content from one element to another
146
+ start = page.find('text:contains("Start")')
147
+ end = page.find('text:contains("End")')
148
+ region = start.select_until(end)
149
+ content = region.extract_text()
150
+ ```
151
+
152
+ ## Exclusion Zones
153
+
154
+ Exclude headers, footers, or other areas from extraction:
155
+
156
+ ```python
157
+ # Page-level exclusion
158
+ page.add_exclusion(page.find('text:contains("Page")').above())
159
+ page.add_exclusion(page.find_all('line')[-1].below())
160
+
161
+ # PDF-level exclusion with lambdas
162
+ pdf.add_exclusion(
163
+ lambda page: page.find('text:contains("Header")').above(),
164
+ label="headers"
165
+ )
166
+
167
+ # Extract text with exclusions applied
168
+ text = pdf.extract_text()
169
+
170
+ # Extract from a specific region with exclusions
171
+ summary = page.find('text:contains("Summary")')
172
+ conclusion = page.find('text:contains("Conclusion")')
173
+ region = page.create_region(summary.x0, summary.top, conclusion.x1, conclusion.bottom)
174
+ region_text = region.extract_text(apply_exclusions=True) # Excludes headers/footers
175
+
176
+ # Disable exclusions for a specific extraction
177
+ full_text = page.extract_text(apply_exclusions=False)
178
+ ```
179
+
180
+ Exclusions work efficiently with different region types:
181
+ - Regions without intersection with exclusion zones → exclusions ignored entirely
182
+ - Rectangular regions with header/footer exclusions → optimized cropping
183
+ - Complex regions with partial exclusions → advanced filtering with warning
184
+
185
+ ## OCR Integration
186
+
187
+ Extract text from scanned documents using OCR with multiple engine options:
188
+
189
+ ```python
190
+ # Using the default EasyOCR engine
191
+ pdf = PDF('scanned_document.pdf', ocr={
192
+ 'enabled': 'auto', # Only use OCR when necessary
193
+ 'languages': ['en'],
194
+ 'min_confidence': 0.5
195
+ })
196
+
197
+ # Using PaddleOCR for better Asian language support
198
+ pdf = PDF('scanned_document.pdf',
199
+ ocr_engine='paddleocr',
200
+ ocr={
201
+ 'enabled': True,
202
+ 'languages': ['zh-cn', 'en'], # Chinese and English
203
+ 'min_confidence': 0.3,
204
+ 'model_settings': {
205
+ 'use_angle_cls': False, # PaddleOCR-specific setting
206
+ 'rec_batch_num': 6
207
+ }
208
+ })
209
+
210
+ # Extract text, OCR will be used if needed
211
+ text = page.extract_text()
212
+
213
+ # Force OCR regardless of existing text
214
+ ocr_text = page.extract_text(ocr=True)
215
+
216
+ # Find OCR-detected text with high confidence
217
+ high_confidence = page.find_all('text[source=ocr][confidence>=0.8]')
218
+
219
+ # Visualize OCR results with color-coded confidence levels
220
+ for elem in page.find_all('text[source=ocr]'):
221
+ if elem.confidence >= 0.8:
222
+ color = (0, 1, 0, 0.3) # Green for high confidence
223
+ elif elem.confidence >= 0.5:
224
+ color = (1, 1, 0, 0.3) # Yellow for medium confidence
225
+ else:
226
+ color = (1, 0, 0, 0.3) # Red for low confidence
227
+
228
+ elem.highlight(color=color, label=f"OCR ({elem.confidence:.2f})")
229
+ page.save_image('ocr_results.png', labels=True)
230
+ ```
231
+
232
+ ## Logging
233
+
234
+ The library includes a structured logging system to provide visibility into its operations:
235
+
236
+ ```python
237
+ import logging
238
+ from natural_pdf import PDF, configure_logging
239
+
240
+ # Configure logging with INFO level to console
241
+ configure_logging(level=logging.INFO)
242
+
243
+ # Or log to a file with DEBUG level
244
+ file_handler = logging.FileHandler("natural_pdf.log")
245
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
246
+ configure_logging(level=logging.DEBUG, handler=file_handler)
247
+
248
+ # Now operations will generate logs
249
+ pdf = PDF("document.pdf")
250
+ # Log: natural_pdf.core.pdf - INFO - Initializing PDF from document.pdf
251
+
252
+ # Run layout detection with verbose logging
253
+ regions = pdf.pages[0].analyze_layout(
254
+ model="paddle",
255
+ model_params={"verbose": True}
256
+ )
257
+ # Log: natural_pdf.analyzers.layout.paddle - INFO - Starting PaddleLayout detection...
258
+ # Log: natural_pdf.analyzers.layout.paddle - DEBUG - Parameters: confidence=0.2...
259
+ ```
260
+
261
+ Logs follow a hierarchical structure matching the library's module organization:
262
+ - `natural_pdf.core` - Core PDF operations
263
+ - `natural_pdf.analyzers` - Layout analysis operations
264
+ - `natural_pdf.ocr` - OCR engine operations
265
+
266
+ ## More details
267
+
268
+ [Complete documentation here](https://jsoma.github.io/natural-pdf)
@@ -0,0 +1,214 @@
1
+ # Natural PDF
2
+
3
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
4
+
5
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
6
+
7
+ [Complete documentation here](https://jsoma.github.io/natural-pdf)
8
+
9
+ ## Features
10
+
11
+ - **Fluent API** for chaining operations
12
+ - **CSS-like selectors** for finding elements
13
+ - **Spatial navigation** with intuitive methods like `above()`, `below()`, and `select_until()`
14
+ - **Element collections** for batch operations
15
+ - **Visual highlighting** for debugging
16
+ - **Region visualization** with direct image extraction of specific regions
17
+ - **Text style analysis** for document structure
18
+ - **Exclusion zones** for headers, footers, and other areas to ignore
19
+ - **OCR integration** for extracting text from scanned documents
20
+ - **Document layout analysis** for detecting document structure with ML models
21
+ - **Table extraction** with multiple detection methods
22
+ - **Structured logging** with configurable levels and handlers
23
+
24
+ ## Installation
25
+
26
+ ```bash
27
+ pip install natural-pdf
28
+ ```
29
+
30
+ or if you're picky...
31
+
32
+ ```bash
33
+ # Minimal installation without AI models (faster, smaller)
34
+ pip install natural-pdf[core]
35
+
36
+ # With all OCR engines
37
+ pip install natural-pdf[easyocr,paddle]
38
+ ```
39
+
40
+ ## Quick Start
41
+
42
+ ```python
43
+ from natural_pdf import PDF
44
+
45
+ # Open a PDF
46
+ pdf = PDF('document.pdf')
47
+
48
+ # Get the first page
49
+ page = pdf.pages[0]
50
+
51
+ # Find elements using CSS-like selectors
52
+ heading = page.find('text:contains("Summary"):bold')
53
+
54
+ # Extract content below the heading
55
+ content = heading.below().extract_text()
56
+ print(content)
57
+
58
+ # Exclude headers and footers
59
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
60
+ page.add_exclusion(page.find_all('line')[-1].below())
61
+
62
+ # Extract clean text
63
+ clean_text = page.extract_text()
64
+ print(clean_text)
65
+ ```
66
+
67
+ ## Selectors
68
+
69
+ The library supports CSS-like selectors for finding elements:
70
+
71
+ ```python
72
+ # Find text containing a specific string
73
+ element = page.find('text:contains("Revenue")')
74
+
75
+ # Find bold text with a specific font size
76
+ headings = page.find_all('text[size>=12]:bold')
77
+
78
+ # Find thick red lines
79
+ lines = page.find_all('line[width>=2][color~=(1,0,0)]')
80
+ ```
81
+
82
+ ## Spatial Navigation
83
+
84
+ Navigate through the document with intuitive spatial methods:
85
+
86
+ ```python
87
+ # Get content below a heading
88
+ heading = page.find('text:contains("Introduction")')
89
+ content = heading.below().extract_text()
90
+
91
+ # Get content from one element to another
92
+ start = page.find('text:contains("Start")')
93
+ end = page.find('text:contains("End")')
94
+ region = start.select_until(end)
95
+ content = region.extract_text()
96
+ ```
97
+
98
+ ## Exclusion Zones
99
+
100
+ Exclude headers, footers, or other areas from extraction:
101
+
102
+ ```python
103
+ # Page-level exclusion
104
+ page.add_exclusion(page.find('text:contains("Page")').above())
105
+ page.add_exclusion(page.find_all('line')[-1].below())
106
+
107
+ # PDF-level exclusion with lambdas
108
+ pdf.add_exclusion(
109
+ lambda page: page.find('text:contains("Header")').above(),
110
+ label="headers"
111
+ )
112
+
113
+ # Extract text with exclusions applied
114
+ text = pdf.extract_text()
115
+
116
+ # Extract from a specific region with exclusions
117
+ summary = page.find('text:contains("Summary")')
118
+ conclusion = page.find('text:contains("Conclusion")')
119
+ region = page.create_region(summary.x0, summary.top, conclusion.x1, conclusion.bottom)
120
+ region_text = region.extract_text(apply_exclusions=True) # Excludes headers/footers
121
+
122
+ # Disable exclusions for a specific extraction
123
+ full_text = page.extract_text(apply_exclusions=False)
124
+ ```
125
+
126
+ Exclusions work efficiently with different region types:
127
+ - Regions without intersection with exclusion zones → exclusions ignored entirely
128
+ - Rectangular regions with header/footer exclusions → optimized cropping
129
+ - Complex regions with partial exclusions → advanced filtering with warning
130
+
131
+ ## OCR Integration
132
+
133
+ Extract text from scanned documents using OCR with multiple engine options:
134
+
135
+ ```python
136
+ # Using the default EasyOCR engine
137
+ pdf = PDF('scanned_document.pdf', ocr={
138
+ 'enabled': 'auto', # Only use OCR when necessary
139
+ 'languages': ['en'],
140
+ 'min_confidence': 0.5
141
+ })
142
+
143
+ # Using PaddleOCR for better Asian language support
144
+ pdf = PDF('scanned_document.pdf',
145
+ ocr_engine='paddleocr',
146
+ ocr={
147
+ 'enabled': True,
148
+ 'languages': ['zh-cn', 'en'], # Chinese and English
149
+ 'min_confidence': 0.3,
150
+ 'model_settings': {
151
+ 'use_angle_cls': False, # PaddleOCR-specific setting
152
+ 'rec_batch_num': 6
153
+ }
154
+ })
155
+
156
+ # Extract text, OCR will be used if needed
157
+ text = page.extract_text()
158
+
159
+ # Force OCR regardless of existing text
160
+ ocr_text = page.extract_text(ocr=True)
161
+
162
+ # Find OCR-detected text with high confidence
163
+ high_confidence = page.find_all('text[source=ocr][confidence>=0.8]')
164
+
165
+ # Visualize OCR results with color-coded confidence levels
166
+ for elem in page.find_all('text[source=ocr]'):
167
+ if elem.confidence >= 0.8:
168
+ color = (0, 1, 0, 0.3) # Green for high confidence
169
+ elif elem.confidence >= 0.5:
170
+ color = (1, 1, 0, 0.3) # Yellow for medium confidence
171
+ else:
172
+ color = (1, 0, 0, 0.3) # Red for low confidence
173
+
174
+ elem.highlight(color=color, label=f"OCR ({elem.confidence:.2f})")
175
+ page.save_image('ocr_results.png', labels=True)
176
+ ```
177
+
178
+ ## Logging
179
+
180
+ The library includes a structured logging system to provide visibility into its operations:
181
+
182
+ ```python
183
+ import logging
184
+ from natural_pdf import PDF, configure_logging
185
+
186
+ # Configure logging with INFO level to console
187
+ configure_logging(level=logging.INFO)
188
+
189
+ # Or log to a file with DEBUG level
190
+ file_handler = logging.FileHandler("natural_pdf.log")
191
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
192
+ configure_logging(level=logging.DEBUG, handler=file_handler)
193
+
194
+ # Now operations will generate logs
195
+ pdf = PDF("document.pdf")
196
+ # Log: natural_pdf.core.pdf - INFO - Initializing PDF from document.pdf
197
+
198
+ # Run layout detection with verbose logging
199
+ regions = pdf.pages[0].analyze_layout(
200
+ model="paddle",
201
+ model_params={"verbose": True}
202
+ )
203
+ # Log: natural_pdf.analyzers.layout.paddle - INFO - Starting PaddleLayout detection...
204
+ # Log: natural_pdf.analyzers.layout.paddle - DEBUG - Parameters: confidence=0.2...
205
+ ```
206
+
207
+ Logs follow a hierarchical structure matching the library's module organization:
208
+ - `natural_pdf.core` - Core PDF operations
209
+ - `natural_pdf.analyzers` - Layout analysis operations
210
+ - `natural_pdf.ocr` - OCR engine operations
211
+
212
+ ## More details
213
+
214
+ [Complete documentation here](https://jsoma.github.io/natural-pdf)