natural-pdf 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- natural_pdf/__init__.py,sha256=hsSosbPnvDRCfyYAL9bf1haVS6oBxLAl7cbKTWRTHkU,1784
1
+ natural_pdf/__init__.py,sha256=hdqbTG3SHtu8jPIL7su6TpEhEbNsL89pgktCXPMKWCI,2825
2
2
  natural_pdf/analyzers/__init__.py,sha256=BkSmEqw5J76C2fvYHF86EXQJQWWFNIvjSwRMwfW-Ht0,140
3
3
  natural_pdf/analyzers/text_options.py,sha256=9IGRoem1O2mc1ZNGiM5-VPRZ3c8LLwEk1B3is9UxMoE,2777
4
4
  natural_pdf/analyzers/text_structure.py,sha256=e4G6v0bD7ZJCdo6DcuDD3iZt8KAwBfALMduwZHGh0wI,12415
@@ -6,25 +6,28 @@ natural_pdf/analyzers/utils.py,sha256=u5_FAUPmEG1ydPVuxpu7bVw507NB3WzisMNSUhsnuk
6
6
  natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
7
7
  natural_pdf/analyzers/layout/base.py,sha256=D6KHDsbVKzZWCfW4vt0khPC3TA9JzQD3cF4VtTSyf28,6752
8
8
  natural_pdf/analyzers/layout/docling.py,sha256=iNeD10ZfolDVJNqayAUd0-Bs2tVr5INE7WK9c_Mll_8,11930
9
- natural_pdf/analyzers/layout/layout_analyzer.py,sha256=oQeqPDHL6vpj_3NHuzS5ja7KVAAL7PhQ7IOwustDBBo,8008
10
- natural_pdf/analyzers/layout/layout_manager.py,sha256=Qr5pxcv_Wk5IJRJr0IoYJJAz71RGJvQgqXONNBhNLOw,9221
11
- natural_pdf/analyzers/layout/layout_options.py,sha256=vZvTSg_M27OirZomcC5uWLSmPYXjvnnCEo5QKy9RjaQ,3503
9
+ natural_pdf/analyzers/layout/layout_analyzer.py,sha256=JJasXl7QEiP4DgAvf-zu1w7Uakdf8ypvITkpQ-OQDgA,13340
10
+ natural_pdf/analyzers/layout/layout_manager.py,sha256=6Zi9SBonpa0urWyeQBJnmxIL1hOn4xAx09ugkMrEhro,9555
11
+ natural_pdf/analyzers/layout/layout_options.py,sha256=EmvPEnDsVGMJkDNfn6ORLnX545gbmlo3kVcz4anVm5Q,3325
12
12
  natural_pdf/analyzers/layout/paddle.py,sha256=QCasH_Z9UITX6wRGlE_HjmwkBuANz9Yyw5Yk7QvRVcw,12519
13
- natural_pdf/analyzers/layout/surya.py,sha256=hmPDfXzTkF2PQPgvg1xjTJSBKFuCmjZB3GTCHa-kpA4,6477
13
+ natural_pdf/analyzers/layout/surya.py,sha256=Ibwo42TioJ-BZP3-2T13KCtH3kLSWQh7C9ZYuk1kUQo,12657
14
14
  natural_pdf/analyzers/layout/tatr.py,sha256=H0Xygk9jA46-vlPleoal94cuDyz-LHTSxVb3e6gpmV8,11956
15
15
  natural_pdf/analyzers/layout/yolo.py,sha256=NSQK3TcS1qN8D2MDxCvcwTpS_kvzGy3I2LepJDUceoQ,7699
16
+ natural_pdf/collections/pdf_collection.py,sha256=Da8saWBTguxk16pNzMxCrFwatrWk_qrcG0RVPQybro8,12159
16
17
  natural_pdf/core/__init__.py,sha256=GUuFtj2Apc9biAdUOlnL8leL3BQncEzubvpiAUaU3ss,37
17
18
  natural_pdf/core/element_manager.py,sha256=H1896JSt48ASLSmG22xEXMY-xSKcpYsUlYmYMD48i6Q,17117
18
19
  natural_pdf/core/highlighting_service.py,sha256=a-40UMohOglYrw4klW1GuQ_p3jZOxnAfPOXPORThr4U,31476
19
- natural_pdf/core/page.py,sha256=tnxG-5OhFVuFHt0p-a9YSLU-nXjA8fftg5ViQdH5sOU,68512
20
- natural_pdf/core/pdf.py,sha256=UzxVfVeCnhSN7rxdJresUj_UNFkcFkeaEjLvwZMJS-c,28532
20
+ natural_pdf/core/page.py,sha256=qhumZqmwHoBlGodiCvYE0z34Iu1WSs32V4_Iz_Sfaow,69350
21
+ natural_pdf/core/pdf.py,sha256=MLN-asJ_d5spmCjLz7SDp74t__vioszfKEFooBul7nU,41167
21
22
  natural_pdf/elements/__init__.py,sha256=6FGHZm2oONd8zErahMEawuB4AvJR5jOZPt4KtEwbj80,40
22
23
  natural_pdf/elements/base.py,sha256=9SQ-O2qbQe9Avbf9JI-p6vWlyThZVch-p1yqXWSrBHw,35750
23
24
  natural_pdf/elements/collections.py,sha256=RJf4cBZeLfCtfS0-SjzYFRCtbzYjWsgk3LrcTwJAYMs,62392
24
25
  natural_pdf/elements/line.py,sha256=QvVdhf_K6rwJkq3q67JmgdZpDhrBgWuSMF-Q25malP4,4783
25
26
  natural_pdf/elements/rect.py,sha256=dls9g-R213O78HvfAJMak3_eV14Zh654Zw7hqTTXxDQ,3949
26
- natural_pdf/elements/region.py,sha256=MXQK00LLMvwuq94NigeeCVFoGov_RWFe9ZylnIMpzB0,72453
27
+ natural_pdf/elements/region.py,sha256=5dXHYbbdO1QNgkD6b6I34ezHt-SHKx_aH1ubzbfMHQs,74370
27
28
  natural_pdf/elements/text.py,sha256=OAuy0ozaemj6yjMwhXPsJ76VZtRPeJbmrFTzpDJA2_U,11017
29
+ natural_pdf/exporters/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
30
+ natural_pdf/exporters/searchable_pdf.py,sha256=PPkF64hFNNhPlZPuyJRvC_scAg3WCOiIvwgIP8nlZ9E,10225
28
31
  natural_pdf/ocr/__init__.py,sha256=mbUUsCfeU6yRsEqNn3I4Len-XY6FfjfKhTAoWDLA1f4,1943
29
32
  natural_pdf/ocr/engine.py,sha256=xDnvhnm4Lr7d83ezglDqOtl9xfx74zOOTyYW-fZHQEQ,4183
30
33
  natural_pdf/ocr/engine_easyocr.py,sha256=6srZhXqlH3UpNWw5iFq7u4TS5HQsMSTWYuuWo3oYZp8,8273
@@ -34,6 +37,12 @@ natural_pdf/ocr/ocr_manager.py,sha256=mAyCntdAnrNv8TIvGYlGs40G2tDAdMQ_Jqb3owiPWW
34
37
  natural_pdf/ocr/ocr_options.py,sha256=A2CQV172id-90zMpPZWb8CD09ZP0BuQnnCZGEFP4SaQ,3787
35
38
  natural_pdf/qa/__init__.py,sha256=kagdfqNMpTnyzjC2EFy_PBX5us38NnJL548ESSQVzfI,107
36
39
  natural_pdf/qa/document_qa.py,sha256=QYKKor0RqUQcEdFEBEUdq7L0ktq1WSMfQ-ynTc64cPU,15926
40
+ natural_pdf/search/__init__.py,sha256=sYv7-XrSohUgE2UH8sFpGfl66SG092jZoNokZaDdxsY,4125
41
+ natural_pdf/search/haystack_search_service.py,sha256=qhvqVJMxz4-KTnQF0MPO7YLQxTlYe27PCgKJgYeAels,27580
42
+ natural_pdf/search/haystack_utils.py,sha256=BXU5yIEcFIWliSX44slMYLlUMfwCXEfve-ZYmVcEt3k,18773
43
+ natural_pdf/search/search_options.py,sha256=PrIGkvM9A9wpqaz6tDB-9hWiSp9fqhi8mf7FQl1qoGI,3510
44
+ natural_pdf/search/search_service_protocol.py,sha256=5EYzHFUoFvaYw3khnQNz1dsOHqTvBChekvk_qf2mu5w,6811
45
+ natural_pdf/search/searchable_mixin.py,sha256=QPtPSJHCP5n0Twp4uHKSns8J6HuvGjyipTNbB66JFLg,24896
37
46
  natural_pdf/selectors/__init__.py,sha256=Jfk-JBZEpQ7V5FWVGuLJQLH-qOfqNLC2AdicncMhrmY,121
38
47
  natural_pdf/selectors/parser.py,sha256=JK1zDVISACkUhzmzWfQMMW8hvsV422lRBFKgDBWOWC4,24108
39
48
  natural_pdf/templates/__init__.py,sha256=i7N8epDxZoDDsK4p2iUiMwzKVs97i_KtNk8ATArqlC4,19
@@ -45,8 +54,8 @@ natural_pdf/utils/visualization.py,sha256=14BM-K4ovDqHniNbxbP_y9KaEYNlkbpELGAv9_
45
54
  natural_pdf/widgets/__init__.py,sha256=qckw3DjdVTsASPLJ8uUrGKg3MFhvzHndUpeNGlqwg6A,215
46
55
  natural_pdf/widgets/viewer.py,sha256=h_amj_uvf-vRqEsFg4P00fgKxawLAd9jjC1ohUza4BY,37479
47
56
  natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
48
- natural_pdf-0.1.1.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
49
- natural_pdf-0.1.1.dist-info/METADATA,sha256=8o22GEPtEqlSqexFQxy6tVoHTB35LmT63sjbjbjORRE,10009
50
- natural_pdf-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
51
- natural_pdf-0.1.1.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
52
- natural_pdf-0.1.1.dist-info/RECORD,,
57
+ natural_pdf-0.1.3.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
58
+ natural_pdf-0.1.3.dist-info/METADATA,sha256=kBSb1SueOGQFw97pvHBxlJYcuNwxAB-lInLKows0BEs,5069
59
+ natural_pdf-0.1.3.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
60
+ natural_pdf-0.1.3.dist-info/top_level.txt,sha256=XtfS3IiR1fTjaQG9TjGDjZsB1Ih2GXQteDbJ2dXlLvQ,12
61
+ natural_pdf-0.1.3.dist-info/RECORD,,
@@ -1,295 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: natural-pdf
3
- Version: 0.1.1
4
- Summary: A more intuitive interface for working with PDFs
5
- Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
- License-Expression: MIT
7
- Project-URL: Homepage, https://github.com/jsoma/natural-pdf
8
- Project-URL: Repository, https://github.com/jsoma/natural-pdf
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.7
12
- Description-Content-Type: text/markdown
13
- License-File: LICENSE
14
- Requires-Dist: pdfplumber>=0.7.0
15
- Requires-Dist: Pillow>=8.0.0
16
- Requires-Dist: colour>=0.1.5
17
- Requires-Dist: numpy>=1.20.0
18
- Requires-Dist: urllib3>=1.26.0
19
- Requires-Dist: torch>=2.0.0
20
- Requires-Dist: torchvision>=0.15.0
21
- Requires-Dist: transformers>=4.30.0
22
- Requires-Dist: huggingface_hub>=0.19.0
23
- Provides-Extra: interactive
24
- Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
25
- Provides-Extra: easyocr
26
- Requires-Dist: easyocr; extra == "easyocr"
27
- Provides-Extra: paddle
28
- Requires-Dist: paddlepaddle; extra == "paddle"
29
- Requires-Dist: paddleocr; extra == "paddle"
30
- Provides-Extra: layout-yolo
31
- Requires-Dist: doclayout_yolo; extra == "layout-yolo"
32
- Provides-Extra: surya
33
- Requires-Dist: surya-ocr; extra == "surya"
34
- Provides-Extra: qa
35
- Provides-Extra: all
36
- Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
37
- Requires-Dist: easyocr; extra == "all"
38
- Requires-Dist: paddlepaddle; extra == "all"
39
- Requires-Dist: paddleocr; extra == "all"
40
- Requires-Dist: doclayout_yolo; extra == "all"
41
- Requires-Dist: surya-ocr; extra == "all"
42
- Dynamic: license-file
43
-
44
- # Natural PDF
45
-
46
- A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
47
-
48
- Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
49
-
50
- - [Complete documentation here](https://jsoma.github.io/natural-pdf)
51
- - [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
52
-
53
- ## Features
54
-
55
- - **Fluent API** for chaining operations
56
- - **CSS-like selectors** for finding elements
57
- - **Spatial navigation** with intuitive methods like `above()`, `below()`, and `select_until()`
58
- - **Element collections** for batch operations
59
- - **Visual highlighting** for debugging (persistent highlights)
60
- - **Interactive element viewer** for Jupyter environments (`.viewer()`)
61
- - **Region visualization** with direct image extraction of specific regions
62
- - **Text style analysis** for document structure
63
- - **Exclusion zones** for headers, footers, and other areas to ignore
64
- - **OCR integration** with multiple engines (EasyOCR, PaddleOCR, Surya)
65
- - **Document layout analysis** for detecting document structure with ML models
66
- - **Table extraction** with multiple detection methods
67
- - **Structured logging** with configurable levels and handlers
68
-
69
- ## Installation
70
-
71
- ```bash
72
- pip install natural-pdf
73
- ```
74
-
75
- # Installs the core library along with required AI dependencies (PyTorch, Transformers)
76
- ```bash
77
- # Install with support for specific OCR and layout engines
78
- pip install natural-pdf[easyocr]
79
- pip install natural-pdf[paddle]
80
- pip install natural-pdf[surya]
81
- pip install natural-pdf[layout_yolo]
82
-
83
- # Install with support for the interactive Jupyter widget
84
- pip install natural-pdf[interactive]
85
-
86
- # Just install everything
87
- pip install natural-pdf[all]
88
- ```
89
-
90
- ## Quick Start
91
-
92
- ```python
93
- from natural_pdf import PDF
94
-
95
- # Open a local PDF
96
- pdf = PDF('document.pdf')
97
-
98
- # Or open a PDF from a URL
99
- pdf = PDF('https://example.com/document.pdf')
100
-
101
- # Get the first page
102
- page = pdf.pages[0]
103
-
104
- # Find elements using CSS-like selectors
105
- heading = page.find('text:contains("Summary"):bold')
106
-
107
- # Extract content below the heading
108
- content = heading.below().extract_text()
109
- print(content)
110
-
111
- # Exclude headers and footers
112
- page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
113
- page.add_exclusion(page.find_all('line')[-1].below())
114
-
115
- # Extract clean text
116
- clean_text = page.extract_text()
117
- print(clean_text)
118
- ```
119
-
120
- - [Complete documentation here](https://jsoma.github.io/natural-pdf)
121
-
122
- ## Selectors
123
-
124
- The library supports CSS-like selectors for finding elements:
125
-
126
- ```python
127
- # Find text containing a specific string
128
- element = page.find('text:contains("Revenue")')
129
-
130
- # Find bold text with a specific font size
131
- headings = page.find_all('text[size>=12]:bold')
132
-
133
- # Find thick red lines
134
- lines = page.find_all('line[width>=2][color~=(1,0,0)]')
135
- ```
136
-
137
- ## Spatial Navigation
138
-
139
- Navigate through the document with intuitive spatial methods:
140
-
141
- ```python
142
- # Get content below a heading
143
- heading = page.find('text:contains("Introduction")')
144
- content = heading.below().extract_text()
145
-
146
- # Get content from one element to another
147
- start = page.find('text:contains("Start")')
148
- end = page.find('text:contains("End")')
149
- region = start.select_until(end)
150
- content = region.extract_text()
151
- ```
152
-
153
- ## Exclusion Zones
154
-
155
- Exclude headers, footers, or other areas from extraction:
156
-
157
- ```python
158
- # Page-level exclusion
159
- page.add_exclusion(page.find('text:contains("Page")').above())
160
- page.add_exclusion(page.find_all('line')[-1].below())
161
-
162
- # PDF-level exclusion with lambdas
163
- pdf.add_exclusion(
164
- lambda page: page.find('text:contains("Header")').above(),
165
- label="headers"
166
- )
167
-
168
- # Extract text with exclusions applied
169
- text = pdf.extract_text()
170
-
171
- # Extract from a specific region with exclusions
172
- summary = page.find('text:contains("Summary")')
173
- conclusion = page.find('text:contains("Conclusion")')
174
- region = page.create_region(summary.x0, summary.top, conclusion.x1, conclusion.bottom)
175
- region_text = region.extract_text(apply_exclusions=True) # Excludes headers/footers
176
-
177
- # Disable exclusions for a specific extraction
178
- full_text = page.extract_text(apply_exclusions=False)
179
- ```
180
-
181
- Exclusions work efficiently with different region types:
182
- - Regions without intersection with exclusion zones → exclusions ignored entirely
183
- - Rectangular regions with header/footer exclusions → optimized cropping
184
- - Complex regions with partial exclusions → advanced filtering with warning
185
-
186
- ## OCR Integration
187
-
188
- Extract text from scanned documents using OCR, with support for multiple engines ([EasyOCR](https://www.jaided.ai/easyocr/), [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html), [Surya](https://github.com/VikParuchuri/surya)):
189
-
190
- ```python
191
- # Apply OCR using a specific engine (e.g., PaddleOCR)
192
- ocr_elements = page.apply_ocr(engine='paddle', languages=['en', 'zh-cn'])
193
-
194
- # Extract text (will use previously applied OCR results if available)
195
- text = page.extract_text()
196
-
197
- # Configure advanced engine options using Options classes
198
- from natural_pdf.ocr import PaddleOCROptions
199
- paddle_opts = PaddleOCROptions(languages=['en'], use_angle_cls=False, rec_batch_num=8)
200
- ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
201
-
202
- # Force OCR regardless of existing text
203
- ocr_text = page.extract_text(ocr=True)
204
-
205
- # Find OCR-detected text with high confidence
206
- high_confidence = page.find_all('text[source=ocr][confidence>=0.8]')
207
-
208
- # Visualize OCR results with color-coded confidence levels
209
- for elem in page.find_all('text[source=ocr]'):
210
- if elem.confidence >= 0.8:
211
- color = (0, 1, 0, 0.3) # Green for high confidence
212
- elif elem.confidence >= 0.5:
213
- color = (1, 1, 0, 0.3) # Yellow for medium confidence
214
- else:
215
- color = (1, 0, 0, 0.3) # Red for low confidence
216
-
217
- elem.highlight(color=color, label=f"OCR ({elem.confidence:.2f})")
218
- page.save_image('ocr_results.png', labels=True)
219
- ```
220
-
221
- ## Logging
222
-
223
- The library includes a structured logging system to provide visibility into its operations:
224
-
225
- ```python
226
- import logging
227
- from natural_pdf import PDF, configure_logging
228
-
229
- # Configure logging with INFO level to console
230
- configure_logging(level=logging.INFO)
231
-
232
- # Or log to a file with DEBUG level
233
- file_handler = logging.FileHandler("natural_pdf.log")
234
- file_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
235
- configure_logging(level=logging.DEBUG, handler=file_handler)
236
-
237
- # Now operations will generate logs
238
- pdf = PDF("document.pdf")
239
- # Log: natural_pdf.core.pdf - INFO - Initializing PDF from document.pdf
240
-
241
- # Run layout detection with verbose logging
242
- regions = pdf.pages[0].analyze_layout(
243
- model="paddle",
244
- model_params={"verbose": True}
245
- )
246
- # Log: natural_pdf.analyzers.layout.paddle - INFO - Starting PaddleLayout detection...
247
- # Log: natural_pdf.analyzers.layout.paddle - DEBUG - Parameters: confidence=0.2...
248
- ```
249
-
250
- Logs follow a hierarchical structure matching the library's module organization:
251
- - `natural_pdf.core` - Core PDF operations
252
- - `natural_pdf.analyzers` - Layout analysis operations
253
- - `natural_pdf.ocr` - OCR engine operations
254
-
255
- ## Document QA
256
-
257
- Ask questions directly to your documents:
258
-
259
- ```python
260
- # Ask questions about the document content
261
- result = pdf.ask("What was the company's revenue in 2022?")
262
- print(f"Answer: {result['answer']}")
263
- print(f"Confidence: {result['confidence']:.2f}")
264
-
265
- # Access more details in the result dictionary
266
- result = pdf.ask("Who is the CEO?")
267
- print(f"Answer: {result['answer']}")
268
- print(f"Found on page: {result['page_num']}")
269
- print(f"Source text: {result.get('source_text', 'N/A')}")
270
- ```
271
-
272
- ## More details
273
-
274
- [Complete documentation here](https://jsoma.github.io/natural-pdf)
275
-
276
- ## Visual Debugging & Interactive Viewer
277
-
278
- Use highlighting to understand element selection and analysis results. Add persistent highlights using `.highlight()` and view them with the interactive `.viewer()` or static `.save_image()`. You can also generate temporary previews of selected elements using `ElementCollection.show()`.
279
-
280
- ```python
281
- # Highlight selected elements persistently
282
- page.find_all('text:bold').highlight(label="Bold Text")
283
-
284
- # Launch the interactive widget in Jupyter (shows persistent highlights)
285
- # Requires: pip install natural-pdf[interactive]
286
- page.viewer()
287
-
288
- # Save a static image file with highlights and legend
289
- page.save_image("highlighted_page.png", labels=True)
290
-
291
- # Show a temporary preview image of specific elements, grouped by attribute
292
- preview_image = page.find_all('region[type*=table]').show(group_by='type')
293
- # In Jupyter, this image will display automatically
294
- preview_image
295
- ```