natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
docs/pdf-navigation/index.md
DELETED
@@ -1,97 +0,0 @@
|
|
1
|
-
# PDF Navigation
|
2
|
-
|
3
|
-
This guide covers the basics of working with PDFs in Natural PDF - opening documents, accessing pages, and navigating through content.
|
4
|
-
|
5
|
-
## Opening a PDF
|
6
|
-
|
7
|
-
The main entry point to Natural PDF is the `PDF` class:
|
8
|
-
|
9
|
-
```python
|
10
|
-
from natural_pdf import PDF
|
11
|
-
|
12
|
-
# Open a PDF file
|
13
|
-
pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42001.pdf")
|
14
|
-
```
|
15
|
-
|
16
|
-
## Accessing Pages
|
17
|
-
|
18
|
-
Once you have a PDF object, you can access its pages:
|
19
|
-
|
20
|
-
```python
|
21
|
-
# Get the total number of pages
|
22
|
-
num_pages = len(pdf)
|
23
|
-
print(f"This PDF has {num_pages} pages")
|
24
|
-
|
25
|
-
# Get a specific page (0-indexed)
|
26
|
-
first_page = pdf.pages[0]
|
27
|
-
last_page = pdf.pages[-1]
|
28
|
-
|
29
|
-
# Iterate through the first 20 pages
|
30
|
-
for page in pdf.pages[:20]:
|
31
|
-
print(f"Page {page.number} has {len(page.extract_text())} characters")
|
32
|
-
```
|
33
|
-
|
34
|
-
## Page Properties
|
35
|
-
|
36
|
-
Each `Page` object has useful properties:
|
37
|
-
|
38
|
-
```python
|
39
|
-
# Page dimensions in points (1/72 inch)
|
40
|
-
print(page.width, page.height)
|
41
|
-
|
42
|
-
# Page number (1-indexed as shown in PDF viewers)
|
43
|
-
print(page.number)
|
44
|
-
|
45
|
-
# Page index (0-indexed position in the PDF)
|
46
|
-
print(page.index)
|
47
|
-
```
|
48
|
-
|
49
|
-
## Working Across Pages
|
50
|
-
|
51
|
-
Natural PDF makes it easy to work with content across multiple pages:
|
52
|
-
|
53
|
-
```python
|
54
|
-
# Extract text from all pages
|
55
|
-
all_text = pdf.extract_text()
|
56
|
-
|
57
|
-
# Find elements across all pages
|
58
|
-
all_headings = pdf.find_all('text[size>=14]:bold')
|
59
|
-
|
60
|
-
# Add exclusion zones to all pages (like headers/footers)
|
61
|
-
pdf.add_exclusion(
|
62
|
-
lambda page: page.find('text:contains("CONFIDENTIAL")').above() if page.find('text:contains("CONFIDENTIAL")') else None,
|
63
|
-
label="header"
|
64
|
-
)
|
65
|
-
```
|
66
|
-
|
67
|
-
## The Page Collection
|
68
|
-
|
69
|
-
The `pdf.pages` object is a `PageCollection` that allows batch operations on pages:
|
70
|
-
|
71
|
-
```python
|
72
|
-
# Extract text from specific pages
|
73
|
-
text = pdf.pages[2:5].extract_text()
|
74
|
-
|
75
|
-
# Find elements across specific pages
|
76
|
-
elements = pdf.pages[2:5].find_all('text:contains("Annual Report")')
|
77
|
-
```
|
78
|
-
|
79
|
-
## Document Sections Across Pages
|
80
|
-
|
81
|
-
You can extract sections that span across multiple pages:
|
82
|
-
|
83
|
-
```python
|
84
|
-
# Get sections with headings as section starts
|
85
|
-
sections = pdf.pages.get_sections(
|
86
|
-
start_elements='text[size>=14]:bold',
|
87
|
-
new_section_on_page_break=False
|
88
|
-
)
|
89
|
-
```
|
90
|
-
|
91
|
-
## Next Steps
|
92
|
-
|
93
|
-
Now that you know how to navigate PDFs, you can:
|
94
|
-
|
95
|
-
- [Find elements using selectors](../element-selection/index.ipynb)
|
96
|
-
- [Extract text from your documents](../text-extraction/index.ipynb)
|
97
|
-
- [Work with specific regions](../regions/index.ipynb)
|