natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
docs/api/index.md
DELETED
@@ -1,386 +0,0 @@
|
|
1
|
-
# API Reference
|
2
|
-
|
3
|
-
This section provides detailed documentation for all the classes and methods in Natural PDF.
|
4
|
-
|
5
|
-
## Core Classes
|
6
|
-
|
7
|
-
### PDF Class
|
8
|
-
|
9
|
-
The main entry point for working with PDFs.
|
10
|
-
|
11
|
-
```python
|
12
|
-
class PDF:
|
13
|
-
"""
|
14
|
-
The main entry point for working with PDFs.
|
15
|
-
|
16
|
-
Parameters:
|
17
|
-
path (str): Path to the PDF file.
|
18
|
-
password (str, optional): Password for encrypted PDFs. Default: None
|
19
|
-
reading_order (bool, optional): Sort elements in reading order. Default: True
|
20
|
-
keep_spaces (bool, optional): Keep spaces in word elements. Default: True
|
21
|
-
font_attrs (list, optional): Font attributes to use for text grouping.
|
22
|
-
Default: ['fontname', 'size']
|
23
|
-
ocr (bool/dict/str, optional): OCR configuration. Default: False
|
24
|
-
ocr_engine (str/Engine, optional): OCR engine to use. Default: "easyocr"
|
25
|
-
"""
|
26
|
-
```
|
27
|
-
|
28
|
-
**Main Methods**
|
29
|
-
|
30
|
-
| Method | Description | Parameters | Returns |
|
31
|
-
|--------|-------------|------------|---------|
|
32
|
-
| `pages` | Access pages in the document | N/A (property) | `PageCollection` |
|
33
|
-
| `extract_text(keep_blank_chars=True, apply_exclusions=True)` | Extract text from all pages | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones | `str`: Extracted text |
|
34
|
-
| `find(selector, case=True, regex=False, apply_exclusions=True)` | Find first element matching selector across all pages | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
|
35
|
-
| `find_all(selector, case=True, regex=False, apply_exclusions=True)` | Find all elements matching selector across all pages | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `ElementCollection` |
|
36
|
-
| `add_exclusion(func, label=None)` | Add a document-wide exclusion zone | `func`: Function taking a page and returning region<br>`label`: Optional label for the exclusion | `None` |
|
37
|
-
| `get_sections(start_elements, end_elements=None, boundary_inclusion='start')` | Get sections across all pages | `start_elements`: Elements marking section starts<br>`end_elements`: Elements marking section ends<br>`boundary_inclusion`: How to include boundaries ('start', 'end', 'both', 'none') | `list[Region]` |
|
38
|
-
| `ask(question, min_confidence=0.0, model=None)` | Ask a question about the document content | `question`: Question to ask<br>`min_confidence`: Minimum confidence threshold<br>`model`: Optional model name or path | `dict`: Result with answer and metadata |
|
39
|
-
|
40
|
-
### Page Class
|
41
|
-
|
42
|
-
Represents a single page in a PDF document.
|
43
|
-
|
44
|
-
```python
|
45
|
-
class Page:
|
46
|
-
"""
|
47
|
-
Represents a single page in a PDF document.
|
48
|
-
|
49
|
-
Properties:
|
50
|
-
page_number (int): 1-indexed page number
|
51
|
-
page_index (int): 0-indexed page position
|
52
|
-
width (float): Page width in points
|
53
|
-
height (float): Page height in points
|
54
|
-
pdf (PDF): Parent PDF object
|
55
|
-
"""
|
56
|
-
```
|
57
|
-
|
58
|
-
**Main Methods**
|
59
|
-
|
60
|
-
| Method | Description | Parameters | Returns |
|
61
|
-
|--------|-------------|------------|---------|
|
62
|
-
| `extract_text(keep_blank_chars=True, apply_exclusions=True, ocr=None)` | Extract text from the page | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones<br>`ocr`: Whether to force OCR | `str`: Extracted text |
|
63
|
-
| `find(selector, case=True, regex=False, apply_exclusions=True)` | Find the first element matching selector | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
|
64
|
-
| `find_all(selector, case=True, regex=False, apply_exclusions=True)` | Find all elements matching selector | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `ElementCollection` |
|
65
|
-
| `create_region(x0, top, x1, bottom)` | Create a region at specific coordinates | `x0`: Left coordinate<br>`top`: Top coordinate<br>`x1`: Right coordinate<br>`bottom`: Bottom coordinate | `Region` |
|
66
|
-
| `highlight(elements, color=None, label=None)` | Highlight elements on the page | `elements`: Elements to highlight<br>`color`: RGBA color tuple<br>`label`: Label for the highlight | `Page` (self) |
|
67
|
-
| `highlight_all(include_types=None, include_text_styles=False, include_layout_regions=False)` | Highlight all elements on the page | `include_types`: Element types to include<br>`include_text_styles`: Whether to include text styles<br>`include_layout_regions`: Whether to include layout regions | `Page` (self) |
|
68
|
-
| `save_image(path, resolution=72, labels=True)` | Save an image of the page with highlights | `path`: Path to save image<br>`resolution`: Image resolution in DPI<br>`labels`: Whether to include labels | `None` |
|
69
|
-
| `to_image(resolution=72, labels=True)` | Get a PIL Image of the page with highlights | `resolution`: Image resolution in DPI<br>`labels`: Whether to include labels | `PIL.Image` |
|
70
|
-
| `analyze_text_styles()` | Group text by visual style properties | None | `dict`: Mapping of style name to elements |
|
71
|
-
| `analyze_layout(engine="yolo", confidence=0.2, existing="replace")` | Detect layout regions using ML models | `model`: Model to use ("yolo", "tatr")<br>`confidence`: Confidence threshold<br>`existing`: How to handle existing regions | `ElementCollection`: Detected regions |
|
72
|
-
| `add_exclusion(region, label=None)` | Add an exclusion zone to the page | `region`: Region to exclude<br>`label`: Optional label for the exclusion | `Region`: The exclusion region |
|
73
|
-
| `get_sections(start_elements, end_elements=None, boundary_inclusion='start')` | Get sections from the page | `start_elements`: Elements marking section starts<br>`end_elements`: Elements marking section ends<br>`boundary_inclusion`: How to include boundaries | `list[Region]` |
|
74
|
-
| `ask(question, min_confidence=0.0, model=None, debug=False)` | Ask a question about the page content | `question`: Question to ask<br>`min_confidence`: Minimum confidence threshold<br>`model`: Optional model name or path<br>`debug`: Whether to save debug files | `dict`: Result with answer and metadata |
|
75
|
-
| `apply_ocr(languages=None, min_confidence=0.0, **kwargs)` | Apply OCR to the page | `languages`: Languages to use<br>`min_confidence`: Minimum confidence threshold<br>`**kwargs`: Additional OCR engine parameters | `ElementCollection`: OCR text elements |
|
76
|
-
|
77
|
-
### Region Class
|
78
|
-
|
79
|
-
Represents a rectangular area on a page.
|
80
|
-
|
81
|
-
```python
|
82
|
-
class Region:
|
83
|
-
"""
|
84
|
-
Represents a rectangular area on a page.
|
85
|
-
|
86
|
-
Properties:
|
87
|
-
x0 (float): Left coordinate
|
88
|
-
top (float): Top coordinate
|
89
|
-
x1 (float): Right coordinate
|
90
|
-
bottom (float): Bottom coordinate
|
91
|
-
width (float): Width of the region
|
92
|
-
height (float): Height of the region
|
93
|
-
page (Page): Parent page object
|
94
|
-
"""
|
95
|
-
```
|
96
|
-
|
97
|
-
**Main Methods**
|
98
|
-
|
99
|
-
| Method | Description | Parameters | Returns |
|
100
|
-
|--------|-------------|------------|---------|
|
101
|
-
| `extract_text(keep_blank_chars=True, apply_exclusions=True, ocr=None)` | Extract text from the region | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones<br>`ocr`: Whether to force OCR | `str`: Extracted text |
|
102
|
-
| `find(selector, case=True, regex=False, apply_exclusions=True)` | Find the first element matching selector within the region | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
|
103
|
-
| `find_all(selector, case=True, regex=False, apply_exclusions=True)` | Find all elements matching selector within the region | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `ElementCollection` |
|
104
|
-
| `expand(left=0, top=0, right=0, bottom=0, width_factor=1.0, height_factor=1.0)` | Expand the region in specified directions | `left/top/right/bottom`: Points to expand in each direction<br>`width_factor/height_factor`: Scale width/height by this factor | `Region`: Expanded region |
|
105
|
-
| `highlight(color=None, label=None, include_attrs=None)` | Highlight the region | `color`: RGBA color tuple<br>`label`: Label for the highlight<br>`include_attrs`: Region attributes to display | `Region` (self) |
|
106
|
-
| `to_image(resolution=72, crop_only=False)` | Get a PIL Image of just the region | `resolution`: Image resolution in DPI<br>`crop_only`: Whether to exclude border | `PIL.Image` |
|
107
|
-
| `save_image(path, resolution=72, crop_only=False)` | Save an image of just the region | `path`: Path to save image<br>`resolution`: Image resolution in DPI<br>`crop_only`: Whether to exclude border | `None` |
|
108
|
-
| `get_sections(start_elements, end_elements=None, boundary_inclusion='start')` | Get sections within the region | `start_elements`: Elements marking section starts<br>`end_elements`: Elements marking section ends<br>`boundary_inclusion`: How to include boundaries | `list[Region]` |
|
109
|
-
| `ask(question, min_confidence=0.0, model=None, debug=False)` | Ask a question about the region content | `question`: Question to ask<br>`min_confidence`: Minimum confidence threshold<br>`model`: Optional model name or path<br>`debug`: Whether to save debug files | `dict`: Result with answer and metadata |
|
110
|
-
| `extract_table(method=None, table_settings=None, use_ocr=False)` | Extract table data from the region | `method`: Extraction method ("plumber", "tatr")<br>`table_settings`: Custom settings for extraction<br>`use_ocr`: Whether to use OCR text | `list`: Table data as rows and columns |
|
111
|
-
| `intersects(other)` | Check if this region intersects with another | `other`: Another region | `bool`: True if regions intersect |
|
112
|
-
| `contains(x, y)` | Check if a point is within the region | `x`: X coordinate<br>`y`: Y coordinate | `bool`: True if point is in region |
|
113
|
-
|
114
|
-
## Element Types
|
115
|
-
|
116
|
-
### Element Base Class
|
117
|
-
|
118
|
-
The base class for all PDF elements.
|
119
|
-
|
120
|
-
```python
|
121
|
-
class Element:
|
122
|
-
"""
|
123
|
-
Base class for all PDF elements.
|
124
|
-
|
125
|
-
Properties:
|
126
|
-
x0 (float): Left coordinate
|
127
|
-
top (float): Top coordinate
|
128
|
-
x1 (float): Right coordinate
|
129
|
-
bottom (float): Bottom coordinate
|
130
|
-
width (float): Width of the element
|
131
|
-
height (float): Height of the element
|
132
|
-
page (Page): Parent page object
|
133
|
-
"""
|
134
|
-
```
|
135
|
-
|
136
|
-
**Main Methods**
|
137
|
-
|
138
|
-
| Method | Description | Parameters | Returns |
|
139
|
-
|--------|-------------|------------|---------|
|
140
|
-
| `above(height=None, full_width=True, until=None, include_until=True)` | Create a region above the element | `height`: Height of region<br>`full_width`: Whether to span page width<br>`until`: Selector for boundary<br>`include_until`: Whether to include boundary | `Region` |
|
141
|
-
| `below(height=None, full_width=True, until=None, include_until=True)` | Create a region below the element | `height`: Height of region<br>`full_width`: Whether to span page width<br>`until`: Selector for boundary<br>`include_until`: Whether to include boundary | `Region` |
|
142
|
-
| `select_until(selector, include_endpoint=True, full_width=True)` | Create a region from this element to another | `selector`: Selector for endpoint<br>`include_endpoint`: Whether to include endpoint<br>`full_width`: Whether to span page width | `Region` |
|
143
|
-
| `highlight(color=None, label=None, include_attrs=None)` | Highlight this element | `color`: RGBA color tuple<br>`label`: Label for the highlight<br>`include_attrs`: Element attributes to display | `Element` (self) |
|
144
|
-
| `extract_text(keep_blank_chars=True, apply_exclusions=True)` | Extract text from this element | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones | `str`: Extracted text |
|
145
|
-
| `next(selector=None, limit=None, apply_exclusions=True)` | Get the next element in reading order | `selector`: Optional selector to filter<br>`limit`: How many elements to search<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
|
146
|
-
| `prev(selector=None, limit=None, apply_exclusions=True)` | Get the previous element in reading order | `selector`: Optional selector to filter<br>`limit`: How many elements to search<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
|
147
|
-
| `nearest(selector, max_distance=None, apply_exclusions=True)` | Get the nearest element matching selector | `selector`: Selector for elements<br>`max_distance`: Maximum distance in points<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
|
148
|
-
|
149
|
-
### TextElement
|
150
|
-
|
151
|
-
Represents text elements in the PDF.
|
152
|
-
|
153
|
-
```python
|
154
|
-
class TextElement(Element):
|
155
|
-
"""
|
156
|
-
Represents text elements in the PDF.
|
157
|
-
|
158
|
-
Additional Properties:
|
159
|
-
text (str): The text content
|
160
|
-
fontname (str): The font name
|
161
|
-
size (float): The font size
|
162
|
-
bold (bool): Whether the text is bold
|
163
|
-
italic (bool): Whether the text is italic
|
164
|
-
color (tuple): The text color as RGB tuple
|
165
|
-
confidence (float): OCR confidence (for OCR text)
|
166
|
-
source (str): 'pdf' or 'ocr'
|
167
|
-
"""
|
168
|
-
```
|
169
|
-
|
170
|
-
**Main Properties**
|
171
|
-
|
172
|
-
| Property | Type | Description |
|
173
|
-
|----------|------|-------------|
|
174
|
-
| `text` | `str` | The text content |
|
175
|
-
| `fontname` | `str` | The font name |
|
176
|
-
| `size` | `float` | The font size |
|
177
|
-
| `bold` | `bool` | Whether the text is bold |
|
178
|
-
| `italic` | `bool` | Whether the text is italic |
|
179
|
-
| `color` | `tuple` | The text color as RGB tuple |
|
180
|
-
| `confidence` | `float` | OCR confidence (for OCR text) |
|
181
|
-
| `source` | `str` | 'pdf' or 'ocr' |
|
182
|
-
| `font_variant` | `str` | Font variant identifier (e.g., 'AAAAAB+') |
|
183
|
-
|
184
|
-
**Additional Methods**
|
185
|
-
|
186
|
-
| Method | Description | Parameters | Returns |
|
187
|
-
|--------|-------------|------------|---------|
|
188
|
-
| `font_info()` | Get detailed font information | None | `dict`: Font properties |
|
189
|
-
|
190
|
-
## Collections
|
191
|
-
|
192
|
-
### ElementCollection
|
193
|
-
|
194
|
-
A collection of elements with batch operations.
|
195
|
-
|
196
|
-
```python
|
197
|
-
class ElementCollection:
|
198
|
-
"""
|
199
|
-
A collection of elements with batch operations.
|
200
|
-
|
201
|
-
This class provides operations that can be applied to multiple elements at once.
|
202
|
-
"""
|
203
|
-
```
|
204
|
-
|
205
|
-
**Main Methods**
|
206
|
-
|
207
|
-
| Method | Description | Parameters | Returns |
|
208
|
-
|--------|-------------|------------|---------|
|
209
|
-
| `extract_text(keep_blank_chars=True, apply_exclusions=True)` | Extract text from all elements | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones | `str`: Extracted text |
|
210
|
-
| `filter(selector)` | Filter elements by selector | `selector`: CSS-like selector string | `ElementCollection` |
|
211
|
-
| `highlight(color=None, label=None, include_attrs=None)` | Highlight all elements | `color`: RGBA color tuple<br>`label`: Label for the highlight<br>`include_attrs`: Attributes to display | `ElementCollection` (self) |
|
212
|
-
| `first` | Get the first element in the collection | N/A (property) | `Element` or `None` |
|
213
|
-
| `last` | Get the last element in the collection | N/A (property) | `Element` or `None` |
|
214
|
-
| `highest()` | Get the highest element on the page | None | `Element` or `None` |
|
215
|
-
| `lowest()` | Get the lowest element on the page | None | `Element` or `None` |
|
216
|
-
| `leftmost()` | Get the leftmost element on the page | None | `Element` or `None` |
|
217
|
-
| `rightmost()` | Get the rightmost element on the page | None | `Element` or `None` |
|
218
|
-
| `__len__()` | Get the number of elements | None | `int` |
|
219
|
-
| `__getitem__(index)` | Get an element by index | `index`: Index or slice | `Element` or `ElementCollection` |
|
220
|
-
|
221
|
-
### PageCollection
|
222
|
-
|
223
|
-
A collection of pages with cross-page operations.
|
224
|
-
|
225
|
-
```python
|
226
|
-
class PageCollection:
|
227
|
-
"""
|
228
|
-
A collection of pages with cross-page operations.
|
229
|
-
|
230
|
-
This class provides operations that can be applied across multiple pages.
|
231
|
-
"""
|
232
|
-
```
|
233
|
-
|
234
|
-
**Main Methods**
|
235
|
-
|
236
|
-
| Method | Description | Parameters | Returns |
|
237
|
-
|--------|-------------|------------|---------|
|
238
|
-
| `extract_text(keep_blank_chars=True, apply_exclusions=True)` | Extract text from all pages | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones | `str`: Extracted text |
|
239
|
-
| `find(selector, case=True, regex=False, apply_exclusions=True)` | Find the first element matching selector across all pages | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
|
240
|
-
| `find_all(selector, case=True, regex=False, apply_exclusions=True)` | Find all elements matching selector across all pages | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `ElementCollection` |
|
241
|
-
| `get_sections(start_elements, end_elements=None, boundary_inclusion='start', new_section_on_page_break=False)` | Get sections spanning multiple pages | `start_elements`: Elements marking section starts<br>`end_elements`: Elements marking section ends<br>`boundary_inclusion`: How to include boundaries<br>`new_section_on_page_break`: Whether to start new sections at page breaks | `list[Region]` |
|
242
|
-
| `__len__()` | Get the number of pages | None | `int` |
|
243
|
-
| `__getitem__(index)` | Get a page by index | `index`: Index or slice | `Page` or `PageCollection` |
|
244
|
-
|
245
|
-
## OCR Classes
|
246
|
-
|
247
|
-
### OCREngine
|
248
|
-
|
249
|
-
Base class for OCR engines.
|
250
|
-
|
251
|
-
```python
|
252
|
-
class OCREngine:
|
253
|
-
"""
|
254
|
-
Base class for OCR engines.
|
255
|
-
|
256
|
-
This class provides the interface for OCR engines.
|
257
|
-
"""
|
258
|
-
```
|
259
|
-
|
260
|
-
**Main Methods**
|
261
|
-
|
262
|
-
| Method | Description | Parameters | Returns |
|
263
|
-
|--------|-------------|------------|---------|
|
264
|
-
| `process_image(image, languages=None, min_confidence=0.0, **kwargs)` | Process an image with OCR | `image`: PIL Image<br>`languages`: Languages to use<br>`min_confidence`: Minimum confidence threshold | `list`: OCR results |
|
265
|
-
|
266
|
-
### EasyOCREngine
|
267
|
-
|
268
|
-
OCR engine using EasyOCR.
|
269
|
-
|
270
|
-
```python
|
271
|
-
class EasyOCREngine(OCREngine):
|
272
|
-
"""
|
273
|
-
OCR engine using EasyOCR.
|
274
|
-
|
275
|
-
Parameters:
|
276
|
-
model_dir (str, optional): Directory for models. Default: None
|
277
|
-
"""
|
278
|
-
```
|
279
|
-
|
280
|
-
### PaddleOCREngine
|
281
|
-
|
282
|
-
OCR engine using PaddleOCR.
|
283
|
-
|
284
|
-
```python
|
285
|
-
class PaddleOCREngine(OCREngine):
|
286
|
-
"""
|
287
|
-
OCR engine using PaddleOCR.
|
288
|
-
|
289
|
-
Parameters:
|
290
|
-
use_angle_cls (bool, optional): Use text direction classification. Default: False
|
291
|
-
lang (str, optional): Language code. Default: "en"
|
292
|
-
det (bool, optional): Use text detection. Default: True
|
293
|
-
rec (bool, optional): Use text recognition. Default: True
|
294
|
-
cls (bool, optional): Use text direction classification. Default: False
|
295
|
-
det_model_dir (str, optional): Detection model directory. Default: None
|
296
|
-
rec_model_dir (str, optional): Recognition model directory. Default: None
|
297
|
-
verbose (bool, optional): Enable verbose output. Default: False
|
298
|
-
"""
|
299
|
-
```
|
300
|
-
|
301
|
-
## Document QA Classes
|
302
|
-
|
303
|
-
### DocumentQA
|
304
|
-
|
305
|
-
Class for document question answering.
|
306
|
-
|
307
|
-
```python
|
308
|
-
class DocumentQA:
|
309
|
-
"""
|
310
|
-
Class for document question answering.
|
311
|
-
|
312
|
-
Parameters:
|
313
|
-
model (str, optional): Model name or path. Default: "microsoft/layoutlmv3-base"
|
314
|
-
device (str, optional): Device to use. Default: "cpu"
|
315
|
-
verbose (bool, optional): Enable verbose output. Default: False
|
316
|
-
"""
|
317
|
-
```
|
318
|
-
|
319
|
-
**Main Methods**
|
320
|
-
|
321
|
-
| Method | Description | Parameters | Returns |
|
322
|
-
|--------|-------------|------------|---------|
|
323
|
-
| `ask(question, image, word_boxes, min_confidence=0.0, max_answer_length=None, language=None)` | Ask a question about a document | `question`: Question to ask<br>`image`: Document image<br>`word_boxes`: Text positions<br>`min_confidence`: Minimum confidence threshold<br>`max_answer_length`: Maximum answer length<br>`language`: Language code | `dict`: Result with answer and metadata |
|
324
|
-
|
325
|
-
## Selector Syntax
|
326
|
-
|
327
|
-
Natural PDF uses a CSS-like selector syntax to find elements in PDFs.
|
328
|
-
|
329
|
-
### Basic Selectors
|
330
|
-
|
331
|
-
| Selector | Description | Example |
|
332
|
-
|----------|-------------|---------|
|
333
|
-
| `element_type` | Match elements of this type | `text`, `rect`, `line` |
|
334
|
-
| `[attribute=value]` | Match elements with this attribute value | `[fontname=Arial]`, `[size=12]` |
|
335
|
-
| `[attribute>=value]` | Match elements with attribute >= value | `[size>=12]` |
|
336
|
-
| `[attribute<=value]` | Match elements with attribute <= value | `[size<=10]` |
|
337
|
-
| `[attribute~=value]` | Match elements with attribute approximately equal | `[color~=red]`, `[color~=(1,0,0)]` |
|
338
|
-
| `[attribute*=value]` | Match elements with attribute containing value | `[fontname*=Arial]` |
|
339
|
-
|
340
|
-
### Pseudo-Classes
|
341
|
-
|
342
|
-
| Pseudo-Class | Description | Example |
|
343
|
-
|--------------|-------------|---------|
|
344
|
-
| `:contains("text")` | Match elements containing text | `text:contains("Summary")` |
|
345
|
-
| `:starts-with("text")` | Match elements starting with text | `text:starts-with("Summary")` |
|
346
|
-
| `:ends-with("text")` | Match elements ending with text | `text:ends-with("2023")` |
|
347
|
-
| `:bold` | Match bold text | `text:bold` |
|
348
|
-
| `:italic` | Match italic text | `text:italic` |
|
349
|
-
|
350
|
-
### Attribute Names
|
351
|
-
|
352
|
-
| Attribute | Element Types | Description |
|
353
|
-
|-----------|--------------|-------------|
|
354
|
-
| `fontname` | text | Font name |
|
355
|
-
| `size` | text | Font size |
|
356
|
-
| `color` | text, rect, line | Color |
|
357
|
-
| `width` | rect, line | Width |
|
358
|
-
| `height` | rect | Height |
|
359
|
-
| `confidence` | text (OCR) | OCR confidence score |
|
360
|
-
| `source` | text | Source ('pdf' or 'ocr') |
|
361
|
-
| `type` | region | Region type (e.g., 'table', 'title') |
|
362
|
-
| `model` | region | Layout model that detected the region |
|
363
|
-
| `font-variant` | text | Font variant identifier |
|
364
|
-
|
365
|
-
## Constants and Configuration
|
366
|
-
|
367
|
-
### Color Names
|
368
|
-
|
369
|
-
Natural PDF supports color names in selectors.
|
370
|
-
|
371
|
-
| Color Name | RGB Value | Example |
|
372
|
-
|------------|-----------|---------|
|
373
|
-
| `red` | (1, 0, 0) | `[color~=red]` |
|
374
|
-
| `green` | (0, 1, 0) | `[color~=green]` |
|
375
|
-
| `blue` | (0, 0, 1) | `[color~=blue]` |
|
376
|
-
| `black` | (0, 0, 0) | `[color~=black]` |
|
377
|
-
| `white` | (1, 1, 1) | `[color~=white]` |
|
378
|
-
|
379
|
-
### Region Types
|
380
|
-
|
381
|
-
Layout analysis models detect the following region types:
|
382
|
-
|
383
|
-
| Model | Region Types |
|
384
|
-
|-------|-------------|
|
385
|
-
| YOLO | `title`, `plain-text`, `table`, `figure`, `figure_caption`, `table_caption`, `table_footnote`, `isolate_formula`, `formula_caption`, `abandon` |
|
386
|
-
| TATR | `table`, `table-row`, `table-column`, `table-column-header` |
|
docs/assets/favicon.png
DELETED
@@ -1,3 +0,0 @@
|
|
1
|
-
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="#4051b5" width="32" height="32">
|
2
|
-
<path d="M14,2H6A2,2 0 0,0 4,4V20A2,2 0 0,0 6,22H18A2,2 0 0,0 20,20V8L14,2M18,20H6V4H13V9H18V20M16,11V18.1L13.9,16L11.1,18.8L8.3,16L11.1,13.2L8.9,11L11.1,8.8L16,13.7V11H16Z" />
|
3
|
-
</svg>
|
docs/assets/favicon.svg
DELETED
@@ -1,3 +0,0 @@
|
|
1
|
-
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="#4051b5" width="32" height="32">
|
2
|
-
<path d="M14,2H6A2,2 0 0,0 4,4V20A2,2 0 0,0 6,22H18A2,2 0 0,0 20,20V8L14,2M18,20H6V4H13V9H18V20M16,11V18.1L13.9,16L11.1,18.8L8.3,16L11.1,13.2L8.9,11L11.1,8.8L16,13.7V11H16Z" />
|
3
|
-
</svg>
|
@@ -1,17 +0,0 @@
|
|
1
|
-
// Natural PDF custom script
|
2
|
-
|
3
|
-
// Add homepage class for styling
|
4
|
-
document.addEventListener('DOMContentLoaded', function() {
|
5
|
-
const path = window.location.pathname;
|
6
|
-
if (path === '/' ||
|
7
|
-
path === '/index.html' ||
|
8
|
-
path.endsWith('/') && !path.endsWith('/index.html') && path.split('/').filter(Boolean).length <= 1) {
|
9
|
-
document.body.classList.add('homepage');
|
10
|
-
}
|
11
|
-
|
12
|
-
// Add animation classes to feature cards
|
13
|
-
document.querySelectorAll('.feature-card').forEach((card, index) => {
|
14
|
-
card.style.animationDelay = `${index * 0.1}s`;
|
15
|
-
card.classList.add('animate-in');
|
16
|
-
});
|
17
|
-
});
|
docs/assets/logo.svg
DELETED
@@ -1,3 +0,0 @@
|
|
1
|
-
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="#4051b5" width="240" height="240">
|
2
|
-
<path d="M14,2H6A2,2 0 0,0 4,4V20A2,2 0 0,0 6,22H18A2,2 0 0,0 20,20V8L14,2M18,20H6V4H13V9H18V20M16,11V18.1L13.9,16L11.1,18.8L8.3,16L11.1,13.2L8.9,11L11.1,8.8L16,13.7V11H16Z" />
|
3
|
-
</svg>
|
docs/assets/sample-screen.png
DELETED
Binary file
|
docs/assets/social-preview.png
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="630" viewBox="0 0 1200 630">
|
2
|
-
<rect width="1200" height="630" fill="#3f51b5"/>
|
3
|
-
<rect x="50" y="50" width="1100" height="530" rx="15" fill="#fff" opacity="0.1"/>
|
4
|
-
|
5
|
-
<!-- Logo -->
|
6
|
-
<path d="M300,200 H200 Q180,200 180,220 V410 Q180,430 200,430 H300 Q320,430 320,410 V220 Q320,200 300,200 M300,430 H200 V200 H270 V250 H300 V430 M290,270 V350 L270,330 L240,360 L210,330 L240,300 L220,280 L240,260 L290,310 V270 H290Z" fill="#fff"/>
|
7
|
-
|
8
|
-
<!-- Text -->
|
9
|
-
<text x="380" y="290" font-family="'Roboto', sans-serif" font-size="100" font-weight="700" fill="#fff">Natural PDF</text>
|
10
|
-
<text x="380" y="360" font-family="'Roboto', sans-serif" font-size="40" font-weight="300" fill="#fff">A more intuitive way to work with PDFs</text>
|
11
|
-
|
12
|
-
<!-- Decorative elements -->
|
13
|
-
<rect x="380" y="380" width="500" height="4" rx="2" fill="#fff"/>
|
14
|
-
<circle cx="150" cy="150" r="30" fill="#fff" opacity="0.1"/>
|
15
|
-
<circle cx="1050" cy="480" r="50" fill="#fff" opacity="0.1"/>
|
16
|
-
<circle cx="950" cy="150" r="20" fill="#fff" opacity="0.1"/>
|
17
|
-
</svg>
|
docs/assets/social-preview.svg
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="630" viewBox="0 0 1200 630">
|
2
|
-
<rect width="1200" height="630" fill="#3f51b5"/>
|
3
|
-
<rect x="50" y="50" width="1100" height="530" rx="15" fill="#fff" opacity="0.1"/>
|
4
|
-
|
5
|
-
<!-- Logo -->
|
6
|
-
<path d="M300,200 H200 Q180,200 180,220 V410 Q180,430 200,430 H300 Q320,430 320,410 V220 Q320,200 300,200 M300,430 H200 V200 H270 V250 H300 V430 M290,270 V350 L270,330 L240,360 L210,330 L240,300 L220,280 L240,260 L290,310 V270 H290Z" fill="#fff"/>
|
7
|
-
|
8
|
-
<!-- Text -->
|
9
|
-
<text x="380" y="290" font-family="'Roboto', sans-serif" font-size="100" font-weight="700" fill="#fff">Natural PDF</text>
|
10
|
-
<text x="380" y="360" font-family="'Roboto', sans-serif" font-size="40" font-weight="300" fill="#fff">A more intuitive way to work with PDFs</text>
|
11
|
-
|
12
|
-
<!-- Decorative elements -->
|
13
|
-
<rect x="380" y="380" width="500" height="4" rx="2" fill="#fff"/>
|
14
|
-
<circle cx="150" cy="150" r="30" fill="#fff" opacity="0.1"/>
|
15
|
-
<circle cx="1050" cy="480" r="50" fill="#fff" opacity="0.1"/>
|
16
|
-
<circle cx="950" cy="150" r="20" fill="#fff" opacity="0.1"/>
|
17
|
-
</svg>
|
@@ -1,65 +0,0 @@
|
|
1
|
-
/* Natural PDF - Minimal Custom Styling */
|
2
|
-
|
3
|
-
.jp-InputPrompt, .jp-OutputPrompt {
|
4
|
-
display: none !important;
|
5
|
-
}
|
6
|
-
|
7
|
-
.jupyter-wrapper .CodeMirror {
|
8
|
-
font-size: 0.85em !important;
|
9
|
-
}
|
10
|
-
|
11
|
-
.highlight-ipynb pre {
|
12
|
-
white-space: pre-wrap !important;
|
13
|
-
word-wrap: break-word !important;
|
14
|
-
}
|
15
|
-
|
16
|
-
.CodeMirror pre {
|
17
|
-
white-space: pre-wrap !important;
|
18
|
-
word-wrap: break-word !important;
|
19
|
-
}
|
20
|
-
|
21
|
-
.jp-CodeMirrorEditor {
|
22
|
-
max-width: 100%;
|
23
|
-
overflow-x: auto;
|
24
|
-
}
|
25
|
-
|
26
|
-
.jupyter-wrapper{
|
27
|
-
--jp-code-font-size: 0.85em !important;
|
28
|
-
}
|
29
|
-
|
30
|
-
/* Typography improvements */
|
31
|
-
.md-typeset h1 {
|
32
|
-
font-weight: 400;
|
33
|
-
}
|
34
|
-
|
35
|
-
.md-typeset h2 {
|
36
|
-
font-weight: 400;
|
37
|
-
margin-top: 1.5em;
|
38
|
-
border-bottom: 1px solid rgba(0,0,0,0.1);
|
39
|
-
padding-bottom: 0.3em;
|
40
|
-
}
|
41
|
-
|
42
|
-
.md-typeset h3 {
|
43
|
-
font-weight: 500;
|
44
|
-
}
|
45
|
-
|
46
|
-
/* Table improvements */
|
47
|
-
.md-typeset table:not([class]) {
|
48
|
-
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
|
49
|
-
border-radius: 4px;
|
50
|
-
}
|
51
|
-
|
52
|
-
/* API Reference improvements */
|
53
|
-
.doc-method {
|
54
|
-
border-left: 3px solid var(--md-primary-fg-color);
|
55
|
-
padding-left: 1em;
|
56
|
-
margin: 1.5em 0;
|
57
|
-
}
|
58
|
-
|
59
|
-
/* Example images */
|
60
|
-
.pdf-screenshot {
|
61
|
-
border: 1px solid #ddd;
|
62
|
-
border-radius: 4px;
|
63
|
-
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
|
64
|
-
margin: 1em 0;
|
65
|
-
}
|