natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
docs/api/index.md DELETED
@@ -1,386 +0,0 @@
1
- # API Reference
2
-
3
- This section provides detailed documentation for all the classes and methods in Natural PDF.
4
-
5
- ## Core Classes
6
-
7
- ### PDF Class
8
-
9
- The main entry point for working with PDFs.
10
-
11
- ```python
12
- class PDF:
13
- """
14
- The main entry point for working with PDFs.
15
-
16
- Parameters:
17
- path (str): Path to the PDF file.
18
- password (str, optional): Password for encrypted PDFs. Default: None
19
- reading_order (bool, optional): Sort elements in reading order. Default: True
20
- keep_spaces (bool, optional): Keep spaces in word elements. Default: True
21
- font_attrs (list, optional): Font attributes to use for text grouping.
22
- Default: ['fontname', 'size']
23
- ocr (bool/dict/str, optional): OCR configuration. Default: False
24
- ocr_engine (str/Engine, optional): OCR engine to use. Default: "easyocr"
25
- """
26
- ```
27
-
28
- **Main Methods**
29
-
30
- | Method | Description | Parameters | Returns |
31
- |--------|-------------|------------|---------|
32
- | `pages` | Access pages in the document | N/A (property) | `PageCollection` |
33
- | `extract_text(keep_blank_chars=True, apply_exclusions=True)` | Extract text from all pages | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones | `str`: Extracted text |
34
- | `find(selector, case=True, regex=False, apply_exclusions=True)` | Find first element matching selector across all pages | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
35
- | `find_all(selector, case=True, regex=False, apply_exclusions=True)` | Find all elements matching selector across all pages | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `ElementCollection` |
36
- | `add_exclusion(func, label=None)` | Add a document-wide exclusion zone | `func`: Function taking a page and returning region<br>`label`: Optional label for the exclusion | `None` |
37
- | `get_sections(start_elements, end_elements=None, boundary_inclusion='start')` | Get sections across all pages | `start_elements`: Elements marking section starts<br>`end_elements`: Elements marking section ends<br>`boundary_inclusion`: How to include boundaries ('start', 'end', 'both', 'none') | `list[Region]` |
38
- | `ask(question, min_confidence=0.0, model=None)` | Ask a question about the document content | `question`: Question to ask<br>`min_confidence`: Minimum confidence threshold<br>`model`: Optional model name or path | `dict`: Result with answer and metadata |
39
-
40
- ### Page Class
41
-
42
- Represents a single page in a PDF document.
43
-
44
- ```python
45
- class Page:
46
- """
47
- Represents a single page in a PDF document.
48
-
49
- Properties:
50
- page_number (int): 1-indexed page number
51
- page_index (int): 0-indexed page position
52
- width (float): Page width in points
53
- height (float): Page height in points
54
- pdf (PDF): Parent PDF object
55
- """
56
- ```
57
-
58
- **Main Methods**
59
-
60
- | Method | Description | Parameters | Returns |
61
- |--------|-------------|------------|---------|
62
- | `extract_text(keep_blank_chars=True, apply_exclusions=True, ocr=None)` | Extract text from the page | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones<br>`ocr`: Whether to force OCR | `str`: Extracted text |
63
- | `find(selector, case=True, regex=False, apply_exclusions=True)` | Find the first element matching selector | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
64
- | `find_all(selector, case=True, regex=False, apply_exclusions=True)` | Find all elements matching selector | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `ElementCollection` |
65
- | `create_region(x0, top, x1, bottom)` | Create a region at specific coordinates | `x0`: Left coordinate<br>`top`: Top coordinate<br>`x1`: Right coordinate<br>`bottom`: Bottom coordinate | `Region` |
66
- | `highlight(elements, color=None, label=None)` | Highlight elements on the page | `elements`: Elements to highlight<br>`color`: RGBA color tuple<br>`label`: Label for the highlight | `Page` (self) |
67
- | `highlight_all(include_types=None, include_text_styles=False, include_layout_regions=False)` | Highlight all elements on the page | `include_types`: Element types to include<br>`include_text_styles`: Whether to include text styles<br>`include_layout_regions`: Whether to include layout regions | `Page` (self) |
68
- | `save_image(path, resolution=72, labels=True)` | Save an image of the page with highlights | `path`: Path to save image<br>`resolution`: Image resolution in DPI<br>`labels`: Whether to include labels | `None` |
69
- | `to_image(resolution=72, labels=True)` | Get a PIL Image of the page with highlights | `resolution`: Image resolution in DPI<br>`labels`: Whether to include labels | `PIL.Image` |
70
- | `analyze_text_styles()` | Group text by visual style properties | None | `dict`: Mapping of style name to elements |
71
- | `analyze_layout(engine="yolo", confidence=0.2, existing="replace")` | Detect layout regions using ML models | `model`: Model to use ("yolo", "tatr")<br>`confidence`: Confidence threshold<br>`existing`: How to handle existing regions | `ElementCollection`: Detected regions |
72
- | `add_exclusion(region, label=None)` | Add an exclusion zone to the page | `region`: Region to exclude<br>`label`: Optional label for the exclusion | `Region`: The exclusion region |
73
- | `get_sections(start_elements, end_elements=None, boundary_inclusion='start')` | Get sections from the page | `start_elements`: Elements marking section starts<br>`end_elements`: Elements marking section ends<br>`boundary_inclusion`: How to include boundaries | `list[Region]` |
74
- | `ask(question, min_confidence=0.0, model=None, debug=False)` | Ask a question about the page content | `question`: Question to ask<br>`min_confidence`: Minimum confidence threshold<br>`model`: Optional model name or path<br>`debug`: Whether to save debug files | `dict`: Result with answer and metadata |
75
- | `apply_ocr(languages=None, min_confidence=0.0, **kwargs)` | Apply OCR to the page | `languages`: Languages to use<br>`min_confidence`: Minimum confidence threshold<br>`**kwargs`: Additional OCR engine parameters | `ElementCollection`: OCR text elements |
76
-
77
- ### Region Class
78
-
79
- Represents a rectangular area on a page.
80
-
81
- ```python
82
- class Region:
83
- """
84
- Represents a rectangular area on a page.
85
-
86
- Properties:
87
- x0 (float): Left coordinate
88
- top (float): Top coordinate
89
- x1 (float): Right coordinate
90
- bottom (float): Bottom coordinate
91
- width (float): Width of the region
92
- height (float): Height of the region
93
- page (Page): Parent page object
94
- """
95
- ```
96
-
97
- **Main Methods**
98
-
99
- | Method | Description | Parameters | Returns |
100
- |--------|-------------|------------|---------|
101
- | `extract_text(keep_blank_chars=True, apply_exclusions=True, ocr=None)` | Extract text from the region | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones<br>`ocr`: Whether to force OCR | `str`: Extracted text |
102
- | `find(selector, case=True, regex=False, apply_exclusions=True)` | Find the first element matching selector within the region | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
103
- | `find_all(selector, case=True, regex=False, apply_exclusions=True)` | Find all elements matching selector within the region | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `ElementCollection` |
104
- | `expand(left=0, top=0, right=0, bottom=0, width_factor=1.0, height_factor=1.0)` | Expand the region in specified directions | `left/top/right/bottom`: Points to expand in each direction<br>`width_factor/height_factor`: Scale width/height by this factor | `Region`: Expanded region |
105
- | `highlight(color=None, label=None, include_attrs=None)` | Highlight the region | `color`: RGBA color tuple<br>`label`: Label for the highlight<br>`include_attrs`: Region attributes to display | `Region` (self) |
106
- | `to_image(resolution=72, crop_only=False)` | Get a PIL Image of just the region | `resolution`: Image resolution in DPI<br>`crop_only`: Whether to exclude border | `PIL.Image` |
107
- | `save_image(path, resolution=72, crop_only=False)` | Save an image of just the region | `path`: Path to save image<br>`resolution`: Image resolution in DPI<br>`crop_only`: Whether to exclude border | `None` |
108
- | `get_sections(start_elements, end_elements=None, boundary_inclusion='start')` | Get sections within the region | `start_elements`: Elements marking section starts<br>`end_elements`: Elements marking section ends<br>`boundary_inclusion`: How to include boundaries | `list[Region]` |
109
- | `ask(question, min_confidence=0.0, model=None, debug=False)` | Ask a question about the region content | `question`: Question to ask<br>`min_confidence`: Minimum confidence threshold<br>`model`: Optional model name or path<br>`debug`: Whether to save debug files | `dict`: Result with answer and metadata |
110
- | `extract_table(method=None, table_settings=None, use_ocr=False)` | Extract table data from the region | `method`: Extraction method ("plumber", "tatr")<br>`table_settings`: Custom settings for extraction<br>`use_ocr`: Whether to use OCR text | `list`: Table data as rows and columns |
111
- | `intersects(other)` | Check if this region intersects with another | `other`: Another region | `bool`: True if regions intersect |
112
- | `contains(x, y)` | Check if a point is within the region | `x`: X coordinate<br>`y`: Y coordinate | `bool`: True if point is in region |
113
-
114
- ## Element Types
115
-
116
- ### Element Base Class
117
-
118
- The base class for all PDF elements.
119
-
120
- ```python
121
- class Element:
122
- """
123
- Base class for all PDF elements.
124
-
125
- Properties:
126
- x0 (float): Left coordinate
127
- top (float): Top coordinate
128
- x1 (float): Right coordinate
129
- bottom (float): Bottom coordinate
130
- width (float): Width of the element
131
- height (float): Height of the element
132
- page (Page): Parent page object
133
- """
134
- ```
135
-
136
- **Main Methods**
137
-
138
- | Method | Description | Parameters | Returns |
139
- |--------|-------------|------------|---------|
140
- | `above(height=None, full_width=True, until=None, include_until=True)` | Create a region above the element | `height`: Height of region<br>`full_width`: Whether to span page width<br>`until`: Selector for boundary<br>`include_until`: Whether to include boundary | `Region` |
141
- | `below(height=None, full_width=True, until=None, include_until=True)` | Create a region below the element | `height`: Height of region<br>`full_width`: Whether to span page width<br>`until`: Selector for boundary<br>`include_until`: Whether to include boundary | `Region` |
142
- | `select_until(selector, include_endpoint=True, full_width=True)` | Create a region from this element to another | `selector`: Selector for endpoint<br>`include_endpoint`: Whether to include endpoint<br>`full_width`: Whether to span page width | `Region` |
143
- | `highlight(color=None, label=None, include_attrs=None)` | Highlight this element | `color`: RGBA color tuple<br>`label`: Label for the highlight<br>`include_attrs`: Element attributes to display | `Element` (self) |
144
- | `extract_text(keep_blank_chars=True, apply_exclusions=True)` | Extract text from this element | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones | `str`: Extracted text |
145
- | `next(selector=None, limit=None, apply_exclusions=True)` | Get the next element in reading order | `selector`: Optional selector to filter<br>`limit`: How many elements to search<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
146
- | `prev(selector=None, limit=None, apply_exclusions=True)` | Get the previous element in reading order | `selector`: Optional selector to filter<br>`limit`: How many elements to search<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
147
- | `nearest(selector, max_distance=None, apply_exclusions=True)` | Get the nearest element matching selector | `selector`: Selector for elements<br>`max_distance`: Maximum distance in points<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
148
-
149
- ### TextElement
150
-
151
- Represents text elements in the PDF.
152
-
153
- ```python
154
- class TextElement(Element):
155
- """
156
- Represents text elements in the PDF.
157
-
158
- Additional Properties:
159
- text (str): The text content
160
- fontname (str): The font name
161
- size (float): The font size
162
- bold (bool): Whether the text is bold
163
- italic (bool): Whether the text is italic
164
- color (tuple): The text color as RGB tuple
165
- confidence (float): OCR confidence (for OCR text)
166
- source (str): 'pdf' or 'ocr'
167
- """
168
- ```
169
-
170
- **Main Properties**
171
-
172
- | Property | Type | Description |
173
- |----------|------|-------------|
174
- | `text` | `str` | The text content |
175
- | `fontname` | `str` | The font name |
176
- | `size` | `float` | The font size |
177
- | `bold` | `bool` | Whether the text is bold |
178
- | `italic` | `bool` | Whether the text is italic |
179
- | `color` | `tuple` | The text color as RGB tuple |
180
- | `confidence` | `float` | OCR confidence (for OCR text) |
181
- | `source` | `str` | 'pdf' or 'ocr' |
182
- | `font_variant` | `str` | Font variant identifier (e.g., 'AAAAAB+') |
183
-
184
- **Additional Methods**
185
-
186
- | Method | Description | Parameters | Returns |
187
- |--------|-------------|------------|---------|
188
- | `font_info()` | Get detailed font information | None | `dict`: Font properties |
189
-
190
- ## Collections
191
-
192
- ### ElementCollection
193
-
194
- A collection of elements with batch operations.
195
-
196
- ```python
197
- class ElementCollection:
198
- """
199
- A collection of elements with batch operations.
200
-
201
- This class provides operations that can be applied to multiple elements at once.
202
- """
203
- ```
204
-
205
- **Main Methods**
206
-
207
- | Method | Description | Parameters | Returns |
208
- |--------|-------------|------------|---------|
209
- | `extract_text(keep_blank_chars=True, apply_exclusions=True)` | Extract text from all elements | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones | `str`: Extracted text |
210
- | `filter(selector)` | Filter elements by selector | `selector`: CSS-like selector string | `ElementCollection` |
211
- | `highlight(color=None, label=None, include_attrs=None)` | Highlight all elements | `color`: RGBA color tuple<br>`label`: Label for the highlight<br>`include_attrs`: Attributes to display | `ElementCollection` (self) |
212
- | `first` | Get the first element in the collection | N/A (property) | `Element` or `None` |
213
- | `last` | Get the last element in the collection | N/A (property) | `Element` or `None` |
214
- | `highest()` | Get the highest element on the page | None | `Element` or `None` |
215
- | `lowest()` | Get the lowest element on the page | None | `Element` or `None` |
216
- | `leftmost()` | Get the leftmost element on the page | None | `Element` or `None` |
217
- | `rightmost()` | Get the rightmost element on the page | None | `Element` or `None` |
218
- | `__len__()` | Get the number of elements | None | `int` |
219
- | `__getitem__(index)` | Get an element by index | `index`: Index or slice | `Element` or `ElementCollection` |
220
-
221
- ### PageCollection
222
-
223
- A collection of pages with cross-page operations.
224
-
225
- ```python
226
- class PageCollection:
227
- """
228
- A collection of pages with cross-page operations.
229
-
230
- This class provides operations that can be applied across multiple pages.
231
- """
232
- ```
233
-
234
- **Main Methods**
235
-
236
- | Method | Description | Parameters | Returns |
237
- |--------|-------------|------------|---------|
238
- | `extract_text(keep_blank_chars=True, apply_exclusions=True)` | Extract text from all pages | `keep_blank_chars`: Whether to keep blank characters<br>`apply_exclusions`: Whether to apply exclusion zones | `str`: Extracted text |
239
- | `find(selector, case=True, regex=False, apply_exclusions=True)` | Find the first element matching selector across all pages | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `Element` or `None` |
240
- | `find_all(selector, case=True, regex=False, apply_exclusions=True)` | Find all elements matching selector across all pages | `selector`: CSS-like selector string<br>`case`: Case-sensitive search<br>`regex`: Use regex for :contains()<br>`apply_exclusions`: Whether to apply exclusion zones | `ElementCollection` |
241
- | `get_sections(start_elements, end_elements=None, boundary_inclusion='start', new_section_on_page_break=False)` | Get sections spanning multiple pages | `start_elements`: Elements marking section starts<br>`end_elements`: Elements marking section ends<br>`boundary_inclusion`: How to include boundaries<br>`new_section_on_page_break`: Whether to start new sections at page breaks | `list[Region]` |
242
- | `__len__()` | Get the number of pages | None | `int` |
243
- | `__getitem__(index)` | Get a page by index | `index`: Index or slice | `Page` or `PageCollection` |
244
-
245
- ## OCR Classes
246
-
247
- ### OCREngine
248
-
249
- Base class for OCR engines.
250
-
251
- ```python
252
- class OCREngine:
253
- """
254
- Base class for OCR engines.
255
-
256
- This class provides the interface for OCR engines.
257
- """
258
- ```
259
-
260
- **Main Methods**
261
-
262
- | Method | Description | Parameters | Returns |
263
- |--------|-------------|------------|---------|
264
- | `process_image(image, languages=None, min_confidence=0.0, **kwargs)` | Process an image with OCR | `image`: PIL Image<br>`languages`: Languages to use<br>`min_confidence`: Minimum confidence threshold | `list`: OCR results |
265
-
266
- ### EasyOCREngine
267
-
268
- OCR engine using EasyOCR.
269
-
270
- ```python
271
- class EasyOCREngine(OCREngine):
272
- """
273
- OCR engine using EasyOCR.
274
-
275
- Parameters:
276
- model_dir (str, optional): Directory for models. Default: None
277
- """
278
- ```
279
-
280
- ### PaddleOCREngine
281
-
282
- OCR engine using PaddleOCR.
283
-
284
- ```python
285
- class PaddleOCREngine(OCREngine):
286
- """
287
- OCR engine using PaddleOCR.
288
-
289
- Parameters:
290
- use_angle_cls (bool, optional): Use text direction classification. Default: False
291
- lang (str, optional): Language code. Default: "en"
292
- det (bool, optional): Use text detection. Default: True
293
- rec (bool, optional): Use text recognition. Default: True
294
- cls (bool, optional): Use text direction classification. Default: False
295
- det_model_dir (str, optional): Detection model directory. Default: None
296
- rec_model_dir (str, optional): Recognition model directory. Default: None
297
- verbose (bool, optional): Enable verbose output. Default: False
298
- """
299
- ```
300
-
301
- ## Document QA Classes
302
-
303
- ### DocumentQA
304
-
305
- Class for document question answering.
306
-
307
- ```python
308
- class DocumentQA:
309
- """
310
- Class for document question answering.
311
-
312
- Parameters:
313
- model (str, optional): Model name or path. Default: "microsoft/layoutlmv3-base"
314
- device (str, optional): Device to use. Default: "cpu"
315
- verbose (bool, optional): Enable verbose output. Default: False
316
- """
317
- ```
318
-
319
- **Main Methods**
320
-
321
- | Method | Description | Parameters | Returns |
322
- |--------|-------------|------------|---------|
323
- | `ask(question, image, word_boxes, min_confidence=0.0, max_answer_length=None, language=None)` | Ask a question about a document | `question`: Question to ask<br>`image`: Document image<br>`word_boxes`: Text positions<br>`min_confidence`: Minimum confidence threshold<br>`max_answer_length`: Maximum answer length<br>`language`: Language code | `dict`: Result with answer and metadata |
324
-
325
- ## Selector Syntax
326
-
327
- Natural PDF uses a CSS-like selector syntax to find elements in PDFs.
328
-
329
- ### Basic Selectors
330
-
331
- | Selector | Description | Example |
332
- |----------|-------------|---------|
333
- | `element_type` | Match elements of this type | `text`, `rect`, `line` |
334
- | `[attribute=value]` | Match elements with this attribute value | `[fontname=Arial]`, `[size=12]` |
335
- | `[attribute>=value]` | Match elements with attribute >= value | `[size>=12]` |
336
- | `[attribute<=value]` | Match elements with attribute <= value | `[size<=10]` |
337
- | `[attribute~=value]` | Match elements with attribute approximately equal | `[color~=red]`, `[color~=(1,0,0)]` |
338
- | `[attribute*=value]` | Match elements with attribute containing value | `[fontname*=Arial]` |
339
-
340
- ### Pseudo-Classes
341
-
342
- | Pseudo-Class | Description | Example |
343
- |--------------|-------------|---------|
344
- | `:contains("text")` | Match elements containing text | `text:contains("Summary")` |
345
- | `:starts-with("text")` | Match elements starting with text | `text:starts-with("Summary")` |
346
- | `:ends-with("text")` | Match elements ending with text | `text:ends-with("2023")` |
347
- | `:bold` | Match bold text | `text:bold` |
348
- | `:italic` | Match italic text | `text:italic` |
349
-
350
- ### Attribute Names
351
-
352
- | Attribute | Element Types | Description |
353
- |-----------|--------------|-------------|
354
- | `fontname` | text | Font name |
355
- | `size` | text | Font size |
356
- | `color` | text, rect, line | Color |
357
- | `width` | rect, line | Width |
358
- | `height` | rect | Height |
359
- | `confidence` | text (OCR) | OCR confidence score |
360
- | `source` | text | Source ('pdf' or 'ocr') |
361
- | `type` | region | Region type (e.g., 'table', 'title') |
362
- | `model` | region | Layout model that detected the region |
363
- | `font-variant` | text | Font variant identifier |
364
-
365
- ## Constants and Configuration
366
-
367
- ### Color Names
368
-
369
- Natural PDF supports color names in selectors.
370
-
371
- | Color Name | RGB Value | Example |
372
- |------------|-----------|---------|
373
- | `red` | (1, 0, 0) | `[color~=red]` |
374
- | `green` | (0, 1, 0) | `[color~=green]` |
375
- | `blue` | (0, 0, 1) | `[color~=blue]` |
376
- | `black` | (0, 0, 0) | `[color~=black]` |
377
- | `white` | (1, 1, 1) | `[color~=white]` |
378
-
379
- ### Region Types
380
-
381
- Layout analysis models detect the following region types:
382
-
383
- | Model | Region Types |
384
- |-------|-------------|
385
- | YOLO | `title`, `plain-text`, `table`, `figure`, `figure_caption`, `table_caption`, `table_footnote`, `isolate_formula`, `formula_caption`, `abandon` |
386
- | TATR | `table`, `table-row`, `table-column`, `table-column-header` |
docs/assets/favicon.png DELETED
@@ -1,3 +0,0 @@
1
- <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="#4051b5" width="32" height="32">
2
- <path d="M14,2H6A2,2 0 0,0 4,4V20A2,2 0 0,0 6,22H18A2,2 0 0,0 20,20V8L14,2M18,20H6V4H13V9H18V20M16,11V18.1L13.9,16L11.1,18.8L8.3,16L11.1,13.2L8.9,11L11.1,8.8L16,13.7V11H16Z" />
3
- </svg>
docs/assets/favicon.svg DELETED
@@ -1,3 +0,0 @@
1
- <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="#4051b5" width="32" height="32">
2
- <path d="M14,2H6A2,2 0 0,0 4,4V20A2,2 0 0,0 6,22H18A2,2 0 0,0 20,20V8L14,2M18,20H6V4H13V9H18V20M16,11V18.1L13.9,16L11.1,18.8L8.3,16L11.1,13.2L8.9,11L11.1,8.8L16,13.7V11H16Z" />
3
- </svg>
@@ -1,17 +0,0 @@
1
- // Natural PDF custom script
2
-
3
- // Add homepage class for styling
4
- document.addEventListener('DOMContentLoaded', function() {
5
- const path = window.location.pathname;
6
- if (path === '/' ||
7
- path === '/index.html' ||
8
- path.endsWith('/') && !path.endsWith('/index.html') && path.split('/').filter(Boolean).length <= 1) {
9
- document.body.classList.add('homepage');
10
- }
11
-
12
- // Add animation classes to feature cards
13
- document.querySelectorAll('.feature-card').forEach((card, index) => {
14
- card.style.animationDelay = `${index * 0.1}s`;
15
- card.classList.add('animate-in');
16
- });
17
- });
docs/assets/logo.svg DELETED
@@ -1,3 +0,0 @@
1
- <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="#4051b5" width="240" height="240">
2
- <path d="M14,2H6A2,2 0 0,0 4,4V20A2,2 0 0,0 6,22H18A2,2 0 0,0 20,20V8L14,2M18,20H6V4H13V9H18V20M16,11V18.1L13.9,16L11.1,18.8L8.3,16L11.1,13.2L8.9,11L11.1,8.8L16,13.7V11H16Z" />
3
- </svg>
Binary file
@@ -1,17 +0,0 @@
1
- <svg xmlns="http://www.w3.org/2000/svg" width="1200" height="630" viewBox="0 0 1200 630">
2
- <rect width="1200" height="630" fill="#3f51b5"/>
3
- <rect x="50" y="50" width="1100" height="530" rx="15" fill="#fff" opacity="0.1"/>
4
-
5
- <!-- Logo -->
6
- <path d="M300,200 H200 Q180,200 180,220 V410 Q180,430 200,430 H300 Q320,430 320,410 V220 Q320,200 300,200 M300,430 H200 V200 H270 V250 H300 V430 M290,270 V350 L270,330 L240,360 L210,330 L240,300 L220,280 L240,260 L290,310 V270 H290Z" fill="#fff"/>
7
-
8
- <!-- Text -->
9
- <text x="380" y="290" font-family="'Roboto', sans-serif" font-size="100" font-weight="700" fill="#fff">Natural PDF</text>
10
- <text x="380" y="360" font-family="'Roboto', sans-serif" font-size="40" font-weight="300" fill="#fff">A more intuitive way to work with PDFs</text>
11
-
12
- <!-- Decorative elements -->
13
- <rect x="380" y="380" width="500" height="4" rx="2" fill="#fff"/>
14
- <circle cx="150" cy="150" r="30" fill="#fff" opacity="0.1"/>
15
- <circle cx="1050" cy="480" r="50" fill="#fff" opacity="0.1"/>
16
- <circle cx="950" cy="150" r="20" fill="#fff" opacity="0.1"/>
17
- </svg>
@@ -1,17 +0,0 @@
1
- <svg xmlns="http://www.w3.org/2000/svg" width="1200" height="630" viewBox="0 0 1200 630">
2
- <rect width="1200" height="630" fill="#3f51b5"/>
3
- <rect x="50" y="50" width="1100" height="530" rx="15" fill="#fff" opacity="0.1"/>
4
-
5
- <!-- Logo -->
6
- <path d="M300,200 H200 Q180,200 180,220 V410 Q180,430 200,430 H300 Q320,430 320,410 V220 Q320,200 300,200 M300,430 H200 V200 H270 V250 H300 V430 M290,270 V350 L270,330 L240,360 L210,330 L240,300 L220,280 L240,260 L290,310 V270 H290Z" fill="#fff"/>
7
-
8
- <!-- Text -->
9
- <text x="380" y="290" font-family="'Roboto', sans-serif" font-size="100" font-weight="700" fill="#fff">Natural PDF</text>
10
- <text x="380" y="360" font-family="'Roboto', sans-serif" font-size="40" font-weight="300" fill="#fff">A more intuitive way to work with PDFs</text>
11
-
12
- <!-- Decorative elements -->
13
- <rect x="380" y="380" width="500" height="4" rx="2" fill="#fff"/>
14
- <circle cx="150" cy="150" r="30" fill="#fff" opacity="0.1"/>
15
- <circle cx="1050" cy="480" r="50" fill="#fff" opacity="0.1"/>
16
- <circle cx="950" cy="150" r="20" fill="#fff" opacity="0.1"/>
17
- </svg>
@@ -1,65 +0,0 @@
1
- /* Natural PDF - Minimal Custom Styling */
2
-
3
- .jp-InputPrompt, .jp-OutputPrompt {
4
- display: none !important;
5
- }
6
-
7
- .jupyter-wrapper .CodeMirror {
8
- font-size: 0.85em !important;
9
- }
10
-
11
- .highlight-ipynb pre {
12
- white-space: pre-wrap !important;
13
- word-wrap: break-word !important;
14
- }
15
-
16
- .CodeMirror pre {
17
- white-space: pre-wrap !important;
18
- word-wrap: break-word !important;
19
- }
20
-
21
- .jp-CodeMirrorEditor {
22
- max-width: 100%;
23
- overflow-x: auto;
24
- }
25
-
26
- .jupyter-wrapper{
27
- --jp-code-font-size: 0.85em !important;
28
- }
29
-
30
- /* Typography improvements */
31
- .md-typeset h1 {
32
- font-weight: 400;
33
- }
34
-
35
- .md-typeset h2 {
36
- font-weight: 400;
37
- margin-top: 1.5em;
38
- border-bottom: 1px solid rgba(0,0,0,0.1);
39
- padding-bottom: 0.3em;
40
- }
41
-
42
- .md-typeset h3 {
43
- font-weight: 500;
44
- }
45
-
46
- /* Table improvements */
47
- .md-typeset table:not([class]) {
48
- box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
49
- border-radius: 4px;
50
- }
51
-
52
- /* API Reference improvements */
53
- .doc-method {
54
- border-left: 3px solid var(--md-primary-fg-color);
55
- padding-left: 1em;
56
- margin: 1.5em 0;
57
- }
58
-
59
- /* Example images */
60
- .pdf-screenshot {
61
- border: 1px solid #ddd;
62
- border-radius: 4px;
63
- box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
64
- margin: 1em 0;
65
- }