natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +2 -0
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +321 -15
- natural_pdf/core/element_manager.py +67 -0
- natural_pdf/core/page.py +227 -64
- natural_pdf/core/pdf.py +387 -378
- natural_pdf/elements/collections.py +272 -41
- natural_pdf/elements/region.py +99 -15
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_manager.py +85 -25
- natural_pdf/ocr/ocr_options.py +33 -10
- natural_pdf/ocr/utils.py +14 -3
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/text_extraction.py +52 -1
- natural_pdf/utils/tqdm_utils.py +43 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
|
|
2
2
|
"cells": [
|
3
3
|
{
|
4
4
|
"cell_type": "markdown",
|
5
|
-
"id": "
|
5
|
+
"id": "d3e16d1e",
|
6
6
|
"metadata": {},
|
7
7
|
"source": [
|
8
8
|
"# Basic Table Extraction\n",
|
@@ -15,13 +15,13 @@
|
|
15
15
|
{
|
16
16
|
"cell_type": "code",
|
17
17
|
"execution_count": 1,
|
18
|
-
"id": "
|
18
|
+
"id": "34e8aa13",
|
19
19
|
"metadata": {
|
20
20
|
"execution": {
|
21
|
-
"iopub.execute_input": "2025-04-
|
22
|
-
"iopub.status.busy": "2025-04-
|
23
|
-
"iopub.status.idle": "2025-04-
|
24
|
-
"shell.execute_reply": "2025-04-
|
21
|
+
"iopub.execute_input": "2025-04-27T16:31:09.562901Z",
|
22
|
+
"iopub.status.busy": "2025-04-27T16:31:09.562589Z",
|
23
|
+
"iopub.status.idle": "2025-04-27T16:31:09.568933Z",
|
24
|
+
"shell.execute_reply": "2025-04-27T16:31:09.568165Z"
|
25
25
|
},
|
26
26
|
"lines_to_next_cell": 2
|
27
27
|
},
|
@@ -33,13 +33,13 @@
|
|
33
33
|
{
|
34
34
|
"cell_type": "code",
|
35
35
|
"execution_count": 2,
|
36
|
-
"id": "
|
36
|
+
"id": "1abe1fe6",
|
37
37
|
"metadata": {
|
38
38
|
"execution": {
|
39
|
-
"iopub.execute_input": "2025-04-
|
40
|
-
"iopub.status.busy": "2025-04-
|
41
|
-
"iopub.status.idle": "2025-04-
|
42
|
-
"shell.execute_reply": "2025-04-
|
39
|
+
"iopub.execute_input": "2025-04-27T16:31:09.571499Z",
|
40
|
+
"iopub.status.busy": "2025-04-27T16:31:09.571210Z",
|
41
|
+
"iopub.status.idle": "2025-04-27T16:31:15.218180Z",
|
42
|
+
"shell.execute_reply": "2025-04-27T16:31:15.217682Z"
|
43
43
|
}
|
44
44
|
},
|
45
45
|
"outputs": [],
|
@@ -70,7 +70,7 @@
|
|
70
70
|
},
|
71
71
|
{
|
72
72
|
"cell_type": "markdown",
|
73
|
-
"id": "
|
73
|
+
"id": "e02178e5",
|
74
74
|
"metadata": {},
|
75
75
|
"source": [
|
76
76
|
"This code uses `page.extract_tables()` which attempts to automatically detect tables based on visual cues like lines and whitespace. The result is a list of lists, representing the rows and cells of the table.\n",
|
@@ -96,6 +96,11 @@
|
|
96
96
|
"main_language": "python",
|
97
97
|
"notebook_metadata_filter": "-all"
|
98
98
|
},
|
99
|
+
"kernelspec": {
|
100
|
+
"display_name": "Python (natural-pdf)",
|
101
|
+
"language": "python",
|
102
|
+
"name": "natural-pdf"
|
103
|
+
},
|
99
104
|
"language_info": {
|
100
105
|
"codemirror_mode": {
|
101
106
|
"name": "ipython",
|