natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/finetuning/index.md +176 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +411 -248
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +326 -17
- natural_pdf/core/element_manager.py +73 -4
- natural_pdf/core/page.py +255 -83
- natural_pdf/core/pdf.py +385 -367
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +279 -49
- natural_pdf/elements/region.py +106 -21
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +86 -42
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +98 -34
- natural_pdf/ocr/ocr_options.py +38 -10
- natural_pdf/ocr/utils.py +59 -33
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +96 -65
- natural_pdf/utils/tqdm_utils.py +43 -0
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
|
|
2
2
|
"cells": [
|
3
3
|
{
|
4
4
|
"cell_type": "markdown",
|
5
|
-
"id": "
|
5
|
+
"id": "d3e16d1e",
|
6
6
|
"metadata": {},
|
7
7
|
"source": [
|
8
8
|
"# Basic Table Extraction\n",
|
@@ -15,13 +15,13 @@
|
|
15
15
|
{
|
16
16
|
"cell_type": "code",
|
17
17
|
"execution_count": 1,
|
18
|
-
"id": "
|
18
|
+
"id": "34e8aa13",
|
19
19
|
"metadata": {
|
20
20
|
"execution": {
|
21
|
-
"iopub.execute_input": "2025-04-
|
22
|
-
"iopub.status.busy": "2025-04-
|
23
|
-
"iopub.status.idle": "2025-04-
|
24
|
-
"shell.execute_reply": "2025-04-
|
21
|
+
"iopub.execute_input": "2025-04-27T16:31:09.562901Z",
|
22
|
+
"iopub.status.busy": "2025-04-27T16:31:09.562589Z",
|
23
|
+
"iopub.status.idle": "2025-04-27T16:31:09.568933Z",
|
24
|
+
"shell.execute_reply": "2025-04-27T16:31:09.568165Z"
|
25
25
|
},
|
26
26
|
"lines_to_next_cell": 2
|
27
27
|
},
|
@@ -33,13 +33,13 @@
|
|
33
33
|
{
|
34
34
|
"cell_type": "code",
|
35
35
|
"execution_count": 2,
|
36
|
-
"id": "
|
36
|
+
"id": "1abe1fe6",
|
37
37
|
"metadata": {
|
38
38
|
"execution": {
|
39
|
-
"iopub.execute_input": "2025-04-
|
40
|
-
"iopub.status.busy": "2025-04-
|
41
|
-
"iopub.status.idle": "2025-04-
|
42
|
-
"shell.execute_reply": "2025-04-
|
39
|
+
"iopub.execute_input": "2025-04-27T16:31:09.571499Z",
|
40
|
+
"iopub.status.busy": "2025-04-27T16:31:09.571210Z",
|
41
|
+
"iopub.status.idle": "2025-04-27T16:31:15.218180Z",
|
42
|
+
"shell.execute_reply": "2025-04-27T16:31:15.217682Z"
|
43
43
|
}
|
44
44
|
},
|
45
45
|
"outputs": [],
|
@@ -70,7 +70,7 @@
|
|
70
70
|
},
|
71
71
|
{
|
72
72
|
"cell_type": "markdown",
|
73
|
-
"id": "
|
73
|
+
"id": "e02178e5",
|
74
74
|
"metadata": {},
|
75
75
|
"source": [
|
76
76
|
"This code uses `page.extract_tables()` which attempts to automatically detect tables based on visual cues like lines and whitespace. The result is a list of lists, representing the rows and cells of the table.\n",
|
@@ -96,6 +96,11 @@
|
|
96
96
|
"main_language": "python",
|
97
97
|
"notebook_metadata_filter": "-all"
|
98
98
|
},
|
99
|
+
"kernelspec": {
|
100
|
+
"display_name": "Python (natural-pdf)",
|
101
|
+
"language": "python",
|
102
|
+
"name": "natural-pdf"
|
103
|
+
},
|
99
104
|
"language_info": {
|
100
105
|
"codemirror_mode": {
|
101
106
|
"name": "ipython",
|