natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "24111eee",
5
+ "id": "d3e16d1e",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Basic Table Extraction\n",
@@ -15,13 +15,13 @@
15
15
  {
16
16
  "cell_type": "code",
17
17
  "execution_count": 1,
18
- "id": "75f17900",
18
+ "id": "34e8aa13",
19
19
  "metadata": {
20
20
  "execution": {
21
- "iopub.execute_input": "2025-04-21T21:23:59.967091Z",
22
- "iopub.status.busy": "2025-04-21T21:23:59.966933Z",
23
- "iopub.status.idle": "2025-04-21T21:23:59.971753Z",
24
- "shell.execute_reply": "2025-04-21T21:23:59.970980Z"
21
+ "iopub.execute_input": "2025-04-27T16:31:09.562901Z",
22
+ "iopub.status.busy": "2025-04-27T16:31:09.562589Z",
23
+ "iopub.status.idle": "2025-04-27T16:31:09.568933Z",
24
+ "shell.execute_reply": "2025-04-27T16:31:09.568165Z"
25
25
  },
26
26
  "lines_to_next_cell": 2
27
27
  },
@@ -33,13 +33,13 @@
33
33
  {
34
34
  "cell_type": "code",
35
35
  "execution_count": 2,
36
- "id": "f1b71280",
36
+ "id": "1abe1fe6",
37
37
  "metadata": {
38
38
  "execution": {
39
- "iopub.execute_input": "2025-04-21T21:23:59.974183Z",
40
- "iopub.status.busy": "2025-04-21T21:23:59.973996Z",
41
- "iopub.status.idle": "2025-04-21T21:24:06.847197Z",
42
- "shell.execute_reply": "2025-04-21T21:24:06.846712Z"
39
+ "iopub.execute_input": "2025-04-27T16:31:09.571499Z",
40
+ "iopub.status.busy": "2025-04-27T16:31:09.571210Z",
41
+ "iopub.status.idle": "2025-04-27T16:31:15.218180Z",
42
+ "shell.execute_reply": "2025-04-27T16:31:15.217682Z"
43
43
  }
44
44
  },
45
45
  "outputs": [],
@@ -70,7 +70,7 @@
70
70
  },
71
71
  {
72
72
  "cell_type": "markdown",
73
- "id": "5c80e397",
73
+ "id": "e02178e5",
74
74
  "metadata": {},
75
75
  "source": [
76
76
  "This code uses `page.extract_tables()` which attempts to automatically detect tables based on visual cues like lines and whitespace. The result is a list of lists, representing the rows and cells of the table.\n",
@@ -96,6 +96,11 @@
96
96
  "main_language": "python",
97
97
  "notebook_metadata_filter": "-all"
98
98
  },
99
+ "kernelspec": {
100
+ "display_name": "Python (natural-pdf)",
101
+ "language": "python",
102
+ "name": "natural-pdf"
103
+ },
99
104
  "language_info": {
100
105
  "codemirror_mode": {
101
106
  "name": "ipython",