natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/finetuning/index.md +176 -0
  6. docs/index.md +19 -0
  7. docs/ocr/index.md +63 -16
  8. docs/tutorials/01-loading-and-extraction.ipynb +411 -248
  9. docs/tutorials/02-finding-elements.ipynb +123 -46
  10. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  11. docs/tutorials/04-table-extraction.ipynb +17 -12
  12. docs/tutorials/05-excluding-content.ipynb +37 -32
  13. docs/tutorials/06-document-qa.ipynb +36 -31
  14. docs/tutorials/07-layout-analysis.ipynb +45 -40
  15. docs/tutorials/07-working-with-regions.ipynb +61 -60
  16. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  17. docs/tutorials/09-section-extraction.ipynb +160 -155
  18. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  19. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  20. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  21. docs/tutorials/12-ocr-integration.md +68 -106
  22. docs/tutorials/13-semantic-search.ipynb +641 -251
  23. natural_pdf/__init__.py +3 -0
  24. natural_pdf/analyzers/layout/gemini.py +63 -47
  25. natural_pdf/classification/manager.py +343 -0
  26. natural_pdf/classification/mixin.py +149 -0
  27. natural_pdf/classification/results.py +62 -0
  28. natural_pdf/collections/mixins.py +63 -0
  29. natural_pdf/collections/pdf_collection.py +326 -17
  30. natural_pdf/core/element_manager.py +73 -4
  31. natural_pdf/core/page.py +255 -83
  32. natural_pdf/core/pdf.py +385 -367
  33. natural_pdf/elements/base.py +1 -3
  34. natural_pdf/elements/collections.py +279 -49
  35. natural_pdf/elements/region.py +106 -21
  36. natural_pdf/elements/text.py +5 -2
  37. natural_pdf/exporters/__init__.py +4 -0
  38. natural_pdf/exporters/base.py +61 -0
  39. natural_pdf/exporters/paddleocr.py +345 -0
  40. natural_pdf/extraction/manager.py +134 -0
  41. natural_pdf/extraction/mixin.py +246 -0
  42. natural_pdf/extraction/result.py +37 -0
  43. natural_pdf/ocr/__init__.py +16 -8
  44. natural_pdf/ocr/engine.py +46 -30
  45. natural_pdf/ocr/engine_easyocr.py +86 -42
  46. natural_pdf/ocr/engine_paddle.py +39 -28
  47. natural_pdf/ocr/engine_surya.py +32 -16
  48. natural_pdf/ocr/ocr_factory.py +34 -23
  49. natural_pdf/ocr/ocr_manager.py +98 -34
  50. natural_pdf/ocr/ocr_options.py +38 -10
  51. natural_pdf/ocr/utils.py +59 -33
  52. natural_pdf/qa/document_qa.py +0 -4
  53. natural_pdf/selectors/parser.py +363 -238
  54. natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
  55. natural_pdf/utils/debug.py +4 -2
  56. natural_pdf/utils/identifiers.py +9 -5
  57. natural_pdf/utils/locks.py +8 -0
  58. natural_pdf/utils/packaging.py +172 -105
  59. natural_pdf/utils/text_extraction.py +96 -65
  60. natural_pdf/utils/tqdm_utils.py +43 -0
  61. natural_pdf/utils/visualization.py +1 -1
  62. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
  63. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
  64. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  65. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  66. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  "cells": [
3
3
  {
4
4
  "cell_type": "markdown",
5
- "id": "24111eee",
5
+ "id": "d3e16d1e",
6
6
  "metadata": {},
7
7
  "source": [
8
8
  "# Basic Table Extraction\n",
@@ -15,13 +15,13 @@
15
15
  {
16
16
  "cell_type": "code",
17
17
  "execution_count": 1,
18
- "id": "75f17900",
18
+ "id": "34e8aa13",
19
19
  "metadata": {
20
20
  "execution": {
21
- "iopub.execute_input": "2025-04-21T21:23:59.967091Z",
22
- "iopub.status.busy": "2025-04-21T21:23:59.966933Z",
23
- "iopub.status.idle": "2025-04-21T21:23:59.971753Z",
24
- "shell.execute_reply": "2025-04-21T21:23:59.970980Z"
21
+ "iopub.execute_input": "2025-04-27T16:31:09.562901Z",
22
+ "iopub.status.busy": "2025-04-27T16:31:09.562589Z",
23
+ "iopub.status.idle": "2025-04-27T16:31:09.568933Z",
24
+ "shell.execute_reply": "2025-04-27T16:31:09.568165Z"
25
25
  },
26
26
  "lines_to_next_cell": 2
27
27
  },
@@ -33,13 +33,13 @@
33
33
  {
34
34
  "cell_type": "code",
35
35
  "execution_count": 2,
36
- "id": "f1b71280",
36
+ "id": "1abe1fe6",
37
37
  "metadata": {
38
38
  "execution": {
39
- "iopub.execute_input": "2025-04-21T21:23:59.974183Z",
40
- "iopub.status.busy": "2025-04-21T21:23:59.973996Z",
41
- "iopub.status.idle": "2025-04-21T21:24:06.847197Z",
42
- "shell.execute_reply": "2025-04-21T21:24:06.846712Z"
39
+ "iopub.execute_input": "2025-04-27T16:31:09.571499Z",
40
+ "iopub.status.busy": "2025-04-27T16:31:09.571210Z",
41
+ "iopub.status.idle": "2025-04-27T16:31:15.218180Z",
42
+ "shell.execute_reply": "2025-04-27T16:31:15.217682Z"
43
43
  }
44
44
  },
45
45
  "outputs": [],
@@ -70,7 +70,7 @@
70
70
  },
71
71
  {
72
72
  "cell_type": "markdown",
73
- "id": "5c80e397",
73
+ "id": "e02178e5",
74
74
  "metadata": {},
75
75
  "source": [
76
76
  "This code uses `page.extract_tables()` which attempts to automatically detect tables based on visual cues like lines and whitespace. The result is a list of lists, representing the rows and cells of the table.\n",
@@ -96,6 +96,11 @@
96
96
  "main_language": "python",
97
97
  "notebook_metadata_filter": "-all"
98
98
  },
99
+ "kernelspec": {
100
+ "display_name": "Python (natural-pdf)",
101
+ "language": "python",
102
+ "name": "natural-pdf"
103
+ },
99
104
  "language_info": {
100
105
  "codemirror_mode": {
101
106
  "name": "ipython",