PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

docs/tutorials/12-ocr-integration.ipynb DELETED Viewed

@@ -1,604 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "8b02fa9e",
-   "metadata": {},
-   "source": [
-    "# OCR Integration for Scanned Documents\n",
-    "\n",
-    "Optical Character Recognition (OCR) allows you to extract text from scanned documents where the text isn't embedded in the PDF. This tutorial demonstrates how to work with scanned documents."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "bde55ac1",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:06.104226Z",
-     "iopub.status.busy": "2025-04-21T21:32:06.104019Z",
-     "iopub.status.idle": "2025-04-21T21:32:06.108232Z",
-     "shell.execute_reply": "2025-04-21T21:32:06.107754Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "#%pip install \"natural-pdf[all]\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "5c624a53",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:06.110125Z",
-     "iopub.status.busy": "2025-04-21T21:32:06.109925Z",
-     "iopub.status.idle": "2025-04-21T21:32:14.008764Z",
-     "shell.execute_reply": "2025-04-21T21:32:14.008268Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'Without OCR: 0 characters extracted'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from natural_pdf import PDF\n",
-    "\n",
-    "# Load a PDF\n",
-    "pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf\")\n",
-    "page = pdf.pages[0]\n",
-    "\n",
-    "# Try extracting text without OCR\n",
-    "text_without_ocr = page.extract_text()\n",
-    "f\"Without OCR: {len(text_without_ocr)} characters extracted\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "461a5090",
-   "metadata": {},
-   "source": [
-    "## Finding Text Elements with OCR"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "895e3c2c",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:14.010745Z",
-     "iopub.status.busy": "2025-04-21T21:32:14.010324Z",
-     "iopub.status.idle": "2025-04-21T21:32:28.416856Z",
-     "shell.execute_reply": "2025-04-21T21:32:28.416360Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m2025-04-21T21:32:14.064078Z\u001b[0m [\u001b[33m\u001b[1mwarning  \u001b[0m] \u001b[1mUsing CPU. Note: This module is much faster with a GPU.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m71\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35measyocr.easyocr\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[2025-04-21 17:32:14,064] [ WARNING] easyocr.py:71 - Using CPU. Note: This module is much faster with a GPU.\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "<ElementCollection[TextElement](count=47)>"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Convert text-as-image to text elements\n",
-    "page.apply_ocr()\n",
-    "\n",
-    "# Select all text pieces on the page\n",
-    "text_elements = page.find_all('text')\n",
-    "f\"Found {len(text_elements)} text elements\"\n",
-    "\n",
-    "# Visualize the elements\n",
-    "text_elements.highlight()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "36051d57",
-   "metadata": {},
-   "source": [
-    "## OCR Configuration Options"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "d4461746",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:28.418763Z",
-     "iopub.status.busy": "2025-04-21T21:32:28.418565Z",
-     "iopub.status.idle": "2025-04-21T21:32:28.423024Z",
-     "shell.execute_reply": "2025-04-21T21:32:28.422671Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'                                                                                    \\n                                                                                    \\n                              ...'"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Set OCR configuration for better results\n",
-    "page.ocr_config = {\n",
-    "    'language': 'eng',  # English\n",
-    "    'dpi': 300,         # Higher resolution\n",
-    "}\n",
-    "\n",
-    "# Extract text with the improved configuration\n",
-    "improved_text = page.extract_text()\n",
-    "\n",
-    "# Preview the text\n",
-    "improved_text[:200] + \"...\" if len(improved_text) > 200 else improved_text"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d5a96ac7",
-   "metadata": {},
-   "source": [
-    "## Working with Multi-language Documents"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "9fa156f5",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:28.424374Z",
-     "iopub.status.busy": "2025-04-21T21:32:28.424235Z",
-     "iopub.status.idle": "2025-04-21T21:32:28.428114Z",
-     "shell.execute_reply": "2025-04-21T21:32:28.427816Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'                                                                                    \\n                                                                                    \\n                              '"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Configure for multiple languages\n",
-    "page.ocr_config = {\n",
-    "    'language': 'eng+fra+deu',  # English, French, German\n",
-    "    'dpi': 300\n",
-    "}\n",
-    "\n",
-    "# Extract text with multi-language support\n",
-    "multilang_text = page.extract_text()\n",
-    "multilang_text[:200]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d3ccf43f",
-   "metadata": {},
-   "source": [
-    "## Extracting Tables from Scanned Documents"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "ee7a7e7d",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:28.429414Z",
-     "iopub.status.busy": "2025-04-21T21:32:28.429283Z",
-     "iopub.status.idle": "2025-04-21T21:32:30.754086Z",
-     "shell.execute_reply": "2025-04-21T21:32:30.753700Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m2025-04-21T21:32:28.446098Z\u001b[0m [\u001b[33m\u001b[1mwarning  \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[2025-04-21 17:32:28,446] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m2025-04-21T21:32:28.446834Z\u001b[0m [\u001b[33m\u001b[1mwarning  \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[2025-04-21 17:32:28,446] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmpjbbxsx1v/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1940.4ms\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Speed: 5.4ms preprocess, 1940.4ms inference, 1.0ms postprocess per image at shape (1, 3, 1024, 800)\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Enable OCR and analyze the document layout\n",
-    "page.use_ocr = True\n",
-    "page.analyze_layout()\n",
-    "\n",
-    "# Find table regions\n",
-    "table_regions = page.find_all('region[type=table]')\n",
-    "\n",
-    "# Visualize any detected tables\n",
-    "table_regions.highlight()\n",
-    "\n",
-    "# Extract the first table if found\n",
-    "if table_regions:\n",
-    "    table_data = table_regions[0].extract_table()\n",
-    "    table_data\n",
-    "else:\n",
-    "    \"No tables found in the document\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6a3c701e",
-   "metadata": {},
-   "source": [
-    "## Finding Form Fields in Scanned Documents"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "7180badd",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:30.755960Z",
-     "iopub.status.busy": "2025-04-21T21:32:30.755766Z",
-     "iopub.status.idle": "2025-04-21T21:32:30.762760Z",
-     "shell.execute_reply": "2025-04-21T21:32:30.762434Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{\"Site: Durham's Meatpacking Chicago, IIl.\": 'Jungle Health and Satety Inspection Service\\nINS-UPONSINCLAIR           \\n                           \\n                           \\n                           \\n                           \\n                           \\nSummary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms\\nThese people could not be shown to the visitor for the odor of a fertilizer man would scare any\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; their peculiar trouble was that they fell\\ninlo the vats; and when they were fished out; there was never enough of them left to be worth\\nwould be overlooked for days, till all but the bones of them had gone out\\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\nLevel                      \\nUnsanitary Working Conditions Critical\\nInadequate Protective Equipment: Serious\\n                           \\nSerious                    \\nFailure to Properly Storc Hazardous Materials_ Critical\\nSafety Measures_ Serious   \\nInadequate Ventilation Systems Serious\\n                           \\nInsufficient Employee Training for Safe Work Practices Serious\\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\nJungle Health and Salety Irspection Service',\n",
-       " 'Date: February 3, 1905': \"Jungle Health and Satety Inspection Service\\n        INS-UPONSINCLAIR   \\n                           \\nSite: Durham's Meatpacking Chicago, IIl.\\n                           \\n                           \\n                           \\nSummary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms\\nThese people could not be shown to the visitor for the odor of a fertilizer man would scare any\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; their peculiar trouble was that they fell\\ninlo the vats; and when they were fished out; there was never enough of them left to be worth\\ntheywould be overlooked for days, till all but the bones of them had gone out\\nto thc world as Durham's Purc Lcaf Lard!\\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\nDescription                \\n                           \\nUnsanitary Working Conditions\\nInadequate Protective Equipment:\\nIneffective Injury Prevention _\\n                           \\nFailure to Properly Storc Hazardous Materials_\\nLack of AdequateFireSafety Measures_\\nInadequate Ventilation Systems\\n                           \\nInsufficient Employee Training for Safe Work Practices\\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\nJungle Health and Salety Irspection Service\",\n",
-       " 'Violation Count': \"Site: Durham's Meatpacking Chicago, IIl.\\nDate: February 3, 1905     \\n                           \\n                           \\nSummary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms\\nThese people could not be shown to the visitor for the odor of a fertilizer man would scare any\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; their peculiar trouble was that they fell\\ninlo the vats; and when they were fished out; there was never enough of them left to be worth\\nsometimestheywould be overlooked for days, till all but the bones of them had gone out\\nto thc world as Durham's Purc Lcaf Lard!\\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\nDescription                \\n                           \\nUnsanitary Working Conditions\\nInadequate Protective Equipment:\\nIneffective Injury Prevention _\\n                           \\nFailure to Properly Storc Hazardous Materials_\\nLack of AdequateFireSafety Measures_\\nInadequate Ventilation Systems\\n                           \\nInsufficient Employee Training for Safe Work Practices\\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\nJungle Health and Salety Irspection Service\",\n",
-       " 'Summary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms': 'Red (ZGB tuple]         \\n                        \\nJungle Health and Satety Inspection Service\\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\nordinary                \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\n                        \\nRepeat?',\n",
-       " 'Inadequate Protective Equipment': 'Jungle Health and Satety Inspection Service\\nINS-UPONSINCLAIR           \\n                           \\n                           \\n                           \\n                           \\n                           \\nSummary: Worst of any, however; were the fertilizer men, and those who served in the cooking rooms\\nThese people could not be shown to the visitor for the odor of a fertilizer man would scare anyordinary\\nvisitor at a hundred yards, and as for the other men, who worked in tank rooms full of steam, and in\\nsome of which there were open vats near the level of the floor; their peculiar trouble was that they fell\\ninlo the vats; and when they were fished out; there was never enough of them left to be worth\\nwould be overlooked for days, till all but the bones of them had gone out\\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\nLevel                      \\nCritical                   \\nSerious                    \\n                           \\nSerious                    \\nFailure to Properly Storc Hazardous Materials_ Critical\\nSafety Measures_ Serious   \\nSerious                    \\n                           \\nInsufficient Employee Training for Safe Work Practices Serious\\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\n                           \\nJungle Health and Salety Irspection Service'}"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Look for potential form labels (containing a colon)\n",
-    "labels = page.find_all('text:contains(\":\")') \n",
-    "\n",
-    "# Visualize the labels\n",
-    "labels.highlight()\n",
-    "\n",
-    "# Extract form data by looking to the right of each label\n",
-    "form_data = {}\n",
-    "for label in labels:\n",
-    "    # Clean the label text\n",
-    "    field_name = label.text.strip().rstrip(':')\n",
-    "    \n",
-    "    # Find the value to the right\n",
-    "    value_element = label.right(width=200)\n",
-    "    value = value_element.extract_text().strip()\n",
-    "    \n",
-    "    # Add to our dictionary\n",
-    "    form_data[field_name] = value\n",
-    "\n",
-    "# Display the extracted data\n",
-    "form_data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5495e93c",
-   "metadata": {},
-   "source": [
-    "## Combining OCR with Layout Analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "20b489df",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:30.764203Z",
-     "iopub.status.busy": "2025-04-21T21:32:30.764045Z",
-     "iopub.status.idle": "2025-04-21T21:32:32.790129Z",
-     "shell.execute_reply": "2025-04-21T21:32:32.789771Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m2025-04-21T21:32:30.782293Z\u001b[0m [\u001b[33m\u001b[1mwarning  \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[2025-04-21 17:32:30,782] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m2025-04-21T21:32:30.783192Z\u001b[0m [\u001b[33m\u001b[1mwarning  \u001b[0m] \u001b[1mGOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m72\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35mnatural_pdf.analyzers.layout.gemini\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[2025-04-21 17:32:30,783] [ WARNING] gemini.py:72 - GOOGLE_API_KEY environment variable not set. Gemini detector (via OpenAI lib) will not be available.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "image 1/1 /var/folders/25/h3prywj14qb0mlkl2s8bxq5m0000gn/T/tmprtsl29ey/temp_layout_image.png: 1024x800 2 titles, 2 plain texts, 3 abandons, 1 table, 1925.6ms\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Speed: 4.7ms preprocess, 1925.6ms inference, 1.2ms postprocess per image at shape (1, 3, 1024, 800)\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "[]"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Apply OCR and analyze layout\n",
-    "page.use_ocr = True\n",
-    "page.analyze_layout()\n",
-    "\n",
-    "# Find document structure elements\n",
-    "headings = page.find_all('region[type=heading]')\n",
-    "paragraphs = page.find_all('region[type=paragraph]')\n",
-    "\n",
-    "# Visualize the structure\n",
-    "headings.highlight(color=\"red\", label=\"Headings\")\n",
-    "paragraphs.highlight(color=\"blue\", label=\"Paragraphs\")\n",
-    "\n",
-    "# Create a simple document outline\n",
-    "document_outline = []\n",
-    "for heading in headings:\n",
-    "    heading_text = heading.extract_text()\n",
-    "    document_outline.append(heading_text)\n",
-    "\n",
-    "document_outline"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "320bdfc4",
-   "metadata": {},
-   "source": [
-    "## Working with Multiple Pages"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "9421a04d",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:32.791525Z",
-     "iopub.status.busy": "2025-04-21T21:32:32.791398Z",
-     "iopub.status.idle": "2025-04-21T21:32:32.796295Z",
-     "shell.execute_reply": "2025-04-21T21:32:32.795973Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Page 1:                                                                                     \\n               ...']"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Process all pages in the document\n",
-    "all_text = []\n",
-    "\n",
-    "for i, page in enumerate(pdf.pages):\n",
-    "    # Enable OCR for each page\n",
-    "    page.use_ocr = True\n",
-    "    \n",
-    "    # Extract text\n",
-    "    page_text = page.extract_text()\n",
-    "    \n",
-    "    # Add to our collection with page number\n",
-    "    all_text.append(f\"Page {i+1}: {page_text[:100]}...\")\n",
-    "\n",
-    "# Show the first few pages\n",
-    "all_text"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d69c14d1",
-   "metadata": {},
-   "source": [
-    "## Saving PDFs with Searchable Text\n",
-    "\n",
-    "After applying OCR to a PDF, you can save a new version of the PDF where the recognized text is embedded as an invisible layer. This makes the text searchable and copyable in standard PDF viewers.\n",
-    "\n",
-    "Use the `save_searchable()` method on the `PDF` object:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "e84f8946",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2025-04-21T21:32:32.797789Z",
-     "iopub.status.busy": "2025-04-21T21:32:32.797610Z",
-     "iopub.status.idle": "2025-04-21T21:32:49.165749Z",
-     "shell.execute_reply": "2025-04-21T21:32:49.165293Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[2m2025-04-21T21:32:32.910436Z\u001b[0m [\u001b[33m\u001b[1mwarning  \u001b[0m] \u001b[1mUsing CPU. Note: This module is much faster with a GPU.\u001b[0m \u001b[36mlineno\u001b[0m=\u001b[35m71\u001b[0m \u001b[36mmodule\u001b[0m=\u001b[35measyocr.easyocr\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "[2025-04-21 17:32:32,910] [ WARNING] easyocr.py:71 - Using CPU. Note: This module is much faster with a GPU.\n"
-     ]
-    }
-   ],
-   "source": [
-    "from natural_pdf import PDF\n",
-    "\n",
-    "input_pdf_path = \"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf\"\n",
-    "\n",
-    "pdf = PDF(input_pdf_path)\n",
-    "pdf.apply_ocr() \n",
-    "\n",
-    "pdf.save_searchable(\"needs-ocr-searchable.pdf\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cd0b43ed",
-   "metadata": {},
-   "source": [
-    "This creates `needs-ocr-searchable.pdf`, which looks identical to the original but now has a text layer corresponding to the OCR results. You can adjust the rendering resolution used during saving with the `dpi` parameter (default is 300).\n",
-    "\n",
-    "OCR integration enables you to work with scanned documents, historical archives, and image-based PDFs that don't have embedded text. By combining OCR with natural-pdf's layout analysis capabilities, you can turn any document into structured, searchable data. "
-   ]
-  }
- ],
- "metadata": {
-  "jupytext": {
-   "cell_metadata_filter": "-all",
-   "main_language": "python",
-   "notebook_metadata_filter": "-all",
-   "text_representation": {
-    "extension": ".md",
-    "format_name": "markdown"
-   }
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl