PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docs/categorizing-documents/index.md +168 -0
docs/data-extraction/index.md +87 -0
docs/element-selection/index.ipynb +218 -164
docs/element-selection/index.md +20 -0
docs/index.md +19 -0
docs/ocr/index.md +63 -16
docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
docs/tutorials/02-finding-elements.ipynb +123 -46
docs/tutorials/03-extracting-blocks.ipynb +24 -19
docs/tutorials/04-table-extraction.ipynb +17 -12
docs/tutorials/05-excluding-content.ipynb +37 -32
docs/tutorials/06-document-qa.ipynb +36 -31
docs/tutorials/07-layout-analysis.ipynb +45 -40
docs/tutorials/07-working-with-regions.ipynb +61 -60
docs/tutorials/08-spatial-navigation.ipynb +76 -71
docs/tutorials/09-section-extraction.ipynb +160 -155
docs/tutorials/10-form-field-extraction.ipynb +71 -66
docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
docs/tutorials/12-ocr-integration.ipynb +3420 -312
docs/tutorials/12-ocr-integration.md +68 -106
docs/tutorials/13-semantic-search.ipynb +641 -251
natural_pdf/__init__.py +2 -0
natural_pdf/classification/manager.py +343 -0
natural_pdf/classification/mixin.py +149 -0
natural_pdf/classification/results.py +62 -0
natural_pdf/collections/mixins.py +63 -0
natural_pdf/collections/pdf_collection.py +321 -15
natural_pdf/core/element_manager.py +67 -0
natural_pdf/core/page.py +227 -64
natural_pdf/core/pdf.py +387 -378
natural_pdf/elements/collections.py +272 -41
natural_pdf/elements/region.py +99 -15
natural_pdf/elements/text.py +5 -2
natural_pdf/exporters/paddleocr.py +1 -1
natural_pdf/extraction/manager.py +134 -0
natural_pdf/extraction/mixin.py +246 -0
natural_pdf/extraction/result.py +37 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_manager.py +85 -25
natural_pdf/ocr/ocr_options.py +33 -10
natural_pdf/ocr/utils.py +14 -3
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/selectors/parser.py +363 -238
natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/text_extraction.py +52 -1
natural_pdf/utils/tqdm_utils.py +43 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0

docs/tutorials/12-ocr-integration.md CHANGED Viewed

@@ -18,139 +18,97 @@ text_without_ocr = page.extract_text()
 f"Without OCR: {len(text_without_ocr)} characters extracted"
 ```
-## Finding Text Elements with OCR
+## Applying OCR and Finding Elements
+The core method is `page.apply_ocr()`. This runs the OCR process and adds `TextElement` objects to the page. You can specify the engine and languages.
+**Note:** Re-applying OCR to the same page or region will automatically remove any previously generated OCR elements for that area before adding the new ones.
 ```python
-# Convert text-as-image to text elements
-page.apply_ocr()
+# Apply OCR using the default engine (EasyOCR) for English
+page.apply_ocr(languages=['en'])
-# Select all text pieces on the page
-text_elements = page.find_all('text')
-f"Found {len(text_elements)} text elements"
+# Select all text pieces found by OCR
+text_elements = page.find_all('text[source=ocr]')
+print(f"Found {len(text_elements)} text elements using default OCR")
 # Visualize the elements
 text_elements.highlight()
-```
-## OCR Configuration Options
-```python
-# Set OCR configuration for better results
-page.ocr_config = {
-    'language': 'eng',  # English
-    'dpi': 300,         # Higher resolution
-}
+# Apply OCR using PaddleOCR for English and Chinese
+page.apply_ocr(engine='paddle', languages=['en', 'ch_sim'])
-# Extract text with the improved configuration
-improved_text = page.extract_text()
+# Apply OCR using SuryaOCR for English and German
+page.apply_ocr(engine='surya', languages=['en', 'de'])
-# Preview the text
-improved_text[:200] + "..." if len(improved_text) > 200 else improved_text
+text_with_ocr = page.extract_text()
+print(f"\nExtracted text after OCR:\n{text_with_ocr[:150]}...")
 ```
-## Working with Multi-language Documents
+## Advanced OCR Configuration
-```python
-# Configure for multiple languages
-page.ocr_config = {
-    'language': 'eng+fra+deu',  # English, French, German
-    'dpi': 300
-}
-# Extract text with multi-language support
-multilang_text = page.extract_text()
-multilang_text[:200]
-```
-## Extracting Tables from Scanned Documents
+For more control, import and use the specific `Options` class for your chosen engine within the `apply_ocr` call.
 ```python
-# Enable OCR and analyze the document layout
-page.use_ocr = True
-page.analyze_layout()
-# Find table regions
-table_regions = page.find_all('region[type=table]')
-# Visualize any detected tables
-table_regions.highlight()
-# Extract the first table if found
-if table_regions:
-    table_data = table_regions[0].extract_table()
-    table_data
-else:
-    "No tables found in the document"
+from natural_pdf.ocr import PaddleOCROptions, EasyOCROptions, SuryaOCROptions
+# Re-apply OCR using EasyOCR with specific options
+easy_opts = EasyOCROptions(
+    paragraph=False,
+)
+page.apply_ocr(engine='easyocr', languages=['en'], min_confidence=0.1, options=easy_opts)
+paddle_opts = PaddleOCROptions(
+    use_angle_cls=False,
+    det_db_thresh=0.3,
+)
+page.apply_ocr(engine='paddle', languages=['en'], options=paddle_opts)
+surya_opts = SuryaOCROptions()
+page.apply_ocr(engine='surya', languages=['en'], min_confidence=0.5, detect_only=True, options=surya_opts)
 ```
-## Finding Form Fields in Scanned Documents
+## Interactive OCR Correction / Debugging
-```python
-# Look for potential form labels (containing a colon)
-labels = page.find_all('text:contains(":")')
-# Visualize the labels
-labels.highlight()
-# Extract form data by looking to the right of each label
-form_data = {}
-for label in labels:
-    # Clean the label text
-    field_name = label.text.strip().rstrip(':')
-    # Find the value to the right
-    value_element = label.right(width=200)
-    value = value_element.extract_text().strip()
-    # Add to our dictionary
-    form_data[field_name] = value
-# Display the extracted data
-form_data
-```
+If OCR results aren't perfect, you can use the bundled interactive web application (SPA) to review and correct them.
-## Combining OCR with Layout Analysis
+1.  **Package the data:**
+    After running `apply_ocr` (or `apply_layout`), use `create_correction_task_package` to create a zip file containing the PDF images and detected elements.
-```python
-# Apply OCR and analyze layout
-page.use_ocr = True
-page.analyze_layout()
+    ```python
+    from natural_pdf.utils.packaging import create_correction_task_package
-# Find document structure elements
-headings = page.find_all('region[type=heading]')
-paragraphs = page.find_all('region[type=paragraph]')
+    page.apply_ocr()
-# Visualize the structure
-headings.highlight(color="red", label="Headings")
-paragraphs.highlight(color="blue", label="Paragraphs")
+    create_correction_task_package(pdf, "correction_package.zip", overwrite=True)
+    ```
-# Create a simple document outline
-document_outline = []
-for heading in headings:
-    heading_text = heading.extract_text()
-    document_outline.append(heading_text)
+2.  **Run the SPA:**
+    Navigate to the SPA directory within the installed `natural_pdf` library in your terminal and start a simple web server.
+3.  **Use the SPA:**
+    Open `http://localhost:8000` in your browser. Drag the `correction_package.zip` file onto the page to load the document. You can then click on text elements to correct the OCR results.
-document_outline
-```
 ## Working with Multiple Pages
+Apply OCR or layout analysis to all pages using the `PDF` object.
 ```python
 # Process all pages in the document
-all_text = []
-for i, page in enumerate(pdf.pages):
-    # Enable OCR for each page
-    page.use_ocr = True
-    # Extract text
-    page_text = page.extract_text()
-    # Add to our collection with page number
-    all_text.append(f"Page {i+1}: {page_text[:100]}...")
-# Show the first few pages
-all_text
+# Apply OCR to all pages (example using EasyOCR)
+pdf.apply_ocr(engine='easyocr', languages=['en'])
+print(f"Applied OCR to {len(pdf.pages)} pages.")
+# Or apply layout analysis to all pages (example using Paddle)
+# pdf.apply_layout(engine='paddle')
+# print(f"Applied Layout Analysis to {len(pdf.pages)} pages.")
+# Extract text from all pages (uses OCR results if available)
+all_text_content = pdf.extract_text(page_separator="\\n\\n---\\n\\n")
+print(f"\nCombined text from all pages:\n{all_text_content[:500]}...")
 ```
 ## Saving PDFs with Searchable Text
@@ -165,9 +123,13 @@ from natural_pdf import PDF
 input_pdf_path = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf"
 pdf = PDF(input_pdf_path)
-pdf.apply_ocr()
+# Apply OCR to all pages before saving
+# Use desired engine and options
+pdf.apply_ocr(engine='easyocr', languages=['en'])
 pdf.save_searchable("needs-ocr-searchable.pdf")
+print("Saved searchable PDF to needs-ocr-searchable.pdf")
 ```
 This creates `needs-ocr-searchable.pdf`, which looks identical to the original but now has a text layer corresponding to the OCR results. You can adjust the rendering resolution used during saving with the `dpi` parameter (default is 300).

natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl