PyPI - natural-pdf - Versions diffs - 0.1.8__tar.gz → 0.1.10__tar.gz - Mend

natural-pdf 0.1.8tar.gz → 0.1.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

natural_pdf-0.1.10/MANIFEST.in ADDED Viewed

@@ -0,0 +1,48 @@
+include README.md
+include LICENSE
+# HTML templates
+recursive-include natural_pdf/templates *.html
+# Documentation assets
+recursive-include docs *.md *.png *.jpg *.gif
+# Remove common build garbage
+global-exclude __pycache__ *.py[cod] *.so .DS_Store
+global-exclude *hidden*
+# 💣 Critical: prevent recursion bugs
+prune build
+prune dist
+prune .nox
+prune .venv
+prune env
+prune venv
+# General junk
+exclude .notebook_cache.json
+exclude Untitled.ipynb
+exclude conversation.md
+exclude transcript.md
+exclude sample.py
+exclude sample2.py
+exclude requirements.lock
+exclude install.sh
+# Directories to exclude
+prune .venv
+prune output
+prune results
+prune natural_pdf_index
+prune hidden
+prune pdfs/hidden
+prune my_paddleocr_finetune_data
+prune notebooks
+prune docs/tutorials/pdfs
+# Individual files in nested directories
+exclude docs/tutorials/needs-ocr-searchable.pdf
+exclude notebooks/Examples.md
+# File patterns
+global-exclude *.hocr

{natural_pdf-0.1.8 → natural_pdf-0.1.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: natural-pdf
-Version: 0.1.8
+Version: 0.1.10
 Summary: A more intuitive interface for working with PDFs
 Author-email: Jonathan Soma <jonathan.soma@gmail.com>
 License-Expression: MIT
@@ -17,11 +17,13 @@ Requires-Dist: colour
 Requires-Dist: numpy
 Requires-Dist: urllib3
 Requires-Dist: tqdm
+Requires-Dist: pydantic
 Provides-Extra: interactive
 Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
 Provides-Extra: haystack
 Requires-Dist: haystack-ai; extra == "haystack"
-Requires-Dist: chroma-haystack; extra == "haystack"
+Requires-Dist: lancedb-haystack; extra == "haystack"
+Requires-Dist: lancedb; extra == "haystack"
 Requires-Dist: sentence-transformers; extra == "haystack"
 Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
 Provides-Extra: easyocr
@@ -36,6 +38,9 @@ Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
 Provides-Extra: surya
 Requires-Dist: surya-ocr; extra == "surya"
 Requires-Dist: natural-pdf[core-ml]; extra == "surya"
+Provides-Extra: doctr
+Requires-Dist: python-doctr[torch]; extra == "doctr"
+Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
 Provides-Extra: qa
 Requires-Dist: natural-pdf[core-ml]; extra == "qa"
 Provides-Extra: docling
@@ -43,7 +48,6 @@ Requires-Dist: docling; extra == "docling"
 Requires-Dist: natural-pdf[core-ml]; extra == "docling"
 Provides-Extra: llm
 Requires-Dist: openai>=1.0; extra == "llm"
-Requires-Dist: pydantic; extra == "llm"
 Provides-Extra: classification
 Requires-Dist: sentence-transformers; extra == "classification"
 Requires-Dist: timm; extra == "classification"
@@ -63,6 +67,9 @@ Requires-Dist: pipdeptree; extra == "dev"
 Requires-Dist: nbformat; extra == "dev"
 Requires-Dist: jupytext; extra == "dev"
 Requires-Dist: nbclient; extra == "dev"
+Provides-Extra: deskew
+Requires-Dist: deskew>=1.5; extra == "deskew"
+Requires-Dist: img2pdf; extra == "deskew"
 Provides-Extra: all
 Requires-Dist: natural-pdf[interactive]; extra == "all"
 Requires-Dist: natural-pdf[haystack]; extra == "all"
@@ -70,11 +77,13 @@ Requires-Dist: natural-pdf[easyocr]; extra == "all"
 Requires-Dist: natural-pdf[paddle]; extra == "all"
 Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
 Requires-Dist: natural-pdf[surya]; extra == "all"
+Requires-Dist: natural-pdf[doctr]; extra == "all"
 Requires-Dist: natural-pdf[qa]; extra == "all"
 Requires-Dist: natural-pdf[ocr-export]; extra == "all"
 Requires-Dist: natural-pdf[docling]; extra == "all"
 Requires-Dist: natural-pdf[llm]; extra == "all"
 Requires-Dist: natural-pdf[classification]; extra == "all"
+Requires-Dist: natural-pdf[deskew]; extra == "all"
 Requires-Dist: natural-pdf[test]; extra == "all"
 Provides-Extra: core-ml
 Requires-Dist: torch; extra == "core-ml"

natural_pdf-0.1.10/audit_packaging.py ADDED Viewed

@@ -0,0 +1,56 @@
+import subprocess
+import tarfile
+import zipfile
+from pathlib import Path
+DIST_DIR = Path("dist")
+def build_package():
+    subprocess.run(["python", "-m", "build", "--sdist", "--wheel"], check=True)
+def get_sdist_files():
+    sdist_path = next(DIST_DIR.glob("*.tar.gz"))
+    with tarfile.open(sdist_path, "r:gz") as tar:
+        return sorted(str(Path(m.name)) for m in tar.getmembers() if m.isfile())
+def get_wheel_files():
+    wheel_path = next(DIST_DIR.glob("*.whl"))
+    with zipfile.ZipFile(wheel_path, "r") as zipf:
+        return sorted(str(f) for f in zipf.namelist() if not f.endswith("/"))
+def get_gitignored_files():
+    proc = subprocess.run(
+        ["git", "ls-files", "--others", "-i", "--exclude-standard"],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+    return sorted(proc.stdout.strip().splitlines())
+def diff_lists(packaged, ignored):
+    return sorted(set(packaged) & set(ignored))
+def main():
+    build_package()
+    sdist_files = get_sdist_files()
+    wheel_files = get_wheel_files()
+    ignored_files = get_gitignored_files()
+    print("\n🚫 Files in *sdist* that are also .gitignored:")
+    for f in diff_lists(sdist_files, ignored_files):
+        print("  •", f)
+    print("\n🚫 Files in *wheel* that are also .gitignored:")
+    for f in diff_lists(wheel_files, ignored_files):
+        print("  •", f)
+if __name__ == "__main__":
+    main()

{natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/categorizing-documents/index.md RENAMED Viewed

@@ -22,32 +22,29 @@ from natural_pdf import PDF
 # Example: Classify a Page
 pdf = PDF("pdfs/01-practice.pdf")
 page = pdf.pages[0]
-categories = ["invoice", "letter", "report cover", "data table"]
-results = page.classify(categories=categories, model="text")
+labels = ["invoice", "letter", "report cover", "data table"]
+page.classify(labels, using="text")
 # Access the top result
 print(f"Top Category: {page.category}")
 print(f"Confidence: {page.category_confidence:.3f}")
-# Access all results
-# print(page.classification_results)
 ```
 **Key Arguments:**
-*   `categories` (required): A list of strings representing the potential categories you want to classify the item into.
-*   `model` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
+*   `labels` (required): A list of strings representing the potential labels you want to classify the item into.
+*   `using` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
     *   `"text"`: Uses a text-based model (default: `facebook/bart-large-mnli`) suitable for classifying based on language content.
     *   `"vision"`: Uses a vision-based model (default: `openai/clip-vit-base-patch32`) suitable for classifying based on visual layout and appearance.
     *   Specific Model ID: You can provide a Hugging Face model ID (e.g., `"google/siglip-base-patch16-224"`, `"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"`) compatible with zero-shot text or image classification. The library attempts to infer whether it's text or vision, but you might need `using`.
-*   `using` (optional): Explicitly set to `"text"` or `"vision"` if the automatic inference based on the `model` ID fails or is ambiguous.
-*   `min_confidence` (optional): A float between 0.0 and 1.0. Only categories with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
+*   `model` (optional): Explicitly model ID (HuggingFace repo name)
+*   `min_confidence` (optional): A float between 0.0 and 1.0. Only labels with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
 ## Text vs. Vision Classification
 Choosing the right model type depends on your goal:
-### Text Classification (`model="text"`)
+### Text Classification (`using="text"`)
 *   **How it works:** Extracts the text from the page or region and analyzes the language content.
 *   **Best for:**
@@ -57,12 +54,12 @@ Choosing the right model type depends on your goal:
 ```python
 # Find pages related to finance
-financial_categories = ["budget", "revenue", "expenditure", "forecast"]
-pdf.classify_pages(categories=financial_categories, model="text")
+financial_labels = ["budget", "revenue", "expenditure", "forecast"]
+pdf.classify_pages(financial_labels, using="text")
 budget_pages = [p for p in pdf.pages if p.category == "budget"]
 ```
-### Vision Classification (`model="vision"`)
+### Vision Classification (`using="vision"`)
 *   **How it works:** Renders the page or region as an image and analyzes its visual layout, structure, and appearance.
 *   **Best for:**
@@ -72,8 +69,8 @@ budget_pages = [p for p in pdf.pages if p.category == "budget"]
 ```python
 # Find pages that look like invoices or receipts
-visual_categories = ["invoice", "receipt", "letter", "form"]
-page.classify(categories=visual_categories, model="vision")
+visual_labels = ["invoice", "receipt", "letter", "form"]
+page.classify(visual_labels, using="vision")
 if page.category in ["invoice", "receipt"]:
     print(f"Page {page.number} looks like an invoice or receipt.")
 ```
@@ -88,7 +85,7 @@ Classifying a whole page is useful for sorting documents or identifying the over
 # Classify the first page
 page = pdf.pages[0]
 page_types = ["cover page", "table of contents", "chapter start", "appendix"]
-page.classify(categories=page_types, model="vision") # Vision often good for page structure
+page.classify(page_types, using="vision") # Vision often good for page structure
 print(f"Page 1 Type: {page.category}")
 ```
@@ -101,9 +98,9 @@ Classifying a specific region allows for more granular analysis within a page. Y
 paragraphs = page.find_all("region[type=paragraph]")
 if paragraphs:
     # Classify the topic of the first paragraph
-    topic_categories = ["introduction", "methodology", "results", "conclusion"]
+    topic_labels = ["introduction", "methodology", "results", "conclusion"]
     # Use text model for topic
-    paragraphs[0].classify(categories=topic_categories, model="text")
+    paragraphs[0].classify(topic_labels, using="text")
     print(f"First paragraph category: {paragraphs[0].category}")
 ```
@@ -113,10 +110,10 @@ After running `.classify()`, you can access the results:
 *   `page.category` or `region.category`: Returns the string label of the category with the highest confidence score from the *last* classification run. Returns `None` if no classification has been run or no category met the threshold.
 *   `page.category_confidence` or `region.category_confidence`: Returns the float confidence score (0.0-1.0) for the top category. Returns `None` otherwise.
-*   `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, categories provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
+*   `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, labels provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
 ```python
-results = page.classify(categories=["invoice", "letter"], model="text", min_confidence=0.5)
+results = page.classify(["invoice", "letter"], using="text", min_confidence=0.5)
 if page.category == "invoice":
     print(f"Found an invoice with confidence {page.category_confidence:.2f}")
@@ -135,10 +132,10 @@ Classifies pages across all PDFs in the collection. Use `max_workers` for parall
 ```python
 collection = natural_pdf.PDFCollection.from_directory("./documents/")
-categories = ["form", "datasheet", "image", "text document"]
+labels = ["form", "datasheet", "image", "text document"]
 # Classify all pages using vision model, processing 4 PDFs concurrently
-collection.classify_all(categories=categories, model="vision", max_workers=4)
+collection.classify_all(labels, using="vision", max_workers=4)
 # Filter PDFs containing forms
 form_pdfs = []
@@ -160,7 +157,7 @@ layout_regions = pdf.find_all("region")
 region_types = ["paragraph", "list", "table", "figure", "caption"]
 # Classify all detected regions based on vision
-layout_regions.classify_all(categories=region_types, model="vision")
+layout_regions.classify_all(region_types, model="vision")
 # Count table regions
 table_count = sum(1 for r in layout_regions if r.category == "table")

{natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/data-extraction/index.md RENAMED Viewed

@@ -1,42 +1,56 @@
 # Structured Data Extraction
-Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with Large Language Models (LLMs) via Pydantic schemas to achieve this.
+Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with LLMs to pull out [structured data](https://platform.openai.com/docs/guides/structured-outputs).
+You need to install more than just the tiny baby default `natural_pdf` for this:
+```
+# Install just the LLM portions
+pip install "natural_pdf[llm]"
+# Install eeeeeverything
+pip install "natural_pdf[all]"
+```
 ## Introduction
 This feature allows you to define the exact data structure you want using a Pydantic model and then instruct an LLM to populate that structure based on the content of a PDF element (like a `Page` or `Region`).
+> Not sure how to write a Pydantic schema? Just ask an LLM! "Write me a Pydantic schema to pull out an invoice number (an integer), a company name (string) and a date (string)." It'll go fine.
 ## Basic Extraction
 1.  **Define a Schema:** Create a Pydantic model for your desired data.
-2.  **Extract:** Use the `.extract()` method on a `PDF`, `Page`, or `Region` object.
-3.  **Access:** Use the `.extracted()` method to retrieve the results.
+2.  **Extract:** Use `.extract()` on a `PDF`, `Page`, or `Region` object.
+3.  **Access:** Use `.extracted()` to retrieve the results.
 ```python
 from natural_pdf import PDF
 from pydantic import BaseModel, Field
-from openai import OpenAI # Example client
+from openai import OpenAI
-# Example: Initialize your LLM client
-client = OpenAI()
+# Initialize your LLM client
+# Anything OpenAI-compatible works!
+client = OpenAI(
+    api_key="ANTHROPIC_API_KEY",  # Your Anthropic API key
+    base_url="https://api.anthropic.com/v1/"  # Anthropic's API endpoint
+)
 # Load the PDF
 pdf = PDF("path/to/your/document.pdf")
 page = pdf.pages[0]
-# 1. Define your schema
+# Define your schema
 class InvoiceInfo(BaseModel):
     invoice_number: str = Field(description="The main invoice identifier")
     total_amount: float = Field(description="The final amount due")
     company_name: Optional[str] = Field(None, description="The name of the issuing company")
-# 2. Extract data (using default analysis_key="default-structured")
+# Extract data
 page.extract(schema=InvoiceInfo, client=client)
-# 3. Access the results
 # Access the full result object
 full_data = page.extracted()
-print(full_data)
+print(full_data)
 # Access a single field
 inv_num = page.extracted('invoice_number')
@@ -51,16 +65,23 @@ print(f"Invoice Number: {inv_num}")
 ```python
 # Extract using a specific key
-page.extract(InvoiceInfo, client, analysis_key="invoice_header")
+page.extract(InvoiceInfo, client=client, analysis_key="invoice_header")
 # Access using the specific key
 header_data = page.extracted(analysis_key="invoice_header")
 company = page.extracted('company_name', analysis_key="invoice_header")
 ```
-## Applying to Regions and Collections
+## Text vs vision
+When sending a page (or a region or etc) to an LLM, you can choose either `using='text'` (default) or `using='vision'`.
+- `text` sends the text, somewhat respecting layout using `.extract_text(layout=True)`
+- `vision` sends an image of the page with `.to_image(resolution=72)` (no highlights or labels)
+## Batch and bulk extraction
-The `.extract()` and `.extracted()` methods work identically on `Region` objects, allowing you to target specific areas of a page for structured data extraction.
+If you have a lot of pages or a lot of PDFs or a lot of anything, the `.extract()` and `.extracted()` methods work identically on most parts of a PDF - regions, pages, collections of pdfs, etc, allowing a lot of flexibility in what you analyze.
 ```python
 # Assuming 'header_region' is a Region object you defined
@@ -73,15 +94,16 @@ Furthermore, you can apply extraction to collections of elements (like `pdf.page
 ```python
 # Example: Extract InvoiceInfo from the first 5 pages
 results = pdf.pages[:5].apply(
-    'extract',
-    schema=InvoiceInfo,
-    client=client,
-    analysis_key="page_invoice_info", # Use a specific key for batch results
-    overwrite=True # Allow overwriting if run multiple times
+    lambda page: page.extract(
+        client=client,
+        schema=InvoiceInfo,
+        client=client,
+        analysis_key="page_invoice_info",
+    )
 )
 # Access results for the first page in the collection
-first_page_company = results[0].extracted('company_name', analysis_key="page_invoice_info")
+pdf.pages[0].extracted('company_name', analysis_key="page_invoice_info")
 ```
 This provides a powerful way to turn unstructured PDF content into structured, usable data.

{natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/index.md RENAMED Viewed

@@ -140,14 +140,14 @@ Categorize pages or specific regions based on their content using text or vision
 ```python
 # Classify a page based on text
-categories = ["invoice", "scientific article", "presentation"]
-page.classify(categories=categories, model="text")
+labels = ["invoice", "scientific article", "presentation"]
+page.classify(labels, using="text")
 print(f"Page Category: {page.category} (Confidence: {page.category_confidence:.2f})")
 # Classify a page based on what it looks like
-categories = ["invoice", "scientific article", "presentation"]
-page.classify(categories=categories, model="vision")
+labels = ["invoice", "scientific article", "presentation"]
+page.classify(labels, using="vision")
 print(f"Page Category: {page.category} (Confidence: {page.category_confidence:.2f})")
 ```

natural-pdf 0.1.8__tar.gz → 0.1.10__tar.gz

natural-pdf 0.1.8tar.gz → 0.1.10tar.gz