PyPI - natural-pdf - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl - Mend

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

natural_pdf/__init__.py +3 -0
natural_pdf/analyzers/layout/base.py +1 -5
natural_pdf/analyzers/layout/gemini.py +61 -51
natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
natural_pdf/analyzers/layout/layout_manager.py +26 -84
natural_pdf/analyzers/layout/layout_options.py +7 -0
natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
natural_pdf/analyzers/layout/surya.py +46 -123
natural_pdf/analyzers/layout/tatr.py +51 -4
natural_pdf/analyzers/text_structure.py +3 -5
natural_pdf/analyzers/utils.py +3 -3
natural_pdf/classification/manager.py +422 -0
natural_pdf/classification/mixin.py +163 -0
natural_pdf/classification/results.py +80 -0
natural_pdf/collections/mixins.py +111 -0
natural_pdf/collections/pdf_collection.py +434 -15
natural_pdf/core/element_manager.py +83 -0
natural_pdf/core/highlighting_service.py +13 -22
natural_pdf/core/page.py +578 -93
natural_pdf/core/pdf.py +912 -460
natural_pdf/elements/base.py +134 -40
natural_pdf/elements/collections.py +712 -109
natural_pdf/elements/region.py +722 -69
natural_pdf/elements/text.py +4 -1
natural_pdf/export/mixin.py +137 -0
natural_pdf/exporters/base.py +3 -3
natural_pdf/exporters/paddleocr.py +5 -4
natural_pdf/extraction/manager.py +135 -0
natural_pdf/extraction/mixin.py +279 -0
natural_pdf/extraction/result.py +23 -0
natural_pdf/ocr/__init__.py +5 -5
natural_pdf/ocr/engine_doctr.py +346 -0
natural_pdf/ocr/engine_easyocr.py +6 -3
natural_pdf/ocr/ocr_factory.py +24 -4
natural_pdf/ocr/ocr_manager.py +122 -26
natural_pdf/ocr/ocr_options.py +94 -11
natural_pdf/ocr/utils.py +19 -6
natural_pdf/qa/document_qa.py +0 -4
natural_pdf/search/__init__.py +20 -34
natural_pdf/search/haystack_search_service.py +309 -265
natural_pdf/search/haystack_utils.py +99 -75
natural_pdf/search/search_service_protocol.py +11 -12
natural_pdf/selectors/parser.py +431 -230
natural_pdf/utils/debug.py +3 -3
natural_pdf/utils/identifiers.py +1 -1
natural_pdf/utils/locks.py +8 -0
natural_pdf/utils/packaging.py +8 -6
natural_pdf/utils/text_extraction.py +60 -1
natural_pdf/utils/tqdm_utils.py +51 -0
natural_pdf/utils/visualization.py +18 -0
natural_pdf/widgets/viewer.py +4 -25
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
natural_pdf-0.1.9.dist-info/RECORD +80 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
docs/api/index.md +0 -386
docs/assets/favicon.png +0 -3
docs/assets/favicon.svg +0 -3
docs/assets/javascripts/custom.js +0 -17
docs/assets/logo.svg +0 -3
docs/assets/sample-screen.png +0 -0
docs/assets/social-preview.png +0 -17
docs/assets/social-preview.svg +0 -17
docs/assets/stylesheets/custom.css +0 -65
docs/document-qa/index.ipynb +0 -435
docs/document-qa/index.md +0 -79
docs/element-selection/index.ipynb +0 -915
docs/element-selection/index.md +0 -229
docs/finetuning/index.md +0 -176
docs/index.md +0 -170
docs/installation/index.md +0 -69
docs/interactive-widget/index.ipynb +0 -962
docs/interactive-widget/index.md +0 -12
docs/layout-analysis/index.ipynb +0 -818
docs/layout-analysis/index.md +0 -185
docs/ocr/index.md +0 -209
docs/pdf-navigation/index.ipynb +0 -314
docs/pdf-navigation/index.md +0 -97
docs/regions/index.ipynb +0 -816
docs/regions/index.md +0 -294
docs/tables/index.ipynb +0 -658
docs/tables/index.md +0 -144
docs/text-analysis/index.ipynb +0 -370
docs/text-analysis/index.md +0 -105
docs/text-extraction/index.ipynb +0 -1478
docs/text-extraction/index.md +0 -292
docs/tutorials/01-loading-and-extraction.ipynb +0 -194
docs/tutorials/01-loading-and-extraction.md +0 -95
docs/tutorials/02-finding-elements.ipynb +0 -340
docs/tutorials/02-finding-elements.md +0 -149
docs/tutorials/03-extracting-blocks.ipynb +0 -147
docs/tutorials/03-extracting-blocks.md +0 -48
docs/tutorials/04-table-extraction.ipynb +0 -114
docs/tutorials/04-table-extraction.md +0 -50
docs/tutorials/05-excluding-content.ipynb +0 -270
docs/tutorials/05-excluding-content.md +0 -109
docs/tutorials/06-document-qa.ipynb +0 -332
docs/tutorials/06-document-qa.md +0 -91
docs/tutorials/07-layout-analysis.ipynb +0 -288
docs/tutorials/07-layout-analysis.md +0 -66
docs/tutorials/07-working-with-regions.ipynb +0 -413
docs/tutorials/07-working-with-regions.md +0 -151
docs/tutorials/08-spatial-navigation.ipynb +0 -508
docs/tutorials/08-spatial-navigation.md +0 -190
docs/tutorials/09-section-extraction.ipynb +0 -2434
docs/tutorials/09-section-extraction.md +0 -256
docs/tutorials/10-form-field-extraction.ipynb +0 -512
docs/tutorials/10-form-field-extraction.md +0 -201
docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
docs/tutorials/11-enhanced-table-processing.md +0 -9
docs/tutorials/12-ocr-integration.ipynb +0 -604
docs/tutorials/12-ocr-integration.md +0 -175
docs/tutorials/13-semantic-search.ipynb +0 -1328
docs/tutorials/13-semantic-search.md +0 -77
docs/visual-debugging/index.ipynb +0 -2970
docs/visual-debugging/index.md +0 -157
docs/visual-debugging/region.png +0 -0
natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
natural_pdf/templates/spa/css/style.css +0 -334
natural_pdf/templates/spa/index.html +0 -31
natural_pdf/templates/spa/js/app.js +0 -472
natural_pdf/templates/spa/words.txt +0 -235976
natural_pdf/widgets/frontend/viewer.js +0 -88
natural_pdf-0.1.7.dist-info/RECORD +0 -145
notebooks/Examples.ipynb +0 -1293
pdfs/.gitkeep +0 -0
pdfs/01-practice.pdf +0 -543
pdfs/0500000US42001.pdf +0 -0
pdfs/0500000US42007.pdf +0 -0
pdfs/2014 Statistics.pdf +0 -0
pdfs/2019 Statistics.pdf +0 -0
pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
pdfs/needs-ocr.pdf +0 -0
{natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0

docs/regions/index.md DELETED Viewed

@@ -1,294 +0,0 @@
-# Working with Regions
-Regions are rectangular areas on a page that define boundaries for operations like text extraction, element finding, or visualization. They're one of Natural PDF's most powerful features for working with specific parts of a document.
-## Setup
-Let's set up a PDF to experiment with regions.
-```python
-from natural_pdf import PDF
-# Load the PDF
-pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
-# Get the first page
-page = pdf.pages[0]
-# Display the page
-page.show(width=700)
-```
-## Creating Regions
-There are several ways to create regions in Natural PDF.
-### Using `create_region()` with Coordinates
-This is the most direct method - provide the coordinates directly.
-```python
-# Create a region by specifying (x0, top, x1, bottom) coordinates
-# Let's create a region in the middle of the page
-mid_region = page.create_region(
-    x0=100,         # Left edge
-    top=200,        # Top edge
-    x1=500,         # Right edge
-    bottom=400      # Bottom edge
-)
-# Highlight the region to see it
-mid_region.highlight(color="blue").show()
-```
-### Using Element Methods: `above()`, `below()`, `left()`, `right()`
-You can create regions relative to existing elements.
-```python
-# Find a heading-like element
-heading = page.find('text[size>=12]:bold')
-# Create a region below this heading element
-if heading:
-    region_below = heading.below()
-    # Highlight the heading and the region below it
-    heading.highlight(color="red")
-    region_below.highlight(color="blue")
-    page.show()
-```
-```python
-# Create a region with height limit
-if heading:
-    # Only include 100px below the heading
-    small_region_below = heading.below(height=100)
-    page.clear_highlights()
-    heading.highlight(color="red")
-    small_region_below.highlight(color="green")
-    page.show()
-```
-```python
-# Find a line or other element to create a region above
-line = page.find('line')
-if line:
-    # Create a region above the line
-    region_above = line.above()
-    page.clear_highlights()
-    line.highlight(color="black")
-    region_above.highlight(color="purple")
-    page.show()
-```
-### Creating a Region Between Elements with `until()`
-```python
-# Find two elements to use as boundaries
-first_heading = page.find('text[size>=11]:bold')
-next_heading = first_heading.next('text[size>=11]:bold') if first_heading else None
-if first_heading and next_heading:
-    # Create a region from the first heading until the next heading
-    section = first_heading.below(until=next_heading, include_endpoint=False)
-    # Highlight both elements and the region between them
-    page.clear_highlights()
-    first_heading.highlight(color="red")
-    next_heading.highlight(color="red")
-    section.highlight(color="yellow")
-    page.show()
-```
-## Using Regions
-Once you have a region, here's what you can do with it.
-### Extract Text from a Region
-```python
-# Find a region to work with (e.g., from a title to the next bold text)
-title = page.find('text:contains("Site")')  # Adjust if needed
-if title:
-    # Create a region from title down to the next bold text
-    content_region = title.below(until='line:horizontal', include_endpoint=False)
-    # Extract text from just this region
-    region_text = content_region.extract_text()
-    # Show the region and the extracted text
-    page.clear_highlights()
-    content_region.highlight(color="green")
-    page.show()
-    # Displaying the text (first 300 chars if long)
-    print(region_text[:300] + "..." if len(region_text) > 300 else region_text)
-```
-### Find Elements Within a Region
-You can use a region as a "filter" to only find elements within its boundaries.
-```python
-# Create a region in an interesting part of the page
-test_region = page.create_region(
-    x0=page.width * 0.1,
-    top=page.height * 0.25,
-    x1=page.width * 0.9,
-    bottom=page.height * 0.75
-)
-# Find all text elements ONLY within this region
-text_in_region = test_region.find_all('text')
-# Display result
-page.clear_highlights()
-test_region.highlight(color="blue")
-text_in_region.highlight(color="red")
-page.show()
-len(text_in_region)  # Number of text elements found in region
-```
-### Generate an Image of a Region
-```python
-# Find a specific region to capture
-# (Could be a table, figure, or any significant area)
-region_for_image = page.create_region(
-    x0=100,
-    top=150,
-    x1=page.width - 100,
-    bottom=300
-)
-# Generate an image of just this region
-region_for_image.to_image(crop_only=True)  # Shows just the region
-```
-### Adjust and Expand Regions
-```python
-# Take an existing region and expand it
-region_a = page.create_region(200, 200, 400, 400)
-# Expand by a certain number of points in each direction
-expanded = region_a.expand(left=20, right=20, top=20, bottom=20)
-# Visualize original and expanded regions
-page.clear_highlights()
-region_a.highlight(color="blue", label="Original")
-expanded.highlight(color="red", label="Expanded")
-page.to_image()
-```
-## Using Exclusion Zones with Regions
-Exclusion zones are regions that you want to ignore during operations like text extraction.
-```python
-# Create a region for the whole page
-full_page_region = page.create_region(0, 0, page.width, page.height)
-# Extract text without exclusions as baseline
-full_text = full_page_region.extract_text()
-print(f"Full page text length: {len(full_text)} characters")
-```
-```python
-# Define an area we want to exclude (like a header)
-# Let's exclude the top 10% of the page
-header_zone = page.create_region(0, 0, page.width, page.height * 0.1)
-# Add this as an exclusion for the page
-page.add_exclusion(header_zone)
-# Visualize the exclusion
-page.clear_highlights()
-header_zone.highlight(color="red", label="Excluded")
-page.show()
-```
-```python
-# Now extract text again - the header should be excluded
-text_with_exclusion = full_page_region.extract_text() # Uses apply_exclusions=True by default
-# Compare text lengths
-print(f"Original text: {len(full_text)} chars\nText with exclusion: {len(text_with_exclusion)} chars")
-print(f"Difference: {len(full_text) - len(text_with_exclusion)} chars excluded")
-```
-```python
-# When done with this page, clear exclusions
-page.clear_exclusions()
-```
-## Document-Level Exclusions
-PDF-level exclusions apply to all pages and use functions to adapt to each page.
-```python
-# Define a PDF-level exclusion for headers
-# This will exclude the top 30% of every page
-pdf.add_exclusion(
-    lambda p: p.create_region(0, 0, p.width, p.height * 0.3),
-    label="Header zone"
-)
-# Define a PDF-level exclusion for footers
-# This will exclude the bottom 20% of every page
-pdf.add_exclusion(
-    lambda p: p.create_region(0, p.height * 0.8, p.width, p.height),
-    label="Footer zone"
-)
-# PDF-level exclusions are used whenever you extract text
-# Let's try on the first three pages
-for page in pdf.pages[:3]:
-    text = page.extract_text()
-    text_original = page.extract_text(use_exclusions=False)
-    print(f"Page {page.number} – Before: {len(text_original)} After: {len(text)}")
-```
-```python
-# Clear PDF-level exclusions when done
-pdf.clear_exclusions()
-print("Cleared all PDF-level exclusions")
-```
-## Working with Layout Analysis Regions
-When you run layout analysis, the detected regions (tables, titles, etc.) are also Region objects.
-```python
-# First, run layout analysis to detect regions
-page.analyze_layout()  # Uses 'yolo' engine by default
-# Find all detected regions
-detected_regions = page.find_all('region')
-print(f"Found {len(detected_regions)} layout regions")
-```
-```python
-# Highlight all detected regions by type
-detected_regions.highlight(group_by='region_type').show()
-```
-```python
-# Extract text from a specific region type (e.g., title)
-title_regions = page.find_all('region[type=title]')
-if title_regions:
-    titles_text = title_regions.extract_text()
-    print(f"Title text: {titles_text}")
-```
-## Next Steps
-Now that you understand regions, you can:
-- [Extract tables](../tables/index.ipynb) from table regions
-- [Ask questions](../document-qa/index.ipynb) about specific regions
-- [Exclude content](../text-extraction/index.md#filtering-out-headers-and-footers) from extraction

natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

natural-pdf 0.1.7py3-none-any.whl → 0.1.9py3-none-any.whl