natural-pdf 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +8 -0
- natural_pdf/analyzers/checkbox/__init__.py +6 -0
- natural_pdf/analyzers/checkbox/base.py +265 -0
- natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
- natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
- natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
- natural_pdf/analyzers/checkbox/mixin.py +95 -0
- natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
- natural_pdf/analyzers/guides.py +26 -2
- natural_pdf/collections/mixins.py +14 -5
- natural_pdf/core/element_manager.py +5 -1
- natural_pdf/core/page.py +61 -0
- natural_pdf/core/page_collection.py +41 -1
- natural_pdf/core/pdf.py +24 -1
- natural_pdf/describe/base.py +20 -0
- natural_pdf/elements/base.py +152 -10
- natural_pdf/elements/element_collection.py +41 -2
- natural_pdf/elements/region.py +115 -2
- natural_pdf/judge.py +1509 -0
- natural_pdf/selectors/parser.py +42 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/RECORD +42 -18
- temp/check_model.py +49 -0
- temp/check_pdf_content.py +9 -0
- temp/checkbox_checks.py +590 -0
- temp/checkbox_simple.py +117 -0
- temp/checkbox_ux_ideas.py +400 -0
- temp/context_manager_prototype.py +177 -0
- temp/convert_to_hf.py +60 -0
- temp/demo_text_closest.py +66 -0
- temp/inspect_model.py +43 -0
- temp/rtdetr_dinov2_test.py +49 -0
- temp/test_closest_debug.py +26 -0
- temp/test_closest_debug2.py +22 -0
- temp/test_context_exploration.py +85 -0
- temp/test_durham.py +30 -0
- temp/test_empty_string.py +16 -0
- temp/test_similarity.py +15 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
"""
|
2
|
+
Demo of the text:closest() selector for fuzzy text matching in Natural PDF.
|
3
|
+
|
4
|
+
This selector is designed to handle OCR errors and text variations by:
|
5
|
+
1. First finding exact substring matches
|
6
|
+
2. Then ranking other elements by similarity score
|
7
|
+
"""
|
8
|
+
|
9
|
+
from natural_pdf import PDF
|
10
|
+
|
11
|
+
# Load a PDF
|
12
|
+
pdf = PDF("pdfs/01-practice.pdf")
|
13
|
+
page = pdf.pages[0]
|
14
|
+
|
15
|
+
print("=== text:closest() Selector Demo ===\n")
|
16
|
+
|
17
|
+
# Example 1: Basic fuzzy matching (default threshold 0.0 - all elements)
|
18
|
+
print("1. Find elements closest to 'Durham' (no threshold):")
|
19
|
+
results = page.find_all('text:closest("Durham")')
|
20
|
+
print(f" Found {len(results)} elements (all text elements)")
|
21
|
+
print(f" First 3: {[r.text.strip() for r in results[:3]]}\n")
|
22
|
+
|
23
|
+
# Example 2: With similarity threshold
|
24
|
+
print("2. Find elements with at least 40% similarity to 'Durham':")
|
25
|
+
results = page.find_all('text:closest("Durham@0.4")')
|
26
|
+
print(f" Found {len(results)} elements")
|
27
|
+
for r in results:
|
28
|
+
print(f" - '{r.text.strip()}'")
|
29
|
+
print()
|
30
|
+
|
31
|
+
# Example 3: OCR error simulation
|
32
|
+
print("3. Simulate OCR errors:")
|
33
|
+
print(" Searching for 'Durharn' (OCR error: 'rn' instead of 'm'):")
|
34
|
+
results = page.find_all('text:closest("Durharn@0.4")')
|
35
|
+
print(f" Found: {[r.text.strip() for r in results if 'Durham' in r.text]}\n")
|
36
|
+
|
37
|
+
# Example 4: Case insensitive matching (default)
|
38
|
+
print("4. Case insensitive search for 'chicago':")
|
39
|
+
results = page.find_all('text:closest("chicago@0.6")')
|
40
|
+
print(f" Found: {[r.text.strip() for r in results if 'Chicago' in r.text]}\n")
|
41
|
+
|
42
|
+
# Example 5: Combining with other selectors
|
43
|
+
print("5. Find fuzzy matches with size constraints:")
|
44
|
+
results = page.find_all('text:closest("Violation@0.6")[size>10]')
|
45
|
+
print(f" Found {len(results)} elements with size > 10")
|
46
|
+
if results:
|
47
|
+
print(f" Example: '{results[0].text.strip()}' (size={results[0].size})\n")
|
48
|
+
|
49
|
+
# Example 6: Practical use case - finding labels
|
50
|
+
print("6. Practical OCR use case - finding form labels:")
|
51
|
+
print(" Looking for 'Date:' even if OCR missed the colon:")
|
52
|
+
results = page.find_all('text:closest("Date@0.8")')
|
53
|
+
date_labels = [r for r in results if "Date" in r.text]
|
54
|
+
if date_labels:
|
55
|
+
print(f" Found: '{date_labels[0].text.strip()}'")
|
56
|
+
# Now find the value to the right
|
57
|
+
value = date_labels[0].right(until='text')
|
58
|
+
print(f" Value: '{value.extract_text().strip()}'")
|
59
|
+
|
60
|
+
print("\n=== Key Features ===")
|
61
|
+
print("- Default threshold is 0.0 (matches all elements, sorted by similarity)")
|
62
|
+
print("- Exact substring matches always come first")
|
63
|
+
print("- Case insensitive by default (use case=True for case sensitive)")
|
64
|
+
print("- Threshold specified with @ separator: 'search@0.8'")
|
65
|
+
print("- Uses Python's difflib.SequenceMatcher for similarity calculation")
|
66
|
+
print("- Empty search string returns no results")
|
temp/inspect_model.py
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
import torch
|
2
|
+
|
3
|
+
model_path = "/Users/soma/Development/natural-pdf/model-weights/checkbox-nano.pt"
|
4
|
+
|
5
|
+
# Load checkpoint
|
6
|
+
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
|
7
|
+
|
8
|
+
print("Checkpoint keys:", checkpoint.keys())
|
9
|
+
print("\nArgs:", checkpoint['args'])
|
10
|
+
|
11
|
+
# Look at model structure
|
12
|
+
model = checkpoint['model']
|
13
|
+
print(f"\nModel type: {type(model)}")
|
14
|
+
|
15
|
+
# If it's an OrderedDict, it's just the state dict
|
16
|
+
if isinstance(model, dict):
|
17
|
+
print(f"Model has {len(model)} weight tensors")
|
18
|
+
# Look at first few keys to understand architecture
|
19
|
+
for i, key in enumerate(list(model.keys())[:10]):
|
20
|
+
print(f" {key}")
|
21
|
+
|
22
|
+
# Check the args to understand what model this is
|
23
|
+
args = checkpoint['args']
|
24
|
+
print(f"\nModel configuration from args:")
|
25
|
+
for attr in ['model', 'task', 'mode', 'imgsz', 'batch', 'device']:
|
26
|
+
if hasattr(args, attr):
|
27
|
+
print(f" {attr}: {getattr(args, attr)}")
|
28
|
+
|
29
|
+
# Try to determine RT-DETR variant
|
30
|
+
if hasattr(args, 'model'):
|
31
|
+
model_name = getattr(args, 'model')
|
32
|
+
print(f"\nModel name: {model_name}")
|
33
|
+
|
34
|
+
# RT-DETR variants mapping
|
35
|
+
if 'rtdetr' in str(model_name).lower():
|
36
|
+
if '18' in str(model_name) or 'r18' in str(model_name):
|
37
|
+
print("This appears to be RT-DETR with ResNet-18 backbone")
|
38
|
+
elif '34' in str(model_name) or 'r34' in str(model_name):
|
39
|
+
print("This appears to be RT-DETR with ResNet-34 backbone")
|
40
|
+
elif '50' in str(model_name) or 'r50' in str(model_name):
|
41
|
+
print("This appears to be RT-DETR with ResNet-50 backbone")
|
42
|
+
elif '101' in str(model_name) or 'r101' in str(model_name):
|
43
|
+
print("This appears to be RT-DETR with ResNet-101 backbone")
|
@@ -0,0 +1,49 @@
|
|
1
|
+
"""Test using RT-DETR with DINOv2 backbone in transformers"""
|
2
|
+
from transformers import RTDetrConfig, RTDetrForObjectDetection
|
3
|
+
import torch
|
4
|
+
|
5
|
+
# Create config with DINOv2 backbone
|
6
|
+
config = RTDetrConfig(
|
7
|
+
# Model architecture
|
8
|
+
backbone="facebook/dinov2-small", # DINOv2 small variant
|
9
|
+
use_pretrained_backbone=False, # We'll load our weights
|
10
|
+
backbone_kwargs={
|
11
|
+
"out_indices": [3, 6, 9, 12], # Match the indices from your model
|
12
|
+
},
|
13
|
+
|
14
|
+
# Detection head config (from your checkpoint)
|
15
|
+
num_labels=3, # checked, unchecked, (background?)
|
16
|
+
hidden_dim=256,
|
17
|
+
num_queries=300,
|
18
|
+
decoder_layers=2,
|
19
|
+
d_model=256,
|
20
|
+
dim_feedforward=2048,
|
21
|
+
dropout=0.0,
|
22
|
+
nheads=8, # sa_nheads from your model
|
23
|
+
|
24
|
+
# Loss coefficients
|
25
|
+
bbox_loss_coefficient=5.0,
|
26
|
+
giou_loss_coefficient=2.0,
|
27
|
+
cls_loss_coefficient=1.0,
|
28
|
+
)
|
29
|
+
|
30
|
+
# Initialize model
|
31
|
+
model = RTDetrForObjectDetection(config)
|
32
|
+
|
33
|
+
print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")
|
34
|
+
|
35
|
+
# Your checkpoint has these keys in the state dict
|
36
|
+
# We'd need to map them to the RT-DETR expected keys
|
37
|
+
checkpoint = torch.load("model-weights/checkbox-nano.pt", map_location='cpu', weights_only=False)
|
38
|
+
state_dict = checkpoint['model']
|
39
|
+
|
40
|
+
# Show some key mappings needed
|
41
|
+
print("\nYour model keys (first 5):")
|
42
|
+
for key in list(state_dict.keys())[:5]:
|
43
|
+
print(f" {key}")
|
44
|
+
|
45
|
+
print("\nRT-DETR expects keys like:")
|
46
|
+
for key in list(model.state_dict().keys())[:5]:
|
47
|
+
print(f" {key}")
|
48
|
+
|
49
|
+
print("\nWould need to create a mapping between these formats")
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from natural_pdf import PDF
|
2
|
+
from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
|
3
|
+
|
4
|
+
# Test parsing
|
5
|
+
selector_str = 'text:closest("Durham@0.7")'
|
6
|
+
parsed = parse_selector(selector_str)
|
7
|
+
print("Parsed selector:", parsed)
|
8
|
+
|
9
|
+
# Test with actual PDF
|
10
|
+
pdf = PDF('pdfs/01-practice.pdf')
|
11
|
+
page = pdf.pages[0]
|
12
|
+
|
13
|
+
# Get all text elements
|
14
|
+
all_text = page.find_all('text')
|
15
|
+
print(f"\nTotal text elements: {len(all_text)}")
|
16
|
+
|
17
|
+
# Test the selector
|
18
|
+
results = page.find_all('text:closest("Durham@0.7")')
|
19
|
+
print(f"Results with :closest selector: {len(results)}")
|
20
|
+
|
21
|
+
# Let's manually test the filter function
|
22
|
+
filter_func = selector_to_filter_func(parsed)
|
23
|
+
print("\nTesting filter function manually:")
|
24
|
+
for i, el in enumerate(all_text[:5]):
|
25
|
+
match = filter_func(el)
|
26
|
+
print(f" Element {i}: '{el.text}' -> {match}")
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from natural_pdf import PDF
|
2
|
+
from natural_pdf.selectors.parser import parse_selector, _build_filter_list
|
3
|
+
import difflib
|
4
|
+
|
5
|
+
# Test parsing
|
6
|
+
selector_str = 'text:closest("Durham@0.7")'
|
7
|
+
parsed = parse_selector(selector_str)
|
8
|
+
print("Parsed selector:", parsed)
|
9
|
+
print("Args:", repr(parsed['pseudo_classes'][0]['args']))
|
10
|
+
|
11
|
+
# Build filters
|
12
|
+
filters = _build_filter_list(parsed)
|
13
|
+
print("\nFilters built:", filters)
|
14
|
+
|
15
|
+
# Test similarity calculation manually
|
16
|
+
search_text = "Durham"
|
17
|
+
test_texts = ["Durham's Meatpacking ", "Chicago, Ill.", "Violations"]
|
18
|
+
|
19
|
+
print(f"\nTesting similarity for '{search_text}':")
|
20
|
+
for text in test_texts:
|
21
|
+
ratio = difflib.SequenceMatcher(None, search_text.lower(), text.lower()).ratio()
|
22
|
+
print(f" '{text}' -> ratio: {ratio:.3f}")
|
@@ -0,0 +1,85 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Test to explore how context managers might work with Natural PDF
|
4
|
+
"""
|
5
|
+
|
6
|
+
import natural_pdf as npdf
|
7
|
+
from contextlib import contextmanager
|
8
|
+
import threading
|
9
|
+
|
10
|
+
# Test 1: Check current global option system
|
11
|
+
print("=== Current Global Options ===")
|
12
|
+
print(f"Default directional_offset: {npdf.options.layout.directional_offset}")
|
13
|
+
print(f"Default auto_multipage: {npdf.options.layout.auto_multipage}")
|
14
|
+
|
15
|
+
# Test 2: See how options are used in directional methods
|
16
|
+
print("\n=== How Options Are Used ===")
|
17
|
+
# Let's trace through the code to see how offset is used
|
18
|
+
|
19
|
+
# Test 3: Prototype a context manager approach
|
20
|
+
@contextmanager
|
21
|
+
def temporary_options(**kwargs):
|
22
|
+
"""Context manager to temporarily change options."""
|
23
|
+
# Store original values
|
24
|
+
original_values = {}
|
25
|
+
|
26
|
+
for key, value in kwargs.items():
|
27
|
+
parts = key.split('.')
|
28
|
+
obj = npdf.options
|
29
|
+
|
30
|
+
# Navigate to the right section
|
31
|
+
for part in parts[:-1]:
|
32
|
+
obj = getattr(obj, part)
|
33
|
+
|
34
|
+
# Store original and set new value
|
35
|
+
final_key = parts[-1]
|
36
|
+
original_values[key] = getattr(obj, final_key)
|
37
|
+
setattr(obj, final_key, value)
|
38
|
+
|
39
|
+
try:
|
40
|
+
yield
|
41
|
+
finally:
|
42
|
+
# Restore original values
|
43
|
+
for key, original_value in original_values.items():
|
44
|
+
parts = key.split('.')
|
45
|
+
obj = npdf.options
|
46
|
+
|
47
|
+
for part in parts[:-1]:
|
48
|
+
obj = getattr(obj, part)
|
49
|
+
|
50
|
+
final_key = parts[-1]
|
51
|
+
setattr(obj, final_key, original_value)
|
52
|
+
|
53
|
+
# Test the context manager
|
54
|
+
print("\n=== Testing Context Manager ===")
|
55
|
+
print(f"Before: offset={npdf.options.layout.directional_offset}")
|
56
|
+
|
57
|
+
with temporary_options(**{'layout.directional_offset': 5.0}):
|
58
|
+
print(f"Inside context: offset={npdf.options.layout.directional_offset}")
|
59
|
+
|
60
|
+
print(f"After: offset={npdf.options.layout.directional_offset}")
|
61
|
+
|
62
|
+
# Test 4: Check thread safety
|
63
|
+
print("\n=== Thread Safety Test ===")
|
64
|
+
results = []
|
65
|
+
|
66
|
+
def thread_func(thread_id, offset_value):
|
67
|
+
with temporary_options(**{'layout.directional_offset': offset_value}):
|
68
|
+
import time
|
69
|
+
time.sleep(0.1) # Simulate some work
|
70
|
+
results.append((thread_id, npdf.options.layout.directional_offset))
|
71
|
+
|
72
|
+
threads = []
|
73
|
+
for i in range(3):
|
74
|
+
t = threading.Thread(target=thread_func, args=(i, float(i * 10)))
|
75
|
+
threads.append(t)
|
76
|
+
t.start()
|
77
|
+
|
78
|
+
for t in threads:
|
79
|
+
t.join()
|
80
|
+
|
81
|
+
print("Thread results (thread_id, offset):")
|
82
|
+
for result in sorted(results):
|
83
|
+
print(f" Thread {result[0]}: offset={result[1]}")
|
84
|
+
|
85
|
+
print("\nConclusion: Global options are NOT thread-safe!")
|
temp/test_durham.py
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
from natural_pdf import PDF
|
2
|
+
import difflib
|
3
|
+
|
4
|
+
pdf = PDF('pdfs/01-practice.pdf')
|
5
|
+
page = pdf.pages[0]
|
6
|
+
|
7
|
+
# Find all text elements
|
8
|
+
all_text = page.find_all('text')
|
9
|
+
print(f"Total text elements: {len(all_text)}")
|
10
|
+
|
11
|
+
# Look for Durham-related text
|
12
|
+
print("\nLooking for Durham-related text:")
|
13
|
+
for el in all_text:
|
14
|
+
if "Durham" in el.text:
|
15
|
+
print(f" Found: '{el.text}'")
|
16
|
+
|
17
|
+
# Test similarity
|
18
|
+
search_text = "Durharn" # OCR error: rn instead of m
|
19
|
+
print(f"\nTesting similarity for '{search_text}':")
|
20
|
+
for el in all_text[:20]: # Just check first 20
|
21
|
+
if el.text:
|
22
|
+
ratio = difflib.SequenceMatcher(None, search_text.lower(), el.text.lower()).ratio()
|
23
|
+
if ratio > 0.3:
|
24
|
+
print(f" '{el.text}' -> ratio: {ratio:.3f}")
|
25
|
+
|
26
|
+
# Test the actual selector
|
27
|
+
results = page.find_all('text:closest("Durharn@0.4")')
|
28
|
+
print(f"\nResults with threshold 0.4: {len(results)}")
|
29
|
+
for r in results[:5]:
|
30
|
+
print(f" - {r.text}")
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from natural_pdf import PDF
|
2
|
+
from natural_pdf.selectors.parser import parse_selector
|
3
|
+
|
4
|
+
# Test parsing empty string
|
5
|
+
selector_str = 'text:closest("")'
|
6
|
+
parsed = parse_selector(selector_str)
|
7
|
+
print("Parsed selector:", parsed)
|
8
|
+
print("Args:", repr(parsed['pseudo_classes'][0]['args']))
|
9
|
+
|
10
|
+
# Test with actual PDF
|
11
|
+
pdf = PDF('pdfs/01-practice.pdf')
|
12
|
+
page = pdf.pages[0]
|
13
|
+
|
14
|
+
# Try the selector
|
15
|
+
results = page.find_all('text:closest("")')
|
16
|
+
print(f"Results with empty string: {len(results)}")
|
temp/test_similarity.py
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
import difflib
|
2
|
+
|
3
|
+
# Test similarity calculation
|
4
|
+
pairs = [
|
5
|
+
("Durham", "Durham's Meatpacking "),
|
6
|
+
("durham", "durham's meatpacking "), # lowercase
|
7
|
+
("Chicgo", "Chicago, Ill."),
|
8
|
+
("chicgo", "chicago, ill."), # lowercase
|
9
|
+
("Chicago", "Chicago, Ill."),
|
10
|
+
]
|
11
|
+
|
12
|
+
print("Similarity ratios:")
|
13
|
+
for search, text in pairs:
|
14
|
+
ratio = difflib.SequenceMatcher(None, search, text).ratio()
|
15
|
+
print(f" '{search}' vs '{text}' -> {ratio:.3f}")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|