natural-pdf 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. natural_pdf/__init__.py +8 -0
  2. natural_pdf/analyzers/checkbox/__init__.py +6 -0
  3. natural_pdf/analyzers/checkbox/base.py +265 -0
  4. natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  5. natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  6. natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  7. natural_pdf/analyzers/checkbox/mixin.py +95 -0
  8. natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  9. natural_pdf/analyzers/guides.py +26 -2
  10. natural_pdf/collections/mixins.py +14 -5
  11. natural_pdf/core/element_manager.py +5 -1
  12. natural_pdf/core/page.py +61 -0
  13. natural_pdf/core/page_collection.py +41 -1
  14. natural_pdf/core/pdf.py +24 -1
  15. natural_pdf/describe/base.py +20 -0
  16. natural_pdf/elements/base.py +152 -10
  17. natural_pdf/elements/element_collection.py +41 -2
  18. natural_pdf/elements/region.py +115 -2
  19. natural_pdf/judge.py +1509 -0
  20. natural_pdf/selectors/parser.py +42 -1
  21. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/METADATA +1 -1
  22. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/RECORD +42 -18
  23. temp/check_model.py +49 -0
  24. temp/check_pdf_content.py +9 -0
  25. temp/checkbox_checks.py +590 -0
  26. temp/checkbox_simple.py +117 -0
  27. temp/checkbox_ux_ideas.py +400 -0
  28. temp/context_manager_prototype.py +177 -0
  29. temp/convert_to_hf.py +60 -0
  30. temp/demo_text_closest.py +66 -0
  31. temp/inspect_model.py +43 -0
  32. temp/rtdetr_dinov2_test.py +49 -0
  33. temp/test_closest_debug.py +26 -0
  34. temp/test_closest_debug2.py +22 -0
  35. temp/test_context_exploration.py +85 -0
  36. temp/test_durham.py +30 -0
  37. temp/test_empty_string.py +16 -0
  38. temp/test_similarity.py +15 -0
  39. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/WHEEL +0 -0
  40. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/entry_points.txt +0 -0
  41. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/licenses/LICENSE +0 -0
  42. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,66 @@
1
+ """
2
+ Demo of the text:closest() selector for fuzzy text matching in Natural PDF.
3
+
4
+ This selector is designed to handle OCR errors and text variations by:
5
+ 1. First finding exact substring matches
6
+ 2. Then ranking other elements by similarity score
7
+ """
8
+
9
+ from natural_pdf import PDF
10
+
11
+ # Load a PDF
12
+ pdf = PDF("pdfs/01-practice.pdf")
13
+ page = pdf.pages[0]
14
+
15
+ print("=== text:closest() Selector Demo ===\n")
16
+
17
+ # Example 1: Basic fuzzy matching (default threshold 0.0 - all elements)
18
+ print("1. Find elements closest to 'Durham' (no threshold):")
19
+ results = page.find_all('text:closest("Durham")')
20
+ print(f" Found {len(results)} elements (all text elements)")
21
+ print(f" First 3: {[r.text.strip() for r in results[:3]]}\n")
22
+
23
+ # Example 2: With similarity threshold
24
+ print("2. Find elements with at least 40% similarity to 'Durham':")
25
+ results = page.find_all('text:closest("Durham@0.4")')
26
+ print(f" Found {len(results)} elements")
27
+ for r in results:
28
+ print(f" - '{r.text.strip()}'")
29
+ print()
30
+
31
+ # Example 3: OCR error simulation
32
+ print("3. Simulate OCR errors:")
33
+ print(" Searching for 'Durharn' (OCR error: 'rn' instead of 'm'):")
34
+ results = page.find_all('text:closest("Durharn@0.4")')
35
+ print(f" Found: {[r.text.strip() for r in results if 'Durham' in r.text]}\n")
36
+
37
+ # Example 4: Case insensitive matching (default)
38
+ print("4. Case insensitive search for 'chicago':")
39
+ results = page.find_all('text:closest("chicago@0.6")')
40
+ print(f" Found: {[r.text.strip() for r in results if 'Chicago' in r.text]}\n")
41
+
42
+ # Example 5: Combining with other selectors
43
+ print("5. Find fuzzy matches with size constraints:")
44
+ results = page.find_all('text:closest("Violation@0.6")[size>10]')
45
+ print(f" Found {len(results)} elements with size > 10")
46
+ if results:
47
+ print(f" Example: '{results[0].text.strip()}' (size={results[0].size})\n")
48
+
49
+ # Example 6: Practical use case - finding labels
50
+ print("6. Practical OCR use case - finding form labels:")
51
+ print(" Looking for 'Date:' even if OCR missed the colon:")
52
+ results = page.find_all('text:closest("Date@0.8")')
53
+ date_labels = [r for r in results if "Date" in r.text]
54
+ if date_labels:
55
+ print(f" Found: '{date_labels[0].text.strip()}'")
56
+ # Now find the value to the right
57
+ value = date_labels[0].right(until='text')
58
+ print(f" Value: '{value.extract_text().strip()}'")
59
+
60
+ print("\n=== Key Features ===")
61
+ print("- Default threshold is 0.0 (matches all elements, sorted by similarity)")
62
+ print("- Exact substring matches always come first")
63
+ print("- Case insensitive by default (use case=True for case sensitive)")
64
+ print("- Threshold specified with @ separator: 'search@0.8'")
65
+ print("- Uses Python's difflib.SequenceMatcher for similarity calculation")
66
+ print("- Empty search string returns no results")
temp/inspect_model.py ADDED
@@ -0,0 +1,43 @@
1
+ import torch
2
+
3
+ model_path = "/Users/soma/Development/natural-pdf/model-weights/checkbox-nano.pt"
4
+
5
+ # Load checkpoint
6
+ checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
7
+
8
+ print("Checkpoint keys:", checkpoint.keys())
9
+ print("\nArgs:", checkpoint['args'])
10
+
11
+ # Look at model structure
12
+ model = checkpoint['model']
13
+ print(f"\nModel type: {type(model)}")
14
+
15
+ # If it's an OrderedDict, it's just the state dict
16
+ if isinstance(model, dict):
17
+ print(f"Model has {len(model)} weight tensors")
18
+ # Look at first few keys to understand architecture
19
+ for i, key in enumerate(list(model.keys())[:10]):
20
+ print(f" {key}")
21
+
22
+ # Check the args to understand what model this is
23
+ args = checkpoint['args']
24
+ print(f"\nModel configuration from args:")
25
+ for attr in ['model', 'task', 'mode', 'imgsz', 'batch', 'device']:
26
+ if hasattr(args, attr):
27
+ print(f" {attr}: {getattr(args, attr)}")
28
+
29
+ # Try to determine RT-DETR variant
30
+ if hasattr(args, 'model'):
31
+ model_name = getattr(args, 'model')
32
+ print(f"\nModel name: {model_name}")
33
+
34
+ # RT-DETR variants mapping
35
+ if 'rtdetr' in str(model_name).lower():
36
+ if '18' in str(model_name) or 'r18' in str(model_name):
37
+ print("This appears to be RT-DETR with ResNet-18 backbone")
38
+ elif '34' in str(model_name) or 'r34' in str(model_name):
39
+ print("This appears to be RT-DETR with ResNet-34 backbone")
40
+ elif '50' in str(model_name) or 'r50' in str(model_name):
41
+ print("This appears to be RT-DETR with ResNet-50 backbone")
42
+ elif '101' in str(model_name) or 'r101' in str(model_name):
43
+ print("This appears to be RT-DETR with ResNet-101 backbone")
@@ -0,0 +1,49 @@
1
+ """Test using RT-DETR with DINOv2 backbone in transformers"""
2
+ from transformers import RTDetrConfig, RTDetrForObjectDetection
3
+ import torch
4
+
5
+ # Create config with DINOv2 backbone
6
+ config = RTDetrConfig(
7
+ # Model architecture
8
+ backbone="facebook/dinov2-small", # DINOv2 small variant
9
+ use_pretrained_backbone=False, # We'll load our weights
10
+ backbone_kwargs={
11
+ "out_indices": [3, 6, 9, 12], # Match the indices from your model
12
+ },
13
+
14
+ # Detection head config (from your checkpoint)
15
+ num_labels=3, # checked, unchecked, (background?)
16
+ hidden_dim=256,
17
+ num_queries=300,
18
+ decoder_layers=2,
19
+ d_model=256,
20
+ dim_feedforward=2048,
21
+ dropout=0.0,
22
+ nheads=8, # sa_nheads from your model
23
+
24
+ # Loss coefficients
25
+ bbox_loss_coefficient=5.0,
26
+ giou_loss_coefficient=2.0,
27
+ cls_loss_coefficient=1.0,
28
+ )
29
+
30
+ # Initialize model
31
+ model = RTDetrForObjectDetection(config)
32
+
33
+ print(f"Model initialized with {sum(p.numel() for p in model.parameters())} parameters")
34
+
35
+ # Your checkpoint has these keys in the state dict
36
+ # We'd need to map them to the RT-DETR expected keys
37
+ checkpoint = torch.load("model-weights/checkbox-nano.pt", map_location='cpu', weights_only=False)
38
+ state_dict = checkpoint['model']
39
+
40
+ # Show some key mappings needed
41
+ print("\nYour model keys (first 5):")
42
+ for key in list(state_dict.keys())[:5]:
43
+ print(f" {key}")
44
+
45
+ print("\nRT-DETR expects keys like:")
46
+ for key in list(model.state_dict().keys())[:5]:
47
+ print(f" {key}")
48
+
49
+ print("\nWould need to create a mapping between these formats")
@@ -0,0 +1,26 @@
1
+ from natural_pdf import PDF
2
+ from natural_pdf.selectors.parser import parse_selector, selector_to_filter_func
3
+
4
+ # Test parsing
5
+ selector_str = 'text:closest("Durham@0.7")'
6
+ parsed = parse_selector(selector_str)
7
+ print("Parsed selector:", parsed)
8
+
9
+ # Test with actual PDF
10
+ pdf = PDF('pdfs/01-practice.pdf')
11
+ page = pdf.pages[0]
12
+
13
+ # Get all text elements
14
+ all_text = page.find_all('text')
15
+ print(f"\nTotal text elements: {len(all_text)}")
16
+
17
+ # Test the selector
18
+ results = page.find_all('text:closest("Durham@0.7")')
19
+ print(f"Results with :closest selector: {len(results)}")
20
+
21
+ # Let's manually test the filter function
22
+ filter_func = selector_to_filter_func(parsed)
23
+ print("\nTesting filter function manually:")
24
+ for i, el in enumerate(all_text[:5]):
25
+ match = filter_func(el)
26
+ print(f" Element {i}: '{el.text}' -> {match}")
@@ -0,0 +1,22 @@
1
+ from natural_pdf import PDF
2
+ from natural_pdf.selectors.parser import parse_selector, _build_filter_list
3
+ import difflib
4
+
5
+ # Test parsing
6
+ selector_str = 'text:closest("Durham@0.7")'
7
+ parsed = parse_selector(selector_str)
8
+ print("Parsed selector:", parsed)
9
+ print("Args:", repr(parsed['pseudo_classes'][0]['args']))
10
+
11
+ # Build filters
12
+ filters = _build_filter_list(parsed)
13
+ print("\nFilters built:", filters)
14
+
15
+ # Test similarity calculation manually
16
+ search_text = "Durham"
17
+ test_texts = ["Durham's Meatpacking ", "Chicago, Ill.", "Violations"]
18
+
19
+ print(f"\nTesting similarity for '{search_text}':")
20
+ for text in test_texts:
21
+ ratio = difflib.SequenceMatcher(None, search_text.lower(), text.lower()).ratio()
22
+ print(f" '{text}' -> ratio: {ratio:.3f}")
@@ -0,0 +1,85 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test to explore how context managers might work with Natural PDF
4
+ """
5
+
6
+ import natural_pdf as npdf
7
+ from contextlib import contextmanager
8
+ import threading
9
+
10
+ # Test 1: Check current global option system
11
+ print("=== Current Global Options ===")
12
+ print(f"Default directional_offset: {npdf.options.layout.directional_offset}")
13
+ print(f"Default auto_multipage: {npdf.options.layout.auto_multipage}")
14
+
15
+ # Test 2: See how options are used in directional methods
16
+ print("\n=== How Options Are Used ===")
17
+ # Let's trace through the code to see how offset is used
18
+
19
+ # Test 3: Prototype a context manager approach
20
+ @contextmanager
21
+ def temporary_options(**kwargs):
22
+ """Context manager to temporarily change options."""
23
+ # Store original values
24
+ original_values = {}
25
+
26
+ for key, value in kwargs.items():
27
+ parts = key.split('.')
28
+ obj = npdf.options
29
+
30
+ # Navigate to the right section
31
+ for part in parts[:-1]:
32
+ obj = getattr(obj, part)
33
+
34
+ # Store original and set new value
35
+ final_key = parts[-1]
36
+ original_values[key] = getattr(obj, final_key)
37
+ setattr(obj, final_key, value)
38
+
39
+ try:
40
+ yield
41
+ finally:
42
+ # Restore original values
43
+ for key, original_value in original_values.items():
44
+ parts = key.split('.')
45
+ obj = npdf.options
46
+
47
+ for part in parts[:-1]:
48
+ obj = getattr(obj, part)
49
+
50
+ final_key = parts[-1]
51
+ setattr(obj, final_key, original_value)
52
+
53
+ # Test the context manager
54
+ print("\n=== Testing Context Manager ===")
55
+ print(f"Before: offset={npdf.options.layout.directional_offset}")
56
+
57
+ with temporary_options(**{'layout.directional_offset': 5.0}):
58
+ print(f"Inside context: offset={npdf.options.layout.directional_offset}")
59
+
60
+ print(f"After: offset={npdf.options.layout.directional_offset}")
61
+
62
+ # Test 4: Check thread safety
63
+ print("\n=== Thread Safety Test ===")
64
+ results = []
65
+
66
+ def thread_func(thread_id, offset_value):
67
+ with temporary_options(**{'layout.directional_offset': offset_value}):
68
+ import time
69
+ time.sleep(0.1) # Simulate some work
70
+ results.append((thread_id, npdf.options.layout.directional_offset))
71
+
72
+ threads = []
73
+ for i in range(3):
74
+ t = threading.Thread(target=thread_func, args=(i, float(i * 10)))
75
+ threads.append(t)
76
+ t.start()
77
+
78
+ for t in threads:
79
+ t.join()
80
+
81
+ print("Thread results (thread_id, offset):")
82
+ for result in sorted(results):
83
+ print(f" Thread {result[0]}: offset={result[1]}")
84
+
85
+ print("\nConclusion: Global options are NOT thread-safe!")
temp/test_durham.py ADDED
@@ -0,0 +1,30 @@
1
+ from natural_pdf import PDF
2
+ import difflib
3
+
4
+ pdf = PDF('pdfs/01-practice.pdf')
5
+ page = pdf.pages[0]
6
+
7
+ # Find all text elements
8
+ all_text = page.find_all('text')
9
+ print(f"Total text elements: {len(all_text)}")
10
+
11
+ # Look for Durham-related text
12
+ print("\nLooking for Durham-related text:")
13
+ for el in all_text:
14
+ if "Durham" in el.text:
15
+ print(f" Found: '{el.text}'")
16
+
17
+ # Test similarity
18
+ search_text = "Durharn" # OCR error: rn instead of m
19
+ print(f"\nTesting similarity for '{search_text}':")
20
+ for el in all_text[:20]: # Just check first 20
21
+ if el.text:
22
+ ratio = difflib.SequenceMatcher(None, search_text.lower(), el.text.lower()).ratio()
23
+ if ratio > 0.3:
24
+ print(f" '{el.text}' -> ratio: {ratio:.3f}")
25
+
26
+ # Test the actual selector
27
+ results = page.find_all('text:closest("Durharn@0.4")')
28
+ print(f"\nResults with threshold 0.4: {len(results)}")
29
+ for r in results[:5]:
30
+ print(f" - {r.text}")
@@ -0,0 +1,16 @@
1
+ from natural_pdf import PDF
2
+ from natural_pdf.selectors.parser import parse_selector
3
+
4
+ # Test parsing empty string
5
+ selector_str = 'text:closest("")'
6
+ parsed = parse_selector(selector_str)
7
+ print("Parsed selector:", parsed)
8
+ print("Args:", repr(parsed['pseudo_classes'][0]['args']))
9
+
10
+ # Test with actual PDF
11
+ pdf = PDF('pdfs/01-practice.pdf')
12
+ page = pdf.pages[0]
13
+
14
+ # Try the selector
15
+ results = page.find_all('text:closest("")')
16
+ print(f"Results with empty string: {len(results)}")
@@ -0,0 +1,15 @@
1
+ import difflib
2
+
3
+ # Test similarity calculation
4
+ pairs = [
5
+ ("Durham", "Durham's Meatpacking "),
6
+ ("durham", "durham's meatpacking "), # lowercase
7
+ ("Chicgo", "Chicago, Ill."),
8
+ ("chicgo", "chicago, ill."), # lowercase
9
+ ("Chicago", "Chicago, Ill."),
10
+ ]
11
+
12
+ print("Similarity ratios:")
13
+ for search, text in pairs:
14
+ ratio = difflib.SequenceMatcher(None, search, text).ratio()
15
+ print(f" '{search}' vs '{text}' -> {ratio:.3f}")