natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +8 -0
  2. natural_pdf/analyzers/checkbox/__init__.py +6 -0
  3. natural_pdf/analyzers/checkbox/base.py +265 -0
  4. natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  5. natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  6. natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  7. natural_pdf/analyzers/checkbox/mixin.py +95 -0
  8. natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  9. natural_pdf/collections/mixins.py +14 -5
  10. natural_pdf/core/element_manager.py +5 -1
  11. natural_pdf/core/page.py +61 -0
  12. natural_pdf/core/page_collection.py +41 -1
  13. natural_pdf/core/pdf.py +24 -1
  14. natural_pdf/describe/base.py +20 -0
  15. natural_pdf/elements/base.py +152 -10
  16. natural_pdf/elements/element_collection.py +41 -2
  17. natural_pdf/elements/region.py +115 -2
  18. natural_pdf/judge.py +1509 -0
  19. natural_pdf/selectors/parser.py +42 -1
  20. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
  21. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
  22. temp/check_model.py +49 -0
  23. temp/check_pdf_content.py +9 -0
  24. temp/checkbox_checks.py +590 -0
  25. temp/checkbox_simple.py +117 -0
  26. temp/checkbox_ux_ideas.py +400 -0
  27. temp/context_manager_prototype.py +177 -0
  28. temp/convert_to_hf.py +60 -0
  29. temp/demo_text_closest.py +66 -0
  30. temp/inspect_model.py +43 -0
  31. temp/rtdetr_dinov2_test.py +49 -0
  32. temp/test_closest_debug.py +26 -0
  33. temp/test_closest_debug2.py +22 -0
  34. temp/test_context_exploration.py +85 -0
  35. temp/test_durham.py +30 -0
  36. temp/test_empty_string.py +16 -0
  37. temp/test_similarity.py +15 -0
  38. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
  39. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
  40. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,400 @@
1
+ """
2
+ Practical UX patterns for checkbox detection in form processing.
3
+ """
4
+
5
+ import json
6
+ import pandas as pd
7
+ import numpy as np
8
+ from pathlib import Path
9
+ from typing import List, Dict, Tuple, Optional
10
+ from collections import defaultdict
11
+
12
+
13
+ class CheckboxCalibrator:
14
+ """
15
+ Learn optimal thresholds from user-provided examples.
16
+ """
17
+ def __init__(self):
18
+ self.checked_examples = []
19
+ self.unchecked_examples = []
20
+ self.thresholds = {}
21
+
22
+ def add_example(self, region, is_checked: bool):
23
+ """Add a labeled example."""
24
+ metrics = self._extract_metrics(region)
25
+ if is_checked:
26
+ self.checked_examples.append(metrics)
27
+ else:
28
+ self.unchecked_examples.append(metrics)
29
+
30
+ def calibrate(self):
31
+ """Find optimal thresholds based on examples."""
32
+ if not self.checked_examples or not self.unchecked_examples:
33
+ raise ValueError("Need both checked and unchecked examples")
34
+
35
+ # For each metric, find threshold that best separates checked/unchecked
36
+ checked_df = pd.DataFrame(self.checked_examples)
37
+ unchecked_df = pd.DataFrame(self.unchecked_examples)
38
+
39
+ for metric in checked_df.columns:
40
+ checked_vals = checked_df[metric].values
41
+ unchecked_vals = unchecked_df[metric].values
42
+
43
+ # Find threshold that maximizes separation
44
+ all_vals = np.concatenate([checked_vals, unchecked_vals])
45
+ best_threshold = None
46
+ best_score = 0
47
+
48
+ for threshold in np.percentile(all_vals, [10, 20, 30, 40, 50, 60, 70, 80, 90]):
49
+ # Score based on correct classification
50
+ correct = np.sum(checked_vals < threshold) + np.sum(unchecked_vals >= threshold)
51
+ score = correct / len(all_vals)
52
+ if score > best_score:
53
+ best_score = score
54
+ best_threshold = threshold
55
+
56
+ self.thresholds[metric] = {
57
+ 'value': best_threshold,
58
+ 'accuracy': best_score,
59
+ 'checked_mean': np.mean(checked_vals),
60
+ 'unchecked_mean': np.mean(unchecked_vals)
61
+ }
62
+
63
+ def predict(self, region, confidence_threshold=0.7):
64
+ """Predict if checkbox is checked based on calibrated thresholds."""
65
+ metrics = self._extract_metrics(region)
66
+ votes = 0
67
+ total_weight = 0
68
+
69
+ for metric, value in metrics.items():
70
+ if metric in self.thresholds:
71
+ threshold_info = self.thresholds[metric]
72
+ weight = threshold_info['accuracy']
73
+ if weight > confidence_threshold:
74
+ if value < threshold_info['value']:
75
+ votes += weight
76
+ total_weight += weight
77
+
78
+ confidence = votes / total_weight if total_weight > 0 else 0
79
+ return confidence > 0.5, confidence
80
+
81
+ def save(self, path: str):
82
+ """Save calibration to file."""
83
+ with open(path, 'w') as f:
84
+ json.dump({
85
+ 'thresholds': self.thresholds,
86
+ 'n_checked': len(self.checked_examples),
87
+ 'n_unchecked': len(self.unchecked_examples)
88
+ }, f, indent=2)
89
+
90
+ def load(self, path: str):
91
+ """Load calibration from file."""
92
+ with open(path, 'r') as f:
93
+ data = json.load(f)
94
+ self.thresholds = data['thresholds']
95
+
96
+ def _extract_metrics(self, region):
97
+ """Extract key metrics from region."""
98
+ img = np.array(region.render(crop=True).convert('L'))
99
+ h, w = img.shape
100
+ cy, cx = h // 2, w // 2
101
+
102
+ # Center sample
103
+ center = img[max(0, cy-2):min(h, cy+3), max(0, cx-2):min(w, cx+3)]
104
+
105
+ return {
106
+ 'center_darkness': np.mean(center),
107
+ 'dark_pixel_ratio': np.sum(img < 200) / img.size,
108
+ 'std_intensity': np.std(img),
109
+ 'edge_center_ratio': np.mean(img[0:2, :]) / np.mean(center) if np.mean(center) > 0 else 1
110
+ }
111
+
112
+
113
+ class FormCheckboxProcessor:
114
+ """
115
+ Process multiple instances of the same form with checkboxes.
116
+ """
117
+ def __init__(self, template_config: Dict):
118
+ """
119
+ template_config = {
120
+ 'checkboxes': {
121
+ 'option1': {'find': 'text=Acceptable', 'direction': 'left', 'width': 15},
122
+ 'option2': {'find': 'text=Deficient', 'direction': 'left', 'width': 15},
123
+ 'option3': {'find': 'text=At-Risk', 'direction': 'left', 'width': 15},
124
+ },
125
+ 'constraints': {
126
+ 'exactly_one': True, # Exactly one must be checked
127
+ 'min_checked': 1, # At least this many
128
+ 'max_checked': 1, # At most this many
129
+ }
130
+ }
131
+ """
132
+ self.config = template_config
133
+ self.calibrator = CheckboxCalibrator()
134
+ self.results = []
135
+
136
+ def process_form(self, page, form_id: str) -> Dict:
137
+ """Process a single form instance."""
138
+ results = {'form_id': form_id, 'checkboxes': {}, 'valid': True, 'confidence': 1.0}
139
+
140
+ # Find all checkboxes
141
+ for name, config in self.config['checkboxes'].items():
142
+ try:
143
+ # Find the checkbox region
144
+ ref = page.find(config['find'])
145
+ if not ref:
146
+ results['checkboxes'][name] = {'found': False}
147
+ results['valid'] = False
148
+ continue
149
+
150
+ # Navigate to checkbox
151
+ direction = config.get('direction', 'left')
152
+ width = config.get('width', 15)
153
+ if direction == 'left':
154
+ cb = ref.left(width=width)
155
+ elif direction == 'right':
156
+ cb = ref.right(width=width)
157
+ elif direction == 'above':
158
+ cb = ref.above(height=width)
159
+ elif direction == 'below':
160
+ cb = ref.below(height=width)
161
+
162
+ # Analyze checkbox
163
+ if self.calibrator.thresholds:
164
+ # Use calibrated prediction
165
+ is_checked, confidence = self.calibrator.predict(cb)
166
+ else:
167
+ # Use default analysis
168
+ from temp.checkbox_checks import analyze_checkbox
169
+ is_checked = analyze_checkbox(cb, method='all')
170
+ confidence = 0.8 # Default confidence
171
+
172
+ results['checkboxes'][name] = {
173
+ 'found': True,
174
+ 'checked': is_checked,
175
+ 'confidence': confidence,
176
+ 'bbox': cb.bbox
177
+ }
178
+ results['confidence'] = min(results['confidence'], confidence)
179
+
180
+ except Exception as e:
181
+ results['checkboxes'][name] = {'found': False, 'error': str(e)}
182
+ results['valid'] = False
183
+
184
+ # Check constraints
185
+ if results['valid'] and 'constraints' in self.config:
186
+ checked_count = sum(1 for cb in results['checkboxes'].values()
187
+ if cb.get('found') and cb.get('checked'))
188
+
189
+ constraints = self.config['constraints']
190
+ if 'exactly_one' in constraints and constraints['exactly_one']:
191
+ results['constraint_met'] = checked_count == 1
192
+ elif 'min_checked' in constraints:
193
+ results['constraint_met'] = checked_count >= constraints['min_checked']
194
+ if 'max_checked' in constraints:
195
+ results['constraint_met'] &= checked_count <= constraints['max_checked']
196
+ else:
197
+ results['constraint_met'] = True
198
+
199
+ results['checked_count'] = checked_count
200
+
201
+ self.results.append(results)
202
+ return results
203
+
204
+ def process_batch(self, pdf_paths: List[str], page_num: int = 0):
205
+ """Process multiple PDFs."""
206
+ import natural_pdf as npdf
207
+
208
+ for pdf_path in pdf_paths:
209
+ pdf = npdf.PDF(pdf_path)
210
+ page = pdf[page_num]
211
+ form_id = Path(pdf_path).stem
212
+ yield self.process_form(page, form_id)
213
+
214
+ def get_summary(self) -> pd.DataFrame:
215
+ """Get summary of all processed forms."""
216
+ data = []
217
+ for result in self.results:
218
+ row = {'form_id': result['form_id'], 'valid': result['valid']}
219
+ for name, info in result['checkboxes'].items():
220
+ row[f'{name}_checked'] = info.get('checked', False)
221
+ row[f'{name}_confidence'] = info.get('confidence', 0)
222
+ row['constraint_met'] = result.get('constraint_met', False)
223
+ row['overall_confidence'] = result.get('confidence', 0)
224
+ data.append(row)
225
+
226
+ return pd.DataFrame(data)
227
+
228
+ def flag_for_review(self, confidence_threshold: float = 0.8) -> List[str]:
229
+ """Get forms that need human review."""
230
+ return [r['form_id'] for r in self.results
231
+ if r['confidence'] < confidence_threshold or not r['valid']]
232
+
233
+
234
+ class InteractiveCheckboxReviewer:
235
+ """
236
+ Interactive tool for reviewing uncertain cases.
237
+ """
238
+ def __init__(self, processor: FormCheckboxProcessor):
239
+ self.processor = processor
240
+ self.corrections = {}
241
+
242
+ def review_uncertain(self, confidence_threshold: float = 0.8):
243
+ """Show uncertain cases for review."""
244
+ uncertain = [r for r in self.processor.results
245
+ if r['confidence'] < confidence_threshold]
246
+
247
+ print(f"Found {len(uncertain)} uncertain cases to review")
248
+
249
+ for i, result in enumerate(uncertain):
250
+ print(f"\n--- Form {i+1}/{len(uncertain)}: {result['form_id']} ---")
251
+ print(f"Overall confidence: {result['confidence']:.2f}")
252
+
253
+ for name, info in result['checkboxes'].items():
254
+ if info.get('found'):
255
+ status = "✓" if info['checked'] else "✗"
256
+ conf = info['confidence']
257
+ print(f"{name}: {status} (confidence: {conf:.2f})")
258
+
259
+ # In a real implementation, show the actual checkbox images
260
+ correction = input("Correct? (y/n/skip): ").lower()
261
+ if correction == 'n':
262
+ # Get corrections
263
+ for name in result['checkboxes']:
264
+ if result['checkboxes'][name].get('found'):
265
+ checked = input(f"Is {name} checked? (y/n): ").lower() == 'y'
266
+ self.corrections[f"{result['form_id']}_{name}"] = checked
267
+
268
+ def export_training_data(self, output_dir: str):
269
+ """Export examples for future training."""
270
+ # Implementation would save checkbox images with labels
271
+
272
+
273
+ class CheckboxExampleManager:
274
+ """
275
+ Manage a library of checkbox examples.
276
+ """
277
+ def __init__(self, examples_dir: str):
278
+ self.examples_dir = Path(examples_dir)
279
+ self.examples_dir.mkdir(exist_ok=True)
280
+
281
+ def save_example(self, region, label: str, is_checked: bool):
282
+ """Save a checkbox example."""
283
+ subdir = self.examples_dir / ('checked' if is_checked else 'unchecked')
284
+ subdir.mkdir(exist_ok=True)
285
+
286
+ # Save image
287
+ img = region.render(crop=True)
288
+ filename = f"{label}_{len(list(subdir.glob('*.png')))}.png"
289
+ img.save(subdir / filename)
290
+
291
+ # Save metadata
292
+ meta = {
293
+ 'label': label,
294
+ 'is_checked': is_checked,
295
+ 'bbox': region.bbox,
296
+ 'timestamp': pd.Timestamp.now().isoformat()
297
+ }
298
+ with open(subdir / f"{filename}.json", 'w') as f:
299
+ json.dump(meta, f)
300
+
301
+ def load_examples(self) -> Tuple[List, List]:
302
+ """Load all examples."""
303
+ checked = list((self.examples_dir / 'checked').glob('*.png'))
304
+ unchecked = list((self.examples_dir / 'unchecked').glob('*.png'))
305
+ return checked, unchecked
306
+
307
+ def create_calibrator(self) -> CheckboxCalibrator:
308
+ """Create calibrator from saved examples."""
309
+ calibrator = CheckboxCalibrator()
310
+
311
+ # Mock implementation - would load actual images
312
+ checked, unchecked = self.load_examples()
313
+ print(f"Loading {len(checked)} checked and {len(unchecked)} unchecked examples")
314
+
315
+ return calibrator
316
+
317
+
318
+ # Example usage patterns
319
+ if __name__ == "__main__":
320
+ print("UX Pattern Examples")
321
+ print("=" * 60)
322
+
323
+ # Pattern 1: Simple pairwise comparison
324
+ print("\n1. PAIRWISE COMPARISON")
325
+ print("Pros: Simple, no setup needed")
326
+ print("Cons: No learning, must process each time")
327
+ print("""
328
+ from temp.checkbox_checks import compare_checkboxes
329
+
330
+ df = compare_checkboxes([cb1, cb2, cb3], ['A', 'B', 'C'])
331
+ winner = df.loc['vote_score'].idxmax()
332
+ print(f"Checked: {winner}")
333
+ """)
334
+
335
+ # Pattern 2: Calibration-based
336
+ print("\n2. CALIBRATION-BASED")
337
+ print("Pros: Learns from examples, improves over time")
338
+ print("Cons: Requires initial setup")
339
+ print("""
340
+ calibrator = CheckboxCalibrator()
341
+
342
+ # Add examples
343
+ calibrator.add_example(checked_cb, is_checked=True)
344
+ calibrator.add_example(unchecked_cb, is_checked=False)
345
+ calibrator.calibrate()
346
+
347
+ # Use on new checkboxes
348
+ is_checked, confidence = calibrator.predict(new_cb)
349
+ """)
350
+
351
+ # Pattern 3: Template-based batch processing
352
+ print("\n3. TEMPLATE-BASED BATCH")
353
+ print("Pros: Handles many forms, constraint checking")
354
+ print("Cons: Requires template definition")
355
+ print("""
356
+ config = {
357
+ 'checkboxes': {
358
+ 'acceptable': {'find': 'text=Acceptable', 'direction': 'left'},
359
+ 'deficient': {'find': 'text=Deficient', 'direction': 'left'},
360
+ },
361
+ 'constraints': {'exactly_one': True}
362
+ }
363
+
364
+ processor = FormCheckboxProcessor(config)
365
+ for result in processor.process_batch(pdf_files):
366
+ if not result['constraint_met']:
367
+ print(f"Invalid: {result['form_id']}")
368
+ """)
369
+
370
+ # Pattern 4: Confidence-based triage
371
+ print("\n4. CONFIDENCE-BASED TRIAGE")
372
+ print("Pros: Focuses human review on uncertain cases")
373
+ print("Cons: Still requires some manual review")
374
+ print("""
375
+ # Process all forms
376
+ results = processor.process_batch(pdfs)
377
+
378
+ # Auto-accept high confidence
379
+ high_conf = [r for r in results if r['confidence'] > 0.9]
380
+
381
+ # Flag for review
382
+ needs_review = processor.flag_for_review(confidence_threshold=0.8)
383
+ print(f"Review needed: {len(needs_review)} forms")
384
+ """)
385
+
386
+ # Pattern 5: Active learning
387
+ print("\n5. ACTIVE LEARNING")
388
+ print("Pros: Improves accuracy with minimal human input")
389
+ print("Cons: More complex implementation")
390
+ print("""
391
+ reviewer = InteractiveCheckboxReviewer(processor)
392
+
393
+ # Review uncertain cases
394
+ reviewer.review_uncertain(confidence_threshold=0.7)
395
+
396
+ # Update calibrator with corrections
397
+ for correction in reviewer.corrections:
398
+ calibrator.add_example(region, is_checked=correction)
399
+ calibrator.calibrate()
400
+ """)
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Prototype implementation of context manager for Natural PDF directional options.
4
+
5
+ This demonstrates how Option 1 (Global Context Manager) would work in practice.
6
+ """
7
+
8
+ import natural_pdf as npdf
9
+ from contextlib import contextmanager
10
+ from typing import Any, Dict, Optional
11
+
12
+ @contextmanager
13
+ def with_directional_options(
14
+ directional_offset: Optional[float] = None,
15
+ auto_multipage: Optional[bool] = None,
16
+ **kwargs
17
+ ):
18
+ """
19
+ Context manager to temporarily override directional method options.
20
+
21
+ Parameters
22
+ ----------
23
+ directional_offset : float, optional
24
+ Temporary offset in points for directional methods (default: keep current)
25
+ auto_multipage : bool, optional
26
+ Temporary setting for automatic multipage navigation (default: keep current)
27
+ **kwargs : dict
28
+ Additional layout options to override
29
+
30
+ Examples
31
+ --------
32
+ >>> # Temporarily use larger offset
33
+ >>> with with_directional_options(directional_offset=5.0):
34
+ ... region = element.below() # Uses 5.0 offset
35
+
36
+ >>> # Multiple options
37
+ >>> with with_directional_options(directional_offset=10.0, auto_multipage=True):
38
+ ... region = element.below(until="Section 2") # Can span pages
39
+
40
+ >>> # Nested contexts
41
+ >>> with with_directional_options(directional_offset=5.0):
42
+ ... with with_directional_options(auto_multipage=True):
43
+ ... # Both options are active here
44
+ ... region = element.below()
45
+
46
+ Warning
47
+ -------
48
+ This context manager modifies global state and is NOT thread-safe.
49
+ Do not use in multi-threaded applications.
50
+ """
51
+ # Build options dict
52
+ options = {}
53
+ if directional_offset is not None:
54
+ options['directional_offset'] = directional_offset
55
+ if auto_multipage is not None:
56
+ options['auto_multipage'] = auto_multipage
57
+ options.update(kwargs)
58
+
59
+ # Store original values
60
+ original_values = {}
61
+ layout_options = npdf.options.layout
62
+
63
+ for key, value in options.items():
64
+ if hasattr(layout_options, key):
65
+ original_values[key] = getattr(layout_options, key)
66
+ setattr(layout_options, key, value)
67
+ else:
68
+ raise ValueError(f"Unknown layout option: {key}")
69
+
70
+ try:
71
+ yield
72
+ finally:
73
+ # Restore original values
74
+ for key, original_value in original_values.items():
75
+ setattr(layout_options, key, original_value)
76
+
77
+
78
+ # Convenience functions for common use cases
79
+ @contextmanager
80
+ def with_offset(offset: float):
81
+ """Temporarily set directional offset."""
82
+ with with_directional_options(directional_offset=offset):
83
+ yield
84
+
85
+
86
+ @contextmanager
87
+ def with_multipage():
88
+ """Temporarily enable multipage navigation."""
89
+ with with_directional_options(auto_multipage=True):
90
+ yield
91
+
92
+
93
+ @contextmanager
94
+ def no_offset():
95
+ """Temporarily disable directional offset (set to 0)."""
96
+ with with_directional_options(directional_offset=0.0):
97
+ yield
98
+
99
+
100
+ # Demo usage
101
+ if __name__ == "__main__":
102
+ import tempfile
103
+ from reportlab.pdfgen import canvas
104
+ from reportlab.lib.pagesizes import letter
105
+
106
+ # Create a test PDF
107
+ with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as tmp:
108
+ c = canvas.Canvas(tmp.name, pagesize=letter)
109
+
110
+ # Page 1
111
+ c.drawString(100, 700, "Header 1")
112
+ c.drawString(100, 600, "Content 1")
113
+ c.drawString(100, 500, "Footer 1")
114
+
115
+ # Page 2
116
+ c.showPage()
117
+ c.drawString(100, 700, "Header 2")
118
+ c.drawString(100, 600, "Content 2")
119
+
120
+ c.save()
121
+
122
+ # Test the context managers
123
+ print("=== Testing Context Managers ===")
124
+ pdf = npdf.PDF(tmp.name)
125
+ page = pdf.pages[0]
126
+
127
+ # Find header
128
+ header = page.find('text:contains("Header 1")')
129
+
130
+ # Test 1: Default behavior
131
+ print(f"\n1. Default offset: {npdf.options.layout.directional_offset}")
132
+ region1 = header.below(height=50)
133
+ print(f" Region bbox: {region1.bbox}")
134
+
135
+ # Test 2: With custom offset
136
+ print(f"\n2. Using with_offset(10.0):")
137
+ with with_offset(10.0):
138
+ print(f" Inside context: offset={npdf.options.layout.directional_offset}")
139
+ region2 = header.below(height=50)
140
+ print(f" Region bbox: {region2.bbox}")
141
+ print(f" Y difference: {region2.top - region1.top}")
142
+ print(f" After context: offset={npdf.options.layout.directional_offset}")
143
+
144
+ # Test 3: No offset
145
+ print(f"\n3. Using no_offset():")
146
+ with no_offset():
147
+ region3 = header.below(height=50)
148
+ print(f" Region bbox: {region3.bbox}")
149
+ print(f" Includes header: {region3.top <= header.bottom}")
150
+
151
+ # Test 4: Multipage navigation
152
+ print(f"\n4. Testing multipage:")
153
+ print(f" Default auto_multipage: {npdf.options.layout.auto_multipage}")
154
+
155
+ # This would normally stop at page boundary
156
+ region4 = header.below(until='text:contains("Header 2")')
157
+ print(f" Without multipage: stops at page boundary")
158
+
159
+ # This can cross pages
160
+ with with_multipage():
161
+ print(f" Inside context: auto_multipage={npdf.options.layout.auto_multipage}")
162
+ # Note: This would work if multipage was fully implemented
163
+ # region5 = header.below(until="Header 2")
164
+ # print(f" Region type: {type(region5).__name__}") # Would be FlowRegion
165
+
166
+ # Test 5: Nested contexts
167
+ print(f"\n5. Nested contexts:")
168
+ with with_offset(5.0):
169
+ print(f" Outer: offset={npdf.options.layout.directional_offset}")
170
+ with with_offset(10.0):
171
+ print(f" Inner: offset={npdf.options.layout.directional_offset}")
172
+ print(f" Back to outer: offset={npdf.options.layout.directional_offset}")
173
+ print(f" Back to default: offset={npdf.options.layout.directional_offset}")
174
+
175
+ # Cleanup
176
+ import os
177
+ os.unlink(tmp.name)
temp/convert_to_hf.py ADDED
@@ -0,0 +1,60 @@
1
+ """Convert RF-DETR checkpoint to HuggingFace format"""
2
+ import torch
3
+ import json
4
+ from pathlib import Path
5
+
6
+ # Load your checkpoint
7
+ checkpoint = torch.load("model-weights/checkbox-nano.pt", map_location='cpu', weights_only=False)
8
+
9
+ # Extract model info
10
+ model_state = checkpoint['model']
11
+ args = checkpoint['args']
12
+
13
+ print(f"Model has {len(model_state)} parameters")
14
+ print(f"Classes: {args.class_names}")
15
+ print(f"Number of classes: {args.num_classes}")
16
+
17
+ # Create HF-style config
18
+ config = {
19
+ "architectures": ["RFDetrForObjectDetection"],
20
+ "model_type": "rf-detr",
21
+ "num_labels": args.num_classes,
22
+ "id2label": {str(i): label for i, label in enumerate(args.class_names)},
23
+ "label2id": {label: str(i) for i, label in enumerate(args.class_names)},
24
+
25
+ # RF-DETR specific
26
+ "encoder": args.encoder, # dinov2_windowed_small
27
+ "hidden_dim": args.hidden_dim,
28
+ "num_queries": args.num_queries,
29
+ "dec_layers": args.dec_layers,
30
+ "dim_feedforward": args.dim_feedforward,
31
+ "dropout": args.dropout,
32
+ "sa_nheads": args.sa_nheads,
33
+ "ca_nheads": args.ca_nheads,
34
+ "two_stage": args.two_stage,
35
+
36
+ # Detection specific
37
+ "bbox_loss_coef": args.bbox_loss_coef,
38
+ "giou_loss_coef": args.giou_loss_coef,
39
+ "cls_loss_coef": args.cls_loss_coef,
40
+
41
+ # Training config
42
+ "resolution": args.resolution,
43
+ "pretrained_encoder": args.encoder,
44
+ }
45
+
46
+ # Save config
47
+ output_dir = Path("temp/checkbox-rf-detr-hf")
48
+ output_dir.mkdir(exist_ok=True)
49
+
50
+ with open(output_dir / "config.json", "w") as f:
51
+ json.dump(config, f, indent=2)
52
+
53
+ # Save model weights in HF format
54
+ torch.save(model_state, output_dir / "pytorch_model.bin")
55
+
56
+ print(f"\nSaved to {output_dir}")
57
+ print("Next steps:")
58
+ print("1. Copy the custom RF-DETR implementation files from Thastp/rf-detr-base")
59
+ print("2. Upload to HuggingFace Hub with these files")
60
+ print("3. Use with transformers library")