natural-pdf 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. natural_pdf/__init__.py +8 -0
  2. natural_pdf/analyzers/checkbox/__init__.py +6 -0
  3. natural_pdf/analyzers/checkbox/base.py +265 -0
  4. natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  5. natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  6. natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  7. natural_pdf/analyzers/checkbox/mixin.py +95 -0
  8. natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  9. natural_pdf/collections/mixins.py +14 -5
  10. natural_pdf/core/element_manager.py +5 -1
  11. natural_pdf/core/page.py +103 -9
  12. natural_pdf/core/page_collection.py +41 -1
  13. natural_pdf/core/pdf.py +24 -1
  14. natural_pdf/describe/base.py +20 -0
  15. natural_pdf/elements/base.py +152 -10
  16. natural_pdf/elements/element_collection.py +41 -2
  17. natural_pdf/elements/region.py +115 -2
  18. natural_pdf/judge.py +1509 -0
  19. natural_pdf/selectors/parser.py +42 -1
  20. natural_pdf/utils/spatial.py +42 -39
  21. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
  22. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +42 -18
  23. temp/check_model.py +49 -0
  24. temp/check_pdf_content.py +9 -0
  25. temp/checkbox_checks.py +590 -0
  26. temp/checkbox_simple.py +117 -0
  27. temp/checkbox_ux_ideas.py +400 -0
  28. temp/context_manager_prototype.py +177 -0
  29. temp/convert_to_hf.py +60 -0
  30. temp/demo_text_closest.py +66 -0
  31. temp/inspect_model.py +43 -0
  32. temp/rtdetr_dinov2_test.py +49 -0
  33. temp/test_closest_debug.py +26 -0
  34. temp/test_closest_debug2.py +22 -0
  35. temp/test_context_exploration.py +85 -0
  36. temp/test_durham.py +30 -0
  37. temp/test_empty_string.py +16 -0
  38. temp/test_similarity.py +15 -0
  39. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
  40. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
  41. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
  42. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,590 @@
1
+ """Checkbox detection using flood fill from center - X vs empty box."""
2
+
3
+ import numpy as np
4
+ from collections import deque
5
+
6
+
7
+
8
+
9
+ def flood_fill_size(image, start_x, start_y, threshold=200):
10
+ """
11
+ Flood fill from a point and return the size of filled region.
12
+ Larger region = empty box. Smaller region = X or other mark.
13
+ """
14
+ if isinstance(image, np.ndarray) and len(image.shape) == 3:
15
+ image = np.mean(image, axis=2).astype(np.uint8)
16
+
17
+ height, width = image.shape
18
+ if start_x >= width or start_y >= height:
19
+ return 0
20
+
21
+ # Track visited pixels
22
+ visited = np.zeros_like(image, dtype=bool)
23
+
24
+ # Queue for flood fill
25
+ queue = deque([(start_x, start_y)])
26
+ visited[start_y, start_x] = True
27
+
28
+ # Count pixels in flood fill region
29
+ count = 0
30
+
31
+ # 8-directional flood fill
32
+ directions = [(-1, -1), (-1, 0), (-1, 1), (0, -1),
33
+ (0, 1), (1, -1), (1, 0), (1, 1)]
34
+
35
+ while queue:
36
+ x, y = queue.popleft()
37
+ count += 1
38
+
39
+ # Check all 8 neighbors
40
+ for dx, dy in directions:
41
+ nx, ny = x + dx, y + dy
42
+
43
+ # Check bounds
44
+ if 0 <= nx < width and 0 <= ny < height:
45
+ # If not visited and light enough (not ink)
46
+ if not visited[ny, nx] and image[ny, nx] >= threshold:
47
+ visited[ny, nx] = True
48
+ queue.append((nx, ny))
49
+
50
+ return count
51
+
52
+
53
+ def detect_checkbox_flood(region, debug=False):
54
+ """
55
+ Detect if checkbox is checked using flood fill from center.
56
+ Empty box = large flood region. X = small flood region.
57
+ """
58
+ # Convert to grayscale numpy array
59
+ img = np.array(region.render(crop=True).convert('L'))
60
+ height, width = img.shape
61
+
62
+ # Start flood fill from center
63
+ center_x, center_y = width // 2, height // 2
64
+
65
+ # Try flood fill from center
66
+ flood_size = flood_fill_size(img, center_x, center_y)
67
+ total_pixels = width * height
68
+ flood_ratio = flood_size / total_pixels
69
+
70
+ # Also try a few points near center in case center pixel is dark
71
+ alt_points = [
72
+ (center_x - 2, center_y),
73
+ (center_x + 2, center_y),
74
+ (center_x, center_y - 2),
75
+ (center_x, center_y + 2),
76
+ ]
77
+
78
+ max_flood_size = flood_size
79
+ for x, y in alt_points:
80
+ if 0 <= x < width and 0 <= y < height:
81
+ size = flood_fill_size(img, x, y)
82
+ max_flood_size = max(max_flood_size, size)
83
+
84
+ max_flood_ratio = max_flood_size / total_pixels
85
+
86
+ # Decision logic:
87
+ # - If flood fills >50% of box, it's likely empty
88
+ # - If flood fills <20% of box, it's likely marked with X
89
+ is_empty = max_flood_ratio > 0.5
90
+ is_marked = max_flood_ratio < 0.2
91
+
92
+ if debug:
93
+ print(f"Flood fill results:")
94
+ print(f" Center flood: {flood_size} pixels ({flood_ratio:.1%})")
95
+ print(f" Max flood: {max_flood_size} pixels ({max_flood_ratio:.1%})")
96
+ print(f" Decision: {'EMPTY' if is_empty else 'MARKED' if is_marked else 'UNCERTAIN'}")
97
+
98
+ return {
99
+ 'is_checked': is_marked,
100
+ 'is_empty': is_empty,
101
+ 'flood_ratio': max_flood_ratio,
102
+ 'confidence': abs(max_flood_ratio - 0.35) / 0.35 # Confidence based on distance from uncertain middle
103
+ }
104
+
105
+
106
+ def simple_center_darkness(region, sample_size=5):
107
+ """
108
+ Even simpler: just check if center region is dark.
109
+ Good for quick X detection.
110
+ """
111
+ img = np.array(get_region_image(region).convert('L'))
112
+ height, width = img.shape
113
+
114
+ # Sample center region
115
+ cy, cx = height // 2, width // 2
116
+ half = sample_size // 2
117
+
118
+ # Extract center patch
119
+ y1 = max(0, cy - half)
120
+ y2 = min(height, cy + half + 1)
121
+ x1 = max(0, cx - half)
122
+ x2 = min(width, cx + half + 1)
123
+
124
+ center_patch = img[y1:y2, x1:x2]
125
+ center_darkness = np.mean(center_patch)
126
+
127
+ # X marks usually have dark center
128
+ return center_darkness < 200
129
+
130
+
131
+ def ink_blob_analysis(region, threshold=200):
132
+ """
133
+ Analyze connected components of ink.
134
+ X pattern typically has 1 large connected component.
135
+ Empty box has small scattered components (noise).
136
+ """
137
+ img = np.array(get_region_image(region).convert('L'))
138
+ binary = img < threshold
139
+
140
+ # Simple connected component analysis without scipy
141
+ # Count number of ink "blobs"
142
+ visited = np.zeros_like(binary, dtype=bool)
143
+ blob_sizes = []
144
+
145
+ # Iterative flood fill to avoid recursion limit
146
+ def get_blob_size(start_y, start_x):
147
+ """Iterative flood fill to get connected component size."""
148
+ if visited[start_y, start_x] or not binary[start_y, start_x]:
149
+ return 0
150
+
151
+ stack = [(start_y, start_x)]
152
+ visited[start_y, start_x] = True
153
+ size = 0
154
+
155
+ while stack:
156
+ y, x = stack.pop()
157
+ size += 1
158
+
159
+ # Check 8 neighbors
160
+ for dy in [-1, 0, 1]:
161
+ for dx in [-1, 0, 1]:
162
+ if dy == 0 and dx == 0:
163
+ continue
164
+ ny, nx = y + dy, x + dx
165
+ if (0 <= ny < binary.shape[0] and
166
+ 0 <= nx < binary.shape[1] and
167
+ not visited[ny, nx] and
168
+ binary[ny, nx]):
169
+ visited[ny, nx] = True
170
+ stack.append((ny, nx))
171
+
172
+ return size
173
+
174
+ # Find all blobs
175
+ for y in range(binary.shape[0]):
176
+ for x in range(binary.shape[1]):
177
+ if binary[y, x] and not visited[y, x]:
178
+ blob_size = get_blob_size(y, x)
179
+ if blob_size > 10: # Ignore tiny noise
180
+ blob_sizes.append(blob_size)
181
+
182
+ # Analysis
183
+ total_ink = np.sum(binary)
184
+ largest_blob = max(blob_sizes) if blob_sizes else 0
185
+ num_blobs = len(blob_sizes)
186
+
187
+ # X typically has 1-2 large blobs, empty box has many small ones
188
+ is_x_pattern = (num_blobs <= 3 and largest_blob > total_ink * 0.6)
189
+
190
+ return {
191
+ 'is_x_pattern': is_x_pattern,
192
+ 'num_blobs': num_blobs,
193
+ 'largest_blob_ratio': largest_blob / total_ink if total_ink > 0 else 0,
194
+ 'total_ink_ratio': total_ink / binary.size
195
+ }
196
+
197
+
198
+ def analyze_checkbox(region, method='flood', debug=False):
199
+ """
200
+ Main function to analyze checkbox using specified method.
201
+
202
+ Methods:
203
+ - 'flood': Flood fill from center (default, most reliable)
204
+ - 'center': Check center darkness (fastest)
205
+ - 'blob': Connected component analysis (most thorough)
206
+ - 'all': Run all methods and vote
207
+ """
208
+ if method == 'flood':
209
+ result = detect_checkbox_flood(region, debug)
210
+ return result['is_checked']
211
+
212
+ elif method == 'center':
213
+ return simple_center_darkness(region)
214
+
215
+ elif method == 'blob':
216
+ result = ink_blob_analysis(region)
217
+ return result['is_x_pattern']
218
+
219
+ elif method == 'all':
220
+ # Run all methods and vote
221
+ flood_result = detect_checkbox_flood(region, debug=False)
222
+ center_result = simple_center_darkness(region)
223
+ blob_result = ink_blob_analysis(region)
224
+
225
+ votes = sum([
226
+ flood_result['is_checked'],
227
+ center_result,
228
+ blob_result['is_x_pattern']
229
+ ])
230
+
231
+ if debug:
232
+ print("All methods:")
233
+ print(f" Flood fill: {'MARKED' if flood_result['is_checked'] else 'EMPTY'}")
234
+ print(f" Center darkness: {'MARKED' if center_result else 'EMPTY'}")
235
+ print(f" Blob analysis: {'X PATTERN' if blob_result['is_x_pattern'] else 'EMPTY/OTHER'}")
236
+ print(f" Final vote: {votes}/3 say MARKED")
237
+
238
+ return votes >= 2
239
+
240
+ else:
241
+ raise ValueError(f"Unknown method: {method}")
242
+
243
+
244
+
245
+
246
+ def compare_checkboxes(regions, labels=None):
247
+ """
248
+ Compare multiple checkbox regions across all metrics.
249
+
250
+ Args:
251
+ regions: List of checkbox regions to analyze
252
+ labels: Optional list of labels for each region (e.g., ['Acceptable', 'Deficient', 'At-Risk'])
253
+
254
+ Returns:
255
+ pandas DataFrame with metrics as rows and regions as columns
256
+ """
257
+ import pandas as pd
258
+
259
+ if labels is None:
260
+ labels = [f'Region_{i+1}' for i in range(len(regions))]
261
+
262
+ # Initialize results dictionary
263
+ results = {label: {} for label in labels}
264
+
265
+ for i, (region, label) in enumerate(zip(regions, labels)):
266
+ # Get grayscale image
267
+ img = np.array(region.render(crop=True).convert('L'))
268
+ height, width = img.shape
269
+ total_pixels = height * width
270
+
271
+ # 1. Basic pixel metrics
272
+ dark_pixels = np.sum(img < 200)
273
+ results[label]['dark_pixel_count'] = dark_pixels
274
+ results[label]['dark_pixel_ratio'] = dark_pixels / total_pixels
275
+ results[label]['mean_intensity'] = np.mean(img)
276
+ results[label]['std_intensity'] = np.std(img)
277
+ results[label]['ink_score'] = (255 - np.mean(img)) / 2.55
278
+
279
+ # 2. Flood fill metrics
280
+ flood_result = detect_checkbox_flood(region, debug=False)
281
+ results[label]['flood_ratio'] = flood_result['flood_ratio']
282
+ results[label]['flood_confidence'] = flood_result['confidence']
283
+ results[label]['is_empty_flood'] = flood_result['is_empty']
284
+ results[label]['is_marked_flood'] = flood_result['is_checked']
285
+
286
+ # 3. Center darkness
287
+ cy, cx = height // 2, width // 2
288
+ center_patch = img[max(0, cy-2):min(height, cy+3), max(0, cx-2):min(width, cx+3)]
289
+ results[label]['center_darkness'] = np.mean(center_patch)
290
+ results[label]['is_marked_center'] = np.mean(center_patch) < 200
291
+
292
+ # 4. Blob analysis
293
+ blob_result = ink_blob_analysis(region)
294
+ results[label]['num_blobs'] = blob_result['num_blobs']
295
+ results[label]['largest_blob_ratio'] = blob_result['largest_blob_ratio']
296
+ results[label]['total_ink_ratio'] = blob_result['total_ink_ratio']
297
+ results[label]['is_x_pattern'] = blob_result['is_x_pattern']
298
+
299
+ # 5. Spatial distribution
300
+ # Check if ink is concentrated in diagonal patterns (X shape)
301
+ binary = img < 200
302
+ # Main diagonal
303
+ diag1_sum = sum(binary[i, i] for i in range(min(height, width)))
304
+ # Anti-diagonal
305
+ diag2_sum = sum(binary[i, width-1-i] for i in range(min(height, width)))
306
+ diagonal_ratio = (diag1_sum + diag2_sum) / (2 * min(height, width))
307
+ results[label]['diagonal_ink_ratio'] = diagonal_ratio
308
+
309
+ # 6. Edge vs center distribution
310
+ edge_mask = np.zeros_like(binary)
311
+ edge_mask[0:3, :] = True
312
+ edge_mask[-3:, :] = True
313
+ edge_mask[:, 0:3] = True
314
+ edge_mask[:, -3:] = True
315
+ edge_ink = np.sum(binary & edge_mask)
316
+ center_ink = np.sum(binary & ~edge_mask)
317
+ results[label]['edge_ink_ratio'] = edge_ink / np.sum(edge_mask) if np.sum(edge_mask) > 0 else 0
318
+ results[label]['center_ink_ratio'] = center_ink / np.sum(~edge_mask) if np.sum(~edge_mask) > 0 else 0
319
+
320
+ # 7. Final voting
321
+ votes = sum([
322
+ results[label]['is_marked_flood'],
323
+ results[label]['is_marked_center'],
324
+ results[label]['is_x_pattern']
325
+ ])
326
+ results[label]['vote_score'] = votes
327
+ results[label]['is_checked'] = votes >= 2
328
+
329
+ # Create DataFrame
330
+ df = pd.DataFrame(results)
331
+
332
+ # Add row for identifying which is most likely checked
333
+ most_checked_scores = {
334
+ 'dark_pixel_ratio': df.loc['dark_pixel_ratio'].idxmax(),
335
+ 'ink_score': df.loc['ink_score'].idxmax(),
336
+ 'flood_ratio': df.loc['flood_ratio'].idxmin(), # Lower is more marked
337
+ 'center_darkness': df.loc['center_darkness'].idxmin(), # Lower is darker
338
+ 'diagonal_ink_ratio': df.loc['diagonal_ink_ratio'].idxmax(),
339
+ 'vote_score': df.loc['vote_score'].idxmax(),
340
+ }
341
+
342
+ # Add summary row
343
+ df.loc['most_likely_checked'] = [most_checked_scores.get(col, '') for col in df.columns]
344
+
345
+ return df
346
+
347
+
348
+ # Example usage scaffold
349
+ def analyze_three_checkboxes():
350
+ """
351
+ Example scaffold for analyzing three checkbox regions.
352
+ """
353
+ import natural_pdf as npdf
354
+ import pandas as pd
355
+
356
+ # Set pandas display options for better viewing
357
+ pd.set_option('display.max_rows', None)
358
+ pd.set_option('display.float_format', '{:.3f}'.format)
359
+
360
+ # Load your PDF
361
+ pdf = npdf.PDF("your_form.pdf")
362
+ page = pdf[0]
363
+
364
+ # Method 1: If you have the regions already
365
+ # regions = [region1, region2, region3]
366
+ # labels = ['Acceptable', 'Deficient', 'At-Risk']
367
+
368
+ # Method 2: Find them relative to text labels
369
+ checkbox_area = page.find('text:contains("Housing")') # Or however you identify the area
370
+
371
+ regions = []
372
+ labels = ['Acceptable', 'Deficient', 'At-Risk']
373
+
374
+ for label in labels:
375
+ cb = checkbox_area.find(f'text={label}').left(width=15)
376
+ if cb:
377
+ regions.append(cb)
378
+ else:
379
+ print(f"Warning: Could not find checkbox for {label}")
380
+
381
+ # Run comparison
382
+ df = compare_checkboxes(regions, labels)
383
+
384
+ # Display results
385
+ print("\nCheckbox Analysis Results:")
386
+ print("=" * 80)
387
+ print(df)
388
+
389
+ print("\n\nKey Metrics Interpretation:")
390
+ print("-" * 80)
391
+ print("dark_pixel_ratio: Higher = more ink")
392
+ print("ink_score: 0-100, higher = more marking")
393
+ print("flood_ratio: Lower = more marked (ink blocks flood fill)")
394
+ print("center_darkness: Lower = darker center (X pattern)")
395
+ print("diagonal_ink_ratio: Higher = more X-like pattern")
396
+ print("vote_score: 0-3, number of methods that think it's checked")
397
+
398
+ print("\n\nConclusion:")
399
+ print("-" * 80)
400
+ checked_col = df.loc['is_checked']
401
+ checked_labels = [col for col in checked_col.index if checked_col[col]]
402
+ if checked_labels:
403
+ print(f"Checked boxes: {', '.join(checked_labels)}")
404
+ else:
405
+ print("No boxes appear to be checked")
406
+
407
+ # Find most likely based on vote
408
+ vote_scores = df.loc['vote_score']
409
+ max_vote = vote_scores.max()
410
+ if max_vote > 0:
411
+ most_likely = vote_scores.idxmax()
412
+ print(f"Most likely checked (by vote): {most_likely}")
413
+
414
+ return df
415
+
416
+
417
+ # Quick test function for three regions
418
+ def quick_compare(region1, region2, region3, labels=['Region 1', 'Region 2', 'Region 3']):
419
+ """
420
+ Quick comparison of three regions.
421
+
422
+ Example:
423
+ df = quick_compare(acceptable_cb, deficient_cb, at_risk_cb,
424
+ ['Acceptable', 'Deficient', 'At-Risk'])
425
+ """
426
+ return compare_checkboxes([region1, region2, region3], labels)
427
+
428
+
429
+ def which_is_checked(*regions, labels=None):
430
+ """
431
+ Simple function that tells you which checkbox is checked.
432
+
433
+ Usage:
434
+ checked = which_is_checked(region1, region2, region3)
435
+ print(checked) # "Region 2"
436
+
437
+ # With labels:
438
+ checked = which_is_checked(accept_cb, deficient_cb, risk_cb,
439
+ labels=['Acceptable', 'Deficient', 'At-Risk'])
440
+ print(checked) # "Deficient"
441
+ """
442
+ if labels is None:
443
+ labels = [f'Checkbox {i+1}' for i in range(len(regions))]
444
+
445
+ # Quick analysis of each
446
+ scores = []
447
+ for region, label in zip(regions, labels):
448
+ img = np.array(region.render(crop=True).convert('L'))
449
+
450
+ # Just check center darkness - simplest reliable method
451
+ h, w = img.shape
452
+ cy, cx = h // 2, w // 2
453
+ center = img[max(0, cy-3):min(h, cy+4), max(0, cx-3):min(w, cx+4)]
454
+ darkness = 255 - np.mean(center) # Higher = darker
455
+
456
+ scores.append((label, darkness))
457
+
458
+ # Sort by darkness
459
+ scores.sort(key=lambda x: x[1], reverse=True)
460
+
461
+ # Simple decision
462
+ if scores[0][1] > scores[1][1] * 1.5: # Clear winner
463
+ return scores[0][0]
464
+ else:
465
+ # Not clear - return with uncertainty
466
+ return f"{scores[0][0]} (uncertain - also check {scores[1][0]})"
467
+
468
+
469
+ def show_checkbox_comparison(*regions, labels=None):
470
+ """
471
+ Visual comparison of checkboxes - shows which is darkest.
472
+
473
+ Usage:
474
+ show_checkbox_comparison(cb1, cb2, cb3, labels=['A', 'B', 'C'])
475
+ """
476
+ if labels is None:
477
+ labels = [f'Option {i+1}' for i in range(len(regions))]
478
+
479
+ print("\nCheckbox Analysis:")
480
+ print("-" * 40)
481
+
482
+ results = []
483
+ for region, label in zip(regions, labels):
484
+ img = np.array(region.render(crop=True).convert('L'))
485
+
486
+ # Simple metrics
487
+ darkness = 255 - np.mean(img)
488
+ dark_pixels = np.sum(img < 200)
489
+
490
+ results.append({
491
+ 'label': label,
492
+ 'darkness': darkness,
493
+ 'dark_pixels': dark_pixels
494
+ })
495
+
496
+ # Sort by darkness
497
+ results.sort(key=lambda x: x['darkness'], reverse=True)
498
+
499
+ # Show results
500
+ print(f"Most likely checked: {results[0]['label']}")
501
+ print()
502
+
503
+ # Simple bar chart
504
+ max_darkness = max(r['darkness'] for r in results)
505
+ for r in results:
506
+ bar_length = int(30 * r['darkness'] / max_darkness) if max_darkness > 0 else 0
507
+ bar = '█' * bar_length
508
+ print(f"{r['label']:15} {bar} {r['darkness']:.0f}")
509
+
510
+ # Confidence check
511
+ if len(results) >= 2:
512
+ ratio = results[0]['darkness'] / results[1]['darkness'] if results[1]['darkness'] > 0 else 10
513
+ if ratio < 1.3:
514
+ print(f"\n⚠️ Low confidence - {results[1]['label']} is almost as dark")
515
+ elif ratio > 2:
516
+ print(f"\n✓ High confidence - clearly {results[0]['label']}")
517
+
518
+
519
+ def is_this_checked(region, reference_checked=None, reference_unchecked=None):
520
+ """
521
+ Check if a single checkbox is marked, optionally using reference examples.
522
+
523
+ Usage:
524
+ # Simple check
525
+ if is_this_checked(my_checkbox):
526
+ print("It's checked!")
527
+
528
+ # With references for better accuracy
529
+ if is_this_checked(my_checkbox,
530
+ reference_checked=known_checked_cb,
531
+ reference_unchecked=known_empty_cb):
532
+ print("It's checked!")
533
+ """
534
+ img = np.array(region.render(crop=True).convert('L'))
535
+ darkness = 255 - np.mean(img)
536
+
537
+ if reference_checked is None and reference_unchecked is None:
538
+ # Simple threshold
539
+ return darkness > 40 # Adjust based on your forms
540
+
541
+ # Compare to references
542
+ if reference_checked:
543
+ checked_img = np.array(reference_checked.render(crop=True).convert('L'))
544
+ checked_darkness = 255 - np.mean(checked_img)
545
+ else:
546
+ checked_darkness = 60 # Default
547
+
548
+ if reference_unchecked:
549
+ unchecked_img = np.array(reference_unchecked.render(crop=True).convert('L'))
550
+ unchecked_darkness = 255 - np.mean(unchecked_img)
551
+ else:
552
+ unchecked_darkness = 20 # Default
553
+
554
+ # Which is it closer to?
555
+ threshold = (checked_darkness + unchecked_darkness) / 2
556
+ return darkness > threshold
557
+
558
+
559
+ def debug_regions(regions, labels=None):
560
+ """Debug function to check what's in each region."""
561
+ if labels is None:
562
+ labels = [f'Region_{i+1}' for i in range(len(regions))]
563
+
564
+ print("Debugging regions:")
565
+ print("=" * 60)
566
+
567
+ for i, (region, label) in enumerate(zip(regions, labels)):
568
+ print(f"\n{label}:")
569
+ print(f" Region object: {region}")
570
+ print(f" Bbox: {region.bbox if hasattr(region, 'bbox') else 'N/A'}")
571
+
572
+ # Get the image
573
+ img = np.array(region.render(crop=True).convert('L'))
574
+ print(f" Image shape: {img.shape}")
575
+ print(f" Image dtype: {img.dtype}")
576
+ print(f" Unique values: {len(np.unique(img))}")
577
+ print(f" Min/Max values: {img.min()}/{img.max()}")
578
+
579
+ # Show a small sample of the center
580
+ h, w = img.shape
581
+ cy, cx = h//2, w//2
582
+ sample = img[max(0,cy-2):cy+3, max(0,cx-2):cx+3]
583
+ print(f" Center sample:\n{sample}")
584
+
585
+ # Save images for inspection
586
+ from PIL import Image
587
+ pil_img = Image.fromarray(img, mode='L')
588
+ filename = f"/tmp/checkbox_{label.lower().replace(' ', '_').replace('-', '_')}.png"
589
+ pil_img.save(filename)
590
+ print(f" Saved to: {filename}")
@@ -0,0 +1,117 @@
1
+ """Simplified checkbox detection - just the most effective methods."""
2
+
3
+ import numpy as np
4
+
5
+
6
+ def is_checkbox_marked_simple(region, method='center'):
7
+ """
8
+ Simplified checkbox detection using only the most effective methods.
9
+
10
+ Methods:
11
+ - 'center': Check if center is dark (fastest, most reliable)
12
+ - 'flood': Check flood fill ratio (slightly more robust)
13
+ - 'both': Use both and require agreement
14
+ """
15
+ img = np.array(region.render(crop=True).convert('L'))
16
+
17
+ if method == 'center':
18
+ # Just check center darkness
19
+ h, w = img.shape
20
+ cy, cx = h // 2, w // 2
21
+ # Sample 5x5 center region
22
+ center_region = img[max(0, cy-2):min(h, cy+3),
23
+ max(0, cx-2):min(w, cx+3)]
24
+ center_mean = np.mean(center_region)
25
+ return center_mean < 190 # Threshold can be tuned
26
+
27
+ elif method == 'flood':
28
+ # Simple flood fill from center
29
+ h, w = img.shape
30
+ cy, cx = h // 2, w // 2
31
+
32
+ # Count pixels reachable from center
33
+ visited = np.zeros_like(img, dtype=bool)
34
+ stack = [(cy, cx)]
35
+ visited[cy, cx] = True
36
+ count = 0
37
+
38
+ while stack and count < h * w: # Safety limit
39
+ y, x = stack.pop()
40
+ count += 1
41
+
42
+ # Check 4 neighbors (simpler than 8)
43
+ for dy, dx in [(0, 1), (1, 0), (0, -1), (-1, 0)]:
44
+ ny, nx = y + dy, x + dx
45
+ if (0 <= ny < h and 0 <= nx < w and
46
+ not visited[ny, nx] and img[ny, nx] > 200):
47
+ visited[ny, nx] = True
48
+ stack.append((ny, nx))
49
+
50
+ flood_ratio = count / (h * w)
51
+ return flood_ratio < 0.3 # Less than 30% fillable = marked
52
+
53
+ elif method == 'both':
54
+ # Require both methods to agree
55
+ center_dark = is_checkbox_marked_simple(region, 'center')
56
+ flood_blocked = is_checkbox_marked_simple(region, 'flood')
57
+ return center_dark and flood_blocked
58
+
59
+ else:
60
+ # Ultra simple: just count dark pixels
61
+ dark_ratio = np.sum(img < 200) / img.size
62
+ return dark_ratio > 0.15 # More than 15% dark pixels
63
+
64
+
65
+ def analyze_checkboxes_simple(regions, labels=None):
66
+ """
67
+ Quick analysis of multiple checkboxes.
68
+ Returns dict with results and which one is most likely checked.
69
+ """
70
+ if labels is None:
71
+ labels = [f'CB{i+1}' for i in range(len(regions))]
72
+
73
+ results = {}
74
+
75
+ for region, label in zip(regions, labels):
76
+ img = np.array(region.render(crop=True).convert('L'))
77
+ h, w = img.shape
78
+
79
+ # Get key metrics
80
+ cy, cx = h // 2, w // 2
81
+ center_sample = img[max(0, cy-2):min(h, cy+3),
82
+ max(0, cx-2):min(w, cx+3)]
83
+
84
+ results[label] = {
85
+ 'center_darkness': np.mean(center_sample),
86
+ 'dark_pixel_ratio': np.sum(img < 200) / img.size,
87
+ 'is_marked': is_checkbox_marked_simple(region, 'center')
88
+ }
89
+
90
+ # Find which one is most likely marked (darkest center)
91
+ darkest = min(results.items(), key=lambda x: x[1]['center_darkness'])
92
+
93
+ return {
94
+ 'results': results,
95
+ 'most_likely_marked': darkest[0],
96
+ 'marked_checkboxes': [label for label, data in results.items() if data['is_marked']]
97
+ }
98
+
99
+
100
+ # Usage example:
101
+ if __name__ == "__main__":
102
+ print("Simple Checkbox Detection")
103
+ print("=" * 50)
104
+ print("""
105
+ # Single checkbox
106
+ is_marked = is_checkbox_marked_simple(checkbox_region)
107
+
108
+ # Multiple checkboxes
109
+ result = analyze_checkboxes_simple([cb1, cb2, cb3],
110
+ ['Acceptable', 'Deficient', 'At-Risk'])
111
+ print(result['most_likely_marked']) # 'Acceptable'
112
+ print(result['marked_checkboxes']) # ['Acceptable']
113
+
114
+ # Just check center (fastest)
115
+ if is_checkbox_marked_simple(region, method='center'):
116
+ print("Checkbox is marked!")
117
+ """)