natural-pdf 0.2.18__py3-none-any.whl → 0.2.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +8 -0
- natural_pdf/analyzers/checkbox/__init__.py +6 -0
- natural_pdf/analyzers/checkbox/base.py +265 -0
- natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
- natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
- natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
- natural_pdf/analyzers/checkbox/mixin.py +95 -0
- natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
- natural_pdf/analyzers/guides.py +26 -2
- natural_pdf/collections/mixins.py +14 -5
- natural_pdf/core/element_manager.py +5 -1
- natural_pdf/core/page.py +61 -0
- natural_pdf/core/page_collection.py +41 -1
- natural_pdf/core/pdf.py +24 -1
- natural_pdf/describe/base.py +20 -0
- natural_pdf/elements/base.py +152 -10
- natural_pdf/elements/element_collection.py +41 -2
- natural_pdf/elements/region.py +115 -2
- natural_pdf/judge.py +1509 -0
- natural_pdf/selectors/parser.py +42 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/RECORD +42 -18
- temp/check_model.py +49 -0
- temp/check_pdf_content.py +9 -0
- temp/checkbox_checks.py +590 -0
- temp/checkbox_simple.py +117 -0
- temp/checkbox_ux_ideas.py +400 -0
- temp/context_manager_prototype.py +177 -0
- temp/convert_to_hf.py +60 -0
- temp/demo_text_closest.py +66 -0
- temp/inspect_model.py +43 -0
- temp/rtdetr_dinov2_test.py +49 -0
- temp/test_closest_debug.py +26 -0
- temp/test_closest_debug2.py +22 -0
- temp/test_context_exploration.py +85 -0
- temp/test_durham.py +30 -0
- temp/test_empty_string.py +16 -0
- temp/test_similarity.py +15 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.20.dist-info}/top_level.txt +0 -0
temp/checkbox_checks.py
ADDED
@@ -0,0 +1,590 @@
|
|
1
|
+
"""Checkbox detection using flood fill from center - X vs empty box."""
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
from collections import deque
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
def flood_fill_size(image, start_x, start_y, threshold=200):
|
10
|
+
"""
|
11
|
+
Flood fill from a point and return the size of filled region.
|
12
|
+
Larger region = empty box. Smaller region = X or other mark.
|
13
|
+
"""
|
14
|
+
if isinstance(image, np.ndarray) and len(image.shape) == 3:
|
15
|
+
image = np.mean(image, axis=2).astype(np.uint8)
|
16
|
+
|
17
|
+
height, width = image.shape
|
18
|
+
if start_x >= width or start_y >= height:
|
19
|
+
return 0
|
20
|
+
|
21
|
+
# Track visited pixels
|
22
|
+
visited = np.zeros_like(image, dtype=bool)
|
23
|
+
|
24
|
+
# Queue for flood fill
|
25
|
+
queue = deque([(start_x, start_y)])
|
26
|
+
visited[start_y, start_x] = True
|
27
|
+
|
28
|
+
# Count pixels in flood fill region
|
29
|
+
count = 0
|
30
|
+
|
31
|
+
# 8-directional flood fill
|
32
|
+
directions = [(-1, -1), (-1, 0), (-1, 1), (0, -1),
|
33
|
+
(0, 1), (1, -1), (1, 0), (1, 1)]
|
34
|
+
|
35
|
+
while queue:
|
36
|
+
x, y = queue.popleft()
|
37
|
+
count += 1
|
38
|
+
|
39
|
+
# Check all 8 neighbors
|
40
|
+
for dx, dy in directions:
|
41
|
+
nx, ny = x + dx, y + dy
|
42
|
+
|
43
|
+
# Check bounds
|
44
|
+
if 0 <= nx < width and 0 <= ny < height:
|
45
|
+
# If not visited and light enough (not ink)
|
46
|
+
if not visited[ny, nx] and image[ny, nx] >= threshold:
|
47
|
+
visited[ny, nx] = True
|
48
|
+
queue.append((nx, ny))
|
49
|
+
|
50
|
+
return count
|
51
|
+
|
52
|
+
|
53
|
+
def detect_checkbox_flood(region, debug=False):
|
54
|
+
"""
|
55
|
+
Detect if checkbox is checked using flood fill from center.
|
56
|
+
Empty box = large flood region. X = small flood region.
|
57
|
+
"""
|
58
|
+
# Convert to grayscale numpy array
|
59
|
+
img = np.array(region.render(crop=True).convert('L'))
|
60
|
+
height, width = img.shape
|
61
|
+
|
62
|
+
# Start flood fill from center
|
63
|
+
center_x, center_y = width // 2, height // 2
|
64
|
+
|
65
|
+
# Try flood fill from center
|
66
|
+
flood_size = flood_fill_size(img, center_x, center_y)
|
67
|
+
total_pixels = width * height
|
68
|
+
flood_ratio = flood_size / total_pixels
|
69
|
+
|
70
|
+
# Also try a few points near center in case center pixel is dark
|
71
|
+
alt_points = [
|
72
|
+
(center_x - 2, center_y),
|
73
|
+
(center_x + 2, center_y),
|
74
|
+
(center_x, center_y - 2),
|
75
|
+
(center_x, center_y + 2),
|
76
|
+
]
|
77
|
+
|
78
|
+
max_flood_size = flood_size
|
79
|
+
for x, y in alt_points:
|
80
|
+
if 0 <= x < width and 0 <= y < height:
|
81
|
+
size = flood_fill_size(img, x, y)
|
82
|
+
max_flood_size = max(max_flood_size, size)
|
83
|
+
|
84
|
+
max_flood_ratio = max_flood_size / total_pixels
|
85
|
+
|
86
|
+
# Decision logic:
|
87
|
+
# - If flood fills >50% of box, it's likely empty
|
88
|
+
# - If flood fills <20% of box, it's likely marked with X
|
89
|
+
is_empty = max_flood_ratio > 0.5
|
90
|
+
is_marked = max_flood_ratio < 0.2
|
91
|
+
|
92
|
+
if debug:
|
93
|
+
print(f"Flood fill results:")
|
94
|
+
print(f" Center flood: {flood_size} pixels ({flood_ratio:.1%})")
|
95
|
+
print(f" Max flood: {max_flood_size} pixels ({max_flood_ratio:.1%})")
|
96
|
+
print(f" Decision: {'EMPTY' if is_empty else 'MARKED' if is_marked else 'UNCERTAIN'}")
|
97
|
+
|
98
|
+
return {
|
99
|
+
'is_checked': is_marked,
|
100
|
+
'is_empty': is_empty,
|
101
|
+
'flood_ratio': max_flood_ratio,
|
102
|
+
'confidence': abs(max_flood_ratio - 0.35) / 0.35 # Confidence based on distance from uncertain middle
|
103
|
+
}
|
104
|
+
|
105
|
+
|
106
|
+
def simple_center_darkness(region, sample_size=5):
|
107
|
+
"""
|
108
|
+
Even simpler: just check if center region is dark.
|
109
|
+
Good for quick X detection.
|
110
|
+
"""
|
111
|
+
img = np.array(get_region_image(region).convert('L'))
|
112
|
+
height, width = img.shape
|
113
|
+
|
114
|
+
# Sample center region
|
115
|
+
cy, cx = height // 2, width // 2
|
116
|
+
half = sample_size // 2
|
117
|
+
|
118
|
+
# Extract center patch
|
119
|
+
y1 = max(0, cy - half)
|
120
|
+
y2 = min(height, cy + half + 1)
|
121
|
+
x1 = max(0, cx - half)
|
122
|
+
x2 = min(width, cx + half + 1)
|
123
|
+
|
124
|
+
center_patch = img[y1:y2, x1:x2]
|
125
|
+
center_darkness = np.mean(center_patch)
|
126
|
+
|
127
|
+
# X marks usually have dark center
|
128
|
+
return center_darkness < 200
|
129
|
+
|
130
|
+
|
131
|
+
def ink_blob_analysis(region, threshold=200):
|
132
|
+
"""
|
133
|
+
Analyze connected components of ink.
|
134
|
+
X pattern typically has 1 large connected component.
|
135
|
+
Empty box has small scattered components (noise).
|
136
|
+
"""
|
137
|
+
img = np.array(get_region_image(region).convert('L'))
|
138
|
+
binary = img < threshold
|
139
|
+
|
140
|
+
# Simple connected component analysis without scipy
|
141
|
+
# Count number of ink "blobs"
|
142
|
+
visited = np.zeros_like(binary, dtype=bool)
|
143
|
+
blob_sizes = []
|
144
|
+
|
145
|
+
# Iterative flood fill to avoid recursion limit
|
146
|
+
def get_blob_size(start_y, start_x):
|
147
|
+
"""Iterative flood fill to get connected component size."""
|
148
|
+
if visited[start_y, start_x] or not binary[start_y, start_x]:
|
149
|
+
return 0
|
150
|
+
|
151
|
+
stack = [(start_y, start_x)]
|
152
|
+
visited[start_y, start_x] = True
|
153
|
+
size = 0
|
154
|
+
|
155
|
+
while stack:
|
156
|
+
y, x = stack.pop()
|
157
|
+
size += 1
|
158
|
+
|
159
|
+
# Check 8 neighbors
|
160
|
+
for dy in [-1, 0, 1]:
|
161
|
+
for dx in [-1, 0, 1]:
|
162
|
+
if dy == 0 and dx == 0:
|
163
|
+
continue
|
164
|
+
ny, nx = y + dy, x + dx
|
165
|
+
if (0 <= ny < binary.shape[0] and
|
166
|
+
0 <= nx < binary.shape[1] and
|
167
|
+
not visited[ny, nx] and
|
168
|
+
binary[ny, nx]):
|
169
|
+
visited[ny, nx] = True
|
170
|
+
stack.append((ny, nx))
|
171
|
+
|
172
|
+
return size
|
173
|
+
|
174
|
+
# Find all blobs
|
175
|
+
for y in range(binary.shape[0]):
|
176
|
+
for x in range(binary.shape[1]):
|
177
|
+
if binary[y, x] and not visited[y, x]:
|
178
|
+
blob_size = get_blob_size(y, x)
|
179
|
+
if blob_size > 10: # Ignore tiny noise
|
180
|
+
blob_sizes.append(blob_size)
|
181
|
+
|
182
|
+
# Analysis
|
183
|
+
total_ink = np.sum(binary)
|
184
|
+
largest_blob = max(blob_sizes) if blob_sizes else 0
|
185
|
+
num_blobs = len(blob_sizes)
|
186
|
+
|
187
|
+
# X typically has 1-2 large blobs, empty box has many small ones
|
188
|
+
is_x_pattern = (num_blobs <= 3 and largest_blob > total_ink * 0.6)
|
189
|
+
|
190
|
+
return {
|
191
|
+
'is_x_pattern': is_x_pattern,
|
192
|
+
'num_blobs': num_blobs,
|
193
|
+
'largest_blob_ratio': largest_blob / total_ink if total_ink > 0 else 0,
|
194
|
+
'total_ink_ratio': total_ink / binary.size
|
195
|
+
}
|
196
|
+
|
197
|
+
|
198
|
+
def analyze_checkbox(region, method='flood', debug=False):
|
199
|
+
"""
|
200
|
+
Main function to analyze checkbox using specified method.
|
201
|
+
|
202
|
+
Methods:
|
203
|
+
- 'flood': Flood fill from center (default, most reliable)
|
204
|
+
- 'center': Check center darkness (fastest)
|
205
|
+
- 'blob': Connected component analysis (most thorough)
|
206
|
+
- 'all': Run all methods and vote
|
207
|
+
"""
|
208
|
+
if method == 'flood':
|
209
|
+
result = detect_checkbox_flood(region, debug)
|
210
|
+
return result['is_checked']
|
211
|
+
|
212
|
+
elif method == 'center':
|
213
|
+
return simple_center_darkness(region)
|
214
|
+
|
215
|
+
elif method == 'blob':
|
216
|
+
result = ink_blob_analysis(region)
|
217
|
+
return result['is_x_pattern']
|
218
|
+
|
219
|
+
elif method == 'all':
|
220
|
+
# Run all methods and vote
|
221
|
+
flood_result = detect_checkbox_flood(region, debug=False)
|
222
|
+
center_result = simple_center_darkness(region)
|
223
|
+
blob_result = ink_blob_analysis(region)
|
224
|
+
|
225
|
+
votes = sum([
|
226
|
+
flood_result['is_checked'],
|
227
|
+
center_result,
|
228
|
+
blob_result['is_x_pattern']
|
229
|
+
])
|
230
|
+
|
231
|
+
if debug:
|
232
|
+
print("All methods:")
|
233
|
+
print(f" Flood fill: {'MARKED' if flood_result['is_checked'] else 'EMPTY'}")
|
234
|
+
print(f" Center darkness: {'MARKED' if center_result else 'EMPTY'}")
|
235
|
+
print(f" Blob analysis: {'X PATTERN' if blob_result['is_x_pattern'] else 'EMPTY/OTHER'}")
|
236
|
+
print(f" Final vote: {votes}/3 say MARKED")
|
237
|
+
|
238
|
+
return votes >= 2
|
239
|
+
|
240
|
+
else:
|
241
|
+
raise ValueError(f"Unknown method: {method}")
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
def compare_checkboxes(regions, labels=None):
|
247
|
+
"""
|
248
|
+
Compare multiple checkbox regions across all metrics.
|
249
|
+
|
250
|
+
Args:
|
251
|
+
regions: List of checkbox regions to analyze
|
252
|
+
labels: Optional list of labels for each region (e.g., ['Acceptable', 'Deficient', 'At-Risk'])
|
253
|
+
|
254
|
+
Returns:
|
255
|
+
pandas DataFrame with metrics as rows and regions as columns
|
256
|
+
"""
|
257
|
+
import pandas as pd
|
258
|
+
|
259
|
+
if labels is None:
|
260
|
+
labels = [f'Region_{i+1}' for i in range(len(regions))]
|
261
|
+
|
262
|
+
# Initialize results dictionary
|
263
|
+
results = {label: {} for label in labels}
|
264
|
+
|
265
|
+
for i, (region, label) in enumerate(zip(regions, labels)):
|
266
|
+
# Get grayscale image
|
267
|
+
img = np.array(region.render(crop=True).convert('L'))
|
268
|
+
height, width = img.shape
|
269
|
+
total_pixels = height * width
|
270
|
+
|
271
|
+
# 1. Basic pixel metrics
|
272
|
+
dark_pixels = np.sum(img < 200)
|
273
|
+
results[label]['dark_pixel_count'] = dark_pixels
|
274
|
+
results[label]['dark_pixel_ratio'] = dark_pixels / total_pixels
|
275
|
+
results[label]['mean_intensity'] = np.mean(img)
|
276
|
+
results[label]['std_intensity'] = np.std(img)
|
277
|
+
results[label]['ink_score'] = (255 - np.mean(img)) / 2.55
|
278
|
+
|
279
|
+
# 2. Flood fill metrics
|
280
|
+
flood_result = detect_checkbox_flood(region, debug=False)
|
281
|
+
results[label]['flood_ratio'] = flood_result['flood_ratio']
|
282
|
+
results[label]['flood_confidence'] = flood_result['confidence']
|
283
|
+
results[label]['is_empty_flood'] = flood_result['is_empty']
|
284
|
+
results[label]['is_marked_flood'] = flood_result['is_checked']
|
285
|
+
|
286
|
+
# 3. Center darkness
|
287
|
+
cy, cx = height // 2, width // 2
|
288
|
+
center_patch = img[max(0, cy-2):min(height, cy+3), max(0, cx-2):min(width, cx+3)]
|
289
|
+
results[label]['center_darkness'] = np.mean(center_patch)
|
290
|
+
results[label]['is_marked_center'] = np.mean(center_patch) < 200
|
291
|
+
|
292
|
+
# 4. Blob analysis
|
293
|
+
blob_result = ink_blob_analysis(region)
|
294
|
+
results[label]['num_blobs'] = blob_result['num_blobs']
|
295
|
+
results[label]['largest_blob_ratio'] = blob_result['largest_blob_ratio']
|
296
|
+
results[label]['total_ink_ratio'] = blob_result['total_ink_ratio']
|
297
|
+
results[label]['is_x_pattern'] = blob_result['is_x_pattern']
|
298
|
+
|
299
|
+
# 5. Spatial distribution
|
300
|
+
# Check if ink is concentrated in diagonal patterns (X shape)
|
301
|
+
binary = img < 200
|
302
|
+
# Main diagonal
|
303
|
+
diag1_sum = sum(binary[i, i] for i in range(min(height, width)))
|
304
|
+
# Anti-diagonal
|
305
|
+
diag2_sum = sum(binary[i, width-1-i] for i in range(min(height, width)))
|
306
|
+
diagonal_ratio = (diag1_sum + diag2_sum) / (2 * min(height, width))
|
307
|
+
results[label]['diagonal_ink_ratio'] = diagonal_ratio
|
308
|
+
|
309
|
+
# 6. Edge vs center distribution
|
310
|
+
edge_mask = np.zeros_like(binary)
|
311
|
+
edge_mask[0:3, :] = True
|
312
|
+
edge_mask[-3:, :] = True
|
313
|
+
edge_mask[:, 0:3] = True
|
314
|
+
edge_mask[:, -3:] = True
|
315
|
+
edge_ink = np.sum(binary & edge_mask)
|
316
|
+
center_ink = np.sum(binary & ~edge_mask)
|
317
|
+
results[label]['edge_ink_ratio'] = edge_ink / np.sum(edge_mask) if np.sum(edge_mask) > 0 else 0
|
318
|
+
results[label]['center_ink_ratio'] = center_ink / np.sum(~edge_mask) if np.sum(~edge_mask) > 0 else 0
|
319
|
+
|
320
|
+
# 7. Final voting
|
321
|
+
votes = sum([
|
322
|
+
results[label]['is_marked_flood'],
|
323
|
+
results[label]['is_marked_center'],
|
324
|
+
results[label]['is_x_pattern']
|
325
|
+
])
|
326
|
+
results[label]['vote_score'] = votes
|
327
|
+
results[label]['is_checked'] = votes >= 2
|
328
|
+
|
329
|
+
# Create DataFrame
|
330
|
+
df = pd.DataFrame(results)
|
331
|
+
|
332
|
+
# Add row for identifying which is most likely checked
|
333
|
+
most_checked_scores = {
|
334
|
+
'dark_pixel_ratio': df.loc['dark_pixel_ratio'].idxmax(),
|
335
|
+
'ink_score': df.loc['ink_score'].idxmax(),
|
336
|
+
'flood_ratio': df.loc['flood_ratio'].idxmin(), # Lower is more marked
|
337
|
+
'center_darkness': df.loc['center_darkness'].idxmin(), # Lower is darker
|
338
|
+
'diagonal_ink_ratio': df.loc['diagonal_ink_ratio'].idxmax(),
|
339
|
+
'vote_score': df.loc['vote_score'].idxmax(),
|
340
|
+
}
|
341
|
+
|
342
|
+
# Add summary row
|
343
|
+
df.loc['most_likely_checked'] = [most_checked_scores.get(col, '') for col in df.columns]
|
344
|
+
|
345
|
+
return df
|
346
|
+
|
347
|
+
|
348
|
+
# Example usage scaffold
|
349
|
+
def analyze_three_checkboxes():
|
350
|
+
"""
|
351
|
+
Example scaffold for analyzing three checkbox regions.
|
352
|
+
"""
|
353
|
+
import natural_pdf as npdf
|
354
|
+
import pandas as pd
|
355
|
+
|
356
|
+
# Set pandas display options for better viewing
|
357
|
+
pd.set_option('display.max_rows', None)
|
358
|
+
pd.set_option('display.float_format', '{:.3f}'.format)
|
359
|
+
|
360
|
+
# Load your PDF
|
361
|
+
pdf = npdf.PDF("your_form.pdf")
|
362
|
+
page = pdf[0]
|
363
|
+
|
364
|
+
# Method 1: If you have the regions already
|
365
|
+
# regions = [region1, region2, region3]
|
366
|
+
# labels = ['Acceptable', 'Deficient', 'At-Risk']
|
367
|
+
|
368
|
+
# Method 2: Find them relative to text labels
|
369
|
+
checkbox_area = page.find('text:contains("Housing")') # Or however you identify the area
|
370
|
+
|
371
|
+
regions = []
|
372
|
+
labels = ['Acceptable', 'Deficient', 'At-Risk']
|
373
|
+
|
374
|
+
for label in labels:
|
375
|
+
cb = checkbox_area.find(f'text={label}').left(width=15)
|
376
|
+
if cb:
|
377
|
+
regions.append(cb)
|
378
|
+
else:
|
379
|
+
print(f"Warning: Could not find checkbox for {label}")
|
380
|
+
|
381
|
+
# Run comparison
|
382
|
+
df = compare_checkboxes(regions, labels)
|
383
|
+
|
384
|
+
# Display results
|
385
|
+
print("\nCheckbox Analysis Results:")
|
386
|
+
print("=" * 80)
|
387
|
+
print(df)
|
388
|
+
|
389
|
+
print("\n\nKey Metrics Interpretation:")
|
390
|
+
print("-" * 80)
|
391
|
+
print("dark_pixel_ratio: Higher = more ink")
|
392
|
+
print("ink_score: 0-100, higher = more marking")
|
393
|
+
print("flood_ratio: Lower = more marked (ink blocks flood fill)")
|
394
|
+
print("center_darkness: Lower = darker center (X pattern)")
|
395
|
+
print("diagonal_ink_ratio: Higher = more X-like pattern")
|
396
|
+
print("vote_score: 0-3, number of methods that think it's checked")
|
397
|
+
|
398
|
+
print("\n\nConclusion:")
|
399
|
+
print("-" * 80)
|
400
|
+
checked_col = df.loc['is_checked']
|
401
|
+
checked_labels = [col for col in checked_col.index if checked_col[col]]
|
402
|
+
if checked_labels:
|
403
|
+
print(f"Checked boxes: {', '.join(checked_labels)}")
|
404
|
+
else:
|
405
|
+
print("No boxes appear to be checked")
|
406
|
+
|
407
|
+
# Find most likely based on vote
|
408
|
+
vote_scores = df.loc['vote_score']
|
409
|
+
max_vote = vote_scores.max()
|
410
|
+
if max_vote > 0:
|
411
|
+
most_likely = vote_scores.idxmax()
|
412
|
+
print(f"Most likely checked (by vote): {most_likely}")
|
413
|
+
|
414
|
+
return df
|
415
|
+
|
416
|
+
|
417
|
+
# Quick test function for three regions
|
418
|
+
def quick_compare(region1, region2, region3, labels=['Region 1', 'Region 2', 'Region 3']):
|
419
|
+
"""
|
420
|
+
Quick comparison of three regions.
|
421
|
+
|
422
|
+
Example:
|
423
|
+
df = quick_compare(acceptable_cb, deficient_cb, at_risk_cb,
|
424
|
+
['Acceptable', 'Deficient', 'At-Risk'])
|
425
|
+
"""
|
426
|
+
return compare_checkboxes([region1, region2, region3], labels)
|
427
|
+
|
428
|
+
|
429
|
+
def which_is_checked(*regions, labels=None):
|
430
|
+
"""
|
431
|
+
Simple function that tells you which checkbox is checked.
|
432
|
+
|
433
|
+
Usage:
|
434
|
+
checked = which_is_checked(region1, region2, region3)
|
435
|
+
print(checked) # "Region 2"
|
436
|
+
|
437
|
+
# With labels:
|
438
|
+
checked = which_is_checked(accept_cb, deficient_cb, risk_cb,
|
439
|
+
labels=['Acceptable', 'Deficient', 'At-Risk'])
|
440
|
+
print(checked) # "Deficient"
|
441
|
+
"""
|
442
|
+
if labels is None:
|
443
|
+
labels = [f'Checkbox {i+1}' for i in range(len(regions))]
|
444
|
+
|
445
|
+
# Quick analysis of each
|
446
|
+
scores = []
|
447
|
+
for region, label in zip(regions, labels):
|
448
|
+
img = np.array(region.render(crop=True).convert('L'))
|
449
|
+
|
450
|
+
# Just check center darkness - simplest reliable method
|
451
|
+
h, w = img.shape
|
452
|
+
cy, cx = h // 2, w // 2
|
453
|
+
center = img[max(0, cy-3):min(h, cy+4), max(0, cx-3):min(w, cx+4)]
|
454
|
+
darkness = 255 - np.mean(center) # Higher = darker
|
455
|
+
|
456
|
+
scores.append((label, darkness))
|
457
|
+
|
458
|
+
# Sort by darkness
|
459
|
+
scores.sort(key=lambda x: x[1], reverse=True)
|
460
|
+
|
461
|
+
# Simple decision
|
462
|
+
if scores[0][1] > scores[1][1] * 1.5: # Clear winner
|
463
|
+
return scores[0][0]
|
464
|
+
else:
|
465
|
+
# Not clear - return with uncertainty
|
466
|
+
return f"{scores[0][0]} (uncertain - also check {scores[1][0]})"
|
467
|
+
|
468
|
+
|
469
|
+
def show_checkbox_comparison(*regions, labels=None):
|
470
|
+
"""
|
471
|
+
Visual comparison of checkboxes - shows which is darkest.
|
472
|
+
|
473
|
+
Usage:
|
474
|
+
show_checkbox_comparison(cb1, cb2, cb3, labels=['A', 'B', 'C'])
|
475
|
+
"""
|
476
|
+
if labels is None:
|
477
|
+
labels = [f'Option {i+1}' for i in range(len(regions))]
|
478
|
+
|
479
|
+
print("\nCheckbox Analysis:")
|
480
|
+
print("-" * 40)
|
481
|
+
|
482
|
+
results = []
|
483
|
+
for region, label in zip(regions, labels):
|
484
|
+
img = np.array(region.render(crop=True).convert('L'))
|
485
|
+
|
486
|
+
# Simple metrics
|
487
|
+
darkness = 255 - np.mean(img)
|
488
|
+
dark_pixels = np.sum(img < 200)
|
489
|
+
|
490
|
+
results.append({
|
491
|
+
'label': label,
|
492
|
+
'darkness': darkness,
|
493
|
+
'dark_pixels': dark_pixels
|
494
|
+
})
|
495
|
+
|
496
|
+
# Sort by darkness
|
497
|
+
results.sort(key=lambda x: x['darkness'], reverse=True)
|
498
|
+
|
499
|
+
# Show results
|
500
|
+
print(f"Most likely checked: {results[0]['label']}")
|
501
|
+
print()
|
502
|
+
|
503
|
+
# Simple bar chart
|
504
|
+
max_darkness = max(r['darkness'] for r in results)
|
505
|
+
for r in results:
|
506
|
+
bar_length = int(30 * r['darkness'] / max_darkness) if max_darkness > 0 else 0
|
507
|
+
bar = '█' * bar_length
|
508
|
+
print(f"{r['label']:15} {bar} {r['darkness']:.0f}")
|
509
|
+
|
510
|
+
# Confidence check
|
511
|
+
if len(results) >= 2:
|
512
|
+
ratio = results[0]['darkness'] / results[1]['darkness'] if results[1]['darkness'] > 0 else 10
|
513
|
+
if ratio < 1.3:
|
514
|
+
print(f"\n⚠️ Low confidence - {results[1]['label']} is almost as dark")
|
515
|
+
elif ratio > 2:
|
516
|
+
print(f"\n✓ High confidence - clearly {results[0]['label']}")
|
517
|
+
|
518
|
+
|
519
|
+
def is_this_checked(region, reference_checked=None, reference_unchecked=None):
|
520
|
+
"""
|
521
|
+
Check if a single checkbox is marked, optionally using reference examples.
|
522
|
+
|
523
|
+
Usage:
|
524
|
+
# Simple check
|
525
|
+
if is_this_checked(my_checkbox):
|
526
|
+
print("It's checked!")
|
527
|
+
|
528
|
+
# With references for better accuracy
|
529
|
+
if is_this_checked(my_checkbox,
|
530
|
+
reference_checked=known_checked_cb,
|
531
|
+
reference_unchecked=known_empty_cb):
|
532
|
+
print("It's checked!")
|
533
|
+
"""
|
534
|
+
img = np.array(region.render(crop=True).convert('L'))
|
535
|
+
darkness = 255 - np.mean(img)
|
536
|
+
|
537
|
+
if reference_checked is None and reference_unchecked is None:
|
538
|
+
# Simple threshold
|
539
|
+
return darkness > 40 # Adjust based on your forms
|
540
|
+
|
541
|
+
# Compare to references
|
542
|
+
if reference_checked:
|
543
|
+
checked_img = np.array(reference_checked.render(crop=True).convert('L'))
|
544
|
+
checked_darkness = 255 - np.mean(checked_img)
|
545
|
+
else:
|
546
|
+
checked_darkness = 60 # Default
|
547
|
+
|
548
|
+
if reference_unchecked:
|
549
|
+
unchecked_img = np.array(reference_unchecked.render(crop=True).convert('L'))
|
550
|
+
unchecked_darkness = 255 - np.mean(unchecked_img)
|
551
|
+
else:
|
552
|
+
unchecked_darkness = 20 # Default
|
553
|
+
|
554
|
+
# Which is it closer to?
|
555
|
+
threshold = (checked_darkness + unchecked_darkness) / 2
|
556
|
+
return darkness > threshold
|
557
|
+
|
558
|
+
|
559
|
+
def debug_regions(regions, labels=None):
|
560
|
+
"""Debug function to check what's in each region."""
|
561
|
+
if labels is None:
|
562
|
+
labels = [f'Region_{i+1}' for i in range(len(regions))]
|
563
|
+
|
564
|
+
print("Debugging regions:")
|
565
|
+
print("=" * 60)
|
566
|
+
|
567
|
+
for i, (region, label) in enumerate(zip(regions, labels)):
|
568
|
+
print(f"\n{label}:")
|
569
|
+
print(f" Region object: {region}")
|
570
|
+
print(f" Bbox: {region.bbox if hasattr(region, 'bbox') else 'N/A'}")
|
571
|
+
|
572
|
+
# Get the image
|
573
|
+
img = np.array(region.render(crop=True).convert('L'))
|
574
|
+
print(f" Image shape: {img.shape}")
|
575
|
+
print(f" Image dtype: {img.dtype}")
|
576
|
+
print(f" Unique values: {len(np.unique(img))}")
|
577
|
+
print(f" Min/Max values: {img.min()}/{img.max()}")
|
578
|
+
|
579
|
+
# Show a small sample of the center
|
580
|
+
h, w = img.shape
|
581
|
+
cy, cx = h//2, w//2
|
582
|
+
sample = img[max(0,cy-2):cy+3, max(0,cx-2):cx+3]
|
583
|
+
print(f" Center sample:\n{sample}")
|
584
|
+
|
585
|
+
# Save images for inspection
|
586
|
+
from PIL import Image
|
587
|
+
pil_img = Image.fromarray(img, mode='L')
|
588
|
+
filename = f"/tmp/checkbox_{label.lower().replace(' ', '_').replace('-', '_')}.png"
|
589
|
+
pil_img.save(filename)
|
590
|
+
print(f" Saved to: {filename}")
|
temp/checkbox_simple.py
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
"""Simplified checkbox detection - just the most effective methods."""
|
2
|
+
|
3
|
+
import numpy as np
|
4
|
+
|
5
|
+
|
6
|
+
def is_checkbox_marked_simple(region, method='center'):
|
7
|
+
"""
|
8
|
+
Simplified checkbox detection using only the most effective methods.
|
9
|
+
|
10
|
+
Methods:
|
11
|
+
- 'center': Check if center is dark (fastest, most reliable)
|
12
|
+
- 'flood': Check flood fill ratio (slightly more robust)
|
13
|
+
- 'both': Use both and require agreement
|
14
|
+
"""
|
15
|
+
img = np.array(region.render(crop=True).convert('L'))
|
16
|
+
|
17
|
+
if method == 'center':
|
18
|
+
# Just check center darkness
|
19
|
+
h, w = img.shape
|
20
|
+
cy, cx = h // 2, w // 2
|
21
|
+
# Sample 5x5 center region
|
22
|
+
center_region = img[max(0, cy-2):min(h, cy+3),
|
23
|
+
max(0, cx-2):min(w, cx+3)]
|
24
|
+
center_mean = np.mean(center_region)
|
25
|
+
return center_mean < 190 # Threshold can be tuned
|
26
|
+
|
27
|
+
elif method == 'flood':
|
28
|
+
# Simple flood fill from center
|
29
|
+
h, w = img.shape
|
30
|
+
cy, cx = h // 2, w // 2
|
31
|
+
|
32
|
+
# Count pixels reachable from center
|
33
|
+
visited = np.zeros_like(img, dtype=bool)
|
34
|
+
stack = [(cy, cx)]
|
35
|
+
visited[cy, cx] = True
|
36
|
+
count = 0
|
37
|
+
|
38
|
+
while stack and count < h * w: # Safety limit
|
39
|
+
y, x = stack.pop()
|
40
|
+
count += 1
|
41
|
+
|
42
|
+
# Check 4 neighbors (simpler than 8)
|
43
|
+
for dy, dx in [(0, 1), (1, 0), (0, -1), (-1, 0)]:
|
44
|
+
ny, nx = y + dy, x + dx
|
45
|
+
if (0 <= ny < h and 0 <= nx < w and
|
46
|
+
not visited[ny, nx] and img[ny, nx] > 200):
|
47
|
+
visited[ny, nx] = True
|
48
|
+
stack.append((ny, nx))
|
49
|
+
|
50
|
+
flood_ratio = count / (h * w)
|
51
|
+
return flood_ratio < 0.3 # Less than 30% fillable = marked
|
52
|
+
|
53
|
+
elif method == 'both':
|
54
|
+
# Require both methods to agree
|
55
|
+
center_dark = is_checkbox_marked_simple(region, 'center')
|
56
|
+
flood_blocked = is_checkbox_marked_simple(region, 'flood')
|
57
|
+
return center_dark and flood_blocked
|
58
|
+
|
59
|
+
else:
|
60
|
+
# Ultra simple: just count dark pixels
|
61
|
+
dark_ratio = np.sum(img < 200) / img.size
|
62
|
+
return dark_ratio > 0.15 # More than 15% dark pixels
|
63
|
+
|
64
|
+
|
65
|
+
def analyze_checkboxes_simple(regions, labels=None):
|
66
|
+
"""
|
67
|
+
Quick analysis of multiple checkboxes.
|
68
|
+
Returns dict with results and which one is most likely checked.
|
69
|
+
"""
|
70
|
+
if labels is None:
|
71
|
+
labels = [f'CB{i+1}' for i in range(len(regions))]
|
72
|
+
|
73
|
+
results = {}
|
74
|
+
|
75
|
+
for region, label in zip(regions, labels):
|
76
|
+
img = np.array(region.render(crop=True).convert('L'))
|
77
|
+
h, w = img.shape
|
78
|
+
|
79
|
+
# Get key metrics
|
80
|
+
cy, cx = h // 2, w // 2
|
81
|
+
center_sample = img[max(0, cy-2):min(h, cy+3),
|
82
|
+
max(0, cx-2):min(w, cx+3)]
|
83
|
+
|
84
|
+
results[label] = {
|
85
|
+
'center_darkness': np.mean(center_sample),
|
86
|
+
'dark_pixel_ratio': np.sum(img < 200) / img.size,
|
87
|
+
'is_marked': is_checkbox_marked_simple(region, 'center')
|
88
|
+
}
|
89
|
+
|
90
|
+
# Find which one is most likely marked (darkest center)
|
91
|
+
darkest = min(results.items(), key=lambda x: x[1]['center_darkness'])
|
92
|
+
|
93
|
+
return {
|
94
|
+
'results': results,
|
95
|
+
'most_likely_marked': darkest[0],
|
96
|
+
'marked_checkboxes': [label for label, data in results.items() if data['is_marked']]
|
97
|
+
}
|
98
|
+
|
99
|
+
|
100
|
+
# Usage example:
|
101
|
+
if __name__ == "__main__":
|
102
|
+
print("Simple Checkbox Detection")
|
103
|
+
print("=" * 50)
|
104
|
+
print("""
|
105
|
+
# Single checkbox
|
106
|
+
is_marked = is_checkbox_marked_simple(checkbox_region)
|
107
|
+
|
108
|
+
# Multiple checkboxes
|
109
|
+
result = analyze_checkboxes_simple([cb1, cb2, cb3],
|
110
|
+
['Acceptable', 'Deficient', 'At-Risk'])
|
111
|
+
print(result['most_likely_marked']) # 'Acceptable'
|
112
|
+
print(result['marked_checkboxes']) # ['Acceptable']
|
113
|
+
|
114
|
+
# Just check center (fastest)
|
115
|
+
if is_checkbox_marked_simple(region, method='center'):
|
116
|
+
print("Checkbox is marked!")
|
117
|
+
""")
|