fleet-python 0.2.66b2__py3-none-any.whl → 0.2.105__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. examples/export_tasks.py +16 -5
  2. examples/export_tasks_filtered.py +245 -0
  3. examples/fetch_tasks.py +230 -0
  4. examples/import_tasks.py +140 -8
  5. examples/iterate_verifiers.py +725 -0
  6. fleet/__init__.py +128 -5
  7. fleet/_async/__init__.py +27 -3
  8. fleet/_async/base.py +24 -9
  9. fleet/_async/client.py +938 -41
  10. fleet/_async/env/client.py +60 -3
  11. fleet/_async/instance/client.py +52 -7
  12. fleet/_async/models.py +15 -0
  13. fleet/_async/resources/api.py +200 -0
  14. fleet/_async/resources/sqlite.py +1801 -46
  15. fleet/_async/tasks.py +122 -25
  16. fleet/_async/verifiers/bundler.py +22 -21
  17. fleet/_async/verifiers/verifier.py +25 -19
  18. fleet/agent/__init__.py +32 -0
  19. fleet/agent/gemini_cua/Dockerfile +45 -0
  20. fleet/agent/gemini_cua/__init__.py +10 -0
  21. fleet/agent/gemini_cua/agent.py +759 -0
  22. fleet/agent/gemini_cua/mcp/main.py +108 -0
  23. fleet/agent/gemini_cua/mcp_server/__init__.py +5 -0
  24. fleet/agent/gemini_cua/mcp_server/main.py +105 -0
  25. fleet/agent/gemini_cua/mcp_server/tools.py +178 -0
  26. fleet/agent/gemini_cua/requirements.txt +5 -0
  27. fleet/agent/gemini_cua/start.sh +30 -0
  28. fleet/agent/orchestrator.py +854 -0
  29. fleet/agent/types.py +49 -0
  30. fleet/agent/utils.py +34 -0
  31. fleet/base.py +34 -9
  32. fleet/cli.py +1061 -0
  33. fleet/client.py +1060 -48
  34. fleet/config.py +1 -1
  35. fleet/env/__init__.py +16 -0
  36. fleet/env/client.py +60 -3
  37. fleet/eval/__init__.py +15 -0
  38. fleet/eval/uploader.py +231 -0
  39. fleet/exceptions.py +8 -0
  40. fleet/instance/client.py +53 -8
  41. fleet/instance/models.py +1 -0
  42. fleet/models.py +303 -0
  43. fleet/proxy/__init__.py +25 -0
  44. fleet/proxy/proxy.py +453 -0
  45. fleet/proxy/whitelist.py +244 -0
  46. fleet/resources/api.py +200 -0
  47. fleet/resources/sqlite.py +1845 -46
  48. fleet/tasks.py +113 -20
  49. fleet/utils/__init__.py +7 -0
  50. fleet/utils/http_logging.py +178 -0
  51. fleet/utils/logging.py +13 -0
  52. fleet/utils/playwright.py +440 -0
  53. fleet/verifiers/bundler.py +22 -21
  54. fleet/verifiers/db.py +985 -1
  55. fleet/verifiers/decorator.py +1 -1
  56. fleet/verifiers/verifier.py +25 -19
  57. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/METADATA +28 -1
  58. fleet_python-0.2.105.dist-info/RECORD +115 -0
  59. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/WHEEL +1 -1
  60. fleet_python-0.2.105.dist-info/entry_points.txt +2 -0
  61. tests/test_app_method.py +85 -0
  62. tests/test_expect_exactly.py +4148 -0
  63. tests/test_expect_only.py +2593 -0
  64. tests/test_instance_dispatch.py +607 -0
  65. tests/test_sqlite_resource_dual_mode.py +263 -0
  66. tests/test_sqlite_shared_memory_behavior.py +117 -0
  67. fleet_python-0.2.66b2.dist-info/RECORD +0 -81
  68. tests/test_verifier_security.py +0 -427
  69. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/licenses/LICENSE +0 -0
  70. {fleet_python-0.2.66b2.dist-info → fleet_python-0.2.105.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,725 @@
1
+ import argparse
2
+ import json
3
+ import re
4
+ import sys
5
+ from typing import Dict, Tuple, Optional
6
+
7
+
8
+ # Marker for storing leading content (docstrings, imports) in the Python file
9
+ LEADING_CONTENT_START = "# @LEADING_CONTENT_START"
10
+ LEADING_CONTENT_END = "# @LEADING_CONTENT_END"
11
+ # Legacy markers for backwards compatibility
12
+ LEADING_DOCSTRING_START = "# @LEADING_DOCSTRING_START"
13
+ LEADING_DOCSTRING_END = "# @LEADING_DOCSTRING_END"
14
+
15
+
16
+ def extract_function_info(function_code: str) -> Optional[Tuple[str, bool]]:
17
+ """
18
+ Extract function name and async status from Python function code.
19
+ Handles both regular functions (def) and async functions (async def).
20
+
21
+ Args:
22
+ function_code: Python function code as a string
23
+
24
+ Returns:
25
+ A tuple of (function_name, is_async) if found, None otherwise
26
+ """
27
+ # Normalize escaped newlines and strip common Markdown code fences
28
+ code = function_code.replace("\\n", "\n").strip()
29
+ if "```" in code:
30
+ # Extract the first fenced block if present
31
+ fence_blocks = re.findall(r"```[a-zA-Z0-9_+-]*\n([\s\S]*?)\n```", code)
32
+ if fence_blocks:
33
+ code = fence_blocks[0].strip()
34
+
35
+ # Remove leading decorators (keep them for regex but allow preceding lines)
36
+ # Robust regex: allow optional decorators and whitespace before the def
37
+ pattern = r"^\s*(?:@[\w\.\n+() ,]*\n\s*)*(async\s+)?def\s+([A-Za-z_]\w*)\s*\("
38
+
39
+ match = re.search(pattern, code, flags=re.MULTILINE)
40
+ if match:
41
+ is_async = match.group(1) is not None
42
+ function_name = match.group(2)
43
+ return (function_name, is_async)
44
+
45
+ # Fallback: search anywhere (not anchored) for a def signature
46
+ fallback = r"(async\s+)?def\s+([A-Za-z_]\w*)\s*\("
47
+ match = re.search(fallback, code)
48
+ if match:
49
+ is_async = match.group(1) is not None
50
+ function_name = match.group(2)
51
+ return (function_name, is_async)
52
+
53
+ return None
54
+
55
+
56
+ def extract_leading_content(code: str) -> Tuple[Optional[str], str]:
57
+ """
58
+ Extract leading content (docstrings, imports, etc.) before the main function definition.
59
+
60
+ Args:
61
+ code: The verifier code
62
+
63
+ Returns:
64
+ Tuple of (leading_content or None, function_code)
65
+ """
66
+ code = code.strip()
67
+
68
+ # Find the first top-level function definition (def or async def at column 0)
69
+ # We need to find "def " or "async def " that's not indented
70
+ lines = code.split("\n")
71
+ func_start_idx = None
72
+
73
+ for i, line in enumerate(lines):
74
+ # Check for unindented def or async def
75
+ if line.startswith("def ") or line.startswith("async def "):
76
+ func_start_idx = i
77
+ break
78
+
79
+ if func_start_idx is None or func_start_idx == 0:
80
+ # No leading content or function not found
81
+ return (None, code)
82
+
83
+ # Everything before the function is leading content
84
+ leading_lines = lines[:func_start_idx]
85
+ func_lines = lines[func_start_idx:]
86
+
87
+ # Clean up leading content - remove empty lines at the end
88
+ while leading_lines and not leading_lines[-1].strip():
89
+ leading_lines.pop()
90
+
91
+ if not leading_lines:
92
+ return (None, code)
93
+
94
+ leading_content = "\n".join(leading_lines)
95
+ function_code = "\n".join(func_lines)
96
+
97
+ return (leading_content, function_code)
98
+
99
+
100
+ def clean_verifier_code(code: str) -> Tuple[str, Optional[str]]:
101
+ """
102
+ Clean verifier code by removing markdown code fences and extracting leading content.
103
+
104
+ Args:
105
+ code: Raw verifier code string
106
+
107
+ Returns:
108
+ Tuple of (function code, leading_content or None)
109
+ """
110
+ code = code.strip()
111
+
112
+ # Remove markdown code fences if present
113
+ if "```" in code:
114
+ fence_blocks = re.findall(r"```[a-zA-Z0-9_+-]*\n([\s\S]*?)\n```", code)
115
+ if fence_blocks:
116
+ code = fence_blocks[0].strip()
117
+
118
+ # Extract leading content (docstrings, imports, etc.) if present
119
+ leading_content, code = extract_leading_content(code)
120
+
121
+ return (code, leading_content)
122
+
123
+
124
+ def format_leading_content_as_comment(content: str) -> str:
125
+ """
126
+ Format leading content (docstrings, imports, etc.) as a comment block with markers.
127
+
128
+ Args:
129
+ content: The leading content (docstrings, imports, etc.)
130
+
131
+ Returns:
132
+ Formatted comment block
133
+ """
134
+ lines = [LEADING_CONTENT_START]
135
+
136
+ for line in content.split("\n"):
137
+ # Prefix each line with "# |" to preserve exact content including empty lines
138
+ lines.append(f"# |{line}")
139
+
140
+ lines.append(LEADING_CONTENT_END)
141
+ return "\n".join(lines)
142
+
143
+
144
+ def parse_leading_content_from_comments(comment_block: str) -> str:
145
+ """
146
+ Parse leading content from a comment block with markers.
147
+
148
+ Args:
149
+ comment_block: The comment block between markers
150
+
151
+ Returns:
152
+ Reconstructed leading content
153
+ """
154
+ lines = []
155
+ for line in comment_block.split("\n"):
156
+ # Remove "# |" prefix (new format)
157
+ if line.startswith("# |"):
158
+ lines.append(line[3:])
159
+ # Legacy format: "# " prefix
160
+ elif line.startswith("# "):
161
+ lines.append(line[2:])
162
+ elif line == "#":
163
+ lines.append("")
164
+
165
+ return "\n".join(lines)
166
+
167
+
168
+ def parse_legacy_docstring_from_comments(comment_block: str) -> str:
169
+ """
170
+ Parse a docstring from legacy comment block with markers.
171
+
172
+ Args:
173
+ comment_block: The comment block between markers
174
+
175
+ Returns:
176
+ Reconstructed docstring with triple quotes
177
+ """
178
+ lines = []
179
+ for line in comment_block.split("\n"):
180
+ # Remove "# " prefix
181
+ if line.startswith("# "):
182
+ lines.append(line[2:])
183
+ elif line == "#":
184
+ lines.append("")
185
+
186
+ return '"""' + "\n".join(lines) + '"""'
187
+
188
+
189
+ def extract_verifiers_to_file(json_path: str, py_path: str) -> None:
190
+ """
191
+ Extract verifiers from JSON file and write them to a Python file with decorators.
192
+
193
+ Args:
194
+ json_path: Path to input JSON file
195
+ py_path: Path to output Python file
196
+ """
197
+ print(f"Reading tasks from: {json_path}")
198
+
199
+ # Load JSON file
200
+ try:
201
+ with open(json_path, "r", encoding="utf-8") as f:
202
+ tasks = json.load(f)
203
+ except FileNotFoundError:
204
+ print(f"✗ Error: File '{json_path}' not found")
205
+ sys.exit(1)
206
+ except json.JSONDecodeError as e:
207
+ print(f"✗ Error: Invalid JSON in '{json_path}': {e}")
208
+ sys.exit(1)
209
+
210
+ if not isinstance(tasks, list):
211
+ print("✗ Error: JSON file must contain an array of tasks")
212
+ sys.exit(1)
213
+
214
+ print(f"Found {len(tasks)} task(s)")
215
+
216
+ # Extract verifiers
217
+ verifiers = []
218
+ missing_verifier = []
219
+ duplicate_keys = set()
220
+ seen_keys = set()
221
+
222
+ for i, task in enumerate(tasks):
223
+ task_key = task.get("key") or task.get("id")
224
+ if not task_key:
225
+ print(f"⚠ Warning: Task at index {i} has no key or id, skipping")
226
+ continue
227
+
228
+ # Check for duplicate keys
229
+ if task_key in seen_keys:
230
+ duplicate_keys.add(task_key)
231
+ print(f"⚠ Warning: Duplicate task key '{task_key}' found")
232
+ seen_keys.add(task_key)
233
+
234
+ # Get verifier code from multiple possible locations
235
+ verifier_code = (
236
+ task.get("verifier_func")
237
+ or task.get("verifier_code")
238
+ or task.get("metadata", {}).get("verifier_code")
239
+ )
240
+
241
+ if not verifier_code:
242
+ missing_verifier.append(task_key)
243
+ continue
244
+
245
+ # Clean the code and extract leading content (docstrings, imports, etc.)
246
+ cleaned_code, leading_content = clean_verifier_code(verifier_code)
247
+
248
+ # Extract function info
249
+ func_info = extract_function_info(cleaned_code)
250
+ if not func_info:
251
+ print(
252
+ f"⚠ Warning: Could not extract function name from verifier for task '{task_key}'"
253
+ )
254
+ continue
255
+
256
+ function_name, is_async = func_info
257
+
258
+ verifiers.append(
259
+ {
260
+ "task_key": task_key,
261
+ "function_name": function_name,
262
+ "is_async": is_async,
263
+ "code": cleaned_code,
264
+ "leading_content": leading_content,
265
+ }
266
+ )
267
+
268
+ if missing_verifier:
269
+ print(f"\n⚠ Warning: {len(missing_verifier)} task(s) missing verifier code:")
270
+ for key in missing_verifier[:10]: # Show first 10
271
+ print(f" - {key}")
272
+ if len(missing_verifier) > 10:
273
+ print(f" ... and {len(missing_verifier) - 10} more")
274
+
275
+ if duplicate_keys:
276
+ print(f"\n⚠ Warning: {len(duplicate_keys)} duplicate task key(s) found:")
277
+ for key in list(duplicate_keys)[:10]:
278
+ print(f" - {key}")
279
+
280
+ print(f"\n✓ Extracted {len(verifiers)} verifier(s)")
281
+
282
+ # Count async vs sync
283
+ async_count = sum(1 for v in verifiers if v["is_async"])
284
+ sync_count = len(verifiers) - async_count
285
+ print(f" - {async_count} async verifier(s)")
286
+ print(f" - {sync_count} sync verifier(s)")
287
+
288
+ # Write to Python file
289
+ print(f"\nWriting verifiers to: {py_path}")
290
+
291
+ with open(py_path, "w", encoding="utf-8") as f:
292
+ # Write header
293
+ f.write('"""Auto-generated verifiers file.\n\n')
294
+ f.write(f"Extracted from: {json_path}\n")
295
+ f.write(f"Total verifiers: {len(verifiers)}\n")
296
+ f.write(f" - Async: {async_count}\n")
297
+ f.write(f" - Sync: {sync_count}\n")
298
+ f.write('"""\n\n')
299
+
300
+ # Write imports
301
+ f.write("# Import verifier decorators and dependencies\n")
302
+ f.write("from fleet import (\n")
303
+ f.write(" verifier,\n")
304
+ f.write(" AsyncEnv,\n")
305
+ f.write(" SyncEnv,\n")
306
+ f.write(" SyncEnv as Environment,\n")
307
+ f.write(" AsyncEnv as AsyncEnvironment,\n")
308
+ f.write(" IgnoreConfig,\n")
309
+ f.write(" TASK_FAILED_SCORE,\n")
310
+ f.write(" TASK_SUCCESSFUL_SCORE,\n")
311
+ f.write(")\n")
312
+ f.write("from fleet.verifiers.verifier import verifier as verifier_sync\n")
313
+ f.write("\n")
314
+ f.write("# Standard library imports used in verifiers\n")
315
+ f.write("import json\n")
316
+ f.write("import re\n")
317
+ f.write("import string\n")
318
+ f.write("from typing import Any, Dict, List\n")
319
+ f.write("\n")
320
+ f.write("# Helper functions available in verifier namespace\n")
321
+ f.write(
322
+ '_TRANSLATOR = str.maketrans(string.punctuation, " " * len(string.punctuation))\n'
323
+ )
324
+ f.write("\n")
325
+ f.write("def _normalize_text(value: str) -> str:\n")
326
+ f.write(" text = value.lower().translate(_TRANSLATOR)\n")
327
+ f.write(' return "".join(text.split())\n')
328
+ f.write("\n")
329
+ f.write("def _stringify_content(content: Any) -> str:\n")
330
+ f.write(" if isinstance(content, (dict, list)):\n")
331
+ f.write(" return json.dumps(content, sort_keys=True)\n")
332
+ f.write(" return str(content)\n")
333
+ f.write("\n")
334
+ f.write("def normalized_contains(target: str, blob: Any) -> bool:\n")
335
+ f.write(" normalized_target = _normalize_text(target)\n")
336
+ f.write(" normalized_blob = _normalize_text(_stringify_content(blob))\n")
337
+ f.write(" return normalized_target in normalized_blob\n")
338
+ f.write("\n")
339
+ f.write("def extract_numbers(text: str) -> list:\n")
340
+ f.write(" cleaned_text = text.replace(',', '')\n")
341
+ f.write(" pattern = r'-?\\d+\\.?\\d*'\n")
342
+ f.write(" matches = re.findall(pattern, cleaned_text)\n")
343
+ f.write(" return [float(num) for num in matches]\n")
344
+ f.write("\n")
345
+ f.write("def contains_number(text: str, target_number) -> bool:\n")
346
+ f.write(" numbers = extract_numbers(text)\n")
347
+ f.write(" try:\n")
348
+ f.write(" if isinstance(target_number, str):\n")
349
+ f.write(" target_number = target_number.replace(',', '')\n")
350
+ f.write(" target = float(target_number)\n")
351
+ f.write(" except (ValueError, AttributeError):\n")
352
+ f.write(" return False\n")
353
+ f.write(" return target in numbers\n")
354
+ f.write("\n")
355
+ f.write("# " + "=" * 78 + "\n")
356
+ f.write("# VERIFIERS\n")
357
+ f.write("# " + "=" * 78 + "\n\n")
358
+
359
+ # Write each verifier
360
+ for i, ver in enumerate(verifiers):
361
+ # Write separator comment
362
+ if i > 0:
363
+ f.write("\n" + "# " + "-" * 78 + "\n\n")
364
+
365
+ # Write task key comment
366
+ f.write(f"# Task: {ver['task_key']}\n")
367
+ f.write(
368
+ f"# Function: {ver['function_name']} ({'async' if ver['is_async'] else 'sync'})\n"
369
+ )
370
+
371
+ # Write leading content (docstrings, imports) as comments if present
372
+ if ver["leading_content"]:
373
+ f.write(format_leading_content_as_comment(ver["leading_content"]))
374
+ f.write("\n")
375
+
376
+ # Write decorator - use verifier for async, verifier_sync for sync
377
+ decorator_name = "verifier" if ver["is_async"] else "verifier_sync"
378
+ f.write(f'@{decorator_name}(key="{ver["task_key"]}")\n')
379
+
380
+ # Write function code
381
+ f.write(ver["code"])
382
+ f.write("\n")
383
+
384
+ print(f"✓ Successfully wrote {len(verifiers)} verifier(s) to '{py_path}'")
385
+ print("\nNext steps:")
386
+ print(f" 1. Edit the verifiers in '{py_path}'")
387
+ print(f" 2. Run: python {sys.argv[0]} apply {json_path} {py_path}")
388
+
389
+
390
+ def parse_verifiers_from_file(python_path: str) -> Dict[str, dict]:
391
+ """
392
+ Parse verifiers from a Python file and extract them by task key.
393
+
394
+ Args:
395
+ python_path: Path to Python file containing verifiers
396
+
397
+ Returns:
398
+ Dictionary mapping task_key to dict with 'code' and 'leading_content'
399
+ """
400
+ print(f"Reading verifiers from: {python_path}")
401
+
402
+ try:
403
+ with open(python_path, "r", encoding="utf-8") as f:
404
+ content = f.read()
405
+ except FileNotFoundError:
406
+ print(f"✗ Error: File '{python_path}' not found")
407
+ sys.exit(1)
408
+
409
+ verifiers = {}
410
+
411
+ # Split by "# Task: " markers followed by a task key pattern (uuid or specific format)
412
+ # This avoids splitting on "# Task: " that appears inside docstring comments
413
+ # Task keys look like: task_uuid, task_xxx_timestamp_xxx, or send_xxx_xxx
414
+ task_key_pattern = (
415
+ r"(?:task_[a-f0-9-]+|task_[a-z0-9]+_\d+_[a-z0-9]+|[a-z_]+_[a-z0-9]+)"
416
+ )
417
+ task_blocks = re.split(rf"\n# Task: (?={task_key_pattern})", content)
418
+
419
+ for block in task_blocks[1:]: # Skip the first block (header)
420
+ # Extract task key from the first line
421
+ lines = block.split("\n")
422
+ if not lines:
423
+ continue
424
+
425
+ # First line should be the task key
426
+ task_key = lines[0].strip()
427
+
428
+ # Skip if this doesn't look like a task key (sanity check)
429
+ if not re.match(task_key_pattern, task_key):
430
+ continue
431
+
432
+ # Find the @verifier or @verifier_sync decorator to extract the key parameter
433
+ verifier_match = re.search(
434
+ r'@verifier(?:_sync)?\(key=["\']([^"\']+)["\']\s*(?:,\s*[^)]+)?\)', block
435
+ )
436
+ if verifier_match:
437
+ task_key = verifier_match.group(1)
438
+
439
+ # Check for leading content markers (new format)
440
+ leading_content = None
441
+ if LEADING_CONTENT_START in block:
442
+ start_idx = block.find(LEADING_CONTENT_START)
443
+ end_idx = block.find(LEADING_CONTENT_END)
444
+ if start_idx != -1 and end_idx != -1:
445
+ comment_block = block[
446
+ start_idx + len(LEADING_CONTENT_START) : end_idx
447
+ ].strip()
448
+ leading_content = parse_leading_content_from_comments(comment_block)
449
+ # Fallback: check for legacy docstring markers
450
+ elif LEADING_DOCSTRING_START in block:
451
+ start_idx = block.find(LEADING_DOCSTRING_START)
452
+ end_idx = block.find(LEADING_DOCSTRING_END)
453
+ if start_idx != -1 and end_idx != -1:
454
+ comment_block = block[
455
+ start_idx + len(LEADING_DOCSTRING_START) : end_idx
456
+ ].strip()
457
+ leading_content = parse_legacy_docstring_from_comments(comment_block)
458
+
459
+ # Find the function definition (async def or def)
460
+ # Extract from the function start until we hit the separator or end
461
+ func_pattern = r"((async\s+)?def\s+\w+.*?)(?=\n# -+\n|\n# Task:|\Z)"
462
+ func_match = re.search(func_pattern, block, re.DOTALL)
463
+
464
+ if func_match:
465
+ function_code = func_match.group(1).strip()
466
+ verifiers[task_key] = {
467
+ "code": function_code,
468
+ "leading_content": leading_content,
469
+ }
470
+
471
+ # If the above approach didn't work, try a direct pattern match
472
+ if not verifiers:
473
+ # Pattern to match @verifier or @verifier_sync decorator with key and the following function
474
+ pattern = r'@verifier(?:_sync)?\(key=["\']([^"\']+)["\']\s*(?:,\s*[^)]+)?\)\s*\n((?:async\s+)?def\s+[^\n]+:(?:\n(?: |\t).*)*(?:\n(?: |\t).*)*)'
475
+
476
+ matches = re.findall(pattern, content, re.MULTILINE)
477
+
478
+ for task_key, function_code in matches:
479
+ verifiers[task_key] = {
480
+ "code": function_code.strip(),
481
+ "leading_content": None,
482
+ }
483
+
484
+ print(f"✓ Found {len(verifiers)} verifier(s)")
485
+
486
+ # Analyze async vs sync
487
+ async_count = 0
488
+ sync_count = 0
489
+ for data in verifiers.values():
490
+ func_info = extract_function_info(data["code"])
491
+ if func_info:
492
+ _, is_async = func_info
493
+ if is_async:
494
+ async_count += 1
495
+ else:
496
+ sync_count += 1
497
+
498
+ print(f" - {async_count} async verifier(s)")
499
+ print(f" - {sync_count} sync verifier(s)")
500
+
501
+ return verifiers
502
+
503
+
504
+ def normalize_code_for_comparison(code: str) -> str:
505
+ """
506
+ Normalize code for comparison to avoid false positives.
507
+ Removes leading/trailing whitespace and normalizes line endings.
508
+ """
509
+ # Strip and normalize line endings
510
+ code = code.strip().replace("\r\n", "\n")
511
+ # Normalize trailing whitespace on each line
512
+ lines = code.split("\n")
513
+ lines = [line.rstrip() for line in lines]
514
+ code = "\n".join(lines)
515
+ # Normalize multiple blank lines to single (2+ newlines → 1)
516
+ code = re.sub(r"\n\n+", "\n", code)
517
+ return code
518
+
519
+
520
+ def apply_verifiers_to_json(json_path: str, python_path: str) -> None:
521
+ """
522
+ Apply verifiers from Python file back into JSON task file (updates in-place).
523
+
524
+ Args:
525
+ json_path: Path to JSON file to update
526
+ python_path: Path to Python file with verifiers
527
+ """
528
+ # Parse verifiers from Python file
529
+ verifiers = parse_verifiers_from_file(python_path)
530
+
531
+ # Load JSON file
532
+ print(f"\nReading tasks from: {json_path}")
533
+ try:
534
+ with open(json_path, "r", encoding="utf-8") as f:
535
+ tasks = json.load(f)
536
+ except FileNotFoundError:
537
+ print(f"✗ Error: File '{json_path}' not found")
538
+ sys.exit(1)
539
+ except json.JSONDecodeError as e:
540
+ print(f"✗ Error: Invalid JSON in '{json_path}': {e}")
541
+ sys.exit(1)
542
+
543
+ if not isinstance(tasks, list):
544
+ print("✗ Error: JSON file must contain an array of tasks")
545
+ sys.exit(1)
546
+
547
+ print(f"Found {len(tasks)} task(s)")
548
+
549
+ # Update tasks with new verifiers (only if changed)
550
+ updated_count = 0
551
+ updated_keys = []
552
+ not_found = []
553
+
554
+ for task in tasks:
555
+ task_key = task.get("key") or task.get("id")
556
+ if not task_key:
557
+ continue
558
+
559
+ if task_key in verifiers:
560
+ ver_data = verifiers[task_key]
561
+
562
+ # Reconstruct the full verifier code with leading content if present
563
+ if ver_data["leading_content"]:
564
+ new_code = ver_data["leading_content"] + "\n" + ver_data["code"]
565
+ else:
566
+ new_code = ver_data["code"]
567
+
568
+ old_code = task.get("verifier_func", "")
569
+
570
+ # Normalize both for comparison
571
+ old_normalized = normalize_code_for_comparison(old_code)
572
+ new_normalized = normalize_code_for_comparison(new_code)
573
+
574
+ # Debug: show comparison info
575
+ old_len = len(old_normalized)
576
+ new_len = len(new_normalized)
577
+
578
+ if old_normalized == new_normalized:
579
+ if old_code != new_code:
580
+ print(
581
+ f" [DEBUG] {task_key}: Codes differ in whitespace only (normalized match)"
582
+ )
583
+ else:
584
+ # Find first difference position for debugging
585
+ min_len = min(old_len, new_len)
586
+ diff_pos = min_len
587
+ for i in range(min_len):
588
+ if old_normalized[i] != new_normalized[i]:
589
+ diff_pos = i
590
+ break
591
+ print(
592
+ f" [DEBUG] {task_key}: Code changed (old={old_len}, new={new_len}, first_diff@{diff_pos})"
593
+ )
594
+
595
+ # Only update if the code actually changed
596
+ if old_normalized != new_normalized:
597
+ # Update verifier_func with new code
598
+ task["verifier_func"] = new_code
599
+
600
+ # Also update metadata if it exists
601
+ if "metadata" in task and isinstance(task["metadata"], dict):
602
+ task["metadata"]["verifier_code"] = new_code
603
+
604
+ # Clear verifier_id and verifier_sha to force re-upload
605
+ task["verifier_id"] = None
606
+ task["verifier_sha"] = None
607
+
608
+ updated_count += 1
609
+ updated_keys.append(task_key)
610
+ else:
611
+ not_found.append(task_key)
612
+
613
+ print(f"\n✓ Updated {updated_count} task(s) with new verifiers")
614
+
615
+ if updated_keys:
616
+ print("\nUpdated task keys:")
617
+ for key in updated_keys:
618
+ print(f" - {key}")
619
+
620
+ if not_found:
621
+ print(f"\n⚠ Warning: {len(not_found)} task(s) not found in Python file:")
622
+ for key in not_found[:10]:
623
+ print(f" - {key}")
624
+ if len(not_found) > 10:
625
+ print(f" ... and {len(not_found) - 10} more")
626
+
627
+ # Write output back to the same JSON file
628
+ print(f"\nWriting updated tasks to: {json_path}")
629
+
630
+ try:
631
+ with open(json_path, "w", encoding="utf-8") as f:
632
+ json.dump(tasks, f, indent=2, ensure_ascii=False)
633
+ print(f"✓ Successfully updated {len(tasks)} task(s) in '{json_path}'")
634
+ except Exception as e:
635
+ print(f"✗ Error writing JSON file: {e}")
636
+ sys.exit(1)
637
+
638
+
639
+ def validate_verifiers_file(python_path: str) -> None:
640
+ """
641
+ Validate that a Python verifiers file can be parsed correctly.
642
+
643
+ Args:
644
+ python_path: Path to Python file with verifiers
645
+ """
646
+ verifiers = parse_verifiers_from_file(python_path)
647
+
648
+ print("\nValidating verifiers...")
649
+ errors = []
650
+
651
+ for task_key, ver_data in verifiers.items():
652
+ func_info = extract_function_info(ver_data["code"])
653
+ if not func_info:
654
+ errors.append(f" - {task_key}: Could not extract function info")
655
+ else:
656
+ function_name, is_async = func_info
657
+ has_leading = (
658
+ " (has leading content)" if ver_data["leading_content"] else ""
659
+ )
660
+ print(
661
+ f" ✓ {task_key}: {function_name} ({'async' if is_async else 'sync'}){has_leading}"
662
+ )
663
+
664
+ if errors:
665
+ print(f"\n✗ Found {len(errors)} error(s):")
666
+ for error in errors:
667
+ print(error)
668
+ sys.exit(1)
669
+ else:
670
+ print(f"\n✓ All {len(verifiers)} verifier(s) are valid!")
671
+
672
+
673
+ def main():
674
+ parser = argparse.ArgumentParser(
675
+ description="Iterate on verifier code from JSON task files",
676
+ formatter_class=argparse.RawDescriptionHelpFormatter,
677
+ epilog="""
678
+ Examples:
679
+ # Extract verifiers from JSON to Python file
680
+ %(prog)s extract xai-day-10-batch.json verifiers.py
681
+
682
+ # Edit verifiers.py...
683
+
684
+ # Apply changes back to JSON file (updates in-place)
685
+ %(prog)s apply xai-day-10-batch.json verifiers.py
686
+
687
+ # Validate verifiers file
688
+ %(prog)s validate verifiers.py
689
+ """,
690
+ )
691
+
692
+ subparsers = parser.add_subparsers(dest="command", help="Command to run")
693
+ subparsers.required = True
694
+
695
+ # Extract command
696
+ extract_parser = subparsers.add_parser(
697
+ "extract", help="Extract verifiers from JSON to Python file"
698
+ )
699
+ extract_parser.add_argument("json_file", help="Path to JSON file containing tasks")
700
+ extract_parser.add_argument("py_file", help="Path to output Python file")
701
+
702
+ # Apply command
703
+ apply_parser = subparsers.add_parser(
704
+ "apply", help="Apply verifiers from Python file back to JSON (updates in-place)"
705
+ )
706
+ apply_parser.add_argument("json_file", help="Path to JSON file to update")
707
+ apply_parser.add_argument("py_file", help="Path to Python file with verifiers")
708
+
709
+ # Validate command
710
+ validate_parser = subparsers.add_parser("validate", help="Validate verifiers file")
711
+ validate_parser.add_argument("py_file", help="Path to Python file with verifiers")
712
+
713
+ args = parser.parse_args()
714
+
715
+ # Execute command
716
+ if args.command == "extract":
717
+ extract_verifiers_to_file(args.json_file, args.py_file)
718
+ elif args.command == "apply":
719
+ apply_verifiers_to_json(args.json_file, args.py_file)
720
+ elif args.command == "validate":
721
+ validate_verifiers_file(args.py_file)
722
+
723
+
724
+ if __name__ == "__main__":
725
+ main()