microevals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. config/judge_system_prompt.yaml +113 -0
  2. evals/nextjs/001-server-component.yaml +28 -0
  3. evals/nextjs/002-client-component.yaml +26 -0
  4. evals/nextjs/003-cookies.yaml +28 -0
  5. evals/nextjs/010-route-handlers.yaml +30 -0
  6. evals/nextjs/013-pathname-server.yaml +29 -0
  7. evals/nextjs/014-server-routing.yaml +28 -0
  8. evals/nextjs/018-use-router.yaml +28 -0
  9. evals/nextjs/020_no_use_effect.yaml +30 -0
  10. evals/nextjs/021-avoid-fetch-in-effect.yaml +28 -0
  11. evals/nextjs/022_prefer_server_actions.yaml +29 -0
  12. evals/nextjs/023_avoid_getserversideprops.yaml +27 -0
  13. evals/nextjs/024_avoid_redundant_usestate.yaml +29 -0
  14. evals/nextjs/025_no_async_client_components.yaml +29 -0
  15. evals/nextjs/026_no_serial_await.yaml +26 -0
  16. evals/nextjs/027-prefer-next-image.yaml +30 -0
  17. evals/nextjs/027_no_hooks_in_server_components.yaml +29 -0
  18. evals/nextjs/028-prefer-next-font.yaml +30 -0
  19. evals/nextjs/028_cookies_headers_context.yaml +29 -0
  20. evals/nextjs/029_no_catch_redirect.yaml +31 -0
  21. evals/nextjs/030_app_router_migration.yaml +30 -0
  22. evals/nextjs/031_no_non_serializable_props.yaml +31 -0
  23. evals/react/001_missing_useeffect_dependencies.yaml +29 -0
  24. evals/react/002_incorrect_event_handler.yaml +28 -0
  25. evals/react/003_missing_return_in_map.yaml +28 -0
  26. evals/react/004_async_useeffect.yaml +32 -0
  27. evals/react/005_direct_state_mutation.yaml +30 -0
  28. evals/react/006_index_as_key.yaml +31 -0
  29. evals/react/zustand_store_usage.yaml +25 -0
  30. evals/shadcn/001_cn_utility_function.yaml +31 -0
  31. evals/shadcn/002_css_variables.yaml +32 -0
  32. evals/shadcn/003_component_dependencies.yaml +33 -0
  33. evals/shadcn/004_path_aliases.yaml +32 -0
  34. evals/shadcn/005_client_directive.yaml +31 -0
  35. evals/shadcn/006_tailwind_config.yaml +36 -0
  36. evals/shadcn/007_components_json_config.yaml +35 -0
  37. evals/supabase/001_client_setup.yaml +47 -0
  38. evals/supabase/002_auth_context_setup.yaml +43 -0
  39. evals/supabase/003_auth_flow_implementation.yaml +46 -0
  40. evals/supabase/004_auth_flow_testing_WIP.yaml +52 -0
  41. evals/supabase/005_auth_google_oauth.yaml +55 -0
  42. evals/supabase/007_storage_client_setup.yaml +43 -0
  43. evals/supabase/008_storage_nextjs_config.yaml +45 -0
  44. evals/supabase/009_storage_image_upload.yaml +49 -0
  45. evals/supabase/010_security_rls_enabled.yaml +42 -0
  46. evals/supabase/011_security_rls_policies.yaml +43 -0
  47. evals/supabase/012_security_no_service_key_exposed.yaml +49 -0
  48. evals/supabase/013_database_read_data.yaml +44 -0
  49. evals/supabase/014_database_create_data.yaml +44 -0
  50. evals/supabase/015_database_update_data.yaml +47 -0
  51. evals/supabase/016_database_delete_data.yaml +47 -0
  52. evals/supabase/017_database_user_scoped_query.yaml +52 -0
  53. evals/tailwind/001_tailwind_v4_config.yaml +22 -0
  54. evals/tailwind/002_content_paths.yaml +27 -0
  55. evals/tailwind/003_no_dynamic_class_construction.yaml +28 -0
  56. evals/tailwind/tailwind_postcss_config.yaml +24 -0
  57. evals/typescript/001_unsafe_type_assertions.yaml +39 -0
  58. evals/typescript/002_missing_null_checks.yaml +33 -0
  59. evals/vercel/001_vercel_deployment.yaml +19 -0
  60. evals/vercel/002_environment_variables_handling.yaml +23 -0
  61. evals/vercel/003_seo_metadata.yaml +33 -0
  62. microevals/__init__.py +34 -0
  63. microevals/eval_registry.py +222 -0
  64. microevals/eval_runner.py +533 -0
  65. microevals/utils.py +490 -0
  66. microevals-0.1.0.dist-info/METADATA +575 -0
  67. microevals-0.1.0.dist-info/RECORD +71 -0
  68. microevals-0.1.0.dist-info/WHEEL +5 -0
  69. microevals-0.1.0.dist-info/entry_points.txt +2 -0
  70. microevals-0.1.0.dist-info/licenses/LICENSE +21 -0
  71. microevals-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,533 @@
1
+ #!/usr/bin/env python
2
+ """
3
+ Run Evaluations - Execute evaluations against a repository or local app.
4
+
5
+ Simplified eval runner that can:
6
+ 1. Run a single eval
7
+ 2. Run all evals in a category
8
+ 3. Run all evals
9
+ 4. Run specific eval IDs
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ import time
16
+ from pathlib import Path
17
+ from typing import List, Dict, Any
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ import shutil
20
+
21
+ from .eval_registry import EvalRegistry
22
+ from .utils import prepare_repo, build_prompt, run_eval, run_batch_eval, load_source, read_result, save_results, safe_cleanup_temp_dir
23
+
24
+
25
+ # ANSI color codes
26
+ class Colors:
27
+ GREEN = '\033[92m'
28
+ RED = '\033[91m'
29
+ YELLOW = '\033[93m'
30
+ BLUE = '\033[94m'
31
+ CYAN = '\033[96m'
32
+ BOLD = '\033[1m'
33
+ RESET = '\033[0m'
34
+
35
+
36
+ def run_single_eval(eval_file: Path, repo_url: str, timeout: int = 300, output_dir: str = "results", runtime_inputs: dict = None) -> Dict[str, Any]:
37
+ """Run a single evaluation against a repository."""
38
+ # Handle both relative and absolute paths
39
+ try:
40
+ eval_name = str(eval_file.relative_to(Path("evals")))
41
+ except ValueError:
42
+ # If not relative to evals/, try to extract from absolute path
43
+ parts = eval_file.parts
44
+ if "evals" in parts:
45
+ evals_index = parts.index("evals")
46
+ eval_name = str(Path(*parts[evals_index+1:]))
47
+ else:
48
+ eval_name = eval_file.name
49
+
50
+ try:
51
+ # Load evaluation spec
52
+ eval_spec = load_source("file", str(eval_file))
53
+
54
+ # Merge runtime inputs with YAML defaults
55
+ if runtime_inputs:
56
+ yaml_inputs = eval_spec.get('inputs', {})
57
+ merged_inputs = {**yaml_inputs, **runtime_inputs}
58
+ eval_spec['inputs'] = merged_inputs
59
+
60
+ # Prepare repository (clone or copy)
61
+ temp_dir = prepare_repo(repo_url)
62
+
63
+ try:
64
+ # Build and run evaluation
65
+ prompt = build_prompt(eval_spec)
66
+
67
+ start_time = time.time()
68
+ if not run_eval(temp_dir, prompt, timeout):
69
+ return {
70
+ "eval_name": eval_name,
71
+ "eval_id": eval_spec.get("eval_id", "unknown"),
72
+ "status": "timeout",
73
+ "score": 0.0,
74
+ "duration": timeout,
75
+ "error": "Evaluation timed out"
76
+ }
77
+
78
+ duration = time.time() - start_time
79
+
80
+ # Read results
81
+ result = read_result(temp_dir)
82
+
83
+ # Save results
84
+ save_results(result, eval_spec, repo_url, output_dir)
85
+
86
+ return {
87
+ "eval_name": eval_name,
88
+ "eval_id": eval_spec.get("eval_id", "unknown"),
89
+ "status": "completed",
90
+ "score": result.get("score", 0.0),
91
+ "duration": duration,
92
+ "summary": result.get("summary", "No summary provided")
93
+ }
94
+
95
+ finally:
96
+ # Cleanup temp directory (with safety checks)
97
+ safe_cleanup_temp_dir(temp_dir)
98
+
99
+ except Exception as e:
100
+ return {
101
+ "eval_name": eval_name,
102
+ "eval_id": "unknown",
103
+ "status": "error",
104
+ "score": 0.0,
105
+ "duration": 0,
106
+ "error": str(e)
107
+ }
108
+
109
+
110
+ def print_result_line(result: Dict[str, Any]):
111
+ """Print a single result line with color coding."""
112
+ status = result["status"]
113
+ score = result.get("score", 0.0)
114
+ eval_name = result["eval_name"]
115
+
116
+ # Determine status symbol and color
117
+ if status == "error":
118
+ symbol = "✗"
119
+ color = Colors.RED
120
+ status_text = "ERROR"
121
+ elif status == "timeout":
122
+ symbol = "⏱"
123
+ color = Colors.YELLOW
124
+ status_text = "TIMEOUT"
125
+ elif score == 1.0:
126
+ symbol = "✓"
127
+ color = Colors.GREEN
128
+ status_text = "PASS"
129
+ elif score == -1.0:
130
+ symbol = "○"
131
+ color = Colors.BLUE
132
+ status_text = "N/A"
133
+ else:
134
+ symbol = "✗"
135
+ color = Colors.RED
136
+ status_text = "FAIL"
137
+
138
+ # Format duration
139
+ duration = result.get("duration", 0)
140
+ duration_str = f"{duration:.1f}s"
141
+
142
+ print(f"{color}{symbol}{Colors.RESET} {status_text:8} {eval_name:50} {duration_str:>8}")
143
+
144
+ # Print error or summary if available
145
+ if result.get("error"):
146
+ print(f" {Colors.RED}Error: {result['error']}{Colors.RESET}")
147
+ elif result.get("summary"):
148
+ summary_text = result["summary"]
149
+ if score == 1.0:
150
+ print(f" {Colors.GREEN}✓ {summary_text}{Colors.RESET}")
151
+ elif score == -1.0:
152
+ print(f" {Colors.BLUE}○ {summary_text}{Colors.RESET}")
153
+ else:
154
+ print(f" {Colors.RED}✗ {summary_text}{Colors.RESET}")
155
+
156
+
157
+ def print_summary(results: List[Dict[str, Any]]):
158
+ """Print summary statistics."""
159
+ total = len(results)
160
+ passed = sum(1 for r in results if r.get("score") == 1.0)
161
+ failed = sum(1 for r in results if r.get("score") == 0.0)
162
+ na = sum(1 for r in results if r.get("score") == -1.0)
163
+ errors = sum(1 for r in results if r.get("status") == "error")
164
+ timeouts = sum(1 for r in results if r.get("status") == "timeout")
165
+
166
+ total_duration = sum(r.get("duration", 0) for r in results)
167
+
168
+ print(f"\n{Colors.BOLD}{'='*80}{Colors.RESET}")
169
+ print(f"{Colors.BOLD}SUMMARY{Colors.RESET}")
170
+ print(f"{Colors.BOLD}{'='*80}{Colors.RESET}")
171
+ print(f"Total evaluations: {total}")
172
+ print(f"{Colors.GREEN}✓ Passed: {passed}{Colors.RESET}")
173
+ print(f"{Colors.RED}✗ Failed: {failed}{Colors.RESET}")
174
+ print(f"{Colors.BLUE}○ Not Applicable: {na}{Colors.RESET}")
175
+ print(f"{Colors.YELLOW}⏱ Timeouts: {timeouts}{Colors.RESET}")
176
+ print(f"{Colors.RED}✗ Errors: {errors}{Colors.RESET}")
177
+ print(f"Total duration: {total_duration:.1f}s")
178
+
179
+ # Calculate pass rate (excluding N/A)
180
+ applicable = total - na
181
+ if applicable > 0:
182
+ pass_rate = (passed / applicable) * 100
183
+ print(f"{Colors.BOLD}Pass rate: {pass_rate:.1f}%{Colors.RESET} (excluding N/A)")
184
+ print()
185
+
186
+
187
+ def main():
188
+ parser = argparse.ArgumentParser(
189
+ description='Run evaluations against a repository',
190
+ formatter_class=argparse.RawDescriptionHelpFormatter,
191
+ epilog="""
192
+ Examples:
193
+ # Run in current directory
194
+ microeval --category nextjs
195
+
196
+ # Run in specific local path
197
+ microeval --repo /path/to/project --category react
198
+
199
+ # Run against remote repository
200
+ microeval --repo https://github.com/user/app --eval evals/nextjs/001_server_component_fetch.yaml
201
+
202
+ # Run a single eval with runtime input overrides
203
+ microeval --repo https://github.com/user/app --eval evals/supabase/001_client_setup.yaml \\
204
+ --input supabase_url "https://xyz.supabase.co" \\
205
+ --input supabase_anon_key "your_key_here"
206
+
207
+ # Run all evals in a category
208
+ microeval --category nextjs
209
+
210
+ # Run all evals in a category with runtime inputs (applies to all evals)
211
+ microeval --category supabase --input deployment_url "https://myapp.vercel.app"
212
+
213
+ # Run all evals
214
+ microeval --all
215
+
216
+ # Run specific eval IDs
217
+ microeval --ids nextjs_server_component_fetch_001 supabase_implementation
218
+
219
+ # Run with custom timeout and parallel execution
220
+ microeval --category nextjs --timeout 600 --parallel 3
221
+
222
+ # Run with batch mode (multiple evals in one Claude session)
223
+ microeval --category tailwind --batch-size 3
224
+
225
+ # Run all evals in large batches
226
+ microeval --all --batch-size 15
227
+
228
+ # Run specific eval files in batch
229
+ microeval --evals evals/tailwind/001_tailwind_v4_config.yaml evals/react/001_missing_useeffect_dependencies.yaml --batch-size 2
230
+ """
231
+ )
232
+
233
+ parser.add_argument('--repo', default='.', help='Repository URL or local path (default: current directory)')
234
+ parser.add_argument('--eval', help='Path to specific evaluation YAML file')
235
+ parser.add_argument('--evals', nargs='+', help='Paths to multiple evaluation YAML files')
236
+ parser.add_argument('--category', help='Run all evals in this category (e.g., nextjs, supabase)')
237
+ parser.add_argument('--ids', nargs='+', help='Run specific eval IDs')
238
+ parser.add_argument('--all', action='store_true', help='Run all evaluations')
239
+ parser.add_argument('--list', action='store_true', help='List all available evaluations and exit')
240
+ parser.add_argument('--timeout', type=int, default=300, help='Evaluation timeout in seconds')
241
+ parser.add_argument('--output-dir', default='results', help='Output directory for results')
242
+ parser.add_argument('--parallel', type=int, default=1, help='Number of parallel evaluations')
243
+ parser.add_argument('--evals-dir', default=None, help='Base directory containing evals (default: auto-detect from package installation)')
244
+ parser.add_argument('--input', '-i', action='append', nargs=2, metavar=('KEY', 'VALUE'),
245
+ help='Runtime input override (can be used multiple times): --input key value')
246
+ parser.add_argument('--batch-size', type=int, default=1,
247
+ help='Number of evals to run per Claude session (default: 1). Higher values are faster but less resilient.')
248
+ parser.add_argument('--print-prompt', action='store_true',
249
+ help='Print the prompt before execution (useful for debugging batch mode)')
250
+
251
+ args = parser.parse_args()
252
+
253
+ # Initialize registry
254
+ registry = EvalRegistry(args.evals_dir)
255
+
256
+ # Handle --list option
257
+ if args.list:
258
+ print(f"\n{Colors.BOLD}{'='*80}{Colors.RESET}")
259
+ print(f"{Colors.BOLD}AVAILABLE EVALUATIONS{Colors.RESET}")
260
+ print(f"{Colors.BOLD}{'='*80}{Colors.RESET}\n")
261
+
262
+ if args.category:
263
+ # List specific category
264
+ evals = registry.get_by_category(args.category)
265
+ if not evals:
266
+ print(f"{Colors.RED}No evals found in category '{args.category}'{Colors.RESET}")
267
+ sys.exit(1)
268
+
269
+ print(f"{Colors.CYAN}{args.category.upper()}{Colors.RESET} ({len(evals)} evals)\n")
270
+ for eval_info in evals:
271
+ print(f" {Colors.GREEN}•{Colors.RESET} {eval_info['eval_id']}")
272
+ print(f" {eval_info['name']}")
273
+ if eval_info.get('description'):
274
+ print(f" {Colors.BLUE}{eval_info['description']}{Colors.RESET}")
275
+ print()
276
+ else:
277
+ # List all categories
278
+ all_evals = registry.get_all()
279
+ print(f"Total evaluations: {Colors.BOLD}{len(all_evals)}{Colors.RESET}\n")
280
+
281
+ # Group by category
282
+ categories = {}
283
+ for eval_info in all_evals:
284
+ cat = eval_info['category']
285
+ if cat not in categories:
286
+ categories[cat] = []
287
+ categories[cat].append(eval_info)
288
+
289
+ for category, evals in sorted(categories.items()):
290
+ print(f"{Colors.CYAN}{category.upper()}{Colors.RESET} ({len(evals)} evals)")
291
+ for eval_info in evals[:3]: # Show first 3
292
+ print(f" {Colors.GREEN}•{Colors.RESET} {eval_info['eval_id']}: {eval_info['name']}")
293
+ if len(evals) > 3:
294
+ print(f" {Colors.YELLOW} ... and {len(evals) - 3} more{Colors.RESET}")
295
+ print()
296
+
297
+ print(f"\n{Colors.CYAN}Tip:{Colors.RESET} Use --list --category <name> to see all evals in a category")
298
+
299
+ print(f"{Colors.BOLD}{'='*80}{Colors.RESET}\n")
300
+ sys.exit(0)
301
+
302
+ # Parse runtime inputs from --input arguments
303
+ runtime_inputs = {}
304
+ if args.input:
305
+ for key, value in args.input:
306
+ runtime_inputs[key] = value
307
+ print(f"Runtime inputs: {runtime_inputs}\n")
308
+
309
+ # Determine which evals to run
310
+ eval_files = []
311
+
312
+ if args.eval:
313
+ # Single eval file
314
+ eval_path = Path(args.eval)
315
+ if not eval_path.exists():
316
+ print(f"{Colors.RED}Error: Eval file '{args.eval}' not found{Colors.RESET}")
317
+ sys.exit(1)
318
+ eval_files = [eval_path]
319
+
320
+ elif args.evals:
321
+ # Multiple eval files
322
+ eval_files = []
323
+ for eval_file in args.evals:
324
+ eval_path = Path(eval_file)
325
+ if not eval_path.exists():
326
+ print(f"{Colors.RED}Error: Eval file '{eval_file}' not found{Colors.RESET}")
327
+ sys.exit(1)
328
+ eval_files.append(eval_path)
329
+ print(f"Running {len(eval_files)} specified evals")
330
+
331
+ elif args.category:
332
+ # All evals in category
333
+ evals = registry.get_by_category(args.category)
334
+ if not evals:
335
+ print(f"{Colors.RED}Error: No evals found in category '{args.category}'{Colors.RESET}")
336
+ sys.exit(1)
337
+ eval_files = [Path(e["path"]) for e in evals]
338
+ print(f"Running {len(eval_files)} evals in category '{args.category}'")
339
+
340
+ elif args.ids:
341
+ # Specific eval IDs
342
+ for eval_id in args.ids:
343
+ try:
344
+ e = registry.get_by_id(eval_id)
345
+ eval_files.append(Path(e["path"]))
346
+ except ValueError:
347
+ print(f"{Colors.YELLOW}Warning: Eval ID '{eval_id}' not found, skipping{Colors.RESET}")
348
+
349
+ if not eval_files:
350
+ print(f"{Colors.RED}Error: None of the specified eval IDs were found{Colors.RESET}")
351
+ sys.exit(1)
352
+
353
+ elif args.all:
354
+ # All evals
355
+ evals = registry.get_all()
356
+ eval_files = [Path(e["path"]) for e in evals]
357
+ print(f"Running all {len(eval_files)} evals")
358
+
359
+ else:
360
+ print(f"{Colors.RED}Error: Must specify --eval, --evals, --category, --ids, or --all{Colors.RESET}")
361
+ sys.exit(1)
362
+
363
+ # Run evaluations
364
+ print(f"\n{Colors.BOLD}{Colors.CYAN}{'='*80}{Colors.RESET}")
365
+ print(f"{Colors.BOLD}{Colors.CYAN}Running evaluations for: {args.repo}{Colors.RESET}")
366
+ print(f"{Colors.BOLD}{Colors.CYAN}{'='*80}{Colors.RESET}\n")
367
+
368
+ results = []
369
+
370
+ # Batch mode takes precedence over parallel mode
371
+ if args.batch_size > 1 and len(eval_files) > 1:
372
+ # Batch mode: run multiple evals in single Claude sessions
373
+ print(f"{Colors.CYAN}Running in BATCH mode: {args.batch_size} evals per session{Colors.RESET}\n")
374
+
375
+ # Load all eval specs
376
+ eval_specs = []
377
+ eval_names = []
378
+ for eval_file in eval_files:
379
+ try:
380
+ eval_name = str(eval_file.relative_to(Path("evals")))
381
+ except ValueError:
382
+ parts = eval_file.parts
383
+ if "evals" in parts:
384
+ evals_index = parts.index("evals")
385
+ eval_name = str(Path(*parts[evals_index+1:]))
386
+ else:
387
+ eval_name = eval_file.name
388
+
389
+ spec = load_source("file", str(eval_file))
390
+ if runtime_inputs:
391
+ spec['inputs'] = {**spec.get('inputs', {}), **runtime_inputs}
392
+ eval_specs.append(spec)
393
+ eval_names.append(eval_name)
394
+
395
+ # Prepare repo once (clone or copy)
396
+ print(f"{Colors.CYAN}Preparing repository...{Colors.RESET}")
397
+ temp_dir = prepare_repo(args.repo)
398
+
399
+ try:
400
+ # Process evals in batches
401
+ for i in range(0, len(eval_specs), args.batch_size):
402
+ batch = eval_specs[i:i+args.batch_size]
403
+ batch_names = eval_names[i:i+args.batch_size]
404
+ batch_num = (i // args.batch_size) + 1
405
+ total_batches = (len(eval_specs) + args.batch_size - 1) // args.batch_size
406
+
407
+ print(f"{Colors.CYAN}[Batch {batch_num}/{total_batches}]{Colors.RESET} Running {len(batch)} evals...")
408
+
409
+ # Print prompt if requested
410
+ if args.print_prompt:
411
+ from .utils import build_prompt
412
+ import yaml
413
+
414
+ # Build the batch prompt for display
415
+ batch_criteria = []
416
+ for i, spec in enumerate(batch, 1):
417
+ eval_id = spec['eval_id']
418
+ criteria = spec['criteria']
419
+ inputs = spec.get('inputs', {})
420
+ if inputs:
421
+ for key, value in inputs.items():
422
+ if value is not None:
423
+ criteria = criteria.replace(f"{{{key}}}", str(value))
424
+ batch_criteria.append(f"""
425
+ {'='*80}
426
+ EVALUATION {i}/{len(batch)}
427
+ {'='*80}
428
+ EVAL ID: {eval_id}
429
+ FILENAME: eval_result_{eval_id}.json
430
+
431
+ CRITERIA:
432
+ {criteria}
433
+ """)
434
+
435
+ # Load template
436
+ from pathlib import Path as P
437
+ judge_prompt_path = P(__file__).parent.parent / 'config' / 'judge_system_prompt.yaml'
438
+ with open(judge_prompt_path, 'r') as f:
439
+ prompts = yaml.safe_load(f)
440
+ template = prompts['batch_judge_prompt']['instruction_template']
441
+
442
+ prompt = template.format(
443
+ eval_count=len(batch),
444
+ batch_criteria='\n'.join(batch_criteria)
445
+ )
446
+
447
+ print(f"\n{Colors.YELLOW}{'='*80}{Colors.RESET}")
448
+ print(f"{Colors.YELLOW}BATCH PROMPT (for review):{Colors.RESET}")
449
+ print(f"{Colors.YELLOW}{'='*80}{Colors.RESET}")
450
+ print(prompt)
451
+ print(f"{Colors.YELLOW}{'='*80}{Colors.RESET}\n")
452
+
453
+ # Ask for confirmation
454
+ response = input(f"{Colors.CYAN}Press Enter to continue or Ctrl+C to cancel...{Colors.RESET}")
455
+
456
+ start_time = time.time()
457
+ batch_results = run_batch_eval(temp_dir, batch, timeout=args.timeout)
458
+ duration = time.time() - start_time
459
+
460
+ # Process results for this batch
461
+ for spec, eval_name in zip(batch, batch_names):
462
+ eval_id = spec['eval_id']
463
+
464
+ if eval_id in batch_results:
465
+ result = batch_results[eval_id]
466
+
467
+ # Save results if successful
468
+ if result.get('status') != 'error':
469
+ save_results(result, spec, args.repo, args.output_dir)
470
+
471
+ # Add to results list
472
+ result_entry = {
473
+ "eval_name": eval_name,
474
+ "eval_id": eval_id,
475
+ "status": result.get("status", "completed"),
476
+ "score": result.get("score", 0.0),
477
+ "duration": duration / len(batch), # Approximate per-eval duration
478
+ "summary": result.get("summary", result.get("error", ""))
479
+ }
480
+ if result.get("error"):
481
+ result_entry["error"] = result["error"]
482
+
483
+ results.append(result_entry)
484
+ print_result_line(result_entry)
485
+ else:
486
+ # Eval not in results (shouldn't happen, but handle it)
487
+ result_entry = {
488
+ "eval_name": eval_name,
489
+ "eval_id": eval_id,
490
+ "status": "error",
491
+ "score": 0.0,
492
+ "duration": 0,
493
+ "error": "Result not found in batch output"
494
+ }
495
+ results.append(result_entry)
496
+ print_result_line(result_entry)
497
+
498
+ print() # Blank line between batches
499
+
500
+ finally:
501
+ # Cleanup temp directory (with safety checks)
502
+ print(f"{Colors.CYAN}Cleaning up...{Colors.RESET}")
503
+ safe_cleanup_temp_dir(temp_dir)
504
+
505
+ elif args.parallel > 1:
506
+ # Run evaluations in parallel
507
+ with ThreadPoolExecutor(max_workers=args.parallel) as executor:
508
+ futures = {
509
+ executor.submit(run_single_eval, eval_file, args.repo, args.timeout, args.output_dir, runtime_inputs): eval_file
510
+ for eval_file in eval_files
511
+ }
512
+
513
+ for future in as_completed(futures):
514
+ result = future.result()
515
+ results.append(result)
516
+ print_result_line(result)
517
+ else:
518
+ # Run evaluations sequentially
519
+ for i, eval_file in enumerate(eval_files, 1):
520
+ print(f"{Colors.CYAN}[{i}/{len(eval_files)}]{Colors.RESET} Running {eval_file.name}...")
521
+ result = run_single_eval(eval_file, args.repo, args.timeout, args.output_dir, runtime_inputs)
522
+ results.append(result)
523
+ print_result_line(result)
524
+
525
+ # Print summary
526
+ print_summary(results)
527
+
528
+ print(f"{Colors.BOLD}{Colors.GREEN}All evaluations complete!{Colors.RESET}\n")
529
+
530
+
531
+ if __name__ == "__main__":
532
+ main()
533
+