microevals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- config/judge_system_prompt.yaml +113 -0
- evals/nextjs/001-server-component.yaml +28 -0
- evals/nextjs/002-client-component.yaml +26 -0
- evals/nextjs/003-cookies.yaml +28 -0
- evals/nextjs/010-route-handlers.yaml +30 -0
- evals/nextjs/013-pathname-server.yaml +29 -0
- evals/nextjs/014-server-routing.yaml +28 -0
- evals/nextjs/018-use-router.yaml +28 -0
- evals/nextjs/020_no_use_effect.yaml +30 -0
- evals/nextjs/021-avoid-fetch-in-effect.yaml +28 -0
- evals/nextjs/022_prefer_server_actions.yaml +29 -0
- evals/nextjs/023_avoid_getserversideprops.yaml +27 -0
- evals/nextjs/024_avoid_redundant_usestate.yaml +29 -0
- evals/nextjs/025_no_async_client_components.yaml +29 -0
- evals/nextjs/026_no_serial_await.yaml +26 -0
- evals/nextjs/027-prefer-next-image.yaml +30 -0
- evals/nextjs/027_no_hooks_in_server_components.yaml +29 -0
- evals/nextjs/028-prefer-next-font.yaml +30 -0
- evals/nextjs/028_cookies_headers_context.yaml +29 -0
- evals/nextjs/029_no_catch_redirect.yaml +31 -0
- evals/nextjs/030_app_router_migration.yaml +30 -0
- evals/nextjs/031_no_non_serializable_props.yaml +31 -0
- evals/react/001_missing_useeffect_dependencies.yaml +29 -0
- evals/react/002_incorrect_event_handler.yaml +28 -0
- evals/react/003_missing_return_in_map.yaml +28 -0
- evals/react/004_async_useeffect.yaml +32 -0
- evals/react/005_direct_state_mutation.yaml +30 -0
- evals/react/006_index_as_key.yaml +31 -0
- evals/react/zustand_store_usage.yaml +25 -0
- evals/shadcn/001_cn_utility_function.yaml +31 -0
- evals/shadcn/002_css_variables.yaml +32 -0
- evals/shadcn/003_component_dependencies.yaml +33 -0
- evals/shadcn/004_path_aliases.yaml +32 -0
- evals/shadcn/005_client_directive.yaml +31 -0
- evals/shadcn/006_tailwind_config.yaml +36 -0
- evals/shadcn/007_components_json_config.yaml +35 -0
- evals/supabase/001_client_setup.yaml +47 -0
- evals/supabase/002_auth_context_setup.yaml +43 -0
- evals/supabase/003_auth_flow_implementation.yaml +46 -0
- evals/supabase/004_auth_flow_testing_WIP.yaml +52 -0
- evals/supabase/005_auth_google_oauth.yaml +55 -0
- evals/supabase/007_storage_client_setup.yaml +43 -0
- evals/supabase/008_storage_nextjs_config.yaml +45 -0
- evals/supabase/009_storage_image_upload.yaml +49 -0
- evals/supabase/010_security_rls_enabled.yaml +42 -0
- evals/supabase/011_security_rls_policies.yaml +43 -0
- evals/supabase/012_security_no_service_key_exposed.yaml +49 -0
- evals/supabase/013_database_read_data.yaml +44 -0
- evals/supabase/014_database_create_data.yaml +44 -0
- evals/supabase/015_database_update_data.yaml +47 -0
- evals/supabase/016_database_delete_data.yaml +47 -0
- evals/supabase/017_database_user_scoped_query.yaml +52 -0
- evals/tailwind/001_tailwind_v4_config.yaml +22 -0
- evals/tailwind/002_content_paths.yaml +27 -0
- evals/tailwind/003_no_dynamic_class_construction.yaml +28 -0
- evals/tailwind/tailwind_postcss_config.yaml +24 -0
- evals/typescript/001_unsafe_type_assertions.yaml +39 -0
- evals/typescript/002_missing_null_checks.yaml +33 -0
- evals/vercel/001_vercel_deployment.yaml +19 -0
- evals/vercel/002_environment_variables_handling.yaml +23 -0
- evals/vercel/003_seo_metadata.yaml +33 -0
- microevals/__init__.py +34 -0
- microevals/eval_registry.py +222 -0
- microevals/eval_runner.py +533 -0
- microevals/utils.py +490 -0
- microevals-0.1.0.dist-info/METADATA +575 -0
- microevals-0.1.0.dist-info/RECORD +71 -0
- microevals-0.1.0.dist-info/WHEEL +5 -0
- microevals-0.1.0.dist-info/entry_points.txt +2 -0
- microevals-0.1.0.dist-info/licenses/LICENSE +21 -0
- microevals-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,533 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
"""
|
|
3
|
+
Run Evaluations - Execute evaluations against a repository or local app.
|
|
4
|
+
|
|
5
|
+
Simplified eval runner that can:
|
|
6
|
+
1. Run a single eval
|
|
7
|
+
2. Run all evals in a category
|
|
8
|
+
3. Run all evals
|
|
9
|
+
4. Run specific eval IDs
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import List, Dict, Any
|
|
18
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
19
|
+
import shutil
|
|
20
|
+
|
|
21
|
+
from .eval_registry import EvalRegistry
|
|
22
|
+
from .utils import prepare_repo, build_prompt, run_eval, run_batch_eval, load_source, read_result, save_results, safe_cleanup_temp_dir
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ANSI color codes
|
|
26
|
+
class Colors:
|
|
27
|
+
GREEN = '\033[92m'
|
|
28
|
+
RED = '\033[91m'
|
|
29
|
+
YELLOW = '\033[93m'
|
|
30
|
+
BLUE = '\033[94m'
|
|
31
|
+
CYAN = '\033[96m'
|
|
32
|
+
BOLD = '\033[1m'
|
|
33
|
+
RESET = '\033[0m'
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def run_single_eval(eval_file: Path, repo_url: str, timeout: int = 300, output_dir: str = "results", runtime_inputs: dict = None) -> Dict[str, Any]:
|
|
37
|
+
"""Run a single evaluation against a repository."""
|
|
38
|
+
# Handle both relative and absolute paths
|
|
39
|
+
try:
|
|
40
|
+
eval_name = str(eval_file.relative_to(Path("evals")))
|
|
41
|
+
except ValueError:
|
|
42
|
+
# If not relative to evals/, try to extract from absolute path
|
|
43
|
+
parts = eval_file.parts
|
|
44
|
+
if "evals" in parts:
|
|
45
|
+
evals_index = parts.index("evals")
|
|
46
|
+
eval_name = str(Path(*parts[evals_index+1:]))
|
|
47
|
+
else:
|
|
48
|
+
eval_name = eval_file.name
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Load evaluation spec
|
|
52
|
+
eval_spec = load_source("file", str(eval_file))
|
|
53
|
+
|
|
54
|
+
# Merge runtime inputs with YAML defaults
|
|
55
|
+
if runtime_inputs:
|
|
56
|
+
yaml_inputs = eval_spec.get('inputs', {})
|
|
57
|
+
merged_inputs = {**yaml_inputs, **runtime_inputs}
|
|
58
|
+
eval_spec['inputs'] = merged_inputs
|
|
59
|
+
|
|
60
|
+
# Prepare repository (clone or copy)
|
|
61
|
+
temp_dir = prepare_repo(repo_url)
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
# Build and run evaluation
|
|
65
|
+
prompt = build_prompt(eval_spec)
|
|
66
|
+
|
|
67
|
+
start_time = time.time()
|
|
68
|
+
if not run_eval(temp_dir, prompt, timeout):
|
|
69
|
+
return {
|
|
70
|
+
"eval_name": eval_name,
|
|
71
|
+
"eval_id": eval_spec.get("eval_id", "unknown"),
|
|
72
|
+
"status": "timeout",
|
|
73
|
+
"score": 0.0,
|
|
74
|
+
"duration": timeout,
|
|
75
|
+
"error": "Evaluation timed out"
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
duration = time.time() - start_time
|
|
79
|
+
|
|
80
|
+
# Read results
|
|
81
|
+
result = read_result(temp_dir)
|
|
82
|
+
|
|
83
|
+
# Save results
|
|
84
|
+
save_results(result, eval_spec, repo_url, output_dir)
|
|
85
|
+
|
|
86
|
+
return {
|
|
87
|
+
"eval_name": eval_name,
|
|
88
|
+
"eval_id": eval_spec.get("eval_id", "unknown"),
|
|
89
|
+
"status": "completed",
|
|
90
|
+
"score": result.get("score", 0.0),
|
|
91
|
+
"duration": duration,
|
|
92
|
+
"summary": result.get("summary", "No summary provided")
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
finally:
|
|
96
|
+
# Cleanup temp directory (with safety checks)
|
|
97
|
+
safe_cleanup_temp_dir(temp_dir)
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
return {
|
|
101
|
+
"eval_name": eval_name,
|
|
102
|
+
"eval_id": "unknown",
|
|
103
|
+
"status": "error",
|
|
104
|
+
"score": 0.0,
|
|
105
|
+
"duration": 0,
|
|
106
|
+
"error": str(e)
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def print_result_line(result: Dict[str, Any]):
|
|
111
|
+
"""Print a single result line with color coding."""
|
|
112
|
+
status = result["status"]
|
|
113
|
+
score = result.get("score", 0.0)
|
|
114
|
+
eval_name = result["eval_name"]
|
|
115
|
+
|
|
116
|
+
# Determine status symbol and color
|
|
117
|
+
if status == "error":
|
|
118
|
+
symbol = "✗"
|
|
119
|
+
color = Colors.RED
|
|
120
|
+
status_text = "ERROR"
|
|
121
|
+
elif status == "timeout":
|
|
122
|
+
symbol = "⏱"
|
|
123
|
+
color = Colors.YELLOW
|
|
124
|
+
status_text = "TIMEOUT"
|
|
125
|
+
elif score == 1.0:
|
|
126
|
+
symbol = "✓"
|
|
127
|
+
color = Colors.GREEN
|
|
128
|
+
status_text = "PASS"
|
|
129
|
+
elif score == -1.0:
|
|
130
|
+
symbol = "○"
|
|
131
|
+
color = Colors.BLUE
|
|
132
|
+
status_text = "N/A"
|
|
133
|
+
else:
|
|
134
|
+
symbol = "✗"
|
|
135
|
+
color = Colors.RED
|
|
136
|
+
status_text = "FAIL"
|
|
137
|
+
|
|
138
|
+
# Format duration
|
|
139
|
+
duration = result.get("duration", 0)
|
|
140
|
+
duration_str = f"{duration:.1f}s"
|
|
141
|
+
|
|
142
|
+
print(f"{color}{symbol}{Colors.RESET} {status_text:8} {eval_name:50} {duration_str:>8}")
|
|
143
|
+
|
|
144
|
+
# Print error or summary if available
|
|
145
|
+
if result.get("error"):
|
|
146
|
+
print(f" {Colors.RED}Error: {result['error']}{Colors.RESET}")
|
|
147
|
+
elif result.get("summary"):
|
|
148
|
+
summary_text = result["summary"]
|
|
149
|
+
if score == 1.0:
|
|
150
|
+
print(f" {Colors.GREEN}✓ {summary_text}{Colors.RESET}")
|
|
151
|
+
elif score == -1.0:
|
|
152
|
+
print(f" {Colors.BLUE}○ {summary_text}{Colors.RESET}")
|
|
153
|
+
else:
|
|
154
|
+
print(f" {Colors.RED}✗ {summary_text}{Colors.RESET}")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def print_summary(results: List[Dict[str, Any]]):
|
|
158
|
+
"""Print summary statistics."""
|
|
159
|
+
total = len(results)
|
|
160
|
+
passed = sum(1 for r in results if r.get("score") == 1.0)
|
|
161
|
+
failed = sum(1 for r in results if r.get("score") == 0.0)
|
|
162
|
+
na = sum(1 for r in results if r.get("score") == -1.0)
|
|
163
|
+
errors = sum(1 for r in results if r.get("status") == "error")
|
|
164
|
+
timeouts = sum(1 for r in results if r.get("status") == "timeout")
|
|
165
|
+
|
|
166
|
+
total_duration = sum(r.get("duration", 0) for r in results)
|
|
167
|
+
|
|
168
|
+
print(f"\n{Colors.BOLD}{'='*80}{Colors.RESET}")
|
|
169
|
+
print(f"{Colors.BOLD}SUMMARY{Colors.RESET}")
|
|
170
|
+
print(f"{Colors.BOLD}{'='*80}{Colors.RESET}")
|
|
171
|
+
print(f"Total evaluations: {total}")
|
|
172
|
+
print(f"{Colors.GREEN}✓ Passed: {passed}{Colors.RESET}")
|
|
173
|
+
print(f"{Colors.RED}✗ Failed: {failed}{Colors.RESET}")
|
|
174
|
+
print(f"{Colors.BLUE}○ Not Applicable: {na}{Colors.RESET}")
|
|
175
|
+
print(f"{Colors.YELLOW}⏱ Timeouts: {timeouts}{Colors.RESET}")
|
|
176
|
+
print(f"{Colors.RED}✗ Errors: {errors}{Colors.RESET}")
|
|
177
|
+
print(f"Total duration: {total_duration:.1f}s")
|
|
178
|
+
|
|
179
|
+
# Calculate pass rate (excluding N/A)
|
|
180
|
+
applicable = total - na
|
|
181
|
+
if applicable > 0:
|
|
182
|
+
pass_rate = (passed / applicable) * 100
|
|
183
|
+
print(f"{Colors.BOLD}Pass rate: {pass_rate:.1f}%{Colors.RESET} (excluding N/A)")
|
|
184
|
+
print()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def main():
|
|
188
|
+
parser = argparse.ArgumentParser(
|
|
189
|
+
description='Run evaluations against a repository',
|
|
190
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
191
|
+
epilog="""
|
|
192
|
+
Examples:
|
|
193
|
+
# Run in current directory
|
|
194
|
+
microeval --category nextjs
|
|
195
|
+
|
|
196
|
+
# Run in specific local path
|
|
197
|
+
microeval --repo /path/to/project --category react
|
|
198
|
+
|
|
199
|
+
# Run against remote repository
|
|
200
|
+
microeval --repo https://github.com/user/app --eval evals/nextjs/001_server_component_fetch.yaml
|
|
201
|
+
|
|
202
|
+
# Run a single eval with runtime input overrides
|
|
203
|
+
microeval --repo https://github.com/user/app --eval evals/supabase/001_client_setup.yaml \\
|
|
204
|
+
--input supabase_url "https://xyz.supabase.co" \\
|
|
205
|
+
--input supabase_anon_key "your_key_here"
|
|
206
|
+
|
|
207
|
+
# Run all evals in a category
|
|
208
|
+
microeval --category nextjs
|
|
209
|
+
|
|
210
|
+
# Run all evals in a category with runtime inputs (applies to all evals)
|
|
211
|
+
microeval --category supabase --input deployment_url "https://myapp.vercel.app"
|
|
212
|
+
|
|
213
|
+
# Run all evals
|
|
214
|
+
microeval --all
|
|
215
|
+
|
|
216
|
+
# Run specific eval IDs
|
|
217
|
+
microeval --ids nextjs_server_component_fetch_001 supabase_implementation
|
|
218
|
+
|
|
219
|
+
# Run with custom timeout and parallel execution
|
|
220
|
+
microeval --category nextjs --timeout 600 --parallel 3
|
|
221
|
+
|
|
222
|
+
# Run with batch mode (multiple evals in one Claude session)
|
|
223
|
+
microeval --category tailwind --batch-size 3
|
|
224
|
+
|
|
225
|
+
# Run all evals in large batches
|
|
226
|
+
microeval --all --batch-size 15
|
|
227
|
+
|
|
228
|
+
# Run specific eval files in batch
|
|
229
|
+
microeval --evals evals/tailwind/001_tailwind_v4_config.yaml evals/react/001_missing_useeffect_dependencies.yaml --batch-size 2
|
|
230
|
+
"""
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
parser.add_argument('--repo', default='.', help='Repository URL or local path (default: current directory)')
|
|
234
|
+
parser.add_argument('--eval', help='Path to specific evaluation YAML file')
|
|
235
|
+
parser.add_argument('--evals', nargs='+', help='Paths to multiple evaluation YAML files')
|
|
236
|
+
parser.add_argument('--category', help='Run all evals in this category (e.g., nextjs, supabase)')
|
|
237
|
+
parser.add_argument('--ids', nargs='+', help='Run specific eval IDs')
|
|
238
|
+
parser.add_argument('--all', action='store_true', help='Run all evaluations')
|
|
239
|
+
parser.add_argument('--list', action='store_true', help='List all available evaluations and exit')
|
|
240
|
+
parser.add_argument('--timeout', type=int, default=300, help='Evaluation timeout in seconds')
|
|
241
|
+
parser.add_argument('--output-dir', default='results', help='Output directory for results')
|
|
242
|
+
parser.add_argument('--parallel', type=int, default=1, help='Number of parallel evaluations')
|
|
243
|
+
parser.add_argument('--evals-dir', default=None, help='Base directory containing evals (default: auto-detect from package installation)')
|
|
244
|
+
parser.add_argument('--input', '-i', action='append', nargs=2, metavar=('KEY', 'VALUE'),
|
|
245
|
+
help='Runtime input override (can be used multiple times): --input key value')
|
|
246
|
+
parser.add_argument('--batch-size', type=int, default=1,
|
|
247
|
+
help='Number of evals to run per Claude session (default: 1). Higher values are faster but less resilient.')
|
|
248
|
+
parser.add_argument('--print-prompt', action='store_true',
|
|
249
|
+
help='Print the prompt before execution (useful for debugging batch mode)')
|
|
250
|
+
|
|
251
|
+
args = parser.parse_args()
|
|
252
|
+
|
|
253
|
+
# Initialize registry
|
|
254
|
+
registry = EvalRegistry(args.evals_dir)
|
|
255
|
+
|
|
256
|
+
# Handle --list option
|
|
257
|
+
if args.list:
|
|
258
|
+
print(f"\n{Colors.BOLD}{'='*80}{Colors.RESET}")
|
|
259
|
+
print(f"{Colors.BOLD}AVAILABLE EVALUATIONS{Colors.RESET}")
|
|
260
|
+
print(f"{Colors.BOLD}{'='*80}{Colors.RESET}\n")
|
|
261
|
+
|
|
262
|
+
if args.category:
|
|
263
|
+
# List specific category
|
|
264
|
+
evals = registry.get_by_category(args.category)
|
|
265
|
+
if not evals:
|
|
266
|
+
print(f"{Colors.RED}No evals found in category '{args.category}'{Colors.RESET}")
|
|
267
|
+
sys.exit(1)
|
|
268
|
+
|
|
269
|
+
print(f"{Colors.CYAN}{args.category.upper()}{Colors.RESET} ({len(evals)} evals)\n")
|
|
270
|
+
for eval_info in evals:
|
|
271
|
+
print(f" {Colors.GREEN}•{Colors.RESET} {eval_info['eval_id']}")
|
|
272
|
+
print(f" {eval_info['name']}")
|
|
273
|
+
if eval_info.get('description'):
|
|
274
|
+
print(f" {Colors.BLUE}{eval_info['description']}{Colors.RESET}")
|
|
275
|
+
print()
|
|
276
|
+
else:
|
|
277
|
+
# List all categories
|
|
278
|
+
all_evals = registry.get_all()
|
|
279
|
+
print(f"Total evaluations: {Colors.BOLD}{len(all_evals)}{Colors.RESET}\n")
|
|
280
|
+
|
|
281
|
+
# Group by category
|
|
282
|
+
categories = {}
|
|
283
|
+
for eval_info in all_evals:
|
|
284
|
+
cat = eval_info['category']
|
|
285
|
+
if cat not in categories:
|
|
286
|
+
categories[cat] = []
|
|
287
|
+
categories[cat].append(eval_info)
|
|
288
|
+
|
|
289
|
+
for category, evals in sorted(categories.items()):
|
|
290
|
+
print(f"{Colors.CYAN}{category.upper()}{Colors.RESET} ({len(evals)} evals)")
|
|
291
|
+
for eval_info in evals[:3]: # Show first 3
|
|
292
|
+
print(f" {Colors.GREEN}•{Colors.RESET} {eval_info['eval_id']}: {eval_info['name']}")
|
|
293
|
+
if len(evals) > 3:
|
|
294
|
+
print(f" {Colors.YELLOW} ... and {len(evals) - 3} more{Colors.RESET}")
|
|
295
|
+
print()
|
|
296
|
+
|
|
297
|
+
print(f"\n{Colors.CYAN}Tip:{Colors.RESET} Use --list --category <name> to see all evals in a category")
|
|
298
|
+
|
|
299
|
+
print(f"{Colors.BOLD}{'='*80}{Colors.RESET}\n")
|
|
300
|
+
sys.exit(0)
|
|
301
|
+
|
|
302
|
+
# Parse runtime inputs from --input arguments
|
|
303
|
+
runtime_inputs = {}
|
|
304
|
+
if args.input:
|
|
305
|
+
for key, value in args.input:
|
|
306
|
+
runtime_inputs[key] = value
|
|
307
|
+
print(f"Runtime inputs: {runtime_inputs}\n")
|
|
308
|
+
|
|
309
|
+
# Determine which evals to run
|
|
310
|
+
eval_files = []
|
|
311
|
+
|
|
312
|
+
if args.eval:
|
|
313
|
+
# Single eval file
|
|
314
|
+
eval_path = Path(args.eval)
|
|
315
|
+
if not eval_path.exists():
|
|
316
|
+
print(f"{Colors.RED}Error: Eval file '{args.eval}' not found{Colors.RESET}")
|
|
317
|
+
sys.exit(1)
|
|
318
|
+
eval_files = [eval_path]
|
|
319
|
+
|
|
320
|
+
elif args.evals:
|
|
321
|
+
# Multiple eval files
|
|
322
|
+
eval_files = []
|
|
323
|
+
for eval_file in args.evals:
|
|
324
|
+
eval_path = Path(eval_file)
|
|
325
|
+
if not eval_path.exists():
|
|
326
|
+
print(f"{Colors.RED}Error: Eval file '{eval_file}' not found{Colors.RESET}")
|
|
327
|
+
sys.exit(1)
|
|
328
|
+
eval_files.append(eval_path)
|
|
329
|
+
print(f"Running {len(eval_files)} specified evals")
|
|
330
|
+
|
|
331
|
+
elif args.category:
|
|
332
|
+
# All evals in category
|
|
333
|
+
evals = registry.get_by_category(args.category)
|
|
334
|
+
if not evals:
|
|
335
|
+
print(f"{Colors.RED}Error: No evals found in category '{args.category}'{Colors.RESET}")
|
|
336
|
+
sys.exit(1)
|
|
337
|
+
eval_files = [Path(e["path"]) for e in evals]
|
|
338
|
+
print(f"Running {len(eval_files)} evals in category '{args.category}'")
|
|
339
|
+
|
|
340
|
+
elif args.ids:
|
|
341
|
+
# Specific eval IDs
|
|
342
|
+
for eval_id in args.ids:
|
|
343
|
+
try:
|
|
344
|
+
e = registry.get_by_id(eval_id)
|
|
345
|
+
eval_files.append(Path(e["path"]))
|
|
346
|
+
except ValueError:
|
|
347
|
+
print(f"{Colors.YELLOW}Warning: Eval ID '{eval_id}' not found, skipping{Colors.RESET}")
|
|
348
|
+
|
|
349
|
+
if not eval_files:
|
|
350
|
+
print(f"{Colors.RED}Error: None of the specified eval IDs were found{Colors.RESET}")
|
|
351
|
+
sys.exit(1)
|
|
352
|
+
|
|
353
|
+
elif args.all:
|
|
354
|
+
# All evals
|
|
355
|
+
evals = registry.get_all()
|
|
356
|
+
eval_files = [Path(e["path"]) for e in evals]
|
|
357
|
+
print(f"Running all {len(eval_files)} evals")
|
|
358
|
+
|
|
359
|
+
else:
|
|
360
|
+
print(f"{Colors.RED}Error: Must specify --eval, --evals, --category, --ids, or --all{Colors.RESET}")
|
|
361
|
+
sys.exit(1)
|
|
362
|
+
|
|
363
|
+
# Run evaluations
|
|
364
|
+
print(f"\n{Colors.BOLD}{Colors.CYAN}{'='*80}{Colors.RESET}")
|
|
365
|
+
print(f"{Colors.BOLD}{Colors.CYAN}Running evaluations for: {args.repo}{Colors.RESET}")
|
|
366
|
+
print(f"{Colors.BOLD}{Colors.CYAN}{'='*80}{Colors.RESET}\n")
|
|
367
|
+
|
|
368
|
+
results = []
|
|
369
|
+
|
|
370
|
+
# Batch mode takes precedence over parallel mode
|
|
371
|
+
if args.batch_size > 1 and len(eval_files) > 1:
|
|
372
|
+
# Batch mode: run multiple evals in single Claude sessions
|
|
373
|
+
print(f"{Colors.CYAN}Running in BATCH mode: {args.batch_size} evals per session{Colors.RESET}\n")
|
|
374
|
+
|
|
375
|
+
# Load all eval specs
|
|
376
|
+
eval_specs = []
|
|
377
|
+
eval_names = []
|
|
378
|
+
for eval_file in eval_files:
|
|
379
|
+
try:
|
|
380
|
+
eval_name = str(eval_file.relative_to(Path("evals")))
|
|
381
|
+
except ValueError:
|
|
382
|
+
parts = eval_file.parts
|
|
383
|
+
if "evals" in parts:
|
|
384
|
+
evals_index = parts.index("evals")
|
|
385
|
+
eval_name = str(Path(*parts[evals_index+1:]))
|
|
386
|
+
else:
|
|
387
|
+
eval_name = eval_file.name
|
|
388
|
+
|
|
389
|
+
spec = load_source("file", str(eval_file))
|
|
390
|
+
if runtime_inputs:
|
|
391
|
+
spec['inputs'] = {**spec.get('inputs', {}), **runtime_inputs}
|
|
392
|
+
eval_specs.append(spec)
|
|
393
|
+
eval_names.append(eval_name)
|
|
394
|
+
|
|
395
|
+
# Prepare repo once (clone or copy)
|
|
396
|
+
print(f"{Colors.CYAN}Preparing repository...{Colors.RESET}")
|
|
397
|
+
temp_dir = prepare_repo(args.repo)
|
|
398
|
+
|
|
399
|
+
try:
|
|
400
|
+
# Process evals in batches
|
|
401
|
+
for i in range(0, len(eval_specs), args.batch_size):
|
|
402
|
+
batch = eval_specs[i:i+args.batch_size]
|
|
403
|
+
batch_names = eval_names[i:i+args.batch_size]
|
|
404
|
+
batch_num = (i // args.batch_size) + 1
|
|
405
|
+
total_batches = (len(eval_specs) + args.batch_size - 1) // args.batch_size
|
|
406
|
+
|
|
407
|
+
print(f"{Colors.CYAN}[Batch {batch_num}/{total_batches}]{Colors.RESET} Running {len(batch)} evals...")
|
|
408
|
+
|
|
409
|
+
# Print prompt if requested
|
|
410
|
+
if args.print_prompt:
|
|
411
|
+
from .utils import build_prompt
|
|
412
|
+
import yaml
|
|
413
|
+
|
|
414
|
+
# Build the batch prompt for display
|
|
415
|
+
batch_criteria = []
|
|
416
|
+
for i, spec in enumerate(batch, 1):
|
|
417
|
+
eval_id = spec['eval_id']
|
|
418
|
+
criteria = spec['criteria']
|
|
419
|
+
inputs = spec.get('inputs', {})
|
|
420
|
+
if inputs:
|
|
421
|
+
for key, value in inputs.items():
|
|
422
|
+
if value is not None:
|
|
423
|
+
criteria = criteria.replace(f"{{{key}}}", str(value))
|
|
424
|
+
batch_criteria.append(f"""
|
|
425
|
+
{'='*80}
|
|
426
|
+
EVALUATION {i}/{len(batch)}
|
|
427
|
+
{'='*80}
|
|
428
|
+
EVAL ID: {eval_id}
|
|
429
|
+
FILENAME: eval_result_{eval_id}.json
|
|
430
|
+
|
|
431
|
+
CRITERIA:
|
|
432
|
+
{criteria}
|
|
433
|
+
""")
|
|
434
|
+
|
|
435
|
+
# Load template
|
|
436
|
+
from pathlib import Path as P
|
|
437
|
+
judge_prompt_path = P(__file__).parent.parent / 'config' / 'judge_system_prompt.yaml'
|
|
438
|
+
with open(judge_prompt_path, 'r') as f:
|
|
439
|
+
prompts = yaml.safe_load(f)
|
|
440
|
+
template = prompts['batch_judge_prompt']['instruction_template']
|
|
441
|
+
|
|
442
|
+
prompt = template.format(
|
|
443
|
+
eval_count=len(batch),
|
|
444
|
+
batch_criteria='\n'.join(batch_criteria)
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
print(f"\n{Colors.YELLOW}{'='*80}{Colors.RESET}")
|
|
448
|
+
print(f"{Colors.YELLOW}BATCH PROMPT (for review):{Colors.RESET}")
|
|
449
|
+
print(f"{Colors.YELLOW}{'='*80}{Colors.RESET}")
|
|
450
|
+
print(prompt)
|
|
451
|
+
print(f"{Colors.YELLOW}{'='*80}{Colors.RESET}\n")
|
|
452
|
+
|
|
453
|
+
# Ask for confirmation
|
|
454
|
+
response = input(f"{Colors.CYAN}Press Enter to continue or Ctrl+C to cancel...{Colors.RESET}")
|
|
455
|
+
|
|
456
|
+
start_time = time.time()
|
|
457
|
+
batch_results = run_batch_eval(temp_dir, batch, timeout=args.timeout)
|
|
458
|
+
duration = time.time() - start_time
|
|
459
|
+
|
|
460
|
+
# Process results for this batch
|
|
461
|
+
for spec, eval_name in zip(batch, batch_names):
|
|
462
|
+
eval_id = spec['eval_id']
|
|
463
|
+
|
|
464
|
+
if eval_id in batch_results:
|
|
465
|
+
result = batch_results[eval_id]
|
|
466
|
+
|
|
467
|
+
# Save results if successful
|
|
468
|
+
if result.get('status') != 'error':
|
|
469
|
+
save_results(result, spec, args.repo, args.output_dir)
|
|
470
|
+
|
|
471
|
+
# Add to results list
|
|
472
|
+
result_entry = {
|
|
473
|
+
"eval_name": eval_name,
|
|
474
|
+
"eval_id": eval_id,
|
|
475
|
+
"status": result.get("status", "completed"),
|
|
476
|
+
"score": result.get("score", 0.0),
|
|
477
|
+
"duration": duration / len(batch), # Approximate per-eval duration
|
|
478
|
+
"summary": result.get("summary", result.get("error", ""))
|
|
479
|
+
}
|
|
480
|
+
if result.get("error"):
|
|
481
|
+
result_entry["error"] = result["error"]
|
|
482
|
+
|
|
483
|
+
results.append(result_entry)
|
|
484
|
+
print_result_line(result_entry)
|
|
485
|
+
else:
|
|
486
|
+
# Eval not in results (shouldn't happen, but handle it)
|
|
487
|
+
result_entry = {
|
|
488
|
+
"eval_name": eval_name,
|
|
489
|
+
"eval_id": eval_id,
|
|
490
|
+
"status": "error",
|
|
491
|
+
"score": 0.0,
|
|
492
|
+
"duration": 0,
|
|
493
|
+
"error": "Result not found in batch output"
|
|
494
|
+
}
|
|
495
|
+
results.append(result_entry)
|
|
496
|
+
print_result_line(result_entry)
|
|
497
|
+
|
|
498
|
+
print() # Blank line between batches
|
|
499
|
+
|
|
500
|
+
finally:
|
|
501
|
+
# Cleanup temp directory (with safety checks)
|
|
502
|
+
print(f"{Colors.CYAN}Cleaning up...{Colors.RESET}")
|
|
503
|
+
safe_cleanup_temp_dir(temp_dir)
|
|
504
|
+
|
|
505
|
+
elif args.parallel > 1:
|
|
506
|
+
# Run evaluations in parallel
|
|
507
|
+
with ThreadPoolExecutor(max_workers=args.parallel) as executor:
|
|
508
|
+
futures = {
|
|
509
|
+
executor.submit(run_single_eval, eval_file, args.repo, args.timeout, args.output_dir, runtime_inputs): eval_file
|
|
510
|
+
for eval_file in eval_files
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
for future in as_completed(futures):
|
|
514
|
+
result = future.result()
|
|
515
|
+
results.append(result)
|
|
516
|
+
print_result_line(result)
|
|
517
|
+
else:
|
|
518
|
+
# Run evaluations sequentially
|
|
519
|
+
for i, eval_file in enumerate(eval_files, 1):
|
|
520
|
+
print(f"{Colors.CYAN}[{i}/{len(eval_files)}]{Colors.RESET} Running {eval_file.name}...")
|
|
521
|
+
result = run_single_eval(eval_file, args.repo, args.timeout, args.output_dir, runtime_inputs)
|
|
522
|
+
results.append(result)
|
|
523
|
+
print_result_line(result)
|
|
524
|
+
|
|
525
|
+
# Print summary
|
|
526
|
+
print_summary(results)
|
|
527
|
+
|
|
528
|
+
print(f"{Colors.BOLD}{Colors.GREEN}All evaluations complete!{Colors.RESET}\n")
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
if __name__ == "__main__":
|
|
532
|
+
main()
|
|
533
|
+
|