microevals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- config/judge_system_prompt.yaml +113 -0
- evals/nextjs/001-server-component.yaml +28 -0
- evals/nextjs/002-client-component.yaml +26 -0
- evals/nextjs/003-cookies.yaml +28 -0
- evals/nextjs/010-route-handlers.yaml +30 -0
- evals/nextjs/013-pathname-server.yaml +29 -0
- evals/nextjs/014-server-routing.yaml +28 -0
- evals/nextjs/018-use-router.yaml +28 -0
- evals/nextjs/020_no_use_effect.yaml +30 -0
- evals/nextjs/021-avoid-fetch-in-effect.yaml +28 -0
- evals/nextjs/022_prefer_server_actions.yaml +29 -0
- evals/nextjs/023_avoid_getserversideprops.yaml +27 -0
- evals/nextjs/024_avoid_redundant_usestate.yaml +29 -0
- evals/nextjs/025_no_async_client_components.yaml +29 -0
- evals/nextjs/026_no_serial_await.yaml +26 -0
- evals/nextjs/027-prefer-next-image.yaml +30 -0
- evals/nextjs/027_no_hooks_in_server_components.yaml +29 -0
- evals/nextjs/028-prefer-next-font.yaml +30 -0
- evals/nextjs/028_cookies_headers_context.yaml +29 -0
- evals/nextjs/029_no_catch_redirect.yaml +31 -0
- evals/nextjs/030_app_router_migration.yaml +30 -0
- evals/nextjs/031_no_non_serializable_props.yaml +31 -0
- evals/react/001_missing_useeffect_dependencies.yaml +29 -0
- evals/react/002_incorrect_event_handler.yaml +28 -0
- evals/react/003_missing_return_in_map.yaml +28 -0
- evals/react/004_async_useeffect.yaml +32 -0
- evals/react/005_direct_state_mutation.yaml +30 -0
- evals/react/006_index_as_key.yaml +31 -0
- evals/react/zustand_store_usage.yaml +25 -0
- evals/shadcn/001_cn_utility_function.yaml +31 -0
- evals/shadcn/002_css_variables.yaml +32 -0
- evals/shadcn/003_component_dependencies.yaml +33 -0
- evals/shadcn/004_path_aliases.yaml +32 -0
- evals/shadcn/005_client_directive.yaml +31 -0
- evals/shadcn/006_tailwind_config.yaml +36 -0
- evals/shadcn/007_components_json_config.yaml +35 -0
- evals/supabase/001_client_setup.yaml +47 -0
- evals/supabase/002_auth_context_setup.yaml +43 -0
- evals/supabase/003_auth_flow_implementation.yaml +46 -0
- evals/supabase/004_auth_flow_testing_WIP.yaml +52 -0
- evals/supabase/005_auth_google_oauth.yaml +55 -0
- evals/supabase/007_storage_client_setup.yaml +43 -0
- evals/supabase/008_storage_nextjs_config.yaml +45 -0
- evals/supabase/009_storage_image_upload.yaml +49 -0
- evals/supabase/010_security_rls_enabled.yaml +42 -0
- evals/supabase/011_security_rls_policies.yaml +43 -0
- evals/supabase/012_security_no_service_key_exposed.yaml +49 -0
- evals/supabase/013_database_read_data.yaml +44 -0
- evals/supabase/014_database_create_data.yaml +44 -0
- evals/supabase/015_database_update_data.yaml +47 -0
- evals/supabase/016_database_delete_data.yaml +47 -0
- evals/supabase/017_database_user_scoped_query.yaml +52 -0
- evals/tailwind/001_tailwind_v4_config.yaml +22 -0
- evals/tailwind/002_content_paths.yaml +27 -0
- evals/tailwind/003_no_dynamic_class_construction.yaml +28 -0
- evals/tailwind/tailwind_postcss_config.yaml +24 -0
- evals/typescript/001_unsafe_type_assertions.yaml +39 -0
- evals/typescript/002_missing_null_checks.yaml +33 -0
- evals/vercel/001_vercel_deployment.yaml +19 -0
- evals/vercel/002_environment_variables_handling.yaml +23 -0
- evals/vercel/003_seo_metadata.yaml +33 -0
- microevals/__init__.py +34 -0
- microevals/eval_registry.py +222 -0
- microevals/eval_runner.py +533 -0
- microevals/utils.py +490 -0
- microevals-0.1.0.dist-info/METADATA +575 -0
- microevals-0.1.0.dist-info/RECORD +71 -0
- microevals-0.1.0.dist-info/WHEEL +5 -0
- microevals-0.1.0.dist-info/entry_points.txt +2 -0
- microevals-0.1.0.dist-info/licenses/LICENSE +21 -0
- microevals-0.1.0.dist-info/top_level.txt +1 -0
microevals/utils.py
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
1
|
+
"""Utility functions for evaluations"""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import json
|
|
5
|
+
import tempfile
|
|
6
|
+
import shutil
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, Any
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
import yaml
|
|
11
|
+
import urllib.request
|
|
12
|
+
import urllib.error
|
|
13
|
+
import time
|
|
14
|
+
import threading
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Load judge system prompt at module initialization
|
|
18
|
+
_judge_prompt_path = Path(__file__).parent.parent / 'config' / 'judge_system_prompt.yaml'
|
|
19
|
+
with open(_judge_prompt_path, 'r') as f:
|
|
20
|
+
_judge_prompts = yaml.safe_load(f)
|
|
21
|
+
JUDGE_PROMPT_TEMPLATE = _judge_prompts['judge_prompt']['instruction_template']
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Global rate limiter to prevent hitting Claude CLI limits
|
|
25
|
+
class RateLimiter:
|
|
26
|
+
"""Simple rate limiter with configurable delay between requests."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, min_interval: float = 2.0):
|
|
29
|
+
"""
|
|
30
|
+
Initialize rate limiter.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
min_interval: Minimum seconds between requests (default: 2.0)
|
|
34
|
+
"""
|
|
35
|
+
self.min_interval = min_interval
|
|
36
|
+
self.last_request_time = 0
|
|
37
|
+
self.lock = threading.Lock()
|
|
38
|
+
|
|
39
|
+
def wait(self):
|
|
40
|
+
"""Wait until enough time has passed since last request."""
|
|
41
|
+
with self.lock:
|
|
42
|
+
current_time = time.time()
|
|
43
|
+
time_since_last = current_time - self.last_request_time
|
|
44
|
+
|
|
45
|
+
if time_since_last < self.min_interval:
|
|
46
|
+
sleep_time = self.min_interval - time_since_last
|
|
47
|
+
time.sleep(sleep_time)
|
|
48
|
+
|
|
49
|
+
self.last_request_time = time.time()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Global rate limiter instance
|
|
53
|
+
_rate_limiter = RateLimiter(min_interval=2.0)
|
|
54
|
+
|
|
55
|
+
# SAFETY: Marker file to identify our temp directories
|
|
56
|
+
_MICROEVAL_TEMP_MARKER = ".microeval_temp_directory"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def safe_cleanup_temp_dir(temp_dir: Path) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
Safely remove temp directory with SIX independent safety checks.
|
|
62
|
+
|
|
63
|
+
Returns True if deleted, False if any safety check failed.
|
|
64
|
+
|
|
65
|
+
SAFETY CHECKS (ALL must pass):
|
|
66
|
+
1. Directory exists and is a Path object
|
|
67
|
+
2. Path is inside system temp directory
|
|
68
|
+
3. Directory name starts with "eval-"
|
|
69
|
+
4. Directory contains our safety marker file
|
|
70
|
+
5. Path is not current working directory
|
|
71
|
+
6. Path is not home directory or parent of home
|
|
72
|
+
"""
|
|
73
|
+
# CHECK 0: Valid input
|
|
74
|
+
if not temp_dir or not isinstance(temp_dir, Path):
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
temp_dir = temp_dir.resolve()
|
|
78
|
+
|
|
79
|
+
# CHECK 1: Directory exists
|
|
80
|
+
if not temp_dir.exists() or not temp_dir.is_dir():
|
|
81
|
+
return False
|
|
82
|
+
|
|
83
|
+
# CHECK 2: Must be inside system temp directory
|
|
84
|
+
system_temp = Path(tempfile.gettempdir()).resolve()
|
|
85
|
+
try:
|
|
86
|
+
temp_dir.relative_to(system_temp)
|
|
87
|
+
except ValueError:
|
|
88
|
+
print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - not in system temp")
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
# CHECK 3: Directory name must start with "eval-"
|
|
92
|
+
if not temp_dir.name.startswith("eval-"):
|
|
93
|
+
print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - missing 'eval-' prefix")
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
# CHECK 4: Must contain our safety marker
|
|
97
|
+
marker_file = temp_dir / _MICROEVAL_TEMP_MARKER
|
|
98
|
+
if not marker_file.exists():
|
|
99
|
+
print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - missing safety marker")
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
# CHECK 5: Must not be current working directory
|
|
103
|
+
try:
|
|
104
|
+
if temp_dir == Path.cwd().resolve():
|
|
105
|
+
print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - is current directory")
|
|
106
|
+
return False
|
|
107
|
+
except:
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
# CHECK 6: Must not be or contain home directory
|
|
111
|
+
try:
|
|
112
|
+
home_dir = Path.home().resolve()
|
|
113
|
+
if temp_dir == home_dir:
|
|
114
|
+
print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - is home directory")
|
|
115
|
+
return False
|
|
116
|
+
# Check if home is inside temp_dir
|
|
117
|
+
try:
|
|
118
|
+
home_dir.relative_to(temp_dir)
|
|
119
|
+
print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - contains home directory")
|
|
120
|
+
return False
|
|
121
|
+
except ValueError:
|
|
122
|
+
pass # Good - home is not inside temp_dir
|
|
123
|
+
except:
|
|
124
|
+
pass
|
|
125
|
+
|
|
126
|
+
# ALL CHECKS PASSED - safe to delete
|
|
127
|
+
try:
|
|
128
|
+
shutil.rmtree(temp_dir)
|
|
129
|
+
return True
|
|
130
|
+
except Exception as e:
|
|
131
|
+
print(f"Warning: Failed to cleanup {temp_dir}: {e}")
|
|
132
|
+
return False
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def load_source(source_type: str, location: str, s3_client=None, bucket_name: str = None, base_path: str = None) -> dict:
|
|
136
|
+
"""Load evaluation YAML from various sources (url, s3_key, inline, or file)"""
|
|
137
|
+
if source_type == "file":
|
|
138
|
+
if base_path:
|
|
139
|
+
file_path = Path(base_path) / location
|
|
140
|
+
else:
|
|
141
|
+
file_path = Path(location)
|
|
142
|
+
with open(file_path, 'r') as f:
|
|
143
|
+
return yaml.safe_load(f)
|
|
144
|
+
|
|
145
|
+
elif source_type == "url":
|
|
146
|
+
with urllib.request.urlopen(location) as response:
|
|
147
|
+
return yaml.safe_load(response.read().decode('utf-8'))
|
|
148
|
+
|
|
149
|
+
elif source_type == "s3_key":
|
|
150
|
+
if not s3_client or not bucket_name:
|
|
151
|
+
raise ValueError("S3 client and bucket_name required for s3_key source")
|
|
152
|
+
response = s3_client.get_object(Bucket=bucket_name, Key=location)
|
|
153
|
+
return yaml.safe_load(response['Body'].read().decode('utf-8'))
|
|
154
|
+
|
|
155
|
+
elif source_type == "inline":
|
|
156
|
+
return yaml.safe_load(location)
|
|
157
|
+
|
|
158
|
+
raise ValueError(f"Unknown source type: {source_type}")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def prepare_repo(repo_url: str) -> Path:
|
|
162
|
+
"""
|
|
163
|
+
Prepare repository for evaluation (clone remote or copy local).
|
|
164
|
+
Always returns a MARKED temp directory that can be safely deleted.
|
|
165
|
+
"""
|
|
166
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="eval-"))
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
# Detect if local path or remote URL
|
|
170
|
+
if not repo_url.startswith(('http://', 'https://', 'git@')):
|
|
171
|
+
# Local path - copy to temp for read-only safety
|
|
172
|
+
local_path = Path(repo_url).resolve()
|
|
173
|
+
if not local_path.exists():
|
|
174
|
+
raise Exception(f"Local path does not exist: {repo_url}")
|
|
175
|
+
|
|
176
|
+
# Copy with ignore patterns
|
|
177
|
+
ignore = shutil.ignore_patterns(
|
|
178
|
+
'node_modules', '.git', '__pycache__', 'venv', '.venv',
|
|
179
|
+
'.next', 'dist', 'build', '.cache', 'coverage', '*.pyc', '.DS_Store'
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
for item in local_path.iterdir():
|
|
183
|
+
if item.is_dir():
|
|
184
|
+
shutil.copytree(item, temp_dir / item.name, ignore=ignore, dirs_exist_ok=True)
|
|
185
|
+
else:
|
|
186
|
+
shutil.copy2(item, temp_dir)
|
|
187
|
+
else:
|
|
188
|
+
# Remote URL - git clone
|
|
189
|
+
result = subprocess.run(
|
|
190
|
+
['git', 'clone', '--depth', '1', repo_url, str(temp_dir)],
|
|
191
|
+
capture_output=True, text=True, timeout=60
|
|
192
|
+
)
|
|
193
|
+
if result.returncode != 0:
|
|
194
|
+
raise Exception(f"Failed to clone repository: {result.stderr}")
|
|
195
|
+
|
|
196
|
+
# CRITICAL: Create safety marker file
|
|
197
|
+
marker = temp_dir / _MICROEVAL_TEMP_MARKER
|
|
198
|
+
marker.write_text(f"MicroEval temp directory created {datetime.now().isoformat()}\n")
|
|
199
|
+
|
|
200
|
+
return temp_dir
|
|
201
|
+
|
|
202
|
+
except Exception as e:
|
|
203
|
+
# Clean up if preparation failed
|
|
204
|
+
safe_cleanup_temp_dir(temp_dir)
|
|
205
|
+
raise
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# Keep clone_repo as an alias for backward compatibility
|
|
209
|
+
clone_repo = prepare_repo
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def build_prompt(eval_dict: dict) -> str:
|
|
213
|
+
"""Build prompt for evaluator agent with variable substitution support"""
|
|
214
|
+
|
|
215
|
+
# Get criteria
|
|
216
|
+
criteria = eval_dict['criteria']
|
|
217
|
+
inputs = eval_dict.get('inputs', {})
|
|
218
|
+
|
|
219
|
+
# Perform variable substitution for {variable_name} syntax in criteria
|
|
220
|
+
if inputs:
|
|
221
|
+
for key, value in inputs.items():
|
|
222
|
+
if value is not None:
|
|
223
|
+
# Replace {key} with value in criteria
|
|
224
|
+
placeholder = f"{{{key}}}"
|
|
225
|
+
criteria = criteria.replace(placeholder, str(value))
|
|
226
|
+
|
|
227
|
+
# Build inputs section for reference
|
|
228
|
+
inputs_section = ""
|
|
229
|
+
if inputs:
|
|
230
|
+
inputs_section = "\n\nProvided Inputs:\n"
|
|
231
|
+
for key, value in inputs.items():
|
|
232
|
+
if value is not None:
|
|
233
|
+
inputs_section += f"- {key}: {value}\n"
|
|
234
|
+
|
|
235
|
+
# Use the judge prompt template from YAML
|
|
236
|
+
prompt = JUDGE_PROMPT_TEMPLATE.format(
|
|
237
|
+
criteria=criteria,
|
|
238
|
+
inputs_section=inputs_section
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
return prompt
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def run_eval(temp_dir: Path, prompt: str, timeout: int = 300, max_retries: int = 3) -> bool:
|
|
245
|
+
"""
|
|
246
|
+
Run Claude CLI evaluator to analyze the work with rate limiting and retries.
|
|
247
|
+
|
|
248
|
+
Args:
|
|
249
|
+
temp_dir: Directory to run evaluation in
|
|
250
|
+
prompt: Evaluation prompt
|
|
251
|
+
timeout: Timeout in seconds
|
|
252
|
+
max_retries: Maximum retry attempts for rate limits (default: 3)
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
True if evaluation completed, False if timeout
|
|
256
|
+
"""
|
|
257
|
+
for attempt in range(max_retries):
|
|
258
|
+
try:
|
|
259
|
+
# Wait for rate limiter (adds 2s delay between requests)
|
|
260
|
+
_rate_limiter.wait()
|
|
261
|
+
|
|
262
|
+
result = subprocess.run(
|
|
263
|
+
['claude', '-p', prompt, '--dangerously-skip-permissions'],
|
|
264
|
+
cwd=str(temp_dir),
|
|
265
|
+
capture_output=True,
|
|
266
|
+
text=True,
|
|
267
|
+
timeout=timeout,
|
|
268
|
+
check=False
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Check for rate limits
|
|
272
|
+
is_rate_limited = (
|
|
273
|
+
(result.stdout and "Session limit reached" in result.stdout) or
|
|
274
|
+
(result.stderr and "Session limit reached" in result.stderr)
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
if is_rate_limited:
|
|
278
|
+
if attempt < max_retries - 1:
|
|
279
|
+
# Exponential backoff: 10s, 30s, 90s
|
|
280
|
+
wait_time = 10 * (3 ** attempt)
|
|
281
|
+
print(f"⚠️ Rate limit hit, waiting {wait_time}s before retry {attempt + 2}/{max_retries}...")
|
|
282
|
+
time.sleep(wait_time)
|
|
283
|
+
continue
|
|
284
|
+
else:
|
|
285
|
+
raise RuntimeError(f"Claude CLI rate limit reached after {max_retries} attempts. Please wait and try again later.")
|
|
286
|
+
|
|
287
|
+
# Success - no rate limit
|
|
288
|
+
return True
|
|
289
|
+
|
|
290
|
+
except subprocess.TimeoutExpired:
|
|
291
|
+
return False
|
|
292
|
+
except FileNotFoundError:
|
|
293
|
+
raise RuntimeError("Claude CLI not found - ensure it's installed")
|
|
294
|
+
|
|
295
|
+
return False
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def run_batch_eval(
|
|
299
|
+
temp_dir: Path,
|
|
300
|
+
eval_specs: list,
|
|
301
|
+
timeout: int = None,
|
|
302
|
+
max_retries: int = 3
|
|
303
|
+
) -> dict:
|
|
304
|
+
"""
|
|
305
|
+
Run multiple evaluations in a single Claude session.
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
temp_dir: Repository directory to evaluate
|
|
309
|
+
eval_specs: List of eval specifications (loaded YAML dicts)
|
|
310
|
+
timeout: Total timeout (default: 300s per eval)
|
|
311
|
+
max_retries: Max retry attempts for rate limits
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Dict mapping eval_id -> result dict (or error dict if failed)
|
|
315
|
+
|
|
316
|
+
Example:
|
|
317
|
+
eval_specs = [
|
|
318
|
+
load_source("file", "evals/nextjs/001.yaml"),
|
|
319
|
+
load_source("file", "evals/react/001.yaml"),
|
|
320
|
+
]
|
|
321
|
+
results = run_batch_eval(temp_dir, eval_specs)
|
|
322
|
+
# Returns: {
|
|
323
|
+
# "nextjs_server_component_fetch_001": {...result...},
|
|
324
|
+
# "react_missing_useeffect_dependencies_001": {...result...}
|
|
325
|
+
# }
|
|
326
|
+
"""
|
|
327
|
+
if not eval_specs:
|
|
328
|
+
return {}
|
|
329
|
+
|
|
330
|
+
# Calculate timeout: 300s per eval if not specified
|
|
331
|
+
if timeout is None:
|
|
332
|
+
timeout = len(eval_specs) * 300
|
|
333
|
+
|
|
334
|
+
# Build batch criteria section
|
|
335
|
+
batch_criteria = []
|
|
336
|
+
for i, spec in enumerate(eval_specs, 1):
|
|
337
|
+
eval_id = spec['eval_id']
|
|
338
|
+
criteria = spec['criteria']
|
|
339
|
+
|
|
340
|
+
# Perform input substitution
|
|
341
|
+
inputs = spec.get('inputs', {})
|
|
342
|
+
if inputs:
|
|
343
|
+
for key, value in inputs.items():
|
|
344
|
+
if value is not None:
|
|
345
|
+
criteria = criteria.replace(f"{{{key}}}", str(value))
|
|
346
|
+
|
|
347
|
+
batch_criteria.append(f"""
|
|
348
|
+
{'='*80}
|
|
349
|
+
EVALUATION {i}/{len(eval_specs)}
|
|
350
|
+
{'='*80}
|
|
351
|
+
EVAL ID: {eval_id}
|
|
352
|
+
FILENAME: eval_result_{eval_id}.json
|
|
353
|
+
|
|
354
|
+
CRITERIA:
|
|
355
|
+
{criteria}
|
|
356
|
+
""")
|
|
357
|
+
|
|
358
|
+
# Load batch prompt template
|
|
359
|
+
judge_prompt_path = Path(__file__).parent.parent / 'config' / 'judge_system_prompt.yaml'
|
|
360
|
+
with open(judge_prompt_path, 'r') as f:
|
|
361
|
+
prompts = yaml.safe_load(f)
|
|
362
|
+
template = prompts['batch_judge_prompt']['instruction_template']
|
|
363
|
+
|
|
364
|
+
# Build final prompt
|
|
365
|
+
prompt = template.format(
|
|
366
|
+
eval_count=len(eval_specs),
|
|
367
|
+
batch_criteria='\n'.join(batch_criteria)
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Run Claude (with rate limiting and retries)
|
|
371
|
+
for attempt in range(max_retries):
|
|
372
|
+
try:
|
|
373
|
+
_rate_limiter.wait()
|
|
374
|
+
|
|
375
|
+
result = subprocess.run(
|
|
376
|
+
['claude', '-p', prompt, '--dangerously-skip-permissions'],
|
|
377
|
+
cwd=str(temp_dir),
|
|
378
|
+
capture_output=True,
|
|
379
|
+
text=True,
|
|
380
|
+
timeout=timeout,
|
|
381
|
+
check=False
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
# Check for rate limits
|
|
385
|
+
is_rate_limited = (
|
|
386
|
+
(result.stdout and "Session limit reached" in result.stdout) or
|
|
387
|
+
(result.stderr and "Session limit reached" in result.stderr)
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
if is_rate_limited and attempt < max_retries - 1:
|
|
391
|
+
wait_time = 10 * (3 ** attempt)
|
|
392
|
+
print(f"⚠️ Rate limit hit, waiting {wait_time}s before retry {attempt + 2}/{max_retries}...")
|
|
393
|
+
time.sleep(wait_time)
|
|
394
|
+
continue
|
|
395
|
+
elif is_rate_limited:
|
|
396
|
+
raise RuntimeError(f"Claude CLI rate limit reached after {max_retries} attempts")
|
|
397
|
+
|
|
398
|
+
break
|
|
399
|
+
|
|
400
|
+
except subprocess.TimeoutExpired:
|
|
401
|
+
# Timeout - but some evals may have completed, collect what we can
|
|
402
|
+
print(f"⚠️ Batch evaluation timed out after {timeout}s. Collecting partial results...")
|
|
403
|
+
break
|
|
404
|
+
except FileNotFoundError:
|
|
405
|
+
raise RuntimeError("Claude CLI not found - ensure it's installed")
|
|
406
|
+
|
|
407
|
+
# Collect results for each eval
|
|
408
|
+
results = {}
|
|
409
|
+
for spec in eval_specs:
|
|
410
|
+
eval_id = spec['eval_id']
|
|
411
|
+
result_file = temp_dir / f"eval_result_{eval_id}.json"
|
|
412
|
+
|
|
413
|
+
if result_file.exists():
|
|
414
|
+
try:
|
|
415
|
+
with open(result_file, 'r') as f:
|
|
416
|
+
content = f.read()
|
|
417
|
+
start = content.find('{')
|
|
418
|
+
end = content.rfind('}')
|
|
419
|
+
if start != -1 and end != -1:
|
|
420
|
+
json_str = content[start:end+1]
|
|
421
|
+
results[eval_id] = json.loads(json_str)
|
|
422
|
+
else:
|
|
423
|
+
results[eval_id] = {
|
|
424
|
+
"status": "error",
|
|
425
|
+
"score": 0.0,
|
|
426
|
+
"error": "Invalid JSON format in result file"
|
|
427
|
+
}
|
|
428
|
+
except Exception as e:
|
|
429
|
+
results[eval_id] = {
|
|
430
|
+
"status": "error",
|
|
431
|
+
"score": 0.0,
|
|
432
|
+
"error": f"Failed to parse result: {str(e)}"
|
|
433
|
+
}
|
|
434
|
+
else:
|
|
435
|
+
results[eval_id] = {
|
|
436
|
+
"status": "error",
|
|
437
|
+
"score": 0.0,
|
|
438
|
+
"error": "Result file not created"
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
return results
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def read_result(temp_dir: Path) -> dict:
|
|
445
|
+
"""Read evaluation result JSON from temp directory"""
|
|
446
|
+
result_file = temp_dir / 'eval_result.json'
|
|
447
|
+
if not result_file.exists():
|
|
448
|
+
raise FileNotFoundError("No eval_result.json found")
|
|
449
|
+
|
|
450
|
+
with open(result_file, 'r') as f:
|
|
451
|
+
content = f.read()
|
|
452
|
+
|
|
453
|
+
# Try to find JSON object in the content (handle extra text before/after)
|
|
454
|
+
# Look for the first { and last }
|
|
455
|
+
start = content.find('{')
|
|
456
|
+
end = content.rfind('}')
|
|
457
|
+
|
|
458
|
+
if start == -1 or end == -1 or start >= end:
|
|
459
|
+
raise ValueError(f"No valid JSON object found in eval_result.json. Content: {content[:200]}")
|
|
460
|
+
|
|
461
|
+
json_str = content[start:end+1]
|
|
462
|
+
|
|
463
|
+
try:
|
|
464
|
+
return json.loads(json_str)
|
|
465
|
+
except json.JSONDecodeError as e:
|
|
466
|
+
raise ValueError(f"Failed to parse JSON: {e}. Content: {json_str[:200]}")
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def save_results(result: dict, eval_spec: dict, repo_url: str, output_dir: str = "results") -> Path:
|
|
470
|
+
"""Save results to file"""
|
|
471
|
+
output_path = Path(output_dir)
|
|
472
|
+
output_path.mkdir(exist_ok=True)
|
|
473
|
+
|
|
474
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
475
|
+
filename = f"{eval_spec['eval_id']}_{timestamp}.json"
|
|
476
|
+
output_file = output_path / filename
|
|
477
|
+
|
|
478
|
+
result['metadata'] = {
|
|
479
|
+
'eval_id': eval_spec['eval_id'],
|
|
480
|
+
'eval_name': eval_spec['name'],
|
|
481
|
+
'repo_url': repo_url,
|
|
482
|
+
'timestamp': datetime.now().isoformat(),
|
|
483
|
+
'evaluator': 'claude'
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
with open(output_file, 'w') as f:
|
|
487
|
+
json.dump(result, f, indent=2)
|
|
488
|
+
|
|
489
|
+
return output_file
|
|
490
|
+
|