microevals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. config/judge_system_prompt.yaml +113 -0
  2. evals/nextjs/001-server-component.yaml +28 -0
  3. evals/nextjs/002-client-component.yaml +26 -0
  4. evals/nextjs/003-cookies.yaml +28 -0
  5. evals/nextjs/010-route-handlers.yaml +30 -0
  6. evals/nextjs/013-pathname-server.yaml +29 -0
  7. evals/nextjs/014-server-routing.yaml +28 -0
  8. evals/nextjs/018-use-router.yaml +28 -0
  9. evals/nextjs/020_no_use_effect.yaml +30 -0
  10. evals/nextjs/021-avoid-fetch-in-effect.yaml +28 -0
  11. evals/nextjs/022_prefer_server_actions.yaml +29 -0
  12. evals/nextjs/023_avoid_getserversideprops.yaml +27 -0
  13. evals/nextjs/024_avoid_redundant_usestate.yaml +29 -0
  14. evals/nextjs/025_no_async_client_components.yaml +29 -0
  15. evals/nextjs/026_no_serial_await.yaml +26 -0
  16. evals/nextjs/027-prefer-next-image.yaml +30 -0
  17. evals/nextjs/027_no_hooks_in_server_components.yaml +29 -0
  18. evals/nextjs/028-prefer-next-font.yaml +30 -0
  19. evals/nextjs/028_cookies_headers_context.yaml +29 -0
  20. evals/nextjs/029_no_catch_redirect.yaml +31 -0
  21. evals/nextjs/030_app_router_migration.yaml +30 -0
  22. evals/nextjs/031_no_non_serializable_props.yaml +31 -0
  23. evals/react/001_missing_useeffect_dependencies.yaml +29 -0
  24. evals/react/002_incorrect_event_handler.yaml +28 -0
  25. evals/react/003_missing_return_in_map.yaml +28 -0
  26. evals/react/004_async_useeffect.yaml +32 -0
  27. evals/react/005_direct_state_mutation.yaml +30 -0
  28. evals/react/006_index_as_key.yaml +31 -0
  29. evals/react/zustand_store_usage.yaml +25 -0
  30. evals/shadcn/001_cn_utility_function.yaml +31 -0
  31. evals/shadcn/002_css_variables.yaml +32 -0
  32. evals/shadcn/003_component_dependencies.yaml +33 -0
  33. evals/shadcn/004_path_aliases.yaml +32 -0
  34. evals/shadcn/005_client_directive.yaml +31 -0
  35. evals/shadcn/006_tailwind_config.yaml +36 -0
  36. evals/shadcn/007_components_json_config.yaml +35 -0
  37. evals/supabase/001_client_setup.yaml +47 -0
  38. evals/supabase/002_auth_context_setup.yaml +43 -0
  39. evals/supabase/003_auth_flow_implementation.yaml +46 -0
  40. evals/supabase/004_auth_flow_testing_WIP.yaml +52 -0
  41. evals/supabase/005_auth_google_oauth.yaml +55 -0
  42. evals/supabase/007_storage_client_setup.yaml +43 -0
  43. evals/supabase/008_storage_nextjs_config.yaml +45 -0
  44. evals/supabase/009_storage_image_upload.yaml +49 -0
  45. evals/supabase/010_security_rls_enabled.yaml +42 -0
  46. evals/supabase/011_security_rls_policies.yaml +43 -0
  47. evals/supabase/012_security_no_service_key_exposed.yaml +49 -0
  48. evals/supabase/013_database_read_data.yaml +44 -0
  49. evals/supabase/014_database_create_data.yaml +44 -0
  50. evals/supabase/015_database_update_data.yaml +47 -0
  51. evals/supabase/016_database_delete_data.yaml +47 -0
  52. evals/supabase/017_database_user_scoped_query.yaml +52 -0
  53. evals/tailwind/001_tailwind_v4_config.yaml +22 -0
  54. evals/tailwind/002_content_paths.yaml +27 -0
  55. evals/tailwind/003_no_dynamic_class_construction.yaml +28 -0
  56. evals/tailwind/tailwind_postcss_config.yaml +24 -0
  57. evals/typescript/001_unsafe_type_assertions.yaml +39 -0
  58. evals/typescript/002_missing_null_checks.yaml +33 -0
  59. evals/vercel/001_vercel_deployment.yaml +19 -0
  60. evals/vercel/002_environment_variables_handling.yaml +23 -0
  61. evals/vercel/003_seo_metadata.yaml +33 -0
  62. microevals/__init__.py +34 -0
  63. microevals/eval_registry.py +222 -0
  64. microevals/eval_runner.py +533 -0
  65. microevals/utils.py +490 -0
  66. microevals-0.1.0.dist-info/METADATA +575 -0
  67. microevals-0.1.0.dist-info/RECORD +71 -0
  68. microevals-0.1.0.dist-info/WHEEL +5 -0
  69. microevals-0.1.0.dist-info/entry_points.txt +2 -0
  70. microevals-0.1.0.dist-info/licenses/LICENSE +21 -0
  71. microevals-0.1.0.dist-info/top_level.txt +1 -0
microevals/utils.py ADDED
@@ -0,0 +1,490 @@
1
+ """Utility functions for evaluations"""
2
+
3
+ import subprocess
4
+ import json
5
+ import tempfile
6
+ import shutil
7
+ from pathlib import Path
8
+ from typing import Dict, Any
9
+ from datetime import datetime
10
+ import yaml
11
+ import urllib.request
12
+ import urllib.error
13
+ import time
14
+ import threading
15
+
16
+
17
+ # Load judge system prompt at module initialization
18
+ _judge_prompt_path = Path(__file__).parent.parent / 'config' / 'judge_system_prompt.yaml'
19
+ with open(_judge_prompt_path, 'r') as f:
20
+ _judge_prompts = yaml.safe_load(f)
21
+ JUDGE_PROMPT_TEMPLATE = _judge_prompts['judge_prompt']['instruction_template']
22
+
23
+
24
+ # Global rate limiter to prevent hitting Claude CLI limits
25
+ class RateLimiter:
26
+ """Simple rate limiter with configurable delay between requests."""
27
+
28
+ def __init__(self, min_interval: float = 2.0):
29
+ """
30
+ Initialize rate limiter.
31
+
32
+ Args:
33
+ min_interval: Minimum seconds between requests (default: 2.0)
34
+ """
35
+ self.min_interval = min_interval
36
+ self.last_request_time = 0
37
+ self.lock = threading.Lock()
38
+
39
+ def wait(self):
40
+ """Wait until enough time has passed since last request."""
41
+ with self.lock:
42
+ current_time = time.time()
43
+ time_since_last = current_time - self.last_request_time
44
+
45
+ if time_since_last < self.min_interval:
46
+ sleep_time = self.min_interval - time_since_last
47
+ time.sleep(sleep_time)
48
+
49
+ self.last_request_time = time.time()
50
+
51
+
52
+ # Global rate limiter instance
53
+ _rate_limiter = RateLimiter(min_interval=2.0)
54
+
55
+ # SAFETY: Marker file to identify our temp directories
56
+ _MICROEVAL_TEMP_MARKER = ".microeval_temp_directory"
57
+
58
+
59
+ def safe_cleanup_temp_dir(temp_dir: Path) -> bool:
60
+ """
61
+ Safely remove temp directory with SIX independent safety checks.
62
+
63
+ Returns True if deleted, False if any safety check failed.
64
+
65
+ SAFETY CHECKS (ALL must pass):
66
+ 1. Directory exists and is a Path object
67
+ 2. Path is inside system temp directory
68
+ 3. Directory name starts with "eval-"
69
+ 4. Directory contains our safety marker file
70
+ 5. Path is not current working directory
71
+ 6. Path is not home directory or parent of home
72
+ """
73
+ # CHECK 0: Valid input
74
+ if not temp_dir or not isinstance(temp_dir, Path):
75
+ return False
76
+
77
+ temp_dir = temp_dir.resolve()
78
+
79
+ # CHECK 1: Directory exists
80
+ if not temp_dir.exists() or not temp_dir.is_dir():
81
+ return False
82
+
83
+ # CHECK 2: Must be inside system temp directory
84
+ system_temp = Path(tempfile.gettempdir()).resolve()
85
+ try:
86
+ temp_dir.relative_to(system_temp)
87
+ except ValueError:
88
+ print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - not in system temp")
89
+ return False
90
+
91
+ # CHECK 3: Directory name must start with "eval-"
92
+ if not temp_dir.name.startswith("eval-"):
93
+ print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - missing 'eval-' prefix")
94
+ return False
95
+
96
+ # CHECK 4: Must contain our safety marker
97
+ marker_file = temp_dir / _MICROEVAL_TEMP_MARKER
98
+ if not marker_file.exists():
99
+ print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - missing safety marker")
100
+ return False
101
+
102
+ # CHECK 5: Must not be current working directory
103
+ try:
104
+ if temp_dir == Path.cwd().resolve():
105
+ print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - is current directory")
106
+ return False
107
+ except:
108
+ pass
109
+
110
+ # CHECK 6: Must not be or contain home directory
111
+ try:
112
+ home_dir = Path.home().resolve()
113
+ if temp_dir == home_dir:
114
+ print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - is home directory")
115
+ return False
116
+ # Check if home is inside temp_dir
117
+ try:
118
+ home_dir.relative_to(temp_dir)
119
+ print(f"⚠️ SAFETY: Refusing to delete {temp_dir} - contains home directory")
120
+ return False
121
+ except ValueError:
122
+ pass # Good - home is not inside temp_dir
123
+ except:
124
+ pass
125
+
126
+ # ALL CHECKS PASSED - safe to delete
127
+ try:
128
+ shutil.rmtree(temp_dir)
129
+ return True
130
+ except Exception as e:
131
+ print(f"Warning: Failed to cleanup {temp_dir}: {e}")
132
+ return False
133
+
134
+
135
+ def load_source(source_type: str, location: str, s3_client=None, bucket_name: str = None, base_path: str = None) -> dict:
136
+ """Load evaluation YAML from various sources (url, s3_key, inline, or file)"""
137
+ if source_type == "file":
138
+ if base_path:
139
+ file_path = Path(base_path) / location
140
+ else:
141
+ file_path = Path(location)
142
+ with open(file_path, 'r') as f:
143
+ return yaml.safe_load(f)
144
+
145
+ elif source_type == "url":
146
+ with urllib.request.urlopen(location) as response:
147
+ return yaml.safe_load(response.read().decode('utf-8'))
148
+
149
+ elif source_type == "s3_key":
150
+ if not s3_client or not bucket_name:
151
+ raise ValueError("S3 client and bucket_name required for s3_key source")
152
+ response = s3_client.get_object(Bucket=bucket_name, Key=location)
153
+ return yaml.safe_load(response['Body'].read().decode('utf-8'))
154
+
155
+ elif source_type == "inline":
156
+ return yaml.safe_load(location)
157
+
158
+ raise ValueError(f"Unknown source type: {source_type}")
159
+
160
+
161
+ def prepare_repo(repo_url: str) -> Path:
162
+ """
163
+ Prepare repository for evaluation (clone remote or copy local).
164
+ Always returns a MARKED temp directory that can be safely deleted.
165
+ """
166
+ temp_dir = Path(tempfile.mkdtemp(prefix="eval-"))
167
+
168
+ try:
169
+ # Detect if local path or remote URL
170
+ if not repo_url.startswith(('http://', 'https://', 'git@')):
171
+ # Local path - copy to temp for read-only safety
172
+ local_path = Path(repo_url).resolve()
173
+ if not local_path.exists():
174
+ raise Exception(f"Local path does not exist: {repo_url}")
175
+
176
+ # Copy with ignore patterns
177
+ ignore = shutil.ignore_patterns(
178
+ 'node_modules', '.git', '__pycache__', 'venv', '.venv',
179
+ '.next', 'dist', 'build', '.cache', 'coverage', '*.pyc', '.DS_Store'
180
+ )
181
+
182
+ for item in local_path.iterdir():
183
+ if item.is_dir():
184
+ shutil.copytree(item, temp_dir / item.name, ignore=ignore, dirs_exist_ok=True)
185
+ else:
186
+ shutil.copy2(item, temp_dir)
187
+ else:
188
+ # Remote URL - git clone
189
+ result = subprocess.run(
190
+ ['git', 'clone', '--depth', '1', repo_url, str(temp_dir)],
191
+ capture_output=True, text=True, timeout=60
192
+ )
193
+ if result.returncode != 0:
194
+ raise Exception(f"Failed to clone repository: {result.stderr}")
195
+
196
+ # CRITICAL: Create safety marker file
197
+ marker = temp_dir / _MICROEVAL_TEMP_MARKER
198
+ marker.write_text(f"MicroEval temp directory created {datetime.now().isoformat()}\n")
199
+
200
+ return temp_dir
201
+
202
+ except Exception as e:
203
+ # Clean up if preparation failed
204
+ safe_cleanup_temp_dir(temp_dir)
205
+ raise
206
+
207
+
208
+ # Keep clone_repo as an alias for backward compatibility
209
+ clone_repo = prepare_repo
210
+
211
+
212
+ def build_prompt(eval_dict: dict) -> str:
213
+ """Build prompt for evaluator agent with variable substitution support"""
214
+
215
+ # Get criteria
216
+ criteria = eval_dict['criteria']
217
+ inputs = eval_dict.get('inputs', {})
218
+
219
+ # Perform variable substitution for {variable_name} syntax in criteria
220
+ if inputs:
221
+ for key, value in inputs.items():
222
+ if value is not None:
223
+ # Replace {key} with value in criteria
224
+ placeholder = f"{{{key}}}"
225
+ criteria = criteria.replace(placeholder, str(value))
226
+
227
+ # Build inputs section for reference
228
+ inputs_section = ""
229
+ if inputs:
230
+ inputs_section = "\n\nProvided Inputs:\n"
231
+ for key, value in inputs.items():
232
+ if value is not None:
233
+ inputs_section += f"- {key}: {value}\n"
234
+
235
+ # Use the judge prompt template from YAML
236
+ prompt = JUDGE_PROMPT_TEMPLATE.format(
237
+ criteria=criteria,
238
+ inputs_section=inputs_section
239
+ )
240
+
241
+ return prompt
242
+
243
+
244
+ def run_eval(temp_dir: Path, prompt: str, timeout: int = 300, max_retries: int = 3) -> bool:
245
+ """
246
+ Run Claude CLI evaluator to analyze the work with rate limiting and retries.
247
+
248
+ Args:
249
+ temp_dir: Directory to run evaluation in
250
+ prompt: Evaluation prompt
251
+ timeout: Timeout in seconds
252
+ max_retries: Maximum retry attempts for rate limits (default: 3)
253
+
254
+ Returns:
255
+ True if evaluation completed, False if timeout
256
+ """
257
+ for attempt in range(max_retries):
258
+ try:
259
+ # Wait for rate limiter (adds 2s delay between requests)
260
+ _rate_limiter.wait()
261
+
262
+ result = subprocess.run(
263
+ ['claude', '-p', prompt, '--dangerously-skip-permissions'],
264
+ cwd=str(temp_dir),
265
+ capture_output=True,
266
+ text=True,
267
+ timeout=timeout,
268
+ check=False
269
+ )
270
+
271
+ # Check for rate limits
272
+ is_rate_limited = (
273
+ (result.stdout and "Session limit reached" in result.stdout) or
274
+ (result.stderr and "Session limit reached" in result.stderr)
275
+ )
276
+
277
+ if is_rate_limited:
278
+ if attempt < max_retries - 1:
279
+ # Exponential backoff: 10s, 30s, 90s
280
+ wait_time = 10 * (3 ** attempt)
281
+ print(f"⚠️ Rate limit hit, waiting {wait_time}s before retry {attempt + 2}/{max_retries}...")
282
+ time.sleep(wait_time)
283
+ continue
284
+ else:
285
+ raise RuntimeError(f"Claude CLI rate limit reached after {max_retries} attempts. Please wait and try again later.")
286
+
287
+ # Success - no rate limit
288
+ return True
289
+
290
+ except subprocess.TimeoutExpired:
291
+ return False
292
+ except FileNotFoundError:
293
+ raise RuntimeError("Claude CLI not found - ensure it's installed")
294
+
295
+ return False
296
+
297
+
298
+ def run_batch_eval(
299
+ temp_dir: Path,
300
+ eval_specs: list,
301
+ timeout: int = None,
302
+ max_retries: int = 3
303
+ ) -> dict:
304
+ """
305
+ Run multiple evaluations in a single Claude session.
306
+
307
+ Args:
308
+ temp_dir: Repository directory to evaluate
309
+ eval_specs: List of eval specifications (loaded YAML dicts)
310
+ timeout: Total timeout (default: 300s per eval)
311
+ max_retries: Max retry attempts for rate limits
312
+
313
+ Returns:
314
+ Dict mapping eval_id -> result dict (or error dict if failed)
315
+
316
+ Example:
317
+ eval_specs = [
318
+ load_source("file", "evals/nextjs/001.yaml"),
319
+ load_source("file", "evals/react/001.yaml"),
320
+ ]
321
+ results = run_batch_eval(temp_dir, eval_specs)
322
+ # Returns: {
323
+ # "nextjs_server_component_fetch_001": {...result...},
324
+ # "react_missing_useeffect_dependencies_001": {...result...}
325
+ # }
326
+ """
327
+ if not eval_specs:
328
+ return {}
329
+
330
+ # Calculate timeout: 300s per eval if not specified
331
+ if timeout is None:
332
+ timeout = len(eval_specs) * 300
333
+
334
+ # Build batch criteria section
335
+ batch_criteria = []
336
+ for i, spec in enumerate(eval_specs, 1):
337
+ eval_id = spec['eval_id']
338
+ criteria = spec['criteria']
339
+
340
+ # Perform input substitution
341
+ inputs = spec.get('inputs', {})
342
+ if inputs:
343
+ for key, value in inputs.items():
344
+ if value is not None:
345
+ criteria = criteria.replace(f"{{{key}}}", str(value))
346
+
347
+ batch_criteria.append(f"""
348
+ {'='*80}
349
+ EVALUATION {i}/{len(eval_specs)}
350
+ {'='*80}
351
+ EVAL ID: {eval_id}
352
+ FILENAME: eval_result_{eval_id}.json
353
+
354
+ CRITERIA:
355
+ {criteria}
356
+ """)
357
+
358
+ # Load batch prompt template
359
+ judge_prompt_path = Path(__file__).parent.parent / 'config' / 'judge_system_prompt.yaml'
360
+ with open(judge_prompt_path, 'r') as f:
361
+ prompts = yaml.safe_load(f)
362
+ template = prompts['batch_judge_prompt']['instruction_template']
363
+
364
+ # Build final prompt
365
+ prompt = template.format(
366
+ eval_count=len(eval_specs),
367
+ batch_criteria='\n'.join(batch_criteria)
368
+ )
369
+
370
+ # Run Claude (with rate limiting and retries)
371
+ for attempt in range(max_retries):
372
+ try:
373
+ _rate_limiter.wait()
374
+
375
+ result = subprocess.run(
376
+ ['claude', '-p', prompt, '--dangerously-skip-permissions'],
377
+ cwd=str(temp_dir),
378
+ capture_output=True,
379
+ text=True,
380
+ timeout=timeout,
381
+ check=False
382
+ )
383
+
384
+ # Check for rate limits
385
+ is_rate_limited = (
386
+ (result.stdout and "Session limit reached" in result.stdout) or
387
+ (result.stderr and "Session limit reached" in result.stderr)
388
+ )
389
+
390
+ if is_rate_limited and attempt < max_retries - 1:
391
+ wait_time = 10 * (3 ** attempt)
392
+ print(f"⚠️ Rate limit hit, waiting {wait_time}s before retry {attempt + 2}/{max_retries}...")
393
+ time.sleep(wait_time)
394
+ continue
395
+ elif is_rate_limited:
396
+ raise RuntimeError(f"Claude CLI rate limit reached after {max_retries} attempts")
397
+
398
+ break
399
+
400
+ except subprocess.TimeoutExpired:
401
+ # Timeout - but some evals may have completed, collect what we can
402
+ print(f"⚠️ Batch evaluation timed out after {timeout}s. Collecting partial results...")
403
+ break
404
+ except FileNotFoundError:
405
+ raise RuntimeError("Claude CLI not found - ensure it's installed")
406
+
407
+ # Collect results for each eval
408
+ results = {}
409
+ for spec in eval_specs:
410
+ eval_id = spec['eval_id']
411
+ result_file = temp_dir / f"eval_result_{eval_id}.json"
412
+
413
+ if result_file.exists():
414
+ try:
415
+ with open(result_file, 'r') as f:
416
+ content = f.read()
417
+ start = content.find('{')
418
+ end = content.rfind('}')
419
+ if start != -1 and end != -1:
420
+ json_str = content[start:end+1]
421
+ results[eval_id] = json.loads(json_str)
422
+ else:
423
+ results[eval_id] = {
424
+ "status": "error",
425
+ "score": 0.0,
426
+ "error": "Invalid JSON format in result file"
427
+ }
428
+ except Exception as e:
429
+ results[eval_id] = {
430
+ "status": "error",
431
+ "score": 0.0,
432
+ "error": f"Failed to parse result: {str(e)}"
433
+ }
434
+ else:
435
+ results[eval_id] = {
436
+ "status": "error",
437
+ "score": 0.0,
438
+ "error": "Result file not created"
439
+ }
440
+
441
+ return results
442
+
443
+
444
+ def read_result(temp_dir: Path) -> dict:
445
+ """Read evaluation result JSON from temp directory"""
446
+ result_file = temp_dir / 'eval_result.json'
447
+ if not result_file.exists():
448
+ raise FileNotFoundError("No eval_result.json found")
449
+
450
+ with open(result_file, 'r') as f:
451
+ content = f.read()
452
+
453
+ # Try to find JSON object in the content (handle extra text before/after)
454
+ # Look for the first { and last }
455
+ start = content.find('{')
456
+ end = content.rfind('}')
457
+
458
+ if start == -1 or end == -1 or start >= end:
459
+ raise ValueError(f"No valid JSON object found in eval_result.json. Content: {content[:200]}")
460
+
461
+ json_str = content[start:end+1]
462
+
463
+ try:
464
+ return json.loads(json_str)
465
+ except json.JSONDecodeError as e:
466
+ raise ValueError(f"Failed to parse JSON: {e}. Content: {json_str[:200]}")
467
+
468
+
469
+ def save_results(result: dict, eval_spec: dict, repo_url: str, output_dir: str = "results") -> Path:
470
+ """Save results to file"""
471
+ output_path = Path(output_dir)
472
+ output_path.mkdir(exist_ok=True)
473
+
474
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
475
+ filename = f"{eval_spec['eval_id']}_{timestamp}.json"
476
+ output_file = output_path / filename
477
+
478
+ result['metadata'] = {
479
+ 'eval_id': eval_spec['eval_id'],
480
+ 'eval_name': eval_spec['name'],
481
+ 'repo_url': repo_url,
482
+ 'timestamp': datetime.now().isoformat(),
483
+ 'evaluator': 'claude'
484
+ }
485
+
486
+ with open(output_file, 'w') as f:
487
+ json.dump(result, f, indent=2)
488
+
489
+ return output_file
490
+