microevals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. config/judge_system_prompt.yaml +113 -0
  2. evals/nextjs/001-server-component.yaml +28 -0
  3. evals/nextjs/002-client-component.yaml +26 -0
  4. evals/nextjs/003-cookies.yaml +28 -0
  5. evals/nextjs/010-route-handlers.yaml +30 -0
  6. evals/nextjs/013-pathname-server.yaml +29 -0
  7. evals/nextjs/014-server-routing.yaml +28 -0
  8. evals/nextjs/018-use-router.yaml +28 -0
  9. evals/nextjs/020_no_use_effect.yaml +30 -0
  10. evals/nextjs/021-avoid-fetch-in-effect.yaml +28 -0
  11. evals/nextjs/022_prefer_server_actions.yaml +29 -0
  12. evals/nextjs/023_avoid_getserversideprops.yaml +27 -0
  13. evals/nextjs/024_avoid_redundant_usestate.yaml +29 -0
  14. evals/nextjs/025_no_async_client_components.yaml +29 -0
  15. evals/nextjs/026_no_serial_await.yaml +26 -0
  16. evals/nextjs/027-prefer-next-image.yaml +30 -0
  17. evals/nextjs/027_no_hooks_in_server_components.yaml +29 -0
  18. evals/nextjs/028-prefer-next-font.yaml +30 -0
  19. evals/nextjs/028_cookies_headers_context.yaml +29 -0
  20. evals/nextjs/029_no_catch_redirect.yaml +31 -0
  21. evals/nextjs/030_app_router_migration.yaml +30 -0
  22. evals/nextjs/031_no_non_serializable_props.yaml +31 -0
  23. evals/react/001_missing_useeffect_dependencies.yaml +29 -0
  24. evals/react/002_incorrect_event_handler.yaml +28 -0
  25. evals/react/003_missing_return_in_map.yaml +28 -0
  26. evals/react/004_async_useeffect.yaml +32 -0
  27. evals/react/005_direct_state_mutation.yaml +30 -0
  28. evals/react/006_index_as_key.yaml +31 -0
  29. evals/react/zustand_store_usage.yaml +25 -0
  30. evals/shadcn/001_cn_utility_function.yaml +31 -0
  31. evals/shadcn/002_css_variables.yaml +32 -0
  32. evals/shadcn/003_component_dependencies.yaml +33 -0
  33. evals/shadcn/004_path_aliases.yaml +32 -0
  34. evals/shadcn/005_client_directive.yaml +31 -0
  35. evals/shadcn/006_tailwind_config.yaml +36 -0
  36. evals/shadcn/007_components_json_config.yaml +35 -0
  37. evals/supabase/001_client_setup.yaml +47 -0
  38. evals/supabase/002_auth_context_setup.yaml +43 -0
  39. evals/supabase/003_auth_flow_implementation.yaml +46 -0
  40. evals/supabase/004_auth_flow_testing_WIP.yaml +52 -0
  41. evals/supabase/005_auth_google_oauth.yaml +55 -0
  42. evals/supabase/007_storage_client_setup.yaml +43 -0
  43. evals/supabase/008_storage_nextjs_config.yaml +45 -0
  44. evals/supabase/009_storage_image_upload.yaml +49 -0
  45. evals/supabase/010_security_rls_enabled.yaml +42 -0
  46. evals/supabase/011_security_rls_policies.yaml +43 -0
  47. evals/supabase/012_security_no_service_key_exposed.yaml +49 -0
  48. evals/supabase/013_database_read_data.yaml +44 -0
  49. evals/supabase/014_database_create_data.yaml +44 -0
  50. evals/supabase/015_database_update_data.yaml +47 -0
  51. evals/supabase/016_database_delete_data.yaml +47 -0
  52. evals/supabase/017_database_user_scoped_query.yaml +52 -0
  53. evals/tailwind/001_tailwind_v4_config.yaml +22 -0
  54. evals/tailwind/002_content_paths.yaml +27 -0
  55. evals/tailwind/003_no_dynamic_class_construction.yaml +28 -0
  56. evals/tailwind/tailwind_postcss_config.yaml +24 -0
  57. evals/typescript/001_unsafe_type_assertions.yaml +39 -0
  58. evals/typescript/002_missing_null_checks.yaml +33 -0
  59. evals/vercel/001_vercel_deployment.yaml +19 -0
  60. evals/vercel/002_environment_variables_handling.yaml +23 -0
  61. evals/vercel/003_seo_metadata.yaml +33 -0
  62. microevals/__init__.py +34 -0
  63. microevals/eval_registry.py +222 -0
  64. microevals/eval_runner.py +533 -0
  65. microevals/utils.py +490 -0
  66. microevals-0.1.0.dist-info/METADATA +575 -0
  67. microevals-0.1.0.dist-info/RECORD +71 -0
  68. microevals-0.1.0.dist-info/WHEEL +5 -0
  69. microevals-0.1.0.dist-info/entry_points.txt +2 -0
  70. microevals-0.1.0.dist-info/licenses/LICENSE +21 -0
  71. microevals-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,22 @@
1
+ eval_id: nextjs_tailwind_v4_config
2
+ name: "Tailwind CSS v4 Configuration"
3
+ description: "Verify the agent correctly recognizes and configures Tailwind CSS v4 syntax"
4
+ category: tailwind
5
+
6
+ criteria: |
7
+ Check Tailwind CSS v4 configuration.
8
+
9
+ REQUIREMENTS:
10
+ 1. globals.css contains: @import "tailwindcss";
11
+ 2. globals.css does NOT contain old v3 directives (@tailwind base/components/utilities)
12
+ 3. postcss.config has: plugins: ["@tailwindcss/postcss"]
13
+ 4. package.json has: "tailwindcss": "^4"
14
+
15
+ EVALUATION:
16
+ - If no Tailwind v4: score -1.0 (N/A)
17
+ - If all requirements met: score 1.0
18
+ - If using old v3 syntax: score 0.0
19
+
20
+ # Optional: Custom inputs for this specific eval
21
+ inputs:
22
+ deployment_url: "https://agentic-9d1c6c3d.vercel.app/"
@@ -0,0 +1,27 @@
1
+ eval_id: tailwind_content_paths_002
2
+ name: "Correct Content Paths in Tailwind Config"
3
+ description: "Checks if tailwind.config content paths include all files using Tailwind classes"
4
+ category: tailwind
5
+
6
+ criteria: |
7
+ Verify content paths in tailwind.config include all files with Tailwind classes.
8
+
9
+ ANTI-PATTERN:
10
+ - content: ['./app/**/*.{js,ts,jsx,tsx}'] but components are in /components
11
+ - Missing file extensions (.mdx, .vue, etc.)
12
+ - Hardcoded paths that don't match actual file structure
13
+
14
+ WHY IT'S WRONG:
15
+ - Classes in files not matched by content paths get purged in production
16
+ - Styles work in dev but break in production build
17
+ - Silent failure - no error, just missing styles
18
+
19
+ CORRECT:
20
+ - content: ['./app/**/*.{js,ts,jsx,tsx,mdx}', './components/**/*.{js,ts,jsx,tsx}']
21
+ - Include all directories and extensions where Tailwind is used
22
+
23
+ SCORING:
24
+ - Score 1.0 (PASS): Tailwind config exists AND content paths correctly include all files with Tailwind classes
25
+ - Score 0.0 (FAIL): Tailwind config exists BUT files with classes are outside content paths (anti-pattern)
26
+ - Score -1.0 (N/A): No Tailwind config found in project
27
+
@@ -0,0 +1,28 @@
1
+ eval_id: tailwind_no_dynamic_class_construction_003
2
+ name: "No Dynamic Tailwind Class Construction"
3
+ description: "Checks for dynamic class string construction that breaks Tailwind's purging/scanning"
4
+ category: tailwind
5
+
6
+ criteria: |
7
+ Detect dynamic class string construction that breaks Tailwind purging.
8
+
9
+ ANTI-PATTERN:
10
+ - className={'text-' + color} or className={`bg-${color}-500`}
11
+ - Building class names from variables/props
12
+ - Concatenating partial class names
13
+
14
+ WHY IT'S WRONG:
15
+ - Tailwind scans for complete class strings at build time
16
+ - Dynamically constructed classes get purged
17
+ - Works in dev, breaks in production
18
+
19
+ CORRECT:
20
+ - Complete class strings: className={color === 'red' ? 'text-red-500' : 'text-blue-500'}
21
+ - Use safelist in config for truly dynamic classes
22
+ - Or use inline styles for dynamic values
23
+
24
+ SCORING:
25
+ - Score 1.0 (PASS): No dynamic class construction found - all Tailwind classes are complete strings
26
+ - Score 0.0 (FAIL): Dynamic class construction found (template literals or concatenation building classes)
27
+ - Score -1.0 (N/A): No Tailwind usage found in project
28
+
@@ -0,0 +1,24 @@
1
+ eval_id: nextjs_tailwind_postcss_config
2
+ name: "Tailwind CSS PostCSS Configuration"
3
+ description: "Verify the agent correctly diagnoses and fixes missing PostCSS configuration for Tailwind CSS"
4
+ category: tailwind
5
+
6
+ criteria: |
7
+ Check Tailwind CSS v3 PostCSS configuration (traditional setup).
8
+
9
+ IMPORTANT: This eval checks for Tailwind CSS v3 configuration ONLY.
10
+ If the project uses Tailwind v4 (indicated by @import "tailwindcss" or @tailwindcss/postcss),
11
+ score -1.0 (N/A) since v4 has different requirements.
12
+
13
+ REQUIREMENTS FOR V3:
14
+ 1. postcss.config.js/mjs exists
15
+ 2. Contains "tailwindcss" plugin (not @tailwindcss/postcss)
16
+ 3. Contains autoprefixer plugin
17
+ 4. tailwind.config.js exists with content paths
18
+ 5. globals.css has @tailwind directives (not @import)
19
+
20
+ SCORING:
21
+ - If using Tailwind v4 syntax (@import "tailwindcss" OR @tailwindcss/postcss): score -1.0 (N/A)
22
+ - If no Tailwind at all: score -1.0 (N/A)
23
+ - If all v3 requirements met: score 1.0 (PASS)
24
+ - If v3 config incomplete/incorrect: score 0.0 (FAIL)
@@ -0,0 +1,39 @@
1
+ eval_id: typescript_unsafe_type_assertions_001
2
+ name: "Unsafe Type Assertions"
3
+ description: "Checks for type assertions (as) without runtime validation"
4
+ category: typescript
5
+
6
+ criteria: |
7
+ Detect type assertions that bypass type checking without validation.
8
+
9
+ ANTI-PATTERN:
10
+ - const user = data as User // no validation
11
+ - response as MyType // assuming type without checking
12
+ - (value as any) to bypass errors
13
+ - Using 'as' without runtime validation
14
+
15
+ WHY IT'S WRONG:
16
+ - Type assertions don't validate at runtime
17
+ - Causes crashes when data doesn't match assumed type
18
+ - TypeScript provides false sense of security
19
+ - Common LLM mistake
20
+
21
+ CORRECT:
22
+ - Validate before asserting: if (isUser(data)) { const user = data as User }
23
+ - Use type guards: function isUser(obj: any): obj is User { ... }
24
+ - Use zod or similar for runtime validation
25
+ - Avoid 'as' unless absolutely necessary
26
+
27
+ EVALUATION:
28
+ - Scan for 'as TypeName' assertions (excluding safe ones like 'as const')
29
+ - Check if there's validation before the assertion
30
+ - Particularly check API responses and external data
31
+
32
+ SCORING:
33
+ - Score 1.0 (PASS): No unsafe type assertions found, or all type assertions have proper validation
34
+ - Score 0.0 (FAIL): Unsafe type assertions found without runtime validation
35
+ - Score -1.0 (N/A): Not a TypeScript project, can't evaluate
36
+
37
+ Note: Finding ZERO type assertions is a GOOD thing and should score 1.0 (PASS).
38
+ The absence of unsafe patterns means the code is using proper TypeScript without bypassing type safety.
39
+
@@ -0,0 +1,33 @@
1
+ eval_id: typescript_missing_null_checks_002
2
+ name: "Missing Null/Undefined Checks"
3
+ description: "Checks for missing null/undefined checks before accessing properties"
4
+ category: typescript
5
+
6
+ criteria: |
7
+ Detect property access without null/undefined checks.
8
+
9
+ ANTI-PATTERN:
10
+ - user.name when user might be null/undefined
11
+ - data.items[0].value without checking data exists
12
+ - Accessing nested properties without optional chaining
13
+
14
+ WHY IT'S WRONG:
15
+ - Runtime error: "Cannot read property 'name' of undefined"
16
+ - App crashes
17
+ - Common with API responses, optional props
18
+ - TypeScript might not catch if types are wrong
19
+
20
+ CORRECT:
21
+ - user?.name (optional chaining)
22
+ - if (user) { user.name }
23
+ - user && user.name
24
+ - data?.items?.[0]?.value (nested optional chaining)
25
+
26
+ SCORING:
27
+ - Score 1.0 (PASS): No missing null checks found, all property accesses are safe
28
+ - Score 0.0 (FAIL): Missing null checks found, unsafe property access detected
29
+ - Score -1.0 (N/A): Not a TypeScript project, can't evaluate
30
+
31
+ Note: Finding NO missing null checks is a GOOD thing and should score 1.0 (PASS).
32
+ The absence of unsafe property access means the code properly handles nullability.
33
+
@@ -0,0 +1,19 @@
1
+ eval_id: vercel_deployment_check
2
+ name: "Vercel Deployment"
3
+ description: "Check if deployed site is live and matches the codebase"
4
+ category: vercel
5
+
6
+ criteria: |
7
+ Curl {deployment_url} and verify deployment is working. Check HTTP 200, valid HTML, no error pages.
8
+
9
+ If the repository has source code, also verify the deployed content matches the codebase - compare
10
+ HTML against components/routes/framework markers. If repo is empty or has no code, just verify
11
+ the site loads successfully.
12
+
13
+ SCORING:
14
+ - Score 1.0 (PASS): Site loads successfully (HTTP 200) AND content matches codebase (or no code to match)
15
+ - Score 0.0 (FAIL): Errors, broken deployment, or content completely different from codebase
16
+ - Score -1.0 (N/A): No deployment URL provided
17
+
18
+ inputs:
19
+ deployment_url: "" # Will be provided at runtime from generation metadata
@@ -0,0 +1,23 @@
1
+ eval_id: vercel_environment_variables
2
+ name: "Environment Variables Configuration"
3
+ description: "Verify the application properly handles environment variables for Vercel deployment"
4
+ category: vercel
5
+
6
+ criteria: |
7
+ Evaluate how the application handles environment variables in a Vercel deployment context.
8
+
9
+ Check if the app demonstrates proper env var usage:
10
+ - Public vars prefixed with NEXT_PUBLIC_ for client access
11
+ - Sensitive vars (API keys, secrets) NOT prefixed with NEXT_PUBLIC_
12
+ - Graceful handling of missing environment variables
13
+ - No hardcoded secrets in the codebase
14
+ - Proper use of process.env with fallbacks where appropriate
15
+ - .env.example or documentation showing required vars
16
+
17
+ SCORING:
18
+ - Score 1.0 (PASS): Environment variables properly scoped, no exposed secrets, graceful handling, clear documentation
19
+ - Score 0.0 (FAIL): Secrets exposed with NEXT_PUBLIC_, hardcoded API keys, app crashes when vars missing, or no documentation
20
+ - Score -1.0 (N/A): No environment variables used in the application
21
+
22
+ inputs:
23
+ deployment_url: ""
@@ -0,0 +1,33 @@
1
+ eval_id: vercel_seo_metadata
2
+ name: "SEO and Metadata Configuration"
3
+ description: "Verify proper SEO metadata and Open Graph tags for production deployment"
4
+ category: vercel
5
+
6
+ criteria: |
7
+ Evaluate the application's SEO and social sharing metadata. Use a balanced scoring approach where having most SEO elements is sufficient for a passing score.
8
+
9
+ Check for these elements (award points for presence):
10
+ - Page titles using Metadata API or next/head [High priority]
11
+ - Meta descriptions for key pages [High priority]
12
+ - Open Graph tags (og:title, og:description, og:image) [Medium priority]
13
+ - Twitter Card tags for social sharing [Low priority]
14
+ - Canonical URLs to prevent duplicate content [Medium priority]
15
+ - Structured data (JSON-LD) [Low priority]
16
+ - Robots.txt and sitemap.xml generation [Medium priority]
17
+ - Alt text on images for accessibility [High priority]
18
+
19
+ Mark as PASS if the app has MOST of the good stuff:
20
+ - Has at least 4-5 of the 8 elements above (simple majority)
21
+ - Shows clear SEO effort (titles and descriptions present)
22
+ - Missing a few low-priority items is acceptable
23
+ - The positives clearly outweigh any gaps
24
+
25
+ Mark as FAIL only if severely lacking:
26
+ - Missing most high-priority items (no titles AND no descriptions)
27
+ - Has fewer than 3 total SEO elements
28
+ - Shows no SEO consideration whatsoever
29
+
30
+ When in doubt, favor PASS if reasonable SEO effort is evident.
31
+
32
+ inputs:
33
+ deployment_url: ""
microevals/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ """
2
+ MicroEvals - A lightweight framework for evaluating code against specific criteria.
3
+
4
+ This package provides tools to run automated evaluations on codebases created by agents.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ from .eval_registry import EvalRegistry
10
+ from .utils import (
11
+ load_source,
12
+ clone_repo,
13
+ prepare_repo,
14
+ build_prompt,
15
+ run_eval,
16
+ run_batch_eval,
17
+ read_result,
18
+ save_results,
19
+ safe_cleanup_temp_dir
20
+ )
21
+
22
+ __all__ = [
23
+ 'EvalRegistry',
24
+ 'load_source',
25
+ 'clone_repo',
26
+ 'prepare_repo',
27
+ 'build_prompt',
28
+ 'run_eval',
29
+ 'run_batch_eval',
30
+ 'read_result',
31
+ 'save_results',
32
+ 'safe_cleanup_temp_dir'
33
+ ]
34
+
@@ -0,0 +1,222 @@
1
+ """
2
+ Eval Registry - Central catalog of all evaluations with applicability logic.
3
+
4
+ This module discovers all eval files and provides metadata about each eval,
5
+ including its category, dependencies, and applicability conditions.
6
+ """
7
+
8
+ from pathlib import Path
9
+ from typing import List, Dict, Any
10
+ import yaml
11
+ import sys
12
+
13
+
14
+ class EvalRegistry:
15
+ """Registry of all available evaluations."""
16
+
17
+ def __init__(self, evals_dir: str = None):
18
+ if evals_dir is None:
19
+ # Try to find evals relative to package installation
20
+ try:
21
+ # Use importlib.resources for Python 3.9+ or fall back to __file__
22
+ if sys.version_info >= (3, 9):
23
+ from importlib.resources import files
24
+ evals_dir = str(files('microevals').parent / 'evals')
25
+ else:
26
+ # Fallback for older Python versions
27
+ evals_dir = str(Path(__file__).parent.parent / 'evals')
28
+ except:
29
+ # Final fallback - look relative to this file
30
+ evals_dir = str(Path(__file__).parent.parent / 'evals')
31
+ self.evals_dir = Path(evals_dir)
32
+ self.evals = self._discover_evals()
33
+
34
+ def _discover_evals(self) -> List[Dict[str, Any]]:
35
+ """Discover all eval YAML files and load their metadata."""
36
+ if not self.evals_dir.exists():
37
+ raise ValueError(f"Evals directory '{self.evals_dir}' not found")
38
+
39
+ evals = []
40
+ yaml_files = sorted(list(self.evals_dir.rglob("*.yaml")) + list(self.evals_dir.rglob("*.yml")))
41
+
42
+ for yaml_file in yaml_files:
43
+ with open(yaml_file, 'r') as f:
44
+ eval_data = yaml.safe_load(f)
45
+
46
+ # Extract metadata
47
+ eval_info = {
48
+ "path": str(yaml_file),
49
+ "relative_path": str(yaml_file.relative_to(self.evals_dir)),
50
+ "eval_id": eval_data.get("eval_id", "unknown"),
51
+ "name": eval_data.get("name", ""),
52
+ "category": eval_data.get("category", "general"),
53
+ "description": eval_data.get("description", ""),
54
+ "inputs": eval_data.get("inputs", {}),
55
+
56
+ # Applicability metadata
57
+ "requires": self._extract_requirements(eval_data),
58
+ "keywords": self._extract_keywords(eval_data),
59
+ }
60
+
61
+ evals.append(eval_info)
62
+
63
+ return evals
64
+
65
+ def _extract_requirements(self, eval_data: Dict[str, Any]) -> List[str]:
66
+ """Extract technology/feature requirements from eval metadata."""
67
+ requirements = []
68
+
69
+ # Category-based requirements
70
+ category = eval_data.get("category", "").lower()
71
+ if category == "nextjs":
72
+ requirements.append("nextjs")
73
+ elif category == "supabase":
74
+ requirements.append("supabase")
75
+ elif category == "react":
76
+ requirements.append("react")
77
+ elif category == "tailwind":
78
+ requirements.append("tailwind")
79
+ elif category == "vercel":
80
+ requirements.append("vercel")
81
+ elif category == "shadcn":
82
+ requirements.extend(["shadcn", "react", "tailwind"])
83
+
84
+ # Explicit requirements from inputs
85
+ inputs = eval_data.get("inputs", {})
86
+ if "supabase_url" in inputs or "supabase_anon_key" in inputs:
87
+ requirements.append("supabase")
88
+
89
+ # Keyword-based requirements from eval_id and name
90
+ eval_id = eval_data.get("eval_id", "").lower()
91
+ name = eval_data.get("name", "").lower()
92
+
93
+ if "server" in eval_id or "server" in name:
94
+ requirements.append("server-component")
95
+ if "client" in eval_id or "client" in name:
96
+ requirements.append("client-component")
97
+ if "action" in eval_id or "action" in name:
98
+ requirements.append("server-action")
99
+ if "cookie" in eval_id or "cookie" in name:
100
+ requirements.append("cookies")
101
+ if "auth" in eval_id or "auth" in name:
102
+ requirements.append("authentication")
103
+ if "zustand" in eval_id or "zustand" in name:
104
+ requirements.append("zustand")
105
+ if "metadata" in eval_id or "metadata" in name:
106
+ requirements.append("metadata")
107
+ if "route" in eval_id or "route" in name:
108
+ requirements.append("routing")
109
+ if "middleware" in eval_id or "middleware" in name:
110
+ requirements.append("middleware")
111
+
112
+ return list(set(requirements))
113
+
114
+ def _extract_keywords(self, eval_data: Dict[str, Any]) -> List[str]:
115
+ """Extract keywords from eval for matching."""
116
+ keywords = []
117
+
118
+ # From description and name
119
+ description = eval_data.get("description", "").lower()
120
+ name = eval_data.get("name", "").lower()
121
+
122
+ # Handle criteria - it might be a string or list
123
+ criteria = eval_data.get("criteria", "")
124
+ if isinstance(criteria, str):
125
+ criteria = criteria.lower()
126
+ else:
127
+ criteria = ""
128
+
129
+ # Common keywords to extract
130
+ all_text = f"{description} {name} {criteria}"
131
+
132
+ keyword_list = [
133
+ "fetch", "api", "database", "auth", "login", "signup",
134
+ "cookie", "session", "server", "client", "component",
135
+ "action", "form", "upload", "image", "file",
136
+ "route", "router", "navigation", "link",
137
+ "metadata", "seo", "og", "opengraph",
138
+ "parallel", "intercepting", "loading", "error",
139
+ "middleware", "revalidate", "cache", "streaming",
140
+ "tailwind", "styling", "css", "ui", "shadcn",
141
+ "supabase", "nextjs", "react", "vercel",
142
+ "zustand", "state", "store", "hook"
143
+ ]
144
+
145
+ for keyword in keyword_list:
146
+ if keyword in all_text:
147
+ keywords.append(keyword)
148
+
149
+ return list(set(keywords))
150
+
151
+ def get_all(self) -> List[Dict[str, Any]]:
152
+ """Get all registered evals."""
153
+ return self.evals
154
+
155
+ def get_by_category(self, category: str) -> List[Dict[str, Any]]:
156
+ """Get evals by category."""
157
+ return [e for e in self.evals if e["category"] == category]
158
+
159
+ def get_by_id(self, eval_id: str) -> Dict[str, Any]:
160
+ """Get eval by ID."""
161
+ for e in self.evals:
162
+ if e["eval_id"] == eval_id:
163
+ return e
164
+ raise ValueError(f"Eval with ID '{eval_id}' not found")
165
+
166
+ def filter_by_requirements(self, requirements: List[str]) -> List[Dict[str, Any]]:
167
+ """Filter evals that match any of the given requirements."""
168
+ matching = []
169
+ for e in self.evals:
170
+ if any(req in e["requires"] for req in requirements):
171
+ matching.append(e)
172
+ return matching
173
+
174
+ def print_summary(self):
175
+ """Print a summary of all registered evals."""
176
+ print(f"\n{'='*80}")
177
+ print(f"EVAL REGISTRY SUMMARY")
178
+ print(f"{'='*80}\n")
179
+ print(f"Total evals: {len(self.evals)}\n")
180
+
181
+ # Group by category
182
+ categories = {}
183
+ for e in self.evals:
184
+ cat = e["category"]
185
+ if cat not in categories:
186
+ categories[cat] = []
187
+ categories[cat].append(e)
188
+
189
+ for category, evals in sorted(categories.items()):
190
+ print(f"{category.upper()} ({len(evals)} evals)")
191
+ for e in evals:
192
+ reqs = ", ".join(e["requires"]) if e["requires"] else "general"
193
+ print(f" - {e['eval_id']}: {e['name']}")
194
+ print(f" Requirements: {reqs}")
195
+ print()
196
+
197
+
198
+ def main():
199
+ """CLI for exploring the eval registry."""
200
+ import argparse
201
+
202
+ parser = argparse.ArgumentParser(description="Explore the eval registry")
203
+ parser.add_argument("--category", help="Filter by category")
204
+ parser.add_argument("--list", action="store_true", help="List all evals")
205
+ args = parser.parse_args()
206
+
207
+ registry = EvalRegistry()
208
+
209
+ if args.list:
210
+ registry.print_summary()
211
+ elif args.category:
212
+ evals = registry.get_by_category(args.category)
213
+ print(f"\nEvals in category '{args.category}':")
214
+ for e in evals:
215
+ print(f" - {e['eval_id']}: {e['name']}")
216
+ else:
217
+ registry.print_summary()
218
+
219
+
220
+ if __name__ == "__main__":
221
+ main()
222
+