microevals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- config/judge_system_prompt.yaml +113 -0
- evals/nextjs/001-server-component.yaml +28 -0
- evals/nextjs/002-client-component.yaml +26 -0
- evals/nextjs/003-cookies.yaml +28 -0
- evals/nextjs/010-route-handlers.yaml +30 -0
- evals/nextjs/013-pathname-server.yaml +29 -0
- evals/nextjs/014-server-routing.yaml +28 -0
- evals/nextjs/018-use-router.yaml +28 -0
- evals/nextjs/020_no_use_effect.yaml +30 -0
- evals/nextjs/021-avoid-fetch-in-effect.yaml +28 -0
- evals/nextjs/022_prefer_server_actions.yaml +29 -0
- evals/nextjs/023_avoid_getserversideprops.yaml +27 -0
- evals/nextjs/024_avoid_redundant_usestate.yaml +29 -0
- evals/nextjs/025_no_async_client_components.yaml +29 -0
- evals/nextjs/026_no_serial_await.yaml +26 -0
- evals/nextjs/027-prefer-next-image.yaml +30 -0
- evals/nextjs/027_no_hooks_in_server_components.yaml +29 -0
- evals/nextjs/028-prefer-next-font.yaml +30 -0
- evals/nextjs/028_cookies_headers_context.yaml +29 -0
- evals/nextjs/029_no_catch_redirect.yaml +31 -0
- evals/nextjs/030_app_router_migration.yaml +30 -0
- evals/nextjs/031_no_non_serializable_props.yaml +31 -0
- evals/react/001_missing_useeffect_dependencies.yaml +29 -0
- evals/react/002_incorrect_event_handler.yaml +28 -0
- evals/react/003_missing_return_in_map.yaml +28 -0
- evals/react/004_async_useeffect.yaml +32 -0
- evals/react/005_direct_state_mutation.yaml +30 -0
- evals/react/006_index_as_key.yaml +31 -0
- evals/react/zustand_store_usage.yaml +25 -0
- evals/shadcn/001_cn_utility_function.yaml +31 -0
- evals/shadcn/002_css_variables.yaml +32 -0
- evals/shadcn/003_component_dependencies.yaml +33 -0
- evals/shadcn/004_path_aliases.yaml +32 -0
- evals/shadcn/005_client_directive.yaml +31 -0
- evals/shadcn/006_tailwind_config.yaml +36 -0
- evals/shadcn/007_components_json_config.yaml +35 -0
- evals/supabase/001_client_setup.yaml +47 -0
- evals/supabase/002_auth_context_setup.yaml +43 -0
- evals/supabase/003_auth_flow_implementation.yaml +46 -0
- evals/supabase/004_auth_flow_testing_WIP.yaml +52 -0
- evals/supabase/005_auth_google_oauth.yaml +55 -0
- evals/supabase/007_storage_client_setup.yaml +43 -0
- evals/supabase/008_storage_nextjs_config.yaml +45 -0
- evals/supabase/009_storage_image_upload.yaml +49 -0
- evals/supabase/010_security_rls_enabled.yaml +42 -0
- evals/supabase/011_security_rls_policies.yaml +43 -0
- evals/supabase/012_security_no_service_key_exposed.yaml +49 -0
- evals/supabase/013_database_read_data.yaml +44 -0
- evals/supabase/014_database_create_data.yaml +44 -0
- evals/supabase/015_database_update_data.yaml +47 -0
- evals/supabase/016_database_delete_data.yaml +47 -0
- evals/supabase/017_database_user_scoped_query.yaml +52 -0
- evals/tailwind/001_tailwind_v4_config.yaml +22 -0
- evals/tailwind/002_content_paths.yaml +27 -0
- evals/tailwind/003_no_dynamic_class_construction.yaml +28 -0
- evals/tailwind/tailwind_postcss_config.yaml +24 -0
- evals/typescript/001_unsafe_type_assertions.yaml +39 -0
- evals/typescript/002_missing_null_checks.yaml +33 -0
- evals/vercel/001_vercel_deployment.yaml +19 -0
- evals/vercel/002_environment_variables_handling.yaml +23 -0
- evals/vercel/003_seo_metadata.yaml +33 -0
- microevals/__init__.py +34 -0
- microevals/eval_registry.py +222 -0
- microevals/eval_runner.py +533 -0
- microevals/utils.py +490 -0
- microevals-0.1.0.dist-info/METADATA +575 -0
- microevals-0.1.0.dist-info/RECORD +71 -0
- microevals-0.1.0.dist-info/WHEEL +5 -0
- microevals-0.1.0.dist-info/entry_points.txt +2 -0
- microevals-0.1.0.dist-info/licenses/LICENSE +21 -0
- microevals-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
eval_id: nextjs_tailwind_v4_config
|
|
2
|
+
name: "Tailwind CSS v4 Configuration"
|
|
3
|
+
description: "Verify the agent correctly recognizes and configures Tailwind CSS v4 syntax"
|
|
4
|
+
category: tailwind
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Check Tailwind CSS v4 configuration.
|
|
8
|
+
|
|
9
|
+
REQUIREMENTS:
|
|
10
|
+
1. globals.css contains: @import "tailwindcss";
|
|
11
|
+
2. globals.css does NOT contain old v3 directives (@tailwind base/components/utilities)
|
|
12
|
+
3. postcss.config has: plugins: ["@tailwindcss/postcss"]
|
|
13
|
+
4. package.json has: "tailwindcss": "^4"
|
|
14
|
+
|
|
15
|
+
EVALUATION:
|
|
16
|
+
- If no Tailwind v4: score -1.0 (N/A)
|
|
17
|
+
- If all requirements met: score 1.0
|
|
18
|
+
- If using old v3 syntax: score 0.0
|
|
19
|
+
|
|
20
|
+
# Optional: Custom inputs for this specific eval
|
|
21
|
+
inputs:
|
|
22
|
+
deployment_url: "https://agentic-9d1c6c3d.vercel.app/"
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
eval_id: tailwind_content_paths_002
|
|
2
|
+
name: "Correct Content Paths in Tailwind Config"
|
|
3
|
+
description: "Checks if tailwind.config content paths include all files using Tailwind classes"
|
|
4
|
+
category: tailwind
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Verify content paths in tailwind.config include all files with Tailwind classes.
|
|
8
|
+
|
|
9
|
+
ANTI-PATTERN:
|
|
10
|
+
- content: ['./app/**/*.{js,ts,jsx,tsx}'] but components are in /components
|
|
11
|
+
- Missing file extensions (.mdx, .vue, etc.)
|
|
12
|
+
- Hardcoded paths that don't match actual file structure
|
|
13
|
+
|
|
14
|
+
WHY IT'S WRONG:
|
|
15
|
+
- Classes in files not matched by content paths get purged in production
|
|
16
|
+
- Styles work in dev but break in production build
|
|
17
|
+
- Silent failure - no error, just missing styles
|
|
18
|
+
|
|
19
|
+
CORRECT:
|
|
20
|
+
- content: ['./app/**/*.{js,ts,jsx,tsx,mdx}', './components/**/*.{js,ts,jsx,tsx}']
|
|
21
|
+
- Include all directories and extensions where Tailwind is used
|
|
22
|
+
|
|
23
|
+
SCORING:
|
|
24
|
+
- Score 1.0 (PASS): Tailwind config exists AND content paths correctly include all files with Tailwind classes
|
|
25
|
+
- Score 0.0 (FAIL): Tailwind config exists BUT files with classes are outside content paths (anti-pattern)
|
|
26
|
+
- Score -1.0 (N/A): No Tailwind config found in project
|
|
27
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
eval_id: tailwind_no_dynamic_class_construction_003
|
|
2
|
+
name: "No Dynamic Tailwind Class Construction"
|
|
3
|
+
description: "Checks for dynamic class string construction that breaks Tailwind's purging/scanning"
|
|
4
|
+
category: tailwind
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Detect dynamic class string construction that breaks Tailwind purging.
|
|
8
|
+
|
|
9
|
+
ANTI-PATTERN:
|
|
10
|
+
- className={'text-' + color} or className={`bg-${color}-500`}
|
|
11
|
+
- Building class names from variables/props
|
|
12
|
+
- Concatenating partial class names
|
|
13
|
+
|
|
14
|
+
WHY IT'S WRONG:
|
|
15
|
+
- Tailwind scans for complete class strings at build time
|
|
16
|
+
- Dynamically constructed classes get purged
|
|
17
|
+
- Works in dev, breaks in production
|
|
18
|
+
|
|
19
|
+
CORRECT:
|
|
20
|
+
- Complete class strings: className={color === 'red' ? 'text-red-500' : 'text-blue-500'}
|
|
21
|
+
- Use safelist in config for truly dynamic classes
|
|
22
|
+
- Or use inline styles for dynamic values
|
|
23
|
+
|
|
24
|
+
SCORING:
|
|
25
|
+
- Score 1.0 (PASS): No dynamic class construction found - all Tailwind classes are complete strings
|
|
26
|
+
- Score 0.0 (FAIL): Dynamic class construction found (template literals or concatenation building classes)
|
|
27
|
+
- Score -1.0 (N/A): No Tailwind usage found in project
|
|
28
|
+
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
eval_id: nextjs_tailwind_postcss_config
|
|
2
|
+
name: "Tailwind CSS PostCSS Configuration"
|
|
3
|
+
description: "Verify the agent correctly diagnoses and fixes missing PostCSS configuration for Tailwind CSS"
|
|
4
|
+
category: tailwind
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Check Tailwind CSS v3 PostCSS configuration (traditional setup).
|
|
8
|
+
|
|
9
|
+
IMPORTANT: This eval checks for Tailwind CSS v3 configuration ONLY.
|
|
10
|
+
If the project uses Tailwind v4 (indicated by @import "tailwindcss" or @tailwindcss/postcss),
|
|
11
|
+
score -1.0 (N/A) since v4 has different requirements.
|
|
12
|
+
|
|
13
|
+
REQUIREMENTS FOR V3:
|
|
14
|
+
1. postcss.config.js/mjs exists
|
|
15
|
+
2. Contains "tailwindcss" plugin (not @tailwindcss/postcss)
|
|
16
|
+
3. Contains autoprefixer plugin
|
|
17
|
+
4. tailwind.config.js exists with content paths
|
|
18
|
+
5. globals.css has @tailwind directives (not @import)
|
|
19
|
+
|
|
20
|
+
SCORING:
|
|
21
|
+
- If using Tailwind v4 syntax (@import "tailwindcss" OR @tailwindcss/postcss): score -1.0 (N/A)
|
|
22
|
+
- If no Tailwind at all: score -1.0 (N/A)
|
|
23
|
+
- If all v3 requirements met: score 1.0 (PASS)
|
|
24
|
+
- If v3 config incomplete/incorrect: score 0.0 (FAIL)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
eval_id: typescript_unsafe_type_assertions_001
|
|
2
|
+
name: "Unsafe Type Assertions"
|
|
3
|
+
description: "Checks for type assertions (as) without runtime validation"
|
|
4
|
+
category: typescript
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Detect type assertions that bypass type checking without validation.
|
|
8
|
+
|
|
9
|
+
ANTI-PATTERN:
|
|
10
|
+
- const user = data as User // no validation
|
|
11
|
+
- response as MyType // assuming type without checking
|
|
12
|
+
- (value as any) to bypass errors
|
|
13
|
+
- Using 'as' without runtime validation
|
|
14
|
+
|
|
15
|
+
WHY IT'S WRONG:
|
|
16
|
+
- Type assertions don't validate at runtime
|
|
17
|
+
- Causes crashes when data doesn't match assumed type
|
|
18
|
+
- TypeScript provides false sense of security
|
|
19
|
+
- Common LLM mistake
|
|
20
|
+
|
|
21
|
+
CORRECT:
|
|
22
|
+
- Validate before asserting: if (isUser(data)) { const user = data as User }
|
|
23
|
+
- Use type guards: function isUser(obj: any): obj is User { ... }
|
|
24
|
+
- Use zod or similar for runtime validation
|
|
25
|
+
- Avoid 'as' unless absolutely necessary
|
|
26
|
+
|
|
27
|
+
EVALUATION:
|
|
28
|
+
- Scan for 'as TypeName' assertions (excluding safe ones like 'as const')
|
|
29
|
+
- Check if there's validation before the assertion
|
|
30
|
+
- Particularly check API responses and external data
|
|
31
|
+
|
|
32
|
+
SCORING:
|
|
33
|
+
- Score 1.0 (PASS): No unsafe type assertions found, or all type assertions have proper validation
|
|
34
|
+
- Score 0.0 (FAIL): Unsafe type assertions found without runtime validation
|
|
35
|
+
- Score -1.0 (N/A): Not a TypeScript project, can't evaluate
|
|
36
|
+
|
|
37
|
+
Note: Finding ZERO type assertions is a GOOD thing and should score 1.0 (PASS).
|
|
38
|
+
The absence of unsafe patterns means the code is using proper TypeScript without bypassing type safety.
|
|
39
|
+
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
eval_id: typescript_missing_null_checks_002
|
|
2
|
+
name: "Missing Null/Undefined Checks"
|
|
3
|
+
description: "Checks for missing null/undefined checks before accessing properties"
|
|
4
|
+
category: typescript
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Detect property access without null/undefined checks.
|
|
8
|
+
|
|
9
|
+
ANTI-PATTERN:
|
|
10
|
+
- user.name when user might be null/undefined
|
|
11
|
+
- data.items[0].value without checking data exists
|
|
12
|
+
- Accessing nested properties without optional chaining
|
|
13
|
+
|
|
14
|
+
WHY IT'S WRONG:
|
|
15
|
+
- Runtime error: "Cannot read property 'name' of undefined"
|
|
16
|
+
- App crashes
|
|
17
|
+
- Common with API responses, optional props
|
|
18
|
+
- TypeScript might not catch if types are wrong
|
|
19
|
+
|
|
20
|
+
CORRECT:
|
|
21
|
+
- user?.name (optional chaining)
|
|
22
|
+
- if (user) { user.name }
|
|
23
|
+
- user && user.name
|
|
24
|
+
- data?.items?.[0]?.value (nested optional chaining)
|
|
25
|
+
|
|
26
|
+
SCORING:
|
|
27
|
+
- Score 1.0 (PASS): No missing null checks found, all property accesses are safe
|
|
28
|
+
- Score 0.0 (FAIL): Missing null checks found, unsafe property access detected
|
|
29
|
+
- Score -1.0 (N/A): Not a TypeScript project, can't evaluate
|
|
30
|
+
|
|
31
|
+
Note: Finding NO missing null checks is a GOOD thing and should score 1.0 (PASS).
|
|
32
|
+
The absence of unsafe property access means the code properly handles nullability.
|
|
33
|
+
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
eval_id: vercel_deployment_check
|
|
2
|
+
name: "Vercel Deployment"
|
|
3
|
+
description: "Check if deployed site is live and matches the codebase"
|
|
4
|
+
category: vercel
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Curl {deployment_url} and verify deployment is working. Check HTTP 200, valid HTML, no error pages.
|
|
8
|
+
|
|
9
|
+
If the repository has source code, also verify the deployed content matches the codebase - compare
|
|
10
|
+
HTML against components/routes/framework markers. If repo is empty or has no code, just verify
|
|
11
|
+
the site loads successfully.
|
|
12
|
+
|
|
13
|
+
SCORING:
|
|
14
|
+
- Score 1.0 (PASS): Site loads successfully (HTTP 200) AND content matches codebase (or no code to match)
|
|
15
|
+
- Score 0.0 (FAIL): Errors, broken deployment, or content completely different from codebase
|
|
16
|
+
- Score -1.0 (N/A): No deployment URL provided
|
|
17
|
+
|
|
18
|
+
inputs:
|
|
19
|
+
deployment_url: "" # Will be provided at runtime from generation metadata
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
eval_id: vercel_environment_variables
|
|
2
|
+
name: "Environment Variables Configuration"
|
|
3
|
+
description: "Verify the application properly handles environment variables for Vercel deployment"
|
|
4
|
+
category: vercel
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Evaluate how the application handles environment variables in a Vercel deployment context.
|
|
8
|
+
|
|
9
|
+
Check if the app demonstrates proper env var usage:
|
|
10
|
+
- Public vars prefixed with NEXT_PUBLIC_ for client access
|
|
11
|
+
- Sensitive vars (API keys, secrets) NOT prefixed with NEXT_PUBLIC_
|
|
12
|
+
- Graceful handling of missing environment variables
|
|
13
|
+
- No hardcoded secrets in the codebase
|
|
14
|
+
- Proper use of process.env with fallbacks where appropriate
|
|
15
|
+
- .env.example or documentation showing required vars
|
|
16
|
+
|
|
17
|
+
SCORING:
|
|
18
|
+
- Score 1.0 (PASS): Environment variables properly scoped, no exposed secrets, graceful handling, clear documentation
|
|
19
|
+
- Score 0.0 (FAIL): Secrets exposed with NEXT_PUBLIC_, hardcoded API keys, app crashes when vars missing, or no documentation
|
|
20
|
+
- Score -1.0 (N/A): No environment variables used in the application
|
|
21
|
+
|
|
22
|
+
inputs:
|
|
23
|
+
deployment_url: ""
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
eval_id: vercel_seo_metadata
|
|
2
|
+
name: "SEO and Metadata Configuration"
|
|
3
|
+
description: "Verify proper SEO metadata and Open Graph tags for production deployment"
|
|
4
|
+
category: vercel
|
|
5
|
+
|
|
6
|
+
criteria: |
|
|
7
|
+
Evaluate the application's SEO and social sharing metadata. Use a balanced scoring approach where having most SEO elements is sufficient for a passing score.
|
|
8
|
+
|
|
9
|
+
Check for these elements (award points for presence):
|
|
10
|
+
- Page titles using Metadata API or next/head [High priority]
|
|
11
|
+
- Meta descriptions for key pages [High priority]
|
|
12
|
+
- Open Graph tags (og:title, og:description, og:image) [Medium priority]
|
|
13
|
+
- Twitter Card tags for social sharing [Low priority]
|
|
14
|
+
- Canonical URLs to prevent duplicate content [Medium priority]
|
|
15
|
+
- Structured data (JSON-LD) [Low priority]
|
|
16
|
+
- Robots.txt and sitemap.xml generation [Medium priority]
|
|
17
|
+
- Alt text on images for accessibility [High priority]
|
|
18
|
+
|
|
19
|
+
Mark as PASS if the app has MOST of the good stuff:
|
|
20
|
+
- Has at least 4-5 of the 8 elements above (simple majority)
|
|
21
|
+
- Shows clear SEO effort (titles and descriptions present)
|
|
22
|
+
- Missing a few low-priority items is acceptable
|
|
23
|
+
- The positives clearly outweigh any gaps
|
|
24
|
+
|
|
25
|
+
Mark as FAIL only if severely lacking:
|
|
26
|
+
- Missing most high-priority items (no titles AND no descriptions)
|
|
27
|
+
- Has fewer than 3 total SEO elements
|
|
28
|
+
- Shows no SEO consideration whatsoever
|
|
29
|
+
|
|
30
|
+
When in doubt, favor PASS if reasonable SEO effort is evident.
|
|
31
|
+
|
|
32
|
+
inputs:
|
|
33
|
+
deployment_url: ""
|
microevals/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MicroEvals - A lightweight framework for evaluating code against specific criteria.
|
|
3
|
+
|
|
4
|
+
This package provides tools to run automated evaluations on codebases created by agents.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
from .eval_registry import EvalRegistry
|
|
10
|
+
from .utils import (
|
|
11
|
+
load_source,
|
|
12
|
+
clone_repo,
|
|
13
|
+
prepare_repo,
|
|
14
|
+
build_prompt,
|
|
15
|
+
run_eval,
|
|
16
|
+
run_batch_eval,
|
|
17
|
+
read_result,
|
|
18
|
+
save_results,
|
|
19
|
+
safe_cleanup_temp_dir
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
'EvalRegistry',
|
|
24
|
+
'load_source',
|
|
25
|
+
'clone_repo',
|
|
26
|
+
'prepare_repo',
|
|
27
|
+
'build_prompt',
|
|
28
|
+
'run_eval',
|
|
29
|
+
'run_batch_eval',
|
|
30
|
+
'read_result',
|
|
31
|
+
'save_results',
|
|
32
|
+
'safe_cleanup_temp_dir'
|
|
33
|
+
]
|
|
34
|
+
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Eval Registry - Central catalog of all evaluations with applicability logic.
|
|
3
|
+
|
|
4
|
+
This module discovers all eval files and provides metadata about each eval,
|
|
5
|
+
including its category, dependencies, and applicability conditions.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import List, Dict, Any
|
|
10
|
+
import yaml
|
|
11
|
+
import sys
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EvalRegistry:
|
|
15
|
+
"""Registry of all available evaluations."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, evals_dir: str = None):
|
|
18
|
+
if evals_dir is None:
|
|
19
|
+
# Try to find evals relative to package installation
|
|
20
|
+
try:
|
|
21
|
+
# Use importlib.resources for Python 3.9+ or fall back to __file__
|
|
22
|
+
if sys.version_info >= (3, 9):
|
|
23
|
+
from importlib.resources import files
|
|
24
|
+
evals_dir = str(files('microevals').parent / 'evals')
|
|
25
|
+
else:
|
|
26
|
+
# Fallback for older Python versions
|
|
27
|
+
evals_dir = str(Path(__file__).parent.parent / 'evals')
|
|
28
|
+
except:
|
|
29
|
+
# Final fallback - look relative to this file
|
|
30
|
+
evals_dir = str(Path(__file__).parent.parent / 'evals')
|
|
31
|
+
self.evals_dir = Path(evals_dir)
|
|
32
|
+
self.evals = self._discover_evals()
|
|
33
|
+
|
|
34
|
+
def _discover_evals(self) -> List[Dict[str, Any]]:
|
|
35
|
+
"""Discover all eval YAML files and load their metadata."""
|
|
36
|
+
if not self.evals_dir.exists():
|
|
37
|
+
raise ValueError(f"Evals directory '{self.evals_dir}' not found")
|
|
38
|
+
|
|
39
|
+
evals = []
|
|
40
|
+
yaml_files = sorted(list(self.evals_dir.rglob("*.yaml")) + list(self.evals_dir.rglob("*.yml")))
|
|
41
|
+
|
|
42
|
+
for yaml_file in yaml_files:
|
|
43
|
+
with open(yaml_file, 'r') as f:
|
|
44
|
+
eval_data = yaml.safe_load(f)
|
|
45
|
+
|
|
46
|
+
# Extract metadata
|
|
47
|
+
eval_info = {
|
|
48
|
+
"path": str(yaml_file),
|
|
49
|
+
"relative_path": str(yaml_file.relative_to(self.evals_dir)),
|
|
50
|
+
"eval_id": eval_data.get("eval_id", "unknown"),
|
|
51
|
+
"name": eval_data.get("name", ""),
|
|
52
|
+
"category": eval_data.get("category", "general"),
|
|
53
|
+
"description": eval_data.get("description", ""),
|
|
54
|
+
"inputs": eval_data.get("inputs", {}),
|
|
55
|
+
|
|
56
|
+
# Applicability metadata
|
|
57
|
+
"requires": self._extract_requirements(eval_data),
|
|
58
|
+
"keywords": self._extract_keywords(eval_data),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
evals.append(eval_info)
|
|
62
|
+
|
|
63
|
+
return evals
|
|
64
|
+
|
|
65
|
+
def _extract_requirements(self, eval_data: Dict[str, Any]) -> List[str]:
|
|
66
|
+
"""Extract technology/feature requirements from eval metadata."""
|
|
67
|
+
requirements = []
|
|
68
|
+
|
|
69
|
+
# Category-based requirements
|
|
70
|
+
category = eval_data.get("category", "").lower()
|
|
71
|
+
if category == "nextjs":
|
|
72
|
+
requirements.append("nextjs")
|
|
73
|
+
elif category == "supabase":
|
|
74
|
+
requirements.append("supabase")
|
|
75
|
+
elif category == "react":
|
|
76
|
+
requirements.append("react")
|
|
77
|
+
elif category == "tailwind":
|
|
78
|
+
requirements.append("tailwind")
|
|
79
|
+
elif category == "vercel":
|
|
80
|
+
requirements.append("vercel")
|
|
81
|
+
elif category == "shadcn":
|
|
82
|
+
requirements.extend(["shadcn", "react", "tailwind"])
|
|
83
|
+
|
|
84
|
+
# Explicit requirements from inputs
|
|
85
|
+
inputs = eval_data.get("inputs", {})
|
|
86
|
+
if "supabase_url" in inputs or "supabase_anon_key" in inputs:
|
|
87
|
+
requirements.append("supabase")
|
|
88
|
+
|
|
89
|
+
# Keyword-based requirements from eval_id and name
|
|
90
|
+
eval_id = eval_data.get("eval_id", "").lower()
|
|
91
|
+
name = eval_data.get("name", "").lower()
|
|
92
|
+
|
|
93
|
+
if "server" in eval_id or "server" in name:
|
|
94
|
+
requirements.append("server-component")
|
|
95
|
+
if "client" in eval_id or "client" in name:
|
|
96
|
+
requirements.append("client-component")
|
|
97
|
+
if "action" in eval_id or "action" in name:
|
|
98
|
+
requirements.append("server-action")
|
|
99
|
+
if "cookie" in eval_id or "cookie" in name:
|
|
100
|
+
requirements.append("cookies")
|
|
101
|
+
if "auth" in eval_id or "auth" in name:
|
|
102
|
+
requirements.append("authentication")
|
|
103
|
+
if "zustand" in eval_id or "zustand" in name:
|
|
104
|
+
requirements.append("zustand")
|
|
105
|
+
if "metadata" in eval_id or "metadata" in name:
|
|
106
|
+
requirements.append("metadata")
|
|
107
|
+
if "route" in eval_id or "route" in name:
|
|
108
|
+
requirements.append("routing")
|
|
109
|
+
if "middleware" in eval_id or "middleware" in name:
|
|
110
|
+
requirements.append("middleware")
|
|
111
|
+
|
|
112
|
+
return list(set(requirements))
|
|
113
|
+
|
|
114
|
+
def _extract_keywords(self, eval_data: Dict[str, Any]) -> List[str]:
|
|
115
|
+
"""Extract keywords from eval for matching."""
|
|
116
|
+
keywords = []
|
|
117
|
+
|
|
118
|
+
# From description and name
|
|
119
|
+
description = eval_data.get("description", "").lower()
|
|
120
|
+
name = eval_data.get("name", "").lower()
|
|
121
|
+
|
|
122
|
+
# Handle criteria - it might be a string or list
|
|
123
|
+
criteria = eval_data.get("criteria", "")
|
|
124
|
+
if isinstance(criteria, str):
|
|
125
|
+
criteria = criteria.lower()
|
|
126
|
+
else:
|
|
127
|
+
criteria = ""
|
|
128
|
+
|
|
129
|
+
# Common keywords to extract
|
|
130
|
+
all_text = f"{description} {name} {criteria}"
|
|
131
|
+
|
|
132
|
+
keyword_list = [
|
|
133
|
+
"fetch", "api", "database", "auth", "login", "signup",
|
|
134
|
+
"cookie", "session", "server", "client", "component",
|
|
135
|
+
"action", "form", "upload", "image", "file",
|
|
136
|
+
"route", "router", "navigation", "link",
|
|
137
|
+
"metadata", "seo", "og", "opengraph",
|
|
138
|
+
"parallel", "intercepting", "loading", "error",
|
|
139
|
+
"middleware", "revalidate", "cache", "streaming",
|
|
140
|
+
"tailwind", "styling", "css", "ui", "shadcn",
|
|
141
|
+
"supabase", "nextjs", "react", "vercel",
|
|
142
|
+
"zustand", "state", "store", "hook"
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
for keyword in keyword_list:
|
|
146
|
+
if keyword in all_text:
|
|
147
|
+
keywords.append(keyword)
|
|
148
|
+
|
|
149
|
+
return list(set(keywords))
|
|
150
|
+
|
|
151
|
+
def get_all(self) -> List[Dict[str, Any]]:
|
|
152
|
+
"""Get all registered evals."""
|
|
153
|
+
return self.evals
|
|
154
|
+
|
|
155
|
+
def get_by_category(self, category: str) -> List[Dict[str, Any]]:
|
|
156
|
+
"""Get evals by category."""
|
|
157
|
+
return [e for e in self.evals if e["category"] == category]
|
|
158
|
+
|
|
159
|
+
def get_by_id(self, eval_id: str) -> Dict[str, Any]:
|
|
160
|
+
"""Get eval by ID."""
|
|
161
|
+
for e in self.evals:
|
|
162
|
+
if e["eval_id"] == eval_id:
|
|
163
|
+
return e
|
|
164
|
+
raise ValueError(f"Eval with ID '{eval_id}' not found")
|
|
165
|
+
|
|
166
|
+
def filter_by_requirements(self, requirements: List[str]) -> List[Dict[str, Any]]:
|
|
167
|
+
"""Filter evals that match any of the given requirements."""
|
|
168
|
+
matching = []
|
|
169
|
+
for e in self.evals:
|
|
170
|
+
if any(req in e["requires"] for req in requirements):
|
|
171
|
+
matching.append(e)
|
|
172
|
+
return matching
|
|
173
|
+
|
|
174
|
+
def print_summary(self):
|
|
175
|
+
"""Print a summary of all registered evals."""
|
|
176
|
+
print(f"\n{'='*80}")
|
|
177
|
+
print(f"EVAL REGISTRY SUMMARY")
|
|
178
|
+
print(f"{'='*80}\n")
|
|
179
|
+
print(f"Total evals: {len(self.evals)}\n")
|
|
180
|
+
|
|
181
|
+
# Group by category
|
|
182
|
+
categories = {}
|
|
183
|
+
for e in self.evals:
|
|
184
|
+
cat = e["category"]
|
|
185
|
+
if cat not in categories:
|
|
186
|
+
categories[cat] = []
|
|
187
|
+
categories[cat].append(e)
|
|
188
|
+
|
|
189
|
+
for category, evals in sorted(categories.items()):
|
|
190
|
+
print(f"{category.upper()} ({len(evals)} evals)")
|
|
191
|
+
for e in evals:
|
|
192
|
+
reqs = ", ".join(e["requires"]) if e["requires"] else "general"
|
|
193
|
+
print(f" - {e['eval_id']}: {e['name']}")
|
|
194
|
+
print(f" Requirements: {reqs}")
|
|
195
|
+
print()
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def main():
|
|
199
|
+
"""CLI for exploring the eval registry."""
|
|
200
|
+
import argparse
|
|
201
|
+
|
|
202
|
+
parser = argparse.ArgumentParser(description="Explore the eval registry")
|
|
203
|
+
parser.add_argument("--category", help="Filter by category")
|
|
204
|
+
parser.add_argument("--list", action="store_true", help="List all evals")
|
|
205
|
+
args = parser.parse_args()
|
|
206
|
+
|
|
207
|
+
registry = EvalRegistry()
|
|
208
|
+
|
|
209
|
+
if args.list:
|
|
210
|
+
registry.print_summary()
|
|
211
|
+
elif args.category:
|
|
212
|
+
evals = registry.get_by_category(args.category)
|
|
213
|
+
print(f"\nEvals in category '{args.category}':")
|
|
214
|
+
for e in evals:
|
|
215
|
+
print(f" - {e['eval_id']}: {e['name']}")
|
|
216
|
+
else:
|
|
217
|
+
registry.print_summary()
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
if __name__ == "__main__":
|
|
221
|
+
main()
|
|
222
|
+
|