microevals 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- config/judge_system_prompt.yaml +113 -0
- evals/nextjs/001-server-component.yaml +28 -0
- evals/nextjs/002-client-component.yaml +26 -0
- evals/nextjs/003-cookies.yaml +28 -0
- evals/nextjs/010-route-handlers.yaml +30 -0
- evals/nextjs/013-pathname-server.yaml +29 -0
- evals/nextjs/014-server-routing.yaml +28 -0
- evals/nextjs/018-use-router.yaml +28 -0
- evals/nextjs/020_no_use_effect.yaml +30 -0
- evals/nextjs/021-avoid-fetch-in-effect.yaml +28 -0
- evals/nextjs/022_prefer_server_actions.yaml +29 -0
- evals/nextjs/023_avoid_getserversideprops.yaml +27 -0
- evals/nextjs/024_avoid_redundant_usestate.yaml +29 -0
- evals/nextjs/025_no_async_client_components.yaml +29 -0
- evals/nextjs/026_no_serial_await.yaml +26 -0
- evals/nextjs/027-prefer-next-image.yaml +30 -0
- evals/nextjs/027_no_hooks_in_server_components.yaml +29 -0
- evals/nextjs/028-prefer-next-font.yaml +30 -0
- evals/nextjs/028_cookies_headers_context.yaml +29 -0
- evals/nextjs/029_no_catch_redirect.yaml +31 -0
- evals/nextjs/030_app_router_migration.yaml +30 -0
- evals/nextjs/031_no_non_serializable_props.yaml +31 -0
- evals/react/001_missing_useeffect_dependencies.yaml +29 -0
- evals/react/002_incorrect_event_handler.yaml +28 -0
- evals/react/003_missing_return_in_map.yaml +28 -0
- evals/react/004_async_useeffect.yaml +32 -0
- evals/react/005_direct_state_mutation.yaml +30 -0
- evals/react/006_index_as_key.yaml +31 -0
- evals/react/zustand_store_usage.yaml +25 -0
- evals/shadcn/001_cn_utility_function.yaml +31 -0
- evals/shadcn/002_css_variables.yaml +32 -0
- evals/shadcn/003_component_dependencies.yaml +33 -0
- evals/shadcn/004_path_aliases.yaml +32 -0
- evals/shadcn/005_client_directive.yaml +31 -0
- evals/shadcn/006_tailwind_config.yaml +36 -0
- evals/shadcn/007_components_json_config.yaml +35 -0
- evals/supabase/001_client_setup.yaml +47 -0
- evals/supabase/002_auth_context_setup.yaml +43 -0
- evals/supabase/003_auth_flow_implementation.yaml +46 -0
- evals/supabase/004_auth_flow_testing_WIP.yaml +52 -0
- evals/supabase/005_auth_google_oauth.yaml +55 -0
- evals/supabase/007_storage_client_setup.yaml +43 -0
- evals/supabase/008_storage_nextjs_config.yaml +45 -0
- evals/supabase/009_storage_image_upload.yaml +49 -0
- evals/supabase/010_security_rls_enabled.yaml +42 -0
- evals/supabase/011_security_rls_policies.yaml +43 -0
- evals/supabase/012_security_no_service_key_exposed.yaml +49 -0
- evals/supabase/013_database_read_data.yaml +44 -0
- evals/supabase/014_database_create_data.yaml +44 -0
- evals/supabase/015_database_update_data.yaml +47 -0
- evals/supabase/016_database_delete_data.yaml +47 -0
- evals/supabase/017_database_user_scoped_query.yaml +52 -0
- evals/tailwind/001_tailwind_v4_config.yaml +22 -0
- evals/tailwind/002_content_paths.yaml +27 -0
- evals/tailwind/003_no_dynamic_class_construction.yaml +28 -0
- evals/tailwind/tailwind_postcss_config.yaml +24 -0
- evals/typescript/001_unsafe_type_assertions.yaml +39 -0
- evals/typescript/002_missing_null_checks.yaml +33 -0
- evals/vercel/001_vercel_deployment.yaml +19 -0
- evals/vercel/002_environment_variables_handling.yaml +23 -0
- evals/vercel/003_seo_metadata.yaml +33 -0
- microevals/__init__.py +34 -0
- microevals/eval_registry.py +222 -0
- microevals/eval_runner.py +533 -0
- microevals/utils.py +490 -0
- microevals-0.1.0.dist-info/METADATA +575 -0
- microevals-0.1.0.dist-info/RECORD +71 -0
- microevals-0.1.0.dist-info/WHEEL +5 -0
- microevals-0.1.0.dist-info/entry_points.txt +2 -0
- microevals-0.1.0.dist-info/licenses/LICENSE +21 -0
- microevals-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,575 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: microevals
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Automated evaluation framework for AI-generated code quality
|
|
5
|
+
Author-email: Design Arena <contact@designarena.ai>, Kamryn Ohly <kamryn@arcada.dev>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Design-Arena/MicroEvals
|
|
8
|
+
Project-URL: Repository, https://github.com/Design-Arena/MicroEvals
|
|
9
|
+
Project-URL: Documentation, https://github.com/Design-Arena/MicroEvals#readme
|
|
10
|
+
Project-URL: Issues, https://github.com/Design-Arena/MicroEvals/issues
|
|
11
|
+
Keywords: evaluation,agents,ai,evals,design,benchmarks
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
20
|
+
Classifier: Operating System :: OS Independent
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: requests>=2.31.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# MicroEvals
|
|
30
|
+
|
|
31
|
+
**Automated evaluation framework for AI-generated code quality and best practices.**
|
|
32
|
+
|
|
33
|
+
MicroEvals is a collection of focused, automated tests that evaluate whether AI-generated code (or any codebase) follows framework-specific best practices and avoids common anti-patterns. Each evaluation uses Claude to analyze your codebase against specific criteria.
|
|
34
|
+
|
|
35
|
+
## What Are MicroEvals?
|
|
36
|
+
|
|
37
|
+
MicroEvals are **micro-evaluations** - small, focused tests that check for specific patterns or anti-patterns in your code. Unlike traditional linters that check syntax, MicroEvals use LLM as a judge to understand context and evaluate architectural decisions.
|
|
38
|
+
|
|
39
|
+
**Example Use Cases:**
|
|
40
|
+
- Verify Next.js App Router best practices (server components, data fetching)
|
|
41
|
+
- Catch React anti-patterns (missing dependencies, incorrect hooks usage)
|
|
42
|
+
- Validate Supabase security (RLS policies, proper auth setup)
|
|
43
|
+
- Check TypeScript type safety (unsafe assertions, missing null checks)
|
|
44
|
+
- Ensure proper shadcn/ui integration
|
|
45
|
+
- Audit deployment configurations
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
### Installation
|
|
50
|
+
|
|
51
|
+
#### Option 1: Install from PyPI (Recommended)
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install microevals
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
#### Option 2: Install from Source (For Development)
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Clone the repository
|
|
61
|
+
git clone https://github.com/Design-Arena/MicroEvals
|
|
62
|
+
cd MicroEvals
|
|
63
|
+
|
|
64
|
+
# Install in development mode
|
|
65
|
+
pip install -e .
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Prerequisites
|
|
69
|
+
|
|
70
|
+
1. **Python 3.8+** installed
|
|
71
|
+
2. **Claude CLI** installed and authenticated:
|
|
72
|
+
```bash
|
|
73
|
+
# Install Claude CLI (if not already installed)
|
|
74
|
+
# See: https://docs.anthropic.com/en/docs/build-with-claude/cli
|
|
75
|
+
|
|
76
|
+
# Verify installation
|
|
77
|
+
claude --version
|
|
78
|
+
|
|
79
|
+
# If command not found, add Claude to your PATH:
|
|
80
|
+
export PATH="$PATH:/path/to/claude"
|
|
81
|
+
# Add the export line to your ~/.bashrc or ~/.zshrc to make it permanent
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
3. **Git** installed (for remote repositories)
|
|
85
|
+
|
|
86
|
+
### Run Your First Eval
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
# Navigate to your project
|
|
90
|
+
cd your-nextjs-app
|
|
91
|
+
|
|
92
|
+
# Run evaluations on current directory
|
|
93
|
+
microeval --category nextjs
|
|
94
|
+
|
|
95
|
+
# Check the results
|
|
96
|
+
cat results/*.json
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
**🔒 Safety Note:** When running on local directories, your code is **copied** to a temporary directory before evaluation. Your original files are **never modified or deleted**. The framework has 6 independent safety checks to prevent accidental file deletion.
|
|
100
|
+
|
|
101
|
+
### Alternative: Run Against Remote Repository
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# Run against a GitHub repository
|
|
105
|
+
microeval --repo https://github.com/user/app --category nextjs
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Available Eval Categories
|
|
109
|
+
|
|
110
|
+
| Category | Count | Description |
|
|
111
|
+
|----------|-------|-------------|
|
|
112
|
+
| **nextjs** | 20+ | Next.js App Router patterns, server/client components, routing |
|
|
113
|
+
| **react** | 7+ | React hooks, state management, component patterns |
|
|
114
|
+
| **supabase** | 17+ | Supabase auth, database, storage, RLS security |
|
|
115
|
+
| **tailwind** | 4+ | Tailwind CSS configuration and usage |
|
|
116
|
+
| **typescript** | 2+ | TypeScript type safety and best practices |
|
|
117
|
+
| **vercel** | 3+ | Vercel deployment and configuration |
|
|
118
|
+
| **shadcn** | 7+ | shadcn/ui component library integration |
|
|
119
|
+
|
|
120
|
+
**See all available evals:**
|
|
121
|
+
```bash
|
|
122
|
+
# List all evals (recommended)
|
|
123
|
+
microeval --list
|
|
124
|
+
|
|
125
|
+
# List evals in a specific category
|
|
126
|
+
microeval --list --category nextjs
|
|
127
|
+
|
|
128
|
+
# Or using Python module
|
|
129
|
+
python -m microevals.eval_registry --list
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Running Evals
|
|
133
|
+
|
|
134
|
+
### Local Directory (Recommended)
|
|
135
|
+
|
|
136
|
+
Run evaluations on your current project:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
# Using the microeval command (recommended)
|
|
140
|
+
microeval --category nextjs
|
|
141
|
+
|
|
142
|
+
# Or using Python module directly
|
|
143
|
+
python -m microevals.eval_runner --category nextjs
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**More examples:**
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
# Run a specific eval
|
|
150
|
+
microeval --eval evals/nextjs/001-server-component.yaml
|
|
151
|
+
|
|
152
|
+
# Run all evals
|
|
153
|
+
microeval --all
|
|
154
|
+
|
|
155
|
+
# Run with batch mode for speed
|
|
156
|
+
microeval --category nextjs --batch-size 10
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Remote Repository
|
|
160
|
+
|
|
161
|
+
Run evaluations against a GitHub repository:
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
# Using the microeval command
|
|
165
|
+
microeval --repo https://github.com/user/app --category nextjs
|
|
166
|
+
|
|
167
|
+
# Or using Python module directly
|
|
168
|
+
python -m microevals.eval_runner --repo https://github.com/user/app --category nextjs
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**More examples:**
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
# Run specific eval
|
|
175
|
+
microeval --repo https://github.com/user/app --eval evals/nextjs/001-server-component.yaml
|
|
176
|
+
|
|
177
|
+
# Run all evals
|
|
178
|
+
microeval --repo https://github.com/user/app --all
|
|
179
|
+
|
|
180
|
+
# Run with batch mode
|
|
181
|
+
microeval --repo https://github.com/user/app --all --batch-size 15
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### Specific Eval IDs
|
|
185
|
+
|
|
186
|
+
Run evaluations by their IDs:
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
# Using microeval command
|
|
190
|
+
microeval --ids nextjs_server_component_001 react_missing_useeffect_dependencies_001
|
|
191
|
+
|
|
192
|
+
# Or using Python module
|
|
193
|
+
python -m microevals.eval_runner \
|
|
194
|
+
--repo https://github.com/user/app \
|
|
195
|
+
--ids nextjs_server_component_001 react_missing_useeffect_dependencies_001
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Multiple Specific Evals
|
|
199
|
+
|
|
200
|
+
Run multiple specific eval files:
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
# Using microeval command
|
|
204
|
+
microeval --evals evals/nextjs/001-server-component.yaml evals/react/001_missing_useeffect_dependencies.yaml
|
|
205
|
+
|
|
206
|
+
# Or using Python module
|
|
207
|
+
python -m microevals.eval_runner \
|
|
208
|
+
--repo https://github.com/user/app \
|
|
209
|
+
--evals evals/nextjs/001-server-component.yaml evals/react/001_missing_useeffect_dependencies.yaml
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## Advanced Usage
|
|
213
|
+
|
|
214
|
+
### Runtime Input Overrides
|
|
215
|
+
|
|
216
|
+
Override default values from eval YAML files:
|
|
217
|
+
|
|
218
|
+
```bash
|
|
219
|
+
# Using microeval command
|
|
220
|
+
microeval --eval evals/supabase/001_client_setup.yaml \
|
|
221
|
+
--input supabase_url "https://xyz.supabase.co" \
|
|
222
|
+
--input supabase_anon_key "your_key_here"
|
|
223
|
+
|
|
224
|
+
# Or using Python module
|
|
225
|
+
python -m microevals.eval_runner \
|
|
226
|
+
--repo https://github.com/user/app \
|
|
227
|
+
--eval evals/supabase/001_client_setup.yaml \
|
|
228
|
+
--input supabase_url "https://xyz.supabase.co" \
|
|
229
|
+
--input supabase_anon_key "your_key_here"
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### Parallel Execution
|
|
233
|
+
|
|
234
|
+
Run multiple evals in parallel (faster but uses more resources):
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
# Using microeval command
|
|
238
|
+
microeval --category nextjs --parallel 3
|
|
239
|
+
|
|
240
|
+
# Or using Python module
|
|
241
|
+
python -m microevals.eval_runner \
|
|
242
|
+
--repo https://github.com/user/app \
|
|
243
|
+
--category nextjs \
|
|
244
|
+
--parallel 3
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### Batch Mode
|
|
248
|
+
|
|
249
|
+
Run multiple evals in a single Claude session (most efficient):
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
# Using microeval command - Run 5 evals per Claude session
|
|
253
|
+
microeval --category tailwind --batch-size 5
|
|
254
|
+
|
|
255
|
+
# Run all evals in large batches
|
|
256
|
+
microeval --all --batch-size 15
|
|
257
|
+
|
|
258
|
+
# Or using Python module
|
|
259
|
+
python -m microevals.eval_runner \
|
|
260
|
+
--repo https://github.com/user/app \
|
|
261
|
+
--category tailwind \
|
|
262
|
+
--batch-size 5
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
**Batch mode benefits:**
|
|
266
|
+
- Faster execution (single context for multiple evals)
|
|
267
|
+
- More efficient Claude usage
|
|
268
|
+
- Better for related evaluations
|
|
269
|
+
|
|
270
|
+
**Preview batch prompt before running:**
|
|
271
|
+
|
|
272
|
+
```bash
|
|
273
|
+
microeval --category tailwind --batch-size 3 --print-prompt
|
|
274
|
+
|
|
275
|
+
# Or using Python module
|
|
276
|
+
python -m microevals.eval_runner \
|
|
277
|
+
--repo https://github.com/user/app \
|
|
278
|
+
--category tailwind \
|
|
279
|
+
--batch-size 3 \
|
|
280
|
+
--print-prompt
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### Custom Timeout
|
|
284
|
+
|
|
285
|
+
Increase timeout for slower evaluations:
|
|
286
|
+
|
|
287
|
+
```bash
|
|
288
|
+
# Using microeval command
|
|
289
|
+
microeval --eval evals/nextjs/030_app_router_migration.yaml --timeout 600
|
|
290
|
+
|
|
291
|
+
# Or using Python module
|
|
292
|
+
python -m microevals.eval_runner \
|
|
293
|
+
--repo https://github.com/user/app \
|
|
294
|
+
--eval evals/nextjs/030_app_router_migration.yaml \
|
|
295
|
+
--timeout 600 # 10 minutes
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
### Custom Output Directory
|
|
299
|
+
|
|
300
|
+
Save results to a specific directory:
|
|
301
|
+
|
|
302
|
+
```bash
|
|
303
|
+
# Using microeval command
|
|
304
|
+
microeval --category nextjs --output-dir my_results
|
|
305
|
+
|
|
306
|
+
# Or using Python module
|
|
307
|
+
python -m microevals.eval_runner \
|
|
308
|
+
--repo https://github.com/user/app \
|
|
309
|
+
--category nextjs \
|
|
310
|
+
--output-dir my_results
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## Understanding Results
|
|
314
|
+
|
|
315
|
+
### Score System
|
|
316
|
+
|
|
317
|
+
Each eval returns a score:
|
|
318
|
+
|
|
319
|
+
| Score | Status | Meaning |
|
|
320
|
+
|-------|--------|---------|
|
|
321
|
+
| **1.0** | PASS | Code follows best practices, no issues found |
|
|
322
|
+
| **0.0** | FAIL | Anti-pattern detected or criteria not met |
|
|
323
|
+
| **-1.0** | N/A | Pattern/feature not present in codebase |
|
|
324
|
+
|
|
325
|
+
### Result Output
|
|
326
|
+
|
|
327
|
+
Results are saved to `results/` as JSON files:
|
|
328
|
+
|
|
329
|
+
```json
|
|
330
|
+
{
|
|
331
|
+
"passed": true,
|
|
332
|
+
"score": 1.0,
|
|
333
|
+
"summary": "Server components properly use async/await for data fetching",
|
|
334
|
+
"evidence": [
|
|
335
|
+
"app/page.tsx:15 - Correct async server component implementation",
|
|
336
|
+
"app/posts/page.tsx:20 - Proper await on fetch and response.json()"
|
|
337
|
+
],
|
|
338
|
+
"issues": [],
|
|
339
|
+
"metadata": {
|
|
340
|
+
"eval_id": "nextjs_server_component_001",
|
|
341
|
+
"eval_name": "Server Component Data Fetching",
|
|
342
|
+
"repo_url": "https://github.com/user/app",
|
|
343
|
+
"timestamp": "2025-11-10T10:30:45",
|
|
344
|
+
"evaluator": "claude"
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
### Terminal Output
|
|
350
|
+
|
|
351
|
+
Live results show in terminal with color coding:
|
|
352
|
+
|
|
353
|
+
```
|
|
354
|
+
Running evaluations for: https://github.com/user/my-app
|
|
355
|
+
================================================================================
|
|
356
|
+
|
|
357
|
+
[1/5] Running 001-server-component.yaml...
|
|
358
|
+
PASS nextjs/001-server-component.yaml 12.3s
|
|
359
|
+
Server components properly use async/await for data fetching
|
|
360
|
+
|
|
361
|
+
[2/5] Running 002-client-component.yaml...
|
|
362
|
+
FAIL nextjs/002-client-component.yaml 8.7s
|
|
363
|
+
Found 'use client' components with hooks that should be server components
|
|
364
|
+
|
|
365
|
+
[3/5] Running 003-cookies.yaml...
|
|
366
|
+
N/A nextjs/003-cookies.yaml 5.2s
|
|
367
|
+
No cookie usage found in codebase
|
|
368
|
+
|
|
369
|
+
================================================================================
|
|
370
|
+
SUMMARY
|
|
371
|
+
================================================================================
|
|
372
|
+
Total evaluations: 5
|
|
373
|
+
Passed: 3
|
|
374
|
+
Failed: 1
|
|
375
|
+
Not Applicable: 1
|
|
376
|
+
Timeouts: 0
|
|
377
|
+
Errors: 0
|
|
378
|
+
Total duration: 45.2s
|
|
379
|
+
Pass rate: 75.0% (excluding N/A)
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
## Project Structure
|
|
383
|
+
|
|
384
|
+
```
|
|
385
|
+
MicroEvals/
|
|
386
|
+
├── microevals/ # Main package
|
|
387
|
+
│ ├── __init__.py # Package initialization
|
|
388
|
+
│ ├── eval_runner.py # Main CLI for running evals
|
|
389
|
+
│ ├── eval_registry.py # Registry and discovery of evals
|
|
390
|
+
│ └── utils.py # Utility functions
|
|
391
|
+
│
|
|
392
|
+
├── evals/ # Evaluation definitions
|
|
393
|
+
│ ├── nextjs/ # Next.js-specific evals
|
|
394
|
+
│ │ ├── 001-server-component.yaml
|
|
395
|
+
│ │ ├── 002-client-component.yaml
|
|
396
|
+
│ │ └── ...
|
|
397
|
+
│ ├── react/ # React-specific evals
|
|
398
|
+
│ ├── supabase/ # Supabase-specific evals
|
|
399
|
+
│ ├── tailwind/ # Tailwind-specific evals
|
|
400
|
+
│ ├── typescript/ # TypeScript-specific evals
|
|
401
|
+
│ ├── vercel/ # Vercel-specific evals
|
|
402
|
+
│ └── shadcn/ # shadcn/ui-specific evals
|
|
403
|
+
│
|
|
404
|
+
├── config/ # Configuration files
|
|
405
|
+
│ ├── judge_system_prompt.yaml # Claude judge prompt templates
|
|
406
|
+
│ └── example_repos.json # Example repositories
|
|
407
|
+
│
|
|
408
|
+
├── results/ # Evaluation results (auto-generated)
|
|
409
|
+
│ └── *.json # Result files
|
|
410
|
+
│
|
|
411
|
+
├── requirements.txt # Python dependencies
|
|
412
|
+
├── CONTRIBUTING.md # Contribution guidelines
|
|
413
|
+
├── LICENSE # License file
|
|
414
|
+
└── README.md # This file
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
## Creating Custom Evals
|
|
418
|
+
|
|
419
|
+
Want to add your own evaluations? See [CONTRIBUTING.md](CONTRIBUTING.md) for:
|
|
420
|
+
|
|
421
|
+
- Eval template and format
|
|
422
|
+
- Naming conventions
|
|
423
|
+
- Testing guidelines
|
|
424
|
+
- Submission process
|
|
425
|
+
|
|
426
|
+
**Quick template:**
|
|
427
|
+
|
|
428
|
+
```yaml
|
|
429
|
+
eval_id: category_descriptive_name_001
|
|
430
|
+
name: "Human-Readable Name"
|
|
431
|
+
description: "What this eval checks"
|
|
432
|
+
category: nextjs # or react, supabase, etc.
|
|
433
|
+
|
|
434
|
+
# Optional runtime inputs
|
|
435
|
+
inputs:
|
|
436
|
+
custom_variable: "default_value"
|
|
437
|
+
|
|
438
|
+
criteria: |
|
|
439
|
+
You have access to the entire codebase. Evaluate [what to check].
|
|
440
|
+
|
|
441
|
+
WHAT TO LOOK FOR:
|
|
442
|
+
- [Specific patterns to search for]
|
|
443
|
+
|
|
444
|
+
ANTI-PATTERN (mark as failed):
|
|
445
|
+
- [Bad pattern 1]
|
|
446
|
+
- [Bad pattern 2]
|
|
447
|
+
|
|
448
|
+
CORRECT PATTERN (mark as passed):
|
|
449
|
+
- [Good pattern 1]
|
|
450
|
+
- [Good pattern 2]
|
|
451
|
+
|
|
452
|
+
MARK AS N/A if:
|
|
453
|
+
- [Condition for not applicable]
|
|
454
|
+
|
|
455
|
+
Return JSON with: passed, score, summary, evidence, issues
|
|
456
|
+
```
|
|
457
|
+
|
|
458
|
+
## Use Cases
|
|
459
|
+
|
|
460
|
+
### 1. CI/CD Integration
|
|
461
|
+
|
|
462
|
+
Add to your CI pipeline to catch anti-patterns:
|
|
463
|
+
|
|
464
|
+
```yaml
|
|
465
|
+
# .github/workflows/evals.yml
|
|
466
|
+
name: Code Quality Evals
|
|
467
|
+
on: [push, pull_request]
|
|
468
|
+
|
|
469
|
+
jobs:
|
|
470
|
+
evals:
|
|
471
|
+
runs-on: ubuntu-latest
|
|
472
|
+
steps:
|
|
473
|
+
- uses: actions/checkout@v3
|
|
474
|
+
- name: Run MicroEvals
|
|
475
|
+
run: |
|
|
476
|
+
pip install -r requirements.txt
|
|
477
|
+
python -m microevals.eval_runner \
|
|
478
|
+
--repo . \
|
|
479
|
+
--category nextjs \
|
|
480
|
+
--batch-size 10
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
### 2. Audit Existing Projects
|
|
484
|
+
|
|
485
|
+
Evaluate multiple repositories:
|
|
486
|
+
|
|
487
|
+
```bash
|
|
488
|
+
#!/bin/bash
|
|
489
|
+
repos=(
|
|
490
|
+
"https://github.com/org/app1"
|
|
491
|
+
"https://github.com/org/app2"
|
|
492
|
+
"https://github.com/org/app3"
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
for repo in "${repos[@]}"; do
|
|
496
|
+
echo "Evaluating $repo..."
|
|
497
|
+
python -m microevals.eval_runner --repo "$repo" --all --batch-size 20
|
|
498
|
+
done
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
### 3. Pre-deployment Checks
|
|
502
|
+
|
|
503
|
+
Validate before deploying to production:
|
|
504
|
+
|
|
505
|
+
```bash
|
|
506
|
+
# Check production-critical patterns
|
|
507
|
+
python -m microevals.eval_runner \
|
|
508
|
+
--repo https://github.com/org/production-app \
|
|
509
|
+
--category vercel \
|
|
510
|
+
--category supabase \
|
|
511
|
+
--input deployment_url "https://app.vercel.app"
|
|
512
|
+
```
|
|
513
|
+
|
|
514
|
+
## Troubleshooting
|
|
515
|
+
|
|
516
|
+
### Claude CLI Not Found
|
|
517
|
+
|
|
518
|
+
```bash
|
|
519
|
+
# Ensure Claude CLI is installed and in PATH
|
|
520
|
+
which claude
|
|
521
|
+
|
|
522
|
+
# If not installed, see: https://docs.anthropic.com/en/docs/build-with-claude/cli
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
### Rate Limiting
|
|
526
|
+
|
|
527
|
+
If you hit Claude rate limits:
|
|
528
|
+
|
|
529
|
+
```bash
|
|
530
|
+
# Use batch mode to reduce API calls
|
|
531
|
+
python -m microevals.eval_runner --repo URL --all --batch-size 15
|
|
532
|
+
|
|
533
|
+
# Or add delays with single eval mode (automatic 2s delay)
|
|
534
|
+
python -m microevals.eval_runner --repo URL --all --parallel 1
|
|
535
|
+
```
|
|
536
|
+
|
|
537
|
+
### Timeout Issues
|
|
538
|
+
|
|
539
|
+
For large codebases, increase timeout:
|
|
540
|
+
|
|
541
|
+
```bash
|
|
542
|
+
python -m microevals.eval_runner \
|
|
543
|
+
--repo URL \
|
|
544
|
+
--all \
|
|
545
|
+
--timeout 600 \
|
|
546
|
+
--batch-size 10
|
|
547
|
+
```
|
|
548
|
+
|
|
549
|
+
## Contributing
|
|
550
|
+
|
|
551
|
+
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for:
|
|
552
|
+
|
|
553
|
+
- How to submit new evals
|
|
554
|
+
- Testing requirements
|
|
555
|
+
- PR guidelines
|
|
556
|
+
|
|
557
|
+
**Quick contribution:**
|
|
558
|
+
1. Fork the repo
|
|
559
|
+
2. Create new eval in `evals/[category]/`
|
|
560
|
+
3. Test locally: `python -m microevals.eval_runner --eval your-eval.yaml --repo test-repo`
|
|
561
|
+
4. Submit PR
|
|
562
|
+
|
|
563
|
+
## License
|
|
564
|
+
|
|
565
|
+
MicroEvals operates under MIT license. Please see [LICENSE](LICENSE) for more details.
|
|
566
|
+
|
|
567
|
+
## Support
|
|
568
|
+
|
|
569
|
+
- [Issues](https://github.com/Design-Arena/MicroEvals/issues)
|
|
570
|
+
- Email: contact@designarena.ai
|
|
571
|
+
|
|
572
|
+
---
|
|
573
|
+
|
|
574
|
+
Built for better agent code quality.
|
|
575
|
+
See more and try the evals live at [designarena.ai/evals](DesignArena.ai/evals).
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
config/judge_system_prompt.yaml,sha256=ZsnWvg3IlXcNmF75MXo2fKEacRq_32ycab7bYPZgEd0,5088
|
|
2
|
+
evals/nextjs/001-server-component.yaml,sha256=5OUlzDEjDl8GhzEsFPWxaPTGIWh6Va9iQsFoNdHJBdo,1254
|
|
3
|
+
evals/nextjs/002-client-component.yaml,sha256=OSS4wceHg6gt38B1ViCD5EyQHeD-pm71rGsHW5jmySE,1187
|
|
4
|
+
evals/nextjs/003-cookies.yaml,sha256=7-o71TwWxf_1zypbMafBsBgbCCvehmv_QdIIEVocbtc,1327
|
|
5
|
+
evals/nextjs/010-route-handlers.yaml,sha256=aKNoU6aaqcYBK70nD7NwaUhJoTMbbGAKKV9ncZ-BsRE,1388
|
|
6
|
+
evals/nextjs/013-pathname-server.yaml,sha256=VhcdlfMp8kKnJny2RDddyJpFlnCtel9MPk2gv-4Zzjg,1384
|
|
7
|
+
evals/nextjs/014-server-routing.yaml,sha256=WSd9RiOf5qQjc8syBXg2feJO3Si6AIfg2tkpHlEOAcg,1406
|
|
8
|
+
evals/nextjs/018-use-router.yaml,sha256=tg_e_OX5YFBI1OszF-0vYLNc1Jp-1uxoXuABPWTKvKw,1229
|
|
9
|
+
evals/nextjs/020_no_use_effect.yaml,sha256=wQ1Gv4d4DfbF3WHWt6E9kayx1t8oUGy3tjlqmdFXWVw,1379
|
|
10
|
+
evals/nextjs/021-avoid-fetch-in-effect.yaml,sha256=-Yi1T6O2u7ZmR3w1jpkUvJ-SPMnxovQq6VEgNzk4YsU,1497
|
|
11
|
+
evals/nextjs/022_prefer_server_actions.yaml,sha256=3N_GIl216M3eL8leR5SUSmsxACjJp70LfDFS-d-cULg,1229
|
|
12
|
+
evals/nextjs/023_avoid_getserversideprops.yaml,sha256=bsxl6cu4S9quaCFFxyNYeQIC3_6aoGmtEjaXVPchYjM,1143
|
|
13
|
+
evals/nextjs/024_avoid_redundant_usestate.yaml,sha256=m2Iuwj03xM6PES2_nJA2Vk8SN4o3dImCwR1OFvyMgok,1251
|
|
14
|
+
evals/nextjs/025_no_async_client_components.yaml,sha256=sOLWLSEfg1AXrK4xYyVLt-xvbqzWYcmHItPdbUn1a0I,1299
|
|
15
|
+
evals/nextjs/026_no_serial_await.yaml,sha256=i-RPE8dsxo52IfXrb_FIVyJ0F4PexVwFWBak4yZpg7I,1118
|
|
16
|
+
evals/nextjs/027-prefer-next-image.yaml,sha256=_HqYl90rA1DdUH_59CIygMvdImKZEPAZFnUsgTrGiNI,1235
|
|
17
|
+
evals/nextjs/027_no_hooks_in_server_components.yaml,sha256=OCtNLX1Y2xPdPJSK-SmTtKFW2TbpJYF3g6ol3SPFQpY,1319
|
|
18
|
+
evals/nextjs/028-prefer-next-font.yaml,sha256=6zFqwC0Et8CgJh2TWhdg5bgaE1t5QayK3AJDTBJ5pac,1339
|
|
19
|
+
evals/nextjs/028_cookies_headers_context.yaml,sha256=CWByrwvBgU9S3u5sWbn7QaUAvoFMiEUj5Yw7KOKs82g,1379
|
|
20
|
+
evals/nextjs/029_no_catch_redirect.yaml,sha256=bzbKtQV93ldhDKjOWPGSMn2LgQlFmifxp4iOnMfThmM,1380
|
|
21
|
+
evals/nextjs/030_app_router_migration.yaml,sha256=d35_G6J-TydYO9Ehw-s8ETjDI5TBJK3827PEVOtLQSo,1201
|
|
22
|
+
evals/nextjs/031_no_non_serializable_props.yaml,sha256=kKb4zDbUmhBj9kEq2cFzO7QEx61LE16HfVkwmO7Btmc,1590
|
|
23
|
+
evals/react/001_missing_useeffect_dependencies.yaml,sha256=dvulLqelmdheGINvruWgM02LzxQz_7UlApLSr9YNlE4,1132
|
|
24
|
+
evals/react/002_incorrect_event_handler.yaml,sha256=BfSrp1Q887ZIUvWtG5s6B4kJnUBaQoJBArEg85SjRWw,1060
|
|
25
|
+
evals/react/003_missing_return_in_map.yaml,sha256=lJfeEVKkB28CgJizLTLIifhqzPXnfDpfxAlIwKK9M9A,1119
|
|
26
|
+
evals/react/004_async_useeffect.yaml,sha256=5iD_lilT9RZKe_Ql0JoDpiyEOWgR6bN8mhDuFSV9F3s,1071
|
|
27
|
+
evals/react/005_direct_state_mutation.yaml,sha256=bGIc6W9z3sIy_4fh04olnH2MDPGJJ3tVpnzFQM-SwzQ,1080
|
|
28
|
+
evals/react/006_index_as_key.yaml,sha256=Y-x6f6P0fMQHQdDHmeP0XUc-41dNqcIQzxOQhQlgJzk,1107
|
|
29
|
+
evals/react/zustand_store_usage.yaml,sha256=hWd5t2VEb7sqSZ0VILdTFps2zpB8QoIfrKCnhsHcD7w,886
|
|
30
|
+
evals/shadcn/001_cn_utility_function.yaml,sha256=p2m3vW8jQbsOtGPkdq58c4lNBt5lLjhJWKnEwgeqxB4,1050
|
|
31
|
+
evals/shadcn/002_css_variables.yaml,sha256=8nLAn5AuFTtJrIjztyrkNIFWlvCUR5ThcBpVbi4l9yI,1116
|
|
32
|
+
evals/shadcn/003_component_dependencies.yaml,sha256=9wcCMOrY09plqT-qdJ4Y0H_cu73NWBOCz47o7OFUdXc,1118
|
|
33
|
+
evals/shadcn/004_path_aliases.yaml,sha256=mJPsZoCsqLIVgU4-U-G1gZCiaVwXi8mLh5cgK3f5aCQ,1111
|
|
34
|
+
evals/shadcn/005_client_directive.yaml,sha256=Rh3KiGDILuhF5fFPd7O9MxG9aH2XDQ483rw6EvyZt-Y,1208
|
|
35
|
+
evals/shadcn/006_tailwind_config.yaml,sha256=yeTnRhwdzO4fxxh6S7v1KUVoRUYWBNOgw2J9RR_wWF8,1253
|
|
36
|
+
evals/shadcn/007_components_json_config.yaml,sha256=b1dQbAvA_3-fR2iBB9oAua-GAYlcAg3SBRksaLQc9QM,1154
|
|
37
|
+
evals/supabase/001_client_setup.yaml,sha256=P00X9Zovq-szyvftkRA_eW_RDbpbZH7JrMWzxQd_CsU,2102
|
|
38
|
+
evals/supabase/002_auth_context_setup.yaml,sha256=3_YmBmQSxMdw14DEtORUQaavYnzPlOryiGeIScPqZas,1854
|
|
39
|
+
evals/supabase/003_auth_flow_implementation.yaml,sha256=Rzea6GCxT9V7cEEjSa8pZj69BjRiJ3EaDqkX-P6UGn8,1634
|
|
40
|
+
evals/supabase/004_auth_flow_testing_WIP.yaml,sha256=JdqJ5p_YnNTgWDjvRBG4B3fkqUtww3hPbSBpPjc9tsU,1787
|
|
41
|
+
evals/supabase/005_auth_google_oauth.yaml,sha256=6scsmEaNmLCa27MhGdn3KsTQfX0jtuPio6cPdnelr8w,2136
|
|
42
|
+
evals/supabase/007_storage_client_setup.yaml,sha256=WsdZ_W2MHSqbgcPZfnd_gpBJvCiBLb5P6_yUz6fF6C4,1701
|
|
43
|
+
evals/supabase/008_storage_nextjs_config.yaml,sha256=drp8H6hnZZsIZvzunU_fHrxeNhwcp5t4hKLgW3qUNjU,1698
|
|
44
|
+
evals/supabase/009_storage_image_upload.yaml,sha256=yLh2vFAAaZAzTfcBmJ4v8IZtm3nhwru2e6M3eMEcpYU,1831
|
|
45
|
+
evals/supabase/010_security_rls_enabled.yaml,sha256=q9iglE_s5WEVRhteSr9WInPO-x5ZkE0RFx_89YkBmYA,1713
|
|
46
|
+
evals/supabase/011_security_rls_policies.yaml,sha256=f1LeJDcFEoNu0OfkcOlBXxiSnFfdEADHhtH6qjZhmKA,1805
|
|
47
|
+
evals/supabase/012_security_no_service_key_exposed.yaml,sha256=VEhOO9OQG2XimJ1yXhi-aYU3h4M9gBzIQaR69JsybTE,1950
|
|
48
|
+
evals/supabase/013_database_read_data.yaml,sha256=x9dDAIJEc1YDh7QS1nwQKZ4z7niCPb-m54_klnJpiGg,1660
|
|
49
|
+
evals/supabase/014_database_create_data.yaml,sha256=qyT850fWv-lDF4rQunOwSyHWyF1AisKVcbnZp0JMHzQ,1648
|
|
50
|
+
evals/supabase/015_database_update_data.yaml,sha256=rHStxm0UhyaXFeY9SdhCp3MTCBaUnIrdIkbn_kNIuhQ,1931
|
|
51
|
+
evals/supabase/016_database_delete_data.yaml,sha256=jkrNElfHgyXk13GBBdtwjTAkOtJsVaS_AbsfLaBYLiE,1879
|
|
52
|
+
evals/supabase/017_database_user_scoped_query.yaml,sha256=ZK0E3rN22RCk9S9d2AipKGGhK8WXvtEdrI5a9GAUG8Y,1993
|
|
53
|
+
evals/tailwind/001_tailwind_v4_config.yaml,sha256=RN7XV7zbAFKSeiXzKUPyNm7TfIz6Xdn6s4yxOgyUfj4,745
|
|
54
|
+
evals/tailwind/002_content_paths.yaml,sha256=2F7HS4Tnb3GQOLcde2fDMiWKPTKRE3MBduU4nMJtshI,1159
|
|
55
|
+
evals/tailwind/003_no_dynamic_class_construction.yaml,sha256=03p1Y58-rP0eAC4lRPXb2MFPSrFKBDU0rqm1RUKqITk,1124
|
|
56
|
+
evals/tailwind/tailwind_postcss_config.yaml,sha256=Mfts7oOzFlLGhrOfJ0TKsLGAWqnxEVKYX0UIvJsjXIs,1039
|
|
57
|
+
evals/typescript/001_unsafe_type_assertions.yaml,sha256=AFFcRRcNlShvXfZnq4EH47EGdSC_uM9hu5TfVtGPdpM,1551
|
|
58
|
+
evals/typescript/002_missing_null_checks.yaml,sha256=n_B9c1gCG9Qt9IgGJ-vryQjb7nAtGx_hGa7VWTpUqXg,1197
|
|
59
|
+
evals/vercel/001_vercel_deployment.yaml,sha256=BUBS_V_0MWQOR9a3j_PeTfDKK37Y-3tSBtcBtUJLm8c,845
|
|
60
|
+
evals/vercel/002_environment_variables_handling.yaml,sha256=s5_7u9FKXtgxJhNgRhUTBzBknMbTbnuwndWY5UPK7Uw,1069
|
|
61
|
+
evals/vercel/003_seo_metadata.yaml,sha256=IPMyNdU6dVdWznwb2dY2VtldYFOVbIdBTb8ho6KjO5U,1448
|
|
62
|
+
microevals/__init__.py,sha256=6OZMz6BNNbsEaLcoeooRXgREZ-UjwJkFfQWxuKx6JLM,651
|
|
63
|
+
microevals/eval_registry.py,sha256=8r-57uqcorPUuf2m_4gliI6RGQpgLYz0DaYJf-LOkng,8489
|
|
64
|
+
microevals/eval_runner.py,sha256=2VMyDUnPKWz_TWVg7hWyQNxK3Lpil0Q4d58S_CreUZs,21473
|
|
65
|
+
microevals/utils.py,sha256=qvZk09CDwM4_pym1hSM_W-ZDIfYCMUeNIC8K5rhI9x4,16884
|
|
66
|
+
microevals-0.1.0.dist-info/licenses/LICENSE,sha256=KrbtaYayZGcets1iOs45DzSQ29eEDn25Vk9yzIfA-TA,1069
|
|
67
|
+
microevals-0.1.0.dist-info/METADATA,sha256=623qIjGrAGJnjbHPLxGuqGlxzGtFnOH-MR1ERnA7jls,15671
|
|
68
|
+
microevals-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
69
|
+
microevals-0.1.0.dist-info/entry_points.txt,sha256=sTpr74DUzhPIMVcfzx1IyLwZ_wNuf5BJNYhNwxamm_c,58
|
|
70
|
+
microevals-0.1.0.dist-info/top_level.txt,sha256=8LDwu8cEm-bLrA_lUj1ARssge0Db7oHgu0cXnysx5UM,11
|
|
71
|
+
microevals-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Design-Arena
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
microevals
|