microevals 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. config/judge_system_prompt.yaml +113 -0
  2. evals/nextjs/001-server-component.yaml +28 -0
  3. evals/nextjs/002-client-component.yaml +26 -0
  4. evals/nextjs/003-cookies.yaml +28 -0
  5. evals/nextjs/010-route-handlers.yaml +30 -0
  6. evals/nextjs/013-pathname-server.yaml +29 -0
  7. evals/nextjs/014-server-routing.yaml +28 -0
  8. evals/nextjs/018-use-router.yaml +28 -0
  9. evals/nextjs/020_no_use_effect.yaml +30 -0
  10. evals/nextjs/021-avoid-fetch-in-effect.yaml +28 -0
  11. evals/nextjs/022_prefer_server_actions.yaml +29 -0
  12. evals/nextjs/023_avoid_getserversideprops.yaml +27 -0
  13. evals/nextjs/024_avoid_redundant_usestate.yaml +29 -0
  14. evals/nextjs/025_no_async_client_components.yaml +29 -0
  15. evals/nextjs/026_no_serial_await.yaml +26 -0
  16. evals/nextjs/027-prefer-next-image.yaml +30 -0
  17. evals/nextjs/027_no_hooks_in_server_components.yaml +29 -0
  18. evals/nextjs/028-prefer-next-font.yaml +30 -0
  19. evals/nextjs/028_cookies_headers_context.yaml +29 -0
  20. evals/nextjs/029_no_catch_redirect.yaml +31 -0
  21. evals/nextjs/030_app_router_migration.yaml +30 -0
  22. evals/nextjs/031_no_non_serializable_props.yaml +31 -0
  23. evals/react/001_missing_useeffect_dependencies.yaml +29 -0
  24. evals/react/002_incorrect_event_handler.yaml +28 -0
  25. evals/react/003_missing_return_in_map.yaml +28 -0
  26. evals/react/004_async_useeffect.yaml +32 -0
  27. evals/react/005_direct_state_mutation.yaml +30 -0
  28. evals/react/006_index_as_key.yaml +31 -0
  29. evals/react/zustand_store_usage.yaml +25 -0
  30. evals/shadcn/001_cn_utility_function.yaml +31 -0
  31. evals/shadcn/002_css_variables.yaml +32 -0
  32. evals/shadcn/003_component_dependencies.yaml +33 -0
  33. evals/shadcn/004_path_aliases.yaml +32 -0
  34. evals/shadcn/005_client_directive.yaml +31 -0
  35. evals/shadcn/006_tailwind_config.yaml +36 -0
  36. evals/shadcn/007_components_json_config.yaml +35 -0
  37. evals/supabase/001_client_setup.yaml +47 -0
  38. evals/supabase/002_auth_context_setup.yaml +43 -0
  39. evals/supabase/003_auth_flow_implementation.yaml +46 -0
  40. evals/supabase/004_auth_flow_testing_WIP.yaml +52 -0
  41. evals/supabase/005_auth_google_oauth.yaml +55 -0
  42. evals/supabase/007_storage_client_setup.yaml +43 -0
  43. evals/supabase/008_storage_nextjs_config.yaml +45 -0
  44. evals/supabase/009_storage_image_upload.yaml +49 -0
  45. evals/supabase/010_security_rls_enabled.yaml +42 -0
  46. evals/supabase/011_security_rls_policies.yaml +43 -0
  47. evals/supabase/012_security_no_service_key_exposed.yaml +49 -0
  48. evals/supabase/013_database_read_data.yaml +44 -0
  49. evals/supabase/014_database_create_data.yaml +44 -0
  50. evals/supabase/015_database_update_data.yaml +47 -0
  51. evals/supabase/016_database_delete_data.yaml +47 -0
  52. evals/supabase/017_database_user_scoped_query.yaml +52 -0
  53. evals/tailwind/001_tailwind_v4_config.yaml +22 -0
  54. evals/tailwind/002_content_paths.yaml +27 -0
  55. evals/tailwind/003_no_dynamic_class_construction.yaml +28 -0
  56. evals/tailwind/tailwind_postcss_config.yaml +24 -0
  57. evals/typescript/001_unsafe_type_assertions.yaml +39 -0
  58. evals/typescript/002_missing_null_checks.yaml +33 -0
  59. evals/vercel/001_vercel_deployment.yaml +19 -0
  60. evals/vercel/002_environment_variables_handling.yaml +23 -0
  61. evals/vercel/003_seo_metadata.yaml +33 -0
  62. microevals/__init__.py +34 -0
  63. microevals/eval_registry.py +222 -0
  64. microevals/eval_runner.py +533 -0
  65. microevals/utils.py +490 -0
  66. microevals-0.1.0.dist-info/METADATA +575 -0
  67. microevals-0.1.0.dist-info/RECORD +71 -0
  68. microevals-0.1.0.dist-info/WHEEL +5 -0
  69. microevals-0.1.0.dist-info/entry_points.txt +2 -0
  70. microevals-0.1.0.dist-info/licenses/LICENSE +21 -0
  71. microevals-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,46 @@
1
+
2
+ eval_id: supabase_auth_flow_implementation
3
+ name: "Supabase Auth Flow Implementation"
4
+ description: "Check if agent implemented signup, login, and signout functionality"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Create a login page and authentication system
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to create a login page and authentication system.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **Sign Up Implementation**
20
+ - Uses supabase.auth.signUp() or similar
21
+ - Accepts email and password
22
+
23
+ 2. **Sign In Implementation**
24
+ - Uses supabase.auth.signInWithPassword() or similar
25
+ - Accepts email and password
26
+
27
+ 3. **Sign Out Implementation**
28
+ - Uses supabase.auth.signOut()
29
+ - Clears session properly
30
+
31
+ Look at the repo and check for evidence like:
32
+ - Auth context or auth service with signUp method
33
+ - Auth context or auth service with signIn method
34
+ - Auth context or auth service with signOut method
35
+ - Proper Supabase auth API usage
36
+
37
+ SCORING:
38
+ - 1.0 (PASS): All three auth methods (signUp, signIn, signOut) are properly implemented with correct Supabase API usage
39
+ - 0.0 (FAIL): Missing any of the three auth methods OR incorrect implementation OR anti-patterns found
40
+ - -1.0 (N/A): Supabase auth is NOT USED (no @supabase/supabase-js OR uses different auth provider like Firebase, Auth0, NextAuth)
41
+
42
+ # Optional: Custom inputs for this specific eval
43
+ inputs:
44
+ supabase_url: "SUPABASE_URL"
45
+ supabase_anon_key: "SUPABASE_ANON_KEY"
46
+
@@ -0,0 +1,52 @@
1
+
2
+ eval_id: supabase_auth_flow_testing
3
+ name: "Supabase Auth Flow Testing"
4
+ description: "Test the actual authentication flow with live credentials"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Create a login page and authentication system
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to create a login page and authentication system. Test if it actually works.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if the authentication works by testing:
18
+
19
+ 1. **Sign Up Works**
20
+ - Test account creation with new email/password
21
+ - Verify account is created in Supabase
22
+ - Use provided credentials: URL={supabase_url}, Key={supabase_admin_key}
23
+
24
+ 2. **Sign In Works**
25
+ - Test login with test account: {test_email} / {test_password}
26
+ - Verify session is created
27
+ - User state is updated
28
+
29
+ 3. **Sign Out Works**
30
+ - Test logout functionality
31
+ - Verify session is cleared
32
+ - User state is reset
33
+
34
+ Look at the repo and check for evidence like:
35
+ - Start the application
36
+ - Test the complete authentication flow
37
+ - Verify auth state changes correctly
38
+ - Check Supabase dashboard for user creation
39
+
40
+ SCORING:
41
+ - 1.0 (PASS): All three operations work (signUp, signIn, signOut) when tested live with provided credentials
42
+ - 0.0 (FAIL): Any of the three operations fail OR unable to test OR authentication errors
43
+ - -1.0 (N/A): Cannot test authentication flow (app doesn't run, no test environment, or auth not implemented)
44
+
45
+ # Optional: Custom inputs for this specific eval
46
+ inputs:
47
+ supabase_url: "SUPABASE_URL"
48
+ supabase_anon_key: "SUPABASE_ANON_KEY"
49
+ supabase_admin_key: "SUPABASE_ADMIN_KEY"
50
+ test_email: "test@example.com"
51
+ test_password: "testpassword123"
52
+
@@ -0,0 +1,55 @@
1
+
2
+ eval_id: supabase_auth_google_oauth
3
+ name: "Supabase Google OAuth Implementation"
4
+ description: "Check if agent implemented Google OAuth login correctly"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Add Google OAuth login
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to add Google OAuth login functionality.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **OAuth Sign In Implementation**
20
+ - Uses supabase.auth.signInWithOAuth({ provider: 'google' }) or similar
21
+ - Correct provider specified as 'google'
22
+ - Proper async/await or promise handling
23
+
24
+ 2. **Redirect Configuration**
25
+ - Redirect URLs configured (e.g., redirectTo option)
26
+ - Uses environment variables for redirect URLs (not hard-coded)
27
+ - Check for NEXT_PUBLIC_SITE_URL or similar env var usage
28
+
29
+ 3. **No Hard-Coded Secrets**
30
+ - No Google client secrets in the code
31
+ - No hard-coded OAuth credentials
32
+ - Credentials should be in .env or .env.local (not committed)
33
+
34
+ 4. **Proper Redirect Handling**
35
+ - Auth callback route exists (e.g., /auth/callback)
36
+ - Exchanges code for session using supabase.auth.exchangeCodeForSession() or similar
37
+ - Handles redirect after successful authentication
38
+
39
+ Look at the repo and check for evidence like:
40
+ - signInWithOAuth method with 'google' provider
41
+ - Environment variable usage for URLs/redirects
42
+ - Auth callback route handler
43
+ - No hard-coded secrets or credentials
44
+ - Proper error handling for OAuth flow
45
+
46
+ SCORING:
47
+ - 1.0 (PASS): All requirements met (correct OAuth call, env vars for redirect, no secrets, callback handler exists)
48
+ - 0.0 (FAIL): Missing any requirement OR incorrect implementation OR security issues (hardcoded secrets/credentials)
49
+ - -1.0 (N/A): Supabase auth is NOT USED (no @supabase/supabase-js OR uses different auth provider) OR OAuth is not implemented at all
50
+
51
+ # Optional: Custom inputs for this specific eval
52
+ inputs:
53
+ supabase_url: "SUPABASE_URL"
54
+ supabase_anon_key: "SUPABASE_ANON_KEY"
55
+
@@ -0,0 +1,43 @@
1
+
2
+ eval_id: supabase_storage_client_setup
3
+ name: "Supabase Storage Client Setup"
4
+ description: "Check if agent set up Supabase storage client and packages"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Implement image upload functionality using Supabase storage
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to implement image upload functionality using Supabase storage.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **Storage Client Available**
20
+ - Supabase client is configured (from 001_client_setup)
21
+ - Can access storage methods (.storage.from())
22
+
23
+ 2. **Connection Test**
24
+ - Test connection to storage using provided credentials: URL={supabase_url}, Key={supabase_anon_key}
25
+ - Verify storage bucket exists or can be accessed
26
+ - Use admin key if needed: {supabase_admin_key}
27
+
28
+ Look at the repo and check for evidence like:
29
+ - Supabase client with storage access
30
+ - Storage methods being used (.storage.from().upload(), .storage.from().getPublicUrl())
31
+ - Successful connection to storage
32
+
33
+ SCORING:
34
+ - 1.0 (PASS): Storage client is available AND connection to storage works with provided credentials
35
+ - 0.0 (FAIL): Missing storage client OR connection fails OR storage methods not accessible
36
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js) OR no file storage feature exists (using different provider like AWS S3, Cloudinary, etc.)
37
+
38
+ # Optional: Custom inputs for this specific eval
39
+ inputs:
40
+ supabase_url: "SUPABASE_URL"
41
+ supabase_anon_key: "SUPABASE_ANON_KEY"
42
+ supabase_admin_key: "SUPABASE_ADMIN_KEY"
43
+
@@ -0,0 +1,45 @@
1
+
2
+ eval_id: supabase_storage_nextjs_config
3
+ name: "Next.js Image Config for Supabase Storage"
4
+ description: "Check if Next.js is configured to allow images from Supabase storage"
5
+ category: supabase
6
+ framework: nextjs
7
+
8
+ # What was the agent asked to do?
9
+ task_description: |
10
+ Implement image upload functionality using Supabase storage
11
+
12
+ # How to evaluate success?
13
+ criteria: |
14
+ The agent was asked to implement image upload functionality using Supabase storage.
15
+
16
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
17
+
18
+ Evaluate if they completed the task successfully by checking:
19
+
20
+ 1. **Next.js Config Exists**
21
+ - next.config.js or next.config.ts file exists
22
+ - File is properly formatted
23
+
24
+ 2. **Images RemotePatterns Configured**
25
+ - Config includes images.remotePatterns array
26
+ - Supabase storage hostname is in remotePatterns (extract from {supabase_url})
27
+ - Protocol is set to 'https'
28
+ - Pathname pattern covers storage paths (e.g., '/storage/v1/object/public/**')
29
+
30
+ Look at the repo and check for evidence like:
31
+ - next.config.js or next.config.ts file
32
+ - images.remotePatterns configuration
33
+ - Supabase hostname in remotePatterns
34
+ - Correct protocol and pathname patterns
35
+
36
+ SCORING:
37
+ - 1.0 (PASS): Next.js config exists AND Supabase storage hostname is properly configured in remotePatterns
38
+ - 0.0 (FAIL): Missing config file OR missing/incorrect remotePatterns configuration OR anti-patterns found
39
+ - -1.0 (N/A): Supabase storage is NOT USED (no @supabase/supabase-js OR uses different storage provider) OR not a Next.js app
40
+
41
+ # Optional: Custom inputs for this specific eval
42
+ inputs:
43
+ supabase_url: "SUPABASE_URL"
44
+ project_root: "."
45
+
@@ -0,0 +1,49 @@
1
+
2
+ eval_id: supabase_storage_image_upload
3
+ name: "Supabase Storage Image Upload Implementation"
4
+ description: "Check if image upload and display functionality is properly implemented"
5
+ category: supabase
6
+ framework: nextjs
7
+
8
+ # What was the agent asked to do?
9
+ task_description: |
10
+ Implement image upload functionality using Supabase storage
11
+
12
+ # How to evaluate success?
13
+ criteria: |
14
+ The agent was asked to implement image upload functionality using Supabase storage.
15
+
16
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
17
+
18
+ Evaluate if they completed the task successfully by checking:
19
+
20
+ 1. **Upload Implementation**
21
+ - Uses .storage.from().upload() method
22
+ - Handles file selection and upload
23
+ - Proper error handling for upload
24
+
25
+ 2. **Image Display with Next.js Image Component**
26
+ - Uses Next.js Image component (next/image)
27
+ - Image src points to Supabase storage URLs
28
+ - Proper width/height or fill mode
29
+
30
+ 3. **Public URL Retrieval**
31
+ - Uses .storage.from().getPublicUrl() or similar
32
+ - Correctly constructs image URLs from storage
33
+
34
+ Look at the repo and check for evidence like:
35
+ - Upload functionality using Supabase storage API
36
+ - Next.js Image component with Supabase URLs
37
+ - Public URL retrieval from storage
38
+ - Proper implementation in components or API routes
39
+
40
+ SCORING:
41
+ - 1.0 (PASS): Upload is implemented AND Next.js Image component is used with Supabase storage URLs
42
+ - 0.0 (FAIL): Missing upload implementation OR missing/incorrect Image component usage OR anti-patterns found
43
+ - -1.0 (N/A): Supabase storage is NOT USED (no @supabase/supabase-js OR uses different storage provider) OR no image upload feature exists
44
+
45
+ # Optional: Custom inputs for this specific eval
46
+ inputs:
47
+ supabase_url: "SUPABASE_URL"
48
+ supabase_anon_key: "SUPABASE_ANON_KEY"
49
+
@@ -0,0 +1,42 @@
1
+
2
+ eval_id: supabase_security_rls_enabled
3
+ name: "Supabase RLS Enabled on Tables"
4
+ description: "Check if Row Level Security is enabled on database tables"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Make sure the database and Supabase are secure
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to make sure the database and Supabase are secure.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **RLS Enabled on Public Tables**
20
+ - Query Supabase to check RLS status on all public tables
21
+ - Use admin credentials: URL={supabase_url}, Key={supabase_admin_key}
22
+ - Run SQL: SELECT tablename, rowsecurity FROM pg_tables WHERE schemaname = 'public'
23
+
24
+ 2. **All Data Tables Have RLS**
25
+ - Verify that all tables (excluding system tables) have rowsecurity = true
26
+ - Tables without RLS are a critical security vulnerability
27
+
28
+ Look at the repo and check for evidence like:
29
+ - Database tables have RLS enabled
30
+ - No public tables accessible without RLS
31
+ - SQL queries or migrations that enable RLS (ALTER TABLE ... ENABLE ROW LEVEL SECURITY)
32
+
33
+ SCORING:
34
+ - 1.0 (PASS): RLS is enabled on ALL public data tables (rowsecurity = true for all tables)
35
+ - 0.0 (FAIL): Any public data table missing RLS OR security vulnerability found (tables without RLS)
36
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js OR uses different database like MongoDB, Prisma+PostgreSQL, Firebase) OR cannot connect to verify RLS status
37
+
38
+ # Optional: Custom inputs for this specific eval
39
+ inputs:
40
+ supabase_url: "SUPABASE_URL"
41
+ supabase_admin_key: "SUPABASE_ADMIN_KEY"
42
+
@@ -0,0 +1,43 @@
1
+
2
+ eval_id: supabase_security_rls_policies
3
+ name: "Supabase RLS Policies Defined"
4
+ description: "Check if Row Level Security policies are properly defined"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Make sure the database and Supabase are secure
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to make sure the database and Supabase are secure.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **RLS Policies Exist**
20
+ - Query Supabase to check for RLS policies
21
+ - Use admin credentials: URL={supabase_url}, Key={supabase_admin_key}
22
+ - Run SQL: SELECT schemaname, tablename, policyname, cmd FROM pg_policies WHERE schemaname = 'public'
23
+
24
+ 2. **Policies Cover CRUD Operations**
25
+ - Each table should have policies for operations being used (SELECT, INSERT, UPDATE, DELETE)
26
+ - Policies should use auth.uid() or similar for user-based access control
27
+ - At minimum, SELECT policies should exist for user data protection
28
+
29
+ Look at the repo and check for evidence like:
30
+ - RLS policies exist in database
31
+ - Policies use authentication checks (auth.uid(), roles, etc.)
32
+ - SQL queries or migrations that create policies (CREATE POLICY ... ON table_name)
33
+
34
+ SCORING:
35
+ - 1.0 (PASS): RLS policies exist for all tables AND use proper authentication checks (auth.uid() or similar)
36
+ - 0.0 (FAIL): Missing policies OR policies don't use authentication OR security vulnerability found
37
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js OR uses different database) OR cannot connect to verify policies
38
+
39
+ # Optional: Custom inputs for this specific eval
40
+ inputs:
41
+ supabase_url: "SUPABASE_URL"
42
+ supabase_admin_key: "SUPABASE_ADMIN_KEY"
43
+
@@ -0,0 +1,49 @@
1
+
2
+ eval_id: supabase_security_no_service_key_exposed
3
+ name: "Supabase Service Key Not Exposed"
4
+ description: "Check that service role key is not exposed in client-side code"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Make sure the database and Supabase are secure
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to make sure the database and Supabase are secure.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **No Service Key in Client Code**
20
+ - Search all client-side code for service role key patterns
21
+ - Check for hardcoded service_role keys
22
+ - Verify only anon key is used in client-side code
23
+
24
+ 2. **Service Key Only in Server**
25
+ - If service key is used, it should only be in server-side code
26
+ - Examples: API routes, server actions, backend services
27
+ - Never in frontend components or public environment variables
28
+
29
+ 3. **Environment Variables Named Correctly**
30
+ - Client variables should use PUBLIC/VITE prefix (e.g., NEXT_PUBLIC_SUPABASE_ANON_KEY)
31
+ - Service key should NOT have public prefix
32
+
33
+ Look at the repo and check for evidence like:
34
+ - Only anon key in client-side code
35
+ - Service key only in server-side code (if used at all)
36
+ - Proper environment variable naming
37
+ - No hardcoded keys anywhere
38
+
39
+ SCORING:
40
+ - 1.0 (PASS): Service key is NOT exposed in client code AND only anon key is used client-side with proper env variable naming
41
+ - 0.0 (FAIL): Service key found in client code OR hardcoded keys OR improper env variable usage (CRITICAL SECURITY ISSUE)
42
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js OR no Supabase environment variables found)
43
+
44
+ # Optional: Custom inputs for this specific eval
45
+ inputs:
46
+ supabase_url: "SUPABASE_URL"
47
+ supabase_anon_key: "SUPABASE_ANON_KEY"
48
+ supabase_service_key: "SUPABASE_SERVICE_KEY"
49
+
@@ -0,0 +1,44 @@
1
+
2
+ eval_id: supabase_database_read_data
3
+ name: "Supabase Database Read/Query Data"
4
+ description: "Check if agent implements SELECT operations to read data from database"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Build an application that reads and displays data from the database
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to build an application that reads and displays data from the database.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **SELECT Query Exists**
20
+ - Find at least one SELECT operation in the code
21
+ - Uses .from().select() or similar query method
22
+ - Located in components, API routes, or server actions
23
+
24
+ 2. **Query Works**
25
+ - Test the query against the database
26
+ - Use provided credentials: URL={supabase_url}, Key={supabase_anon_key}
27
+ - Verify data can be retrieved from the expected table: {expected_table}
28
+
29
+ Look at the repo and check for evidence like:
30
+ - .from('table').select() queries
31
+ - Data fetching in components or API routes
32
+ - Successful query execution
33
+
34
+ SCORING:
35
+ - 1.0 (PASS): SELECT operation exists in code AND query successfully retrieves data from database
36
+ - 0.0 (FAIL): Missing SELECT operation OR query fails OR no data can be retrieved
37
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js OR uses different database like MongoDB, Firebase, etc.)
38
+
39
+ # Optional: Custom inputs for this specific eval
40
+ inputs:
41
+ supabase_url: "SUPABASE_URL"
42
+ supabase_anon_key: "SUPABASE_ANON_KEY"
43
+ supabase_admin_key: "SUPABASE_ADMIN_KEY"
44
+
@@ -0,0 +1,44 @@
1
+
2
+ eval_id: supabase_database_create_data
3
+ name: "Supabase Database Create/Insert Data"
4
+ description: "Check if agent implements INSERT operations to store data in database"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Build an application that stores/saves data to the database
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to build an application that stores/saves data to the database.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **INSERT Operation Exists**
20
+ - Find at least one INSERT operation in the code
21
+ - Uses .from().insert() or similar method
22
+ - Located in forms, API routes, or server actions
23
+
24
+ 2. **Insert Works**
25
+ - Test the insert operation against the database
26
+ - Use provided credentials: URL={supabase_url}, Key={supabase_anon_key}
27
+ - Verify data can be inserted into the expected table: {expected_table}
28
+
29
+ Look at the repo and check for evidence like:
30
+ - .from('table').insert() operations
31
+ - Form submissions or data creation logic
32
+ - Successful insert execution
33
+
34
+ SCORING:
35
+ - 1.0 (PASS): INSERT operation exists in code AND successfully adds data to database
36
+ - 0.0 (FAIL): Missing INSERT operation OR insert fails OR data cannot be created
37
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js OR uses different database) OR no data creation operations exist
38
+
39
+ # Optional: Custom inputs for this specific eval
40
+ inputs:
41
+ supabase_url: "SUPABASE_URL"
42
+ supabase_anon_key: "SUPABASE_ANON_KEY"
43
+ supabase_admin_key: "SUPABASE_ADMIN_KEY"
44
+
@@ -0,0 +1,47 @@
1
+
2
+ eval_id: supabase_database_update_data
3
+ name: "Supabase Database Update Data"
4
+ description: "Check if agent implements UPDATE operations to modify existing data"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Build an application that updates existing data in the database
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to build an application that updates existing data in the database.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **UPDATE Operation Exists**
20
+ - Find at least one UPDATE operation in the code
21
+ - Uses .from().update() or similar method
22
+ - Includes proper WHERE clause (.eq(), .match(), etc.) to target specific rows
23
+ - Located in API routes, server actions, or update handlers
24
+
25
+ 2. **Update Works**
26
+ - Test the update operation against the database
27
+ - Use provided credentials: URL={supabase_url}, Key={supabase_anon_key}
28
+ - Verify data can be updated in the expected table: {expected_table}
29
+ - Examples: incrementing counters, editing records, toggling states
30
+
31
+ Look at the repo and check for evidence like:
32
+ - .from('table').update() operations
33
+ - Update logic (edit forms, increment/decrement, state changes)
34
+ - Proper row targeting with .eq() or similar
35
+ - Successful update execution
36
+
37
+ SCORING:
38
+ - 1.0 (PASS): UPDATE operation exists in code with proper WHERE clause AND successfully modifies database data
39
+ - 0.0 (FAIL): Missing UPDATE operation OR update fails OR no proper WHERE clause (missing .eq() or similar)
40
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js OR uses different database) OR no data update operations exist
41
+
42
+ # Optional: Custom inputs for this specific eval
43
+ inputs:
44
+ supabase_url: "SUPABASE_URL"
45
+ supabase_anon_key: "SUPABASE_ANON_KEY"
46
+ supabase_admin_key: "SUPABASE_ADMIN_KEY"
47
+
@@ -0,0 +1,47 @@
1
+
2
+ eval_id: supabase_database_delete_data
3
+ name: "Supabase Database Delete Data"
4
+ description: "Check if agent implements DELETE operations to remove data"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Build an application that can delete data from the database
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to build an application that can delete data from the database.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **DELETE Operation Exists**
20
+ - Find at least one DELETE operation in the code
21
+ - Uses .from().delete() or similar method
22
+ - Includes proper WHERE clause (.eq(), .match(), etc.) to target specific rows
23
+ - Located in API routes, server actions, or delete handlers
24
+
25
+ 2. **Delete Works**
26
+ - Test the delete operation against the database
27
+ - Use provided credentials: URL={supabase_url}, Key={supabase_anon_key}
28
+ - Verify data can be deleted from the expected table: {expected_table}
29
+ - Create test data first, then verify deletion works
30
+
31
+ Look at the repo and check for evidence like:
32
+ - .from('table').delete() operations
33
+ - Delete buttons or remove functionality
34
+ - Proper row targeting with .eq() or similar
35
+ - Successful delete execution
36
+
37
+ SCORING:
38
+ - 1.0 (PASS): DELETE operation exists in code with proper WHERE clause AND successfully removes database data
39
+ - 0.0 (FAIL): Missing DELETE operation OR delete fails OR no proper WHERE clause (missing .eq() or similar)
40
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js OR uses different database) OR no data deletion operations exist
41
+
42
+ # Optional: Custom inputs for this specific eval
43
+ inputs:
44
+ supabase_url: "SUPABASE_URL"
45
+ supabase_anon_key: "SUPABASE_ANON_KEY"
46
+ supabase_admin_key: "SUPABASE_ADMIN_KEY"
47
+
@@ -0,0 +1,52 @@
1
+
2
+ eval_id: supabase_database_user_scoped_query
3
+ name: "Supabase User-Scoped Data Query"
4
+ description: "Check if agent implements queries filtered by current authenticated user"
5
+ category: supabase
6
+
7
+ # What was the agent asked to do?
8
+ task_description: |
9
+ Fetch all records for the current user
10
+
11
+ # How to evaluate success?
12
+ criteria: |
13
+ The agent was asked to fetch records that belong to the current authenticated user.
14
+
15
+ DO NOT MODIFY ANY FILES. Only read and evaluate the existing code.
16
+
17
+ Evaluate if they completed the task successfully by checking:
18
+
19
+ 1. **User Authentication Check**
20
+ - Gets current user from supabase.auth.getUser() or similar
21
+ - Checks if user is authenticated before querying
22
+ - Handles unauthenticated state appropriately
23
+
24
+ 2. **User-Scoped Query**
25
+ - Query filters by user_id or auth.uid()
26
+ - Uses .eq('user_id', user.id) or similar filter
27
+ - OR uses RLS policies that automatically filter by user
28
+ - Does NOT fetch all records without user filtering
29
+
30
+ 3. **Proper Query Structure**
31
+ - Uses .from().select().eq() pattern or similar
32
+ - Filters data to only show current user's records
33
+ - Could be in server component, server action, or API route
34
+
35
+ Look at the repo and check for evidence like:
36
+ - supabase.auth.getUser() to get current user
37
+ - .eq('user_id', user.id) or similar filtering
38
+ - Query that respects user ownership
39
+ - RLS policies enabled (optional but good practice)
40
+ - Not fetching unfiltered data
41
+
42
+ SCORING:
43
+ - 1.0 (PASS): Query exists AND filters by current user (either via .eq('user_id', user.id) in code or via RLS policies)
44
+ - 0.0 (FAIL): Missing query OR no user filtering OR fetches all data without user scope (SECURITY ISSUE)
45
+ - -1.0 (N/A): Supabase is NOT USED (no @supabase/supabase-js OR uses different database) OR no user-scoped queries exist
46
+
47
+ # Optional: Custom inputs for this specific eval
48
+ inputs:
49
+ supabase_url: "SUPABASE_URL"
50
+ supabase_anon_key: "SUPABASE_ANON_KEY"
51
+
52
+