agentic-loop 3.19.0 → 3.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/.claude/commands/tour.md +11 -7
  2. package/.claude/commands/vibe-help.md +5 -2
  3. package/.claude/commands/vibe-list.md +17 -2
  4. package/.claude/skills/prd/SKILL.md +21 -6
  5. package/.claude/skills/setup-review/SKILL.md +56 -0
  6. package/.claude/skills/tour/SKILL.md +11 -7
  7. package/.claude/skills/vibe-help/SKILL.md +2 -1
  8. package/.claude/skills/vibe-list/SKILL.md +5 -2
  9. package/.pre-commit-hooks.yaml +8 -0
  10. package/README.md +4 -0
  11. package/bin/agentic-loop.sh +7 -0
  12. package/bin/ralph.sh +29 -0
  13. package/dist/checks/check-signs-secrets.d.ts +9 -0
  14. package/dist/checks/check-signs-secrets.d.ts.map +1 -0
  15. package/dist/checks/check-signs-secrets.js +57 -0
  16. package/dist/checks/check-signs-secrets.js.map +1 -0
  17. package/dist/checks/index.d.ts +2 -5
  18. package/dist/checks/index.d.ts.map +1 -1
  19. package/dist/checks/index.js +4 -9
  20. package/dist/checks/index.js.map +1 -1
  21. package/dist/index.d.ts +1 -1
  22. package/dist/index.d.ts.map +1 -1
  23. package/dist/index.js +1 -1
  24. package/dist/index.js.map +1 -1
  25. package/package.json +2 -1
  26. package/ralph/hooks/common.sh +47 -0
  27. package/ralph/hooks/warn-debug.sh +12 -26
  28. package/ralph/hooks/warn-empty-catch.sh +21 -34
  29. package/ralph/hooks/warn-secrets.sh +39 -52
  30. package/ralph/hooks/warn-urls.sh +25 -45
  31. package/ralph/init.sh +58 -82
  32. package/ralph/loop.sh +506 -53
  33. package/ralph/prd-check.sh +177 -236
  34. package/ralph/prd.sh +5 -2
  35. package/ralph/setup/quick-setup.sh +2 -16
  36. package/ralph/setup.sh +68 -80
  37. package/ralph/signs.sh +8 -0
  38. package/ralph/uat.sh +2015 -0
  39. package/ralph/utils.sh +198 -69
  40. package/ralph/verify/tests.sh +65 -10
  41. package/templates/PROMPT.md +10 -4
  42. package/templates/UAT-PROMPT.md +197 -0
  43. package/templates/config/elixir.json +0 -2
  44. package/templates/config/fastmcp.json +0 -2
  45. package/templates/config/fullstack.json +2 -4
  46. package/templates/config/go.json +0 -2
  47. package/templates/config/minimal.json +0 -2
  48. package/templates/config/node.json +0 -2
  49. package/templates/config/python.json +0 -2
  50. package/templates/config/rust.json +0 -2
  51. package/templates/prd-example.json +6 -8
@@ -0,0 +1,197 @@
1
+ # UAT Ralph — Autonomous UAT Loop
2
+
3
+ You are an autonomous UAT agent. Your job is to FIND BUGS by thinking like a real end user who is also a security researcher.
4
+
5
+ Your goal is NOT test coverage. Your goal is to BREAK THINGS.
6
+
7
+ ---
8
+
9
+ ## Your Mindset
10
+
11
+ 1. **Think like a user** — What would someone actually do? Login, fill forms, click around, navigate between pages, use the back button, refresh mid-flow.
12
+ 2. **Think like a hacker** — Try to break it. SQL injection in inputs, XSS in text fields, huge payloads, special characters, rapid-fire submissions, direct URL manipulation.
13
+ 3. **Think like a pedant** — Edge cases matter. Empty states, error states, loading states, silent failures, frontend-backend mismatches, off-by-one errors, timezone issues.
14
+
15
+ ---
16
+
17
+ ## MCP Browser Exploration
18
+
19
+ Before writing any test, EXPLORE the feature using browser tools:
20
+
21
+ - `browser_snapshot` — Understand page structure and available elements before writing selectors
22
+ - `browser_take_screenshot` — Document what you actually see (save to `.ralph/uat/screenshots/`)
23
+ - `browser_click`, `browser_type`, `browser_fill_form` — Interact with the real UI
24
+ - `browser_console_messages` — Check for JavaScript errors, warnings, failed network requests
25
+ - `browser_navigate` — Move between pages, test deep links, test direct URL access
26
+
27
+ Write tests based on what you ACTUALLY FOUND — never guess selectors or page structure.
28
+
29
+ ---
30
+
31
+ ## Writing Repeatable Evals (not just tests)
32
+
33
+ A test checks "does it work." An eval checks "does it produce the RIGHT result."
34
+
35
+ For every feature you test, follow this process:
36
+
37
+ ### Step 1: Capture Ground Truth
38
+
39
+ Before writing any test, USE the feature manually via MCP:
40
+ - Fill the form with known inputs
41
+ - Click submit
42
+ - RECORD what happens: what text appears, where you're redirected, what the page looks like, what's in the console
43
+
44
+ This is your ground truth. Screenshot it. Write it down in the test as a comment.
45
+
46
+ ### Step 2: Define Assertions as Input → Expected Output
47
+
48
+ Every test case is: "Given THIS input, I expect THIS output."
49
+
50
+ Bad assertion (proves nothing about correctness):
51
+ ```typescript
52
+ await expect(page).toHaveURL(/dashboard/);
53
+ ```
54
+
55
+ Good assertion (verifies the right content):
56
+ ```typescript
57
+ // Input: registered with name "John"
58
+ // Expected: dashboard shows personalized greeting
59
+ await expect(page.getByText('Welcome, John')).toBeVisible();
60
+ ```
61
+
62
+ ### Step 3: Choose Assertion Strategy
63
+
64
+ | What you're checking | Strategy | Example |
65
+ |---------------------|----------|---------|
66
+ | Specific text appears | keyword | `expect(text).toContain('Paris')` |
67
+ | Correct number/calc | structural | `expect(price).toBe(212)` |
68
+ | Right page/redirect | navigation | `expect(page).toHaveURL('/dashboard')` |
69
+ | No JS errors | console | `expect(errors).toHaveLength(0)` |
70
+ | Visual correctness | screenshot | `expect(page).toHaveScreenshot()` |
71
+ | Freeform AI response | llm-judge | call Claude Haiku to grade the response |
72
+
73
+ ### Step 4: Make It Repeatable
74
+
75
+ - Use fixed test data, not random (so reruns produce same result)
76
+ - Clean up after: delete created users, reset state
77
+ - No time-dependent assertions (don't assert "posted 1 minute ago")
78
+ - Use test IDs or accessible roles for selectors, not CSS classes that change
79
+
80
+ ### Testing AI / Freeform Responses
81
+
82
+ If the app produces AI-generated or freeform text, you CANNOT use keyword matching alone. Use an LLM judge:
83
+
84
+ ```typescript
85
+ import Anthropic from '@anthropic-ai/sdk';
86
+
87
+ const RUBRIC = `
88
+ Must mention: sunlight as energy source
89
+ Must mention: CO2 and water converted to glucose
90
+ Must NOT claim: animals perform photosynthesis
91
+ `;
92
+
93
+ const responseText = await page.getByTestId('answer').textContent();
94
+
95
+ const client = new Anthropic();
96
+ const judgment = await client.messages.create({
97
+ model: 'claude-haiku-4-5-20251001',
98
+ max_tokens: 50,
99
+ messages: [{
100
+ role: 'user',
101
+ content: `Judge this answer against the rubric.\nAnswer: "${responseText}"\nRubric: ${RUBRIC}\nReply only: PASS or FAIL`
102
+ }]
103
+ });
104
+ expect(judgment.content[0].text).toContain('PASS');
105
+ ```
106
+
107
+ The rubric is the eval. Be specific about what MUST and MUST NOT appear.
108
+
109
+ ---
110
+
111
+ ## TDD Methodology (Red-Green)
112
+
113
+ Ralph uses a strict Test-Driven Development flow:
114
+
115
+ ### RED Phase (Test Only)
116
+ - You write the test. You do NOT modify application code.
117
+ - The test should verify CORRECT behavior based on the plan's assertions.
118
+ - If the app has a bug, the test WILL fail -- that is the expected outcome.
119
+
120
+ ### GREEN Phase (Fix Only)
121
+ - A separate session reads your test and the failure output.
122
+ - It fixes the application code minimally to make the test pass.
123
+ - It must NOT modify the test file.
124
+
125
+ This separation ensures every test is validated before any fix is applied.
126
+
127
+ ---
128
+
129
+ ## Writing Tests
130
+
131
+ - Use Playwright for E2E tests (`.spec.ts` files)
132
+ - Use Vitest/Jest for integration tests (`.test.ts` files)
133
+ - Each test file should cover ONE feature area with both happy path and edge cases
134
+ - Always test the happy path FIRST, then systematically hit every edge case
135
+ - Include assertions for console errors — any `console.error` in E2E = test failure
136
+ - Test auth flows before anything else (they gate everything)
137
+ - **Every test MUST have content assertions** — checking the page loads is not enough, check that it shows the RIGHT content
138
+
139
+ ---
140
+
141
+ ## When a Test Fails
142
+
143
+ Read the failure output carefully. Then decide:
144
+
145
+ - **App bug**: The test expectation is correct, but the app doesn't meet it.
146
+ - **Test bug**: The test has wrong selectors, wrong URLs, or wrong expectations.
147
+
148
+ **When in doubt, it's an app bug.** Never weaken a test to make it pass.
149
+
150
+ **Note:** In TDD mode (RED/GREEN), your phase determines what you can fix:
151
+ - **RED phase**: Only fix the test. Do NOT touch application code.
152
+ - **GREEN phase**: Only fix the app. Do NOT touch the test file.
153
+
154
+ ---
155
+
156
+ ## Edge Cases to Always Try
157
+
158
+ **Input fields:**
159
+ - Empty string / whitespace only
160
+ - Very long strings (10,000+ characters)
161
+ - Unicode: emojis, RTL text, zero-width characters
162
+ - HTML tags: `<script>alert('xss')</script>`
163
+ - SQL: `'; DROP TABLE users; --`
164
+ - Special characters: `<>&"'/\`
165
+ - Null bytes: `\0`
166
+
167
+ **Forms:**
168
+ - Submit with all fields empty
169
+ - Submit with only required fields
170
+ - Double-click the submit button
171
+ - Submit then immediately navigate away
172
+ - Fill form, refresh page, check if data persists
173
+
174
+ **Navigation:**
175
+ - Direct URL access without auth
176
+ - Back button after form submission
177
+ - Deep link to page that requires specific state
178
+ - Rapid page transitions
179
+
180
+ **API boundaries:**
181
+ - Request with missing required fields
182
+ - Request with extra unexpected fields
183
+ - Request with wrong data types
184
+ - Concurrent duplicate requests
185
+
186
+ ---
187
+
188
+ ## Rules
189
+
190
+ 1. **Never weaken a test to make it pass** — if the app is wrong, fix the app
191
+ 2. **Test the edges** — empty strings, null, huge inputs, special chars, concurrent requests
192
+ 3. **Real browser, real HTTP** — no mocking unless absolutely necessary
193
+ 4. **Console errors = test failure** — check `browser_console_messages` in E2E tests
194
+ 5. **Test auth flows first** — they gate access to everything else
195
+ 6. **Happy path FIRST** — then systematically break each edge case
196
+ 7. **One test file per feature area** — keep tests focused and debuggable
197
+ 8. **Document what you found** — screenshots in `.ralph/uat/screenshots/`, notes in test descriptions
@@ -1,7 +1,5 @@
1
1
  {
2
2
  "auth": {
3
- "testUser": "",
4
- "testPassword": "",
5
3
  "loginEndpoint": "/api/auth/login",
6
4
  "loginMethod": "POST",
7
5
  "tokenType": "jwt",
@@ -2,8 +2,6 @@
2
2
  "projectType": "fastmcp",
3
3
 
4
4
  "auth": {
5
- "testUser": "",
6
- "testPassword": "",
7
5
  "loginEndpoint": "",
8
6
  "loginMethod": "",
9
7
  "tokenType": "",
@@ -1,7 +1,5 @@
1
1
  {
2
2
  "auth": {
3
- "testUser": "",
4
- "testPassword": "",
5
3
  "loginEndpoint": "/api/auth/login",
6
4
  "loginMethod": "POST",
7
5
  "tokenType": "session",
@@ -25,14 +23,14 @@
25
23
 
26
24
  "commands": {
27
25
  "devFrontend": "cd frontend && npm run dev",
28
- "devBackend": "python manage.py runserver",
26
+ "devBackend": "python3 manage.py runserver",
29
27
  "install": "npm install && pip install -r requirements.txt",
30
28
  "seed": "",
31
29
  "migrate": ""
32
30
  },
33
31
 
34
32
  "migrations": {
35
- "command": "python manage.py migrate",
33
+ "command": "python3 manage.py migrate",
36
34
  "pattern": "migrations/.*\\.py$"
37
35
  },
38
36
 
@@ -1,7 +1,5 @@
1
1
  {
2
2
  "auth": {
3
- "testUser": "",
4
- "testPassword": "",
5
3
  "loginEndpoint": "/api/auth/login",
6
4
  "loginMethod": "POST",
7
5
  "tokenType": "jwt",
@@ -1,7 +1,5 @@
1
1
  {
2
2
  "auth": {
3
- "testUser": "",
4
- "testPassword": "",
5
3
  "loginEndpoint": "/api/auth/login",
6
4
  "loginMethod": "POST",
7
5
  "tokenType": "session",
@@ -1,7 +1,5 @@
1
1
  {
2
2
  "auth": {
3
- "testUser": "",
4
- "testPassword": "",
5
3
  "loginEndpoint": "/api/auth/login",
6
4
  "loginMethod": "POST",
7
5
  "tokenType": "jwt",
@@ -1,7 +1,5 @@
1
1
  {
2
2
  "auth": {
3
- "testUser": "",
4
- "testPassword": "",
5
3
  "loginEndpoint": "/api/auth/login",
6
4
  "loginMethod": "POST",
7
5
  "tokenType": "jwt",
@@ -1,7 +1,5 @@
1
1
  {
2
2
  "auth": {
3
- "testUser": "",
4
- "testPassword": "",
5
3
  "loginEndpoint": "/api/auth/login",
6
4
  "loginMethod": "POST",
7
5
  "tokenType": "jwt",
@@ -73,17 +73,13 @@
73
73
  "response": {"id": "string", "email": "string"}
74
74
  },
75
75
 
76
- "testUsers": {
77
- "admin": {"email": "admin@test.com", "password": "test123"},
78
- "user": {"email": "user@test.com", "password": "test123"}
79
- },
80
-
81
76
  "contextFiles": [
82
77
  "docs/ideas/auth.md"
83
78
  ],
84
79
 
85
80
  "notes": "SECURITY: Use bcrypt with cost 10+. Never log passwords. Validate email format server-side even if validated client-side.",
86
- "dependsOn": []
81
+ "dependsOn": [],
82
+ "batch": 1
87
83
  },
88
84
  {
89
85
  "id": "TASK-002",
@@ -148,7 +144,8 @@
148
144
  "mcp": ["playwright", "devtools"],
149
145
 
150
146
  "notes": "IMPORTANT: Reference the ASCII mockup in docs/ideas/auth.md for layout. Use existing Button and Input components from ui folder per styleguide.",
151
- "dependsOn": ["TASK-001"]
147
+ "dependsOn": ["TASK-001"],
148
+ "batch": 2
152
149
  },
153
150
  {
154
151
  "id": "TASK-003",
@@ -210,7 +207,8 @@
210
207
  ],
211
208
 
212
209
  "notes": "SCALE: Always paginate list endpoints. Enforce max limit to prevent memory issues. Add database index for sort column.",
213
- "dependsOn": ["TASK-001"]
210
+ "dependsOn": ["TASK-001"],
211
+ "batch": 2
214
212
  }
215
213
  ]
216
214
  }