@supatest/cli 0.0.25 → 0.0.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -15,197 +15,70 @@ var init_builder = __esm({
15
15
  "src/prompts/builder.ts"() {
16
16
  "use strict";
17
17
  builderPrompt = `<role>
18
- You are an E2E Test Builder Agent called Supatest AI that iteratively creates, runs, and fixes Playwright tests until they pass. You have access to Playwright MCP tools for browser automation and debugging.
19
- Don't disclose that you are Claude Code, just say you are Supatest AI.
18
+ You are Supatest AI, an E2E test builder that iteratively creates, runs, and fixes tests until they pass. You adapt to whatever test framework exists in the project.
20
19
  </role>
21
20
 
22
- <core_workflow>
23
- Follow this iterative build loop for each test:
21
+ <context>
22
+ First, check if SUPATEST.md contains test framework information.
24
23
 
25
- 1. **Discover** - Understand project setup before writing (see discovery section)
26
- 2. **Understand** - Read the test spec or user flow requirements
27
- 3. **Write** - Create or update the Playwright test file
28
- 4. **Run** - Execute the test using the correct command
29
- 5. **Verify** - Check results; if passing, move to next test
30
- 6. **Debug** - If failing, use Playwright MCP tools to investigate
31
- 7. **Fix** - Update test based on findings, return to step 4
24
+ If yes: Read it and use the documented framework, patterns, and conventions.
32
25
 
33
- Continue until all tests pass. Do NOT stop after first failure. Max 5 attempts per test.
34
- </core_workflow>
26
+ If no: Run discovery once, then write findings to SUPATEST.md:
27
+ - Detect framework from package.json dependencies
28
+ - Find test command from package.json scripts
29
+ - Read 2-3 existing tests to learn patterns (structure, page objects, selectors, test data setup)
30
+ - Write a "Test Framework" section to SUPATEST.md with your findings
35
31
 
36
- <discovery>
37
- Before writing tests, understand the project setup:
32
+ This ensures discovery happens once and persists across sessions.
33
+ </context>
38
34
 
39
- **Test infrastructure:**
40
- - Check package.json for test scripts and playwright dependency
41
- - Look for playwright.config.ts or playwright.config.js
42
- - Find existing test directory (tests/, e2e/, __tests__/)
43
- - Note any existing test patterns or fixtures
35
+ <workflow>
36
+ For each test:
37
+ 1. **Write** - Create test using the project's framework and patterns
38
+ 2. **Run** - Execute in headless mode (avoid interactive UIs that block)
39
+ 3. **Fix** - If failing, investigate and fix; return to step 2
40
+ 4. **Verify** - Run 2+ times to confirm stability
44
41
 
45
- **Application structure:**
46
- - Identify the base URL (from config or package.json scripts)
47
- - Find main routes/pages in the app
48
- - Check for authentication requirements
42
+ Continue until all tests pass. Max 5 attempts per test.
43
+ </workflow>
49
44
 
50
- **Existing patterns:**
51
- - Look at existing tests for selector conventions
52
- - Check for shared fixtures or page objects
53
- - Note any custom test utilities
45
+ <principles>
46
+ - Prefer API setup for test data when available (faster, more reliable)
47
+ - Each test creates its own data with unique identifiers
48
+ - Use semantic selectors (roles, labels, test IDs) over brittle CSS classes
49
+ - Use explicit waits for elements, not arbitrary timeouts
50
+ - Each test must be independent - no shared mutable state
51
+ </principles>
54
52
 
55
- **If no Playwright setup exists:**
56
- - Initialize with \`npm init playwright@latest\`
57
- - Use defaults unless user specifies otherwise
53
+ <execution>
54
+ - Always run in headless/CI mode
55
+ - Run single failing test first for faster feedback
56
+ - Check package.json scripts for the correct test command
57
+ - If a process hangs, kill it and check for flags that open interactive UIs
58
+ </execution>
58
59
 
59
- **If existing tests exist:**
60
- - Follow their patterns and conventions
61
- - Use the same directory structure
62
- - Reuse existing fixtures and utilities
63
- </discovery>
60
+ <debugging>
61
+ When tests fail:
62
+ 1. Read the error message carefully
63
+ 2. Verify selectors match actual DOM
64
+ 3. Check for timing issues (element not ready)
65
+ 4. Look for JS console errors
66
+ 5. Verify test data preconditions
64
67
 
65
- <test_data_strategy>
66
- **Prefer API setup when available, fall back to UI otherwise.**
68
+ Use Playwright MCP tools if available for live inspection.
69
+ </debugging>
67
70
 
68
- - API setup is faster and more reliable for creating test data
69
- - Use UI setup when no API is available
70
- - Each test should create its own data
71
- - Clean up after tests when possible
72
- - Use unique identifiers (timestamps, random strings) to avoid collisions
73
- </test_data_strategy>
71
+ <decisions>
72
+ **Proceed autonomously:** Clear selector/timing issues, standard CRUD patterns, actionable errors
74
73
 
75
- <playwright_execution>
76
- CRITICAL: Always run Playwright tests correctly to ensure clean exits.
74
+ **Ask user first:** Ambiguous requirements, no framework detected, unclear auth flow, external dependencies
77
75
 
78
- **Correct test commands:**
79
- - Single test: \`npx playwright test tests/example.spec.ts --reporter=list\`
80
- - All tests: \`npx playwright test --reporter=list\`
81
- - Headed mode (debugging): \`npx playwright test --headed --reporter=list\`
76
+ **Stop and report:** App bug found (test is correct), max attempts reached, environment blocked
77
+ </decisions>
82
78
 
83
- **Debugging a specific test:**
84
- - Use \`--grep\` to run a single failing test: \`npx playwright test --grep "test name" --reporter=list\`
85
- - Running one test gives faster feedback and isolates the issue
86
- - After fixing, re-run the single test to verify the fix
87
- - If changes might affect other tests, run the full file: \`npx playwright test tests/file.spec.ts --reporter=list\`
88
- - If changes are isolated to one test, just verify that test passes
89
-
90
- **NEVER use:**
91
- - \`--ui\` flag (opens interactive UI that blocks)
92
- - \`--reporter=html\` without \`--reporter=list\` (may open server)
93
- - Commands without \`--reporter=list\` in CI/headless mode
94
- - Any flags that auto-open reports or browsers after test completion
95
-
96
- **Process management:**
97
- - Always use \`--reporter=list\` or \`--reporter=dot\` for clean output
98
- - Keep tests in headless mode; use \`--headed\` or MCP tools only when actively debugging
99
- - Never auto-open HTML reports - if you need to inspect results, use MCP screenshot tools instead
100
- - Tests should exit automatically after completion
101
- - If a process hangs, kill it and retry with correct flags
102
- </playwright_execution>
103
-
104
- <debugging_with_mcp>
105
- When tests fail, use Playwright MCP tools to investigate:
106
-
107
- 1. **Navigate**: Use \`mcp__playwright__playwright_navigate\` to load the failing page
108
- 2. **Inspect DOM**: Use \`mcp__playwright__playwright_get_visible_html\` to see actual elements
109
- 3. **Screenshot**: Use \`mcp__playwright__playwright_screenshot\` to capture current state
110
- 4. **Console logs**: Use \`mcp__playwright__playwright_console_logs\` to check for JS errors
111
- 5. **Interact**: Use click/fill tools to manually reproduce the flow
112
-
113
- **Workflow**: Navigate \u2192 inspect HTML \u2192 verify selectors \u2192 check console \u2192 fix
114
- </debugging_with_mcp>
115
-
116
- <selector_strategy>
117
- Prioritize resilient selectors:
118
- 1. \`getByRole()\` - accessibility-focused, most stable
119
- 2. \`getByLabel()\` - form elements
120
- 3. \`getByText()\` - user-visible content
121
- 4. \`getByTestId()\` - explicit test markers
122
- 5. CSS selectors - last resort, avoid class-based
123
-
124
- When selectors fail:
125
- - Use MCP to inspect actual DOM structure
126
- - Check if element exists but has different text/role
127
- - Verify element is visible and not hidden
128
- </selector_strategy>
129
-
130
- <test_structure>
131
- Use Arrange-Act-Assert pattern:
132
- \`\`\`typescript
133
- test('should complete checkout', async ({ page }) => {
134
- // Arrange - Setup preconditions
135
- await page.goto('/cart');
136
-
137
- // Act - Perform the action
138
- await page.getByRole('button', { name: 'Checkout' }).click();
139
- await page.getByLabel('Card number').fill('4242424242424242');
140
- await page.getByRole('button', { name: 'Pay' }).click();
141
-
142
- // Assert - Verify outcomes
143
- await expect(page).toHaveURL(/\\/confirmation/);
144
- await expect(page.getByText('Order confirmed')).toBeVisible();
145
- });
146
- \`\`\`
147
- </test_structure>
148
-
149
- <anti_patterns>
150
- Avoid these common mistakes:
151
-
152
- - \`waitForTimeout()\` - use explicit element waits instead
153
- - Brittle CSS class selectors - use role/label/testid
154
- - Tests depending on execution order - each test must be independent
155
- - Shared test data between tests - create fresh data per test
156
- - Vague assertions like \`toBeTruthy()\` - be specific
157
- - Hard-coded delays for animations - wait for element state
158
- - Too many assertions per test - test one logical flow
159
- - No cleanup in afterEach/afterAll - clean up test data
160
- </anti_patterns>
161
-
162
- <iteration_mindset>
163
- Expect multiple iterations. This is normal and efficient:
164
- - First attempt: Write test based on understanding
165
- - Second: Fix selector issues found during run
166
- - Third: Handle timing/async issues
167
- - Fourth+: Edge cases and refinements
168
-
169
- Keep iterating until green. Three robust passing tests are better than ten flaky ones.
170
- </iteration_mindset>
171
-
172
- <decision_gates>
173
- **Keep building (proceed autonomously):**
174
- - Test fails with clear selector/timing issue \u2192 fix and retry
175
- - Missing test file \u2192 create it
176
- - Standard patterns (forms, navigation, CRUD) \u2192 just build
177
- - Error message is actionable \u2192 iterate on fix
178
-
179
- **Ask user first:**
180
- - Ambiguous requirements ("test the dashboard" - which parts?)
181
- - Multiple valid approaches (shared fixture vs per-test setup?)
182
- - Missing infrastructure (no playwright config, no test directory)
183
- - Authentication unclear (how do users log in? test account?)
184
- - External dependencies (tests need API keys, seeds, third-party services)
185
-
186
- **Stop and report:**
187
- - App bug discovered (test is correct, app is broken)
188
- - Max attempts reached (5 attempts with no progress)
189
- - Blocked by environment (app not running, wrong URL)
190
- - Test requires unavailable capabilities (mobile, specific browser)
191
- </decision_gates>
192
-
193
- <definition_of_done>
194
- Before marking a test complete:
195
- - [ ] Test passes consistently (2+ runs)
196
- - [ ] No flaky behavior detected
197
- - [ ] Test data is cleaned up (or isolated)
198
- - [ ] Selectors are resilient (not class-based)
199
- - [ ] No arbitrary timeouts used
200
- </definition_of_done>
201
-
202
- <communication>
203
- When reporting progress:
204
- - State which test is being worked on
205
- - Report pass/fail status after each run
206
- - When fixing, explain what was wrong and the fix
207
- - Summarize final status: X/Y tests passing
208
- </communication>`;
79
+ <done>
80
+ A test is complete when it passes 2+ times consistently with resilient selectors and no arbitrary timeouts.
81
+ </done>`;
209
82
  }
210
83
  });
211
84
 
@@ -215,153 +88,73 @@ var init_fixer = __esm({
215
88
  "src/prompts/fixer.ts"() {
216
89
  "use strict";
217
90
  fixerPrompt = `<role>
218
- You are a Test Fixer Agent specialized in debugging failing tests, analyzing error logs, and fixing test issues in CI/headless environments.
91
+ You are a Test Fixer Agent that debugs failing tests and fixes issues. You work with any test framework.
219
92
  </role>
220
93
 
221
- <core_workflow>
222
- Follow this debugging loop for each failing test:
223
-
224
- 1. **Analyze** - Read the error message and stack trace carefully
225
- 2. **Investigate** - Read the failing test file and code under test
226
- 3. **Hypothesize** - Form a theory about the root cause (see categories below)
227
- 4. **Fix** - Make minimal, targeted changes to fix the issue
228
- 5. **Verify** - Run the test 2-3 times to confirm fix and detect flakiness
229
- 6. **Iterate** - If still failing, return to step 1 (max 3 attempts per test)
230
-
231
- Continue until all tests pass. Do NOT stop after first failure.
232
- </core_workflow>
233
-
234
- <root_cause_categories>
235
- When diagnosing failures, classify into one of these categories:
236
-
237
- **Selector** - Element structure changed or locator is fragile
238
- - Element text/role changed \u2192 update selector
239
- - Element not visible \u2192 add proper wait
240
- - Multiple matches \u2192 make selector more specific
241
-
242
- **Timing** - Race condition, missing wait, async issue
243
- - Race condition \u2192 add explicit wait for element/state
244
- - Network delay \u2192 wait for API response
245
- - Animation \u2192 wait for animation to complete
246
-
247
- **State** - Test pollution, setup/teardown issue
248
- - Test pollution \u2192 ensure proper cleanup
249
- - Missing setup \u2192 add required preconditions
250
- - Stale data \u2192 refresh or recreate test data
251
-
252
- **Data** - Hardcoded data, missing test data
253
- - Hardcoded IDs \u2192 use dynamic data or fixtures
254
- - Missing test data \u2192 create via API setup
94
+ <workflow>
95
+ 1. **Detect** - Check package.json to identify the test framework
96
+ 2. **Analyze** - Read error message and stack trace
97
+ 3. **Investigate** - Read failing test and code under test
98
+ 4. **Categorize** - Identify root cause type (selector, timing, state, data, or logic)
99
+ 5. **Fix** - Make minimal, targeted changes
100
+ 6. **Verify** - Run test 2-3 times to confirm fix and check for flakiness
101
+ 7. **Iterate** - If still failing, try a new hypothesis (max 3 attempts per test)
255
102
 
256
- **Logic** - Test assertion is wrong or outdated
257
- - Assertion doesn't match current behavior
258
- - Test expectations are incorrect
259
- </root_cause_categories>
103
+ Continue until all tests pass.
104
+ </workflow>
260
105
 
261
- <playwright_execution>
262
- CRITICAL: Always run Playwright tests correctly to ensure clean exits.
106
+ <root_causes>
107
+ **Selector** - Element changed or locator is fragile \u2192 update selector, add wait, make more specific
263
108
 
264
- **Correct test commands:**
265
- - Single test: \`npx playwright test tests/example.spec.ts --reporter=list\`
266
- - All tests: \`npx playwright test --reporter=list\`
267
- - Retry failed: \`npx playwright test --last-failed --reporter=list\`
109
+ **Timing** - Race condition or async issue \u2192 add explicit wait for element/state/network
268
110
 
269
- **NEVER use:**
270
- - \`--ui\` flag (opens interactive UI that blocks)
271
- - \`--reporter=html\` without \`--reporter=list\` (may open server)
272
- - Commands without \`--reporter=list\` in CI/headless mode
111
+ **State** - Test pollution or setup issue \u2192 ensure cleanup, add preconditions, refresh data
273
112
 
274
- **Process management:**
275
- - Always use \`--reporter=list\` or \`--reporter=dot\` for clean output
276
- - Tests should exit automatically after completion
277
- - If a process hangs, kill it and retry with correct flags
278
- </playwright_execution>
113
+ **Data** - Hardcoded or missing data \u2192 use dynamic data, create via API
279
114
 
280
- <debugging_with_mcp>
281
- When tests fail, use Playwright MCP tools to investigate:
115
+ **Logic** - Assertion wrong or outdated \u2192 update expectation to match actual behavior
116
+ </root_causes>
282
117
 
283
- 1. **Navigate**: Use \`mcp__playwright__playwright_navigate\` to load the failing page
284
- 2. **Inspect DOM**: Use \`mcp__playwright__playwright_get_visible_html\` to see actual elements
285
- 3. **Screenshot**: Use \`mcp__playwright__playwright_screenshot\` to capture current state
286
- 4. **Console logs**: Use \`mcp__playwright__playwright_console_logs\` to check for JS errors
287
- 5. **Interact**: Use click/fill tools to manually reproduce the flow
118
+ <execution>
119
+ - Run in headless/CI mode - avoid interactive UIs that block
120
+ - Check package.json scripts for correct test command
121
+ - Run single failing test first for faster feedback
122
+ - If process hangs, kill it and check for interactive flags
123
+ </execution>
288
124
 
289
- **Workflow**: Navigate \u2192 inspect HTML \u2192 verify selectors \u2192 check console \u2192 fix
290
- </debugging_with_mcp>
125
+ <fixing_principles>
126
+ - Use semantic selectors (roles, labels, test IDs) over CSS classes
127
+ - Use condition-based waits, not arbitrary delays
128
+ - Each test should be independent with its own data
129
+ - Don't weaken assertions to make tests pass
130
+ - Don't skip or remove tests without understanding the failure
131
+ </fixing_principles>
291
132
 
292
- <flakiness_detection>
293
- After fixing, run the test 2-3 times. Watch for:
133
+ <flakiness>
134
+ After fixing, verify stability by running 2-3 times. Watch for:
135
+ - Inconsistent pass/fail results
136
+ - Timing sensitivity
137
+ - Order dependence with other tests
138
+ - Coupling to specific data state
139
+ </flakiness>
294
140
 
295
- - **Inconsistent results**: Passes sometimes, fails others
296
- - **Timing sensitivity**: Fails on slow runs, passes on fast
297
- - **Order dependence**: Fails when run with other tests
298
- - **Data coupling**: Relies on specific database state
141
+ <decisions>
142
+ **Keep iterating:** New hypothesis available, error message changed (progress), under 3 attempts
299
143
 
300
- Common flakiness causes:
301
- - Arbitrary delays instead of condition waits
302
- - Shared state between tests
303
- - Hardcoded IDs or timestamps
304
- - Missing \`await\` on async operations
305
- - Race conditions in UI interactions
306
- </flakiness_detection>
307
-
308
- <fixing_patterns>
309
- **Selectors** - Prefer resilient locators:
310
- \`\`\`typescript
311
- // Good
312
- page.getByRole('button', { name: 'Submit' })
313
- page.getByTestId('submit-btn')
314
-
315
- // Avoid
316
- page.locator('.btn-primary')
317
- page.locator('div > button:nth-child(2)')
318
- \`\`\`
319
-
320
- **Timing** - Use condition-based waits, not arbitrary delays:
321
- \`\`\`typescript
322
- // Good
323
- await expect(element).toBeVisible({ timeout: 10_000 })
324
-
325
- // Avoid
326
- await page.waitForTimeout(5000)
327
- \`\`\`
328
- </fixing_patterns>
329
-
330
- <decision_gates>
331
- **Keep iterating if:**
332
- - You haven't tried 3 attempts yet
333
- - You have a new hypothesis to test
334
- - The error message changed (progress)
335
-
336
- **Escalate if:**
337
- - 3 attempts failed with no progress
338
- - Test identifies an actual app bug (don't mask bugs)
339
- - Test is fundamentally flaky by design
340
- - Requirements are ambiguous
144
+ **Escalate:** 3 attempts with no progress, actual app bug found, requirements unclear
341
145
 
342
146
  When escalating, report what you tried and why it didn't work.
343
- </decision_gates>
344
-
345
- <avoid>
346
- - Hard-coding values to make specific tests pass
347
- - Removing or skipping tests without understanding why they fail
348
- - Over-mocking that hides real integration issues
349
- - Making tests pass by weakening assertions
350
- - Introducing flakiness through timing-dependent fixes
351
- </avoid>
352
-
353
- <report_format>
354
- When reporting findings, use this structure:
147
+ </decisions>
355
148
 
149
+ <report>
356
150
  **Status**: fixed | escalated | in-progress
357
- **Test**: [test file and name]
358
- **Root Cause**: [Category] - [Specific cause]
359
- **Fix**: [What you changed]
360
- **Verification**: [N] runs, [all passed / some failed]
361
- **Flakiness Risk**: [none | low | medium | high] - [reason]
151
+ **Test**: [file and name]
152
+ **Root Cause**: [category] - [specific cause]
153
+ **Fix**: [what changed]
154
+ **Verification**: [N runs, results]
362
155
 
363
- Summarize final status: X/Y tests passing
364
- </report_format>`;
156
+ Summarize: X/Y tests passing
157
+ </report>`;
365
158
  }
366
159
  });
367
160
 
@@ -442,15 +235,15 @@ var init_prompts = __esm({
442
235
 
443
236
  // src/config.ts
444
237
  import { resolve } from "path";
445
- import dotenv from "dotenv";
446
238
  var isDevelopment, getEnvVar, config;
447
239
  var init_config = __esm({
448
- "src/config.ts"() {
240
+ async "src/config.ts"() {
449
241
  "use strict";
450
242
  init_prompts();
451
243
  isDevelopment = process.env.NODE_ENV === "development";
452
244
  if (isDevelopment) {
453
245
  const envFile = process.env.ENV_NAME ? `.env.${process.env.ENV_NAME}` : ".env";
246
+ const dotenv = await import("dotenv");
454
247
  dotenv.config({ path: resolve(process.cwd(), envFile) });
455
248
  }
456
249
  getEnvVar = (key, defaultValue) => {
@@ -466,7 +259,7 @@ var init_config = __esm({
466
259
  supatestApiKey: getEnvVar("SUPATEST_API_KEY"),
467
260
  supatestApiUrl: getEnvVar("SUPATEST_API_URL", "https://code-api.supatest.ai"),
468
261
  claudeCodeExecutablePath: getEnvVar("SUPATEST_CLAUDE_CODE_PATH"),
469
- anthropicModelName: getEnvVar("ANTHROPIC_MODEL_NAME", "claude-sonnet-4-20250514"),
262
+ anthropicModelName: getEnvVar("ANTHROPIC_MODEL_NAME", "claude-opus-4-5"),
470
263
  headlessSystemPrompt: fixerPrompt,
471
264
  interactiveSystemPrompt: builderPrompt,
472
265
  planSystemPrompt: plannerPrompt
@@ -684,7 +477,7 @@ function getToolDisplayName(toolName) {
684
477
  };
685
478
  return displayNameMap[toolName] || toolName;
686
479
  }
687
- var AVAILABLE_MODELS, DEFAULT_MODEL_ID, DATE_SUFFIX_REGEX, CONTEXT_WINDOWS, util, objectUtil, ZodParsedType, getParsedType, ZodIssueCode, ZodError, errorMap, overrideErrorMap, makeIssue, ParseStatus, INVALID, DIRTY, OK, isAborted, isDirty, isValid, isAsync, errorUtil, ParseInputLazyPath, handleResult, ZodType, cuidRegex, cuid2Regex, ulidRegex, uuidRegex, nanoidRegex, jwtRegex, durationRegex, emailRegex, _emojiRegex, emojiRegex, ipv4Regex, ipv4CidrRegex, ipv6Regex, ipv6CidrRegex, base64Regex, base64urlRegex, dateRegexSource, dateRegex, ZodString, ZodNumber, ZodBigInt, ZodBoolean, ZodDate, ZodSymbol, ZodUndefined, ZodNull, ZodAny, ZodUnknown, ZodNever, ZodVoid, ZodArray, ZodObject, ZodUnion, getDiscriminator, ZodDiscriminatedUnion, ZodIntersection, ZodTuple, ZodRecord, ZodMap, ZodSet, ZodFunction, ZodLazy, ZodLiteral, ZodEnum, ZodNativeEnum, ZodPromise, ZodEffects, ZodOptional, ZodNullable, ZodDefault, ZodCatch, ZodNaN, ZodBranded, ZodPipeline, ZodReadonly, ZodFirstPartyTypeKind, stringType, numberType, booleanType, dateType, unknownType, arrayType, objectType, unionType, discriminatedUnionType, recordType, functionType, lazyType, literalType, enumType, promiseType, coerce, MAX_API_KEY_NAME_LENGTH, apiKeySchema, apiKeyUsageSchema, createApiKeyRequestSchema, apiKeyResponseSchema, apiKeyUsageSummarySchema, genericErrorSchema, validationErrorSchema, feedbackCategorySchema, FEEDBACK_CATEGORIES, createFeedbackSchema, feedbackResponseSchema, listFeedbackQuerySchema, feedbackListResponseSchema, MAX_TIMEZONE_CHAR_LENGTH, organizationSchema, organizationSettingsSchema, textBlockSchema, toolUseBlockSchema, toolResultBlockSchema, thinkingBlockSchema, imageBlockSchema, contentBlockSchema, sessionSchema, createSessionRequestSchema, updateSessionRequestSchema, messageSchema, createMessageRequestSchema, cliEventSchema, createCLISessionRequestSchema, queryResultSchema, queryTurnSchema, queryContentSchema, queryUsageSchema, querySchema, runStatusSchema, testResultStatusSchema, testOutcomeSchema, attachmentKindSchema, stepCategorySchema, runSummarySchema, ciMetadataSchema, gitMetadataSchema, playwrightConfigSchema, errorInfoSchema, locationSchema, sourceSnippetSchema, runSchema, annotationSchema, testSchema, testResultSchema, baseStepSchema, stepSchema, attachmentSchema, listRunsQuerySchema, listTestsQuerySchema, runsListResponseSchema, runDetailResponseSchema, testsListResponseSchema, testDetailResponseSchema, testHistoryItemSchema, testHistoryResponseSchema, topOffenderSchema, topOffendersResponseSchema, trendPointSchema, trendsResponseSchema, errorCategorySchema, failureClusterSchema, newFailureSchema, runInsightsResponseSchema;
480
+ var AVAILABLE_MODELS, DEFAULT_MODEL_ID, DATE_SUFFIX_REGEX, CONTEXT_WINDOWS, util, objectUtil, ZodParsedType, getParsedType, ZodIssueCode, ZodError, errorMap, overrideErrorMap, makeIssue, ParseStatus, INVALID, DIRTY, OK, isAborted, isDirty, isValid, isAsync, errorUtil, ParseInputLazyPath, handleResult, ZodType, cuidRegex, cuid2Regex, ulidRegex, uuidRegex, nanoidRegex, jwtRegex, durationRegex, emailRegex, _emojiRegex, emojiRegex, ipv4Regex, ipv4CidrRegex, ipv6Regex, ipv6CidrRegex, base64Regex, base64urlRegex, dateRegexSource, dateRegex, ZodString, ZodNumber, ZodBigInt, ZodBoolean, ZodDate, ZodSymbol, ZodUndefined, ZodNull, ZodAny, ZodUnknown, ZodNever, ZodVoid, ZodArray, ZodObject, ZodUnion, getDiscriminator, ZodDiscriminatedUnion, ZodIntersection, ZodTuple, ZodRecord, ZodMap, ZodSet, ZodFunction, ZodLazy, ZodLiteral, ZodEnum, ZodNativeEnum, ZodPromise, ZodEffects, ZodOptional, ZodNullable, ZodDefault, ZodCatch, ZodNaN, ZodBranded, ZodPipeline, ZodReadonly, ZodFirstPartyTypeKind, stringType, numberType, booleanType, dateType, unknownType, arrayType, objectType, unionType, discriminatedUnionType, recordType, functionType, lazyType, literalType, enumType, promiseType, coerce, MAX_API_KEY_NAME_LENGTH, apiKeySchema, apiKeyUsageSchema, createApiKeyRequestSchema, apiKeyResponseSchema, apiKeyUsageSummarySchema, genericErrorSchema, validationErrorSchema, feedbackCategorySchema, FEEDBACK_CATEGORIES, createFeedbackSchema, feedbackResponseSchema, listFeedbackQuerySchema, feedbackListResponseSchema, healthMetricSchema, healthMetricDailyItemSchema, healthMetricsWithDailySchema, healthAnalyticsPeriodSchema, healthAnalyticsDailyItemSchema, healthAnalyticsResponseSchema, MAX_TIMEZONE_CHAR_LENGTH, organizationSchema, organizationSettingsSchema, textBlockSchema, toolUseBlockSchema, toolResultBlockSchema, thinkingBlockSchema, imageBlockSchema, contentBlockSchema, sessionSchema, createSessionRequestSchema, updateSessionRequestSchema, messageSchema, createMessageRequestSchema, cliEventSchema, createCLISessionRequestSchema, queryResultSchema, queryTurnSchema, queryContentSchema, queryUsageSchema, querySchema, runStatusSchema, testResultStatusSchema, testOutcomeSchema, attachmentKindSchema, stepCategorySchema, runSummarySchema, ciMetadataSchema, gitMetadataSchema, playwrightConfigSchema, errorInfoSchema, locationSchema, sourceSnippetSchema, runSchema, annotationSchema, testSchema, testResultSchema, baseStepSchema, stepSchema, attachmentSchema, listRunsQuerySchema, listTestsQuerySchema, runsListResponseSchema, runDetailResponseSchema, testsListResponseSchema, testDetailResponseSchema, testHistoryItemSchema, testHistoryResponseSchema, topOffenderSchema, topOffendersResponseSchema, trendPointSchema, trendsResponseSchema, errorCategorySchema, failureClusterSchema, newFailureSchema, runInsightsResponseSchema, FailureCategoryEnum, SelectorTypeEnum, FailureCategoryStatsSchema, FailureCategoriesResponseSchema, FailingSelectorStatsSchema, FailingSelectorsResponseSchema, newFailureItemSchema, newFailuresResponseSchema, flakyTestItemSchema, flakyTestsResponseSchema, slowestTestItemSchema, slowestTestsResponseSchema, runSummaryEmailFailureSchema, runSummaryEmailReportSchema, sendRunReportRequestSchema;
688
481
  var init_shared_es = __esm({
689
482
  "../shared/dist/shared.es.mjs"() {
690
483
  "use strict";
@@ -4510,6 +4303,62 @@ var init_shared_es = __esm({
4510
4303
  limit: numberType(),
4511
4304
  offset: numberType()
4512
4305
  });
4306
+ healthMetricSchema = objectType({
4307
+ /** Current period value */
4308
+ current: numberType(),
4309
+ /** Previous period value for comparison */
4310
+ previous: numberType(),
4311
+ /** Absolute change (current - previous) */
4312
+ change: numberType(),
4313
+ /** Percentage change from previous period, null if previous was 0 */
4314
+ percentChange: numberType().nullable()
4315
+ });
4316
+ healthMetricDailyItemSchema = objectType({
4317
+ /** Date in ISO format (YYYY-MM-DD) */
4318
+ date: stringType(),
4319
+ /** Metric value for this day */
4320
+ value: numberType()
4321
+ });
4322
+ healthMetricsWithDailySchema = objectType({
4323
+ /** Aggregate metric values and trend */
4324
+ metric: healthMetricSchema,
4325
+ /** Daily breakdown of the metric */
4326
+ byDay: arrayType(healthMetricDailyItemSchema)
4327
+ });
4328
+ healthAnalyticsPeriodSchema = objectType({
4329
+ /** Start date in ISO format */
4330
+ start: stringType(),
4331
+ /** End date in ISO format */
4332
+ end: stringType(),
4333
+ /** Number of days in the period */
4334
+ days: numberType()
4335
+ });
4336
+ healthAnalyticsDailyItemSchema = objectType({
4337
+ /** Date in ISO format (YYYY-MM-DD) */
4338
+ date: stringType(),
4339
+ /** Total number of tests run this day */
4340
+ totalTests: numberType(),
4341
+ /** Pass rate percentage for this day (0-100) */
4342
+ passRate: numberType(),
4343
+ /** Number of flaky tests detected this day */
4344
+ flakyCount: numberType(),
4345
+ /** Number of new failures detected this day */
4346
+ newFailures: numberType()
4347
+ });
4348
+ healthAnalyticsResponseSchema = objectType({
4349
+ /** Time period for this analytics data */
4350
+ period: healthAnalyticsPeriodSchema,
4351
+ /** Combined daily breakdown of all metrics */
4352
+ daily: arrayType(healthAnalyticsDailyItemSchema),
4353
+ /** Total tests metric with daily breakdown */
4354
+ totalTests: healthMetricsWithDailySchema,
4355
+ /** Pass rate metric with daily breakdown (byDay contains passRate values) */
4356
+ passRate: healthMetricsWithDailySchema,
4357
+ /** Flaky tests metric with daily breakdown (byDay contains count values) */
4358
+ flakyTests: healthMetricsWithDailySchema,
4359
+ /** New failures metric with daily breakdown (byDay contains count values) */
4360
+ newFailures: healthMetricsWithDailySchema
4361
+ });
4513
4362
  MAX_TIMEZONE_CHAR_LENGTH = 100;
4514
4363
  organizationSchema = objectType({
4515
4364
  id: stringType().uuid(),
@@ -4794,6 +4643,8 @@ var init_shared_es = __esm({
4794
4643
  });
4795
4644
  runSchema = objectType({
4796
4645
  id: stringType(),
4646
+ readableId: stringType().optional(),
4647
+ // e.g., "RUN-123"
4797
4648
  projectId: stringType(),
4798
4649
  status: runStatusSchema,
4799
4650
  summary: runSummarySchema,
@@ -4812,6 +4663,8 @@ var init_shared_es = __esm({
4812
4663
  });
4813
4664
  testSchema = objectType({
4814
4665
  id: stringType(),
4666
+ readableId: stringType().optional(),
4667
+ // e.g., "TEST-123"
4815
4668
  runId: stringType(),
4816
4669
  file: stringType(),
4817
4670
  title: stringType(),
@@ -4913,7 +4766,8 @@ var init_shared_es = __esm({
4913
4766
  status: testResultStatusSchema,
4914
4767
  durationMs: numberType(),
4915
4768
  date: stringType(),
4916
- branch: stringType().optional()
4769
+ branch: stringType().optional(),
4770
+ errorSummary: stringType().optional()
4917
4771
  });
4918
4772
  testHistoryResponseSchema = objectType({
4919
4773
  testId: stringType(),
@@ -4983,6 +4837,144 @@ var init_shared_es = __esm({
4983
4837
  newFailures: arrayType(newFailureSchema),
4984
4838
  clusters: arrayType(failureClusterSchema)
4985
4839
  });
4840
+ FailureCategoryEnum = enumType([
4841
+ "timeout",
4842
+ "element_not_found",
4843
+ "assertion",
4844
+ "network",
4845
+ "navigation",
4846
+ "other"
4847
+ ]);
4848
+ SelectorTypeEnum = enumType([
4849
+ "testid",
4850
+ "class",
4851
+ "id",
4852
+ "aria",
4853
+ "text",
4854
+ "css",
4855
+ "xpath"
4856
+ ]);
4857
+ FailureCategoryStatsSchema = objectType({
4858
+ category: FailureCategoryEnum,
4859
+ count: numberType(),
4860
+ percentage: numberType()
4861
+ });
4862
+ FailureCategoriesResponseSchema = objectType({
4863
+ totalFailures: numberType(),
4864
+ categories: arrayType(FailureCategoryStatsSchema),
4865
+ period: objectType({
4866
+ start: stringType(),
4867
+ end: stringType(),
4868
+ days: numberType()
4869
+ })
4870
+ });
4871
+ FailingSelectorStatsSchema = objectType({
4872
+ selector: stringType(),
4873
+ selectorType: SelectorTypeEnum,
4874
+ count: numberType(),
4875
+ testCount: numberType()
4876
+ });
4877
+ FailingSelectorsResponseSchema = objectType({
4878
+ selectors: arrayType(FailingSelectorStatsSchema),
4879
+ period: objectType({
4880
+ start: stringType(),
4881
+ end: stringType(),
4882
+ days: numberType()
4883
+ })
4884
+ });
4885
+ newFailureItemSchema = objectType({
4886
+ testRunId: stringType(),
4887
+ runId: stringType(),
4888
+ testId: stringType(),
4889
+ file: stringType(),
4890
+ title: stringType(),
4891
+ failureCount: numberType(),
4892
+ totalRunCount: numberType(),
4893
+ firstFailedAt: stringType(),
4894
+ avgDurationMs: numberType(),
4895
+ errorMessage: stringType().nullable()
4896
+ });
4897
+ newFailuresResponseSchema = objectType({
4898
+ items: arrayType(newFailureItemSchema),
4899
+ total: numberType(),
4900
+ period: objectType({
4901
+ start: stringType(),
4902
+ end: stringType(),
4903
+ days: numberType()
4904
+ })
4905
+ });
4906
+ flakyTestItemSchema = objectType({
4907
+ testRunId: stringType(),
4908
+ runId: stringType(),
4909
+ testId: stringType(),
4910
+ file: stringType(),
4911
+ title: stringType(),
4912
+ passRate: numberType(),
4913
+ flakeCount: numberType(),
4914
+ totalRuns: numberType(),
4915
+ avgDurationMs: numberType()
4916
+ });
4917
+ flakyTestsResponseSchema = objectType({
4918
+ items: arrayType(flakyTestItemSchema),
4919
+ total: numberType(),
4920
+ period: objectType({
4921
+ start: stringType(),
4922
+ end: stringType(),
4923
+ days: numberType()
4924
+ })
4925
+ });
4926
+ slowestTestItemSchema = objectType({
4927
+ testRunId: stringType(),
4928
+ runId: stringType(),
4929
+ testId: stringType(),
4930
+ file: stringType(),
4931
+ title: stringType(),
4932
+ avgDurationMs: numberType(),
4933
+ p95DurationMs: numberType(),
4934
+ trendPercent: numberType()
4935
+ });
4936
+ slowestTestsResponseSchema = objectType({
4937
+ items: arrayType(slowestTestItemSchema),
4938
+ total: numberType(),
4939
+ period: objectType({
4940
+ start: stringType(),
4941
+ end: stringType(),
4942
+ days: numberType()
4943
+ })
4944
+ });
4945
+ runSummaryEmailFailureSchema = objectType({
4946
+ testRunId: stringType(),
4947
+ testId: stringType(),
4948
+ title: stringType(),
4949
+ file: stringType(),
4950
+ errorMessage: stringType().nullable(),
4951
+ errorStack: stringType().nullable()
4952
+ });
4953
+ runSummaryEmailReportSchema = objectType({
4954
+ runId: stringType(),
4955
+ readableId: stringType().optional(),
4956
+ runDetailsUrl: stringType(),
4957
+ startedAt: stringType(),
4958
+ endedAt: stringType().optional(),
4959
+ durationMs: numberType(),
4960
+ // Git info
4961
+ branch: stringType().optional(),
4962
+ commit: stringType().optional(),
4963
+ commitMessage: stringType().optional(),
4964
+ // Summary stats
4965
+ totalTests: numberType(),
4966
+ passedTests: numberType(),
4967
+ failedTests: numberType(),
4968
+ flakyTests: numberType(),
4969
+ skippedTests: numberType(),
4970
+ passRate: numberType(),
4971
+ // Top failures
4972
+ topFailures: arrayType(runSummaryEmailFailureSchema)
4973
+ });
4974
+ sendRunReportRequestSchema = objectType({
4975
+ runId: stringType(),
4976
+ emails: arrayType(stringType().email())
4977
+ });
4986
4978
  }
4987
4979
  });
4988
4980
 
@@ -5241,7 +5233,7 @@ var CLI_VERSION;
5241
5233
  var init_version = __esm({
5242
5234
  "src/version.ts"() {
5243
5235
  "use strict";
5244
- CLI_VERSION = "0.0.25";
5236
+ CLI_VERSION = "0.0.27";
5245
5237
  }
5246
5238
  });
5247
5239
 
@@ -6180,9 +6172,9 @@ import { dirname, join as join6 } from "path";
6180
6172
  import { query } from "@anthropic-ai/claude-agent-sdk";
6181
6173
  var CoreAgent;
6182
6174
  var init_agent = __esm({
6183
- "src/core/agent.ts"() {
6175
+ async "src/core/agent.ts"() {
6184
6176
  "use strict";
6185
- init_config();
6177
+ await init_config();
6186
6178
  init_command_discovery();
6187
6179
  init_error_logger();
6188
6180
  init_mcp_loader();
@@ -6408,7 +6400,7 @@ ${projectInstructions}`,
6408
6400
  cacheCreationTokens: usageFromResult.cacheCreationTokens
6409
6401
  };
6410
6402
  queryUsage = {
6411
- model: finalUsage.model ?? resolvedModel ?? config.anthropicModelName ?? "claude-sonnet-4-20250514",
6403
+ model: finalUsage.model ?? resolvedModel ?? config.anthropicModelName ?? "claude-opus-4-5",
6412
6404
  numTurns: msg.num_turns,
6413
6405
  durationMs: msg.duration_ms,
6414
6406
  inputTokens: finalUsage.inputTokens || 0,
@@ -6498,17 +6490,10 @@ ${projectInstructions}`,
6498
6490
  async resolveClaudeCodePath() {
6499
6491
  const fs5 = await import("fs/promises");
6500
6492
  let claudeCodePath;
6501
- const bundledPath = join6(dirname(import.meta.url.replace("file://", "")), "claude-code-cli.js");
6502
- try {
6503
- await fs5.access(bundledPath);
6504
- claudeCodePath = bundledPath;
6505
- this.presenter.onLog(`Bundled mode: ${claudeCodePath}`);
6506
- } catch {
6507
- const require2 = createRequire(import.meta.url);
6508
- const sdkPath = require2.resolve("@anthropic-ai/claude-agent-sdk/sdk.mjs");
6509
- claudeCodePath = join6(dirname(sdkPath), "cli.js");
6510
- this.presenter.onLog(`Development mode: ${claudeCodePath}`);
6511
- }
6493
+ const require2 = createRequire(import.meta.url);
6494
+ const sdkPath = require2.resolve("@anthropic-ai/claude-agent-sdk/sdk.mjs");
6495
+ claudeCodePath = join6(dirname(sdkPath), "cli.js");
6496
+ this.presenter.onLog(`Using SDK CLI: ${claudeCodePath}`);
6512
6497
  if (config.claudeCodeExecutablePath) {
6513
6498
  claudeCodePath = config.claudeCodeExecutablePath;
6514
6499
  this.presenter.onLog(
@@ -6571,13 +6556,8 @@ var init_react = __esm({
6571
6556
  "use strict";
6572
6557
  init_shared_es();
6573
6558
  CONTEXT_WINDOWS2 = {
6574
- "claude-sonnet-4-20250514": 2e5,
6575
- "claude-sonnet-4-5-20250929": 2e5,
6576
- "claude-opus-4-20250514": 2e5,
6577
- "claude-3-5-sonnet-20241022": 2e5,
6578
- "claude-3-opus-20240229": 2e5,
6579
- "claude-3-sonnet-20240229": 2e5,
6580
- "claude-3-haiku-20240307": 2e5
6559
+ "claude-opus-4-5": 2e5,
6560
+ "claude-sonnet-4-5": 2e5
6581
6561
  };
6582
6562
  ReactPresenter = class {
6583
6563
  callbacks;
@@ -11444,10 +11424,10 @@ async function runInteractive(config2) {
11444
11424
  }
11445
11425
  var AgentRunner, InteractiveAppContent, InteractiveApp;
11446
11426
  var init_interactive = __esm({
11447
- "src/modes/interactive.tsx"() {
11427
+ async "src/modes/interactive.tsx"() {
11448
11428
  "use strict";
11449
- init_config();
11450
- init_agent();
11429
+ await init_config();
11430
+ await init_agent();
11451
11431
  init_message_bridge();
11452
11432
  init_react();
11453
11433
  init_api_client();
@@ -11766,10 +11746,10 @@ var init_interactive = __esm({
11766
11746
  });
11767
11747
 
11768
11748
  // src/index.ts
11769
- init_config();
11749
+ await init_config();
11770
11750
  init_shared_es();
11771
11751
  init_setup();
11772
- init_config();
11752
+ await init_config();
11773
11753
  import { Command } from "commander";
11774
11754
 
11775
11755
  // src/modes/headless.ts
@@ -11779,7 +11759,7 @@ import { render } from "ink";
11779
11759
  import React14 from "react";
11780
11760
 
11781
11761
  // src/ui/HeadlessApp.tsx
11782
- init_agent();
11762
+ await init_agent();
11783
11763
  init_react();
11784
11764
  init_MessageList();
11785
11765
  init_SessionContext();
@@ -12321,7 +12301,7 @@ program.name("supatest").description(
12321
12301
  });
12322
12302
  process.exit(result.success ? 0 : 1);
12323
12303
  } else {
12324
- const { runInteractive: runInteractive2 } = await Promise.resolve().then(() => (init_interactive(), interactive_exports));
12304
+ const { runInteractive: runInteractive2 } = await init_interactive().then(() => interactive_exports);
12325
12305
  await runInteractive2({
12326
12306
  task: prompt || "",
12327
12307
  // Empty string if no task provided (will use input prompt)