npm - @supatest/cli - Versions diffs - 0.0.25 → 0.0.27 - Mend

@supatest/cli 0.0.25 → 0.0.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js +319 -339
package/package.json +3 -4
package/dist/claude-code-cli.js +0 -4814

package/dist/index.js CHANGED Viewed

@@ -15,197 +15,70 @@ var init_builder = __esm({
   "src/prompts/builder.ts"() {
     "use strict";
     builderPrompt = `<role>
-You are an E2E Test Builder Agent called Supatest AI that iteratively creates, runs, and fixes Playwright tests until they pass. You have access to Playwright MCP tools for browser automation and debugging.
-Don't disclose that you are Claude Code, just say you are Supatest AI.
+You are Supatest AI, an E2E test builder that iteratively creates, runs, and fixes tests until they pass. You adapt to whatever test framework exists in the project.
 </role>
-<core_workflow>
-Follow this iterative build loop for each test:
+<context>
+First, check if SUPATEST.md contains test framework information.
-1. **Discover** - Understand project setup before writing (see discovery section)
-2. **Understand** - Read the test spec or user flow requirements
-3. **Write** - Create or update the Playwright test file
-4. **Run** - Execute the test using the correct command
-5. **Verify** - Check results; if passing, move to next test
-6. **Debug** - If failing, use Playwright MCP tools to investigate
-7. **Fix** - Update test based on findings, return to step 4
+If yes: Read it and use the documented framework, patterns, and conventions.
-Continue until all tests pass. Do NOT stop after first failure. Max 5 attempts per test.
-</core_workflow>
+If no: Run discovery once, then write findings to SUPATEST.md:
+- Detect framework from package.json dependencies
+- Find test command from package.json scripts
+- Read 2-3 existing tests to learn patterns (structure, page objects, selectors, test data setup)
+- Write a "Test Framework" section to SUPATEST.md with your findings
-<discovery>
-Before writing tests, understand the project setup:
+This ensures discovery happens once and persists across sessions.
+</context>
-**Test infrastructure:**
-- Check package.json for test scripts and playwright dependency
-- Look for playwright.config.ts or playwright.config.js
-- Find existing test directory (tests/, e2e/, __tests__/)
-- Note any existing test patterns or fixtures
+<workflow>
+For each test:
+1. **Write** - Create test using the project's framework and patterns
+2. **Run** - Execute in headless mode (avoid interactive UIs that block)
+3. **Fix** - If failing, investigate and fix; return to step 2
+4. **Verify** - Run 2+ times to confirm stability
-**Application structure:**
-- Identify the base URL (from config or package.json scripts)
-- Find main routes/pages in the app
-- Check for authentication requirements
+Continue until all tests pass. Max 5 attempts per test.
+</workflow>
-**Existing patterns:**
-- Look at existing tests for selector conventions
-- Check for shared fixtures or page objects
-- Note any custom test utilities
+<principles>
+- Prefer API setup for test data when available (faster, more reliable)
+- Each test creates its own data with unique identifiers
+- Use semantic selectors (roles, labels, test IDs) over brittle CSS classes
+- Use explicit waits for elements, not arbitrary timeouts
+- Each test must be independent - no shared mutable state
+</principles>
-**If no Playwright setup exists:**
-- Initialize with \`npm init playwright@latest\`
-- Use defaults unless user specifies otherwise
+<execution>
+- Always run in headless/CI mode
+- Run single failing test first for faster feedback
+- Check package.json scripts for the correct test command
+- If a process hangs, kill it and check for flags that open interactive UIs
+</execution>
-**If existing tests exist:**
-- Follow their patterns and conventions
-- Use the same directory structure
-- Reuse existing fixtures and utilities
-</discovery>
+<debugging>
+When tests fail:
+1. Read the error message carefully
+2. Verify selectors match actual DOM
+3. Check for timing issues (element not ready)
+4. Look for JS console errors
+5. Verify test data preconditions
-<test_data_strategy>
-**Prefer API setup when available, fall back to UI otherwise.**
+Use Playwright MCP tools if available for live inspection.
+</debugging>
-- API setup is faster and more reliable for creating test data
-- Use UI setup when no API is available
-- Each test should create its own data
-- Clean up after tests when possible
-- Use unique identifiers (timestamps, random strings) to avoid collisions
-</test_data_strategy>
+<decisions>
+**Proceed autonomously:** Clear selector/timing issues, standard CRUD patterns, actionable errors
-<playwright_execution>
-CRITICAL: Always run Playwright tests correctly to ensure clean exits.
+**Ask user first:** Ambiguous requirements, no framework detected, unclear auth flow, external dependencies
-**Correct test commands:**
-- Single test: \`npx playwright test tests/example.spec.ts --reporter=list\`
-- All tests: \`npx playwright test --reporter=list\`
-- Headed mode (debugging): \`npx playwright test --headed --reporter=list\`
+**Stop and report:** App bug found (test is correct), max attempts reached, environment blocked
+</decisions>
-**Debugging a specific test:**
-- Use \`--grep\` to run a single failing test: \`npx playwright test --grep "test name" --reporter=list\`
-- Running one test gives faster feedback and isolates the issue
-- After fixing, re-run the single test to verify the fix
-- If changes might affect other tests, run the full file: \`npx playwright test tests/file.spec.ts --reporter=list\`
-- If changes are isolated to one test, just verify that test passes
-**NEVER use:**
-- \`--ui\` flag (opens interactive UI that blocks)
-- \`--reporter=html\` without \`--reporter=list\` (may open server)
-- Commands without \`--reporter=list\` in CI/headless mode
-- Any flags that auto-open reports or browsers after test completion
-**Process management:**
-- Always use \`--reporter=list\` or \`--reporter=dot\` for clean output
-- Keep tests in headless mode; use \`--headed\` or MCP tools only when actively debugging
-- Never auto-open HTML reports - if you need to inspect results, use MCP screenshot tools instead
-- Tests should exit automatically after completion
-- If a process hangs, kill it and retry with correct flags
-</playwright_execution>
-<debugging_with_mcp>
-When tests fail, use Playwright MCP tools to investigate:
-1. **Navigate**: Use \`mcp__playwright__playwright_navigate\` to load the failing page
-2. **Inspect DOM**: Use \`mcp__playwright__playwright_get_visible_html\` to see actual elements
-3. **Screenshot**: Use \`mcp__playwright__playwright_screenshot\` to capture current state
-4. **Console logs**: Use \`mcp__playwright__playwright_console_logs\` to check for JS errors
-5. **Interact**: Use click/fill tools to manually reproduce the flow
-**Workflow**: Navigate \u2192 inspect HTML \u2192 verify selectors \u2192 check console \u2192 fix
-</debugging_with_mcp>
-<selector_strategy>
-Prioritize resilient selectors:
-1. \`getByRole()\` - accessibility-focused, most stable
-2. \`getByLabel()\` - form elements
-3. \`getByText()\` - user-visible content
-4. \`getByTestId()\` - explicit test markers
-5. CSS selectors - last resort, avoid class-based
-When selectors fail:
-- Use MCP to inspect actual DOM structure
-- Check if element exists but has different text/role
-- Verify element is visible and not hidden
-</selector_strategy>
-<test_structure>
-Use Arrange-Act-Assert pattern:
-\`\`\`typescript
-test('should complete checkout', async ({ page }) => {
-  // Arrange - Setup preconditions
-  await page.goto('/cart');
-  // Act - Perform the action
-  await page.getByRole('button', { name: 'Checkout' }).click();
-  await page.getByLabel('Card number').fill('4242424242424242');
-  await page.getByRole('button', { name: 'Pay' }).click();
-  // Assert - Verify outcomes
-  await expect(page).toHaveURL(/\\/confirmation/);
-  await expect(page.getByText('Order confirmed')).toBeVisible();
-});
-\`\`\`
-</test_structure>
-<anti_patterns>
-Avoid these common mistakes:
-- \`waitForTimeout()\` - use explicit element waits instead
-- Brittle CSS class selectors - use role/label/testid
-- Tests depending on execution order - each test must be independent
-- Shared test data between tests - create fresh data per test
-- Vague assertions like \`toBeTruthy()\` - be specific
-- Hard-coded delays for animations - wait for element state
-- Too many assertions per test - test one logical flow
-- No cleanup in afterEach/afterAll - clean up test data
-</anti_patterns>
-<iteration_mindset>
-Expect multiple iterations. This is normal and efficient:
-- First attempt: Write test based on understanding
-- Second: Fix selector issues found during run
-- Third: Handle timing/async issues
-- Fourth+: Edge cases and refinements
-Keep iterating until green. Three robust passing tests are better than ten flaky ones.
-</iteration_mindset>
-<decision_gates>
-**Keep building (proceed autonomously):**
-- Test fails with clear selector/timing issue \u2192 fix and retry
-- Missing test file \u2192 create it
-- Standard patterns (forms, navigation, CRUD) \u2192 just build
-- Error message is actionable \u2192 iterate on fix
-**Ask user first:**
-- Ambiguous requirements ("test the dashboard" - which parts?)
-- Multiple valid approaches (shared fixture vs per-test setup?)
-- Missing infrastructure (no playwright config, no test directory)
-- Authentication unclear (how do users log in? test account?)
-- External dependencies (tests need API keys, seeds, third-party services)
-**Stop and report:**
-- App bug discovered (test is correct, app is broken)
-- Max attempts reached (5 attempts with no progress)
-- Blocked by environment (app not running, wrong URL)
-- Test requires unavailable capabilities (mobile, specific browser)
-</decision_gates>
-<definition_of_done>
-Before marking a test complete:
-- [ ] Test passes consistently (2+ runs)
-- [ ] No flaky behavior detected
-- [ ] Test data is cleaned up (or isolated)
-- [ ] Selectors are resilient (not class-based)
-- [ ] No arbitrary timeouts used
-</definition_of_done>
-<communication>
-When reporting progress:
-- State which test is being worked on
-- Report pass/fail status after each run
-- When fixing, explain what was wrong and the fix
-- Summarize final status: X/Y tests passing
-</communication>`;
+<done>
+A test is complete when it passes 2+ times consistently with resilient selectors and no arbitrary timeouts.
+</done>`;
   }
 });
@@ -215,153 +88,73 @@ var init_fixer = __esm({
   "src/prompts/fixer.ts"() {
     "use strict";
     fixerPrompt = `<role>
-You are a Test Fixer Agent specialized in debugging failing tests, analyzing error logs, and fixing test issues in CI/headless environments.
+You are a Test Fixer Agent that debugs failing tests and fixes issues. You work with any test framework.
 </role>
-<core_workflow>
-Follow this debugging loop for each failing test:
-1. **Analyze** - Read the error message and stack trace carefully
-2. **Investigate** - Read the failing test file and code under test
-3. **Hypothesize** - Form a theory about the root cause (see categories below)
-4. **Fix** - Make minimal, targeted changes to fix the issue
-5. **Verify** - Run the test 2-3 times to confirm fix and detect flakiness
-6. **Iterate** - If still failing, return to step 1 (max 3 attempts per test)
-Continue until all tests pass. Do NOT stop after first failure.
-</core_workflow>
-<root_cause_categories>
-When diagnosing failures, classify into one of these categories:
-**Selector** - Element structure changed or locator is fragile
-- Element text/role changed \u2192 update selector
-- Element not visible \u2192 add proper wait
-- Multiple matches \u2192 make selector more specific
-**Timing** - Race condition, missing wait, async issue
-- Race condition \u2192 add explicit wait for element/state
-- Network delay \u2192 wait for API response
-- Animation \u2192 wait for animation to complete
-**State** - Test pollution, setup/teardown issue
-- Test pollution \u2192 ensure proper cleanup
-- Missing setup \u2192 add required preconditions
-- Stale data \u2192 refresh or recreate test data
-**Data** - Hardcoded data, missing test data
-- Hardcoded IDs \u2192 use dynamic data or fixtures
-- Missing test data \u2192 create via API setup
+<workflow>
+1. **Detect** - Check package.json to identify the test framework
+2. **Analyze** - Read error message and stack trace
+3. **Investigate** - Read failing test and code under test
+4. **Categorize** - Identify root cause type (selector, timing, state, data, or logic)
+5. **Fix** - Make minimal, targeted changes
+6. **Verify** - Run test 2-3 times to confirm fix and check for flakiness
+7. **Iterate** - If still failing, try a new hypothesis (max 3 attempts per test)
-**Logic** - Test assertion is wrong or outdated
-- Assertion doesn't match current behavior
-- Test expectations are incorrect
-</root_cause_categories>
+Continue until all tests pass.
+</workflow>
-<playwright_execution>
-CRITICAL: Always run Playwright tests correctly to ensure clean exits.
+<root_causes>
+**Selector** - Element changed or locator is fragile \u2192 update selector, add wait, make more specific
-**Correct test commands:**
-- Single test: \`npx playwright test tests/example.spec.ts --reporter=list\`
-- All tests: \`npx playwright test --reporter=list\`
-- Retry failed: \`npx playwright test --last-failed --reporter=list\`
+**Timing** - Race condition or async issue \u2192 add explicit wait for element/state/network
-**NEVER use:**
-- \`--ui\` flag (opens interactive UI that blocks)
-- \`--reporter=html\` without \`--reporter=list\` (may open server)
-- Commands without \`--reporter=list\` in CI/headless mode
+**State** - Test pollution or setup issue \u2192 ensure cleanup, add preconditions, refresh data
-**Process management:**
-- Always use \`--reporter=list\` or \`--reporter=dot\` for clean output
-- Tests should exit automatically after completion
-- If a process hangs, kill it and retry with correct flags
-</playwright_execution>
+**Data** - Hardcoded or missing data \u2192 use dynamic data, create via API
-<debugging_with_mcp>
-When tests fail, use Playwright MCP tools to investigate:
+**Logic** - Assertion wrong or outdated \u2192 update expectation to match actual behavior
+</root_causes>
-1. **Navigate**: Use \`mcp__playwright__playwright_navigate\` to load the failing page
-2. **Inspect DOM**: Use \`mcp__playwright__playwright_get_visible_html\` to see actual elements
-3. **Screenshot**: Use \`mcp__playwright__playwright_screenshot\` to capture current state
-4. **Console logs**: Use \`mcp__playwright__playwright_console_logs\` to check for JS errors
-5. **Interact**: Use click/fill tools to manually reproduce the flow
+<execution>
+- Run in headless/CI mode - avoid interactive UIs that block
+- Check package.json scripts for correct test command
+- Run single failing test first for faster feedback
+- If process hangs, kill it and check for interactive flags
+</execution>
-**Workflow**: Navigate \u2192 inspect HTML \u2192 verify selectors \u2192 check console \u2192 fix
-</debugging_with_mcp>
+<fixing_principles>
+- Use semantic selectors (roles, labels, test IDs) over CSS classes
+- Use condition-based waits, not arbitrary delays
+- Each test should be independent with its own data
+- Don't weaken assertions to make tests pass
+- Don't skip or remove tests without understanding the failure
+</fixing_principles>
-<flakiness_detection>
-After fixing, run the test 2-3 times. Watch for:
+<flakiness>
+After fixing, verify stability by running 2-3 times. Watch for:
+- Inconsistent pass/fail results
+- Timing sensitivity
+- Order dependence with other tests
+- Coupling to specific data state
+</flakiness>
-- **Inconsistent results**: Passes sometimes, fails others
-- **Timing sensitivity**: Fails on slow runs, passes on fast
-- **Order dependence**: Fails when run with other tests
-- **Data coupling**: Relies on specific database state
+<decisions>
+**Keep iterating:** New hypothesis available, error message changed (progress), under 3 attempts
-Common flakiness causes:
-- Arbitrary delays instead of condition waits
-- Shared state between tests
-- Hardcoded IDs or timestamps
-- Missing \`await\` on async operations
-- Race conditions in UI interactions
-</flakiness_detection>
-<fixing_patterns>
-**Selectors** - Prefer resilient locators:
-\`\`\`typescript
-// Good
-page.getByRole('button', { name: 'Submit' })
-page.getByTestId('submit-btn')
-// Avoid
-page.locator('.btn-primary')
-page.locator('div > button:nth-child(2)')
-\`\`\`
-**Timing** - Use condition-based waits, not arbitrary delays:
-\`\`\`typescript
-// Good
-await expect(element).toBeVisible({ timeout: 10_000 })
-// Avoid
-await page.waitForTimeout(5000)
-\`\`\`
-</fixing_patterns>
-<decision_gates>
-**Keep iterating if:**
-- You haven't tried 3 attempts yet
-- You have a new hypothesis to test
-- The error message changed (progress)
-**Escalate if:**
-- 3 attempts failed with no progress
-- Test identifies an actual app bug (don't mask bugs)
-- Test is fundamentally flaky by design
-- Requirements are ambiguous
+**Escalate:** 3 attempts with no progress, actual app bug found, requirements unclear
 When escalating, report what you tried and why it didn't work.
-</decision_gates>
-<avoid>
-- Hard-coding values to make specific tests pass
-- Removing or skipping tests without understanding why they fail
-- Over-mocking that hides real integration issues
-- Making tests pass by weakening assertions
-- Introducing flakiness through timing-dependent fixes
-</avoid>
-<report_format>
-When reporting findings, use this structure:
+</decisions>
+<report>
 **Status**: fixed | escalated | in-progress
-**Test**: [test file and name]
-**Root Cause**: [Category] - [Specific cause]
-**Fix**: [What you changed]
-**Verification**: [N] runs, [all passed / some failed]
-**Flakiness Risk**: [none | low | medium | high] - [reason]
+**Test**: [file and name]
+**Root Cause**: [category] - [specific cause]
+**Fix**: [what changed]
+**Verification**: [N runs, results]
-Summarize final status: X/Y tests passing
-</report_format>`;
+Summarize: X/Y tests passing
+</report>`;
   }
 });
@@ -442,15 +235,15 @@ var init_prompts = __esm({
 // src/config.ts
 import { resolve } from "path";
-import dotenv from "dotenv";
 var isDevelopment, getEnvVar, config;
 var init_config = __esm({
-  "src/config.ts"() {
+  async "src/config.ts"() {
     "use strict";
     init_prompts();
     isDevelopment = process.env.NODE_ENV === "development";
     if (isDevelopment) {
       const envFile = process.env.ENV_NAME ? `.env.${process.env.ENV_NAME}` : ".env";
+      const dotenv = await import("dotenv");
       dotenv.config({ path: resolve(process.cwd(), envFile) });
     }
     getEnvVar = (key, defaultValue) => {
@@ -466,7 +259,7 @@ var init_config = __esm({
       supatestApiKey: getEnvVar("SUPATEST_API_KEY"),
       supatestApiUrl: getEnvVar("SUPATEST_API_URL", "https://code-api.supatest.ai"),
       claudeCodeExecutablePath: getEnvVar("SUPATEST_CLAUDE_CODE_PATH"),
-      anthropicModelName: getEnvVar("ANTHROPIC_MODEL_NAME", "claude-sonnet-4-20250514"),
+      anthropicModelName: getEnvVar("ANTHROPIC_MODEL_NAME", "claude-opus-4-5"),
       headlessSystemPrompt: fixerPrompt,
       interactiveSystemPrompt: builderPrompt,
       planSystemPrompt: plannerPrompt
@@ -684,7 +477,7 @@ function getToolDisplayName(toolName) {
   };
   return displayNameMap[toolName] || toolName;
 }
-var AVAILABLE_MODELS, DEFAULT_MODEL_ID, DATE_SUFFIX_REGEX, CONTEXT_WINDOWS, util, objectUtil, ZodParsedType, getParsedType, ZodIssueCode, ZodError, errorMap, overrideErrorMap, makeIssue, ParseStatus, INVALID, DIRTY, OK, isAborted, isDirty, isValid, isAsync, errorUtil, ParseInputLazyPath, handleResult, ZodType, cuidRegex, cuid2Regex, ulidRegex, uuidRegex, nanoidRegex, jwtRegex, durationRegex, emailRegex, _emojiRegex, emojiRegex, ipv4Regex, ipv4CidrRegex, ipv6Regex, ipv6CidrRegex, base64Regex, base64urlRegex, dateRegexSource, dateRegex, ZodString, ZodNumber, ZodBigInt, ZodBoolean, ZodDate, ZodSymbol, ZodUndefined, ZodNull, ZodAny, ZodUnknown, ZodNever, ZodVoid, ZodArray, ZodObject, ZodUnion, getDiscriminator, ZodDiscriminatedUnion, ZodIntersection, ZodTuple, ZodRecord, ZodMap, ZodSet, ZodFunction, ZodLazy, ZodLiteral, ZodEnum, ZodNativeEnum, ZodPromise, ZodEffects, ZodOptional, ZodNullable, ZodDefault, ZodCatch, ZodNaN, ZodBranded, ZodPipeline, ZodReadonly, ZodFirstPartyTypeKind, stringType, numberType, booleanType, dateType, unknownType, arrayType, objectType, unionType, discriminatedUnionType, recordType, functionType, lazyType, literalType, enumType, promiseType, coerce, MAX_API_KEY_NAME_LENGTH, apiKeySchema, apiKeyUsageSchema, createApiKeyRequestSchema, apiKeyResponseSchema, apiKeyUsageSummarySchema, genericErrorSchema, validationErrorSchema, feedbackCategorySchema, FEEDBACK_CATEGORIES, createFeedbackSchema, feedbackResponseSchema, listFeedbackQuerySchema, feedbackListResponseSchema, MAX_TIMEZONE_CHAR_LENGTH, organizationSchema, organizationSettingsSchema, textBlockSchema, toolUseBlockSchema, toolResultBlockSchema, thinkingBlockSchema, imageBlockSchema, contentBlockSchema, sessionSchema, createSessionRequestSchema, updateSessionRequestSchema, messageSchema, createMessageRequestSchema, cliEventSchema, createCLISessionRequestSchema, queryResultSchema, queryTurnSchema, queryContentSchema, queryUsageSchema, querySchema, runStatusSchema, testResultStatusSchema, testOutcomeSchema, attachmentKindSchema, stepCategorySchema, runSummarySchema, ciMetadataSchema, gitMetadataSchema, playwrightConfigSchema, errorInfoSchema, locationSchema, sourceSnippetSchema, runSchema, annotationSchema, testSchema, testResultSchema, baseStepSchema, stepSchema, attachmentSchema, listRunsQuerySchema, listTestsQuerySchema, runsListResponseSchema, runDetailResponseSchema, testsListResponseSchema, testDetailResponseSchema, testHistoryItemSchema, testHistoryResponseSchema, topOffenderSchema, topOffendersResponseSchema, trendPointSchema, trendsResponseSchema, errorCategorySchema, failureClusterSchema, newFailureSchema, runInsightsResponseSchema;
+var AVAILABLE_MODELS, DEFAULT_MODEL_ID, DATE_SUFFIX_REGEX, CONTEXT_WINDOWS, util, objectUtil, ZodParsedType, getParsedType, ZodIssueCode, ZodError, errorMap, overrideErrorMap, makeIssue, ParseStatus, INVALID, DIRTY, OK, isAborted, isDirty, isValid, isAsync, errorUtil, ParseInputLazyPath, handleResult, ZodType, cuidRegex, cuid2Regex, ulidRegex, uuidRegex, nanoidRegex, jwtRegex, durationRegex, emailRegex, _emojiRegex, emojiRegex, ipv4Regex, ipv4CidrRegex, ipv6Regex, ipv6CidrRegex, base64Regex, base64urlRegex, dateRegexSource, dateRegex, ZodString, ZodNumber, ZodBigInt, ZodBoolean, ZodDate, ZodSymbol, ZodUndefined, ZodNull, ZodAny, ZodUnknown, ZodNever, ZodVoid, ZodArray, ZodObject, ZodUnion, getDiscriminator, ZodDiscriminatedUnion, ZodIntersection, ZodTuple, ZodRecord, ZodMap, ZodSet, ZodFunction, ZodLazy, ZodLiteral, ZodEnum, ZodNativeEnum, ZodPromise, ZodEffects, ZodOptional, ZodNullable, ZodDefault, ZodCatch, ZodNaN, ZodBranded, ZodPipeline, ZodReadonly, ZodFirstPartyTypeKind, stringType, numberType, booleanType, dateType, unknownType, arrayType, objectType, unionType, discriminatedUnionType, recordType, functionType, lazyType, literalType, enumType, promiseType, coerce, MAX_API_KEY_NAME_LENGTH, apiKeySchema, apiKeyUsageSchema, createApiKeyRequestSchema, apiKeyResponseSchema, apiKeyUsageSummarySchema, genericErrorSchema, validationErrorSchema, feedbackCategorySchema, FEEDBACK_CATEGORIES, createFeedbackSchema, feedbackResponseSchema, listFeedbackQuerySchema, feedbackListResponseSchema, healthMetricSchema, healthMetricDailyItemSchema, healthMetricsWithDailySchema, healthAnalyticsPeriodSchema, healthAnalyticsDailyItemSchema, healthAnalyticsResponseSchema, MAX_TIMEZONE_CHAR_LENGTH, organizationSchema, organizationSettingsSchema, textBlockSchema, toolUseBlockSchema, toolResultBlockSchema, thinkingBlockSchema, imageBlockSchema, contentBlockSchema, sessionSchema, createSessionRequestSchema, updateSessionRequestSchema, messageSchema, createMessageRequestSchema, cliEventSchema, createCLISessionRequestSchema, queryResultSchema, queryTurnSchema, queryContentSchema, queryUsageSchema, querySchema, runStatusSchema, testResultStatusSchema, testOutcomeSchema, attachmentKindSchema, stepCategorySchema, runSummarySchema, ciMetadataSchema, gitMetadataSchema, playwrightConfigSchema, errorInfoSchema, locationSchema, sourceSnippetSchema, runSchema, annotationSchema, testSchema, testResultSchema, baseStepSchema, stepSchema, attachmentSchema, listRunsQuerySchema, listTestsQuerySchema, runsListResponseSchema, runDetailResponseSchema, testsListResponseSchema, testDetailResponseSchema, testHistoryItemSchema, testHistoryResponseSchema, topOffenderSchema, topOffendersResponseSchema, trendPointSchema, trendsResponseSchema, errorCategorySchema, failureClusterSchema, newFailureSchema, runInsightsResponseSchema, FailureCategoryEnum, SelectorTypeEnum, FailureCategoryStatsSchema, FailureCategoriesResponseSchema, FailingSelectorStatsSchema, FailingSelectorsResponseSchema, newFailureItemSchema, newFailuresResponseSchema, flakyTestItemSchema, flakyTestsResponseSchema, slowestTestItemSchema, slowestTestsResponseSchema, runSummaryEmailFailureSchema, runSummaryEmailReportSchema, sendRunReportRequestSchema;
 var init_shared_es = __esm({
   "../shared/dist/shared.es.mjs"() {
     "use strict";
@@ -4510,6 +4303,62 @@ var init_shared_es = __esm({
       limit: numberType(),
       offset: numberType()
     });
+    healthMetricSchema = objectType({
+      /** Current period value */
+      current: numberType(),
+      /** Previous period value for comparison */
+      previous: numberType(),
+      /** Absolute change (current - previous) */
+      change: numberType(),
+      /** Percentage change from previous period, null if previous was 0 */
+      percentChange: numberType().nullable()
+    });
+    healthMetricDailyItemSchema = objectType({
+      /** Date in ISO format (YYYY-MM-DD) */
+      date: stringType(),
+      /** Metric value for this day */
+      value: numberType()
+    });
+    healthMetricsWithDailySchema = objectType({
+      /** Aggregate metric values and trend */
+      metric: healthMetricSchema,
+      /** Daily breakdown of the metric */
+      byDay: arrayType(healthMetricDailyItemSchema)
+    });
+    healthAnalyticsPeriodSchema = objectType({
+      /** Start date in ISO format */
+      start: stringType(),
+      /** End date in ISO format */
+      end: stringType(),
+      /** Number of days in the period */
+      days: numberType()
+    });
+    healthAnalyticsDailyItemSchema = objectType({
+      /** Date in ISO format (YYYY-MM-DD) */
+      date: stringType(),
+      /** Total number of tests run this day */
+      totalTests: numberType(),
+      /** Pass rate percentage for this day (0-100) */
+      passRate: numberType(),
+      /** Number of flaky tests detected this day */
+      flakyCount: numberType(),
+      /** Number of new failures detected this day */
+      newFailures: numberType()
+    });
+    healthAnalyticsResponseSchema = objectType({
+      /** Time period for this analytics data */
+      period: healthAnalyticsPeriodSchema,
+      /** Combined daily breakdown of all metrics */
+      daily: arrayType(healthAnalyticsDailyItemSchema),
+      /** Total tests metric with daily breakdown */
+      totalTests: healthMetricsWithDailySchema,
+      /** Pass rate metric with daily breakdown (byDay contains passRate values) */
+      passRate: healthMetricsWithDailySchema,
+      /** Flaky tests metric with daily breakdown (byDay contains count values) */
+      flakyTests: healthMetricsWithDailySchema,
+      /** New failures metric with daily breakdown (byDay contains count values) */
+      newFailures: healthMetricsWithDailySchema
+    });
     MAX_TIMEZONE_CHAR_LENGTH = 100;
     organizationSchema = objectType({
       id: stringType().uuid(),
@@ -4794,6 +4643,8 @@ var init_shared_es = __esm({
     });
     runSchema = objectType({
       id: stringType(),
+      readableId: stringType().optional(),
+      // e.g., "RUN-123"
       projectId: stringType(),
       status: runStatusSchema,
       summary: runSummarySchema,
@@ -4812,6 +4663,8 @@ var init_shared_es = __esm({
     });
     testSchema = objectType({
       id: stringType(),
+      readableId: stringType().optional(),
+      // e.g., "TEST-123"
       runId: stringType(),
       file: stringType(),
       title: stringType(),
@@ -4913,7 +4766,8 @@ var init_shared_es = __esm({
       status: testResultStatusSchema,
       durationMs: numberType(),
       date: stringType(),
-      branch: stringType().optional()
+      branch: stringType().optional(),
+      errorSummary: stringType().optional()
     });
     testHistoryResponseSchema = objectType({
       testId: stringType(),
@@ -4983,6 +4837,144 @@ var init_shared_es = __esm({
       newFailures: arrayType(newFailureSchema),
       clusters: arrayType(failureClusterSchema)
     });
+    FailureCategoryEnum = enumType([
+      "timeout",
+      "element_not_found",
+      "assertion",
+      "network",
+      "navigation",
+      "other"
+    ]);
+    SelectorTypeEnum = enumType([
+      "testid",
+      "class",
+      "id",
+      "aria",
+      "text",
+      "css",
+      "xpath"
+    ]);
+    FailureCategoryStatsSchema = objectType({
+      category: FailureCategoryEnum,
+      count: numberType(),
+      percentage: numberType()
+    });
+    FailureCategoriesResponseSchema = objectType({
+      totalFailures: numberType(),
+      categories: arrayType(FailureCategoryStatsSchema),
+      period: objectType({
+        start: stringType(),
+        end: stringType(),
+        days: numberType()
+      })
+    });
+    FailingSelectorStatsSchema = objectType({
+      selector: stringType(),
+      selectorType: SelectorTypeEnum,
+      count: numberType(),
+      testCount: numberType()
+    });
+    FailingSelectorsResponseSchema = objectType({
+      selectors: arrayType(FailingSelectorStatsSchema),
+      period: objectType({
+        start: stringType(),
+        end: stringType(),
+        days: numberType()
+      })
+    });
+    newFailureItemSchema = objectType({
+      testRunId: stringType(),
+      runId: stringType(),
+      testId: stringType(),
+      file: stringType(),
+      title: stringType(),
+      failureCount: numberType(),
+      totalRunCount: numberType(),
+      firstFailedAt: stringType(),
+      avgDurationMs: numberType(),
+      errorMessage: stringType().nullable()
+    });
+    newFailuresResponseSchema = objectType({
+      items: arrayType(newFailureItemSchema),
+      total: numberType(),
+      period: objectType({
+        start: stringType(),
+        end: stringType(),
+        days: numberType()
+      })
+    });
+    flakyTestItemSchema = objectType({
+      testRunId: stringType(),
+      runId: stringType(),
+      testId: stringType(),
+      file: stringType(),
+      title: stringType(),
+      passRate: numberType(),
+      flakeCount: numberType(),
+      totalRuns: numberType(),
+      avgDurationMs: numberType()
+    });
+    flakyTestsResponseSchema = objectType({
+      items: arrayType(flakyTestItemSchema),
+      total: numberType(),
+      period: objectType({
+        start: stringType(),
+        end: stringType(),
+        days: numberType()
+      })
+    });
+    slowestTestItemSchema = objectType({
+      testRunId: stringType(),
+      runId: stringType(),
+      testId: stringType(),
+      file: stringType(),
+      title: stringType(),
+      avgDurationMs: numberType(),
+      p95DurationMs: numberType(),
+      trendPercent: numberType()
+    });
+    slowestTestsResponseSchema = objectType({
+      items: arrayType(slowestTestItemSchema),
+      total: numberType(),
+      period: objectType({
+        start: stringType(),
+        end: stringType(),
+        days: numberType()
+      })
+    });
+    runSummaryEmailFailureSchema = objectType({
+      testRunId: stringType(),
+      testId: stringType(),
+      title: stringType(),
+      file: stringType(),
+      errorMessage: stringType().nullable(),
+      errorStack: stringType().nullable()
+    });
+    runSummaryEmailReportSchema = objectType({
+      runId: stringType(),
+      readableId: stringType().optional(),
+      runDetailsUrl: stringType(),
+      startedAt: stringType(),
+      endedAt: stringType().optional(),
+      durationMs: numberType(),
+      // Git info
+      branch: stringType().optional(),
+      commit: stringType().optional(),
+      commitMessage: stringType().optional(),
+      // Summary stats
+      totalTests: numberType(),
+      passedTests: numberType(),
+      failedTests: numberType(),
+      flakyTests: numberType(),
+      skippedTests: numberType(),
+      passRate: numberType(),
+      // Top failures
+      topFailures: arrayType(runSummaryEmailFailureSchema)
+    });
+    sendRunReportRequestSchema = objectType({
+      runId: stringType(),
+      emails: arrayType(stringType().email())
+    });
   }
 });
@@ -5241,7 +5233,7 @@ var CLI_VERSION;
 var init_version = __esm({
   "src/version.ts"() {
     "use strict";
-    CLI_VERSION = "0.0.25";
+    CLI_VERSION = "0.0.27";
   }
 });
@@ -6180,9 +6172,9 @@ import { dirname, join as join6 } from "path";
 import { query } from "@anthropic-ai/claude-agent-sdk";
 var CoreAgent;
 var init_agent = __esm({
-  "src/core/agent.ts"() {
+  async "src/core/agent.ts"() {
     "use strict";
-    init_config();
+    await init_config();
     init_command_discovery();
     init_error_logger();
     init_mcp_loader();
@@ -6408,7 +6400,7 @@ ${projectInstructions}`,
                 cacheCreationTokens: usageFromResult.cacheCreationTokens
               };
               queryUsage = {
-                model: finalUsage.model ?? resolvedModel ?? config.anthropicModelName ?? "claude-sonnet-4-20250514",
+                model: finalUsage.model ?? resolvedModel ?? config.anthropicModelName ?? "claude-opus-4-5",
                 numTurns: msg.num_turns,
                 durationMs: msg.duration_ms,
                 inputTokens: finalUsage.inputTokens || 0,
@@ -6498,17 +6490,10 @@ ${projectInstructions}`,
       async resolveClaudeCodePath() {
         const fs5 = await import("fs/promises");
         let claudeCodePath;
-        const bundledPath = join6(dirname(import.meta.url.replace("file://", "")), "claude-code-cli.js");
-        try {
-          await fs5.access(bundledPath);
-          claudeCodePath = bundledPath;
-          this.presenter.onLog(`Bundled mode: ${claudeCodePath}`);
-        } catch {
-          const require2 = createRequire(import.meta.url);
-          const sdkPath = require2.resolve("@anthropic-ai/claude-agent-sdk/sdk.mjs");
-          claudeCodePath = join6(dirname(sdkPath), "cli.js");
-          this.presenter.onLog(`Development mode: ${claudeCodePath}`);
-        }
+        const require2 = createRequire(import.meta.url);
+        const sdkPath = require2.resolve("@anthropic-ai/claude-agent-sdk/sdk.mjs");
+        claudeCodePath = join6(dirname(sdkPath), "cli.js");
+        this.presenter.onLog(`Using SDK CLI: ${claudeCodePath}`);
         if (config.claudeCodeExecutablePath) {
           claudeCodePath = config.claudeCodeExecutablePath;
           this.presenter.onLog(
@@ -6571,13 +6556,8 @@ var init_react = __esm({
     "use strict";
     init_shared_es();
     CONTEXT_WINDOWS2 = {
-      "claude-sonnet-4-20250514": 2e5,
-      "claude-sonnet-4-5-20250929": 2e5,
-      "claude-opus-4-20250514": 2e5,
-      "claude-3-5-sonnet-20241022": 2e5,
-      "claude-3-opus-20240229": 2e5,
-      "claude-3-sonnet-20240229": 2e5,
-      "claude-3-haiku-20240307": 2e5
+      "claude-opus-4-5": 2e5,
+      "claude-sonnet-4-5": 2e5
     };
     ReactPresenter = class {
       callbacks;
@@ -11444,10 +11424,10 @@ async function runInteractive(config2) {
 }
 var AgentRunner, InteractiveAppContent, InteractiveApp;
 var init_interactive = __esm({
-  "src/modes/interactive.tsx"() {
+  async "src/modes/interactive.tsx"() {
     "use strict";
-    init_config();
-    init_agent();
+    await init_config();
+    await init_agent();
     init_message_bridge();
     init_react();
     init_api_client();
@@ -11766,10 +11746,10 @@ var init_interactive = __esm({
 });
 // src/index.ts
-init_config();
+await init_config();
 init_shared_es();
 init_setup();
-init_config();
+await init_config();
 import { Command } from "commander";
 // src/modes/headless.ts
@@ -11779,7 +11759,7 @@ import { render } from "ink";
 import React14 from "react";
 // src/ui/HeadlessApp.tsx
-init_agent();
+await init_agent();
 init_react();
 init_MessageList();
 init_SessionContext();
@@ -12321,7 +12301,7 @@ program.name("supatest").description(
       });
       process.exit(result.success ? 0 : 1);
     } else {
-      const { runInteractive: runInteractive2 } = await Promise.resolve().then(() => (init_interactive(), interactive_exports));
+      const { runInteractive: runInteractive2 } = await init_interactive().then(() => interactive_exports);
       await runInteractive2({
         task: prompt || "",
         // Empty string if no task provided (will use input prompt)