npm - @bbearai/ai-executor - Versions diffs - 0.2.0 → 0.2.1 - Mend

@bbearai/ai-executor 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/chunk-WT22IQMS.mjs +175 -0
package/dist/chunk-WT22IQMS.mjs.map +1 -0
package/dist/cli.js +622 -129
package/dist/cli.js.map +1 -1
package/dist/index.d.mts +533 -8
package/dist/index.d.ts +533 -8
package/dist/index.js +1613 -131
package/dist/index.js.map +1 -1
package/dist/index.mjs +1411 -130
package/dist/index.mjs.map +1 -1
package/dist/report-generator-EVZEB33O.mjs +7 -0
package/dist/report-generator-EVZEB33O.mjs.map +1 -0
package/package.json +5 -1

package/dist/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { Stagehand, Page } from '@browserbasehq/stagehand';
 import Anthropic from '@anthropic-ai/sdk';
+import { Stagehand, Page } from '@browserbasehq/stagehand';
 /**
  * @bbearai/ai-executor - Type definitions
@@ -50,11 +50,33 @@ interface FormLoginAuth {
     /** Password to enter */
     password: string;
 }
-type AuthConfig = CookieAuth | LocalStorageAuth | FormLoginAuth;
+interface SupabaseNativeAuth {
+    type: 'supabase-native';
+    /** Supabase project URL (e.g. https://xyz.supabase.co) */
+    supabaseUrl: string;
+    /** Supabase anon/public key (required for GoTrue REST API) */
+    anonKey: string;
+    /** Email for Supabase auth */
+    email: string;
+    /** Password for Supabase auth */
+    password: string;
+}
+type AuthConfig = CookieAuth | LocalStorageAuth | FormLoginAuth | SupabaseNativeAuth;
+type StepActionType = 'click' | 'fill' | 'select' | 'navigate' | 'scroll' | 'wait' | 'assert';
 interface TestStep {
     stepNumber: number;
     action: string;
     expectedResult: string;
+    /** Deterministic action type — when set with selector, bypasses AI */
+    actionType?: StepActionType;
+    /** CSS selector or data-testid for deterministic execution */
+    selector?: string;
+    /** Value for fill/select/navigate actions */
+    value?: string;
+    /** Explicit wait after action (ms) */
+    waitMs?: number;
+    /** Hint for the vision evaluator on what to look for */
+    evaluationHint?: string;
 }
 interface TestCaseInput {
     id: string;
@@ -99,6 +121,13 @@ interface NetworkError {
     /** Timestamp relative to step start (ms) */
     timestamp: number;
 }
+/** A single retry attempt record */
+interface RetryAttempt {
+    attempt: number;
+    error: string;
+    confidence: number;
+    timestamp: number;
+}
 interface StepResult {
     stepNumber: number;
     action: string;
@@ -121,6 +150,20 @@ interface StepResult {
     consoleLogs: ConsoleEntry[];
     /** Failed/errored network requests during this step */
     networkErrors: NetworkError[];
+    /** Number of retry attempts (0 = succeeded on first try) */
+    retryCount: number;
+    /** History of failed retry attempts before the final result */
+    retryHistory: RetryAttempt[];
+    /** Whether this step was skipped in resilient mode after failing */
+    skipped: boolean;
+    /** Reason the step was skipped */
+    skipReason?: string;
+}
+interface RetryConfig {
+    /** Maximum retry attempts per step (default: 2) */
+    maxRetries?: number;
+    /** Delay between retries in ms (default: 2000) */
+    retryDelayMs?: number;
 }
 interface TestRunConfig {
     /** Base URL of the application under test */
@@ -141,13 +184,17 @@ interface TestRunConfig {
         anonKey: string;
         projectId: string;
     };
+    /** Retry configuration for transient failures */
+    retry?: RetryConfig;
+    /** Enable skip-and-recover mode: failed steps are skipped and page state is recovered (default: true) */
+    resilientMode?: boolean;
     /** Callback for real-time progress updates */
     onStepComplete?: (result: StepResult, index: number, total: number) => void;
     /** Callback for status changes */
     onStatusChange?: (status: TestRunStatus) => void;
 }
 type TestRunStatus = 'initializing' | 'navigating' | 'authenticating' | 'executing' | 'evaluating' | 'completed' | 'error';
-type OverallResult = 'passed' | 'failed' | 'error' | 'partial';
+type OverallResult = 'passed' | 'failed' | 'error' | 'partial' | 'passed_with_skips';
 interface TestRunResult {
     testCaseId: string;
     testCaseTitle: string;
@@ -166,15 +213,102 @@ interface TestRunResult {
     /** Browserbase session ID (if applicable) */
     browserSessionId?: string;
 }
+type FindingCategory = 'console_error' | 'broken_interaction' | 'visual_anomaly' | 'input_handling';
+type FindingSeverity = 'critical' | 'high' | 'medium' | 'low';
+interface ExplorationConfig {
+    targetUrl: string;
+    featureDescription: string;
+    actionBudget: number;
+    auth?: AuthConfig;
+    browserConfig: BrowserConfig;
+    anthropicApiKey: string;
+    model?: string;
+    onActionComplete?: (action: ExplorationAction, index: number) => void;
+}
+interface ExplorationAction {
+    actionNumber: number;
+    action: string;
+    category: FindingCategory | 'normal';
+    severity?: FindingSeverity;
+    confidence: number;
+    description: string;
+    screenshotBefore: Buffer;
+    screenshotAfter: Buffer;
+    networkRequests: CapturedRequest[];
+    consoleLogs: ConsoleEntry[];
+    domContext?: DomContext;
+    durationMs: number;
+}
+interface CapturedRequest {
+    method: string;
+    url: string;
+    status: number;
+    responseBody?: string;
+    requestBody?: string;
+    timestamp: string;
+}
+interface DomContext {
+    selector: string;
+    elementText: string;
+    nearbyText: string;
+}
+interface ActionableFinding {
+    title: string;
+    category: FindingCategory;
+    severity: FindingSeverity;
+    confidence: number;
+    networkRequests: CapturedRequest[];
+    consoleErrors: ConsoleEntry[];
+    domContext?: DomContext;
+    url: string;
+    route: string;
+    reproSteps: string[];
+    screenshotUrl: string;
+    actionPerformed: string;
+    expectedBehavior: string;
+    actualBehavior: string;
+}
+interface ExplorationReport {
+    projectName: string;
+    featureDescription: string;
+    targetUrl: string;
+    exploredAt: string;
+    duration: string;
+    actionsUsed: number;
+    actionBudget: number;
+    findings: ActionableFinding[];
+    tested: {
+        description: string;
+        route: string;
+        status: 'passed';
+    }[];
+    notTested: {
+        description: string;
+        reason: string;
+    }[];
+    summary: string;
+    suggestedPrompt: string;
+}
+interface ExplorationResult {
+    overallResult: 'clean' | 'findings' | 'error';
+    actions: ExplorationAction[];
+    report: ExplorationReport;
+    totalDurationMs: number;
+    tokenUsage: {
+        inputTokens: number;
+        outputTokens: number;
+    };
+    browserSessionId?: string;
+}
 /**
  * Test Runner
  *
- * Orchestrates the full test execution lifecycle using Stagehand:
+ * Orchestrates the full test execution lifecycle:
  * 1. Launch Stagehand browser session
  * 2. Navigate to target URL
- * 3. Inject authentication
- * 4. For each step: act() → screenshot → extract() → record
+ * 3. Inject authentication (supports supabase-native, cookie, localStorage, form-login)
+ * 4. For each step: act() → screenshot → vision evaluate → record
  * 5. Generate summary
  * 6. Return structured results
  */
@@ -184,6 +318,40 @@ interface TestRunResult {
  */
 declare function runTest(config: TestRunConfig): Promise<TestRunResult>;
+/**
+ * Exploratory Testing Runner
+ *
+ * Implements the observe->act->evaluate loop for autonomous
+ * feature exploration. The AI navigates a feature area,
+ * tries edge cases, and reports findings.
+ */
+declare function runExploration(config: ExplorationConfig): Promise<ExplorationResult>;
+/**
+ * Exploration Report Generator
+ *
+ * Transforms raw exploration actions into a structured report
+ * optimized for Claude Code consumption. The suggestedPrompt
+ * is designed to be pasted directly into Claude Code to fix issues.
+ */
+interface ReportInput {
+    projectName: string;
+    featureDescription: string;
+    targetUrl: string;
+    actions: ExplorationAction[];
+    model: string;
+}
+interface ReportOutput {
+    report: ExplorationReport;
+    tokenUsage: {
+        inputTokens: number;
+        outputTokens: number;
+    };
+}
+declare function generateExplorationReport(anthropic: Anthropic, input: ReportInput): Promise<ReportOutput>;
 /**
  * Browser Provider
  *
@@ -204,17 +372,309 @@ interface StagehandSession {
  * and manages Browserbase or local browser sessions.
  */
 declare function createStagehandSession(config: BrowserConfig, anthropicApiKey: string): Promise<StagehandSession>;
+/**
+ * Suppress the BugBear widget in the browser session.
+ *
+ * Uses Playwright's addInitScript() on the browser context to set a suppression
+ * flag before any page script runs. This prevents the widget from rendering and
+ * interfering with test execution (clicking the widget instead of app UI, popups
+ * covering elements, etc.).
+ *
+ * The flag is checked by BugBearProvider and BugBearPanel in @bbearai/react.
+ */
+declare function suppressBugBearWidget(stagehand: Stagehand): Promise<void>;
 /**
  * Inject authentication into the browser session.
  * Uses Stagehand's Page API and CDP for cookie injection.
  */
 declare function injectAuth(page: Page, auth: AuthConfig, stagehand?: Stagehand): Promise<void>;
+interface NetworkCapture {
+    start: () => void;
+    stop: () => void;
+    getRequests: () => CapturedRequest[];
+    getErrors: () => NetworkError[];
+}
+/**
+ * Supabase Native Auth
+ *
+ * Authenticates against Supabase GoTrue API directly, bypassing fragile
+ * form-based login. Injects the session into localStorage so the app
+ * picks it up on page load — no DOM interaction required.
+ */
+/** Supabase GoTrue session shape (subset we need) */
+interface GoTrueSession {
+    access_token: string;
+    refresh_token: string;
+    expires_in: number;
+    expires_at: number;
+    token_type: string;
+    user: {
+        id: string;
+        email: string;
+        role: string;
+        aud: string;
+    };
+}
+/**
+ * Authenticate via Supabase GoTrue REST API and return the session.
+ */
+declare function authenticateSupabase(auth: SupabaseNativeAuth): Promise<GoTrueSession>;
+/**
+ * Inject a Supabase session into the browser's localStorage.
+ *
+ * The app's Supabase client reads from `sb-<ref>-auth-token` on load.
+ * We inject the token into localStorage so the app authenticates on
+ * the next page load — no DOM interaction needed.
+ */
+declare function injectSupabaseAuth(page: Page, auth: SupabaseNativeAuth, session: GoTrueSession): Promise<void>;
+/**
+ * Verify the session is valid by calling the Supabase user endpoint.
+ */
+declare function verifySupabaseSession(auth: SupabaseNativeAuth, accessToken: string): Promise<boolean>;
+/**
+ * Full Supabase auth flow: authenticate → inject → verify.
+ */
+declare function performSupabaseAuth(page: Page, auth: SupabaseNativeAuth): Promise<void>;
+/**
+ * Vision-Based Step Evaluator
+ *
+ * Replaces Stagehand's extract() with direct Claude Messages API calls
+ * using before/after screenshots. This gives Claude visual context
+ * instead of just DOM text, catching visual regressions, layout shifts,
+ * and rendering issues that DOM-only evaluation misses.
+ */
+interface StepEvaluationInput {
+    anthropic: Anthropic;
+    screenshotBefore: Buffer;
+    screenshotAfter: Buffer;
+    action: string;
+    expectedResult: string;
+    /** Optional hint to guide what the evaluator should look for */
+    evaluationHint?: string;
+    model?: string;
+}
+interface StepEvaluation {
+    passed: boolean;
+    confidence: number;
+    actualResult: string;
+}
+/**
+ * Evaluate a test step by comparing before/after screenshots using Claude Vision.
+ *
+ * Sends both screenshots as image content blocks along with a structured
+ * evaluation prompt. Returns a typed assessment with pass/fail, confidence,
+ * and a description of what actually happened.
+ */
+declare function evaluateStep(input: StepEvaluationInput): Promise<StepEvaluation>;
+/**
+ * Action Executor
+ *
+ * Executes test step actions using a tiered approach:
+ * 1. If step has selector + actionType → Playwright direct (deterministic, fast)
+ * 2. If step has only natural language → Stagehand AI fallback
+ * 3. If Playwright action fails → Fall back to Stagehand with the natural language
+ *
+ * This eliminates AI flakiness for steps that have been enriched with
+ * selectors while preserving AI flexibility for natural-language-only steps.
+ */
+interface ActionResult {
+    /** Whether the action was executed via Playwright (true) or Stagehand AI (false) */
+    deterministic: boolean;
+    /** Error message if action failed */
+    error?: string;
+}
+/**
+ * Execute a test step action, preferring deterministic Playwright
+ * when the step has a selector, falling back to Stagehand AI otherwise.
+ */
+declare function executeAction(page: Page, stagehand: Stagehand, step: TestStep): Promise<ActionResult>;
+/**
+ * Selector Discovery
+ *
+ * After Stagehand successfully executes a natural-language step,
+ * attempts to discover which element was interacted with. Records
+ * the best available selector so the test case can be enriched
+ * for deterministic execution next time.
+ *
+ * Discovery data is stored in ai_step_results.actions_taken (JSONB).
+ */
+interface DiscoveredSelector {
+    /** The selector that was discovered */
+    selector: string;
+    /** How the selector was derived */
+    strategy: 'data-testid' | 'role' | 'aria-label' | 'id' | 'css-path';
+    /** Suggested actionType based on the element */
+    suggestedActionType?: StepActionType;
+    /** Element tag name */
+    tagName: string;
+    /** Visible text content (truncated) */
+    textContent?: string;
+}
+/**
+ * Attempt to discover the selector for the last-interacted element.
+ *
+ * Uses page.evaluate() to find the currently focused or last-clicked
+ * element and extract the best available selector for it.
+ *
+ * Returns null if no element can be identified.
+ */
+declare function discoverSelector(page: Page): Promise<DiscoveredSelector | null>;
+/**
+ * Install a click tracker on the page.
+ *
+ * Records the last-clicked element in `document.__bbLastClicked`
+ * so discoverSelector() can find it after Stagehand's act().
+ * Should be called once after page navigation.
+ */
+declare function installClickTracker(page: Page): Promise<void>;
+/**
+ * Report Auto-Triager
+ *
+ * Uses Claude to analyze incoming bug reports and auto-assign:
+ * - Severity (critical/high/medium/low)
+ * - Category (ui_ux/functional/crash/security/other)
+ * - Duplicate detection against recent reports
+ * - Root cause analysis
+ *
+ * Results are stored in reports.ai_analysis (JSONB).
+ */
+interface TriageReportInput {
+    title?: string | null;
+    description: string;
+    app_context?: Record<string, unknown> | null;
+    enhanced_context?: Record<string, unknown> | null;
+    device_info?: Record<string, unknown> | null;
+    navigation_history?: unknown[] | null;
+    screenshot_urls?: string[] | null;
+    error_fingerprint?: string | null;
+    report_source?: string | null;
+}
+interface RecentReportSummary {
+    id: string;
+    title?: string | null;
+    description: string;
+    error_fingerprint?: string | null;
+    severity?: string | null;
+    category?: string | null;
+    status: string;
+}
+interface TriageInput {
+    anthropic: Anthropic;
+    report: TriageReportInput;
+    recentReports: RecentReportSummary[];
+    model?: string;
+}
+type TriageSeverity = 'critical' | 'high' | 'medium' | 'low';
+type TriageCategory = 'ui_ux' | 'functional' | 'crash' | 'security' | 'other';
+interface TriageResult {
+    suggested_severity: TriageSeverity;
+    severity_confidence: number;
+    suggested_category: TriageCategory;
+    category_confidence: number;
+    root_cause_analysis: string;
+    duplicate_of: string | null;
+    duplicate_confidence: number;
+    triage_notes: string;
+}
+/**
+ * Analyze a report using Claude and return triage suggestions.
+ */
+declare function triageReport(input: TriageInput): Promise<TriageResult>;
+/**
+ * Failure Analyzer
+ *
+ * When an AI test step fails, analyzes the failure to classify it as:
+ * - real_bug: Actual application defect (API error, broken feature, crash)
+ * - test_maintenance: Test is stale (selector changed, page restructured)
+ * - flaky: Timing issue, intermittent network failure, race condition
+ * - unknown: Can't determine with sufficient confidence
+ *
+ * For test_maintenance failures, suggests corrected selectors/actions
+ * that can be auto-applied to heal the test case.
+ */
+type FailureClassification = 'real_bug' | 'test_maintenance' | 'ai_limitation' | 'flaky' | 'unknown';
+/** Run-level classification (aggregated from step-level classifications) */
+type RunFailureClassification = 'bug' | 'test_issue' | 'ai_limitation' | 'flaky' | 'unknown';
+interface FailureAnalysis {
+    classification: FailureClassification;
+    confidence: number;
+    reasoning: string;
+    suggested_fix?: {
+        stepNumber: number;
+        original_action: string;
+        corrected_action?: string;
+        corrected_selector?: string;
+        corrected_actionType?: string;
+        corrected_value?: string;
+    };
+}
+interface FailureAnalysisInput {
+    anthropic: Anthropic;
+    step: {
+        stepNumber: number;
+        action: string;
+        expectedResult: string;
+        selector?: string;
+        actionType?: string;
+        value?: string;
+    };
+    result: {
+        actualResult: string;
+        error?: string;
+        screenshotBefore: Buffer;
+        screenshotAfter: Buffer;
+    };
+    discoveredSelector?: {
+        selector: string;
+        strategy: string;
+        tagName?: string;
+        textContent?: string;
+    };
+    consoleLogs?: Array<{
+        level: string;
+        text: string;
+    }>;
+    networkErrors?: Array<{
+        method: string;
+        url: string;
+        status: number;
+        statusText: string;
+    }>;
+    model?: string;
+}
+/**
+ * Analyze a failed test step to classify the failure and suggest fixes.
+ */
+declare function analyzeFailure(input: FailureAnalysisInput): Promise<FailureAnalysis>;
+/**
+ * Roll up step-level failure classifications into a single run-level classification.
+ *
+ * Priority:
+ *   1. ANY step = real_bug → 'bug'
+ *   2. ALL steps = ai_limitation → 'ai_limitation'
+ *   3. ALL steps = test_maintenance → 'test_issue'
+ *   4. ALL steps = flaky → 'flaky'
+ *   5. Otherwise → most common classification (mapped to run-level)
+ */
+declare function rollupFailureClassification(stepClassifications: FailureClassification[]): RunFailureClassification;
 /**
  * Result Evaluator
  *
  * Generates AI summaries of test run results.
- * Step-level evaluation is now handled by Stagehand's extract() in the runner.
+ * Step-level evaluation is handled by vision-evaluator.ts (Claude Vision).
  */
 /**
@@ -228,6 +688,71 @@ declare function generateRunSummary(anthropic: Anthropic, testTitle: string, ste
     passed: boolean;
     confidence: number;
     error?: string;
+    skipped?: boolean;
 }>, model: string): Promise<string>;
-export { type AuthConfig, type BrowserConfig, type BrowserProvider, type ConsoleEntry, type CookieAuth, type FormLoginAuth, type LocalStorageAuth, type NetworkError, type OverallResult, type StagehandSession, type StepAction, type StepResult, type TestCaseInput, type TestRunConfig, type TestRunResult, type TestRunStatus, type TestStep, createStagehandSession, generateRunSummary, injectAuth, runTest };
+/**
+ * Simple counting semaphore for controlling concurrent browser sessions.
+ *
+ * Usage:
+ *   const sem = new Semaphore(3);
+ *   await sem.acquire();
+ *   try { ... } finally { sem.release(); }
+ */
+declare class Semaphore {
+    private readonly max;
+    private current;
+    private queue;
+    constructor(max: number);
+    acquire(): Promise<void>;
+    release(): void;
+    /** Number of slots currently in use */
+    get active(): number;
+    /** Number of waiters in the queue */
+    get waiting(): number;
+}
+/**
+ * AI Test Execution Cost Estimation
+ *
+ * Provides pre-run cost estimates and post-run cost calculations
+ * based on per-model token pricing and calibrated usage profiles.
+ */
+interface CostEstimate {
+    /** Cost in cents (USD) */
+    cents: number;
+    /** Formatted string (e.g., "$0.12") */
+    formatted: string;
+    /** Token breakdown */
+    tokens: {
+        inputTokens: number;
+        outputTokens: number;
+    };
+    /** Model used for estimate */
+    model: string;
+}
+/**
+ * Calculate actual cost from known token counts.
+ */
+declare function estimateCost(inputTokens: number, outputTokens: number, model?: string): CostEstimate;
+/**
+ * Pre-run cost estimate based on step count.
+ * Each step involves: act() + extract(). Plus one summary at the end.
+ */
+declare function estimateTestCost(stepCount: number, model?: string): CostEstimate;
+/**
+ * Estimate cost for a batch of test cases.
+ */
+declare function estimateBatchCost(testCases: Array<{
+    stepCount: number;
+}>, model?: string): CostEstimate;
+/**
+ * Get calibrated token estimates for a test with N steps.
+ * More accurate than the old hardcoded 3000/500 per step.
+ */
+declare function getTokenEstimate(stepCount: number): {
+    inputTokens: number;
+    outputTokens: number;
+};
+export { type ActionResult, type ActionableFinding, type AuthConfig, type BrowserConfig, type BrowserProvider, type CapturedRequest, type ConsoleEntry, type CookieAuth, type CostEstimate, type DiscoveredSelector, type DomContext, type ExplorationAction, type ExplorationConfig, type ExplorationReport, type ExplorationResult, type FailureAnalysis, type FailureAnalysisInput, type FailureClassification, type FindingCategory, type FindingSeverity, type FormLoginAuth, type LocalStorageAuth, type NetworkCapture, type NetworkError, type OverallResult, type RecentReportSummary, type RetryAttempt, type RetryConfig, type RunFailureClassification, Semaphore, type StagehandSession, type StepAction, type StepActionType, type StepEvaluation, type StepEvaluationInput, type StepResult, type SupabaseNativeAuth, type TestCaseInput, type TestRunConfig, type TestRunResult, type TestRunStatus, type TestStep, type TriageCategory, type TriageInput, type TriageReportInput, type TriageResult, type TriageSeverity, analyzeFailure, authenticateSupabase, createStagehandSession, discoverSelector, estimateBatchCost, estimateCost, estimateTestCost, evaluateStep, executeAction, generateExplorationReport, generateRunSummary, getTokenEstimate, injectAuth, injectSupabaseAuth, installClickTracker, performSupabaseAuth, rollupFailureClassification, runExploration, runTest, suppressBugBearWidget, triageReport, verifySupabaseSession };