npm - greenrun-cli - Versions diffs - 0.1.6 → 0.2.0 - Mend

greenrun-cli 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/api-client.d.ts +48 -2
package/dist/api-client.js +66 -0
package/dist/commands/init.js +46 -19
package/dist/server.js +49 -1
package/package.json +1 -1
package/templates/claude-md.md +20 -10
package/templates/commands/greenrun-sweep.md +4 -1
package/templates/commands/greenrun.md +10 -12
package/templates/commands/procedures.md +305 -34

package/dist/api-client.d.ts CHANGED Viewed

@@ -12,14 +12,32 @@ export declare class ApiClient {
         name: string;
         base_url?: string;
         description?: string;
-        concurrency?: number;
+        auth_mode?: string;
+        login_url?: string;
+        register_url?: string;
+        login_instructions?: string;
+        register_instructions?: string;
+        credentials?: {
+            name: string;
+            email: string;
+            password: string;
+        }[];
     }): Promise<unknown>;
     getProject(id: string): Promise<unknown>;
     updateProject(id: string, data: {
         name?: string;
         base_url?: string;
         description?: string;
-        concurrency?: number;
+        auth_mode?: string;
+        login_url?: string;
+        register_url?: string;
+        login_instructions?: string;
+        register_instructions?: string;
+        credentials?: {
+            name: string;
+            email: string;
+            password: string;
+        }[];
     }): Promise<unknown>;
     deleteProject(id: string): Promise<unknown>;
     listPages(projectId: string): Promise<unknown>;
@@ -39,6 +57,7 @@ export declare class ApiClient {
         page_ids?: string[];
         status?: string;
         tags?: string[];
+        credential_name?: string;
     }): Promise<unknown>;
     getTest(id: string): Promise<unknown>;
     updateTest(id: string, data: {
@@ -47,6 +66,9 @@ export declare class ApiClient {
         page_ids?: string[];
         status?: string;
         tags?: string[];
+        credential_name?: string | null;
+        script?: string | null;
+        script_generated_at?: string | null;
     }): Promise<unknown>;
     deleteTest(id: string): Promise<unknown>;
     sweep(projectId: string, params: {
@@ -60,4 +82,28 @@ export declare class ApiClient {
     }): Promise<unknown>;
     getRun(runId: string): Promise<unknown>;
     listRuns(testId: string): Promise<unknown>;
+    prepareTestBatch(projectId: string, filter?: string, testIds?: string[]): Promise<{
+        project: {
+            id: any;
+            name: any;
+            base_url: any;
+            auth_mode: any;
+            login_url: any;
+            register_url: any;
+            login_instructions: any;
+            register_instructions: any;
+            credentials: any;
+        };
+        tests: {
+            test_id: any;
+            test_name: any;
+            run_id: any;
+            instructions: any;
+            credential_name: any;
+            pages: any;
+            tags: any;
+            script: any;
+            script_generated_at: any;
+        }[];
+    }>;
 }

package/dist/api-client.js CHANGED Viewed

@@ -102,4 +102,70 @@ export class ApiClient {
     async listRuns(testId) {
         return this.request('GET', `/tests/${testId}/runs`);
     }
+    // Batch operations
+    async prepareTestBatch(projectId, filter, testIds) {
+        const [projectResult, testsResult] = await Promise.all([
+            this.getProject(projectId),
+            this.listTests(projectId),
+        ]);
+        const project = projectResult.project;
+        let tests = (testsResult.tests || []).filter((t) => t.status === 'active');
+        if (testIds && testIds.length > 0) {
+            const idSet = new Set(testIds);
+            tests = tests.filter((t) => idSet.has(t.id));
+        }
+        else if (filter) {
+            if (filter.startsWith('tag:')) {
+                const tag = filter.slice(4).toLowerCase();
+                tests = tests.filter((t) => (t.tags || []).some((tg) => (tg.name || tg).toLowerCase() === tag));
+            }
+            else if (filter.startsWith('/')) {
+                tests = tests.filter((t) => (t.pages || []).some((p) => (p.url || '').includes(filter)));
+            }
+            else {
+                const term = filter.toLowerCase();
+                tests = tests.filter((t) => (t.name || '').toLowerCase().includes(term));
+            }
+        }
+        if (tests.length === 0) {
+            return {
+                project: {
+                    id: project.id, name: project.name, base_url: project.base_url,
+                    auth_mode: project.auth_mode ?? 'none',
+                    login_url: project.login_url ?? null,
+                    register_url: project.register_url ?? null,
+                    login_instructions: project.login_instructions ?? null,
+                    register_instructions: project.register_instructions ?? null,
+                    credentials: project.credentials ?? null,
+                },
+                tests: [],
+            };
+        }
+        // Fetch full test details in parallel
+        const fullTests = await Promise.all(tests.map((t) => this.getTest(t.id)));
+        // Start runs in parallel
+        const runs = await Promise.all(tests.map((t) => this.startRun(t.id)));
+        return {
+            project: {
+                id: project.id, name: project.name, base_url: project.base_url,
+                auth_mode: project.auth_mode ?? 'none',
+                login_url: project.login_url ?? null,
+                register_url: project.register_url ?? null,
+                login_instructions: project.login_instructions ?? null,
+                register_instructions: project.register_instructions ?? null,
+                credentials: project.credentials ?? null,
+            },
+            tests: fullTests.map((ft, i) => ({
+                test_id: ft.test.id,
+                test_name: ft.test.name,
+                run_id: runs[i].run.id,
+                instructions: ft.test.instructions,
+                credential_name: ft.test.credential_name ?? null,
+                pages: ft.test.pages || [],
+                tags: ft.test.tags || [],
+                script: ft.test.script ?? null,
+                script_generated_at: ft.test.script_generated_at ?? null,
+            })),
+        };
+    }
 }

package/dist/commands/init.js CHANGED Viewed

@@ -44,7 +44,7 @@ function checkPrerequisites() {
     catch {
         // not installed
     }
-    return { claude, chromeHint: true };
+    return { claude };
 }
 async function validateToken(token) {
     try {
@@ -74,6 +74,15 @@ function configureMcpLocal(token) {
         console.error(`  claude mcp add greenrun --transport stdio -e GREENRUN_API_TOKEN=${token} -- npx -y greenrun-cli@latest\n`);
     }
 }
+function configurePlaywrightMcp() {
+    try {
+        execSync('claude mcp add playwright -- npx @playwright/mcp@latest --browser chrome --user-data-dir ~/.greenrun/browser-profile', { stdio: 'inherit' });
+    }
+    catch {
+        console.error('\nFailed to add Playwright MCP. You can add it manually:\n');
+        console.error('  claude mcp add playwright -- npx @playwright/mcp@latest --browser chrome --user-data-dir ~/.greenrun/browser-profile\n');
+    }
+}
 function configureMcpProject(token) {
     const mcpConfig = {
         mcpServers: {
@@ -127,7 +136,9 @@ function installClaudeMd() {
     if (existsSync(claudeMdPath)) {
         const existing = readFileSync(claudeMdPath, 'utf-8');
         if (existing.includes('## Greenrun')) {
-            console.log('  CLAUDE.md already contains Greenrun section, skipping');
+            const updated = existing.replace(/## Greenrun[\s\S]*$/, snippet.trimEnd());
+            writeFileSync(claudeMdPath, updated.endsWith('\n') ? updated : updated + '\n');
+            console.log('  Replaced Greenrun section in CLAUDE.md');
             return;
         }
         appendFileSync(claudeMdPath, '\n' + snippet);
@@ -155,6 +166,7 @@ function installSettings() {
         'mcp__greenrun__list_projects',
         'mcp__greenrun__get_project',
         'mcp__greenrun__create_project',
+        'mcp__greenrun__update_project',
         'mcp__greenrun__list_pages',
         'mcp__greenrun__create_page',
         'mcp__greenrun__list_tests',
@@ -166,19 +178,34 @@ function installSettings() {
         'mcp__greenrun__get_run',
         'mcp__greenrun__list_runs',
         'mcp__greenrun__sweep',
+        'mcp__greenrun__prepare_test_batch',
     ];
     const browserTools = [
-        'mcp__claude-in-chrome__tabs_context_mcp',
-        'mcp__claude-in-chrome__tabs_create_mcp',
-        'mcp__claude-in-chrome__navigate',
-        'mcp__claude-in-chrome__computer',
-        'mcp__claude-in-chrome__read_page',
-        'mcp__claude-in-chrome__find',
-        'mcp__claude-in-chrome__form_input',
-        'mcp__claude-in-chrome__javascript_tool',
-        'mcp__claude-in-chrome__get_page_text',
-        'mcp__claude-in-chrome__read_console_messages',
-        'mcp__claude-in-chrome__read_network_requests',
+        'mcp__playwright__browser_navigate',
+        'mcp__playwright__browser_snapshot',
+        'mcp__playwright__browser_click',
+        'mcp__playwright__browser_type',
+        'mcp__playwright__browser_handle_dialog',
+        'mcp__playwright__browser_tab_list',
+        'mcp__playwright__browser_tab_new',
+        'mcp__playwright__browser_tab_select',
+        'mcp__playwright__browser_tab_close',
+        'mcp__playwright__browser_select_option',
+        'mcp__playwright__browser_hover',
+        'mcp__playwright__browser_drag',
+        'mcp__playwright__browser_press_key',
+        'mcp__playwright__browser_screenshot',
+        'mcp__playwright__browser_wait',
+        'mcp__playwright__browser_file_upload',
+        'mcp__playwright__browser_pdf_save',
+        'mcp__playwright__browser_close',
+        'mcp__playwright__browser_console_messages',
+        'mcp__playwright__browser_resize',
+        'mcp__playwright__browser_run_code',
+        'mcp__playwright__browser_evaluate',
+        'mcp__playwright__browser_fill_form',
+        'mcp__playwright__browser_tabs',
+        'mcp__playwright__browser_network_requests',
     ];
     const requiredTools = [...greenrunTools, ...browserTools];
     existing.permissions = existing.permissions || {};
@@ -228,8 +255,7 @@ export async function runInit(args) {
             process.exit(1);
         }
     }
-    console.log('  [i] Claude in Chrome extension required for browser test execution');
-    console.log('      Get it at: https://chromewebstore.google.com/detail/claude-in-chrome\n');
+    console.log('  [i] Playwright MCP will be configured for browser test execution\n');
     let token = opts.token;
     let scope = opts.scope;
     if (interactive) {
@@ -283,14 +309,15 @@ export async function runInit(args) {
         scope = scope || 'local';
     }
     // Configure MCP
-    console.log('Configuring MCP server...');
+    console.log('Configuring MCP servers...');
     if (scope === 'project') {
         configureMcpProject(token);
     }
     else {
         configureMcpLocal(token);
     }
-    console.log('  MCP server configured.\n');
+    configurePlaywrightMcp();
+    console.log('  MCP servers configured.\n');
     // Install extras
     if (opts.claudeMd) {
         installClaudeMd();
@@ -302,7 +329,7 @@ export async function runInit(args) {
     console.log(`
 Done! Restart Claude Code to connect.
-Make sure Chrome is open with the Claude in Chrome extension active
-before running /greenrun - Claude needs browser access to execute tests.
+Playwright will launch a Chrome browser automatically when running tests.
+Run /greenrun to execute your test suite.
 `);
 }

package/dist/server.js CHANGED Viewed

@@ -26,7 +26,16 @@ export async function startServer() {
         name: z.string().describe('Project name'),
         base_url: z.string().optional().describe('Base URL of the site (e.g. https://myapp.com)'),
         description: z.string().optional().describe('Project description'),
-        concurrency: z.number().int().min(1).max(20).optional().describe('Number of tests to run in parallel (default: 5)'),
+        auth_mode: z.enum(['none', 'existing_user', 'new_user']).optional().describe('How to authenticate before tests (default: none)'),
+        login_url: z.string().optional().describe('URL of login page (for existing_user auth mode)'),
+        register_url: z.string().optional().describe('URL of registration page (for new_user auth mode)'),
+        login_instructions: z.string().optional().describe('Steps to log in with existing credentials'),
+        register_instructions: z.string().optional().describe('Steps to register a new user'),
+        credentials: z.array(z.object({
+            name: z.string().describe('Credential set name (e.g. "admin", "viewer")'),
+            email: z.string().describe('Login email'),
+            password: z.string().describe('Login password'),
+        })).optional().describe('Named credential sets for test authentication (max 20)'),
     }, async (args) => {
         const result = await api.createProject(args);
         return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
@@ -35,6 +44,26 @@ export async function startServer() {
         const result = await api.getProject(args.project_id);
         return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
     });
+    server.tool('update_project', 'Update project settings', {
+        project_id: z.string().describe('Project UUID'),
+        name: z.string().optional().describe('Updated project name'),
+        base_url: z.string().optional().describe('Updated base URL'),
+        description: z.string().optional().describe('Updated description'),
+        auth_mode: z.enum(['none', 'existing_user', 'new_user']).optional().describe('How to authenticate before tests'),
+        login_url: z.string().optional().describe('URL of login page (for existing_user auth mode)'),
+        register_url: z.string().optional().describe('URL of registration page (for new_user auth mode)'),
+        login_instructions: z.string().optional().describe('Steps to log in with existing credentials'),
+        register_instructions: z.string().optional().describe('Steps to register a new user'),
+        credentials: z.array(z.object({
+            name: z.string().describe('Credential set name (e.g. "admin", "viewer")'),
+            email: z.string().describe('Login email'),
+            password: z.string().describe('Login password'),
+        })).optional().describe('Named credential sets for test authentication (max 20)'),
+    }, async (args) => {
+        const { project_id, ...data } = args;
+        const result = await api.updateProject(project_id, data);
+        return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
+    });
     // --- Pages ---
     server.tool('list_pages', 'List pages in a project', { project_id: z.string().describe('Project UUID') }, async (args) => {
         const result = await api.listPages(args.project_id);
@@ -64,6 +93,7 @@ export async function startServer() {
         page_ids: z.array(z.string()).optional().describe('UUIDs of pages this test covers'),
         status: z.enum(['draft', 'active', 'archived']).optional().describe('Test status (default: active)'),
         tags: z.array(z.string()).optional().describe('Tag names for organizing tests (e.g. ["smoke", "auth"])'),
+        credential_name: z.string().optional().describe('Name of a credential set from the project to use for authentication'),
     }, async (args) => {
         const result = await api.createTest(args.project_id, {
             name: args.name,
@@ -71,6 +101,7 @@ export async function startServer() {
             page_ids: args.page_ids,
             status: args.status,
             tags: args.tags,
+            credential_name: args.credential_name,
         });
         return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
     });
@@ -81,6 +112,9 @@ export async function startServer() {
         page_ids: z.array(z.string()).optional().describe('Updated page UUIDs (replaces existing)'),
         status: z.enum(['draft', 'active', 'archived']).optional().describe('Updated status'),
         tags: z.array(z.string()).optional().describe('Updated tag names (replaces existing tags)'),
+        credential_name: z.string().optional().nullable().describe('Name of a credential set from the project to use for authentication'),
+        script: z.string().optional().nullable().describe('Generated Playwright test script'),
+        script_generated_at: z.string().optional().nullable().describe('ISO timestamp when the script was generated'),
     }, async (args) => {
         const { test_id, ...data } = args;
         const result = await api.updateTest(test_id, data);
@@ -98,6 +132,20 @@ export async function startServer() {
         });
         return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
     });
+    // --- Batch ---
+    server.tool('prepare_test_batch', 'Prepare a batch of tests for execution: lists tests, filters, fetches full details, and starts runs — all in one call. Returns everything needed to execute tests.', {
+        project_id: z.string().describe('Project UUID'),
+        filter: z.string().optional().describe('Filter: "tag:xxx" for tag, "/path" for page URL, or text for name substring'),
+        test_ids: z.array(z.string()).optional().describe('Specific test UUIDs to run (overrides filter)'),
+    }, async (args) => {
+        try {
+            const result = await api.prepareTestBatch(args.project_id, args.filter, args.test_ids);
+            return { content: [{ type: 'text', text: JSON.stringify(result, null, 2) }] };
+        }
+        catch (error) {
+            return { content: [{ type: 'text', text: `Error: ${error.message}` }], isError: true };
+        }
+    });
     // --- Test Runs ---
     server.tool('start_run', 'Start a test run (sets status to running)', { test_id: z.string().describe('Test UUID') }, async (args) => {
         const result = await api.startRun(args.test_id);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "greenrun-cli",
-  "version": "0.1.6",
+  "version": "0.2.0",
   "description": "CLI and MCP server for Greenrun - browser test management for Claude Code",
   "type": "module",
   "main": "dist/server.js",

package/templates/claude-md.md CHANGED Viewed

@@ -2,34 +2,44 @@
 ### Prerequisites
-- **Claude in Chrome extension** must be installed and active in your browser for test execution
+- **Playwright MCP** must be configured for browser test execution (`claude mcp add playwright -- npx @playwright/mcp@latest --browser chrome --user-data-dir ~/.greenrun/browser-profile`)
 - MCP server must be connected (check with `/mcp` in Claude Code)
 ### Available MCP Tools
 The Greenrun MCP server provides these tools:
-- **list_projects** / **get_project** / **create_project** - Manage projects
+- **list_projects** / **get_project** / **create_project** - Manage projects (includes auth configuration)
 - **list_pages** / **create_page** - Manage page URLs within a project
 - **list_tests** / **get_test** / **create_test** / **update_test** - Manage test cases
 - **start_run** / **complete_run** / **get_run** / **list_runs** - Execute and track test runs
 - **sweep** - Impact analysis: find tests affected by changed pages
+- **prepare_test_batch** - Batch prepare tests for execution (lists, filters, fetches details, starts runs in one call)
 ### Running Tests
-To run tests for this project:
+Use the `/greenrun` slash command to run all tests automatically, or `/greenrun tag:smoke` to filter.
-1. Use `list_projects` to find the project, then `list_tests` to get all tests
-2. For each test, call `get_test` to retrieve the full instructions
-3. Call `start_run` to begin a run (returns a run ID)
-4. Execute the test instructions using browser automation (Claude in Chrome)
-5. Call `complete_run` with the run ID, status (passed/failed/error), and a result summary
+To run tests manually:
-Or use the `/greenrun` slash command to run all tests automatically.
+1. Use `list_projects` to find the project
+2. Call `prepare_test_batch` with the project ID (and optional filter) to get test details and run IDs
+3. Execute each test's instructions using Playwright browser automation tools (`browser_navigate`, `browser_snapshot`, `browser_click`, `browser_type`)
+4. Call `complete_run` with the run ID, status (passed/failed/error), and a result summary
+### Auth Configuration
+Projects can be configured with authentication settings so tests auto-login before execution:
+- **`auth_mode: "none"`** - No authentication (default)
+- **`auth_mode: "existing_user"`** - Log in with existing credentials via `login_url` and `login_instructions`
+- **`auth_mode: "new_user"`** - Register a new account each run via `register_url` and `register_instructions`
+Projects can also store named **credentials** (name/email/password sets). Each test can reference a credential by `credential_name` to use specific login details during execution.
 ### Creating Tests
-1. Navigate to the page you want to test in Chrome
+1. Navigate to the page you want to test using Playwright
 2. Write clear, step-by-step test instructions describing what to do and what to verify
 3. Use `create_page` to register the page URL if not already registered
 4. Use `create_test` with the instructions and page IDs

package/templates/commands/greenrun-sweep.md CHANGED Viewed

@@ -36,4 +36,7 @@ Present the affected tests:
 ### 6. Offer to run
-Ask the user if they want to run the affected tests. If yes, read `.claude/commands/procedures.md` for the agent prompt template and execution procedures. Follow those procedures to pre-fetch test details, launch agents in batches, collect results, and summarize.
+Ask the user if they want to run the affected tests. If yes:
+1. Call `prepare_test_batch` with the project ID and `test_ids` set to the affected test IDs from the sweep results.
+2. Read `.claude/commands/procedures.md` and follow the Execute and Summarize procedures using the batch result.

package/templates/commands/greenrun.md CHANGED Viewed

@@ -2,27 +2,25 @@ Run Greenrun browser tests for this project in parallel.
 ## Instructions
-You are executing browser tests managed by Greenrun. Tests run in parallel using background agents, each with its own browser tab. Follow these steps precisely:
+You are executing browser tests managed by Greenrun. Follow these steps precisely:
 ### 1. Find the project
 Call `list_projects` to get all projects. Match the current project by name or base URL. If no match is found, tell the user and stop.
-Note the project's `concurrency` value (default: 5). This controls how many tests run simultaneously.
+### 2. Prepare test batch
-### 2. Get tests
+Call `prepare_test_batch` with the project ID.
-Call `list_tests` with the project ID. Each test has associated pages and tags which are organizational metadata for filtering.
+If the user specified an argument ("$ARGUMENTS"), pass it as the `filter` parameter:
+- `tag:smoke` → filters by tag
+- `/checkout` → filters by page URL
+- `login` → filters by test name
-If the user specified an argument ("$ARGUMENTS"), use it to filter tests:
-- If it starts with `/` (e.g. `/checkout`), filter to tests linked to a page matching that URL
-- If it starts with `tag:` (e.g. `tag:smoke`), filter to tests with that tag
-- Otherwise, treat it as a test name filter
+If no argument is given, omit the filter to run all active tests.
-If no argument is given, run all active tests.
-If there are no matching active tests, tell the user and stop.
+If the result has zero tests, tell the user and stop.
 ### 3. Execute tests
-Read `.claude/commands/procedures.md` for the agent prompt template and execution procedures. Follow those procedures to pre-fetch test details, launch agents in batches, collect results, and summarize.
+Read `.claude/commands/procedures.md` and follow the Execute and Summarize procedures using the batch result.

package/templates/commands/procedures.md CHANGED Viewed

@@ -1,64 +1,335 @@
-Shared procedures for executing Greenrun browser tests in parallel. Referenced by `/greenrun` and `/greenrun-sweep`.
+Shared procedures for executing Greenrun browser tests. Referenced by `/greenrun` and `/greenrun-sweep`.
-## Pre-fetch
+## Authenticate
-Before launching agents, call `get_test` for ALL tests **in parallel** to get full instructions. Then call `start_run` for ALL tests **in parallel** to get run IDs.
+Before executing tests, handle authentication based on the project's `auth_mode` from the batch result.
-## Launch agents
+- **`none`** (or missing): Skip authentication entirely.
+- **`existing_user`**: Navigate to the project's `login_url` and follow the `login_instructions` step by step. Use `browser_snapshot` after to verify the page shows an authenticated state (no login form visible).
+- **`new_user`**: Navigate to the project's `register_url` and follow the `register_instructions` step by step. Use `browser_snapshot` after to verify registration succeeded and the user is authenticated.
-Split tests into batches of size `concurrency` (from project settings, default: 5).
+### Credentials
-For each batch, launch all tests simultaneously using the **Task tool** with `run_in_background: true`:
+The project may include a `credentials` array — named credential sets with `name`, `email`, and `password`. Each test may have a `credential_name` field referencing one of these sets.
+When authenticating for a test with `credential_name`:
+- Find the matching credential in `project.credentials` by name
+- Use that credential's email and password to fill the login form at `login_url`
+- If no `credential_name` is set on a test, use the first credential in the array (or fall back to `login_instructions`)
+When authenticating once for a batch (Step 1 below), use the credential that appears most frequently across the batch's tests. If tests use different credentials, re-authenticate between tests as needed.
+If auth fails (login form still visible after following instructions), report all tests as error with "Authentication failed" and stop.
+## Execute
+You have a batch result from `prepare_test_batch` containing `project` (with `credentials` array) and `tests[]` (each with `test_id`, `test_name`, `run_id`, `instructions`, `credential_name`, `pages`, `tags`, `script`, `script_generated_at`).
+If `tests` is empty, tell the user no matching active tests were found and stop.
+### Step 1: Authenticate on the main page
+Run the Authenticate procedure above once, using the standard Playwright tools (`browser_navigate`, `browser_snapshot`, `browser_click`, `browser_type`).
+### Step 2: Classify tests
+Split the batch into two groups:
+- **scripted**: tests where `script` is non-null (cached Playwright scripts ready to run)
+- **unscripted**: tests where `script` is null (need script generation)
+If all tests are scripted, skip to Step 4.
+### Step 3: Score and generate scripts (easy-first)
+For each **unscripted** test, assign a difficulty score based on the instructions:
+- **easy** (1): Single-page tests with simple actions — navigate, check text/headings, verify static content, click a link and check the URL. Typically 1-4 steps, no form submissions, no multi-step flows.
+- **medium** (2): Tests involving form input, button clicks that trigger state changes, checking error/success messages, or verifying a redirect after an action. Typically 3-8 steps.
+- **hard** (3): Multi-page flows, tests requiring specific sequences of actions (e.g. add to cart then checkout), tests with complex assertions (table data, dynamic content), or tests involving file uploads, modals, or dialogs.
+Sort unscripted tests by difficulty ascending (easy first). This ensures simple tests get scripts generated quickly so native execution can start sooner.
+#### Walk-through script generation
+For each unscripted test (in difficulty order), do a **scouting pass** — actually follow the test instructions in the browser to observe all UI states:
+1. Navigate to the test's starting page via `browser_navigate`
+2. Take a `browser_snapshot` to see initial elements
+3. Follow the test instructions step by step using Playwright MCP tools (`browser_click`, `browser_type`, `browser_snapshot` after each action)
+4. Snapshot after each state change to capture: validation errors, success banners, modal dialogs, redirected pages, dynamically loaded content
+5. Collect all observed elements and selectors as context
+Then generate a `.spec.ts` script using the observed elements:
+```ts
+import { test, expect } from '@playwright/test';
+test('{test_name}', async ({ page }) => {
+  // If the test has a credential_name, include login steps using the matching
+  // credential from project.credentials (email + password) at the login_url
+  await page.goto('{start_url}');
+  // Steps generated from scouting pass observations
+  // Use getByRole, getByText, getByLabel, getByPlaceholder for selectors
+});
+```
+Save via `update_test(test_id, { script: <generated_script>, script_generated_at: <ISO_now> })`.
+**Pipeline optimisation**: After finishing all **easy** tests, if there are medium/hard tests remaining, proceed to Step 4 immediately with whatever scripts are ready (scripted + newly generated easy tests). Continue generating medium/hard scripts in parallel by launching a background Task agent for the remaining generation work. When those scripts are ready, they'll be saved to the API for next run.
+To launch the background generation agent:
 ```
-For each test in the current batch, call the Task tool with:
+Task tool with:
+- subagent_type: "general-purpose"
+- run_in_background: true
+- max_turns: 50
+- model: "sonnet"
+- prompt: (include project details, remaining unscripted tests with instructions, and the scouting+generation procedure above)
+```
+The background agent should: for each remaining test, do the scouting pass, generate the script, and call `update_test` to save it. It does NOT need to call `complete_run` — that happens in the native execution step.
+### Step 4: Export auth state
+If `auth_mode` is not `none`, export the browser session so native Playwright inherits it:
+```js
+async (page) => {
+  const state = await page.context().storageState();
+  require('fs').writeFileSync('/tmp/greenrun-auth-state.json', JSON.stringify(state));
+  return 'Auth state exported';
+}
+```
+Call this via `browser_run_code`. If `auth_mode` is `none`, skip this step.
+### Step 5: Write files and run natively
+Gather all tests that have scripts (previously scripted + newly generated from Step 3).
+1. **Write test files**: For each scripted test, write the script to `/tmp/greenrun-tests/{test_id}.spec.ts`
+2. **Write config**: Write `/tmp/greenrun-tests/playwright.config.ts`:
+```ts
+import { defineConfig } from '@playwright/test';
+export default defineConfig({
+  testDir: '.',
+  timeout: 30000,
+  workers: 20,
+  reporter: [['json', { outputFile: 'results.json' }]],
+  use: {
+    baseURL: '{base_url}',
+    storageState: '/tmp/greenrun-auth-state.json',  // omit 'use.storageState' entirely if auth_mode is 'none'
+  },
+});
+```
+Replace `{base_url}` with the project's base_url.
+3. **Execute**: Run via Bash:
+```
+npx playwright test --config /tmp/greenrun-tests/playwright.config.ts
+```
+4. **Parse results**: Read `/tmp/greenrun-tests/results.json`. Map each result back to a run ID via the filename: `{test_id}.spec.ts` → test_id → find the matching run_id from the batch.
+5. **Report results**: Call `complete_run(run_id, status, result_summary)` for each test. Map Playwright statuses: `passed` → `passed`, `failed`/`timedOut` → `failed`, other → `error`.
+### Step 6: Handle unscripted tests without scripts
+Any tests that still don't have scripts (e.g. because the background agent hasn't finished, or script generation failed) need to be executed via AI agents using the legacy approach. Follow Step 7 for these tests.
+### Step 7: Circuit breaker
+After parsing all native results, walk through them in completion order. Track consecutive failures:
+- If **3 or more consecutive failures** occur:
+  - Mark all remaining un-reported tests as error: "Circuit breaker: N consecutive failures detected"
+  - Skip AI fallback for remaining tests
+  - The counter resets on any pass
+### Step 8: AI-agent fallback for native failures
+For tests that **failed** in native execution (and circuit breaker has not tripped):
+1. Start new runs via `start_run(test_id)` (the original runs were already completed in Step 5)
+2. Launch background Task agents using the tab-isolation pattern:
+Create tabs and launch agents in batches of 20:
+#### Create tab
+```js
+async (page) => {
+  const newPage = await page.context().newPage();
+  await newPage.goto(START_URL);
+  return { index: page.context().pages().length - 1, url: newPage.url() };
+}
+```
+#### Launch agent
+```
+Task tool with:
 - subagent_type: "general-purpose"
 - run_in_background: true
 - max_turns: 25
-- model: "haiku"
-- prompt: (see agent prompt below)
+- model: "sonnet"
+- prompt: (agent prompt below, including the native failure message for diagnosis)
 ```
-### Agent prompt
+#### Agent prompt
 ```
-Execute a Greenrun browser test. Run ID: {run_id}
+Greenrun browser test (AI fallback). Run ID: {run_id}
+Tab index: {INDEX}
+**{test_name}**
-**Test: {test_name}**
+{paste the full test instructions here}
-## Instructions
-{paste the full test instructions from get_test here}
+**Native execution failed with:** {failure_message}
-## Setup
-1. Call `tabs_context_mcp`, then `tabs_create_mcp` to create YOUR tab. Use ONLY this tabId — other tabs belong to parallel tests.
-2. Navigate to the first URL. Run `javascript_tool`: `window.location.pathname`. If it returns `/login`, call `complete_run` with status "error", result "Not authenticated", then `window.close()` and stop.
+Determine if this is a stale script (UI changed) or an actual bug. If the script is stale, the test may still pass when executed manually.
-## Execution rules
-- Verify assertions with `screenshot` after actions that change the page. Do NOT use `read_page` for verification.
-- Use `find` to locate elements, then `ref` parameter on `computer` tool or `form_input` to interact.
-- Navigate with absolute URLs via `navigate` — don't click nav links.
-- Before destructive buttons: `window.alert = () => {}; window.confirm = () => true; window.prompt = () => null;`
-- On failure or timeout, retry ONCE then move on. Max 35 tool calls total.
+## CRITICAL: Tab isolation
-## Finish
-Call `complete_run` with run_id "{run_id}", status ("passed"/"failed"/"error"), and a brief result summary.
-Then run `javascript_tool`: `window.close()`.
+You are assigned to tab index {INDEX}. You MUST use ONLY `browser_run_code` for ALL browser interactions. Do NOT use `browser_snapshot`, `browser_click`, `browser_type`, `browser_navigate`, or any other Playwright MCP tools. The only non-browser tool you may call is `complete_run`.
-Return: {test_name} | {status} | {result_summary}
+Every `browser_run_code` call must scope to your tab:
+```js
+async (page) => {
+  const p = page.context().pages()[INDEX];
+  // ... your action here ...
+}
 ```
-## Collect results
+## Auth
+No authentication needed — the main page already authenticated and cookies are shared to your tab.
+## Interaction patterns
-After launching all agents in a batch, wait for them all to complete (use `TaskOutput`) before launching the next batch.
+**Navigate:**
+```js
+async (page) => {
+  const p = page.context().pages()[INDEX];
+  await p.goto('https://example.com/path');
+  return p.url();
+}
+```
+**Read page state (replaces browser_snapshot):**
+```js
+async (page) => {
+  const p = page.context().pages()[INDEX];
+  const url = p.url();
+  const title = await p.title();
+  const text = await p.locator('body').innerText();
+  const headings = await p.getByRole('heading').allTextContents();
+  const buttons = await p.getByRole('button').allTextContents();
+  const links = await p.getByRole('link').allTextContents();
+  const textboxes = await p.getByRole('textbox').evaluateAll(els =>
+    els.map(e => ({ name: e.getAttribute('name') || e.getAttribute('aria-label') || e.placeholder, value: e.value }))
+  );
+  return { url, title, headings, buttons, links, textboxes, text: text.substring(0, 2000) };
+}
+```
+**Click an element:**
+```js
+async (page) => {
+  const p = page.context().pages()[INDEX];
+  await p.getByRole('button', { name: 'Submit' }).click();
+  return p.url();
+}
+```
+**Fill a form field:**
+```js
+async (page) => {
+  const p = page.context().pages()[INDEX];
+  await p.getByRole('textbox', { name: 'Email' }).fill('test@example.com');
+  return 'filled';
+}
+```
+**Handle a dialog:**
+```js
+async (page) => {
+  const p = page.context().pages()[INDEX];
+  p.once('dialog', d => d.accept());
+  await p.getByRole('button', { name: 'Delete' }).click();
+  return p.url();
+}
+```
+**Check for specific text (verification):**
+```js
+async (page) => {
+  const p = page.context().pages()[INDEX];
+  const visible = await p.getByText('Success').isVisible();
+  return { found: visible };
+}
+```
+## Rules
+- ONLY use `browser_run_code` — no other browser tools
+- Always scope to `page.context().pages()[INDEX]`
+- Use Playwright locators: `getByRole`, `getByText`, `getByLabel`, `getByPlaceholder`, `locator`
+- Read page state to find elements before interacting
+- Navigate with absolute URLs via `p.goto(url)` — never click nav links
+## FORBIDDEN — never use these:
+- `browser_snapshot`, `browser_click`, `browser_type`, `browser_navigate` — these operate on the MAIN page and will interfere with other tests
+- `browser_wait` — NEVER call this
+- `browser_screenshot` — NEVER use
+## Error recovery
+- On ANY failure: retry the failing step ONCE, then skip to Finish.
+## Finish (MANDATORY — always reach this step)
+1. If the test passes on manual execution, call `update_test(test_id, { script: null, script_generated_at: null })` to invalidate the stale cached script.
+2. `complete_run(run_id, status, brief_summary)` — ALWAYS call this, even on error.
+3. Return: {test_name} | {status} | {summary}
+```
+#### Wait and clean up
+Wait for all agents to complete via `TaskOutput`. Then close extra tabs (newest first):
+```js
+async (page) => {
+  const pages = page.context().pages();
+  for (let i = pages.length - 1; i >= 1; i--) {
+    await pages[i].close();
+  }
+  return { remainingPages: page.context().pages().length };
+}
+```
+Check for orphaned runs (agents that crashed without calling `complete_run`). For any orphaned run IDs, call `complete_run(run_id, "error", "Agent crashed or timed out")`.
+### Step 9: Wait for background generation
+If a background generation agent was launched in Step 3, check if it has completed via `TaskOutput` with `block: false`. If still running, note this in the summary. The generated scripts will be available on the next run.
 ## Summarize
-After all batches complete, present a summary table:
+Present a summary table with a Mode column showing how each test was executed:
+| Test | Pages | Tags | Mode | Status | Result |
+|------|-------|------|------|--------|--------|
+| Test name | /login, /dashboard | smoke, auth | native/agent/skipped | passed/failed/error | Brief summary |
+Mode values:
+- **native** — executed via `npx playwright test`
+- **agent** — executed via AI agent (fallback or no script available)
+- **skipped** — circuit breaker tripped, not executed
+Total: "X passed, Y failed, Z errors out of N tests"
-| Test | Pages | Tags | Status | Result |
-|------|-------|------|--------|--------|
-| Test name | /login, /dashboard | smoke, auth | passed/failed/error | Brief summary |
+If the circuit breaker tripped, note: "Circuit breaker tripped after N consecutive failures. M tests skipped."
-Include the total count: "X passed, Y failed, Z errors out of N tests"
+If background script generation is still running, note: "Script generation in progress for N tests. Scripts will be cached for next run."
 If any tests failed, highlight what went wrong and suggest next steps.