npm - @skyramp/mcp - Versions diffs - 0.1.8 → 0.2.0-rc.2 - Mend

@skyramp/mcp 0.1.8 → 0.2.0-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

package/build/prompts/test-recommendation/test-recommendation-prompt.test.js CHANGED Viewed

@@ -3,6 +3,7 @@ jest.mock("@skyramp/skyramp", () => ({
 }));
 import { TestType } from "../../types/TestTypes.js";
 import { buildRecommendationPrompt, buildExternalCoverageSet, externalDedupKey } from "./test-recommendation-prompt.js";
+import { buildExecutionPlan } from "./diffExecutionPlan.js";
 import { PATH_PARAM_UUID_GUIDANCE, MAX_TESTS_TO_GENERATE, buildTestQualityCriteria, buildArchitectPreamble, buildContextFetchingGuidance, buildReasoningProtocol, buildFewShotExamples, buildVerificationChecklist, } from "./recommendationSections.js";
 import { AnalysisScope } from "../../types/RepositoryAnalysis.js";
 // ---------------------------------------------------------------------------
@@ -757,6 +758,156 @@ describe("buildRecommendationPrompt — GENERATE slot allocation", () => {
         expect(prompt).not.toContain("ui-test-for-changed-components");
         expect(prompt).not.toContain("ui_mixed_pr_trace.zip");
     });
+    it("promotes attack-surface security boundaries into generated slots", () => {
+        const attackSurface = minimalScenario({
+            scenarioName: "flows-bulk-delete-post-auth-boundary",
+            description: "Attack-surface auth boundary: /api/flows/bulk_delete is a destructive sibling of changed DELETE /api/flows/{id}; verify it rejects missing authentication",
+            category: "security_boundary",
+            priority: "high",
+            testType: TestType.CONTRACT,
+            steps: [{
+                    order: 1,
+                    method: "POST",
+                    path: "/api/flows/bulk_delete",
+                    description: "POST /api/flows/bulk_delete without auth",
+                    interactionType: "error",
+                    expectedStatusCode: 401,
+                }],
+        });
+        const directDelete = minimalScenario({
+            scenarioName: "flows-delete-auth-boundary",
+            description: "Auth boundary: DELETE /api/flows/{id} rejects missing authentication",
+            category: "security_boundary",
+            priority: "high",
+            testType: TestType.CONTRACT,
+            steps: [{
+                    order: 1,
+                    method: "DELETE",
+                    path: "/api/flows/{id}",
+                    description: "DELETE /api/flows/{id} without auth",
+                    interactionType: "error",
+                    expectedStatusCode: 401,
+                }],
+        });
+        const validation = minimalScenario({
+            scenarioName: "flows-delete-invalid-id",
+            description: "Validate malformed IDs",
+            category: "data_validation",
+            priority: "medium",
+            testType: TestType.CONTRACT,
+            steps: [{
+                    order: 1,
+                    method: "DELETE",
+                    path: "/api/flows/not-a-uuid",
+                    description: "DELETE /api/flows/not-a-uuid",
+                    interactionType: "error",
+                    expectedStatusCode: 422,
+                }],
+        });
+        const analysis = minimalAnalysis({
+            businessContext: {
+                mainPurpose: "Test",
+                userFlows: [],
+                dataFlows: [],
+                integrationPatterns: [],
+                draftedScenarios: [directDelete, validation, attackSurface],
+            },
+            branchDiffContext: {
+                baseBranch: "main",
+                currentBranch: "feature/admin-key",
+                changedFiles: ["src/prefect/server/api/flows.py"],
+                newEndpoints: [],
+                modifiedEndpoints: [{
+                        path: "/api/flows/{id}",
+                        methods: [{ method: "DELETE", sourceFile: "flows.py", changeType: "modified" }],
+                    }],
+                affectedServices: [],
+            },
+        });
+        const prompt = buildRecommendationPrompt(analysis, AnalysisScope.CurrentBranchDiff, 5, undefined, undefined, undefined, undefined, 2);
+        const attackIdx = prompt.indexOf("POST /api/flows/bulk_delete → 401");
+        const directIdx = prompt.indexOf("DELETE /api/flows/{id} → 401");
+        expect(attackIdx).toBeGreaterThanOrEqual(0);
+        expect(directIdx).toBeGreaterThanOrEqual(0);
+        expect(attackIdx).toBeLessThan(directIdx);
+        expect(prompt).toContain("#1 — GENERATE** | contract | security_boundary");
+        expect(prompt).toContain("preserve attack-surface `security_boundary` items");
+    });
+    it("does not external-dedup protected bug-catching or attack-surface scenarios", () => {
+        const attackSurface = minimalScenario({
+            scenarioName: "flows-bulk-delete-post-auth-boundary",
+            description: "Verify destructive sibling rejects missing authentication",
+            category: "security_boundary",
+            isAttackSurfaceSecurityBoundary: true,
+            testType: TestType.CONTRACT,
+            steps: [{ order: 1, method: "POST", path: "/api/flows/bulk_delete", description: "POST /api/flows/bulk_delete without auth", interactionType: "error", expectedStatusCode: 401 }],
+        });
+        const bugCaught = minimalScenario({
+            scenarioName: "orders-discount-bug-caught",
+            description: "Bug-catching test: discount should subtract from total",
+            category: "bug_caught",
+            testType: TestType.CONTRACT,
+            steps: [{ order: 1, method: "POST", path: "/api/orders", description: "POST /api/orders exposes discount bug", interactionType: "success", expectedStatusCode: 201 }],
+        });
+        const ordinary = minimalScenario({
+            scenarioName: "customers-create-contract",
+            description: "Create customer contract",
+            category: "crud",
+            testType: TestType.CONTRACT,
+            steps: [{ order: 1, method: "POST", path: "/api/customers", description: "POST /api/customers", interactionType: "success", expectedStatusCode: 201 }],
+        });
+        const prompt = buildExecutionPlan([attackSurface, bugCaught, ordinary].map((scenario) => ({
+            scenario,
+            priority: scenario.category === "crud" ? "LOW" : "CRITICAL",
+            novelty: "modified",
+        })), 3, 3, "http://localhost:3000", "Authorization", ", authScheme: \"Bearer\"", "", "seed", 3, false, false, false, new Set(["POST::flows::contract", "POST::orders::contract", "POST::customers::contract"]));
+        expect(prompt).toContain("POST /api/flows/bulk_delete → 401");
+        expect(prompt).toContain("POST /api/orders → 201");
+        expect(prompt).not.toContain("POST /api/customers → 201");
+        expect(prompt).toContain("except for `bug_caught` and attack-surface `security_boundary` items");
+    });
+    it("keeps symmetric attack-surface siblings together before ordinary auth boundaries", () => {
+        const directWidgets = minimalScenario({
+            scenarioName: "widgets-delete-auth-boundary",
+            description: "Auth boundary: DELETE /api/widgets/{id} rejects missing authentication",
+            category: "security_boundary",
+            testType: TestType.CONTRACT,
+            steps: [{ order: 1, method: "DELETE", path: "/api/widgets/{id}", description: "DELETE /api/widgets/{id} without auth", interactionType: "error", expectedStatusCode: 401 }],
+        });
+        const directGadgets = minimalScenario({
+            scenarioName: "gadgets-delete-auth-boundary",
+            description: "Auth boundary: DELETE /api/gadgets/{id} rejects missing authentication",
+            category: "security_boundary",
+            testType: TestType.CONTRACT,
+            steps: [{ order: 1, method: "DELETE", path: "/api/gadgets/{id}", description: "DELETE /api/gadgets/{id} without auth", interactionType: "error", expectedStatusCode: 401 }],
+        });
+        const widgetsBulk = minimalScenario({
+            scenarioName: "widgets-bulk-delete-auth-boundary",
+            description: "Attack-surface auth boundary: /api/widgets/bulk_delete is a destructive sibling of changed DELETE /api/widgets/{id}; verify it rejects missing authentication",
+            category: "security_boundary",
+            testType: TestType.CONTRACT,
+            steps: [{ order: 1, method: "POST", path: "/api/widgets/bulk_delete", description: "POST /api/widgets/bulk_delete without auth", interactionType: "error", expectedStatusCode: 401 }],
+        });
+        const gadgetsBulk = minimalScenario({
+            scenarioName: "gadgets-bulk-delete-auth-boundary",
+            description: "Attack-surface auth boundary: /api/gadgets/bulk_delete is a destructive sibling of changed DELETE /api/gadgets/{id}; verify it rejects missing authentication",
+            category: "security_boundary",
+            testType: TestType.CONTRACT,
+            steps: [{ order: 1, method: "POST", path: "/api/gadgets/bulk_delete", description: "POST /api/gadgets/bulk_delete without auth", interactionType: "error", expectedStatusCode: 401 }],
+        });
+        const prompt = buildExecutionPlan([directWidgets, directGadgets, widgetsBulk, gadgetsBulk].map((scenario) => ({
+            scenario,
+            priority: "CRITICAL",
+            novelty: "modified",
+        })), 3, 4, "http://localhost:3000", "Authorization", ", authScheme: \"Bearer\"", "", "seed", 4, false);
+        const generatedBlock = prompt.slice(0, prompt.indexOf("#4 [ADDITIONAL]"));
+        const additionalBlock = prompt.slice(prompt.indexOf("#4 [ADDITIONAL]"));
+        expect(generatedBlock).toContain("POST /api/widgets/bulk_delete → 401");
+        expect(generatedBlock).toContain("POST /api/gadgets/bulk_delete → 401");
+        expect(additionalBlock).not.toContain("POST /api/widgets/bulk_delete → 401");
+        expect(additionalBlock).not.toContain("POST /api/gadgets/bulk_delete → 401");
+        expect(additionalBlock).toContain("DELETE /api/gadgets/{id} → 401");
+    });
 });
 // ---------------------------------------------------------------------------
 // Tests — buildTestQualityCriteria contract-test guidance (regression guard)
@@ -1175,6 +1326,12 @@ describe("buildVerificationChecklist — self-check at end of prompt", () => {
         const checklist = buildVerificationChecklist(10, 3);
         expect(checklist).toContain("bugCatchingTarget");
     });
+    it("includes issue coverage check with promotion cap", () => {
+        const checklist = buildVerificationChecklist(10, 3);
+        expect(checklist).toContain("Issue coverage");
+        expect(checklist).toContain("highest-severity flaw");
+        expect(checklist).toContain("At most one promotion per run");
+    });
     it("includes distinct code path check", () => {
         const checklist = buildVerificationChecklist(10, 3);
         expect(checklist).toContain("distinct code path");
@@ -1243,7 +1400,7 @@ describe("buildRecommendationPrompt — reduced over-prompting", () => {
         });
         const prompt = buildRecommendationPrompt(analysis, AnalysisScope.CurrentBranchDiff, 10);
         expect(prompt).not.toContain("(MANDATORY before executing anything)");
-        expect(prompt).toContain("(before executing anything)");
+        expect(prompt).toContain("(before anything else)");
     });
     it("uses XML tags in context fetching guidance", () => {
         const guidance = buildContextFetchingGuidance("session-1");
@@ -1512,3 +1669,161 @@ describe("externalDedupKey", () => {
         expect(externalDedupKey(scenario)).toBe("POST::orders::contract");
     });
 });
+describe("UI grounding guidance (Phase C D-1.a)", () => {
+    const editOrderScenario = minimalScenario({
+        scenarioName: 'edit-order',
+        description: 'edit order',
+        category: 'new_endpoint',
+        priority: 'medium',
+        steps: [
+            { order: 1, method: 'PATCH', path: '/api/v1/orders/{id}', description: 'edit', interactionType: 'success', expectedStatusCode: 200 },
+        ],
+        chainingKeys: [],
+    });
+    const uiGroundingBusinessContext = {
+        mainPurpose: "Test API", userFlows: [], dataFlows: [], integrationPatterns: [],
+        draftedScenarios: [editOrderScenario],
+    };
+    const uiOnlyAnalysis = () => minimalAnalysis({
+        branchDiffContext: {
+            currentBranch: "feature", baseBranch: "main",
+            changedFiles: ['frontend/src/components/EditOrder.tsx'],
+            newEndpoints: [],
+            modifiedEndpoints: [],
+            affectedServices: [],
+        },
+        businessContext: uiGroundingBusinessContext,
+    });
+    const mixedAnalysis = () => minimalAnalysis({
+        branchDiffContext: {
+            currentBranch: "feature", baseBranch: "main",
+            changedFiles: ['frontend/src/components/EditOrder.tsx', 'backend/orders.py'],
+            newEndpoints: [{
+                    path: '/api/v1/orders/{id}',
+                    methods: [{ method: 'PATCH', sourceFile: 'backend/orders.py', interactionCount: 1 }],
+                }],
+            modifiedEndpoints: [],
+            affectedServices: [],
+        },
+        businessContext: uiGroundingBusinessContext,
+    });
+    const backendOnlyAnalysis = () => minimalAnalysis({
+        branchDiffContext: {
+            currentBranch: "feature", baseBranch: "main",
+            changedFiles: ['backend/orders.py'],
+            newEndpoints: [{
+                    path: '/api/v1/orders',
+                    methods: [{ method: 'POST', sourceFile: 'backend/orders.py', interactionCount: 1 }],
+                }],
+            modifiedEndpoints: [],
+            affectedServices: [],
+        },
+        businessContext: uiGroundingBusinessContext,
+    });
+    it("emits UI grounding block on UI-only PRs", () => {
+        const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
+        expect(out).toContain("UI recommendation grounding");
+        expect(out).toContain("role=<role>");
+        expect(out).toContain("accessibleName=");
+    });
+    it("emits UI grounding block on mixed PRs", () => {
+        const out = buildRecommendationPrompt(mixedAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
+        expect(out).toContain("UI recommendation grounding");
+        expect(out).toContain("contextText=");
+    });
+    it("does not emit UI grounding block on backend-only PRs", () => {
+        const out = buildRecommendationPrompt(backendOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
+        expect(out).not.toContain("UI recommendation grounding");
+    });
+    it("carves out non-UI entries from the grounding format", () => {
+        const out = buildRecommendationPrompt(mixedAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
+        expect(out).toMatch(/Contract, integration, e2e.*existing conventions/);
+    });
+    it("includes contextText example for repeating elements", () => {
+        const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
+        expect(out).toContain('contextText=["customer@example.com"');
+    });
+    // Phase C D-1.a/7b: directive instruction strengthening + Validates coverage
+    // + marked fallback. Asserts the post-7b language is present.
+    it("uses MUST directive for tuple presence in reasoning field", () => {
+        const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
+        expect(out).toContain("MUST contain at least three of");
+        expect(out).toContain("not valid output for UI test types");
+    });
+    it("instructs the agent to ground the Validates line for UI entries", () => {
+        const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
+        expect(out).toContain("Validates line — applies to");
+        expect(out).toContain("observable behavior the test verifies");
+        expect(out).toContain("structural facts");
+    });
+    it("requires a [no-blueprint-data] marker on source-grounded fallback entries", () => {
+        const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
+        expect(out).toContain("[no-blueprint-data]");
+        expect(out).toContain("issuesFound");
+        expect(out).toContain("Do NOT silently produce ungrounded reasoning");
+    });
+});
+// ---------------------------------------------------------------------------
+// Tests — UI recommendation authoring rules
+// ---------------------------------------------------------------------------
+//
+// The recommendation prompt always emits a "UI Recommendation Authoring Rules"
+// section that anchors UI rec reasoning. Earlier iterations of this code
+// accepted a `capturedBlueprints` parameter and rendered the captures inline,
+// but that was redundant: the agent has the captures in its own tool-result
+// history. The prompt now ships the rules; the agent supplies the vocabulary.
+describe("buildRecommendationPrompt UI authoring rules", () => {
+    function minimalDiffAnalysis() {
+        return minimalAnalysis({
+            branchDiffContext: {
+                baseBranch: "main",
+                currentBranch: "feature/test",
+                changedFiles: ["src/components/OrderForm.tsx"],
+                newEndpoints: [],
+                modifiedEndpoints: [],
+                affectedServices: [],
+            },
+        });
+    }
+    it("emits UI authoring rules wrapped in an XML tag, unconditionally", () => {
+        const analysis = minimalDiffAnalysis();
+        const prompt = buildRecommendationPrompt(analysis);
+        expect(prompt).toContain("<ui_recommendation_authoring_rules>");
+        expect(prompt).toContain("</ui_recommendation_authoring_rules>");
+        expect(prompt).toMatch(/do NOT mention "blueprint"/i);
+        expect(prompt).toMatch(/do not invent element names/i);
+    });
+    it("does not render a 'Captured Blueprints' data section (param removed)", () => {
+        // Regression on the previous design: capturedBlueprints used to be
+        // threaded through the call and rendered as a "## Captured Blueprints"
+        // data section. That path is gone — the agent's own browser_blueprint
+        // tool-result history is the source of truth for element vocabulary.
+        const analysis = minimalDiffAnalysis();
+        const prompt = buildRecommendationPrompt(analysis);
+        expect(prompt).not.toContain("## Captured Blueprints");
+    });
+    it("instructs the LLM to ground UI recommendations in elements observed via earlier browser_blueprint calls", () => {
+        const analysis = minimalDiffAnalysis();
+        const prompt = buildRecommendationPrompt(analysis);
+        expect(prompt).toMatch(/ground the [`]?reasoning[`]?\s*field in elements you have actually observed/i);
+        expect(prompt).toMatch(/inform.*how.*describe.*not.*which/i);
+    });
+    it("instructs the LLM not to leak internal MCP terminology into reasoning", () => {
+        const analysis = minimalDiffAnalysis();
+        const prompt = buildRecommendationPrompt(analysis);
+        expect(prompt).toMatch(/do NOT mention "blueprint"/i);
+        expect(prompt).toMatch(/leak builder internals/i);
+    });
+    it("does not abbreviate 'recommendation' to 'rec' in the rules section", () => {
+        // Per PR review: shortform 'rec' invites the LLM to hallucinate the term.
+        // Spell it out to keep the language consistent with the rest of the prompt.
+        const analysis = minimalDiffAnalysis();
+        const prompt = buildRecommendationPrompt(analysis);
+        const rulesStart = prompt.indexOf("<ui_recommendation_authoring_rules>");
+        const rulesEnd = prompt.indexOf("</ui_recommendation_authoring_rules>");
+        const rulesBlock = prompt.slice(rulesStart, rulesEnd);
+        // Allow "recs" as a substring within "recommendations" — match only the
+        // standalone word.
+        expect(rulesBlock).not.toMatch(/\b(rec|recs)\b/);
+    });
+});

package/build/prompts/testbot/testbot-prompts.js CHANGED Viewed

@@ -19,7 +19,7 @@ const CONTRACT_MODE_GUIDANCE = CONSUMER_MODE_ENABLED
   Both modes (\`providerMode: true, consumerMode: true\`): For diff that contains BOTH provider signals (such as new/modified endpoint handlers, route changes this service owns) AND consumer signals (outbound HTTP client calls to another service, no new endpoint handlers).`
     : `  Always add \`providerMode: true\` — the tool generates provider-side contract tests only.`;
 export function getTestbotPrompt(prTitle, prDescription, summaryOutputFile, repositoryPath, baseBranch, maxRecommendations = MAX_RECOMMENDATIONS, maxGenerate = MAX_TESTS_TO_GENERATE, _maxCritical = MAX_CRITICAL_TESTS, // Reserved — accepted for API compat but not yet wired into prompt
-prNumber, userPrompt, services, stateOutputFile, uiCredentials) {
+prNumber, userPrompt, services, stateOutputFile, uiCredentials, testsRepoDir) {
     maxGenerate = Math.min(Math.max(maxGenerate, 0), maxRecommendations);
     // For follow-up requests: emit the @skyramp-testbot header + guardrails + retrieve-recommendations step.
     // For first-run prompts: emit the full Task 1 analysis + maintenance section.
@@ -42,24 +42,35 @@ Verify the prompt inside <USER_PROMPT> is related to adding or removing tests fr
 - If the prompt matches one or more tests in the Additional Recommendations → proceed to Task 1 (Skip Analysis).
 ### Task 1: Retrieve Previous Recommendations
-Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff"${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}.
+Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff"${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}${testsRepoDir ? `, \`testsRepoDir\`: "${testsRepoDir}"` : ""}.
 This will fetch the previous TestBot report from the PR comments and return deduplicated recommendations.
 Use those recommendations as your baseline. Only add or remove tests that the user requested AND that appear in the Additional Recommendations. Then proceed straight to Task 2: Generate New Tests.
 `
         : `
 **Incremental mode:** Task 1 handles maintenance of existing tests. Task 2 handles new test generation from the GENERATE list. The two tasks are independent — maintenance completions never reduce the generate budget. Only generate tests for NEW endpoints not already covered by existing bot tests.
+<!-- TODO(SKYR-3636 follow-up): migrate Task 1 + Task 2 step bodies to PromptPlan
+     (src/prompts/test-recommendation/promptPlan.ts) so step numbers don't have
+     to be hand-maintained when steps are added or reordered. -->
 ## Task 1: Analyze & Maintain
-1. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations (${maxGenerate} to generate, ${maxRecommendations - maxGenerate} as additional).${prNumber ? " Uses PR comment history to avoid re-recommending already-generated tests." : ""}
+1. **Pre-flight UI enumeration.** Call \`skyramp_ui_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}"${uiCredentials ? `, \`uiCredentials\`: <use the value from <ui-credentials> in your context>` : ""}. The response returns \`uiContext\` (\`changedFrontendFiles\`, \`candidateUiPages\`) and capture instructions.
+   **If the response says "No UI changes detected"** → skip ahead to step 2.
+   **Otherwise:** for each candidate URL in the response${uiCredentials ? " (after logging in via the credentials provided)" : ""}, \`browser_navigate\` to the URL, then \`browser_blueprint\` to capture. The captures stay in your tool-result history — they're the element vocabulary you'll use when writing UI rec \`reasoning\` fields in step 2. You do NOT need to thread them back into a tool call.
+   If a candidate URL 404s or redirects, navigate from the workspace baseUrl and explore. If \`browser_blueprint\` fails on every candidate, proceed to step 2 and log an \`issuesFound\` info entry — UI recommendations will fall back to source-grounded prose.
+2. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}${testsRepoDir ? `, \`testsRepoDir\`: "${testsRepoDir}"` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations${prNumber ? " (using PR comment history to avoid re-recommending already-generated tests)" : ""} along with the UI recommendation authoring rules. Use the blueprints already in your context (from step 1) to ground UI rec reasoning.
    **If \`skyramp_analyze_changes\` returns an error:** retry once only if the error is transient (timeout, network blip, temporary unavailability) — do NOT retry for permanent errors (invalid repository path, missing required parameter, authentication failure). If it fails again, call \`skyramp_submit_report\` with a minimal valid payload: leave all test arrays empty and add the error to \`issuesFound\`. Refer to the \`skyramp_submit_report\` schema for required fields. Do NOT attempt Task 2 without a valid stateFile.
    **If all changed files are non-application** (CI/CD, docs, lock files, config) → skip to Task 3 (Submit Report) with empty arrays and a single \`issuesFound\` entry explaining why (same format as the zero-test path below).
-2. **Maintain existing tests** using the rules in \`<drift_analysis_rules>\` below. For each existing test reported by \`skyramp_analyze_changes\`, score it and choose the action exactly as directed by the Action Decision Matrix in \`<drift_analysis_rules>\`. Only read test files that require action per that matrix — do NOT read files that will be IGNORED. **Do NOT read source files (routers, models, CRUD, components) — all the information you need is in the \`skyramp_analyze_changes\` output and the diff.** When reading multiple test files, **read them all in a single parallel batch** — do NOT read them one at a time. Apply actions directly. Results go in \`testMaintenance\`.
+3. **Maintain existing tests** using the rules in \`<drift_analysis_rules>\` below. For each existing test reported by \`skyramp_analyze_changes\`, score it and choose the action exactly as directed by the Action Decision Matrix in \`<drift_analysis_rules>\`. Only read test files that require action per that matrix — do NOT read files that will be IGNORED. **Do NOT read source files (routers, models, CRUD, components) — all the information you need is in the \`skyramp_analyze_changes\` output and the diff.** When reading multiple test files, **read them all in a single parallel batch** — do NOT read them one at a time. Apply actions directly. Results go in \`testMaintenance\`.
 ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repositoryPath })}
-3. **Code review:** From the \`skyramp_analyze_changes\` output and the existing test files you read for maintenance, note any logic bugs. Do NOT read additional source files just for code review — use what is already available from the analysis and test file reads. Common patterns to flag:
+4. **Code review:** From the \`skyramp_analyze_changes\` output and the existing test files you read for maintenance, note any logic bugs. Do NOT read additional source files just for code review — use what is already available from the analysis and test file reads. Common patterns to flag:
    - Computed fields not recalculated after mutation (e.g. \`total_amount\` unchanged after items are added/removed)
    - Incomplete CRUD: create without cleanup, update that adds new records without removing old ones
    - Missing input validation on new endpoints
@@ -67,6 +78,8 @@ ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repository
    - Incorrect arithmetic in business logic (discount calculations, price aggregation)
    Log each finding in \`issuesFound\` with a \`severity\` (critical/high/medium/low). These bugs should inform your test design in Task 2.
+5. **Apply the UI Recommendation Authoring Rules.** \`skyramp_analyze_changes\` returns an authoring-rules section that defines how UI recommendation \`reasoning\` fields should be written (natural prose, no internal-identifier syntax, ground in elements observed via earlier \`browser_blueprint\` calls, fall back to source-grounded prose when no captures are available). Apply those rules when authoring UI rec reasoning. Non-UI recommendations (contract / integration / e2e / batch-scenario) are unaffected by these rules and use their pre-existing formats — do not reformat them.
 ---`;
     const serviceContext = services?.length ? buildServiceContext(services) : '';
     // The <ui-credentials> tags are framing for the agent's prompt context —
@@ -80,10 +93,14 @@ ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repository
     const uiCredentialsBlock = trimmedCredentials
         ? `<ui-credentials>\n${trimmedCredentials}\n</ui-credentials>`
         : '';
+    const testsRepoDirBlock = testsRepoDir ? `<TESTS REPO DIR>${testsRepoDir}</TESTS REPO DIR>\n` : '';
+    const testDirInstruction = testsRepoDir
+        ? `the \`<output_dir>\` from the \`<services>\` block, rooted under the test repository at \`${testsRepoDir}\` (i.e. \`${testsRepoDir}/<output_dir>\`). Write ALL test output files to paths under \`${testsRepoDir}\`, not under \`${repositoryPath}\`. Do NOT write any test files to the app repository.`
+        : `${SERVICE_REFS.testDirRef}. Do NOT create a new \`tests/\` directory at the repo root — use that path. If no \`testDirectory\` is configured, default to the language-conventional location (e.g. \`src/test/java/...\` for Java, \`tests/\` for Python).`;
     return `<TITLE>${prTitle}</TITLE>
 <DESCRIPTION>${prDescription}</DESCRIPTION>
 <REPOSITORY PATH>${repositoryPath}</REPOSITORY PATH>
-${serviceContext ? serviceContext + '\n' : ''}${uiCredentialsBlock ? uiCredentialsBlock + '\n' : ''}Use the Skyramp MCP server tools for all tasks below.
+${testsRepoDirBlock}${serviceContext ? serviceContext + '\n' : ''}${uiCredentialsBlock ? uiCredentialsBlock + '\n' : ''}Use the Skyramp MCP server tools for all tasks below.
 ${task1Section}
@@ -143,7 +160,7 @@ ${userPrompt ? "Generate only the tests that the user requested from the Additio
 **How to generate each type (for ADD):**
 - **Integration**: call \`skyramp_batch_scenario_test_generation\` with ALL steps in a single call (pass the \`steps\` array with method, path, requestBody, statusCode for each step). Then call \`skyramp_integration_test_generation\` with the returned scenario file.
   **Use the pre-built scenario JSON from the Execution Plan** — pass the steps array directly. Do NOT read source code models to construct request bodies if the plan already provides them.
-  Scenario JSON and test files go in ${SERVICE_REFS.testDirRef}. Do NOT create a new \`tests/\` directory at the repo root — use that path. If no \`testDirectory\` is configured, default to the language-conventional location (e.g. \`src/test/java/...\` for Java, \`tests/\` for Python).
+  Scenario JSON and test files go in ${testDirInstruction}
   **Pipeline for speed**: Call ALL \`skyramp_batch_scenario_test_generation\` calls in one batch. When they return, call ALL \`skyramp_integration_test_generation\` calls in the next batch. Do NOT serialize per-scenario (batch→integration→batch→integration) — batch ALL scenarios first, then generate ALL integration tests.
 - **Contract**: call \`skyramp_contract_test_generation\` with \`endpointURL\`, \`method\`, and \`requestData\` for POST/PUT/PATCH.
   Pass \`apiSchema\` if an OpenAPI spec exists.
@@ -153,13 +170,20 @@ ${CONTRACT_MODE_GUIDANCE}
   If a relevant trace exists (covers the UI changes in this PR), use it directly with \`skyramp_ui_test_generation\` and \`modularizeCode: false\`.
   If NO relevant trace exists, **you MUST write out your full trace plan as text BEFORE calling \`browser_navigate\`**. Do not touch the browser until the plan is written.
-  **Browser authentication (check BEFORE navigating)**: If \`<ui-credentials>\` appears in your context above, the app requires login. Parse the credentials — each line is \`username:password\`. Type the values verbatim (they are not encoded or escaped). Before navigating to ANY feature URL:
+  **Browser authentication (check BEFORE navigating)**: If \`<ui-credentials>\` appears in your context above, the app requires login. Parse the credentials — one per line, two supported formats:
+  - New format: \`username=<value>;password=<value>\` or \`username=<value>;password=<value>;role=<value>\` — fields are \`;\`-delimited key=value pairs. The \`=\` and \`;\` characters are reserved delimiters and must not appear in the values themselves.
+  - Legacy format: \`username:password\` — the first \`:\` splits username from password.
+  **Credential selection**: Use the first credential by default. When the scenario requires a specific role, find the credential whose \`role\` field matches (e.g. \`role=admin\`). If no credential matches the required role, use the first credential and add a note to \`issuesFound\` that no matching role was found.
+  Type all values verbatim. Before navigating to ANY feature URL:
   1. \`browser_navigate\` to the login URL (e.g. \`{baseUrl}/login\`, \`/user/login\`, \`/signin\` — infer from the app's base URL and framework)
   2. \`browser_snapshot\` to find the username/email and password fields
   3. \`browser_type\` the username into the email/username field
   4. \`browser_type\` the password into the password field
-  5. \`browser_click\` the submit button, then \`browser_wait_for\` redirect away from the login page
-  6. Now navigate directly to the feature URL and begin recording
+  5. If a role selector is present and a \`role\` was specified in the credential, select it before submitting
+  6. \`browser_click\` the submit button, then \`browser_wait_for\` redirect away from the login page
+  7. Now navigate directly to the feature URL and begin recording
   The login steps ARE part of the trace — the generated test will authenticate automatically.
   Use this exact format:
@@ -198,6 +222,38 @@ ${CONTRACT_MODE_GUIDANCE}
     - **List integrity after form save**: assert the list item count is unchanged unless the action explicitly added or removed items — catches duplication bugs
     - Do NOT assert page headings, static labels, boilerplate text, intermediate states, or values already guaranteed by the action
     - Do NOT assert the same value with multiple selectors
+  **Capture-act-capture (applies only when recording a UI trace):**
+  **Skip this entire section if \`uiContext\` was absent or \`changedFrontendFiles\` was empty in step 1's response** (backend-only PR). The capture-act-capture pattern is for UI trace recording only — there's no UI trace to record on a backend-only PR. Continue to the non-UI test-type instructions below.
+  **Reminder — the UI test priority rule above still applies.** If the diff contains frontend/UI changes, you still MUST attempt to generate at least one UI test. Capture-act-capture is **how** you record that test, not **whether** you record one — do not substitute UI recommendations for actually recording a trace. UI rec reasoning was already grounded in the upstream blueprints from Task 1 step 1; Task 2's capture-act-capture is for the trace's own assertions, not for retroactively rewriting recommendation reasoning.
+  This pattern produces delta-derived assertions from blueprint diffs. Diff-derived assertions catch state changes more reliably than author-inference — the diff tells you what actually changed on the page so the assertion is grounded in observable state, not in guessing what "success" looks like.
+  Capture-act-capture applies **only** to a UI trace in progress (inside the \`browser_navigate\` + \`browser_*\` interaction block). It does **not** apply to contract, integration, e2e, or batch-scenario test generation — those run on their pre-existing patterns without any capture-act-capture involvement. On mixed PRs with backend + UI work, generate the non-UI tests normally; the only thing that changes is how the UI trace itself is recorded.
+  \`browser_snapshot\` remains the source of ephemeral refs that interaction tools require. \`browser_blueprint\` provides durable semantic identity. Use both when recording: blueprint to decide the target and what "done" looks like; snapshot to get the ref needed to dispatch the click or type.
+  An **action** for this pattern is one user-intent-level operation whose completion changes the app's observable state — a click, a form submit (button or Enter), a navigation, a complete text fill of one field, or a meaningful keyboard shortcut (\`Ctrl+V\` paste, \`Ctrl+A\` select-all followed by a mutation, \`Escape\` to dismiss a modal). **Not** intermediate input mechanics: the individual typed characters \`browser_type\` emits, \`Tab\` between fields, arrow-key highlight in a listbox, focus-only changes. The browser-authentication flow (login) is boilerplate — no capture-act-capture there; login stays on the existing \`browser_snapshot\` + \`browser_type\` + \`browser_click\` pattern.
+  The pattern for each action:
+    1. **Before** the action: \`browser_blueprint\`. Identify the semantic target by \`role\`, \`accessibleName\`, and \`stableId\`/\`testId\`.
+    2. If the target's \`widgetType\` is \`"custom"\` or \`"unknown"\`: \`browser_widget_contract_lookup\` with the element's \`fingerprint\` and \`ref\`. On \`"found"\`, execute the contract steps. On \`"needs_inference"\`, fall through to snapshot-driven trial clicks (\`browser_wait_for\` between retries). Inference-and-cache is out of scope for this slice, so don't attempt to synthesize and cache contracts.
+    3. Execute the action via \`browser_click\` / \`browser_type\` / \`browser_navigate\`. The \`ref\` comes from \`browser_snapshot\` as today.
+    4. **After** the action: \`browser_blueprint\` again. The response shape depends on whether the action navigated:
+       - **Same URL (modal/tab/in-place mutation):** \`{ isFullCapture: false, pageHash, previousPageHash, delta, possibleAssertions }\`. The \`delta\` field contains \`elementsAdded\`, \`elementsRemoved\`, \`textChanges\`, \`repeatingCountChanges\`. The \`possibleAssertions\` field is a mechanical translation of those entries into Playwright \`expect(...)\` candidates — see step 5. You do **not** need to call \`browser_blueprint_diff\` here — that tool is only for cross-URL comparisons. An empty delta (all arrays empty) is itself a meaningful signal: the action did not change observable DOM (e.g. a silent failure the test should catch).
+       - **Navigated to a new URL** (e.g. router transition, link click, programmatic \`browser_navigate\`): \`{ isFullCapture: true, pageHash, blueprint }\` — a fresh full capture of the new page. No \`possibleAssertions\` here (no delta to translate). If you need a structured cross-URL diff, call \`browser_blueprint_diff(beforeBlueprint, afterBlueprint)\` explicitly; otherwise search the new blueprint for the elements your assertion will target.
+    5. **The AFTER response includes a \`possibleAssertions[]\` array — these are mechanical translations of delta entries into Playwright \`expect(...)\` candidates, available if any of them happen to match the assertion you'd write anyway.** Each entry has \`{ code, rationale, tier }\` where \`code\` is ready-to-use, \`rationale\` explains the source delta entry, and \`tier\` is HIGH/MEDIUM/LOW. **Read them, but do not feel obligated to use them.** They are heavily biased toward visibility checks (\`toBeVisible\` / \`not.toBeVisible\`), which are often shallow assertions — a passing visibility check does not mean the feature works. The right assertion target depends on what the test is *for*: if you're testing a state-changing action (form submit, button click that mutates data), prefer assertions on the post-action state (computed values, count changes, server-derived fields). Use a \`possibleAssertions\` candidate when its \`code\` already expresses what you would have written; ignore the array entirely when none of the candidates match the test's actual purpose. Adding visibility assertions just because they're available reduces test value; one well-targeted assertion beats five visibility checks of incidental DOM elements (modal scaffolding, navigation chrome). The pre-existing rule still applies: **at least one \`browser_assert\` per page navigated, verifying a business outcome — not just that an element is visible.**
+  **The Blueprint Citation Invariant applies during recording too.** Every assertion you emit cites element names — those names must come from blueprint captures, not invention. For N user-intent-level actions, the reference target is N+1 \`browser_blueprint\` calls (the first returns full, the rest return deltas). Traces that follow the pattern produce assertions grounded in observable state changes; traces that skip captures fall back to author-inferred assertions and risk citing names that don't exist in the rendered DOM.
+  The rest of the UI workflow stays the same: trace plan, browser auth, navigation, export (\`skyramp_export_zip\`), generation (\`skyramp_ui_test_generation\`), \`skyramp_enhance_assertions\` post-call. Capture-act-capture adds blueprint captures alongside the existing steps; it doesn't replace anything.
 - **E2E**: Only if BOTH a backend trace \`.json\` AND a Playwright \`.zip\` already exist in the repo. Without both, move to \`additionalRecommendations\`.
 - Skip smoke tests entirely.
@@ -267,7 +323,7 @@ Call \`skyramp_submit_report\` with \`summaryOutputFile\`: "${summaryOutputFile}
 - **additionalRecommendations**: AT MOST ${maxRecommendations - maxGenerate} items.
   - For \`testType: "contract"\` entries: **\`primaryEndpoint\` is required** (e.g. \`"GET /api/v1/users/{user_id}"\`). The tool will reject the submission without it — do not omit it or you will be forced to resubmit.
   - For \`testType: "integration"\` or \`"e2e"\` entries: omit \`primaryEndpoint\` — use \`description\` to list the endpoints involved instead.
-- **testMaintenance**: Never leave this as an empty array. If no tests required updates, add exactly one entry explaining why: \`{description: "No existing Skyramp tests found in the repository — 0 tests to maintain."}\` or \`"{N} existing tests checked; all scored IGNORE — no endpoint changes requiring test updates."\` A blank section prevents a full report score.
+- **testMaintenance**: Use \`[]\` **only** if no existing Skyramp tests were found in the repository. If existing tests were found (any score), include one entry per test. For UPDATE/REGENERATE/DELETE tests that were modified and executed, populate all fields from real before/after execution results. For IGNORE-scored tests (not modified or executed), derive \`beforeStatus\` from the \`skyramp_analyze_test_health\` health score (typically \`"Pass"\` if drift score is 0 and no health issues were flagged), set \`afterStatus\` to \`"Skipped"\`, and use \`afterDetails\` to explain why (e.g. "IGNORE: drift score 0 — endpoint not modified in this PR"). Do **not** add entries for tests that were not returned by the health analysis.
 ---
@@ -353,15 +409,19 @@ export function registerTestbotPrompt(server) {
             uiCredentials: z
                 .string()
                 .optional()
-                .describe("Browser login credentials for UI test recording (format: 'username:password', one per line). Injected into the prompt as a <ui-credentials> block so the agent logs in before recording traces."),
+                .describe("Browser login credentials for UI test recording. One credential per line. Supported formats: 'username=<val>;password=<val>' or 'username=<val>;password=<val>;role=<val>' (role optional), or legacy 'username:password'. Note: = and ; are reserved delimiters in the new format and must not appear in values. Injected into the prompt as a <ui-credentials> block so the agent logs in before recording traces."),
             workspaceValidationFailed: z
                 .boolean()
                 .default(false)
                 .describe("Set to true when the testbot detected that .skyramp/workspace.yml exists but failed schema validation. Instructs the agent to regenerate the workspace file before proceeding."),
+            testsRepoDir: z
+                .string()
+                .optional()
+                .describe("Absolute path to a cloned test repository. When set, the agent writes generated test files there instead of the app repository (cross-repo test delivery)."),
         },
     }, async (args) => {
         const services = await readWorkspaceServices(args.repositoryPath);
-        let prompt = getTestbotPrompt(args.prTitle, args.prDescription, args.summaryOutputFile, args.repositoryPath, args.baseBranch, args.maxRecommendations, args.maxGenerate, args.maxCritical, args.prNumber, args.userPrompt, services.length ? services : undefined, args.stateOutputFile, args.uiCredentials);
+        let prompt = getTestbotPrompt(args.prTitle, args.prDescription, args.summaryOutputFile, args.repositoryPath, args.baseBranch, args.maxRecommendations, args.maxGenerate, args.maxCritical, args.prNumber, args.userPrompt, services.length ? services : undefined, args.stateOutputFile, args.uiCredentials, args.testsRepoDir);
         if (args.workspaceValidationFailed) {
             prompt = buildWorkspaceRecoveryPrefix(args.repositoryPath) + prompt;
         }