npm - @skyramp/mcp - Versions diffs - 0.2.0-rc.1 → 0.2.0-rc.2 - Mend

@skyramp/mcp 0.2.0-rc.1 → 0.2.0-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

package/build/index.js +4 -2
package/build/prompts/code-reuse.js +106 -7
package/build/prompts/pom-aware-code-reuse.js +106 -7
package/build/prompts/startTraceCollectionPrompts.js +37 -15
package/build/prompts/test-maintenance/drift-analysis-prompt.js +26 -31
package/build/prompts/test-maintenance/drift-analysis-prompt.test.js +40 -1
package/build/prompts/test-maintenance/driftAnalysisSections.js +90 -86
package/build/prompts/test-recommendation/analysisOutputPrompt.js +286 -163
package/build/prompts/test-recommendation/analysisOutputPrompt.test.js +154 -45
package/build/prompts/test-recommendation/diffExecutionPlan.js +215 -117
package/build/prompts/test-recommendation/promptPlan.js +290 -0
package/build/prompts/test-recommendation/promptPlan.test.js +336 -0
package/build/prompts/test-recommendation/recommendationSections.js +3 -1
package/build/prompts/test-recommendation/recommendationShared.js +23 -1
package/build/prompts/test-recommendation/scopeAssessment.js +65 -14
package/build/prompts/test-recommendation/scopeAssessment.test.js +93 -2
package/build/prompts/test-recommendation/test-recommendation-prompt.js +36 -12
package/build/prompts/test-recommendation/test-recommendation-prompt.test.js +222 -1
package/build/prompts/testbot/testbot-prompts.js +18 -62
package/build/prompts/testbot/testbot-prompts.test.js +65 -31
package/build/services/ScenarioGenerationService.js +11 -1
package/build/services/TestExecutionService.js +73 -15
package/build/services/TestExecutionService.test.js +105 -0
package/build/services/TestGenerationService.js +11 -1
package/build/tools/executeSkyrampTestTool.js +1 -10
package/build/tools/test-management/actionsTool.js +152 -63
package/build/tools/test-management/analyzeChangesTool.js +171 -63
package/build/tools/test-management/analyzeChangesTool.test.js +103 -16
package/build/tools/test-management/analyzeTestHealthTool.js +30 -81
package/build/tools/test-management/index.js +1 -0
package/build/tools/test-management/uiAnalyzeChangesTool.js +149 -0
package/build/tools/test-management/uiAnalyzeChangesTool.test.js +100 -0
package/build/tools/trace/resolveSaveStoragePath.js +16 -0
package/build/tools/trace/resolveSaveStoragePath.test.js +17 -0
package/build/tools/trace/resolveSessionPaths.js +39 -0
package/build/tools/trace/resolveSessionPaths.test.js +103 -0
package/build/tools/trace/sessionState.js +14 -0
package/build/tools/trace/sessionState.test.js +17 -0
package/build/tools/trace/startTraceCollectionTool.js +84 -14
package/build/tools/trace/stopTraceCollectionTool.js +9 -2
package/build/types/TestAnalysis.js +50 -0
package/build/types/TestRecommendation.js +6 -58
package/build/types/TestTypes.js +1 -1
package/build/utils/AnalysisStateManager.js +22 -11
package/build/utils/branchDiff.js +11 -2
package/build/utils/docker.test.js +1 -1
package/build/utils/gitStaging.js +52 -3
package/build/utils/gitStaging.test.js +19 -1
package/build/utils/repoScanner.js +18 -10
package/build/utils/repoScanner.test.js +92 -0
package/build/utils/routeParsers.js +168 -25
package/build/utils/routeParsers.test.js +180 -1
package/build/utils/scenarioDrafting.js +220 -17
package/build/utils/scenarioDrafting.test.js +182 -9
package/build/utils/sourceRouteExtractor.js +806 -0
package/build/utils/sourceRouteExtractor.test.js +565 -0
package/build/utils/uiPageEnumerator.js +319 -0
package/build/utils/uiPageEnumerator.test.js +422 -0
package/build/utils/utils.js +27 -0
package/build/utils/versions.js +1 -1
package/build/utils/workspaceAuth.js +33 -4
package/node_modules/playwright/lib/dom-analyzer/blueprint.js +54 -5
package/node_modules/playwright/lib/dom-analyzer/blueprintDiff.js +4 -0
package/node_modules/playwright/lib/dom-analyzer/blueprintDiff.test.js +6 -0
package/node_modules/playwright/lib/dom-analyzer/possibleAssertions.js +150 -0
package/node_modules/playwright/lib/dom-analyzer/possibleAssertions.test.js +470 -0
package/node_modules/playwright/lib/mcp/browser/tab.js +1 -1
package/node_modules/playwright/lib/mcp/browser/tools/pageBlueprint.js +21 -4
package/node_modules/playwright/lib/mcp/browser/tools/pageBlueprint.test.js +3 -0
package/node_modules/playwright/package.json +1 -1
package/node_modules/playwright/skyramp-playwright-1.58.2-skyramp.8.9.4.tgz +0 -0
package/node_modules/playwright/skyramp-playwright-1.58.2-skyramp.8.9.5.tgz +0 -0
package/node_modules/playwright/skyramp-playwright-1.58.2-skyramp.8.9.6.tgz +0 -0
package/package.json +3 -3
package/build/services/TestHealthService.js +0 -694
package/build/services/TestHealthService.test.js +0 -241
package/build/types/TestDriftAnalysis.js +0 -1
package/build/types/TestHealth.js +0 -4

package/build/prompts/testbot/testbot-prompts.js CHANGED Viewed

@@ -6,7 +6,6 @@ import { buildDriftAnalysisPrompt } from "../test-maintenance/drift-analysis-pro
 import { getTraceRecordingPromptText } from "../../playwright/traceRecordingPrompt.js";
 import { isContractConsumerModeEnabled } from "../../utils/featureFlags.js";
 import { resolveServiceDetailsRef } from "../../utils/utils.js";
-import { UI_FILE_GIT_PATHSPEC } from "../../utils/routeParsers.js";
 import { readWorkspaceConfigRaw } from "../../utils/workspaceAuth.js";
 // Cached at module-load — flags are process-wide and cannot change per call.
 const CONSUMER_MODE_ENABLED = isContractConsumerModeEnabled();
@@ -22,14 +21,6 @@ const CONTRACT_MODE_GUIDANCE = CONSUMER_MODE_ENABLED
 export function getTestbotPrompt(prTitle, prDescription, summaryOutputFile, repositoryPath, baseBranch, maxRecommendations = MAX_RECOMMENDATIONS, maxGenerate = MAX_TESTS_TO_GENERATE, _maxCritical = MAX_CRITICAL_TESTS, // Reserved — accepted for API compat but not yet wired into prompt
 prNumber, userPrompt, services, stateOutputFile, uiCredentials, testsRepoDir) {
     maxGenerate = Math.min(Math.max(maxGenerate, 0), maxRecommendations);
-    // Task 1 UI Path — candidate-page strategy section. Two strategies plus
-    // a guaranteed root-fallback. Cost is ~5s when strategies 1 & 2 succeed
-    // (no crawl), more when the root-fallback fires.
-    const uiPathStrategies = `**Lazy mode** (default). Two candidate-page strategies, then a guaranteed root-fallback:
-   1. **Framework route grep** — identify route files under \`app/\`, \`pages/\`, \`routes/\` whose path segments match the changed component's file location.
-   2. **Import-graph walk** — from the changed component's file, walk up import chains to find route entrypoints that import it.
-   3. **Root fallback (always)** — if strategies 1 and 2 produce no candidate pages (common for SPAs without filesystem routing), navigate to the app's root URL (\`/\`) and treat that as the single candidate page. Apply explore-and-discover from there to surface gated UI.`;
     // For follow-up requests: emit the @skyramp-testbot header + guardrails + retrieve-recommendations step.
     // For first-run prompts: emit the full Task 1 analysis + maintenance section.
     const task1Section = userPrompt
@@ -58,53 +49,28 @@ Use those recommendations as your baseline. Only add or remove tests that the us
         : `
 **Incremental mode:** Task 1 handles maintenance of existing tests. Task 2 handles new test generation from the GENERATE list. The two tasks are independent — maintenance completions never reduce the generate budget. Only generate tests for NEW endpoints not already covered by existing bot tests.
-## Task 0: UI Pre-Scan (runs before \`skyramp_analyze_changes\` when UI files are in the diff)
-When UI recommendations will ground in blueprint elements, the agent needs blueprint data in its context before writing any \`reasoning\` fields. This task captures it early so it's available when the recommendation catalog arrives in Task 1.
-**The app is already running** — the eval / Testbot runtime started it before this prompt was issued, and it is reachable at the \`baseUrl\` field of \`.skyramp/workspace.yml\`. Do NOT run \`docker compose up\`, \`yarn start\`, \`npm run dev\`, the \`serverStartCommand\` from workspace.yml, or any other server-bootstrapping command — those will at best be no-ops (the container is already healthy) and at worst hang on a wait loop and consume your time budget. Navigate directly via \`browser_navigate\`. If \`browser_navigate\` fails with a real connection refused, log it in \`issuesFound\` and proceed source-grounded; do not attempt to start the server yourself.
-Check for UI files via: \`Bash git diff ${baseBranch ? baseBranch : 'HEAD~1'} --name-only --diff-filter=AM -- ${UI_FILE_GIT_PATHSPEC}\` from \`${repositoryPath}\`. (Uses git's pathspec filter directly so no grep/rg invocation is needed.) If the command returns nothing, skip this task entirely (backend-only PR) and proceed to Task 1.
-If UI files are found, for each changed UI file enumerate candidate pages using the strategy ladder below, then **take the union** of strategies 1 and 2 (don't stop at the first that yields results). Strategy 3 is the root-URL fallback used only when strategies 1 and 2 both return empty.
-${uiPathStrategies}
-Capture \`browser_blueprint\` on each candidate page from the union. The app is running post-PR; no pre-PR baseline is available.
-**Return shape for \`browser_blueprint\`:** the first call at any URL returns \`{ isFullCapture: true, pageHash, blueprint }\` with the full structural payload. A subsequent call at the *same* URL automatically returns \`{ isFullCapture: false, pageHash, previousPageHash, delta }\` — the delta is computed against your prior capture at that URL. Both shapes are valid and load-bearing; key off \`isFullCapture\` to know which one you got.
-**After the initial capture, verify the changed feature is actually visible in the blueprint.** Search the captured blueprint for any of: the changed component's name as a \`logicalName\` / \`accessibleName\`, its \`testId\` (look for \`data-testid\` patterns derived from the component name), or distinctive class names from the diff. If none appear, the changed feature is likely behind a UI gate — a modal trigger, a dropdown, a tab, an accordion, or a conditional render. In that case:
-  1. Identify the most likely trigger from the route blueprint (a button whose accessibleName matches the feature — "Edit", "Add", "Open", or the component name itself).
-  2. \`browser_click\` the trigger.
-  3. Re-capture \`browser_blueprint\` — the new blueprint should now contain the changed feature's elements.
-  4. If still not visible after one click, log an \`issuesFound\` entry of \`info\` severity describing what you tried and proceed with whatever blueprint data you have. Do NOT iterate more than once per candidate page.
-This is a deliberate, scoped exploration — one click max per candidate page. It exists because route-level blueprints often miss modal/dialog/conditional content, and a recommendation grounded in the empty home page of a route is no better than a source-grounded recommendation.
+<!-- TODO(SKYR-3636 follow-up): migrate Task 1 + Task 2 step bodies to PromptPlan
+     (src/prompts/test-recommendation/promptPlan.ts) so step numbers don't have
+     to be hand-maintained when steps are added or reordered. -->
+## Task 1: Analyze & Maintain
-**Thresholds for how many to capture:**
-- **≤5 candidates:** capture all.
-- **6-15 candidates:** capture all, but note the count in \`issuesFound\` as \`info\` severity so high-fanout cases surface in post-hoc analysis.
-- **>15 candidates:** prioritize by diff proximity and capture the top 15. Ranking: (a) pages whose source imports name the changed component directly, not via re-export chains; (b) route entrypoints over nested layouts; (c) pages in the diff's own route segment if the PR also changes routes.
+1. **Pre-flight UI enumeration.** Call \`skyramp_ui_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}"${uiCredentials ? `, \`uiCredentials\`: <use the value from <ui-credentials> in your context>` : ""}. The response returns \`uiContext\` (\`changedFrontendFiles\`, \`candidateUiPages\`) and capture instructions.
-Token-cost note: blueprint capture is a few hundred ms per page, so 15 pages is ~3-5 seconds of wall-clock. The count cap at 15 prevents pathological cases (shared design-system components imported by 100+ routes) from dominating TestBot runtime. **Missing a candidate is worse than over-capturing within the budget.**
+   **If the response says "No UI changes detected"** → skip ahead to step 2.
-Keep the captured blueprints in your working context through the rest of Task 1 and Task 2. They are the source of truth for UI-test \`reasoning\` fields in Task 1's recommendation catalog (the catalog itself will tell you what format to use).
+   **Otherwise:** for each candidate URL in the response${uiCredentials ? " (after logging in via the credentials provided)" : ""}, \`browser_navigate\` to the URL, then \`browser_blueprint\` to capture. The captures stay in your tool-result history — they're the element vocabulary you'll use when writing UI rec \`reasoning\` fields in step 2. You do NOT need to thread them back into a tool call.
-**Failure fallback:** if the diff check fails, the app is unreachable, or \`browser_blueprint\` fails on every candidate page, skip Task 0 and proceed source-grounded. Log one \`issuesFound\` entry describing the failure mode. Non-UI work is unaffected.
+   If a candidate URL 404s or redirects, navigate from the workspace baseUrl and explore. If \`browser_blueprint\` fails on every candidate, proceed to step 2 and log an \`issuesFound\` info entry — UI recommendations will fall back to source-grounded prose.
-## Task 1: Analyze & Maintain
-1. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}${testsRepoDir ? `, \`testsRepoDir\`: "${testsRepoDir}"` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations (${maxGenerate} to generate, ${maxRecommendations - maxGenerate} as additional).${prNumber ? " Uses PR comment history to avoid re-recommending already-generated tests." : ""}
+2. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}${testsRepoDir ? `, \`testsRepoDir\`: "${testsRepoDir}"` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations${prNumber ? " (using PR comment history to avoid re-recommending already-generated tests)" : ""} along with the UI recommendation authoring rules. Use the blueprints already in your context (from step 1) to ground UI rec reasoning.
    **If \`skyramp_analyze_changes\` returns an error:** retry once only if the error is transient (timeout, network blip, temporary unavailability) — do NOT retry for permanent errors (invalid repository path, missing required parameter, authentication failure). If it fails again, call \`skyramp_submit_report\` with a minimal valid payload: leave all test arrays empty and add the error to \`issuesFound\`. Refer to the \`skyramp_submit_report\` schema for required fields. Do NOT attempt Task 2 without a valid stateFile.
    **If all changed files are non-application** (CI/CD, docs, lock files, config) → skip to Task 3 (Submit Report) with empty arrays and a single \`issuesFound\` entry explaining why (same format as the zero-test path below).
-2. **Maintain existing tests** using the rules in \`<drift_analysis_rules>\` below. For each existing test reported by \`skyramp_analyze_changes\`, score it and choose the action exactly as directed by the Action Decision Matrix in \`<drift_analysis_rules>\`. Only read test files that require action per that matrix — do NOT read files that will be IGNORED. **Do NOT read source files (routers, models, CRUD, components) — all the information you need is in the \`skyramp_analyze_changes\` output and the diff.** When reading multiple test files, **read them all in a single parallel batch** — do NOT read them one at a time. Apply actions directly. Results go in \`testMaintenance\`.
+3. **Maintain existing tests** using the rules in \`<drift_analysis_rules>\` below. For each existing test reported by \`skyramp_analyze_changes\`, score it and choose the action exactly as directed by the Action Decision Matrix in \`<drift_analysis_rules>\`. Only read test files that require action per that matrix — do NOT read files that will be IGNORED. **Do NOT read source files (routers, models, CRUD, components) — all the information you need is in the \`skyramp_analyze_changes\` output and the diff.** When reading multiple test files, **read them all in a single parallel batch** — do NOT read them one at a time. Apply actions directly. Results go in \`testMaintenance\`.
 ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repositoryPath })}
-3. **Code review:** From the \`skyramp_analyze_changes\` output and the existing test files you read for maintenance, note any logic bugs. Do NOT read additional source files just for code review — use what is already available from the analysis and test file reads. Common patterns to flag:
+4. **Code review:** From the \`skyramp_analyze_changes\` output and the existing test files you read for maintenance, note any logic bugs. Do NOT read additional source files just for code review — use what is already available from the analysis and test file reads. Common patterns to flag:
    - Computed fields not recalculated after mutation (e.g. \`total_amount\` unchanged after items are added/removed)
    - Incomplete CRUD: create without cleanup, update that adds new records without removing old ones
    - Missing input validation on new endpoints
@@ -112,19 +78,7 @@ ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repository
    - Incorrect arithmetic in business logic (discount calculations, price aggregation)
    Log each finding in \`issuesFound\` with a \`severity\` (critical/high/medium/low). These bugs should inform your test design in Task 2.
-4. **UI Recommendation Grounding** — when UI files are in the diff, Task 0 captured blueprint data for candidate pages. Task 1 step 1's \`skyramp_analyze_changes\` output contains a "UI recommendation grounding" section inside its prompt text — that section defines the exact format for \`reasoning\` fields on \`testType: "ui"\` entries (role/accessibleName/testId/stableId/logicalName tuple, plus contextText for repeating-element rows). Use the captured blueprints from Task 0 to fill in those \`reasoning\` fields exactly as that section specifies.
-   **Blueprint Citation Invariant.** Every UI element you cite by \`role\`, \`accessibleName\`, \`testId\`, \`stableId\`, or \`logicalName\` — in a recommendation's \`reasoning\` field, in a generated test's assertion, or in an \`issuesFound\` entry — must come from a \`browser_blueprint\` call you actually made. The cited string must appear verbatim in a captured blueprint. Seeing related, parent, or sibling elements is NOT the same as seeing the element you want to cite.
-   When a citation isn't yet backed by a blueprint, do ONE of:
-   - **Capture once more.** \`browser_click\` the likely trigger (modal opener, tab, accordion, drawer), then \`browser_blueprint\`. Same-URL re-captures auto-diff against your prior call — the response's \`delta.elementsAdded\` is where the new element should appear.
-   - **Drop the citation.** Rewrite without the unverified tuple — source-grounded prose is fine. Add an \`issuesFound\` entry of \`info\` severity: \`"Blueprint capture missed <element name>; recommendation grounded in source diff only"\`.
-   Do not fabricate tuple values from the PR description, source diff, or component name. A fabricated tuple looks like grounding but isn't — and downstream test generation will emit assertions against names that don't exist in the rendered DOM.
-   **Non-UI entries (contract / integration / e2e / batch-scenario) are unaffected.** Their \`reasoning\` fields use the pre-existing formats — endpoint paths, request/response schemas, fixture chains. Do not reformat them.
-   **Failure fallback:** if Task 0 ran but the captured inventory is empty for a candidate page (e.g. pre-scan picked the wrong page), or if Task 0 logged a failure, UI entries fall back to source-grounded reasoning. Legitimate outcome.
+5. **Apply the UI Recommendation Authoring Rules.** \`skyramp_analyze_changes\` returns an authoring-rules section that defines how UI recommendation \`reasoning\` fields should be written (natural prose, no internal-identifier syntax, ground in elements observed via earlier \`browser_blueprint\` calls, fall back to source-grounded prose when no captures are available). Apply those rules when authoring UI rec reasoning. Non-UI recommendations (contract / integration / e2e / batch-scenario) are unaffected by these rules and use their pre-existing formats — do not reformat them.
 ---`;
     const serviceContext = services?.length ? buildServiceContext(services) : '';
@@ -271,9 +225,9 @@ ${CONTRACT_MODE_GUIDANCE}
   **Capture-act-capture (applies only when recording a UI trace):**
-  **Skip this entire section if Task 0's UI pre-scan found no UI files** (backend-only PR). The capture-act-capture pattern is for UI trace recording only — there's no UI trace to record on a backend-only PR. Continue to the non-UI test-type instructions below.
+  **Skip this entire section if \`uiContext\` was absent or \`changedFrontendFiles\` was empty in step 1's response** (backend-only PR). The capture-act-capture pattern is for UI trace recording only — there's no UI trace to record on a backend-only PR. Continue to the non-UI test-type instructions below.
-  **Reminder — the UI test priority rule above still applies.** If the diff contains frontend/UI changes, you still MUST attempt to generate at least one UI test. Capture-act-capture is **how** you record that test, not **whether** you record one — do not substitute UI recommendations for actually recording a trace. Task 1 step 4 produced the grounded recommendations; Task 2 implements one or more of them.
+  **Reminder — the UI test priority rule above still applies.** If the diff contains frontend/UI changes, you still MUST attempt to generate at least one UI test. Capture-act-capture is **how** you record that test, not **whether** you record one — do not substitute UI recommendations for actually recording a trace. UI rec reasoning was already grounded in the upstream blueprints from Task 1 step 1; Task 2's capture-act-capture is for the trace's own assertions, not for retroactively rewriting recommendation reasoning.
   This pattern produces delta-derived assertions from blueprint diffs. Diff-derived assertions catch state changes more reliably than author-inference — the diff tells you what actually changed on the page so the assertion is grounded in observable state, not in guessing what "success" looks like.
@@ -291,9 +245,11 @@ ${CONTRACT_MODE_GUIDANCE}
     3. Execute the action via \`browser_click\` / \`browser_type\` / \`browser_navigate\`. The \`ref\` comes from \`browser_snapshot\` as today.
-    4. **After** the action: \`browser_blueprint\` again. **The response IS the diff** — because you already captured at this URL in step 1, the second call returns \`{ isFullCapture: false, pageHash, previousPageHash, delta }\`. The \`delta\` field already contains \`elementsAdded\`, \`elementsRemoved\`, \`textChanges\`, \`repeatingCountChanges\`, \`urlChange\`. You do **not** need to call \`browser_blueprint_diff\` for same-URL captures — that tool is only for cross-URL comparisons. An empty delta (all arrays empty) is itself a meaningful signal: the action did not change observable DOM (e.g. a silent failure the test should catch).
+    4. **After** the action: \`browser_blueprint\` again. The response shape depends on whether the action navigated:
+       - **Same URL (modal/tab/in-place mutation):** \`{ isFullCapture: false, pageHash, previousPageHash, delta, possibleAssertions }\`. The \`delta\` field contains \`elementsAdded\`, \`elementsRemoved\`, \`textChanges\`, \`repeatingCountChanges\`. The \`possibleAssertions\` field is a mechanical translation of those entries into Playwright \`expect(...)\` candidates — see step 5. You do **not** need to call \`browser_blueprint_diff\` here — that tool is only for cross-URL comparisons. An empty delta (all arrays empty) is itself a meaningful signal: the action did not change observable DOM (e.g. a silent failure the test should catch).
+       - **Navigated to a new URL** (e.g. router transition, link click, programmatic \`browser_navigate\`): \`{ isFullCapture: true, pageHash, blueprint }\` — a fresh full capture of the new page. No \`possibleAssertions\` here (no delta to translate). If you need a structured cross-URL diff, call \`browser_blueprint_diff(beforeBlueprint, afterBlueprint)\` explicitly; otherwise search the new blueprint for the elements your assertion will target.
-    5. For each delta entry worth verifying, \`browser_assert\` whose target and expectation come from the delta. Example: delta reports \`repeatingCountChanges\` (\`view_details_for_order_btn\`: 12 → 13) → \`browser_assert\` on \`toHaveCount(13)\` against the repeating element's \`accessibleNameTemplate\`. The existing "at least one \`browser_assert\` per page navigated" rule still applies; the delta will naturally surface ≥1 assertable signal per action.
+    5. **The AFTER response includes a \`possibleAssertions[]\` array — these are mechanical translations of delta entries into Playwright \`expect(...)\` candidates, available if any of them happen to match the assertion you'd write anyway.** Each entry has \`{ code, rationale, tier }\` where \`code\` is ready-to-use, \`rationale\` explains the source delta entry, and \`tier\` is HIGH/MEDIUM/LOW. **Read them, but do not feel obligated to use them.** They are heavily biased toward visibility checks (\`toBeVisible\` / \`not.toBeVisible\`), which are often shallow assertions — a passing visibility check does not mean the feature works. The right assertion target depends on what the test is *for*: if you're testing a state-changing action (form submit, button click that mutates data), prefer assertions on the post-action state (computed values, count changes, server-derived fields). Use a \`possibleAssertions\` candidate when its \`code\` already expresses what you would have written; ignore the array entirely when none of the candidates match the test's actual purpose. Adding visibility assertions just because they're available reduces test value; one well-targeted assertion beats five visibility checks of incidental DOM elements (modal scaffolding, navigation chrome). The pre-existing rule still applies: **at least one \`browser_assert\` per page navigated, verifying a business outcome — not just that an element is visible.**
   **The Blueprint Citation Invariant applies during recording too.** Every assertion you emit cites element names — those names must come from blueprint captures, not invention. For N user-intent-level actions, the reference target is N+1 \`browser_blueprint\` calls (the first returns full, the rest return deltas). Traces that follow the pattern produce assertions grounded in observable state changes; traces that skip captures fall back to author-inferred assertions and risk citing names that don't exist in the rendered DOM.

package/build/prompts/testbot/testbot-prompts.test.js CHANGED Viewed

@@ -203,6 +203,8 @@ describe("uiCredentials in getTestbotPrompt", () => {
     });
 });
 describe("drift analysis inline embedding", () => {
+    beforeAll(() => { process.env.SKYRAMP_FEATURE_TESTBOT = "1"; });
+    afterAll(() => { delete process.env.SKYRAMP_FEATURE_TESTBOT; });
     function basePrompt() {
         return getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
     }
@@ -226,44 +228,58 @@ describe("drift analysis inline embedding", () => {
         expect(rulesPos).toBeGreaterThan(task1Pos);
         expect(rulesPos).toBeLessThan(task2Pos);
     });
-    it("Task 1 step 2 prose references drift_analysis_rules tag", () => {
+    it("Task 1 step 3 prose references drift_analysis_rules tag", () => {
         const prompt = basePrompt();
         expect(prompt).toContain("rules in `<drift_analysis_rules>`");
     });
 });
-describe("Task 0 UI pre-scan (Phase C D-1.a)", () => {
-    it("emits the Task 0 UI pre-scan section in full-analysis mode", () => {
+describe("UI grounding via Task 2 capture-act-capture", () => {
+    it("surfaces uiContext as guidance, not a contract", () => {
         const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
-        expect(prompt).toContain("## Task 0: UI Pre-Scan");
-        expect(prompt).toContain("-- '*.tsx' '*.jsx' '*.vue' '*.svelte' '*.html' '*.xml'");
-        expect(prompt).toContain("browser_blueprint");
-    });
-    it("places Task 0 before Task 1 (skyramp_analyze_changes)", () => {
+        // uiContext fields are explained inline so the agent knows what to do with
+        // them. Step 1 provides candidate URLs but gives fallback instructions
+        // ("navigate from the workspace baseUrl and explore") for 404s/redirects,
+        // treating candidates as guidance not a rigid contract.
+        expect(prompt).toContain("uiContext");
+        expect(prompt).toContain("candidateUiPages");
+        expect(prompt).toContain("changedFrontendFiles");
+        expect(prompt).toMatch(/navigate from the workspace baseUrl and explore/i);
+    });
+    it("step 5 enforces Blueprint Citation Invariant in natural prose", () => {
+        const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
+        // Step 5 is the citation-invariant guardrail, not a "fill in tuples"
+        // post-processing step (slice 4 cleanup: recs are grounded upstream).
+        expect(prompt).toContain("Blueprint Citation Invariant");
+        // Reasoning must be natural prose, NOT internal-identifier syntax.
+        expect(prompt).toMatch(/natural prose/i);
+        expect(prompt).toMatch(/internal-identifier syntax/i);
+    });
+    it("Task 2 no longer instructs the agent to fill in tuples post-hoc", () => {
         const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
-        const task0Idx = prompt.indexOf("## Task 0: UI Pre-Scan");
-        const task1Idx = prompt.indexOf("## Task 1: Analyze & Maintain");
-        expect(task0Idx).toBeGreaterThanOrEqual(0);
-        expect(task1Idx).toBeGreaterThan(task0Idx);
-    });
-    it("does not emit Task 0 in follow-up mode (userPrompt set)", () => {
-        // Call signature (14 positional args): prTitle, prDescription, summaryOutputFile,
-        // repositoryPath, baseBranch?, maxRecommendations?, maxGenerate?, _maxCritical?,
-        // prNumber?, userPrompt?, services?, stateOutputFile?, uiCredentials?, testsRepoDir?
-        const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath, undefined, undefined, undefined, undefined, undefined, "add more tests");
-        expect(prompt).not.toContain("## Task 0: UI Pre-Scan");
-    });
-    it("Task 1 step 4 references Task 0; does not re-specify format", () => {
+        // After slice 4 cleanup: Task 2 captures are for trace recording's own
+        // assertions, not for retroactively rewriting recommendation reasoning.
+        // The phrase "fill in tuples" must NOT appear anywhere in the prompt.
+        expect(prompt).not.toMatch(/fill in tuples/i);
+        expect(prompt).not.toMatch(/return to step 5 and fill/i);
+    });
+    it("Task 2 step 5 mentions possibleAssertions as available, NOT as required (slice 5.5 softening)", () => {
         const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
-        expect(prompt).toContain("UI Recommendation Grounding");
-        expect(prompt).toMatch(/UI recommendation grounding.*section/);
-        expect(prompt).toContain("Use the captured blueprints from Task 0");
-        // Negative half of test name: step 4 must NOT re-specify the tuple
-        // format (that lives in step 1's grounding section; duplicating would
-        // reintroduce the contradictory-instruction problem Phase C D-1.a fixes).
-        const step4Start = prompt.indexOf("4. **UI Recommendation Grounding**");
-        const step4End = prompt.indexOf("Legitimate outcome.", step4Start);
-        const step4Slice = prompt.slice(step4Start, step4End);
-        expect(step4Slice).not.toMatch(/role=<role>,\s*accessibleName=/);
+        // Slice 5: AFTER-action browser_blueprint response includes
+        // possibleAssertions[] — mechanically translated candidates.
+        expect(prompt).toContain("possibleAssertions");
+        // Slice 5.5: prompt explicitly tells the agent NOT to feel obligated.
+        // Two P09 runs with the prior "emit at least one" directive showed the
+        // agent over-using shallow visibility assertions at the expense of
+        // integration-test depth. The softened version says: read them, use
+        // when they happen to match what you'd write anyway, ignore otherwise.
+        expect(prompt).toMatch(/do not feel obligated/i);
+        expect(prompt).toMatch(/biased toward visibility/i);
+        // The candidate format is still documented.
+        expect(prompt).toMatch(/\bcode\b.*\brationale\b.*\btier\b/i);
+        // The pre-existing "at least one browser_assert per page navigated"
+        // rule should be preserved (it's about meaningful business-outcome
+        // assertions, not about possibleAssertions).
+        expect(prompt).toMatch(/at least one .browser_assert. per page navigated/i);
     });
 });
 describe("buildWorkspaceRecoveryPrefix", () => {
@@ -327,3 +343,21 @@ describe("testsRepoDir in getTestbotPrompt", () => {
         expect(prompt).not.toContain("testsRepoDir");
     });
 });
+describe("testbot prompt blueprint-grounded recommendations (slice 4)", () => {
+    it("instructs the agent to call skyramp_ui_analyze_changes before skyramp_analyze_changes", () => {
+        const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
+        const uiacIdx = prompt.indexOf("skyramp_ui_analyze_changes");
+        const acIdx = prompt.indexOf("skyramp_analyze_changes");
+        expect(uiacIdx).toBeGreaterThan(-1);
+        expect(acIdx).toBeGreaterThan(uiacIdx);
+    });
+    it("Task 1 step 1 instructs the agent to capture blueprints (without threading them through a param)", () => {
+        const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
+        // Captures stay in tool-result history; analyze_changes returns the
+        // authoring rules and the agent supplies the captured vocabulary.
+        expect(prompt).toMatch(/browser_blueprint`?\s*to capture/i);
+        expect(prompt).toMatch(/tool-result history/i);
+        // Make sure we removed the old capturedBlueprints threading directive.
+        expect(prompt).not.toMatch(/capturedBlueprints/);
+    });
+});

package/build/services/ScenarioGenerationService.js CHANGED Viewed

@@ -2,7 +2,8 @@ import { AUTH_PLACEHOLDER_TOKEN } from "../types/TestTypes.js";
 import { isAuthorizationHeaderName } from "../utils/workspaceAuth.js";
 import { inferExpectedStatus } from "../utils/httpDefaults.js";
 import { logger } from "../utils/logger.js";
-import { stageGeneratedPaths } from "../utils/gitStaging.js";
+import { stageGeneratedPaths, resolveOutputDir } from "../utils/gitStaging.js";
+import { getTestsRepoDir } from "../utils/AnalysisStateManager.js";
 import fs from "fs";
 import path from "path";
 // Keys that trigger built-in prototype setters when used as bracket-notation
@@ -27,6 +28,15 @@ export class ScenarioGenerationService {
                     isError: true,
                 };
             }
+            // In cross-repo mode, redirect outputDir to the test repo clone.
+            const resolved = resolveOutputDir(params.outputDir, getTestsRepoDir());
+            if (resolved !== params.outputDir) {
+                logger.info("Cross-repo: redirecting scenario outputDir to test repo", {
+                    original: params.outputDir,
+                    redirected: resolved,
+                });
+                params.outputDir = resolved;
+            }
             const scenarioName = params.scenarioName.replace(/ /g, "-").toLowerCase();
             const fileName = `scenario_${scenarioName}.json`;
             const filePath = path.join(params.outputDir, fileName);

package/build/services/TestExecutionService.js CHANGED Viewed

@@ -156,14 +156,25 @@ function filterComments(lines) {
  * Detect session file paths referenced in test files
  * Looks for storageState patterns in TypeScript/JavaScript/Python/Java/C# test files
  * Excludes matches found in comments
+ *
+ * Also handles the codegen pattern `path.join(__dirname, '<filename>')` (TS/JS) —
+ * the filename is resolved relative to the test file's directory on the host so
+ * the existing absolute-path mount branch makes it visible at the same path
+ * inside the container (Playwright's TS loader resolves __dirname to the host
+ * workspace path at runtime).
  */
-function detectSessionFiles(testFilePath) {
+export function detectSessionFiles(testFilePath) {
     try {
         const content = fs.readFileSync(testFilePath, "utf-8");
         const lines = content.split("\n");
         const sessionFiles = [];
         // Pattern for TypeScript/JavaScript: storageState: '/path/to/file' or storageState: "/path/to/file"
         const tsJsPattern = /storageState:\s*['"]([^'"]+)['"]/g;
+        // Pattern for TypeScript/JavaScript with path.join(__dirname, 'filename') — covers
+        // both the inline form (`storageState: path.join(__dirname, '...')`) and the
+        // variable-assignment form (`const X = path.join(__dirname, '...')` then
+        // `storageState: X`) the skyramp codegen emits.
+        const tsJsPathJoinPattern = /path\.join\s*\(\s*__dirname\s*,\s*['"]([^'"]+)['"]\s*\)/g;
         // Pattern for Python: storage_state='/path/to/file' or storage_state="/path/to/file"
         const pythonPattern = /storage_state\s*=\s*['"]([^'"]+)['"]/g;
         // Pattern for Java: setStorageState(Paths.get("path")) or setStorageState("path")
@@ -173,6 +184,7 @@ function detectSessionFiles(testFilePath) {
         const csharpPattern = /StorageState(?:Path)?\s*=\s*['"]([^'"]+)['"]/g;
         // Filter out comments
         const codeLines = filterComments(lines);
+        const testFileDir = path.dirname(testFilePath);
         // Process each non-comment line
         for (const line of codeLines) {
             // Try all patterns on this line
@@ -181,6 +193,12 @@ function detectSessionFiles(testFilePath) {
             while ((match = tsJsPattern.exec(line)) !== null) {
                 sessionFiles.push(match[1]);
             }
+            tsJsPathJoinPattern.lastIndex = 0;
+            while ((match = tsJsPathJoinPattern.exec(line)) !== null) {
+                // Resolve relative to the test file's host directory so the absolute-
+                // path branch below mounts it at the same path inside the container.
+                sessionFiles.push(path.resolve(testFileDir, match[1]));
+            }
             pythonPattern.lastIndex = 0;
             while ((match = pythonPattern.exec(line)) !== null) {
                 sessionFiles.push(match[1]);
@@ -357,39 +375,79 @@ export class TestExecutionService {
                 },
             ],
         };
-        // Mount workspace files, skipping EXCLUDED_MOUNT_ITEMS completely
+        // Mount workspace files, skipping EXCLUDED_MOUNT_ITEMS completely.
+        //
+        // Each workspace entry is bind-mounted at BOTH the canonical /home/user
+        // path AND its host-absolute path. The dual mount lets the test resolve
+        // any absolute reference the codegen happens to embed (storageState,
+        // fixture paths, snapshots) — including the host workspace path that
+        // Playwright's TypeScript loader sometimes produces from `__dirname` —
+        // without needing source-code detection. EXCLUDED_MOUNT_ITEMS
+        // (node_modules) stays excluded at both targets; MOUNT_NULL_ITEMS
+        // shadows (package.json → empty JSON, etc.) and PLAYWRIGHT_CONFIG_FILES
+        // shadows (minimal config) are applied at both targets too so the
+        // protections survive regardless of which path the test resolves to.
         const workspaceFiles = fs.readdirSync(workspacePath);
         const filesToMount = workspaceFiles.filter((file) => !EXCLUDED_MOUNT_ITEMS.includes(file) && !MOUNT_NULL_ITEMS.includes(file));
-        hostConfig.Mounts?.push(...filesToMount.map((file) => ({
-            Type: "bind",
-            Target: path.join(containerMountPath, file),
-            Source: path.join(workspacePath, file),
-        })));
+        // Single Set tracks every mount target we've added so far. Used to dedupe
+        // both the workspace-mirror push (when workspacePath happens to equal
+        // containerMountPath) and the session-file push below.
+        const mountedPaths = new Set();
+        const pushMount = (mount) => {
+            if (mountedPaths.has(mount.Target))
+                return;
+            mountedPaths.add(mount.Target);
+            hostConfig.Mounts.push(mount);
+        };
+        const mirrorAtHostPath = workspacePath !== containerMountPath;
+        for (const file of filesToMount) {
+            const source = path.join(workspacePath, file);
+            pushMount({
+                Type: "bind",
+                Target: path.join(containerMountPath, file),
+                Source: source,
+            });
+            if (mirrorAtHostPath) {
+                pushMount({ Type: "bind", Target: source, Source: source });
+            }
+        }
         // Mount MOUNT_NULL_ITEMS (found recursively) to /dev/null (or empty JSON for .json files)
         const nullPaths = findExcludedPaths(workspacePath, MOUNT_NULL_ITEMS);
         for (const absolutePath of nullPaths) {
-            const target = path.join(containerMountPath, path.relative(workspacePath, absolutePath));
+            const rel = path.relative(workspacePath, absolutePath);
             const source = absolutePath.endsWith(".json") ? EMPTY_JSON_PATH : "/dev/null";
-            hostConfig.Mounts?.push({
+            pushMount({
                 Type: "bind",
                 Source: source,
-                Target: target,
+                Target: path.join(containerMountPath, rel),
             });
+            if (mirrorAtHostPath) {
+                pushMount({ Type: "bind", Source: source, Target: absolutePath });
+            }
         }
         // Mount Playwright config files with minimal config (shadows repo configs that may
         // import dotenv or other dependencies not available in the executor container)
         const playwrightConfigPaths = findExcludedPaths(workspacePath, PLAYWRIGHT_CONFIG_FILES);
         for (const absolutePath of playwrightConfigPaths) {
-            const target = path.join(containerMountPath, path.relative(workspacePath, absolutePath));
-            hostConfig.Mounts?.push({
+            const rel = path.relative(workspacePath, absolutePath);
+            pushMount({
                 Type: "bind",
                 Source: MINIMAL_PLAYWRIGHT_CONFIG_PATH,
-                Target: target,
+                Target: path.join(containerMountPath, rel),
             });
+            if (mirrorAtHostPath) {
+                pushMount({
+                    Type: "bind",
+                    Source: MINIMAL_PLAYWRIGHT_CONFIG_PATH,
+                    Target: absolutePath,
+                });
+            }
         }
-        // Detect and mount session files
+        // Detect and mount session files referenced outside the workspace
+        // (anything inside the workspace is already covered by the dual mount
+        // above; the session-file loop is the safety net for tests that point
+        // at a session in some other directory).
         const sessionFiles = detectSessionFiles(options.testFile);
-        const mountedPaths = new Set(); // Track mounted file paths to prevent duplicates
         for (const sessionFile of sessionFiles) {
             let sessionFileSource;
             let sessionFileTarget;

package/build/services/TestExecutionService.test.js CHANGED Viewed

@@ -142,6 +142,49 @@ describe("buildContainerEnv", () => {
         expect(env).toContain("API_KEY=my-key");
     });
 });
+describe("detectSessionFiles", () => {
+    // Import after mocks are set up so the fs mock applies
+    let detectSessionFiles;
+    let mockReadFileSync;
+    beforeAll(async () => {
+        const mod = await import("./TestExecutionService.js");
+        detectSessionFiles = mod.detectSessionFiles;
+        // eslint-disable-next-line @typescript-eslint/no-require-imports
+        mockReadFileSync = require("fs").readFileSync;
+    });
+    it("detects string-literal storageState (TS/JS)", () => {
+        mockReadFileSync.mockReturnValueOnce(`test.use({ storageState: '/abs/path/session.json' });`);
+        expect(detectSessionFiles("/ws/test.spec.ts")).toEqual([
+            "/abs/path/session.json",
+        ]);
+    });
+    it("detects skyramp codegen path.join(__dirname, '<file>') pattern and resolves to host-absolute path", () => {
+        // Reproduces SKYR-3321 generated test shape — must resolve to the host
+        // absolute path so the executor's absolute-path mount branch makes the
+        // file visible at that same path inside the container.
+        mockReadFileSync.mockReturnValueOnce(`
+import path from 'path';
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const SESSION_STORAGE = path.join(__dirname, 'skyramp_session_storage.json');
+test.use({ storageState: SESSION_STORAGE });
+`);
+        expect(detectSessionFiles("/Users/pedro/projects/cisco-xdr-tests/xdr_dashboard.spec.ts")).toEqual([
+            "/Users/pedro/projects/cisco-xdr-tests/skyramp_session_storage.json",
+        ]);
+    });
+    it("detects inline storageState: path.join(__dirname, '<file>')", () => {
+        mockReadFileSync.mockReturnValueOnce(`test.use({ storageState: path.join(__dirname, 'session.json') });`);
+        expect(detectSessionFiles("/ws/spec.ts")).toEqual(["/ws/session.json"]);
+    });
+    it("ignores storageState references inside comments", () => {
+        mockReadFileSync.mockReturnValueOnce(`
+// storageState: '/should/not/match.json'
+// path.join(__dirname, 'also-not.json')
+test('x', () => {});
+`);
+        expect(detectSessionFiles("/ws/spec.ts")).toEqual([]);
+    });
+});
 describe("TestExecutionService.executeTest - Docker env forwarding", () => {
     // Import after mocks are set up
     let TestExecutionService;
@@ -210,4 +253,66 @@ describe("TestExecutionService.executeTest - Docker env forwarding", () => {
             e.startsWith("SKYRAMP_TEST_SERVICE_URL_"));
         expect(envWithBaseUrl).toHaveLength(0);
     });
+    // Approach B: every workspace mount is mirrored at the host-absolute path so
+    // tests that embed absolute references (storageState, fixtures, snapshots)
+    // resolve correctly inside the executor regardless of which path-shape the
+    // codegen happens to emit.
+    it("mirrors each workspace file mount at both /home/user/<f> and the host-absolute path", async () => {
+        // eslint-disable-next-line @typescript-eslint/no-require-imports
+        const fs = require("fs");
+        fs.readdirSync.mockImplementation((_path, options) => {
+            if (options?.withFileTypes) {
+                return [
+                    { name: "xdr_dashboard.spec.ts", isFile: () => true, isDirectory: () => false },
+                    { name: "skyramp_session_storage.json", isFile: () => true, isDirectory: () => false },
+                ];
+            }
+            return ["xdr_dashboard.spec.ts", "skyramp_session_storage.json"];
+        });
+        const mockContainer = { remove: jest.fn().mockResolvedValue(undefined) };
+        mockRun.mockResolvedValue([{ StatusCode: 0 }, mockContainer]);
+        const service = new TestExecutionService();
+        await service.executeTest({
+            testFile: "/Users/pedro/projects/cisco-xdr-tests/xdr_dashboard.spec.ts",
+            workspacePath: "/Users/pedro/projects/cisco-xdr-tests",
+            language: "typescript",
+            testType: "ui",
+        });
+        const dockerOptions = mockRun.mock.calls[0][3];
+        const targets = dockerOptions.HostConfig.Mounts.map((m) => m.Target);
+        // Canonical /home/user mount
+        expect(targets).toContain("/home/user/xdr_dashboard.spec.ts");
+        expect(targets).toContain("/home/user/skyramp_session_storage.json");
+        // Host-absolute mirror — the fix for absolute paths leaking out of `__dirname`
+        expect(targets).toContain("/Users/pedro/projects/cisco-xdr-tests/xdr_dashboard.spec.ts");
+        expect(targets).toContain("/Users/pedro/projects/cisco-xdr-tests/skyramp_session_storage.json");
+    });
+    it("does not double-mount when workspacePath equals /home/user", async () => {
+        // eslint-disable-next-line @typescript-eslint/no-require-imports
+        const fs = require("fs");
+        fs.readdirSync.mockImplementation((_path, options) => {
+            if (options?.withFileTypes) {
+                return [{ name: "test_file.py", isFile: () => true, isDirectory: () => false }];
+            }
+            return ["test_file.py"];
+        });
+        const mockContainer = { remove: jest.fn().mockResolvedValue(undefined) };
+        mockRun.mockResolvedValue([{ StatusCode: 0 }, mockContainer]);
+        const service = new TestExecutionService();
+        await service.executeTest({
+            testFile: "/home/user/test_file.py",
+            workspacePath: "/home/user",
+            language: "python",
+            testType: "smoke",
+        });
+        const dockerOptions = mockRun.mock.calls[0][3];
+        const targetCounts = {};
+        for (const m of dockerOptions.HostConfig.Mounts) {
+            targetCounts[m.Target] = (targetCounts[m.Target] ?? 0) + 1;
+        }
+        // No mount target should appear twice (no host-absolute mirror when workspace == /home/user)
+        for (const [t, n] of Object.entries(targetCounts)) {
+            expect({ target: t, count: n }).toEqual({ target: t, count: 1 });
+        }
+    });
 });

package/build/services/TestGenerationService.js CHANGED Viewed

@@ -8,7 +8,8 @@ import { getEntryPoint } from "../utils/telemetry.js";
 import { getLanguageSteps } from "../utils/language-helper.js";
 import { logger } from "../utils/logger.js";
 import { normalizeLanguageParams } from "../utils/normalizeParams.js";
-import { stageGeneratedPaths } from "../utils/gitStaging.js";
+import { stageGeneratedPaths, resolveOutputDir } from "../utils/gitStaging.js";
+import { getTestsRepoDir } from "../utils/AnalysisStateManager.js";
 export class TestGenerationService {
     client;
     constructor() {
@@ -18,6 +19,15 @@ export class TestGenerationService {
         try {
             // Normalize language/framework to handle LLM case variations
             normalizeLanguageParams(params);
+            // In cross-repo mode, redirect outputDir to the test repo clone.
+            const resolved = resolveOutputDir(params.outputDir, getTestsRepoDir());
+            if (resolved !== params.outputDir) {
+                logger.info("Cross-repo: redirecting outputDir to test repo", {
+                    original: params.outputDir,
+                    redirected: resolved,
+                });
+                params.outputDir = resolved;
+            }
             // Log prompt parameter using reusable utility
             logger.info("Generating test", {
                 prompt: params.prompt,

package/build/tools/executeSkyrampTestTool.js CHANGED Viewed

@@ -147,16 +147,7 @@ For detailed documentation visit: https://www.skyramp.dev/docs/quickstart`,
                     if (stateData && stateData.existingTests) {
                         const testIndex = stateData.existingTests.findIndex((t) => t.testFile === params.testFile);
                         if (testIndex >= 0) {
-                            stateData.existingTests[testIndex].execution = {
-                                passed: result.passed,
-                                duration: result.duration || 0,
-                                errors: result.errors || [],
-                                warnings: result.warnings || [],
-                                crashed: result.crashed || false,
-                                stdout: result.output || "",
-                                stderr: result.errors?.join("\n") || "",
-                                executionTimestamp: new Date().toISOString(),
-                            };
+                            stateData.existingTests[testIndex].execution = result;
                             await stateManager.writeData(stateData);
                             logger.info(`Updated stateFile with execution results for ${params.testFile}`);
                         }