@skyramp/mcp 0.2.3 → 0.2.5-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/build/playwright/registerPlaywrightTools.js +21 -25
  2. package/build/playwright/traceRecordingPrompt.js +2 -2
  3. package/build/prompts/test-maintenance/actionsInstructions.js +60 -0
  4. package/build/prompts/test-maintenance/drift-analysis-prompt.js +18 -101
  5. package/build/prompts/test-maintenance/driftAnalysisSections.js +210 -171
  6. package/build/prompts/test-recommendation/analysisOutputPrompt.js +1 -1
  7. package/build/prompts/test-recommendation/diffExecutionPlan.js +4 -3
  8. package/build/prompts/test-recommendation/recommendationSections.js +6 -6
  9. package/build/prompts/test-recommendation/scopeAssessment.js +3 -1
  10. package/build/prompts/test-recommendation/scopeAssessment.test.js +13 -0
  11. package/build/prompts/test-recommendation/test-recommendation-prompt.js +2 -2
  12. package/build/prompts/test-recommendation/test-recommendation-prompt.test.js +3 -3
  13. package/build/prompts/testbot/testbot-prompts.js +21 -17
  14. package/build/prompts/testbot/testbot-prompts.test.js +21 -17
  15. package/build/services/TestDiscoveryService.js +11 -43
  16. package/build/tools/submitReportTool.js +9 -12
  17. package/build/tools/submitReportTool.test.js +4 -5
  18. package/build/tools/test-management/actionsTool.js +160 -240
  19. package/build/tools/test-management/analyzeChangesTool.js +43 -18
  20. package/build/tools/test-management/analyzeTestHealthTool.js +17 -29
  21. package/build/utils/docker.test.js +1 -1
  22. package/build/utils/versions.js +1 -1
  23. package/node_modules/playwright/lib/mcp/skyramp/common/visualSnapshot.js +95 -0
  24. package/node_modules/playwright/lib/mcp/skyramp/loadTraceTool.js +2 -0
  25. package/node_modules/playwright/lib/mcp/skyramp/traceRecordingBackend.js +150 -2
  26. package/node_modules/playwright/lib/mcp/skyramp/visualSnapshotTool.js +63 -0
  27. package/node_modules/playwright/lib/mcp/test/skyRampExport.js +36 -0
  28. package/package.json +2 -2
  29. package/build/prompts/test-maintenance/drift-analysis-prompt.test.js +0 -116
@@ -21,7 +21,7 @@ function formatTestLocations(locs) {
21
21
  "|--------------|---------------------------------------------------------|\n" +
22
22
  rows + "\n\n" +
23
23
  "**Deduplication rule (apply this table before generating anything):**\n" +
24
- "- `[external]` tests: if a resource is covered by an `[external]` test, do NOT create a new test for the same HTTP method + resource + test type (e.g. an external integration test covering `POST /orders` blocks any new `POST` integration test on the `orders` resource). Do NOT attempt to UPDATE, REGENERATE, or DELETE external tests — they are user-maintained.\n" +
24
+ "- `[external]` tests: if a resource is covered by an `[external]` test, do NOT create a new parallel test for the same HTTP method + resource + test type. These tests still break when the API changes Task 1 maintenance applies to them the same as Skyramp tests (in-place UPDATE only; do not regenerate or delete).\n" +
25
25
  "- `[skyramp]` contract test: if the HTTP method + path already appears in a `[skyramp]` `covers:` entry of type `contract` → UPDATE that file, do NOT create a new one.\n" +
26
26
  "- `[skyramp]` integration test: if the primary (last mutating) step's method + path already appears in a `[skyramp]` `covers:` entry of type `integration` → UPDATE, do NOT create a new one.\n" +
27
27
  "- UI/E2E test: always create a new file — traces are distinct recordings.\n" +
@@ -477,7 +477,7 @@ ${interactionSection}
477
477
 
478
478
  **Two categories of test files (identified by tag):**
479
479
  - \`[skyramp]\` — generated by Skyramp tools. You may UPDATE these when the covered endpoint changes.
480
- - \`[external]\` — user-written tests (pytest, jest, junit, etc.) maintained outside Skyramp. Treat as read-only: use them to determine existing coverage but NEVER update, regenerate, or delete them.
480
+ - \`[external]\` — user-written tests (pytest, jest, junit, etc.). Do not generate a new parallel test file for an endpoint already covered by an external test. These tests still break when the API changes — Task 1 maintenance applies to them the same as to Skyramp tests (in-place UPDATE only; do not regenerate or delete).
481
481
 
482
482
  - Frameworks: ${analysis.existingTests.frameworks.join(", ") || "none"}
483
483
  ${formatTestLocations(analysis.existingTests.testLocations)}
@@ -993,7 +993,7 @@ describe("buildRecommendationPrompt — Mandatory Reasoning Protocol", () => {
993
993
  expect(protocol).toContain("requestBody");
994
994
  expect(protocol).toContain("endpointURL");
995
995
  expect(protocol).toContain("authHeader");
996
- expect(protocol).toContain("FK path params");
996
+ expect(protocol).toContain("Foreign-key path params");
997
997
  });
998
998
  it("reasoning protocol instructs to read source file when value cannot be sourced", () => {
999
999
  const protocol = buildReasoningProtocol();
@@ -1515,8 +1515,8 @@ describe("buildRecommendationPrompt — testFingerprint", () => {
1515
1515
  });
1516
1516
  const prompt = buildRecommendationPrompt(analysis);
1517
1517
  expect(prompt).toContain("[external]");
1518
- expect(prompt).toContain("do NOT create a new test");
1519
- expect(prompt).toContain("Do NOT attempt to UPDATE, REGENERATE, or DELETE external tests");
1518
+ expect(prompt).toContain("do NOT create a new parallel test");
1519
+ expect(prompt).toContain("Task 1 maintenance applies to them the same as Skyramp tests");
1520
1520
  });
1521
1521
  });
1522
1522
  // ---------------------------------------------------------------------------
@@ -2,7 +2,6 @@ import { z } from "zod";
2
2
  import { logger } from "../../utils/logger.js";
3
3
  import { AnalyticsService } from "../../services/AnalyticsService.js";
4
4
  import { MAX_TESTS_TO_GENERATE, MAX_RECOMMENDATIONS, MAX_CRITICAL_TESTS, PATH_PARAM_UUID_GUIDANCE, AUTH_CONFLICT_ERROR_MSG, } from "../test-recommendation/recommendationSections.js";
5
- import { buildDriftAnalysisPrompt } from "../test-maintenance/drift-analysis-prompt.js";
6
5
  import { getTraceRecordingPromptText } from "../../playwright/traceRecordingPrompt.js";
7
6
  import { isContractConsumerModeEnabled } from "../../utils/featureFlags.js";
8
7
  import { resolveServiceDetailsRef } from "../../utils/utils.js";
@@ -21,6 +20,9 @@ const CONTRACT_MODE_GUIDANCE = CONSUMER_MODE_ENABLED
21
20
  export function getTestbotPrompt(prTitle, prDescription, summaryOutputFile, repositoryPath, baseBranch, maxRecommendations = MAX_RECOMMENDATIONS, maxGenerate = MAX_TESTS_TO_GENERATE, _maxCritical = MAX_CRITICAL_TESTS, // Reserved — accepted for API compat but not yet wired into prompt
22
21
  prNumber, userPrompt, services, stateOutputFile, uiCredentials, testsRepoDir) {
23
22
  maxGenerate = Math.min(Math.max(maxGenerate, 0), maxRecommendations);
23
+ // TODO(SKYR-3636 follow-up): migrate Task 1 + Task 2 step bodies to PromptPlan
24
+ // (src/prompts/test-recommendation/promptPlan.ts) so step numbers don't have
25
+ // to be hand-maintained when steps are added or reordered.
24
26
  // For follow-up requests: emit the @skyramp-testbot header + guardrails + retrieve-recommendations step.
25
27
  // For first-run prompts: emit the full Task 1 analysis + maintenance section.
26
28
  const task1Section = userPrompt
@@ -49,26 +51,27 @@ Use those recommendations as your baseline. Only add or remove tests that the us
49
51
  : `
50
52
  **Incremental mode:** Task 1 handles maintenance of existing tests. Task 2 handles new test generation from the GENERATE list. The two tasks are independent — maintenance completions never reduce the generate budget. Only generate tests for NEW endpoints not already covered by existing bot tests.
51
53
 
52
- <!-- TODO(SKYR-3636 follow-up): migrate Task 1 + Task 2 step bodies to PromptPlan
53
- (src/prompts/test-recommendation/promptPlan.ts) so step numbers don't have
54
- to be hand-maintained when steps are added or reordered. -->
55
54
  ## Task 1: Analyze & Maintain
56
55
 
57
56
  1. **Pre-flight UI enumeration.** Call \`skyramp_ui_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}"${uiCredentials ? `, \`uiCredentials\`: <use the value from <ui-credentials> in your context>` : ""}. The response returns \`uiContext\` (\`changedFrontendFiles\`, \`candidateUiPages\`) and capture instructions.
58
57
 
59
- **If the response says "No UI changes detected"** → skip ahead to step 2.
58
+ **If the response says "No UI changes detected"** → skip ahead to \`skyramp_analyze_changes\`.
60
59
 
61
- **Otherwise:** for each candidate URL in the response${uiCredentials ? " (after logging in via the credentials provided)" : ""}, \`browser_navigate\` to the URL, then \`browser_blueprint\` to capture. The captures stay in your tool-result history — they're the element vocabulary you'll use when writing UI rec \`reasoning\` fields in step 2. You do NOT need to thread them back into a tool call.
60
+ **Otherwise:** for each candidate URL in the response${uiCredentials ? " (after logging in via the credentials provided)" : ""}, \`browser_navigate\` to the URL, then \`browser_blueprint\` to capture. The captures stay in your tool-result history — they're the element vocabulary you'll use when writing UI recommendation \`reasoning\` fields in \`skyramp_analyze_changes\`. You do NOT need to thread them back into a tool call.
62
61
 
63
- If a candidate URL 404s or redirects, navigate from the workspace baseUrl and explore. If \`browser_blueprint\` fails on every candidate, proceed to step 2 and log an \`issuesFound\` info entry — UI recommendations will fall back to source-grounded prose.
62
+ If a candidate URL 404s or redirects, navigate from the workspace baseUrl and explore. If \`browser_blueprint\` fails on every candidate, proceed to \`skyramp_analyze_changes\` and log an \`issuesFound\` info entry — UI recommendations will fall back to source-grounded prose.
64
63
 
65
- 2. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}${testsRepoDir ? `, \`testsRepoDir\`: "${testsRepoDir}"` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations${prNumber ? " (using PR comment history to avoid re-recommending already-generated tests)" : ""} along with the UI recommendation authoring rules. Use the blueprints already in your context (from step 1) to ground UI rec reasoning.
64
+ 2. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}${testsRepoDir ? `, \`testsRepoDir\`: "${testsRepoDir}"` : ""}. Use the blueprints already in your context (from \`skyramp_ui_analyze_changes\`) to ground UI recommendation reasoning.
66
65
  **If \`skyramp_analyze_changes\` returns an error:** retry once only if the error is transient (timeout, network blip, temporary unavailability) — do NOT retry for permanent errors (invalid repository path, missing required parameter, authentication failure). If it fails again, call \`skyramp_submit_report\` with a minimal valid payload: leave all test arrays empty and add the error to \`issuesFound\`. Refer to the \`skyramp_submit_report\` schema for required fields. Do NOT attempt Task 2 without a valid stateFile.
67
- **If all changed files are non-application** (CI/CD, docs, lock files, config) skip to Task 3 (Submit Report) with empty arrays and a single \`issuesFound\` entry explaining why (same format as the zero-test path below).
66
+ **If all changed files are non-application** (CI/CD, docs, lock files, config, or cosmetic frontend changes such as \`.css\`/\`.scss\` reformats with no observable rendering or interaction change) skip to Task 3 (Submit Report) and follow the zero-test abstention path there.
68
67
 
69
- 3. **Maintain existing tests** using the rules in \`<drift_analysis_rules>\` below. For each existing test reported by \`skyramp_analyze_changes\`, score it and choose the action exactly as directed by the Action Decision Matrix in \`<drift_analysis_rules>\`. Only read test files that require action per that matrix — do NOT read files that will be IGNORED. **Do NOT read source files (routers, models, CRUD, components) — all the information you need is in the \`skyramp_analyze_changes\` output and the diff.** When reading multiple test files, **read them all in a single parallel batch** — do NOT read them one at a time. Apply actions directly. Results go in \`testMaintenance\`.
68
+ 3. **Maintain existing tests:**
70
69
 
71
- ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repositoryPath })}
70
+ a. Call \`skyramp_analyze_test_health\` with \`stateFile\` (from \`skyramp_analyze_changes\` output). **Do NOT read application source files** (routes, models, controllers) — all change information you need is in the \`skyramp_analyze_changes\` output and the diff.
71
+
72
+ b. Write \`updateInstructions\` for each UPDATE or REGENERATE test before calling \`skyramp_actions\` — articulating the change first prevents file content from overriding diff-based reasoning.
73
+
74
+ c. Call \`skyramp_actions\` with \`stateFile\` (from \`skyramp_analyze_changes\` output) and apply the edits it returns.
72
75
 
73
76
  4. **Code review:** From the \`skyramp_analyze_changes\` output and the existing test files you read for maintenance, note any logic bugs. Do NOT read additional source files just for code review — use what is already available from the analysis and test file reads. Common patterns to flag:
74
77
  - Computed fields not recalculated after mutation (e.g. \`total_amount\` unchanged after items are added/removed)
@@ -78,7 +81,7 @@ ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repository
78
81
  - Incorrect arithmetic in business logic (discount calculations, price aggregation)
79
82
  Log each finding in \`issuesFound\` with a \`severity\` (critical/high/medium/low). These bugs should inform your test design in Task 2.
80
83
 
81
- 5. **Blueprint Citation Invariant** (UI test recommendations only). When step 2 returned recommendations grounded in the captured blueprints from step 1, every named UI element your recommendation \`reasoning\` mentions — heading text, button label, link text, role descriptions — must correspond to an element actually present in one of those captured blueprints.
84
+ 5. **Blueprint Citation Invariant** (UI test recommendations only). When \`skyramp_analyze_changes\` returned recommendations grounded in the captured blueprints from \`skyramp_ui_analyze_changes\`, every named UI element your recommendation \`reasoning\` mentions — heading text, button label, link text, role descriptions — must correspond to an element actually present in one of those captured blueprints.
82
85
 
83
86
  Write the \`reasoning\` field in **natural prose** that names the elements as a human would describe them ("the Notifications heading", "the disabled Mark all as read button"). Do NOT use internal-identifier syntax like \`role=button, logicalName=...\` — that jargon leaks builder internals into a user-facing report.
84
87
 
@@ -86,7 +89,7 @@ ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repository
86
89
 
87
90
  **Non-UI entries (contract / integration / e2e / batch-scenario) are unaffected.** Their \`reasoning\` fields use the pre-existing formats — endpoint paths, request/response schemas, fixture chains. Do not reformat them.
88
91
 
89
- **No upstream captures available?** If step 1 produced no candidate URLs or \`browser_blueprint\` failed on every candidate, all UI recommendations fall back to source-grounded prose drawn from the diff alone. Log the failure mode once in \`issuesFound\`. Non-UI work is unaffected.
92
+ **No upstream captures available?** If \`skyramp_ui_analyze_changes\` produced no candidate URLs or \`browser_blueprint\` failed on every candidate, all UI recommendations fall back to source-grounded prose drawn from the diff alone. Log the failure mode once in \`issuesFound\`. Non-UI work is unaffected.
90
93
 
91
94
  ---`;
92
95
  const serviceContext = services?.length ? buildServiceContext(services) : '';
@@ -119,7 +122,7 @@ ${userPrompt ? "Generate only the tests that the user requested from the Additio
119
122
  - **MANDATORY — use the pre-ranked GENERATE list as-is**: The Execution Plan's GENERATE section governs ADD actions. You MUST generate exactly those scenarios in the exact order listed. Do NOT substitute, rename, or replace a GENERATE item. If parameter grounding uncovers a distinct bug-catching scenario not already in the GENERATE or ADDITIONAL list, generate it after all planned GENERATE items are complete and report it in \`newTestsCreated\` — this is an additional test driven by source-code analysis and does not count against the GENERATE budget.
120
123
  - Scenario JSON files are always new files — always generate them for new methods. Every generated scenario JSON must have a corresponding new integration test generated from it via \`skyramp_integration_test_generation\`.
121
124
  - Covered-resource handling (aligns with Execution Plan Step 0): When a GENERATE item targets a resource that already has an existing test file covering the same endpoint:
122
- - If the existing test source is \`[external]\`, skip the resource entirely — the external test already provides coverage. Do NOT UPDATE, REGENERATE, or DELETE external tests.
125
+ - If the existing test source is \`[external]\`, skip the resource entirely — the external test already provides coverage. Do NOT generate a new parallel test file for it.
123
126
  - If the existing test is tagged \`[skyramp]\`, apply type-specific rules:
124
127
  - Contract tests: UPDATE the existing Skyramp test file (add the new method's test cases). A new test case is a new test even if the file already exists — report in \`newTestsCreated\` and count toward the budget.
125
128
  - Integration/scenario tests: Always generate as a new file via the scenario pipeline (\`skyramp_batch_scenario_test_generation\` → \`skyramp_integration_test_generation\`), even if an existing integration test covers the same resource. A new multi-step scenario (e.g. create → PATCH → verify recalculation) is a distinct test file. Report in \`newTestsCreated\` and count toward the budget.
@@ -128,6 +131,7 @@ ${userPrompt ? "Generate only the tests that the user requested from the Additio
128
131
  - Example: If enrichment reveals that sending \`discount_value\` without \`discount_type\` silently orphans the value (a concrete bug), complete all planned GENERATE items first, then generate this discovered scenario as an extra test and report it in \`newTestsCreated\`.
129
132
  - Total generated: Follow the "Budget: N generate" line in the Execution Plan. Process every GENERATE-tagged item in order. Backfill from ADDITIONAL candidates (highest-ranked first) until \`newTestsCreated\` reaches ${maxGenerate} or all candidates are exhausted.
130
133
  - **UI test priority**: If the PR scope assessment shows any UI/E2E budget OR \`uiContext.changedFrontendFiles\` is non-empty (the deterministic server signal — populated for all supported frontend file types including \`.tsx\`/\`.jsx\`/\`.vue\`/\`.svelte\`/\`.dart\`), you MUST attempt to generate at least one UI test. Use \`browser_navigate\` to the app's base URL — if the app responds, record a trace and generate the test.
134
+ **Flutter web apps:** Skyramp's Playwright tools automatically enable Flutter's accessibility semantics tree on every \`browser_navigate\` call — you do NOT need to manually click \`flt-semantics-placeholder\` or add any activation step to the trace. Do NOT log an \`issuesFound\` entry about Flutter canvas rendering or accessibility activation — this is handled transparently.
131
135
  **Skip only if one of these conditions is met:**
132
136
  - **(a) App is unreachable** — \`browser_navigate\` fails or connection is refused.
133
137
  - **(b) Unintegrated non-route component** — the changed file is a leaf component (not a framework route/entrypoint) that has no integration point in the running app. To confirm:
@@ -135,6 +139,7 @@ ${userPrompt ? "Generate only the tests that the user requested from the Additio
135
139
  2. If no production file imports, re-exports, or renders it, the component has no DOM node in the running app → unintegrated.
136
140
  3. **Exception**: if the same PR also adds a route/page file (e.g. under Next.js \`pages/\` or \`app/\`) that imports the component, the route IS the integration point — test through it.
137
141
  **Never** apply the unintegrated heuristic to framework route/entrypoint files themselves — those are always reachable by convention.
142
+ - **(c) Cosmetic-only frontend change** — the frontend files in the diff are purely cosmetic with no observable rendering or interaction change (e.g. a \`.css\`/\`.scss\` reformat: property reordering, comment/whitespace edits, \`0px\`→\`0\`). \`uiContext.changedFrontendFiles\` being non-empty does NOT override this — that signal only means a frontend file changed, not that behavior changed. Take the zero-test abstention path in Task 3; do NOT record a trace just to satisfy the mandate.
138
143
  **Never** generate tests for unrelated pages as a substitute for an unintegrated component.
139
144
  This rule takes priority over generating additional backend-only tests.
140
145
  - **Always generate a test for critical bugs, even if it will fail.** When a GENERATE-tagged item targets a page or endpoint with a known bug, do NOT skip it because you expect the test to fail — a failing test that documents a bug is more valuable than a text-only description. This applies within the existing GENERATE budget; do not add extra tests beyond the plan.
@@ -233,9 +238,9 @@ ${CONTRACT_MODE_GUIDANCE}
233
238
 
234
239
  **Capture-act-capture (applies only when recording a UI trace):**
235
240
 
236
- **Skip this entire section if \`uiContext\` was absent or \`changedFrontendFiles\` was empty in step 1's response** (backend-only PR). The capture-act-capture pattern is for UI trace recording only — there's no UI trace to record on a backend-only PR. Continue to the non-UI test-type instructions below.
241
+ **Skip this entire section if \`uiContext\` was absent or \`changedFrontendFiles\` was empty in the \`skyramp_ui_analyze_changes\` response** (backend-only PR). The capture-act-capture pattern is for UI trace recording only — there's no UI trace to record on a backend-only PR. Continue to the non-UI test-type instructions below.
237
242
 
238
- **Reminder — the UI test priority rule above still applies.** If the diff contains frontend/UI changes, you still MUST attempt to generate at least one UI test. Capture-act-capture is **how** you record that test, not **whether** you record one — do not substitute UI recommendations for actually recording a trace. UI rec reasoning was already grounded in the upstream blueprints from Task 1 step 1; Task 2's capture-act-capture is for the trace's own assertions, not for retroactively rewriting recommendation reasoning.
243
+ **Reminder — the UI test priority rule above still applies.** If the diff contains frontend/UI changes, you still MUST attempt to generate at least one UI test. Capture-act-capture is **how** you record that test, not **whether** you record one — do not substitute UI recommendations for actually recording a trace. UI recommendation reasoning was already grounded in the upstream blueprints from \`skyramp_ui_analyze_changes\`; Task 2's capture-act-capture is for the trace's own assertions, not for retroactively rewriting recommendation reasoning.
239
244
 
240
245
  This pattern produces delta-derived assertions from blueprint diffs. Diff-derived assertions catch state changes more reliably than author-inference — the diff tells you what actually changed on the page so the assertion is grounded in observable state, not in guessing what "success" looks like.
241
246
 
@@ -331,7 +336,6 @@ Call \`skyramp_submit_report\` with \`summaryOutputFile\`: "${summaryOutputFile}
331
336
  - **additionalRecommendations**: AT MOST ${maxRecommendations - maxGenerate} items.
332
337
  - For \`testType: "contract"\` entries: **\`primaryEndpoint\` is required** (e.g. \`"GET /api/v1/users/{user_id}"\`). The tool will reject the submission without it — do not omit it or you will be forced to resubmit.
333
338
  - For \`testType: "integration"\` or \`"e2e"\` entries: omit \`primaryEndpoint\` — use \`description\` to list the endpoints involved instead.
334
- - **testMaintenance**: Use \`[]\` **only** if no existing Skyramp tests were found in the repository. If existing tests were found (any score), include one entry per test. Set \`action\` to the exact drift action you chose from the Action Decision Matrix (\`UPDATE\`, \`REGENERATE\`, \`DELETE\`, \`VERIFY\`, or \`IGNORE\`). For UPDATE/REGENERATE/DELETE tests that were modified and executed, populate all fields from real before/after execution results. For VERIFY/IGNORE tests (not modified), derive \`beforeStatus\` from the \`skyramp_analyze_test_health\` health score (typically \`"Pass"\` if drift score is 0 and no health issues were flagged), set \`afterStatus\` to \`"Skipped"\`, and use \`afterDetails\` to explain why (e.g. "IGNORE: drift score 0 — endpoint not modified in this PR"). Do **not** add entries for tests that were not returned by the health analysis.
335
339
 
336
340
  ---
337
341
 
@@ -202,35 +202,39 @@ describe("uiCredentials in getTestbotPrompt", () => {
202
202
  .toThrow("</ui-credentials>");
203
203
  });
204
204
  });
205
- describe("drift analysis inline embedding", () => {
206
- beforeAll(() => { process.env.SKYRAMP_FEATURE_TESTBOT = "1"; });
207
- afterAll(() => { delete process.env.SKYRAMP_FEATURE_TESTBOT; });
205
+ describe("drift analysis runtime tool call (step 3)", () => {
206
+ // The build-time embed of buildDriftAnalysisPrompt was replaced with a
207
+ // runtime instruction: LLM calls skyramp_analyze_test_health then skyramp_actions.
208
208
  function basePrompt() {
209
209
  return getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
210
210
  }
211
- it("wraps inline drift rules in XML tags", () => {
211
+ it("step 3 instructs the LLM to call skyramp_analyze_test_health", () => {
212
212
  const prompt = basePrompt();
213
- expect(prompt).toContain("<drift_analysis_rules>");
214
- expect(prompt).toContain("</drift_analysis_rules>");
213
+ expect(prompt).toContain("skyramp_analyze_test_health");
215
214
  });
216
- it("does not include a persona statement inside the inline XML block", () => {
215
+ it("step 3 instructs the LLM to call skyramp_actions", () => {
217
216
  const prompt = basePrompt();
218
- const start = prompt.indexOf("<drift_analysis_rules>");
219
- const end = prompt.indexOf("</drift_analysis_rules>");
220
- const block = prompt.slice(start, end);
221
- expect(block).not.toContain("You are acting as a Skyramp Integration Architect");
217
+ expect(prompt).toContain("skyramp_actions");
222
218
  });
223
- it("drift_analysis_rules block appears inside Task 1, before Task 2", () => {
219
+ it("step 3 appears inside Task 1, before Task 2", () => {
224
220
  const prompt = basePrompt();
225
221
  const task1Pos = prompt.indexOf("## Task 1");
226
- const rulesPos = prompt.indexOf("<drift_analysis_rules>");
222
+ const healthPos = prompt.indexOf("skyramp_analyze_test_health");
227
223
  const task2Pos = prompt.indexOf("## Task 2");
228
- expect(rulesPos).toBeGreaterThan(task1Pos);
229
- expect(rulesPos).toBeLessThan(task2Pos);
224
+ expect(healthPos).toBeGreaterThan(task1Pos);
225
+ expect(healthPos).toBeLessThan(task2Pos);
230
226
  });
231
- it("Task 1 step 3 prose references drift_analysis_rules tag", () => {
227
+ it("does not contain the build-time embedded drift_analysis_rules content (Action Decision Tree)", () => {
228
+ // The rules are now fetched at runtime via skyramp_analyze_test_health —
229
+ // the <drift_analysis_rules> tag may appear as a reference in prose,
230
+ // but the actual rule content (Action Decision Tree) must not be baked in.
232
231
  const prompt = basePrompt();
233
- expect(prompt).toContain("rules in `<drift_analysis_rules>`");
232
+ expect(prompt).not.toContain("Action Decision Tree\n\nFor each existing test");
233
+ expect(prompt).not.toContain("Update Execution Rules\n\nWhen applying UPDATE actions");
234
+ });
235
+ it("does not contain a persona statement (no nested identity from old embed)", () => {
236
+ const prompt = basePrompt();
237
+ expect(prompt).not.toContain("You are acting as a Skyramp Integration Architect");
234
238
  });
235
239
  });
236
240
  describe("UI grounding via Task 2 capture-act-capture", () => {
@@ -1,6 +1,5 @@
1
1
  import * as fs from "fs";
2
2
  import * as path from "path";
3
- import { simpleGit } from "simple-git";
4
3
  import { logger } from "../utils/logger.js";
5
4
  import { TestSource } from "../types/TestAnalysis.js";
6
5
  import fg from "fast-glob";
@@ -39,7 +38,7 @@ export class TestDiscoveryService {
39
38
  /^test_.*\.(py|js|ts|rb|go|php)$/, // test_*.py, test_*.rb, test_*.go
40
39
  /.*_test\.(py|ts|js|go|rs)$/, // *_test.py, *_test.go, *_test.rs
41
40
  /.*\.test\.(ts|js|tsx|jsx)$/, // *.test.ts, *.test.js, *.test.tsx
42
- /.*\.spec\.(ts|js|tsx|jsx|rb)$/, // *.spec.ts, *.spec.js, *.spec.rb
41
+ /.*\..*spec\.(ts|js|tsx|jsx|rb)$/, // *.spec.ts, *.e2e-spec.ts, *.unit-spec.ts
43
42
  /.*Test\.(java|kt|kts|cs|scala|swift|m)$/, // *Test.java, *Test.kt, *Test.m (ObjC)
44
43
  /.*Tests\.(cs|swift|m)$/, // *Tests.cs, *Tests.swift, *Tests.m (ObjC)
45
44
  /.*_spec\.rb$/, // *_spec.rb (RSpec)
@@ -54,11 +53,8 @@ export class TestDiscoveryService {
54
53
  /[\\/]__tests__[\\/]/,
55
54
  /[\\/]spec[\\/]/,
56
55
  ];
57
- // Cache git client and repo status per repository
58
- gitClientCache = new Map();
59
- isGitRepoCache = new Map();
60
56
  /**
61
- * Discover all tests in a repository — both Skyramp-generated and external (user-written).
57
+ * Discover all tests under testDir — both Skyramp-generated and external (user-written).
62
58
  * Uses fast-glob for cross-platform file scanning, then classifies discovered files
63
59
  * as Skyramp-generated tests, external tests, or not-a-test during processing.
64
60
  *
@@ -68,19 +64,15 @@ export class TestDiscoveryService {
68
64
  * rather than flooding context with irrelevant files.
69
65
  * - `undefined` (full-repo mode, no diff): cap at MAX_EXTERNAL_FULL_REPO.
70
66
  */
71
- async discoverTests(repositoryPath, options = {}) {
72
- logger.info(`Starting test discovery in: ${repositoryPath}`);
73
- if (!fs.existsSync(repositoryPath)) {
74
- throw new Error(`Repository path does not exist: ${repositoryPath}`);
75
- }
76
- const stats = fs.statSync(repositoryPath);
77
- if (!stats.isDirectory()) {
78
- throw new Error(`Path is not a directory: ${repositoryPath}`);
79
- }
80
- // Initialize git client cache for this repository
81
- await this.initializeGitClient(repositoryPath);
82
- // File classification: skyramp vs external vs not-a-test (carries content forward)
83
- const classified = this.classifyTestFiles(repositoryPath);
67
+ async discoverTests(testDir, options = {}) {
68
+ logger.info(`Starting test discovery in: ${testDir}`);
69
+ const stats = fs.statSync(testDir, { throwIfNoEntry: false });
70
+ if (!stats)
71
+ throw new Error(`Test directory does not exist: ${testDir}`);
72
+ if (!stats.isDirectory())
73
+ throw new Error(`Path is not a directory: ${testDir}`);
74
+ // File classification: skyramp vs external vs not-a-test (carries content forward).
75
+ const classified = this.classifyTestFiles(testDir);
84
76
  logger.info(`Found ${classified.skyramp.length} Skyramp test files, ${classified.external.length} external test files`);
85
77
  // Process Skyramp tests (content already cached from classification)
86
78
  const skyrampTests = await this.processFilesInBatches(classified.skyramp, false, classified.contentCache);
@@ -139,9 +131,6 @@ export class TestDiscoveryService {
139
131
  }));
140
132
  const externalTests = [...relevantExternalTests, ...otherExternalTests];
141
133
  logger.info(`Discovered ${skyrampTests.length} Skyramp tests, ${externalTests.length} external tests`);
142
- // Clean up caches to free memory
143
- this.gitClientCache.clear();
144
- this.isGitRepoCache.clear();
145
134
  return {
146
135
  tests: [...skyrampTests, ...externalTests],
147
136
  // Expose the relevant file paths so callers can build read instructions for the LLM.
@@ -186,27 +175,6 @@ export class TestDiscoveryService {
186
175
  }
187
176
  return { relevant, other };
188
177
  }
189
- /**
190
- * Initialize git client and check if repository is a git repo
191
- */
192
- async initializeGitClient(repositoryPath) {
193
- try {
194
- const git = simpleGit(repositoryPath);
195
- this.gitClientCache.set(repositoryPath, git);
196
- const isRepo = await git.checkIsRepo();
197
- this.isGitRepoCache.set(repositoryPath, isRepo);
198
- if (isRepo) {
199
- logger.debug(`Git repository detected at: ${repositoryPath}`);
200
- }
201
- else {
202
- logger.debug(`Not a git repository: ${repositoryPath}`);
203
- }
204
- }
205
- catch (error) {
206
- logger.debug(`Could not initialize git client: ${error.message}`);
207
- this.isGitRepoCache.set(repositoryPath, false);
208
- }
209
- }
210
178
  /**
211
179
  * Process test files in parallel batches with concurrency control
212
180
  * @param isExternal When true, uses external test metadata extraction
@@ -82,13 +82,15 @@ const testMaintenanceSchema = z.object({
82
82
  testType: z.nativeEnum(TestType).describe("Type of test."),
83
83
  endpoint: z.string().describe("HTTP verb and path, e.g. 'GET /api/v1/products'"),
84
84
  fileName: z.string().describe("Test file that was maintained, e.g. 'products_smoke_test.py'"),
85
- action: z.nativeEnum(DriftAction).optional().describe("The drift action taken for this test, exactly as decided by the Action Decision Matrix: UPDATE, REGENERATE, or DELETE modify the test; VERIFY or IGNORE leave it unchanged (no-op)."),
85
+ action: z.nativeEnum(DriftAction).describe("The drift action assigned to this test during maintenance triage."),
86
86
  description: z.string().describe("What was changed and why"),
87
87
  beforeStatus: z.enum(["Pass", "Fail", "Error"]).describe("Test result BEFORE modification"),
88
88
  beforeDetails: z.string().describe("Execution output/timing before modification, or 'baseline from CI workflow <name>' if a parallel workflow provided the baseline"),
89
89
  afterStatus: z.enum(["Pass", "Fail", "Error", "Skipped"]).describe("Test result AFTER modification"),
90
90
  afterDetails: z.string().describe("Execution output/timing after modification"),
91
- });
91
+ })
92
+ .refine(m => ![DriftAction.Verify, DriftAction.Ignore].includes(m.action) || m.afterStatus === "Skipped", { message: "VERIFY and IGNORE entries must have afterStatus: 'Skipped' — these actions do not modify the test file" })
93
+ .refine(m => ![DriftAction.Update, DriftAction.Regenerate, DriftAction.Delete].includes(m.action) || m.afterStatus !== "Skipped", { message: "UPDATE, REGENERATE, and DELETE entries must have a real afterStatus (not Skipped) — these actions modify the test file and must be executed" });
92
94
  /**
93
95
  * Derive per-run analytics counts from a submitted report. These power the
94
96
  * alpha-launch dashboards (tests generated/maintained, suite growth, bugs vs
@@ -103,21 +105,15 @@ const testMaintenanceSchema = z.object({
103
105
  * Fail/Error before maintenance and Pass afterward.
104
106
  *
105
107
  * testsMaintained counts only entries that actually changed a test file
106
- * (action UPDATE/REGENERATE/DELETE). VERIFY/IGNORE entries are reported for
107
- * transparency but are no-ops, so they are excluded. When `action` is absent
108
- * (older reports), we fall back to the status heuristic: an IGNORE no-op sets
109
- * afterStatus to "Skipped", so anything else is treated as a real change.
108
+ * (action UPDATE/REGENERATE/DELETE). VERIFY/IGNORE are no-ops and excluded.
110
109
  */
111
110
  function isMaintenanceChange(m) {
112
- if (m.action) {
113
- return MAINTENANCE_CHANGE_ACTIONS.has(m.action);
114
- }
115
- return m.afterStatus !== "Skipped";
111
+ return MAINTENANCE_CHANGE_ACTIONS.has(m.action);
116
112
  }
117
113
  function computeReportMetrics(params) {
118
114
  const recommendations = params.additionalRecommendations ?? [];
119
115
  const countBy = (items, pred) => items.filter(pred).length;
120
- const changedMaintenance = params.testMaintenance.filter(isMaintenanceChange);
116
+ const changedMaintenance = (params.testMaintenance ?? []).filter(isMaintenanceChange);
121
117
  const maintenanceRecovered = countBy(changedMaintenance, (m) => m.beforeStatus !== "Pass" && m.afterStatus === "Pass");
122
118
  return {
123
119
  testsGenerated: String(params.newTestsCreated.length),
@@ -155,7 +151,8 @@ export function registerSubmitReportTool(server) {
155
151
  .describe("Recommended tests that were not generated (lower priority). Only include recommendations that add distinct coverage beyond generated tests — do not pad with variants testing the same endpoint and flow."),
156
152
  testMaintenance: z
157
153
  .array(testMaintenanceSchema)
158
- .describe("List of existing test modifications with before/after execution results. Use empty array [] if none."),
154
+ .optional()
155
+ .describe("One entry per test assessed in the drift analysis step. Omit this field when no existing tests were found."),
159
156
  testResults: z
160
157
  .array(testResultSchema)
161
158
  .describe("List of ALL test execution results. One entry per test executed."),
@@ -1,6 +1,7 @@
1
1
  // @ts-ignore
2
2
  import { registerSubmitReportTool, additionalRecommendationSchema } from "./submitReportTool.js";
3
3
  import { TestType } from "../types/TestTypes.js";
4
+ import { DriftAction } from "../types/TestAnalysis.js";
4
5
  import { AnalyticsService } from "../services/AnalyticsService.js";
5
6
  import * as fs from "fs/promises";
6
7
  import * as path from "path";
@@ -379,17 +380,15 @@ describe("registerSubmitReportTool", () => {
379
380
  // Both changes went Fail→Pass
380
381
  expect(params.maintenanceRecovered).toBe("2");
381
382
  });
382
- it("falls back to afterStatus heuristic when action is absent", async () => {
383
+ it("counts UPDATE/REGENERATE/DELETE as maintenance changes, not VERIFY/IGNORE", async () => {
383
384
  const tmpDir = await fs.mkdtemp(path.join(os.tmpdir(), "submit-report-test-"));
384
385
  tmpDirs.push(tmpDir);
385
386
  const outputFile = path.join(tmpDir, "report.json");
386
387
  await handler({
387
388
  ...sampleReportParams(outputFile),
388
389
  testMaintenance: [
389
- // No action field afterStatus !== "Skipped" counts as a change
390
- { testType: TestType.CONTRACT, endpoint: "GET /api/v1/products", fileName: "products_contract_test.py", description: "Patched", beforeStatus: "Fail", beforeDetails: "401", afterStatus: "Pass", afterDetails: "passed" },
391
- // No action field — afterStatus === "Skipped" → treated as no-op
392
- { testType: TestType.SMOKE, endpoint: "GET /api/v1/reviews", fileName: "reviews_smoke_test.py", description: "No action required", beforeStatus: "Pass", beforeDetails: "drift 0", afterStatus: "Skipped", afterDetails: "not in PR" },
390
+ { testType: TestType.CONTRACT, endpoint: "GET /api/v1/products", fileName: "products_contract_test.py", action: DriftAction.Update, description: "Patched", beforeStatus: "Fail", beforeDetails: "401", afterStatus: "Pass", afterDetails: "passed" },
391
+ { testType: TestType.SMOKE, endpoint: "GET /api/v1/reviews", fileName: "reviews_smoke_test.py", action: DriftAction.Ignore, description: "No action required", beforeStatus: "Pass", beforeDetails: "drift 0", afterStatus: "Skipped", afterDetails: "not in PR" },
393
392
  ],
394
393
  });
395
394
  const params = lastAnalyticsParams();