npm - @skyramp/mcp - Versions diffs - 0.1.8 → 0.2.0-rc.2 - Mend

@skyramp/mcp 0.1.8 → 0.2.0-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

package/build/prompts/test-recommendation/recommendationShared.js CHANGED Viewed

@@ -27,6 +27,16 @@ export function externalDedupKey(scenario) {
     const resource = extractResourceFromPath(primaryStep?.path ?? "");
     return `${method}::${resource}::${testType}`;
 }
+export function isAttackSurfaceSecurityBoundary(scenario) {
+    return scenario.category === "security_boundary" &&
+        (scenario.isAttackSurfaceSecurityBoundary === true ||
+            scenario.description.startsWith("Attack-surface auth boundary:"));
+}
+export function isOrdinaryDirectAuthBoundary(scenario) {
+    return scenario.category === "security_boundary" &&
+        !isAttackSurfaceSecurityBoundary(scenario) &&
+        scenario.description.startsWith("Auth boundary:");
+}
 /**
  * Build a set of coverage keys from external (non-Skyramp) tests.
  * Parses `testLocations` entries tagged with `[external]` to extract the
@@ -62,7 +72,19 @@ export function buildExternalCoverageSet(testLocations) {
     }
     if (externalWithoutCoverage > 0) {
         logger.info(`${externalWithoutCoverage} external test file(s) have no extractable endpoint coverage — ` +
-            `programmatic dedup skipped for these; Step 0 semantic check is the fallback.`);
+            `programmatic dedup skipped for these; prompt-level semantic coverage check is the fallback.`);
     }
     return coverage;
 }
+// Shared TestBot task and step labels used by prompt modules that cannot import
+// testbot-prompts.ts directly without creating a circular dependency.
+export const TASK_UI_PRESCAN = "0";
+export const TASK_ANALYZE_MAINTAIN = "1";
+export const TASK_GENERATE = "2";
+export const TASK_SUBMIT = "3";
+export const TESTBOT_TASK1_STEP_ANALYZE = "1";
+export const TESTBOT_TASK1_STEP_MAINTAIN = "2";
+export const TESTBOT_TASK1_STEP_CODE_REVIEW = "3";
+export const TESTBOT_TASK1_STEP_UI_GROUNDING = "4";
+export const taskRef = (taskId) => `Task ${taskId}`;
+export const taskStepRef = (taskId, stepId) => `Task ${taskId} Step ${stepId}`;

package/build/prompts/test-recommendation/scopeAssessment.js CHANGED Viewed

@@ -25,25 +25,76 @@ export function isFrontendFile(filePath) {
         return true;
     return AMBIGUOUS_FRONTEND_PATTERN.test(filePath) && FRONTEND_DIR_PATTERN.test(filePath);
 }
+/**
+ * Returns true if the file path looks like a test file rather than UI source.
+ *
+ * Catches:
+ *   - Skyramp-generated tests (`*_test.ts`, `*_smoke.ts`, `*_contract.ts`,
+ *     `*_fuzz.ts`, `*_integration.ts`, `*_load.ts`, `*_e2e.ts`, `*_ui.ts`)
+ *   - Skyramp scenario files (`scenario_*.json`)
+ *   - Conventional Vitest/Jest/Playwright spec naming
+ *     (`*.spec.{ts,tsx,js,jsx}`, `*.test.{ts,tsx,js,jsx}`)
+ *   - Files inside `__tests__/` directories (Jest convention)
+ *
+ * Used by callers that want to filter test files OUT of frontend-source
+ * processing — `isFrontendFile` returns true for `.spec.ts` under a
+ * frontend directory because the rule is "tier-3 ambiguous + frontend dir,"
+ * but those tests aren't UI source we'd want to ground recommendations in.
+ */
+export function isTestFile(filePath) {
+    return (/(?:_test|_smoke|_contract|_fuzz|_integration|_load|_e2e|_ui)\.[^/]+$/.test(filePath) ||
+        /scenario_[^/]+\.json$/.test(filePath) ||
+        /\.(spec|test)\.(tsx?|jsx?)$/.test(filePath) ||
+        /(?:^|\/)__tests__\//.test(filePath));
+}
 // ── LLM scope assessment ──────────────────────────────────────────────────────
 /**
- * Builds the PR scope assessment section embedded as the first step in the
- * execution plan prompt.
+ * Builds the PR scope assessment section.
+ *
+ * When `precomputedUIPct` is provided (0 = backend-only, 100 = UI-only) the server
+ * has already determined the split unambiguously — skip Steps A–C and emit one line.
  *
- * This replaces fixed formula-computed topN and uiFraction values.  The LLM has
- * richer context than a file-count formula: it understands semantic complexity
- * (one auth change > ten CSS tweaks), can identify UI tests that are warranted
- * even on mostly-backend PRs (frontend logic bugs, form validation errors), and
- * can down-scale when the diff is trivial regardless of file count.
+ * For mixed PRs (`precomputedUIPct` is undefined, `hasFrontendChanges` is true) skip
+ * Steps A–C but keep Step D so the LLM can apply judgment to determine the UI%.
  *
- * The LLM is asked to state a concrete Budget Plan before proceeding, which the
- * rest of the prompt references to enforce count discipline.
+ * Falls back to the full four-step assessment when no precomputed data is available.
  */
-export function buildScopeAssessmentSection(maxTotal = MAX_RECOMMENDATIONS, maxGenerate = MAX_TESTS_TO_GENERATE, isUIOnly = false) {
-    // Clamp minTotal to maxTotal so the range is never inverted (e.g. when maxGenerateOverride === topN).
-    const minTotal = Math.min(maxGenerate + 1, maxTotal);
-    const minAdditional = minTotal - maxGenerate; // 1 normally; 0 when maxTotal === maxGenerate
-    const baselineFormula = `${maxGenerate} (generate) + ${minAdditional} (min additional) = ${minTotal}`;
+export function buildScopeAssessmentSection(maxTotal = MAX_RECOMMENDATIONS, maxGenerate = MAX_TESTS_TO_GENERATE, isUIOnly = false,
+/** Server-determined UI/E2E percentage. `undefined` = mixed PR or unknown. */
+precomputedUIPct,
+/** Whether the diff contains frontend files (true ↔ mixed PR when precomputedUIPct is undefined). */
+hasFrontendChanges) {
+    const effectiveGenerate = Math.min(maxGenerate, maxTotal);
+    const additional = Math.max(0, maxTotal - effectiveGenerate);
+    // Unambiguous backend-only or UI-only: emit a single Budget Plan line — no LLM counting needed.
+    if (precomputedUIPct !== undefined) {
+        return `### PR Scope Assessment
+Budget Plan: ${maxTotal} total (${effectiveGenerate} generate + ${additional} additional), ${precomputedUIPct}% UI/E2E
+Use these exact numbers throughout the rest of the prompt.`;
+    }
+    // Mixed PR: server can pre-compute the total but not the UI/E2E split — keep Step D.
+    if (hasFrontendChanges) {
+        return `### PR Scope Assessment — determine UI% before planning recommendations
+Budget Plan (total already determined): **${maxTotal} total (${effectiveGenerate} generate + ${additional} additional)**
+**Step D — Determine UI vs backend split for the budget above:**
+- Non-UI slots are backend tests; start from file-count ratio for UI%, then apply judgment:
+  - Pure CSS/style changes inflate the frontend file count without adding test value → reduce UI%
+  - Frontend logic bugs (state management, calculation errors, form validation) in the diff → increase UI% even if few frontend files
+  - Frontend component calls a changed backend API → an E2E test covers both sides → count toward UI%
+  - Frontend files only in \`__tests__/\` or \`.stories.\` → exclude from the ratio
+**Append your UI% now** — update the Budget Plan to:
+\`Budget Plan: ${maxTotal} total (${effectiveGenerate} generate + ${additional} additional), <ui_pct>% UI/E2E\`
+Use these exact numbers throughout the rest of the prompt.`;
+    }
+    // Fallback (no diff context — full_repo else-branch or test): full four-step assessment.
+    const minTotal = Math.min(effectiveGenerate + 1, maxTotal);
+    const minAdditional = Math.max(0, minTotal - effectiveGenerate);
+    const baselineFormula = `${effectiveGenerate} (generate) + ${minAdditional} (min additional) = ${minTotal}`;
     const stepD = isUIOnly
         ? `**Step D — UI/E2E confirmation (frontend-only PR):**
 This is a frontend-only PR — set **100% UI/E2E** in your Budget Plan.

package/build/prompts/test-recommendation/scopeAssessment.test.js CHANGED Viewed

@@ -1,7 +1,7 @@
 jest.mock("@skyramp/skyramp", () => ({
     WorkspaceConfigManager: { create: jest.fn() },
 }));
-import { isFrontendFile, buildScopeAssessmentSection } from "./scopeAssessment.js";
+import { isFrontendFile, isTestFile, buildScopeAssessmentSection } from "./scopeAssessment.js";
 // ---------------------------------------------------------------------------
 // isFrontendFile
 // ---------------------------------------------------------------------------
@@ -58,6 +58,40 @@ describe("isFrontendFile", () => {
     });
 });
 // ---------------------------------------------------------------------------
+// isTestFile
+// ---------------------------------------------------------------------------
+describe("isTestFile", () => {
+    it("matches Skyramp-generated test suffix patterns", () => {
+        expect(isTestFile("backend/tests/orders_integration_test.py")).toBe(true);
+        expect(isTestFile("frontend/tests/cart_ui_test.ts")).toBe(true);
+        expect(isTestFile("backend/tests/users_contract_test.py")).toBe(true);
+        expect(isTestFile("backend/tests/load_test.py")).toBe(true);
+    });
+    it("matches Skyramp scenario JSON files", () => {
+        expect(isTestFile("tests/scenario_orders.json")).toBe(true);
+        expect(isTestFile("scenario_checkout.json")).toBe(true);
+    });
+    it("matches conventional Vitest/Jest/Playwright spec naming", () => {
+        expect(isTestFile("frontend/tests/cart_labels_baseline.spec.ts")).toBe(true);
+        expect(isTestFile("src/components/Button.test.tsx")).toBe(true);
+        expect(isTestFile("e2e/login.spec.js")).toBe(true);
+        expect(isTestFile("packages/foo/Bar.test.jsx")).toBe(true);
+    });
+    it("matches files under __tests__/ directories", () => {
+        expect(isTestFile("src/__tests__/utils.ts")).toBe(true);
+        expect(isTestFile("packages/foo/__tests__/Bar.tsx")).toBe(true);
+    });
+    it("returns false for ordinary source files", () => {
+        expect(isTestFile("frontend/src/pages/Cart.tsx")).toBe(false);
+        expect(isTestFile("frontend/src/components/Button.tsx")).toBe(false);
+        expect(isTestFile("backend/src/handlers/orders.py")).toBe(false);
+    });
+    it("returns false for non-spec .ts files in tests/ directories that aren't Skyramp-generated", () => {
+        // Tests/ directory alone doesn't trigger isTestFile — only the suffix matters.
+        expect(isTestFile("frontend/tests/helpers/setup.ts")).toBe(false);
+    });
+});
+// ---------------------------------------------------------------------------
 // buildScopeAssessmentSection
 // ---------------------------------------------------------------------------
 describe("buildScopeAssessmentSection", () => {
@@ -80,9 +114,12 @@ describe("buildScopeAssessmentSection", () => {
     it("clamps minTotal to maxTotal when maxTotal < maxGenerate", () => {
         // Defensive: maxGenerate clamped to topN upstream, but guard applies here too
         const section = buildScopeAssessmentSection(3, 5);
-        // minTotal = min(5+1, 3) = 3; range "3–3"
+        // effectiveGenerate = min(5, 3) = 3; minTotal = min(3+1, 3) = 3; range "3–3"
         expect(section).toContain("3–3");
         expect(section).not.toMatch(/\b[6-9]–3\b/);
+        // Must not show original maxGenerate (5) in the formula
+        expect(section).toContain("3 (generate)");
+        expect(section).not.toContain("5 (generate)");
     });
     it("embeds UI/E2E confirmation step when isUIOnly=true", () => {
         const section = buildScopeAssessmentSection(10, 3, true);
@@ -101,4 +138,58 @@ describe("buildScopeAssessmentSection", () => {
         expect(section.length).toBeGreaterThan(0);
         expect(section).toContain("Budget Plan");
     });
+    // ---------------------------------------------------------------------------
+    // New branches added in PR 453 (4a) — precomputedUIPct / hasFrontendChanges
+    // ---------------------------------------------------------------------------
+    it("emits a single Budget Plan line for backend-only PR (precomputedUIPct=0)", () => {
+        // isUIOnlyPR=false, hasFrontendChanges=false → precomputedUIPct=0
+        const section = buildScopeAssessmentSection(10, 3, false, 0, false);
+        expect(section).toContain("Budget Plan: 10 total (3 generate + 7 additional), 0% UI/E2E");
+        expect(section).toContain("Use these exact numbers throughout the rest of the prompt.");
+        // Must NOT include the step-based assessment
+        expect(section).not.toContain("Step A");
+        expect(section).not.toContain("Step B");
+        expect(section).not.toContain("Step C");
+        expect(section).not.toContain("Step D");
+    });
+    it("emits a single Budget Plan line for UI-only PR (precomputedUIPct=100)", () => {
+        const section = buildScopeAssessmentSection(10, 3, true, 100, false);
+        expect(section).toContain("Budget Plan: 10 total (3 generate + 7 additional), 100% UI/E2E");
+        expect(section).not.toContain("Step A");
+        expect(section).not.toContain("Step D");
+    });
+    it("emits Step D only for mixed PR (hasFrontendChanges=true, precomputedUIPct=undefined)", () => {
+        const section = buildScopeAssessmentSection(10, 3, false, undefined, true);
+        // Total is pre-determined — shows in the Budget Plan header
+        expect(section).toContain("Budget Plan (total already determined)");
+        expect(section).toContain("10 total (3 generate + 7 additional)");
+        // Step D is kept for UI/E2E split judgment
+        expect(section).toContain("Step D");
+        // The "Append your UI%" instruction must appear (comment #3222061560)
+        expect(section).toContain("Append your UI%");
+        // Steps A–C are skipped
+        expect(section).not.toContain("Step A");
+        expect(section).not.toContain("Step B");
+        expect(section).not.toContain("Step C");
+    });
+    it("clamps additional to 0 when maxGenerate equals maxTotal (precomputed path)", () => {
+        // maxGenerate=5, maxTotal=5 → additional must be 0, not negative
+        const section = buildScopeAssessmentSection(5, 5, false, 0, false);
+        expect(section).toContain("5 generate + 0 additional");
+        expect(section).not.toMatch(/\d+ generate \+ -\d+ additional/);
+    });
+    it("clamps additional to 0 when maxGenerate exceeds maxTotal (defensive, precomputed path)", () => {
+        // Should never happen in normal usage but guard must hold
+        const section = buildScopeAssessmentSection(3, 5, false, 0, false);
+        expect(section).toContain("3 generate + 0 additional");
+        expect(section).not.toMatch(/\+ -\d+ additional/);
+        // Must not show unclamped maxGenerate
+        expect(section).not.toContain("5 generate");
+    });
+    it("clamps generate in mixed PR path when maxGenerate exceeds maxTotal", () => {
+        const section = buildScopeAssessmentSection(3, 5, false, undefined, true);
+        expect(section).toContain("3 generate + 0 additional");
+        expect(section).not.toContain("5 generate");
+        expect(section).not.toMatch(/\+ -\d+ additional/);
+    });
 });

package/build/prompts/test-recommendation/test-recommendation-prompt.js CHANGED Viewed

@@ -5,9 +5,10 @@ import { logger } from "../../utils/logger.js";
 import { buildArchitectPreamble, buildContextFetchingGuidance, buildReasoningProtocol, buildToolWorkflows, buildFewShotExamples, buildVerificationChecklist, getAuthSnippets, MAX_TESTS_TO_GENERATE, MAX_RECOMMENDATIONS, } from "./recommendationSections.js";
 import { CATEGORY_PRIORITY } from "../../types/TestRecommendation.js";
 import { buildScopeAssessmentSection, isFrontendFile } from "./scopeAssessment.js";
-import { buildExecutionPlan } from "./diffExecutionPlan.js";
+import { buildExecutionPlan, EXEC_STEP_CODE_REVIEW } from "./diffExecutionPlan.js";
 import { buildFullRepoRecommendations } from "./fullRepoCatalog.js";
-import { buildExternalCoverageSet, externalDedupKey, } from "./recommendationShared.js";
+import { ANALYSIS_STEP_EXTRACT } from "./analysisOutputPrompt.js";
+import { TASK_GENERATE, buildExternalCoverageSet, externalDedupKey, isAttackSurfaceSecurityBoundary, taskRef, } from "./recommendationShared.js";
 // Re-export for backward compatibility (tests and external callers import these from this module)
 export { buildExternalCoverageSet, externalDedupKey };
 function formatTestLocations(locs) {
@@ -47,7 +48,9 @@ function classifyNovelty(scenario, diffContext) {
     return "existing";
 }
 function prioritiseCandidate(scenario, diffContext) {
-    const priority = CATEGORY_PRIORITY[scenario.category] ?? "LOW";
+    const priority = isAttackSurfaceSecurityBoundary(scenario)
+        ? "CRITICAL"
+        : CATEGORY_PRIORITY[scenario.category] ?? "LOW";
     const novelty = classifyNovelty(scenario, diffContext);
     return { priority, novelty };
 }
@@ -89,6 +92,25 @@ ${isUIOnlyPR ? `\n**UI-only PR** — no backend changes. UI and E2E tests are mo
                 : ``}
 Output should be concise and immediately actionable.`
         : `You are in **Repo mode**. Comprehensive test strategy across all endpoints.`;
+    // ── UI rec authoring rules ──
+    // Anchors every UI recommendation regardless of whether the agent ran
+    // skyramp_ui_analyze_changes / browser_blueprint before this call. When the
+    // agent has prior captures in its own tool-result history, those serve as
+    // grounding; when it doesn't, recommendations fall back to source-grounded
+    // prose. Either way, inventing elements / leaking jargon / duplicating UI
+    // tests are wrong, so the rules fire unconditionally.
+    const uiRecRulesSection = `
+<ui_recommendation_authoring_rules>
+**Blueprints inform *how* you describe UI tests, not *which* tests to recommend.** The recommendation catalog derives from the same change-signals listed elsewhere in this prompt (new endpoints, schema/field changes, security boundaries, business-logic modifications, frontend route or component additions, layout additions, etc.) — refer to those signals for the source-of-truth list, not this section.
+**Do not generate near-duplicate UI tests of the same surface;** one well-targeted UI test per surface is enough.
+For UI recommendations you *do* emit, ground the \`reasoning\` field in elements you have actually observed via \`browser_blueprint\` calls earlier in this session. If a recommendation's target element is not in any blueprint you have observed, either rephrase the recommendation around an element that IS observed, or describe the test target in higher-level terms. Do not invent element names from the PR description, source diff, or component name. If you have not captured any blueprints yet (e.g. backend-only PR, or pre-flight skipped), UI recommendations fall back to source-grounded prose drawn from the diff alone — that is a legitimate outcome, not a reason to invent.
+Write UI recommendation \`reasoning\` fields in **natural prose** that names elements as a human would describe them (e.g. "the Notifications heading", "the disabled Mark all as read button"). **Do NOT mention "blueprint", "captured blueprint", "DOM analyzer", or any other internal MCP terminology in the reasoning text.** The reader of the report is a developer reviewing test recommendations on a PR; they don't know what a blueprint is and shouldn't have to. Phrases like "the captured blueprint shows X" or "visible from the captured blueprint" leak builder internals — instead just describe what the test verifies in plain product terms ("the disabled mark-all-read button in the empty state"). Likewise do NOT use internal-identifier syntax like \`role=button, accessibleName='X', logicalName=...\`.
+</ui_recommendation_authoring_rules>
+`;
     // ── Endpoint listing ──
     const allEndpoints = analysis.apiEndpoints.endpoints;
     // In PR mode, identify which endpoints were changed so we can partition the listing.
@@ -124,7 +146,7 @@ Output should be concise and immediately actionable.`
                 changedLines.push(`  ${m.method} ${ep.path} [removed]`);
             }
         }
-        endpointLines = `**Likely changed in this PR (from static file→endpoint mapping — verify against diff in Step 2):**\n${changedLines.join("\n") || "  none"}\n\n**Other endpoints (reference only):**\n${otherLines.join("\n") || "  none"}`;
+        endpointLines = `**Likely changed in this PR (from static file→endpoint mapping — verify against diff in Step ${ANALYSIS_STEP_EXTRACT}):**\n${changedLines.join("\n") || "  none"}\n\n**Other endpoints (reference only):**\n${otherLines.join("\n") || "  none"}`;
     }
     else {
         endpointLines = allEndpoints
@@ -311,10 +333,11 @@ ${detailBlocks}
         }
         mainSection = buildFullRepoRecommendations(scored, topN, analysis.apiEndpoints.baseUrl, authHeaderValue, authSchemeSnippet, authTypeValue, isFrontendProject, isFrontendOnlyProject, externalCoverageFullRepo);
     }
-    else if (isDiffScope && (isUIOnlyPR || scored.length > 0)) {
-        // Build external coverage set for programmatic dedup — prevents recommending
-        // tests that duplicate existing non-Skyramp tests at the METHOD::resource::testType
-        // level, so different methods on the same resource (e.g. GET vs PUT) remain distinct.
+    else if (isDiffScope) {
+        // Always use the full execution plan in diff scope — even when scored.length === 0
+        // (no pre-ranked scenarios). The execution plan includes the dynamic Code Review step,
+        // bug-catching insertion, and the dynamic bug-coverage gate
+        // that are critical for catching seeded bugs in new endpoints.
         const externalCoverage = buildExternalCoverageSet(testLocations);
         if (externalCoverage.size > 0) {
             logger.info(`External test coverage keys: ${[...externalCoverage].join(", ")}`);
@@ -322,6 +345,7 @@ ${detailBlocks}
         mainSection = buildExecutionPlan(scored, maxGen, topN, analysis.apiEndpoints.baseUrl, authHeaderValue, authSchemeSnippet, authTypeValue, seed, endpointCount, isUIOnlyPR, hasFrontendChanges, hasTraces, externalCoverage, analysis.existingTests.relevantExternalTestPaths ?? []);
     }
     else {
+        // Full-repo scope with no scored items — rare fallback
         mainSection = `
 ## Draft Your Execution Plan
@@ -407,7 +431,7 @@ Only add NEW recommendations for code paths introduced in the latest commit.
         prHistorySection = `
 ## PR History (PR #${prContext.prNumber})
 Tests from prior bot runs are still in the working tree — the maintenance pipeline
-(Task 2) keeps them up to date. Use the history below to **avoid duplicating** existing
+(${taskRef(TASK_GENERATE)}) keeps them up to date. Use the history below to **avoid duplicating** existing
 coverage and to fill gaps:
 - **Do NOT re-recommend** tests listed under "Previously Generated Tests" — they already
   exist and are maintained automatically.
@@ -419,7 +443,7 @@ coverage and to fill gaps:
 - **Stability**: When the code diff between commits is small, the recommendation set
   should be mostly stable. Do not churn recommendations without cause.
 - If prior execution results show failures, note the issue but do not re-recommend
-  the test — Task 2 handles fixes for existing tests.
+  the test — ${taskRef(TASK_GENERATE)} handles fixes for existing tests.
 ${historyBody}`;
     }
     // ── Compose all sections ──
@@ -436,7 +460,7 @@ ${modePreamble}
 Scope: ${scopeNote}
 ${sourcePriority}
+${uiRecRulesSection}
 <repository_context>
 ## Repository Context
@@ -485,7 +509,7 @@ ${isDiffScope
 ${mainSection}
-${isDiffScope ? buildVerificationChecklist(topN, maxGen) : ""}
+${isDiffScope ? buildVerificationChecklist(topN, maxGen, EXEC_STEP_CODE_REVIEW) : ""}
 </instructions>
 `;
 }