@skyramp/mcp 0.0.64-rc.4 → 0.0.64-rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,7 +44,16 @@ function computeTiebreakerSeed(endpoints, diffFiles) {
44
44
  return crypto.createHash("sha256").update(canonical).digest("hex").slice(0, 8);
45
45
  }
46
46
  // ── Execution Plan (replaces pre-ranked + scenarios + heuristic sections) ──
47
- function buildExecutionPlan(scored, maxGen, topN, baseUrl, authHeaderValue, authSchemeSnippet, authTypeValue, seed, endpointCount, isUIOnlyPR, hasFrontendChanges = false, hasTraces = false) {
47
+ function buildExecutionPlan(scored, maxGen, topN, baseUrl, authHeaderValue, authSchemeSnippet, authTypeValue, seed, endpointCount, isUIOnlyPR, hasFrontendChanges = false, hasTraces = false, isDiffScope = false, isFrontendProject = false, isFrontendOnlyProject = false) {
48
+ // Full-repo mode only — percentage-based UI/E2E slot targets (15% each, floor 1).
49
+ // Capped so E2E+UI together never exceed topN.
50
+ // Referenced in supplementNote below, but the ternary conditions that use them
51
+ // (`isFrontendProject && !isDiffScope`) are always false in PR/diff mode.
52
+ const rawE2E = isFrontendProject ? Math.max(1, Math.round(topN * 0.15)) : 0;
53
+ const rawUI = isFrontendProject ? Math.max(1, Math.round(topN * 0.15)) : 0;
54
+ const slotsFloor = Math.floor(topN / 2);
55
+ const minE2ESlots = Math.min(rawE2E, slotsFloor);
56
+ const minUISlots = Math.min(rawUI, Math.max(0, topN - minE2ESlots));
48
57
  const generateItems = scored.slice(0, Math.min(maxGen, scored.length));
49
58
  const additionalItems = scored.slice(maxGen, topN);
50
59
  const authRef = authHeaderValue
@@ -109,9 +118,10 @@ function buildExecutionPlan(scored, maxGen, topN, baseUrl, authHeaderValue, auth
109
118
  prereqNote);
110
119
  }
111
120
  }).join("\n\n");
112
- // For mixed PRs, always reserve slots for UI and E2E recommendations regardless of whether
113
- // traces already existthe user can record them later or the bot can record during the run.
114
- const needsE2ESlot = hasFrontendChanges && !isUIOnlyPR;
121
+ // Reserve slots for UI/E2E ADDITIONAL recommendations on mixed PRs.
122
+ // E2E requires traces to generateonly reserve the slot when traces are available.
123
+ // UI can be recommended without traces (agent can record inline).
124
+ const needsE2ESlot = hasFrontendChanges && !isUIOnlyPR && hasTraces;
115
125
  const needsUISlot = hasFrontendChanges && !isUIOnlyPR;
116
126
  const frontendSlots = (needsE2ESlot ? 1 : 0) + (needsUISlot ? 1 : 0);
117
127
  const backendAdditionalItems = frontendSlots > 0
@@ -143,8 +153,159 @@ function buildExecutionPlan(scored, maxGen, topN, baseUrl, authHeaderValue, auth
143
153
  })() : "";
144
154
  const supplementCount = topN - generateItems.length - backendAdditionalItems.length - frontendSlots;
145
155
  const supplementNote = supplementCount > 0
146
- ? `\n**REQUIRED — You MUST add ${supplementCount} more to reach the total of ${topN}.** Draft them from endpoint interactions and source code patterns not yet covered. Use the same 5-dimension rubric and quality gate to assign priority (HIGH/MEDIUM/LOW), testType, and category.${hasFrontendChanges && !isUIOnlyPR ? " Since this PR has frontend changes, at least 1 of these should be a UI or E2E test targeting the changed components." : ""} Do NOT produce fewer than ${topN} total.`
156
+ ? `\n**REQUIRED — You MUST add ${supplementCount} more to reach the total of ${topN}.** Draft them in this priority order — exhaust each tier before moving to the next:\n\n**Tier 1:** Edge-case and error-path tests for endpoints already in the ${isDiffScope ? "GENERATE set" : "list"} — boundary values for numeric fields (e.g. 0%, 100%, >100% discount), invalid/non-existent IDs (→ 404), empty arrays where a minimum is required, missing required fields (→ 422), auth boundary (call without Authorization header → 403/401).\n\n**Tier 2:** Auth-boundary contract tests for any endpoint not yet covered.\n\n**Tier 3:** Cross-resource integration tests — ONLY when one resource's POST body contains the other's \`_id\` field. NEVER pair resources where neither POST body has the other's ID.\n\n**Tier 4:** CRUD lifecycle tests for any resource not yet covered.\n\nUse the same 5-dimension rubric to assign priority (HIGH/MEDIUM/LOW), testType, and category. For each supplement item, apply the same source-code enrichment from Step 1 — use real field names from the route handler, not generic placeholders.${isFrontendOnlyProject && !isDiffScope ? ` Since this is a frontend repo, the supplement MUST include at least ${minE2ESlots} E2E test${minE2ESlots > 1 ? "s" : ""} (\`skyramp_e2e_test_generation\`) and at least ${minUISlots} UI test${minUISlots > 1 ? "s" : ""} (\`skyramp_ui_test_generation\`). Do NOT add integration or contract tests.` : isFrontendProject && !isDiffScope ? ` Since this is a full-stack repo, the supplement MUST include at least ${minE2ESlots} E2E test${minE2ESlots > 1 ? "s" : ""} (\`skyramp_e2e_test_generation\` — full browser-to-backend flow) and at least ${minUISlots} UI test${minUISlots > 1 ? "s" : ""} (\`skyramp_ui_test_generation\` — component-level interaction flows). Add these before exhausting backend tiers.` : hasFrontendChanges && !isUIOnlyPR ? " Since this PR has frontend changes, at least 1 of these should be a UI or E2E test targeting the changed components." : ""} Do NOT produce fewer than ${topN} total.`
147
157
  : "";
158
+ // ── Full-repo mode: recommendations only, no execution ──────────────────
159
+ if (!isDiffScope) {
160
+ const toTitle = (name) => name.replace(/-/g, " ").replace(/\b\w/g, c => c.toUpperCase());
161
+ // Coverage ranking (highest to lowest breadth):
162
+ // E2E first: full browser-to-backend flow — exercises both frontend and backend.
163
+ // UI second: frontend components call backend APIs — also exercises backend.
164
+ // Integration third: backend API chains validated directly.
165
+ // Contract last: single-endpoint boundary only.
166
+ const TYPE_ORDER = ["e2e", "ui", "integration", "contract"];
167
+ const TYPE_LABEL = {
168
+ e2e: "E2E", ui: "UI", integration: "Integration", contract: "Contract",
169
+ };
170
+ // All scored items up to topN, already sorted by priority/novelty
171
+ const allItems = scored.slice(0, topN);
172
+ // Group by test type while preserving priority ordering within each group
173
+ const byType = new Map();
174
+ for (const t of TYPE_ORDER)
175
+ byType.set(t, []);
176
+ for (const item of allItems) {
177
+ const t = item.scenario.testType ?? (item.scenario.steps.length === 1 ? "contract" : "integration");
178
+ if (!byType.has(t))
179
+ byType.set(t, []);
180
+ byType.get(t).push(item);
181
+ }
182
+ const renderItem = (item, rank) => {
183
+ const s = item.scenario;
184
+ const testType = s.testType ?? (s.steps.length === 1 ? "contract" : "integration");
185
+ const title = toTitle(s.scenarioName);
186
+ if (testType === "contract") {
187
+ const step = s.steps[0];
188
+ const endpointURL = `${baseUrl}${step.path}`;
189
+ const isBodyMethod = ["POST", "PUT", "PATCH"].includes(step.method);
190
+ const dataParam = isBodyMethod
191
+ ? `, requestData: <${step.method} ${step.path} required fields from source code>`
192
+ : "";
193
+ return [
194
+ `**${rank}. ${title}**`,
195
+ ` ${s.description}`,
196
+ ` ${step.method} ${step.path} → ${step.expectedStatusCode}`,
197
+ ` Tool: \`skyramp_contract_test_generation({ endpointURL: "${endpointURL}", method: "${step.method}"${authRef}${dataParam} })\``,
198
+ ` From source: fill in requestData field names and the specific production boundary this validates`,
199
+ ].join("\n");
200
+ }
201
+ else {
202
+ const stepLines = s.steps.map(st => {
203
+ const isBody = ["POST", "PUT", "PATCH"].includes(st.method);
204
+ const bodyHint = isBody ? ` — body: <${st.method} ${st.path} required fields from source>` : "";
205
+ return ` ${st.order}. ${st.method} ${st.path} → ${st.expectedStatusCode}: ${st.description}${bodyHint}`;
206
+ }).join("\n");
207
+ const toolCalls = s.steps.map(st => {
208
+ const isBody = ["POST", "PUT", "PATCH"].includes(st.method);
209
+ const dataParam = isBody
210
+ ? `, requestBody: <${st.method} ${st.path} required fields from source>`
211
+ : "";
212
+ return ` skyramp_scenario_test_generation({ scenarioName: "${s.scenarioName}", destination: "${s.scenarioName}", baseURL: "${baseUrl}", method: "${st.method}", path: "${st.path}", statusCode: ${st.expectedStatusCode}${scenarioAuthRef}${dataParam} })`;
213
+ }).join("\n");
214
+ // E2E and UI use trace-based generation, not the scenario pipeline.
215
+ // Only emit per-step skyramp_scenario_test_generation calls for integration type.
216
+ const isTraceBased = testType === "e2e" || testType === "ui";
217
+ const finalTool = testType === "e2e"
218
+ ? `skyramp_e2e_test_generation({ playwrightZip: "<trace zip path>", traceFile: "<backend trace path>"${authHeaderOnlyRef} })`
219
+ : testType === "ui"
220
+ ? `skyramp_ui_test_generation({ playwrightZip: "<trace zip path>"${authHeaderOnlyRef} })`
221
+ : `skyramp_integration_test_generation({ scenarioFile: "scenario_${s.scenarioName}.json"${authHeaderOnlyRef} })`;
222
+ const toolCallsBlock = isTraceBased
223
+ ? ` ${finalTool}`
224
+ : `${toolCalls}\n ${finalTool}`;
225
+ return [
226
+ `**${rank}. ${title}**`,
227
+ ` ${s.description}`,
228
+ ` Steps:`,
229
+ stepLines,
230
+ ` Tool calls:`,
231
+ toolCallsBlock,
232
+ ` From source: fill in requestBody field values and assert all computed response fields`,
233
+ ].join("\n");
234
+ }
235
+ };
236
+ const sections = TYPE_ORDER
237
+ .filter(t => (byType.get(t) ?? []).length > 0)
238
+ .map(t => {
239
+ const items = byType.get(t);
240
+ const label = TYPE_LABEL[t];
241
+ let globalRank = 0;
242
+ for (const prev of TYPE_ORDER) {
243
+ if (prev === t)
244
+ break;
245
+ globalRank += (byType.get(prev) ?? []).length;
246
+ }
247
+ const entries = items.map((item, i) => renderItem(item, globalRank + i + 1)).join("\n\n");
248
+ return `### ${label} (${items.length})\n\n${entries}`;
249
+ })
250
+ .join("\n\n");
251
+ const repoSupplementNote = supplementNote; // already built above with isDiffScope=false tier ordering
252
+ return `## Test Recommendations (${topN} total)
253
+
254
+ > **Repo mode — no tests are executed.** Use the tool calls below to generate any recommendation on demand.
255
+ > Highest-value tests appear first within each type. Use the "From source" hint in each item to fill in field names and assertions before calling the tool.
256
+
257
+ **Step 1 — Source-Code Enrichment (MANDATORY before presenting anything)**
258
+ For each endpoint listed in the Repository Context above, read the route handler source code. Look for:
259
+ - **All required request body fields** (names and types) for POST/PUT/PATCH — use in step descriptions and tool call params
260
+ - **Computed/derived response fields** (e.g. \`total_amount\`, \`discount_percent\`) and their formulas — assert these
261
+ - **Auth middleware** (HTTPBearer, \`Depends(get_current_user)\`, \`@UseGuards\`, \`jwt.verify\`) — set \`authHeader\`/\`authScheme\`; FastAPI HTTPBearer → **403** (not 401)
262
+ - **Storage backend** — if Redis or schema-less, discard unique-constraint and cascade-delete scenarios
263
+ - **Delete behavior** — hard-delete → 204; soft-delete/cancel → 200
264
+
265
+ Do NOT present generic placeholders. Replace every \`<... from source>\` hint with actual field names and realistic values.
266
+
267
+ ${buildTestPatternGuidelines()}
268
+
269
+ ${buildTestExamples()}
270
+
271
+ **Cascade vs referential integrity:** Keep only the scenario that matches the actual FK delete policy in source (CASCADE → cascade-delete; RESTRICT/no cascade → delete-blocked). Remove the other silently — do NOT create a "Removed Recommendations" or "Not Applicable" section.
272
+
273
+ **Unique constraints:** If the storage backend is Redis, in-memory, or schema-less with no explicit \`UNIQUE\` index, discard unique-constraint scenarios entirely and replace them with a different high-value test. Do NOT list removed scenarios.
274
+
275
+ **NEVER create a "Removed Recommendations", "Not Applicable", or similar section.** If a scenario is inapplicable, silently replace it with an equivalent-priority scenario from the supplement tiers. The output must contain ONLY the ${topN} recommendations.
276
+
277
+ ${buildTestQualityCriteria()}
278
+
279
+ **5-dimension rubric — use to assign priority for supplement items:**
280
+ | Dimension | What to assess |
281
+ | Production Safety | Guards a critical boundary (auth, unique constraint, cascade delete, data integrity, breaking migration)? → HIGH |
282
+ | Bug-Finding Potential | Targets a known failure mode (race condition, data consistency, state transition, cascade effect)? → HIGH |
283
+ | User Journey Relevance | Reflects how real users interact (from traces, business flows, critical paths)? → HIGH or MEDIUM |
284
+ | Coverage Gap | Addresses an area with zero existing test coverage? → bump up one tier |
285
+ | Code Insight | Derived from actual implementation (spotted middleware pattern, N+1 risk, unique constraint)? → bump up one tier |
286
+
287
+ **Per-recommendation format (apply to ALL items):**
288
+ - Title and one-sentence description of what it validates (business rule, not just "tests the endpoint")
289
+ - Steps with concrete field names and realistic values derived from source code
290
+ - Ready-to-use tool call — replace all \`<...>\` placeholders with real values before presenting
291
+ - "From source" note — the specific production risk or business rule this prevents
292
+
293
+ **MANDATORY: Every pre-ranked item listed above MUST appear in your output — do not drop or skip any.**
294
+
295
+ ${sections}
296
+ ${repoSupplementNote}
297
+
298
+ **Test type mix — MANDATORY:**
299
+ ${isFrontendOnlyProject
300
+ ? `This is a frontend repo. Focus on E2E and UI tests only — E2E covers the full browser-to-backend flow (highest coverage), UI exercises frontend components that call backend APIs. Include at least ${minE2ESlots} E2E test${minE2ESlots > 1 ? "s" : ""} (\`skyramp_e2e_test_generation\`) and at least ${minUISlots} UI test${minUISlots > 1 ? "s" : ""} (\`skyramp_ui_test_generation\`). Do NOT add integration or contract tests.`
301
+ : isFrontendProject
302
+ ? `This is a full-stack repo. Coverage ranking: E2E (full browser-to-backend flow) > UI (frontend exercises backend APIs) > Integration (backend chains) > Contract (single endpoint). Include at least ${minE2ESlots} E2E test${minE2ESlots > 1 ? "s" : ""} (\`skyramp_e2e_test_generation\`) and at least ${minUISlots} UI test${minUISlots > 1 ? "s" : ""} (\`skyramp_ui_test_generation\`), in addition to backend integration and contract tests.`
303
+ : `Focus on integration and contract tests for all API endpoints.`}
304
+ **No smoke tests. No fuzz tests.**
305
+
306
+ **You MUST present EXACTLY ${topN} recommendations. Do NOT execute any tests. Do NOT produce fewer than ${topN}.**`;
307
+ }
308
+ // ── PR / branch-diff mode: execution plan ────────────────────────────────
148
309
  return `## Execution Plan
149
310
  Seed: ${seed} | Endpoints: ${endpointCount} | Budget: ${generateItems.length} generate + ${Math.max(topN - generateItems.length, 0)} additional = ${topN} total
150
311
 
@@ -370,7 +531,7 @@ ${detailBlocks}
370
531
  const errorA = a.scenario.steps.some(s => s.interactionType === "error" || s.interactionType === "edge-case") ? 1 : 0;
371
532
  const errorB = b.scenario.steps.some(s => s.interactionType === "error" || s.interactionType === "edge-case") ? 1 : 0;
372
533
  if (errorB !== errorA)
373
- return errorB - errorA;
534
+ return errorA - errorB;
374
535
  // Use locale-independent comparison to avoid runtime-locale non-determinism
375
536
  const nameA = a.scenario.scenarioName;
376
537
  const nameB = b.scenario.scenarioName;
@@ -427,7 +588,10 @@ Do not churn recommendations without cause.
427
588
  `;
428
589
  }
429
590
  else if (scored.length > 0) {
430
- mainSection = buildExecutionPlan(scored, maxGen, topN, analysis.apiEndpoints.baseUrl, authHeaderValue, authSchemeSnippet, authTypeValue, seed, endpointCount, isUIOnlyPR, hasFrontendChanges, hasTraces);
591
+ const projectType = analysis.projectClassification.projectType;
592
+ const isFrontendProject = projectType === "full-stack" || projectType === "frontend";
593
+ const isFrontendOnlyProject = projectType === "frontend";
594
+ mainSection = buildExecutionPlan(scored, maxGen, topN, analysis.apiEndpoints.baseUrl, authHeaderValue, authSchemeSnippet, authTypeValue, seed, endpointCount, isUIOnlyPR, hasFrontendChanges, hasTraces, isDiffScope, isFrontendProject, isFrontendOnlyProject);
431
595
  }
432
596
  else {
433
597
  mainSection = `
@@ -501,13 +665,12 @@ and adjust the test approach if needed.
501
665
  historyBody += `
502
666
  ### Previously Recommended (not generated)
503
667
  ${recLines}
504
- **Stability rule**: If a previously recommended test still applies to the current code
505
- (the endpoint exists, the business logic hasn't changed), carry it forward in your
506
- additionalRecommendations match by scenarioName (for multi-step scenarios) or by
507
- endpoint (for single-endpoint tests). Re-derive category and priority from the test
508
- content. Do NOT drop a previous recommendation unless the underlying code was removed
509
- or the test is now covered by a generated test.
510
- Only add NEW recommendations for code paths introduced in the latest commit.
668
+ **Stability rule**: Carry forward previously recommended tests unchanged in
669
+ additionalRecommendations if they still apply match by scenarioName (for multi-step
670
+ scenarios) or by endpoint (for single-endpoint tests). Re-derive category and priority
671
+ from the test content. Drop only if the underlying endpoint was removed, business logic
672
+ changed, or the test is now covered by a generated test.
673
+ GENERATE items are always executed regardless of prior recommendations — do not suppress them.
511
674
  `;
512
675
  }
513
676
  prHistorySection = `
@@ -515,8 +678,11 @@ Only add NEW recommendations for code paths introduced in the latest commit.
515
678
  Tests from prior bot runs are still in the working tree — the maintenance pipeline
516
679
  (Task 2) keeps them up to date. Use the history below to **avoid duplicating** existing
517
680
  coverage and to fill gaps:
518
- - **Do NOT re-recommend** tests listed under "Previously Generated Tests" they already
519
- exist and are maintained automatically.
681
+ - **GENERATE section is unaffected by prior history** always execute ALL pre-ranked
682
+ GENERATE items regardless of what was generated in prior runs. The execution pipeline
683
+ handles deduplication at the file level.
684
+ - Tests listed under "Previously Generated Tests" are maintained automatically by Task 2 —
685
+ do NOT include them in additionalRecommendations.
520
686
  - **Carry forward** previously recommended-but-not-generated tests unchanged in
521
687
  additionalRecommendations if they still apply. Promote the highest-priority ones
522
688
  into generation slots if capacity allows.
@@ -202,14 +202,15 @@ describe("buildRecommendationPrompt — PR History section", () => {
202
202
  expect(prompt).toContain("Promote the highest-priority ones");
203
203
  expect(prompt).toContain("into generation slots if capacity allows");
204
204
  });
205
- it("includes do-not-re-recommend instruction for implemented tests", () => {
205
+ it("instructs that GENERATE is unaffected by prior history for implemented tests (Gap 4)", () => {
206
206
  const ctx = makePRContext({
207
207
  previousRecommendations: [
208
208
  { testType: "contract", endpoint: "GET /api/items", status: "implemented", commentId: "1" },
209
209
  ],
210
210
  });
211
211
  const prompt = buildRecommendationPrompt(minimalAnalysis(), "current_branch_diff", 10, ctx);
212
- expect(prompt).toContain("Do NOT re-recommend");
212
+ expect(prompt).toContain("GENERATE section is unaffected by prior history");
213
+ expect(prompt).not.toContain("Do NOT re-recommend");
213
214
  expect(prompt).toContain("Previously Generated Tests");
214
215
  });
215
216
  it("de-duplicates multi-step scenario entries to one line per scenario", () => {
@@ -281,29 +282,31 @@ function minimalScenario(overrides = {}) {
281
282
  };
282
283
  }
283
284
  describe("buildRecommendationPrompt — Stability and supplement section", () => {
284
- it("includes Recommendation Stability section in output when scenarios exist", () => {
285
+ // Recommendation Stability is a PR-mode (branch_diff) concept carry-forward across bot runs.
286
+ // Full-repo mode is presentation-only; there is no previous-run state to carry forward.
287
+ it("includes Recommendation Stability section in output when scenarios exist (PR mode)", () => {
285
288
  const analysis = minimalAnalysis({
286
289
  businessContext: { mainPurpose: "Test API", userFlows: [], dataFlows: [], integrationPatterns: [], draftedScenarios: [minimalScenario()] },
287
290
  });
288
- const prompt = buildRecommendationPrompt(analysis, "full_repo", 10);
291
+ const prompt = buildRecommendationPrompt(analysis, "current_branch_diff", 10);
289
292
  expect(prompt).toContain("## Recommendation Stability");
290
293
  });
291
- it("stability section uses scenarioName/endpoint matching strategy", () => {
294
+ it("stability section uses scenarioName/endpoint matching strategy (PR mode)", () => {
292
295
  const analysis = minimalAnalysis({
293
296
  businessContext: { mainPurpose: "Test API", userFlows: [], dataFlows: [], integrationPatterns: [], draftedScenarios: [minimalScenario()] },
294
297
  });
295
- const prompt = buildRecommendationPrompt(analysis, "full_repo", 10);
298
+ const prompt = buildRecommendationPrompt(analysis, "current_branch_diff", 10);
296
299
  const stabilityStart = prompt.indexOf("## Recommendation Stability");
297
300
  const stabilityBlock = prompt.slice(stabilityStart, stabilityStart + 500);
298
301
  expect(stabilityBlock).toContain("scenarioName");
299
302
  expect(stabilityBlock).toContain("endpoint");
300
303
  expect(stabilityBlock).toContain("Re-derive category and priority");
301
304
  });
302
- it("stability section specifies when to drop a recommendation", () => {
305
+ it("stability section specifies when to drop a recommendation (PR mode)", () => {
303
306
  const analysis = minimalAnalysis({
304
307
  businessContext: { mainPurpose: "Test API", userFlows: [], dataFlows: [], integrationPatterns: [], draftedScenarios: [minimalScenario()] },
305
308
  });
306
- const prompt = buildRecommendationPrompt(analysis, "full_repo", 10);
309
+ const prompt = buildRecommendationPrompt(analysis, "current_branch_diff", 10);
307
310
  expect(prompt).toContain("target endpoint was removed");
308
311
  expect(prompt).toContain("business logic changed");
309
312
  expect(prompt).toContain("covered by a generated test");
@@ -321,12 +324,12 @@ describe("buildRecommendationPrompt — Stability and supplement section", () =>
321
324
  it("MAX_TESTS_TO_GENERATE is 3", () => {
322
325
  expect(MAX_TESTS_TO_GENERATE).toBe(3);
323
326
  });
324
- it("uses MAX_CRITICAL_TESTS in category-aware selection rules", () => {
327
+ it("uses MAX_CRITICAL_TESTS in category-aware selection rules (PR mode)", () => {
325
328
  const analysis = minimalAnalysis({
326
329
  businessContext: { mainPurpose: "Test API", userFlows: [], dataFlows: [], integrationPatterns: [], draftedScenarios: [minimalScenario()] },
327
330
  });
328
- const prompt = buildRecommendationPrompt(analysis, "full_repo", 10);
329
- // The critical-category minimum line references MAX_CRITICAL_TESTS (= 3)
331
+ // MAX_CRITICAL_TESTS applies to PR mode (GENERATE items) — full_repo mode only presents, does not execute
332
+ const prompt = buildRecommendationPrompt(analysis, "current_branch_diff", 10);
330
333
  expect(prompt).toContain("GENERATE items MUST be from HIGH-priority categories");
331
334
  });
332
335
  });
@@ -369,3 +372,569 @@ describe("PATH_PARAM_UUID_GUIDANCE — no hardcoded UUID anchor", () => {
369
372
  expect(prompt).not.toMatch(UUID_V4_REGEX);
370
373
  });
371
374
  });
375
+ // ---------------------------------------------------------------------------
376
+ // Regression tests — PR #110 quality baseline
377
+ //
378
+ // Guard against regressions in recommendation quality. These tests assert that
379
+ // the key signals that made PR #110's recommendations excellent are present in
380
+ // both full_repo and branch_diff (PR) modes.
381
+ // Baseline: https://github.com/letsramp/demoshop-fullstack/pull/110
382
+ // ---------------------------------------------------------------------------
383
+ function mockDiffScenario(overrides = {}) {
384
+ return {
385
+ scenarioName: "orders-update-with-discount",
386
+ description: "PUT /api/v1/orders/{order_id} with discount_percent — verifies total_amount formula",
387
+ category: "business_rule",
388
+ priority: "high",
389
+ steps: [
390
+ { order: 1, method: "POST", path: "/api/v1/products", expectedStatusCode: 201, description: "Create product", interactionType: "success" },
391
+ { order: 2, method: "POST", path: "/api/v1/orders", expectedStatusCode: 201, description: "Create order", interactionType: "success", chainsFrom: { sourceField: "id", sourceStep: 1, sourceLocation: "body", targetParam: "product_id", targetLocation: "body" } },
392
+ { order: 3, method: "PUT", path: "/api/v1/orders/{order_id}", expectedStatusCode: 200, description: "Apply discount", interactionType: "success", chainsFrom: { sourceField: "order_id", sourceStep: 2, sourceLocation: "body", targetParam: "order_id", targetLocation: "path" } },
393
+ ],
394
+ chainingKeys: ["id", "order_id"],
395
+ requiresAuth: true,
396
+ estimatedComplexity: "moderate",
397
+ testType: "integration",
398
+ ...overrides,
399
+ };
400
+ }
401
+ function analysisWithScenario(scope) {
402
+ const base = minimalAnalysis({
403
+ businessContext: {
404
+ mainPurpose: "E-commerce demo",
405
+ userFlows: [],
406
+ dataFlows: [],
407
+ integrationPatterns: [],
408
+ draftedScenarios: [mockDiffScenario()],
409
+ },
410
+ });
411
+ if (scope === "current_branch_diff") {
412
+ return {
413
+ ...base,
414
+ branchDiffContext: {
415
+ currentBranch: "shiny/edit-order",
416
+ baseBranch: "main",
417
+ changedFiles: ["backend/app/routers/orders.py"],
418
+ newEndpoints: [{
419
+ path: "/api/v1/orders/{order_id}",
420
+ methods: [{ method: "PUT", sourceFile: "orders.py", interactionCount: 3 }],
421
+ }],
422
+ modifiedEndpoints: [],
423
+ affectedServices: ["orders"],
424
+ },
425
+ };
426
+ }
427
+ return base;
428
+ }
429
+ describe("PR #110 quality baseline — full_repo mode", () => {
430
+ let prompt;
431
+ beforeAll(() => { prompt = buildRecommendationPrompt(analysisWithScenario("full_repo"), "full_repo", 20); });
432
+ it("source enrichment targets each endpoint's route handler, not 'changed files'", () => {
433
+ expect(prompt).toContain("Source-Code Enrichment");
434
+ expect(prompt).toContain("route handler");
435
+ expect(prompt).not.toContain("Read the source code for ALL changed files");
436
+ });
437
+ it("includes test pattern guidelines for quality anchoring", () => {
438
+ expect(prompt).toContain("Test Pattern Guidelines");
439
+ });
440
+ it("includes concrete impressive/deprioritise examples", () => {
441
+ expect(prompt).toContain("Impressive (these catch prod bugs)");
442
+ expect(prompt).toContain("Deprioritise");
443
+ });
444
+ it("supplement ordering puts edge cases before cross-resource (Tier 1 before Tier 3)", () => {
445
+ const tier1Idx = prompt.indexOf("Tier 1");
446
+ const tier3Idx = prompt.indexOf("Tier 3");
447
+ expect(tier1Idx).toBeGreaterThan(-1);
448
+ expect(tier3Idx).toBeGreaterThan(-1);
449
+ expect(tier1Idx).toBeLessThan(tier3Idx);
450
+ });
451
+ it("supplement Tier 1 calls out boundary values and invalid IDs explicitly", () => {
452
+ expect(prompt).toMatch(/Tier 1.*boundary values/s);
453
+ expect(prompt).toMatch(/Tier 1.*invalid.*non-existent IDs/s);
454
+ });
455
+ it("includes 5-dimension quality rubric", () => {
456
+ expect(prompt).toContain("Production Safety");
457
+ expect(prompt).toContain("Bug-Finding Potential");
458
+ expect(prompt).toContain("Coverage Gap");
459
+ });
460
+ it("includes per-recommendation format instruction", () => {
461
+ // Full-repo mode hides category/priority from user output — check for format label and key fields
462
+ expect(prompt).toContain("Per-recommendation format");
463
+ expect(prompt).toContain("tool call");
464
+ expect(prompt).toContain("From source");
465
+ });
466
+ it("includes unique-constraint storage gating for Redis", () => {
467
+ expect(prompt).toContain("Unique constraints");
468
+ expect(prompt).toContain("Redis");
469
+ });
470
+ });
471
+ // ---------------------------------------------------------------------------
472
+ // Tests — full_repo output format and execution guardrails
473
+ //
474
+ // Guard that full_repo mode:
475
+ // - never emits execution/GENERATE language
476
+ // - groups items by test type with section headers
477
+ // - hides category/priority labels from user-facing rendered items
478
+ // - emits "Do NOT execute any tests"
479
+ // - renders pre-ranked item names
480
+ // - includes cascade guidance
481
+ // - scopes Tier 1 supplement to "list" (not "GENERATE set")
482
+ // ---------------------------------------------------------------------------
483
+ function fullRepoAnalysisWithScenarios(overrides = {}, scenarios = []) {
484
+ return minimalAnalysis({
485
+ businessContext: {
486
+ mainPurpose: "E-commerce API",
487
+ userFlows: [],
488
+ dataFlows: [],
489
+ integrationPatterns: [],
490
+ draftedScenarios: scenarios.length > 0 ? scenarios : [mockDiffScenario()],
491
+ },
492
+ ...overrides,
493
+ });
494
+ }
495
+ function makeContractScenario() {
496
+ return {
497
+ scenarioName: "create-product-contract",
498
+ description: "POST /api/v1/products auth boundary",
499
+ category: "security_boundary",
500
+ priority: "high",
501
+ steps: [{ order: 1, method: "POST", path: "/api/v1/products", expectedStatusCode: 201, description: "Create product", interactionType: "success" }],
502
+ chainingKeys: [],
503
+ requiresAuth: true,
504
+ estimatedComplexity: "simple",
505
+ testType: "contract",
506
+ };
507
+ }
508
+ describe("full_repo mode — output format and execution guardrails", () => {
509
+ let prompt;
510
+ beforeAll(() => {
511
+ prompt = buildRecommendationPrompt(fullRepoAnalysisWithScenarios({}, [mockDiffScenario(), makeContractScenario()]), "full_repo", 10);
512
+ });
513
+ it("does NOT contain GENERATE execution language", () => {
514
+ expect(prompt).not.toContain("### GENERATE");
515
+ expect(prompt).not.toContain("execute these in order");
516
+ expect(prompt).not.toContain("one retry on failure then skip");
517
+ });
518
+ it("does NOT contain the PR-mode ADDITIONAL section header", () => {
519
+ // The '### ADDITIONAL (list in additionalRecommendations...)' header is a PR-mode structural
520
+ // concept; it must not appear in the full_repo grouped output.
521
+ expect(prompt).not.toContain("### ADDITIONAL (list in additionalRecommendations");
522
+ });
523
+ it("contains explicit 'Do NOT execute any tests' instruction", () => {
524
+ expect(prompt).toContain("Do NOT execute any tests");
525
+ });
526
+ it("contains 'Repo mode' header or preamble", () => {
527
+ expect(prompt).toContain("Repo mode");
528
+ });
529
+ it("groups items by test type — Integration section header present", () => {
530
+ expect(prompt).toMatch(/### (Integration|Contract)/);
531
+ });
532
+ it("E2E section appears before Integration section (E2E ranked highest coverage)", () => {
533
+ const e2eIdx = prompt.indexOf("### E2E");
534
+ const integrationIdx = prompt.indexOf("### Integration");
535
+ // If E2E section exists, it must appear before Integration
536
+ if (e2eIdx !== -1 && integrationIdx !== -1) {
537
+ expect(e2eIdx).toBeLessThan(integrationIdx);
538
+ }
539
+ // At minimum, E2E appears before Contract
540
+ const contractIdx = prompt.indexOf("### Contract");
541
+ if (e2eIdx !== -1 && contractIdx !== -1) {
542
+ expect(e2eIdx).toBeLessThan(contractIdx);
543
+ }
544
+ });
545
+ it("UI section appears before Integration and Contract sections", () => {
546
+ const uiIdx = prompt.indexOf("### UI");
547
+ const integrationIdx = prompt.indexOf("### Integration");
548
+ const contractIdx = prompt.indexOf("### Contract");
549
+ if (uiIdx !== -1 && integrationIdx !== -1) {
550
+ expect(uiIdx).toBeLessThan(integrationIdx);
551
+ }
552
+ if (uiIdx !== -1 && contractIdx !== -1) {
553
+ expect(uiIdx).toBeLessThan(contractIdx);
554
+ }
555
+ });
556
+ it("prompt forbids the LLM from creating a 'Removed Recommendations' section", () => {
557
+ // The prompt must contain the 'NEVER create' instruction so the LLM doesn't add such a section
558
+ expect(prompt).toContain("NEVER create a");
559
+ // The prompt must NOT have an actual section heading titled 'Removed Recommendations'
560
+ // (it may contain the phrase inside the NEVER instruction itself, which is expected)
561
+ expect(prompt).not.toMatch(/^##+ Removed Recommendations/m);
562
+ expect(prompt).not.toMatch(/^##+ Not Applicable/m);
563
+ });
564
+ it("rendered item does NOT contain 'priority=' label visible to user", () => {
565
+ // priority= is a PR-mode label; must not appear in rendered sections
566
+ expect(prompt).not.toMatch(/priority=(HIGH|MEDIUM|LOW|CRITICAL)/);
567
+ });
568
+ it("rendered item does NOT contain pipe-delimited category label", () => {
569
+ // | category | pattern used in PR-mode GENERATE blocks
570
+ expect(prompt).not.toMatch(/\| (security_boundary|business_rule|data_integrity|crud|workflow) \|/);
571
+ });
572
+ it("renders the pre-ranked scenario name in the output", () => {
573
+ expect(prompt).toContain("orders-update-with-discount");
574
+ });
575
+ it("includes cascade vs referential integrity guidance", () => {
576
+ expect(prompt).toContain("Cascade vs referential integrity");
577
+ });
578
+ it("supplement Tier 1 is scoped to 'list' (not 'GENERATE set') in full_repo", () => {
579
+ // In full_repo there is no GENERATE set — supplement references the pre-ranked list
580
+ expect(prompt).toMatch(/Tier 1.*list/s);
581
+ expect(prompt).not.toMatch(/Tier 1.*GENERATE set/s);
582
+ });
583
+ it("supplement note references 5-dimension rubric for priority assignment", () => {
584
+ expect(prompt).toContain("5-dimension rubric");
585
+ });
586
+ it("cascade guidance instructs silent removal — no 'Removed Recommendations' section", () => {
587
+ // The cascade guidance must say to remove silently, not to list removed items
588
+ expect(prompt).toContain("silently");
589
+ expect(prompt).toContain("Do NOT list removed scenarios");
590
+ });
591
+ });
592
+ // ---------------------------------------------------------------------------
593
+ // Tests — full_repo mode: full-stack vs backend-only test mix
594
+ // ---------------------------------------------------------------------------
595
+ describe("full_repo mode — full-stack repo test mix", () => {
596
+ function fullStackAnalysis() {
597
+ return fullRepoAnalysisWithScenarios({
598
+ projectClassification: {
599
+ projectType: "full-stack",
600
+ primaryLanguage: "TypeScript",
601
+ primaryFramework: "Next.js",
602
+ deploymentPattern: "full-stack",
603
+ },
604
+ });
605
+ }
606
+ function backendOnlyAnalysis() {
607
+ return fullRepoAnalysisWithScenarios({
608
+ projectClassification: {
609
+ projectType: "rest-api",
610
+ primaryLanguage: "Python",
611
+ primaryFramework: "FastAPI",
612
+ deploymentPattern: "traditional",
613
+ },
614
+ });
615
+ }
616
+ // topN=10 → 15% × 10 = 1.5 → round → 2 for both E2E and UI
617
+ it("full-stack repo mandates percentage-based UI slots (topN=10 → ≥2)", () => {
618
+ const prompt = buildRecommendationPrompt(fullStackAnalysis(), "full_repo", 10);
619
+ expect(prompt).toContain("skyramp_ui_test_generation");
620
+ expect(prompt).toMatch(/at least 2 UI test/);
621
+ });
622
+ it("full-stack repo mandates percentage-based E2E slots (topN=10 → ≥2)", () => {
623
+ const prompt = buildRecommendationPrompt(fullStackAnalysis(), "full_repo", 10);
624
+ expect(prompt).toContain("skyramp_e2e_test_generation");
625
+ expect(prompt).toMatch(/at least 2 E2E test/);
626
+ });
627
+ // topN=20 → 15% × 20 = 3 for both E2E and UI (scales up vs fixed ≥1/≥2)
628
+ it("full-stack repo scales to ≥3 E2E and ≥3 UI at topN=20", () => {
629
+ const prompt = buildRecommendationPrompt(fullStackAnalysis(), "full_repo", 20);
630
+ expect(prompt).toMatch(/at least 3 E2E test/);
631
+ expect(prompt).toMatch(/at least 3 UI test/);
632
+ });
633
+ // topN=5 → 15% × 5 = 0.75 → round → 1, floor at 1
634
+ it("full-stack repo floors at ≥1 E2E and ≥1 UI for small topN=5", () => {
635
+ const prompt = buildRecommendationPrompt(fullStackAnalysis(), "full_repo", 5);
636
+ expect(prompt).toMatch(/at least 1 E2E test/);
637
+ expect(prompt).toMatch(/at least 1 UI test/);
638
+ });
639
+ it("full-stack repo explicitly excludes smoke and fuzz tests", () => {
640
+ const prompt = buildRecommendationPrompt(fullStackAnalysis(), "full_repo", 10);
641
+ expect(prompt).toContain("No smoke tests");
642
+ expect(prompt).toContain("No fuzz tests");
643
+ });
644
+ it("backend-only (rest-api) repo does NOT mandate UI/E2E tests", () => {
645
+ const prompt = buildRecommendationPrompt(backendOnlyAnalysis(), "full_repo", 10);
646
+ // Tool names appear in generic buildToolWorkflows docs — check for the mandate text instead
647
+ expect(prompt).not.toMatch(/at least \d+ (UI|E2E) test/);
648
+ expect(prompt).not.toContain("supplement MUST include");
649
+ expect(prompt).not.toContain("full-stack repo");
650
+ });
651
+ it("backend-only repo focuses on integration and contract tests", () => {
652
+ const prompt = buildRecommendationPrompt(backendOnlyAnalysis(), "full_repo", 10);
653
+ expect(prompt).toContain("integration and contract tests");
654
+ });
655
+ it("backend-only repo still excludes smoke and fuzz tests", () => {
656
+ const prompt = buildRecommendationPrompt(backendOnlyAnalysis(), "full_repo", 10);
657
+ expect(prompt).toContain("No smoke tests");
658
+ expect(prompt).toContain("No fuzz tests");
659
+ });
660
+ it("'frontend' project type focuses on UI/E2E only — NOT backend tests", () => {
661
+ const frontendAnalysis = fullRepoAnalysisWithScenarios({
662
+ projectClassification: {
663
+ projectType: "frontend",
664
+ primaryLanguage: "TypeScript",
665
+ primaryFramework: "React",
666
+ deploymentPattern: "traditional",
667
+ },
668
+ });
669
+ const prompt = buildRecommendationPrompt(frontendAnalysis, "full_repo", 10);
670
+ // topN=10 → 15% × 10 = 1.5 → round → 2 for both
671
+ expect(prompt).toMatch(/at least 2 UI test/);
672
+ expect(prompt).toMatch(/at least 2 E2E test/);
673
+ // Should NOT say "in addition to backend integration and contract tests"
674
+ expect(prompt).not.toContain("in addition to backend integration and contract tests");
675
+ // Should explicitly say no integration/contract
676
+ expect(prompt).toContain("Do NOT add integration or contract tests");
677
+ });
678
+ it("'frontend' project type says 'frontend repo' not 'full-stack repo'", () => {
679
+ const frontendAnalysis = fullRepoAnalysisWithScenarios({
680
+ projectClassification: {
681
+ projectType: "frontend",
682
+ primaryLanguage: "TypeScript",
683
+ primaryFramework: "React",
684
+ deploymentPattern: "traditional",
685
+ },
686
+ });
687
+ const prompt = buildRecommendationPrompt(frontendAnalysis, "full_repo", 10);
688
+ expect(prompt).toContain("frontend repo");
689
+ expect(prompt).not.toContain("full-stack repo");
690
+ });
691
+ it("'full-stack' project type includes BOTH backend and frontend tests", () => {
692
+ const prompt = buildRecommendationPrompt(fullStackAnalysis(), "full_repo", 10);
693
+ expect(prompt).toContain("full-stack repo");
694
+ expect(prompt).toContain("in addition to backend integration and contract tests");
695
+ });
696
+ it("full-stack repo explains E2E > UI > Integration > Contract coverage ranking", () => {
697
+ const prompt = buildRecommendationPrompt(fullStackAnalysis(), "full_repo", 10);
698
+ expect(prompt).toContain("Coverage ranking");
699
+ expect(prompt).toContain("E2E");
700
+ expect(prompt).toContain("UI");
701
+ });
702
+ // Critical: scenarioDrafting.ts NEVER generates UI or E2E testType —
703
+ // they only come from the LLM supplement. The supplement note MUST
704
+ // explicitly tell the LLM to add UI/E2E for full-stack repos, otherwise
705
+ // the LLM fills the supplement with backend-only tiers (edge cases, CRUD)
706
+ // and never produces UI/E2E recommendations (PR #110 regression risk).
707
+ it("full-stack supplement note explicitly mandates UI and E2E with percentage-based counts (PR #110 regression guard)", () => {
708
+ // topN=20, 15% → 3 E2E + 3 UI mandated in the supplement note
709
+ const analysis = fullRepoAnalysisWithScenarios({
710
+ projectClassification: {
711
+ projectType: "full-stack",
712
+ primaryLanguage: "TypeScript",
713
+ primaryFramework: "Next.js",
714
+ deploymentPattern: "full-stack",
715
+ },
716
+ });
717
+ const prompt = buildRecommendationPrompt(analysis, "full_repo", 20);
718
+ // Tool names must appear in supplement (not just test-mix footer)
719
+ const requiredIdx = prompt.indexOf("REQUIRED — You MUST add");
720
+ const e2eIdx = prompt.indexOf("skyramp_e2e_test_generation");
721
+ expect(requiredIdx).toBeGreaterThan(-1);
722
+ expect(e2eIdx).toBeGreaterThan(-1);
723
+ expect(e2eIdx).toBeGreaterThan(requiredIdx); // inside supplement note
724
+ // Percentage-based count: topN=20 → 3
725
+ expect(prompt).toMatch(/at least 3 E2E test/);
726
+ expect(prompt).toMatch(/at least 3 UI test/);
727
+ });
728
+ it("backend-only repo supplement note does NOT add UI/E2E mandate", () => {
729
+ const analysis = fullRepoAnalysisWithScenarios({
730
+ projectClassification: {
731
+ projectType: "rest-api",
732
+ primaryLanguage: "Python",
733
+ primaryFramework: "FastAPI",
734
+ deploymentPattern: "traditional",
735
+ },
736
+ });
737
+ const prompt = buildRecommendationPrompt(analysis, "full_repo", 20);
738
+ const requiredIdx = prompt.indexOf("REQUIRED — You MUST add");
739
+ if (requiredIdx === -1)
740
+ return; // no supplement needed
741
+ const supplementBlock = prompt.slice(requiredIdx, requiredIdx + 800);
742
+ // Backend-only repos should NOT mandate UI/E2E in the supplement tiers
743
+ expect(supplementBlock).not.toContain("full-stack repo, the supplement MUST include");
744
+ });
745
+ });
746
+ // ---------------------------------------------------------------------------
747
+ // Tests — full_repo mode: PR mode must NOT be affected by these changes
748
+ // ---------------------------------------------------------------------------
749
+ describe("full_repo mode — PR mode unchanged by full_repo changes", () => {
750
+ let prPrompt;
751
+ beforeAll(() => {
752
+ prPrompt = buildRecommendationPrompt(analysisWithScenario("current_branch_diff"), "current_branch_diff", 10);
753
+ });
754
+ it("PR mode still contains GENERATE execution language", () => {
755
+ expect(prPrompt).toContain("### GENERATE");
756
+ });
757
+ it("PR mode still shows priority= labels on GENERATE items", () => {
758
+ expect(prPrompt).toMatch(/priority=(HIGH|MEDIUM|LOW|CRITICAL)/);
759
+ });
760
+ it("PR mode does not show 'Do NOT execute any tests'", () => {
761
+ expect(prPrompt).not.toContain("Do NOT execute any tests");
762
+ });
763
+ it("PR mode does not show 'Repo mode' preamble", () => {
764
+ expect(prPrompt).not.toContain("Repo mode — no tests are executed");
765
+ });
766
+ });
767
+ describe("PR #110 quality baseline — branch_diff (PR) mode", () => {
768
+ let prompt;
769
+ beforeAll(() => { prompt = buildRecommendationPrompt(analysisWithScenario("current_branch_diff"), "current_branch_diff", 20); });
770
+ it("source enrichment references changed files (not 'each endpoint')", () => {
771
+ expect(prompt).toContain("Source-Code Enrichment");
772
+ expect(prompt).toContain("changed files");
773
+ expect(prompt).not.toContain("For each endpoint listed in the Repository Context above, read the route handler");
774
+ });
775
+ it("supplement Tier 1 scoped to GENERATE set", () => {
776
+ expect(prompt).toMatch(/Tier 1.*GENERATE set/s);
777
+ });
778
+ it("supplement ordering puts edge cases before cross-resource", () => {
779
+ const tier1Idx = prompt.indexOf("Tier 1");
780
+ const tier3Idx = prompt.indexOf("Tier 3");
781
+ expect(tier1Idx).toBeGreaterThan(-1);
782
+ expect(tier3Idx).toBeGreaterThan(-1);
783
+ expect(tier1Idx).toBeLessThan(tier3Idx);
784
+ });
785
+ it("includes cascade vs referential integrity guidance", () => {
786
+ expect(prompt).toContain("Cascade vs referential integrity");
787
+ });
788
+ it("includes per-recommendation format requirements", () => {
789
+ expect(prompt).toContain("Per-recommendation format");
790
+ });
791
+ it("GENERATE block present for the business_rule scenario", () => {
792
+ expect(prompt).toContain("GENERATE");
793
+ expect(prompt).toContain("orders-update-with-discount");
794
+ });
795
+ });
796
+ // ---------------------------------------------------------------------------
797
+ // Regression tests — v3 gap fixes
798
+ // ---------------------------------------------------------------------------
799
+ describe("Gap 1 — happy-path ranking: success scenarios ranked before error/edge-case scenarios", () => {
800
+ function makeScenarioByInteraction(name, interactionType) {
801
+ return mockDiffScenario({
802
+ scenarioName: name,
803
+ steps: [
804
+ { order: 1, method: "POST", path: "/api/items", expectedStatusCode: interactionType === "success" ? 201 : 404, description: "step", interactionType },
805
+ { order: 2, method: "GET", path: "/api/items/{id}", expectedStatusCode: interactionType === "success" ? 200 : 404, description: "verify", interactionType },
806
+ { order: 3, method: "DELETE", path: "/api/items/{id}", expectedStatusCode: interactionType === "success" ? 204 : 404, description: "cleanup", interactionType },
807
+ ],
808
+ });
809
+ }
810
+ it("happy-path scenario ranked before error-path scenario in GENERATE block", () => {
811
+ const analysis = {
812
+ ...analysisWithScenario("current_branch_diff"),
813
+ businessContext: {
814
+ mainPurpose: "Test",
815
+ userFlows: [], dataFlows: [], integrationPatterns: [],
816
+ draftedScenarios: [
817
+ makeScenarioByInteraction("error-path-scenario", "error"),
818
+ makeScenarioByInteraction("happy-path-scenario", "success"),
819
+ ],
820
+ },
821
+ };
822
+ const prompt = buildRecommendationPrompt(analysis, "current_branch_diff", 5);
823
+ const happyIdx = prompt.indexOf("happy-path-scenario");
824
+ const errorIdx = prompt.indexOf("error-path-scenario");
825
+ expect(happyIdx).toBeGreaterThan(-1);
826
+ expect(errorIdx).toBeGreaterThan(-1);
827
+ // Happy path should appear first (lower index = earlier in the output)
828
+ expect(happyIdx).toBeLessThan(errorIdx);
829
+ });
830
+ });
831
+ describe("Gap 2 — E2E ADDITIONAL slot gated on hasTraces", () => {
832
+ function makeMixedPRAnalysis(hasTraceFiles) {
833
+ // Needs draftedScenarios so scored.length > 0 and buildExecutionPlan is reached
834
+ const base = analysisWithScenario("current_branch_diff");
835
+ return {
836
+ ...base,
837
+ artifacts: {
838
+ openApiSpecs: [],
839
+ playwrightRecordings: [],
840
+ traceFiles: hasTraceFiles ? [{ path: "/repo/tests/trace.json", format: "skyramp" }] : [],
841
+ notFound: [],
842
+ },
843
+ branchDiffContext: {
844
+ currentBranch: "test",
845
+ baseBranch: "main",
846
+ // frontend/components/.tsx triggers hasFrontendChanges; newEndpoints makes it a mixed PR (not UI-only)
847
+ changedFiles: ["frontend/components/App.tsx", "backend/routers/orders.py"],
848
+ newEndpoints: [{ path: "/api/v1/orders/{order_id}", methods: [{ method: "PUT", sourceFile: "orders.py", interactionCount: 3 }] }],
849
+ modifiedEndpoints: [],
850
+ affectedServices: ["orders"],
851
+ },
852
+ };
853
+ }
854
+ it("E2E [ADDITIONAL] slot present when hasTraces=true and frontend+API changes exist", () => {
855
+ const prompt = buildRecommendationPrompt(makeMixedPRAnalysis(true), "current_branch_diff", 10);
856
+ expect(prompt).toMatch(/\[ADDITIONAL\].*E2E/s);
857
+ });
858
+ it("E2E [ADDITIONAL] slot absent when hasTraces=false and frontend+API changes exist", () => {
859
+ const prompt = buildRecommendationPrompt(makeMixedPRAnalysis(false), "current_branch_diff", 10);
860
+ // UI slot should still be present, E2E slot should not
861
+ expect(prompt).toMatch(/\[ADDITIONAL\].*UI/s);
862
+ // [ADDITIONAL] E2E label must not appear (tool docs contain "E2E" but not as [ADDITIONAL] label)
863
+ expect(prompt).not.toContain("[ADDITIONAL] | E2E |");
864
+ });
865
+ });
866
+ describe("Gap 4 — PR history does NOT suppress GENERATE items on 2nd+ run", () => {
867
+ it("prompt contains GENERATE-unaffected instruction when prior history exists", () => {
868
+ const ctx = makePRContext({
869
+ previousRecommendations: [
870
+ { testType: "integration", endpoint: "POST /api/v1/orders", scenarioName: "orders-update-with-discount", status: "implemented", commentId: "1" },
871
+ ],
872
+ });
873
+ const prompt = buildRecommendationPrompt(analysisWithScenario("current_branch_diff"), "current_branch_diff", 5, ctx);
874
+ expect(prompt).toContain("GENERATE section is unaffected by prior history");
875
+ });
876
+ it("prompt does NOT contain old suppression text 'Do NOT re-recommend'", () => {
877
+ const ctx = makePRContext({
878
+ previousRecommendations: [
879
+ { testType: "integration", endpoint: "POST /api/v1/orders", status: "implemented", commentId: "1" },
880
+ ],
881
+ });
882
+ const prompt = buildRecommendationPrompt(analysisWithScenario("current_branch_diff"), "current_branch_diff", 5, ctx);
883
+ expect(prompt).not.toContain("Do NOT re-recommend");
884
+ });
885
+ });
886
+ describe("renderItem — correct tool for E2E and UI testTypes in full_repo mode", () => {
887
+ function makeTypedScenario(testType) {
888
+ return mockDiffScenario({
889
+ scenarioName: `${testType}-scenario`,
890
+ testType,
891
+ steps: [
892
+ { order: 1, method: "GET", path: "/api/items", expectedStatusCode: 200, description: "list items", interactionType: "success" },
893
+ { order: 2, method: "POST", path: "/api/items", expectedStatusCode: 201, description: "create item", interactionType: "success" },
894
+ ],
895
+ });
896
+ }
897
+ it("integration scenario uses skyramp_integration_test_generation in full_repo", () => {
898
+ const analysis = minimalAnalysis({
899
+ businessContext: { mainPurpose: "Test", userFlows: [], dataFlows: [], integrationPatterns: [], draftedScenarios: [makeTypedScenario("integration")] },
900
+ });
901
+ const prompt = buildRecommendationPrompt(analysis, "full_repo", 5);
902
+ expect(prompt).toContain("skyramp_integration_test_generation");
903
+ });
904
+ it("e2e scenario uses skyramp_e2e_test_generation and omits scenario step calls in full_repo", () => {
905
+ const analysis = minimalAnalysis({
906
+ businessContext: { mainPurpose: "Test", userFlows: [], dataFlows: [], integrationPatterns: [], draftedScenarios: [makeTypedScenario("e2e")] },
907
+ });
908
+ const prompt = buildRecommendationPrompt(analysis, "full_repo", 5);
909
+ // Extract recommendation content only (before Tool Workflows docs which list all tools)
910
+ const toolWorkflowsIdx = prompt.indexOf("## How to Generate Tests");
911
+ const mainContent = toolWorkflowsIdx > 0 ? prompt.slice(0, toolWorkflowsIdx) : prompt;
912
+ expect(mainContent).toContain("skyramp_e2e_test_generation");
913
+ expect(mainContent).not.toContain("skyramp_integration_test_generation");
914
+ // E2E does not use per-step scenario pipeline
915
+ expect(mainContent).not.toContain("skyramp_scenario_test_generation");
916
+ });
917
+ it("ui scenario uses skyramp_ui_test_generation and omits scenario step calls in full_repo", () => {
918
+ const analysis = minimalAnalysis({
919
+ businessContext: { mainPurpose: "Test", userFlows: [], dataFlows: [], integrationPatterns: [], draftedScenarios: [makeTypedScenario("ui")] },
920
+ });
921
+ const prompt = buildRecommendationPrompt(analysis, "full_repo", 5);
922
+ // Extract recommendation content only (before Tool Workflows docs which list all tools)
923
+ const toolWorkflowsIdx = prompt.indexOf("## How to Generate Tests");
924
+ const mainContent = toolWorkflowsIdx > 0 ? prompt.slice(0, toolWorkflowsIdx) : prompt;
925
+ expect(mainContent).toContain("skyramp_ui_test_generation");
926
+ expect(mainContent).not.toContain("skyramp_integration_test_generation");
927
+ // UI does not use per-step scenario pipeline
928
+ expect(mainContent).not.toContain("skyramp_scenario_test_generation");
929
+ });
930
+ it("integration scenario still emits per-step skyramp_scenario_test_generation calls in full_repo", () => {
931
+ const analysis = minimalAnalysis({
932
+ businessContext: { mainPurpose: "Test", userFlows: [], dataFlows: [], integrationPatterns: [], draftedScenarios: [makeTypedScenario("integration")] },
933
+ });
934
+ const prompt = buildRecommendationPrompt(analysis, "full_repo", 5);
935
+ const toolWorkflowsIdx = prompt.indexOf("## How to Generate Tests");
936
+ const mainContent = toolWorkflowsIdx > 0 ? prompt.slice(0, toolWorkflowsIdx) : prompt;
937
+ expect(mainContent).toContain("skyramp_scenario_test_generation");
938
+ expect(mainContent).toContain("skyramp_integration_test_generation");
939
+ });
940
+ });
@@ -120,6 +120,7 @@ Generate a net-new test. Use a unique descriptive filename to avoid overwriting
120
120
  **How to generate each type (for ADD and REGENERATE):**
121
121
  - **Integration**: call \`skyramp_scenario_test_generation\` per step (sequentially), then \`skyramp_integration_test_generation\` with the scenario file.
122
122
  Scenario JSON goes in the same \`outputDir\` (e.g. \`tests/scenario_<name>.json\`), not \`.skyramp/\`.
123
+ **Required fields (MANDATORY before generating any scenario step):** For every POST/PUT/PATCH step — including prerequisite/setup steps (e.g. create a product before creating an order) — read the route handler source code or OpenAPI schema to identify ALL required request body fields. Include every required field with a realistic value. Do NOT omit fields just because they are not the focus of the test.
123
124
  - **Contract**: call \`skyramp_contract_test_generation\` with \`endpointURL\`, \`method\`, and \`requestData\` for POST/PUT/PATCH.
124
125
  Pass \`apiSchema\` if an OpenAPI spec exists.
125
126
  For internal/microservice APIs: add \`providerMode: true\` to verify implementation matches the contract.
@@ -159,11 +160,10 @@ await page.waitForTimeout(1500);
159
160
  \`\`\`
160
161
  Then re-run the test. This is a common issue with SSR/SPA frameworks where the DOM is rendered but not yet interactive.
161
162
 
162
- **After generation, fix chaining and enhance assertions only:**
163
- - Path params like \`id = 'id'\` → \`skyramp.get_response_value(prev_response, "id")\`
164
- - Hardcoded IDs in request bodies dynamic values from prior response
165
- - **Integration tests and contract provider tests**: after the test generation, you MUST enhance response body assertions as instructed in the tool output.
166
- - Change ONLY chaining values and enhance assertions. Preserve everything else exactly as generated.
163
+ **After generation, you MUST do exactly two things — nothing more, nothing less:**
164
+ 1. **Fix chaining**: replace hardcoded IDs with dynamic response values — path params like \`id = 'id'\` → \`skyramp.get_response_value(prev_response, "id")\`, and hardcoded IDs in request bodies → dynamic values from prior responses.
165
+ 2. **Enhance assertions**: for integration tests and contract provider tests, follow the assertion enhancement instructions returned in the tool output. Add response body assertions for every request. This step is MANDATORY do NOT skip it even if chaining is already correct.
166
+ Do not make any other changes to the generated test file.
167
167
 
168
168
  After all actions, execute ONLY the test files you created (ADD), regenerated (REGENERATE),
169
169
  or edited (UPDATE). Do NOT execute VERIFY'd tests — they are unaffected by the diff and do not
@@ -191,7 +191,7 @@ Call \`skyramp_submit_report\` with \`summaryOutputFile\`: "${summaryOutputFile}
191
191
  VERIFY: note that the test was verified as unaffected by the diff — no file changes made.
192
192
  Do NOT include files that were newly created in this run (those go in \`newTestsCreated\`).
193
193
 
194
- **additionalRecommendations** — items you could not act on (quota exceeded, missing traces, etc.):
194
+ **additionalRecommendations** — remaining recommendations from the ranked list (MUST contain EXACTLY ${maxRecommendations - maxGenerate} items):
195
195
  \`testId\` (human-readable kebab-case, e.g. \`integration-products-orders-workflow\`), \`testType\`, \`category\`, \`scenarioName\`, \`priority\` (high/medium/low — used for sorting, not displayed), \`description\`, \`steps\`, \`reasoning\`
196
196
  Keep each \`description\` to one sentence. Omit \`requestBody\` and \`responseBody\` from steps.
197
197
  Include at most 3 steps per recommendation.
@@ -156,6 +156,8 @@ export async function parseTraceFile(filePath) {
156
156
  return { entries, userFlows, format };
157
157
  }
158
158
  const SKIP_DIRS = new Set(["node_modules", ".git", "dist", "build", ".next", ".nuxt", "coverage", "__pycache__", ".venv", "venv"]);
159
+ /** Known test-artifact directories where testbot-generated traces are written. */
160
+ const TRACE_SCAN_DIRS = [".skyramp", "tests", "test", "e2e", "playwright"];
159
161
  /**
160
162
  * Recursively scan a directory for files matching a predicate, up to maxDepth levels.
161
163
  */
@@ -180,6 +182,22 @@ function scanDir(dir, predicate, maxDepth, results) {
180
182
  }
181
183
  }
182
184
  }
185
+ /**
186
+ * Scan only known test-artifact directories for trace files.
187
+ * Root-level files are checked at depth 0; named test-artifact subdirs are scanned
188
+ * at full depth. This prevents picking up committed demo assets (e.g. frontend/public/traces/).
189
+ */
190
+ function scanTraceArtifactDirs(repositoryPath, predicate, results) {
191
+ // Root-level files only (depth 0)
192
+ scanDir(repositoryPath, predicate, 0, results);
193
+ // Named test-artifact subdirectories (full depth)
194
+ for (const dir of TRACE_SCAN_DIRS) {
195
+ const full = path.join(repositoryPath, dir);
196
+ if (fs.existsSync(full)) {
197
+ scanDir(full, predicate, 5, results);
198
+ }
199
+ }
200
+ }
183
201
  /**
184
202
  * Discover trace JSON files in a repository path.
185
203
  */
@@ -191,12 +209,12 @@ export function discoverTraceFiles(repositoryPath) {
191
209
  if (fs.existsSync(full))
192
210
  found.push(full);
193
211
  }
194
- // Recursive scan: any *trace*.json|har, but exclude scenario files and test output files
212
+ // Recursive scan scoped to test-artifact dirs: any *trace*.json|har, excluding scenario/test output files
195
213
  const isTraceJson = (name) => /\.(json|har)$/i.test(name) &&
196
214
  /trace/i.test(name) &&
197
215
  !/^scenario_/i.test(name) &&
198
216
  !/_test\.(json|har)$/i.test(name);
199
- scanDir(repositoryPath, isTraceJson, 5, found);
217
+ scanTraceArtifactDirs(repositoryPath, isTraceJson, found);
200
218
  // Deduplicate and sort for deterministic ordering
201
219
  return [...new Set(found)].sort();
202
220
  }
@@ -209,6 +227,6 @@ export function discoverPlaywrightZips(repositoryPath) {
209
227
  const isPlaywrightZip = (name) => /\.zip$/i.test(name) && (/playwright/i.test(name) ||
210
228
  /_trace\.zip$/i.test(name) ||
211
229
  name.toLowerCase() === "trace.zip");
212
- scanDir(repositoryPath, isPlaywrightZip, 5, found);
230
+ scanTraceArtifactDirs(repositoryPath, isPlaywrightZip, found);
213
231
  return [...new Set(found)].sort();
214
232
  }
@@ -0,0 +1,140 @@
1
+ /**
2
+ * Unit tests for trace-parser.ts — specifically the scanTraceArtifactDirs scoping
3
+ * introduced to prevent demo/fixture files (e.g. frontend/public/traces/) from being
4
+ * misidentified as testbot-generated traces.
5
+ */
6
+ import * as fs from "fs";
7
+ import * as os from "os";
8
+ import * as path from "path";
9
+ import { discoverTraceFiles, discoverPlaywrightZips } from "./trace-parser.js";
10
+ // ---------------------------------------------------------------------------
11
+ // Helpers
12
+ // ---------------------------------------------------------------------------
13
+ function mkdirp(dir) {
14
+ fs.mkdirSync(dir, { recursive: true });
15
+ }
16
+ function touch(file) {
17
+ mkdirp(path.dirname(file));
18
+ fs.writeFileSync(file, "");
19
+ }
20
+ function withTempRepo(fn) {
21
+ const dir = fs.mkdtempSync(path.join(os.tmpdir(), "trace-parser-test-"));
22
+ try {
23
+ fn(dir);
24
+ }
25
+ finally {
26
+ fs.rmSync(dir, { recursive: true, force: true });
27
+ }
28
+ }
29
+ // ---------------------------------------------------------------------------
30
+ // discoverPlaywrightZips — scoping tests
31
+ // ---------------------------------------------------------------------------
32
+ describe("discoverPlaywrightZips — scanTraceArtifactDirs scoping", () => {
33
+ it("does NOT discover playwright zip in frontend/public/traces/ (demo fixture dir)", () => {
34
+ withTempRepo(repo => {
35
+ touch(path.join(repo, "frontend", "public", "traces", "ui_test_playwright.zip"));
36
+ expect(discoverPlaywrightZips(repo)).toEqual([]);
37
+ });
38
+ });
39
+ it("discovers playwright zip in tests/ (test-artifact dir)", () => {
40
+ withTempRepo(repo => {
41
+ const zip = path.join(repo, "tests", "ui_test_playwright.zip");
42
+ touch(zip);
43
+ expect(discoverPlaywrightZips(repo)).toContain(zip);
44
+ });
45
+ });
46
+ it("discovers playwright zip in .skyramp/ (test-artifact dir)", () => {
47
+ withTempRepo(repo => {
48
+ const zip = path.join(repo, ".skyramp", "recording_playwright.zip");
49
+ touch(zip);
50
+ expect(discoverPlaywrightZips(repo)).toContain(zip);
51
+ });
52
+ });
53
+ it("discovers playwright zip in e2e/ (test-artifact dir)", () => {
54
+ withTempRepo(repo => {
55
+ const zip = path.join(repo, "e2e", "flow_playwright.zip");
56
+ touch(zip);
57
+ expect(discoverPlaywrightZips(repo)).toContain(zip);
58
+ });
59
+ });
60
+ it("discovers playwright zip in playwright/ (test-artifact dir)", () => {
61
+ withTempRepo(repo => {
62
+ const zip = path.join(repo, "playwright", "trace.zip");
63
+ touch(zip);
64
+ expect(discoverPlaywrightZips(repo)).toContain(zip);
65
+ });
66
+ });
67
+ it("does NOT discover zip in src/ (not a test-artifact dir)", () => {
68
+ withTempRepo(repo => {
69
+ touch(path.join(repo, "src", "recordings", "ui_playwright.zip"));
70
+ expect(discoverPlaywrightZips(repo)).toEqual([]);
71
+ });
72
+ });
73
+ it("does NOT discover zip in deeply nested non-test dir", () => {
74
+ withTempRepo(repo => {
75
+ touch(path.join(repo, "frontend", "src", "assets", "demo_playwright.zip"));
76
+ expect(discoverPlaywrightZips(repo)).toEqual([]);
77
+ });
78
+ });
79
+ });
80
+ // ---------------------------------------------------------------------------
81
+ // discoverTraceFiles — scoping tests
82
+ // ---------------------------------------------------------------------------
83
+ describe("discoverTraceFiles — scanTraceArtifactDirs scoping", () => {
84
+ it("does NOT discover trace.json nested under frontend/public/traces/", () => {
85
+ withTempRepo(repo => {
86
+ touch(path.join(repo, "frontend", "public", "traces", "backend_trace.json"));
87
+ const found = discoverTraceFiles(repo);
88
+ // fixed-name root candidates don't match "backend_trace.json", and scan won't reach frontend/
89
+ expect(found.some(f => f.includes("frontend"))).toBe(false);
90
+ });
91
+ });
92
+ it("discovers trace.json in tests/ dir", () => {
93
+ withTempRepo(repo => {
94
+ const f = path.join(repo, "tests", "backend_trace.json");
95
+ touch(f);
96
+ expect(discoverTraceFiles(repo)).toContain(f);
97
+ });
98
+ });
99
+ it("discovers trace.json in .skyramp/ dir", () => {
100
+ withTempRepo(repo => {
101
+ const f = path.join(repo, ".skyramp", "skyramp_trace.json");
102
+ touch(f);
103
+ expect(discoverTraceFiles(repo)).toContain(f);
104
+ });
105
+ });
106
+ it("discovers root-level trace.json", () => {
107
+ withTempRepo(repo => {
108
+ const f = path.join(repo, "trace.json");
109
+ touch(f);
110
+ expect(discoverTraceFiles(repo)).toContain(f);
111
+ });
112
+ });
113
+ it("discovers root-level skyramp_traces.json via fixed-name check", () => {
114
+ withTempRepo(repo => {
115
+ const f = path.join(repo, "skyramp_traces.json");
116
+ touch(f);
117
+ expect(discoverTraceFiles(repo)).toContain(f);
118
+ });
119
+ });
120
+ it("does NOT discover scenario_ json files (excluded by predicate)", () => {
121
+ withTempRepo(repo => {
122
+ touch(path.join(repo, "tests", "scenario_orders_trace.json"));
123
+ expect(discoverTraceFiles(repo)).toEqual([]);
124
+ });
125
+ });
126
+ it("does NOT discover _test.json files (excluded by predicate)", () => {
127
+ withTempRepo(repo => {
128
+ touch(path.join(repo, "tests", "orders_trace_test.json"));
129
+ expect(discoverTraceFiles(repo)).toEqual([]);
130
+ });
131
+ });
132
+ it("results are deduplicated when fixed-name and scan both find the same root file", () => {
133
+ withTempRepo(repo => {
134
+ const f = path.join(repo, "trace.json");
135
+ touch(f);
136
+ const found = discoverTraceFiles(repo);
137
+ expect(found.filter(x => x === f)).toHaveLength(1);
138
+ });
139
+ });
140
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@skyramp/mcp",
3
- "version": "0.0.64-rc.4",
3
+ "version": "0.0.64-rc.6",
4
4
  "main": "build/index.js",
5
5
  "type": "module",
6
6
  "bin": {