npm - @skyramp/mcp - Versions diffs - 0.0.64-rc.9 → 0.0.64 - Mend

@skyramp/mcp 0.0.64-rc.9 → 0.0.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/build/index.js +2 -0
package/build/prompts/test-maintenance/drift-analysis-prompt.js +26 -7
package/build/prompts/test-maintenance/driftAnalysisSections.js +96 -34
package/build/prompts/test-maintenance/enhanceAssertionSection.js +99 -0
package/build/prompts/test-recommendation/recommendationSections.js +24 -9
package/build/prompts/test-recommendation/test-recommendation-prompt.js +96 -27
package/build/prompts/test-recommendation/test-recommendation-prompt.test.js +239 -2
package/build/prompts/testbot/testbot-prompts.js +182 -125
package/build/services/TestDiscoveryService.js +23 -0
package/build/services/TestExecutionService.js +1 -1
package/build/services/TestGenerationService.js +83 -12
package/build/services/TestGenerationService.test.js +111 -2
package/build/tool-phase-coverage.test.js +8 -2
package/build/tool-phases.js +11 -13
package/build/tools/generate-tests/generateBatchScenarioRestTool.js +203 -0
package/build/tools/generate-tests/generateContractRestTool.js +3 -73
package/build/tools/generate-tests/generateIntegrationRestTool.js +11 -61
package/build/tools/submitReportTool.js +11 -3
package/build/tools/submitReportTool.test.js +1 -1
package/build/tools/test-management/analyzeChangesTool.js +14 -4
package/build/types/RepositoryAnalysis.js +1 -0
package/build/utils/scenarioDrafting.js +121 -11
package/build/utils/scenarioDrafting.test.js +266 -3
package/node_modules/playwright/ThirdPartyNotices.txt +679 -3093
package/node_modules/playwright/lib/mcp/skyramp/traceRecordingBackend.js +117 -11
package/package.json +2 -2
package/build/tools/test-recommendation/recommendTestsTool.js +0 -274

package/build/index.js CHANGED Viewed

@@ -23,6 +23,7 @@ import { registerRecommendTestsPrompt } from "./prompts/test-recommendation/regi
 import { registerModularizationTool } from "./tools/code-refactor/modularizationTool.js";
 import { registerCodeReuseTool } from "./tools/code-refactor/codeReuseTool.js";
 import { registerScenarioTestTool } from "./tools/generate-tests/generateScenarioRestTool.js";
+import { registerBatchScenarioTestTool } from "./tools/generate-tests/generateBatchScenarioRestTool.js";
 import { registerMockTool } from "./tools/generate-tests/generateMockRestTool.js";
 import { registerAnalyzeChangesTool, registerAnalyzeTestHealthTool, registerExecuteTestsTool, registerActionsTool, registerStateCleanupTool, } from "./tools/test-management/index.js";
 import { registerTestbotPrompt, registerTestbotResource, } from "./prompts/testbot/testbot-prompts.js";
@@ -218,6 +219,7 @@ const infrastructureTools = [
 ];
 if (process.env.SKYRAMP_FEATURE_TESTBOT === "1") {
     infrastructureTools.push(registerSubmitReportTool);
+    registerBatchScenarioTestTool(server);
     logger.info("TestBot tools enabled via SKYRAMP_FEATURE_TESTBOT");
 }
 infrastructureTools.forEach((registerTool) => registerTool(server));

package/build/prompts/test-maintenance/drift-analysis-prompt.js CHANGED Viewed

@@ -1,14 +1,15 @@
-import { buildDriftScoringGuide, buildActionDecisionMatrix, buildBreakingChangePatterns, buildTestAssessmentGuidelines, buildAddRecommendationGuidelines, buildDriftOutputChecklist, } from "./driftAnalysisSections.js";
+import { buildDriftScoringGuide, buildActionDecisionMatrix, buildBreakingChangePatterns, buildTestAssessmentGuidelines, buildAddRecommendationGuidelines, buildDriftOutputChecklist, buildUpdateExecutionRules, } from "./driftAnalysisSections.js";
 export function buildDriftAnalysisPrompt(params) {
     const { existingTests, parsedDiff, scannedEndpoints, repositoryPath, stateFile } = params;
+    const inlineMode = !stateFile;
     // Detect new endpoints count from parsedDiff
     let newEndpointCount = 0;
     let diffSection = "";
     if (parsedDiff) {
         const lines = parsedDiff.split("\n");
-        const epMatches = parsedDiff.match(/(?:^|\n)\*\*(GET|POST|PUT|PATCH|DELETE)\s+[^\*]+\*\*/gm);
-        if (epMatches)
-            newEndpointCount = epMatches.length;
+        const newEndpointMatch = parsedDiff.match(/\*\*New Endpoints\*\*\s+\((\d+)\)/);
+        if (newEndpointMatch)
+            newEndpointCount = parseInt(newEndpointMatch[1], 10);
         diffSection = `## Branch Diff
 \`\`\`
 ${lines.slice(0, 200).join("\n")}
@@ -32,7 +33,11 @@ No existing Skyramp tests found in repository.
 ${scannedEndpoints.map((ep) => `- ${Array.isArray(ep.methods) ? ep.methods.join("|") : ep.method} ${ep.path}`).join("\n")}
 `
         : "";
-    return `# Test Health Analysis
+    // In inline mode (testbot), skip the context header — existing tests and diff
+    // are provided by skyramp_analyze_changes at runtime, not at prompt-build time.
+    const contextSection = inlineMode
+        ? ""
+        : `# Test Health Analysis
 **Repository**: \`${repositoryPath}\`
 **Existing tests**: ${existingTests.length}
@@ -40,7 +45,19 @@ ${scannedEndpoints.map((ep) => `- ${Array.isArray(ep.methods) ? ep.methods.join(
 ${diffSection}
 ${testListSection}
-${scannedSection}
+${scannedSection}`;
+    if (inlineMode) {
+        // Testbot inline mode: all maintenance logic lives here so the testbot
+        // prompt only orchestrates steps without duplicating rules.
+        return `${buildActionDecisionMatrix()}
+${buildUpdateExecutionRules()}
+${buildDriftOutputChecklist(existingTests.length, newEndpointCount, inlineMode)}
+**Be brief.** Score each test, decide the action, and apply edits immediately. Do NOT write detailed analysis for IGNORE'd tests.`;
+    }
+    return `${contextSection}
 ${buildDriftScoringGuide()}
 ${buildActionDecisionMatrix()}
@@ -49,9 +66,11 @@ ${buildBreakingChangePatterns()}
 ${buildTestAssessmentGuidelines()}
+${buildUpdateExecutionRules()}
 ${buildAddRecommendationGuidelines()}
-${buildDriftOutputChecklist(existingTests.length, newEndpointCount)}
+${buildDriftOutputChecklist(existingTests.length, newEndpointCount, inlineMode)}
 After completing the assessment above, call \`skyramp_actions\` with \`stateFile: "${stateFile}"\`

package/build/prompts/test-maintenance/driftAnalysisSections.js CHANGED Viewed

@@ -2,6 +2,7 @@
  * Modular section builders for the Drift Analysis prompt,
  * mirroring the recommendationSections.ts pattern.
  */
+import { ENHANCE_ASSERTIONS_FOR_INTEGRATION_AND_CONTRACTPROVIDER } from "./enhanceAssertionSection.js";
 export function buildDriftScoringGuide() {
     return `## Drift Score Guide (0–100)
@@ -11,6 +12,9 @@ export function buildDriftScoringGuide() {
 | 21–40 | VERIFY | Minor changes detected — review but likely fine |
 | 41–70 | UPDATE | Breaking changes detected — test needs edits |
 | 71–100 | REGENERATE | Major structural changes — regenerate from scratch |
+| 80–100 | DELETE | ALL endpoints the test covers were removed — test is obsolete |
+DELETE and REGENERATE overlap in the 80–100 range. The distinction is cause, not score: DELETE when the endpoints no longer exist, REGENERATE when they still exist but changed drastically.
 Assign each existing test a score based on how much the codebase has changed relative to what the test expects.`;
 }
@@ -24,7 +28,8 @@ For each test, choose one of:
 | **IGNORE** | Drift score 0–20; no breaking changes AND no additive field gaps detected |
 | **VERIFY** | Drift score 21–40; minor changes, manual review recommended |
 | **UPDATE** | Drift score 25–70; breaking changes OR additive fields added to a covered endpoint (new response field the test doesn't assert) |
-| **REGENERATE** | Drift score 71–100; endpoint removed, major restructuring, or test is fundamentally broken |
+| **REGENERATE** | Drift score 71–100; major restructuring or test is fundamentally broken |
+| **DELETE** | Drift score 80–100; ALL endpoints the test covers were removed from the codebase |
 | **ADD** | New endpoint detected in diff that has no corresponding test yet |
 Rules:
@@ -68,76 +73,130 @@ These do NOT break existing assertions but leave the new field untested. Always
 - New key added inside an existing dict/object returned by the endpoint`;
 }
 export function buildTestAssessmentGuidelines() {
-    return `## Per-Test Assessment (4 Steps)
+    return `## Per-Test Assessment (4 Checks)
-For each existing test file, follow these steps:
+For each existing test file, run these checks:
-### Step 1: Check endpoint existence
+### Check A: Endpoint existence
 Does the endpoint the test targets still exist in the codebase?
-- If the endpoint path/method is no longer present → score 80+, action: REGENERATE
+- If ALL endpoints the test covers were removed → score 80+, action: DELETE (the entire test file is obsolete)
+- If SOME methods were removed but others remain → score 50–70, action: UPDATE (remove the test functions for deleted methods, keep the rest)
 - If the endpoint was renamed → score 50–70, action: UPDATE (path substitution)
-### Step 2: Check request/response shape (breaking changes)
+### Check B: Request/response shape (breaking changes)
 Has the request body or response structure changed in a way that breaks the test?
 - Compare test's expected fields against current schema/model definitions
 - Type changes (string→int, int→string) → score 60+, action: UPDATE or REGENERATE
 - New required fields the test doesn't send → score 50+, action: UPDATE
 - Response fields the test asserts on have been removed → score 50+, action: UPDATE
-### Step 2b: Check additive response field changes (coverage gaps)
+### Check B2: Additive response field changes (coverage gaps)
 **Even if existing assertions still pass**, does the diff add a new field to the response of an endpoint this test already covers?
 - Look at the diff for lines like \`+ "newField":\` or \`+ newField =\` inside a view/serializer this test hits
 - If YES → score 30, action: UPDATE — add an assertion for the new field (e.g. \`assert "newField" in response_body\` or \`assert response_body["newField"] >= 0\`)
 - This applies even when the test only checks status codes — the test should be extended to cover the new field
 - **Do NOT score IGNORE if a new response field was added to a covered endpoint**
-### Step 3: Check auth changes
+### Check C: Auth changes
 Has the authentication mechanism for this endpoint changed?
 - Auth added where none existed → score 40+, action: UPDATE
 - Auth method changed (bearer→cookie) → score 50+, action: UPDATE
 - Auth removed → score 30+, action: VERIFY or UPDATE
-### Step 4: Assign score and action
-Based on the above, assign a final drift score 0–100 and choose the action (IGNORE / VERIFY / UPDATE / REGENERATE).
+### Check D: Assign score and action
+Based on the above, assign a final drift score 0–100 and choose the action (IGNORE / VERIFY / UPDATE / REGENERATE / DELETE).
 Provide a 1-2 sentence rationale.
-- If Step 2b flagged an additive field → score must be ≥ 30 and action must be UPDATE, even if Steps 2/3 found no breaking changes.`;
+- If Check B2 flagged an additive field → score must be ≥ 30 and action must be UPDATE, even if Checks B/C found no breaking changes.`;
 }
 export function buildAddRecommendationGuidelines() {
-    return `## ADD Recommendations for New Endpoints
+    return `## ADD — New Tests for New Endpoints
+**Only ADD when:**
+- The diff introduces a brand-new route that has **no existing test coverage at all**, OR
+- The diff introduces a new auth path, error branch, or fundamentally separate scenario that no existing test covers.
-For each new endpoint detected in the diff (not yet covered by any existing test):
+**Never ADD when:**
+- The resource already has existing tests and the diff only adds a new HTTP method — UPDATE those files instead.
+- The endpoint existed before this diff but lacks tests — that is a pre-existing coverage gap; log it in \`additionalRecommendations\`, do NOT add a test now.
-### Test type priority by HTTP method
+**Test type priority by HTTP method:**
 | Method | Recommended test types |
 |--------|----------------------|
 | POST / PUT / PATCH | integration, contract |
 | GET | contract, smoke |
 | DELETE | integration, smoke |
-### ADD recommendation format
-For each new endpoint, include:
-1. The endpoint path and method
-2. The recommended test types (from the table above)
-3. The Skyramp tool to call (e.g., \`skyramp_contract_test_generation\`, \`skyramp_integration_test_generation\`)
-4. The \`endpointURL\` to use (combine base URL + path)
-5. The language/framework to use (from workspace config or project metadata)`;
+Use a unique descriptive filename for every new test file. Do NOT create a new contract or integration test file for a resource that already has existing tests — use UPDATE instead.`;
 }
-export function buildDriftOutputChecklist(existingTestCount, newEndpointCount) {
-    return `## Output Checklist
+export function buildUpdateExecutionRules() {
+    return `## Update Execution Rules
+When applying UPDATE actions to existing test files, follow these rules in addition to the drift-detected changes:
+### Test file ordering (CRITICAL)
+Place mutation test functions (PATCH, PUT, POST) **before** any DELETE test function targeting the same resource. DELETE removes the resource — any mutation call after it will 404. When inserting a new mutation test, place it above the DELETE function and above the DELETE call in the \`if __name__ == "__main__"\` block (or equivalent runner entrypoint).
+### Happy path first (CRITICAL)
+When adding a new HTTP method (PUT, PATCH, POST) to an existing test file, always add the happy path (2xx success) assertion first. Do NOT add only error-path tests (404, 422) for the new method — error cases may follow, but the 2xx case is mandatory.
+### All test files for a resource (CRITICAL)
+When a diff adds a new HTTP method to a resource, UPDATE covers **all** existing test files for that resource — contract, integration, and UI. Scan the actual test directory on disk to find all files covering the same resource path; do not rely solely on what the analyze tool reports.
+### PATCH/PUT with child collections (MANDATORY)
+When updating a contract or integration test for a PATCH or PUT endpoint whose request/response includes a child collection array (e.g. \`items\`, \`products\`, \`line_items\`):
+1. The request body MUST include the child array with at least one item containing the FK field (e.g. \`product_id\`) and a \`quantity\` field.
+2. Assert each item's FK field and \`quantity\` match the sent values.
+3. Assert the top-level computed total (e.g. \`total_amount\`) equals the expected math from the items.
+A test that only sends/asserts metadata (discount, status, notes) without asserting the items array is INCOMPLETE and will produce false passes even when the items/total logic is broken.
+### REGENERATE
+Call the appropriate generation tool to replace the existing test from scratch. Use the same filename so it overwrites the old file.
+### DELETE
+Remove the test file when ALL endpoints it covers were removed from the codebase. If only SOME methods were removed, use UPDATE instead — remove the test functions for deleted methods and keep the rest.
+### Test data isolation (MANDATORY)
+Never use hardcoded resource IDs (e.g. \`order_id=1\`) in any test step, including GET or DELETE steps. Always create required resources via prior POST steps and chain IDs dynamically. Use timestamp-based unique names for created resources (e.g. \`"Product-\${int(time.time())}"\`) to prevent collisions across test runs.
-Complete ALL of the following before calling skyramp_actions:
+### Enhance assertions after UPDATE (MANDATORY)
+Apply to **new test functions you are adding** and **existing functions that cover endpoints changed in the diff** only. Do NOT touch existing functions for endpoints unrelated to the diff.
-### Existing tests (${existingTestCount} total)
-For EACH existing test, output:
+${ENHANCE_ASSERTIONS_FOR_INTEGRATION_AND_CONTRACTPROVIDER}`;
+}
+export function buildDriftOutputChecklist(existingTestCount, newEndpointCount, inlineMode = false) {
+    const finalStep = inlineMode
+        ? `### Final step
+Apply all maintenance actions (UPDATE / REGENERATE / DELETE) directly by editing the test files. New test generation (ADD) is handled separately in the next step.`
+        : `### Final step
+After completing all assessments above, call \`skyramp_actions\` with the stateFile to execute the recommended changes.`;
+    // In inline mode, existing test counts are unknown at prompt-build time —
+    // they come from skyramp_analyze_changes at runtime. Skip the count headers.
+    const existingTestSection = inlineMode
+        ? `### Existing tests
+For each existing test reported by \`skyramp_analyze_changes\`:
+- **IGNORE/VERIFY tests**: list on a single line: \`<testFile> — IGNORE\` or \`<testFile> — VERIFY (score <N>)\`. Do NOT write detailed rationale.
+- **UPDATE/REGENERATE/DELETE tests**: output the full block:
 \`\`\`
 Test: <testFile>
 Drift Score: <0-100>
-Action: <IGNORE | VERIFY | UPDATE | REGENERATE>
+Action: <UPDATE | REGENERATE | DELETE>
 Rationale: <1-2 sentence explanation>
 \`\`\`
-${newEndpointCount > 0
-        ? `### New endpoints (${newEndpointCount} detected)
+Focus your analysis on tests that need action — do not spend time analyzing unchanged tests.`
+        : `### Existing tests (${existingTestCount} total)
+For each existing test:
+- **IGNORE/VERIFY tests**: list on a single line: \`<testFile> — IGNORE\` or \`<testFile> — VERIFY (score <N>)\`. Do NOT write detailed rationale.
+- **UPDATE/REGENERATE/DELETE tests**: output the full block:
+\`\`\`
+Test: <testFile>
+Drift Score: <0-100>
+Action: <UPDATE | REGENERATE | DELETE>
+Rationale: <1-2 sentence explanation>
+\`\`\``;
+    const newEndpointSection = inlineMode
+        ? ""
+        : newEndpointCount > 0
+            ? `### New endpoints (${newEndpointCount} detected)
 For EACH new endpoint, output:
 \`\`\`
 Endpoint: <METHOD> <path>
@@ -145,9 +204,12 @@ Action: ADD
 Test types: <contract | integration | smoke | ...>
 Rationale: <1 sentence>
 \`\`\``
-        : `### New endpoints
-No new endpoints detected in this diff.`}
+            : `### New endpoints
+No new endpoints detected in this diff.`;
+    const sections = [existingTestSection, newEndpointSection, finalStep].filter(s => s.length > 0);
+    return `## Output Checklist
-### Final step
-After completing all assessments above, call \`skyramp_actions\` with the stateFile to execute the recommended changes.`;
+Complete ALL of the following:
+${sections.join("\n\n")}`;
 }

package/build/prompts/test-maintenance/enhanceAssertionSection.js ADDED Viewed

@@ -0,0 +1,99 @@
+export const ENHANCE_ASSERTIONS_FOR_INTEGRATION_AND_CONTRACTPROVIDER = `
+**Enhance assertions** — apply every rule below to every success-path test function that returns a response body (GET, POST, PATCH, PUT with 2xx) in every integration or contract-provider test file.
+Error-path functions (4xx/5xx) and no-body responses (e.g. DELETE 204) need only a status code assertion — do not add echo-back or computed field checks to those.
+---
+**IMPORTANT — How to access response body fields (use the SDK helpers, NOT dict/attribute access on the response variable):**
+- **Python**: \`skyramp.get_response_value(<response_var>, "<json_path>")\`
+  - e.g. \`skyramp.get_response_value(products_POST_response, "id")\`
+  - e.g. \`skyramp.get_response_value(orders_POST_response, "items.0.product_id")\`
+- **TypeScript (Playwright)**: \`getResponseValue(<response_var>, "<json_path>")\` (already imported from \`@skyramp/skyramp\`)
+  - e.g. \`getResponseValue(productsPostResponse, "id")\`
+- **JavaScript (Playwright)**: \`getResponseValue(<response_var>, "<json_path>")\` (already imported from \`@skyramp/skyramp\`)
+  - e.g. \`getResponseValue(productsPostResponse, "id")\`
+- **Java**: \`getValue(<response_var>, "<json_path>")\` (already imported)
+  - e.g. \`getValue(productsPostResponse, "id")\`
+---
+**What to assert after each request:**
+1. **Non-null / non-empty fields** — Assert that key identifying fields are present and non-empty:
+   - IDs, names, emails, and other primary fields must not be null/None/empty.
+   - Python: \`assert skyramp.get_response_value(products_POST_response, "id") is not None\`
+   - TypeScript: \`expect(getResponseValue(productsPostResponse, "id"), 'id').not.toBeNull();\`
+   - JavaScript: \`assert.notStrictEqual(getResponseValue(productsPostResponse, "id"), null, 'id should not be null');\`
+   - Java: \`assertNotNull(getValue(productsPostResponse, "id"));\`
+2. **Echo-back values (exact sent value)** — For fields returned unchanged from the request body (e.g. \`customer_email\`, \`status\`, \`discount_type\`, \`discount_value\`), assert the exact sent value. Using \`is not None\` is only allowed when the value is genuinely unknown (e.g. server-generated timestamps or IDs). This rule does NOT apply to computed fields (e.g. \`total_amount\`, \`discount_amount\`) — those are covered below.
+   - Python: \`assert skyramp.get_response_value(products_POST_response, "name") == "Skyramp Tester"\`
+   - TypeScript: \`expect(getResponseValue(productsPostResponse, "name"), 'name').toBe("Skyramp Tester");\`
+   - JavaScript: \`assert.strictEqual(getResponseValue(productsPostResponse, "name"), "Skyramp Tester", 'name should match request');\`
+   - Java: \`assertEquals("Skyramp Tester", getValue(productsPostResponse, "name"));\`
+3. **Chained values**:
+   - *Integration tests*: chaining is fundamental — POST creates a resource, GET/PATCH assert that the chained ID echoes back (e.g. GET response \`id\` == POST response \`id\`).
+     - Python: \`assert skyramp.get_response_value(product_GET_response, "id") == skyramp.get_response_value(products_POST_response, "id")\`
+     - TypeScript: \`expect(getResponseValue(productGetResponse, "id"), 'id').toBe(getResponseValue(productsPostResponse, "id"));\`
+   - *Contract-provider tests*: do NOT reference \`beforeAll\` provisioning data — the setup response that creates the resource is \`beforeAll\`, so chaining from it into test-function assertions is prohibited. Assert \`is not None\` for server-generated IDs and use inline request body literals for everything else.
+4. **Value ranges** — For numeric fields where a realistic range is inferable from the field name or domain:
+   - Python: \`assert skyramp.get_response_value(products_POST_response, "price") >= 0\`
+   - TypeScript: \`expect(getResponseValue(productsPostResponse, "price")).toBeGreaterThanOrEqual(0);\`
+   - JavaScript: \`assert.ok(getResponseValue(productsPostResponse, "price") >= 0, 'price should be non-negative');\`
+5. **Specific known values** — For enum/status fields where only one outcome is valid for this flow:
+   - Python: \`assert skyramp.get_response_value(orders_POST_response, "status") == "pending"\`
+   - TypeScript: \`expect(getResponseValue(ordersPostResponse, "status"), 'status').toBe("pending");\`
+   - JavaScript: \`assert.strictEqual(getResponseValue(ordersPostResponse, "status"), "pending", 'status should be pending');\`
+   - Java: \`assertEquals("pending", getValue(ordersPostResponse, "status"));\`
+6. **Array/collection completeness** — Only assert indices that exist in the recorded response body — never infer array length from the request or scenario name. Use the \`expected_response_body\` as the source of truth for which indices to assert. For contract-provider tests, use inline request body values — do NOT reference \`beforeAll\` provisioning data.
+   - For each item at index N, assert \`product_id\`, \`quantity\`, and \`unit_price\` (integration: chain \`unit_price\` from prior product response; contract: use inline request body value).
+   - Assert that no additional item exists beyond the expected count by checking that the next index returns null/None.
+   - Python:
+     \`assert skyramp.get_response_value(patch_response, "items.0.product_id") == skyramp.get_response_value(product_POST_response, "product_id")\`
+     \`assert skyramp.get_response_value(patch_response, "items.0.quantity") == <quantity_sent_in_patch_body>\`
+     \`assert skyramp.get_response_value(patch_response, "items.1.product_id") is None  # only 1 item was sent\`
+   - TypeScript:
+     \`expect(getResponseValue(patchResponse, "items.0.product_id")).toBe(getResponseValue(productPostResponse, "product_id"));\`
+     \`expect(getResponseValue(patchResponse, "items.0.quantity")).toBe(<quantity_sent_in_patch_body>);\`
+     \`expect(getResponseValue(patchResponse, "items.1.product_id")).toBeNull();\`
+   - JavaScript:
+     \`assert.strictEqual(getResponseValue(patchResponse, "items.0.product_id"), getResponseValue(productPostResponse, "product_id"), 'product_id should match');\`
+     \`assert.strictEqual(getResponseValue(patchResponse, "items.0.quantity"), <quantity_sent_in_patch_body>, 'quantity should match');\`
+     \`assert.strictEqual(getResponseValue(patchResponse, "items.1.product_id"), null, 'no second item expected');\`
+   - Java:
+     \`assertEquals(getValue(productPostResponse, "product_id"), getValue(patchResponse, "items.0.product_id"));\`
+     \`assertEquals(<quantity_sent_in_patch_body>, getValue(patchResponse, "items.0.quantity"));\`
+     \`assertNull(getValue(patchResponse, "items.1.product_id"));\`
+7. **Computed / derived numeric fields**:
+   - *Integration tests*: MUST derive the value dynamically from prior responses — NEVER hardcode a computed numeric value. Hardcoding \`== 2399.97\` is a violation.
+     - Total amount: \`assert skyramp.get_response_value(patch_response, "total_amount") == skyramp.get_response_value(product_POST_response, "price") * <quantity_sent_in_patch_body>\`
+     - Discount (percentage): \`assert skyramp.get_response_value(patch_response, "discount_amount") == skyramp.get_response_value(patch_response, "total_amount") * (skyramp.get_response_value(patch_response, "discount_value") / 100)\`
+   - *Contract-provider tests*: use the exact pre-computed value from \`expected_response_body\` directly (e.g. \`assert get_response_value(response, "total_amount") == 19.98\`). All inputs must come from the inline request body or the response — do NOT reference \`beforeAll\` provisioning data.
+     - Discount (percentage): \`assert skyramp.get_response_value(patch_response, "discount_amount") == skyramp.get_response_value(patch_response, "total_amount") * (skyramp.get_response_value(patch_response, "discount_value") / 100)\`
+8. **Read steps** — re-assert chained and computed fields — do not reduce to null-checks only.
+9. **Parity** — every assertion derivable from the request body or response (non-null, echo-back, value ranges, computed) must appear in both the contract test and the integration test independently.
+---
+**Scope rules:**
+- *Integration tests*: apply to every \`send_request\` / \`sendRequest\` call that returns a body.
+- *Contract-provider tests*: only modify test functions — do NOT touch \`beforeAll\`, \`afterAll\`, or any setup/teardown helper.
+- Only add assertions clearly supported by the request body, prior response values, field names, or codebase evidence. Do not invent constraints.
+- Add new assertions immediately after the existing status-code assertion — do not move or remove anything.
+**What NOT to do — any of these is a violation:**
+- Do NOT access response fields via dict syntax (\`response["field"]\`) or attribute access (\`response.field\`) — always use the SDK helper (\`get_response_value\` / \`getResponseValue\` / \`getValue\`).
+- Do NOT remove or modify existing assertions.
+- Do NOT add assertions for fields where no constraint is clearly inferable.
+- Do NOT restructure, reformat, or reorder any existing code.
+- Do NOT add comments or docstrings.
+- Do NOT change function signatures, imports, or variable names.
+`;

package/build/prompts/test-recommendation/recommendationSections.js CHANGED Viewed

@@ -44,6 +44,8 @@ export function buildTestPatternGuidelines() {
 - **Cascade deletes**: If deleting a parent removes children, verify cascade AND orphan prevention (delete product → orders referencing it get error or cascade)
 - **Race conditions**: If concurrent writes are possible (inventory deduction, counter increment), test concurrent requests
 - **Computed fields**: If response contains derived values (total, average, count), verify computation with known inputs (e.g., total_cost = compute_seconds * rate + memory_mb * rate + external_cost)
+- **Mutation with collection modification**: If PUT/PATCH endpoints accept arrays of child items (e.g., order line items, cart products, invoice entries), test adding/removing items and verify that derived totals (e.g., total_amount, subtotal, item_count) are recalculated correctly. This is the most common source of user-reported bugs — always prioritize it for GENERATE over simple field-update tests.
+    **CRITICAL**: The PATCH/PUT request body MUST include the child collection array field(s) defined for that endpoint (e.g., "items" with FK references like "product_id" and a quantity field) chained from prior POST responses. A PATCH that only sends metadata fields (e.g., discount_type, status, notes) without modifying the child collection is NOT a valid mutation-recalc test — it will pass even when the item/total logic is broken. Before writing assertions, inspect the source code or OpenAPI spec to identify (1) the actual child collection field name and its FK/quantity/price sub-fields, and (2) how derived totals are calculated (including any discounts, taxes, or fees). Then assert: the child FK fields match chained IDs, quantities match sent values, and totals match the computation from the source code
 - **Webhook/event side effects**: If endpoints trigger async operations, test that side effects occur (e.g., POST /orders triggers notification)
 - **Cross-user isolation**: If resources are owned by users, test that user B cannot access/modify user A's resources (GET /users/{other_id}/data → 403 Forbidden)
 - **Range/boundary invariants**: If business rules cap values (max retries, min balance, discount ≤ subtotal), test the boundary (e.g., set retries to max+1 → expect rejection)
@@ -56,6 +58,16 @@ export function buildTestQualityCriteria() {
 that step B depends on (e.g., create product → create order referencing that product's ID →
 verify order contains correct product). Single-resource CRUD alone is not an integration test.
 Use realistic request bodies from source code schemas and verify response data, not just status codes.
+When a PUT/PATCH updates a resource with child collections (e.g., order items), the request body
+MUST include the child array with FK references chained from prior steps — and assertions MUST
+verify the actual child items in the response (product_id, quantity, unit_price), not just
+top-level metadata like discount or status.
+**Contract tests** (single-step) are the right choice for: error-handling scenarios on a single
+endpoint (e.g., PATCH/GET/DELETE a nonexistent resource → 404, POST with invalid payload → 422),
+validation boundary checks, and any test that exercises one endpoint's API contract in isolation.
+Do NOT add setup steps just to avoid hardcoding an ID — use a realistic hardcoded nonexistent ID
+(e.g., 99999 or a random UUID) and keep it a single-step contract test.
 **E2E tests** should follow realistic user journeys end-to-end: browse products → search →
 add to cart → checkout. Verify that frontend actions trigger the correct API calls and
@@ -73,9 +85,8 @@ Choose based on what adds the most value for this PR's changes.
 **Contract test mode — signal-based selection:**
 - **Consumer contract** (\`consumerMode: true\`): Look for outbound HTTP client code (fetch, axios, httpx, requests, http.Client), service client classes, or calls to external base URLs. If an endpoint's implementation makes downstream calls, that downstream boundary is a consumer contract test candidate.
-- **Provider contract** (\`providerMode: true\`): Look for new or modified endpoint handlers, route changes, response shape modifications, or the presence of an OpenAPI spec. If the diff adds/changes an endpoint this service owns, that is a provider contract test candidate.
-- **Both modes**: When the service is simultaneously an API owner (upstream) AND a client of another service (downstream).
-- **Default (neither)**: Only when role is unclear or no spec is available.
+- **Provider contract** (\`providerMode: true\`): Look for new or modified endpoint handlers, route changes, or response shape modifications. If the diff adds/changes an endpoint this service owns, that is a provider contract test candidate.
+- **Both modes** (\`providerMode: true, consumerMode: true\`) — produces the same output as omitting both flags (generates provider and consumer contract tests). Use when the diff contains BOTH provider signals (new/modified endpoint handlers) AND consumer signals (outbound HTTP client calls to another service).
 **Scenario fidelity:** Every workflow scenario should reflect the actual resource
 relationships in the code. If the pre-drafted scenarios don't match the real data model,
@@ -169,9 +180,10 @@ To skip auth for unauthenticated endpoints, pass \`authHeader: ""\`.`;
 ${authHeaderLine}
 ${authGuidance}
-**For multi-endpoint workflows (integration tests) — Scenario → Integration pipeline:**
-1. Call \`skyramp_scenario_test_generation\` once per step: \`scenarioName\`, \`destination\`,
-   \`baseURL\`, \`method\`, \`path\`, \`requestBody\` OR \`queryParams\`, \`responseBody\`, \`${authCallParams}\`.
+**For multi-endpoint workflows (integration tests) — Batch Scenario → Integration pipeline:**
+1. Call \`skyramp_batch_scenario_test_generation\` with ALL steps in a single call: \`scenarioName\`, \`destination\`,
+   \`baseURL\`, \`${authCallParams}\`, and a \`steps\` array where each element has \`method\`, \`path\`, \`requestBody\` OR \`queryParams\`, \`responseBody\`, \`statusCode\`.
+   (Fallback: if batch tool is unavailable, call \`skyramp_scenario_test_generation\` once per step.)
    \`statusCode\` is optional — defaults: POST→201, DELETE→204, GET/PUT/PATCH→200. Only override for non-standard codes.
    **OpenAPI spec is NOT required.** \`apiSchema\` is OPTIONAL — omit it if no spec exists.
    **CRITICAL — Query params vs request body:**
@@ -183,6 +195,10 @@ ${authGuidance}
    returned by the controller — e.g., \`id\`, \`ownerId\`, \`createdAt\`, included relations like \`collection\`, \`tags\`).
    Wrap in \`{"response": ...}\` if the API uses an envelope pattern. If omitted, a synthetic response is generated.
    Inspect the source code to determine the correct request AND response body shapes — avoid sending \`{}\`.
+   **CRITICAL for PATCH/PUT mutation-recalc scenarios:** The request body MUST include the child
+   collection array (e.g. \`"items": [{"product_id": <chained from prior POST>, "quantity": 2}]\`).
+   Never send a PATCH that only modifies metadata (discount, status) without also including the
+   items/products collection — such a test will not catch collection-level or total-recalculation bugs.
    Use unique names with timestamp suffix to avoid conflicts on re-runs.
    For GET/PUT/DELETE with path IDs, use a placeholder — chaining resolves the real ID.
 2. Produces a \`scenario_<name>.json\` in the same \`outputDir\` as the test files (not \`.skyramp/\`).
@@ -200,12 +216,11 @@ ${PATH_PARAM_UUID_GUIDANCE}
 **Contract test mode selection — set based on this service's role at the boundary:**
 - \`providerMode: true\` — this service IS the API; validates the implementation matches the spec.
-  Use for new or modified endpoints this codebase owns, especially when an OpenAPI spec is present.
+  Use for new or modified endpoints this codebase owns.
 - \`consumerMode: true\` — this service CALLS another API; validates outbound requests conform to the downstream contract.
   Use when the endpoint's implementation makes HTTP calls to external services (look for fetch/axios/httpx/http.Client/service clients).
   A request-aware mock stands in for the real downstream service — no live dependency needed.
-- Both — use when the service boundary is both a provider (owns an API) and a consumer (calls a downstream API).
-- Neither (default) — use only when the role is ambiguous or no spec is available.
+- **Both modes** (\`providerMode: true, consumerMode: true\`) — same output as omitting both flags. Generates both consumer and provider contract tests. Use when the diff contains BOTH provider signals (new/modified endpoint handlers) AND consumer signals (outbound HTTP client calls to another service).
 **For UI tests:**
 1. \`browser_navigate\` to the target URL (from workspace \`api.baseUrl\`)