@skyramp/mcp 0.0.64-rc.9 → 0.0.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/build/index.js +2 -0
  2. package/build/prompts/test-maintenance/drift-analysis-prompt.js +26 -7
  3. package/build/prompts/test-maintenance/driftAnalysisSections.js +96 -34
  4. package/build/prompts/test-maintenance/enhanceAssertionSection.js +99 -0
  5. package/build/prompts/test-recommendation/recommendationSections.js +24 -9
  6. package/build/prompts/test-recommendation/test-recommendation-prompt.js +96 -27
  7. package/build/prompts/test-recommendation/test-recommendation-prompt.test.js +239 -2
  8. package/build/prompts/testbot/testbot-prompts.js +182 -125
  9. package/build/services/TestDiscoveryService.js +23 -0
  10. package/build/services/TestExecutionService.js +1 -1
  11. package/build/services/TestGenerationService.js +83 -12
  12. package/build/services/TestGenerationService.test.js +111 -2
  13. package/build/tool-phase-coverage.test.js +8 -2
  14. package/build/tool-phases.js +11 -13
  15. package/build/tools/generate-tests/generateBatchScenarioRestTool.js +203 -0
  16. package/build/tools/generate-tests/generateContractRestTool.js +3 -73
  17. package/build/tools/generate-tests/generateIntegrationRestTool.js +11 -61
  18. package/build/tools/submitReportTool.js +11 -3
  19. package/build/tools/submitReportTool.test.js +1 -1
  20. package/build/tools/test-management/analyzeChangesTool.js +14 -4
  21. package/build/types/RepositoryAnalysis.js +1 -0
  22. package/build/utils/scenarioDrafting.js +121 -11
  23. package/build/utils/scenarioDrafting.test.js +266 -3
  24. package/node_modules/playwright/ThirdPartyNotices.txt +679 -3093
  25. package/node_modules/playwright/lib/mcp/skyramp/traceRecordingBackend.js +117 -11
  26. package/package.json +2 -2
  27. package/build/tools/test-recommendation/recommendTestsTool.js +0 -274
package/build/index.js CHANGED
@@ -23,6 +23,7 @@ import { registerRecommendTestsPrompt } from "./prompts/test-recommendation/regi
23
23
  import { registerModularizationTool } from "./tools/code-refactor/modularizationTool.js";
24
24
  import { registerCodeReuseTool } from "./tools/code-refactor/codeReuseTool.js";
25
25
  import { registerScenarioTestTool } from "./tools/generate-tests/generateScenarioRestTool.js";
26
+ import { registerBatchScenarioTestTool } from "./tools/generate-tests/generateBatchScenarioRestTool.js";
26
27
  import { registerMockTool } from "./tools/generate-tests/generateMockRestTool.js";
27
28
  import { registerAnalyzeChangesTool, registerAnalyzeTestHealthTool, registerExecuteTestsTool, registerActionsTool, registerStateCleanupTool, } from "./tools/test-management/index.js";
28
29
  import { registerTestbotPrompt, registerTestbotResource, } from "./prompts/testbot/testbot-prompts.js";
@@ -218,6 +219,7 @@ const infrastructureTools = [
218
219
  ];
219
220
  if (process.env.SKYRAMP_FEATURE_TESTBOT === "1") {
220
221
  infrastructureTools.push(registerSubmitReportTool);
222
+ registerBatchScenarioTestTool(server);
221
223
  logger.info("TestBot tools enabled via SKYRAMP_FEATURE_TESTBOT");
222
224
  }
223
225
  infrastructureTools.forEach((registerTool) => registerTool(server));
@@ -1,14 +1,15 @@
1
- import { buildDriftScoringGuide, buildActionDecisionMatrix, buildBreakingChangePatterns, buildTestAssessmentGuidelines, buildAddRecommendationGuidelines, buildDriftOutputChecklist, } from "./driftAnalysisSections.js";
1
+ import { buildDriftScoringGuide, buildActionDecisionMatrix, buildBreakingChangePatterns, buildTestAssessmentGuidelines, buildAddRecommendationGuidelines, buildDriftOutputChecklist, buildUpdateExecutionRules, } from "./driftAnalysisSections.js";
2
2
  export function buildDriftAnalysisPrompt(params) {
3
3
  const { existingTests, parsedDiff, scannedEndpoints, repositoryPath, stateFile } = params;
4
+ const inlineMode = !stateFile;
4
5
  // Detect new endpoints count from parsedDiff
5
6
  let newEndpointCount = 0;
6
7
  let diffSection = "";
7
8
  if (parsedDiff) {
8
9
  const lines = parsedDiff.split("\n");
9
- const epMatches = parsedDiff.match(/(?:^|\n)\*\*(GET|POST|PUT|PATCH|DELETE)\s+[^\*]+\*\*/gm);
10
- if (epMatches)
11
- newEndpointCount = epMatches.length;
10
+ const newEndpointMatch = parsedDiff.match(/\*\*New Endpoints\*\*\s+\((\d+)\)/);
11
+ if (newEndpointMatch)
12
+ newEndpointCount = parseInt(newEndpointMatch[1], 10);
12
13
  diffSection = `## Branch Diff
13
14
  \`\`\`
14
15
  ${lines.slice(0, 200).join("\n")}
@@ -32,7 +33,11 @@ No existing Skyramp tests found in repository.
32
33
  ${scannedEndpoints.map((ep) => `- ${Array.isArray(ep.methods) ? ep.methods.join("|") : ep.method} ${ep.path}`).join("\n")}
33
34
  `
34
35
  : "";
35
- return `# Test Health Analysis
36
+ // In inline mode (testbot), skip the context header — existing tests and diff
37
+ // are provided by skyramp_analyze_changes at runtime, not at prompt-build time.
38
+ const contextSection = inlineMode
39
+ ? ""
40
+ : `# Test Health Analysis
36
41
 
37
42
  **Repository**: \`${repositoryPath}\`
38
43
  **Existing tests**: ${existingTests.length}
@@ -40,7 +45,19 @@ ${scannedEndpoints.map((ep) => `- ${Array.isArray(ep.methods) ? ep.methods.join(
40
45
 
41
46
  ${diffSection}
42
47
  ${testListSection}
43
- ${scannedSection}
48
+ ${scannedSection}`;
49
+ if (inlineMode) {
50
+ // Testbot inline mode: all maintenance logic lives here so the testbot
51
+ // prompt only orchestrates steps without duplicating rules.
52
+ return `${buildActionDecisionMatrix()}
53
+
54
+ ${buildUpdateExecutionRules()}
55
+
56
+ ${buildDriftOutputChecklist(existingTests.length, newEndpointCount, inlineMode)}
57
+
58
+ **Be brief.** Score each test, decide the action, and apply edits immediately. Do NOT write detailed analysis for IGNORE'd tests.`;
59
+ }
60
+ return `${contextSection}
44
61
  ${buildDriftScoringGuide()}
45
62
 
46
63
  ${buildActionDecisionMatrix()}
@@ -49,9 +66,11 @@ ${buildBreakingChangePatterns()}
49
66
 
50
67
  ${buildTestAssessmentGuidelines()}
51
68
 
69
+ ${buildUpdateExecutionRules()}
70
+
52
71
  ${buildAddRecommendationGuidelines()}
53
72
 
54
- ${buildDriftOutputChecklist(existingTests.length, newEndpointCount)}
73
+ ${buildDriftOutputChecklist(existingTests.length, newEndpointCount, inlineMode)}
55
74
 
56
75
  After completing the assessment above, call \`skyramp_actions\` with \`stateFile: "${stateFile}"\`
57
76
 
@@ -2,6 +2,7 @@
2
2
  * Modular section builders for the Drift Analysis prompt,
3
3
  * mirroring the recommendationSections.ts pattern.
4
4
  */
5
+ import { ENHANCE_ASSERTIONS_FOR_INTEGRATION_AND_CONTRACTPROVIDER } from "./enhanceAssertionSection.js";
5
6
  export function buildDriftScoringGuide() {
6
7
  return `## Drift Score Guide (0–100)
7
8
 
@@ -11,6 +12,9 @@ export function buildDriftScoringGuide() {
11
12
  | 21–40 | VERIFY | Minor changes detected — review but likely fine |
12
13
  | 41–70 | UPDATE | Breaking changes detected — test needs edits |
13
14
  | 71–100 | REGENERATE | Major structural changes — regenerate from scratch |
15
+ | 80–100 | DELETE | ALL endpoints the test covers were removed — test is obsolete |
16
+
17
+ DELETE and REGENERATE overlap in the 80–100 range. The distinction is cause, not score: DELETE when the endpoints no longer exist, REGENERATE when they still exist but changed drastically.
14
18
 
15
19
  Assign each existing test a score based on how much the codebase has changed relative to what the test expects.`;
16
20
  }
@@ -24,7 +28,8 @@ For each test, choose one of:
24
28
  | **IGNORE** | Drift score 0–20; no breaking changes AND no additive field gaps detected |
25
29
  | **VERIFY** | Drift score 21–40; minor changes, manual review recommended |
26
30
  | **UPDATE** | Drift score 25–70; breaking changes OR additive fields added to a covered endpoint (new response field the test doesn't assert) |
27
- | **REGENERATE** | Drift score 71–100; endpoint removed, major restructuring, or test is fundamentally broken |
31
+ | **REGENERATE** | Drift score 71–100; major restructuring or test is fundamentally broken |
32
+ | **DELETE** | Drift score 80–100; ALL endpoints the test covers were removed from the codebase |
28
33
  | **ADD** | New endpoint detected in diff that has no corresponding test yet |
29
34
 
30
35
  Rules:
@@ -68,76 +73,130 @@ These do NOT break existing assertions but leave the new field untested. Always
68
73
  - New key added inside an existing dict/object returned by the endpoint`;
69
74
  }
70
75
  export function buildTestAssessmentGuidelines() {
71
- return `## Per-Test Assessment (4 Steps)
76
+ return `## Per-Test Assessment (4 Checks)
72
77
 
73
- For each existing test file, follow these steps:
78
+ For each existing test file, run these checks:
74
79
 
75
- ### Step 1: Check endpoint existence
80
+ ### Check A: Endpoint existence
76
81
  Does the endpoint the test targets still exist in the codebase?
77
- - If the endpoint path/method is no longer present → score 80+, action: REGENERATE
82
+ - If ALL endpoints the test covers were removed → score 80+, action: DELETE (the entire test file is obsolete)
83
+ - If SOME methods were removed but others remain → score 50–70, action: UPDATE (remove the test functions for deleted methods, keep the rest)
78
84
  - If the endpoint was renamed → score 50–70, action: UPDATE (path substitution)
79
85
 
80
- ### Step 2: Check request/response shape (breaking changes)
86
+ ### Check B: Request/response shape (breaking changes)
81
87
  Has the request body or response structure changed in a way that breaks the test?
82
88
  - Compare test's expected fields against current schema/model definitions
83
89
  - Type changes (string→int, int→string) → score 60+, action: UPDATE or REGENERATE
84
90
  - New required fields the test doesn't send → score 50+, action: UPDATE
85
91
  - Response fields the test asserts on have been removed → score 50+, action: UPDATE
86
92
 
87
- ### Step 2b: Check additive response field changes (coverage gaps)
93
+ ### Check B2: Additive response field changes (coverage gaps)
88
94
  **Even if existing assertions still pass**, does the diff add a new field to the response of an endpoint this test already covers?
89
95
  - Look at the diff for lines like \`+ "newField":\` or \`+ newField =\` inside a view/serializer this test hits
90
96
  - If YES → score 30, action: UPDATE — add an assertion for the new field (e.g. \`assert "newField" in response_body\` or \`assert response_body["newField"] >= 0\`)
91
97
  - This applies even when the test only checks status codes — the test should be extended to cover the new field
92
98
  - **Do NOT score IGNORE if a new response field was added to a covered endpoint**
93
99
 
94
- ### Step 3: Check auth changes
100
+ ### Check C: Auth changes
95
101
  Has the authentication mechanism for this endpoint changed?
96
102
  - Auth added where none existed → score 40+, action: UPDATE
97
103
  - Auth method changed (bearer→cookie) → score 50+, action: UPDATE
98
104
  - Auth removed → score 30+, action: VERIFY or UPDATE
99
105
 
100
- ### Step 4: Assign score and action
101
- Based on the above, assign a final drift score 0–100 and choose the action (IGNORE / VERIFY / UPDATE / REGENERATE).
106
+ ### Check D: Assign score and action
107
+ Based on the above, assign a final drift score 0–100 and choose the action (IGNORE / VERIFY / UPDATE / REGENERATE / DELETE).
102
108
  Provide a 1-2 sentence rationale.
103
- - If Step 2b flagged an additive field → score must be ≥ 30 and action must be UPDATE, even if Steps 2/3 found no breaking changes.`;
109
+ - If Check B2 flagged an additive field → score must be ≥ 30 and action must be UPDATE, even if Checks B/C found no breaking changes.`;
104
110
  }
105
111
  export function buildAddRecommendationGuidelines() {
106
- return `## ADD Recommendations for New Endpoints
112
+ return `## ADD New Tests for New Endpoints
113
+
114
+ **Only ADD when:**
115
+ - The diff introduces a brand-new route that has **no existing test coverage at all**, OR
116
+ - The diff introduces a new auth path, error branch, or fundamentally separate scenario that no existing test covers.
107
117
 
108
- For each new endpoint detected in the diff (not yet covered by any existing test):
118
+ **Never ADD when:**
119
+ - The resource already has existing tests and the diff only adds a new HTTP method — UPDATE those files instead.
120
+ - The endpoint existed before this diff but lacks tests — that is a pre-existing coverage gap; log it in \`additionalRecommendations\`, do NOT add a test now.
109
121
 
110
- ### Test type priority by HTTP method
122
+ **Test type priority by HTTP method:**
111
123
  | Method | Recommended test types |
112
124
  |--------|----------------------|
113
125
  | POST / PUT / PATCH | integration, contract |
114
126
  | GET | contract, smoke |
115
127
  | DELETE | integration, smoke |
116
128
 
117
- ### ADD recommendation format
118
- For each new endpoint, include:
119
- 1. The endpoint path and method
120
- 2. The recommended test types (from the table above)
121
- 3. The Skyramp tool to call (e.g., \`skyramp_contract_test_generation\`, \`skyramp_integration_test_generation\`)
122
- 4. The \`endpointURL\` to use (combine base URL + path)
123
- 5. The language/framework to use (from workspace config or project metadata)`;
129
+ Use a unique descriptive filename for every new test file. Do NOT create a new contract or integration test file for a resource that already has existing tests — use UPDATE instead.`;
124
130
  }
125
- export function buildDriftOutputChecklist(existingTestCount, newEndpointCount) {
126
- return `## Output Checklist
131
+ export function buildUpdateExecutionRules() {
132
+ return `## Update Execution Rules
133
+
134
+ When applying UPDATE actions to existing test files, follow these rules in addition to the drift-detected changes:
135
+
136
+ ### Test file ordering (CRITICAL)
137
+ Place mutation test functions (PATCH, PUT, POST) **before** any DELETE test function targeting the same resource. DELETE removes the resource — any mutation call after it will 404. When inserting a new mutation test, place it above the DELETE function and above the DELETE call in the \`if __name__ == "__main__"\` block (or equivalent runner entrypoint).
138
+
139
+ ### Happy path first (CRITICAL)
140
+ When adding a new HTTP method (PUT, PATCH, POST) to an existing test file, always add the happy path (2xx success) assertion first. Do NOT add only error-path tests (404, 422) for the new method — error cases may follow, but the 2xx case is mandatory.
141
+
142
+ ### All test files for a resource (CRITICAL)
143
+ When a diff adds a new HTTP method to a resource, UPDATE covers **all** existing test files for that resource — contract, integration, and UI. Scan the actual test directory on disk to find all files covering the same resource path; do not rely solely on what the analyze tool reports.
144
+
145
+ ### PATCH/PUT with child collections (MANDATORY)
146
+ When updating a contract or integration test for a PATCH or PUT endpoint whose request/response includes a child collection array (e.g. \`items\`, \`products\`, \`line_items\`):
147
+ 1. The request body MUST include the child array with at least one item containing the FK field (e.g. \`product_id\`) and a \`quantity\` field.
148
+ 2. Assert each item's FK field and \`quantity\` match the sent values.
149
+ 3. Assert the top-level computed total (e.g. \`total_amount\`) equals the expected math from the items.
150
+ A test that only sends/asserts metadata (discount, status, notes) without asserting the items array is INCOMPLETE and will produce false passes even when the items/total logic is broken.
151
+
152
+ ### REGENERATE
153
+ Call the appropriate generation tool to replace the existing test from scratch. Use the same filename so it overwrites the old file.
154
+
155
+ ### DELETE
156
+ Remove the test file when ALL endpoints it covers were removed from the codebase. If only SOME methods were removed, use UPDATE instead — remove the test functions for deleted methods and keep the rest.
157
+
158
+ ### Test data isolation (MANDATORY)
159
+ Never use hardcoded resource IDs (e.g. \`order_id=1\`) in any test step, including GET or DELETE steps. Always create required resources via prior POST steps and chain IDs dynamically. Use timestamp-based unique names for created resources (e.g. \`"Product-\${int(time.time())}"\`) to prevent collisions across test runs.
127
160
 
128
- Complete ALL of the following before calling skyramp_actions:
161
+ ### Enhance assertions after UPDATE (MANDATORY)
162
+ Apply to **new test functions you are adding** and **existing functions that cover endpoints changed in the diff** only. Do NOT touch existing functions for endpoints unrelated to the diff.
129
163
 
130
- ### Existing tests (${existingTestCount} total)
131
- For EACH existing test, output:
164
+ ${ENHANCE_ASSERTIONS_FOR_INTEGRATION_AND_CONTRACTPROVIDER}`;
165
+ }
166
+ export function buildDriftOutputChecklist(existingTestCount, newEndpointCount, inlineMode = false) {
167
+ const finalStep = inlineMode
168
+ ? `### Final step
169
+ Apply all maintenance actions (UPDATE / REGENERATE / DELETE) directly by editing the test files. New test generation (ADD) is handled separately in the next step.`
170
+ : `### Final step
171
+ After completing all assessments above, call \`skyramp_actions\` with the stateFile to execute the recommended changes.`;
172
+ // In inline mode, existing test counts are unknown at prompt-build time —
173
+ // they come from skyramp_analyze_changes at runtime. Skip the count headers.
174
+ const existingTestSection = inlineMode
175
+ ? `### Existing tests
176
+ For each existing test reported by \`skyramp_analyze_changes\`:
177
+ - **IGNORE/VERIFY tests**: list on a single line: \`<testFile> — IGNORE\` or \`<testFile> — VERIFY (score <N>)\`. Do NOT write detailed rationale.
178
+ - **UPDATE/REGENERATE/DELETE tests**: output the full block:
132
179
  \`\`\`
133
180
  Test: <testFile>
134
181
  Drift Score: <0-100>
135
- Action: <IGNORE | VERIFY | UPDATE | REGENERATE>
182
+ Action: <UPDATE | REGENERATE | DELETE>
136
183
  Rationale: <1-2 sentence explanation>
137
184
  \`\`\`
138
-
139
- ${newEndpointCount > 0
140
- ? `### New endpoints (${newEndpointCount} detected)
185
+ Focus your analysis on tests that need action — do not spend time analyzing unchanged tests.`
186
+ : `### Existing tests (${existingTestCount} total)
187
+ For each existing test:
188
+ - **IGNORE/VERIFY tests**: list on a single line: \`<testFile> — IGNORE\` or \`<testFile> — VERIFY (score <N>)\`. Do NOT write detailed rationale.
189
+ - **UPDATE/REGENERATE/DELETE tests**: output the full block:
190
+ \`\`\`
191
+ Test: <testFile>
192
+ Drift Score: <0-100>
193
+ Action: <UPDATE | REGENERATE | DELETE>
194
+ Rationale: <1-2 sentence explanation>
195
+ \`\`\``;
196
+ const newEndpointSection = inlineMode
197
+ ? ""
198
+ : newEndpointCount > 0
199
+ ? `### New endpoints (${newEndpointCount} detected)
141
200
  For EACH new endpoint, output:
142
201
  \`\`\`
143
202
  Endpoint: <METHOD> <path>
@@ -145,9 +204,12 @@ Action: ADD
145
204
  Test types: <contract | integration | smoke | ...>
146
205
  Rationale: <1 sentence>
147
206
  \`\`\``
148
- : `### New endpoints
149
- No new endpoints detected in this diff.`}
207
+ : `### New endpoints
208
+ No new endpoints detected in this diff.`;
209
+ const sections = [existingTestSection, newEndpointSection, finalStep].filter(s => s.length > 0);
210
+ return `## Output Checklist
150
211
 
151
- ### Final step
152
- After completing all assessments above, call \`skyramp_actions\` with the stateFile to execute the recommended changes.`;
212
+ Complete ALL of the following:
213
+
214
+ ${sections.join("\n\n")}`;
153
215
  }
@@ -0,0 +1,99 @@
1
+ export const ENHANCE_ASSERTIONS_FOR_INTEGRATION_AND_CONTRACTPROVIDER = `
2
+ **Enhance assertions** — apply every rule below to every success-path test function that returns a response body (GET, POST, PATCH, PUT with 2xx) in every integration or contract-provider test file.
3
+ Error-path functions (4xx/5xx) and no-body responses (e.g. DELETE 204) need only a status code assertion — do not add echo-back or computed field checks to those.
4
+
5
+ ---
6
+
7
+ **IMPORTANT — How to access response body fields (use the SDK helpers, NOT dict/attribute access on the response variable):**
8
+
9
+ - **Python**: \`skyramp.get_response_value(<response_var>, "<json_path>")\`
10
+ - e.g. \`skyramp.get_response_value(products_POST_response, "id")\`
11
+ - e.g. \`skyramp.get_response_value(orders_POST_response, "items.0.product_id")\`
12
+ - **TypeScript (Playwright)**: \`getResponseValue(<response_var>, "<json_path>")\` (already imported from \`@skyramp/skyramp\`)
13
+ - e.g. \`getResponseValue(productsPostResponse, "id")\`
14
+ - **JavaScript (Playwright)**: \`getResponseValue(<response_var>, "<json_path>")\` (already imported from \`@skyramp/skyramp\`)
15
+ - e.g. \`getResponseValue(productsPostResponse, "id")\`
16
+ - **Java**: \`getValue(<response_var>, "<json_path>")\` (already imported)
17
+ - e.g. \`getValue(productsPostResponse, "id")\`
18
+
19
+ ---
20
+
21
+ **What to assert after each request:**
22
+
23
+ 1. **Non-null / non-empty fields** — Assert that key identifying fields are present and non-empty:
24
+ - IDs, names, emails, and other primary fields must not be null/None/empty.
25
+ - Python: \`assert skyramp.get_response_value(products_POST_response, "id") is not None\`
26
+ - TypeScript: \`expect(getResponseValue(productsPostResponse, "id"), 'id').not.toBeNull();\`
27
+ - JavaScript: \`assert.notStrictEqual(getResponseValue(productsPostResponse, "id"), null, 'id should not be null');\`
28
+ - Java: \`assertNotNull(getValue(productsPostResponse, "id"));\`
29
+
30
+ 2. **Echo-back values (exact sent value)** — For fields returned unchanged from the request body (e.g. \`customer_email\`, \`status\`, \`discount_type\`, \`discount_value\`), assert the exact sent value. Using \`is not None\` is only allowed when the value is genuinely unknown (e.g. server-generated timestamps or IDs). This rule does NOT apply to computed fields (e.g. \`total_amount\`, \`discount_amount\`) — those are covered below.
31
+ - Python: \`assert skyramp.get_response_value(products_POST_response, "name") == "Skyramp Tester"\`
32
+ - TypeScript: \`expect(getResponseValue(productsPostResponse, "name"), 'name').toBe("Skyramp Tester");\`
33
+ - JavaScript: \`assert.strictEqual(getResponseValue(productsPostResponse, "name"), "Skyramp Tester", 'name should match request');\`
34
+ - Java: \`assertEquals("Skyramp Tester", getValue(productsPostResponse, "name"));\`
35
+
36
+ 3. **Chained values**:
37
+ - *Integration tests*: chaining is fundamental — POST creates a resource, GET/PATCH assert that the chained ID echoes back (e.g. GET response \`id\` == POST response \`id\`).
38
+ - Python: \`assert skyramp.get_response_value(product_GET_response, "id") == skyramp.get_response_value(products_POST_response, "id")\`
39
+ - TypeScript: \`expect(getResponseValue(productGetResponse, "id"), 'id').toBe(getResponseValue(productsPostResponse, "id"));\`
40
+ - *Contract-provider tests*: do NOT reference \`beforeAll\` provisioning data — the setup response that creates the resource is \`beforeAll\`, so chaining from it into test-function assertions is prohibited. Assert \`is not None\` for server-generated IDs and use inline request body literals for everything else.
41
+
42
+ 4. **Value ranges** — For numeric fields where a realistic range is inferable from the field name or domain:
43
+ - Python: \`assert skyramp.get_response_value(products_POST_response, "price") >= 0\`
44
+ - TypeScript: \`expect(getResponseValue(productsPostResponse, "price")).toBeGreaterThanOrEqual(0);\`
45
+ - JavaScript: \`assert.ok(getResponseValue(productsPostResponse, "price") >= 0, 'price should be non-negative');\`
46
+
47
+ 5. **Specific known values** — For enum/status fields where only one outcome is valid for this flow:
48
+ - Python: \`assert skyramp.get_response_value(orders_POST_response, "status") == "pending"\`
49
+ - TypeScript: \`expect(getResponseValue(ordersPostResponse, "status"), 'status').toBe("pending");\`
50
+ - JavaScript: \`assert.strictEqual(getResponseValue(ordersPostResponse, "status"), "pending", 'status should be pending');\`
51
+ - Java: \`assertEquals("pending", getValue(ordersPostResponse, "status"));\`
52
+
53
+ 6. **Array/collection completeness** — Only assert indices that exist in the recorded response body — never infer array length from the request or scenario name. Use the \`expected_response_body\` as the source of truth for which indices to assert. For contract-provider tests, use inline request body values — do NOT reference \`beforeAll\` provisioning data.
54
+ - For each item at index N, assert \`product_id\`, \`quantity\`, and \`unit_price\` (integration: chain \`unit_price\` from prior product response; contract: use inline request body value).
55
+ - Assert that no additional item exists beyond the expected count by checking that the next index returns null/None.
56
+ - Python:
57
+ \`assert skyramp.get_response_value(patch_response, "items.0.product_id") == skyramp.get_response_value(product_POST_response, "product_id")\`
58
+ \`assert skyramp.get_response_value(patch_response, "items.0.quantity") == <quantity_sent_in_patch_body>\`
59
+ \`assert skyramp.get_response_value(patch_response, "items.1.product_id") is None # only 1 item was sent\`
60
+ - TypeScript:
61
+ \`expect(getResponseValue(patchResponse, "items.0.product_id")).toBe(getResponseValue(productPostResponse, "product_id"));\`
62
+ \`expect(getResponseValue(patchResponse, "items.0.quantity")).toBe(<quantity_sent_in_patch_body>);\`
63
+ \`expect(getResponseValue(patchResponse, "items.1.product_id")).toBeNull();\`
64
+ - JavaScript:
65
+ \`assert.strictEqual(getResponseValue(patchResponse, "items.0.product_id"), getResponseValue(productPostResponse, "product_id"), 'product_id should match');\`
66
+ \`assert.strictEqual(getResponseValue(patchResponse, "items.0.quantity"), <quantity_sent_in_patch_body>, 'quantity should match');\`
67
+ \`assert.strictEqual(getResponseValue(patchResponse, "items.1.product_id"), null, 'no second item expected');\`
68
+ - Java:
69
+ \`assertEquals(getValue(productPostResponse, "product_id"), getValue(patchResponse, "items.0.product_id"));\`
70
+ \`assertEquals(<quantity_sent_in_patch_body>, getValue(patchResponse, "items.0.quantity"));\`
71
+ \`assertNull(getValue(patchResponse, "items.1.product_id"));\`
72
+
73
+ 7. **Computed / derived numeric fields**:
74
+ - *Integration tests*: MUST derive the value dynamically from prior responses — NEVER hardcode a computed numeric value. Hardcoding \`== 2399.97\` is a violation.
75
+ - Total amount: \`assert skyramp.get_response_value(patch_response, "total_amount") == skyramp.get_response_value(product_POST_response, "price") * <quantity_sent_in_patch_body>\`
76
+ - Discount (percentage): \`assert skyramp.get_response_value(patch_response, "discount_amount") == skyramp.get_response_value(patch_response, "total_amount") * (skyramp.get_response_value(patch_response, "discount_value") / 100)\`
77
+ - *Contract-provider tests*: use the exact pre-computed value from \`expected_response_body\` directly (e.g. \`assert get_response_value(response, "total_amount") == 19.98\`). All inputs must come from the inline request body or the response — do NOT reference \`beforeAll\` provisioning data.
78
+ - Discount (percentage): \`assert skyramp.get_response_value(patch_response, "discount_amount") == skyramp.get_response_value(patch_response, "total_amount") * (skyramp.get_response_value(patch_response, "discount_value") / 100)\`
79
+
80
+ 8. **Read steps** — re-assert chained and computed fields — do not reduce to null-checks only.
81
+
82
+ 9. **Parity** — every assertion derivable from the request body or response (non-null, echo-back, value ranges, computed) must appear in both the contract test and the integration test independently.
83
+
84
+ ---
85
+
86
+ **Scope rules:**
87
+ - *Integration tests*: apply to every \`send_request\` / \`sendRequest\` call that returns a body.
88
+ - *Contract-provider tests*: only modify test functions — do NOT touch \`beforeAll\`, \`afterAll\`, or any setup/teardown helper.
89
+ - Only add assertions clearly supported by the request body, prior response values, field names, or codebase evidence. Do not invent constraints.
90
+ - Add new assertions immediately after the existing status-code assertion — do not move or remove anything.
91
+
92
+ **What NOT to do — any of these is a violation:**
93
+ - Do NOT access response fields via dict syntax (\`response["field"]\`) or attribute access (\`response.field\`) — always use the SDK helper (\`get_response_value\` / \`getResponseValue\` / \`getValue\`).
94
+ - Do NOT remove or modify existing assertions.
95
+ - Do NOT add assertions for fields where no constraint is clearly inferable.
96
+ - Do NOT restructure, reformat, or reorder any existing code.
97
+ - Do NOT add comments or docstrings.
98
+ - Do NOT change function signatures, imports, or variable names.
99
+ `;
@@ -44,6 +44,8 @@ export function buildTestPatternGuidelines() {
44
44
  - **Cascade deletes**: If deleting a parent removes children, verify cascade AND orphan prevention (delete product → orders referencing it get error or cascade)
45
45
  - **Race conditions**: If concurrent writes are possible (inventory deduction, counter increment), test concurrent requests
46
46
  - **Computed fields**: If response contains derived values (total, average, count), verify computation with known inputs (e.g., total_cost = compute_seconds * rate + memory_mb * rate + external_cost)
47
+ - **Mutation with collection modification**: If PUT/PATCH endpoints accept arrays of child items (e.g., order line items, cart products, invoice entries), test adding/removing items and verify that derived totals (e.g., total_amount, subtotal, item_count) are recalculated correctly. This is the most common source of user-reported bugs — always prioritize it for GENERATE over simple field-update tests.
48
+ **CRITICAL**: The PATCH/PUT request body MUST include the child collection array field(s) defined for that endpoint (e.g., "items" with FK references like "product_id" and a quantity field) chained from prior POST responses. A PATCH that only sends metadata fields (e.g., discount_type, status, notes) without modifying the child collection is NOT a valid mutation-recalc test — it will pass even when the item/total logic is broken. Before writing assertions, inspect the source code or OpenAPI spec to identify (1) the actual child collection field name and its FK/quantity/price sub-fields, and (2) how derived totals are calculated (including any discounts, taxes, or fees). Then assert: the child FK fields match chained IDs, quantities match sent values, and totals match the computation from the source code
47
49
  - **Webhook/event side effects**: If endpoints trigger async operations, test that side effects occur (e.g., POST /orders triggers notification)
48
50
  - **Cross-user isolation**: If resources are owned by users, test that user B cannot access/modify user A's resources (GET /users/{other_id}/data → 403 Forbidden)
49
51
  - **Range/boundary invariants**: If business rules cap values (max retries, min balance, discount ≤ subtotal), test the boundary (e.g., set retries to max+1 → expect rejection)
@@ -56,6 +58,16 @@ export function buildTestQualityCriteria() {
56
58
  that step B depends on (e.g., create product → create order referencing that product's ID →
57
59
  verify order contains correct product). Single-resource CRUD alone is not an integration test.
58
60
  Use realistic request bodies from source code schemas and verify response data, not just status codes.
61
+ When a PUT/PATCH updates a resource with child collections (e.g., order items), the request body
62
+ MUST include the child array with FK references chained from prior steps — and assertions MUST
63
+ verify the actual child items in the response (product_id, quantity, unit_price), not just
64
+ top-level metadata like discount or status.
65
+
66
+ **Contract tests** (single-step) are the right choice for: error-handling scenarios on a single
67
+ endpoint (e.g., PATCH/GET/DELETE a nonexistent resource → 404, POST with invalid payload → 422),
68
+ validation boundary checks, and any test that exercises one endpoint's API contract in isolation.
69
+ Do NOT add setup steps just to avoid hardcoding an ID — use a realistic hardcoded nonexistent ID
70
+ (e.g., 99999 or a random UUID) and keep it a single-step contract test.
59
71
 
60
72
  **E2E tests** should follow realistic user journeys end-to-end: browse products → search →
61
73
  add to cart → checkout. Verify that frontend actions trigger the correct API calls and
@@ -73,9 +85,8 @@ Choose based on what adds the most value for this PR's changes.
73
85
 
74
86
  **Contract test mode — signal-based selection:**
75
87
  - **Consumer contract** (\`consumerMode: true\`): Look for outbound HTTP client code (fetch, axios, httpx, requests, http.Client), service client classes, or calls to external base URLs. If an endpoint's implementation makes downstream calls, that downstream boundary is a consumer contract test candidate.
76
- - **Provider contract** (\`providerMode: true\`): Look for new or modified endpoint handlers, route changes, response shape modifications, or the presence of an OpenAPI spec. If the diff adds/changes an endpoint this service owns, that is a provider contract test candidate.
77
- - **Both modes**: When the service is simultaneously an API owner (upstream) AND a client of another service (downstream).
78
- - **Default (neither)**: Only when role is unclear or no spec is available.
88
+ - **Provider contract** (\`providerMode: true\`): Look for new or modified endpoint handlers, route changes, or response shape modifications. If the diff adds/changes an endpoint this service owns, that is a provider contract test candidate.
89
+ - **Both modes** (\`providerMode: true, consumerMode: true\`) — produces the same output as omitting both flags (generates provider and consumer contract tests). Use when the diff contains BOTH provider signals (new/modified endpoint handlers) AND consumer signals (outbound HTTP client calls to another service).
79
90
 
80
91
  **Scenario fidelity:** Every workflow scenario should reflect the actual resource
81
92
  relationships in the code. If the pre-drafted scenarios don't match the real data model,
@@ -169,9 +180,10 @@ To skip auth for unauthenticated endpoints, pass \`authHeader: ""\`.`;
169
180
  ${authHeaderLine}
170
181
  ${authGuidance}
171
182
 
172
- **For multi-endpoint workflows (integration tests) — Scenario → Integration pipeline:**
173
- 1. Call \`skyramp_scenario_test_generation\` once per step: \`scenarioName\`, \`destination\`,
174
- \`baseURL\`, \`method\`, \`path\`, \`requestBody\` OR \`queryParams\`, \`responseBody\`, \`${authCallParams}\`.
183
+ **For multi-endpoint workflows (integration tests) — Batch Scenario → Integration pipeline:**
184
+ 1. Call \`skyramp_batch_scenario_test_generation\` with ALL steps in a single call: \`scenarioName\`, \`destination\`,
185
+ \`baseURL\`, \`${authCallParams}\`, and a \`steps\` array where each element has \`method\`, \`path\`, \`requestBody\` OR \`queryParams\`, \`responseBody\`, \`statusCode\`.
186
+ (Fallback: if batch tool is unavailable, call \`skyramp_scenario_test_generation\` once per step.)
175
187
  \`statusCode\` is optional — defaults: POST→201, DELETE→204, GET/PUT/PATCH→200. Only override for non-standard codes.
176
188
  **OpenAPI spec is NOT required.** \`apiSchema\` is OPTIONAL — omit it if no spec exists.
177
189
  **CRITICAL — Query params vs request body:**
@@ -183,6 +195,10 @@ ${authGuidance}
183
195
  returned by the controller — e.g., \`id\`, \`ownerId\`, \`createdAt\`, included relations like \`collection\`, \`tags\`).
184
196
  Wrap in \`{"response": ...}\` if the API uses an envelope pattern. If omitted, a synthetic response is generated.
185
197
  Inspect the source code to determine the correct request AND response body shapes — avoid sending \`{}\`.
198
+ **CRITICAL for PATCH/PUT mutation-recalc scenarios:** The request body MUST include the child
199
+ collection array (e.g. \`"items": [{"product_id": <chained from prior POST>, "quantity": 2}]\`).
200
+ Never send a PATCH that only modifies metadata (discount, status) without also including the
201
+ items/products collection — such a test will not catch collection-level or total-recalculation bugs.
186
202
  Use unique names with timestamp suffix to avoid conflicts on re-runs.
187
203
  For GET/PUT/DELETE with path IDs, use a placeholder — chaining resolves the real ID.
188
204
  2. Produces a \`scenario_<name>.json\` in the same \`outputDir\` as the test files (not \`.skyramp/\`).
@@ -200,12 +216,11 @@ ${PATH_PARAM_UUID_GUIDANCE}
200
216
 
201
217
  **Contract test mode selection — set based on this service's role at the boundary:**
202
218
  - \`providerMode: true\` — this service IS the API; validates the implementation matches the spec.
203
- Use for new or modified endpoints this codebase owns, especially when an OpenAPI spec is present.
219
+ Use for new or modified endpoints this codebase owns.
204
220
  - \`consumerMode: true\` — this service CALLS another API; validates outbound requests conform to the downstream contract.
205
221
  Use when the endpoint's implementation makes HTTP calls to external services (look for fetch/axios/httpx/http.Client/service clients).
206
222
  A request-aware mock stands in for the real downstream service — no live dependency needed.
207
- - Both — use when the service boundary is both a provider (owns an API) and a consumer (calls a downstream API).
208
- - Neither (default) — use only when the role is ambiguous or no spec is available.
223
+ - **Both modes** (\`providerMode: true, consumerMode: true\`) same output as omitting both flags. Generates both consumer and provider contract tests. Use when the diff contains BOTH provider signals (new/modified endpoint handlers) AND consumer signals (outbound HTTP client calls to another service).
209
224
 
210
225
  **For UI tests:**
211
226
  1. \`browser_navigate\` to the target URL (from workspace \`api.baseUrl\`)