@skyramp/mcp 0.2.3 → 0.2.5-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/build/playwright/registerPlaywrightTools.js +21 -25
  2. package/build/playwright/traceRecordingPrompt.js +2 -2
  3. package/build/prompts/test-maintenance/actionsInstructions.js +60 -0
  4. package/build/prompts/test-maintenance/drift-analysis-prompt.js +18 -101
  5. package/build/prompts/test-maintenance/driftAnalysisSections.js +210 -171
  6. package/build/prompts/test-recommendation/analysisOutputPrompt.js +1 -1
  7. package/build/prompts/test-recommendation/diffExecutionPlan.js +4 -3
  8. package/build/prompts/test-recommendation/recommendationSections.js +6 -6
  9. package/build/prompts/test-recommendation/scopeAssessment.js +3 -1
  10. package/build/prompts/test-recommendation/scopeAssessment.test.js +13 -0
  11. package/build/prompts/test-recommendation/test-recommendation-prompt.js +2 -2
  12. package/build/prompts/test-recommendation/test-recommendation-prompt.test.js +3 -3
  13. package/build/prompts/testbot/testbot-prompts.js +21 -17
  14. package/build/prompts/testbot/testbot-prompts.test.js +21 -17
  15. package/build/services/TestDiscoveryService.js +11 -43
  16. package/build/tools/submitReportTool.js +9 -12
  17. package/build/tools/submitReportTool.test.js +4 -5
  18. package/build/tools/test-management/actionsTool.js +160 -240
  19. package/build/tools/test-management/analyzeChangesTool.js +43 -18
  20. package/build/tools/test-management/analyzeTestHealthTool.js +17 -29
  21. package/build/utils/docker.test.js +1 -1
  22. package/build/utils/versions.js +1 -1
  23. package/node_modules/playwright/lib/mcp/skyramp/common/visualSnapshot.js +95 -0
  24. package/node_modules/playwright/lib/mcp/skyramp/loadTraceTool.js +2 -0
  25. package/node_modules/playwright/lib/mcp/skyramp/traceRecordingBackend.js +150 -2
  26. package/node_modules/playwright/lib/mcp/skyramp/visualSnapshotTool.js +63 -0
  27. package/node_modules/playwright/lib/mcp/test/skyRampExport.js +36 -0
  28. package/package.json +2 -2
  29. package/build/prompts/test-maintenance/drift-analysis-prompt.test.js +0 -116
@@ -2,37 +2,46 @@
2
2
  * Modular section builders for the Drift Analysis prompt,
3
3
  * mirroring the recommendationSections.ts pattern.
4
4
  */
5
- export function buildActionDecisionMatrix() {
5
+ import { AUTH_MIDDLEWARE_PATTERNS_STR } from "../../utils/workspaceAuth.js";
6
+ // TODO: Replace the open-ended diff-line categories below with a two-tier structure:
7
+ // Tier 1 — mechanical patterns (specific token + file-path criteria, zero judgment needed).
8
+ // Tier 2 — residual open-ended scan for anything observable not covered by tier 1.
9
+ // This reduces LLM variability in action assignment for the same diff.
10
+ export function buildActionDecisionTree() {
6
11
  return `<decision_rules>
7
- ## Action Decision Tree
12
+ **Before the numbered checks, apply the scope gate:** can this test's service or interface boundary actually reach the changed code? If the answer is clearly no — the test targets a definitively different service, a read-only replica, a completely separate microservice, or a different protocol — assign IGNORE and skip the remaining checks for this test. When reachability is uncertain (shared output layer, inherited base models, conditional field exposure), use VERIFY instead of IGNORE.
8
13
 
9
- For each existing test, work through these checks in orderthe first match wins:
14
+ **Before working through any individual check, do a single pass over the entire diff** to record all diff lines that directly change observable API behavior (route paths, response fields, status codes, auth gates, validation logic). For each matched diff line write one line: \`{pattern type}"{diff line}" — affects {endpoint/test}\`. Build this detection list first it is your working artifact for all tests. An action with no entry in this list is unsupported.
10
15
 
11
- 1. **All endpoints the test covers were removed** **DELETE**
12
- 2. **Some endpoints removed, some renamed** **UPDATE**
13
- 3. **New response field added to a covered endpoint** → **UPDATE** — the test needs a new assertion even if existing assertions still pass
14
- 4. **Shape change breaks assertions (field-level: ≤2 fields changed, renamed, or type-swapped)** → **UPDATE**
15
- **Shape change restructures the root response (flat→nested, new wrapper object, root key renamed, ≥50% of test assertions broken)** → **REGENERATE**
16
- 5. **Auth added or auth method changed** **UPDATE**
17
- **Auth removed** **VERIFY**
18
- 6. **No breaking changes detected** **IGNORE** or **VERIFY** for minor drift
16
+ Diff lines that change observable API behavior look for these in the pre-scan:
17
+ - Route removed or renamed: \`- @app.route\`, \`- router.get\`, \`- @GetMapping\`, or paired \`-\`/\`+\` on a route decorator
18
+ - Field type/removal: \`- field: int\`/\`+ field: string\`, \`- "responseField":\`
19
+ - Status/enum change: \`- return 200\`/\`+ return 201\`, \`- status: "active"\`/\`+ status: "enabled"\`
20
+ - Root wrapper change: \`- return Response({...})\`/\`+ return Response({"data":{...},"meta":{...}})\`
21
+ - New field in output layer (DTO, response schema, view, output formatter, \`res.json\`): \`+ "newField":\` or \`+ newField =\` inside the response-building code
22
+ - New field in model/migration only (no output layer change): \`+ newField = Column(...)\`
23
+ - Auth added/removed/changed: \`+ @require_auth\`, \`- @require_auth\`, token-type change
24
+ - Scope narrowed: \`+ requireRole\`, \`+ raise PermissionError\`, \`+ if not is_owner\`, \`+ [x for x in xs if x.owner == caller_id]\`
25
+ - Behavioral: \`+ raise ValidationError\`/\`+ HTTPException(409)\` on new \`if\`, \`+ VALID_TRANSITIONS\`, sync→async (\`- return 200\`/\`+ return 202\`), formula change (\`- total = a - b\`/\`+ total = a + tax - b\`)
26
+
27
+ Then, for each test where the changed code *is* reachable, work through the individual checks below using your pre-built detection list. Collect **all** matching signals, then assign the single highest-severity action across all matches. Severity order (highest first): **DELETE > REGENERATE > UPDATE > VERIFY > IGNORE**. **Before assigning UPDATE, REGENERATE, or DELETE, quote the specific diff line(s) that triggered it in the rationale. If you cannot point to a diff line this test's endpoint can observe, the action is IGNORE or VERIFY, not UPDATE.**
19
28
 
20
29
  Rules:
21
- - DELETE when all covered endpoints no longer exist; REGENERATE when they still exist but changed drastically.
22
- - REGENERATE means: the top-level response shape changed (flat→nested, new wrapper object added, root key renamed), OR ≥50% of the test's assertions reference fields that were removed or restructured. In all other cases, prefer UPDATE.
23
- - Prefer UPDATE over REGENERATE when changes are field-level (≤2 fields added, removed, renamed, or type-swapped).
24
- - Prefer IGNORE over VERIFY when all changed files are unrelated to the test's endpoint.
30
+ - Collect all signals; assign the highest-severity action across them. Include all matched signals in the rationale and all matching \`updateInstructions\` a diff that renames a path AND adds a field requires both a URL patch and a new assertion.
31
+ - DELETE when all covered endpoints no longer exist; REGENERATE when they still exist but the root response wrapper changed and essentially every assertion is now invalid. In all other cases, prefer UPDATE.
32
+ - REGENERATE when the root response wrapper changed (flat→nested, new envelope object, root key renamed) and essentially every assertion is invalid — if you can describe the fix as patching N specific paths, it is UPDATE regardless of how many paths there are.
33
+ - Prefer IGNORE over VERIFY when all changed files are unrelated to the test's endpoint. Exception: if the diff touches a shared output layer, base model, or response formatter that the test's endpoint uses, prefer VERIFY even if no route file changed.
25
34
  - ADD actions belong in the next step — complete this assessment with IGNORE / VERIFY / UPDATE / REGENERATE / DELETE only.
26
35
 
27
36
  <examples>
28
37
  <example>
29
- Diff adds one field to a response object and renames a URL path segment:
38
+ Diff adds one field to the response output and renames a URL path segment:
30
39
  \`\`\`
31
40
  - @app.route("/users/<id>/orders")
32
41
  + @app.route("/users/<id>/purchases")
33
- + "total_items": len(order.items)
42
+ + "total_items": len(order.items) # inside the response-building code this test hits
34
43
  \`\`\`
35
- → **UPDATE**: path rename + one new field both are field-level changes. Patch the URL and add an assertion for \`total_items\`.
44
+ → **UPDATE**: output layer signal confirmed in diff (total_items added to the response builder) + path rename. Patch the URL and add an assertion for \`total_items\`.
36
45
  </example>
37
46
  <example>
38
47
  Diff wraps the entire response in a new envelope object:
@@ -40,184 +49,214 @@ Diff wraps the entire response in a new envelope object:
40
49
  - return Response({"id": ..., "status": ..., "items": [...]})
41
50
  + return Response({"data": {"id": ..., "status": ..., "items": [...]}, "meta": {"page": 1}})
42
51
  \`\`\`
43
- → **REGENERATE**: root shape changed from a flat object to \`{data, meta}\`. Every existing assertion (e.g. \`response["id"]\`, \`response["status"]\`) is broken rewrite the test from scratch.
52
+ → **REGENERATE**: root wrapper changed every existing assertion (e.g. \`response["id"]\`) is broken. Rewrite from scratch.
53
+ </example>
54
+ <example>
55
+ Diff adds a field to a model/migration only — project has an explicit output layer (DTO, response schema, field allowlist) controlling what gets exposed:
56
+ \`\`\`
57
+ + sort_order = Column(Integer, nullable=True) # in models.py
58
+ \`\`\`
59
+ No output layer file changed.
60
+ → **VERIFY**: cannot confirm from the diff alone whether sort_order is included in the response — the output layer may exclude it.
61
+ </example>
62
+ <example>
63
+ Diff adds a field to a model/migration only — project has no explicit output layer (model fields are passed through directly to the response):
64
+ \`\`\`
65
+ + sort_order: {type: 'integer', nullable: true} # in db/schema.js
66
+ \`\`\`
67
+ No output layer file changed. No field allowlist or exclusion exists for this resource.
68
+ → **UPDATE**: model fields are auto-exposed in responses. Augment the test to assert the new field is present and round-trips correctly.
69
+ </example>
70
+ <example>
71
+ Diff changes a status code in the handler:
72
+ \`\`\`
73
+ - res.status(200).json(...)
74
+ + res.status(201).json(...)
75
+ \`\`\`
76
+ → **UPDATE**: the test asserts \`toBe(200)\` which now fails. Patch the status assertion.
77
+ </example>
78
+ <example>
79
+ Diff adds a role gate to a route the test covers:
80
+ \`\`\`
81
+ + if (user.role !== "owner") {
82
+ + return res.status(403).json({ error: "forbidden_role" });
83
+ + }
84
+ \`\`\`
85
+ → **UPDATE**: the test's existing token now gets 403. Send a token with sufficient role and add a 403 negative assertion for the restricted role. (Authorization scope narrowed — not an auth-mechanism change; the Auth/AuthZ and Behavioral Contract checks cover this.)
86
+ </example>
87
+ <example>
88
+ Diff adds a state-transition guard:
89
+ \`\`\`
90
+ + const VALID_TRANSITIONS = { draft: ["review"], review: ["published"] };
91
+ + if (!VALID_TRANSITIONS[currentStatus]?.includes(newStatus)) {
92
+ + throw new HTTPException(409, "invalid_transition");
93
+ + }
94
+ \`\`\`
95
+ → **UPDATE**: an integration test that previously posted \`draft→published\` directly now gets 409. Chain through the valid states (draft→review→published) and add a 409 assertion for the direct skip.
44
96
  </example>
45
97
  </examples>
46
98
  </decision_rules>`;
47
99
  }
100
+ // Retained for backwards compatibility — no longer rendered in the prompt.
101
+ // Diff signals are now inlined into each individual check function.
102
+ /** @deprecated use the individual check functions; this function is no longer part of the prompt */
48
103
  export function buildBreakingChangePatterns() {
49
- return `## Breaking Change Patterns to Detect
50
-
51
- Scan the diff lines for these high-signal patterns:
52
-
53
- ### Endpoint-level breaking changes
54
- - \`- @app.route("/old-path")\` / \`+ @app.route("/new-path")\` — renamed endpoint
55
- - \`- router.get("/old")\` / \`+ router.get("/new")\` — renamed route
56
- - \`- @GetMapping("/old")\` / \`+ @GetMapping("/new")\` — Spring rename
57
- - Lines removing a route decorator entirely (endpoint removed)
58
-
59
- ### Request/response shape changes
60
- - Field type changes: \`- field: int\` → \`+ field: string\`
61
- - Required field added: \`+ required: [..., "newField"]\`
62
- - Response field removed: \`- "responseField":\`
63
- - Enum value changes: \`- status: "active"\` → \`+ status: "enabled"\`
64
-
65
- ### Auth changes
66
- - \`+ @require_auth\`, \`+ @login_required\`, \`+ middleware(authMiddleware)\`
67
- - \`- @require_auth\` (auth removed)
68
- - Token type changed: Bearer → Cookie
69
-
70
- ### Status code changes
71
- - \`- return 200\` → \`+ return 201\`
72
- - \`- status_code=200\` → \`+ status_code=204\`
73
- - \`- res.status(201)\` → \`+ res.status(200)\`
74
-
75
- ### Additive response field changes (non-breaking but coverage gap)
76
- These do NOT break existing assertions but leave the new field untested. Always flag as UPDATE for covered endpoints.
77
- - \`+ "newField": queryset.filter(...).count()\` added inside a \`Response({...})\` or \`res.json({...})\`
78
- - \`+ newField = serializers.XXXField()\` added to a serializer used by a tested endpoint
79
- - \`+ "newField":\` added to a response body dict returned by the endpoint
80
- - New key added inside an existing dict/object returned by the endpoint`;
104
+ return "";
81
105
  }
82
- export function buildTestAssessmentGuidelines() {
83
- return `## Per-Test Assessment (4 Checks)
84
-
85
- For each existing test file, run these checks:
86
-
87
- ### Check A: Endpoint existence
88
- Does the endpoint the test targets still exist in the codebase?
89
- - If ALL endpoints the test covers were removed → action: DELETE (the entire test file is obsolete)
90
- - If SOME methods were removed but others remain → action: UPDATE (remove the test functions for deleted methods, keep the rest)
91
- - If the endpoint was renamedaction: UPDATE (path substitution)
92
-
93
- ### Check B: Request/response shape (breaking changes)
94
- Has the request body or response structure changed in a way that breaks the test?
95
- - Compare test's expected fields against current schema/model definitions
96
- - Type changes (string→int, int→string) on individual fields → action: UPDATE
97
- - Type change restructures the root object or makes the entire request body invalid → action: REGENERATE
98
- - New required fields the test doesn't send → action: UPDATE
99
- - Response fields the test asserts on have been removed → action: UPDATE
100
- - ≥50% of the test's assertions reference fields that were removed or restructured → action: REGENERATE
101
-
102
- **UPDATE vs REGENERATE:** choose UPDATE when changes are field-level (≤2 fields added, removed, renamed, or type-swapped). Choose REGENERATE only when the root response shape changed (flat→nested, new wrapper object, root key renamed) or ≥50% of assertions are broken.
103
-
104
- ### Check B2: Additive response field changes (coverage gaps)
105
- **Even if existing assertions still pass**, does the diff add a new field to the response of an endpoint this test already covers?
106
- - Look at the diff for lines like \`+ "newField":\` or \`+ newField =\` inside a view/serializer this test hits
107
- - If YES → action: UPDATE
108
- - This applies even when the test only checks status codes — the test should be extended to cover the new field
109
- - A new response field on a covered endpoint always triggers UPDATE — even when existing assertions still pass.
110
-
111
- ### Check C: Auth changes
112
- Has the authentication mechanism for this endpoint changed?
113
- - Auth added where none existed → action: UPDATE
114
- - Auth method changed (bearer→cookie) → action: UPDATE
115
- - Auth removed → action: VERIFY
116
-
117
- ### Check D: Assign action
118
- Based on the above, choose the action (IGNORE / VERIFY / UPDATE / REGENERATE / DELETE) and provide a 1-2 sentence rationale.
119
- - If Check B2 flagged an additive field → action must be UPDATE, even if Checks B/C found no breaking changes.`;
106
+ export function buildCheckEndpointExistence() {
107
+ return `Does the endpoint the test targets still exist in the codebase?
108
+
109
+ Diff signals to look for:
110
+ - Route removed: \`- @app.route("/path")\`, \`- router.get("/path")\`, \`- @GetMapping("/path")\`
111
+ - Route renamed: paired \`-\` and \`+\` on a route decorator with a different path
112
+ - Redirect added for old path: \`res.setHeader("Location", ...)\` + \`res.status(301)\` or \`res.status(308)\` in middleware or a handler for the old path
113
+
114
+ Actions:
115
+ - ALL endpoints the test covers were removed DELETE (the entire test file is obsolete)
116
+ - SOME methods removed but others remain → UPDATE (remove test functions for deleted methods, keep the rest)
117
+ - Endpoint renamed with **no redirect** (route decorator changed, no \`setHeader("Location")\` signal in the diff) → UPDATE (path substitution to the new path; supply \`renamedEndpoints\`)
118
+ - Endpoint renamed with **redirect kept** (\`setHeader("Location")\` + 301/308 signal present for the old path) UPDATE: assert the redirect status AND \`Location\` header on the old path. Do NOT just substitute the URL — the old-path test must verify the redirect contract, not the canonical response.`;
120
119
  }
121
- export function buildAddRecommendationGuidelines() {
122
- return `## ADD New Tests for New Endpoints
123
-
124
- **ADD applies only when:**
125
- - The diff introduces a brand-new route that has **no existing test coverage at all**, OR
126
- - The diff introduces a new auth path, error branch, or fundamentally separate scenario that no existing test covers.
120
+ export function buildCheckResponseShape() {
121
+ return `Has the request body or response structure changed in a way that breaks the test?
127
122
 
128
- **Use UPDATE instead of ADD when:**
129
- - The resource already has existing tests and the diff only adds a new HTTP method — add the new method's test cases to the existing file.
130
- - The endpoint existed before this diff but lacks tests — log it in \`additionalRecommendations\` and skip it; pre-existing coverage gaps are out of scope for ADD.
131
-
132
- **Test type priority by HTTP method:**
133
- | Method | Recommended test types |
134
- |--------|----------------------|
135
- | POST / PUT / PATCH | integration, contract |
136
- | GET | contract, smoke |
137
- | DELETE | integration, smoke |
138
-
139
- Use a unique descriptive filename for every new test file. For a resource with existing tests, update the existing file — always prefer UPDATE over creating a new file.`;
123
+ Diff signals to look for:
124
+ - Field type change: \`- field: int\` / \`+ field: string\`
125
+ - Required request body field added: \`+ required: [..., "newField"]\`
126
+ - Required query param added with no default
127
+ - Response field removed: \`- "responseField":\`
128
+ - Enum value changed: \`- status: "active"\` / \`+ status: "enabled"\`
129
+ - Status code changed: \`- return 200\` / \`+ return 201\`
130
+ - Root wrapper added: \`- return Response({...})\` / \`+ return Response({"data": {...}, "meta": {...}})\`
131
+
132
+ Actions:
133
+ - Type changes, new required fields, removed asserted fields, status/enum changes → UPDATE
134
+ - Root response wrapper changed and essentially every assertion is now invalid REGENERATE
135
+
136
+ **UPDATE vs REGENERATE — the deciding question is whether the root response wrapper changed:**
137
+ - **REGENERATE** only when a new top-level envelope object wraps the entire payload or the root key is renamed so that essentially every existing assertion must change.
138
+ - **UPDATE** for everything else. If you can describe the fix as "patch these N assertion paths", it is UPDATE regardless of how many paths there are.
139
+ - When every assertion in the file is invalid, it is REGENERATE. When you can still patch individual paths, it is UPDATE.`;
140
140
  }
141
- export function buildUpdateExecutionRules() {
142
- return `<execution_rules>
143
- ## Update Execution Rules
141
+ export function buildCheckAuthAndAuthorization() {
142
+ return `Has the authentication or authorization for this endpoint changed?
144
143
 
145
- When applying UPDATE actions to existing test files, follow these rules in addition to the drift-detected changes:
144
+ **Authentication mechanism**
146
145
 
147
- ### Test file ordering
148
- Place mutation test functions (PATCH, PUT, POST) **before** any DELETE test function targeting the same resource. DELETE removes the resource — any mutation call after it will 404. When inserting a new mutation test, place it above the DELETE function and above the DELETE call in the \`if __name__ == "__main__"\` block (or equivalent runner entrypoint).
146
+ Diff signals to look for: ${AUTH_MIDDLEWARE_PATTERNS_STR}. Also: \`@requiresRole\`/\`@Protected\`, \`validateToken\`/\`checkPermission\`/\`verifyHMAC\`, imports from auth/security packages, \`- @require_auth\` (removal), token type change (Bearer → Cookie).
149
147
 
150
- ### Happy path first
151
- When adding a new HTTP method (PUT, PATCH, POST) to an existing test file, always include a 2xx success assertion first. Error-path tests (404, 422) may follow, but the happy path case is required.
148
+ Actions:
149
+ - Auth added where none existed UPDATE (test would 401/403 on every request without the new credential)
150
+ - Auth method changed (e.g. bearer→cookie) → UPDATE (test sends the wrong credential type)
151
+ - Auth removed and test asserts a 401/403 response that will no longer fire → UPDATE
152
+ - Auth removed and test does not assert on auth responses → VERIFY (endpoint may now be intentionally public)
152
153
 
153
- ### All test files for a resource
154
- When a diff adds a new HTTP method to a resource, UPDATE covers **all** existing test files for that resource — contract, integration, and UI. Apply UPDATE to every file the analyze tool reported for that resource path; do not stop after updating the first one.
154
+ **Authorization scope** (same credential, narrower access)
155
155
 
156
- ### PATCH/PUT with child collections
157
- Child collection arrays (e.g. \`items\`, \`products\`, \`line_items\`) drive computed totals — a test that omits them cannot catch the most common mutation bugs. When the request/response includes a child collection:
158
- 1. Include the child array with at least one item containing the FK field (e.g. \`product_id\`) and a \`quantity\` field.
159
- 2. Assert each item's FK field and \`quantity\` match the sent values.
160
- 3. Assert the top-level computed total (e.g. \`total_amount\`) equals the expected math from the items.
156
+ Diff signals to look for: \`+ requireRole\`, \`+ requireCreateX\`/\`requireDeleteX\`, \`+ assert_*_scope\`, \`+ ALLOWED_ROLES.includes\`/\`ASSIGNABLE_ROLES\`, \`+ if not is_owner\`, \`+ raise PermissionError\`/\`HTTPException(403)\`, \`+ [x for x in xs if x.owner == caller_id]\`, new role-carrying request header (e.g. \`X-Workspace-Role\`).
161
157
 
162
- ### REGENERATE
163
- Call the appropriate generation tool to replace the existing test from scratch. Use the same filename so it overwrites the old file.
158
+ Actions:
159
+ - Role or ownership gate added UPDATE (test's existing token may now get 403; send a sufficient-role token and add a 403 negative assertion)
160
+ - Caller-identity filtering added → UPDATE (test's token now returns a subset; adjust expectations or use an admin-scope token)
161
+
162
+ Do NOT assign IGNORE just because the auth *mechanism* is unchanged — scope narrowing breaks a token-valid test.`;
163
+ }
164
+ export function buildCheckBehavioralContract() {
165
+ return `Has the endpoint's BEHAVIOR changed while its response shape stayed the same? A test can break even when no field was added or removed.
166
+
167
+ Diff signals and actions:
168
+ - **Validation tightened**: \`+ raise ValidationError\`/\`+ throw new ValidationError\` gated on field value, \`+ Field(pattern=...)\`/\`ge=\`/\`le=\`/\`max_length=\` on an existing field → UPDATE (fix the payload to satisfy the new constraint; add the 4xx negative case)
169
+ - **New conditional rejection / state guard**: \`+ raise HTTPException(status_code=409)\`/\`+ res.status(409)\` inside a new \`if\`, \`+ VALID_TRANSITIONS\`, \`+ allowed_states = ...\` → UPDATE (chain through valid states; assert the rejection status for the now-illegal path)
170
+ - **Sync → async**: \`- return 200 result\` / \`+ return 202 {job_id}\` → UPDATE (assert \`202\` and the job/id field; remove old result-field assertions from the immediate response)
171
+ - **Computed-field formula changed**: \`- total = a - b\` / \`+ total = a + tax - b\` on an existing asserted field → UPDATE; describe the new formula in \`updateInstructions\` and provide the recomputed expected value where inputs are known from the diff
172
+ - **Behavior gated on a request header**: old shape returned only when a version header is sent; new shape is now the default → UPDATE (migrate assertions to the new default shape, or pin the old shape by sending the version header)
173
+ - **Redirect added to existing path** (e.g. \`res.setHeader("Location", ...)\` + \`res.status(301)\` or \`res.status(308)\`): the endpoint now redirects instead of serving a response → UPDATE: assert the redirect status code AND the \`Location\` header value. Do not just update the URL in the test to the new canonical path — that loses the redirect contract.
174
+
175
+ **Reachability for behavioral changes:** The service/interface scope gate still applies — if the test targets a completely different service or protocol, IGNORE. However, do NOT use the absence of a route or output layer file in the diff as grounds for IGNORE. Behavioral changes (new error branches, new validation, status-code changes) are observable from any test calling the same endpoint, even when the logic lives in an internal handler, middleware, or utility file rather than the route file itself.`;
176
+ }
177
+ export function buildCheckAssignAction() {
178
+ return `Based on the above checks, choose the action (IGNORE / VERIFY / UPDATE / REGENERATE / DELETE) and provide a 1-2 sentence rationale.
179
+
180
+ **Every action requires a specific rationale — including IGNORE:**
181
+ - UPDATE / REGENERATE / DELETE: quote the specific diff line that triggered it.
182
+ - VERIFY: name the uncertain element (e.g. "model-only change, cannot confirm field is exposed without checking the output layer").
183
+ - IGNORE: name the specific reason the changed code cannot reach this test's endpoint (e.g. "diff only touches \`auth/session-service.js\` — this test targets \`/api/v1/orders\` which has no session dependency"). Generic "unrelated endpoint" or "service boundary" without a diff reference is not sufficient.
184
+
185
+ - If the Additive Fields check flagged a new field with output layer signal confirmed in the diff → action is UPDATE. If the Additive Fields check returned VERIFY (model-only signal, no output layer change) → action remains VERIFY.
186
+ - **Scope gate:** If the changed code is clearly not reachable through the service or base URL this test targets, assign IGNORE.
187
+ - **When uncertain, use VERIFY not IGNORE:** If the diff touches a model or migration and this test's endpoint reads from that model, assign VERIFY — you cannot confirm from the diff alone whether the field is exposed.
188
+ - **Pre-commit verification — confirm all three before finalizing UPDATE/REGENERATE/DELETE:**
189
+ 1. You can quote a specific diff line this test's endpoint observes that triggered the action.
190
+ 2. The changed code is reachable through this test's service and base URL.
191
+ 3. For REGENERATE: every assertion in the file is invalid, not just some — if you can patch N paths, it is UPDATE.
192
+ If any check fails, downgrade to VERIFY or IGNORE.
193
+ - **For user-written (external) tests** marked \`[external]\` in the test list:
194
+ - UPDATE is permitted — targeted edits only (fix renamed URL, add assertion for new field, add a new test case for a new scenario). Match the style of the existing file: use the same test framework, assertion helpers, and request pattern already present.
195
+ - REGENERATE and DELETE are **not permitted** — assign those actions in your recommendations but \`skyramp_actions\` will surface them as report-only findings for the developer to act on. Do NOT attempt to rewrite or delete a user-authored test file.`;
196
+ }
197
+ export function buildCheckAdditiveFields() {
198
+ return `Even if existing assertions still pass, new response fields on a covered endpoint need a new assertion. A field being optional or nullable does not make it IGNORE — it still needs to be covered.
164
199
 
165
- ### DELETE
166
- Remove the test file when ALL endpoints it covers were removed from the codebase. If only SOME methods were removed, use UPDATE instead remove the test functions for deleted methods and keep the rest.
200
+ Diff signals to look for:
201
+ - \`+ "newField": ...\` or \`+ newField =\` inside response-building code (DTO, response schema, view, output formatter, \`res.json\`, \`return {...}\`) output layer signal confirmed
202
+ - \`+ newField = Column(...)\` or \`+ newField:\` in a model/migration only, with no output layer change → model-only signal
167
203
 
168
- ### Test data isolation
169
- Never use hardcoded resource IDs (e.g. \`order_id=1\`) in any test step, including GET or DELETE steps. Always create required resources via prior POST steps and chain IDs dynamically. Use timestamp-based unique names for created resources (e.g. \`"Product-\${int(time.time())}"\`) to prevent collisions across test runs.
204
+ Actions:
205
+ - Output layer signal confirmed → UPDATE (assert the new field in the existing test: its value on create, its round-trip on read, and its default when omitted)
206
+ - Model-only signal, **project has an explicit output layer** (a separate DTO, response schema, field allowlist, or \`fields =\` definition controls what gets exposed) → VERIFY (cannot confirm from diff alone whether the field is included in the response)
207
+ - Model-only signal, **project has no explicit output layer** (model fields are passed through directly to the response) → UPDATE (field is auto-exposed in responses; augment the test)
170
208
 
171
- ### Enhance assertions after UPDATE
172
- Call \`skyramp_enhance_assertions\` with \`testFile\` set to the absolute path of the test file you just updated, \`enhanceType: "maintenance"\`, and the matching \`testType\` based on the file you are editing:
173
- - **Integration test file** (multi-step chained requests): call with \`testType: "integration"\`
174
- - **Contract-provider test file** (single endpoint with \`beforeAll\`/\`afterAll\` setup, provider mode): call with \`testType: "contract"\`. Skip for consumer-mode contract tests.
175
- - **UI test file** (imports \`@playwright/test\`, uses \`page.\` calls): call with \`testType: "ui"\`
209
+ To determine which applies: look at whether an explicit output layer (DTO, serializer, \`response_model\`, field allowlist) is visible in the diff context — either as a changed file or as an import in a changed file. If yes and it was not changed in this diff → VERIFY (cannot confirm the new field is included without reading it). If no explicit output layer is visible in the diff context → UPDATE (model fields are passed through directly).
176
210
 
177
- Then apply every instruction returned by the tool to the test file.
178
- </execution_rules>`;
211
+ **Do not assign IGNORE because the field is optional or nullable** — "won't break existing assertions" is not the bar. The bar is "does the test now cover this endpoint's full contract?" A new field, even optional, means the contract changed and the test is incomplete.`;
179
212
  }
180
- export function buildDriftOutputChecklist(existingTestCount, newEndpointCount, inlineMode = false, stateFile) {
181
- const finalStep = inlineMode
182
- ? `### Final step
183
- Apply all maintenance actions (UPDATE / REGENERATE / DELETE) directly by editing the test files. Apply IGNORE, VERIFY, UPDATE, REGENERATE, or DELETE only — ADD is handled in the next task.`
184
- : `### Final step
185
- After completing all assessments above, call \`skyramp_actions\` with \`stateFile: "${stateFile ?? "<stateFile>"}"\` and a \`recommendations\` entry for every test assessed. For each entry include: \`testFile\` (absolute path as reported by the analysis tools), \`action\`, \`rationale\`, \`updateInstructions\` (free-form summary of what this test must change — new fields to assert, constraint details, auth changes, new request params, or any other drift specifics; \`skyramp_actions\` passes this directly to the downstream LLM editing the file), and \`renamedEndpoints\` (for path-rename updates).
186
-
187
- Call \`skyramp_actions\` as the sole final action — skip all other file writes.`;
188
- const existingTestHeader = inlineMode
189
- ? "### Existing tests (reported by skyramp_analyze_changes)"
190
- : `### Existing tests (${existingTestCount} total)`;
191
- const existingTestSection = `${existingTestHeader}
192
- For each existing test:
193
- - **IGNORE/VERIFY tests**: one line each: \`{testFile} — IGNORE\` or \`{testFile} — VERIFY\`. Rationale omitted for brevity.
194
- - **UPDATE/REGENERATE/DELETE tests**: output the full block:
213
+ export function buildDriftOutputChecklist(stateFile, existingTests) {
214
+ const finalStep = `Call \`skyramp_actions\` with \`stateFile: "${stateFile ?? "<stateFile>"}"\` and a \`recommendations\` entry for every test assessed.`;
215
+ const hasTests = (existingTests?.length ?? 0) > 0;
216
+ const testList = hasTests
217
+ ? existingTests.map((t) => {
218
+ const label = (t.source ?? "skyramp") === "external" ? " [external]" : "";
219
+ return `- ${t.testFile}${label}`;
220
+ }).join("\n")
221
+ : "- (none)";
222
+ const noTestsNote = !hasTests
223
+ ? `\nNo existing tests were found. Call \`skyramp_actions\` with \`recommendations: []\` and explain in your report why no maintenance was needed (e.g. "no existing tests cover the changed endpoints" or "PR adds a new endpoint with no prior coverage").\n`
224
+ : "";
225
+ const existingTestSection = `**Existing tests (${existingTests?.length ?? 0} total) — assess ALL of the following:**
226
+ ${testList}
227
+ ${noTestsNote}
228
+ **UPDATE scope notes:**
229
+ - UPDATE is an in-place edit — do not recommend creating a new file for an already-tested endpoint.
230
+ - If a new HTTP method is added to a resource, emit a separate UPDATE entry for every existing test file that covers that resource (contract, integration, UI).
231
+
232
+ For each test above, output one entry:
233
+ - **IGNORE**: \`{testFile} — IGNORE: {reason the diff cannot reach this test's endpoint}\`
234
+ - **VERIFY**: \`{testFile} — VERIFY: {uncertain element}\` — include in \`recommendations[]\` so it appears in the report for developer review
235
+ - **UPDATE**:
236
+ \`\`\`
237
+ Test: {testFile}
238
+ Action: UPDATE
239
+ Rationale: UPDATE because {quoted diff signal}; affects {assertion/path}
240
+ UpdateInstructions: {what must change in the existing file — specific fields, paths, status codes}
241
+ \`\`\`
242
+ - **REGENERATE**:
195
243
  \`\`\`
196
244
  Test: {testFile}
197
- Action: {UPDATE | REGENERATE | DELETE}
198
- Rationale: {1-2 sentence explanation}
199
- \`\`\`
200
- Focus your analysis on tests that need action — keep reasoning for unchanged tests to a single line.`;
201
- const newEndpointSection = inlineMode
202
- ? ""
203
- : newEndpointCount > 0
204
- ? `### New endpoints (${newEndpointCount} detected)
205
- For EACH new endpoint, output:
206
- \`\`\`
207
- Endpoint: {METHOD} {path}
208
- Action: ADD
209
- Test types: {contract | integration | smoke | ...}
210
- Rationale: {1 sentence}
211
- \`\`\``
212
- : `### New endpoints
213
- No new endpoints detected in this diff.`;
214
- const sections = [existingTestSection, newEndpointSection, finalStep].filter(s => s.length > 0);
245
+ Action: REGENERATE
246
+ Rationale: REGENERATE because {quoted diff signal}; every assertion is now invalid
247
+ \`\`\`
248
+ - **DELETE**:
249
+ \`\`\`
250
+ Test: {testFile}
251
+ Action: DELETE
252
+ Rationale: DELETE because {quoted diff signal}; all covered endpoints removed
253
+ \`\`\`
254
+ Include ALL actions — IGNORE, VERIFY, UPDATE, REGENERATE, DELETE — in the \`recommendations[]\` passed to \`skyramp_actions\`. For \`[external]\` tests: \`skyramp_actions\` will apply UPDATE edits and surface REGENERATE/DELETE as report-only findings.
255
+ Be concise — one line per IGNORE/VERIFY entry.`;
256
+ const sections = [existingTestSection, finalStep];
215
257
  return `<output_format>
216
- ## Output Checklist
217
-
218
258
  Complete ALL of the following:
219
259
 
220
260
  ${sections.join("\n\n")}
221
- Be brief. Decide the action for each test and apply edits immediately. Limit reasoning for IGNORE'd tests to a single line.
222
261
  </output_format>`;
223
262
  }
@@ -285,7 +285,7 @@ The ranked test recommendation catalog is pre-built and shown below (after the s
285
285
  2. Output the completed catalog **exactly as formatted — grouped by test type (### E2E / ### UI / ### Integration / ### Contract)**. Do NOT restructure, reorder, rename sections, or generate a new format.
286
286
  3. Do NOT call any Skyramp generation tools. The catalog shows ready-to-use tool calls that can be executed on demand.
287
287
 
288
- **If** Steps 1–2 revealed additional scenarios the catalog does not cover (e.g. a computed formula or FK relationship that was missed), you may optionally call \`skyramp_recommend_tests\` with \`stateFile: "${p.stateFile ?? p.sessionId}"\` and \`enrichedScenarios\` to regenerate a more complete catalog — but only after presenting the current one.`;
288
+ **If** Steps 1–2 revealed additional scenarios the catalog does not cover (e.g. a computed formula or foreign-key relationship that was missed), you may optionally call \`skyramp_recommend_tests\` with \`stateFile: "${p.stateFile ?? p.sessionId}"\` and \`enrichedScenarios\` to regenerate a more complete catalog — but only after presenting the current one.`;
289
289
  const hasJavaFiles = p.candidateRouteFiles?.some((f) => /\.(java|kt)$/.test(f)) ?? false;
290
290
  const routeFilesSection = p.candidateRouteFiles && p.candidateRouteFiles.length > 0
291
291
  ? `\nRoute/controller files found by static scan (read these to discover endpoints — the regex-based catalog below may be incomplete for your framework):\n${p.candidateRouteFiles.map((f) => `- ${f}`).join("\n")}\n`
@@ -29,6 +29,7 @@ function _execCoverageBody(ctx) {
29
29
  5. **Unrelated endpoint coverage (last resort)**: Tests for endpoints with no connection to the PR diff, only when ALL options above have been exhausted.
30
30
  **Avoid backfilling with a test for a completely unrelated resource (e.g. \`POST /reviews\` when the PR only changes \`/orders\`) if any PR-endpoint edge-case integration test is feasible.**
31
31
  - **Contract tests (\`[skyramp]\`)**: If an existing \`[skyramp]\` contract test already covers that resource path → UPDATE the existing test file instead of creating a new one. A new test case is a new test even if the file already exists — count it toward \`newTestsCreated\`.
32
+ - **\`[removed]\` endpoints**: If a GENERATE item targets an endpoint marked \`[removed]\` — the route was deleted in this PR, not renamed — generate a single contract test that asserts \`404 Not Found\`. Do not generate success-path (2xx) tests for removed endpoints. The purpose is a regression guard that catches the endpoint being accidentally re-added.
32
33
  - **Integration/scenario tests**: Always generate as a new file via the scenario pipeline, even if an existing integration test covers the same resource. A new multi-step scenario is a distinct test. Count it toward \`newTestsCreated\`.
33
34
  - **UI tests**: Always generate as a new file. Count toward \`newTestsCreated\`.`;
34
35
  }
@@ -72,7 +73,7 @@ If these conditions are not met, add it to ADDITIONAL only — do NOT displace a
72
73
  When a qualifying candidate is inserted: place it HIGH before MEDIUM before LOW; within the same priority, source-code-derived candidates go BEFORE structural ones. Re-number ranks after insertion. The top ${ctx.maxGen} ranked items become GENERATE candidates.
73
74
 
74
75
  **Source-code validation gates:**
75
- - **Cascade vs referential integrity**: If both a cascade-delete and a delete-blocked scenario appear for the same resource pair, keep only the one matching the source FK delete policy (ON DELETE CASCADE / cascade=True / onDelete: 'CASCADE' → keep cascade-delete; RESTRICT/PROTECT/no annotation → keep delete-blocked). Remove the inapplicable variant.
76
+ - **Cascade vs referential integrity**: If both a cascade-delete and a delete-blocked scenario appear for the same resource pair, keep only the one matching the source foreign-key delete policy (ON DELETE CASCADE / cascade=True / onDelete: 'CASCADE' → keep cascade-delete; RESTRICT/PROTECT/no annotation → keep delete-blocked). Remove the inapplicable variant.
76
77
  - **Unique constraints**: Unique-constraint scenarios (duplicate POST → 409) are pre-drafted for all resources. Confirm enforcement before keeping: SQL UNIQUE index, Mongoose unique: true, Prisma @unique, or explicit duplicate-check code. If the backend is Redis, schema-less, or has no explicit constraint in the changed files, move to ADDITIONAL with a note — do NOT generate.`;
77
78
  }
78
79
  function _execDiversityBody(_ctx) {
@@ -94,7 +95,7 @@ For each pair of GENERATE items, ask: same HTTP method + path + step sequence +
94
95
  Same step sequence with only payload differences (e.g. 10% vs 5% discount both returning 200) = same code path = duplicate. Different scenario names do not make duplicate tests distinct.`;
95
96
  }
96
97
  function _execExecuteBody(ctx) {
97
- return `Replace any scenario that pairs unrelated resources with one reflecting actual FK relationships in the codebase.
98
+ return `Replace any scenario that pairs unrelated resources with one reflecting actual foreign-key relationships in the codebase.
98
99
  Use the field names and values from the \`<source_evidence>\` blocks you quoted in Step ${EXEC_STEP_ENRICH} to fill all tool call parameters. Prefer reusing Step ${EXEC_STEP_ENRICH} evidence when it already resolves a placeholder, but if a placeholder cannot be replaced with concrete values from files already read, you may read the specific schema, model, or handler file needed to resolve it. Assert response field values, not just status codes.
99
100
 
100
101
  ${buildTestQualityCriteria()}
@@ -299,7 +300,7 @@ export function buildExecutionPlan(scored, maxGen, topN, baseUrl, authHeaderValu
299
300
  ? `authHeader: "${authHeaderValue}"${authSchemeSnippet}`
300
301
  : "authHeader: <resolve from workspace or OpenAPI securitySchemes>; authScheme: <if Authorization>";
301
302
  const prereqNote = s.category === "new_endpoint"
302
- ? `\n**Prerequisite discovery**: Check for FK fields (product_id, user_id, order_id) in the endpoint's request body. If found, prepend a step to create that prerequisite resource first, then chain its primary key field into the dependent step using template variable syntax. Check the actual field name from the response body (\`id\`, \`uuid\`, \`_id\`, etc.), response header (\`Location\`), or cookie — do not assume \`id\`.`
303
+ ? `\n**Prerequisite discovery**: Check for foreign-key fields (product_id, user_id, order_id) in the endpoint's request body. If found, prepend a step to create that prerequisite resource first, then chain its primary key field into the dependent step using template variable syntax. Check the actual field name from the response body (\`id\`, \`uuid\`, \`_id\`, etc.), response header (\`Location\`), or cookie — do not assume \`id\`.`
303
304
  : "";
304
305
  const bugLine = s.bugCatchingTarget
305
306
  ? `**Bug to catch**: ${s.bugCatchingTarget}\n`
@@ -50,7 +50,7 @@ Before each GENERATE tool call, confirm WHERE each key value comes from:
50
50
  - **requestBody / responseBody fields** → source code schema (Zod, Pydantic, DTO), enriched scenario, or OpenAPI spec. **The generation tool rejects empty \`{}\` request bodies for POST/PUT/PATCH** — read the source schema first if the fields are unknown.
51
51
  - **endpointURL** → workspace \`baseUrl\` + endpoint path (both required — never path alone)
52
52
  - **authHeader / authScheme** → workspace config or OpenAPI \`securitySchemes\`
53
- - **FK path params** → chained from a prior step's response (check the actual field name it may be \`id\`, \`uuid\`, \`_id\`, or a resource-specific \`*_id\` field). The chaining source can be a response body (POST or GET), a response header (e.g. \`Location\`), or a cookie — not hardcoded
53
+ - **Foreign-key path params** → chained from a prior step's response never invented or hardcoded. Common field names: \`id\`, \`uuid\`, \`_id\`, \`*_id\`; use whatever identifier field the server returns for this resource. The chaining source can be a response body (POST or GET), a response header (e.g. \`Location\`), or a cookie.
54
54
  - **Names / string values** → realistic; append timestamp suffix to avoid re-run conflicts
55
55
 
56
56
  ## Ranking Rule
@@ -112,11 +112,11 @@ export function buildTestPatternGuidelines() {
112
112
  - **Middleware chains**: If auth/rate-limit/logging middleware exists, test the chain (e.g., rate limit hit → auth still checked → correct error returned)
113
113
  - **N+1 query risk**: If list endpoints join related data (e.g., orders with products), test with large datasets
114
114
  - **State machines**: If resources have status transitions (draft→published→archived), test invalid transitions (e.g., archived→draft should fail)
115
- - **Cascade deletes**: Only recommend after reading source code to confirm which resource holds the FK. The resource with the FK is the child; the one it points to is the parent. Example: if orders.product_id references products, then products is the parent — deleting a product tests whether orders are protected or cascade-deleted. Getting this backwards (treating the child as the parent) produces a nonsensical test.
115
+ - **Cascade deletes**: Only recommend after reading source code to confirm which resource holds the foreign key. The resource with the foreign key is the child; the one it points to is the parent. Example: if orders.product_id references products, then products is the parent — deleting a product tests whether orders are protected or cascade-deleted. Getting this backwards (treating the child as the parent) produces a nonsensical test.
116
116
  - **Race conditions**: If concurrent writes are possible (inventory deduction, counter increment), test concurrent requests
117
117
  - **Computed fields**: If response contains derived values (total, average, count), verify computation with known inputs (e.g., total_cost = compute_seconds * rate + memory_mb * rate + external_cost)
118
118
  - **Mutation with collection modification**: If PUT/PATCH endpoints accept arrays of child items (e.g., order line items, cart products, invoice entries), test adding/removing items and verify that derived totals (e.g., total_amount, subtotal, item_count) are recalculated correctly. This is the most common source of user-reported bugs — always prioritize it for GENERATE over simple field-update tests.
119
- The PATCH/PUT request body should include the child collection array field(s) defined for that endpoint (e.g., "items" with FK references like "product_id" and a quantity field) chained from prior POST responses. A PATCH that only sends metadata fields (e.g., discount_type, status, notes) without modifying the child collection is NOT a valid mutation-recalc test — it will pass even when the item/total logic is broken. Before writing assertions, inspect the source code or OpenAPI spec to identify (1) the actual child collection field name and its FK/quantity/price sub-fields, and (2) how derived totals are calculated (including any discounts, taxes, or fees). Then assert: the child FK fields match chained IDs, quantities match sent values, and totals match the computation from the source code
119
+ The PATCH/PUT request body should include the child collection array field(s) defined for that endpoint (e.g., "items" with foreign-key references like "product_id" and a quantity field) chained from prior POST responses. A PATCH that only sends metadata fields (e.g., discount_type, status, notes) without modifying the child collection is NOT a valid mutation-recalc test — it will pass even when the item/total logic is broken. Before writing assertions, inspect the source code or OpenAPI spec to identify (1) the actual child collection field name and its foreign-key/quantity/price sub-fields, and (2) how derived totals are calculated (including any discounts, taxes, or fees). Then assert: the child foreign-key fields match chained IDs, quantities match sent values, and totals match the computation from the source code
120
120
  - **Webhook/event side effects**: If endpoints trigger async operations, test that side effects occur (e.g., POST /orders triggers notification)
121
121
  - **Cross-user isolation**: If resources are owned by users, test that user B cannot access/modify user A's resources (GET /users/{other_id}/data → 403 Forbidden)
122
122
  - **Range/boundary invariants**: If business rules cap values (max retries, min balance, discount ≤ subtotal), test the boundary (e.g., set retries to max+1 → expect rejection)
@@ -130,7 +130,7 @@ that step B depends on (e.g., create product → create order referencing that p
130
130
  verify order contains correct product). Single-resource CRUD alone is not an integration test.
131
131
  Use actual field names and values from the source code schema or OpenAPI schema (not \`{}\` or invented field names); verify response data, not just status codes.
132
132
  When a PUT/PATCH updates a resource with child collections (e.g., order items), the request body
133
- MUST include the child array with FK references chained from prior steps — and assertions MUST
133
+ MUST include the child array with foreign-key references chained from prior steps — and assertions MUST
134
134
  verify the actual child items in the response (product_id, quantity, unit_price), not just
135
135
  top-level metadata like discount or status.
136
136
 
@@ -184,7 +184,7 @@ Before finalizing your output, verify:
184
184
  6. **Real request shapes**: requestBody for POST/PUT/PATCH uses actual field names from source (not \`{}\`). GET search/filter uses \`queryParams\`, not \`requestBody\`.
185
185
  7. **scenarioFile**: \`skyramp_integration_test_generation\` uses the exact \`filePath\` returned by \`skyramp_batch_scenario_test_generation\` — not a guessed or hardcoded filename.
186
186
  8. **bugCatchingTarget**: Every GENERATE integration test that targets a business rule, formula, or constraint has a non-empty \`bugCatchingTarget\`.
187
- 9. **FK chaining**: In multi-step integration tests, path params sourced from a prior step's response (e.g. \`order_id\` from step 1) use \`chainsFrom\` — not hardcoded IDs.
187
+ 9. **Foreign-key chaining**: In multi-step integration tests, path params sourced from a prior step's response (e.g. \`order_id\` from step 1) use \`chainsFrom\` — not hardcoded IDs.
188
188
  10. **Concrete scenario names**: No GENERATE item uses a placeholder name ending in a numeric suffix (e.g. \`ui-test-for-changed-component-1\`, \`ui-test-from-trace-2\`). Derive the name from the actual changed component or flow: if the diff touches \`LinkCard.tsx\`, the scenario name should be \`link-card-pin-toggle\` or \`link-card-edit-description\`, not \`ui-test-for-changed-component-1\`. The changed file list is available above — use it.
189
189
  11. **Issue coverage**: If \`<bug_found>\` blocks exist from Step ${codeReviewStepLabel} (Code Review), verify that the highest-severity flaw (HIGH or CRITICAL) has at least one GENERATE item directly targeting it (its pass/fail outcome depends on whether that bug exists). At most one promotion per run. If the promoted flaw lacks a dedicated GENERATE item, promote or create one before proceeding. Additional HIGH/CRITICAL flaws beyond the first should appear in ADDITIONAL at highest priority.
190
190
  12. **Code Review completeness**: Did you produce a \`<function_review>\` block for EVERY changed function/handler in Step ${codeReviewStepLabel}? If any function is missing a review, you skipped the correctness analysis for it. Go back and complete it before finalizing.
@@ -197,7 +197,7 @@ export function buildFewShotExamples() {
197
197
  **Parameter grounding**:
198
198
  - baseURL: "http://localhost:8000" (workspace api.baseUrl)
199
199
  - steps[0].requestBody fields "name", "price": ProductCreate schema fields (src/models/product.py)
200
- - steps[1].requestBody "product_id": FK to products — chained from step 0 response id
200
+ - steps[1].requestBody "product_id": foreign key to products — chained from step 0 response id
201
201
  - steps[1].requestBody "quantity": OrderCreate schema field (src/models/order.py)
202
202
  - responseBody "total_amount": 89.97 = 29.99 × 3 — from order total formula (src/services/order_service.py: total = sum(item.price * item.quantity))
203
203
  - authHeader/authScheme: workspace config (Authorization / Bearer)
@@ -180,9 +180,11 @@ Use these exact numbers throughout the rest of the prompt.`;
180
180
 
181
181
  Budget Plan (total already determined): **${maxTotal} total (${effectiveGenerate} generate + ${additional} additional)**
182
182
 
183
+ **Cosmetic-only override:** If, after code review, the entire diff is cosmetic with no observable rendering or interaction change (e.g. a \`.css\`/\`.scss\` reformat — property reordering, comment/whitespace edits, \`0px\`→\`0\`), the total above does NOT apply — set your Budget Plan to **0 total**, abstain (recommend and generate zero tests), and skip Step D below. A frontend file appearing in the diff is not, by itself, a behavior change. (Style changes that alter visibility, layout, or state are NOT cosmetic — keep the budget for those.)
184
+
183
185
  **Step D — Determine UI vs backend split for the budget above:**
184
186
  - Non-UI slots are backend tests; start from file-count ratio for UI%, then apply judgment:
185
- - Pure CSS/style changes inflate the frontend file count without adding test value → reduce UI%
187
+ - Cosmetic CSS/style changes alongside real changes inflate the frontend file count without adding test value → reduce UI% (a whole-diff-cosmetic PR already abstained via the override above, so this only applies to the mixed-PR case)
186
188
  - Frontend logic bugs (state management, calculation errors, form validation) in the diff → increase UI% even if few frontend files
187
189
  - Frontend component calls a changed backend API → an E2E test covers both sides → count toward UI%
188
190
  - Frontend files only in \`__tests__/\` or \`.stories.\` → exclude from the ratio
@@ -299,6 +299,19 @@ describe("buildScopeAssessmentSection", () => {
299
299
  expect(section).not.toContain("Step B");
300
300
  expect(section).not.toContain("Step C");
301
301
  });
302
+ it("offers a cosmetic-only abstention override in the mixed-PR branch", () => {
303
+ // Regression: a CSS-only diff classifies as frontend (hasFrontendChanges=true),
304
+ // which previously handed the agent a fixed total with no path to zero tests,
305
+ // tanking d1 coverage to 0 (eval 14-cart-css-cleanup).
306
+ const section = buildScopeAssessmentSection(10, 3, false, undefined, true);
307
+ expect(section).toContain("Cosmetic-only override");
308
+ expect(section).toContain("0 total");
309
+ // The override is self-contained: it must NOT depend on a "Task 3" that
310
+ // doesn't exist in standalone (non-testbot) render contexts, and it must
311
+ // tell the agent to skip the now-irrelevant UI% step.
312
+ expect(section).not.toContain("Task 3");
313
+ expect(section).toContain("skip Step D");
314
+ });
302
315
  it("clamps additional to 0 when maxGenerate equals maxTotal (precomputed path)", () => {
303
316
  // maxGenerate=5, maxTotal=5 → additional must be 0, not negative
304
317
  const section = buildScopeAssessmentSection(5, 5, false, 0, false);