@skyramp/mcp 0.0.63 → 0.0.64-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- export const MAX_TESTS_TO_GENERATE = 4;
1
+ export const MAX_TESTS_TO_GENERATE = 3;
2
2
  export const MAX_RECOMMENDATIONS = 10;
3
3
  export const MAX_CRITICAL_TESTS = 3;
4
4
  function serializeAuthCallParams(params) {
@@ -11,11 +11,16 @@ function serializeAuthCallParams(params) {
11
11
  }
12
12
  return parts.join(", ");
13
13
  }
14
- export function getAuthSnippets(authHeaderValue) {
14
+ export function getAuthSnippets(authHeaderValue, authType) {
15
15
  if (!authHeaderValue) {
16
16
  return { authSchemeSnippet: "", authTokenSnippet: "" };
17
17
  }
18
18
  if (/^authorization$/i.test(authHeaderValue)) {
19
+ if (authType && authType !== "none") {
20
+ // Known auth type from workspace config — embed directly so the LLM doesn't need to guess.
21
+ const scheme = authType.charAt(0).toUpperCase() + authType.slice(1);
22
+ return { authSchemeSnippet: `, authScheme: "${scheme}"`, authTokenSnippet: "" };
23
+ }
19
24
  return { authSchemeSnippet: ', authScheme: "<scheme e.g. Bearer, Basic, Token or empty>"', authTokenSnippet: "" };
20
25
  }
21
26
  return { authSchemeSnippet: "", authTokenSnippet: "" };
@@ -23,169 +28,84 @@ export function getAuthSnippets(authHeaderValue) {
23
28
  export const PATH_PARAM_UUID_GUIDANCE = `**Path parameters:** keep the placeholder in \`endpointURL\` (e.g. \`/coupons/{coupon_id}\`). ` +
24
29
  `Pass the value via \`pathParams\` (e.g. \`coupon_id=<random-uuid-v4>\`). ` +
25
30
  `Use example values from the OpenAPI schema if available; otherwise generate a fresh random UUID v4 — not all-zeros or repeated-digit patterns.`;
26
- export function buildPrioritizationDimensions() {
27
- return `## Prioritization Dimensions (evaluate each candidate test)
28
-
29
- For each candidate test, assess these dimensions using your judgment:
30
-
31
- | Dimension | What to assess |
32
- |-----------|---------------|
33
- | **Production Safety** | Does it guard a critical boundary (auth, unique constraint, cascade delete, data integrity, breaking migration)? Safety tests get priority even if single-endpoint. |
34
- | **Bug-Finding Potential** | Does it target known failure modes (race conditions, data consistency, state transitions, cascade effects)? |
35
- | **User Journey Relevance** | Does it reflect how real users interact with the system (from traces, business flows, or critical paths)? |
36
- | **Coverage Gap** | Does it address an area with zero existing test coverage? Or does it duplicate what\'s already tested? |
37
- | **Code Insight** | Is it derived from actual implementation analysis (e.g., spotted a middleware pattern, found an N+1 risk, found a unique constraint) rather than just API shape? |
38
-
39
- Candidates scoring well across MULTIPLE dimensions should be recommended first.
40
-
41
- **Quality Gate:** For each candidate, ask TWO questions:
42
- 1. "Would this test prevent a production incident?" (safety value)
43
- 2. "Does this test exercise a real workflow or catch a real bug?" (coverage value)
44
-
45
- Candidates scoring YES on question 1 are ALWAYS high priority regardless of sophistication.
46
- A unique constraint test or permission boundary test is not "impressive" but prevents data
47
- corruption and unauthorized access — these are critical.
48
-
49
- ## Test Categories (assign one to each candidate)
50
- | Category | What it covers | Priority boost |
51
- |----------|---------------|----------------|
52
- | **security_boundary** | Auth checks, permission validation, cross-user isolation, idempotency | HIGH — always include if candidates exist |
53
- | **business_rule** | Unique constraints, range validation, state machines, invariants | HIGH — always include if candidates exist |
54
- | **data_integrity** | Cascade deletes, orphan prevention, referential integrity | HIGH — always include if candidates exist |
55
- | **breaking_change** | Route renames, auth migration, response shape changes | HIGH — always include if diff contains breaking changes |
56
- | **workflow** | Cross-resource integration, user journeys, multi-step flows | HIGH — standard ranking |`;
57
- }
58
- export function buildTestExamples() {
59
- return `## Test Examples (calibrate your judgment)
60
-
61
- **Impressive tests (recommend these):**
62
- 1. "Register user → login → create order → verify order appears in user\'s order list"
63
- Cross-resource workflow with auth chaining and data verification across users + orders.
64
- Category: workflow.
65
- 2. "Create product with inventory=10 → place order for qty=10 → verify inventory=0 →
66
- place another order → verify 409 out-of-stock error"
67
- Cross-resource state machine + business rule validation (products + orders + inventory).
68
- Category: business_rule.
69
- 3. "POST /users with duplicate email → verify 409 Conflict → verify original user unchanged"
70
- Error handling with side-effect verification — not just status code check.
71
- Category: business_rule.
72
-
73
- **Critical safety tests (ALWAYS recommend if patterns exist in the codebase):**
74
- 4. "Create user A\'s resource → authenticate as user B → GET/PUT/DELETE user A\'s resource → verify 403 Forbidden"
75
- Cross-user isolation — prevents unauthorized access to another user\'s data.
76
- Category: security_boundary. Example: GET /users/{other_id}/links should return 403, not 200.
77
- 5. "Create parent resource → create child referencing parent → DELETE parent → GET child → verify child is deleted (cascade) or parent delete is blocked (referential integrity)"
78
- Cascade delete / orphan prevention — prevents dangling references in the database.
79
- Category: data_integrity. Example: DELETE /collections/{id} → verify /links referencing it are cleaned up.
80
- 6. "Apply discount of $50 to order with subtotal $30 → verify total is $0 (not negative) or request is rejected"
81
- Edge case arithmetic / business invariant — prevents negative totals, overflows, or invalid state.
82
- Category: business_rule. Example: discount > subtotal, quantity > stock, retry count > max allowed.
83
- 7. "Call old endpoint path after route rename → verify 404 or redirect; call new path → verify 200"
84
- Breaking change migration — ensures clients on the old path get a clear signal.
85
- Category: breaking_change. Example: /automated_emails renamed to /auto_mails, Bearer prefix changed to Token.
86
-
87
- **Non-impressive tests (deprioritize or skip):**
88
- 1. "GET /products → 200" — trivial health check, no assertions beyond status code.
89
- 2. "POST /products → GET /products/{id} → PUT /products/{id} → DELETE /products/{id}"
90
- Single-resource CRUD — baseline, not impressive by itself.
91
- 3. "POST /products with missing name → 422" — obvious validation, already covered by contract tests.`;
92
- }
93
31
  export function buildTestPatternGuidelines() {
94
- return `## Test Pattern Guidelines (reference, not rigid rules)
32
+ return `### Test Pattern Guidelines
95
33
 
96
- ### Tier 1 — Base Patterns
34
+ #### Tier 1 — Base Patterns
97
35
  - CRUD lifecycle per resource group (Create → Read → Update → Delete)
98
36
  - Auth flow (Register → Login → Access protected → Token refresh → Logout)
99
37
  - Pagination & filtering (boundary values, empty results, large page sizes)
100
- - Error responses (400, 401, 403, 404, 409, 422 — each with specific trigger)
101
-
102
- ### Tier 2 — Code-Informed Patterns (higher value look for these in the codebase)
103
- - **Middleware chains**: If auth/rate-limit/logging middleware exists, test the chain
104
- (e.g., rate limit hit auth still checked → correct error returned)
105
- - **N+1 query risk**: If list endpoints join related data (e.g., orders with products),
106
- test with large datasets under load
107
- - **State machines**: If resources have status transitions (draft→published→archived),
108
- test invalid transitions (e.g., archived→draft should fail)
109
- - **Cascade deletes**: If deleting a parent removes children, verify cascade AND verify
110
- orphan prevention (delete product orders referencing it get error or cascade)
111
- - **Race conditions**: If concurrent writes are possible (inventory deduction, counter
112
- increment), test concurrent requests under load
113
- - **Computed fields**: If response contains derived values (total, average, count),
114
- verify computation with known inputs
115
- - **Webhook/event side effects**: If endpoints trigger async operations, test that side
116
- effects occur (e.g., POST /orders triggers email notification)
117
- - **Cross-user isolation**: If resources are owned by users, test that user B cannot
118
- access/modify user A\'s resources (GET /users/{other_id}/data → 403 Forbidden)
119
- - **Range/boundary invariants**: If business rules cap values (max retries, min balance,
120
- discount ≤ subtotal), test the boundary (e.g., set retries to max+1 → expect rejection)
121
- - **Breaking change migration**: If the diff renames a route, changes auth headers, or
122
- removes a required field, test both the old path (should 404 or redirect) and new path
123
- (should succeed). Example: /automated_emails → /auto_mails, Bearer → Token prefix`;
38
+ - Error responses (400, 401, 403, 404, 409, 422 — each with a specific trigger)
39
+
40
+ #### Tier 2 — Code-Informed Patterns (higher value, look for these in source code)
41
+ - **Middleware chains**: If auth/rate-limit/logging middleware exists, test the chain (e.g., rate limit hit → auth still checked → correct error returned)
42
+ - **N+1 query risk**: If list endpoints join related data (e.g., orders with products), test with large datasets
43
+ - **State machines**: If resources have status transitions (draft→published→archived), test invalid transitions (e.g., archived→draft should fail)
44
+ - **Cascade deletes**: If deleting a parent removes children, verify cascade AND orphan prevention (delete product → orders referencing it get error or cascade)
45
+ - **Race conditions**: If concurrent writes are possible (inventory deduction, counter increment), test concurrent requests
46
+ - **Computed fields**: If response contains derived values (total, average, count), verify computation with known inputs (e.g., total_cost = compute_seconds * rate + memory_mb * rate + external_cost)
47
+ - **Webhook/event side effects**: If endpoints trigger async operations, test that side effects occur (e.g., POST /orders triggers notification)
48
+ - **Cross-user isolation**: If resources are owned by users, test that user B cannot access/modify user A's resources (GET /users/{other_id}/data → 403 Forbidden)
49
+ - **Range/boundary invariants**: If business rules cap values (max retries, min balance, discount ≤ subtotal), test the boundary (e.g., set retries to max+1 → expect rejection)
50
+ - **Breaking change migration**: If the diff renames a route, changes auth headers, or removes a required field, test both old path (should 404) and new path (should succeed)`;
124
51
  }
125
52
  export function buildTestQualityCriteria() {
126
- return `## What Makes a Good Test
53
+ return `### What Makes a Good Test
127
54
 
128
55
  **Integration tests** should demonstrate cross-resource data flow — step A creates data
129
- that step B depends on (e.g., create product \u2192 create order referencing that product's ID \u2192
56
+ that step B depends on (e.g., create product create order referencing that product's ID
130
57
  verify order contains correct product). Single-resource CRUD alone is not an integration test.
131
- Use realistic request bodies from source code schemas and verify response data, not just
132
- status codes.
58
+ Use realistic request bodies from source code schemas and verify response data, not just status codes.
133
59
 
134
- **E2E tests** should follow realistic user journeys end-to-end: browse products \u2192 search \u2192
135
- add to cart \u2192 checkout. Verify that frontend actions trigger the correct API calls and
60
+ **E2E tests** should follow realistic user journeys end-to-end: browse products search
61
+ add to cart checkout. Verify that frontend actions trigger the correct API calls and
136
62
  that the UI reflects backend state.
137
63
 
138
- **UI tests** should exercise component behavior and interaction flows: fill form \u2192 validate
139
- inputs \u2192 submit \u2192 see confirmation. Include visual state changes (loading, error, empty)
64
+ **UI tests** should exercise component behavior and interaction flows: fill form validate
65
+ inputs submit see confirmation. Include visual state changes (loading, error, empty)
140
66
  and accessibility checks.`;
141
67
  }
142
68
  export function buildGenerationRules(isUIOnlyPR) {
143
- return `## Generation Guidelines
69
+ return `### Generation Guidelines
70
+
71
+ **Available test types:** integration, contract, E2E, UI. **No smoke or fuzz tests.**
72
+ Choose based on what adds the most value for this PR's changes.
73
+
74
+ **Contract test mode — signal-based selection:**
75
+ - **Consumer contract** (\`consumerMode: true\`): Look for outbound HTTP client code (fetch, axios, httpx, requests, http.Client), service client classes, or calls to external base URLs. If an endpoint's implementation makes downstream calls, that downstream boundary is a consumer contract test candidate.
76
+ - **Provider contract** (\`providerMode: true\`): Look for new or modified endpoint handlers, route changes, response shape modifications, or the presence of an OpenAPI spec. If the diff adds/changes an endpoint this service owns, that is a provider contract test candidate.
77
+ - **Both modes**: When the service is simultaneously an API owner (upstream) AND a client of another service (downstream).
78
+ - **Default (neither)**: Only when role is unclear or no spec is available.
144
79
 
145
80
  **Scenario fidelity:** Every workflow scenario should reflect the actual resource
146
81
  relationships in the code. If the pre-drafted scenarios don't match the real data model,
147
82
  replace them with accurate ones.
148
-
149
- **Available test types:**
150
- - **Integration** — multi-endpoint workflows that chain data across resources
151
- - **Contract** — validates API contracts between services. Two distinct modes — choose based on role:
152
- - **Provider contract test** (\`providerMode: true\`): Recommend when this codebase IS the API provider/owner.
153
- Use when: new endpoints are added, existing responses are modified, an OpenAPI spec exists to validate against,
154
- or you need to verify the implementation still honors its contracts after a code change.
155
- The test calls the real provider and asserts the response conforms to the spec.
156
- - **Consumer contract test** (\`consumerMode: true\`): Recommend when this codebase CALLS another service's API.
157
- Use when: the service makes outbound HTTP calls to downstream APIs, you need to verify outbound requests
158
- conform to the downstream contract, or you want to catch consumer-side drift before it reaches the provider.
159
- The test uses a request-aware mock as the provider — no live downstream service needed.
160
- - **Both modes** (\`providerMode: true, consumerMode: true\`): Recommend for service boundaries where this
161
- codebase is simultaneously an API owner (upstream) AND a client of another service (downstream).
162
- - **Default (neither set)**: generates a standard contract test equivalent to both modes. Use only when
163
- the role is unclear or as a fallback when no spec is available.
164
-
165
- **Signal for consumer contract test:** Look for outbound HTTP client code (fetch, axios, httpx, requests, http.Client),
166
- service client classes, or calls to external base URLs in the codebase. If an endpoint's implementation
167
- makes downstream calls, that downstream boundary is a consumer contract test candidate.
168
-
169
- **Signal for provider contract test:** Look for new or modified endpoint handlers, route changes, response
170
- shape modifications, or the presence of an OpenAPI spec. If the diff adds/changes an endpoint this service owns,
171
- that is a provider contract test candidate.
172
-
173
- **Do NOT recommend Fuzz tests.** Fuzz testing is available as a manual tool but must not appear in automated recommendations.
174
- - **E2E** — user journeys spanning frontend to backend (needs Playwright traces)
175
- - **UI** — frontend component interaction flows (needs Playwright traces)
176
83
  ${isUIOnlyPR ? `
177
- This is a **UI-only PR** — no backend changes. UI and E2E tests are most relevant.
178
- Without Playwright traces, recommend them with trace recording instructions
179
- (\`skyramp_start_trace_collection\` with \`playwright: true\`).
84
+ **UI-only PR** — no backend changes. UI and E2E tests are most relevant.
85
+ Without Playwright traces, list as additionalRecommendations with trace recording instructions.
180
86
  ` : `
181
87
  When no Playwright trace exists, still recommend E2E/UI tests with instructions for
182
88
  recording a trace using \`skyramp_start_trace_collection\` with \`playwright: true\`.
183
89
  `}
184
90
  **No duplicate coverage.** If an existing test already covers an endpoint + test type,
185
- recommend a different test that adds new coverage.
186
-
187
- Choose the test types and distribution that maximize coverage for this specific PR.
188
- No smoke, fuzz tests.`;
91
+ recommend a different test that adds new coverage.`;
92
+ }
93
+ export function buildTestExamples() {
94
+ return `### Examples — what "good" looks like
95
+
96
+ **Impressive (these catch prod bugs):**
97
+ 1. Cross-resource workflow: Register → login → create order → verify order appears in user's order list (category: workflow)
98
+ 2. State machine + business rule: Create product with inventory=10 → place order qty=10 → verify inventory=0 → place another order → verify 409 out-of-stock (category: business_rule)
99
+ 3. Computed field verification: POST /flow-costs with known compute_seconds/memory_mb/external_cost_usd → verify total_cost_usd = (compute_seconds × 0.00012) + (memory_mb × 0.000002 × compute_seconds) + external_cost_usd (category: business_rule)
100
+ 4. Cross-user isolation: Create user A's resource → authenticate as user B → GET/PUT/DELETE user A's resource → verify 403 (category: security_boundary)
101
+ 5. Cascade delete: Create parent → create child referencing parent → DELETE parent → GET child → verify 404 or 409 depending on FK policy (category: data_integrity)
102
+ 6. Unique constraint with side-effect: POST /users with duplicate email → verify 409 → verify original user unchanged (category: business_rule)
103
+ 7. Budget threshold: Create budget with alert_threshold=80% → record costs pushing spend to 85% → verify budget_warning=true on next cost record (category: business_rule)
104
+
105
+ **Deprioritise (low value):**
106
+ - GET /products → 200 (trivial health check, no assertions beyond status)
107
+ - Single-resource CRUD with no cross-resource or state verification
108
+ - POST with missing field → 422 (obvious validation, covered by contract tests)`;
189
109
  }
190
110
  export function buildToolWorkflows(authHeaderValue, authTypeValue = "") {
191
111
  const isAuthorizationHeader = /^authorization$/i.test(authHeaderValue);
@@ -201,7 +121,10 @@ If you find auth requirements, pass the appropriate \`authHeader\` (e.g., "Autho
201
121
  authParams = { authHeader: "" };
202
122
  }
203
123
  else if (isAuthorizationHeader) {
204
- authParams = { authHeader: "Authorization", authScheme: "<scheme e.g. Bearer, Token or empty>" };
124
+ const resolvedScheme = (authTypeValue && authTypeValue !== "none")
125
+ ? authTypeValue.charAt(0).toUpperCase() + authTypeValue.slice(1)
126
+ : "<scheme e.g. Bearer, Token or empty>";
127
+ authParams = { authHeader: "Authorization", authScheme: resolvedScheme };
205
128
  if (authTypeValue) {
206
129
  authGuidance = `**Auth Scheme:** The workspace \`api.authType\` is \`"${authTypeValue}"\`.
207
130
  **Where to find the scheme** (check in order):
@@ -284,65 +207,3 @@ ${PATH_PARAM_UUID_GUIDANCE}
284
207
  **For E2E tests:**
285
208
  Same trace flow, pass both trace file and playwright zip to \`skyramp_e2e_test_generation\`.`;
286
209
  }
287
- export function buildCoverageChecklist(openApiSpec, isUIOnlyPR, topN, maxGenerate = MAX_TESTS_TO_GENERATE, maxCritical = MAX_CRITICAL_TESTS) {
288
- const specNote = openApiSpec
289
- ? `\n**OpenAPI Spec available**: \`${openApiSpec.path}\`
290
- Use it actively:
291
- - **Contract tests**: pass \`apiSchema: "${openApiSpec.path}"\` — the CLI validates response schemas against the spec.
292
- - **Integration tests**: pass \`apiSchema\` to \`skyramp_scenario_test_generation\` — it extracts destination and request/response shapes.
293
- - **Single-endpoint tests**: pass both \`endpointURL\` AND \`apiSchema\` for schema-aware generation.
294
- \n`
295
- : "";
296
- return `## Coverage Checklist
297
- ${specNote}
298
- ${isUIOnlyPR ? `**UI-only PR** — no backend changes.
299
- Without Playwright traces, the testbot skips generation entirely — all recommendations
300
- become additionalRecommendations in the report.
301
- ` : `**Available test types:** integration, contract, E2E, UI. No smoke, fuzz tests.
302
- Choose based on what adds the most value for this PR's changes.
303
- `}
304
- ## For Each Recommendation Include:
305
- 1. Test type 2. Category (security_boundary/business_rule/data_integrity/breaking_change/workflow)
306
- 3. Priority (high/medium/low) 4. Target endpoint/scenario
307
- 5. What it validates (business logic, not just "tests the endpoint")
308
- 6. Skyramp tool call details — exact tool + key params for zero-editing execution
309
- 7. For integration/E2E: reference draftedScenario by scenarioName
310
- 8. Reasoning — WHY this test matters: the specific production risk it prevents,
311
- the business rule it enforces, or the security boundary it validates.
312
- Not "tests the endpoint" but "prevents duplicate user registration that would
313
- corrupt the user table" or "validates cascade delete does not orphan order records"
314
-
315
- ## When Artifacts Are Missing
316
- Recommend the test anyway — never mark it "blocked":
317
- - **No OpenAPI spec** \u2192 use \`endpointURL\` and \`requestBody\` from source code
318
- - **No Playwright recording** \u2192 provide trace recording instructions
319
- - **No backend trace** \u2192 use the scenario generation pipeline
320
-
321
- ## Select the Top ${topN} (category-aware)
322
- Consider all possible tests (endpoints \u00d7 interaction types + scenarios), then:
323
-
324
- 1. **Categorize** each candidate as: security_boundary, business_rule, breaking_change, data_integrity, or workflow
325
- 2. **Select critical tests first**: From each critical category present (security_boundary, business_rule, data_integrity, breaking_change), select the highest-ranked candidate — up to ${Math.min(maxCritical, Math.min(maxGenerate, topN))} of the ${Math.min(maxGenerate, topN)} generated tests MUST be from critical categories
326
- 3. **Fill remaining generated slots**: Fill the remaining ${Math.max(Math.min(maxGenerate, topN) - Math.min(maxCritical, Math.min(maxGenerate, topN)), 0)} slot(s) with the highest-ranked workflow/general candidates
327
- 4. **Total generated**: EXACTLY ${Math.min(maxGenerate, topN)} tests (critical categories filled first, then general). Do NOT generate fewer.
328
- 5. **Additional recommendations**: The remaining ${Math.max(topN - Math.min(maxGenerate, topN), 0)} recommendations (#${Math.min(maxGenerate, topN) + 1}\u2013${topN}) go to additionalRecommendations \u2014 these are recommended but not generated
329
-
330
- Include \`totalConsidered\` count in your output.
331
- Critical-category tests MUST be in the generated set even if a workflow test scores higher on sophistication.
332
-
333
- ## Recommendation Stability
334
- When PR history is provided:
335
- - **Carry forward** previous additionalRecommendations that still apply — match by scenarioName (for multi-step scenarios) or by endpoint (for single-endpoint tests). Re-derive category and priority from test content.
336
- - **Only add** new recommendations for code paths introduced since the last run.
337
- - **Only drop** a previous recommendation if its target endpoint was removed, its business logic changed, or it is now covered by a generated test.
338
-
339
- - Each integration scenario's step sequence should be logically valid — preconditions
340
- met by prior steps.
341
-
342
- Each recommendation should include enough detail for direct tool invocation.
343
- Reference draftedScenarios by name and interactions by description.
344
- Use "high"/"medium"/"low" for priority — no numeric scores.
345
- You MUST produce EXACTLY ${topN} total recommendations: ${Math.min(maxGenerate, topN)} to generate + ${Math.max(topN - Math.min(maxGenerate, topN), 0)} as additionalRecommendations. Do NOT produce fewer.
346
-
347
- Generate recommendations now.`;
348
- }