npm - @skyramp/mcp - Versions diffs - 0.0.63 → 0.0.64-rc.1 - Mend

@skyramp/mcp 0.0.63 → 0.0.64-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/build/prompts/test-recommendation/recommendationSections.js +63 -202
package/build/prompts/test-recommendation/test-recommendation-prompt.js +294 -84
package/build/prompts/test-recommendation/test-recommendation-prompt.test.js +58 -15
package/build/prompts/testbot/testbot-prompts.js +15 -12
package/build/services/TestExecutionService.js +1 -1
package/build/services/TestGenerationService.js +5 -1
package/build/services/TestGenerationService.test.js +11 -11
package/build/tools/generate-tests/generateContractRestTool.js +84 -10
package/build/tools/test-management/analyzeChangesTool.js +1 -1
package/build/types/RepositoryAnalysis.js +2 -0
package/build/utils/scenarioDrafting.js +418 -93
package/build/utils/scenarioDrafting.test.js +190 -2
package/package.json +2 -2

package/build/prompts/test-recommendation/recommendationSections.js CHANGED Viewed

@@ -1,4 +1,4 @@
-export const MAX_TESTS_TO_GENERATE = 4;
+export const MAX_TESTS_TO_GENERATE = 3;
 export const MAX_RECOMMENDATIONS = 10;
 export const MAX_CRITICAL_TESTS = 3;
 function serializeAuthCallParams(params) {
@@ -11,11 +11,16 @@ function serializeAuthCallParams(params) {
     }
     return parts.join(", ");
 }
-export function getAuthSnippets(authHeaderValue) {
+export function getAuthSnippets(authHeaderValue, authType) {
     if (!authHeaderValue) {
         return { authSchemeSnippet: "", authTokenSnippet: "" };
     }
     if (/^authorization$/i.test(authHeaderValue)) {
+        if (authType && authType !== "none") {
+            // Known auth type from workspace config — embed directly so the LLM doesn't need to guess.
+            const scheme = authType.charAt(0).toUpperCase() + authType.slice(1);
+            return { authSchemeSnippet: `, authScheme: "${scheme}"`, authTokenSnippet: "" };
+        }
         return { authSchemeSnippet: ', authScheme: "<scheme e.g. Bearer, Basic, Token or empty>"', authTokenSnippet: "" };
     }
     return { authSchemeSnippet: "", authTokenSnippet: "" };
@@ -23,169 +28,84 @@ export function getAuthSnippets(authHeaderValue) {
 export const PATH_PARAM_UUID_GUIDANCE = `**Path parameters:** keep the placeholder in \`endpointURL\` (e.g. \`/coupons/{coupon_id}\`). ` +
     `Pass the value via \`pathParams\` (e.g. \`coupon_id=<random-uuid-v4>\`). ` +
     `Use example values from the OpenAPI schema if available; otherwise generate a fresh random UUID v4 — not all-zeros or repeated-digit patterns.`;
-export function buildPrioritizationDimensions() {
-    return `## Prioritization Dimensions (evaluate each candidate test)
-For each candidate test, assess these dimensions using your judgment:
-| Dimension | What to assess |
-|-----------|---------------|
-| **Production Safety** | Does it guard a critical boundary (auth, unique constraint, cascade delete, data integrity, breaking migration)? Safety tests get priority even if single-endpoint. |
-| **Bug-Finding Potential** | Does it target known failure modes (race conditions, data consistency, state transitions, cascade effects)? |
-| **User Journey Relevance** | Does it reflect how real users interact with the system (from traces, business flows, or critical paths)? |
-| **Coverage Gap** | Does it address an area with zero existing test coverage? Or does it duplicate what\'s already tested? |
-| **Code Insight** | Is it derived from actual implementation analysis (e.g., spotted a middleware pattern, found an N+1 risk, found a unique constraint) rather than just API shape? |
-Candidates scoring well across MULTIPLE dimensions should be recommended first.
-**Quality Gate:** For each candidate, ask TWO questions:
-1. "Would this test prevent a production incident?" (safety value)
-2. "Does this test exercise a real workflow or catch a real bug?" (coverage value)
-Candidates scoring YES on question 1 are ALWAYS high priority regardless of sophistication.
-A unique constraint test or permission boundary test is not "impressive" but prevents data
-corruption and unauthorized access — these are critical.
-## Test Categories (assign one to each candidate)
-| Category | What it covers | Priority boost |
-|----------|---------------|----------------|
-| **security_boundary** | Auth checks, permission validation, cross-user isolation, idempotency | HIGH — always include if candidates exist |
-| **business_rule** | Unique constraints, range validation, state machines, invariants | HIGH — always include if candidates exist |
-| **data_integrity** | Cascade deletes, orphan prevention, referential integrity | HIGH — always include if candidates exist |
-| **breaking_change** | Route renames, auth migration, response shape changes | HIGH — always include if diff contains breaking changes |
-| **workflow** | Cross-resource integration, user journeys, multi-step flows | HIGH — standard ranking |`;
-}
-export function buildTestExamples() {
-    return `## Test Examples (calibrate your judgment)
-**Impressive tests (recommend these):**
-1. "Register user → login → create order → verify order appears in user\'s order list"
-   Cross-resource workflow with auth chaining and data verification across users + orders.
-   Category: workflow.
-2. "Create product with inventory=10 → place order for qty=10 → verify inventory=0 →
-   place another order → verify 409 out-of-stock error"
-   Cross-resource state machine + business rule validation (products + orders + inventory).
-   Category: business_rule.
-3. "POST /users with duplicate email → verify 409 Conflict → verify original user unchanged"
-   Error handling with side-effect verification — not just status code check.
-   Category: business_rule.
-**Critical safety tests (ALWAYS recommend if patterns exist in the codebase):**
-4. "Create user A\'s resource → authenticate as user B → GET/PUT/DELETE user A\'s resource → verify 403 Forbidden"
-   Cross-user isolation — prevents unauthorized access to another user\'s data.
-   Category: security_boundary. Example: GET /users/{other_id}/links should return 403, not 200.
-5. "Create parent resource → create child referencing parent → DELETE parent → GET child → verify child is deleted (cascade) or parent delete is blocked (referential integrity)"
-   Cascade delete / orphan prevention — prevents dangling references in the database.
-   Category: data_integrity. Example: DELETE /collections/{id} → verify /links referencing it are cleaned up.
-6. "Apply discount of $50 to order with subtotal $30 → verify total is $0 (not negative) or request is rejected"
-   Edge case arithmetic / business invariant — prevents negative totals, overflows, or invalid state.
-   Category: business_rule. Example: discount > subtotal, quantity > stock, retry count > max allowed.
-7. "Call old endpoint path after route rename → verify 404 or redirect; call new path → verify 200"
-   Breaking change migration — ensures clients on the old path get a clear signal.
-   Category: breaking_change. Example: /automated_emails renamed to /auto_mails, Bearer prefix changed to Token.
-**Non-impressive tests (deprioritize or skip):**
-1. "GET /products → 200" — trivial health check, no assertions beyond status code.
-2. "POST /products → GET /products/{id} → PUT /products/{id} → DELETE /products/{id}"
-   Single-resource CRUD — baseline, not impressive by itself.
-3. "POST /products with missing name → 422" — obvious validation, already covered by contract tests.`;
-}
 export function buildTestPatternGuidelines() {
-    return `## Test Pattern Guidelines (reference, not rigid rules)
+    return `### Test Pattern Guidelines
-### Tier 1 — Base Patterns
+#### Tier 1 — Base Patterns
 - CRUD lifecycle per resource group (Create → Read → Update → Delete)
 - Auth flow (Register → Login → Access protected → Token refresh → Logout)
 - Pagination & filtering (boundary values, empty results, large page sizes)
-- Error responses (400, 401, 403, 404, 409, 422 — each with specific trigger)
-### Tier 2 — Code-Informed Patterns (higher value — look for these in the codebase)
-- **Middleware chains**: If auth/rate-limit/logging middleware exists, test the chain
-  (e.g., rate limit hit → auth still checked → correct error returned)
-- **N+1 query risk**: If list endpoints join related data (e.g., orders with products),
-  test with large datasets under load
-- **State machines**: If resources have status transitions (draft→published→archived),
-  test invalid transitions (e.g., archived→draft should fail)
-- **Cascade deletes**: If deleting a parent removes children, verify cascade AND verify
-  orphan prevention (delete product → orders referencing it get error or cascade)
-- **Race conditions**: If concurrent writes are possible (inventory deduction, counter
-  increment), test concurrent requests under load
-- **Computed fields**: If response contains derived values (total, average, count),
-  verify computation with known inputs
-- **Webhook/event side effects**: If endpoints trigger async operations, test that side
-  effects occur (e.g., POST /orders triggers email notification)
-- **Cross-user isolation**: If resources are owned by users, test that user B cannot
-  access/modify user A\'s resources (GET /users/{other_id}/data → 403 Forbidden)
-- **Range/boundary invariants**: If business rules cap values (max retries, min balance,
-  discount ≤ subtotal), test the boundary (e.g., set retries to max+1 → expect rejection)
-- **Breaking change migration**: If the diff renames a route, changes auth headers, or
-  removes a required field, test both the old path (should 404 or redirect) and new path
-  (should succeed). Example: /automated_emails → /auto_mails, Bearer → Token prefix`;
+- Error responses (400, 401, 403, 404, 409, 422 — each with a specific trigger)
+#### Tier 2 — Code-Informed Patterns (higher value, look for these in source code)
+- **Middleware chains**: If auth/rate-limit/logging middleware exists, test the chain (e.g., rate limit hit → auth still checked → correct error returned)
+- **N+1 query risk**: If list endpoints join related data (e.g., orders with products), test with large datasets
+- **State machines**: If resources have status transitions (draft→published→archived), test invalid transitions (e.g., archived→draft should fail)
+- **Cascade deletes**: If deleting a parent removes children, verify cascade AND orphan prevention (delete product → orders referencing it get error or cascade)
+- **Race conditions**: If concurrent writes are possible (inventory deduction, counter increment), test concurrent requests
+- **Computed fields**: If response contains derived values (total, average, count), verify computation with known inputs (e.g., total_cost = compute_seconds * rate + memory_mb * rate + external_cost)
+- **Webhook/event side effects**: If endpoints trigger async operations, test that side effects occur (e.g., POST /orders triggers notification)
+- **Cross-user isolation**: If resources are owned by users, test that user B cannot access/modify user A's resources (GET /users/{other_id}/data → 403 Forbidden)
+- **Range/boundary invariants**: If business rules cap values (max retries, min balance, discount ≤ subtotal), test the boundary (e.g., set retries to max+1 → expect rejection)
+- **Breaking change migration**: If the diff renames a route, changes auth headers, or removes a required field, test both old path (should 404) and new path (should succeed)`;
 }
 export function buildTestQualityCriteria() {
-    return `## What Makes a Good Test
+    return `### What Makes a Good Test
 **Integration tests** should demonstrate cross-resource data flow — step A creates data
-that step B depends on (e.g., create product \u2192 create order referencing that product's ID \u2192
+that step B depends on (e.g., create product → create order referencing that product's ID →
 verify order contains correct product). Single-resource CRUD alone is not an integration test.
-Use realistic request bodies from source code schemas and verify response data, not just
-status codes.
+Use realistic request bodies from source code schemas and verify response data, not just status codes.
-**E2E tests** should follow realistic user journeys end-to-end: browse products \u2192 search \u2192
-add to cart \u2192 checkout. Verify that frontend actions trigger the correct API calls and
+**E2E tests** should follow realistic user journeys end-to-end: browse products → search →
+add to cart → checkout. Verify that frontend actions trigger the correct API calls and
 that the UI reflects backend state.
-**UI tests** should exercise component behavior and interaction flows: fill form \u2192 validate
-inputs \u2192 submit \u2192 see confirmation. Include visual state changes (loading, error, empty)
+**UI tests** should exercise component behavior and interaction flows: fill form → validate
+inputs → submit → see confirmation. Include visual state changes (loading, error, empty)
 and accessibility checks.`;
 }
 export function buildGenerationRules(isUIOnlyPR) {
-    return `## Generation Guidelines
+    return `### Generation Guidelines
+**Available test types:** integration, contract, E2E, UI. **No smoke or fuzz tests.**
+Choose based on what adds the most value for this PR's changes.
+**Contract test mode — signal-based selection:**
+- **Consumer contract** (\`consumerMode: true\`): Look for outbound HTTP client code (fetch, axios, httpx, requests, http.Client), service client classes, or calls to external base URLs. If an endpoint's implementation makes downstream calls, that downstream boundary is a consumer contract test candidate.
+- **Provider contract** (\`providerMode: true\`): Look for new or modified endpoint handlers, route changes, response shape modifications, or the presence of an OpenAPI spec. If the diff adds/changes an endpoint this service owns, that is a provider contract test candidate.
+- **Both modes**: When the service is simultaneously an API owner (upstream) AND a client of another service (downstream).
+- **Default (neither)**: Only when role is unclear or no spec is available.
 **Scenario fidelity:** Every workflow scenario should reflect the actual resource
 relationships in the code. If the pre-drafted scenarios don't match the real data model,
 replace them with accurate ones.
-**Available test types:**
-- **Integration** — multi-endpoint workflows that chain data across resources
-- **Contract** — validates API contracts between services. Two distinct modes — choose based on role:
-  - **Provider contract test** (\`providerMode: true\`): Recommend when this codebase IS the API provider/owner.
-    Use when: new endpoints are added, existing responses are modified, an OpenAPI spec exists to validate against,
-    or you need to verify the implementation still honors its contracts after a code change.
-    The test calls the real provider and asserts the response conforms to the spec.
-  - **Consumer contract test** (\`consumerMode: true\`): Recommend when this codebase CALLS another service's API.
-    Use when: the service makes outbound HTTP calls to downstream APIs, you need to verify outbound requests
-    conform to the downstream contract, or you want to catch consumer-side drift before it reaches the provider.
-    The test uses a request-aware mock as the provider — no live downstream service needed.
-  - **Both modes** (\`providerMode: true, consumerMode: true\`): Recommend for service boundaries where this
-    codebase is simultaneously an API owner (upstream) AND a client of another service (downstream).
-  - **Default (neither set)**: generates a standard contract test equivalent to both modes. Use only when
-    the role is unclear or as a fallback when no spec is available.
-**Signal for consumer contract test:** Look for outbound HTTP client code (fetch, axios, httpx, requests, http.Client),
-service client classes, or calls to external base URLs in the codebase. If an endpoint's implementation
-makes downstream calls, that downstream boundary is a consumer contract test candidate.
-**Signal for provider contract test:** Look for new or modified endpoint handlers, route changes, response
-shape modifications, or the presence of an OpenAPI spec. If the diff adds/changes an endpoint this service owns,
-that is a provider contract test candidate.
-**Do NOT recommend Fuzz tests.** Fuzz testing is available as a manual tool but must not appear in automated recommendations.
-- **E2E** — user journeys spanning frontend to backend (needs Playwright traces)
-- **UI** — frontend component interaction flows (needs Playwright traces)
 ${isUIOnlyPR ? `
-This is a **UI-only PR** — no backend changes. UI and E2E tests are most relevant.
-Without Playwright traces, recommend them with trace recording instructions
-(\`skyramp_start_trace_collection\` with \`playwright: true\`).
+**UI-only PR** — no backend changes. UI and E2E tests are most relevant.
+Without Playwright traces, list as additionalRecommendations with trace recording instructions.
 ` : `
 When no Playwright trace exists, still recommend E2E/UI tests with instructions for
 recording a trace using \`skyramp_start_trace_collection\` with \`playwright: true\`.
 `}
 **No duplicate coverage.** If an existing test already covers an endpoint + test type,
-recommend a different test that adds new coverage.
-Choose the test types and distribution that maximize coverage for this specific PR.
-No smoke, fuzz tests.`;
+recommend a different test that adds new coverage.`;
+}
+export function buildTestExamples() {
+    return `### Examples — what "good" looks like
+**Impressive (these catch prod bugs):**
+1. Cross-resource workflow: Register → login → create order → verify order appears in user's order list (category: workflow)
+2. State machine + business rule: Create product with inventory=10 → place order qty=10 → verify inventory=0 → place another order → verify 409 out-of-stock (category: business_rule)
+3. Computed field verification: POST /flow-costs with known compute_seconds/memory_mb/external_cost_usd → verify total_cost_usd = (compute_seconds × 0.00012) + (memory_mb × 0.000002 × compute_seconds) + external_cost_usd (category: business_rule)
+4. Cross-user isolation: Create user A's resource → authenticate as user B → GET/PUT/DELETE user A's resource → verify 403 (category: security_boundary)
+5. Cascade delete: Create parent → create child referencing parent → DELETE parent → GET child → verify 404 or 409 depending on FK policy (category: data_integrity)
+6. Unique constraint with side-effect: POST /users with duplicate email → verify 409 → verify original user unchanged (category: business_rule)
+7. Budget threshold: Create budget with alert_threshold=80% → record costs pushing spend to 85% → verify budget_warning=true on next cost record (category: business_rule)
+**Deprioritise (low value):**
+- GET /products → 200 (trivial health check, no assertions beyond status)
+- Single-resource CRUD with no cross-resource or state verification
+- POST with missing field → 422 (obvious validation, covered by contract tests)`;
 }
 export function buildToolWorkflows(authHeaderValue, authTypeValue = "") {
     const isAuthorizationHeader = /^authorization$/i.test(authHeaderValue);
@@ -201,7 +121,10 @@ If you find auth requirements, pass the appropriate \`authHeader\` (e.g., "Autho
         authParams = { authHeader: "" };
     }
     else if (isAuthorizationHeader) {
-        authParams = { authHeader: "Authorization", authScheme: "<scheme e.g. Bearer, Token or empty>" };
+        const resolvedScheme = (authTypeValue && authTypeValue !== "none")
+            ? authTypeValue.charAt(0).toUpperCase() + authTypeValue.slice(1)
+            : "<scheme e.g. Bearer, Token or empty>";
+        authParams = { authHeader: "Authorization", authScheme: resolvedScheme };
         if (authTypeValue) {
             authGuidance = `**Auth Scheme:** The workspace \`api.authType\` is \`"${authTypeValue}"\`.
 **Where to find the scheme** (check in order):
@@ -284,65 +207,3 @@ ${PATH_PARAM_UUID_GUIDANCE}
 **For E2E tests:**
 Same trace flow, pass both trace file and playwright zip to \`skyramp_e2e_test_generation\`.`;
 }
-export function buildCoverageChecklist(openApiSpec, isUIOnlyPR, topN, maxGenerate = MAX_TESTS_TO_GENERATE, maxCritical = MAX_CRITICAL_TESTS) {
-    const specNote = openApiSpec
-        ? `\n**OpenAPI Spec available**: \`${openApiSpec.path}\`
-Use it actively:
-- **Contract tests**: pass \`apiSchema: "${openApiSpec.path}"\` — the CLI validates response schemas against the spec.
-- **Integration tests**: pass \`apiSchema\` to \`skyramp_scenario_test_generation\` — it extracts destination and request/response shapes.
-- **Single-endpoint tests**: pass both \`endpointURL\` AND \`apiSchema\` for schema-aware generation.
-\n`
-        : "";
-    return `## Coverage Checklist
-${specNote}
-${isUIOnlyPR ? `**UI-only PR** — no backend changes.
-Without Playwright traces, the testbot skips generation entirely — all recommendations
-become additionalRecommendations in the report.
-` : `**Available test types:** integration, contract, E2E, UI. No smoke, fuzz tests.
-Choose based on what adds the most value for this PR's changes.
-`}
-## For Each Recommendation Include:
-1. Test type  2. Category (security_boundary/business_rule/data_integrity/breaking_change/workflow)
-3. Priority (high/medium/low)  4. Target endpoint/scenario
-5. What it validates (business logic, not just "tests the endpoint")
-6. Skyramp tool call details — exact tool + key params for zero-editing execution
-7. For integration/E2E: reference draftedScenario by scenarioName
-8. Reasoning — WHY this test matters: the specific production risk it prevents,
-   the business rule it enforces, or the security boundary it validates.
-   Not "tests the endpoint" but "prevents duplicate user registration that would
-   corrupt the user table" or "validates cascade delete does not orphan order records"
-## When Artifacts Are Missing
-Recommend the test anyway — never mark it "blocked":
-- **No OpenAPI spec** \u2192 use \`endpointURL\` and \`requestBody\` from source code
-- **No Playwright recording** \u2192 provide trace recording instructions
-- **No backend trace** \u2192 use the scenario generation pipeline
-## Select the Top ${topN} (category-aware)
-Consider all possible tests (endpoints \u00d7 interaction types + scenarios), then:
-1. **Categorize** each candidate as: security_boundary, business_rule, breaking_change, data_integrity, or workflow
-2. **Select critical tests first**: From each critical category present (security_boundary, business_rule, data_integrity, breaking_change), select the highest-ranked candidate — up to ${Math.min(maxCritical, Math.min(maxGenerate, topN))} of the ${Math.min(maxGenerate, topN)} generated tests MUST be from critical categories
-3. **Fill remaining generated slots**: Fill the remaining ${Math.max(Math.min(maxGenerate, topN) - Math.min(maxCritical, Math.min(maxGenerate, topN)), 0)} slot(s) with the highest-ranked workflow/general candidates
-4. **Total generated**: EXACTLY ${Math.min(maxGenerate, topN)} tests (critical categories filled first, then general). Do NOT generate fewer.
-5. **Additional recommendations**: The remaining ${Math.max(topN - Math.min(maxGenerate, topN), 0)} recommendations (#${Math.min(maxGenerate, topN) + 1}\u2013${topN}) go to additionalRecommendations \u2014 these are recommended but not generated
-Include \`totalConsidered\` count in your output.
-Critical-category tests MUST be in the generated set even if a workflow test scores higher on sophistication.
-## Recommendation Stability
-When PR history is provided:
-- **Carry forward** previous additionalRecommendations that still apply — match by scenarioName (for multi-step scenarios) or by endpoint (for single-endpoint tests). Re-derive category and priority from test content.
-- **Only add** new recommendations for code paths introduced since the last run.
-- **Only drop** a previous recommendation if its target endpoint was removed, its business logic changed, or it is now covered by a generated test.
-- Each integration scenario's step sequence should be logically valid — preconditions
-  met by prior steps.
-Each recommendation should include enough detail for direct tool invocation.
-Reference draftedScenarios by name and interactions by description.
-Use "high"/"medium"/"low" for priority — no numeric scores.
-You MUST produce EXACTLY ${topN} total recommendations: ${Math.min(maxGenerate, topN)} to generate + ${Math.max(topN - Math.min(maxGenerate, topN), 0)} as additionalRecommendations. Do NOT produce fewer.
-Generate recommendations now.`;
-}