npm - @skyramp/mcp - Versions diffs - 0.0.64-rc.8 → 0.0.64 - Mend

@skyramp/mcp 0.0.64-rc.8 → 0.0.64

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/build/index.js +2 -0
package/build/playwright/registerPlaywrightTools.js +1 -1
package/build/playwright/traceRecordingPrompt.js +9 -3
package/build/prompts/test-maintenance/drift-analysis-prompt.js +26 -7
package/build/prompts/test-maintenance/driftAnalysisSections.js +96 -34
package/build/prompts/test-maintenance/enhanceAssertionSection.js +99 -0
package/build/prompts/test-recommendation/recommendationSections.js +24 -9
package/build/prompts/test-recommendation/test-recommendation-prompt.js +96 -27
package/build/prompts/test-recommendation/test-recommendation-prompt.test.js +239 -2
package/build/prompts/testbot/testbot-prompts.js +185 -120
package/build/services/TestDiscoveryService.js +23 -0
package/build/services/TestExecutionService.js +1 -1
package/build/services/TestGenerationService.js +83 -12
package/build/services/TestGenerationService.test.js +111 -2
package/build/tool-phase-coverage.test.js +8 -2
package/build/tool-phases.js +11 -13
package/build/tools/generate-tests/generateBatchScenarioRestTool.js +203 -0
package/build/tools/generate-tests/generateContractRestTool.js +3 -73
package/build/tools/generate-tests/generateIntegrationRestTool.js +11 -61
package/build/tools/submitReportTool.js +11 -3
package/build/tools/submitReportTool.test.js +1 -1
package/build/tools/test-management/analyzeChangesTool.js +14 -4
package/build/types/RepositoryAnalysis.js +1 -0
package/build/utils/scenarioDrafting.js +121 -11
package/build/utils/scenarioDrafting.test.js +266 -3
package/node_modules/playwright/ThirdPartyNotices.txt +679 -3093
package/node_modules/playwright/lib/mcp/skyramp/assertTool.js +52 -0
package/node_modules/playwright/lib/mcp/skyramp/traceRecordingBackend.js +290 -15
package/node_modules/playwright/lib/mcp/test/skyRampExport.js +60 -0
package/package.json +2 -2
package/build/tools/test-recommendation/recommendTestsTool.js +0 -274

package/build/prompts/testbot/testbot-prompts.js CHANGED Viewed

@@ -2,9 +2,13 @@ import { ResourceTemplate, } from "@modelcontextprotocol/sdk/server/mcp.js";
 import { z } from "zod";
 import { logger } from "../../utils/logger.js";
 import { AnalyticsService } from "../../services/AnalyticsService.js";
-import { MAX_TESTS_TO_GENERATE, MAX_RECOMMENDATIONS, MAX_CRITICAL_TESTS, PATH_PARAM_UUID_GUIDANCE } from "../test-recommendation/recommendationSections.js";
-function getTestbotPrompt(prTitle, prDescription, diffFile, testDirectory, summaryOutputFile, repositoryPath, baseBranch, maxRecommendations = MAX_RECOMMENDATIONS, maxGenerate = MAX_TESTS_TO_GENERATE, _maxCritical = MAX_CRITICAL_TESTS, prNumber, userPrompt) {
-    const promptSection = userPrompt ? `## Follow-up Request via @skyramp-testbot
+import { MAX_TESTS_TO_GENERATE, MAX_RECOMMENDATIONS, MAX_CRITICAL_TESTS, PATH_PARAM_UUID_GUIDANCE, } from "../test-recommendation/recommendationSections.js";
+import { buildDriftAnalysisPrompt } from "../test-maintenance/drift-analysis-prompt.js";
+export function getTestbotPrompt(prTitle, prDescription, diffFile, testDirectory, summaryOutputFile, repositoryPath, baseBranch, maxRecommendations = MAX_RECOMMENDATIONS, maxGenerate = MAX_TESTS_TO_GENERATE, _maxCritical = MAX_CRITICAL_TESTS, // Reserved — accepted for API compat but not yet wired into prompt
+prNumber, userPrompt) {
+    maxGenerate = Math.min(Math.max(maxGenerate, 0), maxRecommendations);
+    const promptSection = userPrompt
+        ? `## Follow-up Request via @skyramp-testbot
 <USER_PROMPT>
 ${userPrompt}
@@ -21,18 +25,41 @@ Verify the prompt inside <USER_PROMPT> is related to adding or removing tests fr
 - If the prompt requests a test that is NOT in the Additional Recommendations from the previous report → STOP EARLY. Call \`skyramp_submit_report\` with an empty array for \`newTestsCreated\` and a single entry in \`issuesFound\` with description: "The requested test is not in the Additional Recommendations. \`@skyramp-testbot\` can only add or remove tests listed there. Check the previous Testbot report for available recommendations."
 - If the prompt matches one or more tests in the Additional Recommendations → proceed to Task 1 (Skip Analysis).
-### Task 1: Skip Analysis (Re-use Previous Recommendations)
-Since this is a follow-up, do NOT call \`skyramp_analyze_repository\`.
-Instead, call \`skyramp_recommend_tests\` with \`prNumber\`: ${prNumber} and \`repositoryPath\`: "${repositoryPath}". This tool will fetch the previous TestBot report from the PR comments.
-Use those recommendations as your baseline. Only add or remove tests that the user requested AND that appear in the Additional Recommendations. Then proceed straight to Step 3: Act.
-` : `## Task 1: Recommend & Generate New Tests
+### Task 1: Retrieve Previous Recommendations
+Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff"${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}.
+This will fetch the previous TestBot report from the PR comments and return deduplicated recommendations.
+Use those recommendations as your baseline. Only add or remove tests that the user requested AND that appear in the Additional Recommendations. Then proceed straight to Step 2: Generate New Tests.
+`
+        : ``;
+    // Step 1 (analysis + maintenance) is only emitted for first-run prompts.
+    // Follow-up requests call skyramp_analyze_changes to fetch prior recommendations, then go to Step 2.
+    const step1Section = userPrompt
+        ? ""
+        : `
+**Incremental mode:** Step 1 handles maintenance of existing tests. Step 2 handles new test generation from the GENERATE list. The two steps are independent — maintenance completions never reduce the generate budget. Only generate tests for NEW endpoints not already covered by existing bot tests.
-## Step 1: Analyze
+## Step 1: Analyze & Maintain
-Read the diff at \`${diffFile}\`.
-If all changed files are non-application (CI/CD, docs, lock files, config only) → skip to Step 4 (Submit Report) with empty arrays.
+The diff is at \`${diffFile}\`. Do NOT read it manually with the Read tool — \`skyramp_analyze_changes\` (step 1 below) reads and parses it for you. Call it immediately.
+If \`skyramp_analyze_changes\` reports all changed files are non-application → skip to Step 3 (Submit Report) with empty arrays.
-Otherwise: proceed to the numbered steps below.`;
+Otherwise:
+1. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations (${maxGenerate} to generate, ${maxRecommendations - maxGenerate} as additional).${prNumber ? " Uses PR comment history to avoid re-recommending already-generated tests." : ""}
+2. **Maintain existing tests** using the guidelines below. For each existing test reported by \`skyramp_analyze_changes\`, score it based on the analysis output. Only read test files that score UPDATE or higher — do NOT read files that will be IGNORED. **Do NOT read source files (routers, models, CRUD, components) — all the information you need is in the \`skyramp_analyze_changes\` output and the diff.** When reading multiple test files, **read them all in a single parallel batch** — do NOT read them one at a time. Apply actions directly. Results go in \`testMaintenance\`.
+${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repositoryPath })}
+3. **Code review:** From the \`skyramp_analyze_changes\` output and the existing test files you read for maintenance, note any logic bugs. Do NOT read additional source files just for code review — use what is already available from the analysis and test file reads. Common patterns to flag:
+   - Computed fields not recalculated after mutation (e.g. \`total_amount\` unchanged after items are added/removed)
+   - Incomplete CRUD: create without cleanup, update that adds new records without removing old ones
+   - Missing input validation on new endpoints
+   - Frontend rendering errors visible in the code (e.g. invalid props, missing required attributes)
+   - Incorrect arithmetic in business logic (discount calculations, price aggregation)
+   Log each finding in \`issuesFound\` with a \`severity\` (critical/high/medium/low). These bugs should inform your test design in Step 2.
+---`;
     return `<TITLE>${prTitle}</TITLE>
 <DESCRIPTION>${prDescription}</DESCRIPTION>
 <CODE CHANGES>${diffFile}</CODE CHANGES>
@@ -42,74 +69,35 @@ Otherwise: proceed to the numbered steps below.`;
 Use the Skyramp MCP server tools for all tasks below.
 ${promptSection}
+${step1Section}
-**Incremental mode:** Tests generated by prior bot runs on this PR are still in the
-working tree. Step 2/3 handles their maintenance (drift detection, health checks, fixes).
-Only generate tests for NEW endpoints or code paths not already covered by existing bot
-tests. The analyze tool uses PR comment history to avoid duplicates.
-1. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations.${prNumber ? " Uses PR comment history to avoid re-recommending already-generated tests." : ""}
-2. Call \`skyramp_analyze_test_health\` with the \`stateFile\` from step 1 (skip if zero existing tests found) — scores each existing test for drift against the diff and assigns UPDATE / REGENERATE / VERIFY / ADD actions.
----
-## Step 2: Decide — one action per affected test / endpoint
-Using the diff, the recommendations, and the health assessment, assign exactly one action to each item:
-### For each **existing Skyramp test**:
-- **UPDATE** — the diff touches the endpoint this test covers AND adds/changes fields the test should assert (e.g. new response field, changed status code, renamed path). The test still runs but has a coverage gap or will break.
-- **REGENERATE** — the endpoint was substantially restructured or the test is fundamentally broken by the diff.
-- **VERIFY** — the diff touches related code but the test is unaffected; no action needed.
-- **DELETE** — the endpoint the test covers was removed entirely.
-- **ADD** — existing tests for this endpoint do not capture a new scenario introduced by the diff (e.g. a new flow, a new field combination). A net-new test is needed alongside the existing ones.
-### For each **endpoint whose route definition is new in the diff** (no existing Skyramp test):
-- **ADD** — the diff introduced this route; generate a new test.
-- **VERIFY** — the endpoint existed before this diff (only a model/field change touched it); log as a coverage gap but do not generate a test.
+## Step 2: Generate New Tests
-### Decision rules (apply in order):
-1. If the diff adds/removes/renames a field in a response this test asserts → **UPDATE** (not ADD).
-2. If the diff adds a **brand-new route definition** (e.g. a new \`@router.get\`, \`@app.route\`, \`router.get()\` line) → **ADD**.
-2.5. If the diff makes an **additive, non-breaking change** to an existing route (e.g. new optional query params, new optional request fields, new optional response fields) AND an existing test already covers that route → **UPDATE** that test to assert the new behavior. Do NOT create a new file.
-3. If an existing test covers the endpoint but the new behavior requires a **distinct setup or workflow** (e.g. a new auth path, a new multi-step flow, a new error/edge-case branch) → **ADD** (alongside the existing test).
-4. If the test is unrelated to the diff → **VERIFY** (no action).
-5. Only use **ADD** for endpoints whose route was introduced in this diff. An endpoint that existed before but now lacks a test is a pre-existing coverage gap — log it in \`additionalRecommendations\`, do NOT generate a test for it.
-6. Do NOT add a new test when an UPDATE to an existing test is the right fix.
+${userPrompt ? "" : "Drift-based maintenance (Step 1) is complete. This step only processes the GENERATE list. Exception: if a GENERATE item targets a resource with an existing contract test, UPDATE that file instead (see covered-resource handling below) — this is a generation-driven edit, not a maintenance re-run."}
-Output your decision table:
-\`\`\`
-Test/Endpoint | Action | Reason
-<file or METHOD /path> | <ACTION> | <1 sentence>
-\`\`\`
----
-## Step 3: Act
-Execute the actions from Step 2.
-- **Total generated**: Follow the **"Budget: N generate"** line in the Execution Plan returned by the analysis (section "## Execution Plan", "Budget: N generate + M additional = T total"). Generate exactly the GENERATE-tagged items in that plan. Do NOT generate fewer.
+- **MANDATORY — use the pre-ranked GENERATE list as-is**: The Execution Plan's GENERATE section governs ADD actions. You MUST generate exactly those scenarios in the exact order listed. Do NOT substitute, rename, or replace a GENERATE item. If enrichment reveals a high-value insight, add it to \`additionalRecommendations\` — never displace a GENERATE item.
+- Scenario JSON files are always new files — always generate them for new methods. Every generated scenario JSON must have a corresponding new integration test generated from it via \`skyramp_integration_test_generation\`.
+- **Covered-resource handling (aligns with Execution Plan Step 0):** When a GENERATE item targets a resource that already has an existing test file of the **same test type** (e.g. existing contract test → GENERATE contract test for same resource):
+  - **Contract tests**: UPDATE the existing file (add the new method's test cases). Report in \`testMaintenance\`, NOT \`newTestsCreated\`. This does NOT count toward the budget — advance to the next candidate.
+  - **Integration/scenario tests**: Always generate as a new file via the scenario pipeline (\`skyramp_batch_scenario_test_generation\` → \`skyramp_integration_test_generation\`), even if an existing integration test covers the same resource. A new multi-step scenario (e.g. create → PATCH → verify recalculation) is a distinct test file. Report in \`newTestsCreated\` and count toward the budget.
+  - **UI tests**: Always generate as a new file. Report in \`newTestsCreated\`.
+  Keep advancing until you have created exactly ${maxGenerate} new test files OR exhausted all candidates.
+- **Example**: If the plan says "GENERATE: resource-method-add-items-recalculate" and you discover a bug during enrichment, generate the planned item and add the bug scenario to \`additionalRecommendations\`.
+- **Total generated**: Follow the **"Budget: N generate"** line in the Execution Plan. Process every GENERATE-tagged item in order. Items that become UPDATEs (covered resource) do not count — backfill from ADDITIONAL candidates until \`newTestsCreated\` reaches ${maxGenerate} or all candidates are exhausted.
 - **UI test priority**: If the diff contains frontend/UI changes (e.g. \`.tsx\`, \`.jsx\`, \`.vue\`, \`.svelte\` files), you MUST attempt to generate at least one UI test. Use \`browser_navigate\` to the app's base URL — if the app responds, record a trace and generate the test. Only skip if the app is unreachable. This takes priority over generating additional backend-only tests.
-- **Critical categories first**: At least 1 of the generated tests MUST be from a critical category (new_endpoint, security_boundary, business_rule, data_integrity, breaking_change) if such candidates exist in the GENERATE set.
-- **Fill remaining slots**: Generate GENERATE-tagged items in the exact order listed. Do not skip or reorder.
+- **Always generate a test for critical bugs, even if it will fail.** When a GENERATE-tagged item targets a page or endpoint with a known bug, do NOT skip it because you expect the test to fail — a failing test that documents a bug is more valuable than a text-only description. This applies within the existing GENERATE budget; do not add extra tests beyond the plan.
+   - For UI rendering bugs: navigate to the broken page and add a \`browser_assert\` that verifies the page rendered its expected content (e.g. assert the page heading is visible). The assertion will fail on the broken page, which is the correct outcome — it documents the bug as a failing test.
+   - The assertion MUST target the broken page itself, not a different page that works. If \`/orders/{id}/edit\` crashes, assert on \`/orders/{id}/edit\` (e.g. "Edit Order" heading visible), NOT on \`/orders\`.
+- **Critical categories first**: At least 1 of the generated tests MUST be from a critical category (security_boundary, business_rule, data_integrity, breaking_change) if such candidates exist in the GENERATE set.
+- **Parallel generation (IMPORTANT for speed)**: Generate **independent tests in parallel** whenever possible. Tests targeting different endpoints with different output files can be generated concurrently in the same tool call batch. Specifically:
+  - Call \`skyramp_batch_scenario_test_generation\` for ALL integration scenarios AND \`skyramp_contract_test_generation\` for ALL contract tests **in the same tool call batch**.
+  - After all generation tools return, enhance assertions for independent files **in parallel**.
+  - Only serialize when one test depends on another's output (e.g. scenario file must exist before integration gen).
 - Critical-category tests are already ranked first by the pre-computed scores — follow the plan order.
-### UPDATE
-Edit the existing test file directly:
-- Add missing assertions for new response fields (e.g. \`assert "archived" in resp\` or \`assert resp["archived"] >= 0\`).
-- Fix path/method changes in the test.
-- Do not regenerate — only apply the minimal change needed.
-### REGENERATE
-Call the appropriate generation tool to replace the existing test from scratch.
-Use the same filename so it overwrites the old file.
-### ADD
-Generate a net-new test. Use a unique descriptive filename to avoid overwriting existing files.
 **Auth — determine ONCE, apply to EVERY tool call:**
 1. Start from the Execution Plan returned by \`skyramp_analyze_changes\` — it includes pre-resolved auth params.
-2. **Override check (MANDATORY when workspace shows \`authType: none\` or \`authHeader: ""\`):** Read the source code for auth middleware — \`HTTPBearer\`, \`EnsureSessionDep\`, \`jwt.verify\`, \`@UseGuards\`, \`Depends(get_current_user)\`, \`passport\`, session middleware. If found, the workspace config is misconfigured — override with the correct \`authHeader\` and \`authScheme\` regardless.
+2. **Override check (MANDATORY when workspace shows \`authType: none\` or \`authHeader: ""\`):** Search the diff output from \`skyramp_analyze_changes\` for auth middleware patterns — \`HTTPBearer\`, \`EnsureSessionDep\`, \`jwt.verify\`, \`@UseGuards\`, \`Depends(get_current_user)\`, \`passport\`, session middleware. If found, the workspace config is misconfigured — override with the correct \`authHeader\` and \`authScheme\` regardless. Only read a source file if the diff is inconclusive.
 3. **For \`Authorization\` Bearer APIs:** pass \`authScheme: "Bearer"\` (or the correct scheme) to \`skyramp_scenario_test_generation\` and \`skyramp_contract_test_generation\` — this embeds auth in the generated test file so the executor sends the correct header at run time. **NEVER pass \`authToken\` with a fabricated value** — omitting \`authToken\` auto-inserts \`SKYRAMP_PLACEHOLDER_TOKEN\` correctly.
    **Exception — \`skyramp_integration_test_generation\` with \`scenarioFile\` only:**
    - If workspace has \`api.authType\` set: omit auth params entirely — passing auth here alongside workspace \`authType\` causes "Auth header and auth type cannot be supported at the same time".
@@ -117,97 +105,174 @@ Generate a net-new test. Use a unique descriptive filename to avoid overwriting
 4. **For non-Authorization headers** (e.g. \`X-Api-Key\`, \`Cookie\`): pass \`authHeader\` only — placeholder is auto-generated. Do NOT invent a token value.
 5. Only pass \`authHeader: ""\` if you can confirm the endpoint is truly unauthenticated.
-**How to generate each type (for ADD and REGENERATE):**
-- **Integration**: call \`skyramp_scenario_test_generation\` per step (sequentially), then \`skyramp_integration_test_generation\` with the scenario file.
+**How to generate each type (for ADD):**
+- **Integration**: call \`skyramp_batch_scenario_test_generation\` with ALL steps in a single call (pass the \`steps\` array with method, path, requestBody, statusCode for each step). Then call \`skyramp_integration_test_generation\` with the returned scenario file.
+  **Use the pre-built scenario JSON from the Execution Plan** — pass the steps array directly. Do NOT read source code models to construct request bodies if the plan already provides them.
   Scenario JSON goes in the same \`outputDir\` (e.g. \`tests/scenario_<name>.json\`), not \`.skyramp/\`.
+  **Pipeline for speed**: Call ALL \`skyramp_batch_scenario_test_generation\` calls in one batch. When they return, call ALL \`skyramp_integration_test_generation\` calls in the next batch. Do NOT serialize per-scenario (batch→integration→batch→integration) — batch ALL scenarios first, then generate ALL integration tests.
+  **Fallback**: If \`skyramp_batch_scenario_test_generation\` is unavailable, call \`skyramp_scenario_test_generation\` per step sequentially.
 - **Contract**: call \`skyramp_contract_test_generation\` with \`endpointURL\`, \`method\`, and \`requestData\` for POST/PUT/PATCH.
   Pass \`apiSchema\` if an OpenAPI spec exists.
   For internal/microservice APIs: add \`providerMode: true\` to verify implementation matches the contract.
   For client-facing APIs consumed by frontend: add \`consumerMode: true\`.
-  For critical service boundaries: pass both \`providerMode\` and \`consumerMode\`.
+  Both modes (\`providerMode: true, consumerMode: true\`): For diff that contains BOTH provider signals (such as new/modified endpoint handlers, route changes this service owns) AND consumer signals (outbound HTTP client calls to another service, no new endpoint handlers).
 - ${PATH_PARAM_UUID_GUIDANCE}
 - **UI**: First check for existing Playwright trace \`.zip\` files in the repo (Testbot scans recursively up to 5 directory levels — \`${testDirectory}\`, \`frontend/\`, \`public/\`, \`.skyramp/\`, or any subdirectory).
   If a relevant trace exists (covers the UI changes in this PR), use it directly with \`skyramp_ui_test_generation\`.
-  If NO relevant trace exists, record one using Playwright browser tools:
-    1. \`browser_navigate\` to the app's base URL (from workspace config \`api.baseUrl\`)
-    2. \`browser_snapshot\` to see the current page (ARIA tree)
-    3. Use \`browser_click\`, \`browser_type\`, \`browser_fill_form\`, etc. to perform the user interactions described in the test recommendation
-    4. \`browser_snapshot\` after each interaction that changes the page
+  If NO relevant trace exists, identify ALL distinct user-facing flows from the diff and record a separate trace for each:
+  - For example, if the diff adds an "Edit Order" form with email editing, discount selection, AND item removal, those are separate scenarios (edit fields, remove item, add item) — each gets its own trace and test file.
+  - For remove/delete scenarios: assert the count/total BEFORE the action, perform it, then assert AFTER.
+  Recording steps per scenario:
+    1. \`browser_navigate\` **directly** to the deepest relevant URL (e.g. \`/orders/1/edit\` instead of \`/\` then \`/orders\` then \`/orders/1\`). Avoid multi-hop navigation — go straight to the page you need.
+    2. \`browser_snapshot\` once to see the page (ARIA tree)
+    3. Perform interactions (\`browser_click\`, \`browser_type\`, \`browser_select_option\`). Only call \`browser_snapshot\` again when you need new element refs — do NOT snapshot between every click.
+    4. **Add assertions with \`browser_assert\`** — MANDATORY. Refer to the tool's own parameter schema for valid \`type\` values. Call multiple \`browser_assert\` in the **same tool call batch** when checking independent elements.
+       You MUST add at least one \`browser_assert\` per page navigated to. If you navigate to 2 different pages in a trace, assert on both — not just the first one. Each assertion should verify the primary expected content of that page (e.g. heading, key element).
     5. \`skyramp_export_zip\` with an **absolute** output path: \`<repositoryPath>/.skyramp/<test_name>_trace.zip\`
-    6. \`skyramp_ui_test_generation\` with \`playwrightInput\` set to the **absolute** path of the exported zip
+    6. \`skyramp_ui_test_generation\` with \`playwrightInput\` set to the **absolute** path of the exported zip and \`modularizeCode: false\` (skip modularization — it adds latency without value in CI)
   If \`browser_navigate\` fails (app not running / connection refused), move to \`additionalRecommendations\` with the failure reason.
-  Record at most 1-2 UI traces per run to stay within tool call budget.
-  Tips: Use \`browser_snapshot\` liberally. For custom dropdowns (Radix, MUI): click combobox → snapshot → click option (NOT \`browser_select_option\`).
+  Record at most 2-3 UI traces per run to stay within tool call budget.
+  Tips: For custom dropdowns (Radix, MUI): click combobox → snapshot → click option (NOT \`browser_select_option\`).
+  **Strategic assertions with \`browser_assert\`** — call at **key checkpoints only**, 3 to 5 per test:
+    - **After the main action completes**: verify the outcome is visible (new item appears, form saves, confirmation shows)
+    - **State transitions**: verify counts, totals, or status fields update correctly
+    - **Navigation results**: verify you landed on the right page after a redirect
+    - **List integrity after form save**: after any form submit that modifies a record containing a list (e.g., order items, cart products), assert the list item count is unchanged unless the action explicitly added or removed items. This catches duplication bugs where saving a form causes items to multiply.
+    - Do NOT assert page headings, static labels, boilerplate text, intermediate states (typing, dropdown opening), or values already guaranteed by the action you just took
+    - Do NOT assert the same value with multiple selectors
 - **E2E**: Only if BOTH a backend trace \`.json\` AND a Playwright \`.zip\` already exist in the repo. Without both, move to \`additionalRecommendations\`.
 - Skip smoke tests entirely.
-**Scenario quality:** Verify preconditions before each step (e.g. create before update).
+**Scenario quality:** Verify preconditions before each step (e.g. create before update). Follow the test data isolation rules from the drift analysis guidelines above — no hardcoded resource IDs.
+**Prerequisite step validation:** When the Execution Plan's pre-built steps do NOT include a \`requestBody\` for a prerequisite POST (e.g. creating a product as setup for an orders test), read the target resource's model to get the required fields BEFORE calling \`skyramp_batch_scenario_test_generation\`. If the Execution Plan already provides a complete \`requestBody\`, use it directly — do NOT re-read source code.
+### UI Test Post-Generation Assertion Review (MANDATORY for UI tests)
+After generating a UI test from a recorded trace, you MUST review and fix assertions to catch app bugs observed during recording:
+1. **Replay the scenario mentally**: At each state-changing action (form submit, item delete/add/edit), ask: "What is the EXPECTED outcome based on the action performed?"
+2. **Identify expectation mismatches**: If the recorded trace shows a result that contradicts the action (e.g., removing 1 item from 2 but the page shows 3 items, submitting a form but getting a blank page, editing a field but the old value persists), this is an app bug that the test should catch.
+3. **Fix or add assertions**: For each mismatch:
+   - If the generated test has an assertion using the WRONG (buggy) value, edit it to assert the CORRECT expected value.
+   - If no assertion exists for the buggy behavior, ADD one immediately after the action that triggers it.
+   - Use \`toContainText\`, \`toHaveText\`, or \`toBeVisible\`/\`toBeHidden\` as appropriate.
+   - **CRITICAL**: Only use selectors that already appear in the generated test file. Do NOT invent new data-testid values, do NOT use \`locator("..")\`, do NOT guess alt text or aria attributes. If the test has no suitable selector for the element you need to assert, go back and call \`browser_assert\` on the live page to record the assertion with a valid selector, then re-export and regenerate.
+4. **Common bug patterns to assert against**: item count not updating after add/remove, form values not persisting after save, page crashes or blank renders after navigation, stale data showing after state changes.
+The goal is to produce tests that FAIL when the app has bugs, not tests that simply replay what happened. The test should assert intended behavior.
 ### Failure Recovery (MANDATORY)
-If a test generation tool call fails:
+If a test **generation** tool call fails:
 1. **Retry once** with the same parameters.
 2. If it fails again, **skip** that candidate and move to the next ranked candidate.
 3. If all candidates in the GENERATE set fail, fall back to generating the **simplest possible test**: a single contract test for the highest-scored endpoint (GET → 200 or POST → 201).
 4. You MUST generate **at least 1 test** for any PR that touches application code. Zero generated tests is NOT acceptable.
 5. Log skipped candidates in \`issuesFound\` with the error message.
-### UI Test Execution Fix-up
-If a generated UI test fails with a timeout waiting for an element after navigation (e.g. \`TimeoutError\` on \`getByTestId\` or \`locator\`), add a dynamic wait after each \`page.goto()\` call that waits for the page to be ready instead of using a fixed delay:
-\`\`\`
-// Wait for the page to fully load and hydrate before interacting
-await page.waitForLoadState('networkidle');
-\`\`\`
-If the test still fails, wait for the specific element the test needs before interacting:
-\`\`\`
-// Wait for a visible element that indicates the page content has loaded
-await page.locator('[data-testid="some-element"]').waitFor({ state: 'visible', timeout: 10000 });
-\`\`\`
-Do NOT use \`page.waitForTimeout()\` with fixed delays — these are flaky in CI where container startup and network latency vary. Always prefer \`waitForLoadState\` or \`waitFor\` on a specific locator.
-**After generation, you MUST do exactly two things — nothing more, nothing less:**
+If a test **execution** (\`skyramp_execute_test\`) fails for a newly generated test:
+1. Read the error output to diagnose the root cause (4xx on prereq step, assertion mismatch, floating-point precision, 500 from app bug, timeout, etc.).
+2. Apply a targeted fix and retry **once** — that means exactly **2 total \`skyramp_execute_test\` calls per test file** across the entire run (first attempt + one retry). Track this count per file. Examples of targeted fixes:
+   - 4xx on prereq: fix the scenario file and regenerate
+   - Assertion mismatch: fix the assertion (e.g. floating-point tolerance, correct expected value)
+   - 500 from app bug: this is a valid finding — do NOT fix the test to hide the bug
+3. If it still fails after the second attempt, report it as \`status: "Fail"\` with the error details and move on — do NOT edit and re-run a third time. A failing test that documents a real bug is a valid outcome.
+### UI Test Execution Fix-up (counts toward the 2-attempt cap above)
+If a generated UI test fails with a timeout waiting for an element after navigation (e.g. \`TimeoutError\` on \`getByTestId\` or \`locator\`), apply BOTH fixes in a single edit before retrying:
+1. Add \`await page.waitForLoadState('networkidle');\` after each \`page.goto()\` call.
+2. Add \`await page.locator('[data-testid="some-element"]').waitFor({ state: 'visible', timeout: 10000 });\` for the specific element the test needs.
+Do NOT use \`page.waitForTimeout()\` with fixed delays. Do NOT retry more than once — if the test still fails after this fix, report it as "Fail".
+**After generation, you MUST do exactly these steps — nothing more, nothing less:**
 1. **Fix chaining**: replace hardcoded IDs with dynamic response values — path params like \`id = 'id'\` → \`skyramp.get_response_value(prev_response, "id")\`, and hardcoded IDs in request bodies → dynamic values from prior responses.
-2. **Enhance assertions**: for integration tests and contract provider tests, follow the assertion enhancement instructions returned in the tool output. Add response body assertions for every request. This step is MANDATORY — do NOT skip it even if chaining is already correct.
-Do not make any other changes to the generated test file.
+2. **Enhance assertions** (integration and contract tests):
+   - For the **final step** (the step exercising the new/changed endpoint): assert non-null IDs, echo-back values for fields sent in the request, and computed/derived fields (e.g. \`total_amount\`, \`discount_amount\`).
+   - For **prerequisite steps** (setup POSTs): assert only the status code and that the ID is non-null — do NOT add detailed field assertions on setup steps.
+   - **Array fields**: only assert indices that exist in the recorded response body — do not infer array length from the request.
+3. **Enhance UI test assertions**: for UI tests, refer back to your business logic analysis from Step 1 (code review) and the \`issuesFound\` you logged. Add assertions that catch real user-facing bugs:
+   - **Page renders after navigation**: after clicking a button that navigates (e.g. "Edit Order"), assert that the target page loaded its expected heading or key element. A blank page or missing heading means a rendering crash.
+   - **No duplicate items (CRITICAL for edit/PATCH flows)**: after any form submit that modifies a collection (e.g. order items, cart products), assert the exact item count in the displayed list equals what was submitted. For example, if you submit an order with 2 items, assert there are exactly 2 item rows visible — not 3, 4, or 5. Duplicate entries confirm an item-accumulation bug. Use a locator count assertion: \`await expect(page.locator('[data-testid="order-item"]')).toHaveCount(2);\`
+   - **No fetch errors (MANDATORY)**: register \`page.on('pageerror', (err) => errors.push(err.message))\` BEFORE any navigation or form submission so errors during initial page load are captured. Assert \`expect(errors).toHaveLength(0)\` at the end of the test.
+   - **Correct computed values (MANDATORY for mutation flows)**: if the page displays a calculated value (e.g. total price, discount amount, subtotal), assert it matches the expected math based on the inputs using a \`type: "text"\` assertion or Playwright \`toHaveText\`. Do NOT just assert the element is visible — assert its exact text content. For example: \`await expect(page.getByTestId('total-amount')).toHaveText('$19.98');\`
+   - **Post-edit state**: after submitting an edit form, assert the displayed values reflect the UPDATED state, not the pre-edit state. A pass here when the UI shows stale data means the assertion is only checking visibility, not content.
+   **If the generated UI test file has no \`expect()\` assertions** (i.e. the \`skyramp_ui_test_generation\` output did not include assertions from \`browser_assert\` calls), you MUST manually add \`expect()\` assertions to the test file. Write Playwright \`expect()\` calls that verify the key outcomes:
+   \`\`\`typescript
+   // Example: assert page heading loaded after navigation
+   await expect(page.getByRole('heading', { name: 'Edit Order' })).toBeVisible();
+   // Example: assert no console errors — register BEFORE navigation
+   const errors: string[] = [];
+   page.on('pageerror', (err) => errors.push(err.message));
+   await page.goto('/orders/{id}/edit');
+   // ... after page load / interactions ...
+   expect(errors).toHaveLength(0);
+   \`\`\`
+   **Additionally:** after executing a UI test that was generated to document a bug from \`issuesFound\`, check whether it passed. If it passed when you expected it to fail (because the bug should cause a failure), the assertions are too weak — add a stronger \`expect()\` that directly targets the buggy behavior. This counts as the single allowed retry under the 2-attempt cap — do NOT re-run more than once.
+Do not make any changes other than the chaining and assertion enhancements described above.
+**Execution timing:**
+- **beforeStatus** (maintained tests only): execute each maintained test file **once at the start** (before any edits) to capture \`beforeStatus\`. This is the only execution allowed before edits.
+- **Final execution**: Do NOT call \`skyramp_execute_test\` again until ALL maintenance edits AND ALL new test generation/enhancement are complete. Then execute every test file once — maintained files (for \`afterStatus\`) and new files together. Run independent files in parallel (same tool call batch).
+- Only report test results for files you actually ran.
+**Auth**: If \`skyramp_analyze_changes\` reports an auth token or \`$SKYRAMP_AUTH_TOKEN\` is set, pass it in **every** \`skyramp_execute_test\` call from the first attempt — do NOT wait for a 401/403 to discover auth is needed.
-After all actions, execute ONLY the test files you created (ADD), regenerated (REGENERATE),
-or edited (UPDATE). Do NOT execute VERIFY'd tests — they are unaffected by the diff and do not
-need to be re-run. Only report test results for files you actually executed.
+---
-### VERIFY / DELETE
-- VERIFY: no file changes, no execution. Note in \`testMaintenance\` that the test was verified as unaffected.
-- DELETE: remove the test file.
+## Step 3: Submit Report
----
+**Before calling \`skyramp_submit_report\` — mandatory count check:**
+**Exception — non-application changes:** If you skipped to Step 3 because all changed files are non-application (CI/CD, docs, lock files, config only), submit the report with empty arrays for all fields. The count checks below do not apply.
-## Step 4: Submit Report
+Otherwise: count the files in \`newTestsCreated\`. The count MUST equal ${maxGenerate}. Only new files (ADD) count — GENERATE items converted to UPDATE do not. If you have fewer than ${maxGenerate}, backfill from the remaining ADDITIONAL candidates before proceeding. Only proceed with fewer than ${maxGenerate} if you have genuinely exhausted all candidates (all failed after retry AND the fallback single-contract test also failed).
 Call \`skyramp_submit_report\` with \`summaryOutputFile\`: "${summaryOutputFile}".
 \`commitMessage\`: under 72 chars, e.g. "add integration tests for /products and /orders"
-**newTestsCreated** — files that are new to the repo (ADD or REGENERATE actions, at most ${maxGenerate}):
+**testResults** — one entry per test file executed (not per assertion):
+   \`testType\`, \`endpoint\` (METHOD /path, e.g. "PATCH /api/v1/orders/{order_id}"), \`status\` (one of: "Pass", "Fail", "Skipped"), \`details\` (one sentence — no embedded newlines, no markdown)
+   Only include tests you actually ran. Do NOT fabricate results. Keep \`details\` concise: "10.8s, products_contract_test.py" or "failed: <one-line error summary>, products_contract_test.py".
+**newTestsCreated** — files that are new to the repo (ADD actions only, at most ${maxGenerate}):
    \`testId\` (human-readable kebab-case, e.g. \`contract-get-products\`), \`testType\`, \`category\`, \`endpoint\`, \`fileName\`, \`description\`, \`scenarioFile\`, \`reasoning\`
    If no tests were generated, pass an empty array.
    If you created a test and then fixed it (chaining, compilation, imports), report it only here.
-**testMaintenance** — existing tests that were modified or verified (UPDATE or VERIFY actions):
-   UPDATE: set \`testType\` to \`"<type> (updated)"\`. Include before/after execution results.
-   VERIFY: note that the test was verified as unaffected by the diff — no file changes made.
+**testMaintenance** — existing tests modified in Step 1 (UPDATE or REGENERATE actions):
+   Each entry requires: \`testType\` (e.g. "Contract", "Integration"), \`endpoint\` (e.g. "GET /api/v1/orders"), \`fileName\` (e.g. "orders_contract_test.py"), \`description\` (what changed and why),
+   \`beforeStatus\` (one of: "Pass", "Fail", "Error"), \`beforeDetails\` (execution output before modification),
+   \`afterStatus\` (one of: "Pass", "Fail", "Error", "Skipped"), \`afterDetails\` (execution output after modification).
+   \`beforeStatus\` comes from the pre-edit execution (see Execution timing above). \`afterStatus\` comes from the final execution batch.
+   If the "after" run fails, you may fix and retry **at most once** (2 total "after" execution attempts).
+   If it still fails after the second attempt, report \`afterStatus: "Fail"\` with the error details and move on.
    Do NOT include files that were newly created in this run (those go in \`newTestsCreated\`).
-**additionalRecommendations** — items you could not act on (quota exceeded, missing traces, etc.):
-   \`testId\` (human-readable kebab-case, e.g. \`integration-products-orders-workflow\`), \`testType\`, \`category\`, \`scenarioName\`, \`priority\` (high/medium/low — used for sorting, not displayed), \`description\`, \`steps\`, \`reasoning\`
+**issuesFound** — issues, failures, or bugs found during analysis and testing. Include:
+   - Code logic bugs spotted in the diff (with \`severity\`)
+   - Test generation or execution failures
+   - Environment misconfiguration
+   Set \`severity\` for each entry: \`critical\` for broken features (page won't load, data corruption), \`high\` for incorrect behavior (wrong calculations, stale state), \`medium\` for minor gaps, \`low\` for informational.
+   Do NOT include the severity level in the \`description\` text — it is a separate field. Write: \`{ severity: "critical", description: "EditOrderForm crashes on render" }\`, NOT \`{ severity: "critical", description: "CRITICAL — EditOrderForm crashes on render" }\`.
+**additionalRecommendations** — remaining recommendations from the ranked list (MUST contain AT MOST ${maxRecommendations - maxGenerate} items — include only recommendations that add distinct coverage beyond generated tests; do not pad with variants that test the same endpoint and flow as a generated test):
+   \`testId\` (human-readable kebab-case, e.g. \`integration-products-orders-workflow\`), \`testType\`, \`category\`, \`scenarioName\`, \`priority\`, \`description\`, \`steps\`, \`reasoning\`
+   **Priority assignment rules** (used for sorting — high-priority items appear first):
+   First, determine **diff relevance**: does the test's primary endpoint appear in the PR diff (new or modified)?
+   - **high**: diff-relevant tests that guard security boundaries, auth edge cases, error/negative-path handling (expecting 4xx/5xx), cross-resource isolation, or financial calculation edge cases. Also: CRUD lifecycle tests for NEW endpoints introduced in this diff (these exercise the new surface area).
+   - **medium**: diff-relevant business-rule happy-path variants (CRUD with recalculation, status transitions), multi-resource workflows involving diff endpoints. Also: security/error tests for endpoints NOT in the diff (useful but less urgent).
+   - **low**: tests targeting only endpoints NOT changed in this diff, trivially discoverable happy paths that duplicate what a generated test already covers
    Keep each \`description\` to one sentence. Omit \`requestBody\` and \`responseBody\` from steps.
    Include at most 3 steps per recommendation.
    If a UI test cannot be generated because trace recording failed (app not accessible, browser error),
    include it here (not in \`issuesFound\`) with the failure reason.
    If an E2E test cannot be generated because the app was not running (browser_navigate failed), include it here with the failure reason.
-**nextSteps** — actionable next steps when test failures suggest misconfiguration.
-   Each entry must be a single-line string (no embedded newlines).
-   If multiple tests fail with 404 NOT_FOUND or connection refused on endpoints that ARE defined in the diff, add: "Some endpoints returned 404 — verify your \`targetSetupCommand\` deploys the PR branch and \`targetReadyCheckCommand\` confirms the service is healthy."
-   If tests fail with 401/403 on endpoints that require auth, add a step about \`authTokenCommand\`.
-   Only add next steps for systemic patterns (3+ tests with the same error class), not individual failures.
+**nextSteps** — actionable follow-ups for the PR author.
+   Each entry must be a single-line string (no embedded newlines). Include:
+   - A next step for every \`critical\` or \`high\` severity issue in \`issuesFound\` — tell the author what to fix (e.g. "Fix \`<SelectItem value=''>\` in EditOrderForm.tsx — use a non-empty value like \`value='none'\` to prevent the React rendering crash").
+   - If multiple tests fail with 404 NOT_FOUND or connection refused on endpoints defined in the diff: "Verify your \`targetSetupCommand\` deploys the PR branch and \`targetReadyCheckCommand\` confirms the service is healthy."
+   - If tests fail with 401/403 on endpoints that require auth: add a step about \`authTokenCommand\`.
+   - Do NOT add next steps for low-severity or informational issues.
+   - When referencing code, use file name and the relevant code pattern (e.g. "in EditOrderForm.tsx, the \`<SelectItem value=\\"\\">\` element"). Do NOT include line numbers unless you are certain they are correct — omit them if unsure.
 **businessCaseAnalysis** — 1-2 sentences describing what user-facing interactions this PR
    enables or changes (e.g. "customers can now leave and view product reviews").

package/build/services/TestDiscoveryService.js CHANGED Viewed

@@ -162,14 +162,37 @@ export class TestDiscoveryService {
         const testType = this.detectTestType(content, testFile);
         const apiSchema = this.extractApiSchema(content);
         const framework = this.extractFramework(content);
+        const apiEndpoint = this.extractCoveredEndpoints(content);
         return {
             testFile,
             testType,
             language,
             framework,
             apiSchema,
+            apiEndpoint,
         };
     }
+    /**
+     * Extract the HTTP methods and paths covered by this test file.
+     * Returns a comma-separated string like "GET /orders/{id}, DELETE /orders/{id}".
+     * Handles Python (send_request / check_schema) and TypeScript/JS (sendRequest).
+     */
+    extractCoveredEndpoints(content) {
+        const seen = new Set();
+        // Python / Java: send_request("METHOD", "/path", ...)
+        for (const m of content.matchAll(/send_request\(\s*["']([A-Z]+)["']\s*,\s*["']([^"']+)["']/g)) {
+            seen.add(`${m[1]} ${m[2]}`);
+        }
+        // Python contract: check_schema("/path", "METHOD", ...)  — path comes first
+        for (const m of content.matchAll(/check_schema\(\s*["']([^"']+)["']\s*,\s*["']([A-Z]+)["']/g)) {
+            seen.add(`${m[2]} ${m[1]}`);
+        }
+        // TypeScript / JavaScript: sendRequest("METHOD", "/path", ...)
+        for (const m of content.matchAll(/sendRequest\(\s*["']([A-Z]+)["']\s*,\s*["']([^"']+)["']/g)) {
+            seen.add(`${m[1]} ${m[2]}`);
+        }
+        return [...seen].join(", ");
+    }
     /**
      * Detect programming language from file extension
      */

package/build/services/TestExecutionService.js CHANGED Viewed

@@ -8,7 +8,7 @@ import { logger } from "../utils/logger.js";
 import { buildContainerEnv } from "./containerEnv.js";
 const DEFAULT_TIMEOUT = 300000; // 5 minutes
 const MAX_CONCURRENT_EXECUTIONS = 5;
-export const EXECUTOR_DOCKER_IMAGE = "skyramp/executor:v1.3.16";
+export const EXECUTOR_DOCKER_IMAGE = "skyramp/executor:v1.3.17";
 const DOCKER_PLATFORM = "linux/amd64";
 const EXECUTION_PROGRESS_INTERVAL = 10000; // 10 seconds between progress updates during execution
 // Temp file with valid empty JSON — used instead of /dev/null for .json config files

package/build/services/TestGenerationService.js CHANGED Viewed

@@ -167,6 +167,7 @@ The generated test file remains unchanged and ready to use as-is.
         }
         return null;
     }
+    // Standard HTTP headers that are never auth credentials.
     static STANDARD_HEADERS = new Set([
         "content-type",
         "accept",
@@ -176,26 +177,96 @@ The generated test file remains unchanged and ready to use as-is.
         "cache-control",
         "accept-encoding",
         "accept-language",
+        "pragma",
+        "origin",
+        "referer",
+        "content-length",
+        "sec-ch-ua",
+        "sec-ch-ua-mobile",
+        "sec-ch-ua-platform",
+        "sec-fetch-dest",
+        "sec-fetch-mode",
+        "sec-fetch-site",
+        "sec-fetch-user",
+        "upgrade-insecure-requests",
+        "traceparent",
+        "tracestate",
+        "x-datadog-origin",
+        "x-datadog-parent-id",
+        "x-datadog-sampling-priority",
+        "x-datadog-trace-id",
     ]);
-    extractAuthFromTrace(traceFilePath) {
+    // Known auth header names — mirrors KnownAuthHeaders in skyramp.git.
+    static KNOWN_AUTH_HEADERS = new Set([
+        "authorization",
+        "cookie",
+        "x-api-key",
+        "x-auth-token",
+        "x-access-token",
+        "x-oauth-token",
+        "x-oauth-key",
+        "x-client-token",
+        "x-client-id",
+        "x-client-secret",
+        "x-client-secret-token",
+        "x-client-access-token",
+        "x-client-authorization",
+        "x-github-token",
+        "x-firebase-appcheck",
+        "x-auth",
+        "x-requested-with",
+        "proxy-authorization",
+        "fastly-key",
+        "heroku-bearer",
+    ]);
+    static simpleWildcardMatch(pattern, value) {
+        const regex = new RegExp("^" + pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*") + "$");
+        return regex.test(value);
+    }
+    static traceMatchesFilters(entry, include, exclude) {
+        const hostPath = `${entry.Destination ?? ""}${entry.Path ?? ""}`;
+        if (include && include.length > 0) {
+            if (!include.some((p) => this.simpleWildcardMatch(p, hostPath)))
+                return false;
+        }
+        if (exclude && exclude.length > 0) {
+            if (exclude.some((p) => this.simpleWildcardMatch(p, hostPath)))
+                return false;
+        }
+        return true;
+    }
+    extractAuthFromTrace(traceFilePath, include, exclude) {
         try {
             const raw = fs.readFileSync(traceFilePath, "utf8");
             const requests = JSON.parse(raw);
             if (!Array.isArray(requests) || requests.length === 0)
                 return null;
-            const headers = requests[0].RequestHeaders ?? {};
-            for (const [name, values] of Object.entries(headers)) {
-                if (TestGenerationService.STANDARD_HEADERS.has(name.toLowerCase())) {
-                    continue;
+            const filtered = requests.filter((r) => TestGenerationService.traceMatchesFilters(r, include, exclude));
+            // Pass 1: look for Authorization header (highest priority)
+            for (const req of filtered) {
+                const headers = req.RequestHeaders ?? {};
+                for (const [name, values] of Object.entries(headers)) {
+                    if (/^authorization$/i.test(name)) {
+                        const value = (values ?? [])[0] ?? "";
+                        const parts = value.split(" ", 2);
+                        if (parts.length === 2) {
+                            return { authHeader: name, authScheme: parts[0] };
+                        }
+                        return { authHeader: name, authScheme: "" };
+                    }
                 }
-                if (/^authorization$/i.test(name)) {
-                    const value = (values ?? [])[0] ?? "";
-                    const parts = value.split(" ", 2);
-                    if (parts.length === 2) {
-                        return { authHeader: name, authScheme: parts[0] };
+            }
+            // Pass 2: look for known auth headers (Cookie, X-Api-Key, etc.)
+            for (const req of filtered) {
+                const headers = req.RequestHeaders ?? {};
+                for (const [name] of Object.entries(headers)) {
+                    const lower = name.toLowerCase();
+                    if (TestGenerationService.STANDARD_HEADERS.has(lower))
+                        continue;
+                    if (TestGenerationService.KNOWN_AUTH_HEADERS.has(lower)) {
+                        return { authHeader: name, authScheme: "" };
                     }
                 }
-                return { authHeader: name, authScheme: "" };
             }
             return null;
         }
@@ -206,7 +277,7 @@ The generated test file remains unchanged and ready to use as-is.
     async executeGeneration(generateOptions) {
         try {
             if (generateOptions.traceFilePath) {
-                const traceAuth = this.extractAuthFromTrace(generateOptions.traceFilePath);
+                const traceAuth = this.extractAuthFromTrace(generateOptions.traceFilePath, generateOptions.generateInclude, generateOptions.generateExclude);
                 if (traceAuth) {
                     if (!generateOptions.authHeader) {
                         generateOptions.authHeader = traceAuth.authHeader;