@skyramp/mcp 0.1.8 → 0.2.0-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/playwright/registerPlaywrightTools.js +12 -0
- package/build/playwright/traceRecordingPrompt.js +15 -0
- package/build/prompts/test-recommendation/diffExecutionPlan.js +31 -0
- package/build/prompts/test-recommendation/recommendationSections.js +1 -2
- package/build/prompts/test-recommendation/test-recommendation-prompt.test.js +94 -0
- package/build/prompts/testbot/testbot-prompts.js +115 -11
- package/build/prompts/testbot/testbot-prompts.test.js +79 -0
- package/build/resources/testbotResource.js +1 -1
- package/build/services/ScenarioGenerationService.integration.test.js +158 -0
- package/build/services/ScenarioGenerationService.js +36 -3
- package/build/services/ScenarioGenerationService.test.js +158 -22
- package/build/tools/generate-tests/generateBatchScenarioRestTool.js +16 -4
- package/build/tools/generate-tests/generateIntegrationRestTool.js +2 -0
- package/build/tools/generate-tests/generateUIRestTool.js +2 -0
- package/build/tools/test-management/analyzeChangesTool.js +7 -1
- package/build/utils/routeParsers.js +12 -0
- package/node_modules/playwright/ThirdPartyNotices.txt +6 -6
- package/node_modules/playwright/lib/dom-analyzer/analyze.js +111 -0
- package/node_modules/playwright/lib/dom-analyzer/blueprint.js +1161 -0
- package/node_modules/playwright/lib/dom-analyzer/blueprint.test.js +396 -0
- package/node_modules/playwright/lib/dom-analyzer/blueprintCache.js +57 -0
- package/node_modules/playwright/lib/dom-analyzer/blueprintCache.test.js +57 -0
- package/node_modules/playwright/lib/dom-analyzer/blueprintDiff.js +250 -0
- package/node_modules/playwright/lib/dom-analyzer/blueprintDiff.test.js +298 -0
- package/node_modules/playwright/lib/dom-analyzer/crawler.js +384 -0
- package/node_modules/playwright/lib/dom-analyzer/curatedWidgets.js +73 -0
- package/node_modules/playwright/lib/dom-analyzer/dynamicId.js +43 -0
- package/node_modules/playwright/lib/dom-analyzer/dynamicId.test.js +85 -0
- package/node_modules/playwright/lib/dom-analyzer/fingerprint.js +90 -0
- package/node_modules/playwright/lib/dom-analyzer/fingerprint.test.js +231 -0
- package/node_modules/playwright/lib/dom-analyzer/fingerprintAblation.fixtures.js +145 -0
- package/node_modules/playwright/lib/dom-analyzer/fingerprintAblation.test.js +41 -0
- package/node_modules/playwright/lib/dom-analyzer/graph.js +36 -0
- package/node_modules/playwright/lib/dom-analyzer/liveFingerprints.js +43 -0
- package/node_modules/playwright/lib/dom-analyzer/logicalNameResolver.js +72 -0
- package/node_modules/playwright/lib/dom-analyzer/logicalNameResolver.test.js +182 -0
- package/node_modules/playwright/lib/dom-analyzer/sectionGrouper.js +169 -0
- package/node_modules/playwright/lib/dom-analyzer/sectionGrouper.test.js +269 -0
- package/node_modules/playwright/lib/dom-analyzer/serialization.js +75 -0
- package/node_modules/playwright/lib/dom-analyzer/slug.js +30 -0
- package/node_modules/playwright/lib/dom-analyzer/slug.test.js +84 -0
- package/node_modules/playwright/lib/dom-analyzer/widgetContract.js +127 -0
- package/node_modules/playwright/lib/dom-analyzer/widgetContract.test.js +212 -0
- package/node_modules/playwright/lib/mcp/browser/browserContextFactory.js +3 -1
- package/node_modules/playwright/lib/mcp/browser/config.js +1 -1
- package/node_modules/playwright/lib/mcp/browser/context.js +17 -1
- package/node_modules/playwright/lib/mcp/browser/tab.js +38 -0
- package/node_modules/playwright/lib/mcp/browser/tools/domAnalyzer.js +261 -0
- package/node_modules/playwright/lib/mcp/browser/tools/keyboard.js +3 -3
- package/node_modules/playwright/lib/mcp/browser/tools/pageBlueprint.js +129 -0
- package/node_modules/playwright/lib/mcp/browser/tools/pageBlueprint.test.js +137 -0
- package/node_modules/playwright/lib/mcp/browser/tools/sitemap.js +226 -0
- package/node_modules/playwright/lib/mcp/browser/tools/snapshot.js +2 -2
- package/node_modules/playwright/lib/mcp/browser/tools/widgetContract.js +168 -0
- package/node_modules/playwright/lib/mcp/browser/tools.js +6 -0
- package/node_modules/playwright/lib/mcp/skyramp/traceRecordingBackend.js +52 -12
- package/node_modules/playwright/lib/mcp/test/skyRampExport.js +64 -13
- package/node_modules/playwright/package.json +1 -1
- package/node_modules/playwright/skyramp-playwright-1.58.2-skyramp.8.9.3.tgz +0 -0
- package/package.json +2 -2
|
@@ -25,6 +25,12 @@ export async function registerPlaywrightTools(server, options) {
|
|
|
25
25
|
// Only expose essential browser tools to reduce LLM confusion.
|
|
26
26
|
// The full set (23 tools) overwhelms the LLM and causes it to use
|
|
27
27
|
// browser_evaluate/browser_run_code instead of the simpler tools.
|
|
28
|
+
//
|
|
29
|
+
// The DOM Analyzer group (browser_blueprint*, browser_sitemap_*,
|
|
30
|
+
// browser_widget_contract_lookup) is referenced by skyramp_testbot's
|
|
31
|
+
// Task 1 UI Recommendation Grounding and Task 2 capture-act-capture
|
|
32
|
+
// discipline. Omitting them here would silently make the prompt's
|
|
33
|
+
// instructions uncallable.
|
|
28
34
|
const ESSENTIAL_TOOLS = new Set([
|
|
29
35
|
'browser_navigate',
|
|
30
36
|
'browser_snapshot',
|
|
@@ -40,6 +46,12 @@ export async function registerPlaywrightTools(server, options) {
|
|
|
40
46
|
'browser_take_screenshot',
|
|
41
47
|
'browser_assert',
|
|
42
48
|
'skyramp_export_zip',
|
|
49
|
+
// DOM Analyzer tools (Phase C)
|
|
50
|
+
'browser_blueprint',
|
|
51
|
+
'browser_blueprint_diff',
|
|
52
|
+
'browser_sitemap_build',
|
|
53
|
+
'browser_sitemap_query',
|
|
54
|
+
'browser_widget_contract_lookup',
|
|
43
55
|
]);
|
|
44
56
|
const filteredTools = tools.filter((t) => ESSENTIAL_TOOLS.has(t.name));
|
|
45
57
|
logger.info(`Filtering to ${filteredTools.length} essential tools (from ${tools.length} total)`);
|
|
@@ -42,6 +42,21 @@ Then execute in strict order:
|
|
|
42
42
|
- **No Docker required**: the \`browser_*\` tools run a local browser session managed by the MCP server. Docker is ONLY used by \`skyramp_start_trace_collection\` (manual recording mode). Never suggest or check for Docker when using AI-driven recording.
|
|
43
43
|
${modularizeNote}
|
|
44
44
|
|
|
45
|
+
### Tool discipline — three concepts, don't conflate
|
|
46
|
+
|
|
47
|
+
These concepts drive the DOM Analyzer capture tools (\`browser_sitemap_build\`, \`browser_sitemap_query\`, \`browser_blueprint\`) and constrain how their outputs flow into generated test code:
|
|
48
|
+
|
|
49
|
+
- **\`browser_snapshot\` — ephemeral refs**: the \`ref=e29\` values change on every call. Use them ONLY for dispatching the next 1–2 interactions (\`browser_click\`, \`browser_type\`, etc.). **Never put refs in generated test code** — they will not exist at playback time.
|
|
50
|
+
- **\`browser_blueprint\` — stable logical names**: logical names like \`add_order_btn\` and their derived \`getByRole\` locators survive page reloads and playback. Use them in generated test code. Call \`browser_blueprint\` only when the DOM has changed since the last known-good blueprint (modal opens, form submits, filter changes, mutable actions); for pure navigation to an already-crawled URL, reuse the Sitemap cache via \`browser_sitemap_query\`.
|
|
51
|
+
- **\`browser_sitemap_build\` — cached once per session**: crawls the whole app into a Sitemap. Call it once at the start of a session; subsequent calls within ~30 minutes reuse the cache unless \`refresh: true\` is passed. Read already-crawled pages via \`browser_sitemap_query\` (modes: \`page\`, \`edges\`, \`mapJson\`, \`outline\`) — do not re-call \`browser_sitemap_build\` to "check."
|
|
52
|
+
|
|
53
|
+
### Tool discipline — delta tools (consumers of captures)
|
|
54
|
+
|
|
55
|
+
These consume the outputs of the capture tools above; they don't capture state themselves.
|
|
56
|
+
|
|
57
|
+
- **\`browser_blueprint_diff\` — structured before/after delta**: takes two \`browser_blueprint\` results and returns \`elementsAdded\`, \`elementsRemoved\`, \`textChanges\`, \`repeatingCountChanges\`, \`urlChanged\`. Use it to derive assertions from observable state changes rather than guessing at what "success" looks like — capture before an action, perform the action, capture after, diff. An empty diff is itself a meaningful signal (e.g. a silent failure worth catching).
|
|
58
|
+
- **\`browser_widget_contract_lookup\` — custom widget interaction recipes**: when a blueprint element has \`widgetType: "custom"\` or \`"unknown"\`, look up its interaction recipe by \`fingerprint\` + \`ref\`. On \`"found"\`, the returned contract specifies the interaction steps (e.g. for a Radix Select: click trigger → wait for portal → click option). On \`"needs_inference"\`, fall through to snapshot-driven trial clicks. Use this so generated tests interact with custom widgets through stable contracts, not brittle positional clicks.
|
|
59
|
+
|
|
45
60
|
### Assertions
|
|
46
61
|
Call \`browser_assert\` when assertions are needed. Always provide the \`expected\` value.
|
|
47
62
|
- \`type: "text"\` — verify an element contains expected text
|
|
@@ -170,6 +170,36 @@ export function buildExecutionPlan(scored, maxGen, topN, baseUrl, authHeaderValu
|
|
|
170
170
|
: `Scenario: ${s.scenarioName} (${s.steps.map(st => `${st.method} ${st.path}`).join(" → ")})`;
|
|
171
171
|
return `#${rank} [ADDITIONAL] | ${testType} | ${s.category} | ${item.novelty}\n ${target}\n Validates: ${s.description}`;
|
|
172
172
|
}).join("\n\n");
|
|
173
|
+
// Phase C D-1.a: UI grounding guidance — fires whenever the PR has
|
|
174
|
+
// frontend changes (UI-only OR mixed). Tells the agent what to put in the
|
|
175
|
+
// `reasoning` field for UI test entries. This disambiguates the "Fill in
|
|
176
|
+
// placeholders from source code, then display verbatim" header that
|
|
177
|
+
// analyzeChangesTool wraps around this prompt: the catalog's STRUCTURE is
|
|
178
|
+
// frozen, but the `reasoning` CONTENT for UI entries should be blueprint-
|
|
179
|
+
// grounded using concrete elements the agent captured via browser_blueprint.
|
|
180
|
+
const uiGroundingGuidance = hasFrontendChanges ? `
|
|
181
|
+
**UI recommendation grounding — applies to \`testType: "ui"\` entries.** The \`reasoning\` field for \`testType: "ui"\` entries MUST contain at least three of {\`role\`, \`accessibleName\`, \`testId\`, \`stableId\`, \`logicalName\`} cited in key=value form. Entries that omit the tuple are not valid output for UI test types. The agent should call \`browser_blueprint\` on affected pages (if it hasn't already) and use the captured data.
|
|
182
|
+
|
|
183
|
+
**Field stability note for consumers:** \`testId\` and \`stableId\` are stable identifiers (from \`data-testid\` and unique \`id\` attributes respectively) — code-generation consumers can key off them. \`role\` and \`accessibleName\` are derived from ARIA and survive DOM reshuffles. \`logicalName\` is a **display handle, not a stable identifier** — it's derived from role + accessibleName + section context and drifts when the accessibleName text changes. Cite \`logicalName\` for readability in the \`reasoning\` field, but downstream consumers (test generators, scoping tools) should NOT key off it. Prefer \`testId\` > \`stableId\` > \`role + accessibleName\` > \`fingerprint\` for identity.
|
|
184
|
+
|
|
185
|
+
**Format for singular elements:**
|
|
186
|
+
> role=<role>, accessibleName="<Accessible Name>", testId=<test-id-or-null>, stableId=<id-or-null>, logicalName=<logical_name>
|
|
187
|
+
|
|
188
|
+
**Format for repeating elements (table rows, list items):** include \`contextText\` values from the row when the recommendation targets a specific row:
|
|
189
|
+
> role=<role>, accessibleName="<template>", testId=<null>, stableId=<null>, logicalName=<name>, contextText=["customer@example.com", "$129.99", "Pending"]
|
|
190
|
+
|
|
191
|
+
**Example (Edit Order form's Save button on /orders/{id}):**
|
|
192
|
+
> role=button, accessibleName="Save changes", testId=null, stableId=null, logicalName=save_changes_btn — verifies boundary-value clamping on discount_percent (0..100)
|
|
193
|
+
|
|
194
|
+
**Example (new-order-row recommendation on /orders):**
|
|
195
|
+
> role=button, accessibleName="View details for order 13", testId=null, stableId=null, logicalName=view_details_for_order_btn, contextText=["customer@example.com", "$129.99", "Pending"] — verifies new order row renders with correct customer + total + status
|
|
196
|
+
|
|
197
|
+
**Validates line — applies to \`testType: "ui"\` entries.** The \`Validates:\` line for UI entries should describe an observable behavior the test verifies — what changes on the page after the action, or what state the user can see. Ground this description in the captured blueprint when possible. Reference structural facts (an element appears, a count changes, a status text updates, a URL transitions) rather than implementation language (component names, props, internal state). The line should be readable to someone who has not seen the source diff.
|
|
198
|
+
|
|
199
|
+
**Scope clarification:** this grounding format applies **only** to \`testType: "ui"\` entries. Contract, integration, e2e, batch-scenario \`reasoning\` and \`Validates:\` fields use their existing conventions (endpoint paths, schemas, fixture chains) — do NOT reformat those. The "Fill in placeholders, then display verbatim" rule above refers to the CATALOG STRUCTURE (sections, ordering, test types); UI entries' \`reasoning\` and \`Validates:\` CONTENT follows this grounding format.
|
|
200
|
+
|
|
201
|
+
**If blueprint data isn't available** — agent skipped pre-scan, app unreachable, \`BlueprintInvariantError\`, or no candidate page covers the changed component — UI entries may fall back to source-grounded reasoning. Each such entry MUST be flagged with a leading \`[no-blueprint-data]\` marker in the \`reasoning\` field, and the failure mode must be logged in \`issuesFound\` with \`info\` severity naming the entry. Do NOT silently produce ungrounded reasoning without the marker.
|
|
202
|
+
` : "";
|
|
173
203
|
// UI/E2E guidance — the LLM adds as many as its Budget Plan calls for.
|
|
174
204
|
// Note: if a UI test already occupies a GENERATE slot (uiPlaceholderBlock), that slot
|
|
175
205
|
// satisfies the UI generate count — do not add it again in ADDITIONAL.
|
|
@@ -278,6 +308,7 @@ ${isUIOnlyPR
|
|
|
278
308
|
### ADDITIONAL (list in additionalRecommendations in this order after Step 1 insertion)
|
|
279
309
|
|
|
280
310
|
${additionalLines || " (none pre-ranked)"}
|
|
311
|
+
${uiGroundingGuidance}
|
|
281
312
|
${uiGuidance}
|
|
282
313
|
${supplementNote}
|
|
283
314
|
|
|
@@ -317,8 +317,7 @@ ${authGuidance}
|
|
|
317
317
|
**OpenAPI spec is NOT required.** \`apiSchema\` is OPTIONAL — omit it if no spec exists.
|
|
318
318
|
**CRITICAL — Query params vs request body:**
|
|
319
319
|
- For **POST/PUT/PATCH**: use \`requestBody\` with realistic field values from source code schemas.
|
|
320
|
-
- For **GET/DELETE with search/filter/pagination**: use \`queryParams
|
|
321
|
-
Do not put query parameters in \`requestBody\` for GET requests — GET request bodies are non-standard and may be ignored or rejected.
|
|
320
|
+
- For **GET/DELETE with search/filter/pagination**: use \`queryParams\`. Do not put query parameters in \`requestBody\` for GET requests — GET request bodies are non-standard and may be ignored or rejected.
|
|
322
321
|
- For **GET by ID**: no \`requestBody\` or \`queryParams\` needed — the ID is in the path.
|
|
323
322
|
\`responseBody\` should match the actual API response shape from source code (including all fields
|
|
324
323
|
returned by the controller — e.g., \`id\`, \`ownerId\`, \`createdAt\`, included relations like \`collection\`, \`tags\`).
|
|
@@ -1512,3 +1512,97 @@ describe("externalDedupKey", () => {
|
|
|
1512
1512
|
expect(externalDedupKey(scenario)).toBe("POST::orders::contract");
|
|
1513
1513
|
});
|
|
1514
1514
|
});
|
|
1515
|
+
describe("UI grounding guidance (Phase C D-1.a)", () => {
|
|
1516
|
+
const editOrderScenario = minimalScenario({
|
|
1517
|
+
scenarioName: 'edit-order',
|
|
1518
|
+
description: 'edit order',
|
|
1519
|
+
category: 'new_endpoint',
|
|
1520
|
+
priority: 'medium',
|
|
1521
|
+
steps: [
|
|
1522
|
+
{ order: 1, method: 'PATCH', path: '/api/v1/orders/{id}', description: 'edit', interactionType: 'success', expectedStatusCode: 200 },
|
|
1523
|
+
],
|
|
1524
|
+
chainingKeys: [],
|
|
1525
|
+
});
|
|
1526
|
+
const uiGroundingBusinessContext = {
|
|
1527
|
+
mainPurpose: "Test API", userFlows: [], dataFlows: [], integrationPatterns: [],
|
|
1528
|
+
draftedScenarios: [editOrderScenario],
|
|
1529
|
+
};
|
|
1530
|
+
const uiOnlyAnalysis = () => minimalAnalysis({
|
|
1531
|
+
branchDiffContext: {
|
|
1532
|
+
currentBranch: "feature", baseBranch: "main",
|
|
1533
|
+
changedFiles: ['frontend/src/components/EditOrder.tsx'],
|
|
1534
|
+
newEndpoints: [],
|
|
1535
|
+
modifiedEndpoints: [],
|
|
1536
|
+
affectedServices: [],
|
|
1537
|
+
},
|
|
1538
|
+
businessContext: uiGroundingBusinessContext,
|
|
1539
|
+
});
|
|
1540
|
+
const mixedAnalysis = () => minimalAnalysis({
|
|
1541
|
+
branchDiffContext: {
|
|
1542
|
+
currentBranch: "feature", baseBranch: "main",
|
|
1543
|
+
changedFiles: ['frontend/src/components/EditOrder.tsx', 'backend/orders.py'],
|
|
1544
|
+
newEndpoints: [{
|
|
1545
|
+
path: '/api/v1/orders/{id}',
|
|
1546
|
+
methods: [{ method: 'PATCH', sourceFile: 'backend/orders.py', interactionCount: 1 }],
|
|
1547
|
+
}],
|
|
1548
|
+
modifiedEndpoints: [],
|
|
1549
|
+
affectedServices: [],
|
|
1550
|
+
},
|
|
1551
|
+
businessContext: uiGroundingBusinessContext,
|
|
1552
|
+
});
|
|
1553
|
+
const backendOnlyAnalysis = () => minimalAnalysis({
|
|
1554
|
+
branchDiffContext: {
|
|
1555
|
+
currentBranch: "feature", baseBranch: "main",
|
|
1556
|
+
changedFiles: ['backend/orders.py'],
|
|
1557
|
+
newEndpoints: [{
|
|
1558
|
+
path: '/api/v1/orders',
|
|
1559
|
+
methods: [{ method: 'POST', sourceFile: 'backend/orders.py', interactionCount: 1 }],
|
|
1560
|
+
}],
|
|
1561
|
+
modifiedEndpoints: [],
|
|
1562
|
+
affectedServices: [],
|
|
1563
|
+
},
|
|
1564
|
+
businessContext: uiGroundingBusinessContext,
|
|
1565
|
+
});
|
|
1566
|
+
it("emits UI grounding block on UI-only PRs", () => {
|
|
1567
|
+
const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
|
|
1568
|
+
expect(out).toContain("UI recommendation grounding");
|
|
1569
|
+
expect(out).toContain("role=<role>");
|
|
1570
|
+
expect(out).toContain("accessibleName=");
|
|
1571
|
+
});
|
|
1572
|
+
it("emits UI grounding block on mixed PRs", () => {
|
|
1573
|
+
const out = buildRecommendationPrompt(mixedAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
|
|
1574
|
+
expect(out).toContain("UI recommendation grounding");
|
|
1575
|
+
expect(out).toContain("contextText=");
|
|
1576
|
+
});
|
|
1577
|
+
it("does not emit UI grounding block on backend-only PRs", () => {
|
|
1578
|
+
const out = buildRecommendationPrompt(backendOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
|
|
1579
|
+
expect(out).not.toContain("UI recommendation grounding");
|
|
1580
|
+
});
|
|
1581
|
+
it("carves out non-UI entries from the grounding format", () => {
|
|
1582
|
+
const out = buildRecommendationPrompt(mixedAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
|
|
1583
|
+
expect(out).toMatch(/Contract, integration, e2e.*existing conventions/);
|
|
1584
|
+
});
|
|
1585
|
+
it("includes contextText example for repeating elements", () => {
|
|
1586
|
+
const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
|
|
1587
|
+
expect(out).toContain('contextText=["customer@example.com"');
|
|
1588
|
+
});
|
|
1589
|
+
// Phase C D-1.a/7b: directive instruction strengthening + Validates coverage
|
|
1590
|
+
// + marked fallback. Asserts the post-7b language is present.
|
|
1591
|
+
it("uses MUST directive for tuple presence in reasoning field", () => {
|
|
1592
|
+
const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
|
|
1593
|
+
expect(out).toContain("MUST contain at least three of");
|
|
1594
|
+
expect(out).toContain("not valid output for UI test types");
|
|
1595
|
+
});
|
|
1596
|
+
it("instructs the agent to ground the Validates line for UI entries", () => {
|
|
1597
|
+
const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
|
|
1598
|
+
expect(out).toContain("Validates line — applies to");
|
|
1599
|
+
expect(out).toContain("observable behavior the test verifies");
|
|
1600
|
+
expect(out).toContain("structural facts");
|
|
1601
|
+
});
|
|
1602
|
+
it("requires a [no-blueprint-data] marker on source-grounded fallback entries", () => {
|
|
1603
|
+
const out = buildRecommendationPrompt(uiOnlyAnalysis(), AnalysisScope.CurrentBranchDiff, 10);
|
|
1604
|
+
expect(out).toContain("[no-blueprint-data]");
|
|
1605
|
+
expect(out).toContain("issuesFound");
|
|
1606
|
+
expect(out).toContain("Do NOT silently produce ungrounded reasoning");
|
|
1607
|
+
});
|
|
1608
|
+
});
|
|
@@ -6,6 +6,7 @@ import { buildDriftAnalysisPrompt } from "../test-maintenance/drift-analysis-pro
|
|
|
6
6
|
import { getTraceRecordingPromptText } from "../../playwright/traceRecordingPrompt.js";
|
|
7
7
|
import { isContractConsumerModeEnabled } from "../../utils/featureFlags.js";
|
|
8
8
|
import { resolveServiceDetailsRef } from "../../utils/utils.js";
|
|
9
|
+
import { UI_FILE_GIT_PATHSPEC } from "../../utils/routeParsers.js";
|
|
9
10
|
import { readWorkspaceConfigRaw } from "../../utils/workspaceAuth.js";
|
|
10
11
|
// Cached at module-load — flags are process-wide and cannot change per call.
|
|
11
12
|
const CONSUMER_MODE_ENABLED = isContractConsumerModeEnabled();
|
|
@@ -19,8 +20,16 @@ const CONTRACT_MODE_GUIDANCE = CONSUMER_MODE_ENABLED
|
|
|
19
20
|
Both modes (\`providerMode: true, consumerMode: true\`): For diff that contains BOTH provider signals (such as new/modified endpoint handlers, route changes this service owns) AND consumer signals (outbound HTTP client calls to another service, no new endpoint handlers).`
|
|
20
21
|
: ` Always add \`providerMode: true\` — the tool generates provider-side contract tests only.`;
|
|
21
22
|
export function getTestbotPrompt(prTitle, prDescription, summaryOutputFile, repositoryPath, baseBranch, maxRecommendations = MAX_RECOMMENDATIONS, maxGenerate = MAX_TESTS_TO_GENERATE, _maxCritical = MAX_CRITICAL_TESTS, // Reserved — accepted for API compat but not yet wired into prompt
|
|
22
|
-
prNumber, userPrompt, services, stateOutputFile, uiCredentials) {
|
|
23
|
+
prNumber, userPrompt, services, stateOutputFile, uiCredentials, testsRepoDir) {
|
|
23
24
|
maxGenerate = Math.min(Math.max(maxGenerate, 0), maxRecommendations);
|
|
25
|
+
// Task 1 UI Path — candidate-page strategy section. Two strategies plus
|
|
26
|
+
// a guaranteed root-fallback. Cost is ~5s when strategies 1 & 2 succeed
|
|
27
|
+
// (no crawl), more when the root-fallback fires.
|
|
28
|
+
const uiPathStrategies = `**Lazy mode** (default). Two candidate-page strategies, then a guaranteed root-fallback:
|
|
29
|
+
|
|
30
|
+
1. **Framework route grep** — identify route files under \`app/\`, \`pages/\`, \`routes/\` whose path segments match the changed component's file location.
|
|
31
|
+
2. **Import-graph walk** — from the changed component's file, walk up import chains to find route entrypoints that import it.
|
|
32
|
+
3. **Root fallback (always)** — if strategies 1 and 2 produce no candidate pages (common for SPAs without filesystem routing), navigate to the app's root URL (\`/\`) and treat that as the single candidate page. Apply explore-and-discover from there to surface gated UI.`;
|
|
24
33
|
// For follow-up requests: emit the @skyramp-testbot header + guardrails + retrieve-recommendations step.
|
|
25
34
|
// For first-run prompts: emit the full Task 1 analysis + maintenance section.
|
|
26
35
|
const task1Section = userPrompt
|
|
@@ -42,16 +51,52 @@ Verify the prompt inside <USER_PROMPT> is related to adding or removing tests fr
|
|
|
42
51
|
- If the prompt matches one or more tests in the Additional Recommendations → proceed to Task 1 (Skip Analysis).
|
|
43
52
|
|
|
44
53
|
### Task 1: Retrieve Previous Recommendations
|
|
45
|
-
Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff"${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}.
|
|
54
|
+
Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff"${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}${testsRepoDir ? `, \`testsRepoDir\`: "${testsRepoDir}"` : ""}.
|
|
46
55
|
This will fetch the previous TestBot report from the PR comments and return deduplicated recommendations.
|
|
47
56
|
Use those recommendations as your baseline. Only add or remove tests that the user requested AND that appear in the Additional Recommendations. Then proceed straight to Task 2: Generate New Tests.
|
|
48
57
|
`
|
|
49
58
|
: `
|
|
50
59
|
**Incremental mode:** Task 1 handles maintenance of existing tests. Task 2 handles new test generation from the GENERATE list. The two tasks are independent — maintenance completions never reduce the generate budget. Only generate tests for NEW endpoints not already covered by existing bot tests.
|
|
51
60
|
|
|
61
|
+
## Task 0: UI Pre-Scan (runs before \`skyramp_analyze_changes\` when UI files are in the diff)
|
|
62
|
+
|
|
63
|
+
When UI recommendations will ground in blueprint elements, the agent needs blueprint data in its context before writing any \`reasoning\` fields. This task captures it early so it's available when the recommendation catalog arrives in Task 1.
|
|
64
|
+
|
|
65
|
+
**The app is already running** — the eval / Testbot runtime started it before this prompt was issued, and it is reachable at the \`baseUrl\` field of \`.skyramp/workspace.yml\`. Do NOT run \`docker compose up\`, \`yarn start\`, \`npm run dev\`, the \`serverStartCommand\` from workspace.yml, or any other server-bootstrapping command — those will at best be no-ops (the container is already healthy) and at worst hang on a wait loop and consume your time budget. Navigate directly via \`browser_navigate\`. If \`browser_navigate\` fails with a real connection refused, log it in \`issuesFound\` and proceed source-grounded; do not attempt to start the server yourself.
|
|
66
|
+
|
|
67
|
+
Check for UI files via: \`Bash git diff ${baseBranch ? baseBranch : 'HEAD~1'} --name-only --diff-filter=AM -- ${UI_FILE_GIT_PATHSPEC}\` from \`${repositoryPath}\`. (Uses git's pathspec filter directly so no grep/rg invocation is needed.) If the command returns nothing, skip this task entirely (backend-only PR) and proceed to Task 1.
|
|
68
|
+
|
|
69
|
+
If UI files are found, for each changed UI file enumerate candidate pages using the strategy ladder below, then **take the union** of strategies 1 and 2 (don't stop at the first that yields results). Strategy 3 is the root-URL fallback used only when strategies 1 and 2 both return empty.
|
|
70
|
+
|
|
71
|
+
${uiPathStrategies}
|
|
72
|
+
|
|
73
|
+
Capture \`browser_blueprint\` on each candidate page from the union. The app is running post-PR; no pre-PR baseline is available.
|
|
74
|
+
|
|
75
|
+
**Return shape for \`browser_blueprint\`:** the first call at any URL returns \`{ isFullCapture: true, pageHash, blueprint }\` with the full structural payload. A subsequent call at the *same* URL automatically returns \`{ isFullCapture: false, pageHash, previousPageHash, delta }\` — the delta is computed against your prior capture at that URL. Both shapes are valid and load-bearing; key off \`isFullCapture\` to know which one you got.
|
|
76
|
+
|
|
77
|
+
**After the initial capture, verify the changed feature is actually visible in the blueprint.** Search the captured blueprint for any of: the changed component's name as a \`logicalName\` / \`accessibleName\`, its \`testId\` (look for \`data-testid\` patterns derived from the component name), or distinctive class names from the diff. If none appear, the changed feature is likely behind a UI gate — a modal trigger, a dropdown, a tab, an accordion, or a conditional render. In that case:
|
|
78
|
+
|
|
79
|
+
1. Identify the most likely trigger from the route blueprint (a button whose accessibleName matches the feature — "Edit", "Add", "Open", or the component name itself).
|
|
80
|
+
2. \`browser_click\` the trigger.
|
|
81
|
+
3. Re-capture \`browser_blueprint\` — the new blueprint should now contain the changed feature's elements.
|
|
82
|
+
4. If still not visible after one click, log an \`issuesFound\` entry of \`info\` severity describing what you tried and proceed with whatever blueprint data you have. Do NOT iterate more than once per candidate page.
|
|
83
|
+
|
|
84
|
+
This is a deliberate, scoped exploration — one click max per candidate page. It exists because route-level blueprints often miss modal/dialog/conditional content, and a recommendation grounded in the empty home page of a route is no better than a source-grounded recommendation.
|
|
85
|
+
|
|
86
|
+
**Thresholds for how many to capture:**
|
|
87
|
+
- **≤5 candidates:** capture all.
|
|
88
|
+
- **6-15 candidates:** capture all, but note the count in \`issuesFound\` as \`info\` severity so high-fanout cases surface in post-hoc analysis.
|
|
89
|
+
- **>15 candidates:** prioritize by diff proximity and capture the top 15. Ranking: (a) pages whose source imports name the changed component directly, not via re-export chains; (b) route entrypoints over nested layouts; (c) pages in the diff's own route segment if the PR also changes routes.
|
|
90
|
+
|
|
91
|
+
Token-cost note: blueprint capture is a few hundred ms per page, so 15 pages is ~3-5 seconds of wall-clock. The count cap at 15 prevents pathological cases (shared design-system components imported by 100+ routes) from dominating TestBot runtime. **Missing a candidate is worse than over-capturing within the budget.**
|
|
92
|
+
|
|
93
|
+
Keep the captured blueprints in your working context through the rest of Task 1 and Task 2. They are the source of truth for UI-test \`reasoning\` fields in Task 1's recommendation catalog (the catalog itself will tell you what format to use).
|
|
94
|
+
|
|
95
|
+
**Failure fallback:** if the diff check fails, the app is unreachable, or \`browser_blueprint\` fails on every candidate page, skip Task 0 and proceed source-grounded. Log one \`issuesFound\` entry describing the failure mode. Non-UI work is unaffected.
|
|
96
|
+
|
|
52
97
|
## Task 1: Analyze & Maintain
|
|
53
98
|
|
|
54
|
-
1. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations (${maxGenerate} to generate, ${maxRecommendations - maxGenerate} as additional).${prNumber ? " Uses PR comment history to avoid re-recommending already-generated tests." : ""}
|
|
99
|
+
1. Call \`skyramp_analyze_changes\` with \`repositoryPath\`: "${repositoryPath}", \`scope\`: "branch_diff", \`topN\`: ${maxRecommendations}, \`maxGenerate\`: ${maxGenerate}${baseBranch ? `, \`baseBranch\`: "${baseBranch}"` : ""}${prNumber ? `, \`prNumber\`: ${prNumber}` : ""}${stateOutputFile ? `, \`stateOutputFile\`: "${stateOutputFile}"` : ""}${testsRepoDir ? `, \`testsRepoDir\`: "${testsRepoDir}"` : ""} — discovers existing Skyramp tests, scans endpoints changed in the diff, loads workspace config, and returns ${maxRecommendations} ranked ADD recommendations (${maxGenerate} to generate, ${maxRecommendations - maxGenerate} as additional).${prNumber ? " Uses PR comment history to avoid re-recommending already-generated tests." : ""}
|
|
55
100
|
**If \`skyramp_analyze_changes\` returns an error:** retry once only if the error is transient (timeout, network blip, temporary unavailability) — do NOT retry for permanent errors (invalid repository path, missing required parameter, authentication failure). If it fails again, call \`skyramp_submit_report\` with a minimal valid payload: leave all test arrays empty and add the error to \`issuesFound\`. Refer to the \`skyramp_submit_report\` schema for required fields. Do NOT attempt Task 2 without a valid stateFile.
|
|
56
101
|
**If all changed files are non-application** (CI/CD, docs, lock files, config) → skip to Task 3 (Submit Report) with empty arrays and a single \`issuesFound\` entry explaining why (same format as the zero-test path below).
|
|
57
102
|
|
|
@@ -67,6 +112,20 @@ ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repository
|
|
|
67
112
|
- Incorrect arithmetic in business logic (discount calculations, price aggregation)
|
|
68
113
|
Log each finding in \`issuesFound\` with a \`severity\` (critical/high/medium/low). These bugs should inform your test design in Task 2.
|
|
69
114
|
|
|
115
|
+
4. **UI Recommendation Grounding** — when UI files are in the diff, Task 0 captured blueprint data for candidate pages. Task 1 step 1's \`skyramp_analyze_changes\` output contains a "UI recommendation grounding" section inside its prompt text — that section defines the exact format for \`reasoning\` fields on \`testType: "ui"\` entries (role/accessibleName/testId/stableId/logicalName tuple, plus contextText for repeating-element rows). Use the captured blueprints from Task 0 to fill in those \`reasoning\` fields exactly as that section specifies.
|
|
116
|
+
|
|
117
|
+
**Blueprint Citation Invariant.** Every UI element you cite by \`role\`, \`accessibleName\`, \`testId\`, \`stableId\`, or \`logicalName\` — in a recommendation's \`reasoning\` field, in a generated test's assertion, or in an \`issuesFound\` entry — must come from a \`browser_blueprint\` call you actually made. The cited string must appear verbatim in a captured blueprint. Seeing related, parent, or sibling elements is NOT the same as seeing the element you want to cite.
|
|
118
|
+
|
|
119
|
+
When a citation isn't yet backed by a blueprint, do ONE of:
|
|
120
|
+
- **Capture once more.** \`browser_click\` the likely trigger (modal opener, tab, accordion, drawer), then \`browser_blueprint\`. Same-URL re-captures auto-diff against your prior call — the response's \`delta.elementsAdded\` is where the new element should appear.
|
|
121
|
+
- **Drop the citation.** Rewrite without the unverified tuple — source-grounded prose is fine. Add an \`issuesFound\` entry of \`info\` severity: \`"Blueprint capture missed <element name>; recommendation grounded in source diff only"\`.
|
|
122
|
+
|
|
123
|
+
Do not fabricate tuple values from the PR description, source diff, or component name. A fabricated tuple looks like grounding but isn't — and downstream test generation will emit assertions against names that don't exist in the rendered DOM.
|
|
124
|
+
|
|
125
|
+
**Non-UI entries (contract / integration / e2e / batch-scenario) are unaffected.** Their \`reasoning\` fields use the pre-existing formats — endpoint paths, request/response schemas, fixture chains. Do not reformat them.
|
|
126
|
+
|
|
127
|
+
**Failure fallback:** if Task 0 ran but the captured inventory is empty for a candidate page (e.g. pre-scan picked the wrong page), or if Task 0 logged a failure, UI entries fall back to source-grounded reasoning. Legitimate outcome.
|
|
128
|
+
|
|
70
129
|
---`;
|
|
71
130
|
const serviceContext = services?.length ? buildServiceContext(services) : '';
|
|
72
131
|
// The <ui-credentials> tags are framing for the agent's prompt context —
|
|
@@ -80,10 +139,14 @@ ${buildDriftAnalysisPrompt({ existingTests: [], scannedEndpoints: [], repository
|
|
|
80
139
|
const uiCredentialsBlock = trimmedCredentials
|
|
81
140
|
? `<ui-credentials>\n${trimmedCredentials}\n</ui-credentials>`
|
|
82
141
|
: '';
|
|
142
|
+
const testsRepoDirBlock = testsRepoDir ? `<TESTS REPO DIR>${testsRepoDir}</TESTS REPO DIR>\n` : '';
|
|
143
|
+
const testDirInstruction = testsRepoDir
|
|
144
|
+
? `the \`<output_dir>\` from the \`<services>\` block, rooted under the test repository at \`${testsRepoDir}\` (i.e. \`${testsRepoDir}/<output_dir>\`). Write ALL test output files to paths under \`${testsRepoDir}\`, not under \`${repositoryPath}\`. Do NOT write any test files to the app repository.`
|
|
145
|
+
: `${SERVICE_REFS.testDirRef}. Do NOT create a new \`tests/\` directory at the repo root — use that path. If no \`testDirectory\` is configured, default to the language-conventional location (e.g. \`src/test/java/...\` for Java, \`tests/\` for Python).`;
|
|
83
146
|
return `<TITLE>${prTitle}</TITLE>
|
|
84
147
|
<DESCRIPTION>${prDescription}</DESCRIPTION>
|
|
85
148
|
<REPOSITORY PATH>${repositoryPath}</REPOSITORY PATH>
|
|
86
|
-
${serviceContext ? serviceContext + '\n' : ''}${uiCredentialsBlock ? uiCredentialsBlock + '\n' : ''}Use the Skyramp MCP server tools for all tasks below.
|
|
149
|
+
${testsRepoDirBlock}${serviceContext ? serviceContext + '\n' : ''}${uiCredentialsBlock ? uiCredentialsBlock + '\n' : ''}Use the Skyramp MCP server tools for all tasks below.
|
|
87
150
|
|
|
88
151
|
${task1Section}
|
|
89
152
|
|
|
@@ -143,7 +206,7 @@ ${userPrompt ? "Generate only the tests that the user requested from the Additio
|
|
|
143
206
|
**How to generate each type (for ADD):**
|
|
144
207
|
- **Integration**: call \`skyramp_batch_scenario_test_generation\` with ALL steps in a single call (pass the \`steps\` array with method, path, requestBody, statusCode for each step). Then call \`skyramp_integration_test_generation\` with the returned scenario file.
|
|
145
208
|
**Use the pre-built scenario JSON from the Execution Plan** — pass the steps array directly. Do NOT read source code models to construct request bodies if the plan already provides them.
|
|
146
|
-
Scenario JSON and test files go in ${
|
|
209
|
+
Scenario JSON and test files go in ${testDirInstruction}
|
|
147
210
|
**Pipeline for speed**: Call ALL \`skyramp_batch_scenario_test_generation\` calls in one batch. When they return, call ALL \`skyramp_integration_test_generation\` calls in the next batch. Do NOT serialize per-scenario (batch→integration→batch→integration) — batch ALL scenarios first, then generate ALL integration tests.
|
|
148
211
|
- **Contract**: call \`skyramp_contract_test_generation\` with \`endpointURL\`, \`method\`, and \`requestData\` for POST/PUT/PATCH.
|
|
149
212
|
Pass \`apiSchema\` if an OpenAPI spec exists.
|
|
@@ -153,13 +216,20 @@ ${CONTRACT_MODE_GUIDANCE}
|
|
|
153
216
|
If a relevant trace exists (covers the UI changes in this PR), use it directly with \`skyramp_ui_test_generation\` and \`modularizeCode: false\`.
|
|
154
217
|
If NO relevant trace exists, **you MUST write out your full trace plan as text BEFORE calling \`browser_navigate\`**. Do not touch the browser until the plan is written.
|
|
155
218
|
|
|
156
|
-
**Browser authentication (check BEFORE navigating)**: If \`<ui-credentials>\` appears in your context above, the app requires login. Parse the credentials —
|
|
219
|
+
**Browser authentication (check BEFORE navigating)**: If \`<ui-credentials>\` appears in your context above, the app requires login. Parse the credentials — one per line, two supported formats:
|
|
220
|
+
- New format: \`username=<value>;password=<value>\` or \`username=<value>;password=<value>;role=<value>\` — fields are \`;\`-delimited key=value pairs. The \`=\` and \`;\` characters are reserved delimiters and must not appear in the values themselves.
|
|
221
|
+
- Legacy format: \`username:password\` — the first \`:\` splits username from password.
|
|
222
|
+
|
|
223
|
+
**Credential selection**: Use the first credential by default. When the scenario requires a specific role, find the credential whose \`role\` field matches (e.g. \`role=admin\`). If no credential matches the required role, use the first credential and add a note to \`issuesFound\` that no matching role was found.
|
|
224
|
+
|
|
225
|
+
Type all values verbatim. Before navigating to ANY feature URL:
|
|
157
226
|
1. \`browser_navigate\` to the login URL (e.g. \`{baseUrl}/login\`, \`/user/login\`, \`/signin\` — infer from the app's base URL and framework)
|
|
158
227
|
2. \`browser_snapshot\` to find the username/email and password fields
|
|
159
228
|
3. \`browser_type\` the username into the email/username field
|
|
160
229
|
4. \`browser_type\` the password into the password field
|
|
161
|
-
5.
|
|
162
|
-
6.
|
|
230
|
+
5. If a role selector is present and a \`role\` was specified in the credential, select it before submitting
|
|
231
|
+
6. \`browser_click\` the submit button, then \`browser_wait_for\` redirect away from the login page
|
|
232
|
+
7. Now navigate directly to the feature URL and begin recording
|
|
163
233
|
The login steps ARE part of the trace — the generated test will authenticate automatically.
|
|
164
234
|
|
|
165
235
|
Use this exact format:
|
|
@@ -198,6 +268,36 @@ ${CONTRACT_MODE_GUIDANCE}
|
|
|
198
268
|
- **List integrity after form save**: assert the list item count is unchanged unless the action explicitly added or removed items — catches duplication bugs
|
|
199
269
|
- Do NOT assert page headings, static labels, boilerplate text, intermediate states, or values already guaranteed by the action
|
|
200
270
|
- Do NOT assert the same value with multiple selectors
|
|
271
|
+
|
|
272
|
+
**Capture-act-capture (applies only when recording a UI trace):**
|
|
273
|
+
|
|
274
|
+
**Skip this entire section if Task 0's UI pre-scan found no UI files** (backend-only PR). The capture-act-capture pattern is for UI trace recording only — there's no UI trace to record on a backend-only PR. Continue to the non-UI test-type instructions below.
|
|
275
|
+
|
|
276
|
+
**Reminder — the UI test priority rule above still applies.** If the diff contains frontend/UI changes, you still MUST attempt to generate at least one UI test. Capture-act-capture is **how** you record that test, not **whether** you record one — do not substitute UI recommendations for actually recording a trace. Task 1 step 4 produced the grounded recommendations; Task 2 implements one or more of them.
|
|
277
|
+
|
|
278
|
+
This pattern produces delta-derived assertions from blueprint diffs. Diff-derived assertions catch state changes more reliably than author-inference — the diff tells you what actually changed on the page so the assertion is grounded in observable state, not in guessing what "success" looks like.
|
|
279
|
+
|
|
280
|
+
Capture-act-capture applies **only** to a UI trace in progress (inside the \`browser_navigate\` + \`browser_*\` interaction block). It does **not** apply to contract, integration, e2e, or batch-scenario test generation — those run on their pre-existing patterns without any capture-act-capture involvement. On mixed PRs with backend + UI work, generate the non-UI tests normally; the only thing that changes is how the UI trace itself is recorded.
|
|
281
|
+
|
|
282
|
+
\`browser_snapshot\` remains the source of ephemeral refs that interaction tools require. \`browser_blueprint\` provides durable semantic identity. Use both when recording: blueprint to decide the target and what "done" looks like; snapshot to get the ref needed to dispatch the click or type.
|
|
283
|
+
|
|
284
|
+
An **action** for this pattern is one user-intent-level operation whose completion changes the app's observable state — a click, a form submit (button or Enter), a navigation, a complete text fill of one field, or a meaningful keyboard shortcut (\`Ctrl+V\` paste, \`Ctrl+A\` select-all followed by a mutation, \`Escape\` to dismiss a modal). **Not** intermediate input mechanics: the individual typed characters \`browser_type\` emits, \`Tab\` between fields, arrow-key highlight in a listbox, focus-only changes. The browser-authentication flow (login) is boilerplate — no capture-act-capture there; login stays on the existing \`browser_snapshot\` + \`browser_type\` + \`browser_click\` pattern.
|
|
285
|
+
|
|
286
|
+
The pattern for each action:
|
|
287
|
+
|
|
288
|
+
1. **Before** the action: \`browser_blueprint\`. Identify the semantic target by \`role\`, \`accessibleName\`, and \`stableId\`/\`testId\`.
|
|
289
|
+
|
|
290
|
+
2. If the target's \`widgetType\` is \`"custom"\` or \`"unknown"\`: \`browser_widget_contract_lookup\` with the element's \`fingerprint\` and \`ref\`. On \`"found"\`, execute the contract steps. On \`"needs_inference"\`, fall through to snapshot-driven trial clicks (\`browser_wait_for\` between retries). Inference-and-cache is out of scope for this slice, so don't attempt to synthesize and cache contracts.
|
|
291
|
+
|
|
292
|
+
3. Execute the action via \`browser_click\` / \`browser_type\` / \`browser_navigate\`. The \`ref\` comes from \`browser_snapshot\` as today.
|
|
293
|
+
|
|
294
|
+
4. **After** the action: \`browser_blueprint\` again. **The response IS the diff** — because you already captured at this URL in step 1, the second call returns \`{ isFullCapture: false, pageHash, previousPageHash, delta }\`. The \`delta\` field already contains \`elementsAdded\`, \`elementsRemoved\`, \`textChanges\`, \`repeatingCountChanges\`, \`urlChange\`. You do **not** need to call \`browser_blueprint_diff\` for same-URL captures — that tool is only for cross-URL comparisons. An empty delta (all arrays empty) is itself a meaningful signal: the action did not change observable DOM (e.g. a silent failure the test should catch).
|
|
295
|
+
|
|
296
|
+
5. For each delta entry worth verifying, \`browser_assert\` whose target and expectation come from the delta. Example: delta reports \`repeatingCountChanges\` (\`view_details_for_order_btn\`: 12 → 13) → \`browser_assert\` on \`toHaveCount(13)\` against the repeating element's \`accessibleNameTemplate\`. The existing "at least one \`browser_assert\` per page navigated" rule still applies; the delta will naturally surface ≥1 assertable signal per action.
|
|
297
|
+
|
|
298
|
+
**The Blueprint Citation Invariant applies during recording too.** Every assertion you emit cites element names — those names must come from blueprint captures, not invention. For N user-intent-level actions, the reference target is N+1 \`browser_blueprint\` calls (the first returns full, the rest return deltas). Traces that follow the pattern produce assertions grounded in observable state changes; traces that skip captures fall back to author-inferred assertions and risk citing names that don't exist in the rendered DOM.
|
|
299
|
+
|
|
300
|
+
The rest of the UI workflow stays the same: trace plan, browser auth, navigation, export (\`skyramp_export_zip\`), generation (\`skyramp_ui_test_generation\`), \`skyramp_enhance_assertions\` post-call. Capture-act-capture adds blueprint captures alongside the existing steps; it doesn't replace anything.
|
|
201
301
|
- **E2E**: Only if BOTH a backend trace \`.json\` AND a Playwright \`.zip\` already exist in the repo. Without both, move to \`additionalRecommendations\`.
|
|
202
302
|
- Skip smoke tests entirely.
|
|
203
303
|
|
|
@@ -267,7 +367,7 @@ Call \`skyramp_submit_report\` with \`summaryOutputFile\`: "${summaryOutputFile}
|
|
|
267
367
|
- **additionalRecommendations**: AT MOST ${maxRecommendations - maxGenerate} items.
|
|
268
368
|
- For \`testType: "contract"\` entries: **\`primaryEndpoint\` is required** (e.g. \`"GET /api/v1/users/{user_id}"\`). The tool will reject the submission without it — do not omit it or you will be forced to resubmit.
|
|
269
369
|
- For \`testType: "integration"\` or \`"e2e"\` entries: omit \`primaryEndpoint\` — use \`description\` to list the endpoints involved instead.
|
|
270
|
-
- **testMaintenance**:
|
|
370
|
+
- **testMaintenance**: Use \`[]\` **only** if no existing Skyramp tests were found in the repository. If existing tests were found (any score), include one entry per test. For UPDATE/REGENERATE/DELETE tests that were modified and executed, populate all fields from real before/after execution results. For IGNORE-scored tests (not modified or executed), derive \`beforeStatus\` from the \`skyramp_analyze_test_health\` health score (typically \`"Pass"\` if drift score is 0 and no health issues were flagged), set \`afterStatus\` to \`"Skipped"\`, and use \`afterDetails\` to explain why (e.g. "IGNORE: drift score 0 — endpoint not modified in this PR"). Do **not** add entries for tests that were not returned by the health analysis.
|
|
271
371
|
|
|
272
372
|
---
|
|
273
373
|
|
|
@@ -353,15 +453,19 @@ export function registerTestbotPrompt(server) {
|
|
|
353
453
|
uiCredentials: z
|
|
354
454
|
.string()
|
|
355
455
|
.optional()
|
|
356
|
-
.describe("Browser login credentials for UI test recording
|
|
456
|
+
.describe("Browser login credentials for UI test recording. One credential per line. Supported formats: 'username=<val>;password=<val>' or 'username=<val>;password=<val>;role=<val>' (role optional), or legacy 'username:password'. Note: = and ; are reserved delimiters in the new format and must not appear in values. Injected into the prompt as a <ui-credentials> block so the agent logs in before recording traces."),
|
|
357
457
|
workspaceValidationFailed: z
|
|
358
458
|
.boolean()
|
|
359
459
|
.default(false)
|
|
360
460
|
.describe("Set to true when the testbot detected that .skyramp/workspace.yml exists but failed schema validation. Instructs the agent to regenerate the workspace file before proceeding."),
|
|
461
|
+
testsRepoDir: z
|
|
462
|
+
.string()
|
|
463
|
+
.optional()
|
|
464
|
+
.describe("Absolute path to a cloned test repository. When set, the agent writes generated test files there instead of the app repository (cross-repo test delivery)."),
|
|
361
465
|
},
|
|
362
466
|
}, async (args) => {
|
|
363
467
|
const services = await readWorkspaceServices(args.repositoryPath);
|
|
364
|
-
let prompt = getTestbotPrompt(args.prTitle, args.prDescription, args.summaryOutputFile, args.repositoryPath, args.baseBranch, args.maxRecommendations, args.maxGenerate, args.maxCritical, args.prNumber, args.userPrompt, services.length ? services : undefined, args.stateOutputFile, args.uiCredentials);
|
|
468
|
+
let prompt = getTestbotPrompt(args.prTitle, args.prDescription, args.summaryOutputFile, args.repositoryPath, args.baseBranch, args.maxRecommendations, args.maxGenerate, args.maxCritical, args.prNumber, args.userPrompt, services.length ? services : undefined, args.stateOutputFile, args.uiCredentials, args.testsRepoDir);
|
|
365
469
|
if (args.workspaceValidationFailed) {
|
|
366
470
|
prompt = buildWorkspaceRecoveryPrefix(args.repositoryPath) + prompt;
|
|
367
471
|
}
|
|
@@ -231,6 +231,41 @@ describe("drift analysis inline embedding", () => {
|
|
|
231
231
|
expect(prompt).toContain("rules in `<drift_analysis_rules>`");
|
|
232
232
|
});
|
|
233
233
|
});
|
|
234
|
+
describe("Task 0 UI pre-scan (Phase C D-1.a)", () => {
|
|
235
|
+
it("emits the Task 0 UI pre-scan section in full-analysis mode", () => {
|
|
236
|
+
const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
|
|
237
|
+
expect(prompt).toContain("## Task 0: UI Pre-Scan");
|
|
238
|
+
expect(prompt).toContain("-- '*.tsx' '*.jsx' '*.vue' '*.svelte' '*.html' '*.xml'");
|
|
239
|
+
expect(prompt).toContain("browser_blueprint");
|
|
240
|
+
});
|
|
241
|
+
it("places Task 0 before Task 1 (skyramp_analyze_changes)", () => {
|
|
242
|
+
const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
|
|
243
|
+
const task0Idx = prompt.indexOf("## Task 0: UI Pre-Scan");
|
|
244
|
+
const task1Idx = prompt.indexOf("## Task 1: Analyze & Maintain");
|
|
245
|
+
expect(task0Idx).toBeGreaterThanOrEqual(0);
|
|
246
|
+
expect(task1Idx).toBeGreaterThan(task0Idx);
|
|
247
|
+
});
|
|
248
|
+
it("does not emit Task 0 in follow-up mode (userPrompt set)", () => {
|
|
249
|
+
// Call signature (14 positional args): prTitle, prDescription, summaryOutputFile,
|
|
250
|
+
// repositoryPath, baseBranch?, maxRecommendations?, maxGenerate?, _maxCritical?,
|
|
251
|
+
// prNumber?, userPrompt?, services?, stateOutputFile?, uiCredentials?, testsRepoDir?
|
|
252
|
+
const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath, undefined, undefined, undefined, undefined, undefined, "add more tests");
|
|
253
|
+
expect(prompt).not.toContain("## Task 0: UI Pre-Scan");
|
|
254
|
+
});
|
|
255
|
+
it("Task 1 step 4 references Task 0; does not re-specify format", () => {
|
|
256
|
+
const prompt = getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath);
|
|
257
|
+
expect(prompt).toContain("UI Recommendation Grounding");
|
|
258
|
+
expect(prompt).toMatch(/UI recommendation grounding.*section/);
|
|
259
|
+
expect(prompt).toContain("Use the captured blueprints from Task 0");
|
|
260
|
+
// Negative half of test name: step 4 must NOT re-specify the tuple
|
|
261
|
+
// format (that lives in step 1's grounding section; duplicating would
|
|
262
|
+
// reintroduce the contradictory-instruction problem Phase C D-1.a fixes).
|
|
263
|
+
const step4Start = prompt.indexOf("4. **UI Recommendation Grounding**");
|
|
264
|
+
const step4End = prompt.indexOf("Legitimate outcome.", step4Start);
|
|
265
|
+
const step4Slice = prompt.slice(step4Start, step4End);
|
|
266
|
+
expect(step4Slice).not.toMatch(/role=<role>,\s*accessibleName=/);
|
|
267
|
+
});
|
|
268
|
+
});
|
|
234
269
|
describe("buildWorkspaceRecoveryPrefix", () => {
|
|
235
270
|
const { buildWorkspaceRecoveryPrefix } = require("./testbot-prompts.js");
|
|
236
271
|
it("includes repositoryPath in both init_scan and init_workspace instructions", () => {
|
|
@@ -248,3 +283,47 @@ describe("buildWorkspaceRecoveryPrefix", () => {
|
|
|
248
283
|
expect(prefix).toMatch(/^IMPORTANT:/);
|
|
249
284
|
});
|
|
250
285
|
});
|
|
286
|
+
describe("testsRepoDir in getTestbotPrompt", () => {
|
|
287
|
+
function callWithTestsRepoDir(testsRepoDir) {
|
|
288
|
+
return getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath, undefined, // baseBranch
|
|
289
|
+
undefined, // maxRecommendations
|
|
290
|
+
undefined, // maxGenerate
|
|
291
|
+
undefined, // maxCritical
|
|
292
|
+
undefined, // prNumber
|
|
293
|
+
undefined, // userPrompt
|
|
294
|
+
undefined, // services
|
|
295
|
+
undefined, // stateOutputFile
|
|
296
|
+
undefined, // uiCredentials
|
|
297
|
+
testsRepoDir);
|
|
298
|
+
}
|
|
299
|
+
function callFollowUpWithTestsRepoDir(testsRepoDir) {
|
|
300
|
+
return getTestbotPrompt(baseArgs.prTitle, baseArgs.prDescription, baseArgs.summaryOutputFile, baseArgs.repositoryPath, undefined, // baseBranch
|
|
301
|
+
undefined, // maxRecommendations
|
|
302
|
+
undefined, // maxGenerate
|
|
303
|
+
undefined, // maxCritical
|
|
304
|
+
undefined, // prNumber
|
|
305
|
+
"add more tests", // userPrompt — triggers follow-up path
|
|
306
|
+
undefined, // services
|
|
307
|
+
undefined, // stateOutputFile
|
|
308
|
+
undefined, // uiCredentials
|
|
309
|
+
testsRepoDir);
|
|
310
|
+
}
|
|
311
|
+
it("includes testsRepoDir in skyramp_analyze_changes call for first-run prompt", () => {
|
|
312
|
+
const dir = "/home/runner/work/_temp/skyramp/test-repo";
|
|
313
|
+
const prompt = callWithTestsRepoDir(dir);
|
|
314
|
+
expect(prompt).toContain(`\`testsRepoDir\`: "${dir}"`);
|
|
315
|
+
});
|
|
316
|
+
it("includes testsRepoDir in skyramp_analyze_changes call for follow-up prompt", () => {
|
|
317
|
+
const dir = "/home/runner/work/_temp/skyramp/test-repo";
|
|
318
|
+
const prompt = callFollowUpWithTestsRepoDir(dir);
|
|
319
|
+
expect(prompt).toContain(`\`testsRepoDir\`: "${dir}"`);
|
|
320
|
+
});
|
|
321
|
+
it("omits testsRepoDir when not provided", () => {
|
|
322
|
+
const prompt = callWithTestsRepoDir(undefined);
|
|
323
|
+
expect(prompt).not.toContain("testsRepoDir");
|
|
324
|
+
});
|
|
325
|
+
it("omits testsRepoDir from follow-up prompt when not provided", () => {
|
|
326
|
+
const prompt = callFollowUpWithTestsRepoDir(undefined);
|
|
327
|
+
expect(prompt).not.toContain("testsRepoDir");
|
|
328
|
+
});
|
|
329
|
+
});
|
|
@@ -25,7 +25,7 @@ export function registerTestbotResource(server) {
|
|
|
25
25
|
const maxCrit = parseInt(uri.searchParams.get("maxCritical") || "", 10);
|
|
26
26
|
const repositoryPath = param("repositoryPath", ".");
|
|
27
27
|
const services = await readWorkspaceServices(repositoryPath);
|
|
28
|
-
const prompt = getTestbotPrompt(param("prTitle", ""), param("prDescription", ""), param("summaryOutputFile", ""), repositoryPath, uri.searchParams.get("baseBranch") || undefined, isNaN(maxRec) ? MAX_RECOMMENDATIONS : maxRec, isNaN(maxGen) ? MAX_TESTS_TO_GENERATE : maxGen, isNaN(maxCrit) ? MAX_CRITICAL_TESTS : maxCrit, isNaN(prNum) ? undefined : prNum, uri.searchParams.get("userPrompt") || undefined, services.length ? services : undefined, uri.searchParams.get("stateOutputFile") || undefined, uri.searchParams.get("uiCredentials") || undefined);
|
|
28
|
+
const prompt = getTestbotPrompt(param("prTitle", ""), param("prDescription", ""), param("summaryOutputFile", ""), repositoryPath, uri.searchParams.get("baseBranch") || undefined, isNaN(maxRec) ? MAX_RECOMMENDATIONS : maxRec, isNaN(maxGen) ? MAX_TESTS_TO_GENERATE : maxGen, isNaN(maxCrit) ? MAX_CRITICAL_TESTS : maxCrit, isNaN(prNum) ? undefined : prNum, uri.searchParams.get("userPrompt") || undefined, services.length ? services : undefined, uri.searchParams.get("stateOutputFile") || undefined, uri.searchParams.get("uiCredentials") || undefined, uri.searchParams.get("testsRepoDir") || undefined);
|
|
29
29
|
AnalyticsService.pushMCPToolEvent("skyramp_testbot_prompt", undefined, {}).catch(() => { });
|
|
30
30
|
// Return the original URI — clients may use it to re-fetch the resource,
|
|
31
31
|
// and the caller already has these params. Credentials never appear in
|