@trygentic/agentloop 0.19.0-alpha.11 → 0.21.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,854 @@
1
+ {
2
+ "name": "qa-web-tester-continuous-agent-tree",
3
+ "description": "Continuous behavior tree for the QA Web Tester agent. Loops forever, waiting for task assignments from the orchestrator. Validates web application UI by navigating pages with Playwright, testing user flows, checking responsive layouts, monitoring console errors and network requests, and capturing screenshots as evidence.",
4
+ "version": "1.0.0",
5
+ "mode": "reactive",
6
+ "tree": {
7
+ "type": "root",
8
+ "child": {
9
+ "type": "sequence",
10
+ "comment": "Main continuous loop - never exits unless agent is stopped",
11
+ "children": [
12
+ {
13
+ "type": "action",
14
+ "call": "WaitForTask",
15
+ "comment": "Block until orchestrator assigns a task via ContinuousAgentRunner.assignTask()"
16
+ },
17
+ {
18
+ "type": "action",
19
+ "call": "FetchTaskContext",
20
+ "comment": "Load task details, comments, and engineer completion info"
21
+ },
22
+ {
23
+ "type": "action",
24
+ "call": "LoadProjectSpecifications",
25
+ "comment": "Load specification documents from .agentloop/specifications/ so QA can validate against project requirements"
26
+ },
27
+ {
28
+ "type": "selector",
29
+ "comment": "Summarize project specifications if available (non-critical: skip if no specs)",
30
+ "children": [
31
+ {
32
+ "type": "sequence",
33
+ "children": [
34
+ {
35
+ "type": "condition",
36
+ "call": "HasProjectSpecifications",
37
+ "comment": "Only summarize if specifications were loaded"
38
+ },
39
+ {
40
+ "type": "llm-action",
41
+ "name": "SummarizeProjectSpecifications",
42
+ "prompt": "Distill the following project specification documents into a compact structured summary focused on UI/UX requirements. Extract ONLY what is explicitly stated.\n\n## Raw Specifications\n{{projectSpecifications}}\n\n## Output Format\nProduce a structured summary covering ONLY sections that have explicit information:\n\n### Technology Stack\nList every explicitly named frontend technology, framework, UI library. Example: 'Next.js 14 App Router', 'Tailwind CSS', 'shadcn/ui'\n\n### Pages and Routes\nList every page path, route, or URL mentioned. Example: '/login', '/dashboard', '/settings'\n\n### UI Components\nList every UI component, form, or interactive element mentioned.\n\n### User Flows\nDescribe expected user flows (login, checkout, etc.) with steps.\n\n### Visual Requirements\nColors, fonts, breakpoints, responsive behavior, dark mode requirements.\n\n### Acceptance Criteria\nTestable UI/UX success conditions from the specs.\n\nBe exhaustive on details but terse on prose. Use bullet points.",
43
+ "contextKeys": ["projectSpecifications"],
44
+ "outputSchema": {
45
+ "type": "object",
46
+ "properties": {
47
+ "summary": {
48
+ "type": "string",
49
+ "description": "Structured summary of project specifications focused on UI/UX"
50
+ }
51
+ },
52
+ "required": ["summary"]
53
+ },
54
+ "outputKey": "projectSpecSummary",
55
+ "temperature": 0.1,
56
+ "allowedTools": []
57
+ }
58
+ ]
59
+ },
60
+ {
61
+ "type": "action",
62
+ "call": "NoOp",
63
+ "comment": "Continue without summarization if no specs or summarization fails"
64
+ }
65
+ ]
66
+ },
67
+ {
68
+ "type": "selector",
69
+ "comment": "Check for incoming agent messages (non-critical: continue even if unavailable)",
70
+ "children": [
71
+ {
72
+ "type": "action",
73
+ "call": "CheckIncomingMessages",
74
+ "comment": "Poll for messages from other agents (coordination, queries, notifications)"
75
+ },
76
+ {
77
+ "type": "action",
78
+ "call": "NoOp",
79
+ "comment": "Continue without message checking if messaging is unavailable"
80
+ }
81
+ ]
82
+ },
83
+ {
84
+ "type": "selector",
85
+ "comment": "Notify other agents that web QA review is starting (non-critical)",
86
+ "children": [
87
+ {
88
+ "type": "action",
89
+ "call": "SendTaskStartNotification",
90
+ "comment": "Broadcast to other agents that web QA is starting review of this task"
91
+ },
92
+ {
93
+ "type": "action",
94
+ "call": "NoOp",
95
+ "comment": "Continue without notification if messaging is unavailable"
96
+ }
97
+ ]
98
+ },
99
+ {
100
+ "type": "selector",
101
+ "comment": "Main web QA flow with failure handling",
102
+ "children": [
103
+ {
104
+ "type": "sequence",
105
+ "comment": "Main web QA testing sequence",
106
+ "children": [
107
+ {
108
+ "type": "action",
109
+ "call": "ExtractTaskFiles",
110
+ "comment": "Extract task-specific file list from engineer's completion comment"
111
+ },
112
+ {
113
+ "type": "action",
114
+ "call": "GitDiff",
115
+ "comment": "Get git diff scoped to task-specific files when available"
116
+ },
117
+ {
118
+ "type": "selector",
119
+ "comment": "Check if there are changes to test (frontend filtering done at orchestrator level via triggerCondition)",
120
+ "children": [
121
+ {
122
+ "type": "sequence",
123
+ "comment": "No changes detected - pass task",
124
+ "children": [
125
+ {
126
+ "type": "flip",
127
+ "child": {
128
+ "type": "condition",
129
+ "call": "HasCodeChanges"
130
+ }
131
+ },
132
+ {
133
+ "type": "action",
134
+ "call": "AddNoChangesComment"
135
+ },
136
+ {
137
+ "type": "action",
138
+ "call": "ReportTriggerPass"
139
+ }
140
+ ]
141
+ },
142
+ {
143
+ "type": "sequence",
144
+ "comment": "Frontend changes detected - analyze and run web tests",
145
+ "children": [
146
+ {
147
+ "type": "action",
148
+ "call": "DetectProjectType"
149
+ },
150
+ {
151
+ "type": "llm-action",
152
+ "name": "AnalyzeChangesForWebTesting",
153
+ "prompt": "You are a web QA agent analyzing code changes to determine what web pages and user flows need testing.\n\n{{#if projectSpecSummary}}\n## Project Specification Summary\n{{projectSpecSummary}}\n\nValidate the implementation against these specifications. Identify which UI components, pages, and user flows from the specs are affected by the changes.\n{{/if}}\n\nTask: {{taskDescription}}\nGit Diff: {{gitDiff}}\nProject Info: {{projectInfo}}\n\nAnalyze the changes and identify:\n1. Which web pages are affected (route paths)\n2. Which UI components were modified\n3. Which user flows need testing (login, forms, navigation, etc.)\n4. Whether responsive layout testing is needed\n5. Whether API integration testing is needed\n6. The risk level of the changes\n7. Whether this is a component-only task — the engineer built or modified a component that is NOT yet wired into any page route in the current worktree. Check if any page/layout file imports and renders the component. If no page imports it, it's component-only.\n\nIMPORTANT: Focus on frontend/UI changes. If the changes are purely backend with no frontend impact, note that web testing may not be necessary.",
154
+ "contextKeys": [
155
+ "taskDescription",
156
+ "taskTitle",
157
+ "gitDiff",
158
+ "projectInfo",
159
+ "projectSpecifications",
160
+ "projectSpecSummary"
161
+ ],
162
+ "outputSchema": {
163
+ "type": "object",
164
+ "properties": {
165
+ "changesSummary": {
166
+ "type": "string",
167
+ "description": "Brief summary of what was changed"
168
+ },
169
+ "affectedPages": {
170
+ "type": "array",
171
+ "items": { "type": "string" },
172
+ "description": "URL paths of affected pages (e.g., /login, /dashboard)"
173
+ },
174
+ "affectedComponents": {
175
+ "type": "array",
176
+ "items": { "type": "string" },
177
+ "description": "UI components that were modified"
178
+ },
179
+ "userFlowsToTest": {
180
+ "type": "array",
181
+ "items": { "type": "string" },
182
+ "description": "User flows that need E2E testing"
183
+ },
184
+ "needsResponsiveTest": {
185
+ "type": "boolean",
186
+ "description": "Whether layout/responsive testing is needed"
187
+ },
188
+ "needsApiTest": {
189
+ "type": "boolean",
190
+ "description": "Whether API integration testing from the UI is needed"
191
+ },
192
+ "hasFrontendChanges": {
193
+ "type": "boolean",
194
+ "description": "Whether the changes include frontend/UI modifications"
195
+ },
196
+ "riskLevel": {
197
+ "type": "string",
198
+ "enum": ["low", "medium", "high"]
199
+ },
200
+ "isComponentOnlyTask": {
201
+ "type": "boolean",
202
+ "description": "True if the changes are isolated component implementations not yet wired into any page route (e.g., building a new component that isn't imported/rendered by any existing page). False if the component is already integrated into a routable page."
203
+ }
204
+ },
205
+ "required": [
206
+ "changesSummary",
207
+ "hasFrontendChanges",
208
+ "riskLevel",
209
+ "isComponentOnlyTask"
210
+ ]
211
+ },
212
+ "outputKey": "webChangeAnalysis",
213
+ "temperature": 0.3,
214
+ "allowedTools": []
215
+ },
216
+ {
217
+ "type": "selector",
218
+ "comment": "Branch: no frontend changes → approve early, otherwise → run web tests",
219
+ "children": [
220
+ {
221
+ "type": "sequence",
222
+ "comment": "No frontend changes detected by LLM analysis - skip web testing and approve",
223
+ "children": [
224
+ {
225
+ "type": "llm-condition",
226
+ "name": "NoFrontendChangesDetected",
227
+ "prompt": "Based on the web change analysis, did the LLM determine there are NO frontend/UI changes to test?\n\nWeb Change Analysis: {{webChangeAnalysis}}\n\nReturn true if hasFrontendChanges is false in the analysis (meaning the changes are purely backend with no UI impact). Return false if there ARE frontend changes to test.",
228
+ "contextKeys": ["webChangeAnalysis"],
229
+ "confidenceThreshold": 0.7,
230
+ "fallbackValue": false,
231
+ "allowedTools": []
232
+ },
233
+ {
234
+ "type": "action",
235
+ "call": "AddNoFrontendChangesComment",
236
+ "comment": "Document that no frontend changes were found, skipping web tests"
237
+ },
238
+ {
239
+ "type": "action",
240
+ "call": "ReportTriggerPass",
241
+ "comment": "Approve without running web tests"
242
+ }
243
+ ]
244
+ },
245
+ {
246
+ "type": "sequence",
247
+ "comment": "Frontend changes detected - proceed with web testing",
248
+ "children": [
249
+ {
250
+ "type": "action",
251
+ "call": "InstallDependencies",
252
+ "comment": "Install project dependencies before starting dev server"
253
+ },
254
+ {
255
+ "type": "llm-action",
256
+ "name": "PlanWebTests",
257
+ "prompt": "Create a comprehensive test plan for browser-based E2E testing of this web application. Follow the test scenario categories and scenario specification format from your system instructions.\n\n{{#if projectSpecSummary}}\n## Project Specification Summary\n{{projectSpecSummary}}\n{{/if}}\n\n{{#if projectSpecifications}}\n## Full Project Specifications\n{{projectSpecifications}}\n{{/if}}\n\nTask: {{taskDescription}}\nChange Analysis: {{webChangeAnalysis}}\nProject Info: {{projectInfo}}\nGit Diff: {{gitDiff}}\n\n## Requirements\n\n## Component-Only Tasks\nIf the change analysis indicates `isComponentOnlyTask` is true, do NOT plan E2E page navigation scenarios. Instead, plan:\n1. **Build Validation**: Verify the project builds successfully with the new component (`npm run build` or equivalent)\n2. **Component Import Check**: Verify the component can be imported without errors\n3. **Dev Server Health**: Start the dev server and verify it doesn't crash (navigate to any existing page to confirm no regressions)\n\nFor component-only tasks, set scenario priorities to \"medium\" (not \"critical\") since these are build-validation checks, not full E2E tests. The scenarios should NOT attempt to navigate to a page where the component is rendered, since no such page exists yet.\n\nYou MUST create a MINIMUM of 3 test scenarios. Each scenario must test a specific user flow derived from the task description and code changes. Include at least one scenario from each applicable category (happy path, error states, edge cases, responsive design).\n\n## Additional Rules:\n- Focus on testing the CHANGED functionality, not the entire app\n- Include at least one responsive test scenario if layout changes were made\n- Include console error checks for every page load\n- Calculate the dev server port: PORT = 3000 + (taskId % 100)\n- Determine how to start the dev server (check package.json scripts)\n\nPrioritize scenarios by risk and impact.",
258
+ "contextKeys": [
259
+ "taskDescription",
260
+ "webChangeAnalysis",
261
+ "projectInfo",
262
+ "gitDiff",
263
+ "projectSpecifications",
264
+ "projectSpecSummary"
265
+ ],
266
+ "outputSchema": {
267
+ "type": "object",
268
+ "properties": {
269
+ "devServerCommand": {
270
+ "type": "string",
271
+ "description": "Command to start the dev server (e.g., 'npm run dev', 'next dev')"
272
+ },
273
+ "devServerPort": {
274
+ "type": "number",
275
+ "description": "Port to run the dev server on (3000 + taskId % 100)"
276
+ },
277
+ "baseUrl": {
278
+ "type": "string",
279
+ "description": "Base URL for testing (e.g., http://localhost:3028)"
280
+ },
281
+ "scenarios": {
282
+ "type": "array",
283
+ "items": {
284
+ "type": "object",
285
+ "properties": {
286
+ "name": {
287
+ "type": "string",
288
+ "description": "Scenario name"
289
+ },
290
+ "priority": {
291
+ "type": "string",
292
+ "enum": ["critical", "high", "medium", "low"]
293
+ },
294
+ "pages": {
295
+ "type": "array",
296
+ "items": { "type": "string" },
297
+ "description": "URL paths to visit"
298
+ },
299
+ "steps": {
300
+ "type": "array",
301
+ "items": { "type": "string" },
302
+ "description": "Interaction steps to perform"
303
+ },
304
+ "verifications": {
305
+ "type": "array",
306
+ "items": { "type": "string" },
307
+ "description": "What to verify after each step"
308
+ },
309
+ "expectedResults": {
310
+ "type": "array",
311
+ "items": { "type": "string" },
312
+ "description": "Concrete expected outcomes (e.g., 'Form shows success message', 'Error toast appears with message X')"
313
+ },
314
+ "viewports": {
315
+ "type": "array",
316
+ "items": { "type": "string" },
317
+ "description": "Viewport sizes to test (desktop, tablet, mobile)"
318
+ }
319
+ },
320
+ "required": ["name", "priority", "steps", "verifications", "expectedResults"]
321
+ }
322
+ }
323
+ },
324
+ "required": ["devServerCommand", "devServerPort", "baseUrl", "scenarios"]
325
+ },
326
+ "outputKey": "webTestPlan",
327
+ "temperature": 0.3,
328
+ "allowedTools": []
329
+ },
330
+ {
331
+ "type": "retry",
332
+ "name": "RetryOnEnvironmentFailure",
333
+ "comment": "Retry dev server startup + test execution if environment fails (e.g., server crash, port conflict). On first pass, FixWebEnvironment is a no-op. On retry, it kills stale processes and clears state.",
334
+ "attempts": 2,
335
+ "child": {
336
+ "type": "sequence",
337
+ "name": "ExecuteWithEnvRecovery",
338
+ "children": [
339
+ {
340
+ "type": "action",
341
+ "call": "FixWebEnvironment",
342
+ "comment": "No-op on first pass, fixes env on retry (kills port, clears state)"
343
+ },
344
+ {
345
+ "type": "llm-action",
346
+ "name": "StartDevServer",
347
+ "prompt": "Start the dev server for the web application so it can be tested.\n\nTest Plan: {{webTestPlan}}\nProject Info: {{projectInfo}}\nTask ID: {{currentTaskId}}\n\n## Instructions\n\n1. Create the screenshot output directory for organized storage:\n ```bash\n mkdir -p .agentloop/screenshots/task-{{currentTaskId}}/\n ```\n\n2. FIRST, kill any stale process already on the target port to prevent conflicts:\n ```bash\n lsof -ti:{{webTestPlan.devServerPort}} | xargs kill -9 2>/dev/null || true\n ```\n Wait 1 second after killing to ensure the port is freed.\n\n3. Start the dev server in the background using STRICT PORT mode to prevent silent port auto-increment.\n CRITICAL: You MUST use strict port flags so the server FAILS instead of silently switching to another port.\n Redirect stdout+stderr to a log file so you can verify the actual port from the server output.\n ```bash\n cd <project-dir> && PORT={{webTestPlan.devServerPort}} <command> --strict-port > /tmp/devserver-{{currentTaskId}}.log 2>&1 &\n ```\n\n Framework-specific strict port flags (use the appropriate one):\n - Vite: `--strict-port` (makes Vite error out if port is taken instead of auto-incrementing)\n - Next.js: Does not auto-increment by default, but verify with `--port {{webTestPlan.devServerPort}}`\n - Create React App: `PORT={{webTestPlan.devServerPort}}` env var (does not auto-increment)\n - Generic: Always set both the PORT env var AND any CLI port flag\n\n4. Wait for the server to be ready by polling with curl (up to 60 seconds):\n ```bash\n for i in $(seq 1 30); do curl -s -o /dev/null -w '%{http_code}' http://localhost:{{webTestPlan.devServerPort}} | grep -q '200\\|301\\|302\\|304' && break || sleep 2; done\n ```\n\n5. If the server fails to start (e.g., port taken error with --strict-port), check the log:\n ```bash\n cat /tmp/devserver-{{currentTaskId}}.log\n ```\n If it failed because the port was taken, kill the process on that port and retry from step 3.\n\n6. If the primary command fails, try common alternatives (ALWAYS with strict port):\n - `npm run dev -- --port {{webTestPlan.devServerPort}} --strict-port`\n - `npx next dev -p {{webTestPlan.devServerPort}}`\n - `PORT={{webTestPlan.devServerPort}} npm start`\n\n7. VERIFY the server is running on the CORRECT port by checking the server's log output:\n ```bash\n cat /tmp/devserver-{{currentTaskId}}.log | grep -i 'port\\|localhost\\|listening\\|ready\\|http://'\n ```\n Confirm the logged URL/port matches {{webTestPlan.devServerPort}}.\n\n8. As a final check, verify with lsof:\n ```bash\n lsof -i :{{webTestPlan.devServerPort}} -P -n | head -5\n ```\n Store the confirmed port in your output as `port`.\n\nIMPORTANT:\n- Do NOT write any test scripts. Your ONLY job is to start the dev server.\n- Do NOT install playwright or any browser testing tools.\n- Do NOT run `npx playwright install` or similar commands.\n- Just start the dev server and confirm it is running.\n- NEVER trust lsof alone to verify the port — always check the server's stdout log first.",
348
+ "contextKeys": [
349
+ "webTestPlan",
350
+ "projectInfo",
351
+ "currentTaskId"
352
+ ],
353
+ "outputSchema": {
354
+ "type": "object",
355
+ "properties": {
356
+ "serverStarted": {
357
+ "type": "boolean",
358
+ "description": "Whether the dev server started successfully"
359
+ },
360
+ "serverUrl": {
361
+ "type": "string",
362
+ "description": "The URL the dev server is running on"
363
+ },
364
+ "serverPid": {
365
+ "type": ["number", "null"],
366
+ "description": "PID of the dev server process"
367
+ },
368
+ "port": {
369
+ "type": "number",
370
+ "description": "The confirmed port number the dev server is listening on (verified via lsof)"
371
+ },
372
+ "startupDetails": {
373
+ "type": "string",
374
+ "description": "Details about how the server was started"
375
+ }
376
+ },
377
+ "required": ["serverStarted", "serverUrl", "startupDetails"]
378
+ },
379
+ "outputKey": "devServerStatus",
380
+ "temperature": 0.1,
381
+ "subagent": "qa-web-tester",
382
+ "maxTurns": 30,
383
+ "allowedTools": [
384
+ "bash",
385
+ "read",
386
+ "glob",
387
+ "grep"
388
+ ]
389
+ },
390
+ {
391
+ "type": "action",
392
+ "call": "InitializeScenarioResults",
393
+ "comment": "Initialize webTestResults with empty scenarioResults array before per-scenario loop"
394
+ },
395
+ {
396
+ "type": "action",
397
+ "call": "CreatePlaywrightScreenshotDir",
398
+ "comment": "Create .agentloop/screenshots/task-{taskId}/ directory before scenarios run to prevent ENOENT errors on screenshot saves"
399
+ },
400
+ {
401
+ "type": "forEach",
402
+ "name": "IterateScenarios",
403
+ "comment": "Per-scenario loop: executes one LLM call per scenario instead of one monolithic call for all scenarios",
404
+ "collection": "webTestPlan.scenarios",
405
+ "itemKey": "currentScenario",
406
+ "indexKey": "currentScenarioIndex",
407
+ "continueOnFailure": true,
408
+ "child": {
409
+ "type": "sequence",
410
+ "children": [
411
+ {
412
+ "type": "llm-action",
413
+ "name": "ExecuteSingleScenario",
414
+ "subagent": "qa-web-tester",
415
+ "maxTurns": 20,
416
+ "temperature": 0.2,
417
+ "contextKeys": [
418
+ "currentScenario",
419
+ "currentScenarioIndex",
420
+ "devServerStatus",
421
+ "currentTaskId",
422
+ "webTestPlan",
423
+ "projectSpecSummary",
424
+ "webChangeAnalysis",
425
+ "gitDiff"
426
+ ],
427
+ "outputKey": "currentScenarioResult",
428
+ "outputSchema": {
429
+ "type": "object",
430
+ "properties": {
431
+ "name": { "type": "string", "description": "Scenario name" },
432
+ "passed": { "type": "boolean", "description": "Whether the scenario passed" },
433
+ "details": { "type": "string", "description": "Detailed results and observations" },
434
+ "screenshotPaths": { "type": "array", "items": { "type": "string" }, "description": "Paths to screenshots taken" },
435
+ "consoleErrors": { "type": "array", "items": { "type": "string" }, "description": "Console errors observed" },
436
+ "networkFailures": { "type": "array", "items": { "type": "string" }, "description": "Network request failures" }
437
+ },
438
+ "required": ["name", "passed", "details"]
439
+ },
440
+ "allowedTools": [
441
+ "mcp__playwright__browser_navigate",
442
+ "mcp__playwright__browser_navigate_back",
443
+ "mcp__playwright__browser_click",
444
+ "mcp__playwright__browser_hover",
445
+ "mcp__playwright__browser_drag",
446
+ "mcp__playwright__browser_type",
447
+ "mcp__playwright__browser_fill_form",
448
+ "mcp__playwright__browser_select_option",
449
+ "mcp__playwright__browser_press_key",
450
+ "mcp__playwright__browser_take_screenshot",
451
+ "mcp__playwright__browser_snapshot",
452
+ "mcp__playwright__browser_console_messages",
453
+ "mcp__playwright__browser_network_requests",
454
+ "mcp__playwright__browser_wait_for",
455
+ "mcp__playwright__browser_resize",
456
+ "mcp__playwright__browser_close",
457
+ "mcp__playwright__browser_handle_dialog",
458
+ "mcp__playwright__browser_evaluate",
459
+ "mcp__playwright__browser_file_upload",
460
+ "mcp__playwright__browser_tabs"
461
+ ],
462
+ "prompt": "Execute ONE test scenario using Playwright MCP browser tools.\n\n## Your Scenario\n{{currentScenario}}\n(Scenario {{currentScenarioIndex}} of the test plan)\n\n## Dev Server\n{{devServerStatus}}\n\n## Task Context\nTask ID: {{currentTaskId}}\nChanges: {{webChangeAnalysis}}\n\n## Instructions\n1. Navigate to the dev server URL\n2. If this is the FIRST scenario (index 0), perform App Identity Verification per your system instructions\n3. Execute the test steps defined in your scenario\n4. Take a browser_take_screenshot at the END of the scenario, saving to `.agentloop/screenshots/task-{{currentTaskId}}/scenario-{{currentScenarioIndex}}-{descriptive-name}.png`\n5. Check browser_console_messages for errors after key interactions\n6. Report your findings in the output\n\n## CRITICAL: Context Budget\nYou have a LIMITED number of tool calls (max 20 turns). Be efficient:\n- Do NOT call browser_snapshot after every action. Only use it when you need to understand the page structure and cannot infer it from previous context.\n- Prefer browser_click/browser_type/browser_fill_form by using element references from the LAST snapshot rather than taking a new one.\n- Combine checks: check console_messages and take screenshot together at the end, not separately after each step.\n- If you run out of turns, report what you observed so far rather than leaving the scenario incomplete.\n\nFollow the Playwright Testing Guidelines in your system instructions for tool usage, resize workflow, and error checking."
463
+ },
464
+ {
465
+ "type": "action",
466
+ "call": "AccumulateScenarioResult",
467
+ "comment": "Append this scenario's result to webTestResults.scenarioResults"
468
+ }
469
+ ]
470
+ }
471
+ },
472
+ {
473
+ "type": "action",
474
+ "call": "AggregateScenarioResults",
475
+ "comment": "Calculate overallPassed and build summary from accumulated scenario results"
476
+ },
477
+ {
478
+ "type": "condition",
479
+ "call": "WebEnvironmentHealthy",
480
+ "comment": "Returns false if env failed (server crash, connection refused) → sequence fails → retry triggers. Returns true if healthy → continue to result evaluation."
481
+ }
482
+ ]
483
+ }
484
+ },
485
+ {
486
+ "type": "selector",
487
+ "comment": "2-way decision: critical failures → deterministic reject, otherwise → LLM evaluation",
488
+ "children": [
489
+ {
490
+ "type": "sequence",
491
+ "comment": "PATH 1: CRITICAL FAILURES — deterministic rejection, LLM cannot override. Catches build errors, HTTP 500, connection refused, all scenarios failed.",
492
+ "children": [
493
+ {
494
+ "type": "condition",
495
+ "call": "WebTestsHaveCriticalFailures",
496
+ "comment": "DETERMINISTIC: Detects build/compile failures, HTTP 500, ECONNREFUSED, all-scenarios-failed. Returns true if app is fundamentally broken."
497
+ },
498
+ {
499
+ "type": "action",
500
+ "call": "DocumentCriticalFailure",
501
+ "comment": "Write deterministic rejection comment and set rejectionDetails on blackboard"
502
+ },
503
+ {
504
+ "type": "action",
505
+ "call": "AddRejectionComment",
506
+ "comment": "Post the rejection comment to the task"
507
+ },
508
+ {
509
+ "type": "action",
510
+ "call": "ReportTriggerFail",
511
+ "comment": "Move task back to todo for engineer to fix build/compilation errors"
512
+ }
513
+ ]
514
+ },
515
+ {
516
+ "type": "sequence",
517
+ "comment": "PATH 2: No critical failures — LLM evaluates test results (can distinguish pre-existing vs task-related)",
518
+ "children": [
519
+ {
520
+ "type": "selector",
521
+ "comment": "Try LLM evaluation, fall back to deterministic analysis if LLM fails",
522
+ "children": [
523
+ {
524
+ "type": "llm-action",
525
+ "name": "EvaluateWebTestResults",
526
+ "prompt": "You are the QA web tester. Evaluate the web test results and decide whether to APPROVE or REJECT the engineer's changes.\n\n{{#if projectSpecSummary}}\n## Project Specification Summary\n{{projectSpecSummary}}\n{{/if}}\n\nTask: {{taskDescription}}\nPlanned Test Scenarios: {{webTestPlan}}\nWeb Test Results: {{webTestResults}}\nGit Diff: {{gitDiff}}\nTask Files: {{taskFiles}}\n\nYou MUST set the 'approved' field to control whether the task passes or fails. This field directly controls the BT flow — there is no separate judgment step.\n\nCRITICAL RULES FOR TEST COVERAGE COMPLETENESS:\n- Compare the number of executed scenarios in webTestResults against the planned scenarios in webTestPlan.\n- If fewer than 75% of planned scenarios were actually executed, set approved=false with rejectionReason explaining incomplete test coverage.\n- Missing responsive/viewport tests when the plan included them is a coverage gap — do NOT approve.\n- A high confidence score does NOT override incomplete coverage. If scenarios are missing, reject.\n\nCRITICAL RULES FOR THE 'approved' FIELD:\n1. Set approved=true ONLY when 'overallPassed' is true in webTestResults AND scenarios actually executed with real results showing success.\n2. Pre-existing issues (bugs in pages the engineer did NOT modify) do NOT count as failures. Set approved=true even if overallPassed is false, as long as ALL failures are pre-existing.\n3. Set approved=false if there are actual task-related UI failures directly caused by the engineer's changes.\n4. Console warnings from third-party libraries are NOT failures. Only critical JS errors in the engineer's code count.\n5. Check the 'classification' field on each scenarioResult: 'pre-existing' and 'environment' failures are NOT the engineer's fault.\n6. If some scenarios passed and some failed, check whether the failures are in pages/components the engineer actually modified (compare with gitDiff and taskFiles).\n7. When in doubt, set approved=false. It is SAFER to reject and re-review than to approve broken code.\n\nFAILURE CLASSIFICATION (CRITICAL — you MUST classify every failure):\nFor EACH failed scenario, you MUST classify it as one of:\n1. 'task-related': The failure is caused by code the engineer changed or added (appears in gitDiff/taskFiles).\n2. 'pre-existing': The failure is in a page/component the engineer did NOT modify. It existed before these changes.\n3. 'environment': Dev server issues, missing dependencies, connection refused, etc.\n\nPopulate the 'failures' array with ALL failures classified. Set the count fields:\n- taskRelatedFailures: count of 'task-related' failures\n- preExistingFailures: count of 'pre-existing' failures\n- environmentIssues: count of 'environment' failures\n\nIMPORTANT: If ALL failures are 'pre-existing' (taskRelatedFailures=0, preExistingFailures>0), set approved=true — the engineer's code is not at fault.\n\nCOMPONENT-ONLY TASK RULES:\nIf webChangeAnalysis.isComponentOnlyTask is true:\n- The engineer built an isolated component not yet wired into any page route\n- Route navigation failures are EXPECTED and should be classified as 'environment' (not 'task-related')\n- Build success + dev server starts without crashing = PASS (set approved=true)\n- Only reject if the build itself fails or the dev server crashes due to the new component\n\nFOR THE 'confidence' FIELD:\n- Rate your confidence in your verdict from 0.0 to 1.0.\n- Use < 0.5 if the evidence is ambiguous or incomplete.\n- Use > 0.8 only when the evidence clearly supports your verdict with no ambiguity.\n\nFOR THE 'comment' FIELD:\n- If approved=true: Write a concise approval summary listing pages tested, viewports tested, console error status, and any pre-existing issues noted.\n- If approved=false: Write a detailed rejection with bug reports for each task-related failure including severity, page URL, steps to reproduce, expected vs actual behavior, screenshot paths, and console errors.\n\nFOR THE 'rejectionReason' FIELD:\n- If approved=true: Set to null.\n- If approved=false: Briefly describe why the task is rejected.\n\nFOR THE 'bugs' FIELD:\n- If approved=true: Set to an empty array.\n- If approved=false: List each task-related bug with title, severity, page, steps to reproduce, expected/actual behavior.",
527
+ "contextKeys": [
528
+ "taskDescription",
529
+ "webTestPlan",
530
+ "webTestResults",
531
+ "webChangeAnalysis",
532
+ "gitDiff",
533
+ "taskFiles",
534
+ "projectSpecifications",
535
+ "projectSpecSummary"
536
+ ],
537
+ "outputSchema": {
538
+ "type": "object",
539
+ "properties": {
540
+ "approved": {
541
+ "type": "boolean",
542
+ "description": "true if engineer's changes pass web QA, false if task-related failures found. This field CONTROLS the BT flow."
543
+ },
544
+ "confidence": {
545
+ "type": "number",
546
+ "description": "Your confidence in the approval/rejection decision (0.0 to 1.0). Use lower values when evidence is ambiguous."
547
+ },
548
+ "comment": {
549
+ "type": "string",
550
+ "description": "Approval summary (if approved) or detailed rejection with bug reports (if rejected)"
551
+ },
552
+ "rejectionReason": {
553
+ "type": ["string", "null"],
554
+ "description": "Brief rejection reason (null if approved)"
555
+ },
556
+ "taskRelatedFailures": {
557
+ "type": ["number", "null"],
558
+ "description": "Number of failures caused by the engineer's changes"
559
+ },
560
+ "preExistingFailures": {
561
+ "type": ["number", "null"],
562
+ "description": "Number of failures that existed before the engineer's changes (in pages/components the engineer did NOT modify)"
563
+ },
564
+ "environmentIssues": {
565
+ "type": ["number", "null"],
566
+ "description": "Number of environment-related issues (dev server problems, missing deps, etc.)"
567
+ },
568
+ "failures": {
569
+ "type": "array",
570
+ "items": {
571
+ "type": "object",
572
+ "properties": {
573
+ "testName": {
574
+ "type": "string",
575
+ "description": "Name of the failed test scenario"
576
+ },
577
+ "error": {
578
+ "type": "string",
579
+ "description": "Error description"
580
+ },
581
+ "classification": {
582
+ "type": "string",
583
+ "enum": ["task-related", "pre-existing", "environment"],
584
+ "description": "Whether this failure is caused by the engineer's changes, pre-existing, or an environment issue"
585
+ },
586
+ "page": {
587
+ "type": ["string", "null"],
588
+ "description": "Page URL where the failure occurred"
589
+ }
590
+ },
591
+ "required": ["testName", "error", "classification"]
592
+ },
593
+ "description": "Classified list of all failures (task-related, pre-existing, and environment)"
594
+ },
595
+ "bugs": {
596
+ "type": "array",
597
+ "items": {
598
+ "type": "object",
599
+ "properties": {
600
+ "title": { "type": "string" },
601
+ "severity": {
602
+ "type": "string",
603
+ "enum": ["critical", "major", "minor"]
604
+ },
605
+ "page": { "type": "string" },
606
+ "stepsToReproduce": {
607
+ "type": "array",
608
+ "items": { "type": "string" }
609
+ },
610
+ "expected": { "type": "string" },
611
+ "actual": { "type": "string" },
612
+ "screenshotPath": { "type": ["string", "null"] },
613
+ "consoleErrors": {
614
+ "type": "array",
615
+ "items": { "type": "string" }
616
+ }
617
+ },
618
+ "required": ["title", "severity", "stepsToReproduce", "expected", "actual"]
619
+ },
620
+ "default": []
621
+ }
622
+ },
623
+ "required": ["approved", "confidence", "comment", "taskRelatedFailures", "preExistingFailures", "environmentIssues", "failures"]
624
+ },
625
+ "outputKey": "webTestVerdict",
626
+ "temperature": 0.3,
627
+ "allowedTools": []
628
+ },
629
+ {
630
+ "type": "action",
631
+ "call": "FallbackWebTestAnalysis",
632
+ "comment": "Fallback: deterministic analysis when EvaluateWebTestResults LLM fails (prompt too long, API error)"
633
+ }
634
+ ]
635
+ },
636
+ {
637
+ "type": "action",
638
+ "call": "ApplyConfidenceThreshold",
639
+ "comment": "Override approval to rejection if LLM confidence is below 0.8 threshold"
640
+ },
641
+ {
642
+ "type": "selector",
643
+ "comment": "Route based on LLM verdict: approved=true → pass, approved=false → fail",
644
+ "children": [
645
+ {
646
+ "type": "sequence",
647
+ "comment": "APPROVED: LLM determined tests passed — run linter before final approval",
648
+ "children": [
649
+ {
650
+ "type": "condition",
651
+ "call": "WebTestsApproved",
652
+ "comment": "DETERMINISTIC: checks webTestVerdict.approved === true"
653
+ },
654
+ {
655
+ "type": "action",
656
+ "call": "AddApprovalComment"
657
+ },
658
+ {
659
+ "type": "action",
660
+ "call": "RunLinter",
661
+ "comment": "Run linter before final approval - succeeds silently if no linter found"
662
+ },
663
+ {
664
+ "type": "selector",
665
+ "comment": "Check lint results - pass or fail back to engineer",
666
+ "children": [
667
+ {
668
+ "type": "sequence",
669
+ "comment": "Lint passed (or no linter) - approve",
670
+ "children": [
671
+ {
672
+ "type": "condition",
673
+ "call": "LintPassed"
674
+ },
675
+ {
676
+ "type": "action",
677
+ "call": "ReportTriggerPass"
678
+ }
679
+ ]
680
+ },
681
+ {
682
+ "type": "sequence",
683
+ "comment": "Lint failed - send back to engineer with lint errors",
684
+ "children": [
685
+ {
686
+ "type": "condition",
687
+ "call": "LintFailed"
688
+ },
689
+ {
690
+ "type": "action",
691
+ "call": "AddLintFailureComment"
692
+ },
693
+ {
694
+ "type": "action",
695
+ "call": "ReportTriggerFail"
696
+ }
697
+ ]
698
+ }
699
+ ]
700
+ }
701
+ ]
702
+ },
703
+ {
704
+ "type": "sequence",
705
+ "comment": "ALL failures are pre-existing (unrelated to task) and block testing completely - escalate rather than bouncing to engineer",
706
+ "children": [
707
+ {
708
+ "type": "condition",
709
+ "call": "IsPreExistingBugBlocking",
710
+ "comment": "Check if ALL failures are pre-existing with 0 task-related failures"
711
+ },
712
+ {
713
+ "type": "action",
714
+ "call": "EscalatePreExistingBug",
715
+ "comment": "Create bug-fix task, add DAG dependency blocking original task, notify PM and merge-resolver"
716
+ },
717
+ {
718
+ "type": "action",
719
+ "call": "ReportTriggerFail",
720
+ "comment": "Move task to todo - DAG dependency on the bug-fix task prevents orchestrator from re-assigning until fix is done. NOT a rejection of engineer's work."
721
+ }
722
+ ]
723
+ },
724
+ {
725
+ "type": "sequence",
726
+ "comment": "REJECTED: LLM determined task-related failures",
727
+ "children": [
728
+ {
729
+ "type": "action",
730
+ "call": "AddRejectionComment"
731
+ },
732
+ {
733
+ "type": "action",
734
+ "call": "ReportTriggerFail"
735
+ }
736
+ ]
737
+ }
738
+ ]
739
+ }
740
+ ]
741
+ }
742
+ ]
743
+ }
744
+ ]
745
+ }
746
+ ]
747
+ }
748
+ ]
749
+ }
750
+ ]
751
+ }
752
+ ]
753
+ },
754
+ {
755
+ "type": "sequence",
756
+ "comment": "FAILURE HANDLER: Unexpected error during web QA",
757
+ "children": [
758
+ {
759
+ "type": "action",
760
+ "call": "AddQAFailureComment"
761
+ },
762
+ {
763
+ "type": "action",
764
+ "call": "ReportTriggerFail"
765
+ }
766
+ ]
767
+ }
768
+ ]
769
+ },
770
+ {
771
+ "type": "selector",
772
+ "comment": "Notify other agents that web QA review is complete (non-critical)",
773
+ "children": [
774
+ {
775
+ "type": "action",
776
+ "call": "SendTaskCompleteNotification",
777
+ "comment": "Broadcast to other agents that web QA has finished reviewing this task"
778
+ },
779
+ {
780
+ "type": "action",
781
+ "call": "NoOp",
782
+ "comment": "Continue without notification if messaging is unavailable"
783
+ }
784
+ ]
785
+ },
786
+ {
787
+ "type": "selector",
788
+ "comment": "Stop dev server to prevent port conflicts on next run (non-critical: continue even if kill fails)",
789
+ "children": [
790
+ {
791
+ "type": "action",
792
+ "call": "StopDevServer",
793
+ "comment": "Kill the dev server process and free the port"
794
+ },
795
+ {
796
+ "type": "action",
797
+ "call": "NoOp",
798
+ "comment": "Continue even if StopDevServer fails"
799
+ }
800
+ ]
801
+ },
802
+ {
803
+ "type": "action",
804
+ "call": "ClearTaskContext",
805
+ "comment": "Reset task-specific blackboard keys to prepare for next task assignment"
806
+ },
807
+ {
808
+ "type": "action",
809
+ "call": "Loop",
810
+ "comment": "Return RUNNING to restart the BT from the root - wait for next task"
811
+ }
812
+ ]
813
+ }
814
+ },
815
+ "blackboardDefaults": {
816
+ "webChangeAnalysis": null,
817
+ "webTestPlan": null,
818
+ "webTestResults": null,
819
+ "currentScenario": null,
820
+ "currentScenarioIndex": 0,
821
+ "currentScenarioResult": null,
822
+ "devServerStatus": null,
823
+ "approvalComment": null,
824
+ "rejectionDetails": null,
825
+ "webTestVerdict": null,
826
+ "gitDiff": null,
827
+ "projectInfo": null,
828
+ "projectSpecifications": null,
829
+ "projectSpecSummary": null,
830
+ "engineerTestSetup": null,
831
+ "requestedStatus": null,
832
+ "statusChangeReason": null,
833
+ "taskComments": null,
834
+ "taskDetails": null,
835
+ "taskFiles": null,
836
+ "currentTaskId": null,
837
+ "taskAssignedAt": null,
838
+ "loopCount": 0,
839
+ "qaBounceCount": 0,
840
+ "maxQABounces": 3,
841
+ "isQARejection": false,
842
+ "hasQAFeedback": false,
843
+ "incomingMessages": [],
844
+ "coordinationMessage": null,
845
+ "messageResponse": null,
846
+ "envRetryAttempted": false,
847
+ "lintResults": null,
848
+ "lintCommand": null,
849
+ "lintDetected": null,
850
+ "lintPassed": null,
851
+ "lintErrors": null,
852
+ "lintSource": null
853
+ }
854
+ }