@trygentic/agentloop 0.20.0-alpha.11 → 0.21.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,998 @@
1
+ {
2
+ "name": "qa-electron-tester-continuous-agent-tree",
3
+ "description": "Continuous behavior tree for the QA Electron Tester agent. Loops forever, waiting for task assignments from the orchestrator. Validates Electron desktop apps by checking startup health, launching renderer surfaces for Playwright-driven interaction, testing user flows, checking responsive layouts inside the renderer, monitoring console and network issues, and capturing screenshots as evidence.",
4
+ "version": "1.0.0",
5
+ "mode": "reactive",
6
+ "tree": {
7
+ "type": "root",
8
+ "child": {
9
+ "type": "sequence",
10
+ "comment": "Main continuous loop - never exits unless agent is stopped",
11
+ "children": [
12
+ {
13
+ "type": "action",
14
+ "call": "WaitForTask",
15
+ "comment": "Block until orchestrator assigns a task via ContinuousAgentRunner.assignTask()"
16
+ },
17
+ {
18
+ "type": "action",
19
+ "call": "FetchTaskContext",
20
+ "comment": "Load task details, comments, and engineer completion info"
21
+ },
22
+ {
23
+ "type": "action",
24
+ "call": "LoadProjectSpecifications",
25
+ "comment": "Load specification documents from .agentloop/specifications/ so QA can validate against project requirements"
26
+ },
27
+ {
28
+ "type": "selector",
29
+ "comment": "Summarize project specifications if available (non-critical: skip if no specs)",
30
+ "children": [
31
+ {
32
+ "type": "sequence",
33
+ "children": [
34
+ {
35
+ "type": "condition",
36
+ "call": "HasProjectSpecifications",
37
+ "comment": "Only summarize if specifications were loaded"
38
+ },
39
+ {
40
+ "type": "llm-action",
41
+ "name": "SummarizeProjectSpecifications",
42
+ "prompt": "Distill the following project specification documents into a compact structured summary focused on UI/UX requirements. Extract ONLY what is explicitly stated.\n\n## Raw Specifications\n{{projectSpecifications}}\n\n## Output Format\nProduce a structured summary covering ONLY sections that have explicit information:\n\n### Technology Stack\nList every explicitly named frontend technology, framework, UI library. Example: 'Next.js 14 App Router', 'Tailwind CSS', 'shadcn/ui'\n\n### Pages and Routes\nList every page path, route, or URL mentioned. Example: '/login', '/dashboard', '/settings'\n\n### UI Components\nList every UI component, form, or interactive element mentioned.\n\n### User Flows\nDescribe expected user flows (login, checkout, etc.) with steps.\n\n### Visual Requirements\nColors, fonts, breakpoints, responsive behavior, dark mode requirements.\n\n### Acceptance Criteria\nTestable UI/UX success conditions from the specs.\n\nBe exhaustive on details but terse on prose. Use bullet points.",
43
+ "contextKeys": ["projectSpecifications"],
44
+ "outputSchema": {
45
+ "type": "object",
46
+ "properties": {
47
+ "summary": {
48
+ "type": "string",
49
+ "description": "Structured summary of project specifications focused on UI/UX"
50
+ }
51
+ },
52
+ "required": ["summary"]
53
+ },
54
+ "outputKey": "projectSpecSummary",
55
+ "temperature": 0.1,
56
+ "allowedTools": []
57
+ }
58
+ ]
59
+ },
60
+ {
61
+ "type": "action",
62
+ "call": "NoOp",
63
+ "comment": "Continue without summarization if no specs or summarization fails"
64
+ }
65
+ ]
66
+ },
67
+ {
68
+ "type": "selector",
69
+ "comment": "Check for incoming agent messages (non-critical: continue even if unavailable)",
70
+ "children": [
71
+ {
72
+ "type": "action",
73
+ "call": "CheckIncomingMessages",
74
+ "comment": "Poll for messages from other agents (coordination, queries, notifications)"
75
+ },
76
+ {
77
+ "type": "action",
78
+ "call": "NoOp",
79
+ "comment": "Continue without message checking if messaging is unavailable"
80
+ }
81
+ ]
82
+ },
83
+ {
84
+ "type": "selector",
85
+ "comment": "Notify other agents that Electron QA review is starting (non-critical)",
86
+ "children": [
87
+ {
88
+ "type": "action",
89
+ "call": "SendTaskStartNotification",
90
+ "comment": "Broadcast to other agents that Electron QA is starting review of this task"
91
+ },
92
+ {
93
+ "type": "action",
94
+ "call": "NoOp",
95
+ "comment": "Continue without notification if messaging is unavailable"
96
+ }
97
+ ]
98
+ },
99
+ {
100
+ "type": "selector",
101
+ "comment": "Main Electron QA flow with failure handling",
102
+ "children": [
103
+ {
104
+ "type": "sequence",
105
+ "comment": "Main Electron QA testing sequence",
106
+ "children": [
107
+ {
108
+ "type": "action",
109
+ "call": "ExtractTaskFiles",
110
+ "comment": "Extract task-specific file list from engineer's completion comment"
111
+ },
112
+ {
113
+ "type": "action",
114
+ "call": "GitDiff",
115
+ "comment": "Get git diff scoped to task-specific files when available"
116
+ },
117
+ {
118
+ "type": "selector",
119
+ "comment": "Check if there are changes to test",
120
+ "children": [
121
+ {
122
+ "type": "sequence",
123
+ "comment": "No changes detected - pass task",
124
+ "children": [
125
+ {
126
+ "type": "flip",
127
+ "child": {
128
+ "type": "condition",
129
+ "call": "HasCodeChanges"
130
+ }
131
+ },
132
+ {
133
+ "type": "action",
134
+ "call": "AddNoChangesComment"
135
+ },
136
+ {
137
+ "type": "action",
138
+ "call": "ReportTriggerPass"
139
+ }
140
+ ]
141
+ },
142
+ {
143
+ "type": "sequence",
144
+ "comment": "Relevant changes detected - analyze and run Electron tests",
145
+ "children": [
146
+ {
147
+ "type": "action",
148
+ "call": "DetectProjectType"
149
+ },
150
+ {
151
+ "type": "llm-action",
152
+ "name": "AnalyzeChangesForWebTesting",
153
+ "prompt": "You are an Electron QA agent analyzing code changes to determine what startup paths, renderer views, and user flows need testing.\n\n{{#if projectSpecSummary}}\n## Project Specification Summary\n{{projectSpecSummary}}\n\nValidate the implementation against these specifications. Identify which UI components, windows, routes, and flows from the specs are affected by the changes.\n{{/if}}\n\nTask: {{taskDescription}}\nGit Diff: {{gitDiff}}\nProject Info: {{projectInfo}}\n\nFIRST decide whether this task actually targets an Electron app in the CURRENT worktree.\n\nTreat it as an Electron target ONLY when you have concrete evidence such as:\n- package scripts for electron, electron:dev, electron-vite, electron-forge, electron-builder\n- Electron main/preload files\n- BrowserWindow or app lifecycle code\n- explicit task text saying Electron desktop app/runtime\n\nTreat it as NOT requiring Electron QA when:\n- the task is docs/planning only\n- the task targets a generic web app or desktop web client without an Electron runtime in the worktree\n- there is no concrete Electron entrypoint evidence\n\nAnalyze the changes and identify:\n1. Which renderer routes, views, or windows are affected\n2. Which UI components were modified\n3. Which user flows need testing\n4. Whether startup validation is needed because main, preload, config, or boot code changed\n5. Whether responsive or constrained-window testing is needed\n6. Whether API integration testing is needed\n7. The risk level of the changes\n8. Whether this is a component-only task - the engineer built or modified a component that is NOT yet wired into any routable view in the current worktree. Check if any page, layout, view, or window imports and renders the component. If no reachable UI surface imports it, it is component-only.\n\nCRITICAL: Set hasFrontendChanges=true ONLY if the task requires actual Electron runtime QA in this worktree. For docs-only, planning-only, or non-Electron tasks, set hasFrontendChanges=false so the BT skips startup and browser testing.\n\nIMPORTANT: Never infer Electron status from the agent name alone.",
154
+ "contextKeys": [
155
+ "taskDescription",
156
+ "taskTitle",
157
+ "gitDiff",
158
+ "projectInfo",
159
+ "projectSpecifications",
160
+ "projectSpecSummary"
161
+ ],
162
+ "outputSchema": {
163
+ "type": "object",
164
+ "properties": {
165
+ "changesSummary": {
166
+ "type": "string",
167
+ "description": "Brief summary of what was changed"
168
+ },
169
+ "affectedPages": {
170
+ "type": "array",
171
+ "items": {
172
+ "type": "string"
173
+ },
174
+ "description": "URL paths of affected pages (e.g., /login, /dashboard)"
175
+ },
176
+ "affectedComponents": {
177
+ "type": "array",
178
+ "items": {
179
+ "type": "string"
180
+ },
181
+ "description": "UI components that were modified"
182
+ },
183
+ "userFlowsToTest": {
184
+ "type": "array",
185
+ "items": {
186
+ "type": "string"
187
+ },
188
+ "description": "User flows that need E2E testing"
189
+ },
190
+ "needsResponsiveTest": {
191
+ "type": "boolean",
192
+ "description": "Whether layout/responsive testing is needed"
193
+ },
194
+ "needsApiTest": {
195
+ "type": "boolean",
196
+ "description": "Whether API integration testing from the UI is needed"
197
+ },
198
+ "hasFrontendChanges": {
199
+ "type": "boolean",
200
+ "description": "Whether the task actually requires Electron runtime QA in this worktree"
201
+ },
202
+ "riskLevel": {
203
+ "type": "string",
204
+ "enum": ["low", "medium", "high"]
205
+ },
206
+ "isComponentOnlyTask": {
207
+ "type": "boolean",
208
+ "description": "True if the changes are isolated component implementations not yet wired into any page route (e.g., building a new component that isn't imported/rendered by any existing page). False if the component is already integrated into a routable page."
209
+ }
210
+ },
211
+ "required": [
212
+ "changesSummary",
213
+ "hasFrontendChanges",
214
+ "riskLevel",
215
+ "isComponentOnlyTask"
216
+ ]
217
+ },
218
+ "outputKey": "webChangeAnalysis",
219
+ "temperature": 0.3,
220
+ "allowedTools": []
221
+ },
222
+ {
223
+ "type": "selector",
224
+ "comment": "Branch: has Electron-visible changes \u2192 run Electron tests; no Electron-visible changes \u2192 approve early",
225
+ "children": [
226
+ {
227
+ "type": "sequence",
228
+ "comment": "Electron-visible changes detected - proceed with Electron testing",
229
+ "children": [
230
+ {
231
+ "type": "condition",
232
+ "call": "HasFrontendChangesInAnalysis",
233
+ "comment": "Deterministic check: run Electron tests when AnalyzeChangesForWebTesting returned hasFrontendChanges=true (or analysis is missing/invalid and we default to testing)"
234
+ },
235
+ {
236
+ "type": "action",
237
+ "call": "InstallDependencies",
238
+ "comment": "Install project dependencies before starting the Electron app"
239
+ },
240
+ {
241
+ "type": "llm-action",
242
+ "name": "PlanWebTests",
243
+ "prompt": "Create a comprehensive test plan for Electron QA. Follow the scenario categories and scenario specification format from your system instructions.\n\n{{#if projectSpecSummary}}\n## Project Specification Summary\n{{projectSpecSummary}}\n{{/if}}\n\n{{#if projectSpecifications}}\n## Full Project Specifications\n{{projectSpecifications}}\n{{/if}}\n\nTask: {{taskDescription}}\nChange Analysis: {{webChangeAnalysis}}\nProject Info: {{projectInfo}}\nGit Diff: {{gitDiff}}\n\n## Requirements\n\nThe plan must account for Electron startup, renderer navigation, and any visible symptoms of preload or IPC regressions.\n\n## Scenario Count Strategy (CRITICAL)\nClassify task complexity before planning:\n- Low-complexity scaffold/runtime-boundary work (entrypoint wiring, startup bootstrap, preload/main/renderer boundaries, foundation-only changes): plan EXACTLY 1 focused startup scenario. Use 2 only if there are two distinct user-visible surfaces that both changed.\n- Real UI feature work (new/changed user-facing flows, forms, navigation, settings, workflows): plan broader coverage, typically 3-6 scenarios based on risk.\n\n## Component-Only Tasks\nIf the change analysis indicates `isComponentOnlyTask` is true, do NOT plan unreachable end-to-end window navigation. Instead, plan:\n1. Build validation\n2. Component import or bundling check\n3. Electron startup health or renderer startup health to confirm the component does not break boot\n\n## Additional Rules:\n- Focus on the CHANGED functionality, not the entire app\n- Include at least one startup or boot validation scenario when Electron entry code changed\n- Include console error checks for every renderer load\n- Use task-based renderer port when a local renderer server exists: PORT = 3000 + (taskId % 100)\n- Determine the canonical Electron launch command from package.json scripts and config\n- Prefer a renderer URL that matches the started Electron app, not an unrelated local server\n\nPrioritize scenarios by risk and impact.",
244
+ "contextKeys": [
245
+ "taskDescription",
246
+ "webChangeAnalysis",
247
+ "projectInfo",
248
+ "gitDiff",
249
+ "projectSpecifications",
250
+ "projectSpecSummary"
251
+ ],
252
+ "outputSchema": {
253
+ "type": "object",
254
+ "properties": {
255
+ "devServerCommand": {
256
+ "type": "string",
257
+ "description": "Primary command to start the Electron app or Electron dev workflow"
258
+ },
259
+ "devServerPort": {
260
+ "type": "number",
261
+ "description": "Renderer port when the Electron app uses a local dev server (3000 + taskId % 100)"
262
+ },
263
+ "baseUrl": {
264
+ "type": "string",
265
+ "description": "Renderer URL for testing (for example http://localhost:3028)"
266
+ },
267
+ "scenarios": {
268
+ "type": "array",
269
+ "items": {
270
+ "type": "object",
271
+ "properties": {
272
+ "name": {
273
+ "type": "string",
274
+ "description": "Scenario name"
275
+ },
276
+ "priority": {
277
+ "type": "string",
278
+ "enum": ["critical", "high", "medium", "low"]
279
+ },
280
+ "pages": {
281
+ "type": "array",
282
+ "items": {
283
+ "type": "string"
284
+ },
285
+ "description": "Renderer routes, views, or URLs to visit"
286
+ },
287
+ "steps": {
288
+ "type": "array",
289
+ "items": {
290
+ "type": "string"
291
+ },
292
+ "description": "Interaction steps to perform"
293
+ },
294
+ "verifications": {
295
+ "type": "array",
296
+ "items": {
297
+ "type": "string"
298
+ },
299
+ "description": "What to verify after each step"
300
+ },
301
+ "expectedResults": {
302
+ "type": "array",
303
+ "items": {
304
+ "type": "string"
305
+ },
306
+ "description": "Concrete expected outcomes (e.g., 'Form shows success message', 'Error toast appears with message X')"
307
+ },
308
+ "viewports": {
309
+ "type": "array",
310
+ "items": {
311
+ "type": "string"
312
+ },
313
+ "description": "Viewport sizes to test (desktop, tablet, narrow-window)"
314
+ }
315
+ },
316
+ "required": [
317
+ "name",
318
+ "priority",
319
+ "steps",
320
+ "verifications",
321
+ "expectedResults"
322
+ ]
323
+ }
324
+ }
325
+ },
326
+ "required": [
327
+ "devServerCommand",
328
+ "devServerPort",
329
+ "baseUrl",
330
+ "scenarios"
331
+ ]
332
+ },
333
+ "outputKey": "webTestPlan",
334
+ "temperature": 0.3,
335
+ "allowedTools": []
336
+ },
337
+ {
338
+ "type": "action",
339
+ "call": "NormalizeWebTestPlan",
340
+ "comment": "Deterministically cap over-generated scenarios for low-complexity scaffold/runtime tasks before execution."
341
+ },
342
+ {
343
+ "type": "retry",
344
+ "name": "RetryOnEnvironmentFailure",
345
+ "comment": "Retry Electron app startup + test execution if environment fails. On first pass, FixWebEnvironment is a no-op. On retry, it kills stale processes and clears state.",
346
+ "attempts": 2,
347
+ "child": {
348
+ "type": "sequence",
349
+ "name": "ExecuteWithEnvRecovery",
350
+ "children": [
351
+ {
352
+ "type": "action",
353
+ "call": "FixWebEnvironment",
354
+ "comment": "No-op on first pass, fixes env on retry (kills port, clears state)"
355
+ },
356
+ {
357
+ "type": "selector",
358
+ "comment": "Try deterministic startup first; fall back to LLM startup only when deterministic verification fails or is inconclusive.",
359
+ "children": [
360
+ {
361
+ "type": "action",
362
+ "call": "StartElectronDeterministically",
363
+ "comment": "Programmatic startup probe returns FAILED when startup cannot be verified, so selector continues to StartDevServer fallback."
364
+ },
365
+ {
366
+ "type": "llm-action",
367
+ "name": "StartDevServer",
368
+ "comment": "Raised maxTurns for fallback startup diagnostics so log/port verification has room before truncation.",
369
+ "prompt": "Start the Electron app so it can be tested.\n\nTest Plan: {{webTestPlan}}\nProject Info: {{projectInfo}}\nTask ID: {{currentTaskId}}\n\n## Instructions\n\n1. Create the screenshot output directory:\n ```bash\n mkdir -p .agentloop/screenshots\n ```\n\n2. If a renderer port is expected, kill any stale process on that port first:\n ```bash\n lsof -ti:{{webTestPlan.devServerPort}} | xargs kill -9 2>/dev/null || true\n ```\n\n3. Start ONLY a canonical Electron workflow backed by evidence from the current project.\n Allowed evidence:\n - package.json scripts explicitly for Electron\n - Electron main/preload entrypoints\n - project docs/comments explicitly naming the launch command\n\n4. NEVER fall back to unrelated generic web-only commands such as `dev:web`, `vite`, `next dev`, or guessed app routes unless you have concrete evidence that they are the renderer half of the Electron workflow you already identified.\n\n5. NEVER invent routes like `/operations` or `/workspace`. Only use a renderer URL and route backed by project evidence.\n\n6. If the Electron workflow exposes an HTTP renderer URL, mark `serverStarted=true` ONLY if you VERIFIED that URL is reachable with curl or equivalent evidence AND the started process is still alive. If the port never listens or the process exits, set `serverStarted=false`.\n\n7. If the Electron app starts successfully but loads an embedded `data:`/`file:` renderer instead of a localhost URL, do NOT treat that as a startup failure. Set `serverStarted=true`, set `serverUrl` to `electron://embedded-renderer`, and explain that Playwright browser navigation is not possible in that transport mode.\n\n8. If no verified Electron workflow exists in the current worktree, set `serverStarted=false` and explain that Electron QA is not applicable instead of inventing a fallback web server.\n\n9. Capture enough startup details to prove what launched:\n - primary command used\n - renderer URL, if any\n - primary PID if discoverable\n - notable startup log lines\n\nIMPORTANT:\n- Do NOT write test scripts.\n- Do NOT install Playwright or browser tooling.\n- Do NOT treat a spawned background PID alone as success.\n- A listener on the expected renderer URL is required for success.",
370
+ "contextKeys": [
371
+ "webTestPlan",
372
+ "projectInfo",
373
+ "currentTaskId"
374
+ ],
375
+ "outputSchema": {
376
+ "type": "object",
377
+ "properties": {
378
+ "serverStarted": {
379
+ "type": "boolean",
380
+ "description": "Whether the Electron app or renderer startup succeeded"
381
+ },
382
+ "serverUrl": {
383
+ "type": "string",
384
+ "description": "The renderer URL used for Playwright testing"
385
+ },
386
+ "serverPid": {
387
+ "type": ["number", "null"],
388
+ "description": "PID of the primary Electron or startup process"
389
+ },
390
+ "port": {
391
+ "type": "number",
392
+ "description": "The confirmed renderer port number when applicable"
393
+ },
394
+ "startupDetails": {
395
+ "type": "string",
396
+ "description": "Details about how the server was started"
397
+ }
398
+ },
399
+ "required": [
400
+ "serverStarted",
401
+ "serverUrl",
402
+ "startupDetails"
403
+ ]
404
+ },
405
+ "outputKey": "devServerStatus",
406
+ "temperature": 0.1,
407
+ "subagent": "qa-electron-tester",
408
+ "maxTurns": 40,
409
+ "allowedTools": ["bash", "read", "glob", "grep"]
410
+ }
411
+ ]
412
+ },
413
+ {
414
+ "type": "condition",
415
+ "call": "WebEnvironmentHealthy",
416
+ "comment": "Startup gate before scenario loop. If startup verification failed, fail fast so retry triggers without executing all scenarios."
417
+ },
418
+ {
419
+ "type": "action",
420
+ "call": "InitializeScenarioResults",
421
+ "comment": "Initialize webTestResults with empty scenarioResults array before per-scenario loop"
422
+ },
423
+ {
424
+ "type": "action",
425
+ "call": "CreatePlaywrightScreenshotDir",
426
+ "comment": "Create .agentloop/screenshots directory before scenarios run to prevent ENOENT errors on screenshot saves"
427
+ },
428
+ {
429
+ "type": "forEach",
430
+ "name": "IterateScenarios",
431
+ "comment": "Per-scenario loop: executes one LLM call per scenario instead of one monolithic call for all scenarios",
432
+ "collection": "webTestPlan.scenarios",
433
+ "itemKey": "currentScenario",
434
+ "indexKey": "currentScenarioIndex",
435
+ "continueOnFailure": true,
436
+ "child": {
437
+ "type": "sequence",
438
+ "children": [
439
+ {
440
+ "type": "llm-action",
441
+ "name": "ExecuteSingleScenario",
442
+ "subagent": "qa-electron-tester",
443
+ "comment": "Raised maxTurns to reduce timeout/truncation risk for complex scenario execution (navigation, resize, console, and screenshot steps).",
444
+ "maxTurns": 28,
445
+ "temperature": 0.2,
446
+ "contextKeys": [
447
+ "currentScenario",
448
+ "currentScenarioIndex",
449
+ "devServerStatus",
450
+ "currentTaskId",
451
+ "playwrightScreenshotDir",
452
+ "webTestPlan",
453
+ "projectSpecSummary",
454
+ "webChangeAnalysis",
455
+ "gitDiff"
456
+ ],
457
+ "outputKey": "currentScenarioResult",
458
+ "outputSchema": {
459
+ "type": "object",
460
+ "properties": {
461
+ "name": {
462
+ "type": "string",
463
+ "description": "Scenario name"
464
+ },
465
+ "passed": {
466
+ "type": "boolean",
467
+ "description": "Whether the scenario passed"
468
+ },
469
+ "details": {
470
+ "type": "string",
471
+ "description": "Detailed results and observations"
472
+ },
473
+ "screenshotPaths": {
474
+ "type": "array",
475
+ "items": {
476
+ "type": "string"
477
+ },
478
+ "description": "Paths to screenshots taken"
479
+ },
480
+ "consoleErrors": {
481
+ "type": "array",
482
+ "items": {
483
+ "type": "string"
484
+ },
485
+ "description": "Console errors observed"
486
+ },
487
+ "networkFailures": {
488
+ "type": "array",
489
+ "items": {
490
+ "type": "string"
491
+ },
492
+ "description": "Network request failures"
493
+ }
494
+ },
495
+ "required": ["name", "passed", "details"]
496
+ },
497
+ "allowedTools": [
498
+ "mcp__playwright__browser_navigate",
499
+ "mcp__playwright__browser_navigate_back",
500
+ "mcp__playwright__browser_click",
501
+ "mcp__playwright__browser_hover",
502
+ "mcp__playwright__browser_drag",
503
+ "mcp__playwright__browser_type",
504
+ "mcp__playwright__browser_fill_form",
505
+ "mcp__playwright__browser_select_option",
506
+ "mcp__playwright__browser_press_key",
507
+ "mcp__playwright__browser_take_screenshot",
508
+ "mcp__playwright__browser_snapshot",
509
+ "mcp__playwright__browser_console_messages",
510
+ "mcp__playwright__browser_network_requests",
511
+ "mcp__playwright__browser_wait_for",
512
+ "mcp__playwright__browser_resize",
513
+ "mcp__playwright__browser_close",
514
+ "mcp__playwright__browser_handle_dialog",
515
+ "mcp__playwright__browser_evaluate",
516
+ "mcp__playwright__browser_file_upload",
517
+ "mcp__playwright__browser_tabs"
518
+ ],
519
+ "prompt": "Execute ONE Electron test scenario using Playwright MCP browser tools.\n\n## Your Scenario\n{{currentScenario}}\n(Scenario {{currentScenarioIndex}} of the test plan)\n\n## Startup Context\n{{devServerStatus}}\n\n## Task Context\nTask ID: {{currentTaskId}}\nChanges: {{webChangeAnalysis}}\n\n## Instructions\n1. FIRST inspect devServerStatus.\n2. If `serverStarted` is false, DO NOT call browser_navigate. Return a failed scenario with environment classification details explaining startup was not verified.\n3. If `serverUrl` is `electron://embedded-renderer` or startup context shows a `data:`/`file:` renderer, DO NOT call browser_navigate. Treat the run as startup-only validation mode: validate Electron boot evidence and renderer bootstrap boundaries without claiming browser coverage that is impossible in this transport.\n4. If startup is verified and the renderer URL is HTTP-reachable, navigate to the renderer URL.\n5. If this is the FIRST scenario (index 0), perform App Identity Verification per your system instructions when navigation occurs.\n6. If the loaded app is not the expected Electron target, stop and report failure instead of continuing.\n7. Execute the test steps defined in your scenario. In embedded-renderer startup-only mode, focus on startup/boot evidence and clearly mark any non-executable browser interactions as transport-limited rather than fabricating failures.\n8. Take a browser_take_screenshot at the END of the scenario when browser navigation actually occurred.\n9. Check browser_console_messages for errors after key interactions when browser navigation occurred.\n10. Report your findings, including visible symptoms of startup, preload, or IPC regression if any.\n11. If browser evidence shows non-Electron Chromium (`navigator.userAgent` lacks `Electron`), treat missing preload bridge globals as transport-limited unless startup logs explicitly show preload load errors. Do NOT fail that case as task-related.\n\n## CRITICAL: Context Budget\nYou have a LIMITED number of tool calls (max 28 turns). Be efficient:\n- Do NOT call browser_snapshot after every action. Only use it when you need to understand the page structure and cannot infer it from previous context.\n- Prefer browser_click/browser_type/browser_fill_form by using element references from the LAST snapshot rather than taking a new one.\n- Combine checks: check console_messages and take screenshot together at the end, not separately after each step.\n- If startup is unverified, spend ZERO Playwright calls on repeated navigation retries.\n\nFollow the Playwright testing guidelines in your system instructions for tool usage, resize workflow, and error checking."
520
+ },
521
+ {
522
+ "type": "action",
523
+ "call": "AccumulateScenarioResult",
524
+ "comment": "Append this scenario's result to webTestResults.scenarioResults"
525
+ }
526
+ ]
527
+ }
528
+ },
529
+ {
530
+ "type": "action",
531
+ "call": "AggregateScenarioResults",
532
+ "comment": "Calculate overallPassed and build summary from accumulated scenario results"
533
+ },
534
+ {
535
+ "type": "condition",
536
+ "call": "WebEnvironmentHealthy",
537
+ "comment": "Returns false if env failed (server crash, connection refused) \u2192 sequence fails \u2192 retry triggers. Returns true if healthy \u2192 continue to result evaluation."
538
+ }
539
+ ]
540
+ }
541
+ },
542
+ {
543
+ "type": "selector",
544
+ "comment": "2-way decision: critical failures \u2192 deterministic reject, otherwise \u2192 LLM evaluation",
545
+ "children": [
546
+ {
547
+ "type": "sequence",
548
+ "comment": "PATH 1: CRITICAL FAILURES \u2014 deterministic rejection, LLM cannot override. Catches build errors, HTTP 500, connection refused, all scenarios failed.",
549
+ "children": [
550
+ {
551
+ "type": "condition",
552
+ "call": "WebTestsHaveCriticalFailures",
553
+ "comment": "DETERMINISTIC: Detects build or startup failures, HTTP 500, ECONNREFUSED, and all-scenarios-failed states. Returns true if the app is fundamentally broken."
554
+ },
555
+ {
556
+ "type": "action",
557
+ "call": "DocumentCriticalFailure",
558
+ "comment": "Write deterministic rejection comment and set rejectionDetails on blackboard"
559
+ },
560
+ {
561
+ "type": "action",
562
+ "call": "AddRejectionComment",
563
+ "comment": "Post the rejection comment to the task"
564
+ },
565
+ {
566
+ "type": "action",
567
+ "call": "ReportTriggerFail",
568
+ "comment": "Move task back to todo for engineer to fix build/compilation errors"
569
+ }
570
+ ]
571
+ },
572
+ {
573
+ "type": "sequence",
574
+ "comment": "PATH 2: No critical failures \u2014 LLM evaluates test results (can distinguish pre-existing vs task-related)",
575
+ "children": [
576
+ {
577
+ "type": "selector",
578
+ "comment": "Try LLM evaluation, fall back to deterministic analysis if LLM fails",
579
+ "children": [
580
+ {
581
+ "type": "llm-action",
582
+ "name": "EvaluateWebTestResults",
583
+ "prompt": "You are the QA Electron tester. Evaluate the Electron test results and decide whether to APPROVE or REJECT the engineer's changes.\n\n{{#if projectSpecSummary}}\n## Project Specification Summary\n{{projectSpecSummary}}\n{{/if}}\n\nTask: {{taskDescription}}\nPlanned Test Scenarios: {{webTestPlan}}\nScenario Planning Profile: {{webScenarioPlanningProfile}}\nElectron Test Results: {{webTestResults}}\nGit Diff: {{gitDiff}}\nTask Files: {{taskFiles}}\n\nYou MUST set the 'approved' field to control whether the task passes or fails. This field directly controls the BT flow.\n\nCRITICAL RULES FOR TEST COVERAGE COMPLETENESS:\n- Compare the number of executed scenarios in webTestResults against the planned scenarios in webTestPlan.\n- If fewer than 75% of planned scenarios were actually executed, set approved=false with rejectionReason explaining incomplete coverage.\n- Missing startup validation or viewport coverage that the plan explicitly required is a coverage gap.\n- EXCEPTION: if webScenarioPlanningProfile marks complexity as `low-scaffold`, then a 1-scenario plan (or 2 when explicitly planned) is intentional and should NOT be rejected for missing broader category coverage.\n\nCRITICAL RULES FOR THE 'approved' FIELD:\n1. Set approved=true ONLY when overall evidence supports that the changed Electron behavior works.\n2. Pre-existing issues in unrelated surfaces do NOT count against the engineer.\n3. Set approved=false for task-related startup, renderer UI, preload-visible, IPC-visible, or workflow failures caused by the engineer's changes.\n4. Third-party warnings alone are NOT failures.\n5. Use the failure classification field on each scenarioResult.\n6. When in doubt, set approved=false.\n\nFAILURE CLASSIFICATION:\nFor EACH failed scenario, classify it as:\n1. task-related\n2. pre-existing\n3. environment\n\nEnvironment includes startup environment issues, missing dependencies, stale local processes, unreachable renderer URLs unrelated to the engineer's code, and transport limitations where Playwright is attached to non-Electron Chromium.\n\nPLAYWRIGHT TRANSPORT RULES (NON-EMBEDDED TOO):\nIf scenario evidence shows Playwright is running in non-Electron Chromium (for example `navigator.userAgent` lacks `Electron`) while startup is otherwise healthy, do NOT treat missing preload bridge globals as task-related by itself.\n- Classify that failure as environment/transport-limited unless startup logs explicitly show preload load failures or bridge registration failures.\n- Reject only when there is direct evidence of task-related preload breakage (preload crash logs, explicit bridge injection errors, or renderer behavior proving bridge contract regression).\n\nEMBEDDED ELECTRON RENDERER RULES:\nIf startup evidence shows Electron booted successfully into an embedded `data:`/`file:` renderer (`electron://embedded-renderer`) rather than an HTTP URL:\n- Do NOT reject solely because Playwright could not navigate to localhost\n- Treat missing browser navigation as a transport limitation, not automatic task failure\n- For scaffold/runtime-boundary tasks, clean startup plus existing build/test evidence can be sufficient for approval when no task-related startup regressions are observed\n\nCOMPONENT-ONLY TASK RULES:\nIf webChangeAnalysis.isComponentOnlyTask is true:\n- Unreachable route navigation can be environment-only rather than task-related\n- Build success plus clean startup is sufficient for approval when the component is not yet wired into a reachable UI surface\n- Reject only if the build or startup breaks because of the new component\n\nFOR THE COMMENT FIELD:\n- If approved=true: summarize startup path, views tested, viewports tested, console status, and any pre-existing issues.\n- If approved=false: write a detailed rejection with task-related failures, including startup context, affected view, repro steps, expected vs actual behavior, screenshot paths, and console evidence.",
584
+ "contextKeys": [
585
+ "taskDescription",
586
+ "webTestPlan",
587
+ "webTestResults",
588
+ "webChangeAnalysis",
589
+ "gitDiff",
590
+ "taskFiles",
591
+ "projectSpecifications",
592
+ "projectSpecSummary",
593
+ "webScenarioPlanningProfile"
594
+ ],
595
+ "outputSchema": {
596
+ "type": "object",
597
+ "properties": {
598
+ "approved": {
599
+ "type": "boolean",
600
+ "description": "true if engineer's changes pass Electron QA, false if task-related failures found. This field CONTROLS the BT flow."
601
+ },
602
+ "confidence": {
603
+ "type": "number",
604
+ "description": "Your confidence in the approval/rejection decision (0.0 to 1.0). Use lower values when evidence is ambiguous."
605
+ },
606
+ "comment": {
607
+ "type": "string",
608
+ "description": "Approval summary (if approved) or detailed rejection with bug reports (if rejected)"
609
+ },
610
+ "rejectionReason": {
611
+ "type": ["string", "null"],
612
+ "description": "Brief rejection reason (null if approved)"
613
+ },
614
+ "taskRelatedFailures": {
615
+ "type": ["number", "null"],
616
+ "description": "Number of failures caused by the engineer's changes"
617
+ },
618
+ "preExistingFailures": {
619
+ "type": ["number", "null"],
620
+ "description": "Number of failures that existed before the engineer's changes (in pages/components the engineer did NOT modify)"
621
+ },
622
+ "environmentIssues": {
623
+ "type": ["number", "null"],
624
+ "description": "Number of environment-related issues (startup problems, missing deps, stale local processes, etc.)"
625
+ },
626
+ "failures": {
627
+ "type": "array",
628
+ "items": {
629
+ "type": "object",
630
+ "properties": {
631
+ "testName": {
632
+ "type": "string",
633
+ "description": "Name of the failed test scenario"
634
+ },
635
+ "error": {
636
+ "type": "string",
637
+ "description": "Error description"
638
+ },
639
+ "classification": {
640
+ "type": "string",
641
+ "enum": [
642
+ "task-related",
643
+ "pre-existing",
644
+ "environment"
645
+ ],
646
+ "description": "Whether this failure is caused by the engineer's changes, pre-existing, or an environment issue"
647
+ },
648
+ "page": {
649
+ "type": ["string", "null"],
650
+ "description": "Page URL where the failure occurred"
651
+ }
652
+ },
653
+ "required": [
654
+ "testName",
655
+ "error",
656
+ "classification"
657
+ ]
658
+ },
659
+ "description": "Classified list of all failures (task-related, pre-existing, and environment)"
660
+ },
661
+ "bugs": {
662
+ "type": "array",
663
+ "items": {
664
+ "type": "object",
665
+ "properties": {
666
+ "title": {
667
+ "type": "string"
668
+ },
669
+ "severity": {
670
+ "type": "string",
671
+ "enum": ["critical", "major", "minor"]
672
+ },
673
+ "page": {
674
+ "type": "string"
675
+ },
676
+ "stepsToReproduce": {
677
+ "type": "array",
678
+ "items": {
679
+ "type": "string"
680
+ }
681
+ },
682
+ "expected": {
683
+ "type": "string"
684
+ },
685
+ "actual": {
686
+ "type": "string"
687
+ },
688
+ "screenshotPath": {
689
+ "type": ["string", "null"]
690
+ },
691
+ "consoleErrors": {
692
+ "type": "array",
693
+ "items": {
694
+ "type": "string"
695
+ }
696
+ }
697
+ },
698
+ "required": [
699
+ "title",
700
+ "severity",
701
+ "stepsToReproduce",
702
+ "expected",
703
+ "actual"
704
+ ]
705
+ },
706
+ "default": []
707
+ }
708
+ },
709
+ "required": [
710
+ "approved",
711
+ "confidence",
712
+ "comment",
713
+ "taskRelatedFailures",
714
+ "preExistingFailures",
715
+ "environmentIssues",
716
+ "failures"
717
+ ]
718
+ },
719
+ "outputKey": "webTestVerdict",
720
+ "temperature": 0.3,
721
+ "allowedTools": []
722
+ },
723
+ {
724
+ "type": "action",
725
+ "call": "FallbackWebTestAnalysis",
726
+ "comment": "Fallback: deterministic analysis when Electron result evaluation LLM fails"
727
+ }
728
+ ]
729
+ },
730
+ {
731
+ "type": "action",
732
+ "call": "ApplyWebScopeGuardrails",
733
+ "comment": "Deterministically ignore out-of-scope navigation/shortcut rejections for strict runtime-scaffold tasks."
734
+ },
735
+ {
736
+ "type": "action",
737
+ "call": "ApplyConfidenceThreshold",
738
+ "comment": "Override approval to rejection if LLM confidence is below 0.8 threshold"
739
+ },
740
+ {
741
+ "type": "selector",
742
+ "comment": "Route based on LLM verdict: approved=true \u2192 pass, approved=false \u2192 fail",
743
+ "children": [
744
+ {
745
+ "type": "sequence",
746
+ "comment": "APPROVED: LLM determined tests passed \u2014 run linter before final approval",
747
+ "children": [
748
+ {
749
+ "type": "condition",
750
+ "call": "WebTestsApproved",
751
+ "comment": "DETERMINISTIC: checks webTestVerdict.approved === true"
752
+ },
753
+ {
754
+ "type": "action",
755
+ "call": "AddApprovalComment"
756
+ },
757
+ {
758
+ "type": "action",
759
+ "call": "RunLinter",
760
+ "comment": "Run linter before final approval - succeeds silently if no linter found"
761
+ },
762
+ {
763
+ "type": "selector",
764
+ "comment": "Check lint results - pass or fail back to engineer",
765
+ "children": [
766
+ {
767
+ "type": "sequence",
768
+ "comment": "Lint passed (or no linter) - approve",
769
+ "children": [
770
+ {
771
+ "type": "condition",
772
+ "call": "LintPassed"
773
+ },
774
+ {
775
+ "type": "action",
776
+ "call": "ReportTriggerPass"
777
+ }
778
+ ]
779
+ },
780
+ {
781
+ "type": "sequence",
782
+ "comment": "Lint failed - send back to engineer with lint errors",
783
+ "children": [
784
+ {
785
+ "type": "condition",
786
+ "call": "LintFailed"
787
+ },
788
+ {
789
+ "type": "action",
790
+ "call": "AddLintFailureComment"
791
+ },
792
+ {
793
+ "type": "action",
794
+ "call": "ReportTriggerFail"
795
+ }
796
+ ]
797
+ }
798
+ ]
799
+ }
800
+ ]
801
+ },
802
+ {
803
+ "type": "sequence",
804
+ "comment": "ALL failures are pre-existing (unrelated to task) and block testing completely - escalate rather than bouncing to engineer",
805
+ "children": [
806
+ {
807
+ "type": "condition",
808
+ "call": "IsPreExistingBugBlocking",
809
+ "comment": "Check if ALL failures are pre-existing with 0 task-related failures"
810
+ },
811
+ {
812
+ "type": "action",
813
+ "call": "EscalatePreExistingBug",
814
+ "comment": "Create bug-fix task, add DAG dependency blocking original task, notify PM and merge-resolver"
815
+ },
816
+ {
817
+ "type": "action",
818
+ "call": "ReportTriggerFail",
819
+ "comment": "Move task to todo - DAG dependency on the bug-fix task prevents orchestrator from re-assigning until fix is done. NOT a rejection of engineer's work."
820
+ }
821
+ ]
822
+ },
823
+ {
824
+ "type": "sequence",
825
+ "comment": "REJECTED: LLM determined task-related failures",
826
+ "children": [
827
+ {
828
+ "type": "action",
829
+ "call": "AddRejectionComment"
830
+ },
831
+ {
832
+ "type": "action",
833
+ "call": "ReportTriggerFail"
834
+ }
835
+ ]
836
+ }
837
+ ]
838
+ }
839
+ ]
840
+ }
841
+ ]
842
+ }
843
+ ]
844
+ },
845
+ {
846
+ "type": "sequence",
847
+ "comment": "No Electron-visible changes detected by analysis - skip Electron testing and approve",
848
+ "children": [
849
+ {
850
+ "type": "flip",
851
+ "child": {
852
+ "type": "condition",
853
+ "call": "HasFrontendChangesInAnalysis"
854
+ },
855
+ "comment": "Skip only when AnalyzeChangesForWebTesting explicitly returned hasFrontendChanges=false"
856
+ },
857
+ {
858
+ "type": "action",
859
+ "call": "AddNoChangesComment",
860
+ "comment": "Document that no Electron-visible changes were found, skipping Electron tests"
861
+ },
862
+ {
863
+ "type": "action",
864
+ "call": "ReportTriggerPass",
865
+ "comment": "Approve without running Electron tests"
866
+ }
867
+ ]
868
+ }
869
+ ]
870
+ }
871
+ ]
872
+ }
873
+ ]
874
+ }
875
+ ]
876
+ },
877
+ {
878
+ "type": "selector",
879
+ "comment": "FAILURE HANDLER: map startup-environment blockers to trigger retry; keep true task failures as fail",
880
+ "children": [
881
+ {
882
+ "type": "sequence",
883
+ "comment": "Startup verification failed after retries -> request environment retry (no engineer bounce)",
884
+ "children": [
885
+ {
886
+ "type": "condition",
887
+ "call": "ElectronStartupVerificationFailed"
888
+ },
889
+ {
890
+ "type": "action",
891
+ "call": "ReportTriggerRetry"
892
+ }
893
+ ]
894
+ },
895
+ {
896
+ "type": "sequence",
897
+ "comment": "All other failures remain task or QA failures",
898
+ "children": [
899
+ {
900
+ "type": "action",
901
+ "call": "AddQAFailureComment"
902
+ },
903
+ {
904
+ "type": "action",
905
+ "call": "ReportTriggerFail"
906
+ }
907
+ ]
908
+ }
909
+ ]
910
+ }
911
+ ]
912
+ },
913
+ {
914
+ "type": "selector",
915
+ "comment": "Notify other agents that Electron QA review is complete (non-critical)",
916
+ "children": [
917
+ {
918
+ "type": "action",
919
+ "call": "SendTaskCompleteNotification",
920
+ "comment": "Broadcast to other agents that Electron QA has finished reviewing this task"
921
+ },
922
+ {
923
+ "type": "action",
924
+ "call": "NoOp",
925
+ "comment": "Continue without notification if messaging is unavailable"
926
+ }
927
+ ]
928
+ },
929
+ {
930
+ "type": "selector",
931
+ "comment": "Stop the started app process to prevent conflicts on next run (non-critical: continue even if kill fails)",
932
+ "children": [
933
+ {
934
+ "type": "action",
935
+ "call": "StopDevServer",
936
+ "comment": "Kill the primary startup process and free any tracked port"
937
+ },
938
+ {
939
+ "type": "action",
940
+ "call": "NoOp",
941
+ "comment": "Continue even if StopDevServer fails"
942
+ }
943
+ ]
944
+ },
945
+ {
946
+ "type": "action",
947
+ "call": "ClearTaskContext",
948
+ "comment": "Reset task-specific blackboard keys to prepare for next task assignment"
949
+ },
950
+ {
951
+ "type": "action",
952
+ "call": "Loop",
953
+ "comment": "Return RUNNING to restart the BT from the root - wait for next task"
954
+ }
955
+ ]
956
+ }
957
+ },
958
+ "blackboardDefaults": {
959
+ "webChangeAnalysis": null,
960
+ "webTestPlan": null,
961
+ "webTestResults": null,
962
+ "currentScenario": null,
963
+ "currentScenarioIndex": 0,
964
+ "currentScenarioResult": null,
965
+ "devServerStatus": null,
966
+ "approvalComment": null,
967
+ "rejectionDetails": null,
968
+ "webTestVerdict": null,
969
+ "gitDiff": null,
970
+ "projectInfo": null,
971
+ "projectSpecifications": null,
972
+ "projectSpecSummary": null,
973
+ "webScenarioPlanningProfile": null,
974
+ "engineerTestSetup": null,
975
+ "requestedStatus": null,
976
+ "statusChangeReason": null,
977
+ "taskComments": null,
978
+ "taskDetails": null,
979
+ "taskFiles": null,
980
+ "currentTaskId": null,
981
+ "taskAssignedAt": null,
982
+ "loopCount": 0,
983
+ "qaBounceCount": 0,
984
+ "maxQABounces": 3,
985
+ "isQARejection": false,
986
+ "hasQAFeedback": false,
987
+ "incomingMessages": [],
988
+ "coordinationMessage": null,
989
+ "messageResponse": null,
990
+ "envRetryAttempted": false,
991
+ "lintResults": null,
992
+ "lintCommand": null,
993
+ "lintDetected": null,
994
+ "lintPassed": null,
995
+ "lintErrors": null,
996
+ "lintSource": null
997
+ }
998
+ }