usertester 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +219 -0
  3. package/dist/browser/agent.d.ts +33 -0
  4. package/dist/browser/agent.js +393 -0
  5. package/dist/browser/agent.js.map +1 -0
  6. package/dist/cli/cleanup.d.ts +5 -0
  7. package/dist/cli/cleanup.js +75 -0
  8. package/dist/cli/cleanup.js.map +1 -0
  9. package/dist/cli/harness.d.ts +10 -0
  10. package/dist/cli/harness.js +108 -0
  11. package/dist/cli/harness.js.map +1 -0
  12. package/dist/cli/index.d.ts +5 -0
  13. package/dist/cli/index.js +31 -0
  14. package/dist/cli/index.js.map +1 -0
  15. package/dist/cli/kill.d.ts +5 -0
  16. package/dist/cli/kill.js +46 -0
  17. package/dist/cli/kill.js.map +1 -0
  18. package/dist/cli/logs.d.ts +5 -0
  19. package/dist/cli/logs.js +64 -0
  20. package/dist/cli/logs.js.map +1 -0
  21. package/dist/cli/profiles.d.ts +5 -0
  22. package/dist/cli/profiles.js +67 -0
  23. package/dist/cli/profiles.js.map +1 -0
  24. package/dist/cli/send.d.ts +5 -0
  25. package/dist/cli/send.js +46 -0
  26. package/dist/cli/send.js.map +1 -0
  27. package/dist/cli/setup.d.ts +6 -0
  28. package/dist/cli/setup.js +168 -0
  29. package/dist/cli/setup.js.map +1 -0
  30. package/dist/cli/spawn.d.ts +5 -0
  31. package/dist/cli/spawn.js +52 -0
  32. package/dist/cli/spawn.js.map +1 -0
  33. package/dist/cli/status.d.ts +5 -0
  34. package/dist/cli/status.js +85 -0
  35. package/dist/cli/status.js.map +1 -0
  36. package/dist/harness/applier.d.ts +38 -0
  37. package/dist/harness/applier.js +152 -0
  38. package/dist/harness/applier.js.map +1 -0
  39. package/dist/harness/index.d.ts +14 -0
  40. package/dist/harness/index.js +110 -0
  41. package/dist/harness/index.js.map +1 -0
  42. package/dist/harness/patterns.d.ts +14 -0
  43. package/dist/harness/patterns.js +96 -0
  44. package/dist/harness/patterns.js.map +1 -0
  45. package/dist/harness/proposer.d.ts +26 -0
  46. package/dist/harness/proposer.js +181 -0
  47. package/dist/harness/proposer.js.map +1 -0
  48. package/dist/harness/traces.d.ts +29 -0
  49. package/dist/harness/traces.js +65 -0
  50. package/dist/harness/traces.js.map +1 -0
  51. package/dist/harness/validator.d.ts +6 -0
  52. package/dist/harness/validator.js +112 -0
  53. package/dist/harness/validator.js.map +1 -0
  54. package/dist/inbox/agentmail.d.ts +11 -0
  55. package/dist/inbox/agentmail.js +36 -0
  56. package/dist/inbox/agentmail.js.map +1 -0
  57. package/dist/llm/provider.d.ts +15 -0
  58. package/dist/llm/provider.js +65 -0
  59. package/dist/llm/provider.js.map +1 -0
  60. package/dist/orchestrator/agent.d.ts +17 -0
  61. package/dist/orchestrator/agent.js +195 -0
  62. package/dist/orchestrator/agent.js.map +1 -0
  63. package/dist/orchestrator/index.d.ts +7 -0
  64. package/dist/orchestrator/index.js +92 -0
  65. package/dist/orchestrator/index.js.map +1 -0
  66. package/dist/orchestrator/retry.d.ts +27 -0
  67. package/dist/orchestrator/retry.js +145 -0
  68. package/dist/orchestrator/retry.js.map +1 -0
  69. package/dist/orchestrator/session.d.ts +13 -0
  70. package/dist/orchestrator/session.js +55 -0
  71. package/dist/orchestrator/session.js.map +1 -0
  72. package/dist/output/events.d.ts +12 -0
  73. package/dist/output/events.js +81 -0
  74. package/dist/output/events.js.map +1 -0
  75. package/dist/profiles/learner.d.ts +4 -0
  76. package/dist/profiles/learner.js +168 -0
  77. package/dist/profiles/learner.js.map +1 -0
  78. package/dist/tools/captcha.d.ts +19 -0
  79. package/dist/tools/captcha.js +76 -0
  80. package/dist/tools/captcha.js.map +1 -0
  81. package/dist/tools/inbox.d.ts +30 -0
  82. package/dist/tools/inbox.js +65 -0
  83. package/dist/tools/inbox.js.map +1 -0
  84. package/dist/types.d.ts +121 -0
  85. package/dist/types.js +30 -0
  86. package/dist/types.js.map +1 -0
  87. package/package.json +60 -0
  88. package/tasks.example.json +5 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 usertester contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,219 @@
1
+ # usertester
2
+
3
+ Spawn N AI agents as simulated users to test your web app flows — signup, onboarding, checkout, email verification — in parallel, with real email inboxes and natural language control.
4
+
5
+ ```
6
+ usertester spawn --url https://myapp.com --n 3 --message "Sign up as a new user"
7
+ ```
8
+
9
+ Each agent gets a unique email inbox, runs a headless browser, and executes your task as a first-time user. You watch a live NDJSON event stream. When an agent finishes, send it a follow-up task — the browser session stays open.
10
+
11
+ ---
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ npm install -g usertester
17
+ ```
18
+
19
+ Or run without installing:
20
+
21
+ ```bash
22
+ npx usertester setup
23
+ ```
24
+
25
+ You need Node.js 20+ and Google Chrome installed locally.
26
+
27
+ ---
28
+
29
+ ## Quick start
30
+
31
+ **Step 1 — Get two API keys**
32
+
33
+ - Anthropic API key: https://console.anthropic.com/settings/keys
34
+ - AgentMail API key: https://agentmail.to/dashboard
35
+
36
+ **Step 2 — Configure**
37
+
38
+ ```bash
39
+ usertester setup
40
+ ```
41
+
42
+ Prompts for both keys, validates them live, writes `.env`.
43
+
44
+ **Step 3 — Spawn agents**
45
+
46
+ ```bash
47
+ usertester spawn --url https://yourapp.com --n 1 --message "Sign up as a new user"
48
+ ```
49
+
50
+ Output (NDJSON, one event per line):
51
+
52
+ ```jsonl
53
+ {"event":"session_start","sessionId":"abc123","url":"https://yourapp.com","n":1}
54
+ {"event":"spawned","agent":"agent-01","inbox":"abc@agentmail.to"}
55
+ {"event":"state","agent":"agent-01","from":"SIGNING_UP","to":"RUNNING"}
56
+ {"event":"ready","agent":"agent-01","message_completed":"Sign up as a new user","summary":"Filled the registration form and clicked Register. Signup succeeded and was redirected to the dashboard.","screenshot":"/Users/you/.usertester/abc123/agent-01/screenshots/001.png"}
57
+ ```
58
+
59
+ **Step 4 — Send follow-up tasks**
60
+
61
+ While an agent is in `WAITING` state, send it a new task — the browser session stays open:
62
+
63
+ ```bash
64
+ usertester send agent-01 "Go to the pricing page and try to upgrade to the Pro plan"
65
+ ```
66
+
67
+ ---
68
+
69
+ ## Commands
70
+
71
+ ```bash
72
+ usertester setup # First-run API key configuration
73
+ usertester spawn --url URL --n N --message M # Spawn N agents with a shared task
74
+ usertester spawn --url URL --messages-file tasks.json # Per-agent tasks from file
75
+ usertester status # Show all agents + current state
76
+ usertester send <agent-id> <message> # Resume a waiting agent with a new task
77
+ usertester kill <agent-id> # Kill a running or waiting agent
78
+ usertester logs <agent-id> [--follow] # Tail an agent's log
79
+ usertester cleanup # Delete all AgentMail inboxes for current session
80
+ usertester cleanup --all # Clean up all sessions
81
+ usertester profiles list # Show learned profile hints per URL/scenario
82
+ ```
83
+
84
+ ---
85
+
86
+ ## Per-agent task file
87
+
88
+ ```json
89
+ [
90
+ { "message": "Sign up as a new user and complete onboarding" },
91
+ { "message": "Sign up, then try to upgrade to the paid plan" },
92
+ { "message": "Sign up using Google OAuth if available" }
93
+ ]
94
+ ```
95
+
96
+ If the file has fewer entries than `--n`, tasks cycle.
97
+
98
+ ---
99
+
100
+ ## How it works
101
+
102
+ 1. **Inbox provisioning** — each agent gets a unique `@agentmail.to` email address (~135ms)
103
+ 2. **Browser agent** — headless Chrome via Stagehand v3, controlled by `claude-opus-4-6`
104
+ 3. **Multi-step execution** — `agent().execute()` runs an observe→act→check loop until the task completes
105
+ 4. **RLM memory** — session history is queried in chunks rather than fed whole into context. Cost stays near-flat as sessions grow.
106
+ 5. **Profile learning** — after each session, failures are extracted into `facts.json` per URL/scenario. Next run, the agent starts with those hints.
107
+ 6. **NDJSON event stream** — every state transition and result is a JSON line to stdout. Calling agents (Claude Code, etc.) parse this to decide next steps.
108
+
109
+ ---
110
+
111
+ ## Bypassing bot detection (Cloudflare, CAPTCHA)
112
+
113
+ usertester injects an `x-usertester-session: 1` header on every request. Configure your app to allow this traffic through.
114
+
115
+ ### Option A: Cloudflare WAF bypass (recommended, free)
116
+
117
+ **Step 1 — Generate a secret bypass token:**
118
+ ```bash
119
+ openssl rand -hex 24 # → e.g. a3f9c2b8d7e14f6a9c2b8d7e14f6a9c2b8d7e14f
120
+ ```
121
+
122
+ **Step 2 — Add it to your `.env`:**
123
+ ```
124
+ USERTESTER_BYPASS_TOKEN=a3f9c2b8d7e14f6a9c2b8d7e14f6a9c2b8d7e14f
125
+ ```
126
+
127
+ **Step 3 — Add a WAF rule in Cloudflare dashboard → Security → WAF → Custom rules:**
128
+ ```
129
+ Field: Request Header
130
+ Header: x-usertester-bypass
131
+ Operator: equals
132
+ Value: a3f9c2b8d7e14f6a9c2b8d7e14f6a9c2b8d7e14f ← your secret
133
+ Action: Skip → All remaining custom rules
134
+ ```
135
+
136
+ The token is never in source code — only in your `.env` and Cloudflare dashboard. Rotate it anytime by generating a new one and updating both places.
137
+
138
+ ### Option B: Supabase Auth — use Cloudflare test keys
139
+
140
+ If your app uses Supabase Auth with Cloudflare Turnstile:
141
+
142
+ 1. Supabase dashboard → **Authentication → Security → CAPTCHA protection**
143
+ 2. Switch site key to: `1x00000000000000000000AA` (Cloudflare's official test key — always passes)
144
+ 3. Switch secret key to: `1x0000000000000000000000000000000AA`
145
+
146
+ Use only in dev/staging — not production.
147
+
148
+ ### Option C: Automatic CAPTCHA solving (no app changes, paid)
149
+
150
+ Add `CAPSOLVER_API_KEY` to `.env` and usertester will automatically solve Cloudflare Turnstile via [CapSolver](https://capsolver.com) (~$1.20/1K solves, ~85-90% success rate).
151
+
152
+ ```bash
153
+ CAPSOLVER_API_KEY=CAP-...
154
+ ```
155
+
156
+ ---
157
+
158
+ ## Calling from a coding agent
159
+
160
+ usertester is designed to be orchestrated by a coding agent (Claude Code, Codex) as well as used directly. Parse the NDJSON stream:
161
+
162
+ ```typescript
163
+ import { spawn } from 'node:child_process'
164
+ import * as readline from 'node:readline'
165
+
166
+ const proc = spawn('usertester', ['spawn', '--url', url, '--n', '3', '--message', task])
167
+ const rl = readline.createInterface({ input: proc.stdout })
168
+
169
+ rl.on('line', (line) => {
170
+ const event = JSON.parse(line)
171
+ if (event.event === 'ready') {
172
+ // agent.summary tells you what happened
173
+ // send next task:
174
+ spawn('usertester', ['send', event.agent, 'Next task here'])
175
+ }
176
+ })
177
+ ```
178
+
179
+ ---
180
+
181
+ ## Limits
182
+
183
+ | | Free plan | Paid plan |
184
+ |---|---|---|
185
+ | AgentMail inboxes | 3 simultaneous | Unlimited |
186
+ | Agents per session | 3 | Up to 20 (configurable) |
187
+
188
+ Always run `usertester cleanup` between sessions to free inbox slots on the free plan.
189
+
190
+ ---
191
+
192
+ ## Results
193
+
194
+ After a session, results are saved to `~/.usertester/<session-id>/`:
195
+
196
+ ```
197
+ ~/.usertester/<session-id>/
198
+ ├── state.json # Live session + agent states
199
+ ├── agent-01/
200
+ │ ├── agent.log # Full agent activity log
201
+ │ ├── events.ndjson # Structured event history
202
+ │ └── screenshots/ # Screenshots per task
203
+ └── ...
204
+ ```
205
+
206
+ ---
207
+
208
+ ## Requirements
209
+
210
+ - Node.js 20+
211
+ - Google Chrome (for local browser automation)
212
+ - Anthropic API key
213
+ - AgentMail API key
214
+
215
+ ---
216
+
217
+ ## License
218
+
219
+ MIT
@@ -0,0 +1,33 @@
1
+ import type { SessionMemory, ProfileFacts, UsertesterConfig } from '../types.js';
2
+ import type { RetryAttempt } from '../orchestrator/retry.js';
3
+ export interface ResumeResult {
4
+ summary: string;
5
+ screenshotPath: string;
6
+ }
7
+ export declare class BrowserAgent {
8
+ private stagehand;
9
+ private config;
10
+ private memory;
11
+ private agentDir;
12
+ private screenshotIndex;
13
+ private rlmRecentActions;
14
+ private rlmMaxFailedActions;
15
+ private retryHistory;
16
+ constructor(opts: {
17
+ config: Partial<UsertesterConfig>;
18
+ agentDir: string;
19
+ rlmRecentActions?: number;
20
+ rlmMaxFailedActions?: number;
21
+ });
22
+ start(url: string, inbox: string, initialTask: string, profileHints?: ProfileFacts): Promise<void>;
23
+ resume(task: string): Promise<ResumeResult>;
24
+ exportMemory(): SessionMemory;
25
+ exportRetryHistory(): RetryAttempt[];
26
+ destroy(): Promise<void>;
27
+ private buildRLMContext;
28
+ private llmBatch;
29
+ private executeTask;
30
+ private recordAction;
31
+ private takeScreenshot;
32
+ private summarizeLastTask;
33
+ }
@@ -0,0 +1,393 @@
1
+ /**
2
+ * BrowserAgent: Stagehand v3 wrapper with RLM memory loop
3
+ *
4
+ * Implements the BrowserAgent interface from the design doc:
5
+ * start(url, inbox, initialTask, profileHints?) → void
6
+ * resume(task) → ResumeResult
7
+ * exportMemory() → SessionMemory
8
+ * destroy() → void
9
+ */
10
+ import { Stagehand } from '@browserbasehq/stagehand';
11
+ import path from 'node:path';
12
+ import { appendAgentEvent, appendAgentLog } from '../output/events.js';
13
+ import { cheapCall, cheapBatch } from '../llm/provider.js';
14
+ import { classifyFailure, selectToolsForRecovery, buildRetryInstruction } from '../orchestrator/retry.js';
15
+ const ARCHIVE_THRESHOLD = 50;
16
+ const ARCHIVE_BATCH = 10;
17
+ export class BrowserAgent {
18
+ stagehand = null;
19
+ config;
20
+ memory;
21
+ agentDir;
22
+ screenshotIndex = 0;
23
+ rlmRecentActions;
24
+ rlmMaxFailedActions;
25
+ retryHistory = [];
26
+ constructor(opts) {
27
+ this.config = opts.config;
28
+ this.agentDir = opts.agentDir;
29
+ this.rlmRecentActions = opts.rlmRecentActions ?? 10;
30
+ this.rlmMaxFailedActions = opts.rlmMaxFailedActions ?? 5;
31
+ this.memory = {
32
+ taskDescription: '',
33
+ startUrl: '',
34
+ actions: [],
35
+ archivedActionCount: 0,
36
+ recoveryTips: [],
37
+ };
38
+ }
39
+ async start(url, inbox, initialTask, profileHints) {
40
+ this.memory.taskDescription = initialTask;
41
+ this.memory.startUrl = url;
42
+ // Use Stagehand's native model config (provider/model format + apiKey)
43
+ // This is the format confirmed working from spike tests
44
+ const cuaModelString = this.config.cua_model ?? 'anthropic/claude-opus-4-6';
45
+ // Strip 'openrouter/' prefix — Stagehand uses provider/model directly
46
+ const stagehandModelName = cuaModelString.startsWith('openrouter/')
47
+ ? cuaModelString.slice('openrouter/'.length)
48
+ : cuaModelString;
49
+ const apiKey = this.config.anthropic_api_key
50
+ ?? this.config.openrouter_api_key
51
+ ?? this.config.openai_api_key
52
+ ?? process.env.ANTHROPIC_API_KEY
53
+ ?? process.env.OPENROUTER_API_KEY
54
+ ?? '';
55
+ const useBrowserbase = !!(this.config.browserbase_api_key && this.config.browserbase_project_id);
56
+ if (useBrowserbase) {
57
+ appendAgentLog(this.agentDir, `Using Browserbase (project: ${this.config.browserbase_project_id})`);
58
+ this.stagehand = new Stagehand({
59
+ env: 'BROWSERBASE',
60
+ apiKey: this.config.browserbase_api_key,
61
+ projectId: this.config.browserbase_project_id,
62
+ verbose: 0,
63
+ model: { modelName: stagehandModelName, apiKey },
64
+ logger: () => { },
65
+ experimental: true,
66
+ disableAPI: true,
67
+ });
68
+ }
69
+ else {
70
+ appendAgentLog(this.agentDir, `Using local Chrome (headless)`);
71
+ this.stagehand = new Stagehand({
72
+ env: 'LOCAL',
73
+ verbose: 0,
74
+ model: { modelName: stagehandModelName, apiKey },
75
+ localBrowserLaunchOptions: { headless: true },
76
+ logger: () => { },
77
+ experimental: true,
78
+ disableAPI: true,
79
+ });
80
+ }
81
+ await this.stagehand.init();
82
+ const page = this.stagehand.context.pages()[0];
83
+ // Inject customer-specific bypass token if configured.
84
+ // Customers add a WAF rule: (http.request.headers["x-usertester-bypass"] eq "<their-token>") → Skip
85
+ // The token is secret — read from USERTESTER_BYPASS_TOKEN env, never hardcoded.
86
+ const bypassToken = this.config.bypass_token;
87
+ if (bypassToken) {
88
+ await page.setExtraHTTPHeaders({ 'x-usertester-bypass': bypassToken });
89
+ }
90
+ appendAgentLog(this.agentDir, `Browser started. Navigating to ${url}`);
91
+ appendAgentEvent(this.agentDir, { event: 'browser_started', url });
92
+ await page.goto(url, { waitUntil: 'load' });
93
+ // Build initial system context including profile hints.
94
+ // If a high-confidence recovery tip exists (proven approach), use it exclusively —
95
+ // contradictory lower-confidence hints are excluded to avoid confusing the agent.
96
+ const provenApproach = profileHints?.harnessHints.find(h => h.confidence >= 0.95 && h.observation.startsWith('PROVEN APPROACH'));
97
+ const hintLines = provenApproach
98
+ ? `- ${provenApproach.observation}`
99
+ : profileHints?.harnessHints
100
+ .filter(h => h.confidence > 0.5)
101
+ .map(h => `- ${h.observation}`)
102
+ .join('\n');
103
+ const systemContext = [
104
+ `You are testing this web app as a first-time user.`,
105
+ `Your email address is: ${inbox}`,
106
+ `Your task: ${initialTask}`,
107
+ `Navigate the app, complete the task, and note anything confusing, broken, or unclear.`,
108
+ `Do not skip steps. Use the email ${inbox} when asked for an email.`,
109
+ `If verification fails and you need to resend a code, wait for any cooldown timer shown before clicking Resend. Then call readInboxEmail again to get the new code.`,
110
+ hintLines ? `\nKnown context from previous runs:\n${hintLines}` : '',
111
+ ]
112
+ .filter(Boolean)
113
+ .join('\n');
114
+ appendAgentLog(this.agentDir, `Starting task: ${initialTask}`);
115
+ // Pre-inject tools from recovery tip on attempt 1.
116
+ // The profile's PROVEN APPROACH hint records which tools worked — inject them immediately
117
+ // so the agent doesn't waste attempt 1 discovering it needs them.
118
+ const attempt1Tools = {};
119
+ const { readInboxEmail } = await import('../tools/inbox.js');
120
+ const provenHint = profileHints?.harnessHints.find(h => h.confidence >= 0.95 && h.observation.startsWith('PROVEN APPROACH'));
121
+ if (provenHint?.observation.includes('readInboxEmail')) {
122
+ attempt1Tools['readInboxEmail'] = readInboxEmail;
123
+ appendAgentLog(this.agentDir, `Pre-injecting readInboxEmail from profile recovery tip`);
124
+ }
125
+ this.retryHistory = [];
126
+ let result = await this.executeTask(systemContext, initialTask, attempt1Tools);
127
+ if (!result.completed) {
128
+ for (let attempt = 2; attempt <= 5; attempt++) {
129
+ const classification = await classifyFailure(result.message, this.config);
130
+ appendAgentLog(this.agentDir, `Retry ${attempt}: classified as ${classification.type} — ${classification.recoveryHint}`);
131
+ this.retryHistory.push({
132
+ attempt: attempt - 1,
133
+ instruction: initialTask,
134
+ toolsInjected: [],
135
+ result: 'failed',
136
+ failureType: classification.type,
137
+ agentMessage: result.message,
138
+ finalUrl: result.finalUrl,
139
+ });
140
+ if (classification.type === 'COMPLETE')
141
+ break;
142
+ if (classification.type === 'ESCALATE')
143
+ break;
144
+ // RATE_LIMITED: wait the app's specified cooldown then retry
145
+ if (classification.type === 'RATE_LIMITED') {
146
+ const secondsMatch = result.message.match(/only request this after (\d+)|wait (\d+) second/i);
147
+ const waitSeconds = secondsMatch
148
+ ? parseInt(secondsMatch[1] ?? secondsMatch[2], 10)
149
+ : 90; // default to 90s if we can't parse
150
+ appendAgentLog(this.agentDir, ` Rate limited — waiting ${waitSeconds}s before retry`);
151
+ await new Promise(r => setTimeout(r, waitSeconds * 1000));
152
+ }
153
+ // ENVIRONMENT_BLOCK: only break if no solver tool available for it
154
+ if (classification.type === 'ENVIRONMENT_BLOCK') {
155
+ const recoveryTools = selectToolsForRecovery(classification);
156
+ if (Object.keys(recoveryTools).length === 0)
157
+ break; // no tool can help
158
+ }
159
+ if (classification.type === 'TRANSIENT' && attempt > 3)
160
+ break;
161
+ const tools = selectToolsForRecovery(classification);
162
+ const retryInstruction = buildRetryInstruction(initialTask, this.retryHistory, this.memory, url);
163
+ appendAgentLog(this.agentDir, ` injecting tools: ${Object.keys(tools).join(', ') || 'none'}`);
164
+ result = await this.executeTask(systemContext, retryInstruction, tools);
165
+ if (result.completed) {
166
+ appendAgentLog(this.agentDir, `✓ Retry ${attempt} succeeded`);
167
+ const tip = {
168
+ url: this.memory.startUrl,
169
+ scenario: 'signup',
170
+ failedApproaches: this.retryHistory
171
+ .filter(a => a.result === 'failed')
172
+ .map(a => a.agentMessage.slice(0, 150)),
173
+ successApproach: result.message.slice(0, 400),
174
+ toolsUsed: Object.keys(tools),
175
+ finalUrl: result.finalUrl,
176
+ confidence: 0.95,
177
+ ts: Date.now(),
178
+ };
179
+ this.memory.recoveryTips.push(tip);
180
+ appendAgentEvent(this.agentDir, { event: 'recovery_tip_written', tip });
181
+ appendAgentLog(this.agentDir, `Recovery tip stored: ${tip.successApproach.slice(0, 80)}`);
182
+ break;
183
+ }
184
+ }
185
+ }
186
+ }
187
+ async resume(task) {
188
+ if (!this.stagehand)
189
+ throw new Error('BrowserAgent not started');
190
+ const context = await this.buildRLMContext(task);
191
+ appendAgentLog(this.agentDir, `Resuming with task: ${task}`);
192
+ appendAgentLog(this.agentDir, `RLM context: ${context.slice(0, 200)}...`);
193
+ await this.executeTask(context, task, {});
194
+ const screenshotPath = await this.takeScreenshot();
195
+ const summary = await this.summarizeLastTask(task);
196
+ return { summary, screenshotPath };
197
+ }
198
+ exportMemory() {
199
+ return { ...this.memory, actions: [...this.memory.actions] };
200
+ }
201
+ exportRetryHistory() {
202
+ return [...this.retryHistory];
203
+ }
204
+ async destroy() {
205
+ if (this.stagehand) {
206
+ await this.stagehand.close();
207
+ this.stagehand = null;
208
+ }
209
+ }
210
+ // --- Private: RLM context builder ---
211
+ async buildRLMContext(nextTask) {
212
+ const page = this.stagehand.context.pages()[0];
213
+ const currentUrl = page.url();
214
+ const recentWindow = this.memory.actions.slice(-this.rlmRecentActions);
215
+ const failedWindow = this.memory.actions
216
+ .filter(a => a.result === 'failed')
217
+ .slice(-this.rlmMaxFailedActions);
218
+ const [recentContext, failureContext] = await this.llmBatch([
219
+ {
220
+ data: recentWindow,
221
+ prompt: 'What is the current browser state and what has the agent done most recently?',
222
+ },
223
+ {
224
+ data: failedWindow,
225
+ prompt: 'What has failed before that the agent should avoid repeating?',
226
+ },
227
+ ]);
228
+ return [
229
+ `You are a browser automation agent testing a web app.`,
230
+ `Current URL: ${currentUrl}`,
231
+ `Next task: ${nextTask}`,
232
+ `Total actions taken so far: ${this.memory.actions.length + this.memory.archivedActionCount}`,
233
+ `Recent state: ${recentContext}`,
234
+ failureContext !== '(no data)' ? `Things to avoid: ${failureContext}` : null,
235
+ ]
236
+ .filter(Boolean)
237
+ .join('\n');
238
+ }
239
+ async llmBatch(queries) {
240
+ const prompts = queries.map(({ data, prompt }) => {
241
+ if (data.length === 0)
242
+ return null;
243
+ const dataStr = data
244
+ .map(a => `${a.action} → ${a.result}${a.observation ? ` | ${a.observation}` : ''}`)
245
+ .join('\n');
246
+ return `${prompt}\n\nActions:\n${dataStr}\n\nAnswer in 1-2 sentences.`;
247
+ });
248
+ return Promise.all(prompts.map(async (p) => {
249
+ if (p === null)
250
+ return '(no data)';
251
+ const text = await cheapBatch([p], this.config, 150);
252
+ return text[0] || '(no data)';
253
+ }));
254
+ }
255
+ // --- Private: task execution ---
256
+ async executeTask(systemContext, task, tools = {}) {
257
+ if (!this.stagehand)
258
+ throw new Error('Stagehand not initialized');
259
+ const page = this.stagehand.context.pages()[0];
260
+ const startUrl = page.url();
261
+ const fullInstruction = systemContext ? `${systemContext}\n\nTask: ${task}` : task;
262
+ try {
263
+ // Tools are passed to stagehand.agent() config, not to execute()
264
+ const agentConfig = {};
265
+ if (Object.keys(tools).length > 0) {
266
+ agentConfig.tools = tools;
267
+ }
268
+ const agent = this.stagehand.agent(agentConfig);
269
+ const result = await agent.execute({ instruction: fullInstruction, maxSteps: 15 });
270
+ await page.waitForLoadState('load').catch(() => { });
271
+ const newUrl = page.url();
272
+ appendAgentLog(this.agentDir, `agent.execute() completed: ${result.completed ? 'done' : 'incomplete'}`);
273
+ appendAgentLog(this.agentDir, ` steps: ${result.actions?.length ?? 0}, tools injected: ${Object.keys(tools).join(', ') || 'none'}`);
274
+ appendAgentLog(this.agentDir, ` message: ${result.message}`);
275
+ appendAgentLog(this.agentDir, ` final url: ${newUrl}`);
276
+ // Record each step as an ActionRecord for RLM memory
277
+ for (const action of (result.actions ?? [])) {
278
+ this.recordAction({
279
+ ts: Date.now(),
280
+ action: action.type ?? 'unknown',
281
+ result: 'success',
282
+ observation: action.reasoning ?? undefined,
283
+ url: startUrl,
284
+ });
285
+ }
286
+ // Record overall outcome with agent's message as observation
287
+ this.recordAction({
288
+ ts: Date.now(),
289
+ action: task.slice(0, 100),
290
+ result: result.completed ? 'success' : 'failed',
291
+ observation: result.message ?? (newUrl !== startUrl ? `Navigated to ${newUrl}` : `Stayed on ${startUrl}`),
292
+ url: startUrl,
293
+ });
294
+ return { completed: result.completed, message: result.message ?? '', finalUrl: newUrl };
295
+ }
296
+ catch (err) {
297
+ appendAgentLog(this.agentDir, `agent.execute() failed: ${err}`);
298
+ // Fallback: individual act() calls per observed action
299
+ let allActions = [];
300
+ try {
301
+ allActions = await this.stagehand.observe();
302
+ }
303
+ catch { }
304
+ if (allActions.length > 0) {
305
+ appendAgentLog(this.agentDir, `Falling back to ${allActions.length} individual act() calls`);
306
+ for (const action of allActions.slice(0, 5)) {
307
+ try {
308
+ await this.stagehand.act(action.description);
309
+ await page.waitForLoadState('load').catch(() => { });
310
+ this.recordAction({
311
+ ts: Date.now(),
312
+ action: action.description,
313
+ selector: action.selector,
314
+ result: 'success',
315
+ url: startUrl,
316
+ });
317
+ appendAgentLog(this.agentDir, ` ok ${action.description}`);
318
+ }
319
+ catch (err2) {
320
+ this.recordAction({
321
+ ts: Date.now(),
322
+ action: action.description,
323
+ selector: action.selector,
324
+ result: 'failed',
325
+ observation: String(err2),
326
+ url: startUrl,
327
+ });
328
+ appendAgentLog(this.agentDir, ` fail ${action.description}: ${err2}`);
329
+ }
330
+ }
331
+ }
332
+ else {
333
+ this.recordAction({
334
+ ts: Date.now(),
335
+ action: task.slice(0, 100),
336
+ result: 'failed',
337
+ observation: String(err),
338
+ url: startUrl,
339
+ });
340
+ }
341
+ return { completed: false, message: String(err), finalUrl: page.url() };
342
+ }
343
+ }
344
+ recordAction(action) {
345
+ this.memory.actions.push(action);
346
+ appendAgentEvent(this.agentDir, { event: 'action', ...action });
347
+ // Archive oldest actions when exceeding threshold (RLM memory management)
348
+ if (this.memory.actions.length > ARCHIVE_THRESHOLD) {
349
+ const archived = this.memory.actions.splice(0, ARCHIVE_BATCH);
350
+ this.memory.archivedActionCount += archived.length;
351
+ appendAgentEvent(this.agentDir, {
352
+ event: 'actions_archived',
353
+ count: archived.length,
354
+ total_archived: this.memory.archivedActionCount,
355
+ actions: archived,
356
+ });
357
+ }
358
+ }
359
+ async takeScreenshot() {
360
+ if (!this.stagehand)
361
+ return '';
362
+ this.screenshotIndex++;
363
+ const screenshotDir = path.join(this.agentDir, 'screenshots');
364
+ const filename = `${String(this.screenshotIndex).padStart(3, '0')}.png`;
365
+ const screenshotPath = path.join(screenshotDir, filename);
366
+ try {
367
+ const page = this.stagehand.context.pages()[0];
368
+ await page.screenshot({ path: screenshotPath });
369
+ appendAgentLog(this.agentDir, `Screenshot saved: ${filename}`);
370
+ }
371
+ catch (err) {
372
+ appendAgentLog(this.agentDir, `Screenshot failed: ${err}`);
373
+ }
374
+ return screenshotPath;
375
+ }
376
+ async summarizeLastTask(task) {
377
+ const recentActions = this.memory.actions.slice(-10);
378
+ if (recentActions.length === 0)
379
+ return 'No actions recorded.';
380
+ const actionsStr = recentActions
381
+ .map(a => `${a.action} → ${a.result}${a.observation ? ` (${a.observation})` : ''}`)
382
+ .join('\n');
383
+ const prompt = `Task: "${task}"\n\nActions taken:\n${actionsStr}\n\nSummarize in 1-2 sentences: what happened, did the task complete, and anything confusing or broken?`;
384
+ try {
385
+ const text = await cheapCall(prompt, this.config, 200);
386
+ return text || 'Task execution complete.';
387
+ }
388
+ catch {
389
+ return `Completed ${recentActions.filter(a => a.result === 'success').length}/${recentActions.length} actions.`;
390
+ }
391
+ }
392
+ }
393
+ //# sourceMappingURL=agent.js.map