@demigodmode/pi-web-agent 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/README.md +63 -199
  2. package/dist/scripts/live-web-eval.d.ts +1 -0
  3. package/dist/scripts/live-web-eval.js +411 -0
  4. package/dist/src/cache/ttl-cache.d.ts +8 -0
  5. package/dist/src/cache/ttl-cache.js +21 -0
  6. package/dist/src/extension.d.ts +2 -0
  7. package/dist/src/extension.js +155 -0
  8. package/dist/src/extract/readability.d.ts +8 -0
  9. package/dist/src/extract/readability.js +93 -0
  10. package/dist/src/fetch/browser-resolution.d.ts +15 -0
  11. package/dist/src/fetch/browser-resolution.js +55 -0
  12. package/dist/src/fetch/headless-fetch.d.ts +18 -0
  13. package/dist/src/fetch/headless-fetch.js +87 -0
  14. package/dist/src/fetch/http-fetch.d.ts +4 -0
  15. package/dist/src/fetch/http-fetch.js +50 -0
  16. package/dist/src/orchestration/index.d.ts +41 -0
  17. package/dist/src/orchestration/index.js +9 -0
  18. package/dist/src/orchestration/research-orchestrator.d.ts +43 -0
  19. package/dist/src/orchestration/research-orchestrator.js +87 -0
  20. package/dist/src/orchestration/research-types.d.ts +41 -0
  21. package/dist/src/orchestration/research-types.js +1 -0
  22. package/dist/src/orchestration/research-worker.d.ts +16 -0
  23. package/dist/src/orchestration/research-worker.js +131 -0
  24. package/dist/src/search/duckduckgo.d.ts +9 -0
  25. package/dist/src/search/duckduckgo.js +52 -0
  26. package/dist/src/tools/web-explore.d.ts +44 -0
  27. package/dist/src/tools/web-explore.js +50 -0
  28. package/dist/src/tools/web-fetch-headless.d.ts +6 -0
  29. package/dist/src/tools/web-fetch-headless.js +14 -0
  30. package/dist/src/tools/web-fetch.d.ts +6 -0
  31. package/dist/src/tools/web-fetch.js +14 -0
  32. package/dist/src/tools/web-search.d.ts +10 -0
  33. package/dist/src/tools/web-search.js +103 -0
  34. package/dist/src/types.d.ts +48 -0
  35. package/dist/src/types.js +7 -0
  36. package/dist/tests/cache/ttl-cache.test.d.ts +1 -0
  37. package/dist/tests/cache/ttl-cache.test.js +19 -0
  38. package/dist/tests/contracts.test.d.ts +1 -0
  39. package/dist/tests/contracts.test.js +65 -0
  40. package/dist/tests/extension.test.d.ts +1 -0
  41. package/dist/tests/extension.test.js +123 -0
  42. package/dist/tests/extract/readability.test.d.ts +1 -0
  43. package/dist/tests/extract/readability.test.js +79 -0
  44. package/dist/tests/fetch/browser-resolution.test.d.ts +1 -0
  45. package/dist/tests/fetch/browser-resolution.test.js +37 -0
  46. package/dist/tests/fetch/headless-fetch.smoke.test.d.ts +1 -0
  47. package/dist/tests/fetch/headless-fetch.smoke.test.js +17 -0
  48. package/dist/tests/fetch/headless-fetch.test.d.ts +1 -0
  49. package/dist/tests/fetch/headless-fetch.test.js +150 -0
  50. package/dist/tests/fetch/http-fetch.test.d.ts +1 -0
  51. package/dist/tests/fetch/http-fetch.test.js +129 -0
  52. package/dist/tests/orchestration/research-orchestrator.test.d.ts +1 -0
  53. package/dist/tests/orchestration/research-orchestrator.test.js +298 -0
  54. package/dist/tests/orchestration/research-worker.test.d.ts +1 -0
  55. package/dist/tests/orchestration/research-worker.test.js +171 -0
  56. package/dist/tests/orchestration/research-workflow.test.d.ts +1 -0
  57. package/dist/tests/orchestration/research-workflow.test.js +119 -0
  58. package/dist/tests/package-manifest.test.d.ts +1 -0
  59. package/dist/tests/package-manifest.test.js +29 -0
  60. package/dist/tests/release-foundation.test.d.ts +1 -0
  61. package/dist/tests/release-foundation.test.js +16 -0
  62. package/dist/tests/release-script.test.d.ts +1 -0
  63. package/dist/tests/release-script.test.js +72 -0
  64. package/dist/tests/search/duckduckgo.test.d.ts +1 -0
  65. package/dist/tests/search/duckduckgo.test.js +103 -0
  66. package/dist/tests/tools/web-explore.test.d.ts +1 -0
  67. package/dist/tests/tools/web-explore.test.js +163 -0
  68. package/dist/tests/tools/web-fetch-headless.test.d.ts +1 -0
  69. package/dist/tests/tools/web-fetch-headless.test.js +31 -0
  70. package/dist/tests/tools/web-fetch.test.d.ts +1 -0
  71. package/dist/tests/tools/web-fetch.test.js +27 -0
  72. package/dist/tests/tools/web-search.test.d.ts +1 -0
  73. package/dist/tests/tools/web-search.test.js +125 -0
  74. package/dist/vitest.config.d.ts +2 -0
  75. package/dist/vitest.config.js +13 -0
  76. package/package.json +5 -1
package/README.md CHANGED
@@ -1,199 +1,63 @@
1
- # pi-web-agent
2
-
3
- `@demigodmode/pi-web-agent` is a Pi package for reliable web access.
4
-
5
- It is built around a simple rule: searching for a page is not the same thing as reading it. This package keeps those steps separate, prefers plain HTTP by default, and is designed to say "I couldn't read this reliably" instead of making something up.
6
-
7
- ## What it does
8
-
9
- The package is built around three tools:
10
-
11
- - `web_search` finds relevant pages and returns titles, URLs, and snippets
12
- - `web_fetch` fetches a specific page over plain HTTP and tries to extract readable content
13
- - `web_fetch_headless` is the explicit browser-based path for pages that need rendering
14
-
15
- The boundary between those tools is intentional.
16
-
17
- `web_search` is for discovery. It should not imply that a page was fetched.
18
-
19
- `web_fetch` is for reading a page over HTTP. If the result looks weak, incomplete, blocked, or too script-heavy, it should return `needs_headless` instead of bluffing.
20
-
21
- `web_fetch_headless` exists for the cases where a browser really is required. It is opt-in only.
22
-
23
- ## Why this exists
24
-
25
- A lot of web tooling in coding agents gets fuzzy in exactly the wrong places. Search results get treated like page reads. Browser fallback happens behind the scenes. Failures get softened into fake confidence.
26
-
27
- This package is trying to do the opposite.
28
-
29
- The rules are straightforward:
30
-
31
- - no hidden browser launch
32
- - no automatic HTTP-to-headless fallback
33
- - no claiming a page was read when only snippets were available
34
- - explicit structured failure when the result is incomplete or blocked
35
-
36
- ## What makes it different
37
-
38
- The main thing is the contract.
39
-
40
- `web_search` discovers sources.
41
-
42
- `web_fetch` reads over HTTP only.
43
-
44
- `web_fetch_headless` is the explicit browser path.
45
-
46
- That separation is the whole point. It makes failures easier to reason about and avoids the weird behavior where a tool quietly changes execution mode behind your back.
47
-
48
- ## Install
49
-
50
- Install it through Pi:
51
-
52
- ```bash
53
- pi install npm:@demigodmode/pi-web-agent
54
- ```
55
-
56
- Update installed packages later with:
57
-
58
- ```bash
59
- pi update
60
- ```
61
-
62
- If you just want to inspect the package from npm directly, the package name is:
63
-
64
- ```bash
65
- npm view @demigodmode/pi-web-agent
66
- ```
67
-
68
- ## Current status
69
-
70
- This repo is in early MVP shape, but it is no longer just a design doc.
71
-
72
- Right now it has:
73
-
74
- - a TypeScript project scaffold
75
- - shared result and status contracts
76
- - a DuckDuckGo HTML parser for `web_search`
77
- - an HTTP fetch path with readability-based extraction and conservative escalation to `needs_headless`
78
- - a real browser-backed `web_fetch_headless` implementation with local browser resolution
79
- - repo-local Pi extension wiring for development
80
- - a test suite around parser behavior, contracts, extraction, caching, and tool adapters
81
- - optional smoke coverage for local installed browsers
82
-
83
- So the project is real and usable, but still early.
84
-
85
- ## Example behavior
86
-
87
- These are conceptual examples of the contract the package is aiming to expose.
88
-
89
- ### Search
90
-
91
- `web_search("pi coding agent")`
92
-
93
- Returns discovery results like:
94
-
95
- - title
96
- - URL
97
- - snippet
98
-
99
- It does not imply the page was fetched.
100
-
101
- ### HTTP fetch
102
-
103
- `web_fetch("https://example.com/article")`
104
-
105
- If the page is readable over plain HTTP, it should return extracted content.
106
-
107
- If the page looks too script-heavy, too thin, blocked, or otherwise unreliable, it should return `needs_headless` instead of pretending the extraction is good enough.
108
-
109
- ### Explicit headless fetch
110
-
111
- `web_fetch_headless("https://example.com/app")`
112
-
113
- This is the browser-based path for pages that really need rendering.
114
-
115
- This path now launches a local browser explicitly, waits for the rendered page to settle, and then extracts readable content from the rendered HTML.
116
-
117
- ## Local development
118
-
119
- Install dependencies:
120
-
121
- ```bash
122
- npm install
123
- ```
124
-
125
- Run tests with coverage:
126
-
127
- ```bash
128
- npm test
129
- ```
130
-
131
- Run the typecheck used as lint:
132
-
133
- ```bash
134
- npm run lint
135
- ```
136
-
137
- Build the project:
138
-
139
- ```bash
140
- npm run build
141
- ```
142
-
143
- To run the optional real-browser smoke test for headless fetch, set `PI_HEADLESS_SMOKE=1` before running Vitest. It stays skipped by default so local browser install differences do not make the normal test suite flaky.
144
-
145
- Coverage is now part of the normal `npm test` flow. Vitest prints a text summary in the terminal and writes the full HTML report to `coverage/`.
146
-
147
- ### Trying it in Pi locally
148
-
149
- This repo includes a project-local Pi extension entrypoint at `.pi/extensions/pi-web-agent.ts` for development and hot reload.
150
-
151
- For the published npm package, Pi loads the compiled runtime from `dist/extension.js` via the `pi.extensions` entry in `package.json`.
152
-
153
- After starting Pi in this project, use `/reload` if you change the extension code and want Pi to pick up the latest version.
154
-
155
- ## Project layout
156
-
157
- The code is split into small modules on purpose.
158
-
159
- - `src/extension.ts` - package entry surface
160
- - `src/tools/` - thin tool adapters
161
- - `src/search/` - search backend logic
162
- - `src/fetch/` - HTTP and headless fetch logic
163
- - `src/extract/` - readable-content extraction
164
- - `src/cache/` - small cache utilities
165
- - `src/types.ts` - shared contracts
166
- - `tests/` - parser, contract, extraction, fetch, and adapter tests
167
-
168
- ## License
169
-
170
- AGPL-3.0-only. See `LICENSE`.
171
-
172
- ## Release process
173
-
174
- 1. Update `CHANGELOG.md` under `## Unreleased`.
175
- 2. Run `npm run release:dry-run` to preview the next version.
176
- 3. Run `npm run release` to bump version, rewrite the changelog release heading, create a release commit, and create a tag.
177
- 4. Push the branch and tag.
178
- 5. GitHub Actions publishes the tagged release to npm.
179
-
180
- ## Maintainer release notes
181
-
182
- This repo is set up for npm Trusted Publishing from GitHub Actions.
183
-
184
- In npm package settings, add a trusted publisher for:
185
- - package: `@demigodmode/pi-web-agent`
186
- - provider: GitHub Actions
187
- - repository: `demigodmode/pi-web-agent`
188
-
189
- That replaces the old `NPM_TOKEN` secret flow.
190
-
191
- ## Near-term next steps
192
-
193
- The next chunk of work is pretty clear:
194
-
195
- - keep tightening weak-content escalation on tricky HTTP targets
196
- - improve cleanup of noisy rendered-page extraction on busy sites
197
- - expand fixtures and end-to-end coverage
198
- - add alternate search backends behind a first-class provider abstraction
199
-
1
+ # pi-web-agent
2
+
3
+ [![CI](https://github.com/demigodmode/pi-web-agent/actions/workflows/ci.yml/badge.svg)](https://github.com/demigodmode/pi-web-agent/actions/workflows/ci.yml)
4
+ [![npm version](https://img.shields.io/npm/v/@demigodmode/pi-web-agent)](https://www.npmjs.com/package/@demigodmode/pi-web-agent)
5
+ [![Docs](https://img.shields.io/badge/docs-github%20pages-blue)](https://demigodmode.github.io/pi-web-agent/)
6
+
7
+ `@demigodmode/pi-web-agent` is a Pi package for web access.
8
+
9
+ The whole point is keeping the boundaries straight:
10
+
11
+ - `web_search` is for discovery
12
+ - `web_fetch` is for plain HTTP reads
13
+ - `web_fetch_headless` is the explicit browser path
14
+ - `web_explore` is the bounded research path
15
+
16
+ That sounds obvious, but a lot of agent tooling gets fuzzy right there. This package is meant to be stricter about what it actually did and more willing to say when a read was not good enough to trust.
17
+
18
+ ## Install
19
+
20
+ ```bash
21
+ pi install npm:@demigodmode/pi-web-agent
22
+ ```
23
+
24
+ Later on, update installed packages with:
25
+
26
+ ```bash
27
+ pi update
28
+ ```
29
+
30
+ ## Docs
31
+
32
+ Docs site:
33
+
34
+ - https://demigodmode.github.io/pi-web-agent/
35
+
36
+ Work on the docs locally:
37
+
38
+ ```bash
39
+ npm run docs:dev
40
+ ```
41
+
42
+ Build the docs:
43
+
44
+ ```bash
45
+ npm run docs:build
46
+ ```
47
+
48
+ ## Local development
49
+
50
+ ```bash
51
+ npm install
52
+ npm test
53
+ npm run lint
54
+ npm run build
55
+ ```
56
+
57
+ For local Pi work, this repo includes `.pi/extensions/pi-web-agent.ts`.
58
+
59
+ If Pi is already running, use `/reload` after changes.
60
+
61
+ ## License
62
+
63
+ AGPL-3.0-only. See `LICENSE`.
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,411 @@
1
+ import { mkdir, writeFile } from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import process from 'node:process';
4
+ import { AuthStorage, createAgentSession, ModelRegistry, SessionManager } from '@mariozechner/pi-coding-agent';
5
+ import { createWebSearchTool } from '../src/tools/web-search.js';
6
+ const PROMPTS = [
7
+ {
8
+ id: 'prompt-1',
9
+ title: 'Playwright installed browser guidance',
10
+ prompt: 'Find current docs or discussions about Playwright launching an installed Chrome or Edge executable instead of a bundled browser, then summarize the recommended approach.'
11
+ },
12
+ {
13
+ id: 'prompt-2',
14
+ title: 'Vitest coverage configuration',
15
+ prompt: 'Find the current Vitest coverage docs and tell me how to enable coverage with the V8 provider in a TypeScript project.'
16
+ },
17
+ {
18
+ id: 'prompt-4',
19
+ title: 'DuckDuckGo HTML scraping pitfalls',
20
+ prompt: 'Find two or three current sources on DuckDuckGo HTML scraping in Node.js and tell me what the common parsing pitfalls are.'
21
+ }
22
+ ];
23
+ const SEARCH_FAILURE_CASES = [
24
+ {
25
+ id: 'no-results',
26
+ title: 'NO_RESULTS classification',
27
+ expectedCode: 'NO_RESULTS',
28
+ expectedMessage: 'DuckDuckGo returned no usable results for this query.',
29
+ searchHtml: async () => `
30
+ <html>
31
+ <body>
32
+ <div class="results">
33
+ <div class="no-results">No results found for your search.</div>
34
+ </div>
35
+ </body>
36
+ </html>
37
+ `
38
+ },
39
+ {
40
+ id: 'parse-failed',
41
+ title: 'PARSE_FAILED classification',
42
+ expectedCode: 'PARSE_FAILED',
43
+ expectedMessage: 'DuckDuckGo returned a page, but it did not match the expected results format.',
44
+ searchHtml: async () => `
45
+ <html>
46
+ <body>
47
+ <main>
48
+ <h1>Unexpected page</h1>
49
+ <p>Nothing here looks like a search results page.</p>
50
+ </main>
51
+ </body>
52
+ </html>
53
+ `
54
+ },
55
+ {
56
+ id: 'blocked-html',
57
+ title: 'BLOCKED classification from challenge HTML',
58
+ expectedCode: 'BLOCKED',
59
+ expectedMessage: 'DuckDuckGo search appears to be blocked or rate limited.',
60
+ searchHtml: async () => `
61
+ <html>
62
+ <body>
63
+ <main>
64
+ <h1>Are you a robot?</h1>
65
+ <p>Please verify you are human to continue.</p>
66
+ </main>
67
+ </body>
68
+ </html>
69
+ `
70
+ },
71
+ {
72
+ id: 'fetch-failed',
73
+ title: 'FETCH_FAILED classification',
74
+ expectedCode: 'FETCH_FAILED',
75
+ expectedMessage: 'DuckDuckGo search request failed: socket hang up',
76
+ searchHtml: async () => {
77
+ throw new Error('socket hang up');
78
+ }
79
+ }
80
+ ];
81
+ function isoNow() {
82
+ return new Date().toISOString();
83
+ }
84
+ function safeFileStamp(date = new Date()) {
85
+ return date.toISOString().replace(/[:.]/g, '-');
86
+ }
87
+ function extractText(value) {
88
+ if (typeof value === 'string')
89
+ return value;
90
+ if (!value || typeof value !== 'object')
91
+ return '';
92
+ if (Array.isArray(value)) {
93
+ return value.map(extractText).filter(Boolean).join('\n');
94
+ }
95
+ const record = value;
96
+ if (typeof record.text === 'string')
97
+ return record.text;
98
+ if (typeof record.content === 'string')
99
+ return record.content;
100
+ if (Array.isArray(record.content)) {
101
+ return record.content
102
+ .map((item) => {
103
+ if (!item || typeof item !== 'object')
104
+ return '';
105
+ const contentItem = item;
106
+ return typeof contentItem.text === 'string' ? contentItem.text : '';
107
+ })
108
+ .filter(Boolean)
109
+ .join('\n');
110
+ }
111
+ const nestedMessage = record.message;
112
+ if (nestedMessage && typeof nestedMessage === 'object') {
113
+ const nestedRecord = nestedMessage;
114
+ if (Array.isArray(nestedRecord.content)) {
115
+ return extractText(nestedMessage);
116
+ }
117
+ }
118
+ return '';
119
+ }
120
+ function toolDetails(result) {
121
+ if (!result || typeof result !== 'object')
122
+ return result;
123
+ const record = result;
124
+ return record.details ?? result;
125
+ }
126
+ function isEmptySearchResult(result) {
127
+ const details = toolDetails(result);
128
+ if (!details || typeof details !== 'object')
129
+ return false;
130
+ const record = details;
131
+ return record.status === 'ok' && Array.isArray(record.results) && record.results.length === 0;
132
+ }
133
+ function isUnsupportedFetchResult(result) {
134
+ const details = toolDetails(result);
135
+ return !!details && typeof details === 'object' && details.status === 'unsupported';
136
+ }
137
+ function isBotCheckHeadlessResult(result) {
138
+ const details = toolDetails(result);
139
+ if (!details || typeof details !== 'object')
140
+ return false;
141
+ const record = details;
142
+ const content = record.content;
143
+ const text = extractText(content);
144
+ const title = content && typeof content === 'object' && !Array.isArray(content)
145
+ ? String(content.title ?? '')
146
+ : '';
147
+ return /just a moment|security verification|verify you are not a bot/i.test(`${title}\n${text}`);
148
+ }
149
+ function isPostWebExploreGuardResult(result) {
150
+ const details = toolDetails(result);
151
+ if (!details || typeof details !== 'object')
152
+ return false;
153
+ const record = details;
154
+ const error = record.error;
155
+ if (!error || typeof error !== 'object')
156
+ return false;
157
+ return error.code === 'POST_WEB_EXPLORE_GUARD';
158
+ }
159
+ function buildMetrics(toolCalls) {
160
+ const webToolNames = new Set(['web_explore', 'web_search', 'web_fetch', 'web_fetch_headless']);
161
+ const lowLevelWebToolNames = new Set(['web_search', 'web_fetch', 'web_fetch_headless']);
162
+ const webToolCalls = toolCalls.filter((call) => webToolNames.has(call.toolName));
163
+ const firstWebTool = webToolCalls[0];
164
+ const firstWebExploreIndex = toolCalls.findIndex((call) => call.toolName === 'web_explore');
165
+ return {
166
+ webExploreUsed: firstWebExploreIndex !== -1,
167
+ webExploreFirstWebTool: firstWebTool?.toolName === 'web_explore',
168
+ totalToolCalls: toolCalls.length,
169
+ totalWebToolCalls: webToolCalls.length,
170
+ searchCalls: toolCalls.filter((call) => call.toolName === 'web_search').length,
171
+ fetchCalls: toolCalls.filter((call) => call.toolName === 'web_fetch').length,
172
+ headlessCalls: toolCalls.filter((call) => call.toolName === 'web_fetch_headless').length,
173
+ lowLevelCallsAfterExplore: firstWebExploreIndex === -1
174
+ ? toolCalls.filter((call) => lowLevelWebToolNames.has(call.toolName) && !isPostWebExploreGuardResult(call.result)).length
175
+ : toolCalls
176
+ .slice(firstWebExploreIndex + 1)
177
+ .filter((call) => lowLevelWebToolNames.has(call.toolName) && !isPostWebExploreGuardResult(call.result)).length,
178
+ guardedLowLevelCallsAfterExplore: firstWebExploreIndex === -1
179
+ ? toolCalls.filter((call) => lowLevelWebToolNames.has(call.toolName) && isPostWebExploreGuardResult(call.result)).length
180
+ : toolCalls
181
+ .slice(firstWebExploreIndex + 1)
182
+ .filter((call) => lowLevelWebToolNames.has(call.toolName) && isPostWebExploreGuardResult(call.result)).length,
183
+ emptySearches: toolCalls.filter((call) => call.toolName === 'web_search' && isEmptySearchResult(call.result)).length,
184
+ unsupportedFetches: toolCalls.filter((call) => (call.toolName === 'web_fetch' || call.toolName === 'web_fetch_headless') &&
185
+ isUnsupportedFetchResult(call.result)).length,
186
+ botCheckHeadlesses: toolCalls.filter((call) => call.toolName === 'web_fetch_headless' && isBotCheckHeadlessResult(call.result)).length
187
+ };
188
+ }
189
+ function evaluateVerdict(metrics, finalAnswer) {
190
+ const notes = [];
191
+ if (!metrics.webExploreUsed) {
192
+ notes.push('web_explore was not used');
193
+ return { verdict: 'fail', notes };
194
+ }
195
+ if (!metrics.webExploreFirstWebTool) {
196
+ notes.push('web_explore was not the first web research tool');
197
+ }
198
+ if (metrics.lowLevelCallsAfterExplore > 2) {
199
+ notes.push(`too many low-level calls after web_explore (${metrics.lowLevelCallsAfterExplore})`);
200
+ }
201
+ if (metrics.emptySearches > 0) {
202
+ notes.push(`empty web_search calls observed (${metrics.emptySearches})`);
203
+ }
204
+ if (metrics.botCheckHeadlesses > 0) {
205
+ notes.push(`headless bot-check pages observed (${metrics.botCheckHeadlesses})`);
206
+ }
207
+ if (!finalAnswer.trim()) {
208
+ notes.push('final answer text was empty');
209
+ return { verdict: 'fail', notes };
210
+ }
211
+ const looksClean = metrics.webExploreFirstWebTool &&
212
+ metrics.lowLevelCallsAfterExplore <= 1 &&
213
+ metrics.emptySearches === 0 &&
214
+ metrics.botCheckHeadlesses === 0;
215
+ if (looksClean) {
216
+ return { verdict: 'pass', notes };
217
+ }
218
+ return { verdict: 'mixed', notes };
219
+ }
220
+ function formatSearchFailureMarkdown(cases) {
221
+ if (cases.length === 0) {
222
+ return '## Search failure cases\n\nNone.\n';
223
+ }
224
+ const sections = cases
225
+ .map((testCase) => {
226
+ const notes = testCase.notes.length > 0 ? testCase.notes.map((note) => `- ${note}`).join('\n') : '- none';
227
+ return `### ${testCase.title}\n\n` +
228
+ `Verdict: **${testCase.verdict}**\n\n` +
229
+ `- expected code: ${testCase.expectedCode}\n` +
230
+ `- actual code: ${testCase.actualCode}\n` +
231
+ `- expected message: ${testCase.expectedMessage}\n` +
232
+ `- actual message: ${testCase.actualMessage}\n\n` +
233
+ `Notes:\n${notes}\n`;
234
+ })
235
+ .join('\n');
236
+ return `## Search failure cases\n\n${sections}`;
237
+ }
238
+ function formatMarkdown(run) {
239
+ const sections = run.prompts
240
+ .map((prompt) => {
241
+ const tools = prompt.toolCalls
242
+ .map((call, index) => ` ${index + 1}. ${call.toolName}`)
243
+ .join('\n');
244
+ const notes = prompt.notes.length > 0 ? prompt.notes.map((note) => `- ${note}`).join('\n') : '- none';
245
+ return `## ${prompt.title}\n\n` +
246
+ `Prompt: ${prompt.prompt}\n\n` +
247
+ `Verdict: **${prompt.verdict}**\n\n` +
248
+ `Metrics:\n` +
249
+ `- web_explore used: ${prompt.metrics.webExploreUsed}\n` +
250
+ `- web_explore first web tool: ${prompt.metrics.webExploreFirstWebTool}\n` +
251
+ `- total tool calls: ${prompt.metrics.totalToolCalls}\n` +
252
+ `- total web tool calls: ${prompt.metrics.totalWebToolCalls}\n` +
253
+ `- web_search calls: ${prompt.metrics.searchCalls}\n` +
254
+ `- web_fetch calls: ${prompt.metrics.fetchCalls}\n` +
255
+ `- web_fetch_headless calls: ${prompt.metrics.headlessCalls}\n` +
256
+ `- low-level calls after web_explore: ${prompt.metrics.lowLevelCallsAfterExplore}\n` +
257
+ `- guarded low-level calls after web_explore: ${prompt.metrics.guardedLowLevelCallsAfterExplore}\n` +
258
+ `- empty searches: ${prompt.metrics.emptySearches}\n` +
259
+ `- unsupported fetches: ${prompt.metrics.unsupportedFetches}\n` +
260
+ `- bot-check headlesses: ${prompt.metrics.botCheckHeadlesses}\n\n` +
261
+ `Tool order:\n${tools || ' none'}\n\n` +
262
+ `Notes:\n${notes}\n\n` +
263
+ `Final answer:\n\n${prompt.finalAnswer.trim() || '(empty)'}\n`;
264
+ })
265
+ .join('\n---\n\n');
266
+ return `# live web eval\n\nStarted: ${run.startedAt}\nFinished: ${run.finishedAt}\nCWD: ${run.cwd}\n\n` +
267
+ `${sections}\n\n---\n\n${formatSearchFailureMarkdown(run.searchFailureCases)}`;
268
+ }
269
+ function evaluateSearchFailureCase(expectedCode, actualCode, expectedMessage, actualMessage) {
270
+ const notes = [];
271
+ if (actualCode !== expectedCode) {
272
+ notes.push(`expected code ${expectedCode} but got ${actualCode}`);
273
+ }
274
+ if (actualMessage !== expectedMessage) {
275
+ notes.push(`expected message \"${expectedMessage}\" but got \"${actualMessage}\"`);
276
+ }
277
+ return {
278
+ verdict: notes.length === 0 ? 'pass' : 'fail',
279
+ notes
280
+ };
281
+ }
282
+ async function runPrompt(promptCase, cwd, authStorage, modelRegistry) {
283
+ const startedAt = Date.now();
284
+ const toolCalls = [];
285
+ let finalAnswer = '';
286
+ const { session } = await createAgentSession({
287
+ cwd,
288
+ authStorage,
289
+ modelRegistry,
290
+ sessionManager: SessionManager.inMemory()
291
+ });
292
+ const unsubscribe = session.subscribe((event) => {
293
+ if (event.type === 'tool_execution_start') {
294
+ toolCalls.push({
295
+ toolName: event.toolName,
296
+ args: event.args,
297
+ startedAt: isoNow()
298
+ });
299
+ }
300
+ if (event.type === 'tool_execution_end') {
301
+ const active = [...toolCalls].reverse().find((call) => call.toolName === event.toolName && !call.endedAt);
302
+ if (active) {
303
+ active.endedAt = isoNow();
304
+ active.isError = !!event.isError;
305
+ active.result = event.result;
306
+ }
307
+ }
308
+ if (event.type === 'message_end' && event.message?.role === 'assistant') {
309
+ const text = extractText(event.message);
310
+ if (text.trim()) {
311
+ finalAnswer = text;
312
+ }
313
+ }
314
+ });
315
+ try {
316
+ await session.prompt(promptCase.prompt);
317
+ if (!finalAnswer.trim()) {
318
+ const reversedMessages = [...session.messages].reverse();
319
+ const lastAssistant = reversedMessages.find((message) => message?.role === 'assistant');
320
+ finalAnswer = lastAssistant ? extractText(lastAssistant) : '';
321
+ }
322
+ }
323
+ finally {
324
+ unsubscribe();
325
+ session.dispose();
326
+ }
327
+ const finishedAt = Date.now();
328
+ const metrics = buildMetrics(toolCalls);
329
+ const evaluation = evaluateVerdict(metrics, finalAnswer);
330
+ return {
331
+ id: promptCase.id,
332
+ title: promptCase.title,
333
+ prompt: promptCase.prompt,
334
+ startedAt: new Date(startedAt).toISOString(),
335
+ finishedAt: new Date(finishedAt).toISOString(),
336
+ durationMs: finishedAt - startedAt,
337
+ finalAnswer,
338
+ toolCalls,
339
+ metrics,
340
+ verdict: evaluation.verdict,
341
+ notes: evaluation.notes
342
+ };
343
+ }
344
+ async function runSearchFailureCase(testCase) {
345
+ const startedAt = Date.now();
346
+ const search = createWebSearchTool({ searchHtml: testCase.searchHtml });
347
+ const result = await search({ query: 'deterministic test query' });
348
+ const finishedAt = Date.now();
349
+ const actualCode = result.error?.code ?? 'NO_ERROR';
350
+ const actualMessage = result.error?.message ?? 'No error message returned.';
351
+ const evaluation = evaluateSearchFailureCase(testCase.expectedCode, actualCode, testCase.expectedMessage, actualMessage);
352
+ return {
353
+ id: testCase.id,
354
+ title: testCase.title,
355
+ startedAt: new Date(startedAt).toISOString(),
356
+ finishedAt: new Date(finishedAt).toISOString(),
357
+ durationMs: finishedAt - startedAt,
358
+ expectedCode: testCase.expectedCode,
359
+ actualCode,
360
+ expectedMessage: testCase.expectedMessage,
361
+ actualMessage,
362
+ verdict: evaluation.verdict,
363
+ notes: evaluation.notes
364
+ };
365
+ }
366
+ async function main() {
367
+ const cwd = process.cwd();
368
+ const startedAt = isoNow();
369
+ const authStorage = AuthStorage.create();
370
+ const modelRegistry = ModelRegistry.create(authStorage);
371
+ const prompts = [];
372
+ for (const promptCase of PROMPTS) {
373
+ console.log(`Running ${promptCase.id}: ${promptCase.title}`);
374
+ prompts.push(await runPrompt(promptCase, cwd, authStorage, modelRegistry));
375
+ }
376
+ const searchFailureCases = [];
377
+ for (const testCase of SEARCH_FAILURE_CASES) {
378
+ console.log(`Running ${testCase.id}: ${testCase.title}`);
379
+ searchFailureCases.push(await runSearchFailureCase(testCase));
380
+ }
381
+ const run = {
382
+ startedAt,
383
+ finishedAt: isoNow(),
384
+ cwd,
385
+ prompts,
386
+ searchFailureCases
387
+ };
388
+ const outputDir = path.join(cwd, 'local_docs', 'tmp', 'live-evals');
389
+ await mkdir(outputDir, { recursive: true });
390
+ const stamp = safeFileStamp();
391
+ const jsonPath = path.join(outputDir, `${stamp}.json`);
392
+ const mdPath = path.join(outputDir, `${stamp}.md`);
393
+ await writeFile(jsonPath, `${JSON.stringify(run, null, 2)}\n`, 'utf8');
394
+ await writeFile(mdPath, `${formatMarkdown(run)}\n`, 'utf8');
395
+ console.log(`\nSaved JSON: ${jsonPath}`);
396
+ console.log(`Saved Markdown: ${mdPath}`);
397
+ for (const prompt of run.prompts) {
398
+ console.log(`\n${prompt.id} -> ${prompt.verdict}`);
399
+ console.log(` web_explore first web tool: ${prompt.metrics.webExploreFirstWebTool}`);
400
+ console.log(` low-level calls after web_explore: ${prompt.metrics.lowLevelCallsAfterExplore}`);
401
+ console.log(` guarded low-level calls after web_explore: ${prompt.metrics.guardedLowLevelCallsAfterExplore}`);
402
+ console.log(` empty searches: ${prompt.metrics.emptySearches}`);
403
+ console.log(` bot-check headlesses: ${prompt.metrics.botCheckHeadlesses}`);
404
+ }
405
+ for (const testCase of run.searchFailureCases) {
406
+ console.log(`\n${testCase.id} -> ${testCase.verdict}`);
407
+ console.log(` expected code: ${testCase.expectedCode}`);
408
+ console.log(` actual code: ${testCase.actualCode}`);
409
+ }
410
+ }
411
+ await main();
@@ -0,0 +1,8 @@
1
+ export declare function createCacheKey(parts: Array<string | number | boolean>): string;
2
+ export declare function createTtlCache<T>({ ttlMs, now }: {
3
+ ttlMs: number;
4
+ now?: () => number;
5
+ }): {
6
+ get(key: string): T | undefined;
7
+ set(key: string, value: T): void;
8
+ };