@demigodmode/pi-web-agent 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -199
- package/dist/scripts/live-web-eval.d.ts +1 -0
- package/dist/scripts/live-web-eval.js +411 -0
- package/dist/src/cache/ttl-cache.d.ts +8 -0
- package/dist/src/cache/ttl-cache.js +21 -0
- package/dist/src/extension.d.ts +2 -0
- package/dist/src/extension.js +155 -0
- package/dist/src/extract/readability.d.ts +8 -0
- package/dist/src/extract/readability.js +93 -0
- package/dist/src/fetch/browser-resolution.d.ts +15 -0
- package/dist/src/fetch/browser-resolution.js +55 -0
- package/dist/src/fetch/headless-fetch.d.ts +18 -0
- package/dist/src/fetch/headless-fetch.js +87 -0
- package/dist/src/fetch/http-fetch.d.ts +4 -0
- package/dist/src/fetch/http-fetch.js +50 -0
- package/dist/src/orchestration/index.d.ts +41 -0
- package/dist/src/orchestration/index.js +9 -0
- package/dist/src/orchestration/research-orchestrator.d.ts +43 -0
- package/dist/src/orchestration/research-orchestrator.js +87 -0
- package/dist/src/orchestration/research-types.d.ts +41 -0
- package/dist/src/orchestration/research-types.js +1 -0
- package/dist/src/orchestration/research-worker.d.ts +16 -0
- package/dist/src/orchestration/research-worker.js +131 -0
- package/dist/src/search/duckduckgo.d.ts +9 -0
- package/dist/src/search/duckduckgo.js +52 -0
- package/dist/src/tools/web-explore.d.ts +44 -0
- package/dist/src/tools/web-explore.js +50 -0
- package/dist/src/tools/web-fetch-headless.d.ts +6 -0
- package/dist/src/tools/web-fetch-headless.js +14 -0
- package/dist/src/tools/web-fetch.d.ts +6 -0
- package/dist/src/tools/web-fetch.js +14 -0
- package/dist/src/tools/web-search.d.ts +10 -0
- package/dist/src/tools/web-search.js +103 -0
- package/dist/src/types.d.ts +48 -0
- package/dist/src/types.js +7 -0
- package/dist/tests/cache/ttl-cache.test.d.ts +1 -0
- package/dist/tests/cache/ttl-cache.test.js +19 -0
- package/dist/tests/contracts.test.d.ts +1 -0
- package/dist/tests/contracts.test.js +65 -0
- package/dist/tests/extension.test.d.ts +1 -0
- package/dist/tests/extension.test.js +123 -0
- package/dist/tests/extract/readability.test.d.ts +1 -0
- package/dist/tests/extract/readability.test.js +79 -0
- package/dist/tests/fetch/browser-resolution.test.d.ts +1 -0
- package/dist/tests/fetch/browser-resolution.test.js +37 -0
- package/dist/tests/fetch/headless-fetch.smoke.test.d.ts +1 -0
- package/dist/tests/fetch/headless-fetch.smoke.test.js +17 -0
- package/dist/tests/fetch/headless-fetch.test.d.ts +1 -0
- package/dist/tests/fetch/headless-fetch.test.js +150 -0
- package/dist/tests/fetch/http-fetch.test.d.ts +1 -0
- package/dist/tests/fetch/http-fetch.test.js +129 -0
- package/dist/tests/orchestration/research-orchestrator.test.d.ts +1 -0
- package/dist/tests/orchestration/research-orchestrator.test.js +298 -0
- package/dist/tests/orchestration/research-worker.test.d.ts +1 -0
- package/dist/tests/orchestration/research-worker.test.js +171 -0
- package/dist/tests/orchestration/research-workflow.test.d.ts +1 -0
- package/dist/tests/orchestration/research-workflow.test.js +119 -0
- package/dist/tests/package-manifest.test.d.ts +1 -0
- package/dist/tests/package-manifest.test.js +29 -0
- package/dist/tests/release-foundation.test.d.ts +1 -0
- package/dist/tests/release-foundation.test.js +16 -0
- package/dist/tests/release-script.test.d.ts +1 -0
- package/dist/tests/release-script.test.js +72 -0
- package/dist/tests/search/duckduckgo.test.d.ts +1 -0
- package/dist/tests/search/duckduckgo.test.js +103 -0
- package/dist/tests/tools/web-explore.test.d.ts +1 -0
- package/dist/tests/tools/web-explore.test.js +163 -0
- package/dist/tests/tools/web-fetch-headless.test.d.ts +1 -0
- package/dist/tests/tools/web-fetch-headless.test.js +31 -0
- package/dist/tests/tools/web-fetch.test.d.ts +1 -0
- package/dist/tests/tools/web-fetch.test.js +27 -0
- package/dist/tests/tools/web-search.test.d.ts +1 -0
- package/dist/tests/tools/web-search.test.js +125 -0
- package/dist/vitest.config.d.ts +2 -0
- package/dist/vitest.config.js +13 -0
- package/package.json +5 -1
package/README.md
CHANGED
|
@@ -1,199 +1,63 @@
|
|
|
1
|
-
# pi-web-agent
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
The
|
|
10
|
-
|
|
11
|
-
- `web_search`
|
|
12
|
-
- `web_fetch`
|
|
13
|
-
- `web_fetch_headless` is the explicit browser
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
##
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
```bash
|
|
65
|
-
npm view @demigodmode/pi-web-agent
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
## Current status
|
|
69
|
-
|
|
70
|
-
This repo is in early MVP shape, but it is no longer just a design doc.
|
|
71
|
-
|
|
72
|
-
Right now it has:
|
|
73
|
-
|
|
74
|
-
- a TypeScript project scaffold
|
|
75
|
-
- shared result and status contracts
|
|
76
|
-
- a DuckDuckGo HTML parser for `web_search`
|
|
77
|
-
- an HTTP fetch path with readability-based extraction and conservative escalation to `needs_headless`
|
|
78
|
-
- a real browser-backed `web_fetch_headless` implementation with local browser resolution
|
|
79
|
-
- repo-local Pi extension wiring for development
|
|
80
|
-
- a test suite around parser behavior, contracts, extraction, caching, and tool adapters
|
|
81
|
-
- optional smoke coverage for local installed browsers
|
|
82
|
-
|
|
83
|
-
So the project is real and usable, but still early.
|
|
84
|
-
|
|
85
|
-
## Example behavior
|
|
86
|
-
|
|
87
|
-
These are conceptual examples of the contract the package is aiming to expose.
|
|
88
|
-
|
|
89
|
-
### Search
|
|
90
|
-
|
|
91
|
-
`web_search("pi coding agent")`
|
|
92
|
-
|
|
93
|
-
Returns discovery results like:
|
|
94
|
-
|
|
95
|
-
- title
|
|
96
|
-
- URL
|
|
97
|
-
- snippet
|
|
98
|
-
|
|
99
|
-
It does not imply the page was fetched.
|
|
100
|
-
|
|
101
|
-
### HTTP fetch
|
|
102
|
-
|
|
103
|
-
`web_fetch("https://example.com/article")`
|
|
104
|
-
|
|
105
|
-
If the page is readable over plain HTTP, it should return extracted content.
|
|
106
|
-
|
|
107
|
-
If the page looks too script-heavy, too thin, blocked, or otherwise unreliable, it should return `needs_headless` instead of pretending the extraction is good enough.
|
|
108
|
-
|
|
109
|
-
### Explicit headless fetch
|
|
110
|
-
|
|
111
|
-
`web_fetch_headless("https://example.com/app")`
|
|
112
|
-
|
|
113
|
-
This is the browser-based path for pages that really need rendering.
|
|
114
|
-
|
|
115
|
-
This path now launches a local browser explicitly, waits for the rendered page to settle, and then extracts readable content from the rendered HTML.
|
|
116
|
-
|
|
117
|
-
## Local development
|
|
118
|
-
|
|
119
|
-
Install dependencies:
|
|
120
|
-
|
|
121
|
-
```bash
|
|
122
|
-
npm install
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
Run tests with coverage:
|
|
126
|
-
|
|
127
|
-
```bash
|
|
128
|
-
npm test
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
Run the typecheck used as lint:
|
|
132
|
-
|
|
133
|
-
```bash
|
|
134
|
-
npm run lint
|
|
135
|
-
```
|
|
136
|
-
|
|
137
|
-
Build the project:
|
|
138
|
-
|
|
139
|
-
```bash
|
|
140
|
-
npm run build
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
To run the optional real-browser smoke test for headless fetch, set `PI_HEADLESS_SMOKE=1` before running Vitest. It stays skipped by default so local browser install differences do not make the normal test suite flaky.
|
|
144
|
-
|
|
145
|
-
Coverage is now part of the normal `npm test` flow. Vitest prints a text summary in the terminal and writes the full HTML report to `coverage/`.
|
|
146
|
-
|
|
147
|
-
### Trying it in Pi locally
|
|
148
|
-
|
|
149
|
-
This repo includes a project-local Pi extension entrypoint at `.pi/extensions/pi-web-agent.ts` for development and hot reload.
|
|
150
|
-
|
|
151
|
-
For the published npm package, Pi loads the compiled runtime from `dist/extension.js` via the `pi.extensions` entry in `package.json`.
|
|
152
|
-
|
|
153
|
-
After starting Pi in this project, use `/reload` if you change the extension code and want Pi to pick up the latest version.
|
|
154
|
-
|
|
155
|
-
## Project layout
|
|
156
|
-
|
|
157
|
-
The code is split into small modules on purpose.
|
|
158
|
-
|
|
159
|
-
- `src/extension.ts` - package entry surface
|
|
160
|
-
- `src/tools/` - thin tool adapters
|
|
161
|
-
- `src/search/` - search backend logic
|
|
162
|
-
- `src/fetch/` - HTTP and headless fetch logic
|
|
163
|
-
- `src/extract/` - readable-content extraction
|
|
164
|
-
- `src/cache/` - small cache utilities
|
|
165
|
-
- `src/types.ts` - shared contracts
|
|
166
|
-
- `tests/` - parser, contract, extraction, fetch, and adapter tests
|
|
167
|
-
|
|
168
|
-
## License
|
|
169
|
-
|
|
170
|
-
AGPL-3.0-only. See `LICENSE`.
|
|
171
|
-
|
|
172
|
-
## Release process
|
|
173
|
-
|
|
174
|
-
1. Update `CHANGELOG.md` under `## Unreleased`.
|
|
175
|
-
2. Run `npm run release:dry-run` to preview the next version.
|
|
176
|
-
3. Run `npm run release` to bump version, rewrite the changelog release heading, create a release commit, and create a tag.
|
|
177
|
-
4. Push the branch and tag.
|
|
178
|
-
5. GitHub Actions publishes the tagged release to npm.
|
|
179
|
-
|
|
180
|
-
## Maintainer release notes
|
|
181
|
-
|
|
182
|
-
This repo is set up for npm Trusted Publishing from GitHub Actions.
|
|
183
|
-
|
|
184
|
-
In npm package settings, add a trusted publisher for:
|
|
185
|
-
- package: `@demigodmode/pi-web-agent`
|
|
186
|
-
- provider: GitHub Actions
|
|
187
|
-
- repository: `demigodmode/pi-web-agent`
|
|
188
|
-
|
|
189
|
-
That replaces the old `NPM_TOKEN` secret flow.
|
|
190
|
-
|
|
191
|
-
## Near-term next steps
|
|
192
|
-
|
|
193
|
-
The next chunk of work is pretty clear:
|
|
194
|
-
|
|
195
|
-
- keep tightening weak-content escalation on tricky HTTP targets
|
|
196
|
-
- improve cleanup of noisy rendered-page extraction on busy sites
|
|
197
|
-
- expand fixtures and end-to-end coverage
|
|
198
|
-
- add alternate search backends behind a first-class provider abstraction
|
|
199
|
-
|
|
1
|
+
# pi-web-agent
|
|
2
|
+
|
|
3
|
+
[](https://github.com/demigodmode/pi-web-agent/actions/workflows/ci.yml)
|
|
4
|
+
[](https://www.npmjs.com/package/@demigodmode/pi-web-agent)
|
|
5
|
+
[](https://demigodmode.github.io/pi-web-agent/)
|
|
6
|
+
|
|
7
|
+
`@demigodmode/pi-web-agent` is a Pi package for web access.
|
|
8
|
+
|
|
9
|
+
The whole point is keeping the boundaries straight:
|
|
10
|
+
|
|
11
|
+
- `web_search` is for discovery
|
|
12
|
+
- `web_fetch` is for plain HTTP reads
|
|
13
|
+
- `web_fetch_headless` is the explicit browser path
|
|
14
|
+
- `web_explore` is the bounded research path
|
|
15
|
+
|
|
16
|
+
That sounds obvious, but a lot of agent tooling gets fuzzy right there. This package is meant to be stricter about what it actually did and more willing to say when a read was not good enough to trust.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pi install npm:@demigodmode/pi-web-agent
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Later on, update installed packages with:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pi update
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Docs
|
|
31
|
+
|
|
32
|
+
Docs site:
|
|
33
|
+
|
|
34
|
+
- https://demigodmode.github.io/pi-web-agent/
|
|
35
|
+
|
|
36
|
+
Work on the docs locally:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
npm run docs:dev
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Build the docs:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
npm run docs:build
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Local development
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
npm install
|
|
52
|
+
npm test
|
|
53
|
+
npm run lint
|
|
54
|
+
npm run build
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
For local Pi work, this repo includes `.pi/extensions/pi-web-agent.ts`.
|
|
58
|
+
|
|
59
|
+
If Pi is already running, use `/reload` after changes.
|
|
60
|
+
|
|
61
|
+
## License
|
|
62
|
+
|
|
63
|
+
AGPL-3.0-only. See `LICENSE`.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
import { mkdir, writeFile } from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import process from 'node:process';
|
|
4
|
+
import { AuthStorage, createAgentSession, ModelRegistry, SessionManager } from '@mariozechner/pi-coding-agent';
|
|
5
|
+
import { createWebSearchTool } from '../src/tools/web-search.js';
|
|
6
|
+
const PROMPTS = [
|
|
7
|
+
{
|
|
8
|
+
id: 'prompt-1',
|
|
9
|
+
title: 'Playwright installed browser guidance',
|
|
10
|
+
prompt: 'Find current docs or discussions about Playwright launching an installed Chrome or Edge executable instead of a bundled browser, then summarize the recommended approach.'
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
id: 'prompt-2',
|
|
14
|
+
title: 'Vitest coverage configuration',
|
|
15
|
+
prompt: 'Find the current Vitest coverage docs and tell me how to enable coverage with the V8 provider in a TypeScript project.'
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
id: 'prompt-4',
|
|
19
|
+
title: 'DuckDuckGo HTML scraping pitfalls',
|
|
20
|
+
prompt: 'Find two or three current sources on DuckDuckGo HTML scraping in Node.js and tell me what the common parsing pitfalls are.'
|
|
21
|
+
}
|
|
22
|
+
];
|
|
23
|
+
const SEARCH_FAILURE_CASES = [
|
|
24
|
+
{
|
|
25
|
+
id: 'no-results',
|
|
26
|
+
title: 'NO_RESULTS classification',
|
|
27
|
+
expectedCode: 'NO_RESULTS',
|
|
28
|
+
expectedMessage: 'DuckDuckGo returned no usable results for this query.',
|
|
29
|
+
searchHtml: async () => `
|
|
30
|
+
<html>
|
|
31
|
+
<body>
|
|
32
|
+
<div class="results">
|
|
33
|
+
<div class="no-results">No results found for your search.</div>
|
|
34
|
+
</div>
|
|
35
|
+
</body>
|
|
36
|
+
</html>
|
|
37
|
+
`
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
id: 'parse-failed',
|
|
41
|
+
title: 'PARSE_FAILED classification',
|
|
42
|
+
expectedCode: 'PARSE_FAILED',
|
|
43
|
+
expectedMessage: 'DuckDuckGo returned a page, but it did not match the expected results format.',
|
|
44
|
+
searchHtml: async () => `
|
|
45
|
+
<html>
|
|
46
|
+
<body>
|
|
47
|
+
<main>
|
|
48
|
+
<h1>Unexpected page</h1>
|
|
49
|
+
<p>Nothing here looks like a search results page.</p>
|
|
50
|
+
</main>
|
|
51
|
+
</body>
|
|
52
|
+
</html>
|
|
53
|
+
`
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
id: 'blocked-html',
|
|
57
|
+
title: 'BLOCKED classification from challenge HTML',
|
|
58
|
+
expectedCode: 'BLOCKED',
|
|
59
|
+
expectedMessage: 'DuckDuckGo search appears to be blocked or rate limited.',
|
|
60
|
+
searchHtml: async () => `
|
|
61
|
+
<html>
|
|
62
|
+
<body>
|
|
63
|
+
<main>
|
|
64
|
+
<h1>Are you a robot?</h1>
|
|
65
|
+
<p>Please verify you are human to continue.</p>
|
|
66
|
+
</main>
|
|
67
|
+
</body>
|
|
68
|
+
</html>
|
|
69
|
+
`
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
id: 'fetch-failed',
|
|
73
|
+
title: 'FETCH_FAILED classification',
|
|
74
|
+
expectedCode: 'FETCH_FAILED',
|
|
75
|
+
expectedMessage: 'DuckDuckGo search request failed: socket hang up',
|
|
76
|
+
searchHtml: async () => {
|
|
77
|
+
throw new Error('socket hang up');
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
];
|
|
81
|
+
function isoNow() {
|
|
82
|
+
return new Date().toISOString();
|
|
83
|
+
}
|
|
84
|
+
function safeFileStamp(date = new Date()) {
|
|
85
|
+
return date.toISOString().replace(/[:.]/g, '-');
|
|
86
|
+
}
|
|
87
|
+
function extractText(value) {
|
|
88
|
+
if (typeof value === 'string')
|
|
89
|
+
return value;
|
|
90
|
+
if (!value || typeof value !== 'object')
|
|
91
|
+
return '';
|
|
92
|
+
if (Array.isArray(value)) {
|
|
93
|
+
return value.map(extractText).filter(Boolean).join('\n');
|
|
94
|
+
}
|
|
95
|
+
const record = value;
|
|
96
|
+
if (typeof record.text === 'string')
|
|
97
|
+
return record.text;
|
|
98
|
+
if (typeof record.content === 'string')
|
|
99
|
+
return record.content;
|
|
100
|
+
if (Array.isArray(record.content)) {
|
|
101
|
+
return record.content
|
|
102
|
+
.map((item) => {
|
|
103
|
+
if (!item || typeof item !== 'object')
|
|
104
|
+
return '';
|
|
105
|
+
const contentItem = item;
|
|
106
|
+
return typeof contentItem.text === 'string' ? contentItem.text : '';
|
|
107
|
+
})
|
|
108
|
+
.filter(Boolean)
|
|
109
|
+
.join('\n');
|
|
110
|
+
}
|
|
111
|
+
const nestedMessage = record.message;
|
|
112
|
+
if (nestedMessage && typeof nestedMessage === 'object') {
|
|
113
|
+
const nestedRecord = nestedMessage;
|
|
114
|
+
if (Array.isArray(nestedRecord.content)) {
|
|
115
|
+
return extractText(nestedMessage);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
return '';
|
|
119
|
+
}
|
|
120
|
+
function toolDetails(result) {
|
|
121
|
+
if (!result || typeof result !== 'object')
|
|
122
|
+
return result;
|
|
123
|
+
const record = result;
|
|
124
|
+
return record.details ?? result;
|
|
125
|
+
}
|
|
126
|
+
function isEmptySearchResult(result) {
|
|
127
|
+
const details = toolDetails(result);
|
|
128
|
+
if (!details || typeof details !== 'object')
|
|
129
|
+
return false;
|
|
130
|
+
const record = details;
|
|
131
|
+
return record.status === 'ok' && Array.isArray(record.results) && record.results.length === 0;
|
|
132
|
+
}
|
|
133
|
+
function isUnsupportedFetchResult(result) {
|
|
134
|
+
const details = toolDetails(result);
|
|
135
|
+
return !!details && typeof details === 'object' && details.status === 'unsupported';
|
|
136
|
+
}
|
|
137
|
+
function isBotCheckHeadlessResult(result) {
|
|
138
|
+
const details = toolDetails(result);
|
|
139
|
+
if (!details || typeof details !== 'object')
|
|
140
|
+
return false;
|
|
141
|
+
const record = details;
|
|
142
|
+
const content = record.content;
|
|
143
|
+
const text = extractText(content);
|
|
144
|
+
const title = content && typeof content === 'object' && !Array.isArray(content)
|
|
145
|
+
? String(content.title ?? '')
|
|
146
|
+
: '';
|
|
147
|
+
return /just a moment|security verification|verify you are not a bot/i.test(`${title}\n${text}`);
|
|
148
|
+
}
|
|
149
|
+
function isPostWebExploreGuardResult(result) {
|
|
150
|
+
const details = toolDetails(result);
|
|
151
|
+
if (!details || typeof details !== 'object')
|
|
152
|
+
return false;
|
|
153
|
+
const record = details;
|
|
154
|
+
const error = record.error;
|
|
155
|
+
if (!error || typeof error !== 'object')
|
|
156
|
+
return false;
|
|
157
|
+
return error.code === 'POST_WEB_EXPLORE_GUARD';
|
|
158
|
+
}
|
|
159
|
+
function buildMetrics(toolCalls) {
|
|
160
|
+
const webToolNames = new Set(['web_explore', 'web_search', 'web_fetch', 'web_fetch_headless']);
|
|
161
|
+
const lowLevelWebToolNames = new Set(['web_search', 'web_fetch', 'web_fetch_headless']);
|
|
162
|
+
const webToolCalls = toolCalls.filter((call) => webToolNames.has(call.toolName));
|
|
163
|
+
const firstWebTool = webToolCalls[0];
|
|
164
|
+
const firstWebExploreIndex = toolCalls.findIndex((call) => call.toolName === 'web_explore');
|
|
165
|
+
return {
|
|
166
|
+
webExploreUsed: firstWebExploreIndex !== -1,
|
|
167
|
+
webExploreFirstWebTool: firstWebTool?.toolName === 'web_explore',
|
|
168
|
+
totalToolCalls: toolCalls.length,
|
|
169
|
+
totalWebToolCalls: webToolCalls.length,
|
|
170
|
+
searchCalls: toolCalls.filter((call) => call.toolName === 'web_search').length,
|
|
171
|
+
fetchCalls: toolCalls.filter((call) => call.toolName === 'web_fetch').length,
|
|
172
|
+
headlessCalls: toolCalls.filter((call) => call.toolName === 'web_fetch_headless').length,
|
|
173
|
+
lowLevelCallsAfterExplore: firstWebExploreIndex === -1
|
|
174
|
+
? toolCalls.filter((call) => lowLevelWebToolNames.has(call.toolName) && !isPostWebExploreGuardResult(call.result)).length
|
|
175
|
+
: toolCalls
|
|
176
|
+
.slice(firstWebExploreIndex + 1)
|
|
177
|
+
.filter((call) => lowLevelWebToolNames.has(call.toolName) && !isPostWebExploreGuardResult(call.result)).length,
|
|
178
|
+
guardedLowLevelCallsAfterExplore: firstWebExploreIndex === -1
|
|
179
|
+
? toolCalls.filter((call) => lowLevelWebToolNames.has(call.toolName) && isPostWebExploreGuardResult(call.result)).length
|
|
180
|
+
: toolCalls
|
|
181
|
+
.slice(firstWebExploreIndex + 1)
|
|
182
|
+
.filter((call) => lowLevelWebToolNames.has(call.toolName) && isPostWebExploreGuardResult(call.result)).length,
|
|
183
|
+
emptySearches: toolCalls.filter((call) => call.toolName === 'web_search' && isEmptySearchResult(call.result)).length,
|
|
184
|
+
unsupportedFetches: toolCalls.filter((call) => (call.toolName === 'web_fetch' || call.toolName === 'web_fetch_headless') &&
|
|
185
|
+
isUnsupportedFetchResult(call.result)).length,
|
|
186
|
+
botCheckHeadlesses: toolCalls.filter((call) => call.toolName === 'web_fetch_headless' && isBotCheckHeadlessResult(call.result)).length
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
function evaluateVerdict(metrics, finalAnswer) {
|
|
190
|
+
const notes = [];
|
|
191
|
+
if (!metrics.webExploreUsed) {
|
|
192
|
+
notes.push('web_explore was not used');
|
|
193
|
+
return { verdict: 'fail', notes };
|
|
194
|
+
}
|
|
195
|
+
if (!metrics.webExploreFirstWebTool) {
|
|
196
|
+
notes.push('web_explore was not the first web research tool');
|
|
197
|
+
}
|
|
198
|
+
if (metrics.lowLevelCallsAfterExplore > 2) {
|
|
199
|
+
notes.push(`too many low-level calls after web_explore (${metrics.lowLevelCallsAfterExplore})`);
|
|
200
|
+
}
|
|
201
|
+
if (metrics.emptySearches > 0) {
|
|
202
|
+
notes.push(`empty web_search calls observed (${metrics.emptySearches})`);
|
|
203
|
+
}
|
|
204
|
+
if (metrics.botCheckHeadlesses > 0) {
|
|
205
|
+
notes.push(`headless bot-check pages observed (${metrics.botCheckHeadlesses})`);
|
|
206
|
+
}
|
|
207
|
+
if (!finalAnswer.trim()) {
|
|
208
|
+
notes.push('final answer text was empty');
|
|
209
|
+
return { verdict: 'fail', notes };
|
|
210
|
+
}
|
|
211
|
+
const looksClean = metrics.webExploreFirstWebTool &&
|
|
212
|
+
metrics.lowLevelCallsAfterExplore <= 1 &&
|
|
213
|
+
metrics.emptySearches === 0 &&
|
|
214
|
+
metrics.botCheckHeadlesses === 0;
|
|
215
|
+
if (looksClean) {
|
|
216
|
+
return { verdict: 'pass', notes };
|
|
217
|
+
}
|
|
218
|
+
return { verdict: 'mixed', notes };
|
|
219
|
+
}
|
|
220
|
+
function formatSearchFailureMarkdown(cases) {
|
|
221
|
+
if (cases.length === 0) {
|
|
222
|
+
return '## Search failure cases\n\nNone.\n';
|
|
223
|
+
}
|
|
224
|
+
const sections = cases
|
|
225
|
+
.map((testCase) => {
|
|
226
|
+
const notes = testCase.notes.length > 0 ? testCase.notes.map((note) => `- ${note}`).join('\n') : '- none';
|
|
227
|
+
return `### ${testCase.title}\n\n` +
|
|
228
|
+
`Verdict: **${testCase.verdict}**\n\n` +
|
|
229
|
+
`- expected code: ${testCase.expectedCode}\n` +
|
|
230
|
+
`- actual code: ${testCase.actualCode}\n` +
|
|
231
|
+
`- expected message: ${testCase.expectedMessage}\n` +
|
|
232
|
+
`- actual message: ${testCase.actualMessage}\n\n` +
|
|
233
|
+
`Notes:\n${notes}\n`;
|
|
234
|
+
})
|
|
235
|
+
.join('\n');
|
|
236
|
+
return `## Search failure cases\n\n${sections}`;
|
|
237
|
+
}
|
|
238
|
+
function formatMarkdown(run) {
|
|
239
|
+
const sections = run.prompts
|
|
240
|
+
.map((prompt) => {
|
|
241
|
+
const tools = prompt.toolCalls
|
|
242
|
+
.map((call, index) => ` ${index + 1}. ${call.toolName}`)
|
|
243
|
+
.join('\n');
|
|
244
|
+
const notes = prompt.notes.length > 0 ? prompt.notes.map((note) => `- ${note}`).join('\n') : '- none';
|
|
245
|
+
return `## ${prompt.title}\n\n` +
|
|
246
|
+
`Prompt: ${prompt.prompt}\n\n` +
|
|
247
|
+
`Verdict: **${prompt.verdict}**\n\n` +
|
|
248
|
+
`Metrics:\n` +
|
|
249
|
+
`- web_explore used: ${prompt.metrics.webExploreUsed}\n` +
|
|
250
|
+
`- web_explore first web tool: ${prompt.metrics.webExploreFirstWebTool}\n` +
|
|
251
|
+
`- total tool calls: ${prompt.metrics.totalToolCalls}\n` +
|
|
252
|
+
`- total web tool calls: ${prompt.metrics.totalWebToolCalls}\n` +
|
|
253
|
+
`- web_search calls: ${prompt.metrics.searchCalls}\n` +
|
|
254
|
+
`- web_fetch calls: ${prompt.metrics.fetchCalls}\n` +
|
|
255
|
+
`- web_fetch_headless calls: ${prompt.metrics.headlessCalls}\n` +
|
|
256
|
+
`- low-level calls after web_explore: ${prompt.metrics.lowLevelCallsAfterExplore}\n` +
|
|
257
|
+
`- guarded low-level calls after web_explore: ${prompt.metrics.guardedLowLevelCallsAfterExplore}\n` +
|
|
258
|
+
`- empty searches: ${prompt.metrics.emptySearches}\n` +
|
|
259
|
+
`- unsupported fetches: ${prompt.metrics.unsupportedFetches}\n` +
|
|
260
|
+
`- bot-check headlesses: ${prompt.metrics.botCheckHeadlesses}\n\n` +
|
|
261
|
+
`Tool order:\n${tools || ' none'}\n\n` +
|
|
262
|
+
`Notes:\n${notes}\n\n` +
|
|
263
|
+
`Final answer:\n\n${prompt.finalAnswer.trim() || '(empty)'}\n`;
|
|
264
|
+
})
|
|
265
|
+
.join('\n---\n\n');
|
|
266
|
+
return `# live web eval\n\nStarted: ${run.startedAt}\nFinished: ${run.finishedAt}\nCWD: ${run.cwd}\n\n` +
|
|
267
|
+
`${sections}\n\n---\n\n${formatSearchFailureMarkdown(run.searchFailureCases)}`;
|
|
268
|
+
}
|
|
269
|
+
function evaluateSearchFailureCase(expectedCode, actualCode, expectedMessage, actualMessage) {
|
|
270
|
+
const notes = [];
|
|
271
|
+
if (actualCode !== expectedCode) {
|
|
272
|
+
notes.push(`expected code ${expectedCode} but got ${actualCode}`);
|
|
273
|
+
}
|
|
274
|
+
if (actualMessage !== expectedMessage) {
|
|
275
|
+
notes.push(`expected message \"${expectedMessage}\" but got \"${actualMessage}\"`);
|
|
276
|
+
}
|
|
277
|
+
return {
|
|
278
|
+
verdict: notes.length === 0 ? 'pass' : 'fail',
|
|
279
|
+
notes
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
async function runPrompt(promptCase, cwd, authStorage, modelRegistry) {
|
|
283
|
+
const startedAt = Date.now();
|
|
284
|
+
const toolCalls = [];
|
|
285
|
+
let finalAnswer = '';
|
|
286
|
+
const { session } = await createAgentSession({
|
|
287
|
+
cwd,
|
|
288
|
+
authStorage,
|
|
289
|
+
modelRegistry,
|
|
290
|
+
sessionManager: SessionManager.inMemory()
|
|
291
|
+
});
|
|
292
|
+
const unsubscribe = session.subscribe((event) => {
|
|
293
|
+
if (event.type === 'tool_execution_start') {
|
|
294
|
+
toolCalls.push({
|
|
295
|
+
toolName: event.toolName,
|
|
296
|
+
args: event.args,
|
|
297
|
+
startedAt: isoNow()
|
|
298
|
+
});
|
|
299
|
+
}
|
|
300
|
+
if (event.type === 'tool_execution_end') {
|
|
301
|
+
const active = [...toolCalls].reverse().find((call) => call.toolName === event.toolName && !call.endedAt);
|
|
302
|
+
if (active) {
|
|
303
|
+
active.endedAt = isoNow();
|
|
304
|
+
active.isError = !!event.isError;
|
|
305
|
+
active.result = event.result;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
if (event.type === 'message_end' && event.message?.role === 'assistant') {
|
|
309
|
+
const text = extractText(event.message);
|
|
310
|
+
if (text.trim()) {
|
|
311
|
+
finalAnswer = text;
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
});
|
|
315
|
+
try {
|
|
316
|
+
await session.prompt(promptCase.prompt);
|
|
317
|
+
if (!finalAnswer.trim()) {
|
|
318
|
+
const reversedMessages = [...session.messages].reverse();
|
|
319
|
+
const lastAssistant = reversedMessages.find((message) => message?.role === 'assistant');
|
|
320
|
+
finalAnswer = lastAssistant ? extractText(lastAssistant) : '';
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
finally {
|
|
324
|
+
unsubscribe();
|
|
325
|
+
session.dispose();
|
|
326
|
+
}
|
|
327
|
+
const finishedAt = Date.now();
|
|
328
|
+
const metrics = buildMetrics(toolCalls);
|
|
329
|
+
const evaluation = evaluateVerdict(metrics, finalAnswer);
|
|
330
|
+
return {
|
|
331
|
+
id: promptCase.id,
|
|
332
|
+
title: promptCase.title,
|
|
333
|
+
prompt: promptCase.prompt,
|
|
334
|
+
startedAt: new Date(startedAt).toISOString(),
|
|
335
|
+
finishedAt: new Date(finishedAt).toISOString(),
|
|
336
|
+
durationMs: finishedAt - startedAt,
|
|
337
|
+
finalAnswer,
|
|
338
|
+
toolCalls,
|
|
339
|
+
metrics,
|
|
340
|
+
verdict: evaluation.verdict,
|
|
341
|
+
notes: evaluation.notes
|
|
342
|
+
};
|
|
343
|
+
}
|
|
344
|
+
async function runSearchFailureCase(testCase) {
|
|
345
|
+
const startedAt = Date.now();
|
|
346
|
+
const search = createWebSearchTool({ searchHtml: testCase.searchHtml });
|
|
347
|
+
const result = await search({ query: 'deterministic test query' });
|
|
348
|
+
const finishedAt = Date.now();
|
|
349
|
+
const actualCode = result.error?.code ?? 'NO_ERROR';
|
|
350
|
+
const actualMessage = result.error?.message ?? 'No error message returned.';
|
|
351
|
+
const evaluation = evaluateSearchFailureCase(testCase.expectedCode, actualCode, testCase.expectedMessage, actualMessage);
|
|
352
|
+
return {
|
|
353
|
+
id: testCase.id,
|
|
354
|
+
title: testCase.title,
|
|
355
|
+
startedAt: new Date(startedAt).toISOString(),
|
|
356
|
+
finishedAt: new Date(finishedAt).toISOString(),
|
|
357
|
+
durationMs: finishedAt - startedAt,
|
|
358
|
+
expectedCode: testCase.expectedCode,
|
|
359
|
+
actualCode,
|
|
360
|
+
expectedMessage: testCase.expectedMessage,
|
|
361
|
+
actualMessage,
|
|
362
|
+
verdict: evaluation.verdict,
|
|
363
|
+
notes: evaluation.notes
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
async function main() {
|
|
367
|
+
const cwd = process.cwd();
|
|
368
|
+
const startedAt = isoNow();
|
|
369
|
+
const authStorage = AuthStorage.create();
|
|
370
|
+
const modelRegistry = ModelRegistry.create(authStorage);
|
|
371
|
+
const prompts = [];
|
|
372
|
+
for (const promptCase of PROMPTS) {
|
|
373
|
+
console.log(`Running ${promptCase.id}: ${promptCase.title}`);
|
|
374
|
+
prompts.push(await runPrompt(promptCase, cwd, authStorage, modelRegistry));
|
|
375
|
+
}
|
|
376
|
+
const searchFailureCases = [];
|
|
377
|
+
for (const testCase of SEARCH_FAILURE_CASES) {
|
|
378
|
+
console.log(`Running ${testCase.id}: ${testCase.title}`);
|
|
379
|
+
searchFailureCases.push(await runSearchFailureCase(testCase));
|
|
380
|
+
}
|
|
381
|
+
const run = {
|
|
382
|
+
startedAt,
|
|
383
|
+
finishedAt: isoNow(),
|
|
384
|
+
cwd,
|
|
385
|
+
prompts,
|
|
386
|
+
searchFailureCases
|
|
387
|
+
};
|
|
388
|
+
const outputDir = path.join(cwd, 'local_docs', 'tmp', 'live-evals');
|
|
389
|
+
await mkdir(outputDir, { recursive: true });
|
|
390
|
+
const stamp = safeFileStamp();
|
|
391
|
+
const jsonPath = path.join(outputDir, `${stamp}.json`);
|
|
392
|
+
const mdPath = path.join(outputDir, `${stamp}.md`);
|
|
393
|
+
await writeFile(jsonPath, `${JSON.stringify(run, null, 2)}\n`, 'utf8');
|
|
394
|
+
await writeFile(mdPath, `${formatMarkdown(run)}\n`, 'utf8');
|
|
395
|
+
console.log(`\nSaved JSON: ${jsonPath}`);
|
|
396
|
+
console.log(`Saved Markdown: ${mdPath}`);
|
|
397
|
+
for (const prompt of run.prompts) {
|
|
398
|
+
console.log(`\n${prompt.id} -> ${prompt.verdict}`);
|
|
399
|
+
console.log(` web_explore first web tool: ${prompt.metrics.webExploreFirstWebTool}`);
|
|
400
|
+
console.log(` low-level calls after web_explore: ${prompt.metrics.lowLevelCallsAfterExplore}`);
|
|
401
|
+
console.log(` guarded low-level calls after web_explore: ${prompt.metrics.guardedLowLevelCallsAfterExplore}`);
|
|
402
|
+
console.log(` empty searches: ${prompt.metrics.emptySearches}`);
|
|
403
|
+
console.log(` bot-check headlesses: ${prompt.metrics.botCheckHeadlesses}`);
|
|
404
|
+
}
|
|
405
|
+
for (const testCase of run.searchFailureCases) {
|
|
406
|
+
console.log(`\n${testCase.id} -> ${testCase.verdict}`);
|
|
407
|
+
console.log(` expected code: ${testCase.expectedCode}`);
|
|
408
|
+
console.log(` actual code: ${testCase.actualCode}`);
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
await main();
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export declare function createCacheKey(parts: Array<string | number | boolean>): string;
|
|
2
|
+
export declare function createTtlCache<T>({ ttlMs, now }: {
|
|
3
|
+
ttlMs: number;
|
|
4
|
+
now?: () => number;
|
|
5
|
+
}): {
|
|
6
|
+
get(key: string): T | undefined;
|
|
7
|
+
set(key: string, value: T): void;
|
|
8
|
+
};
|