@demigodmode/pi-web-agent 0.2.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +661 -661
- package/README.md +61 -5
- package/dist/commands/web-agent-config.d.ts +23 -0
- package/dist/commands/web-agent-config.js +249 -0
- package/dist/extension.js +30 -66
- package/dist/orchestration/answer-synthesizer.d.ts +8 -0
- package/dist/orchestration/answer-synthesizer.js +17 -0
- package/dist/orchestration/candidate-selector.d.ts +6 -0
- package/dist/orchestration/candidate-selector.js +24 -0
- package/dist/orchestration/evidence-ranker.d.ts +4 -0
- package/dist/orchestration/evidence-ranker.js +36 -0
- package/dist/orchestration/index.d.ts +6 -21
- package/dist/orchestration/query-planner.d.ts +7 -0
- package/dist/orchestration/query-planner.js +37 -0
- package/dist/orchestration/research-orchestrator.d.ts +7 -22
- package/dist/orchestration/research-orchestrator.js +185 -73
- package/dist/orchestration/research-types.d.ts +6 -0
- package/dist/orchestration/research-worker.js +8 -1
- package/dist/orchestration/stop-decider.d.ts +19 -0
- package/dist/orchestration/stop-decider.js +14 -0
- package/dist/presentation/config-store.d.ts +23 -0
- package/dist/presentation/config-store.js +64 -0
- package/dist/presentation/config.d.ts +7 -0
- package/dist/presentation/config.js +44 -0
- package/dist/presentation/explore-presentation.d.ts +3 -0
- package/dist/presentation/explore-presentation.js +56 -0
- package/dist/presentation/fetch-presentation.d.ts +5 -0
- package/dist/presentation/fetch-presentation.js +40 -0
- package/dist/presentation/search-presentation.d.ts +3 -0
- package/dist/presentation/search-presentation.js +30 -0
- package/dist/presentation/select-view.d.ts +2 -0
- package/dist/presentation/select-view.js +12 -0
- package/dist/presentation/types.d.ts +50 -0
- package/dist/presentation/types.js +1 -0
- package/dist/search/duckduckgo.d.ts +6 -1
- package/dist/search/duckduckgo.js +11 -1
- package/dist/tools/web-explore.d.ts +16 -16
- package/dist/tools/web-explore.js +21 -29
- package/dist/tools/web-fetch-headless.js +11 -2
- package/dist/tools/web-fetch.js +11 -2
- package/dist/tools/web-search.js +99 -12
- package/dist/types.d.ts +22 -0
- package/package.json +75 -75
- package/dist/scripts/live-web-eval.d.ts +0 -1
- package/dist/scripts/live-web-eval.js +0 -411
- package/dist/src/cache/ttl-cache.d.ts +0 -8
- package/dist/src/cache/ttl-cache.js +0 -21
- package/dist/src/extension.d.ts +0 -2
- package/dist/src/extension.js +0 -155
- package/dist/src/extract/readability.d.ts +0 -8
- package/dist/src/extract/readability.js +0 -93
- package/dist/src/fetch/browser-resolution.d.ts +0 -15
- package/dist/src/fetch/browser-resolution.js +0 -55
- package/dist/src/fetch/headless-fetch.d.ts +0 -18
- package/dist/src/fetch/headless-fetch.js +0 -87
- package/dist/src/fetch/http-fetch.d.ts +0 -4
- package/dist/src/fetch/http-fetch.js +0 -50
- package/dist/src/orchestration/index.d.ts +0 -41
- package/dist/src/orchestration/index.js +0 -9
- package/dist/src/orchestration/research-orchestrator.d.ts +0 -43
- package/dist/src/orchestration/research-orchestrator.js +0 -87
- package/dist/src/orchestration/research-types.d.ts +0 -41
- package/dist/src/orchestration/research-types.js +0 -1
- package/dist/src/orchestration/research-worker.d.ts +0 -16
- package/dist/src/orchestration/research-worker.js +0 -131
- package/dist/src/search/duckduckgo.d.ts +0 -9
- package/dist/src/search/duckduckgo.js +0 -52
- package/dist/src/tools/web-explore.d.ts +0 -44
- package/dist/src/tools/web-explore.js +0 -50
- package/dist/src/tools/web-fetch-headless.d.ts +0 -6
- package/dist/src/tools/web-fetch-headless.js +0 -14
- package/dist/src/tools/web-fetch.d.ts +0 -6
- package/dist/src/tools/web-fetch.js +0 -14
- package/dist/src/tools/web-search.d.ts +0 -10
- package/dist/src/tools/web-search.js +0 -103
- package/dist/src/types.d.ts +0 -48
- package/dist/src/types.js +0 -7
- package/dist/tests/cache/ttl-cache.test.d.ts +0 -1
- package/dist/tests/cache/ttl-cache.test.js +0 -19
- package/dist/tests/contracts.test.d.ts +0 -1
- package/dist/tests/contracts.test.js +0 -65
- package/dist/tests/extension.test.d.ts +0 -1
- package/dist/tests/extension.test.js +0 -123
- package/dist/tests/extract/readability.test.d.ts +0 -1
- package/dist/tests/extract/readability.test.js +0 -79
- package/dist/tests/fetch/browser-resolution.test.d.ts +0 -1
- package/dist/tests/fetch/browser-resolution.test.js +0 -37
- package/dist/tests/fetch/headless-fetch.smoke.test.d.ts +0 -1
- package/dist/tests/fetch/headless-fetch.smoke.test.js +0 -17
- package/dist/tests/fetch/headless-fetch.test.d.ts +0 -1
- package/dist/tests/fetch/headless-fetch.test.js +0 -150
- package/dist/tests/fetch/http-fetch.test.d.ts +0 -1
- package/dist/tests/fetch/http-fetch.test.js +0 -129
- package/dist/tests/orchestration/research-orchestrator.test.d.ts +0 -1
- package/dist/tests/orchestration/research-orchestrator.test.js +0 -298
- package/dist/tests/orchestration/research-worker.test.d.ts +0 -1
- package/dist/tests/orchestration/research-worker.test.js +0 -171
- package/dist/tests/orchestration/research-workflow.test.d.ts +0 -1
- package/dist/tests/orchestration/research-workflow.test.js +0 -119
- package/dist/tests/package-manifest.test.d.ts +0 -1
- package/dist/tests/package-manifest.test.js +0 -29
- package/dist/tests/release-foundation.test.d.ts +0 -1
- package/dist/tests/release-foundation.test.js +0 -16
- package/dist/tests/release-script.test.d.ts +0 -1
- package/dist/tests/release-script.test.js +0 -72
- package/dist/tests/search/duckduckgo.test.d.ts +0 -1
- package/dist/tests/search/duckduckgo.test.js +0 -103
- package/dist/tests/tools/web-explore.test.d.ts +0 -1
- package/dist/tests/tools/web-explore.test.js +0 -163
- package/dist/tests/tools/web-fetch-headless.test.d.ts +0 -1
- package/dist/tests/tools/web-fetch-headless.test.js +0 -31
- package/dist/tests/tools/web-fetch.test.d.ts +0 -1
- package/dist/tests/tools/web-fetch.test.js +0 -27
- package/dist/tests/tools/web-search.test.d.ts +0 -1
- package/dist/tests/tools/web-search.test.js +0 -125
- package/dist/vitest.config.d.ts +0 -2
- package/dist/vitest.config.js +0 -13
|
@@ -1,411 +0,0 @@
|
|
|
1
|
-
import { mkdir, writeFile } from 'node:fs/promises';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import process from 'node:process';
|
|
4
|
-
import { AuthStorage, createAgentSession, ModelRegistry, SessionManager } from '@mariozechner/pi-coding-agent';
|
|
5
|
-
import { createWebSearchTool } from '../src/tools/web-search.js';
|
|
6
|
-
const PROMPTS = [
|
|
7
|
-
{
|
|
8
|
-
id: 'prompt-1',
|
|
9
|
-
title: 'Playwright installed browser guidance',
|
|
10
|
-
prompt: 'Find current docs or discussions about Playwright launching an installed Chrome or Edge executable instead of a bundled browser, then summarize the recommended approach.'
|
|
11
|
-
},
|
|
12
|
-
{
|
|
13
|
-
id: 'prompt-2',
|
|
14
|
-
title: 'Vitest coverage configuration',
|
|
15
|
-
prompt: 'Find the current Vitest coverage docs and tell me how to enable coverage with the V8 provider in a TypeScript project.'
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
id: 'prompt-4',
|
|
19
|
-
title: 'DuckDuckGo HTML scraping pitfalls',
|
|
20
|
-
prompt: 'Find two or three current sources on DuckDuckGo HTML scraping in Node.js and tell me what the common parsing pitfalls are.'
|
|
21
|
-
}
|
|
22
|
-
];
|
|
23
|
-
const SEARCH_FAILURE_CASES = [
|
|
24
|
-
{
|
|
25
|
-
id: 'no-results',
|
|
26
|
-
title: 'NO_RESULTS classification',
|
|
27
|
-
expectedCode: 'NO_RESULTS',
|
|
28
|
-
expectedMessage: 'DuckDuckGo returned no usable results for this query.',
|
|
29
|
-
searchHtml: async () => `
|
|
30
|
-
<html>
|
|
31
|
-
<body>
|
|
32
|
-
<div class="results">
|
|
33
|
-
<div class="no-results">No results found for your search.</div>
|
|
34
|
-
</div>
|
|
35
|
-
</body>
|
|
36
|
-
</html>
|
|
37
|
-
`
|
|
38
|
-
},
|
|
39
|
-
{
|
|
40
|
-
id: 'parse-failed',
|
|
41
|
-
title: 'PARSE_FAILED classification',
|
|
42
|
-
expectedCode: 'PARSE_FAILED',
|
|
43
|
-
expectedMessage: 'DuckDuckGo returned a page, but it did not match the expected results format.',
|
|
44
|
-
searchHtml: async () => `
|
|
45
|
-
<html>
|
|
46
|
-
<body>
|
|
47
|
-
<main>
|
|
48
|
-
<h1>Unexpected page</h1>
|
|
49
|
-
<p>Nothing here looks like a search results page.</p>
|
|
50
|
-
</main>
|
|
51
|
-
</body>
|
|
52
|
-
</html>
|
|
53
|
-
`
|
|
54
|
-
},
|
|
55
|
-
{
|
|
56
|
-
id: 'blocked-html',
|
|
57
|
-
title: 'BLOCKED classification from challenge HTML',
|
|
58
|
-
expectedCode: 'BLOCKED',
|
|
59
|
-
expectedMessage: 'DuckDuckGo search appears to be blocked or rate limited.',
|
|
60
|
-
searchHtml: async () => `
|
|
61
|
-
<html>
|
|
62
|
-
<body>
|
|
63
|
-
<main>
|
|
64
|
-
<h1>Are you a robot?</h1>
|
|
65
|
-
<p>Please verify you are human to continue.</p>
|
|
66
|
-
</main>
|
|
67
|
-
</body>
|
|
68
|
-
</html>
|
|
69
|
-
`
|
|
70
|
-
},
|
|
71
|
-
{
|
|
72
|
-
id: 'fetch-failed',
|
|
73
|
-
title: 'FETCH_FAILED classification',
|
|
74
|
-
expectedCode: 'FETCH_FAILED',
|
|
75
|
-
expectedMessage: 'DuckDuckGo search request failed: socket hang up',
|
|
76
|
-
searchHtml: async () => {
|
|
77
|
-
throw new Error('socket hang up');
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
];
|
|
81
|
-
function isoNow() {
|
|
82
|
-
return new Date().toISOString();
|
|
83
|
-
}
|
|
84
|
-
function safeFileStamp(date = new Date()) {
|
|
85
|
-
return date.toISOString().replace(/[:.]/g, '-');
|
|
86
|
-
}
|
|
87
|
-
function extractText(value) {
|
|
88
|
-
if (typeof value === 'string')
|
|
89
|
-
return value;
|
|
90
|
-
if (!value || typeof value !== 'object')
|
|
91
|
-
return '';
|
|
92
|
-
if (Array.isArray(value)) {
|
|
93
|
-
return value.map(extractText).filter(Boolean).join('\n');
|
|
94
|
-
}
|
|
95
|
-
const record = value;
|
|
96
|
-
if (typeof record.text === 'string')
|
|
97
|
-
return record.text;
|
|
98
|
-
if (typeof record.content === 'string')
|
|
99
|
-
return record.content;
|
|
100
|
-
if (Array.isArray(record.content)) {
|
|
101
|
-
return record.content
|
|
102
|
-
.map((item) => {
|
|
103
|
-
if (!item || typeof item !== 'object')
|
|
104
|
-
return '';
|
|
105
|
-
const contentItem = item;
|
|
106
|
-
return typeof contentItem.text === 'string' ? contentItem.text : '';
|
|
107
|
-
})
|
|
108
|
-
.filter(Boolean)
|
|
109
|
-
.join('\n');
|
|
110
|
-
}
|
|
111
|
-
const nestedMessage = record.message;
|
|
112
|
-
if (nestedMessage && typeof nestedMessage === 'object') {
|
|
113
|
-
const nestedRecord = nestedMessage;
|
|
114
|
-
if (Array.isArray(nestedRecord.content)) {
|
|
115
|
-
return extractText(nestedMessage);
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
return '';
|
|
119
|
-
}
|
|
120
|
-
function toolDetails(result) {
|
|
121
|
-
if (!result || typeof result !== 'object')
|
|
122
|
-
return result;
|
|
123
|
-
const record = result;
|
|
124
|
-
return record.details ?? result;
|
|
125
|
-
}
|
|
126
|
-
function isEmptySearchResult(result) {
|
|
127
|
-
const details = toolDetails(result);
|
|
128
|
-
if (!details || typeof details !== 'object')
|
|
129
|
-
return false;
|
|
130
|
-
const record = details;
|
|
131
|
-
return record.status === 'ok' && Array.isArray(record.results) && record.results.length === 0;
|
|
132
|
-
}
|
|
133
|
-
function isUnsupportedFetchResult(result) {
|
|
134
|
-
const details = toolDetails(result);
|
|
135
|
-
return !!details && typeof details === 'object' && details.status === 'unsupported';
|
|
136
|
-
}
|
|
137
|
-
function isBotCheckHeadlessResult(result) {
|
|
138
|
-
const details = toolDetails(result);
|
|
139
|
-
if (!details || typeof details !== 'object')
|
|
140
|
-
return false;
|
|
141
|
-
const record = details;
|
|
142
|
-
const content = record.content;
|
|
143
|
-
const text = extractText(content);
|
|
144
|
-
const title = content && typeof content === 'object' && !Array.isArray(content)
|
|
145
|
-
? String(content.title ?? '')
|
|
146
|
-
: '';
|
|
147
|
-
return /just a moment|security verification|verify you are not a bot/i.test(`${title}\n${text}`);
|
|
148
|
-
}
|
|
149
|
-
function isPostWebExploreGuardResult(result) {
|
|
150
|
-
const details = toolDetails(result);
|
|
151
|
-
if (!details || typeof details !== 'object')
|
|
152
|
-
return false;
|
|
153
|
-
const record = details;
|
|
154
|
-
const error = record.error;
|
|
155
|
-
if (!error || typeof error !== 'object')
|
|
156
|
-
return false;
|
|
157
|
-
return error.code === 'POST_WEB_EXPLORE_GUARD';
|
|
158
|
-
}
|
|
159
|
-
function buildMetrics(toolCalls) {
|
|
160
|
-
const webToolNames = new Set(['web_explore', 'web_search', 'web_fetch', 'web_fetch_headless']);
|
|
161
|
-
const lowLevelWebToolNames = new Set(['web_search', 'web_fetch', 'web_fetch_headless']);
|
|
162
|
-
const webToolCalls = toolCalls.filter((call) => webToolNames.has(call.toolName));
|
|
163
|
-
const firstWebTool = webToolCalls[0];
|
|
164
|
-
const firstWebExploreIndex = toolCalls.findIndex((call) => call.toolName === 'web_explore');
|
|
165
|
-
return {
|
|
166
|
-
webExploreUsed: firstWebExploreIndex !== -1,
|
|
167
|
-
webExploreFirstWebTool: firstWebTool?.toolName === 'web_explore',
|
|
168
|
-
totalToolCalls: toolCalls.length,
|
|
169
|
-
totalWebToolCalls: webToolCalls.length,
|
|
170
|
-
searchCalls: toolCalls.filter((call) => call.toolName === 'web_search').length,
|
|
171
|
-
fetchCalls: toolCalls.filter((call) => call.toolName === 'web_fetch').length,
|
|
172
|
-
headlessCalls: toolCalls.filter((call) => call.toolName === 'web_fetch_headless').length,
|
|
173
|
-
lowLevelCallsAfterExplore: firstWebExploreIndex === -1
|
|
174
|
-
? toolCalls.filter((call) => lowLevelWebToolNames.has(call.toolName) && !isPostWebExploreGuardResult(call.result)).length
|
|
175
|
-
: toolCalls
|
|
176
|
-
.slice(firstWebExploreIndex + 1)
|
|
177
|
-
.filter((call) => lowLevelWebToolNames.has(call.toolName) && !isPostWebExploreGuardResult(call.result)).length,
|
|
178
|
-
guardedLowLevelCallsAfterExplore: firstWebExploreIndex === -1
|
|
179
|
-
? toolCalls.filter((call) => lowLevelWebToolNames.has(call.toolName) && isPostWebExploreGuardResult(call.result)).length
|
|
180
|
-
: toolCalls
|
|
181
|
-
.slice(firstWebExploreIndex + 1)
|
|
182
|
-
.filter((call) => lowLevelWebToolNames.has(call.toolName) && isPostWebExploreGuardResult(call.result)).length,
|
|
183
|
-
emptySearches: toolCalls.filter((call) => call.toolName === 'web_search' && isEmptySearchResult(call.result)).length,
|
|
184
|
-
unsupportedFetches: toolCalls.filter((call) => (call.toolName === 'web_fetch' || call.toolName === 'web_fetch_headless') &&
|
|
185
|
-
isUnsupportedFetchResult(call.result)).length,
|
|
186
|
-
botCheckHeadlesses: toolCalls.filter((call) => call.toolName === 'web_fetch_headless' && isBotCheckHeadlessResult(call.result)).length
|
|
187
|
-
};
|
|
188
|
-
}
|
|
189
|
-
function evaluateVerdict(metrics, finalAnswer) {
|
|
190
|
-
const notes = [];
|
|
191
|
-
if (!metrics.webExploreUsed) {
|
|
192
|
-
notes.push('web_explore was not used');
|
|
193
|
-
return { verdict: 'fail', notes };
|
|
194
|
-
}
|
|
195
|
-
if (!metrics.webExploreFirstWebTool) {
|
|
196
|
-
notes.push('web_explore was not the first web research tool');
|
|
197
|
-
}
|
|
198
|
-
if (metrics.lowLevelCallsAfterExplore > 2) {
|
|
199
|
-
notes.push(`too many low-level calls after web_explore (${metrics.lowLevelCallsAfterExplore})`);
|
|
200
|
-
}
|
|
201
|
-
if (metrics.emptySearches > 0) {
|
|
202
|
-
notes.push(`empty web_search calls observed (${metrics.emptySearches})`);
|
|
203
|
-
}
|
|
204
|
-
if (metrics.botCheckHeadlesses > 0) {
|
|
205
|
-
notes.push(`headless bot-check pages observed (${metrics.botCheckHeadlesses})`);
|
|
206
|
-
}
|
|
207
|
-
if (!finalAnswer.trim()) {
|
|
208
|
-
notes.push('final answer text was empty');
|
|
209
|
-
return { verdict: 'fail', notes };
|
|
210
|
-
}
|
|
211
|
-
const looksClean = metrics.webExploreFirstWebTool &&
|
|
212
|
-
metrics.lowLevelCallsAfterExplore <= 1 &&
|
|
213
|
-
metrics.emptySearches === 0 &&
|
|
214
|
-
metrics.botCheckHeadlesses === 0;
|
|
215
|
-
if (looksClean) {
|
|
216
|
-
return { verdict: 'pass', notes };
|
|
217
|
-
}
|
|
218
|
-
return { verdict: 'mixed', notes };
|
|
219
|
-
}
|
|
220
|
-
function formatSearchFailureMarkdown(cases) {
|
|
221
|
-
if (cases.length === 0) {
|
|
222
|
-
return '## Search failure cases\n\nNone.\n';
|
|
223
|
-
}
|
|
224
|
-
const sections = cases
|
|
225
|
-
.map((testCase) => {
|
|
226
|
-
const notes = testCase.notes.length > 0 ? testCase.notes.map((note) => `- ${note}`).join('\n') : '- none';
|
|
227
|
-
return `### ${testCase.title}\n\n` +
|
|
228
|
-
`Verdict: **${testCase.verdict}**\n\n` +
|
|
229
|
-
`- expected code: ${testCase.expectedCode}\n` +
|
|
230
|
-
`- actual code: ${testCase.actualCode}\n` +
|
|
231
|
-
`- expected message: ${testCase.expectedMessage}\n` +
|
|
232
|
-
`- actual message: ${testCase.actualMessage}\n\n` +
|
|
233
|
-
`Notes:\n${notes}\n`;
|
|
234
|
-
})
|
|
235
|
-
.join('\n');
|
|
236
|
-
return `## Search failure cases\n\n${sections}`;
|
|
237
|
-
}
|
|
238
|
-
function formatMarkdown(run) {
|
|
239
|
-
const sections = run.prompts
|
|
240
|
-
.map((prompt) => {
|
|
241
|
-
const tools = prompt.toolCalls
|
|
242
|
-
.map((call, index) => ` ${index + 1}. ${call.toolName}`)
|
|
243
|
-
.join('\n');
|
|
244
|
-
const notes = prompt.notes.length > 0 ? prompt.notes.map((note) => `- ${note}`).join('\n') : '- none';
|
|
245
|
-
return `## ${prompt.title}\n\n` +
|
|
246
|
-
`Prompt: ${prompt.prompt}\n\n` +
|
|
247
|
-
`Verdict: **${prompt.verdict}**\n\n` +
|
|
248
|
-
`Metrics:\n` +
|
|
249
|
-
`- web_explore used: ${prompt.metrics.webExploreUsed}\n` +
|
|
250
|
-
`- web_explore first web tool: ${prompt.metrics.webExploreFirstWebTool}\n` +
|
|
251
|
-
`- total tool calls: ${prompt.metrics.totalToolCalls}\n` +
|
|
252
|
-
`- total web tool calls: ${prompt.metrics.totalWebToolCalls}\n` +
|
|
253
|
-
`- web_search calls: ${prompt.metrics.searchCalls}\n` +
|
|
254
|
-
`- web_fetch calls: ${prompt.metrics.fetchCalls}\n` +
|
|
255
|
-
`- web_fetch_headless calls: ${prompt.metrics.headlessCalls}\n` +
|
|
256
|
-
`- low-level calls after web_explore: ${prompt.metrics.lowLevelCallsAfterExplore}\n` +
|
|
257
|
-
`- guarded low-level calls after web_explore: ${prompt.metrics.guardedLowLevelCallsAfterExplore}\n` +
|
|
258
|
-
`- empty searches: ${prompt.metrics.emptySearches}\n` +
|
|
259
|
-
`- unsupported fetches: ${prompt.metrics.unsupportedFetches}\n` +
|
|
260
|
-
`- bot-check headlesses: ${prompt.metrics.botCheckHeadlesses}\n\n` +
|
|
261
|
-
`Tool order:\n${tools || ' none'}\n\n` +
|
|
262
|
-
`Notes:\n${notes}\n\n` +
|
|
263
|
-
`Final answer:\n\n${prompt.finalAnswer.trim() || '(empty)'}\n`;
|
|
264
|
-
})
|
|
265
|
-
.join('\n---\n\n');
|
|
266
|
-
return `# live web eval\n\nStarted: ${run.startedAt}\nFinished: ${run.finishedAt}\nCWD: ${run.cwd}\n\n` +
|
|
267
|
-
`${sections}\n\n---\n\n${formatSearchFailureMarkdown(run.searchFailureCases)}`;
|
|
268
|
-
}
|
|
269
|
-
function evaluateSearchFailureCase(expectedCode, actualCode, expectedMessage, actualMessage) {
|
|
270
|
-
const notes = [];
|
|
271
|
-
if (actualCode !== expectedCode) {
|
|
272
|
-
notes.push(`expected code ${expectedCode} but got ${actualCode}`);
|
|
273
|
-
}
|
|
274
|
-
if (actualMessage !== expectedMessage) {
|
|
275
|
-
notes.push(`expected message \"${expectedMessage}\" but got \"${actualMessage}\"`);
|
|
276
|
-
}
|
|
277
|
-
return {
|
|
278
|
-
verdict: notes.length === 0 ? 'pass' : 'fail',
|
|
279
|
-
notes
|
|
280
|
-
};
|
|
281
|
-
}
|
|
282
|
-
async function runPrompt(promptCase, cwd, authStorage, modelRegistry) {
|
|
283
|
-
const startedAt = Date.now();
|
|
284
|
-
const toolCalls = [];
|
|
285
|
-
let finalAnswer = '';
|
|
286
|
-
const { session } = await createAgentSession({
|
|
287
|
-
cwd,
|
|
288
|
-
authStorage,
|
|
289
|
-
modelRegistry,
|
|
290
|
-
sessionManager: SessionManager.inMemory()
|
|
291
|
-
});
|
|
292
|
-
const unsubscribe = session.subscribe((event) => {
|
|
293
|
-
if (event.type === 'tool_execution_start') {
|
|
294
|
-
toolCalls.push({
|
|
295
|
-
toolName: event.toolName,
|
|
296
|
-
args: event.args,
|
|
297
|
-
startedAt: isoNow()
|
|
298
|
-
});
|
|
299
|
-
}
|
|
300
|
-
if (event.type === 'tool_execution_end') {
|
|
301
|
-
const active = [...toolCalls].reverse().find((call) => call.toolName === event.toolName && !call.endedAt);
|
|
302
|
-
if (active) {
|
|
303
|
-
active.endedAt = isoNow();
|
|
304
|
-
active.isError = !!event.isError;
|
|
305
|
-
active.result = event.result;
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
if (event.type === 'message_end' && event.message?.role === 'assistant') {
|
|
309
|
-
const text = extractText(event.message);
|
|
310
|
-
if (text.trim()) {
|
|
311
|
-
finalAnswer = text;
|
|
312
|
-
}
|
|
313
|
-
}
|
|
314
|
-
});
|
|
315
|
-
try {
|
|
316
|
-
await session.prompt(promptCase.prompt);
|
|
317
|
-
if (!finalAnswer.trim()) {
|
|
318
|
-
const reversedMessages = [...session.messages].reverse();
|
|
319
|
-
const lastAssistant = reversedMessages.find((message) => message?.role === 'assistant');
|
|
320
|
-
finalAnswer = lastAssistant ? extractText(lastAssistant) : '';
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
finally {
|
|
324
|
-
unsubscribe();
|
|
325
|
-
session.dispose();
|
|
326
|
-
}
|
|
327
|
-
const finishedAt = Date.now();
|
|
328
|
-
const metrics = buildMetrics(toolCalls);
|
|
329
|
-
const evaluation = evaluateVerdict(metrics, finalAnswer);
|
|
330
|
-
return {
|
|
331
|
-
id: promptCase.id,
|
|
332
|
-
title: promptCase.title,
|
|
333
|
-
prompt: promptCase.prompt,
|
|
334
|
-
startedAt: new Date(startedAt).toISOString(),
|
|
335
|
-
finishedAt: new Date(finishedAt).toISOString(),
|
|
336
|
-
durationMs: finishedAt - startedAt,
|
|
337
|
-
finalAnswer,
|
|
338
|
-
toolCalls,
|
|
339
|
-
metrics,
|
|
340
|
-
verdict: evaluation.verdict,
|
|
341
|
-
notes: evaluation.notes
|
|
342
|
-
};
|
|
343
|
-
}
|
|
344
|
-
async function runSearchFailureCase(testCase) {
|
|
345
|
-
const startedAt = Date.now();
|
|
346
|
-
const search = createWebSearchTool({ searchHtml: testCase.searchHtml });
|
|
347
|
-
const result = await search({ query: 'deterministic test query' });
|
|
348
|
-
const finishedAt = Date.now();
|
|
349
|
-
const actualCode = result.error?.code ?? 'NO_ERROR';
|
|
350
|
-
const actualMessage = result.error?.message ?? 'No error message returned.';
|
|
351
|
-
const evaluation = evaluateSearchFailureCase(testCase.expectedCode, actualCode, testCase.expectedMessage, actualMessage);
|
|
352
|
-
return {
|
|
353
|
-
id: testCase.id,
|
|
354
|
-
title: testCase.title,
|
|
355
|
-
startedAt: new Date(startedAt).toISOString(),
|
|
356
|
-
finishedAt: new Date(finishedAt).toISOString(),
|
|
357
|
-
durationMs: finishedAt - startedAt,
|
|
358
|
-
expectedCode: testCase.expectedCode,
|
|
359
|
-
actualCode,
|
|
360
|
-
expectedMessage: testCase.expectedMessage,
|
|
361
|
-
actualMessage,
|
|
362
|
-
verdict: evaluation.verdict,
|
|
363
|
-
notes: evaluation.notes
|
|
364
|
-
};
|
|
365
|
-
}
|
|
366
|
-
async function main() {
|
|
367
|
-
const cwd = process.cwd();
|
|
368
|
-
const startedAt = isoNow();
|
|
369
|
-
const authStorage = AuthStorage.create();
|
|
370
|
-
const modelRegistry = ModelRegistry.create(authStorage);
|
|
371
|
-
const prompts = [];
|
|
372
|
-
for (const promptCase of PROMPTS) {
|
|
373
|
-
console.log(`Running ${promptCase.id}: ${promptCase.title}`);
|
|
374
|
-
prompts.push(await runPrompt(promptCase, cwd, authStorage, modelRegistry));
|
|
375
|
-
}
|
|
376
|
-
const searchFailureCases = [];
|
|
377
|
-
for (const testCase of SEARCH_FAILURE_CASES) {
|
|
378
|
-
console.log(`Running ${testCase.id}: ${testCase.title}`);
|
|
379
|
-
searchFailureCases.push(await runSearchFailureCase(testCase));
|
|
380
|
-
}
|
|
381
|
-
const run = {
|
|
382
|
-
startedAt,
|
|
383
|
-
finishedAt: isoNow(),
|
|
384
|
-
cwd,
|
|
385
|
-
prompts,
|
|
386
|
-
searchFailureCases
|
|
387
|
-
};
|
|
388
|
-
const outputDir = path.join(cwd, 'local_docs', 'tmp', 'live-evals');
|
|
389
|
-
await mkdir(outputDir, { recursive: true });
|
|
390
|
-
const stamp = safeFileStamp();
|
|
391
|
-
const jsonPath = path.join(outputDir, `${stamp}.json`);
|
|
392
|
-
const mdPath = path.join(outputDir, `${stamp}.md`);
|
|
393
|
-
await writeFile(jsonPath, `${JSON.stringify(run, null, 2)}\n`, 'utf8');
|
|
394
|
-
await writeFile(mdPath, `${formatMarkdown(run)}\n`, 'utf8');
|
|
395
|
-
console.log(`\nSaved JSON: ${jsonPath}`);
|
|
396
|
-
console.log(`Saved Markdown: ${mdPath}`);
|
|
397
|
-
for (const prompt of run.prompts) {
|
|
398
|
-
console.log(`\n${prompt.id} -> ${prompt.verdict}`);
|
|
399
|
-
console.log(` web_explore first web tool: ${prompt.metrics.webExploreFirstWebTool}`);
|
|
400
|
-
console.log(` low-level calls after web_explore: ${prompt.metrics.lowLevelCallsAfterExplore}`);
|
|
401
|
-
console.log(` guarded low-level calls after web_explore: ${prompt.metrics.guardedLowLevelCallsAfterExplore}`);
|
|
402
|
-
console.log(` empty searches: ${prompt.metrics.emptySearches}`);
|
|
403
|
-
console.log(` bot-check headlesses: ${prompt.metrics.botCheckHeadlesses}`);
|
|
404
|
-
}
|
|
405
|
-
for (const testCase of run.searchFailureCases) {
|
|
406
|
-
console.log(`\n${testCase.id} -> ${testCase.verdict}`);
|
|
407
|
-
console.log(` expected code: ${testCase.expectedCode}`);
|
|
408
|
-
console.log(` actual code: ${testCase.actualCode}`);
|
|
409
|
-
}
|
|
410
|
-
}
|
|
411
|
-
await main();
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
export declare function createCacheKey(parts: Array<string | number | boolean>): string;
|
|
2
|
-
export declare function createTtlCache<T>({ ttlMs, now }: {
|
|
3
|
-
ttlMs: number;
|
|
4
|
-
now?: () => number;
|
|
5
|
-
}): {
|
|
6
|
-
get(key: string): T | undefined;
|
|
7
|
-
set(key: string, value: T): void;
|
|
8
|
-
};
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
export function createCacheKey(parts) {
|
|
2
|
-
return JSON.stringify(parts);
|
|
3
|
-
}
|
|
4
|
-
export function createTtlCache({ ttlMs, now = () => Date.now() }) {
|
|
5
|
-
const entries = new Map();
|
|
6
|
-
return {
|
|
7
|
-
get(key) {
|
|
8
|
-
const entry = entries.get(key);
|
|
9
|
-
if (!entry)
|
|
10
|
-
return undefined;
|
|
11
|
-
if (entry.expiresAt <= now()) {
|
|
12
|
-
entries.delete(key);
|
|
13
|
-
return undefined;
|
|
14
|
-
}
|
|
15
|
-
return entry.value;
|
|
16
|
-
},
|
|
17
|
-
set(key, value) {
|
|
18
|
-
entries.set(key, { value, expiresAt: now() + ttlMs });
|
|
19
|
-
}
|
|
20
|
-
};
|
|
21
|
-
}
|
package/dist/src/extension.d.ts
DELETED
package/dist/src/extension.js
DELETED
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
import { Type } from '@sinclair/typebox';
|
|
2
|
-
import { createWebExploreTool } from './tools/web-explore.js';
|
|
3
|
-
import { createWebFetchTool } from './tools/web-fetch.js';
|
|
4
|
-
import { createWebFetchHeadlessTool } from './tools/web-fetch-headless.js';
|
|
5
|
-
import { createWebSearchTool } from './tools/web-search.js';
|
|
6
|
-
export default function extension(pi) {
|
|
7
|
-
const webSearch = createWebSearchTool();
|
|
8
|
-
const webFetch = createWebFetchTool();
|
|
9
|
-
const webFetchHeadless = createWebFetchHeadlessTool();
|
|
10
|
-
const webExplore = createWebExploreTool();
|
|
11
|
-
let webExploreUsedInCurrentFlow = false;
|
|
12
|
-
const postWebExploreGuardError = {
|
|
13
|
-
code: 'POST_WEB_EXPLORE_GUARD',
|
|
14
|
-
message: 'web_explore already ran for this research task. Only use low-level web tools if there is a specific unresolved gap.'
|
|
15
|
-
};
|
|
16
|
-
function guardSearchResponse() {
|
|
17
|
-
const result = {
|
|
18
|
-
status: 'error',
|
|
19
|
-
results: [],
|
|
20
|
-
metadata: {
|
|
21
|
-
backend: 'duckduckgo',
|
|
22
|
-
cacheHit: false
|
|
23
|
-
},
|
|
24
|
-
error: postWebExploreGuardError
|
|
25
|
-
};
|
|
26
|
-
return {
|
|
27
|
-
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
28
|
-
details: result,
|
|
29
|
-
isError: true
|
|
30
|
-
};
|
|
31
|
-
}
|
|
32
|
-
function guardFetchResponse(url) {
|
|
33
|
-
const result = {
|
|
34
|
-
status: 'error',
|
|
35
|
-
url,
|
|
36
|
-
metadata: {
|
|
37
|
-
method: 'http',
|
|
38
|
-
cacheHit: false
|
|
39
|
-
},
|
|
40
|
-
error: postWebExploreGuardError
|
|
41
|
-
};
|
|
42
|
-
return {
|
|
43
|
-
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
44
|
-
details: result,
|
|
45
|
-
isError: true
|
|
46
|
-
};
|
|
47
|
-
}
|
|
48
|
-
function guardHeadlessResponse(url) {
|
|
49
|
-
const result = {
|
|
50
|
-
status: 'error',
|
|
51
|
-
url,
|
|
52
|
-
metadata: {
|
|
53
|
-
method: 'headless',
|
|
54
|
-
cacheHit: false
|
|
55
|
-
},
|
|
56
|
-
error: postWebExploreGuardError
|
|
57
|
-
};
|
|
58
|
-
return {
|
|
59
|
-
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
60
|
-
details: result,
|
|
61
|
-
isError: true
|
|
62
|
-
};
|
|
63
|
-
}
|
|
64
|
-
pi.on('before_agent_start', async (event) => {
|
|
65
|
-
webExploreUsedInCurrentFlow = false;
|
|
66
|
-
return {
|
|
67
|
-
systemPrompt: `${event.systemPrompt}\n\n` +
|
|
68
|
-
'For web research questions that require finding and comparing multiple sources, prefer web_explore. ' +
|
|
69
|
-
'Use web_search, web_fetch, and web_fetch_headless for direct/manual operations like explicit search calls, specific URL reads, or debugging. ' +
|
|
70
|
-
'After using web_explore, only call low-level web tools if there is a specific unresolved gap. ' +
|
|
71
|
-
'Do not keep searching or fetching just for extra confirmation.'
|
|
72
|
-
};
|
|
73
|
-
});
|
|
74
|
-
pi.registerTool({
|
|
75
|
-
name: 'web_search',
|
|
76
|
-
label: 'Web Search',
|
|
77
|
-
description: 'Direct search tool for manual discovery of links and snippets. Use for explicit search requests or when the user wants raw search results. Prefer web_explore for broader research questions.',
|
|
78
|
-
parameters: Type.Object({
|
|
79
|
-
query: Type.String({ description: 'Search query.' })
|
|
80
|
-
}),
|
|
81
|
-
async execute(_toolCallId, params) {
|
|
82
|
-
if (webExploreUsedInCurrentFlow) {
|
|
83
|
-
return guardSearchResponse();
|
|
84
|
-
}
|
|
85
|
-
const result = await webSearch({ query: params.query });
|
|
86
|
-
return {
|
|
87
|
-
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
88
|
-
details: result,
|
|
89
|
-
isError: result.status === 'error'
|
|
90
|
-
};
|
|
91
|
-
}
|
|
92
|
-
});
|
|
93
|
-
pi.registerTool({
|
|
94
|
-
name: 'web_fetch',
|
|
95
|
-
label: 'Web Fetch',
|
|
96
|
-
description: 'Direct HTTP page fetch for a specific URL. Use when the user wants one page read directly. Prefer web_explore for broader research across multiple sources.',
|
|
97
|
-
parameters: Type.Object({
|
|
98
|
-
url: Type.String({ description: 'HTTP or HTTPS URL to fetch.' })
|
|
99
|
-
}),
|
|
100
|
-
async execute(_toolCallId, params) {
|
|
101
|
-
if (webExploreUsedInCurrentFlow) {
|
|
102
|
-
return guardFetchResponse(params.url);
|
|
103
|
-
}
|
|
104
|
-
const result = await webFetch({ url: params.url });
|
|
105
|
-
return {
|
|
106
|
-
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
107
|
-
details: result,
|
|
108
|
-
isError: result.status === 'error'
|
|
109
|
-
};
|
|
110
|
-
}
|
|
111
|
-
});
|
|
112
|
-
pi.registerTool({
|
|
113
|
-
name: 'web_fetch_headless',
|
|
114
|
-
label: 'Web Fetch Headless',
|
|
115
|
-
description: 'Direct headless page fetch for a specific URL when browser rendering is explicitly needed. Prefer web_explore for research tasks; it decides headless escalation internally.',
|
|
116
|
-
parameters: Type.Object({
|
|
117
|
-
url: Type.String({ description: 'HTTP or HTTPS URL to fetch in headless mode.' })
|
|
118
|
-
}),
|
|
119
|
-
async execute(_toolCallId, params) {
|
|
120
|
-
if (webExploreUsedInCurrentFlow) {
|
|
121
|
-
return guardHeadlessResponse(params.url);
|
|
122
|
-
}
|
|
123
|
-
const result = await webFetchHeadless({ url: params.url });
|
|
124
|
-
return {
|
|
125
|
-
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
126
|
-
details: result,
|
|
127
|
-
isError: result.status === 'error'
|
|
128
|
-
};
|
|
129
|
-
}
|
|
130
|
-
});
|
|
131
|
-
pi.registerTool({
|
|
132
|
-
name: 'web_explore',
|
|
133
|
-
label: 'Web Explore',
|
|
134
|
-
description: 'Research a web question using bounded search/fetch passes, source ranking, and targeted headless escalation. Prefer this for multi-source web research, current docs/discussion lookups, and recommendation summaries. Use this instead of chaining low-level web tools for the same research task.',
|
|
135
|
-
parameters: Type.Object({
|
|
136
|
-
query: Type.String({ description: 'Web research question to explore.' })
|
|
137
|
-
}),
|
|
138
|
-
async execute(_toolCallId, params) {
|
|
139
|
-
const result = await webExplore({ query: params.query });
|
|
140
|
-
if (result.status === 'ok') {
|
|
141
|
-
webExploreUsedInCurrentFlow = true;
|
|
142
|
-
}
|
|
143
|
-
return {
|
|
144
|
-
content: [
|
|
145
|
-
{
|
|
146
|
-
type: 'text',
|
|
147
|
-
text: result.status === 'ok' ? result.text : JSON.stringify(result, null, 2)
|
|
148
|
-
}
|
|
149
|
-
],
|
|
150
|
-
details: result,
|
|
151
|
-
isError: result.status === 'error'
|
|
152
|
-
};
|
|
153
|
-
}
|
|
154
|
-
});
|
|
155
|
-
}
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
import type { ExtractedContent } from '../types.js';
|
|
2
|
-
export type ReadableExtractionMode = 'readability' | 'fallback';
|
|
3
|
-
export type SafeReadableExtraction = {
|
|
4
|
-
mode: ReadableExtractionMode;
|
|
5
|
-
content: ExtractedContent;
|
|
6
|
-
};
|
|
7
|
-
export declare function extractReadableContent(html: string, maxLength?: number): ExtractedContent;
|
|
8
|
-
export declare function extractReadableContentSafely(html: string, maxLength?: number): SafeReadableExtraction;
|