@demigodmode/pi-web-agent 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +63 -199
- package/dist/scripts/live-web-eval.d.ts +1 -0
- package/dist/scripts/live-web-eval.js +411 -0
- package/dist/src/cache/ttl-cache.d.ts +8 -0
- package/dist/src/cache/ttl-cache.js +21 -0
- package/dist/src/extension.d.ts +2 -0
- package/dist/src/extension.js +155 -0
- package/dist/src/extract/readability.d.ts +8 -0
- package/dist/src/extract/readability.js +93 -0
- package/dist/src/fetch/browser-resolution.d.ts +15 -0
- package/dist/src/fetch/browser-resolution.js +55 -0
- package/dist/src/fetch/headless-fetch.d.ts +18 -0
- package/dist/src/fetch/headless-fetch.js +87 -0
- package/dist/src/fetch/http-fetch.d.ts +4 -0
- package/dist/src/fetch/http-fetch.js +50 -0
- package/dist/src/orchestration/index.d.ts +41 -0
- package/dist/src/orchestration/index.js +9 -0
- package/dist/src/orchestration/research-orchestrator.d.ts +43 -0
- package/dist/src/orchestration/research-orchestrator.js +87 -0
- package/dist/src/orchestration/research-types.d.ts +41 -0
- package/dist/src/orchestration/research-types.js +1 -0
- package/dist/src/orchestration/research-worker.d.ts +16 -0
- package/dist/src/orchestration/research-worker.js +131 -0
- package/dist/src/search/duckduckgo.d.ts +9 -0
- package/dist/src/search/duckduckgo.js +52 -0
- package/dist/src/tools/web-explore.d.ts +44 -0
- package/dist/src/tools/web-explore.js +50 -0
- package/dist/src/tools/web-fetch-headless.d.ts +6 -0
- package/dist/src/tools/web-fetch-headless.js +14 -0
- package/dist/src/tools/web-fetch.d.ts +6 -0
- package/dist/src/tools/web-fetch.js +14 -0
- package/dist/src/tools/web-search.d.ts +10 -0
- package/dist/src/tools/web-search.js +103 -0
- package/dist/src/types.d.ts +48 -0
- package/dist/src/types.js +7 -0
- package/dist/tests/cache/ttl-cache.test.d.ts +1 -0
- package/dist/tests/cache/ttl-cache.test.js +19 -0
- package/dist/tests/contracts.test.d.ts +1 -0
- package/dist/tests/contracts.test.js +65 -0
- package/dist/tests/extension.test.d.ts +1 -0
- package/dist/tests/extension.test.js +123 -0
- package/dist/tests/extract/readability.test.d.ts +1 -0
- package/dist/tests/extract/readability.test.js +79 -0
- package/dist/tests/fetch/browser-resolution.test.d.ts +1 -0
- package/dist/tests/fetch/browser-resolution.test.js +37 -0
- package/dist/tests/fetch/headless-fetch.smoke.test.d.ts +1 -0
- package/dist/tests/fetch/headless-fetch.smoke.test.js +17 -0
- package/dist/tests/fetch/headless-fetch.test.d.ts +1 -0
- package/dist/tests/fetch/headless-fetch.test.js +150 -0
- package/dist/tests/fetch/http-fetch.test.d.ts +1 -0
- package/dist/tests/fetch/http-fetch.test.js +129 -0
- package/dist/tests/orchestration/research-orchestrator.test.d.ts +1 -0
- package/dist/tests/orchestration/research-orchestrator.test.js +298 -0
- package/dist/tests/orchestration/research-worker.test.d.ts +1 -0
- package/dist/tests/orchestration/research-worker.test.js +171 -0
- package/dist/tests/orchestration/research-workflow.test.d.ts +1 -0
- package/dist/tests/orchestration/research-workflow.test.js +119 -0
- package/dist/tests/package-manifest.test.d.ts +1 -0
- package/dist/tests/package-manifest.test.js +29 -0
- package/dist/tests/release-foundation.test.d.ts +1 -0
- package/dist/tests/release-foundation.test.js +16 -0
- package/dist/tests/release-script.test.d.ts +1 -0
- package/dist/tests/release-script.test.js +72 -0
- package/dist/tests/search/duckduckgo.test.d.ts +1 -0
- package/dist/tests/search/duckduckgo.test.js +103 -0
- package/dist/tests/tools/web-explore.test.d.ts +1 -0
- package/dist/tests/tools/web-explore.test.js +163 -0
- package/dist/tests/tools/web-fetch-headless.test.d.ts +1 -0
- package/dist/tests/tools/web-fetch-headless.test.js +31 -0
- package/dist/tests/tools/web-fetch.test.d.ts +1 -0
- package/dist/tests/tools/web-fetch.test.js +27 -0
- package/dist/tests/tools/web-search.test.d.ts +1 -0
- package/dist/tests/tools/web-search.test.js +125 -0
- package/dist/vitest.config.d.ts +2 -0
- package/dist/vitest.config.js +13 -0
- package/package.json +5 -1
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
export function createCacheKey(parts) {
|
|
2
|
+
return JSON.stringify(parts);
|
|
3
|
+
}
|
|
4
|
+
export function createTtlCache({ ttlMs, now = () => Date.now() }) {
|
|
5
|
+
const entries = new Map();
|
|
6
|
+
return {
|
|
7
|
+
get(key) {
|
|
8
|
+
const entry = entries.get(key);
|
|
9
|
+
if (!entry)
|
|
10
|
+
return undefined;
|
|
11
|
+
if (entry.expiresAt <= now()) {
|
|
12
|
+
entries.delete(key);
|
|
13
|
+
return undefined;
|
|
14
|
+
}
|
|
15
|
+
return entry.value;
|
|
16
|
+
},
|
|
17
|
+
set(key, value) {
|
|
18
|
+
entries.set(key, { value, expiresAt: now() + ttlMs });
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
}
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import { Type } from '@sinclair/typebox';
|
|
2
|
+
import { createWebExploreTool } from './tools/web-explore.js';
|
|
3
|
+
import { createWebFetchTool } from './tools/web-fetch.js';
|
|
4
|
+
import { createWebFetchHeadlessTool } from './tools/web-fetch-headless.js';
|
|
5
|
+
import { createWebSearchTool } from './tools/web-search.js';
|
|
6
|
+
export default function extension(pi) {
|
|
7
|
+
const webSearch = createWebSearchTool();
|
|
8
|
+
const webFetch = createWebFetchTool();
|
|
9
|
+
const webFetchHeadless = createWebFetchHeadlessTool();
|
|
10
|
+
const webExplore = createWebExploreTool();
|
|
11
|
+
let webExploreUsedInCurrentFlow = false;
|
|
12
|
+
const postWebExploreGuardError = {
|
|
13
|
+
code: 'POST_WEB_EXPLORE_GUARD',
|
|
14
|
+
message: 'web_explore already ran for this research task. Only use low-level web tools if there is a specific unresolved gap.'
|
|
15
|
+
};
|
|
16
|
+
function guardSearchResponse() {
|
|
17
|
+
const result = {
|
|
18
|
+
status: 'error',
|
|
19
|
+
results: [],
|
|
20
|
+
metadata: {
|
|
21
|
+
backend: 'duckduckgo',
|
|
22
|
+
cacheHit: false
|
|
23
|
+
},
|
|
24
|
+
error: postWebExploreGuardError
|
|
25
|
+
};
|
|
26
|
+
return {
|
|
27
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
28
|
+
details: result,
|
|
29
|
+
isError: true
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
function guardFetchResponse(url) {
|
|
33
|
+
const result = {
|
|
34
|
+
status: 'error',
|
|
35
|
+
url,
|
|
36
|
+
metadata: {
|
|
37
|
+
method: 'http',
|
|
38
|
+
cacheHit: false
|
|
39
|
+
},
|
|
40
|
+
error: postWebExploreGuardError
|
|
41
|
+
};
|
|
42
|
+
return {
|
|
43
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
44
|
+
details: result,
|
|
45
|
+
isError: true
|
|
46
|
+
};
|
|
47
|
+
}
|
|
48
|
+
function guardHeadlessResponse(url) {
|
|
49
|
+
const result = {
|
|
50
|
+
status: 'error',
|
|
51
|
+
url,
|
|
52
|
+
metadata: {
|
|
53
|
+
method: 'headless',
|
|
54
|
+
cacheHit: false
|
|
55
|
+
},
|
|
56
|
+
error: postWebExploreGuardError
|
|
57
|
+
};
|
|
58
|
+
return {
|
|
59
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
60
|
+
details: result,
|
|
61
|
+
isError: true
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
pi.on('before_agent_start', async (event) => {
|
|
65
|
+
webExploreUsedInCurrentFlow = false;
|
|
66
|
+
return {
|
|
67
|
+
systemPrompt: `${event.systemPrompt}\n\n` +
|
|
68
|
+
'For web research questions that require finding and comparing multiple sources, prefer web_explore. ' +
|
|
69
|
+
'Use web_search, web_fetch, and web_fetch_headless for direct/manual operations like explicit search calls, specific URL reads, or debugging. ' +
|
|
70
|
+
'After using web_explore, only call low-level web tools if there is a specific unresolved gap. ' +
|
|
71
|
+
'Do not keep searching or fetching just for extra confirmation.'
|
|
72
|
+
};
|
|
73
|
+
});
|
|
74
|
+
pi.registerTool({
|
|
75
|
+
name: 'web_search',
|
|
76
|
+
label: 'Web Search',
|
|
77
|
+
description: 'Direct search tool for manual discovery of links and snippets. Use for explicit search requests or when the user wants raw search results. Prefer web_explore for broader research questions.',
|
|
78
|
+
parameters: Type.Object({
|
|
79
|
+
query: Type.String({ description: 'Search query.' })
|
|
80
|
+
}),
|
|
81
|
+
async execute(_toolCallId, params) {
|
|
82
|
+
if (webExploreUsedInCurrentFlow) {
|
|
83
|
+
return guardSearchResponse();
|
|
84
|
+
}
|
|
85
|
+
const result = await webSearch({ query: params.query });
|
|
86
|
+
return {
|
|
87
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
88
|
+
details: result,
|
|
89
|
+
isError: result.status === 'error'
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
pi.registerTool({
|
|
94
|
+
name: 'web_fetch',
|
|
95
|
+
label: 'Web Fetch',
|
|
96
|
+
description: 'Direct HTTP page fetch for a specific URL. Use when the user wants one page read directly. Prefer web_explore for broader research across multiple sources.',
|
|
97
|
+
parameters: Type.Object({
|
|
98
|
+
url: Type.String({ description: 'HTTP or HTTPS URL to fetch.' })
|
|
99
|
+
}),
|
|
100
|
+
async execute(_toolCallId, params) {
|
|
101
|
+
if (webExploreUsedInCurrentFlow) {
|
|
102
|
+
return guardFetchResponse(params.url);
|
|
103
|
+
}
|
|
104
|
+
const result = await webFetch({ url: params.url });
|
|
105
|
+
return {
|
|
106
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
107
|
+
details: result,
|
|
108
|
+
isError: result.status === 'error'
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
});
|
|
112
|
+
pi.registerTool({
|
|
113
|
+
name: 'web_fetch_headless',
|
|
114
|
+
label: 'Web Fetch Headless',
|
|
115
|
+
description: 'Direct headless page fetch for a specific URL when browser rendering is explicitly needed. Prefer web_explore for research tasks; it decides headless escalation internally.',
|
|
116
|
+
parameters: Type.Object({
|
|
117
|
+
url: Type.String({ description: 'HTTP or HTTPS URL to fetch in headless mode.' })
|
|
118
|
+
}),
|
|
119
|
+
async execute(_toolCallId, params) {
|
|
120
|
+
if (webExploreUsedInCurrentFlow) {
|
|
121
|
+
return guardHeadlessResponse(params.url);
|
|
122
|
+
}
|
|
123
|
+
const result = await webFetchHeadless({ url: params.url });
|
|
124
|
+
return {
|
|
125
|
+
content: [{ type: 'text', text: JSON.stringify(result, null, 2) }],
|
|
126
|
+
details: result,
|
|
127
|
+
isError: result.status === 'error'
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
});
|
|
131
|
+
pi.registerTool({
|
|
132
|
+
name: 'web_explore',
|
|
133
|
+
label: 'Web Explore',
|
|
134
|
+
description: 'Research a web question using bounded search/fetch passes, source ranking, and targeted headless escalation. Prefer this for multi-source web research, current docs/discussion lookups, and recommendation summaries. Use this instead of chaining low-level web tools for the same research task.',
|
|
135
|
+
parameters: Type.Object({
|
|
136
|
+
query: Type.String({ description: 'Web research question to explore.' })
|
|
137
|
+
}),
|
|
138
|
+
async execute(_toolCallId, params) {
|
|
139
|
+
const result = await webExplore({ query: params.query });
|
|
140
|
+
if (result.status === 'ok') {
|
|
141
|
+
webExploreUsedInCurrentFlow = true;
|
|
142
|
+
}
|
|
143
|
+
return {
|
|
144
|
+
content: [
|
|
145
|
+
{
|
|
146
|
+
type: 'text',
|
|
147
|
+
text: result.status === 'ok' ? result.text : JSON.stringify(result, null, 2)
|
|
148
|
+
}
|
|
149
|
+
],
|
|
150
|
+
details: result,
|
|
151
|
+
isError: result.status === 'error'
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
});
|
|
155
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { ExtractedContent } from '../types.js';
|
|
2
|
+
export type ReadableExtractionMode = 'readability' | 'fallback';
|
|
3
|
+
export type SafeReadableExtraction = {
|
|
4
|
+
mode: ReadableExtractionMode;
|
|
5
|
+
content: ExtractedContent;
|
|
6
|
+
};
|
|
7
|
+
export declare function extractReadableContent(html: string, maxLength?: number): ExtractedContent;
|
|
8
|
+
export declare function extractReadableContentSafely(html: string, maxLength?: number): SafeReadableExtraction;
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { Readability } from '@mozilla/readability';
|
|
2
|
+
import { JSDOM, VirtualConsole } from 'jsdom';
|
|
3
|
+
export function extractReadableContent(html, maxLength = 4000) {
|
|
4
|
+
let stylesheetError;
|
|
5
|
+
const virtualConsole = new VirtualConsole();
|
|
6
|
+
virtualConsole.on('jsdomError', (error) => {
|
|
7
|
+
if (!stylesheetError && error.message.includes('Could not parse CSS stylesheet')) {
|
|
8
|
+
stylesheetError = error;
|
|
9
|
+
}
|
|
10
|
+
});
|
|
11
|
+
const dom = new JSDOM(html, {
|
|
12
|
+
url: 'https://example.com',
|
|
13
|
+
virtualConsole
|
|
14
|
+
});
|
|
15
|
+
if (stylesheetError) {
|
|
16
|
+
throw stylesheetError;
|
|
17
|
+
}
|
|
18
|
+
const article = new Readability(dom.window.document).parse();
|
|
19
|
+
const rawText = (article?.textContent ?? dom.window.document.body.textContent ?? '').trim();
|
|
20
|
+
const text = rawText.slice(0, maxLength);
|
|
21
|
+
const fallbackTitle = dom.window.document.title || undefined;
|
|
22
|
+
return {
|
|
23
|
+
title: article?.title ?? fallbackTitle,
|
|
24
|
+
byline: article?.byline || undefined,
|
|
25
|
+
text
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
function decodeHtmlEntities(text) {
|
|
29
|
+
return text
|
|
30
|
+
.replace(/ /gi, ' ')
|
|
31
|
+
.replace(/&/gi, '&')
|
|
32
|
+
.replace(/</gi, '<')
|
|
33
|
+
.replace(/>/gi, '>')
|
|
34
|
+
.replace(/"/gi, '"')
|
|
35
|
+
.replace(/'/gi, "'")
|
|
36
|
+
.replace(/'/gi, "'")
|
|
37
|
+
.replace(///gi, '/')
|
|
38
|
+
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(Number(code)))
|
|
39
|
+
.replace(/&#x([\da-f]+);/gi, (_, code) => String.fromCharCode(parseInt(code, 16)));
|
|
40
|
+
}
|
|
41
|
+
function extractTitle(html) {
|
|
42
|
+
const match = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
43
|
+
if (!match)
|
|
44
|
+
return undefined;
|
|
45
|
+
return decodeHtmlEntities(match[1].replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim()) || undefined;
|
|
46
|
+
}
|
|
47
|
+
function stripTagContent(html, tagName) {
|
|
48
|
+
return html.replace(new RegExp(`<${tagName}\\b[^>]*>[\\s\\S]*?<\\/${tagName}>`, 'gi'), ' ');
|
|
49
|
+
}
|
|
50
|
+
function extractPreferredSection(html) {
|
|
51
|
+
const mainMatch = html.match(/<main\b[^>]*>([\s\S]*?)<\/main>/i);
|
|
52
|
+
if (mainMatch)
|
|
53
|
+
return mainMatch[1];
|
|
54
|
+
const articleMatch = html.match(/<article\b[^>]*>([\s\S]*?)<\/article>/i);
|
|
55
|
+
if (articleMatch)
|
|
56
|
+
return articleMatch[1];
|
|
57
|
+
const bodyMatch = html.match(/<body\b[^>]*>([\s\S]*?)<\/body>/i);
|
|
58
|
+
if (bodyMatch)
|
|
59
|
+
return bodyMatch[1];
|
|
60
|
+
return html;
|
|
61
|
+
}
|
|
62
|
+
function extractFallbackText(html, maxLength) {
|
|
63
|
+
const title = extractTitle(html);
|
|
64
|
+
let section = extractPreferredSection(html);
|
|
65
|
+
section = stripTagContent(section, 'script');
|
|
66
|
+
section = stripTagContent(section, 'style');
|
|
67
|
+
section = stripTagContent(section, 'noscript');
|
|
68
|
+
section = stripTagContent(section, 'svg');
|
|
69
|
+
section = stripTagContent(section, 'template');
|
|
70
|
+
const text = decodeHtmlEntities(section)
|
|
71
|
+
.replace(/<[^>]+>/g, ' ')
|
|
72
|
+
.replace(/\s+/g, ' ')
|
|
73
|
+
.trim()
|
|
74
|
+
.slice(0, maxLength);
|
|
75
|
+
return {
|
|
76
|
+
title,
|
|
77
|
+
text
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
export function extractReadableContentSafely(html, maxLength = 4000) {
|
|
81
|
+
try {
|
|
82
|
+
return {
|
|
83
|
+
mode: 'readability',
|
|
84
|
+
content: extractReadableContent(html, maxLength)
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
catch {
|
|
88
|
+
return {
|
|
89
|
+
mode: 'fallback',
|
|
90
|
+
content: extractFallbackText(html, maxLength)
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export type BrowserResolutionResult = {
|
|
2
|
+
ok: true;
|
|
3
|
+
executablePath: string;
|
|
4
|
+
browser: 'configured' | 'chrome' | 'edge';
|
|
5
|
+
} | {
|
|
6
|
+
ok: false;
|
|
7
|
+
error: {
|
|
8
|
+
code: 'BROWSER_NOT_FOUND' | 'CONFIGURED_BROWSER_NOT_FOUND';
|
|
9
|
+
message: string;
|
|
10
|
+
};
|
|
11
|
+
};
|
|
12
|
+
export declare function resolveBrowserExecutable({ configuredPath, fileExists }: {
|
|
13
|
+
configuredPath?: string;
|
|
14
|
+
fileExists?: (path: string) => Promise<boolean>;
|
|
15
|
+
}): Promise<BrowserResolutionResult>;
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
const WINDOWS_CANDIDATES = {
|
|
2
|
+
chrome: [
|
|
3
|
+
'C:/Program Files/Google/Chrome/Application/chrome.exe',
|
|
4
|
+
'C:/Program Files (x86)/Google/Chrome/Application/chrome.exe'
|
|
5
|
+
],
|
|
6
|
+
edge: [
|
|
7
|
+
'C:/Program Files/Microsoft/Edge/Application/msedge.exe',
|
|
8
|
+
'C:/Program Files (x86)/Microsoft/Edge/Application/msedge.exe'
|
|
9
|
+
]
|
|
10
|
+
};
|
|
11
|
+
export async function resolveBrowserExecutable({ configuredPath, fileExists = defaultFileExists }) {
|
|
12
|
+
if (configuredPath) {
|
|
13
|
+
if (await fileExists(configuredPath)) {
|
|
14
|
+
return {
|
|
15
|
+
ok: true,
|
|
16
|
+
executablePath: configuredPath,
|
|
17
|
+
browser: 'configured'
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
return {
|
|
21
|
+
ok: false,
|
|
22
|
+
error: {
|
|
23
|
+
code: 'CONFIGURED_BROWSER_NOT_FOUND',
|
|
24
|
+
message: `Configured browser path was not found: ${configuredPath}`
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
for (const path of WINDOWS_CANDIDATES.chrome) {
|
|
29
|
+
if (await fileExists(path)) {
|
|
30
|
+
return { ok: true, executablePath: path, browser: 'chrome' };
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
for (const path of WINDOWS_CANDIDATES.edge) {
|
|
34
|
+
if (await fileExists(path)) {
|
|
35
|
+
return { ok: true, executablePath: path, browser: 'edge' };
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return {
|
|
39
|
+
ok: false,
|
|
40
|
+
error: {
|
|
41
|
+
code: 'BROWSER_NOT_FOUND',
|
|
42
|
+
message: 'No compatible local browser was found for headless fetch.'
|
|
43
|
+
}
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
async function defaultFileExists(path) {
|
|
47
|
+
try {
|
|
48
|
+
const { access } = await import('node:fs/promises');
|
|
49
|
+
await access(path);
|
|
50
|
+
return true;
|
|
51
|
+
}
|
|
52
|
+
catch {
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { type BrowserResolutionResult } from './browser-resolution.js';
|
|
2
|
+
import type { WebFetchHeadlessResponse } from '../types.js';
|
|
3
|
+
export declare function headlessFetch(url: string, { configuredPath, resolveBrowser, launchBrowser, now }?: {
|
|
4
|
+
configuredPath?: string;
|
|
5
|
+
resolveBrowser?: (options?: {
|
|
6
|
+
configuredPath?: string;
|
|
7
|
+
}) => Promise<BrowserResolutionResult>;
|
|
8
|
+
launchBrowser?: (options: {
|
|
9
|
+
executablePath: string;
|
|
10
|
+
}) => Promise<{
|
|
11
|
+
newContext: () => Promise<{
|
|
12
|
+
newPage: () => Promise<any>;
|
|
13
|
+
close: () => Promise<void>;
|
|
14
|
+
}>;
|
|
15
|
+
close: () => Promise<void>;
|
|
16
|
+
}>;
|
|
17
|
+
now?: () => number;
|
|
18
|
+
}): Promise<WebFetchHeadlessResponse>;
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { chromium } from 'playwright-core';
|
|
2
|
+
import { extractReadableContentSafely } from '../extract/readability.js';
|
|
3
|
+
import { resolveBrowserExecutable } from './browser-resolution.js';
|
|
4
|
+
function cleanupRenderedText(text) {
|
|
5
|
+
let cleaned = text.replace(/(Show more)(\s+\1){1,}/gi, '$1');
|
|
6
|
+
cleaned = cleaned.replace(/(Privacy Terms)(\s+\1){1,}/gi, '$1');
|
|
7
|
+
cleaned = cleaned.replace(/\s+/g, ' ').trim();
|
|
8
|
+
return cleaned;
|
|
9
|
+
}
|
|
10
|
+
export async function headlessFetch(url, { configuredPath, resolveBrowser = (options) => resolveBrowserExecutable({ configuredPath: options?.configuredPath }), launchBrowser = ({ executablePath }) => chromium.launch({ executablePath, headless: true }), now = () => Date.now() } = {}) {
|
|
11
|
+
const resolved = await resolveBrowser({ configuredPath });
|
|
12
|
+
if (!resolved.ok) {
|
|
13
|
+
return {
|
|
14
|
+
status: 'error',
|
|
15
|
+
url,
|
|
16
|
+
metadata: { method: 'headless', cacheHit: false },
|
|
17
|
+
error: resolved.error
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
let browser;
|
|
21
|
+
let context;
|
|
22
|
+
let page;
|
|
23
|
+
try {
|
|
24
|
+
browser = await launchBrowser({ executablePath: resolved.executablePath });
|
|
25
|
+
context = await browser.newContext();
|
|
26
|
+
page = await context.newPage();
|
|
27
|
+
const startedAt = now();
|
|
28
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 20000 });
|
|
29
|
+
await page.waitForLoadState('load', { timeout: 10000 });
|
|
30
|
+
await page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => undefined);
|
|
31
|
+
const html = await page.content();
|
|
32
|
+
const finishedAt = now();
|
|
33
|
+
const extraction = extractReadableContentSafely(html);
|
|
34
|
+
const cleanedContent = {
|
|
35
|
+
...extraction.content,
|
|
36
|
+
text: cleanupRenderedText(extraction.content.text)
|
|
37
|
+
};
|
|
38
|
+
if (!cleanedContent.text || cleanedContent.text.length < 40) {
|
|
39
|
+
return {
|
|
40
|
+
status: 'blocked',
|
|
41
|
+
url,
|
|
42
|
+
metadata: {
|
|
43
|
+
method: 'headless',
|
|
44
|
+
cacheHit: false,
|
|
45
|
+
browser: resolved.browser,
|
|
46
|
+
navigationMs: finishedAt - startedAt
|
|
47
|
+
},
|
|
48
|
+
error: {
|
|
49
|
+
code: 'HEADLESS_EXTRACTION_WEAK',
|
|
50
|
+
message: 'Rendered page did not produce enough readable content.'
|
|
51
|
+
}
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
return {
|
|
55
|
+
status: 'ok',
|
|
56
|
+
url,
|
|
57
|
+
content: cleanedContent,
|
|
58
|
+
metadata: {
|
|
59
|
+
method: 'headless',
|
|
60
|
+
cacheHit: false,
|
|
61
|
+
browser: resolved.browser,
|
|
62
|
+
navigationMs: finishedAt - startedAt,
|
|
63
|
+
truncated: cleanedContent.text.length >= 4000
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
catch (error) {
|
|
68
|
+
return {
|
|
69
|
+
status: 'error',
|
|
70
|
+
url,
|
|
71
|
+
metadata: {
|
|
72
|
+
method: 'headless',
|
|
73
|
+
cacheHit: false,
|
|
74
|
+
browser: resolved.browser
|
|
75
|
+
},
|
|
76
|
+
error: {
|
|
77
|
+
code: 'HEADLESS_NAVIGATION_FAILED',
|
|
78
|
+
message: error instanceof Error ? error.message : 'Unknown headless navigation failure.'
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
finally {
|
|
83
|
+
await page?.close?.().catch(() => undefined);
|
|
84
|
+
await context?.close?.().catch(() => undefined);
|
|
85
|
+
await browser?.close?.().catch(() => undefined);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { extractReadableContentSafely } from '../extract/readability.js';
|
|
2
|
+
function looksLikeScriptShell(html) {
|
|
3
|
+
const lower = html.toLowerCase();
|
|
4
|
+
return lower.includes('<script') && (lower.includes('id="app"') || lower.includes('id="root"'));
|
|
5
|
+
}
|
|
6
|
+
function isWeakHttpContent(options) {
|
|
7
|
+
const normalizedText = options.text.replace(/\s+/g, ' ').trim();
|
|
8
|
+
const normalizedHtml = options.html.replace(/\s+/g, ' ').trim();
|
|
9
|
+
const textLength = normalizedText.length;
|
|
10
|
+
const htmlLength = normalizedHtml.length;
|
|
11
|
+
const hasGenericShellMarker = /enable javascript|javascript required|please turn on javascript/i.test(options.html);
|
|
12
|
+
const veryShortBody = textLength > 0 && textLength < 120;
|
|
13
|
+
const lowDensity = htmlLength > 0 && textLength / htmlLength < 0.02;
|
|
14
|
+
return veryShortBody && (lowDensity || hasGenericShellMarker);
|
|
15
|
+
}
|
|
16
|
+
export function createHttpFetcher({ fetchImpl = fetch } = {}) {
|
|
17
|
+
return async function httpFetch(url) {
|
|
18
|
+
const response = await fetchImpl(url);
|
|
19
|
+
const contentType = response.headers.get('content-type') ?? '';
|
|
20
|
+
if (!contentType.includes('text/html')) {
|
|
21
|
+
return {
|
|
22
|
+
status: 'unsupported',
|
|
23
|
+
url: response.url,
|
|
24
|
+
metadata: { method: 'http', cacheHit: false, contentType }
|
|
25
|
+
};
|
|
26
|
+
}
|
|
27
|
+
const html = await response.text();
|
|
28
|
+
const extraction = extractReadableContentSafely(html);
|
|
29
|
+
const content = extraction.content;
|
|
30
|
+
if (looksLikeScriptShell(html) ||
|
|
31
|
+
content.text.length < 40 ||
|
|
32
|
+
isWeakHttpContent({ html, title: content.title, text: content.text })) {
|
|
33
|
+
return {
|
|
34
|
+
status: 'needs_headless',
|
|
35
|
+
url: response.url,
|
|
36
|
+
metadata: { method: 'http', cacheHit: false, contentType },
|
|
37
|
+
error: {
|
|
38
|
+
code: 'WEAK_EXTRACTION',
|
|
39
|
+
message: 'HTTP extraction was not reliable enough.'
|
|
40
|
+
}
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
return {
|
|
44
|
+
status: 'ok',
|
|
45
|
+
url: response.url,
|
|
46
|
+
content,
|
|
47
|
+
metadata: { method: 'http', cacheHit: false, contentType, truncated: content.text.length >= 4000 }
|
|
48
|
+
};
|
|
49
|
+
};
|
|
50
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import type { WebFetchHeadlessResponse, WebFetchResponse, WebSearchResponse } from '../types.js';
|
|
2
|
+
export declare function createResearchWorkflow({ search, fetchPage, headlessFetch }?: {
|
|
3
|
+
search?: (input: {
|
|
4
|
+
query: string;
|
|
5
|
+
}) => Promise<WebSearchResponse>;
|
|
6
|
+
fetchPage?: (input: {
|
|
7
|
+
url: string;
|
|
8
|
+
}) => Promise<WebFetchResponse>;
|
|
9
|
+
headlessFetch?: (input: {
|
|
10
|
+
url: string;
|
|
11
|
+
}) => Promise<WebFetchHeadlessResponse>;
|
|
12
|
+
}): {
|
|
13
|
+
run({ query }: {
|
|
14
|
+
query: string;
|
|
15
|
+
}): Promise<{
|
|
16
|
+
decision: {
|
|
17
|
+
action: "answer";
|
|
18
|
+
rationale: string;
|
|
19
|
+
approvedEvidence: import("./research-types.js").ResearchEvidence[];
|
|
20
|
+
};
|
|
21
|
+
evidence: import("./research-types.js").ResearchEvidence[];
|
|
22
|
+
workerPass: import("./research-types.js").ResearchWorkerResult;
|
|
23
|
+
} | {
|
|
24
|
+
decision: {
|
|
25
|
+
action: "escalate-headless";
|
|
26
|
+
rationale: string;
|
|
27
|
+
url: string;
|
|
28
|
+
approvedEvidence: import("./research-types.js").ResearchEvidence[];
|
|
29
|
+
};
|
|
30
|
+
evidence: import("./research-types.js").ResearchEvidence[];
|
|
31
|
+
workerPass: import("./research-types.js").ResearchWorkerResult;
|
|
32
|
+
} | {
|
|
33
|
+
decision: {
|
|
34
|
+
action: "research-again";
|
|
35
|
+
rationale: string;
|
|
36
|
+
followupQuery: string;
|
|
37
|
+
};
|
|
38
|
+
evidence: import("./research-types.js").ResearchEvidence[];
|
|
39
|
+
workerPass: import("./research-types.js").ResearchWorkerResult;
|
|
40
|
+
}>;
|
|
41
|
+
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { createWebFetchHeadlessTool } from '../tools/web-fetch-headless.js';
|
|
2
|
+
import { createWebFetchTool } from '../tools/web-fetch.js';
|
|
3
|
+
import { createWebSearchTool } from '../tools/web-search.js';
|
|
4
|
+
import { createResearchOrchestrator } from './research-orchestrator.js';
|
|
5
|
+
import { createResearchWorker } from './research-worker.js';
|
|
6
|
+
export function createResearchWorkflow({ search = createWebSearchTool(), fetchPage = createWebFetchTool(), headlessFetch = createWebFetchHeadlessTool() } = {}) {
|
|
7
|
+
const worker = createResearchWorker({ search, fetchPage });
|
|
8
|
+
return createResearchOrchestrator({ worker, headlessFetch });
|
|
9
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { WebFetchHeadlessResponse } from '../types.js';
|
|
2
|
+
import type { ResearchEvidence, ResearchWorkerResult } from './research-types.js';
|
|
3
|
+
export declare function createResearchOrchestrator({ worker, headlessFetch }: {
|
|
4
|
+
worker: {
|
|
5
|
+
run: (input: {
|
|
6
|
+
query: string;
|
|
7
|
+
maxSearchRounds: number;
|
|
8
|
+
maxFetches: number;
|
|
9
|
+
}) => Promise<ResearchWorkerResult>;
|
|
10
|
+
};
|
|
11
|
+
headlessFetch: (input: {
|
|
12
|
+
url: string;
|
|
13
|
+
}) => Promise<WebFetchHeadlessResponse>;
|
|
14
|
+
}): {
|
|
15
|
+
run({ query }: {
|
|
16
|
+
query: string;
|
|
17
|
+
}): Promise<{
|
|
18
|
+
decision: {
|
|
19
|
+
action: "answer";
|
|
20
|
+
rationale: string;
|
|
21
|
+
approvedEvidence: ResearchEvidence[];
|
|
22
|
+
};
|
|
23
|
+
evidence: ResearchEvidence[];
|
|
24
|
+
workerPass: ResearchWorkerResult;
|
|
25
|
+
} | {
|
|
26
|
+
decision: {
|
|
27
|
+
action: "escalate-headless";
|
|
28
|
+
rationale: string;
|
|
29
|
+
url: string;
|
|
30
|
+
approvedEvidence: ResearchEvidence[];
|
|
31
|
+
};
|
|
32
|
+
evidence: ResearchEvidence[];
|
|
33
|
+
workerPass: ResearchWorkerResult;
|
|
34
|
+
} | {
|
|
35
|
+
decision: {
|
|
36
|
+
action: "research-again";
|
|
37
|
+
rationale: string;
|
|
38
|
+
followupQuery: string;
|
|
39
|
+
};
|
|
40
|
+
evidence: ResearchEvidence[];
|
|
41
|
+
workerPass: ResearchWorkerResult;
|
|
42
|
+
}>;
|
|
43
|
+
};
|