webpeel 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -500
- package/dist/cli-auth.d.ts +2 -0
- package/dist/cli-auth.d.ts.map +1 -1
- package/dist/cli-auth.js +16 -3
- package/dist/cli-auth.js.map +1 -1
- package/dist/cli.js +475 -77
- package/dist/cli.js.map +1 -1
- package/dist/core/actions.d.ts +19 -10
- package/dist/core/actions.d.ts.map +1 -1
- package/dist/core/actions.js +214 -43
- package/dist/core/actions.js.map +1 -1
- package/dist/core/agent.d.ts +60 -3
- package/dist/core/agent.d.ts.map +1 -1
- package/dist/core/agent.js +375 -86
- package/dist/core/agent.js.map +1 -1
- package/dist/core/answer.d.ts +43 -0
- package/dist/core/answer.d.ts.map +1 -0
- package/dist/core/answer.js +378 -0
- package/dist/core/answer.js.map +1 -0
- package/dist/core/cache.d.ts +14 -0
- package/dist/core/cache.d.ts.map +1 -0
- package/dist/core/cache.js +122 -0
- package/dist/core/cache.js.map +1 -0
- package/dist/core/dns-cache.d.ts +21 -0
- package/dist/core/dns-cache.d.ts.map +1 -0
- package/dist/core/dns-cache.js +184 -0
- package/dist/core/dns-cache.js.map +1 -0
- package/dist/core/documents.d.ts +24 -0
- package/dist/core/documents.d.ts.map +1 -0
- package/dist/core/documents.js +124 -0
- package/dist/core/documents.js.map +1 -0
- package/dist/core/extract-inline.d.ts +39 -0
- package/dist/core/extract-inline.d.ts.map +1 -0
- package/dist/core/extract-inline.js +214 -0
- package/dist/core/extract-inline.js.map +1 -0
- package/dist/core/fetcher.d.ts +33 -7
- package/dist/core/fetcher.d.ts.map +1 -1
- package/dist/core/fetcher.js +608 -41
- package/dist/core/fetcher.js.map +1 -1
- package/dist/core/jobs.d.ts +66 -0
- package/dist/core/jobs.d.ts.map +1 -0
- package/dist/core/jobs.js +513 -0
- package/dist/core/jobs.js.map +1 -0
- package/dist/core/markdown.d.ts.map +1 -1
- package/dist/core/markdown.js +141 -31
- package/dist/core/markdown.js.map +1 -1
- package/dist/core/pdf.d.ts.map +1 -1
- package/dist/core/pdf.js +3 -1
- package/dist/core/pdf.js.map +1 -1
- package/dist/core/screenshot.d.ts +33 -0
- package/dist/core/screenshot.d.ts.map +1 -0
- package/dist/core/screenshot.js +30 -0
- package/dist/core/screenshot.js.map +1 -0
- package/dist/core/search-provider.d.ts +46 -0
- package/dist/core/search-provider.d.ts.map +1 -0
- package/dist/core/search-provider.js +281 -0
- package/dist/core/search-provider.js.map +1 -0
- package/dist/core/strategies.d.ts +7 -10
- package/dist/core/strategies.d.ts.map +1 -1
- package/dist/core/strategies.js +370 -63
- package/dist/core/strategies.js.map +1 -1
- package/dist/index.d.ts +9 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +61 -32
- package/dist/index.js.map +1 -1
- package/dist/mcp/server.js +335 -70
- package/dist/mcp/server.js.map +1 -1
- package/dist/types.d.ts +43 -1
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/llms.txt +85 -47
- package/package.json +11 -5
package/dist/core/agent.js
CHANGED
|
@@ -1,9 +1,19 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Autonomous web research agent
|
|
3
3
|
* Searches the web, fetches pages, and extracts structured data based on natural language prompts
|
|
4
|
+
*
|
|
5
|
+
* Supports:
|
|
6
|
+
* - depth: "basic" (1 search, top 3) vs "thorough" (multi-step, up to 3 searches, top 10)
|
|
7
|
+
* - maxSources: control how many sources to include (default 5, max 20)
|
|
8
|
+
* - topic: "general" | "news" | "technical" | "academic" — adjusts queries & prioritization
|
|
9
|
+
* - outputSchema: JSON Schema for structured output with validation
|
|
10
|
+
* - streaming callbacks for SSE support
|
|
4
11
|
*/
|
|
5
12
|
import { load } from 'cheerio';
|
|
6
13
|
import { peel } from '../index.js';
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Helpers
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
7
17
|
/**
|
|
8
18
|
* Search DuckDuckGo HTML and parse results
|
|
9
19
|
*/
|
|
@@ -23,22 +33,17 @@ async function searchWeb(query, limit = 10) {
|
|
|
23
33
|
$('.result').each((_, el) => {
|
|
24
34
|
const link = $(el).find('.result__a');
|
|
25
35
|
const snippet = $(el).find('.result__snippet');
|
|
26
|
-
const
|
|
36
|
+
const rawUrl = link.attr('href');
|
|
27
37
|
const title = link.text().trim();
|
|
28
38
|
const desc = snippet.text().trim();
|
|
29
|
-
if (
|
|
30
|
-
// DuckDuckGo uses redirect URLs, extract the actual URL
|
|
39
|
+
if (rawUrl && title) {
|
|
31
40
|
try {
|
|
32
|
-
const actualUrl =
|
|
33
|
-
? `https:${
|
|
34
|
-
:
|
|
35
|
-
? decodeURIComponent(
|
|
36
|
-
:
|
|
37
|
-
results.push({
|
|
38
|
-
url: actualUrl,
|
|
39
|
-
title,
|
|
40
|
-
snippet: desc,
|
|
41
|
-
});
|
|
41
|
+
const actualUrl = rawUrl.startsWith('//')
|
|
42
|
+
? `https:${rawUrl}`
|
|
43
|
+
: rawUrl.includes('uddg=')
|
|
44
|
+
? decodeURIComponent(rawUrl.split('uddg=')[1].split('&')[0])
|
|
45
|
+
: rawUrl;
|
|
46
|
+
results.push({ url: actualUrl, title, snippet: desc });
|
|
42
47
|
}
|
|
43
48
|
catch {
|
|
44
49
|
// Skip malformed URLs
|
|
@@ -53,25 +58,73 @@ async function searchWeb(query, limit = 10) {
|
|
|
53
58
|
}
|
|
54
59
|
}
|
|
55
60
|
/**
|
|
56
|
-
*
|
|
61
|
+
* Prioritise search results by topic relevance (higher = better)
|
|
62
|
+
*/
|
|
63
|
+
function scoreByTopic(result, topic) {
|
|
64
|
+
const url = result.url.toLowerCase();
|
|
65
|
+
const domain = (() => { try {
|
|
66
|
+
return new URL(url).hostname;
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
return '';
|
|
70
|
+
} })();
|
|
71
|
+
switch (topic) {
|
|
72
|
+
case 'academic':
|
|
73
|
+
if (/\.edu$|arxiv\.org|scholar\.google|pubmed|ieee\.org|acm\.org|researchgate\.net/.test(domain))
|
|
74
|
+
return 10;
|
|
75
|
+
if (/\.gov$/.test(domain))
|
|
76
|
+
return 5;
|
|
77
|
+
return 0;
|
|
78
|
+
case 'technical':
|
|
79
|
+
if (/github\.com|stackoverflow\.com|docs\.|developer\.|devdocs\.io|mdn\./.test(domain))
|
|
80
|
+
return 10;
|
|
81
|
+
if (/\.dev$|\.io$/.test(domain))
|
|
82
|
+
return 3;
|
|
83
|
+
return 0;
|
|
84
|
+
case 'news':
|
|
85
|
+
if (/reuters\.com|apnews\.com|bbc\.com|cnn\.com|nytimes\.com|theguardian\.com|bloomberg\.com|techcrunch\.com|theverge\.com|arstechnica\.com/.test(domain))
|
|
86
|
+
return 10;
|
|
87
|
+
if (/news|press|blog/.test(domain))
|
|
88
|
+
return 3;
|
|
89
|
+
return 0;
|
|
90
|
+
default:
|
|
91
|
+
return 0;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Add topic-specific modifiers to search queries
|
|
96
|
+
*/
|
|
97
|
+
function enhanceQueryForTopic(query, topic) {
|
|
98
|
+
switch (topic) {
|
|
99
|
+
case 'news':
|
|
100
|
+
return `${query} latest news 2026`;
|
|
101
|
+
case 'academic':
|
|
102
|
+
return `${query} research paper study`;
|
|
103
|
+
case 'technical':
|
|
104
|
+
return `${query} documentation tutorial`;
|
|
105
|
+
default:
|
|
106
|
+
return query;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Call OpenAI-compatible LLM API (non-streaming)
|
|
57
111
|
*/
|
|
58
112
|
async function callLLM(messages, options) {
|
|
59
|
-
const { apiKey, model = 'gpt-4o-mini', baseUrl = 'https://api.openai.com/v1',
|
|
113
|
+
const { apiKey, model = 'gpt-4o-mini', baseUrl = 'https://api.openai.com/v1', jsonMode } = options;
|
|
60
114
|
const { fetch: undiciFetch } = await import('undici');
|
|
61
115
|
const body = {
|
|
62
116
|
model,
|
|
63
117
|
messages,
|
|
64
118
|
temperature: 0,
|
|
65
119
|
};
|
|
66
|
-
|
|
67
|
-
if (schema) {
|
|
120
|
+
if (jsonMode) {
|
|
68
121
|
body.response_format = { type: 'json_object' };
|
|
69
122
|
}
|
|
70
123
|
const response = await undiciFetch(`${baseUrl}/chat/completions`, {
|
|
71
124
|
method: 'POST',
|
|
72
125
|
headers: {
|
|
73
126
|
'Content-Type': 'application/json',
|
|
74
|
-
|
|
127
|
+
Authorization: `Bearer ${apiKey}`,
|
|
75
128
|
},
|
|
76
129
|
body: JSON.stringify(body),
|
|
77
130
|
});
|
|
@@ -79,12 +132,116 @@ async function callLLM(messages, options) {
|
|
|
79
132
|
const errorText = await response.text();
|
|
80
133
|
throw new Error(`LLM API error ${response.status}: ${errorText}`);
|
|
81
134
|
}
|
|
82
|
-
const result = await response.json();
|
|
135
|
+
const result = (await response.json());
|
|
83
136
|
const content = result.choices?.[0]?.message?.content;
|
|
84
137
|
if (!content) {
|
|
85
138
|
throw new Error('LLM returned empty response');
|
|
86
139
|
}
|
|
87
|
-
|
|
140
|
+
const usage = result.usage
|
|
141
|
+
? { input: result.usage.prompt_tokens ?? 0, output: result.usage.completion_tokens ?? 0 }
|
|
142
|
+
: { input: 0, output: 0 };
|
|
143
|
+
return { content, usage };
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Call OpenAI-compatible LLM API with streaming.
|
|
147
|
+
* Invokes `onChunk` for each text delta, returns full content when done.
|
|
148
|
+
*/
|
|
149
|
+
async function callLLMStreaming(messages, options, onChunk) {
|
|
150
|
+
if (!onChunk)
|
|
151
|
+
return callLLM(messages, options);
|
|
152
|
+
const { apiKey, model = 'gpt-4o-mini', baseUrl = 'https://api.openai.com/v1', jsonMode } = options;
|
|
153
|
+
const { fetch: undiciFetch } = await import('undici');
|
|
154
|
+
const body = {
|
|
155
|
+
model,
|
|
156
|
+
messages,
|
|
157
|
+
temperature: 0,
|
|
158
|
+
stream: true,
|
|
159
|
+
stream_options: { include_usage: true },
|
|
160
|
+
};
|
|
161
|
+
if (jsonMode) {
|
|
162
|
+
body.response_format = { type: 'json_object' };
|
|
163
|
+
}
|
|
164
|
+
const response = await undiciFetch(`${baseUrl}/chat/completions`, {
|
|
165
|
+
method: 'POST',
|
|
166
|
+
headers: {
|
|
167
|
+
'Content-Type': 'application/json',
|
|
168
|
+
Authorization: `Bearer ${apiKey}`,
|
|
169
|
+
},
|
|
170
|
+
body: JSON.stringify(body),
|
|
171
|
+
});
|
|
172
|
+
if (!response.ok) {
|
|
173
|
+
const errorText = await response.text();
|
|
174
|
+
throw new Error(`LLM API error ${response.status}: ${errorText}`);
|
|
175
|
+
}
|
|
176
|
+
let fullContent = '';
|
|
177
|
+
let usage = { input: 0, output: 0 };
|
|
178
|
+
// Read the SSE stream
|
|
179
|
+
const reader = response.body?.getReader?.();
|
|
180
|
+
if (!reader) {
|
|
181
|
+
// Fallback: consume entire body
|
|
182
|
+
const text = await response.text();
|
|
183
|
+
return { content: text, usage };
|
|
184
|
+
}
|
|
185
|
+
const decoder = new TextDecoder();
|
|
186
|
+
let buffer = '';
|
|
187
|
+
while (true) {
|
|
188
|
+
const { done, value } = await reader.read();
|
|
189
|
+
if (done)
|
|
190
|
+
break;
|
|
191
|
+
buffer += decoder.decode(value, { stream: true });
|
|
192
|
+
const lines = buffer.split('\n');
|
|
193
|
+
buffer = lines.pop() || '';
|
|
194
|
+
for (const line of lines) {
|
|
195
|
+
const trimmed = line.trim();
|
|
196
|
+
if (!trimmed || !trimmed.startsWith('data: '))
|
|
197
|
+
continue;
|
|
198
|
+
const data = trimmed.slice(6);
|
|
199
|
+
if (data === '[DONE]')
|
|
200
|
+
continue;
|
|
201
|
+
try {
|
|
202
|
+
const parsed = JSON.parse(data);
|
|
203
|
+
const delta = parsed.choices?.[0]?.delta?.content;
|
|
204
|
+
if (delta) {
|
|
205
|
+
fullContent += delta;
|
|
206
|
+
onChunk(delta);
|
|
207
|
+
}
|
|
208
|
+
// Final chunk may include usage
|
|
209
|
+
if (parsed.usage) {
|
|
210
|
+
usage = {
|
|
211
|
+
input: parsed.usage.prompt_tokens ?? 0,
|
|
212
|
+
output: parsed.usage.completion_tokens ?? 0,
|
|
213
|
+
};
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
catch {
|
|
217
|
+
// Skip unparseable lines
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
return { content: fullContent, usage };
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Validate JSON data against a JSON Schema (best-effort, no extra deps)
|
|
225
|
+
*/
|
|
226
|
+
function validateJsonSchema(data, schema) {
|
|
227
|
+
// Lightweight validation: check required fields and top-level types
|
|
228
|
+
if (schema.type === 'object' && schema.properties) {
|
|
229
|
+
if (typeof data !== 'object' || data === null || Array.isArray(data)) {
|
|
230
|
+
return { valid: false, errors: 'Expected an object' };
|
|
231
|
+
}
|
|
232
|
+
if (schema.required && Array.isArray(schema.required)) {
|
|
233
|
+
const missing = schema.required.filter((k) => !(k in data));
|
|
234
|
+
if (missing.length > 0) {
|
|
235
|
+
return { valid: false, errors: `Missing required fields: ${missing.join(', ')}` };
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
else if (schema.type === 'array') {
|
|
240
|
+
if (!Array.isArray(data)) {
|
|
241
|
+
return { valid: false, errors: 'Expected an array' };
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
return { valid: true };
|
|
88
245
|
}
|
|
89
246
|
/**
|
|
90
247
|
* Truncate content to approximately N tokens (rough estimate: 1 token ≈ 4 chars)
|
|
@@ -95,114 +252,199 @@ function truncateContent(content, maxTokens = 3000) {
|
|
|
95
252
|
return content;
|
|
96
253
|
return content.slice(0, maxChars) + '\n\n[Content truncated...]';
|
|
97
254
|
}
|
|
255
|
+
// ---------------------------------------------------------------------------
|
|
256
|
+
// Main agent
|
|
257
|
+
// ---------------------------------------------------------------------------
|
|
98
258
|
/**
|
|
99
259
|
* Run autonomous web research agent
|
|
100
260
|
*/
|
|
101
261
|
export async function runAgent(options) {
|
|
102
|
-
const { prompt, urls: startUrls = [], schema, llmApiKey, llmApiBase = 'https://api.openai.com/v1', llmModel = 'gpt-4o-mini', maxPages =
|
|
103
|
-
if (!llmApiKey)
|
|
262
|
+
const { prompt, urls: startUrls = [], schema: legacySchema, outputSchema, llmApiKey, llmApiBase = 'https://api.openai.com/v1', llmModel = 'gpt-4o-mini', maxPages, maxSources: rawMaxSources, depth = 'basic', topic = 'general', maxCredits, onProgress, onEvent, } = options;
|
|
263
|
+
if (!llmApiKey)
|
|
104
264
|
throw new Error('llmApiKey is required');
|
|
105
|
-
|
|
106
|
-
if (!prompt) {
|
|
265
|
+
if (!prompt)
|
|
107
266
|
throw new Error('prompt is required');
|
|
108
|
-
|
|
109
|
-
const
|
|
267
|
+
// Effective schema = outputSchema || legacy schema
|
|
268
|
+
const effectiveSchema = outputSchema || legacySchema;
|
|
269
|
+
// Determine effective maxSources:
|
|
270
|
+
// new param > legacy maxPages > depth-based default
|
|
271
|
+
const depthDefaults = depth === 'thorough'
|
|
272
|
+
? { maxSources: 10, maxQueries: 3, resultsPerQuery: 10 }
|
|
273
|
+
: { maxSources: 3, maxQueries: 1, resultsPerQuery: 5 };
|
|
274
|
+
const maxSourcesLimit = Math.min(rawMaxSources ?? maxPages ?? depthDefaults.maxSources, 20);
|
|
275
|
+
const maxQueries = depth === 'thorough' ? depthDefaults.maxQueries : depthDefaults.maxQueries;
|
|
110
276
|
const visitedUrls = new Set();
|
|
111
277
|
const sources = [];
|
|
278
|
+
const sourcesDetailed = [];
|
|
112
279
|
let pagesVisited = 0;
|
|
113
280
|
let creditsUsed = 0;
|
|
114
|
-
|
|
281
|
+
let totalUsage = { input: 0, output: 0 };
|
|
115
282
|
const collectedData = [];
|
|
283
|
+
// Emit both legacy progress and new event
|
|
116
284
|
const reportProgress = (status, message, currentUrl) => {
|
|
117
285
|
if (onProgress) {
|
|
118
|
-
onProgress({
|
|
119
|
-
status,
|
|
120
|
-
currentUrl,
|
|
121
|
-
pagesVisited,
|
|
122
|
-
message,
|
|
123
|
-
});
|
|
286
|
+
onProgress({ status, currentUrl, pagesVisited, message });
|
|
124
287
|
}
|
|
125
288
|
};
|
|
289
|
+
const emit = (event) => {
|
|
290
|
+
if (onEvent)
|
|
291
|
+
onEvent(event);
|
|
292
|
+
};
|
|
293
|
+
const accUsage = (u) => {
|
|
294
|
+
totalUsage.input += u.input;
|
|
295
|
+
totalUsage.output += u.output;
|
|
296
|
+
};
|
|
126
297
|
try {
|
|
127
|
-
//
|
|
128
|
-
|
|
298
|
+
// -----------------------------------------------------------------------
|
|
299
|
+
// Step 1: Determine search strategy & collect URLs
|
|
300
|
+
// -----------------------------------------------------------------------
|
|
129
301
|
let urlsToVisit = [...startUrls];
|
|
130
|
-
// If no starting URLs, ask LLM to generate search queries
|
|
131
302
|
if (urlsToVisit.length === 0) {
|
|
303
|
+
reportProgress('searching', 'Planning research strategy...');
|
|
304
|
+
const queryCount = depth === 'thorough' ? '3-5' : '2-3';
|
|
305
|
+
const topicHint = topic !== 'general'
|
|
306
|
+
? `\nFocus queries on ${topic} sources.`
|
|
307
|
+
: '';
|
|
132
308
|
const planningMessages = [
|
|
133
309
|
{
|
|
134
310
|
role: 'system',
|
|
135
|
-
content:
|
|
136
|
-
},
|
|
137
|
-
{
|
|
138
|
-
role: 'user',
|
|
139
|
-
content: `Research request: ${prompt}`,
|
|
311
|
+
content: `You are a web research assistant. Generate ${queryCount} specific search queries to find information for the user's request.${topicHint}\nReturn JSON only: {"queries": ["query1", "query2", ...]}`,
|
|
140
312
|
},
|
|
313
|
+
{ role: 'user', content: `Research request: ${prompt}` },
|
|
141
314
|
];
|
|
142
315
|
const planResponse = await callLLM(planningMessages, {
|
|
143
316
|
apiKey: llmApiKey,
|
|
144
317
|
model: llmModel,
|
|
145
318
|
baseUrl: llmApiBase,
|
|
146
|
-
|
|
319
|
+
jsonMode: true,
|
|
147
320
|
});
|
|
148
321
|
creditsUsed++;
|
|
322
|
+
accUsage(planResponse.usage);
|
|
149
323
|
let queries = [];
|
|
150
324
|
try {
|
|
151
|
-
const parsed = JSON.parse(planResponse);
|
|
325
|
+
const parsed = JSON.parse(planResponse.content);
|
|
152
326
|
queries = parsed.queries || [];
|
|
153
327
|
}
|
|
154
328
|
catch {
|
|
155
|
-
// Fallback: use the prompt as the query
|
|
156
329
|
queries = [prompt];
|
|
157
330
|
}
|
|
158
|
-
//
|
|
159
|
-
|
|
160
|
-
for (const
|
|
161
|
-
const
|
|
331
|
+
// Limit queries to maxQueries
|
|
332
|
+
const effectiveQueries = queries.slice(0, maxQueries);
|
|
333
|
+
for (const rawQuery of effectiveQueries) {
|
|
334
|
+
const query = topic !== 'general' ? enhanceQueryForTopic(rawQuery, topic) : rawQuery;
|
|
335
|
+
reportProgress('searching', `Searching: ${query}`);
|
|
336
|
+
emit({ type: 'step', action: 'searching', query });
|
|
337
|
+
const results = await searchWeb(query, depthDefaults.resultsPerQuery);
|
|
338
|
+
// Sort by topic relevance
|
|
339
|
+
if (topic !== 'general') {
|
|
340
|
+
results.sort((a, b) => scoreByTopic(b, topic) - scoreByTopic(a, topic));
|
|
341
|
+
}
|
|
162
342
|
urlsToVisit.push(...results.map(r => r.url));
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
break;
|
|
343
|
+
if (urlsToVisit.length >= maxSourcesLimit * 2)
|
|
344
|
+
break; // fetch a bit more than needed to account for failures
|
|
166
345
|
}
|
|
167
|
-
// Deduplicate
|
|
168
|
-
|
|
346
|
+
// Deduplicate by hostname+pathname
|
|
347
|
+
const seen = new Set();
|
|
348
|
+
urlsToVisit = urlsToVisit.filter(u => {
|
|
349
|
+
try {
|
|
350
|
+
const key = new URL(u).hostname + new URL(u).pathname;
|
|
351
|
+
if (seen.has(key))
|
|
352
|
+
return false;
|
|
353
|
+
seen.add(key);
|
|
354
|
+
return true;
|
|
355
|
+
}
|
|
356
|
+
catch {
|
|
357
|
+
return false;
|
|
358
|
+
}
|
|
359
|
+
});
|
|
169
360
|
}
|
|
361
|
+
// -----------------------------------------------------------------------
|
|
170
362
|
// Step 2: Visit pages and collect data
|
|
171
|
-
|
|
172
|
-
|
|
363
|
+
// -----------------------------------------------------------------------
|
|
364
|
+
const maxToFetch = Math.min(urlsToVisit.length, maxSourcesLimit);
|
|
365
|
+
for (const url of urlsToVisit.slice(0, maxToFetch + 5)) {
|
|
366
|
+
// Enough data collected?
|
|
367
|
+
if (collectedData.length >= maxSourcesLimit)
|
|
368
|
+
break;
|
|
173
369
|
if (maxCredits && creditsUsed >= maxCredits) {
|
|
174
370
|
reportProgress('done', 'Credit limit reached');
|
|
175
371
|
break;
|
|
176
372
|
}
|
|
177
|
-
// Skip already visited URLs
|
|
178
373
|
if (visitedUrls.has(url))
|
|
179
374
|
continue;
|
|
180
375
|
visitedUrls.add(url);
|
|
181
376
|
reportProgress('visiting', `Fetching: ${url}`, url);
|
|
377
|
+
emit({ type: 'step', action: 'fetching', url });
|
|
182
378
|
try {
|
|
183
|
-
|
|
184
|
-
const result = await peel(url, {
|
|
185
|
-
format: 'markdown',
|
|
186
|
-
timeout: 15000,
|
|
187
|
-
});
|
|
379
|
+
const result = await peel(url, { format: 'markdown', timeout: 15000 });
|
|
188
380
|
pagesVisited++;
|
|
189
|
-
creditsUsed++;
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
collectedData.push({
|
|
193
|
-
url: result.url,
|
|
194
|
-
title: result.title,
|
|
195
|
-
content: truncated,
|
|
196
|
-
});
|
|
381
|
+
creditsUsed++;
|
|
382
|
+
const truncated = truncateContent(result.content, depth === 'thorough' ? 4000 : 3000);
|
|
383
|
+
collectedData.push({ url: result.url, title: result.title, content: truncated });
|
|
197
384
|
sources.push(result.url);
|
|
385
|
+
sourcesDetailed.push({ url: result.url, title: result.title });
|
|
198
386
|
reportProgress('visiting', `Fetched: ${result.title}`, url);
|
|
199
387
|
}
|
|
200
388
|
catch (error) {
|
|
201
389
|
console.error(`Failed to fetch ${url}:`, error.message);
|
|
202
|
-
// Continue with other URLs
|
|
203
390
|
}
|
|
204
391
|
}
|
|
205
|
-
//
|
|
392
|
+
// -----------------------------------------------------------------------
|
|
393
|
+
// Step 2b (thorough only): Cross-reference — ask LLM if more info needed
|
|
394
|
+
// -----------------------------------------------------------------------
|
|
395
|
+
if (depth === 'thorough' && collectedData.length > 0 && collectedData.length < maxSourcesLimit) {
|
|
396
|
+
reportProgress('searching', 'Cross-referencing — checking for gaps...');
|
|
397
|
+
emit({ type: 'step', action: 'analyzing', summary: 'Cross-referencing collected data for gaps...' });
|
|
398
|
+
const gapMessages = [
|
|
399
|
+
{
|
|
400
|
+
role: 'system',
|
|
401
|
+
content: 'You are a web research assistant. Given the user\'s research request and summaries of pages already visited, identify any gaps. If more searches would help, return JSON: {"queries":["q1"]}. If no gaps, return {"queries":[]}.',
|
|
402
|
+
},
|
|
403
|
+
{
|
|
404
|
+
role: 'user',
|
|
405
|
+
content: `Research request: ${prompt}\n\nPages visited:\n${collectedData.map(d => `- ${d.title} (${d.url})`).join('\n')}`,
|
|
406
|
+
},
|
|
407
|
+
];
|
|
408
|
+
try {
|
|
409
|
+
const gapResponse = await callLLM(gapMessages, {
|
|
410
|
+
apiKey: llmApiKey, model: llmModel, baseUrl: llmApiBase, jsonMode: true,
|
|
411
|
+
});
|
|
412
|
+
creditsUsed++;
|
|
413
|
+
accUsage(gapResponse.usage);
|
|
414
|
+
const gapParsed = JSON.parse(gapResponse.content);
|
|
415
|
+
const gapQueries = (gapParsed.queries || []).slice(0, 2);
|
|
416
|
+
for (const q of gapQueries) {
|
|
417
|
+
emit({ type: 'step', action: 'searching', query: q });
|
|
418
|
+
const results = await searchWeb(q, 5);
|
|
419
|
+
for (const r of results) {
|
|
420
|
+
if (collectedData.length >= maxSourcesLimit)
|
|
421
|
+
break;
|
|
422
|
+
if (visitedUrls.has(r.url))
|
|
423
|
+
continue;
|
|
424
|
+
visitedUrls.add(r.url);
|
|
425
|
+
emit({ type: 'step', action: 'fetching', url: r.url });
|
|
426
|
+
try {
|
|
427
|
+
const result = await peel(r.url, { format: 'markdown', timeout: 15000 });
|
|
428
|
+
pagesVisited++;
|
|
429
|
+
creditsUsed++;
|
|
430
|
+
const truncated = truncateContent(result.content, 4000);
|
|
431
|
+
collectedData.push({ url: result.url, title: result.title, content: truncated });
|
|
432
|
+
sources.push(result.url);
|
|
433
|
+
sourcesDetailed.push({ url: result.url, title: result.title });
|
|
434
|
+
}
|
|
435
|
+
catch {
|
|
436
|
+
// skip
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
catch {
|
|
442
|
+
// Non-critical — continue with what we have
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
// -----------------------------------------------------------------------
|
|
446
|
+
// Step 3: Extract / synthesise final answer
|
|
447
|
+
// -----------------------------------------------------------------------
|
|
206
448
|
if (collectedData.length === 0) {
|
|
207
449
|
return {
|
|
208
450
|
success: false,
|
|
@@ -210,18 +452,30 @@ export async function runAgent(options) {
|
|
|
210
452
|
sources: [],
|
|
211
453
|
pagesVisited,
|
|
212
454
|
creditsUsed,
|
|
455
|
+
tokensUsed: totalUsage,
|
|
213
456
|
};
|
|
214
457
|
}
|
|
215
458
|
reportProgress('extracting', 'Analyzing collected data...');
|
|
216
|
-
|
|
459
|
+
emit({ type: 'step', action: 'analyzing', summary: `Synthesizing answer from ${collectedData.length} sources...` });
|
|
217
460
|
const context = collectedData
|
|
218
461
|
.map(d => `Source: ${d.url}\nTitle: ${d.title}\n\n${d.content}`)
|
|
219
462
|
.join('\n\n---\n\n');
|
|
220
|
-
const truncatedContext = truncateContent(context,
|
|
221
|
-
// Build system prompt
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
463
|
+
const truncatedContext = truncateContent(context, depth === 'thorough' ? 12000 : 8000);
|
|
464
|
+
// Build system prompt based on schema or free-form
|
|
465
|
+
let systemPrompt;
|
|
466
|
+
if (effectiveSchema) {
|
|
467
|
+
systemPrompt =
|
|
468
|
+
'You are a web research assistant. Extract structured data from the provided web content based on the user\'s request. ' +
|
|
469
|
+
`Return a JSON object matching this schema:\n${JSON.stringify(effectiveSchema, null, 2)}\n\nReturn ONLY valid JSON, no explanation.`;
|
|
470
|
+
}
|
|
471
|
+
else {
|
|
472
|
+
systemPrompt =
|
|
473
|
+
'You are a web research assistant. Based on the provided web content, answer the user\'s research question. ' +
|
|
474
|
+
'Provide a comprehensive, well-structured answer. Return a JSON object with:\n' +
|
|
475
|
+
'- "answer": your detailed answer as a string (use markdown formatting)\n' +
|
|
476
|
+
'- "keyFindings": array of key facts/findings\n' +
|
|
477
|
+
'Return ONLY valid JSON, no explanation.';
|
|
478
|
+
}
|
|
225
479
|
const extractMessages = [
|
|
226
480
|
{ role: 'system', content: systemPrompt },
|
|
227
481
|
{
|
|
@@ -229,29 +483,62 @@ export async function runAgent(options) {
|
|
|
229
483
|
content: `Research request: ${prompt}\n\nCollected data from ${collectedData.length} web pages:\n\n${truncatedContext}`,
|
|
230
484
|
},
|
|
231
485
|
];
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
baseUrl: llmApiBase,
|
|
236
|
-
schema: schema || {},
|
|
237
|
-
});
|
|
486
|
+
// Use streaming LLM call when onEvent is present
|
|
487
|
+
const hasStreaming = !!onEvent;
|
|
488
|
+
const extractResponse = await callLLMStreaming(extractMessages, { apiKey: llmApiKey, model: llmModel, baseUrl: llmApiBase, jsonMode: true }, hasStreaming ? (text) => emit({ type: 'chunk', text }) : undefined);
|
|
238
489
|
creditsUsed++;
|
|
490
|
+
accUsage(extractResponse.usage);
|
|
239
491
|
// Parse final result
|
|
240
492
|
let finalData;
|
|
241
493
|
try {
|
|
242
|
-
finalData = JSON.parse(extractResponse);
|
|
494
|
+
finalData = JSON.parse(extractResponse.content);
|
|
243
495
|
}
|
|
244
496
|
catch {
|
|
245
|
-
|
|
246
|
-
finalData = { result: extractResponse };
|
|
497
|
+
finalData = { result: extractResponse.content };
|
|
247
498
|
}
|
|
499
|
+
// Validate against outputSchema if provided
|
|
500
|
+
if (outputSchema) {
|
|
501
|
+
const validation = validateJsonSchema(finalData, outputSchema);
|
|
502
|
+
if (!validation.valid) {
|
|
503
|
+
// Try once more: ask LLM to fix
|
|
504
|
+
try {
|
|
505
|
+
const fixMessages = [
|
|
506
|
+
{
|
|
507
|
+
role: 'system',
|
|
508
|
+
content: `The previous response did not match the required JSON schema. Fix it.\nSchema: ${JSON.stringify(outputSchema)}\nErrors: ${validation.errors}\nReturn ONLY valid JSON.`,
|
|
509
|
+
},
|
|
510
|
+
{ role: 'user', content: extractResponse.content },
|
|
511
|
+
];
|
|
512
|
+
const fixResponse = await callLLM(fixMessages, {
|
|
513
|
+
apiKey: llmApiKey, model: llmModel, baseUrl: llmApiBase, jsonMode: true,
|
|
514
|
+
});
|
|
515
|
+
creditsUsed++;
|
|
516
|
+
accUsage(fixResponse.usage);
|
|
517
|
+
finalData = JSON.parse(fixResponse.content);
|
|
518
|
+
}
|
|
519
|
+
catch {
|
|
520
|
+
// Return what we have with a warning
|
|
521
|
+
finalData._validationWarning = `Output did not match schema: ${validation.errors}`;
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
const answerText = typeof finalData?.answer === 'string' ? finalData.answer : undefined;
|
|
248
526
|
reportProgress('done', `Completed: ${pagesVisited} pages visited`);
|
|
527
|
+
emit({
|
|
528
|
+
type: 'done',
|
|
529
|
+
answer: answerText || JSON.stringify(finalData),
|
|
530
|
+
sources: sourcesDetailed,
|
|
531
|
+
tokensUsed: totalUsage,
|
|
532
|
+
});
|
|
249
533
|
return {
|
|
250
534
|
success: true,
|
|
251
535
|
data: finalData,
|
|
536
|
+
answer: answerText,
|
|
252
537
|
sources,
|
|
538
|
+
sourcesDetailed,
|
|
253
539
|
pagesVisited,
|
|
254
540
|
creditsUsed,
|
|
541
|
+
tokensUsed: totalUsage,
|
|
255
542
|
};
|
|
256
543
|
}
|
|
257
544
|
catch (error) {
|
|
@@ -260,8 +547,10 @@ export async function runAgent(options) {
|
|
|
260
547
|
success: false,
|
|
261
548
|
data: { error: error.message || 'Unknown error occurred' },
|
|
262
549
|
sources,
|
|
550
|
+
sourcesDetailed,
|
|
263
551
|
pagesVisited,
|
|
264
552
|
creditsUsed,
|
|
553
|
+
tokensUsed: totalUsage,
|
|
265
554
|
};
|
|
266
555
|
}
|
|
267
556
|
}
|