webpeel 0.7.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/README.md +140 -500
  2. package/dist/cli-auth.d.ts +2 -0
  3. package/dist/cli-auth.d.ts.map +1 -1
  4. package/dist/cli-auth.js +16 -3
  5. package/dist/cli-auth.js.map +1 -1
  6. package/dist/cli.js +475 -77
  7. package/dist/cli.js.map +1 -1
  8. package/dist/core/actions.d.ts +19 -10
  9. package/dist/core/actions.d.ts.map +1 -1
  10. package/dist/core/actions.js +214 -43
  11. package/dist/core/actions.js.map +1 -1
  12. package/dist/core/agent.d.ts +60 -3
  13. package/dist/core/agent.d.ts.map +1 -1
  14. package/dist/core/agent.js +375 -86
  15. package/dist/core/agent.js.map +1 -1
  16. package/dist/core/answer.d.ts +43 -0
  17. package/dist/core/answer.d.ts.map +1 -0
  18. package/dist/core/answer.js +378 -0
  19. package/dist/core/answer.js.map +1 -0
  20. package/dist/core/cache.d.ts +14 -0
  21. package/dist/core/cache.d.ts.map +1 -0
  22. package/dist/core/cache.js +122 -0
  23. package/dist/core/cache.js.map +1 -0
  24. package/dist/core/dns-cache.d.ts +21 -0
  25. package/dist/core/dns-cache.d.ts.map +1 -0
  26. package/dist/core/dns-cache.js +184 -0
  27. package/dist/core/dns-cache.js.map +1 -0
  28. package/dist/core/documents.d.ts +24 -0
  29. package/dist/core/documents.d.ts.map +1 -0
  30. package/dist/core/documents.js +124 -0
  31. package/dist/core/documents.js.map +1 -0
  32. package/dist/core/extract-inline.d.ts +39 -0
  33. package/dist/core/extract-inline.d.ts.map +1 -0
  34. package/dist/core/extract-inline.js +214 -0
  35. package/dist/core/extract-inline.js.map +1 -0
  36. package/dist/core/fetcher.d.ts +33 -7
  37. package/dist/core/fetcher.d.ts.map +1 -1
  38. package/dist/core/fetcher.js +608 -41
  39. package/dist/core/fetcher.js.map +1 -1
  40. package/dist/core/jobs.d.ts +66 -0
  41. package/dist/core/jobs.d.ts.map +1 -0
  42. package/dist/core/jobs.js +513 -0
  43. package/dist/core/jobs.js.map +1 -0
  44. package/dist/core/markdown.d.ts.map +1 -1
  45. package/dist/core/markdown.js +141 -31
  46. package/dist/core/markdown.js.map +1 -1
  47. package/dist/core/pdf.d.ts.map +1 -1
  48. package/dist/core/pdf.js +3 -1
  49. package/dist/core/pdf.js.map +1 -1
  50. package/dist/core/screenshot.d.ts +33 -0
  51. package/dist/core/screenshot.d.ts.map +1 -0
  52. package/dist/core/screenshot.js +30 -0
  53. package/dist/core/screenshot.js.map +1 -0
  54. package/dist/core/search-provider.d.ts +46 -0
  55. package/dist/core/search-provider.d.ts.map +1 -0
  56. package/dist/core/search-provider.js +281 -0
  57. package/dist/core/search-provider.js.map +1 -0
  58. package/dist/core/strategies.d.ts +7 -10
  59. package/dist/core/strategies.d.ts.map +1 -1
  60. package/dist/core/strategies.js +370 -63
  61. package/dist/core/strategies.js.map +1 -1
  62. package/dist/index.d.ts +9 -3
  63. package/dist/index.d.ts.map +1 -1
  64. package/dist/index.js +61 -32
  65. package/dist/index.js.map +1 -1
  66. package/dist/mcp/server.js +335 -70
  67. package/dist/mcp/server.js.map +1 -1
  68. package/dist/types.d.ts +43 -1
  69. package/dist/types.d.ts.map +1 -1
  70. package/dist/types.js.map +1 -1
  71. package/llms.txt +85 -47
  72. package/package.json +11 -5
@@ -1,9 +1,19 @@
1
1
  /**
2
2
  * Autonomous web research agent
3
3
  * Searches the web, fetches pages, and extracts structured data based on natural language prompts
4
+ *
5
+ * Supports:
6
+ * - depth: "basic" (1 search, top 3) vs "thorough" (multi-step, up to 3 searches, top 10)
7
+ * - maxSources: control how many sources to include (default 5, max 20)
8
+ * - topic: "general" | "news" | "technical" | "academic" — adjusts queries & prioritization
9
+ * - outputSchema: JSON Schema for structured output with validation
10
+ * - streaming callbacks for SSE support
4
11
  */
5
12
  import { load } from 'cheerio';
6
13
  import { peel } from '../index.js';
14
+ // ---------------------------------------------------------------------------
15
+ // Helpers
16
+ // ---------------------------------------------------------------------------
7
17
  /**
8
18
  * Search DuckDuckGo HTML and parse results
9
19
  */
@@ -23,22 +33,17 @@ async function searchWeb(query, limit = 10) {
23
33
  $('.result').each((_, el) => {
24
34
  const link = $(el).find('.result__a');
25
35
  const snippet = $(el).find('.result__snippet');
26
- const url = link.attr('href');
36
+ const rawUrl = link.attr('href');
27
37
  const title = link.text().trim();
28
38
  const desc = snippet.text().trim();
29
- if (url && title) {
30
- // DuckDuckGo uses redirect URLs, extract the actual URL
39
+ if (rawUrl && title) {
31
40
  try {
32
- const actualUrl = url.startsWith('//')
33
- ? `https:${url}`
34
- : url.includes('uddg=')
35
- ? decodeURIComponent(url.split('uddg=')[1].split('&')[0])
36
- : url;
37
- results.push({
38
- url: actualUrl,
39
- title,
40
- snippet: desc,
41
- });
41
+ const actualUrl = rawUrl.startsWith('//')
42
+ ? `https:${rawUrl}`
43
+ : rawUrl.includes('uddg=')
44
+ ? decodeURIComponent(rawUrl.split('uddg=')[1].split('&')[0])
45
+ : rawUrl;
46
+ results.push({ url: actualUrl, title, snippet: desc });
42
47
  }
43
48
  catch {
44
49
  // Skip malformed URLs
@@ -53,25 +58,73 @@ async function searchWeb(query, limit = 10) {
53
58
  }
54
59
  }
55
60
  /**
56
- * Call OpenAI-compatible LLM API
61
+ * Prioritise search results by topic relevance (higher = better)
62
+ */
63
+ function scoreByTopic(result, topic) {
64
+ const url = result.url.toLowerCase();
65
+ const domain = (() => { try {
66
+ return new URL(url).hostname;
67
+ }
68
+ catch {
69
+ return '';
70
+ } })();
71
+ switch (topic) {
72
+ case 'academic':
73
+ if (/\.edu$|arxiv\.org|scholar\.google|pubmed|ieee\.org|acm\.org|researchgate\.net/.test(domain))
74
+ return 10;
75
+ if (/\.gov$/.test(domain))
76
+ return 5;
77
+ return 0;
78
+ case 'technical':
79
+ if (/github\.com|stackoverflow\.com|docs\.|developer\.|devdocs\.io|mdn\./.test(domain))
80
+ return 10;
81
+ if (/\.dev$|\.io$/.test(domain))
82
+ return 3;
83
+ return 0;
84
+ case 'news':
85
+ if (/reuters\.com|apnews\.com|bbc\.com|cnn\.com|nytimes\.com|theguardian\.com|bloomberg\.com|techcrunch\.com|theverge\.com|arstechnica\.com/.test(domain))
86
+ return 10;
87
+ if (/news|press|blog/.test(domain))
88
+ return 3;
89
+ return 0;
90
+ default:
91
+ return 0;
92
+ }
93
+ }
94
+ /**
95
+ * Add topic-specific modifiers to search queries
96
+ */
97
+ function enhanceQueryForTopic(query, topic) {
98
+ switch (topic) {
99
+ case 'news':
100
+ return `${query} latest news 2026`;
101
+ case 'academic':
102
+ return `${query} research paper study`;
103
+ case 'technical':
104
+ return `${query} documentation tutorial`;
105
+ default:
106
+ return query;
107
+ }
108
+ }
109
+ /**
110
+ * Call OpenAI-compatible LLM API (non-streaming)
57
111
  */
58
112
  async function callLLM(messages, options) {
59
- const { apiKey, model = 'gpt-4o-mini', baseUrl = 'https://api.openai.com/v1', schema } = options;
113
+ const { apiKey, model = 'gpt-4o-mini', baseUrl = 'https://api.openai.com/v1', jsonMode } = options;
60
114
  const { fetch: undiciFetch } = await import('undici');
61
115
  const body = {
62
116
  model,
63
117
  messages,
64
118
  temperature: 0,
65
119
  };
66
- // Force JSON mode if schema is provided
67
- if (schema) {
120
+ if (jsonMode) {
68
121
  body.response_format = { type: 'json_object' };
69
122
  }
70
123
  const response = await undiciFetch(`${baseUrl}/chat/completions`, {
71
124
  method: 'POST',
72
125
  headers: {
73
126
  'Content-Type': 'application/json',
74
- 'Authorization': `Bearer ${apiKey}`,
127
+ Authorization: `Bearer ${apiKey}`,
75
128
  },
76
129
  body: JSON.stringify(body),
77
130
  });
@@ -79,12 +132,116 @@ async function callLLM(messages, options) {
79
132
  const errorText = await response.text();
80
133
  throw new Error(`LLM API error ${response.status}: ${errorText}`);
81
134
  }
82
- const result = await response.json();
135
+ const result = (await response.json());
83
136
  const content = result.choices?.[0]?.message?.content;
84
137
  if (!content) {
85
138
  throw new Error('LLM returned empty response');
86
139
  }
87
- return content;
140
+ const usage = result.usage
141
+ ? { input: result.usage.prompt_tokens ?? 0, output: result.usage.completion_tokens ?? 0 }
142
+ : { input: 0, output: 0 };
143
+ return { content, usage };
144
+ }
145
+ /**
146
+ * Call OpenAI-compatible LLM API with streaming.
147
+ * Invokes `onChunk` for each text delta, returns full content when done.
148
+ */
149
+ async function callLLMStreaming(messages, options, onChunk) {
150
+ if (!onChunk)
151
+ return callLLM(messages, options);
152
+ const { apiKey, model = 'gpt-4o-mini', baseUrl = 'https://api.openai.com/v1', jsonMode } = options;
153
+ const { fetch: undiciFetch } = await import('undici');
154
+ const body = {
155
+ model,
156
+ messages,
157
+ temperature: 0,
158
+ stream: true,
159
+ stream_options: { include_usage: true },
160
+ };
161
+ if (jsonMode) {
162
+ body.response_format = { type: 'json_object' };
163
+ }
164
+ const response = await undiciFetch(`${baseUrl}/chat/completions`, {
165
+ method: 'POST',
166
+ headers: {
167
+ 'Content-Type': 'application/json',
168
+ Authorization: `Bearer ${apiKey}`,
169
+ },
170
+ body: JSON.stringify(body),
171
+ });
172
+ if (!response.ok) {
173
+ const errorText = await response.text();
174
+ throw new Error(`LLM API error ${response.status}: ${errorText}`);
175
+ }
176
+ let fullContent = '';
177
+ let usage = { input: 0, output: 0 };
178
+ // Read the SSE stream
179
+ const reader = response.body?.getReader?.();
180
+ if (!reader) {
181
+ // Fallback: consume entire body
182
+ const text = await response.text();
183
+ return { content: text, usage };
184
+ }
185
+ const decoder = new TextDecoder();
186
+ let buffer = '';
187
+ while (true) {
188
+ const { done, value } = await reader.read();
189
+ if (done)
190
+ break;
191
+ buffer += decoder.decode(value, { stream: true });
192
+ const lines = buffer.split('\n');
193
+ buffer = lines.pop() || '';
194
+ for (const line of lines) {
195
+ const trimmed = line.trim();
196
+ if (!trimmed || !trimmed.startsWith('data: '))
197
+ continue;
198
+ const data = trimmed.slice(6);
199
+ if (data === '[DONE]')
200
+ continue;
201
+ try {
202
+ const parsed = JSON.parse(data);
203
+ const delta = parsed.choices?.[0]?.delta?.content;
204
+ if (delta) {
205
+ fullContent += delta;
206
+ onChunk(delta);
207
+ }
208
+ // Final chunk may include usage
209
+ if (parsed.usage) {
210
+ usage = {
211
+ input: parsed.usage.prompt_tokens ?? 0,
212
+ output: parsed.usage.completion_tokens ?? 0,
213
+ };
214
+ }
215
+ }
216
+ catch {
217
+ // Skip unparseable lines
218
+ }
219
+ }
220
+ }
221
+ return { content: fullContent, usage };
222
+ }
223
+ /**
224
+ * Validate JSON data against a JSON Schema (best-effort, no extra deps)
225
+ */
226
+ function validateJsonSchema(data, schema) {
227
+ // Lightweight validation: check required fields and top-level types
228
+ if (schema.type === 'object' && schema.properties) {
229
+ if (typeof data !== 'object' || data === null || Array.isArray(data)) {
230
+ return { valid: false, errors: 'Expected an object' };
231
+ }
232
+ if (schema.required && Array.isArray(schema.required)) {
233
+ const missing = schema.required.filter((k) => !(k in data));
234
+ if (missing.length > 0) {
235
+ return { valid: false, errors: `Missing required fields: ${missing.join(', ')}` };
236
+ }
237
+ }
238
+ }
239
+ else if (schema.type === 'array') {
240
+ if (!Array.isArray(data)) {
241
+ return { valid: false, errors: 'Expected an array' };
242
+ }
243
+ }
244
+ return { valid: true };
88
245
  }
89
246
  /**
90
247
  * Truncate content to approximately N tokens (rough estimate: 1 token ≈ 4 chars)
@@ -95,114 +252,199 @@ function truncateContent(content, maxTokens = 3000) {
95
252
  return content;
96
253
  return content.slice(0, maxChars) + '\n\n[Content truncated...]';
97
254
  }
255
+ // ---------------------------------------------------------------------------
256
+ // Main agent
257
+ // ---------------------------------------------------------------------------
98
258
  /**
99
259
  * Run autonomous web research agent
100
260
  */
101
261
  export async function runAgent(options) {
102
- const { prompt, urls: startUrls = [], schema, llmApiKey, llmApiBase = 'https://api.openai.com/v1', llmModel = 'gpt-4o-mini', maxPages = 10, maxCredits, onProgress, } = options;
103
- if (!llmApiKey) {
262
+ const { prompt, urls: startUrls = [], schema: legacySchema, outputSchema, llmApiKey, llmApiBase = 'https://api.openai.com/v1', llmModel = 'gpt-4o-mini', maxPages, maxSources: rawMaxSources, depth = 'basic', topic = 'general', maxCredits, onProgress, onEvent, } = options;
263
+ if (!llmApiKey)
104
264
  throw new Error('llmApiKey is required');
105
- }
106
- if (!prompt) {
265
+ if (!prompt)
107
266
  throw new Error('prompt is required');
108
- }
109
- const maxIterations = Math.min(maxPages, 10);
267
+ // Effective schema = outputSchema || legacy schema
268
+ const effectiveSchema = outputSchema || legacySchema;
269
+ // Determine effective maxSources:
270
+ // new param > legacy maxPages > depth-based default
271
+ const depthDefaults = depth === 'thorough'
272
+ ? { maxSources: 10, maxQueries: 3, resultsPerQuery: 10 }
273
+ : { maxSources: 3, maxQueries: 1, resultsPerQuery: 5 };
274
+ const maxSourcesLimit = Math.min(rawMaxSources ?? maxPages ?? depthDefaults.maxSources, 20);
275
+ const maxQueries = depth === 'thorough' ? depthDefaults.maxQueries : depthDefaults.maxQueries;
110
276
  const visitedUrls = new Set();
111
277
  const sources = [];
278
+ const sourcesDetailed = [];
112
279
  let pagesVisited = 0;
113
280
  let creditsUsed = 0;
114
- // Collected data from all pages
281
+ let totalUsage = { input: 0, output: 0 };
115
282
  const collectedData = [];
283
+ // Emit both legacy progress and new event
116
284
  const reportProgress = (status, message, currentUrl) => {
117
285
  if (onProgress) {
118
- onProgress({
119
- status,
120
- currentUrl,
121
- pagesVisited,
122
- message,
123
- });
286
+ onProgress({ status, currentUrl, pagesVisited, message });
124
287
  }
125
288
  };
289
+ const emit = (event) => {
290
+ if (onEvent)
291
+ onEvent(event);
292
+ };
293
+ const accUsage = (u) => {
294
+ totalUsage.input += u.input;
295
+ totalUsage.output += u.output;
296
+ };
126
297
  try {
127
- // Step 1: Determine initial search strategy
128
- reportProgress('searching', 'Planning research strategy...');
298
+ // -----------------------------------------------------------------------
299
+ // Step 1: Determine search strategy & collect URLs
300
+ // -----------------------------------------------------------------------
129
301
  let urlsToVisit = [...startUrls];
130
- // If no starting URLs, ask LLM to generate search queries
131
302
  if (urlsToVisit.length === 0) {
303
+ reportProgress('searching', 'Planning research strategy...');
304
+ const queryCount = depth === 'thorough' ? '3-5' : '2-3';
305
+ const topicHint = topic !== 'general'
306
+ ? `\nFocus queries on ${topic} sources.`
307
+ : '';
132
308
  const planningMessages = [
133
309
  {
134
310
  role: 'system',
135
- content: 'You are a web research assistant. Generate 2-3 specific search queries to find information for the user\'s request. Return JSON only: {"queries": ["query1", "query2", "query3"]}',
136
- },
137
- {
138
- role: 'user',
139
- content: `Research request: ${prompt}`,
311
+ content: `You are a web research assistant. Generate ${queryCount} specific search queries to find information for the user's request.${topicHint}\nReturn JSON only: {"queries": ["query1", "query2", ...]}`,
140
312
  },
313
+ { role: 'user', content: `Research request: ${prompt}` },
141
314
  ];
142
315
  const planResponse = await callLLM(planningMessages, {
143
316
  apiKey: llmApiKey,
144
317
  model: llmModel,
145
318
  baseUrl: llmApiBase,
146
- schema: { queries: ['string'] },
319
+ jsonMode: true,
147
320
  });
148
321
  creditsUsed++;
322
+ accUsage(planResponse.usage);
149
323
  let queries = [];
150
324
  try {
151
- const parsed = JSON.parse(planResponse);
325
+ const parsed = JSON.parse(planResponse.content);
152
326
  queries = parsed.queries || [];
153
327
  }
154
328
  catch {
155
- // Fallback: use the prompt as the query
156
329
  queries = [prompt];
157
330
  }
158
- // Search for URLs
159
- reportProgress('searching', `Searching: ${queries.join(', ')}`);
160
- for (const query of queries.slice(0, 2)) { // Limit to 2 queries
161
- const results = await searchWeb(query, 5);
331
+ // Limit queries to maxQueries
332
+ const effectiveQueries = queries.slice(0, maxQueries);
333
+ for (const rawQuery of effectiveQueries) {
334
+ const query = topic !== 'general' ? enhanceQueryForTopic(rawQuery, topic) : rawQuery;
335
+ reportProgress('searching', `Searching: ${query}`);
336
+ emit({ type: 'step', action: 'searching', query });
337
+ const results = await searchWeb(query, depthDefaults.resultsPerQuery);
338
+ // Sort by topic relevance
339
+ if (topic !== 'general') {
340
+ results.sort((a, b) => scoreByTopic(b, topic) - scoreByTopic(a, topic));
341
+ }
162
342
  urlsToVisit.push(...results.map(r => r.url));
163
- // Stop if we have enough URLs
164
- if (urlsToVisit.length >= maxPages)
165
- break;
343
+ if (urlsToVisit.length >= maxSourcesLimit * 2)
344
+ break; // fetch a bit more than needed to account for failures
166
345
  }
167
- // Deduplicate
168
- urlsToVisit = [...new Set(urlsToVisit)];
346
+ // Deduplicate by hostname+pathname
347
+ const seen = new Set();
348
+ urlsToVisit = urlsToVisit.filter(u => {
349
+ try {
350
+ const key = new URL(u).hostname + new URL(u).pathname;
351
+ if (seen.has(key))
352
+ return false;
353
+ seen.add(key);
354
+ return true;
355
+ }
356
+ catch {
357
+ return false;
358
+ }
359
+ });
169
360
  }
361
+ // -----------------------------------------------------------------------
170
362
  // Step 2: Visit pages and collect data
171
- for (const url of urlsToVisit.slice(0, maxIterations)) {
172
- // Check credit limit
363
+ // -----------------------------------------------------------------------
364
+ const maxToFetch = Math.min(urlsToVisit.length, maxSourcesLimit);
365
+ for (const url of urlsToVisit.slice(0, maxToFetch + 5)) {
366
+ // Enough data collected?
367
+ if (collectedData.length >= maxSourcesLimit)
368
+ break;
173
369
  if (maxCredits && creditsUsed >= maxCredits) {
174
370
  reportProgress('done', 'Credit limit reached');
175
371
  break;
176
372
  }
177
- // Skip already visited URLs
178
373
  if (visitedUrls.has(url))
179
374
  continue;
180
375
  visitedUrls.add(url);
181
376
  reportProgress('visiting', `Fetching: ${url}`, url);
377
+ emit({ type: 'step', action: 'fetching', url });
182
378
  try {
183
- // Fetch the page
184
- const result = await peel(url, {
185
- format: 'markdown',
186
- timeout: 15000,
187
- });
379
+ const result = await peel(url, { format: 'markdown', timeout: 15000 });
188
380
  pagesVisited++;
189
- creditsUsed++; // Count each page fetch as 1 credit
190
- // Truncate content to avoid token overflow
191
- const truncated = truncateContent(result.content, 3000);
192
- collectedData.push({
193
- url: result.url,
194
- title: result.title,
195
- content: truncated,
196
- });
381
+ creditsUsed++;
382
+ const truncated = truncateContent(result.content, depth === 'thorough' ? 4000 : 3000);
383
+ collectedData.push({ url: result.url, title: result.title, content: truncated });
197
384
  sources.push(result.url);
385
+ sourcesDetailed.push({ url: result.url, title: result.title });
198
386
  reportProgress('visiting', `Fetched: ${result.title}`, url);
199
387
  }
200
388
  catch (error) {
201
389
  console.error(`Failed to fetch ${url}:`, error.message);
202
- // Continue with other URLs
203
390
  }
204
391
  }
205
- // Step 3: Extract and compile final data
392
+ // -----------------------------------------------------------------------
393
+ // Step 2b (thorough only): Cross-reference — ask LLM if more info needed
394
+ // -----------------------------------------------------------------------
395
+ if (depth === 'thorough' && collectedData.length > 0 && collectedData.length < maxSourcesLimit) {
396
+ reportProgress('searching', 'Cross-referencing — checking for gaps...');
397
+ emit({ type: 'step', action: 'analyzing', summary: 'Cross-referencing collected data for gaps...' });
398
+ const gapMessages = [
399
+ {
400
+ role: 'system',
401
+ content: 'You are a web research assistant. Given the user\'s research request and summaries of pages already visited, identify any gaps. If more searches would help, return JSON: {"queries":["q1"]}. If no gaps, return {"queries":[]}.',
402
+ },
403
+ {
404
+ role: 'user',
405
+ content: `Research request: ${prompt}\n\nPages visited:\n${collectedData.map(d => `- ${d.title} (${d.url})`).join('\n')}`,
406
+ },
407
+ ];
408
+ try {
409
+ const gapResponse = await callLLM(gapMessages, {
410
+ apiKey: llmApiKey, model: llmModel, baseUrl: llmApiBase, jsonMode: true,
411
+ });
412
+ creditsUsed++;
413
+ accUsage(gapResponse.usage);
414
+ const gapParsed = JSON.parse(gapResponse.content);
415
+ const gapQueries = (gapParsed.queries || []).slice(0, 2);
416
+ for (const q of gapQueries) {
417
+ emit({ type: 'step', action: 'searching', query: q });
418
+ const results = await searchWeb(q, 5);
419
+ for (const r of results) {
420
+ if (collectedData.length >= maxSourcesLimit)
421
+ break;
422
+ if (visitedUrls.has(r.url))
423
+ continue;
424
+ visitedUrls.add(r.url);
425
+ emit({ type: 'step', action: 'fetching', url: r.url });
426
+ try {
427
+ const result = await peel(r.url, { format: 'markdown', timeout: 15000 });
428
+ pagesVisited++;
429
+ creditsUsed++;
430
+ const truncated = truncateContent(result.content, 4000);
431
+ collectedData.push({ url: result.url, title: result.title, content: truncated });
432
+ sources.push(result.url);
433
+ sourcesDetailed.push({ url: result.url, title: result.title });
434
+ }
435
+ catch {
436
+ // skip
437
+ }
438
+ }
439
+ }
440
+ }
441
+ catch {
442
+ // Non-critical — continue with what we have
443
+ }
444
+ }
445
+ // -----------------------------------------------------------------------
446
+ // Step 3: Extract / synthesise final answer
447
+ // -----------------------------------------------------------------------
206
448
  if (collectedData.length === 0) {
207
449
  return {
208
450
  success: false,
@@ -210,18 +452,30 @@ export async function runAgent(options) {
210
452
  sources: [],
211
453
  pagesVisited,
212
454
  creditsUsed,
455
+ tokensUsed: totalUsage,
213
456
  };
214
457
  }
215
458
  reportProgress('extracting', 'Analyzing collected data...');
216
- // Build context from all collected pages
459
+ emit({ type: 'step', action: 'analyzing', summary: `Synthesizing answer from ${collectedData.length} sources...` });
217
460
  const context = collectedData
218
461
  .map(d => `Source: ${d.url}\nTitle: ${d.title}\n\n${d.content}`)
219
462
  .join('\n\n---\n\n');
220
- const truncatedContext = truncateContent(context, 8000); // Larger budget for final analysis
221
- // Build system prompt
222
- const systemPrompt = schema
223
- ? `You are a web research assistant. Extract structured data from the provided web content based on the user's request. Return a JSON object matching this schema:\n${JSON.stringify(schema, null, 2)}\n\nReturn ONLY valid JSON, no explanation.`
224
- : `You are a web research assistant. Extract and compile information from the provided web content based on the user's request. Return a JSON object with your findings. Be comprehensive but concise. Return ONLY valid JSON, no explanation.`;
463
+ const truncatedContext = truncateContent(context, depth === 'thorough' ? 12000 : 8000);
464
+ // Build system prompt based on schema or free-form
465
+ let systemPrompt;
466
+ if (effectiveSchema) {
467
+ systemPrompt =
468
+ 'You are a web research assistant. Extract structured data from the provided web content based on the user\'s request. ' +
469
+ `Return a JSON object matching this schema:\n${JSON.stringify(effectiveSchema, null, 2)}\n\nReturn ONLY valid JSON, no explanation.`;
470
+ }
471
+ else {
472
+ systemPrompt =
473
+ 'You are a web research assistant. Based on the provided web content, answer the user\'s research question. ' +
474
+ 'Provide a comprehensive, well-structured answer. Return a JSON object with:\n' +
475
+ '- "answer": your detailed answer as a string (use markdown formatting)\n' +
476
+ '- "keyFindings": array of key facts/findings\n' +
477
+ 'Return ONLY valid JSON, no explanation.';
478
+ }
225
479
  const extractMessages = [
226
480
  { role: 'system', content: systemPrompt },
227
481
  {
@@ -229,29 +483,62 @@ export async function runAgent(options) {
229
483
  content: `Research request: ${prompt}\n\nCollected data from ${collectedData.length} web pages:\n\n${truncatedContext}`,
230
484
  },
231
485
  ];
232
- const extractResponse = await callLLM(extractMessages, {
233
- apiKey: llmApiKey,
234
- model: llmModel,
235
- baseUrl: llmApiBase,
236
- schema: schema || {},
237
- });
486
+ // Use streaming LLM call when onEvent is present
487
+ const hasStreaming = !!onEvent;
488
+ const extractResponse = await callLLMStreaming(extractMessages, { apiKey: llmApiKey, model: llmModel, baseUrl: llmApiBase, jsonMode: true }, hasStreaming ? (text) => emit({ type: 'chunk', text }) : undefined);
238
489
  creditsUsed++;
490
+ accUsage(extractResponse.usage);
239
491
  // Parse final result
240
492
  let finalData;
241
493
  try {
242
- finalData = JSON.parse(extractResponse);
494
+ finalData = JSON.parse(extractResponse.content);
243
495
  }
244
496
  catch {
245
- // If JSON parsing fails, return the raw response wrapped in an object
246
- finalData = { result: extractResponse };
497
+ finalData = { result: extractResponse.content };
247
498
  }
499
+ // Validate against outputSchema if provided
500
+ if (outputSchema) {
501
+ const validation = validateJsonSchema(finalData, outputSchema);
502
+ if (!validation.valid) {
503
+ // Try once more: ask LLM to fix
504
+ try {
505
+ const fixMessages = [
506
+ {
507
+ role: 'system',
508
+ content: `The previous response did not match the required JSON schema. Fix it.\nSchema: ${JSON.stringify(outputSchema)}\nErrors: ${validation.errors}\nReturn ONLY valid JSON.`,
509
+ },
510
+ { role: 'user', content: extractResponse.content },
511
+ ];
512
+ const fixResponse = await callLLM(fixMessages, {
513
+ apiKey: llmApiKey, model: llmModel, baseUrl: llmApiBase, jsonMode: true,
514
+ });
515
+ creditsUsed++;
516
+ accUsage(fixResponse.usage);
517
+ finalData = JSON.parse(fixResponse.content);
518
+ }
519
+ catch {
520
+ // Return what we have with a warning
521
+ finalData._validationWarning = `Output did not match schema: ${validation.errors}`;
522
+ }
523
+ }
524
+ }
525
+ const answerText = typeof finalData?.answer === 'string' ? finalData.answer : undefined;
248
526
  reportProgress('done', `Completed: ${pagesVisited} pages visited`);
527
+ emit({
528
+ type: 'done',
529
+ answer: answerText || JSON.stringify(finalData),
530
+ sources: sourcesDetailed,
531
+ tokensUsed: totalUsage,
532
+ });
249
533
  return {
250
534
  success: true,
251
535
  data: finalData,
536
+ answer: answerText,
252
537
  sources,
538
+ sourcesDetailed,
253
539
  pagesVisited,
254
540
  creditsUsed,
541
+ tokensUsed: totalUsage,
255
542
  };
256
543
  }
257
544
  catch (error) {
@@ -260,8 +547,10 @@ export async function runAgent(options) {
260
547
  success: false,
261
548
  data: { error: error.message || 'Unknown error occurred' },
262
549
  sources,
550
+ sourcesDetailed,
263
551
  pagesVisited,
264
552
  creditsUsed,
553
+ tokensUsed: totalUsage,
265
554
  };
266
555
  }
267
556
  }