@pi-unipi/web-api 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/settings.ts CHANGED
@@ -27,12 +27,42 @@ export interface CacheSettings {
27
27
  ttlMs: number;
28
28
  }
29
29
 
30
+ /** Smart-fetch default settings */
31
+ export interface SmartFetchSettings {
32
+ /** TLS fingerprint browser profile */
33
+ browser: string;
34
+ /** OS fingerprint */
35
+ os: string;
36
+ /** Maximum content characters */
37
+ maxChars: number;
38
+ /** Request timeout in ms */
39
+ timeoutMs: number;
40
+ /** Batch concurrency */
41
+ batchConcurrency: number;
42
+ /** Strip image references */
43
+ removeImages: boolean;
44
+ /** Include replies/comments */
45
+ includeReplies: boolean | "extractors";
46
+ }
47
+
30
48
  /** Config storage structure */
31
49
  export interface WebApiConfig {
32
50
  providers: Record<string, ProviderSettings>;
33
51
  cache: CacheSettings;
52
+ smartFetch?: Partial<SmartFetchSettings>;
34
53
  }
35
54
 
55
+ /** Default smart-fetch settings */
56
+ const DEFAULT_SMART_FETCH_SETTINGS: SmartFetchSettings = {
57
+ browser: "chrome_145",
58
+ os: "windows",
59
+ maxChars: 50000,
60
+ timeoutMs: 15000,
61
+ batchConcurrency: 8,
62
+ removeImages: false,
63
+ includeReplies: "extractors",
64
+ };
65
+
36
66
  /** Default configuration */
37
67
  const DEFAULT_CONFIG: WebApiConfig = {
38
68
  providers: {
@@ -49,6 +79,7 @@ const DEFAULT_CONFIG: WebApiConfig = {
49
79
  enabled: true,
50
80
  ttlMs: 3600000, // 1 hour
51
81
  },
82
+ smartFetch: {},
52
83
  };
53
84
 
54
85
  /**
@@ -94,8 +125,8 @@ export function loadAuth(): WebApiAuth {
94
125
  const content = fs.readFileSync(authPath, "utf-8");
95
126
  return JSON.parse(content);
96
127
  }
97
- } catch (error) {
98
- console.error("[web-api] Failed to load auth:", error);
128
+ } catch {
129
+ // Silently ignore auth load failure returns empty.
99
130
  }
100
131
  return {};
101
132
  }
@@ -133,8 +164,8 @@ export function loadConfig(): WebApiConfig {
133
164
  },
134
165
  };
135
166
  }
136
- } catch (error) {
137
- console.error("[web-api] Failed to load config:", error);
167
+ } catch {
168
+ // Silently ignore config load failure falls back to defaults.
138
169
  }
139
170
  return DEFAULT_CONFIG;
140
171
  }
@@ -261,3 +292,38 @@ export function validateApiKeyFormat(providerId: string, apiKey: string): boolea
261
292
  return apiKey.length >= 8;
262
293
  }
263
294
  }
295
+
296
+ /**
297
+ * Load smart-fetch settings.
298
+ * Merges defaults with saved config.
299
+ * @returns Smart-fetch settings
300
+ */
301
+ export function loadSmartFetchSettings(): SmartFetchSettings {
302
+ const config = loadConfig();
303
+ return {
304
+ ...DEFAULT_SMART_FETCH_SETTINGS,
305
+ ...config.smartFetch,
306
+ };
307
+ }
308
+
309
+ /**
310
+ * Save smart-fetch settings.
311
+ * @param settings - Partial settings to save
312
+ */
313
+ export function saveSmartFetchSettings(settings: Partial<SmartFetchSettings>): void {
314
+ const config = loadConfig();
315
+ config.smartFetch = {
316
+ ...config.smartFetch,
317
+ ...settings,
318
+ };
319
+ saveConfig(config);
320
+ }
321
+
322
+ /**
323
+ * Reset smart-fetch settings to defaults.
324
+ */
325
+ export function resetSmartFetchSettings(): void {
326
+ const config = loadConfig();
327
+ config.smartFetch = {};
328
+ saveConfig(config);
329
+ }
package/src/tools.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  /**
2
2
  * @unipi/web-api — Agent tools registration
3
3
  *
4
- * Registers web-search, web-read, and web-llm-summarize tools.
4
+ * Registers web-search, multi-web-content-read, and web-llm-summarize tools.
5
5
  * Implements smart provider selection based on ranking.
6
6
  */
7
7
 
@@ -19,12 +19,20 @@ import {
19
19
  getApiKey,
20
20
  isProviderEnabled,
21
21
  loadConfig,
22
+ loadSmartFetchSettings,
22
23
  } from "./settings.js";
24
+ import { webCache } from "./cache.js";
25
+ import {
26
+ defuddleFetch,
27
+ defuddleFetchMultiple,
28
+ } from "./engine/extract.js";
29
+ import type { FetchOptions, FetchResult, BatchFetchResult } from "./engine/types.js";
30
+ import { formatSingleResult, formatBatchResult, formatErrorResult } from "./engine/format.js";
23
31
 
24
32
  /** Tool names */
25
33
  export const WEB_TOOLS = {
26
34
  SEARCH: "web_search",
27
- READ: "web_read",
35
+ READ: "multi_web_content_read",
28
36
  SUMMARIZE: "web_llm_summarize",
29
37
  } as const;
30
38
 
@@ -113,9 +121,9 @@ async function executeSearch(
113
121
  }
114
122
 
115
123
  /**
116
- * Execute web read.
124
+ * Execute web read via provider.
117
125
  */
118
- async function executeRead(
126
+ async function executeProviderRead(
119
127
  url: string,
120
128
  sourceRank?: number
121
129
  ): Promise<ReadResult> {
@@ -151,6 +159,85 @@ async function executeSummarize(
151
159
  return provider.summarize(url, prompt, config);
152
160
  }
153
161
 
162
+ /**
163
+ * Generate cache key for smart-fetch results.
164
+ */
165
+ function generateSmartFetchKey(
166
+ url: string,
167
+ options: Partial<FetchOptions>
168
+ ): string {
169
+ const parts = [
170
+ url,
171
+ options.browser || "",
172
+ options.format || "",
173
+ String(options.maxChars || ""),
174
+ ];
175
+ return parts.join(":");
176
+ }
177
+
178
+ /**
179
+ * Execute smart-fetch read (single URL).
180
+ */
181
+ async function executeSmartFetchRead(
182
+ url: string,
183
+ options: Partial<FetchOptions> = {}
184
+ ): Promise<FetchResult> {
185
+ // Check cache first
186
+ const cacheKey = generateSmartFetchKey(url, options);
187
+ const cached = webCache.get(cacheKey, "smart-fetch");
188
+ if (cached) {
189
+ return cached as FetchResult;
190
+ }
191
+
192
+ // Load defaults
193
+ const defaults = loadSmartFetchSettings();
194
+ const fetchOptions: FetchOptions = {
195
+ browser: options.browser || defaults.browser,
196
+ os: options.os || defaults.os,
197
+ format: options.format || "markdown",
198
+ maxChars: options.maxChars || defaults.maxChars,
199
+ timeoutMs: options.timeoutMs || defaults.timeoutMs,
200
+ removeImages: options.removeImages ?? defaults.removeImages,
201
+ includeReplies: options.includeReplies ?? defaults.includeReplies,
202
+ proxy: options.proxy,
203
+ headers: options.headers,
204
+ };
205
+
206
+ // Execute fetch
207
+ const result = await defuddleFetch(url, fetchOptions);
208
+
209
+ // Cache result
210
+ webCache.set(cacheKey, "smart-fetch", result);
211
+
212
+ return result;
213
+ }
214
+
215
+ /**
216
+ * Execute smart-fetch batch read.
217
+ */
218
+ async function executeSmartFetchBatch(
219
+ urls: string[],
220
+ options: Partial<FetchOptions> & { batchConcurrency?: number } = {}
221
+ ): Promise<BatchFetchResult> {
222
+ // Load defaults
223
+ const defaults = loadSmartFetchSettings();
224
+ const fetchOptions: FetchOptions & { batchConcurrency?: number } = {
225
+ browser: options.browser || defaults.browser,
226
+ os: options.os || defaults.os,
227
+ format: options.format || "markdown",
228
+ maxChars: options.maxChars || defaults.maxChars,
229
+ timeoutMs: options.timeoutMs || defaults.timeoutMs,
230
+ removeImages: options.removeImages ?? defaults.removeImages,
231
+ includeReplies: options.includeReplies ?? defaults.includeReplies,
232
+ proxy: options.proxy,
233
+ headers: options.headers,
234
+ batchConcurrency: options.batchConcurrency || defaults.batchConcurrency,
235
+ };
236
+
237
+ // Execute batch fetch
238
+ return defuddleFetchMultiple(urls, fetchOptions);
239
+ }
240
+
154
241
  /**
155
242
  * Register web tools with pi.
156
243
  */
@@ -221,43 +308,213 @@ export function registerWebTools(pi: ExtensionAPI): void {
221
308
  },
222
309
  });
223
310
 
224
- // --- web_read tool ---
311
+ // --- multi_web_content_read tool ---
225
312
  pi.registerTool({
226
313
  name: WEB_TOOLS.READ,
227
- label: "Web Read",
314
+ label: "Multi Web Content Read",
228
315
  description:
229
- "Read and extract content from a URL. " +
230
- "Extracts main content, strips navigation/ads. Returns markdown.",
231
- promptSnippet: "Read content from a URL.",
316
+ "Read and extract content from URLs using the smart-fetch engine (default) or provider fallbacks. " +
317
+ "Supports single URL or batch URLs. " +
318
+ "Returns clean markdown with metadata (title, author, site, word count).",
319
+ promptSnippet: "Read content from one or more URLs.",
232
320
  promptGuidelines: [
233
- "Use web_read to extract content from a web page.",
234
- "Returns main content as markdown.",
235
- "Lower source = simpler providers (Jina Reader).",
236
- "Higher source = more capable providers (Firecrawl, Perplexity).",
321
+ "Use multi_web_content_read to extract content from web pages.",
322
+ "Pass a single URL string or an array of URLs for batch reading.",
323
+ "Default source (0 or omitted) uses the local smart-fetch engine — free, no API key.",
324
+ "source 1-3 uses provider fallbacks: Jina Reader, Firecrawl, Perplexity.",
325
+ "Batch mode: pass an array of URLs, returns results for each.",
237
326
  ],
238
327
  parameters: Type.Object({
239
- url: Type.String({ description: "URL to read" }),
328
+ url: Type.Union([
329
+ Type.String({ description: "Single URL to read" }),
330
+ Type.Array(Type.String(), { description: "Array of URLs to read in batch" }),
331
+ ], { description: "URL or array of URLs to read" }),
240
332
  source: Type.Optional(
241
333
  Type.Number({
242
334
  description:
243
- "Provider selection (1=Jina Reader, 2=Firecrawl, 3=Perplexity). " +
244
- "Omit for auto-selection.",
245
- minimum: 1,
335
+ "Provider selection (0=smart-fetch engine, 1=Jina Reader, 2=Firecrawl, 3=Perplexity). " +
336
+ "Default is 0 (smart-fetch).",
337
+ minimum: 0,
246
338
  maximum: 3,
247
339
  })
248
340
  ),
341
+ browser: Type.Optional(
342
+ Type.String({
343
+ description: "TLS fingerprint browser profile (e.g., chrome_145). Default: chrome_145.",
344
+ })
345
+ ),
346
+ os: Type.Optional(
347
+ Type.String({
348
+ description: "OS fingerprint (windows, macos, linux). Default: windows.",
349
+ })
350
+ ),
351
+ format: Type.Optional(
352
+ Type.Union([
353
+ Type.Literal("markdown"),
354
+ Type.Literal("html"),
355
+ Type.Literal("text"),
356
+ Type.Literal("json"),
357
+ ], { description: "Output format. Default: markdown." })
358
+ ),
359
+ maxChars: Type.Optional(
360
+ Type.Number({
361
+ description: "Maximum characters in output. Default: 50000.",
362
+ })
363
+ ),
364
+ timeoutMs: Type.Optional(
365
+ Type.Number({
366
+ description: "Request timeout in milliseconds. Default: 15000.",
367
+ })
368
+ ),
369
+ removeImages: Type.Optional(
370
+ Type.Boolean({
371
+ description: "Strip image references from content. Default: false.",
372
+ })
373
+ ),
374
+ includeReplies: Type.Optional(
375
+ Type.Union([
376
+ Type.Boolean(),
377
+ Type.Literal("extractors"),
378
+ ], { description: "Include replies/comments. Default: extractors." })
379
+ ),
380
+ proxy: Type.Optional(
381
+ Type.String({
382
+ description: "Proxy URL for requests.",
383
+ })
384
+ ),
385
+ batchConcurrency: Type.Optional(
386
+ Type.Number({
387
+ description: "Concurrent requests for batch mode. Default: 8.",
388
+ })
389
+ ),
390
+ verbose: Type.Optional(
391
+ Type.Boolean({
392
+ description: "Include metadata header in output. Default: true.",
393
+ })
394
+ ),
249
395
  }),
250
396
  async execute(_toolCallId, params, _signal, _onUpdate, _ctx) {
397
+ const source = params.source ?? 0;
398
+ const verbose = params.verbose ?? true;
399
+
251
400
  try {
252
- const result = await executeRead(params.url, params.source);
401
+ // Single URL
402
+ if (typeof params.url === "string") {
403
+ // Provider fallback
404
+ if (source >= 1) {
405
+ const result = await executeProviderRead(params.url, source);
406
+ return {
407
+ content: [
408
+ {
409
+ type: "text",
410
+ text: `Content from ${result.url}:\n\n${result.content}`,
411
+ },
412
+ ],
413
+ details: {},
414
+ };
415
+ }
253
416
 
254
- return {
255
- content: [
256
- {
257
- type: "text",
258
- text: `Content from ${result.url}:\n\n${result.content}`,
417
+ // Smart-fetch engine
418
+ const result = await executeSmartFetchRead(params.url, {
419
+ browser: params.browser,
420
+ os: params.os,
421
+ format: params.format as FetchOptions["format"],
422
+ maxChars: params.maxChars,
423
+ timeoutMs: params.timeoutMs,
424
+ removeImages: params.removeImages,
425
+ includeReplies: params.includeReplies as FetchOptions["includeReplies"],
426
+ proxy: params.proxy,
427
+ });
428
+
429
+ return {
430
+ content: [
431
+ {
432
+ type: "text",
433
+ text: formatSingleResult(result, verbose),
434
+ },
435
+ ],
436
+ details: {
437
+ url: result.url,
438
+ finalUrl: result.finalUrl,
439
+ title: result.title,
440
+ wordCount: result.wordCount,
259
441
  },
260
- ],
442
+ };
443
+ }
444
+
445
+ // Batch URLs
446
+ if (Array.isArray(params.url)) {
447
+ if (params.url.length === 0) {
448
+ return {
449
+ content: [{ type: "text", text: "No URLs provided." }],
450
+ details: {},
451
+ };
452
+ }
453
+
454
+ // Provider fallback for batch (fetch each individually)
455
+ if (source >= 1) {
456
+ const results = await Promise.all(
457
+ params.url.map(async (url) => {
458
+ try {
459
+ const result = await executeProviderRead(url, source);
460
+ return { url, status: "done", content: result.content };
461
+ } catch (error) {
462
+ return {
463
+ url,
464
+ status: "error",
465
+ error: error instanceof Error ? error.message : String(error),
466
+ };
467
+ }
468
+ })
469
+ );
470
+
471
+ const text = results
472
+ .map((r, i) => {
473
+ if (r.status === "done") {
474
+ return `[${i + 1}] ${r.url}\n${r.content}`;
475
+ }
476
+ return `[${i + 1}] ${r.url}\nError: ${r.error}`;
477
+ })
478
+ .join("\n\n---\n\n");
479
+
480
+ return {
481
+ content: [{ type: "text", text }],
482
+ details: { total: results.length },
483
+ };
484
+ }
485
+
486
+ // Smart-fetch batch
487
+ const result = await executeSmartFetchBatch(params.url, {
488
+ browser: params.browser,
489
+ os: params.os,
490
+ format: params.format as FetchOptions["format"],
491
+ maxChars: params.maxChars,
492
+ timeoutMs: params.timeoutMs,
493
+ removeImages: params.removeImages,
494
+ includeReplies: params.includeReplies as FetchOptions["includeReplies"],
495
+ proxy: params.proxy,
496
+ batchConcurrency: params.batchConcurrency,
497
+ });
498
+
499
+ return {
500
+ content: [
501
+ {
502
+ type: "text",
503
+ text: formatBatchResult(result),
504
+ },
505
+ ],
506
+ details: {
507
+ total: result.total,
508
+ succeeded: result.succeeded,
509
+ failed: result.failed,
510
+ },
511
+ };
512
+ }
513
+
514
+ // Should never reach here
515
+ return {
516
+ content: [{ type: "text", text: "Invalid url parameter." }],
517
+ isError: true,
261
518
  details: {},
262
519
  };
263
520
  } catch (error) {
@@ -0,0 +1,168 @@
1
+ /**
2
+ * @unipi/web-api — TUI Progress Renderer
3
+ *
4
+ * Renders batch fetch progress for TUI display.
5
+ */
6
+
7
+ import type { FetchProgress, FetchProgressStatus } from "../engine/types.js";
8
+
9
+ /** Spinner frames for animation */
10
+ const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
11
+
12
+ /** Status glyphs */
13
+ const STATUS_GLYPHS: Record<FetchProgressStatus, string> = {
14
+ queued: "○",
15
+ connecting: SPINNER_FRAMES[0],
16
+ waiting: SPINNER_FRAMES[0],
17
+ loading: SPINNER_FRAMES[0],
18
+ processing: SPINNER_FRAMES[0],
19
+ done: "✓",
20
+ error: "✗",
21
+ };
22
+
23
+ /**
24
+ * Get a spinner frame for the given index.
25
+ * Cycles through spinner frames for animation.
26
+ *
27
+ * @param index - Animation frame index
28
+ * @returns Spinner character
29
+ */
30
+ export function getSpinnerFrame(index: number): string {
31
+ return SPINNER_FRAMES[index % SPINNER_FRAMES.length];
32
+ }
33
+
34
+ /**
35
+ * Render a progress bar.
36
+ *
37
+ * @param percent - Progress percentage (0-100)
38
+ * @param width - Bar width in characters
39
+ * @returns Progress bar string
40
+ */
41
+ export function renderProgressBar(percent: number, width: number = 10): string {
42
+ const filled = Math.round((percent / 100) * width);
43
+ const empty = width - filled;
44
+ return "█".repeat(filled) + "░".repeat(empty);
45
+ }
46
+
47
+ /**
48
+ * Truncate a URL for display.
49
+ *
50
+ * @param url - URL to truncate
51
+ * @param maxLength - Maximum length
52
+ * @returns Truncated URL
53
+ */
54
+ function truncateUrl(url: string, maxLength: number): string {
55
+ if (url.length <= maxLength) {
56
+ return url;
57
+ }
58
+
59
+ // Try to keep the domain
60
+ try {
61
+ const parsed = new URL(url);
62
+ const domain = parsed.host;
63
+ const path = parsed.pathname + parsed.search;
64
+
65
+ if (domain.length + 3 >= maxLength) {
66
+ return url.slice(0, maxLength - 1) + "…";
67
+ }
68
+
69
+ const remaining = maxLength - domain.length - 3;
70
+ if (path.length <= remaining) {
71
+ return domain + path;
72
+ }
73
+
74
+ return domain + path.slice(0, remaining - 1) + "…";
75
+ } catch {
76
+ return url.slice(0, maxLength - 1) + "…";
77
+ }
78
+ }
79
+
80
+ /**
81
+ * Render a single progress item line.
82
+ *
83
+ * @param progress - Progress object
84
+ * @param width - Available width
85
+ * @param spinnerIndex - Animation frame index
86
+ * @returns Formatted line
87
+ */
88
+ export function renderProgressLine(
89
+ progress: FetchProgress,
90
+ width: number = 80,
91
+ spinnerIndex: number = 0
92
+ ): string {
93
+ // Status glyph
94
+ let glyph = STATUS_GLYPHS[progress.status];
95
+ if (["connecting", "waiting", "loading", "processing"].includes(progress.status)) {
96
+ glyph = getSpinnerFrame(spinnerIndex);
97
+ }
98
+
99
+ // Truncate URL
100
+ const urlMax = Math.min(40, width - 30);
101
+ const url = truncateUrl(progress.url, urlMax);
102
+
103
+ // Progress bar
104
+ const bar = renderProgressBar(progress.percent, 8);
105
+
106
+ // Status text
107
+ const statusText = progress.phase || progress.status;
108
+
109
+ // Format line
110
+ return `${glyph} ${url.padEnd(urlMax)} ${statusText.padEnd(12)} [${bar}]`;
111
+ }
112
+
113
+ /**
114
+ * Render batch progress header.
115
+ *
116
+ * @param progress - All progress items
117
+ * @param concurrency - Current concurrency
118
+ * @returns Header line
119
+ */
120
+ export function renderBatchProgressHeader(
121
+ progress: FetchProgress[],
122
+ concurrency: number
123
+ ): string {
124
+ const total = progress.length;
125
+ const done = progress.filter((p) => p.status === "done").length;
126
+ const error = progress.filter((p) => p.status === "error").length;
127
+ const active = progress.filter(
128
+ (p) => !["queued", "done", "error"].includes(p.status)
129
+ ).length;
130
+
131
+ return `batch_web_content_read ${done}/${total} done · ok ${done - error} · err ${error} · concurrency ${concurrency}`;
132
+ }
133
+
134
+ /**
135
+ * Render full batch progress display.
136
+ *
137
+ * @param progress - All progress items
138
+ * @param concurrency - Current concurrency
139
+ * @param width - Available width
140
+ * @param spinnerIndex - Animation frame index
141
+ * @returns Formatted string
142
+ */
143
+ export function renderBatchProgress(
144
+ progress: FetchProgress[],
145
+ concurrency: number = 8,
146
+ width: number = 80,
147
+ spinnerIndex: number = 0
148
+ ): string {
149
+ const lines: string[] = [];
150
+
151
+ // Header
152
+ lines.push(renderBatchProgressHeader(progress, concurrency));
153
+ lines.push("");
154
+
155
+ // Progress items (show up to 10)
156
+ const maxItems = 10;
157
+ const itemsToShow = progress.slice(0, maxItems);
158
+
159
+ for (const item of itemsToShow) {
160
+ lines.push(renderProgressLine(item, width, spinnerIndex));
161
+ }
162
+
163
+ if (progress.length > maxItems) {
164
+ lines.push(` ... and ${progress.length - maxItems} more`);
165
+ }
166
+
167
+ return lines.join("\n");
168
+ }