@pi-unipi/web-api 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,306 @@
1
+ /**
2
+ * @unipi/web-api — Format & Error Builders
3
+ *
4
+ * Output formatting, content truncation, and error text builders.
5
+ */
6
+
7
+ import type {
8
+ FetchResult,
9
+ FetchError,
10
+ BatchFetchResult,
11
+ } from "./types.js";
12
+ import { DEFAULT_MAX_CHARS } from "./constants.js";
13
+
14
+ /** Truncation marker appended to truncated content */
15
+ const TRUNCATION_MARKER = "\n\n... [truncated]";
16
+
17
+ /**
18
+ * Truncate content to a maximum character count.
19
+ * Appends a truncation marker if content is shortened.
20
+ *
21
+ * @param content - Content to truncate
22
+ * @param maxChars - Maximum characters
23
+ * @returns Truncated content with marker if needed
24
+ */
25
+ export function truncateContent(
26
+ content: string,
27
+ maxChars: number = DEFAULT_MAX_CHARS
28
+ ): string {
29
+ if (!content || content.length <= maxChars) {
30
+ return content;
31
+ }
32
+
33
+ // Try to truncate at a word boundary
34
+ const targetLength = maxChars - TRUNCATION_MARKER.length;
35
+ let truncated = content.slice(0, targetLength);
36
+
37
+ // Find last space to avoid cutting mid-word
38
+ const lastSpace = truncated.lastIndexOf(" ");
39
+ if (lastSpace > targetLength * 0.8) {
40
+ truncated = truncated.slice(0, lastSpace);
41
+ }
42
+
43
+ return truncated.trim() + TRUNCATION_MARKER;
44
+ }
45
+
46
+ /**
47
+ * Format a FetchResult into the requested output format.
48
+ *
49
+ * @param result - Fetch result
50
+ * @param format - Output format
51
+ * @param maxChars - Maximum characters (optional)
52
+ * @returns Formatted content string
53
+ */
54
+ export function formatContent(
55
+ result: FetchResult,
56
+ format: "markdown" | "html" | "text" | "json" = "markdown",
57
+ maxChars?: number
58
+ ): string {
59
+ let content: string;
60
+
61
+ switch (format) {
62
+ case "json":
63
+ content = JSON.stringify(result, null, 2);
64
+ break;
65
+
66
+ case "html":
67
+ // For now, return content as-is (defuddle outputs markdown)
68
+ // A full implementation would convert markdown to HTML
69
+ content = result.content;
70
+ break;
71
+
72
+ case "text":
73
+ // Strip markdown formatting for plain text
74
+ content = stripMarkdown(result.content);
75
+ break;
76
+
77
+ case "markdown":
78
+ default:
79
+ content = result.content;
80
+ break;
81
+ }
82
+
83
+ return truncateContent(content, maxChars);
84
+ }
85
+
86
+ /**
87
+ * Strip markdown formatting for plain text output.
88
+ *
89
+ * @param markdown - Markdown content
90
+ * @returns Plain text
91
+ */
92
+ function stripMarkdown(markdown: string): string {
93
+ let text = markdown;
94
+
95
+ // Remove headers
96
+ text = text.replace(/^#{1,6}\s+/gm, "");
97
+
98
+ // Remove bold/italic
99
+ text = text.replace(/\*\*\*(.*?)\*\*\*/g, "$1");
100
+ text = text.replace(/\*\*(.*?)\*\*/g, "$1");
101
+ text = text.replace(/\*(.*?)\*/g, "$1");
102
+ text = text.replace(/___(.*?)___/g, "$1");
103
+ text = text.replace(/__(.*?)__/g, "$1");
104
+ text = text.replace(/_(.*?)_/g, "$1");
105
+
106
+ // Remove links, keep text
107
+ text = text.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1");
108
+
109
+ // Remove images
110
+ text = text.replace(/!\[([^\]]*)\]\([^)]+\)/g, "");
111
+
112
+ // Remove code blocks
113
+ text = text.replace(/```[\s\S]*?```/g, "");
114
+ text = text.replace(/`([^`]+)`/g, "$1");
115
+
116
+ // Remove horizontal rules
117
+ text = text.replace(/^[-*_]{3,}$/gm, "");
118
+
119
+ // Remove blockquotes
120
+ text = text.replace(/^>\s+/gm, "");
121
+
122
+ // Remove list markers
123
+ text = text.replace(/^[\s]*[-*+]\s+/gm, "");
124
+ text = text.replace(/^[\s]*\d+\.\s+/gm, "");
125
+
126
+ // Clean up extra whitespace
127
+ text = text.replace(/\n{3,}/g, "\n\n");
128
+ text = text.trim();
129
+
130
+ return text;
131
+ }
132
+
133
+ /**
134
+ * Build a human-readable error message from a FetchError.
135
+ *
136
+ * @param error - Fetch error
137
+ * @returns Human-readable error string
138
+ */
139
+ export function buildErrorText(error: FetchError): string {
140
+ const parts: string[] = [];
141
+
142
+ // Main error message
143
+ parts.push(error.error);
144
+
145
+ // Code and phase context
146
+ parts.push(`(${error.code} during ${error.phase})`);
147
+
148
+ // URL context
149
+ if (error.url) {
150
+ if (error.finalUrl && error.finalUrl !== error.url) {
151
+ parts.push(`URL: ${error.url} → ${error.finalUrl}`);
152
+ } else {
153
+ parts.push(`URL: ${error.url}`);
154
+ }
155
+ }
156
+
157
+ // HTTP status
158
+ if (error.statusCode) {
159
+ parts.push(`Status: ${error.statusCode}${error.statusText ? ` ${error.statusText}` : ""}`);
160
+ }
161
+
162
+ // Network details
163
+ if (error.mimeType) {
164
+ parts.push(`Content-Type: ${error.mimeType}`);
165
+ }
166
+ if (error.contentLength !== undefined) {
167
+ const sizeKB = Math.round(error.contentLength / 1024);
168
+ parts.push(`Size: ${sizeKB} KB`);
169
+ }
170
+ if (error.downloadedBytes !== undefined && error.contentLength) {
171
+ const percent = Math.round((error.downloadedBytes / error.contentLength) * 100);
172
+ parts.push(`Downloaded: ${percent}%`);
173
+ }
174
+
175
+ // Retry hint
176
+ if (error.retryable) {
177
+ parts.push("This error may be retried.");
178
+ } else {
179
+ parts.push("This error is not retryable.");
180
+ }
181
+
182
+ return parts.join("\n");
183
+ }
184
+
185
+ /**
186
+ * Format a single FetchResult for display.
187
+ *
188
+ * @param result - Fetch result
189
+ * @param verbose - Include metadata header
190
+ * @returns Formatted string
191
+ */
192
+ export function formatSingleResult(
193
+ result: FetchResult,
194
+ verbose: boolean = true
195
+ ): string {
196
+ const lines: string[] = [];
197
+
198
+ if (verbose) {
199
+ // Metadata header
200
+ lines.push(`# ${result.title || "Untitled"}`);
201
+ lines.push("");
202
+ lines.push(`URL: ${result.url}`);
203
+ if (result.finalUrl !== result.url) {
204
+ lines.push(`Final URL: ${result.finalUrl}`);
205
+ }
206
+ if (result.author) {
207
+ lines.push(`Author: ${result.author}`);
208
+ }
209
+ if (result.published) {
210
+ lines.push(`Published: ${result.published}`);
211
+ }
212
+ if (result.site) {
213
+ lines.push(`Site: ${result.site}`);
214
+ }
215
+ if (result.language) {
216
+ lines.push(`Language: ${result.language}`);
217
+ }
218
+ lines.push(`Word count: ${result.wordCount}`);
219
+ lines.push("");
220
+ lines.push("---");
221
+ lines.push("");
222
+ }
223
+
224
+ // Content
225
+ lines.push(result.content);
226
+
227
+ return lines.join("\n");
228
+ }
229
+
230
+ /**
231
+ * Format a BatchFetchResult for display.
232
+ *
233
+ * @param result - Batch fetch result
234
+ * @returns Formatted string
235
+ */
236
+ export function formatBatchResult(result: BatchFetchResult): string {
237
+ const lines: string[] = [];
238
+
239
+ // Summary header
240
+ lines.push(`# Batch Read Results`);
241
+ lines.push("");
242
+ lines.push(
243
+ `Total: ${result.total} · Succeeded: ${result.succeeded} · Failed: ${result.failed}`
244
+ );
245
+ lines.push("");
246
+
247
+ // Per-item results
248
+ for (let i = 0; i < result.items.length; i++) {
249
+ const item = result.items[i];
250
+ lines.push(`## [${i + 1}/${result.total}] ${item.status === "done" ? "✓" : "✗"}`);
251
+
252
+ if (item.status === "done") {
253
+ lines.push(`Title: ${item.result.title}`);
254
+ lines.push(`URL: ${item.result.url}`);
255
+ lines.push(`Words: ${item.result.wordCount}`);
256
+ // Content preview (first 500 chars)
257
+ const preview = item.result.content.slice(0, 500);
258
+ lines.push("");
259
+ lines.push(preview + (item.result.content.length > 500 ? "..." : ""));
260
+ } else {
261
+ lines.push(`URL: ${item.error.url || "unknown"}`);
262
+ lines.push(`Error: ${item.error.error}`);
263
+ }
264
+
265
+ lines.push("");
266
+ }
267
+
268
+ return lines.join("\n");
269
+ }
270
+
271
+ /**
272
+ * Format a FetchError for display.
273
+ *
274
+ * @param error - Fetch error
275
+ * @returns Formatted error string
276
+ */
277
+ export function formatErrorResult(error: FetchError): string {
278
+ const lines: string[] = [];
279
+
280
+ lines.push(`# Fetch Error`);
281
+ lines.push("");
282
+ lines.push(`**${error.error}**`);
283
+ lines.push("");
284
+ lines.push(`Code: \`${error.code}\``);
285
+ lines.push(`Phase: \`${error.phase}\``);
286
+
287
+ if (error.url) {
288
+ lines.push("");
289
+ lines.push(`URL: ${error.url}`);
290
+ if (error.finalUrl && error.finalUrl !== error.url) {
291
+ lines.push(`Final URL: ${error.finalUrl}`);
292
+ }
293
+ }
294
+
295
+ if (error.statusCode) {
296
+ lines.push("");
297
+ lines.push(`HTTP Status: ${error.statusCode}${error.statusText ? ` ${error.statusText}` : ""}`);
298
+ }
299
+
300
+ if (error.retryable) {
301
+ lines.push("");
302
+ lines.push(`*This error may be retried.*`);
303
+ }
304
+
305
+ return lines.join("\n");
306
+ }
@@ -0,0 +1,102 @@
1
+ /**
2
+ * @unipi/web-api — Browser Profile Resolution
3
+ *
4
+ * Resolves browser TLS fingerprint profiles for wreq-js.
5
+ */
6
+
7
+ import { DEFAULT_BROWSER, DEFAULT_OS } from "./constants.js";
8
+
9
+ /** Known browser profiles for TLS fingerprinting */
10
+ export const BROWSER_PROFILES = [
11
+ // Chrome
12
+ "chrome_100",
13
+ "chrome_101",
14
+ "chrome_104",
15
+ "chrome_107",
16
+ "chrome_110",
17
+ "chrome_116",
18
+ "chrome_119",
19
+ "chrome_120",
20
+ "chrome_123",
21
+ "chrome_124",
22
+ "chrome_131",
23
+ "chrome_133",
24
+ "chrome_145",
25
+ // Firefox
26
+ "firefox_120",
27
+ "firefox_133",
28
+ // Safari
29
+ "safari_15_6_1",
30
+ "safari_16_0",
31
+ "safari_17_0",
32
+ // Edge
33
+ "edge_101",
34
+ ] as const;
35
+
36
+ /** OS fingerprint options */
37
+ export const OS_PROFILES = [
38
+ "windows",
39
+ "macos",
40
+ "linux",
41
+ "android",
42
+ "ios",
43
+ ] as const;
44
+
45
+ /** Type for browser profile strings */
46
+ export type BrowserProfile = (typeof BROWSER_PROFILES)[number];
47
+
48
+ /** Type for OS profile strings */
49
+ export type OSProfile = (typeof OS_PROFILES)[number];
50
+
51
+ /**
52
+ * Resolve a browser profile string.
53
+ * If provided, validates against known profiles.
54
+ * If omitted, returns the default (latest Chrome).
55
+ *
56
+ * @param browser - Browser profile string or undefined
57
+ * @returns Validated browser profile
58
+ */
59
+ export function resolveBrowserProfile(browser?: string): string {
60
+ if (!browser) {
61
+ return DEFAULT_BROWSER;
62
+ }
63
+
64
+ // Check exact match
65
+ if ((BROWSER_PROFILES as readonly string[]).includes(browser)) {
66
+ return browser;
67
+ }
68
+
69
+ // Try prefix match (e.g. "chrome" → latest Chrome)
70
+ const prefix = browser.toLowerCase();
71
+ const matches = BROWSER_PROFILES.filter((p) =>
72
+ p.toLowerCase().startsWith(prefix)
73
+ );
74
+
75
+ if (matches.length > 0) {
76
+ // Return the last (newest) matching profile
77
+ return matches[matches.length - 1];
78
+ }
79
+
80
+ // Unknown profile — pass through (wreq-js may support newer profiles)
81
+ return browser;
82
+ }
83
+
84
+ /**
85
+ * Resolve an OS fingerprint string.
86
+ * If omitted, returns the default (windows).
87
+ *
88
+ * @param os - OS string or undefined
89
+ * @returns Validated OS string
90
+ */
91
+ export function resolveOSProfile(os?: string): string {
92
+ if (!os) {
93
+ return DEFAULT_OS;
94
+ }
95
+
96
+ if ((OS_PROFILES as readonly string[]).includes(os)) {
97
+ return os;
98
+ }
99
+
100
+ // Pass through unknown values
101
+ return os;
102
+ }
@@ -0,0 +1,169 @@
1
+ /**
2
+ * @unipi/web-api — Smart-Fetch Engine Types
3
+ *
4
+ * Type definitions for the local smart-fetch engine:
5
+ * wreq-js (TLS fingerprinting) + defuddle (content extraction) + linkedom (server-side DOM).
6
+ */
7
+
8
+ // ─── Error Types ───────────────────────────────────────────────
9
+
10
+ /** Structured error codes for fetch failures */
11
+ export type FetchErrorCode =
12
+ | "invalid_url"
13
+ | "unsupported_protocol"
14
+ | "http_error"
15
+ | "unexpected_response"
16
+ | "timeout"
17
+ | "network_error"
18
+ | "processing_error"
19
+ | "download_error"
20
+ | "no_content"
21
+ | "too_many_redirects";
22
+
23
+ /** Phase in the fetch pipeline where an error occurred */
24
+ export type FetchErrorPhase =
25
+ | "validation"
26
+ | "connecting"
27
+ | "waiting"
28
+ | "loading"
29
+ | "processing"
30
+ | "unknown";
31
+
32
+ /** Rich error with structured context for agent retry decisions */
33
+ export interface FetchError {
34
+ /** Human-readable error message */
35
+ error: string;
36
+ /** Structured error category */
37
+ code: FetchErrorCode;
38
+ /** Where in the pipeline it failed */
39
+ phase: FetchErrorPhase;
40
+ /** Whether the agent can retry this request */
41
+ retryable: boolean;
42
+ /** Configured timeout in ms */
43
+ timeoutMs?: number;
44
+ /** Original URL requested */
45
+ url?: string;
46
+ /** Final URL after redirects */
47
+ finalUrl?: string;
48
+ /** HTTP status code (if applicable) */
49
+ statusCode?: number;
50
+ /** HTTP status text (if applicable) */
51
+ statusText?: string;
52
+ /** Response content type */
53
+ mimeType?: string;
54
+ /** Expected response size in bytes */
55
+ contentLength?: number;
56
+ /** Bytes downloaded before failure */
57
+ downloadedBytes?: number;
58
+ }
59
+
60
+ // ─── Result Types ──────────────────────────────────────────────
61
+
62
+ /** Successful fetch result with full metadata */
63
+ export interface FetchResult {
64
+ /** Original URL requested */
65
+ url: string;
66
+ /** Final URL after redirects */
67
+ finalUrl: string;
68
+ /** Page title */
69
+ title: string;
70
+ /** Article author (if extractable) */
71
+ author: string;
72
+ /** Publication date (ISO 8601 if available) */
73
+ published: string;
74
+ /** Site name */
75
+ site: string;
76
+ /** Content language (BCP 47) */
77
+ language: string;
78
+ /** Word count of extracted content */
79
+ wordCount: number;
80
+ /** Extracted and formatted content */
81
+ content: string;
82
+ /** Output format */
83
+ format: "markdown" | "html" | "text" | "json";
84
+ /** Response MIME type */
85
+ mimeType: string;
86
+ }
87
+
88
+ // ─── Options ───────────────────────────────────────────────────
89
+
90
+ /** Options for the smart-fetch engine */
91
+ export interface FetchOptions {
92
+ /** TLS fingerprint profile (e.g. "chrome_145") */
93
+ browser?: string;
94
+ /** OS fingerprint (e.g. "windows") */
95
+ os?: string;
96
+ /** Output format */
97
+ format?: "markdown" | "html" | "text" | "json";
98
+ /** Maximum characters in output content */
99
+ maxChars?: number;
100
+ /** Request timeout in milliseconds */
101
+ timeoutMs?: number;
102
+ /** Strip image references from content */
103
+ removeImages?: boolean;
104
+ /** Include replies/comments: true, false, or "extractors" */
105
+ includeReplies?: boolean | "extractors";
106
+ /** Proxy URL */
107
+ proxy?: string;
108
+ /** Additional HTTP headers */
109
+ headers?: Record<string, string>;
110
+ }
111
+
112
+ // ─── Batch Types ───────────────────────────────────────────────
113
+
114
+ /** Result for a single item in a batch fetch */
115
+ export type BatchFetchItemResult =
116
+ | { status: "done"; result: FetchResult }
117
+ | { status: "error"; error: FetchError };
118
+
119
+ /** Result of a batch fetch operation */
120
+ export interface BatchFetchResult {
121
+ /** Total URLs requested */
122
+ total: number;
123
+ /** Successfully fetched */
124
+ succeeded: number;
125
+ /** Failed to fetch */
126
+ failed: number;
127
+ /** Per-item results in input order */
128
+ items: BatchFetchItemResult[];
129
+ }
130
+
131
+ // ─── Progress Types ────────────────────────────────────────────
132
+
133
+ /** Status of a single URL in the fetch pipeline */
134
+ export type FetchProgressStatus =
135
+ | "queued"
136
+ | "connecting"
137
+ | "waiting"
138
+ | "loading"
139
+ | "processing"
140
+ | "done"
141
+ | "error";
142
+
143
+ /** Progress update for a single URL */
144
+ export interface FetchProgress {
145
+ /** URL being fetched */
146
+ url: string;
147
+ /** Current pipeline status */
148
+ status: FetchProgressStatus;
149
+ /** Progress percentage (0-100) */
150
+ percent: number;
151
+ /** Bytes loaded so far */
152
+ bytesLoaded: number;
153
+ /** Total bytes expected */
154
+ bytesTotal: number;
155
+ /** Current phase label */
156
+ phase: string;
157
+ /** Error details (if status is "error") */
158
+ error?: FetchError;
159
+ }
160
+
161
+ // ─── Execution Hooks ───────────────────────────────────────────
162
+
163
+ /** Hooks for observing fetch execution progress */
164
+ export interface FetchExecutionHooks {
165
+ /** Called with progress updates for a single URL fetch */
166
+ onProgress?: (progress: FetchProgress) => void;
167
+ /** Called with full progress snapshot for batch fetches */
168
+ onUpdate?: (progress: FetchProgress[]) => void;
169
+ }
package/src/index.ts CHANGED
@@ -2,7 +2,7 @@
2
2
  * @unipi/web-api — Extension entry
3
3
  *
4
4
  * Web search, read, and summarize tools with provider-based backend selection.
5
- * Provides agent tools: web-search, web-read, web-llm-summarize
5
+ * Provides agent tools: web-search, multi-web-content-read, web-llm-summarize
6
6
  */
7
7
 
8
8
  import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
@@ -15,7 +15,8 @@ import {
15
15
  import { registerWebTools, WEB_TOOLS } from "./tools.js";
16
16
  import { registerWebCommands, WEB_COMMANDS } from "./commands.js";
17
17
  import { webCache } from "./cache.js";
18
- import { loadConfig } from "./settings.js";
18
+ import { loadConfig, loadSmartFetchSettings } from "./settings.js";
19
+ import { checkDependencies } from "./engine/dependencies.js";
19
20
  import "./providers/duckduckgo.js";
20
21
  import "./providers/jina-search.js";
21
22
  import "./providers/jina-reader.js";
@@ -79,6 +80,7 @@ export default function (pi: ExtensionAPI) {
79
80
  showByDefault: true,
80
81
  stats: [
81
82
  { id: "providers", label: "Enabled Providers", show: true },
83
+ { id: "smartFetch", label: "Smart-Fetch", show: true },
82
84
  { id: "cacheEntries", label: "Cache Entries", show: true },
83
85
  { id: "cacheSize", label: "Cache Size", show: true },
84
86
  { id: "expired", label: "Expired Entries", show: true },
@@ -91,8 +93,13 @@ export default function (pi: ExtensionAPI) {
91
93
  (p) => p.enabled
92
94
  ).length;
93
95
 
96
+ // Check smart-fetch engine availability
97
+ const deps = await checkDependencies();
98
+ const smartFetchStatus = deps.available ? "✓ Ready" : `Missing: ${deps.missing.join(", ")}`;
99
+
94
100
  return {
95
101
  providers: { value: String(enabledCount) },
102
+ smartFetch: { value: smartFetchStatus },
96
103
  cacheEntries: { value: String(stats.totalEntries) },
97
104
  cacheSize: { value: `${(stats.totalSizeBytes / 1024).toFixed(1)} KB` },
98
105
  expired: { value: String(stats.expiredEntries) },
@@ -7,7 +7,15 @@
7
7
  /** Supported capabilities for web providers */
8
8
  export type WebCapability = "search" | "read" | "summarize";
9
9
 
10
- /** Ranking structure for provider selection */
10
+ /**
11
+ * Ranking for capability selection.
12
+ * Lower number = simpler/cheaper provider (preferred for auto-selection).
13
+ * 0 means provider doesn't support that capability.
14
+ *
15
+ * Note: For "read" capability, rank 0 is reserved for the smart-fetch engine.
16
+ * It is not a registered provider, but the default when source=0 or omitted.
17
+ * Registered read providers use ranks 1+ (Jina Reader=1, Firecrawl=2, Perplexity=3).
18
+ */
11
19
  export interface ProviderRanking {
12
20
  search: number;
13
21
  read: number;