@pi-lab/webfetch 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -0
- package/dist/index.mjs +262 -16
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -16,3 +16,18 @@ pi install npm:@pi-lab/webfetch
|
|
|
16
16
|
- **Redirect handling** — same-domain redirects are followed automatically (up to 10 hops); cross-domain redirects are surfaced to the model so it can decide whether to follow.
|
|
17
17
|
- **Binary downloads** — non-text responses (PDFs, images, etc.) are saved to `~/.pi/agent/pi-lab/tmp/webfetch/` and the file path is returned.
|
|
18
18
|
- **LRU cache** — processed Markdown is cached in memory so paginating the same URL doesn't re-fetch.
|
|
19
|
+
- **Built-in fetch optimizations** — enabled by default. Site-specific rules can rewrite or parse difficult pages before generic extraction. Reddit links are rewritten to `old.reddit.com`; X/Twitter posts are extracted from the page's `window.__INITIAL_STATE__` script and formatted as clean Markdown.
|
|
20
|
+
|
|
21
|
+
## Configuration
|
|
22
|
+
|
|
23
|
+
Disable the built-in optimization framework in pi settings:
|
|
24
|
+
|
|
25
|
+
```json
|
|
26
|
+
{
|
|
27
|
+
"webfetch": {
|
|
28
|
+
"optimizations": false
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
User settings live at `~/.pi/agent/settings.json`; project settings live at `<cwd>/.pi/settings.json` and override user settings.
|
package/dist/index.mjs
CHANGED
|
@@ -1,20 +1,66 @@
|
|
|
1
|
+
import { homedir } from "node:os";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { readFileSync } from "node:fs";
|
|
1
4
|
import { Type } from "@sinclair/typebox";
|
|
2
5
|
import { keyHint } from "@earendil-works/pi-coding-agent";
|
|
3
6
|
import { Text } from "@earendil-works/pi-tui";
|
|
4
7
|
import { LRUCache } from "lru-cache";
|
|
5
8
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
6
|
-
import { join } from "node:path";
|
|
7
9
|
import { Readability } from "@mozilla/readability";
|
|
8
10
|
import { parseHTML } from "linkedom";
|
|
9
11
|
import TurndownService from "turndown";
|
|
10
|
-
|
|
12
|
+
//#region ../utils/src/paths.ts
|
|
13
|
+
function getPiLabGlobalDir(home = homedir()) {
|
|
14
|
+
return join(home, ".pi", "agent", "pi-lab");
|
|
15
|
+
}
|
|
16
|
+
function getPiLabGlobalTmpDir(name, home = homedir()) {
|
|
17
|
+
const tmpDir = join(getPiLabGlobalDir(home), "tmp");
|
|
18
|
+
return name ? join(tmpDir, name) : tmpDir;
|
|
19
|
+
}
|
|
20
|
+
//#endregion
|
|
21
|
+
//#region ../utils/src/settings.ts
|
|
22
|
+
function readJsonFile(filePath) {
|
|
23
|
+
try {
|
|
24
|
+
return JSON.parse(readFileSync(filePath, "utf8"));
|
|
25
|
+
} catch (error) {
|
|
26
|
+
if (error.code === "ENOENT") return {};
|
|
27
|
+
throw error;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
function readPiProjectSettings(cwd = process.cwd()) {
|
|
31
|
+
return readJsonFile(join(cwd, ".pi", "settings.json"));
|
|
32
|
+
}
|
|
33
|
+
function readPiUserSettings(home = homedir()) {
|
|
34
|
+
return readJsonFile(join(home, ".pi", "agent", "settings.json"));
|
|
35
|
+
}
|
|
36
|
+
function mergePiSettings(userSettings = {}, projectSettings = {}) {
|
|
37
|
+
return deepMerge(userSettings, projectSettings);
|
|
38
|
+
}
|
|
39
|
+
function readMergedPiSettings(options = {}) {
|
|
40
|
+
return mergePiSettings(readPiUserSettings(options.home), readPiProjectSettings(options.cwd));
|
|
41
|
+
}
|
|
42
|
+
function deepMerge(base, override) {
|
|
43
|
+
const result = { ...base };
|
|
44
|
+
for (const [key, value] of Object.entries(override)) {
|
|
45
|
+
if (value === void 0) continue;
|
|
46
|
+
const existing = result[key];
|
|
47
|
+
if (isPlainObject(existing) && isPlainObject(value)) result[key] = deepMerge(existing, value);
|
|
48
|
+
else result[key] = value;
|
|
49
|
+
}
|
|
50
|
+
return result;
|
|
51
|
+
}
|
|
52
|
+
function isPlainObject(value) {
|
|
53
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
54
|
+
}
|
|
55
|
+
//#endregion
|
|
11
56
|
//#region src/config.ts
|
|
12
57
|
const DEFAULT_CONFIG = {
|
|
13
58
|
maxPageLength: 2e4,
|
|
14
59
|
cache: {
|
|
15
60
|
maxSizeBytes: 50 * 1024 * 1024,
|
|
16
61
|
ttlMs: 900 * 1e3
|
|
17
|
-
}
|
|
62
|
+
},
|
|
63
|
+
optimizations: true
|
|
18
64
|
};
|
|
19
65
|
function mergeConfig(partial) {
|
|
20
66
|
if (!partial) return DEFAULT_CONFIG;
|
|
@@ -23,9 +69,16 @@ function mergeConfig(partial) {
|
|
|
23
69
|
cache: {
|
|
24
70
|
...DEFAULT_CONFIG.cache,
|
|
25
71
|
...partial.cache
|
|
26
|
-
}
|
|
72
|
+
},
|
|
73
|
+
optimizations: partial.optimizations ?? DEFAULT_CONFIG.optimizations
|
|
27
74
|
};
|
|
28
75
|
}
|
|
76
|
+
function loadWebFetchConfig(settings = {}) {
|
|
77
|
+
const webfetch = settings.webfetch;
|
|
78
|
+
if (typeof webfetch !== "object" || webfetch === null || Array.isArray(webfetch)) return mergeConfig();
|
|
79
|
+
const optimizations = webfetch.optimizations;
|
|
80
|
+
return mergeConfig({ optimizations: typeof optimizations === "boolean" ? optimizations : void 0 });
|
|
81
|
+
}
|
|
29
82
|
//#endregion
|
|
30
83
|
//#region src/cache.ts
|
|
31
84
|
var WebFetchCache = class {
|
|
@@ -234,20 +287,206 @@ function processPlainText(text) {
|
|
|
234
287
|
return text;
|
|
235
288
|
}
|
|
236
289
|
//#endregion
|
|
237
|
-
//#region ../utils/src/paths.ts
|
|
238
|
-
function getPiLabGlobalDir(home = homedir()) {
|
|
239
|
-
return join(home, ".pi", "agent", "pi-lab");
|
|
240
|
-
}
|
|
241
|
-
function getPiLabGlobalTmpDir(name, home = homedir()) {
|
|
242
|
-
const tmpDir = join(getPiLabGlobalDir(home), "tmp");
|
|
243
|
-
return name ? join(tmpDir, name) : tmpDir;
|
|
244
|
-
}
|
|
245
|
-
//#endregion
|
|
246
290
|
//#region src/paths.ts
|
|
247
291
|
function getBinaryTempDir(home = homedir()) {
|
|
248
292
|
return getPiLabGlobalTmpDir("webfetch", home);
|
|
249
293
|
}
|
|
250
294
|
//#endregion
|
|
295
|
+
//#region src/optimizers/reddit.ts
|
|
296
|
+
function isRedditHost(hostname) {
|
|
297
|
+
return hostname === "reddit.com" || hostname === "www.reddit.com";
|
|
298
|
+
}
|
|
299
|
+
const redditOptimizer = {
|
|
300
|
+
id: "reddit",
|
|
301
|
+
match(url) {
|
|
302
|
+
try {
|
|
303
|
+
return isRedditHost(new URL(url).hostname.toLowerCase());
|
|
304
|
+
} catch {
|
|
305
|
+
return false;
|
|
306
|
+
}
|
|
307
|
+
},
|
|
308
|
+
rewriteUrl(url) {
|
|
309
|
+
const parsed = new URL(url);
|
|
310
|
+
parsed.hostname = "old.reddit.com";
|
|
311
|
+
return parsed.toString();
|
|
312
|
+
}
|
|
313
|
+
};
|
|
314
|
+
//#endregion
|
|
315
|
+
//#region src/optimizers/x.ts
|
|
316
|
+
function isXHost(hostname) {
|
|
317
|
+
return hostname === "x.com" || hostname === "www.x.com" || hostname === "twitter.com" || hostname === "www.twitter.com";
|
|
318
|
+
}
|
|
319
|
+
function isObject(value) {
|
|
320
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
321
|
+
}
|
|
322
|
+
function entitiesMap(value) {
|
|
323
|
+
if (!isObject(value)) return {};
|
|
324
|
+
const nested = value.entities;
|
|
325
|
+
return isObject(nested) ? nested : value;
|
|
326
|
+
}
|
|
327
|
+
function stringValue(value) {
|
|
328
|
+
return typeof value === "string" && value.trim() ? value : void 0;
|
|
329
|
+
}
|
|
330
|
+
function numberValue(value) {
|
|
331
|
+
return typeof value === "number" && Number.isFinite(value) ? value : void 0;
|
|
332
|
+
}
|
|
333
|
+
function extractStatusId(url) {
|
|
334
|
+
try {
|
|
335
|
+
return new URL(url).pathname.match(/\/status(?:es)?\/(\d+)/)?.[1];
|
|
336
|
+
} catch {
|
|
337
|
+
return;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
function extractInitialStateJson(script) {
|
|
341
|
+
const start = script.indexOf("window.__INITIAL_STATE__=");
|
|
342
|
+
if (start === -1) return void 0;
|
|
343
|
+
const jsonStart = script.indexOf("{", start + 25);
|
|
344
|
+
if (jsonStart === -1) return void 0;
|
|
345
|
+
let depth = 0;
|
|
346
|
+
let inString = false;
|
|
347
|
+
let escaped = false;
|
|
348
|
+
for (let i = jsonStart; i < script.length; i++) {
|
|
349
|
+
const char = script[i];
|
|
350
|
+
if (inString) {
|
|
351
|
+
if (escaped) escaped = false;
|
|
352
|
+
else if (char === "\\") escaped = true;
|
|
353
|
+
else if (char === "\"") inString = false;
|
|
354
|
+
continue;
|
|
355
|
+
}
|
|
356
|
+
if (char === "\"") inString = true;
|
|
357
|
+
else if (char === "{") depth++;
|
|
358
|
+
else if (char === "}") {
|
|
359
|
+
depth--;
|
|
360
|
+
if (depth === 0) return script.slice(jsonStart, i + 1);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
function parseInitialState(html) {
|
|
365
|
+
const { document } = parseHTML(html);
|
|
366
|
+
for (const script of document.querySelectorAll("script:not([src])")) {
|
|
367
|
+
const json = extractInitialStateJson(script.textContent ?? "");
|
|
368
|
+
if (!json) continue;
|
|
369
|
+
try {
|
|
370
|
+
const parsed = JSON.parse(json);
|
|
371
|
+
return isObject(parsed) ? parsed : void 0;
|
|
372
|
+
} catch {
|
|
373
|
+
return;
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
function decodeHtmlEntities(text) {
|
|
378
|
+
let result = text;
|
|
379
|
+
for (let i = 0; i < 3; i++) {
|
|
380
|
+
const decoded = result.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16))).replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number.parseInt(dec, 10))).replace(/"/g, "\"").replace(/'/g, "'").replace(/</g, "<").replace(/>/g, ">").replace(/&/g, "&");
|
|
381
|
+
if (decoded === result) break;
|
|
382
|
+
result = decoded;
|
|
383
|
+
}
|
|
384
|
+
return result;
|
|
385
|
+
}
|
|
386
|
+
function mediaUrls(tweet) {
|
|
387
|
+
const extended = isObject(tweet.extended_entities) ? tweet.extended_entities : void 0;
|
|
388
|
+
const entities = isObject(tweet.entities) ? tweet.entities : void 0;
|
|
389
|
+
const media = Array.isArray(extended?.media) ? extended.media : Array.isArray(entities?.media) ? entities.media : [];
|
|
390
|
+
const urls = /* @__PURE__ */ new Set();
|
|
391
|
+
for (const item of media) {
|
|
392
|
+
if (!isObject(item)) continue;
|
|
393
|
+
const url = stringValue(item.media_url_https) ?? stringValue(item.media_url);
|
|
394
|
+
if (url) urls.add(url);
|
|
395
|
+
}
|
|
396
|
+
return [...urls];
|
|
397
|
+
}
|
|
398
|
+
function formatStats(tweet) {
|
|
399
|
+
const stats = [
|
|
400
|
+
["Replies", numberValue(tweet.reply_count)],
|
|
401
|
+
["Retweets", numberValue(tweet.retweet_count)],
|
|
402
|
+
["Quotes", numberValue(tweet.quote_count)],
|
|
403
|
+
["Likes", numberValue(tweet.favorite_count)]
|
|
404
|
+
].filter((entry) => entry[1] !== void 0);
|
|
405
|
+
if (stats.length === 0) return void 0;
|
|
406
|
+
return stats.map(([label, value]) => `${label}: ${value}`).join(" · ");
|
|
407
|
+
}
|
|
408
|
+
function selectTweet(tweets, url) {
|
|
409
|
+
const statusId = extractStatusId(url);
|
|
410
|
+
if (statusId && isObject(tweets[statusId])) return tweets[statusId];
|
|
411
|
+
return Object.values(tweets).find(isObject);
|
|
412
|
+
}
|
|
413
|
+
function renderTweetMarkdown(tweet, users) {
|
|
414
|
+
const text = stringValue(tweet.full_text) ?? stringValue(tweet.text);
|
|
415
|
+
if (!text) return void 0;
|
|
416
|
+
const userId = stringValue(tweet.user);
|
|
417
|
+
const user = userId ? users[userId] : void 0;
|
|
418
|
+
const userObj = isObject(user) ? user : void 0;
|
|
419
|
+
const name = stringValue(userObj?.name) ?? "Unknown author";
|
|
420
|
+
const screenName = stringValue(userObj?.screen_name);
|
|
421
|
+
const createdAt = stringValue(tweet.created_at);
|
|
422
|
+
const stats = formatStats(tweet);
|
|
423
|
+
const media = mediaUrls(tweet);
|
|
424
|
+
let body = decodeHtmlEntities(text).replace(/\s+https:\/\/t\.co\/\S+\s*$/g, "").trim();
|
|
425
|
+
if (!body) return void 0;
|
|
426
|
+
const lines = [];
|
|
427
|
+
lines.push(screenName ? `# Tweet by ${name} (@${screenName})` : `# Tweet by ${name}`);
|
|
428
|
+
if (createdAt) lines.push("", `Posted: ${createdAt}`);
|
|
429
|
+
if (stats) lines.push(stats);
|
|
430
|
+
lines.push("", body);
|
|
431
|
+
if (media.length > 0) lines.push("", "Media:", ...media.map((url) => `- ${url}`));
|
|
432
|
+
return lines.join("\n");
|
|
433
|
+
}
|
|
434
|
+
function optimizeXHtml({ url, html }) {
|
|
435
|
+
const state = parseInitialState(html);
|
|
436
|
+
const tweets = entitiesMap(state?.entities?.tweets);
|
|
437
|
+
const users = entitiesMap(state?.entities?.users);
|
|
438
|
+
const tweet = selectTweet(tweets, url);
|
|
439
|
+
if (!tweet) return void 0;
|
|
440
|
+
const markdown = renderTweetMarkdown(tweet, users);
|
|
441
|
+
if (!markdown) return void 0;
|
|
442
|
+
return {
|
|
443
|
+
markdown,
|
|
444
|
+
scripts: [],
|
|
445
|
+
method: "optimized"
|
|
446
|
+
};
|
|
447
|
+
}
|
|
448
|
+
//#endregion
|
|
449
|
+
//#region src/optimizers/index.ts
|
|
450
|
+
const BUILT_IN_OPTIMIZERS = [redditOptimizer, {
|
|
451
|
+
id: "x",
|
|
452
|
+
match(url) {
|
|
453
|
+
try {
|
|
454
|
+
return isXHost(new URL(url).hostname.toLowerCase());
|
|
455
|
+
} catch {
|
|
456
|
+
return false;
|
|
457
|
+
}
|
|
458
|
+
},
|
|
459
|
+
async processHtml(input) {
|
|
460
|
+
return optimizeXHtml(input);
|
|
461
|
+
}
|
|
462
|
+
}];
|
|
463
|
+
function findOptimizer(url, config) {
|
|
464
|
+
if (!config.optimizations) return void 0;
|
|
465
|
+
return BUILT_IN_OPTIMIZERS.find((optimizer) => optimizer.match(url));
|
|
466
|
+
}
|
|
467
|
+
function applyFetchOptimizations(url, config) {
|
|
468
|
+
const optimizer = findOptimizer(url, config);
|
|
469
|
+
if (!optimizer?.rewriteUrl) return { url };
|
|
470
|
+
const rewritten = optimizer.rewriteUrl(url);
|
|
471
|
+
if (!rewritten || rewritten === url) return { url };
|
|
472
|
+
return {
|
|
473
|
+
url: normalizeUrl(rewritten),
|
|
474
|
+
optimizerId: optimizer.id
|
|
475
|
+
};
|
|
476
|
+
}
|
|
477
|
+
async function processHtmlWithOptimizations({ url, html, config, defaultProcess }) {
|
|
478
|
+
const optimizer = findOptimizer(url, config);
|
|
479
|
+
if (optimizer?.processHtml) {
|
|
480
|
+
const optimized = await optimizer.processHtml({
|
|
481
|
+
url,
|
|
482
|
+
html,
|
|
483
|
+
defaultProcess
|
|
484
|
+
});
|
|
485
|
+
if (optimized) return optimized;
|
|
486
|
+
}
|
|
487
|
+
return defaultProcess();
|
|
488
|
+
}
|
|
489
|
+
//#endregion
|
|
251
490
|
//#region src/tool.ts
|
|
252
491
|
function formatScriptIndex(scripts) {
|
|
253
492
|
if (scripts.length === 0) return "";
|
|
@@ -328,6 +567,7 @@ function registerWebFetchTool(pi, config) {
|
|
|
328
567
|
} catch {
|
|
329
568
|
throw new Error(`Invalid URL: ${url}`);
|
|
330
569
|
}
|
|
570
|
+
normalizedUrl = applyFetchOptimizations(normalizedUrl, config).url;
|
|
331
571
|
const tempDir = getBinaryTempDir();
|
|
332
572
|
let entry = cache.get(normalizedUrl);
|
|
333
573
|
if (!entry) {
|
|
@@ -377,7 +617,12 @@ function registerWebFetchTool(pi, config) {
|
|
|
377
617
|
details: {}
|
|
378
618
|
});
|
|
379
619
|
if (result.contentType === "text/html") {
|
|
380
|
-
const processed = await
|
|
620
|
+
const processed = await processHtmlWithOptimizations({
|
|
621
|
+
url: normalizedUrl,
|
|
622
|
+
html: result.content,
|
|
623
|
+
config,
|
|
624
|
+
defaultProcess: () => processHtml(result.content, normalizedUrl)
|
|
625
|
+
});
|
|
381
626
|
entry = {
|
|
382
627
|
markdown: processed.markdown,
|
|
383
628
|
scripts: processed.scripts
|
|
@@ -481,10 +726,11 @@ function registerWebFetchTool(pi, config) {
|
|
|
481
726
|
* - Mozilla Readability for HTML → Markdown extraction
|
|
482
727
|
* - Inline script index — use `script=N` to read a specific inline script
|
|
483
728
|
* - LRU cache (50 MB, 15 min TTL) keyed on normalized URL
|
|
729
|
+
* - Built-in fetch optimizations (enabled by default), including Reddit URL rewrite
|
|
484
730
|
* - Pagination via offset/max_length parameters
|
|
485
731
|
*/
|
|
486
732
|
function src_default(pi) {
|
|
487
|
-
registerWebFetchTool(pi,
|
|
733
|
+
registerWebFetchTool(pi, loadWebFetchConfig(readMergedPiSettings()));
|
|
488
734
|
}
|
|
489
735
|
//#endregion
|
|
490
|
-
export { DEFAULT_CONFIG, src_default as default, mergeConfig, registerWebFetchTool };
|
|
736
|
+
export { DEFAULT_CONFIG, src_default as default, loadWebFetchConfig, mergeConfig, registerWebFetchTool };
|