@pi-lab/webfetch 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +15 -0
  2. package/dist/index.mjs +262 -16
  3. package/package.json +1 -1
package/README.md CHANGED
@@ -16,3 +16,18 @@ pi install npm:@pi-lab/webfetch
16
16
  - **Redirect handling** — same-domain redirects are followed automatically (up to 10 hops); cross-domain redirects are surfaced to the model so it can decide whether to follow.
17
17
  - **Binary downloads** — non-text responses (PDFs, images, etc.) are saved to `~/.pi/agent/pi-lab/tmp/webfetch/` and the file path is returned.
18
18
  - **LRU cache** — processed Markdown is cached in memory so paginating the same URL doesn't re-fetch.
19
+ - **Built-in fetch optimizations** — enabled by default. Site-specific rules can rewrite or parse difficult pages before generic extraction. Reddit links are rewritten to `old.reddit.com`; X/Twitter posts are extracted from the page's `window.__INITIAL_STATE__` script and formatted as clean Markdown.
20
+
21
+ ## Configuration
22
+
23
+ Disable the built-in optimization framework in pi settings:
24
+
25
+ ```json
26
+ {
27
+ "webfetch": {
28
+ "optimizations": false
29
+ }
30
+ }
31
+ ```
32
+
33
+ User settings live at `~/.pi/agent/settings.json`; project settings live at `<cwd>/.pi/settings.json` and override user settings.
package/dist/index.mjs CHANGED
@@ -1,20 +1,66 @@
1
+ import { homedir } from "node:os";
2
+ import { join } from "node:path";
3
+ import { readFileSync } from "node:fs";
1
4
  import { Type } from "@sinclair/typebox";
2
5
  import { keyHint } from "@earendil-works/pi-coding-agent";
3
6
  import { Text } from "@earendil-works/pi-tui";
4
7
  import { LRUCache } from "lru-cache";
5
8
  import { mkdir, writeFile } from "node:fs/promises";
6
- import { join } from "node:path";
7
9
  import { Readability } from "@mozilla/readability";
8
10
  import { parseHTML } from "linkedom";
9
11
  import TurndownService from "turndown";
10
- import { homedir } from "node:os";
12
+ //#region ../utils/src/paths.ts
13
+ function getPiLabGlobalDir(home = homedir()) {
14
+ return join(home, ".pi", "agent", "pi-lab");
15
+ }
16
+ function getPiLabGlobalTmpDir(name, home = homedir()) {
17
+ const tmpDir = join(getPiLabGlobalDir(home), "tmp");
18
+ return name ? join(tmpDir, name) : tmpDir;
19
+ }
20
+ //#endregion
21
+ //#region ../utils/src/settings.ts
22
+ function readJsonFile(filePath) {
23
+ try {
24
+ return JSON.parse(readFileSync(filePath, "utf8"));
25
+ } catch (error) {
26
+ if (error.code === "ENOENT") return {};
27
+ throw error;
28
+ }
29
+ }
30
+ function readPiProjectSettings(cwd = process.cwd()) {
31
+ return readJsonFile(join(cwd, ".pi", "settings.json"));
32
+ }
33
+ function readPiUserSettings(home = homedir()) {
34
+ return readJsonFile(join(home, ".pi", "agent", "settings.json"));
35
+ }
36
+ function mergePiSettings(userSettings = {}, projectSettings = {}) {
37
+ return deepMerge(userSettings, projectSettings);
38
+ }
39
+ function readMergedPiSettings(options = {}) {
40
+ return mergePiSettings(readPiUserSettings(options.home), readPiProjectSettings(options.cwd));
41
+ }
42
+ function deepMerge(base, override) {
43
+ const result = { ...base };
44
+ for (const [key, value] of Object.entries(override)) {
45
+ if (value === void 0) continue;
46
+ const existing = result[key];
47
+ if (isPlainObject(existing) && isPlainObject(value)) result[key] = deepMerge(existing, value);
48
+ else result[key] = value;
49
+ }
50
+ return result;
51
+ }
52
+ function isPlainObject(value) {
53
+ return typeof value === "object" && value !== null && !Array.isArray(value);
54
+ }
55
+ //#endregion
11
56
  //#region src/config.ts
12
57
  const DEFAULT_CONFIG = {
13
58
  maxPageLength: 2e4,
14
59
  cache: {
15
60
  maxSizeBytes: 50 * 1024 * 1024,
16
61
  ttlMs: 900 * 1e3
17
- }
62
+ },
63
+ optimizations: true
18
64
  };
19
65
  function mergeConfig(partial) {
20
66
  if (!partial) return DEFAULT_CONFIG;
@@ -23,9 +69,16 @@ function mergeConfig(partial) {
23
69
  cache: {
24
70
  ...DEFAULT_CONFIG.cache,
25
71
  ...partial.cache
26
- }
72
+ },
73
+ optimizations: partial.optimizations ?? DEFAULT_CONFIG.optimizations
27
74
  };
28
75
  }
76
+ function loadWebFetchConfig(settings = {}) {
77
+ const webfetch = settings.webfetch;
78
+ if (typeof webfetch !== "object" || webfetch === null || Array.isArray(webfetch)) return mergeConfig();
79
+ const optimizations = webfetch.optimizations;
80
+ return mergeConfig({ optimizations: typeof optimizations === "boolean" ? optimizations : void 0 });
81
+ }
29
82
  //#endregion
30
83
  //#region src/cache.ts
31
84
  var WebFetchCache = class {
@@ -234,20 +287,206 @@ function processPlainText(text) {
234
287
  return text;
235
288
  }
236
289
  //#endregion
237
- //#region ../utils/src/paths.ts
238
- function getPiLabGlobalDir(home = homedir()) {
239
- return join(home, ".pi", "agent", "pi-lab");
240
- }
241
- function getPiLabGlobalTmpDir(name, home = homedir()) {
242
- const tmpDir = join(getPiLabGlobalDir(home), "tmp");
243
- return name ? join(tmpDir, name) : tmpDir;
244
- }
245
- //#endregion
246
290
  //#region src/paths.ts
247
291
  function getBinaryTempDir(home = homedir()) {
248
292
  return getPiLabGlobalTmpDir("webfetch", home);
249
293
  }
250
294
  //#endregion
295
+ //#region src/optimizers/reddit.ts
296
+ function isRedditHost(hostname) {
297
+ return hostname === "reddit.com" || hostname === "www.reddit.com";
298
+ }
299
+ const redditOptimizer = {
300
+ id: "reddit",
301
+ match(url) {
302
+ try {
303
+ return isRedditHost(new URL(url).hostname.toLowerCase());
304
+ } catch {
305
+ return false;
306
+ }
307
+ },
308
+ rewriteUrl(url) {
309
+ const parsed = new URL(url);
310
+ parsed.hostname = "old.reddit.com";
311
+ return parsed.toString();
312
+ }
313
+ };
314
+ //#endregion
315
+ //#region src/optimizers/x.ts
316
+ function isXHost(hostname) {
317
+ return hostname === "x.com" || hostname === "www.x.com" || hostname === "twitter.com" || hostname === "www.twitter.com";
318
+ }
319
+ function isObject(value) {
320
+ return typeof value === "object" && value !== null && !Array.isArray(value);
321
+ }
322
+ function entitiesMap(value) {
323
+ if (!isObject(value)) return {};
324
+ const nested = value.entities;
325
+ return isObject(nested) ? nested : value;
326
+ }
327
+ function stringValue(value) {
328
+ return typeof value === "string" && value.trim() ? value : void 0;
329
+ }
330
+ function numberValue(value) {
331
+ return typeof value === "number" && Number.isFinite(value) ? value : void 0;
332
+ }
333
+ function extractStatusId(url) {
334
+ try {
335
+ return new URL(url).pathname.match(/\/status(?:es)?\/(\d+)/)?.[1];
336
+ } catch {
337
+ return;
338
+ }
339
+ }
340
+ function extractInitialStateJson(script) {
341
+ const start = script.indexOf("window.__INITIAL_STATE__=");
342
+ if (start === -1) return void 0;
343
+ const jsonStart = script.indexOf("{", start + 25);
344
+ if (jsonStart === -1) return void 0;
345
+ let depth = 0;
346
+ let inString = false;
347
+ let escaped = false;
348
+ for (let i = jsonStart; i < script.length; i++) {
349
+ const char = script[i];
350
+ if (inString) {
351
+ if (escaped) escaped = false;
352
+ else if (char === "\\") escaped = true;
353
+ else if (char === "\"") inString = false;
354
+ continue;
355
+ }
356
+ if (char === "\"") inString = true;
357
+ else if (char === "{") depth++;
358
+ else if (char === "}") {
359
+ depth--;
360
+ if (depth === 0) return script.slice(jsonStart, i + 1);
361
+ }
362
+ }
363
+ }
364
+ function parseInitialState(html) {
365
+ const { document } = parseHTML(html);
366
+ for (const script of document.querySelectorAll("script:not([src])")) {
367
+ const json = extractInitialStateJson(script.textContent ?? "");
368
+ if (!json) continue;
369
+ try {
370
+ const parsed = JSON.parse(json);
371
+ return isObject(parsed) ? parsed : void 0;
372
+ } catch {
373
+ return;
374
+ }
375
+ }
376
+ }
377
+ function decodeHtmlEntities(text) {
378
+ let result = text;
379
+ for (let i = 0; i < 3; i++) {
380
+ const decoded = result.replace(/&#x([0-9a-f]+);/gi, (_, hex) => String.fromCodePoint(Number.parseInt(hex, 16))).replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(Number.parseInt(dec, 10))).replace(/&quot;/g, "\"").replace(/&#39;/g, "'").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&amp;/g, "&");
381
+ if (decoded === result) break;
382
+ result = decoded;
383
+ }
384
+ return result;
385
+ }
386
+ function mediaUrls(tweet) {
387
+ const extended = isObject(tweet.extended_entities) ? tweet.extended_entities : void 0;
388
+ const entities = isObject(tweet.entities) ? tweet.entities : void 0;
389
+ const media = Array.isArray(extended?.media) ? extended.media : Array.isArray(entities?.media) ? entities.media : [];
390
+ const urls = /* @__PURE__ */ new Set();
391
+ for (const item of media) {
392
+ if (!isObject(item)) continue;
393
+ const url = stringValue(item.media_url_https) ?? stringValue(item.media_url);
394
+ if (url) urls.add(url);
395
+ }
396
+ return [...urls];
397
+ }
398
+ function formatStats(tweet) {
399
+ const stats = [
400
+ ["Replies", numberValue(tweet.reply_count)],
401
+ ["Retweets", numberValue(tweet.retweet_count)],
402
+ ["Quotes", numberValue(tweet.quote_count)],
403
+ ["Likes", numberValue(tweet.favorite_count)]
404
+ ].filter((entry) => entry[1] !== void 0);
405
+ if (stats.length === 0) return void 0;
406
+ return stats.map(([label, value]) => `${label}: ${value}`).join(" · ");
407
+ }
408
+ function selectTweet(tweets, url) {
409
+ const statusId = extractStatusId(url);
410
+ if (statusId && isObject(tweets[statusId])) return tweets[statusId];
411
+ return Object.values(tweets).find(isObject);
412
+ }
413
+ function renderTweetMarkdown(tweet, users) {
414
+ const text = stringValue(tweet.full_text) ?? stringValue(tweet.text);
415
+ if (!text) return void 0;
416
+ const userId = stringValue(tweet.user);
417
+ const user = userId ? users[userId] : void 0;
418
+ const userObj = isObject(user) ? user : void 0;
419
+ const name = stringValue(userObj?.name) ?? "Unknown author";
420
+ const screenName = stringValue(userObj?.screen_name);
421
+ const createdAt = stringValue(tweet.created_at);
422
+ const stats = formatStats(tweet);
423
+ const media = mediaUrls(tweet);
424
+ let body = decodeHtmlEntities(text).replace(/\s+https:\/\/t\.co\/\S+\s*$/g, "").trim();
425
+ if (!body) return void 0;
426
+ const lines = [];
427
+ lines.push(screenName ? `# Tweet by ${name} (@${screenName})` : `# Tweet by ${name}`);
428
+ if (createdAt) lines.push("", `Posted: ${createdAt}`);
429
+ if (stats) lines.push(stats);
430
+ lines.push("", body);
431
+ if (media.length > 0) lines.push("", "Media:", ...media.map((url) => `- ${url}`));
432
+ return lines.join("\n");
433
+ }
434
+ function optimizeXHtml({ url, html }) {
435
+ const state = parseInitialState(html);
436
+ const tweets = entitiesMap(state?.entities?.tweets);
437
+ const users = entitiesMap(state?.entities?.users);
438
+ const tweet = selectTweet(tweets, url);
439
+ if (!tweet) return void 0;
440
+ const markdown = renderTweetMarkdown(tweet, users);
441
+ if (!markdown) return void 0;
442
+ return {
443
+ markdown,
444
+ scripts: [],
445
+ method: "optimized"
446
+ };
447
+ }
448
+ //#endregion
449
+ //#region src/optimizers/index.ts
450
+ const BUILT_IN_OPTIMIZERS = [redditOptimizer, {
451
+ id: "x",
452
+ match(url) {
453
+ try {
454
+ return isXHost(new URL(url).hostname.toLowerCase());
455
+ } catch {
456
+ return false;
457
+ }
458
+ },
459
+ async processHtml(input) {
460
+ return optimizeXHtml(input);
461
+ }
462
+ }];
463
+ function findOptimizer(url, config) {
464
+ if (!config.optimizations) return void 0;
465
+ return BUILT_IN_OPTIMIZERS.find((optimizer) => optimizer.match(url));
466
+ }
467
+ function applyFetchOptimizations(url, config) {
468
+ const optimizer = findOptimizer(url, config);
469
+ if (!optimizer?.rewriteUrl) return { url };
470
+ const rewritten = optimizer.rewriteUrl(url);
471
+ if (!rewritten || rewritten === url) return { url };
472
+ return {
473
+ url: normalizeUrl(rewritten),
474
+ optimizerId: optimizer.id
475
+ };
476
+ }
477
+ async function processHtmlWithOptimizations({ url, html, config, defaultProcess }) {
478
+ const optimizer = findOptimizer(url, config);
479
+ if (optimizer?.processHtml) {
480
+ const optimized = await optimizer.processHtml({
481
+ url,
482
+ html,
483
+ defaultProcess
484
+ });
485
+ if (optimized) return optimized;
486
+ }
487
+ return defaultProcess();
488
+ }
489
+ //#endregion
251
490
  //#region src/tool.ts
252
491
  function formatScriptIndex(scripts) {
253
492
  if (scripts.length === 0) return "";
@@ -328,6 +567,7 @@ function registerWebFetchTool(pi, config) {
328
567
  } catch {
329
568
  throw new Error(`Invalid URL: ${url}`);
330
569
  }
570
+ normalizedUrl = applyFetchOptimizations(normalizedUrl, config).url;
331
571
  const tempDir = getBinaryTempDir();
332
572
  let entry = cache.get(normalizedUrl);
333
573
  if (!entry) {
@@ -377,7 +617,12 @@ function registerWebFetchTool(pi, config) {
377
617
  details: {}
378
618
  });
379
619
  if (result.contentType === "text/html") {
380
- const processed = await processHtml(result.content, normalizedUrl);
620
+ const processed = await processHtmlWithOptimizations({
621
+ url: normalizedUrl,
622
+ html: result.content,
623
+ config,
624
+ defaultProcess: () => processHtml(result.content, normalizedUrl)
625
+ });
381
626
  entry = {
382
627
  markdown: processed.markdown,
383
628
  scripts: processed.scripts
@@ -481,10 +726,11 @@ function registerWebFetchTool(pi, config) {
481
726
  * - Mozilla Readability for HTML → Markdown extraction
482
727
  * - Inline script index — use `script=N` to read a specific inline script
483
728
  * - LRU cache (50 MB, 15 min TTL) keyed on normalized URL
729
+ * - Built-in fetch optimizations (enabled by default), including Reddit URL rewrite
484
730
  * - Pagination via offset/max_length parameters
485
731
  */
486
732
  function src_default(pi) {
487
- registerWebFetchTool(pi, mergeConfig());
733
+ registerWebFetchTool(pi, loadWebFetchConfig(readMergedPiSettings()));
488
734
  }
489
735
  //#endregion
490
- export { DEFAULT_CONFIG, src_default as default, mergeConfig, registerWebFetchTool };
736
+ export { DEFAULT_CONFIG, src_default as default, loadWebFetchConfig, mergeConfig, registerWebFetchTool };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pi-lab/webfetch",
3
- "version": "1.0.0",
3
+ "version": "1.0.1",
4
4
  "description": "WebFetch tool extension for pi coding agent",
5
5
  "keywords": [
6
6
  "pi-package"