pi-read-page 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,265 @@
1
+ import { createHash } from "node:crypto";
2
+ import { mkdir, readFile, rename, writeFile } from "node:fs/promises";
3
+ import { homedir } from "node:os";
4
+ import path from "node:path";
5
+ import {
6
+ DEFAULT_MAX_BYTES,
7
+ truncateHead,
8
+ } from "@earendil-works/pi-coding-agent";
9
+ import type { NormalizedUrl, UrlNormalization } from "../security/url-policy";
10
+ import type { ConfidenceReport, PageMetadata } from "../types";
11
+
12
+ export const CACHE_DIR = path.join(
13
+ homedir(),
14
+ ".pi",
15
+ "agent",
16
+ "caches",
17
+ "read-page",
18
+ );
19
+ export const DEFAULT_TTL_DAYS = 30;
20
+ export const USER_ACTION_TTL_DAYS = 1;
21
+
22
+ export type ReadPageCacheSource = "browser";
23
+ export type ReadPageCacheStatus =
24
+ | "hit"
25
+ | "miss"
26
+ | "refresh"
27
+ | "refresh-failed-fresh"
28
+ | "stale-fallback";
29
+
30
+ export type CacheMeta = {
31
+ version: number;
32
+ input_url: string;
33
+ url: string;
34
+ final_url: string;
35
+ cache_key: string;
36
+ url_sha256: string;
37
+ normalization: UrlNormalization;
38
+ source: ReadPageCacheSource;
39
+ extractor: "defuddle";
40
+ extraction: string;
41
+ parse_mode: string;
42
+ browser_profile?: "persistent" | "temporary";
43
+ user_action: boolean;
44
+ confidence: ConfidenceReport;
45
+ metadata: PageMetadata;
46
+ fetched_at: string;
47
+ expires_at: string;
48
+ ttl_days: number;
49
+ content_sha256: string;
50
+ chars: number;
51
+ lines: number;
52
+ };
53
+
54
+ export type CachedDocument = {
55
+ markdown: string;
56
+ meta: CacheMeta;
57
+ fresh: boolean;
58
+ };
59
+
60
+ export type Pagination = {
61
+ selected: string;
62
+ totalLines: number;
63
+ shownStart: number;
64
+ shownEnd: number;
65
+ nextOffset?: number;
66
+ truncated: boolean;
67
+ shownBytes: number;
68
+ totalBytes: number;
69
+ };
70
+
71
+ export function sha256(input: string): string {
72
+ return createHash("sha256").update(input).digest("hex");
73
+ }
74
+
75
+ function slugify(input: string, fallback: string): string {
76
+ const slug = input
77
+ .toLowerCase()
78
+ .replace(/[^a-z0-9._-]+/g, "-")
79
+ .replace(/-+/g, "-")
80
+ .replace(/^-|-$/g, "")
81
+ .slice(0, 80)
82
+ .replace(/-$/g, "");
83
+ return slug || fallback;
84
+ }
85
+
86
+ function cacheKey(url: string): string {
87
+ const parsed = new URL(url);
88
+ const urlHash = sha256(url);
89
+ const host = slugify(parsed.hostname, "unknown-host");
90
+ const pathAndSearch = `${parsed.pathname}${parsed.search}`;
91
+ const slug = slugify(pathAndSearch === "/" ? "root" : pathAndSearch, "root");
92
+ return `${host}--${slug}--${urlHash.slice(0, 12)}`;
93
+ }
94
+
95
+ export function cachePaths(url: string): {
96
+ dirPath: string;
97
+ mdPath: string;
98
+ metaPath: string;
99
+ key: string;
100
+ urlSha256: string;
101
+ } {
102
+ const key = cacheKey(url);
103
+ const dirPath = path.join(CACHE_DIR, key);
104
+ return {
105
+ dirPath,
106
+ mdPath: path.join(dirPath, "content.md"),
107
+ metaPath: path.join(dirPath, "meta.json"),
108
+ key,
109
+ urlSha256: sha256(url),
110
+ };
111
+ }
112
+
113
+ export function countLines(markdown: string): number {
114
+ return markdown.split(/\r?\n/).length;
115
+ }
116
+
117
+ export function paginate(
118
+ markdown: string,
119
+ offset: number,
120
+ limit: number,
121
+ maxBytes: number = DEFAULT_MAX_BYTES,
122
+ ): Pagination {
123
+ const lines = markdown.split(/\r?\n/);
124
+ const totalLines = lines.length;
125
+ const startIndex = Math.max(0, offset - 1);
126
+ if (startIndex >= totalLines) {
127
+ return {
128
+ selected: "",
129
+ totalLines,
130
+ shownStart: totalLines + 1,
131
+ shownEnd: totalLines,
132
+ truncated: false,
133
+ shownBytes: 0,
134
+ totalBytes: 0,
135
+ };
136
+ }
137
+ const endIndex = Math.min(totalLines, startIndex + limit);
138
+ const windowText = lines.slice(startIndex, endIndex).join("\n");
139
+ // Truncate the selected window by bytes here so nextOffset reflects the lines
140
+ // actually emitted. Computing nextOffset from the full line window would
141
+ // permanently skip any tail lines dropped by byte truncation.
142
+ const truncation = truncateHead(windowText, { maxBytes });
143
+ // Math.max(1, ...) guarantees forward progress even when a single oversized
144
+ // line yields zero output lines, avoiding a nextOffset === offset loop.
145
+ const shownLineCount = truncation.truncated
146
+ ? Math.max(1, truncation.outputLines)
147
+ : endIndex - startIndex;
148
+ const shownEnd = startIndex + shownLineCount;
149
+ return {
150
+ selected: truncation.content,
151
+ totalLines,
152
+ shownStart: Math.min(offset, totalLines),
153
+ shownEnd,
154
+ nextOffset: shownEnd < totalLines ? shownEnd + 1 : undefined,
155
+ truncated: truncation.truncated,
156
+ shownBytes: truncation.outputBytes,
157
+ totalBytes: truncation.totalBytes,
158
+ };
159
+ }
160
+
161
+ export async function loadCached(
162
+ url: string,
163
+ ): Promise<CachedDocument | undefined> {
164
+ const paths = cachePaths(url);
165
+ try {
166
+ return await loadCachedFromPaths(url, paths);
167
+ } catch (error) {
168
+ if (!isNotFoundError(error)) {
169
+ console.warn(
170
+ `[read-page] Ignoring corrupt cache for ${url}: ${errorMessage(error)}`,
171
+ );
172
+ }
173
+ return undefined;
174
+ }
175
+ }
176
+
177
+ export async function loadCachedFromPaths(
178
+ url: string,
179
+ paths: { mdPath: string; metaPath: string },
180
+ ): Promise<CachedDocument> {
181
+ const [markdown, metaRaw] = await Promise.all([
182
+ readFile(paths.mdPath, "utf8"),
183
+ readFile(paths.metaPath, "utf8"),
184
+ ]);
185
+ const meta = JSON.parse(metaRaw) as CacheMeta;
186
+ if (meta.url !== url) throw new Error("Cache URL mismatch");
187
+ if (meta.content_sha256 !== sha256(markdown))
188
+ throw new Error("Cache checksum mismatch");
189
+ if (!Number.isFinite(Date.parse(meta.expires_at)))
190
+ throw new Error("Invalid cache expires_at");
191
+ const fresh = Date.now() < Date.parse(meta.expires_at);
192
+ return { markdown, meta, fresh };
193
+ }
194
+
195
+ function isNotFoundError(error: unknown): boolean {
196
+ return (
197
+ typeof error === "object" &&
198
+ error !== null &&
199
+ "code" in error &&
200
+ (error as { code?: unknown }).code === "ENOENT"
201
+ );
202
+ }
203
+
204
+ function errorMessage(error: unknown): string {
205
+ return error instanceof Error ? error.message : String(error);
206
+ }
207
+
208
+ async function writeFileAtomic(
209
+ filePath: string,
210
+ content: string,
211
+ ): Promise<void> {
212
+ const tmpPath = `${filePath}.${process.pid}.${Date.now()}.tmp`;
213
+ await writeFile(tmpPath, content, "utf8");
214
+ await rename(tmpPath, filePath);
215
+ }
216
+
217
+ export async function saveCached(params: {
218
+ normalized: NormalizedUrl;
219
+ finalUrl: string;
220
+ markdown: string;
221
+ extractor: "defuddle";
222
+ extraction: string;
223
+ parseMode: string;
224
+ userAction: boolean;
225
+ confidence: ConfidenceReport;
226
+ metadata: PageMetadata;
227
+ browserProfile?: "persistent" | "temporary";
228
+ }): Promise<CacheMeta> {
229
+ const { dirPath, mdPath, metaPath, key, urlSha256 } = cachePaths(
230
+ params.normalized.url,
231
+ );
232
+ await mkdir(dirPath, { recursive: true });
233
+
234
+ const now = Date.now();
235
+ const ttlDays = params.userAction ? USER_ACTION_TTL_DAYS : DEFAULT_TTL_DAYS;
236
+ const ttlMs = ttlDays * 24 * 60 * 60 * 1000;
237
+
238
+ const meta: CacheMeta = {
239
+ version: 1,
240
+ input_url: params.normalized.inputUrl,
241
+ url: params.normalized.url,
242
+ final_url: params.finalUrl,
243
+ cache_key: key,
244
+ url_sha256: urlSha256,
245
+ normalization: params.normalized.normalization,
246
+ source: "browser",
247
+ extractor: params.extractor,
248
+ extraction: params.extraction,
249
+ parse_mode: params.parseMode,
250
+ browser_profile: params.browserProfile,
251
+ user_action: params.userAction,
252
+ confidence: params.confidence,
253
+ metadata: params.metadata,
254
+ fetched_at: new Date(now).toISOString(),
255
+ expires_at: new Date(now + ttlMs).toISOString(),
256
+ ttl_days: ttlDays,
257
+ content_sha256: sha256(params.markdown),
258
+ chars: params.markdown.length,
259
+ lines: countLines(params.markdown),
260
+ };
261
+
262
+ await writeFileAtomic(mdPath, params.markdown);
263
+ await writeFileAtomic(metaPath, `${JSON.stringify(meta, null, 2)}\n`);
264
+ return meta;
265
+ }
@@ -0,0 +1,345 @@
1
+ import { lookup } from "node:dns/promises";
2
+ import { isIP } from "node:net";
3
+
4
+ export type UrlNormalization = {
5
+ strip_fragment: boolean;
6
+ strip_query: boolean;
7
+ strip_trailing_slash: boolean;
8
+ };
9
+
10
+ export type NormalizedUrl = {
11
+ inputUrl: string;
12
+ url: string;
13
+ normalization: UrlNormalization;
14
+ };
15
+
16
+ const dnsPolicyChecks = new Map<string, Promise<void>>();
17
+ const MAX_FETCH_REDIRECTS = 20;
18
+
19
+ export function normalizeHttpUrl(
20
+ input: string,
21
+ options: { preserveQuery: boolean },
22
+ ): NormalizedUrl {
23
+ const parsed = parseHttpUrl(input);
24
+ const inputUrl = input.trim();
25
+
26
+ parsed.hostname = parsed.hostname.toLowerCase();
27
+ enforceHostPolicy(parsed.hostname);
28
+ parsed.hash = "";
29
+
30
+ const stripQuery = !options.preserveQuery;
31
+ if (stripQuery) parsed.search = "";
32
+
33
+ const stripTrailingSlash =
34
+ parsed.pathname !== "/" && parsed.pathname.endsWith("/");
35
+ if (stripTrailingSlash)
36
+ parsed.pathname = parsed.pathname.replace(/\/+$/g, "");
37
+
38
+ return {
39
+ inputUrl,
40
+ url: parsed.toString(),
41
+ normalization: {
42
+ strip_fragment: true,
43
+ strip_query: stripQuery,
44
+ strip_trailing_slash: stripTrailingSlash,
45
+ },
46
+ };
47
+ }
48
+
49
+ export async function assertHttpUrlAllowed(url: string): Promise<void> {
50
+ const parsed = parseHttpUrl(url);
51
+ enforceHostPolicy(parsed.hostname);
52
+
53
+ if (allowsPrivateNetwork()) return;
54
+ const host = normalizeHost(parsed.hostname);
55
+ if (isIP(host)) return;
56
+
57
+ const currentCheck = dnsPolicyChecks.get(host);
58
+ if (currentCheck) {
59
+ await currentCheck;
60
+ return;
61
+ }
62
+
63
+ let check!: Promise<void>;
64
+ check = enforceDnsPolicy(host).finally(() => {
65
+ if (dnsPolicyChecks.get(host) === check) dnsPolicyChecks.delete(host);
66
+ });
67
+ dnsPolicyChecks.set(host, check);
68
+ await check;
69
+ }
70
+
71
+ export function isHttpLikeUrl(url: string): boolean {
72
+ try {
73
+ const parsed = new URL(url);
74
+ return parsed.protocol === "http:" || parsed.protocol === "https:";
75
+ } catch {
76
+ return false;
77
+ }
78
+ }
79
+
80
+ export function createPolicyFetch(
81
+ delegate: typeof globalThis.fetch = globalThis.fetch,
82
+ ): typeof globalThis.fetch {
83
+ return async (input, init) => {
84
+ const redirectMode = fetchRedirectMode(input, init);
85
+ if (redirectMode !== "follow") {
86
+ await assertHttpUrlAllowed(fetchInputUrl(input));
87
+ return delegate(input, init);
88
+ }
89
+
90
+ let nextInput: RequestInfo | URL = input;
91
+ let nextInit: RequestInit = { ...init, redirect: "manual" };
92
+
93
+ for (let redirects = 0; redirects <= MAX_FETCH_REDIRECTS; redirects += 1) {
94
+ const currentUrl = fetchInputUrl(nextInput);
95
+ await assertHttpUrlAllowed(currentUrl);
96
+ const response = await delegate(nextInput, nextInit);
97
+ if (!isRedirectResponse(response)) return response;
98
+
99
+ const location = response.headers.get("location");
100
+ if (!location) return response;
101
+ if (redirects === MAX_FETCH_REDIRECTS) {
102
+ throw new Error("Fetch redirect limit exceeded");
103
+ }
104
+
105
+ const currentInput = nextInput;
106
+ nextInput = new URL(location, currentUrl).href;
107
+ nextInit = nextRedirectInit(currentInput, nextInit, response.status);
108
+ }
109
+
110
+ throw new Error("Fetch redirect limit exceeded");
111
+ };
112
+ }
113
+
114
+ function fetchInputUrl(input: RequestInfo | URL): string {
115
+ if (typeof input === "string") return input;
116
+ if (input instanceof URL) return input.href;
117
+ if (typeof input === "object" && input !== null && "url" in input) {
118
+ const url = input.url;
119
+ if (typeof url === "string") return url;
120
+ }
121
+ throw new Error("Unable to determine fetch request URL");
122
+ }
123
+
124
+ function fetchRedirectMode(
125
+ input: RequestInfo | URL,
126
+ init: RequestInit | undefined,
127
+ ): RequestRedirect {
128
+ if (init?.redirect) return init.redirect;
129
+ if (typeof input === "object" && input !== null && "redirect" in input) {
130
+ const redirect = input.redirect;
131
+ if (
132
+ redirect === "error" ||
133
+ redirect === "follow" ||
134
+ redirect === "manual"
135
+ ) {
136
+ return redirect;
137
+ }
138
+ }
139
+ return "follow";
140
+ }
141
+
142
+ function fetchMethod(input: RequestInfo | URL, init: RequestInit): string {
143
+ if (init.method) return init.method.toUpperCase();
144
+ if (typeof input === "object" && input !== null && "method" in input) {
145
+ const method = input.method;
146
+ if (typeof method === "string") return method.toUpperCase();
147
+ }
148
+ return "GET";
149
+ }
150
+
151
+ function nextRedirectInit(
152
+ input: RequestInfo | URL,
153
+ init: RequestInit,
154
+ status: number,
155
+ ): RequestInit {
156
+ const method = fetchMethod(input, init);
157
+ if (
158
+ status !== 303 &&
159
+ !((status === 301 || status === 302) && method === "POST")
160
+ ) {
161
+ return init;
162
+ }
163
+ return { ...init, method: "GET", body: undefined };
164
+ }
165
+
166
+ function isRedirectResponse(response: Response): boolean {
167
+ return [301, 302, 303, 307, 308].includes(response.status);
168
+ }
169
+
170
+ function parseHttpUrl(input: string): URL {
171
+ let parsed: URL;
172
+
173
+ try {
174
+ parsed = new URL(input.trim());
175
+ } catch {
176
+ throw new Error(`Invalid URL: ${input}`);
177
+ }
178
+
179
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
180
+ throw new Error(
181
+ `Only http:// and https:// URLs are supported. Refusing URL: ${input}`,
182
+ );
183
+ }
184
+
185
+ return parsed;
186
+ }
187
+
188
+ async function enforceDnsPolicy(host: string): Promise<void> {
189
+ let records: Array<{ address: string }>;
190
+ try {
191
+ records = await lookup(host, { all: true, verbatim: true });
192
+ } catch (error) {
193
+ throw new Error(
194
+ `Failed to resolve hostname ${host}: ${errorMessage(error)}`,
195
+ );
196
+ }
197
+
198
+ for (const record of records) {
199
+ if (isPrivateIp(record.address)) {
200
+ throw new Error(
201
+ `Refusing hostname ${host} because DNS resolved to private/local address ${record.address}. Set READ_PAGE_ALLOW_PRIVATE_NETWORK=1 to allow it explicitly.`,
202
+ );
203
+ }
204
+ }
205
+ }
206
+
207
+ function enforceHostPolicy(hostname: string): void {
208
+ if (allowsPrivateNetwork()) return;
209
+
210
+ const host = normalizeHost(hostname);
211
+ if (
212
+ host === "localhost" ||
213
+ host.endsWith(".localhost") ||
214
+ host.endsWith(".local") ||
215
+ host.endsWith(".internal")
216
+ ) {
217
+ throw new Error(
218
+ `Refusing private/local hostname: ${hostname}. Set READ_PAGE_ALLOW_PRIVATE_NETWORK=1 to allow it explicitly.`,
219
+ );
220
+ }
221
+
222
+ if (isIP(host) && isPrivateIp(host)) {
223
+ throw new Error(
224
+ `Refusing private/local IP address: ${hostname}. Set READ_PAGE_ALLOW_PRIVATE_NETWORK=1 to allow it explicitly.`,
225
+ );
226
+ }
227
+ }
228
+
229
+ function isPrivateIp(ip: string): boolean {
230
+ const ipVersion = isIP(ip);
231
+ if (ipVersion === 4) return isPrivateIPv4(ip);
232
+ if (ipVersion === 6) return isPrivateIPv6(ip);
233
+ return true;
234
+ }
235
+
236
+ function isPrivateIPv4(ip: string): boolean {
237
+ const parts = ip.split(".").map((part) => Number.parseInt(part, 10));
238
+ if (
239
+ parts.length !== 4 ||
240
+ parts.some((part) => !Number.isInteger(part) || part < 0 || part > 255)
241
+ ) {
242
+ return true;
243
+ }
244
+
245
+ const [a, b] = parts;
246
+ return (
247
+ a === 0 ||
248
+ a === 10 ||
249
+ a === 127 ||
250
+ (a === 100 && b >= 64 && b <= 127) ||
251
+ (a === 169 && b === 254) ||
252
+ (a === 172 && b >= 16 && b <= 31) ||
253
+ (a === 192 && b === 168) ||
254
+ (a === 192 && b === 0) ||
255
+ (a === 198 && (b === 18 || b === 19)) ||
256
+ a >= 224
257
+ );
258
+ }
259
+
260
+ function isPrivateIPv6(ip: string): boolean {
261
+ const normalized = normalizeHost(ip).split("%")[0] || "";
262
+ const mappedIpv4 = ipv4FromMappedIPv6(normalized);
263
+ if (mappedIpv4) return isPrivateIPv4(mappedIpv4);
264
+
265
+ const words = parseIPv6Words(normalized);
266
+ if (!words) return true;
267
+ const first = words[0] ?? 0;
268
+ return (
269
+ normalized === "::1" ||
270
+ normalized === "::" ||
271
+ first === 0 ||
272
+ (first & 0xffc0) === 0xfe80 ||
273
+ (first & 0xfe00) === 0xfc00 ||
274
+ (first & 0xff00) === 0xff00
275
+ );
276
+ }
277
+
278
+ function ipv4FromMappedIPv6(ip: string): string | undefined {
279
+ const words = parseIPv6Words(ip);
280
+ if (!words) return undefined;
281
+ if (words.slice(0, 5).some((word) => word !== 0) || words[5] !== 0xffff) {
282
+ return undefined;
283
+ }
284
+
285
+ const high = words[6] ?? 0;
286
+ const low = words[7] ?? 0;
287
+ return [high >> 8, high & 0xff, low >> 8, low & 0xff].join(".");
288
+ }
289
+
290
+ function parseIPv6Words(ip: string): number[] | undefined {
291
+ let input = ip.toLowerCase();
292
+ if (input.includes(".")) {
293
+ const lastColon = input.lastIndexOf(":");
294
+ if (lastColon === -1) return undefined;
295
+ const ipv4 = input.slice(lastColon + 1);
296
+ if (isIP(ipv4) !== 4) return undefined;
297
+ const octets = ipv4.split(".").map((part) => Number.parseInt(part, 10));
298
+ const [a, b, c, d] = octets;
299
+ if (
300
+ a === undefined ||
301
+ b === undefined ||
302
+ c === undefined ||
303
+ d === undefined
304
+ ) {
305
+ return undefined;
306
+ }
307
+ input = `${input.slice(0, lastColon + 1)}${((a << 8) | b).toString(16)}:${((c << 8) | d).toString(16)}`;
308
+ }
309
+
310
+ const compressionParts = input.split("::");
311
+ if (compressionParts.length > 2) return undefined;
312
+
313
+ const left = parseIPv6Side(compressionParts[0] ?? "");
314
+ const right = parseIPv6Side(compressionParts[1] ?? "");
315
+ if (!left || !right) return undefined;
316
+
317
+ if (compressionParts.length === 1) {
318
+ return left.length === 8 ? left : undefined;
319
+ }
320
+
321
+ const missing = 8 - left.length - right.length;
322
+ if (missing < 1) return undefined;
323
+ return [...left, ...Array.from({ length: missing }, () => 0), ...right];
324
+ }
325
+
326
+ function parseIPv6Side(input: string): number[] | undefined {
327
+ if (!input) return [];
328
+ const words = input.split(":").map((part) => {
329
+ if (!/^[0-9a-f]{1,4}$/.test(part)) return Number.NaN;
330
+ return Number.parseInt(part, 16);
331
+ });
332
+ return words.every(Number.isFinite) ? words : undefined;
333
+ }
334
+
335
+ function normalizeHost(hostname: string): string {
336
+ return hostname.replace(/^\[|\]$/g, "").toLowerCase();
337
+ }
338
+
339
+ function allowsPrivateNetwork(): boolean {
340
+ return process.env.READ_PAGE_ALLOW_PRIVATE_NETWORK === "1";
341
+ }
342
+
343
+ function errorMessage(error: unknown): string {
344
+ return error instanceof Error ? error.message : String(error);
345
+ }