pi-read-page 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Shuqian
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,233 @@
1
+ # pi-read-page
2
+
3
+ Let [pi](https://github.com/earendil-works/pi-coding-agent) read webpages through your local browser and return Markdown.
4
+
5
+ ## What it provides
6
+
7
+ - One read-only Agent tool: `read-page`.
8
+ - Local Chrome/Chromium rendering.
9
+ - Manual handoff for login/captcha/blocked states.
10
+ - Markdown output with pagination and cache.
11
+ - Defensive defaults for untrusted webpages and private-network access.
12
+
13
+ ## Requirements
14
+
15
+ - pi.
16
+ - A local Chrome/Chromium browser.
17
+ - Bun only if you are developing or running tests locally.
18
+
19
+ `pi-read-page` uses `playwright-core`; it does not download a browser. By default it launches the `chrome` channel. Set `READ_PAGE_CHROME_PATH` or `READ_PAGE_BROWSER_CHANNEL` if needed.
20
+
21
+ ## Installation
22
+
23
+ Install from npm:
24
+
25
+ ```bash
26
+ pi install npm:pi-read-page
27
+ ```
28
+
29
+ Try it for one pi run without installing:
30
+
31
+ ```bash
32
+ pi -e npm:pi-read-page
33
+ ```
34
+
35
+ Install from GitHub if you want the latest repository version:
36
+
37
+ ```bash
38
+ pi install https://github.com/Sukitly/pi-read-page
39
+ ```
40
+
41
+ Use a local checkout:
42
+
43
+ ```bash
44
+ git clone https://github.com/Sukitly/pi-read-page.git
45
+ cd pi-read-page
46
+ bun install
47
+ pi -e .
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ Ask pi to read a URL:
53
+
54
+ ```text
55
+ Read https://example.com
56
+ ```
57
+
58
+ The extension registers one Agent-facing tool:
59
+
60
+ ```text
61
+ read-page(url, offset?, limit?, refresh?, preserveQuery?)
62
+ ```
63
+
64
+ Parameters:
65
+
66
+ | Parameter | Default | Description |
67
+ | --- | --- | --- |
68
+ | `url` | required | HTTP or HTTPS URL to read. |
69
+ | `offset` | `1` | 1-based line offset for pagination. |
70
+ | `limit` | `300` | Number of lines to return. Maximum `1000`. |
71
+ | `refresh` | `false` | Force browser re-extraction and overwrite cache. |
72
+ | `preserveQuery` | `false` | Preserve URL query parameters. By default query params are stripped for canonical cache keys. |
73
+
74
+ Use the returned `Next offset` to continue reading long pages.
75
+
76
+ ## How extraction works
77
+
78
+ ```text
79
+ URL normalization and private-network policy
80
+ -> headed Playwright browser
81
+ -> DOMContentLoaded + network idle wait
82
+ -> final URL private-network policy
83
+ -> read-only lazy-load scroll
84
+ -> open shadow-root flattening
85
+ -> URL absolutization
86
+ -> Defuddle HTML/Markdown extraction
87
+ -> confidence and handoff detection
88
+ -> local cache write
89
+ -> paginated Markdown output
90
+ ```
91
+
92
+ If the page appears to require a real user action, pi shows a confirmation prompt and leaves the headed browser open. Complete the login/captcha/manual navigation in that browser, then confirm in pi. The same browser page is settled and extracted again. After the tool call completes, the page and browser context are closed.
93
+
94
+ ## Cache
95
+
96
+ Successful browser extractions are cached under:
97
+
98
+ ```text
99
+ ~/.pi/agent/caches/read-page
100
+ ```
101
+
102
+ Cache behavior:
103
+
104
+ - Normal TTL: 30 days.
105
+ - User-action TTL: 1 day.
106
+ - Cache files: `content.md` and `meta.json`.
107
+ - Writes are atomic.
108
+ - Cached Markdown is sha256-verified on load.
109
+ - If refresh/extraction fails and a cache entry exists, the tool returns cached content with an explicit `refresh-failed-fresh` or `stale-fallback` status.
110
+
111
+ ## Security model
112
+
113
+ `read-page` treats webpages as untrusted external content.
114
+
115
+ - The output includes a security notice and document boundary.
116
+ - The Agent is instructed not to follow instructions from the page unless the user explicitly asks.
117
+ - Private/local hosts and IPs are blocked by default.
118
+ - Browser automation is read-only: it may navigate, wait, scroll, extract DOM, and cache content.
119
+ - The extension does not expose browser mutation/control tools to the Agent.
120
+ - User handoff is only used for actionable captcha, blocked/interstitial, or explicit login-wall states.
121
+
122
+ To intentionally allow private/local network URLs:
123
+
124
+ ```bash
125
+ READ_PAGE_ALLOW_PRIVATE_NETWORK=1 pi
126
+ ```
127
+
128
+ ## Configuration
129
+
130
+ Optional environment variables:
131
+
132
+ | Variable | Default | Description |
133
+ | --- | --- | --- |
134
+ | `READ_PAGE_CHROME_PATH` | unset | Explicit Chrome/Chromium executable path. |
135
+ | `READ_PAGE_BROWSER_CHANNEL` | `chrome` | Playwright browser channel. |
136
+ | `READ_PAGE_PROFILE_DIR` | `~/.pi/agent/read-page/browser-profile` | Persistent browser profile directory. |
137
+ | `READ_PAGE_DISABLE_TEMP_PROFILE_FALLBACK` | unset | Set to `1` to fail instead of using a temporary profile when the persistent profile is locked. |
138
+ | `READ_PAGE_ALLOW_PRIVATE_NETWORK` | unset | Set to `1` to allow private/local network access. |
139
+ | `READ_PAGE_PARSE_TIMEOUT_MS` | `8000` | Defuddle parse timeout before sync fallback. |
140
+ | `READ_PAGE_DEFUDDLE_ASYNC` | unset | Set to `1` to allow Defuddle third-party async extraction. |
141
+ | `READ_PAGE_DEFUDDLE_DEBUG` | unset | Set to `1` to include Defuddle debug information. |
142
+
143
+ ## Development
144
+
145
+ Install dependencies:
146
+
147
+ ```bash
148
+ bun install
149
+ ```
150
+
151
+ Run deterministic checks:
152
+
153
+ ```bash
154
+ bun run lint
155
+ bun test
156
+ ```
157
+
158
+ Run the browser integration test:
159
+
160
+ ```bash
161
+ bun run integration -- https://example.com
162
+ ```
163
+
164
+ The integration test opens a real browser, extracts the page, prints extraction metadata, and closes the browser context.
165
+
166
+ ## Publishing
167
+
168
+ Pi package catalog entries are discovered from public npm packages with the `pi-package` keyword.
169
+
170
+ Before publishing:
171
+
172
+ ```bash
173
+ bun run lint
174
+ bun test
175
+ npm pack --dry-run
176
+ ```
177
+
178
+ Publish:
179
+
180
+ ```bash
181
+ npm login
182
+ npm publish --access public
183
+ ```
184
+
185
+ After publishing, install with:
186
+
187
+ ```bash
188
+ pi install npm:pi-read-page
189
+ ```
190
+
191
+ ## Project layout
192
+
193
+ ```text
194
+ extensions/pi-read-page.ts extension entrypoint
195
+ src/tools/read-page.ts tool orchestration, output formatting, TUI rendering
196
+ src/browser/ browser lifecycle, extraction, handoff, confidence
197
+ src/cache/cache.ts cache, pagination, checksums
198
+ src/security/url-policy.ts URL normalization and private-network policy
199
+ test/ deterministic unit tests
200
+ scripts/integration-read-page.ts browser integration runner
201
+ ```
202
+
203
+ ## Troubleshooting
204
+
205
+ ### Chrome is not found
206
+
207
+ Install Google Chrome/Chromium, or set:
208
+
209
+ ```bash
210
+ READ_PAGE_CHROME_PATH=/path/to/chrome pi
211
+ ```
212
+
213
+ ### Login state is missing
214
+
215
+ By default the extension uses a persistent profile at:
216
+
217
+ ```text
218
+ ~/.pi/agent/read-page/browser-profile
219
+ ```
220
+
221
+ If that profile is already locked by another browser process, `read-page` falls back to a temporary profile. The tool output will include a warning when this happens.
222
+
223
+ ### Query parameters were removed
224
+
225
+ Set `preserveQuery: true` when query parameters are required for the page content, such as search results, filters, or app/detail pages.
226
+
227
+ ### Localhost or private IP is blocked
228
+
229
+ This is intentional. Use `READ_PAGE_ALLOW_PRIVATE_NETWORK=1` only when you explicitly want to read local/private services.
230
+
231
+ ## License
232
+
233
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,11 @@
1
+ import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
2
+ import { closeBrowser } from "../src/browser/browser-manager";
3
+ import { registerReadPageTool } from "../src/tools/read-page";
4
+
5
+ export default function readPageExtension(pi: ExtensionAPI) {
6
+ registerReadPageTool(pi);
7
+
8
+ pi.on("session_shutdown", async () => {
9
+ await closeBrowser();
10
+ });
11
+ }
package/package.json ADDED
@@ -0,0 +1,65 @@
1
+ {
2
+ "name": "pi-read-page",
3
+ "version": "0.1.0",
4
+ "description": "Read webpages through a local browser and return Markdown for Pi coding agent.",
5
+ "type": "module",
6
+ "license": "MIT",
7
+ "author": "Sukitly",
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "git+https://github.com/Sukitly/pi-read-page.git"
11
+ },
12
+ "homepage": "https://github.com/Sukitly/pi-read-page#readme",
13
+ "bugs": {
14
+ "url": "https://github.com/Sukitly/pi-read-page/issues"
15
+ },
16
+ "keywords": [
17
+ "pi-package",
18
+ "pi-extension",
19
+ "pi",
20
+ "read-page",
21
+ "browser",
22
+ "markdown"
23
+ ],
24
+ "files": [
25
+ "extensions",
26
+ "src",
27
+ "README.md",
28
+ "LICENSE",
29
+ "package.json"
30
+ ],
31
+ "publishConfig": {
32
+ "access": "public"
33
+ },
34
+ "pi": {
35
+ "extensions": [
36
+ "./extensions/pi-read-page.ts"
37
+ ]
38
+ },
39
+ "scripts": {
40
+ "typecheck": "tsc --noEmit",
41
+ "test": "vitest run",
42
+ "integration": "tsx scripts/integration-read-page.ts",
43
+ "lint": "bun run typecheck && bunx --bun @biomejs/biome check --error-on-warnings --write ."
44
+ },
45
+ "dependencies": {
46
+ "defuddle": "^0.18.1",
47
+ "linkedom": "^0.18.12",
48
+ "playwright-core": "^1.60.0"
49
+ },
50
+ "peerDependencies": {
51
+ "@earendil-works/pi-coding-agent": "*",
52
+ "@earendil-works/pi-tui": "*",
53
+ "typebox": "*"
54
+ },
55
+ "devDependencies": {
56
+ "@biomejs/biome": "2.5.0",
57
+ "@earendil-works/pi-coding-agent": "^0.79.2",
58
+ "@earendil-works/pi-tui": "^0.79.2",
59
+ "@types/node": "^25.9.3",
60
+ "tsx": "^4.22.4",
61
+ "typebox": "1.1.38",
62
+ "typescript": "^6.0.3",
63
+ "vitest": "^4.1.8"
64
+ }
65
+ }
@@ -0,0 +1,329 @@
1
+ import { mkdir, mkdtemp, rm } from "node:fs/promises";
2
+ import { homedir, tmpdir } from "node:os";
3
+ import { join, resolve } from "node:path";
4
+ import { type BrowserContext, chromium, type Page } from "playwright-core";
5
+ import { assertHttpUrlAllowed, isHttpLikeUrl } from "../security/url-policy";
6
+
7
+ type ManagedBrowserContext = {
8
+ context: BrowserContext;
9
+ profileDir: string;
10
+ temporaryProfileDir?: string;
11
+ };
12
+
13
+ type BrowserAutomation = Pick<typeof chromium, "launchPersistentContext">;
14
+
15
+ let browserAutomation: BrowserAutomation = chromium;
16
+ let managedContext: ManagedBrowserContext | undefined;
17
+ let managedContextPromise: Promise<ManagedBrowserContext> | undefined;
18
+ let contextGeneration = 0;
19
+
20
+ function expandHome(path: string): string {
21
+ if (path === "~") return homedir();
22
+ if (path.startsWith("~/")) return resolve(homedir(), path.slice(2));
23
+ return path;
24
+ }
25
+
26
+ function defaultProfileDir(): string {
27
+ return resolve(homedir(), ".pi", "agent", "read-page", "browser-profile");
28
+ }
29
+
30
+ async function getContext(signal?: AbortSignal): Promise<BrowserContext> {
31
+ throwIfAborted(signal, "read-page aborted before opening browser");
32
+ if (managedContext) return managedContext.context;
33
+
34
+ const generation = contextGeneration;
35
+ if (!managedContextPromise) managedContextPromise = createManagedContext();
36
+ const startup = managedContextPromise;
37
+
38
+ try {
39
+ const created = await abortable(
40
+ startup,
41
+ signal,
42
+ "read-page aborted while starting browser",
43
+ closeManagedContext,
44
+ );
45
+
46
+ if (generation !== contextGeneration) {
47
+ throw new Error("read-page browser context closed during startup");
48
+ }
49
+
50
+ managedContext = created;
51
+ return created.context;
52
+ } catch (error) {
53
+ if (managedContextPromise === startup) managedContextPromise = undefined;
54
+ throw error;
55
+ }
56
+ }
57
+
58
+ async function createManagedContext(): Promise<ManagedBrowserContext> {
59
+ const profileDir = expandHome(
60
+ process.env.READ_PAGE_PROFILE_DIR || defaultProfileDir(),
61
+ );
62
+ await mkdir(profileDir, { recursive: true });
63
+
64
+ try {
65
+ return {
66
+ context: await launchPersistent(profileDir),
67
+ profileDir,
68
+ };
69
+ } catch (error) {
70
+ if (
71
+ !isProfileInUseError(error) ||
72
+ process.env.READ_PAGE_DISABLE_TEMP_PROFILE_FALLBACK === "1"
73
+ ) {
74
+ throw error;
75
+ }
76
+
77
+ const temporaryProfileDir = await mkdtemp(
78
+ join(tmpdir(), "read-page-profile-"),
79
+ );
80
+ try {
81
+ return {
82
+ context: await launchPersistent(temporaryProfileDir),
83
+ profileDir: temporaryProfileDir,
84
+ temporaryProfileDir,
85
+ };
86
+ } catch (tempError) {
87
+ await removeTemporaryProfile(temporaryProfileDir);
88
+ throw tempError;
89
+ }
90
+ }
91
+ }
92
+
93
+ async function launchPersistent(profileDir: string): Promise<BrowserContext> {
94
+ const browserContext = await browserAutomation.launchPersistentContext(
95
+ profileDir,
96
+ {
97
+ headless: false,
98
+ channel: process.env.READ_PAGE_BROWSER_CHANNEL || "chrome",
99
+ executablePath: process.env.READ_PAGE_CHROME_PATH || undefined,
100
+ viewport: null,
101
+ args: ["--disable-blink-features=AutomationControlled"],
102
+ },
103
+ );
104
+
105
+ try {
106
+ await installNetworkPolicy(browserContext);
107
+ return browserContext;
108
+ } catch (error) {
109
+ await browserContext.close().catch(() => undefined);
110
+ throw error;
111
+ }
112
+ }
113
+
114
+ async function installNetworkPolicy(
115
+ browserContext: BrowserContext,
116
+ ): Promise<void> {
117
+ await browserContext.route("**/*", async (route) => {
118
+ const url = route.request().url();
119
+ if (!isHttpLikeUrl(url)) {
120
+ await route.continue();
121
+ return;
122
+ }
123
+
124
+ try {
125
+ await assertHttpUrlAllowed(url);
126
+ await route.continue();
127
+ } catch {
128
+ await route.abort("blockedbyclient");
129
+ }
130
+ });
131
+ }
132
+
133
+ function isProfileInUseError(error: unknown): boolean {
134
+ const message = error instanceof Error ? error.message : String(error);
135
+ return /existing browser session|profile is already in use|user data directory is already in use/i.test(
136
+ message,
137
+ );
138
+ }
139
+
140
+ export function setBrowserAutomationForTest(
141
+ automation: BrowserAutomation | undefined,
142
+ ): void {
143
+ browserAutomation = automation ?? chromium;
144
+ }
145
+
146
+ export function getBrowserRuntimeInfo() {
147
+ return {
148
+ profileDir: managedContext?.profileDir,
149
+ usingTemporaryProfile: managedContext?.temporaryProfileDir !== undefined,
150
+ };
151
+ }
152
+
153
+ export async function closeBrowser(): Promise<void> {
154
+ contextGeneration += 1;
155
+ const current = managedContext;
156
+ const startup = managedContextPromise;
157
+ managedContext = undefined;
158
+ managedContextPromise = undefined;
159
+
160
+ if (current) await closeManagedContext(current);
161
+ if (!startup) return;
162
+
163
+ const created = await startup.catch(() => undefined);
164
+ if (created && created.context !== current?.context) {
165
+ await closeManagedContext(created);
166
+ }
167
+ }
168
+
169
+ export async function openPage(
170
+ url: string,
171
+ signal?: AbortSignal,
172
+ ): Promise<Page> {
173
+ throwIfAborted(signal, "read-page aborted before opening browser");
174
+
175
+ await abortable(
176
+ assertHttpUrlAllowed(url),
177
+ signal,
178
+ "read-page aborted while validating URL",
179
+ );
180
+ const browserContext = await getContext(signal);
181
+ const page = await abortable(
182
+ browserContext.newPage(),
183
+ signal,
184
+ "read-page aborted while opening page",
185
+ async (createdPage) => {
186
+ await createdPage.close().catch(() => undefined);
187
+ },
188
+ );
189
+
190
+ let shouldClosePage = true;
191
+ try {
192
+ await abortable(
193
+ page.goto(url, { waitUntil: "domcontentloaded", timeout: 45_000 }),
194
+ signal,
195
+ "read-page aborted while navigating page",
196
+ );
197
+ await abortable(
198
+ assertHttpUrlAllowed(page.url()),
199
+ signal,
200
+ "read-page aborted while validating final URL",
201
+ );
202
+ await settlePage(page, signal);
203
+ await abortable(
204
+ assertHttpUrlAllowed(page.url()),
205
+ signal,
206
+ "read-page aborted while validating settled URL",
207
+ );
208
+ shouldClosePage = false;
209
+ return page;
210
+ } finally {
211
+ if (shouldClosePage) await page.close().catch(() => undefined);
212
+ }
213
+ }
214
+
215
+ export async function settlePage(
216
+ page: Page,
217
+ signal?: AbortSignal,
218
+ ): Promise<void> {
219
+ throwIfAborted(signal, "read-page aborted while waiting for page");
220
+
221
+ await abortable(
222
+ page.waitForLoadState("networkidle", { timeout: 8_000 }),
223
+ signal,
224
+ "read-page aborted while waiting for page",
225
+ ).catch((error) => {
226
+ if (isAbortError(error)) throw error;
227
+ });
228
+ await abortable(
229
+ page.waitForTimeout(750),
230
+ signal,
231
+ "read-page aborted while waiting for page",
232
+ );
233
+
234
+ // Read-only lazy-load trigger. No clicks, no typing, no submission.
235
+ await abortable(
236
+ page.evaluate(async () => {
237
+ const delay = (ms: number) =>
238
+ new Promise((resolve) => setTimeout(resolve, ms));
239
+ const maxY = Math.max(
240
+ document.body.scrollHeight,
241
+ document.documentElement.scrollHeight,
242
+ );
243
+ const step = Math.max(600, Math.floor(window.innerHeight * 0.8));
244
+ for (let y = 0; y < maxY; y += step) {
245
+ window.scrollTo(0, y);
246
+ await delay(80);
247
+ }
248
+ window.scrollTo(0, 0);
249
+ }),
250
+ signal,
251
+ "read-page aborted while preparing page",
252
+ ).catch((error) => {
253
+ if (isAbortError(error)) throw error;
254
+ });
255
+
256
+ await abortable(
257
+ page.waitForTimeout(300),
258
+ signal,
259
+ "read-page aborted while waiting for page",
260
+ );
261
+ }
262
+
263
+ async function closeManagedContext(
264
+ browserContext: ManagedBrowserContext,
265
+ ): Promise<void> {
266
+ await browserContext.context.close().catch(() => undefined);
267
+ if (browserContext.temporaryProfileDir) {
268
+ await removeTemporaryProfile(browserContext.temporaryProfileDir);
269
+ }
270
+ }
271
+
272
+ async function removeTemporaryProfile(profileDir: string): Promise<void> {
273
+ await rm(profileDir, { recursive: true, force: true }).catch(() => undefined);
274
+ }
275
+
276
+ function throwIfAborted(
277
+ signal: AbortSignal | undefined,
278
+ message: string,
279
+ ): void {
280
+ if (signal?.aborted) throw abortError(message);
281
+ }
282
+
283
+ async function abortable<T>(
284
+ promise: Promise<T>,
285
+ signal: AbortSignal | undefined,
286
+ message: string,
287
+ cleanup?: (value: T) => Promise<void> | void,
288
+ ): Promise<T> {
289
+ if (!signal) return promise;
290
+
291
+ let aborted = signal.aborted;
292
+ let removeAbortListener: () => void = () => undefined;
293
+ const trackedPromise = promise.then((value) => {
294
+ if (aborted && cleanup) {
295
+ void Promise.resolve(cleanup(value)).catch(() => undefined);
296
+ }
297
+ return value;
298
+ });
299
+ void trackedPromise.catch(() => undefined);
300
+
301
+ if (aborted) throw abortError(message);
302
+
303
+ const abortPromise = new Promise<never>((_resolve, reject) => {
304
+ const onAbort = () => {
305
+ aborted = true;
306
+ reject(abortError(message));
307
+ };
308
+ signal.addEventListener("abort", onAbort, { once: true });
309
+ removeAbortListener = () => {
310
+ signal.removeEventListener("abort", onAbort);
311
+ };
312
+ });
313
+
314
+ try {
315
+ return await Promise.race([trackedPromise, abortPromise]);
316
+ } finally {
317
+ removeAbortListener();
318
+ }
319
+ }
320
+
321
+ function abortError(message: string): Error {
322
+ const error = new Error(message);
323
+ error.name = "AbortError";
324
+ return error;
325
+ }
326
+
327
+ function isAbortError(error: unknown): boolean {
328
+ return error instanceof Error && error.name === "AbortError";
329
+ }