pi-read-page 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +233 -0
- package/extensions/pi-read-page.ts +11 -0
- package/package.json +65 -0
- package/src/browser/browser-manager.ts +329 -0
- package/src/browser/confidence.ts +167 -0
- package/src/browser/dom-preparer.ts +150 -0
- package/src/browser/extractor.ts +222 -0
- package/src/browser/user-action.ts +43 -0
- package/src/cache/cache.ts +265 -0
- package/src/security/url-policy.ts +345 -0
- package/src/tools/read-page.ts +636 -0
- package/src/types.ts +54 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Shuqian
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
# pi-read-page
|
|
2
|
+
|
|
3
|
+
Let [pi](https://github.com/earendil-works/pi-coding-agent) read webpages through your local browser and return Markdown.
|
|
4
|
+
|
|
5
|
+
## What it provides
|
|
6
|
+
|
|
7
|
+
- One read-only Agent tool: `read-page`.
|
|
8
|
+
- Local Chrome/Chromium rendering.
|
|
9
|
+
- Manual handoff for login/captcha/blocked states.
|
|
10
|
+
- Markdown output with pagination and cache.
|
|
11
|
+
- Defensive defaults for untrusted webpages and private-network access.
|
|
12
|
+
|
|
13
|
+
## Requirements
|
|
14
|
+
|
|
15
|
+
- pi.
|
|
16
|
+
- A local Chrome/Chromium browser.
|
|
17
|
+
- Bun only if you are developing or running tests locally.
|
|
18
|
+
|
|
19
|
+
`pi-read-page` uses `playwright-core`; it does not download a browser. By default it launches the `chrome` channel. Set `READ_PAGE_CHROME_PATH` or `READ_PAGE_BROWSER_CHANNEL` if needed.
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
Install from npm:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pi install npm:pi-read-page
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Try it for one pi run without installing:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pi -e npm:pi-read-page
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Install from GitHub if you want the latest repository version:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pi install https://github.com/Sukitly/pi-read-page
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Use a local checkout:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
git clone https://github.com/Sukitly/pi-read-page.git
|
|
45
|
+
cd pi-read-page
|
|
46
|
+
bun install
|
|
47
|
+
pi -e .
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
Ask pi to read a URL:
|
|
53
|
+
|
|
54
|
+
```text
|
|
55
|
+
Read https://example.com
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
The extension registers one Agent-facing tool:
|
|
59
|
+
|
|
60
|
+
```text
|
|
61
|
+
read-page(url, offset?, limit?, refresh?, preserveQuery?)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Parameters:
|
|
65
|
+
|
|
66
|
+
| Parameter | Default | Description |
|
|
67
|
+
| --- | --- | --- |
|
|
68
|
+
| `url` | required | HTTP or HTTPS URL to read. |
|
|
69
|
+
| `offset` | `1` | 1-based line offset for pagination. |
|
|
70
|
+
| `limit` | `300` | Number of lines to return. Maximum `1000`. |
|
|
71
|
+
| `refresh` | `false` | Force browser re-extraction and overwrite cache. |
|
|
72
|
+
| `preserveQuery` | `false` | Preserve URL query parameters. By default query params are stripped for canonical cache keys. |
|
|
73
|
+
|
|
74
|
+
Use the returned `Next offset` to continue reading long pages.
|
|
75
|
+
|
|
76
|
+
## How extraction works
|
|
77
|
+
|
|
78
|
+
```text
|
|
79
|
+
URL normalization and private-network policy
|
|
80
|
+
-> headed Playwright browser
|
|
81
|
+
-> DOMContentLoaded + network idle wait
|
|
82
|
+
-> final URL private-network policy
|
|
83
|
+
-> read-only lazy-load scroll
|
|
84
|
+
-> open shadow-root flattening
|
|
85
|
+
-> URL absolutization
|
|
86
|
+
-> Defuddle HTML/Markdown extraction
|
|
87
|
+
-> confidence and handoff detection
|
|
88
|
+
-> local cache write
|
|
89
|
+
-> paginated Markdown output
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
If the page appears to require a real user action, pi shows a confirmation prompt and leaves the headed browser open. Complete the login/captcha/manual navigation in that browser, then confirm in pi. The same browser page is settled and extracted again. After the tool call completes, the page and browser context are closed.
|
|
93
|
+
|
|
94
|
+
## Cache
|
|
95
|
+
|
|
96
|
+
Successful browser extractions are cached under:
|
|
97
|
+
|
|
98
|
+
```text
|
|
99
|
+
~/.pi/agent/caches/read-page
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Cache behavior:
|
|
103
|
+
|
|
104
|
+
- Normal TTL: 30 days.
|
|
105
|
+
- User-action TTL: 1 day.
|
|
106
|
+
- Cache files: `content.md` and `meta.json`.
|
|
107
|
+
- Writes are atomic.
|
|
108
|
+
- Cached Markdown is sha256-verified on load.
|
|
109
|
+
- If refresh/extraction fails and a cache entry exists, the tool returns cached content with an explicit `refresh-failed-fresh` or `stale-fallback` status.
|
|
110
|
+
|
|
111
|
+
## Security model
|
|
112
|
+
|
|
113
|
+
`read-page` treats webpages as untrusted external content.
|
|
114
|
+
|
|
115
|
+
- The output includes a security notice and document boundary.
|
|
116
|
+
- The Agent is instructed not to follow instructions from the page unless the user explicitly asks.
|
|
117
|
+
- Private/local hosts and IPs are blocked by default.
|
|
118
|
+
- Browser automation is read-only: it may navigate, wait, scroll, extract DOM, and cache content.
|
|
119
|
+
- The extension does not expose browser mutation/control tools to the Agent.
|
|
120
|
+
- User handoff is only used for actionable captcha, blocked/interstitial, or explicit login-wall states.
|
|
121
|
+
|
|
122
|
+
To intentionally allow private/local network URLs:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
READ_PAGE_ALLOW_PRIVATE_NETWORK=1 pi
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Configuration
|
|
129
|
+
|
|
130
|
+
Optional environment variables:
|
|
131
|
+
|
|
132
|
+
| Variable | Default | Description |
|
|
133
|
+
| --- | --- | --- |
|
|
134
|
+
| `READ_PAGE_CHROME_PATH` | unset | Explicit Chrome/Chromium executable path. |
|
|
135
|
+
| `READ_PAGE_BROWSER_CHANNEL` | `chrome` | Playwright browser channel. |
|
|
136
|
+
| `READ_PAGE_PROFILE_DIR` | `~/.pi/agent/read-page/browser-profile` | Persistent browser profile directory. |
|
|
137
|
+
| `READ_PAGE_DISABLE_TEMP_PROFILE_FALLBACK` | unset | Set to `1` to fail instead of using a temporary profile when the persistent profile is locked. |
|
|
138
|
+
| `READ_PAGE_ALLOW_PRIVATE_NETWORK` | unset | Set to `1` to allow private/local network access. |
|
|
139
|
+
| `READ_PAGE_PARSE_TIMEOUT_MS` | `8000` | Defuddle parse timeout before sync fallback. |
|
|
140
|
+
| `READ_PAGE_DEFUDDLE_ASYNC` | unset | Set to `1` to allow Defuddle third-party async extraction. |
|
|
141
|
+
| `READ_PAGE_DEFUDDLE_DEBUG` | unset | Set to `1` to include Defuddle debug information. |
|
|
142
|
+
|
|
143
|
+
## Development
|
|
144
|
+
|
|
145
|
+
Install dependencies:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
bun install
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Run deterministic checks:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
bun run lint
|
|
155
|
+
bun test
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Run the browser integration test:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
bun run integration -- https://example.com
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
The integration test opens a real browser, extracts the page, prints extraction metadata, and closes the browser context.
|
|
165
|
+
|
|
166
|
+
## Publishing
|
|
167
|
+
|
|
168
|
+
Pi package catalog entries are discovered from public npm packages with the `pi-package` keyword.
|
|
169
|
+
|
|
170
|
+
Before publishing:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
bun run lint
|
|
174
|
+
bun test
|
|
175
|
+
npm pack --dry-run
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Publish:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
npm login
|
|
182
|
+
npm publish --access public
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
After publishing, install with:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
pi install npm:pi-read-page
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
## Project layout
|
|
192
|
+
|
|
193
|
+
```text
|
|
194
|
+
extensions/pi-read-page.ts extension entrypoint
|
|
195
|
+
src/tools/read-page.ts tool orchestration, output formatting, TUI rendering
|
|
196
|
+
src/browser/ browser lifecycle, extraction, handoff, confidence
|
|
197
|
+
src/cache/cache.ts cache, pagination, checksums
|
|
198
|
+
src/security/url-policy.ts URL normalization and private-network policy
|
|
199
|
+
test/ deterministic unit tests
|
|
200
|
+
scripts/integration-read-page.ts browser integration runner
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Troubleshooting
|
|
204
|
+
|
|
205
|
+
### Chrome is not found
|
|
206
|
+
|
|
207
|
+
Install Google Chrome/Chromium, or set:
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
READ_PAGE_CHROME_PATH=/path/to/chrome pi
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### Login state is missing
|
|
214
|
+
|
|
215
|
+
By default the extension uses a persistent profile at:
|
|
216
|
+
|
|
217
|
+
```text
|
|
218
|
+
~/.pi/agent/read-page/browser-profile
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
If that profile is already locked by another browser process, `read-page` falls back to a temporary profile. The tool output will include a warning when this happens.
|
|
222
|
+
|
|
223
|
+
### Query parameters were removed
|
|
224
|
+
|
|
225
|
+
Set `preserveQuery: true` when query parameters are required for the page content, such as search results, filters, or app/detail pages.
|
|
226
|
+
|
|
227
|
+
### Localhost or private IP is blocked
|
|
228
|
+
|
|
229
|
+
This is intentional. Use `READ_PAGE_ALLOW_PRIVATE_NETWORK=1` only when you explicitly want to read local/private services.
|
|
230
|
+
|
|
231
|
+
## License
|
|
232
|
+
|
|
233
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
|
|
2
|
+
import { closeBrowser } from "../src/browser/browser-manager";
|
|
3
|
+
import { registerReadPageTool } from "../src/tools/read-page";
|
|
4
|
+
|
|
5
|
+
export default function readPageExtension(pi: ExtensionAPI) {
|
|
6
|
+
registerReadPageTool(pi);
|
|
7
|
+
|
|
8
|
+
pi.on("session_shutdown", async () => {
|
|
9
|
+
await closeBrowser();
|
|
10
|
+
});
|
|
11
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "pi-read-page",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Read webpages through a local browser and return Markdown for Pi coding agent.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"license": "MIT",
|
|
7
|
+
"author": "Sukitly",
|
|
8
|
+
"repository": {
|
|
9
|
+
"type": "git",
|
|
10
|
+
"url": "git+https://github.com/Sukitly/pi-read-page.git"
|
|
11
|
+
},
|
|
12
|
+
"homepage": "https://github.com/Sukitly/pi-read-page#readme",
|
|
13
|
+
"bugs": {
|
|
14
|
+
"url": "https://github.com/Sukitly/pi-read-page/issues"
|
|
15
|
+
},
|
|
16
|
+
"keywords": [
|
|
17
|
+
"pi-package",
|
|
18
|
+
"pi-extension",
|
|
19
|
+
"pi",
|
|
20
|
+
"read-page",
|
|
21
|
+
"browser",
|
|
22
|
+
"markdown"
|
|
23
|
+
],
|
|
24
|
+
"files": [
|
|
25
|
+
"extensions",
|
|
26
|
+
"src",
|
|
27
|
+
"README.md",
|
|
28
|
+
"LICENSE",
|
|
29
|
+
"package.json"
|
|
30
|
+
],
|
|
31
|
+
"publishConfig": {
|
|
32
|
+
"access": "public"
|
|
33
|
+
},
|
|
34
|
+
"pi": {
|
|
35
|
+
"extensions": [
|
|
36
|
+
"./extensions/pi-read-page.ts"
|
|
37
|
+
]
|
|
38
|
+
},
|
|
39
|
+
"scripts": {
|
|
40
|
+
"typecheck": "tsc --noEmit",
|
|
41
|
+
"test": "vitest run",
|
|
42
|
+
"integration": "tsx scripts/integration-read-page.ts",
|
|
43
|
+
"lint": "bun run typecheck && bunx --bun @biomejs/biome check --error-on-warnings --write ."
|
|
44
|
+
},
|
|
45
|
+
"dependencies": {
|
|
46
|
+
"defuddle": "^0.18.1",
|
|
47
|
+
"linkedom": "^0.18.12",
|
|
48
|
+
"playwright-core": "^1.60.0"
|
|
49
|
+
},
|
|
50
|
+
"peerDependencies": {
|
|
51
|
+
"@earendil-works/pi-coding-agent": "*",
|
|
52
|
+
"@earendil-works/pi-tui": "*",
|
|
53
|
+
"typebox": "*"
|
|
54
|
+
},
|
|
55
|
+
"devDependencies": {
|
|
56
|
+
"@biomejs/biome": "2.5.0",
|
|
57
|
+
"@earendil-works/pi-coding-agent": "^0.79.2",
|
|
58
|
+
"@earendil-works/pi-tui": "^0.79.2",
|
|
59
|
+
"@types/node": "^25.9.3",
|
|
60
|
+
"tsx": "^4.22.4",
|
|
61
|
+
"typebox": "1.1.38",
|
|
62
|
+
"typescript": "^6.0.3",
|
|
63
|
+
"vitest": "^4.1.8"
|
|
64
|
+
}
|
|
65
|
+
}
|
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
import { mkdir, mkdtemp, rm } from "node:fs/promises";
|
|
2
|
+
import { homedir, tmpdir } from "node:os";
|
|
3
|
+
import { join, resolve } from "node:path";
|
|
4
|
+
import { type BrowserContext, chromium, type Page } from "playwright-core";
|
|
5
|
+
import { assertHttpUrlAllowed, isHttpLikeUrl } from "../security/url-policy";
|
|
6
|
+
|
|
7
|
+
type ManagedBrowserContext = {
|
|
8
|
+
context: BrowserContext;
|
|
9
|
+
profileDir: string;
|
|
10
|
+
temporaryProfileDir?: string;
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
type BrowserAutomation = Pick<typeof chromium, "launchPersistentContext">;
|
|
14
|
+
|
|
15
|
+
let browserAutomation: BrowserAutomation = chromium;
|
|
16
|
+
let managedContext: ManagedBrowserContext | undefined;
|
|
17
|
+
let managedContextPromise: Promise<ManagedBrowserContext> | undefined;
|
|
18
|
+
let contextGeneration = 0;
|
|
19
|
+
|
|
20
|
+
function expandHome(path: string): string {
|
|
21
|
+
if (path === "~") return homedir();
|
|
22
|
+
if (path.startsWith("~/")) return resolve(homedir(), path.slice(2));
|
|
23
|
+
return path;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function defaultProfileDir(): string {
|
|
27
|
+
return resolve(homedir(), ".pi", "agent", "read-page", "browser-profile");
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
async function getContext(signal?: AbortSignal): Promise<BrowserContext> {
|
|
31
|
+
throwIfAborted(signal, "read-page aborted before opening browser");
|
|
32
|
+
if (managedContext) return managedContext.context;
|
|
33
|
+
|
|
34
|
+
const generation = contextGeneration;
|
|
35
|
+
if (!managedContextPromise) managedContextPromise = createManagedContext();
|
|
36
|
+
const startup = managedContextPromise;
|
|
37
|
+
|
|
38
|
+
try {
|
|
39
|
+
const created = await abortable(
|
|
40
|
+
startup,
|
|
41
|
+
signal,
|
|
42
|
+
"read-page aborted while starting browser",
|
|
43
|
+
closeManagedContext,
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
if (generation !== contextGeneration) {
|
|
47
|
+
throw new Error("read-page browser context closed during startup");
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
managedContext = created;
|
|
51
|
+
return created.context;
|
|
52
|
+
} catch (error) {
|
|
53
|
+
if (managedContextPromise === startup) managedContextPromise = undefined;
|
|
54
|
+
throw error;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
async function createManagedContext(): Promise<ManagedBrowserContext> {
|
|
59
|
+
const profileDir = expandHome(
|
|
60
|
+
process.env.READ_PAGE_PROFILE_DIR || defaultProfileDir(),
|
|
61
|
+
);
|
|
62
|
+
await mkdir(profileDir, { recursive: true });
|
|
63
|
+
|
|
64
|
+
try {
|
|
65
|
+
return {
|
|
66
|
+
context: await launchPersistent(profileDir),
|
|
67
|
+
profileDir,
|
|
68
|
+
};
|
|
69
|
+
} catch (error) {
|
|
70
|
+
if (
|
|
71
|
+
!isProfileInUseError(error) ||
|
|
72
|
+
process.env.READ_PAGE_DISABLE_TEMP_PROFILE_FALLBACK === "1"
|
|
73
|
+
) {
|
|
74
|
+
throw error;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const temporaryProfileDir = await mkdtemp(
|
|
78
|
+
join(tmpdir(), "read-page-profile-"),
|
|
79
|
+
);
|
|
80
|
+
try {
|
|
81
|
+
return {
|
|
82
|
+
context: await launchPersistent(temporaryProfileDir),
|
|
83
|
+
profileDir: temporaryProfileDir,
|
|
84
|
+
temporaryProfileDir,
|
|
85
|
+
};
|
|
86
|
+
} catch (tempError) {
|
|
87
|
+
await removeTemporaryProfile(temporaryProfileDir);
|
|
88
|
+
throw tempError;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
async function launchPersistent(profileDir: string): Promise<BrowserContext> {
|
|
94
|
+
const browserContext = await browserAutomation.launchPersistentContext(
|
|
95
|
+
profileDir,
|
|
96
|
+
{
|
|
97
|
+
headless: false,
|
|
98
|
+
channel: process.env.READ_PAGE_BROWSER_CHANNEL || "chrome",
|
|
99
|
+
executablePath: process.env.READ_PAGE_CHROME_PATH || undefined,
|
|
100
|
+
viewport: null,
|
|
101
|
+
args: ["--disable-blink-features=AutomationControlled"],
|
|
102
|
+
},
|
|
103
|
+
);
|
|
104
|
+
|
|
105
|
+
try {
|
|
106
|
+
await installNetworkPolicy(browserContext);
|
|
107
|
+
return browserContext;
|
|
108
|
+
} catch (error) {
|
|
109
|
+
await browserContext.close().catch(() => undefined);
|
|
110
|
+
throw error;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
async function installNetworkPolicy(
|
|
115
|
+
browserContext: BrowserContext,
|
|
116
|
+
): Promise<void> {
|
|
117
|
+
await browserContext.route("**/*", async (route) => {
|
|
118
|
+
const url = route.request().url();
|
|
119
|
+
if (!isHttpLikeUrl(url)) {
|
|
120
|
+
await route.continue();
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
try {
|
|
125
|
+
await assertHttpUrlAllowed(url);
|
|
126
|
+
await route.continue();
|
|
127
|
+
} catch {
|
|
128
|
+
await route.abort("blockedbyclient");
|
|
129
|
+
}
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function isProfileInUseError(error: unknown): boolean {
|
|
134
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
135
|
+
return /existing browser session|profile is already in use|user data directory is already in use/i.test(
|
|
136
|
+
message,
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export function setBrowserAutomationForTest(
|
|
141
|
+
automation: BrowserAutomation | undefined,
|
|
142
|
+
): void {
|
|
143
|
+
browserAutomation = automation ?? chromium;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
export function getBrowserRuntimeInfo() {
|
|
147
|
+
return {
|
|
148
|
+
profileDir: managedContext?.profileDir,
|
|
149
|
+
usingTemporaryProfile: managedContext?.temporaryProfileDir !== undefined,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
export async function closeBrowser(): Promise<void> {
|
|
154
|
+
contextGeneration += 1;
|
|
155
|
+
const current = managedContext;
|
|
156
|
+
const startup = managedContextPromise;
|
|
157
|
+
managedContext = undefined;
|
|
158
|
+
managedContextPromise = undefined;
|
|
159
|
+
|
|
160
|
+
if (current) await closeManagedContext(current);
|
|
161
|
+
if (!startup) return;
|
|
162
|
+
|
|
163
|
+
const created = await startup.catch(() => undefined);
|
|
164
|
+
if (created && created.context !== current?.context) {
|
|
165
|
+
await closeManagedContext(created);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
export async function openPage(
|
|
170
|
+
url: string,
|
|
171
|
+
signal?: AbortSignal,
|
|
172
|
+
): Promise<Page> {
|
|
173
|
+
throwIfAborted(signal, "read-page aborted before opening browser");
|
|
174
|
+
|
|
175
|
+
await abortable(
|
|
176
|
+
assertHttpUrlAllowed(url),
|
|
177
|
+
signal,
|
|
178
|
+
"read-page aborted while validating URL",
|
|
179
|
+
);
|
|
180
|
+
const browserContext = await getContext(signal);
|
|
181
|
+
const page = await abortable(
|
|
182
|
+
browserContext.newPage(),
|
|
183
|
+
signal,
|
|
184
|
+
"read-page aborted while opening page",
|
|
185
|
+
async (createdPage) => {
|
|
186
|
+
await createdPage.close().catch(() => undefined);
|
|
187
|
+
},
|
|
188
|
+
);
|
|
189
|
+
|
|
190
|
+
let shouldClosePage = true;
|
|
191
|
+
try {
|
|
192
|
+
await abortable(
|
|
193
|
+
page.goto(url, { waitUntil: "domcontentloaded", timeout: 45_000 }),
|
|
194
|
+
signal,
|
|
195
|
+
"read-page aborted while navigating page",
|
|
196
|
+
);
|
|
197
|
+
await abortable(
|
|
198
|
+
assertHttpUrlAllowed(page.url()),
|
|
199
|
+
signal,
|
|
200
|
+
"read-page aborted while validating final URL",
|
|
201
|
+
);
|
|
202
|
+
await settlePage(page, signal);
|
|
203
|
+
await abortable(
|
|
204
|
+
assertHttpUrlAllowed(page.url()),
|
|
205
|
+
signal,
|
|
206
|
+
"read-page aborted while validating settled URL",
|
|
207
|
+
);
|
|
208
|
+
shouldClosePage = false;
|
|
209
|
+
return page;
|
|
210
|
+
} finally {
|
|
211
|
+
if (shouldClosePage) await page.close().catch(() => undefined);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
export async function settlePage(
|
|
216
|
+
page: Page,
|
|
217
|
+
signal?: AbortSignal,
|
|
218
|
+
): Promise<void> {
|
|
219
|
+
throwIfAborted(signal, "read-page aborted while waiting for page");
|
|
220
|
+
|
|
221
|
+
await abortable(
|
|
222
|
+
page.waitForLoadState("networkidle", { timeout: 8_000 }),
|
|
223
|
+
signal,
|
|
224
|
+
"read-page aborted while waiting for page",
|
|
225
|
+
).catch((error) => {
|
|
226
|
+
if (isAbortError(error)) throw error;
|
|
227
|
+
});
|
|
228
|
+
await abortable(
|
|
229
|
+
page.waitForTimeout(750),
|
|
230
|
+
signal,
|
|
231
|
+
"read-page aborted while waiting for page",
|
|
232
|
+
);
|
|
233
|
+
|
|
234
|
+
// Read-only lazy-load trigger. No clicks, no typing, no submission.
|
|
235
|
+
await abortable(
|
|
236
|
+
page.evaluate(async () => {
|
|
237
|
+
const delay = (ms: number) =>
|
|
238
|
+
new Promise((resolve) => setTimeout(resolve, ms));
|
|
239
|
+
const maxY = Math.max(
|
|
240
|
+
document.body.scrollHeight,
|
|
241
|
+
document.documentElement.scrollHeight,
|
|
242
|
+
);
|
|
243
|
+
const step = Math.max(600, Math.floor(window.innerHeight * 0.8));
|
|
244
|
+
for (let y = 0; y < maxY; y += step) {
|
|
245
|
+
window.scrollTo(0, y);
|
|
246
|
+
await delay(80);
|
|
247
|
+
}
|
|
248
|
+
window.scrollTo(0, 0);
|
|
249
|
+
}),
|
|
250
|
+
signal,
|
|
251
|
+
"read-page aborted while preparing page",
|
|
252
|
+
).catch((error) => {
|
|
253
|
+
if (isAbortError(error)) throw error;
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
await abortable(
|
|
257
|
+
page.waitForTimeout(300),
|
|
258
|
+
signal,
|
|
259
|
+
"read-page aborted while waiting for page",
|
|
260
|
+
);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
async function closeManagedContext(
|
|
264
|
+
browserContext: ManagedBrowserContext,
|
|
265
|
+
): Promise<void> {
|
|
266
|
+
await browserContext.context.close().catch(() => undefined);
|
|
267
|
+
if (browserContext.temporaryProfileDir) {
|
|
268
|
+
await removeTemporaryProfile(browserContext.temporaryProfileDir);
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
async function removeTemporaryProfile(profileDir: string): Promise<void> {
|
|
273
|
+
await rm(profileDir, { recursive: true, force: true }).catch(() => undefined);
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
function throwIfAborted(
|
|
277
|
+
signal: AbortSignal | undefined,
|
|
278
|
+
message: string,
|
|
279
|
+
): void {
|
|
280
|
+
if (signal?.aborted) throw abortError(message);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
async function abortable<T>(
|
|
284
|
+
promise: Promise<T>,
|
|
285
|
+
signal: AbortSignal | undefined,
|
|
286
|
+
message: string,
|
|
287
|
+
cleanup?: (value: T) => Promise<void> | void,
|
|
288
|
+
): Promise<T> {
|
|
289
|
+
if (!signal) return promise;
|
|
290
|
+
|
|
291
|
+
let aborted = signal.aborted;
|
|
292
|
+
let removeAbortListener: () => void = () => undefined;
|
|
293
|
+
const trackedPromise = promise.then((value) => {
|
|
294
|
+
if (aborted && cleanup) {
|
|
295
|
+
void Promise.resolve(cleanup(value)).catch(() => undefined);
|
|
296
|
+
}
|
|
297
|
+
return value;
|
|
298
|
+
});
|
|
299
|
+
void trackedPromise.catch(() => undefined);
|
|
300
|
+
|
|
301
|
+
if (aborted) throw abortError(message);
|
|
302
|
+
|
|
303
|
+
const abortPromise = new Promise<never>((_resolve, reject) => {
|
|
304
|
+
const onAbort = () => {
|
|
305
|
+
aborted = true;
|
|
306
|
+
reject(abortError(message));
|
|
307
|
+
};
|
|
308
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
309
|
+
removeAbortListener = () => {
|
|
310
|
+
signal.removeEventListener("abort", onAbort);
|
|
311
|
+
};
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
try {
|
|
315
|
+
return await Promise.race([trackedPromise, abortPromise]);
|
|
316
|
+
} finally {
|
|
317
|
+
removeAbortListener();
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
function abortError(message: string): Error {
|
|
322
|
+
const error = new Error(message);
|
|
323
|
+
error.name = "AbortError";
|
|
324
|
+
return error;
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
function isAbortError(error: unknown): boolean {
|
|
328
|
+
return error instanceof Error && error.name === "AbortError";
|
|
329
|
+
}
|