@ogulcancelik/pi-web-browse 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -11
- package/lib/browser-bin.js +11 -8
- package/lib/cdp.js +8 -1
- package/lib/daemon-client.js +21 -7
- package/lib/daemon.js +384 -85
- package/lib/fingerprint.js +352 -0
- package/lib/search.js +144 -47
- package/package.json +2 -2
- package/web-browse.js +138 -58
package/README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# pi-web-browse
|
|
2
2
|
|
|
3
|
-
Web search and content extraction skill for [pi](https://github.com/
|
|
3
|
+
Web search and content extraction skill for [pi](https://github.com/earendil-works/pi). Search the web and fetch pages via a real headless browser (CDP).
|
|
4
4
|
|
|
5
5
|
**Works on Linux, macOS, and Windows.**
|
|
6
6
|
|
|
@@ -10,7 +10,7 @@ Web search and content extraction skill for [pi](https://github.com/badlogic/pi-
|
|
|
10
10
|
- 🌐 **Page Fetching** - Extract readable content from any URL
|
|
11
11
|
- 🤖 **Bot Protection Bypass** - Handles JS challenges, Cloudflare, etc.
|
|
12
12
|
- 🚀 **Persistent Daemon** - Warm browser session for fast subsequent requests
|
|
13
|
-
- 🖥️ **Cross-Platform** - Auto-detects
|
|
13
|
+
- 🖥️ **Cross-Platform** - Auto-detects Chrome, Brave, Edge, Chromium
|
|
14
14
|
|
|
15
15
|
## Install
|
|
16
16
|
|
|
@@ -48,25 +48,29 @@ Environment variables (all optional):
|
|
|
48
48
|
|
|
49
49
|
| Variable | Description | Default |
|
|
50
50
|
|----------|-------------|---------|
|
|
51
|
-
| `WEB_BROWSE_BROWSER_BIN` | Browser binary path | Auto-detected |
|
|
52
|
-
| `WEB_BROWSE_USER_AGENT` | User-Agent string |
|
|
51
|
+
| `WEB_BROWSE_BROWSER_BIN` | Browser binary path | Auto-detected, prefers Chrome |
|
|
52
|
+
| `WEB_BROWSE_USER_AGENT` | Override User-Agent string | Auto-derived from detected browser + OS |
|
|
53
53
|
| `WEB_BROWSE_DAEMON_PORT` | Daemon HTTP port | 9377 |
|
|
54
54
|
| `WEB_BROWSE_CDP_PORT` | Chrome DevTools port | 9225 |
|
|
55
55
|
| `WEB_BROWSE_DEBUG_DUMP` | Save debug files on failure | off |
|
|
56
56
|
|
|
57
|
+
By default, the hidden profile is browser-specific (for example `~/.config/web-browse-cdp-profile-chrome` or `~/.config/web-browse-cdp-profile-brave`) so updates do not try to reuse the same hidden profile across different Chromium-family browsers.
|
|
58
|
+
|
|
57
59
|
## Browser Detection
|
|
58
60
|
|
|
59
|
-
The skill auto-detects browsers in common locations
|
|
61
|
+
The skill auto-detects browsers in common locations and now prefers Chrome first, because Google Search is less likely to challenge headless Chrome than headless Brave on some setups.
|
|
62
|
+
|
|
63
|
+
- **Linux:** google-chrome, google-chrome-stable, brave, brave-browser, chromium (from PATH)
|
|
64
|
+
- **macOS:** Google Chrome, Google Chrome Canary, Brave Browser, Edge, Chromium (in /Applications)
|
|
65
|
+
- **Windows:** Chrome, Brave, Edge, Chromium (Program Files, LocalAppData)
|
|
60
66
|
|
|
61
|
-
|
|
62
|
-
- **macOS:** Brave Browser, Google Chrome, Chromium, Edge (in /Applications)
|
|
63
|
-
- **Windows:** Brave, Chrome, Edge, Chromium (Program Files, LocalAppData)
|
|
67
|
+
If you want Brave anyway, set `WEB_BROWSE_BROWSER_BIN` or pass `--browser-bin`.
|
|
64
68
|
|
|
65
69
|
## How It Works
|
|
66
70
|
|
|
67
|
-
1. **Search** - Uses Google via headless browser
|
|
68
|
-
2. **Fetch** - Opens
|
|
69
|
-
3. **Daemon** - Keeps a warm browser session for speed
|
|
71
|
+
1. **Search** - Uses Google via a persistent headless browser session. If Google blocks the request, it fails fast and falls back to DuckDuckGo.
|
|
72
|
+
2. **Fetch** - Opens URLs in the same hidden browser session, waits for JS, and extracts readable content.
|
|
73
|
+
3. **Daemon** - Keeps a warm, browser-specific hidden profile/session alive for speed and bot-protection resilience without touching your normal browser profile.
|
|
70
74
|
|
|
71
75
|
## License
|
|
72
76
|
|
package/lib/browser-bin.js
CHANGED
|
@@ -7,34 +7,37 @@ const IS_MACOS = PLATFORM === "darwin";
|
|
|
7
7
|
const IS_WINDOWS = PLATFORM === "win32";
|
|
8
8
|
|
|
9
9
|
// macOS .app bundle paths (checked as absolute paths)
|
|
10
|
+
// Prefer Chrome first: on some setups Google Search is less likely to challenge headless Chrome than headless Brave.
|
|
10
11
|
const MACOS_BROWSER_PATHS = [
|
|
11
|
-
"/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
|
|
12
12
|
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
13
13
|
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
|
14
|
-
"/Applications/
|
|
14
|
+
"/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
|
|
15
15
|
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
|
|
16
|
+
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
|
16
17
|
];
|
|
17
18
|
|
|
18
19
|
// Linux binary names (searched on PATH)
|
|
20
|
+
// Prefer Chrome first: on some setups Google Search is less likely to challenge headless Chrome than headless Brave.
|
|
19
21
|
const LINUX_BROWSER_NAMES = [
|
|
20
|
-
"brave",
|
|
21
|
-
"brave-browser",
|
|
22
22
|
"google-chrome",
|
|
23
23
|
"google-chrome-stable",
|
|
24
|
+
"brave",
|
|
25
|
+
"brave-browser",
|
|
24
26
|
"chromium",
|
|
25
27
|
"chromium-browser",
|
|
26
28
|
];
|
|
27
29
|
|
|
28
30
|
// Windows browser paths (common install locations)
|
|
31
|
+
// Prefer Chrome first: on some setups Google Search is less likely to challenge headless Chrome than headless Brave.
|
|
29
32
|
const WINDOWS_BROWSER_PATHS = [
|
|
30
|
-
// Brave
|
|
31
|
-
join(process.env.LOCALAPPDATA || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
32
|
-
join(process.env.PROGRAMFILES || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
33
|
-
join(process.env["PROGRAMFILES(X86)"] || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
34
33
|
// Chrome
|
|
35
34
|
join(process.env.LOCALAPPDATA || "", "Google", "Chrome", "Application", "chrome.exe"),
|
|
36
35
|
join(process.env.PROGRAMFILES || "", "Google", "Chrome", "Application", "chrome.exe"),
|
|
37
36
|
join(process.env["PROGRAMFILES(X86)"] || "", "Google", "Chrome", "Application", "chrome.exe"),
|
|
37
|
+
// Brave
|
|
38
|
+
join(process.env.LOCALAPPDATA || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
39
|
+
join(process.env.PROGRAMFILES || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
40
|
+
join(process.env["PROGRAMFILES(X86)"] || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
38
41
|
// Edge (comes with Windows 10/11)
|
|
39
42
|
join(process.env.PROGRAMFILES || "", "Microsoft", "Edge", "Application", "msedge.exe"),
|
|
40
43
|
join(process.env["PROGRAMFILES(X86)"] || "", "Microsoft", "Edge", "Application", "msedge.exe"),
|
package/lib/cdp.js
CHANGED
|
@@ -85,7 +85,13 @@ async function chooseAvailablePort(preferredPort) {
|
|
|
85
85
|
return await getEphemeralPort();
|
|
86
86
|
}
|
|
87
87
|
|
|
88
|
-
export async function startBrowserForCdp(
|
|
88
|
+
export async function startBrowserForCdp(
|
|
89
|
+
preferredPort,
|
|
90
|
+
profileDir,
|
|
91
|
+
browserBin = null,
|
|
92
|
+
spawnedProcessGroupPids = null,
|
|
93
|
+
extraArgs = [],
|
|
94
|
+
) {
|
|
89
95
|
const bin = resolveBrowserBin(browserBin);
|
|
90
96
|
const port = await chooseAvailablePort(preferredPort);
|
|
91
97
|
|
|
@@ -117,6 +123,7 @@ export async function startBrowserForCdp(preferredPort, profileDir, browserBin =
|
|
|
117
123
|
"--disable-backgrounding-occluded-windows",
|
|
118
124
|
"--disable-renderer-backgrounding",
|
|
119
125
|
|
|
126
|
+
...extraArgs,
|
|
120
127
|
`--remote-debugging-port=${port}`,
|
|
121
128
|
"--remote-debugging-address=127.0.0.1",
|
|
122
129
|
`--user-data-dir=${profileDir}`,
|
package/lib/daemon-client.js
CHANGED
|
@@ -11,9 +11,8 @@ export async function checkDaemonHealth({ daemonUrl, timeoutMs = 600 } = {}) {
|
|
|
11
11
|
const response = await fetch(`${daemonUrl}/health`, { signal: controller.signal });
|
|
12
12
|
clearTimeout(timeout);
|
|
13
13
|
|
|
14
|
-
if (!response.ok) return null;
|
|
15
14
|
const payload = await response.json().catch(() => null);
|
|
16
|
-
return payload && payload.status === "
|
|
15
|
+
return payload && typeof payload === "object" && typeof payload.status === "string" ? payload : null;
|
|
17
16
|
} catch {
|
|
18
17
|
return null;
|
|
19
18
|
}
|
|
@@ -67,8 +66,8 @@ export async function startDaemonInBackground({
|
|
|
67
66
|
|
|
68
67
|
child.unref();
|
|
69
68
|
|
|
70
|
-
// Wait up to ~
|
|
71
|
-
for (let i = 0; i <
|
|
69
|
+
// Wait up to ~12s for daemon to start responding. Browser startup can be slow.
|
|
70
|
+
for (let i = 0; i < 48; i += 1) {
|
|
72
71
|
await new Promise((r) => setTimeout(r, 250));
|
|
73
72
|
const health = await checkDaemonHealth({ daemonUrl, timeoutMs: 800 });
|
|
74
73
|
if (health) return health;
|
|
@@ -77,15 +76,30 @@ export async function startDaemonInBackground({
|
|
|
77
76
|
throw new Error(`daemon failed to start on ${daemonUrl}`);
|
|
78
77
|
}
|
|
79
78
|
|
|
79
|
+
function daemonMatchesExpectedConfig(health, { expectedProfileDir = null, expectedBrowserKind = null } = {}) {
|
|
80
|
+
if (!health) return false;
|
|
81
|
+
if (expectedProfileDir && health.profileDir && health.profileDir !== expectedProfileDir) return false;
|
|
82
|
+
if (expectedBrowserKind && health.browserKind && health.browserKind !== expectedBrowserKind) return false;
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
|
|
80
86
|
export async function ensureDaemonRunning({
|
|
81
87
|
scriptPath,
|
|
82
88
|
daemonUrl,
|
|
83
89
|
daemonPidFile,
|
|
84
90
|
forwardedArgs = [],
|
|
85
91
|
env = process.env,
|
|
92
|
+
expectedProfileDir = null,
|
|
93
|
+
expectedBrowserKind = null,
|
|
86
94
|
} = {}) {
|
|
87
95
|
const health = await checkDaemonHealth({ daemonUrl });
|
|
88
|
-
if (health
|
|
96
|
+
if (health && daemonMatchesExpectedConfig(health, { expectedProfileDir, expectedBrowserKind })) {
|
|
97
|
+
return health;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (health) {
|
|
101
|
+
await stopDaemon({ daemonUrl, daemonPidFile }).catch(() => {});
|
|
102
|
+
}
|
|
89
103
|
|
|
90
104
|
// If a stale PID file exists, ignore it; health check is the source of truth.
|
|
91
105
|
return await startDaemonInBackground({ scriptPath, daemonUrl, daemonPidFile, forwardedArgs, env });
|
|
@@ -115,7 +129,7 @@ export async function stopDaemon({ daemonUrl, daemonPidFile } = {}) {
|
|
|
115
129
|
return { status: "stopping", pid };
|
|
116
130
|
}
|
|
117
131
|
|
|
118
|
-
export async function sendDaemonCommand({ daemonUrl, command, payload } = {}) {
|
|
132
|
+
export async function sendDaemonCommand({ daemonUrl, command, payload, timeoutMs = 180000 } = {}) {
|
|
119
133
|
if (!daemonUrl) throw new Error("sendDaemonCommand requires daemonUrl");
|
|
120
134
|
if (!command) throw new Error("sendDaemonCommand requires command");
|
|
121
135
|
|
|
@@ -123,7 +137,7 @@ export async function sendDaemonCommand({ daemonUrl, command, payload } = {}) {
|
|
|
123
137
|
method: "POST",
|
|
124
138
|
headers: { "Content-Type": "application/json" },
|
|
125
139
|
body: JSON.stringify({ command, payload }),
|
|
126
|
-
signal: AbortSignal.timeout(
|
|
140
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
127
141
|
});
|
|
128
142
|
|
|
129
143
|
const json = await response.json().catch(() => null);
|