@ogulcancelik/pi-web-browse 1.0.3 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -10
- package/lib/browser-bin.js +11 -8
- package/lib/cdp.js +9 -12
- package/lib/daemon-client.js +21 -7
- package/lib/daemon.js +384 -85
- package/lib/fingerprint.js +352 -0
- package/lib/search.js +144 -47
- package/package.json +5 -4
- package/web-browse.js +138 -58
package/README.md
CHANGED
|
@@ -10,7 +10,7 @@ Web search and content extraction skill for [pi](https://github.com/badlogic/pi-
|
|
|
10
10
|
- 🌐 **Page Fetching** - Extract readable content from any URL
|
|
11
11
|
- 🤖 **Bot Protection Bypass** - Handles JS challenges, Cloudflare, etc.
|
|
12
12
|
- 🚀 **Persistent Daemon** - Warm browser session for fast subsequent requests
|
|
13
|
-
- 🖥️ **Cross-Platform** - Auto-detects
|
|
13
|
+
- 🖥️ **Cross-Platform** - Auto-detects Chrome, Brave, Edge, Chromium
|
|
14
14
|
|
|
15
15
|
## Install
|
|
16
16
|
|
|
@@ -48,25 +48,29 @@ Environment variables (all optional):
|
|
|
48
48
|
|
|
49
49
|
| Variable | Description | Default |
|
|
50
50
|
|----------|-------------|---------|
|
|
51
|
-
| `WEB_BROWSE_BROWSER_BIN` | Browser binary path | Auto-detected |
|
|
52
|
-
| `WEB_BROWSE_USER_AGENT` | User-Agent string |
|
|
51
|
+
| `WEB_BROWSE_BROWSER_BIN` | Browser binary path | Auto-detected, prefers Chrome |
|
|
52
|
+
| `WEB_BROWSE_USER_AGENT` | Override User-Agent string | Auto-derived from detected browser + OS |
|
|
53
53
|
| `WEB_BROWSE_DAEMON_PORT` | Daemon HTTP port | 9377 |
|
|
54
54
|
| `WEB_BROWSE_CDP_PORT` | Chrome DevTools port | 9225 |
|
|
55
55
|
| `WEB_BROWSE_DEBUG_DUMP` | Save debug files on failure | off |
|
|
56
56
|
|
|
57
|
+
By default, the hidden profile is browser-specific (for example `~/.config/web-browse-cdp-profile-chrome` or `~/.config/web-browse-cdp-profile-brave`) so updates do not try to reuse the same hidden profile across different Chromium-family browsers.
|
|
58
|
+
|
|
57
59
|
## Browser Detection
|
|
58
60
|
|
|
59
|
-
The skill auto-detects browsers in common locations
|
|
61
|
+
The skill auto-detects browsers in common locations and now prefers Chrome first, because Google Search is less likely to challenge headless Chrome than headless Brave on some setups.
|
|
62
|
+
|
|
63
|
+
- **Linux:** google-chrome, google-chrome-stable, brave, brave-browser, chromium (from PATH)
|
|
64
|
+
- **macOS:** Google Chrome, Google Chrome Canary, Brave Browser, Edge, Chromium (in /Applications)
|
|
65
|
+
- **Windows:** Chrome, Brave, Edge, Chromium (Program Files, LocalAppData)
|
|
60
66
|
|
|
61
|
-
|
|
62
|
-
- **macOS:** Brave Browser, Google Chrome, Chromium, Edge (in /Applications)
|
|
63
|
-
- **Windows:** Brave, Chrome, Edge, Chromium (Program Files, LocalAppData)
|
|
67
|
+
If you want Brave anyway, set `WEB_BROWSE_BROWSER_BIN` or pass `--browser-bin`.
|
|
64
68
|
|
|
65
69
|
## How It Works
|
|
66
70
|
|
|
67
|
-
1. **Search** - Uses Google via headless browser
|
|
68
|
-
2. **Fetch** - Opens
|
|
69
|
-
3. **Daemon** - Keeps a warm browser session for speed
|
|
71
|
+
1. **Search** - Uses Google via a persistent headless browser session. If Google blocks the request, it fails fast and falls back to DuckDuckGo.
|
|
72
|
+
2. **Fetch** - Opens URLs in the same hidden browser session, waits for JS, and extracts readable content.
|
|
73
|
+
3. **Daemon** - Keeps a warm, browser-specific hidden profile/session alive for speed and bot-protection resilience without touching your normal browser profile.
|
|
70
74
|
|
|
71
75
|
## License
|
|
72
76
|
|
package/lib/browser-bin.js
CHANGED
|
@@ -7,34 +7,37 @@ const IS_MACOS = PLATFORM === "darwin";
|
|
|
7
7
|
const IS_WINDOWS = PLATFORM === "win32";
|
|
8
8
|
|
|
9
9
|
// macOS .app bundle paths (checked as absolute paths)
|
|
10
|
+
// Prefer Chrome first: on some setups Google Search is less likely to challenge headless Chrome than headless Brave.
|
|
10
11
|
const MACOS_BROWSER_PATHS = [
|
|
11
|
-
"/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
|
|
12
12
|
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
13
13
|
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
|
14
|
-
"/Applications/
|
|
14
|
+
"/Applications/Brave Browser.app/Contents/MacOS/Brave Browser",
|
|
15
15
|
"/Applications/Microsoft Edge.app/Contents/MacOS/Microsoft Edge",
|
|
16
|
+
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
|
16
17
|
];
|
|
17
18
|
|
|
18
19
|
// Linux binary names (searched on PATH)
|
|
20
|
+
// Prefer Chrome first: on some setups Google Search is less likely to challenge headless Chrome than headless Brave.
|
|
19
21
|
const LINUX_BROWSER_NAMES = [
|
|
20
|
-
"brave",
|
|
21
|
-
"brave-browser",
|
|
22
22
|
"google-chrome",
|
|
23
23
|
"google-chrome-stable",
|
|
24
|
+
"brave",
|
|
25
|
+
"brave-browser",
|
|
24
26
|
"chromium",
|
|
25
27
|
"chromium-browser",
|
|
26
28
|
];
|
|
27
29
|
|
|
28
30
|
// Windows browser paths (common install locations)
|
|
31
|
+
// Prefer Chrome first: on some setups Google Search is less likely to challenge headless Chrome than headless Brave.
|
|
29
32
|
const WINDOWS_BROWSER_PATHS = [
|
|
30
|
-
// Brave
|
|
31
|
-
join(process.env.LOCALAPPDATA || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
32
|
-
join(process.env.PROGRAMFILES || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
33
|
-
join(process.env["PROGRAMFILES(X86)"] || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
34
33
|
// Chrome
|
|
35
34
|
join(process.env.LOCALAPPDATA || "", "Google", "Chrome", "Application", "chrome.exe"),
|
|
36
35
|
join(process.env.PROGRAMFILES || "", "Google", "Chrome", "Application", "chrome.exe"),
|
|
37
36
|
join(process.env["PROGRAMFILES(X86)"] || "", "Google", "Chrome", "Application", "chrome.exe"),
|
|
37
|
+
// Brave
|
|
38
|
+
join(process.env.LOCALAPPDATA || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
39
|
+
join(process.env.PROGRAMFILES || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
40
|
+
join(process.env["PROGRAMFILES(X86)"] || "", "BraveSoftware", "Brave-Browser", "Application", "brave.exe"),
|
|
38
41
|
// Edge (comes with Windows 10/11)
|
|
39
42
|
join(process.env.PROGRAMFILES || "", "Microsoft", "Edge", "Application", "msedge.exe"),
|
|
40
43
|
join(process.env["PROGRAMFILES(X86)"] || "", "Microsoft", "Edge", "Application", "msedge.exe"),
|
package/lib/cdp.js
CHANGED
|
@@ -85,27 +85,23 @@ async function chooseAvailablePort(preferredPort) {
|
|
|
85
85
|
return await getEphemeralPort();
|
|
86
86
|
}
|
|
87
87
|
|
|
88
|
-
export async function startBrowserForCdp(
|
|
88
|
+
export async function startBrowserForCdp(
|
|
89
|
+
preferredPort,
|
|
90
|
+
profileDir,
|
|
91
|
+
browserBin = null,
|
|
92
|
+
spawnedProcessGroupPids = null,
|
|
93
|
+
extraArgs = [],
|
|
94
|
+
) {
|
|
89
95
|
const bin = resolveBrowserBin(browserBin);
|
|
90
96
|
const port = await chooseAvailablePort(preferredPort);
|
|
91
97
|
|
|
92
98
|
// OS-specific headless flags
|
|
93
99
|
let headlessArgs;
|
|
94
100
|
if (IS_MACOS || IS_WINDOWS) {
|
|
95
|
-
// macOS and Windows: use standard headless mode
|
|
96
|
-
// --headless=new injects "HeadlessChrome" into the UA string which is
|
|
97
|
-
// trivially detected by Google (results in /sorry/ CAPTCHA). Override
|
|
98
|
-
// the UA to look like a normal browser.
|
|
99
|
-
// On Linux this isn't needed: --ozone-platform=headless runs a full
|
|
100
|
-
// browser with a normal UA (no "Headless" marker).
|
|
101
|
-
const uaPlatform = IS_MACOS
|
|
102
|
-
? "(Macintosh; Intel Mac OS X 10_15_7)"
|
|
103
|
-
: "(Windows NT 10.0; Win64; x64)";
|
|
104
|
-
const HEADLESS_UA = `Mozilla/5.0 ${uaPlatform} AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36`;
|
|
101
|
+
// macOS and Windows: use standard headless mode
|
|
105
102
|
headlessArgs = [
|
|
106
103
|
"--headless=new",
|
|
107
104
|
"--window-size=1280,720",
|
|
108
|
-
`--user-agent=${HEADLESS_UA}`,
|
|
109
105
|
];
|
|
110
106
|
} else {
|
|
111
107
|
// Linux: use ozone headless platform (Wayland/X11 independent)
|
|
@@ -127,6 +123,7 @@ export async function startBrowserForCdp(preferredPort, profileDir, browserBin =
|
|
|
127
123
|
"--disable-backgrounding-occluded-windows",
|
|
128
124
|
"--disable-renderer-backgrounding",
|
|
129
125
|
|
|
126
|
+
...extraArgs,
|
|
130
127
|
`--remote-debugging-port=${port}`,
|
|
131
128
|
"--remote-debugging-address=127.0.0.1",
|
|
132
129
|
`--user-data-dir=${profileDir}`,
|
package/lib/daemon-client.js
CHANGED
|
@@ -11,9 +11,8 @@ export async function checkDaemonHealth({ daemonUrl, timeoutMs = 600 } = {}) {
|
|
|
11
11
|
const response = await fetch(`${daemonUrl}/health`, { signal: controller.signal });
|
|
12
12
|
clearTimeout(timeout);
|
|
13
13
|
|
|
14
|
-
if (!response.ok) return null;
|
|
15
14
|
const payload = await response.json().catch(() => null);
|
|
16
|
-
return payload && payload.status === "
|
|
15
|
+
return payload && typeof payload === "object" && typeof payload.status === "string" ? payload : null;
|
|
17
16
|
} catch {
|
|
18
17
|
return null;
|
|
19
18
|
}
|
|
@@ -67,8 +66,8 @@ export async function startDaemonInBackground({
|
|
|
67
66
|
|
|
68
67
|
child.unref();
|
|
69
68
|
|
|
70
|
-
// Wait up to ~
|
|
71
|
-
for (let i = 0; i <
|
|
69
|
+
// Wait up to ~12s for daemon to start responding. Browser startup can be slow.
|
|
70
|
+
for (let i = 0; i < 48; i += 1) {
|
|
72
71
|
await new Promise((r) => setTimeout(r, 250));
|
|
73
72
|
const health = await checkDaemonHealth({ daemonUrl, timeoutMs: 800 });
|
|
74
73
|
if (health) return health;
|
|
@@ -77,15 +76,30 @@ export async function startDaemonInBackground({
|
|
|
77
76
|
throw new Error(`daemon failed to start on ${daemonUrl}`);
|
|
78
77
|
}
|
|
79
78
|
|
|
79
|
+
function daemonMatchesExpectedConfig(health, { expectedProfileDir = null, expectedBrowserKind = null } = {}) {
|
|
80
|
+
if (!health) return false;
|
|
81
|
+
if (expectedProfileDir && health.profileDir && health.profileDir !== expectedProfileDir) return false;
|
|
82
|
+
if (expectedBrowserKind && health.browserKind && health.browserKind !== expectedBrowserKind) return false;
|
|
83
|
+
return true;
|
|
84
|
+
}
|
|
85
|
+
|
|
80
86
|
export async function ensureDaemonRunning({
|
|
81
87
|
scriptPath,
|
|
82
88
|
daemonUrl,
|
|
83
89
|
daemonPidFile,
|
|
84
90
|
forwardedArgs = [],
|
|
85
91
|
env = process.env,
|
|
92
|
+
expectedProfileDir = null,
|
|
93
|
+
expectedBrowserKind = null,
|
|
86
94
|
} = {}) {
|
|
87
95
|
const health = await checkDaemonHealth({ daemonUrl });
|
|
88
|
-
if (health
|
|
96
|
+
if (health && daemonMatchesExpectedConfig(health, { expectedProfileDir, expectedBrowserKind })) {
|
|
97
|
+
return health;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if (health) {
|
|
101
|
+
await stopDaemon({ daemonUrl, daemonPidFile }).catch(() => {});
|
|
102
|
+
}
|
|
89
103
|
|
|
90
104
|
// If a stale PID file exists, ignore it; health check is the source of truth.
|
|
91
105
|
return await startDaemonInBackground({ scriptPath, daemonUrl, daemonPidFile, forwardedArgs, env });
|
|
@@ -115,7 +129,7 @@ export async function stopDaemon({ daemonUrl, daemonPidFile } = {}) {
|
|
|
115
129
|
return { status: "stopping", pid };
|
|
116
130
|
}
|
|
117
131
|
|
|
118
|
-
export async function sendDaemonCommand({ daemonUrl, command, payload } = {}) {
|
|
132
|
+
export async function sendDaemonCommand({ daemonUrl, command, payload, timeoutMs = 180000 } = {}) {
|
|
119
133
|
if (!daemonUrl) throw new Error("sendDaemonCommand requires daemonUrl");
|
|
120
134
|
if (!command) throw new Error("sendDaemonCommand requires command");
|
|
121
135
|
|
|
@@ -123,7 +137,7 @@ export async function sendDaemonCommand({ daemonUrl, command, payload } = {}) {
|
|
|
123
137
|
method: "POST",
|
|
124
138
|
headers: { "Content-Type": "application/json" },
|
|
125
139
|
body: JSON.stringify({ command, payload }),
|
|
126
|
-
signal: AbortSignal.timeout(
|
|
140
|
+
signal: AbortSignal.timeout(timeoutMs),
|
|
127
141
|
});
|
|
128
142
|
|
|
129
143
|
const json = await response.json().catch(() => null);
|