@apmantza/greedysearch-pi 1.9.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,590 +1,653 @@
1
- // src/search/chrome.mjs — Chrome launch, probe, port file management, and CDP wrapper
2
- //
3
- // Extracted from search.mjs to reduce file complexity.
4
- //
5
- // cdp() is re-exported from extractors/common.mjs to avoid duplication.
6
- //
7
- // Idle timeout: mode-specific — headless Chrome is auto-killed after
8
- // GREEDY_SEARCH_IDLE_TIMEOUT_MINUTES (default 5). Visible Chrome (explicitly
9
- // launched for captcha/cookie setup) uses GREEDY_SEARCH_VISIBLE_IDLE_TIMEOUT_MINUTES
10
- // (default 60) because restarting it wastes the user's investment in solving captchas.
11
- // Set either to 0 to disable idle cleanup for that mode.
12
-
13
- import { spawn, execSync } from "node:child_process";
14
- import {
15
- existsSync,
16
- readFileSync,
17
- renameSync,
18
- unlinkSync,
19
- writeFileSync,
20
- } from "node:fs";
21
- import http from "node:http";
22
- import { platform, tmpdir } from "node:os";
23
- import { join } from "node:path";
24
- import {
25
- cdp as _cdp,
26
- injectHeadlessStealth,
27
- } from "../../extractors/common.mjs";
28
- import { resolveSystemCmd } from "../utils/system-cmds.mjs";
29
- import {
30
- ACTIVE_PORT_FILE,
31
- CHROME_MODE_FILE,
32
- GREEDY_PORT,
33
- PAGES_CACHE,
34
- } from "./constants.mjs";
35
- import {
36
- readMetadata,
37
- touchActivity as touchActivityBL,
38
- acquireLaunchLock,
39
- cleanupStaleSessions,
40
- registerClient,
41
- } from "./browser-lifecycle.mjs";
42
-
43
- const __dir =
44
- import.meta.dirname ||
45
- new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
46
-
47
- // ─── Mode-specific idle timeouts ─────────────────────────────────────
48
- // Headless: cheap to restart, aggressive cleanup after short idle.
49
- // Visible: user invested time in captcha/cookies — long grace period.
50
-
51
- const _tmp = tmpdir().replaceAll("\\", "/");
52
- const PID_FILE = `${_tmp}/greedysearch-chrome.pid`;
53
- const ACTIVITY_FILE = `${_tmp}/greedysearch-chrome-last-activity`;
54
-
55
- /** Headless idle timeout (default 5 min). Set to 0 to disable. */
56
- const HEADLESS_IDLE_TIMEOUT_MINUTES =
57
- Number.parseInt(process.env.GREEDY_SEARCH_IDLE_TIMEOUT_MINUTES || "5", 10) ||
58
- 5;
59
-
60
- /** Visible idle timeout (default 60 min). Much longer — captcha/cookie investment. */
61
- const VISIBLE_IDLE_TIMEOUT_MINUTES =
62
- Number.parseInt(
63
- process.env.GREEDY_SEARCH_VISIBLE_IDLE_TIMEOUT_MINUTES || "60",
64
- 10,
65
- ) || 60;
66
-
67
- /** Check if the running Chrome was launched in headless mode */
68
- export function isChromeHeadless() {
69
- try {
70
- if (!existsSync(CHROME_MODE_FILE)) return true; // default: headless
71
- return readFileSync(CHROME_MODE_FILE, "utf8").trim() === "headless";
72
- } catch {
73
- return true;
74
- }
75
- }
76
-
77
- /** Record that Chrome was just used / is active right now */
78
- export function touchActivity() {
79
- try {
80
- writeFileSync(ACTIVITY_FILE, String(Date.now()), "utf8");
81
- } catch {}
82
- // Also update structured metadata if it exists
83
- try {
84
- const md = readMetadata();
85
- if (md) touchActivityBL(md);
86
- } catch {}
87
- }
88
-
89
- /**
90
- * Find the PID of the process listening on GREEDY_PORT via OS tools.
91
- * Falls back to the PID file if netstat/lsof isn't available.
92
- */
93
- function getPortPid() {
94
- try {
95
- if (platform() === "win32") {
96
- const out = execSync(`${resolveSystemCmd("netstat")} -ano -p TCP 2>nul`, {
97
- encoding: "utf8",
98
- });
99
- const re = new RegExp(
100
- String.raw`TCP\s+\S+:${GREEDY_PORT}\s+\S+:0\s+LISTENING\s+(\d+)`,
101
- "i",
102
- );
103
- const m = out.match(re);
104
- return m ? Number.parseInt(m[1], 10) : null;
105
- }
106
- const out = execSync(
107
- `${resolveSystemCmd("lsof")} -i :${GREEDY_PORT} -t 2>/dev/null || ${resolveSystemCmd("ss")} -tlnp 2>/dev/null | ${resolveSystemCmd("grep")} :${GREEDY_PORT} | ${resolveSystemCmd("grep")} -oP 'pid=\\K\\d+'`,
108
- { encoding: "utf8" },
109
- ).trim();
110
- return out ? Number.parseInt(out.split("\n")[0], 10) : null;
111
- } catch {
112
- return null;
113
- }
114
- }
115
-
116
- /**
117
- * Send Browser.close via CDP WebSocket so Chrome flushes its cookie DB to disk
118
- * before we force-kill it. Gives the process up to `graceMs` to exit on its own.
119
- * Falls back to force-kill if Chrome is still running after the grace period.
120
- * Returns true if the process is gone after the call.
121
- */
122
- async function gracefulCloseChrome(graceMs = 1500) {
123
- try {
124
- const version = await new Promise((resolve, reject) => {
125
- const req = http.get(
126
- `http://localhost:${GREEDY_PORT}/json/version`,
127
- (res) => {
128
- let body = "";
129
- res.on("data", (d) => (body += d));
130
- res.on("end", () => {
131
- try {
132
- resolve(JSON.parse(body));
133
- } catch {
134
- reject(new Error("bad JSON"));
135
- }
136
- });
137
- },
138
- );
139
- req.on("error", reject);
140
- req.setTimeout(1000, () => {
141
- req.destroy();
142
- reject(new Error("timeout"));
143
- });
144
- });
145
-
146
- const ws = new globalThis.WebSocket(version.webSocketDebuggerUrl);
147
- await new Promise((resolve) => {
148
- ws.onopen = () => {
149
- ws.send(JSON.stringify({ id: 1, method: "Browser.close" }));
150
- // Give Chrome a moment to receive the command before we close the socket
151
- setTimeout(() => {
152
- ws.close();
153
- resolve();
154
- }, 200);
155
- };
156
- ws.onerror = () => resolve();
157
- setTimeout(resolve, 1000);
158
- });
159
- } catch {
160
- // Chrome not reachable — skip to force-kill
161
- }
162
-
163
- // Wait for Chrome to exit gracefully (flushes SQLite cookie DB)
164
- const deadline = Date.now() + graceMs;
165
- while (Date.now() < deadline) {
166
- const pid = getPortPid();
167
- if (!pid) return true; // already gone
168
- await new Promise((r) => setTimeout(r, 150));
169
- }
170
-
171
- // Still running — force-kill
172
- return killProcessOnPort();
173
- }
174
-
175
- /**
176
- * Force-kill whatever process is listening on GREEDY_PORT.
177
- * Uses OS tools to find the PID (not the PID file handles ghost processes).
178
- * Never touches the user's main Chrome (which runs on different ports).
179
- */
180
- function killProcessOnPort() {
181
- try {
182
- let pid = getPortPid();
183
- if (!pid && existsSync(PID_FILE)) {
184
- pid = Number.parseInt(readFileSync(PID_FILE, "utf8").trim(), 10) || null;
185
- }
186
- if (!pid) return false;
187
-
188
- if (platform() === "win32") {
189
- execSync(`${resolveSystemCmd("taskkill")} /F /PID ${pid}`, {
190
- stdio: "ignore",
191
- });
192
- } else {
193
- process.kill(pid, "SIGKILL");
194
- }
195
- return true;
196
- } catch {
197
- return false;
198
- }
199
- }
200
-
201
- /**
202
- * Kill the Chrome on GREEDY_PORT (headless or visible).
203
- * Uses port-based detection (handles stale PID files / ghost processes).
204
- */
205
- export async function killChrome() {
206
- const ready = await probeGreedyChrome(500);
207
- if (!ready) {
208
- // Chrome not running just clean up tracking files
209
- try {
210
- unlinkSync(PID_FILE);
211
- } catch {}
212
- try {
213
- unlinkSync(ACTIVITY_FILE);
214
- } catch {}
215
- try {
216
- unlinkSync(CHROME_MODE_FILE);
217
- } catch {}
218
- return false;
219
- }
220
-
221
- // Graceful close: sends Browser.close so Chrome flushes its cookie DB,
222
- // then force-kills if it doesn't exit within the grace period.
223
- const killed = await gracefulCloseChrome(1500);
224
-
225
- // Clean up tracking files regardless of kill success
226
- try {
227
- unlinkSync(PID_FILE);
228
- } catch {}
229
- try {
230
- unlinkSync(ACTIVITY_FILE);
231
- } catch {}
232
- try {
233
- unlinkSync(CHROME_MODE_FILE);
234
- } catch {}
235
-
236
- if (killed) {
237
- process.stderr.write(
238
- `[greedysearch] Killed Chrome on port ${GREEDY_PORT}.\n`,
239
- );
240
- }
241
- return killed;
242
- }
243
-
244
- // Backward-compat alias
245
- export const killHeadlessChrome = killChrome;
246
-
247
- /**
248
- * Check if Chrome has been idle too long and kill if so.
249
- * Uses mode-specific timeouts: headless → 5 min, visible → 60 min (defaults).
250
- * Visible Chrome has a much longer grace period because the user explicitly
251
- * launched it and invested time in captcha/cookie setup.
252
- * Returns true if Chrome was killed (caller should re-launch).
253
- */
254
- export async function checkAndKillIdle() {
255
- const headless = isChromeHeadless();
256
- const timeoutMinutes = headless
257
- ? HEADLESS_IDLE_TIMEOUT_MINUTES
258
- : VISIBLE_IDLE_TIMEOUT_MINUTES;
259
-
260
- // Disable idle cleanup for this mode
261
- if (timeoutMinutes <= 0) return false;
262
-
263
- if (!existsSync(ACTIVITY_FILE)) {
264
- touchActivity();
265
- return false;
266
- }
267
-
268
- try {
269
- const lastActivity = Number.parseInt(
270
- readFileSync(ACTIVITY_FILE, "utf8").trim(),
271
- 10,
272
- );
273
- if (!lastActivity) return false;
274
-
275
- const idleMs = Date.now() - lastActivity;
276
- const idleMinutes = idleMs / 60000;
277
-
278
- if (idleMinutes >= timeoutMinutes) {
279
- return killChrome();
280
- }
281
- } catch {}
282
-
283
- return false;
284
- }
285
-
286
- /** Re-export cdp() from the canonical location in extractors/common.mjs */
287
- export const cdp = _cdp;
288
-
289
- export async function getAnyTab() {
290
- const list = await cdp(["list"]);
291
- const first = list.split("\n")[0];
292
- if (!first) throw new Error("No Chrome tabs found");
293
- return first.slice(0, 8);
294
- }
295
-
296
- export async function openNewTab(url = "about:blank") {
297
- const anchor = await getAnyTab();
298
- const hostname = new URL(url).hostname;
299
- const needsStealth =
300
- hostname === "copilot.microsoft.com" ||
301
- hostname === "www.perplexity.ai" ||
302
- hostname === "perplexity.ai" ||
303
- hostname.endsWith(".perplexity.ai");
304
-
305
- if (needsStealth) {
306
- // Bing Copilot / Perplexity: create blank tab, inject stealth, return.
307
- // The extractor handles its own navigation, and Page.addScriptToEvaluateOnNewDocument
308
- // runs the stealth script before any page scripts.
309
- //
310
- // For Bing: stealth is awaited (Cloudflare blocks headless without it).
311
- // For Perplexity: stealth is fire-and-forget (Perplexity's anti-bot detects
312
- // the aggressive canvas/console patches, so we don't block on the CDP response).
313
- const raw = await cdp([
314
- "evalraw",
315
- anchor,
316
- "Target.createTarget",
317
- JSON.stringify({ url: "about:blank" }),
318
- ]);
319
- const { targetId } = JSON.parse(raw);
320
- const tid = targetId.slice(0, 8);
321
- await cdp(["list"]).catch(() => null);
322
-
323
- if (hostname === "copilot.microsoft.com") {
324
- await injectHeadlessStealth(tid);
325
- } else {
326
- // Perplexity: fire-and-forget (Perplexity's anti-bot detects awaited patches)
327
- injectHeadlessStealth(tid).catch(() => {});
328
- }
329
-
330
- await cdp(["list"]).catch(() => null);
331
- return targetId;
332
- }
333
-
334
- // Google / other engines: pre-seed with URL directly. Target.createTarget
335
- // navigation is less detectable than CDP Page.navigate.
336
- const raw = await cdp([
337
- "evalraw",
338
- anchor,
339
- "Target.createTarget",
340
- JSON.stringify({ url }),
341
- ]);
342
- const { targetId } = JSON.parse(raw);
343
- await cdp(["list"]).catch(() => null);
344
- return targetId;
345
- }
346
-
347
- export async function activateTab(targetId) {
348
- try {
349
- const anchor = await getAnyTab();
350
- await cdp([
351
- "evalraw",
352
- anchor,
353
- "Target.activateTarget",
354
- JSON.stringify({ targetId }),
355
- ]);
356
- } catch {
357
- // best-effort
358
- }
359
- }
360
-
361
- export async function closeTab(targetId) {
362
- try {
363
- const anchor = await getAnyTab();
364
- await cdp([
365
- "evalraw",
366
- anchor,
367
- "Target.closeTarget",
368
- JSON.stringify({ targetId }),
369
- ]);
370
- } catch {
371
- /* best-effort */
372
- }
373
- }
374
-
375
- export async function closeTabs(targetIds = []) {
376
- await Promise.all(
377
- targetIds.filter(Boolean).map((tid) => closeTab(tid).catch(() => {})),
378
- );
379
- if (targetIds.length > 0) {
380
- await cdp(["list"]).catch(() => null);
381
- }
382
- }
383
-
384
- export function getFullTabFromCache(engine, engineDomains) {
385
- try {
386
- if (!existsSync(PAGES_CACHE)) return null;
387
- const pages = JSON.parse(readFileSync(PAGES_CACHE, "utf8"));
388
- const found = pages.find((p) => p.url.includes(engineDomains[engine]));
389
- return found ? found.targetId : null;
390
- } catch {
391
- return null;
392
- }
393
- }
394
-
395
- export function probeGreedyChrome(timeoutMs = 3000) {
396
- return new Promise((resolve) => {
397
- const req = http.get(
398
- `http://localhost:${GREEDY_PORT}/json/version`,
399
- (res) => {
400
- res.resume();
401
- resolve(res.statusCode === 200);
402
- },
403
- );
404
- req.on("error", () => resolve(false));
405
- req.setTimeout(timeoutMs, () => {
406
- req.destroy();
407
- resolve(false);
408
- });
409
- });
410
- }
411
-
412
- export async function refreshPortFile() {
413
- const LOCK_FILE = `${ACTIVE_PORT_FILE}.lock`;
414
- const TEMP_FILE = `${ACTIVE_PORT_FILE}.tmp`;
415
- const LOCK_STALE_MS = 5000;
416
- const LOCK_WAIT_MS = 1000;
417
-
418
- // File-based lock with exclusive create + stale lock recovery
419
- const lockAcquired = await new Promise((resolve) => {
420
- const start = Date.now();
421
- const tryLock = () => {
422
- try {
423
- const payload = JSON.stringify({ pid: process.pid, ts: Date.now() });
424
- writeFileSync(LOCK_FILE, payload, { encoding: "utf8", flag: "wx" });
425
- resolve(true);
426
- } catch (e) {
427
- if (e?.code !== "EEXIST") {
428
- if (Date.now() - start < LOCK_WAIT_MS) {
429
- setTimeout(tryLock, 50);
430
- } else {
431
- resolve(false);
432
- }
433
- return;
434
- }
435
-
436
- try {
437
- const lockRaw = readFileSync(LOCK_FILE, "utf8").trim();
438
- const parsed = lockRaw.startsWith("{")
439
- ? JSON.parse(lockRaw)
440
- : { ts: Number(lockRaw) };
441
- const lockTime = Number(parsed?.ts) || 0;
442
-
443
- if (lockTime > 0 && Date.now() - lockTime > LOCK_STALE_MS) {
444
- try {
445
- unlinkSync(LOCK_FILE);
446
- } catch {}
447
- }
448
-
449
- if (Date.now() - start < LOCK_WAIT_MS) {
450
- setTimeout(tryLock, 50);
451
- } else {
452
- resolve(false);
453
- }
454
- } catch {
455
- if (Date.now() - start < LOCK_WAIT_MS) {
456
- setTimeout(tryLock, 50);
457
- } else {
458
- resolve(false);
459
- }
460
- }
461
- }
462
- };
463
- tryLock();
464
- });
465
-
466
- try {
467
- const body = await new Promise((res, rej) => {
468
- const req = http.get(
469
- `http://localhost:${GREEDY_PORT}/json/version`,
470
- (r) => {
471
- let b = "";
472
- r.on("data", (d) => (b += d));
473
- r.on("end", () => res(b));
474
- },
475
- );
476
- req.on("error", rej);
477
- req.setTimeout(3000, () => {
478
- req.destroy();
479
- rej(new Error("timeout"));
480
- });
481
- });
482
- const { webSocketDebuggerUrl } = JSON.parse(body);
483
- const wsPath = new URL(webSocketDebuggerUrl).pathname;
484
-
485
- // Atomic write: write to temp file, then rename
486
- if (lockAcquired) {
487
- writeFileSync(TEMP_FILE, `${GREEDY_PORT}\n${wsPath}`, "utf8");
488
- try {
489
- unlinkSync(ACTIVE_PORT_FILE);
490
- } catch {}
491
- renameSync(TEMP_FILE, ACTIVE_PORT_FILE);
492
- }
493
- } catch {
494
- /* best-effort — launch.mjs already wrote the file on first start */
495
- } finally {
496
- if (lockAcquired) {
497
- try {
498
- unlinkSync(LOCK_FILE);
499
- } catch {}
500
- }
501
- }
502
- }
503
-
504
- export async function ensureChrome() {
505
- // ── Stale session cleanup (once per process) + mode-specific idle check ──
506
- cleanupStaleSessions();
507
- const wasKilled = await checkAndKillIdle();
508
-
509
- const ready = wasKilled ? false : await probeGreedyChrome();
510
- // If Chrome is running but in wrong mode, kill it so we relaunch in the correct mode.
511
- let forceRelaunch = false;
512
- if (ready) {
513
- const headless = isChromeHeadless();
514
- const wantsVisible = process.env.GREEDY_SEARCH_VISIBLE === "1";
515
-
516
- if (!wantsVisible && !headless) {
517
- // Headless requested (default) but visible Chrome is running — switch back
518
- process.stderr.write(
519
- "[greedysearch] Visible Chrome detected — switching to headless mode...\n",
520
- );
521
- await killHeadlessChrome();
522
- await new Promise((r) => setTimeout(r, 1000));
523
- forceRelaunch = true;
524
- } else if (wantsVisible && headless) {
525
- // Visible requested but headless Chrome is running — switch
526
- process.stderr.write(
527
- "[greedysearch] Headless Chrome detected — switching to visible mode...\n",
528
- );
529
- await killHeadlessChrome();
530
- await new Promise((r) => setTimeout(r, 1000));
531
- forceRelaunch = true;
532
- }
533
- }
534
-
535
- const readyAfterModeCheck = forceRelaunch ? false : await probeGreedyChrome();
536
- if (readyAfterModeCheck) {
537
- // Chrome already running in correct mode — refresh port file, touch activity, register client
538
- await refreshPortFile();
539
- try {
540
- const md = readMetadata();
541
- if (md) {
542
- touchActivityBL(md);
543
- registerClient(md);
544
- }
545
- } catch {}
546
- return;
547
- }
548
-
549
- // ── Cross-process launch lock: prevent race between concurrent ensureChrome calls ──
550
- const lock = acquireLaunchLock();
551
- if (!lock.acquired) {
552
- // Another process is launching Chrome — wait and re-probe
553
- await new Promise((r) => setTimeout(r, 3000));
554
- const reReady = await probeGreedyChrome(5000);
555
- if (reReady) {
556
- await refreshPortFile();
557
- return;
558
- }
559
- // Still not ready — launch ourselves (the other launcher may have crashed)
560
- }
561
-
562
- try {
563
- // Double-check after acquiring lock (other process may have finished)
564
- const reCheck = await probeGreedyChrome(1000);
565
- if (reCheck) {
566
- await refreshPortFile();
567
- return;
568
- }
569
-
570
- process.stderr.write(
571
- `GreedySearch Chrome not running on port ${GREEDY_PORT} — auto-launching...\n`,
572
- );
573
- const launchArgs = [join(__dir, "..", "..", "bin", "launch.mjs")];
574
- // Headless is the default unless GREEDY_SEARCH_VISIBLE=1
575
- if (process.env.GREEDY_SEARCH_VISIBLE !== "1")
576
- launchArgs.push("--headless");
577
- await new Promise((resolve, reject) => {
578
- // Use process.execPath instead of bare "node" so we are not relying on PATH
579
- // (SonarCloud S4036).
580
- const proc = spawn(process.execPath, launchArgs, {
581
- stdio: ["ignore", process.stderr, process.stderr],
582
- });
583
- proc.on("close", (code) =>
584
- code === 0 ? resolve() : reject(new Error("launch.mjs failed")),
585
- );
586
- });
587
- } finally {
588
- lock.release();
589
- }
590
- }
1
+ // src/search/chrome.mjs — Chrome launch, probe, port file management, and CDP wrapper
2
+ //
3
+ // Extracted from search.mjs to reduce file complexity.
4
+ //
5
+ // cdp() is re-exported from extractors/common.mjs to avoid duplication.
6
+ //
7
+ // Idle timeout: mode-specific — headless Chrome is auto-killed after
8
+ // GREEDY_SEARCH_IDLE_TIMEOUT_MINUTES (default 5). Visible Chrome (explicitly
9
+ // launched for captcha/cookie setup) uses GREEDY_SEARCH_VISIBLE_IDLE_TIMEOUT_MINUTES
10
+ // (default 60) because restarting it wastes the user's investment in solving captchas.
11
+ // Set either to 0 to disable idle cleanup for that mode.
12
+
13
+ import { spawn, execFileSync, execSync } from "node:child_process";
14
+ import {
15
+ existsSync,
16
+ readFileSync,
17
+ renameSync,
18
+ unlinkSync,
19
+ writeFileSync,
20
+ } from "node:fs";
21
+ import http from "node:http";
22
+ import { platform, tmpdir } from "node:os";
23
+ import { join } from "node:path";
24
+ import {
25
+ cdp as _cdp,
26
+ injectHeadlessStealth,
27
+ } from "../../extractors/common.mjs";
28
+ import { resolveSystemCmd } from "../utils/system-cmds.mjs";
29
+ import {
30
+ ACTIVE_PORT_FILE,
31
+ CHROME_MODE_FILE,
32
+ GREEDY_PORT,
33
+ PAGES_CACHE,
34
+ } from "./constants.mjs";
35
+ import {
36
+ readMetadata,
37
+ touchActivity as touchActivityBL,
38
+ acquireLaunchLock,
39
+ cleanupStaleSessions,
40
+ registerClient,
41
+ } from "./browser-lifecycle.mjs";
42
+
43
+ const __dir =
44
+ import.meta.dirname ||
45
+ new URL(".", import.meta.url).pathname.replace(/^\/([A-Z]:)/, "$1");
46
+
47
+ // ─── Mode-specific idle timeouts ─────────────────────────────────────
48
+ // Headless: cheap to restart, aggressive cleanup after short idle.
49
+ // Visible: user invested time in captcha/cookies — long grace period.
50
+
51
+ const _tmp = tmpdir().replaceAll("\\", "/");
52
+ const PID_FILE = `${_tmp}/greedysearch-chrome.pid`;
53
+ const ACTIVITY_FILE = `${_tmp}/greedysearch-chrome-last-activity`;
54
+
55
+ /** Headless idle timeout (default 5 min). Set to 0 to disable. */
56
+ const HEADLESS_IDLE_TIMEOUT_MINUTES =
57
+ Number.parseInt(process.env.GREEDY_SEARCH_IDLE_TIMEOUT_MINUTES || "5", 10) ||
58
+ 5;
59
+
60
+ /** Visible idle timeout (default 60 min). Much longer — captcha/cookie investment. */
61
+ const VISIBLE_IDLE_TIMEOUT_MINUTES =
62
+ Number.parseInt(
63
+ process.env.GREEDY_SEARCH_VISIBLE_IDLE_TIMEOUT_MINUTES || "60",
64
+ 10,
65
+ ) || 60;
66
+
67
+ export function detectHeadlessFromChromeCommandLine(
68
+ cmdLine,
69
+ debugPort = GREEDY_PORT,
70
+ ) {
71
+ const normalized = String(cmdLine || "").toLowerCase();
72
+ if (
73
+ !normalized.includes(`--remote-debugging-port=${debugPort}`) ||
74
+ normalized.includes("--type=")
75
+ ) {
76
+ return null;
77
+ }
78
+ return normalized.includes("--headless");
79
+ }
80
+
81
+ /** Check if the running Chrome was launched in headless mode */
82
+ export function isChromeHeadless() {
83
+ // Prefer the live Chrome command line over the mode marker. The marker can be
84
+ // stale after cross-process relaunches; using it as authoritative made Gemini
85
+ // synthesis kill a visible Chrome immediately after opening its tab.
86
+ try {
87
+ const portPid = getPortPid();
88
+ const cmdLine = portPid ? getProcessCommandLine(portPid) : null;
89
+ const headless = detectHeadlessFromChromeCommandLine(cmdLine);
90
+ if (headless !== null) {
91
+ try {
92
+ writeFileSync(
93
+ CHROME_MODE_FILE,
94
+ headless ? "headless" : "visible",
95
+ "utf8",
96
+ );
97
+ } catch {}
98
+ return headless;
99
+ }
100
+ } catch {}
101
+
102
+ try {
103
+ if (!existsSync(CHROME_MODE_FILE)) return true; // default: headless
104
+ return readFileSync(CHROME_MODE_FILE, "utf8").trim() === "headless";
105
+ } catch {
106
+ return true;
107
+ }
108
+ }
109
+
110
+ /** Record that Chrome was just used / is active right now */
111
+ export function touchActivity() {
112
+ try {
113
+ writeFileSync(ACTIVITY_FILE, String(Date.now()), "utf8");
114
+ } catch {}
115
+ // Also update structured metadata if it exists
116
+ try {
117
+ const md = readMetadata();
118
+ if (md) touchActivityBL(md);
119
+ } catch {}
120
+ }
121
+
122
+ function getProcessCommandLine(pid) {
123
+ try {
124
+ if (platform() === "win32") {
125
+ const output = execFileSync(
126
+ resolveSystemCmd("powershell"),
127
+ [
128
+ "-NoProfile",
129
+ "-NonInteractive",
130
+ "-Command",
131
+ `(Get-CimInstance Win32_Process -Filter "ProcessId = ${pid}").CommandLine`,
132
+ ],
133
+ { encoding: "utf8", windowsHide: true, timeout: 5000 },
134
+ );
135
+ return output.trim() || null;
136
+ }
137
+ const output = execFileSync(
138
+ resolveSystemCmd("ps"),
139
+ ["-p", String(pid), "-o", "command="],
140
+ { encoding: "utf8", timeout: 5000 },
141
+ );
142
+ return output.trim() || null;
143
+ } catch {
144
+ return null;
145
+ }
146
+ }
147
+
148
+ /**
149
+ * Find the PID of the process listening on GREEDY_PORT via OS tools.
150
+ * Falls back to the PID file if netstat/lsof isn't available.
151
+ */
152
+ function getPortPid() {
153
+ try {
154
+ if (platform() === "win32") {
155
+ const out = execSync(`${resolveSystemCmd("netstat")} -ano -p TCP 2>nul`, {
156
+ encoding: "utf8",
157
+ });
158
+ const re = new RegExp(
159
+ String.raw`TCP\s+\S+:${GREEDY_PORT}\s+\S+:0\s+LISTENING\s+(\d+)`,
160
+ "i",
161
+ );
162
+ const m = out.match(re);
163
+ return m ? Number.parseInt(m[1], 10) : null;
164
+ }
165
+ const out = execSync(
166
+ `${resolveSystemCmd("lsof")} -i :${GREEDY_PORT} -t 2>/dev/null || ${resolveSystemCmd("ss")} -tlnp 2>/dev/null | ${resolveSystemCmd("grep")} :${GREEDY_PORT} | ${resolveSystemCmd("grep")} -oP 'pid=\\K\\d+'`,
167
+ { encoding: "utf8" },
168
+ ).trim();
169
+ return out ? Number.parseInt(out.split("\n")[0], 10) : null;
170
+ } catch {
171
+ return null;
172
+ }
173
+ }
174
+
175
+ /**
176
+ * Send Browser.close via CDP WebSocket so Chrome flushes its cookie DB to disk
177
+ * before we force-kill it. Gives the process up to `graceMs` to exit on its own.
178
+ * Falls back to force-kill if Chrome is still running after the grace period.
179
+ * Returns true if the process is gone after the call.
180
+ */
181
+ async function gracefulCloseChrome(graceMs = 1500) {
182
+ try {
183
+ const version = await new Promise((resolve, reject) => {
184
+ const req = http.get(
185
+ `http://localhost:${GREEDY_PORT}/json/version`,
186
+ (res) => {
187
+ let body = "";
188
+ res.on("data", (d) => (body += d));
189
+ res.on("end", () => {
190
+ try {
191
+ resolve(JSON.parse(body));
192
+ } catch {
193
+ reject(new Error("bad JSON"));
194
+ }
195
+ });
196
+ },
197
+ );
198
+ req.on("error", reject);
199
+ req.setTimeout(1000, () => {
200
+ req.destroy();
201
+ reject(new Error("timeout"));
202
+ });
203
+ });
204
+
205
+ const ws = new globalThis.WebSocket(version.webSocketDebuggerUrl);
206
+ await new Promise((resolve) => {
207
+ ws.onopen = () => {
208
+ ws.send(JSON.stringify({ id: 1, method: "Browser.close" }));
209
+ // Give Chrome a moment to receive the command before we close the socket
210
+ setTimeout(() => {
211
+ ws.close();
212
+ resolve();
213
+ }, 200);
214
+ };
215
+ ws.onerror = () => resolve();
216
+ setTimeout(resolve, 1000);
217
+ });
218
+ } catch {
219
+ // Chrome not reachable — skip to force-kill
220
+ }
221
+
222
+ // Wait for Chrome to exit gracefully (flushes SQLite cookie DB)
223
+ const deadline = Date.now() + graceMs;
224
+ while (Date.now() < deadline) {
225
+ const pid = getPortPid();
226
+ if (!pid) return true; // already gone
227
+ await new Promise((r) => setTimeout(r, 150));
228
+ }
229
+
230
+ // Still running — force-kill
231
+ return killProcessOnPort();
232
+ }
233
+
234
+ /**
235
+ * Force-kill whatever process is listening on GREEDY_PORT.
236
+ * Uses OS tools to find the PID (not the PID file — handles ghost processes).
237
+ * Never touches the user's main Chrome (which runs on different ports).
238
+ */
239
+ function killProcessOnPort() {
240
+ try {
241
+ let pid = getPortPid();
242
+ if (!pid && existsSync(PID_FILE)) {
243
+ pid = Number.parseInt(readFileSync(PID_FILE, "utf8").trim(), 10) || null;
244
+ }
245
+ if (!pid) return false;
246
+
247
+ if (platform() === "win32") {
248
+ execSync(`${resolveSystemCmd("taskkill")} /F /PID ${pid}`, {
249
+ stdio: "ignore",
250
+ });
251
+ } else {
252
+ process.kill(pid, "SIGKILL");
253
+ }
254
+ return true;
255
+ } catch {
256
+ return false;
257
+ }
258
+ }
259
+
260
+ /**
261
+ * Kill the Chrome on GREEDY_PORT (headless or visible).
262
+ * Uses port-based detection (handles stale PID files / ghost processes).
263
+ */
264
+ export async function killChrome() {
265
+ const ready = await probeGreedyChrome(500);
266
+ if (!ready) {
267
+ // Chrome not running — just clean up tracking files
268
+ try {
269
+ unlinkSync(PID_FILE);
270
+ } catch {}
271
+ try {
272
+ unlinkSync(ACTIVITY_FILE);
273
+ } catch {}
274
+ try {
275
+ unlinkSync(CHROME_MODE_FILE);
276
+ } catch {}
277
+ return false;
278
+ }
279
+
280
+ // Graceful close: sends Browser.close so Chrome flushes its cookie DB,
281
+ // then force-kills if it doesn't exit within the grace period.
282
+ const killed = await gracefulCloseChrome(1500);
283
+
284
+ // Clean up tracking files regardless of kill success
285
+ try {
286
+ unlinkSync(PID_FILE);
287
+ } catch {}
288
+ try {
289
+ unlinkSync(ACTIVITY_FILE);
290
+ } catch {}
291
+ try {
292
+ unlinkSync(CHROME_MODE_FILE);
293
+ } catch {}
294
+
295
+ if (killed) {
296
+ process.stderr.write(
297
+ `[greedysearch] Killed Chrome on port ${GREEDY_PORT}.\n`,
298
+ );
299
+ }
300
+ return killed;
301
+ }
302
+
303
+ // Backward-compat alias
304
+ export const killHeadlessChrome = killChrome;
305
+
306
+ /**
307
+ * Check if Chrome has been idle too long and kill if so.
308
+ * Uses mode-specific timeouts: headless 5 min, visible → 60 min (defaults).
309
+ * Visible Chrome has a much longer grace period because the user explicitly
310
+ * launched it and invested time in captcha/cookie setup.
311
+ * Returns true if Chrome was killed (caller should re-launch).
312
+ */
313
+ export async function checkAndKillIdle() {
314
+ const headless = isChromeHeadless();
315
+ const timeoutMinutes = headless
316
+ ? HEADLESS_IDLE_TIMEOUT_MINUTES
317
+ : VISIBLE_IDLE_TIMEOUT_MINUTES;
318
+
319
+ // Disable idle cleanup for this mode
320
+ if (timeoutMinutes <= 0) return false;
321
+
322
+ if (!existsSync(ACTIVITY_FILE)) {
323
+ touchActivity();
324
+ return false;
325
+ }
326
+
327
+ try {
328
+ const lastActivity = Number.parseInt(
329
+ readFileSync(ACTIVITY_FILE, "utf8").trim(),
330
+ 10,
331
+ );
332
+ if (!lastActivity) return false;
333
+
334
+ const idleMs = Date.now() - lastActivity;
335
+ const idleMinutes = idleMs / 60000;
336
+
337
+ if (idleMinutes >= timeoutMinutes) {
338
+ return killChrome();
339
+ }
340
+ } catch {}
341
+
342
+ return false;
343
+ }
344
+
345
+ /** Re-export cdp() from the canonical location in extractors/common.mjs */
346
+ export const cdp = _cdp;
347
+
348
+ export async function getAnyTab() {
349
+ const list = await cdp(["list"]);
350
+ const first = list.split("\n")[0];
351
+ if (!first) throw new Error("No Chrome tabs found");
352
+ return first.slice(0, 8);
353
+ }
354
+
355
+ export async function openNewTab(url = "about:blank") {
356
+ const anchor = await getAnyTab();
357
+ const hostname = new URL(url).hostname;
358
+ const needsStealth =
359
+ hostname === "copilot.microsoft.com" ||
360
+ hostname === "www.perplexity.ai" ||
361
+ hostname === "perplexity.ai" ||
362
+ hostname.endsWith(".perplexity.ai");
363
+
364
+ if (needsStealth) {
365
+ // Bing Copilot / Perplexity: create blank tab, inject stealth, return.
366
+ // The extractor handles its own navigation, and Page.addScriptToEvaluateOnNewDocument
367
+ // runs the stealth script before any page scripts.
368
+ //
369
+ // For Bing: stealth is awaited (Cloudflare blocks headless without it).
370
+ // For Perplexity: stealth is fire-and-forget (Perplexity's anti-bot detects
371
+ // the aggressive canvas/console patches, so we don't block on the CDP response).
372
+ const raw = await cdp([
373
+ "evalraw",
374
+ anchor,
375
+ "Target.createTarget",
376
+ JSON.stringify({ url: "about:blank" }),
377
+ ]);
378
+ const { targetId } = JSON.parse(raw);
379
+ const tid = targetId.slice(0, 8);
380
+ await cdp(["list"]).catch(() => null);
381
+
382
+ if (hostname === "copilot.microsoft.com") {
383
+ await injectHeadlessStealth(tid);
384
+ } else {
385
+ // Perplexity: fire-and-forget (Perplexity's anti-bot detects awaited patches)
386
+ injectHeadlessStealth(tid).catch(() => {});
387
+ }
388
+
389
+ await cdp(["list"]).catch(() => null);
390
+ return targetId;
391
+ }
392
+
393
+ // Google / other engines: pre-seed with URL directly. Target.createTarget
394
+ // navigation is less detectable than CDP Page.navigate.
395
+ const raw = await cdp([
396
+ "evalraw",
397
+ anchor,
398
+ "Target.createTarget",
399
+ JSON.stringify({ url }),
400
+ ]);
401
+ const { targetId } = JSON.parse(raw);
402
+ await cdp(["list"]).catch(() => null);
403
+ return targetId;
404
+ }
405
+
406
+ export async function activateTab(targetId) {
407
+ try {
408
+ const anchor = await getAnyTab();
409
+ await cdp([
410
+ "evalraw",
411
+ anchor,
412
+ "Target.activateTarget",
413
+ JSON.stringify({ targetId }),
414
+ ]);
415
+ } catch {
416
+ // best-effort
417
+ }
418
+ }
419
+
420
+ export async function closeTab(targetId) {
421
+ try {
422
+ const anchor = await getAnyTab();
423
+ await cdp([
424
+ "evalraw",
425
+ anchor,
426
+ "Target.closeTarget",
427
+ JSON.stringify({ targetId }),
428
+ ]);
429
+ } catch {
430
+ /* best-effort */
431
+ }
432
+ }
433
+
434
+ export async function closeTabs(targetIds = []) {
435
+ await Promise.all(
436
+ targetIds.filter(Boolean).map((tid) => closeTab(tid).catch(() => {})),
437
+ );
438
+ if (targetIds.length > 0) {
439
+ await cdp(["list"]).catch(() => null);
440
+ }
441
+ }
442
+
443
+ export function getFullTabFromCache(engine, engineDomains) {
444
+ try {
445
+ if (!existsSync(PAGES_CACHE)) return null;
446
+ const pages = JSON.parse(readFileSync(PAGES_CACHE, "utf8"));
447
+ const found = pages.find((p) => p.url.includes(engineDomains[engine]));
448
+ return found ? found.targetId : null;
449
+ } catch {
450
+ return null;
451
+ }
452
+ }
453
+
454
+ export function probeGreedyChrome(timeoutMs = 3000) {
455
+ return new Promise((resolve) => {
456
+ const req = http.get(
457
+ `http://localhost:${GREEDY_PORT}/json/version`,
458
+ (res) => {
459
+ res.resume();
460
+ resolve(res.statusCode === 200);
461
+ },
462
+ );
463
+ req.on("error", () => resolve(false));
464
+ req.setTimeout(timeoutMs, () => {
465
+ req.destroy();
466
+ resolve(false);
467
+ });
468
+ });
469
+ }
470
+
471
+ export async function refreshPortFile() {
472
+ const LOCK_FILE = `${ACTIVE_PORT_FILE}.lock`;
473
+ const TEMP_FILE = `${ACTIVE_PORT_FILE}.tmp`;
474
+ const LOCK_STALE_MS = 5000;
475
+ const LOCK_WAIT_MS = 1000;
476
+
477
+ // File-based lock with exclusive create + stale lock recovery
478
+ const lockAcquired = await new Promise((resolve) => {
479
+ const start = Date.now();
480
+ const tryLock = () => {
481
+ try {
482
+ const payload = JSON.stringify({ pid: process.pid, ts: Date.now() });
483
+ writeFileSync(LOCK_FILE, payload, { encoding: "utf8", flag: "wx" });
484
+ resolve(true);
485
+ } catch (e) {
486
+ if (e?.code !== "EEXIST") {
487
+ if (Date.now() - start < LOCK_WAIT_MS) {
488
+ setTimeout(tryLock, 50);
489
+ } else {
490
+ resolve(false);
491
+ }
492
+ return;
493
+ }
494
+
495
+ try {
496
+ const lockRaw = readFileSync(LOCK_FILE, "utf8").trim();
497
+ const parsed = lockRaw.startsWith("{")
498
+ ? JSON.parse(lockRaw)
499
+ : { ts: Number(lockRaw) };
500
+ const lockTime = Number(parsed?.ts) || 0;
501
+
502
+ if (lockTime > 0 && Date.now() - lockTime > LOCK_STALE_MS) {
503
+ try {
504
+ unlinkSync(LOCK_FILE);
505
+ } catch {}
506
+ }
507
+
508
+ if (Date.now() - start < LOCK_WAIT_MS) {
509
+ setTimeout(tryLock, 50);
510
+ } else {
511
+ resolve(false);
512
+ }
513
+ } catch {
514
+ if (Date.now() - start < LOCK_WAIT_MS) {
515
+ setTimeout(tryLock, 50);
516
+ } else {
517
+ resolve(false);
518
+ }
519
+ }
520
+ }
521
+ };
522
+ tryLock();
523
+ });
524
+
525
+ try {
526
+ const body = await new Promise((res, rej) => {
527
+ const req = http.get(
528
+ `http://localhost:${GREEDY_PORT}/json/version`,
529
+ (r) => {
530
+ let b = "";
531
+ r.on("data", (d) => (b += d));
532
+ r.on("end", () => res(b));
533
+ },
534
+ );
535
+ req.on("error", rej);
536
+ req.setTimeout(3000, () => {
537
+ req.destroy();
538
+ rej(new Error("timeout"));
539
+ });
540
+ });
541
+ const { webSocketDebuggerUrl } = JSON.parse(body);
542
+ const wsPath = new URL(webSocketDebuggerUrl).pathname;
543
+
544
+ // Atomic write: write to temp file, then rename
545
+ if (lockAcquired) {
546
+ writeFileSync(TEMP_FILE, `${GREEDY_PORT}\n${wsPath}`, "utf8");
547
+ try {
548
+ unlinkSync(ACTIVE_PORT_FILE);
549
+ } catch {}
550
+ renameSync(TEMP_FILE, ACTIVE_PORT_FILE);
551
+ }
552
+ } catch {
553
+ /* best-effort launch.mjs already wrote the file on first start */
554
+ } finally {
555
+ if (lockAcquired) {
556
+ try {
557
+ unlinkSync(LOCK_FILE);
558
+ } catch {}
559
+ }
560
+ }
561
+ }
562
+
563
+ export async function ensureChrome() {
564
+ // ── Stale session cleanup (once per process) + mode-specific idle check ──
565
+ cleanupStaleSessions();
566
+ const wasKilled = await checkAndKillIdle();
567
+
568
+ let ready = wasKilled ? false : await probeGreedyChrome();
569
+ if (!ready && !wasKilled) {
570
+ await new Promise((r) => setTimeout(r, 500));
571
+ ready = await probeGreedyChrome();
572
+ }
573
+
574
+ // If Chrome is running but in wrong mode, kill it so we relaunch in the correct mode.
575
+ let forceRelaunch = false;
576
+ if (ready) {
577
+ const headless = isChromeHeadless();
578
+ const wantsVisible = process.env.GREEDY_SEARCH_VISIBLE === "1";
579
+ if (!wantsVisible && !headless) {
580
+ // Headless requested (default) but visible Chrome is running — switch back
581
+ process.stderr.write(
582
+ "[greedysearch] Visible Chrome detected — switching to headless mode...\n",
583
+ );
584
+ await killHeadlessChrome();
585
+ await new Promise((r) => setTimeout(r, 1000));
586
+ forceRelaunch = true;
587
+ } else if (wantsVisible && headless) {
588
+ // Visible requested but headless Chrome is running — switch
589
+ process.stderr.write(
590
+ "[greedysearch] Headless Chrome detected — switching to visible mode...\n",
591
+ );
592
+ await killHeadlessChrome();
593
+ await new Promise((r) => setTimeout(r, 1000));
594
+ forceRelaunch = true;
595
+ }
596
+ }
597
+
598
+ const readyAfterModeCheck = forceRelaunch ? false : await probeGreedyChrome();
599
+ if (readyAfterModeCheck) {
600
+ // Chrome already running in correct mode — refresh port file, touch activity, register client
601
+ await refreshPortFile();
602
+ try {
603
+ const md = readMetadata();
604
+ if (md) {
605
+ touchActivityBL(md);
606
+ registerClient(md);
607
+ }
608
+ } catch {}
609
+ return;
610
+ }
611
+
612
+ // ── Cross-process launch lock: prevent race between concurrent ensureChrome calls ──
613
+ const lock = acquireLaunchLock();
614
+ if (!lock.acquired) {
615
+ // Another process is launching Chrome — wait and re-probe
616
+ await new Promise((r) => setTimeout(r, 3000));
617
+ const reReady = await probeGreedyChrome(5000);
618
+ if (reReady) {
619
+ await refreshPortFile();
620
+ return;
621
+ }
622
+ // Still not ready — launch ourselves (the other launcher may have crashed)
623
+ }
624
+
625
+ try {
626
+ // Double-check after acquiring lock (other process may have finished)
627
+ const reCheck = await probeGreedyChrome(1000);
628
+ if (reCheck) {
629
+ await refreshPortFile();
630
+ return;
631
+ }
632
+
633
+ process.stderr.write(
634
+ `GreedySearch Chrome not running on port ${GREEDY_PORT} — auto-launching...\n`,
635
+ );
636
+ const launchArgs = [join(__dir, "..", "..", "bin", "launch.mjs")];
637
+ // Headless is the default unless GREEDY_SEARCH_VISIBLE=1
638
+ if (process.env.GREEDY_SEARCH_VISIBLE !== "1")
639
+ launchArgs.push("--headless");
640
+ await new Promise((resolve, reject) => {
641
+ // Use process.execPath instead of bare "node" so we are not relying on PATH
642
+ // (SonarCloud S4036).
643
+ const proc = spawn(process.execPath, launchArgs, {
644
+ stdio: ["ignore", process.stderr, process.stderr],
645
+ });
646
+ proc.on("close", (code) =>
647
+ code === 0 ? resolve() : reject(new Error("launch.mjs failed")),
648
+ );
649
+ });
650
+ } finally {
651
+ lock.release();
652
+ }
653
+ }