unbrowse 2.0.3 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "2.0.3",
3
+ "version": "2.0.5",
4
4
  "description": "Reverse-engineer any website into reusable API skills. npm CLI + local engine.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -112,10 +112,10 @@ function pickFirefoxProfile(profilesRoot: string, profile?: string): string | nu
112
112
  return existsSync(candidate) ? candidate : null;
113
113
  }
114
114
 
115
- function getFirefoxCookiesPath(profile?: string): string | null {
116
- const profilesRoot = getFirefoxProfilesRoot();
117
- if (!profilesRoot || !existsSync(profilesRoot)) return null;
118
- return pickFirefoxProfile(profilesRoot, profile);
115
+ function getFirefoxCookiesPath(profile?: string, profilesRoot?: string): string | null {
116
+ const root = profilesRoot ?? getFirefoxProfilesRoot();
117
+ if (!root || !existsSync(root)) return null;
118
+ return pickFirefoxProfile(root, profile);
119
119
  }
120
120
 
121
121
  // ---------------------------------------------------------------------------
@@ -335,13 +335,14 @@ export function extractFromChromium(
335
335
 
336
336
  export function extractFromFirefox(
337
337
  domain: string,
338
- opts?: { profile?: string },
338
+ opts?: { profile?: string; profilesRoot?: string },
339
339
  ): ExtractionResult {
340
340
  const warnings: string[] = [];
341
- const dbPath = getFirefoxCookiesPath(opts?.profile);
341
+ const dbPath = getFirefoxCookiesPath(opts?.profile, opts?.profilesRoot);
342
+ const browserLabel = opts?.profilesRoot ? "Zen" : "Firefox";
342
343
 
343
344
  if (!dbPath) {
344
- warnings.push("Firefox cookies DB not found");
345
+ warnings.push(`${browserLabel} cookies DB not found`);
345
346
  return { cookies: [], source: null, warnings };
346
347
  }
347
348
 
@@ -373,14 +374,14 @@ export function extractFromFirefox(
373
374
  return results;
374
375
  });
375
376
 
376
- const source = opts?.profile ? `Firefox profile "${opts.profile}"` : "Firefox default profile";
377
+ const source = opts?.profile ? `${browserLabel} profile "${opts.profile}"` : `${browserLabel} default profile`;
377
378
  if (cookies.length === 0) {
378
379
  warnings.push(`No cookies for ${domain} found in ${source}`);
379
380
  }
380
381
  log("auth", `extracted ${cookies.length} cookies for ${domain} from ${source}`);
381
382
  return { cookies, source: cookies.length > 0 ? source : null, warnings };
382
383
  } catch (err) {
383
- warnings.push(`Firefox extraction failed: ${err instanceof Error ? err.message : err}`);
384
+ warnings.push(`${browserLabel} extraction failed: ${err instanceof Error ? err.message : err}`);
384
385
  return { cookies: [], source: null, warnings };
385
386
  }
386
387
  }
@@ -416,8 +417,62 @@ export function extractBrowserCookies(
416
417
  return chromium;
417
418
  }
418
419
 
419
- // Fall back to Chrome
420
+ // Try Chrome first
420
421
  const chrome = extractFromChrome(domain, { profile: opts?.chromeProfile });
421
- chrome.warnings.push(...ff.warnings);
422
- return chrome;
422
+ if (chrome.cookies.length > 0) {
423
+ chrome.warnings.push(...ff.warnings);
424
+ return chrome;
425
+ }
426
+
427
+ // Auto-discover other Chromium-family browsers
428
+ const home = homedir();
429
+ const chromiumBrowsers: Array<{ name: string; userDataDir: string; safeStorageService: string }> =
430
+ platform() === "darwin"
431
+ ? [
432
+ { name: "Arc", userDataDir: join(home, "Library", "Application Support", "Arc", "User Data"), safeStorageService: "Arc Safe Storage" },
433
+ { name: "Dia", userDataDir: join(home, "Library", "Application Support", "Dia", "User Data"), safeStorageService: "Dia Safe Storage" },
434
+ { name: "Brave", userDataDir: join(home, "Library", "Application Support", "BraveSoftware", "Brave-Browser"), safeStorageService: "Brave Safe Storage" },
435
+ { name: "Edge", userDataDir: join(home, "Library", "Application Support", "Microsoft Edge"), safeStorageService: "Microsoft Edge Safe Storage" },
436
+ { name: "Vivaldi", userDataDir: join(home, "Library", "Application Support", "Vivaldi"), safeStorageService: "Vivaldi Safe Storage" },
437
+ { name: "Chromium", userDataDir: join(home, "Library", "Application Support", "Chromium"), safeStorageService: "Chromium Safe Storage" },
438
+ ]
439
+ : platform() === "linux"
440
+ ? [
441
+ { name: "Brave", userDataDir: join(home, ".config", "BraveSoftware", "Brave-Browser"), safeStorageService: "Brave Safe Storage" },
442
+ { name: "Edge", userDataDir: join(home, ".config", "microsoft-edge"), safeStorageService: "Microsoft Edge Safe Storage" },
443
+ { name: "Vivaldi", userDataDir: join(home, ".config", "vivaldi"), safeStorageService: "Vivaldi Safe Storage" },
444
+ { name: "Chromium", userDataDir: join(home, ".config", "chromium"), safeStorageService: "Chromium Safe Storage" },
445
+ ]
446
+ : [];
447
+
448
+ const allWarnings = [...ff.warnings, ...chrome.warnings];
449
+ for (const browser of chromiumBrowsers) {
450
+ if (!existsSync(browser.userDataDir)) continue;
451
+ const result = extractFromChromium(domain, {
452
+ userDataDir: browser.userDataDir,
453
+ browserName: browser.name,
454
+ safeStorageService: browser.safeStorageService,
455
+ });
456
+ if (result.cookies.length > 0) {
457
+ result.warnings.push(...allWarnings);
458
+ return result;
459
+ }
460
+ allWarnings.push(...result.warnings);
461
+ }
462
+
463
+ // Also try Firefox-based alternatives (Zen)
464
+ const zenPaths = platform() === "darwin"
465
+ ? [join(home, "Library", "Application Support", "zen")]
466
+ : [join(home, ".zen")];
467
+ for (const zenRoot of zenPaths) {
468
+ if (!existsSync(zenRoot)) continue;
469
+ const zenResult = extractFromFirefox(domain, { profilesRoot: zenRoot });
470
+ if (zenResult.cookies.length > 0) {
471
+ zenResult.warnings.push(...allWarnings);
472
+ return zenResult;
473
+ }
474
+ allWarnings.push(...zenResult.warnings);
475
+ }
476
+
477
+ return { cookies: [], source: null, warnings: allWarnings };
423
478
  }
@@ -38,95 +38,43 @@ export async function interactiveLogin(
38
38
  domain?: string,
39
39
  ): Promise<LoginResult> {
40
40
  const targetDomain = domain ?? new URL(url).hostname;
41
- const profileDir = getProfilePath(targetDomain);
42
41
 
43
42
  log("auth", `interactiveLogin — url: ${url}, domain: ${targetDomain}`);
44
43
 
45
- try {
46
- fs.mkdirSync(profileDir, { recursive: true });
47
-
48
- // Start Kuri and get a tab
49
- await kuri.start();
50
- const tabId = await kuri.getDefaultTab();
51
- await kuri.networkEnable(tabId);
52
-
53
- // Navigate to login URL
54
- await kuri.navigate(tabId, url);
55
-
56
- const startTime = Date.now();
57
-
58
- // Snapshot initial cookies
59
- const initialCookies = await kuri.getCookies(tabId);
60
- const initialCookieCount = initialCookies.filter((c) => isDomainMatch(c.domain, targetDomain)).length;
61
- log("auth", `initial cookies for ${targetDomain}: ${initialCookieCount}`);
62
-
63
- // Wait for user to complete login — detect via cookie changes + URL change
64
- let loggedIn = false;
65
- let lastLoggedUrl = "";
66
- while (Date.now() - startTime < LOGIN_TIMEOUT_MS) {
67
- await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
68
- const elapsed = Date.now() - startTime;
69
-
70
- try {
71
- const currentUrl = await kuri.getCurrentUrl(tabId);
72
- const currentDomain = new URL(currentUrl).hostname.toLowerCase();
73
- const targetNorm = targetDomain.toLowerCase();
74
-
75
- if (currentUrl !== lastLoggedUrl) {
76
- log("auth", `navigated to: ${currentUrl}`);
77
- lastLoggedUrl = currentUrl;
78
- }
79
-
80
- if (elapsed < MIN_WAIT_MS) continue;
81
-
82
- const isOnTarget = currentDomain === targetNorm || currentDomain.endsWith("." + targetNorm);
83
- if (isOnTarget) {
84
- const isStillLogin = /\/(login|signin|sign-in|sso|auth|oauth|uas\/login|checkpoint)/.test(new URL(currentUrl).pathname);
44
+ // Open URL in the user's default browser (visible, not headless)
45
+ const { exec } = await import("node:child_process");
46
+ const openCmd = process.platform === "darwin" ? "open" : "xdg-open";
47
+ exec(`${openCmd} ${JSON.stringify(url)}`);
48
+ log("auth", `opened ${url} in default browser via ${openCmd}`);
85
49
 
86
- const currentCookies = await kuri.getCookies(tabId);
87
- const currentCookieCount = currentCookies.filter((c) => isDomainMatch(c.domain, targetDomain)).length;
88
- const gotNewCookies = currentCookieCount > initialCookieCount;
50
+ // Poll extractBrowserAuth until cookies appear or timeout
51
+ const startTime = Date.now();
52
+ while (Date.now() - startTime < LOGIN_TIMEOUT_MS) {
53
+ await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
89
54
 
90
- if (!isStillLogin && gotNewCookies) {
91
- loggedIn = true;
92
- log("auth", `login complete ${currentUrl} (cookies: ${initialCookieCount} → ${currentCookieCount})`);
93
- break;
94
- }
95
-
96
- if (!isStillLogin && currentCookieCount > 0) {
97
- loggedIn = true;
98
- log("auth", `already logged in — ${currentUrl} (${currentCookieCount} cookies present)`);
99
- break;
100
- }
101
- }
102
- } catch { /* page navigating */ }
103
- }
104
-
105
- if (!loggedIn) {
106
- log("auth", `login wait ended after ${Math.round((Date.now() - startTime) / 1000)}s — capturing cookies anyway`);
55
+ try {
56
+ const result = await extractBrowserAuth(targetDomain);
57
+ if (result.success && result.cookies_stored > 0) {
58
+ log("auth", `login detected — ${result.cookies_stored} cookies captured for ${targetDomain}`);
59
+ return result;
60
+ }
61
+ } catch (err) {
62
+ log("auth", `poll error: ${err instanceof Error ? err.message : err}`);
107
63
  }
108
64
 
109
- // Extract and store cookies
110
- const cookies = await kuri.getCookies(tabId);
111
- const domainCookies = cookies.filter((c) => isDomainMatch(c.domain, targetDomain));
112
-
113
- if (domainCookies.length === 0) {
114
- return { success: false, domain: targetDomain, cookies_stored: 0, error: "No cookies captured for domain" };
65
+ // Log progress every 10s
66
+ const elapsed = Date.now() - startTime;
67
+ if (elapsed % 10_000 < POLL_INTERVAL_MS) {
68
+ log("auth", `waiting for login... ${Math.round(elapsed / 1000)}s elapsed`);
115
69
  }
116
-
117
- const storableCookies = domainCookies.map((c) => ({
118
- name: c.name, value: c.value, domain: c.domain, path: c.path,
119
- secure: c.secure, httpOnly: c.httpOnly, sameSite: c.sameSite, expires: c.expires,
120
- }));
121
-
122
- const vaultKey = `auth:${getRegistrableDomain(targetDomain)}`;
123
- await storeCredential(vaultKey, JSON.stringify({ cookies: storableCookies }));
124
- log("auth", `stored ${storableCookies.length} cookies under ${vaultKey}`);
125
-
126
- return { success: true, domain: targetDomain, cookies_stored: storableCookies.length };
127
- } finally {
128
- // Cleanup handled by Kuri's tab management
129
70
  }
71
+
72
+ return {
73
+ success: false,
74
+ domain: targetDomain,
75
+ cookies_stored: 0,
76
+ error: `Login timed out after ${LOGIN_TIMEOUT_MS / 1000}s — no cookies detected in browser`,
77
+ };
130
78
  }
131
79
 
132
80
  /**
@@ -6,6 +6,40 @@ import { log } from "../logger.js";
6
6
  // BUG-GC-012: Use a real Chrome UA — HeadlessChrome is actively blocked by Google and others.
7
7
  const CHROME_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
8
8
 
9
+ // Stealth script — hides headless Chrome indicators from bot detection.
10
+ // Ported from kuri's cdp/js/stealth.js (commit 4dbbd89).
11
+ const STEALTH_SCRIPT = `
12
+ Object.defineProperty(navigator, 'webdriver', { get: () => false, configurable: true });
13
+ Object.defineProperty(navigator, 'plugins', {
14
+ get: () => {
15
+ const p = [
16
+ { name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' },
17
+ { name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai', description: '' },
18
+ { name: 'Native Client', filename: 'internal-nacl-plugin', description: '' },
19
+ ];
20
+ p.length = 3;
21
+ return p;
22
+ },
23
+ configurable: true,
24
+ });
25
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'], configurable: true });
26
+ if (!window.chrome) window.chrome = {};
27
+ if (!window.chrome.runtime) window.chrome.runtime = { connect: () => {}, sendMessage: () => {}, id: undefined };
28
+ const origQuery = window.navigator.permissions?.query;
29
+ if (origQuery) {
30
+ window.navigator.permissions.query = (p) =>
31
+ p.name === 'notifications' ? Promise.resolve({ state: Notification.permission }) : origQuery(p);
32
+ }
33
+ try {
34
+ const d = Object.getOwnPropertyDescriptor(HTMLIFrameElement.prototype, 'contentWindow');
35
+ if (d) Object.defineProperty(HTMLIFrameElement.prototype, 'contentWindow', { get: function() { return d.get.call(this); } });
36
+ } catch {}
37
+ Object.defineProperty(navigator, 'userAgent', {
38
+ get: () => '${CHROME_UA}',
39
+ configurable: true,
40
+ });
41
+ `;
42
+
9
43
  // Tab semaphore: max 3 concurrent capture tabs
10
44
  const MAX_CONCURRENT_TABS = 3;
11
45
  let activeTabs = 0;
@@ -654,6 +688,11 @@ export async function captureSession(
654
688
  await injectCookies(tabId, cookies);
655
689
  }
656
690
 
691
+ // Inject stealth patches — hide headless Chrome indicators from bot detection
692
+ try {
693
+ await kuri.evaluate(tabId, STEALTH_SCRIPT);
694
+ } catch { /* best-effort */ }
695
+
657
696
  // Start HAR recording
658
697
  await kuri.harStart(tabId);
659
698
 
@@ -662,16 +701,23 @@ export async function captureSession(
662
701
  try { pageDomain = getRegistrableDomain(new URL(url).hostname); } catch { /* bad url */ }
663
702
 
664
703
  // Inject fetch/XHR interceptor BEFORE navigation to capture all response bodies
665
- // Navigate directly to target URL skip origin pre-navigation to save 1-2s on heavy SPAs.
666
- // The interceptor is re-injected after navigation anyway (page context resets on navigate).
667
- await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT).catch(() => {});
704
+ // Navigate to origin first so cookies are applied in the correct domain context
705
+ // before the full page load required for sites like LinkedIn that check auth on first load.
706
+ try {
707
+ const origin = new URL(url).origin;
708
+ await kuri.navigate(tabId, origin);
709
+ await new Promise((r) => setTimeout(r, 500));
710
+ await kuri.evaluate(tabId, STEALTH_SCRIPT);
711
+ await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT);
712
+ } catch { /* best-effort */ }
668
713
 
669
714
  // Navigate to target URL
670
715
  await kuri.navigate(tabId, url);
671
716
 
672
- // Re-inject interceptor after navigation (page context resets on navigate)
717
+ // Re-inject stealth + interceptor after navigation (page context resets on navigate)
673
718
  try {
674
719
  await new Promise((r) => setTimeout(r, 300));
720
+ await kuri.evaluate(tabId, STEALTH_SCRIPT);
675
721
  await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT);
676
722
  } catch { /* page may not be ready */ }
677
723
 
@@ -1792,13 +1792,15 @@ export async function executeEndpoint(
1792
1792
 
1793
1793
  // CSRF token auto-detection (bird pattern): many sites require CSRF tokens
1794
1794
  // as both a cookie AND a header. Detect common patterns and replay them.
1795
- if (!headers["x-csrf-token"] && !headers["x-xsrf-token"]) {
1795
+ if (!headers["x-csrf-token"] && !headers["x-xsrf-token"] && !headers["csrf-token"]) {
1796
1796
  const csrfCookie = cookies.find((c) =>
1797
- /^(ct0|csrf_token|_csrf|csrftoken|XSRF-TOKEN|_xsrf)$/i.test(c.name)
1797
+ /^(ct0|csrf_token|_csrf|csrftoken|XSRF-TOKEN|_xsrf|JSESSIONID)$/i.test(c.name)
1798
1798
  );
1799
1799
  if (csrfCookie) {
1800
1800
  const v = csrfCookie.value.startsWith('"') && csrfCookie.value.endsWith('"') ? csrfCookie.value.slice(1, -1) : csrfCookie.value;
1801
- headers["x-csrf-token"] = v;
1801
+ // LinkedIn uses "csrf-token" header derived from JSESSIONID
1802
+ const headerName = csrfCookie.name === "JSESSIONID" ? "csrf-token" : "x-csrf-token";
1803
+ headers[headerName] = v;
1802
1804
  }
1803
1805
  }
1804
1806
  }
@@ -381,8 +381,30 @@ function inferCsrfPlan(req: RawRequest, parsedBody?: unknown): CsrfPlan | undefi
381
381
  Object.entries(req.request_headers).map(([key, value]) => [key.toLowerCase(), value]),
382
382
  );
383
383
  const cookies = parseCookieHeader(headers["cookie"]);
384
- const csrfCookieNames = Object.keys(cookies).filter((name) => /^(ct0|csrf_token|_csrf|csrftoken|xsrf-token|_xsrf)$/i.test(name));
385
- const headerName = ["x-csrf-token", "x-xsrf-token", "x-csrftoken"].find((name) => typeof headers[name] === "string" && headers[name].length > 0);
384
+ const csrfCookieNames = Object.keys(cookies).filter((name) => /^(ct0|csrf_token|_csrf|csrftoken|xsrf-token|_xsrf|JSESSIONID)$/i.test(name));
385
+ const headerName = ["x-csrf-token", "x-xsrf-token", "x-csrftoken", "csrf-token"].find((name) => typeof headers[name] === "string" && headers[name].length > 0);
386
+
387
+ // Also detect CSRF by value matching: if any cookie value appears as a header value,
388
+ // that's a CSRF token pattern regardless of naming convention
389
+ if (!headerName && csrfCookieNames.length === 0) {
390
+ for (const [cookieName, cookieValue] of Object.entries(cookies)) {
391
+ if (!cookieValue || cookieValue.length < 8) continue;
392
+ const unquoted = cookieValue.startsWith('"') && cookieValue.endsWith('"') ? cookieValue.slice(1, -1) : cookieValue;
393
+ for (const [hName, hValue] of Object.entries(headers)) {
394
+ if (hName === "cookie" || hName === "host" || hName === "content-length") continue;
395
+ const hUnquoted = hValue.startsWith('"') && hValue.endsWith('"') ? hValue.slice(1, -1) : hValue;
396
+ if (unquoted === hUnquoted && unquoted.length >= 8) {
397
+ return {
398
+ source: "cookie",
399
+ param_name: hName,
400
+ refresh_on_401: true,
401
+ extractor_sequence: [cookieName],
402
+ };
403
+ }
404
+ }
405
+ }
406
+ }
407
+
386
408
  if (headerName && csrfCookieNames.length > 0) {
387
409
  return {
388
410
  source: "cookie",
Binary file
Binary file
Binary file
Binary file