unbrowse 2.0.2 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "2.0.2",
3
+ "version": "2.0.4",
4
4
  "description": "Reverse-engineer any website into reusable API skills. npm CLI + local engine.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -112,10 +112,10 @@ function pickFirefoxProfile(profilesRoot: string, profile?: string): string | nu
112
112
  return existsSync(candidate) ? candidate : null;
113
113
  }
114
114
 
115
- function getFirefoxCookiesPath(profile?: string): string | null {
116
- const profilesRoot = getFirefoxProfilesRoot();
117
- if (!profilesRoot || !existsSync(profilesRoot)) return null;
118
- return pickFirefoxProfile(profilesRoot, profile);
115
+ function getFirefoxCookiesPath(profile?: string, profilesRoot?: string): string | null {
116
+ const root = profilesRoot ?? getFirefoxProfilesRoot();
117
+ if (!root || !existsSync(root)) return null;
118
+ return pickFirefoxProfile(root, profile);
119
119
  }
120
120
 
121
121
  // ---------------------------------------------------------------------------
@@ -136,9 +136,10 @@ function getChromiumDecryptionKey(opts?: ChromiumCookieSourceOptions): Buffer |
136
136
  if (platform() !== "darwin") return null; // TODO: Linux/Windows support
137
137
 
138
138
  try {
139
- const keyOutput = execSync(
140
- `security find-generic-password -s "${service.replace(/"/g, '\\"')}" -w 2>/dev/null || echo ""`,
141
- { encoding: "utf8" },
139
+ const keyOutput = execFileSync(
140
+ "security",
141
+ ["find-generic-password", "-s", service, "-w"],
142
+ { encoding: "utf8", stdio: ["pipe", "pipe", "pipe"] },
142
143
  ).trim();
143
144
  if (!keyOutput) return null;
144
145
 
@@ -241,9 +242,13 @@ function buildDomainWhereClause(domain: string, column: string): string {
241
242
  `www.${reg}`,
242
243
  `.www.${reg}`,
243
244
  ]);
244
- const escaped = [...variants].map((d) => `'${d.replace(/'/g, "''")}'`);
245
- // Also match any subdomain via LIKE (e.g. .api.example.com, .sg.example.com)
246
- const likePattern = `'%.${reg.replace(/'/g, "''")}'`;
245
+ // Use parameterized-safe quoting: reject any domain containing single quotes
246
+ for (const d of variants) {
247
+ if (d.includes("'")) throw new Error(`Invalid domain for cookie query: ${d}`);
248
+ }
249
+ const escaped = [...variants].map((d) => `'${d}'`);
250
+ const likeReg = reg.includes("'") ? reg : reg;
251
+ const likePattern = `'%.${likeReg}'`;
247
252
  return `(${column} IN (${escaped.join(", ")}) OR ${column} LIKE ${likePattern})`;
248
253
  }
249
254
 
@@ -330,13 +335,14 @@ export function extractFromChromium(
330
335
 
331
336
  export function extractFromFirefox(
332
337
  domain: string,
333
- opts?: { profile?: string },
338
+ opts?: { profile?: string; profilesRoot?: string },
334
339
  ): ExtractionResult {
335
340
  const warnings: string[] = [];
336
- const dbPath = getFirefoxCookiesPath(opts?.profile);
341
+ const dbPath = getFirefoxCookiesPath(opts?.profile, opts?.profilesRoot);
342
+ const browserLabel = opts?.profilesRoot ? "Zen" : "Firefox";
337
343
 
338
344
  if (!dbPath) {
339
- warnings.push("Firefox cookies DB not found");
345
+ warnings.push(`${browserLabel} cookies DB not found`);
340
346
  return { cookies: [], source: null, warnings };
341
347
  }
342
348
 
@@ -368,14 +374,14 @@ export function extractFromFirefox(
368
374
  return results;
369
375
  });
370
376
 
371
- const source = opts?.profile ? `Firefox profile "${opts.profile}"` : "Firefox default profile";
377
+ const source = opts?.profile ? `${browserLabel} profile "${opts.profile}"` : `${browserLabel} default profile`;
372
378
  if (cookies.length === 0) {
373
379
  warnings.push(`No cookies for ${domain} found in ${source}`);
374
380
  }
375
381
  log("auth", `extracted ${cookies.length} cookies for ${domain} from ${source}`);
376
382
  return { cookies, source: cookies.length > 0 ? source : null, warnings };
377
383
  } catch (err) {
378
- warnings.push(`Firefox extraction failed: ${err instanceof Error ? err.message : err}`);
384
+ warnings.push(`${browserLabel} extraction failed: ${err instanceof Error ? err.message : err}`);
379
385
  return { cookies: [], source: null, warnings };
380
386
  }
381
387
  }
@@ -411,8 +417,62 @@ export function extractBrowserCookies(
411
417
  return chromium;
412
418
  }
413
419
 
414
- // Fall back to Chrome
420
+ // Try Chrome first
415
421
  const chrome = extractFromChrome(domain, { profile: opts?.chromeProfile });
416
- chrome.warnings.push(...ff.warnings);
417
- return chrome;
422
+ if (chrome.cookies.length > 0) {
423
+ chrome.warnings.push(...ff.warnings);
424
+ return chrome;
425
+ }
426
+
427
+ // Auto-discover other Chromium-family browsers
428
+ const home = homedir();
429
+ const chromiumBrowsers: Array<{ name: string; userDataDir: string; safeStorageService: string }> =
430
+ platform() === "darwin"
431
+ ? [
432
+ { name: "Arc", userDataDir: join(home, "Library", "Application Support", "Arc", "User Data"), safeStorageService: "Arc Safe Storage" },
433
+ { name: "Dia", userDataDir: join(home, "Library", "Application Support", "Dia", "User Data"), safeStorageService: "Dia Safe Storage" },
434
+ { name: "Brave", userDataDir: join(home, "Library", "Application Support", "BraveSoftware", "Brave-Browser"), safeStorageService: "Brave Safe Storage" },
435
+ { name: "Edge", userDataDir: join(home, "Library", "Application Support", "Microsoft Edge"), safeStorageService: "Microsoft Edge Safe Storage" },
436
+ { name: "Vivaldi", userDataDir: join(home, "Library", "Application Support", "Vivaldi"), safeStorageService: "Vivaldi Safe Storage" },
437
+ { name: "Chromium", userDataDir: join(home, "Library", "Application Support", "Chromium"), safeStorageService: "Chromium Safe Storage" },
438
+ ]
439
+ : platform() === "linux"
440
+ ? [
441
+ { name: "Brave", userDataDir: join(home, ".config", "BraveSoftware", "Brave-Browser"), safeStorageService: "Brave Safe Storage" },
442
+ { name: "Edge", userDataDir: join(home, ".config", "microsoft-edge"), safeStorageService: "Microsoft Edge Safe Storage" },
443
+ { name: "Vivaldi", userDataDir: join(home, ".config", "vivaldi"), safeStorageService: "Vivaldi Safe Storage" },
444
+ { name: "Chromium", userDataDir: join(home, ".config", "chromium"), safeStorageService: "Chromium Safe Storage" },
445
+ ]
446
+ : [];
447
+
448
+ const allWarnings = [...ff.warnings, ...chrome.warnings];
449
+ for (const browser of chromiumBrowsers) {
450
+ if (!existsSync(browser.userDataDir)) continue;
451
+ const result = extractFromChromium(domain, {
452
+ userDataDir: browser.userDataDir,
453
+ browserName: browser.name,
454
+ safeStorageService: browser.safeStorageService,
455
+ });
456
+ if (result.cookies.length > 0) {
457
+ result.warnings.push(...allWarnings);
458
+ return result;
459
+ }
460
+ allWarnings.push(...result.warnings);
461
+ }
462
+
463
+ // Also try Firefox-based alternatives (Zen)
464
+ const zenPaths = platform() === "darwin"
465
+ ? [join(home, "Library", "Application Support", "zen")]
466
+ : [join(home, ".zen")];
467
+ for (const zenRoot of zenPaths) {
468
+ if (!existsSync(zenRoot)) continue;
469
+ const zenResult = extractFromFirefox(domain, { profilesRoot: zenRoot });
470
+ if (zenResult.cookies.length > 0) {
471
+ zenResult.warnings.push(...allWarnings);
472
+ return zenResult;
473
+ }
474
+ allWarnings.push(...zenResult.warnings);
475
+ }
476
+
477
+ return { cookies: [], source: null, warnings: allWarnings };
418
478
  }
@@ -38,95 +38,43 @@ export async function interactiveLogin(
38
38
  domain?: string,
39
39
  ): Promise<LoginResult> {
40
40
  const targetDomain = domain ?? new URL(url).hostname;
41
- const profileDir = getProfilePath(targetDomain);
42
41
 
43
42
  log("auth", `interactiveLogin — url: ${url}, domain: ${targetDomain}`);
44
43
 
45
- try {
46
- fs.mkdirSync(profileDir, { recursive: true });
47
-
48
- // Start Kuri and get a tab
49
- await kuri.start();
50
- const tabId = await kuri.getDefaultTab();
51
- await kuri.networkEnable(tabId);
52
-
53
- // Navigate to login URL
54
- await kuri.navigate(tabId, url);
55
-
56
- const startTime = Date.now();
57
-
58
- // Snapshot initial cookies
59
- const initialCookies = await kuri.getCookies(tabId);
60
- const initialCookieCount = initialCookies.filter((c) => isDomainMatch(c.domain, targetDomain)).length;
61
- log("auth", `initial cookies for ${targetDomain}: ${initialCookieCount}`);
62
-
63
- // Wait for user to complete login — detect via cookie changes + URL change
64
- let loggedIn = false;
65
- let lastLoggedUrl = "";
66
- while (Date.now() - startTime < LOGIN_TIMEOUT_MS) {
67
- await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
68
- const elapsed = Date.now() - startTime;
69
-
70
- try {
71
- const currentUrl = await kuri.getCurrentUrl(tabId);
72
- const currentDomain = new URL(currentUrl).hostname.toLowerCase();
73
- const targetNorm = targetDomain.toLowerCase();
74
-
75
- if (currentUrl !== lastLoggedUrl) {
76
- log("auth", `navigated to: ${currentUrl}`);
77
- lastLoggedUrl = currentUrl;
78
- }
79
-
80
- if (elapsed < MIN_WAIT_MS) continue;
81
-
82
- const isOnTarget = currentDomain === targetNorm || currentDomain.endsWith("." + targetNorm);
83
- if (isOnTarget) {
84
- const isStillLogin = /\/(login|signin|sign-in|sso|auth|oauth|uas\/login|checkpoint)/.test(new URL(currentUrl).pathname);
44
+ // Open URL in the user's default browser (visible, not headless)
45
+ const { exec } = await import("node:child_process");
46
+ const openCmd = process.platform === "darwin" ? "open" : "xdg-open";
47
+ exec(`${openCmd} ${JSON.stringify(url)}`);
48
+ log("auth", `opened ${url} in default browser via ${openCmd}`);
85
49
 
86
- const currentCookies = await kuri.getCookies(tabId);
87
- const currentCookieCount = currentCookies.filter((c) => isDomainMatch(c.domain, targetDomain)).length;
88
- const gotNewCookies = currentCookieCount > initialCookieCount;
50
+ // Poll extractBrowserAuth until cookies appear or timeout
51
+ const startTime = Date.now();
52
+ while (Date.now() - startTime < LOGIN_TIMEOUT_MS) {
53
+ await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
89
54
 
90
- if (!isStillLogin && gotNewCookies) {
91
- loggedIn = true;
92
- log("auth", `login complete ${currentUrl} (cookies: ${initialCookieCount} → ${currentCookieCount})`);
93
- break;
94
- }
95
-
96
- if (!isStillLogin && currentCookieCount > 0) {
97
- loggedIn = true;
98
- log("auth", `already logged in — ${currentUrl} (${currentCookieCount} cookies present)`);
99
- break;
100
- }
101
- }
102
- } catch { /* page navigating */ }
103
- }
104
-
105
- if (!loggedIn) {
106
- log("auth", `login wait ended after ${Math.round((Date.now() - startTime) / 1000)}s — capturing cookies anyway`);
55
+ try {
56
+ const result = await extractBrowserAuth(targetDomain);
57
+ if (result.success && result.cookies_stored > 0) {
58
+ log("auth", `login detected — ${result.cookies_stored} cookies captured for ${targetDomain}`);
59
+ return result;
60
+ }
61
+ } catch (err) {
62
+ log("auth", `poll error: ${err instanceof Error ? err.message : err}`);
107
63
  }
108
64
 
109
- // Extract and store cookies
110
- const cookies = await kuri.getCookies(tabId);
111
- const domainCookies = cookies.filter((c) => isDomainMatch(c.domain, targetDomain));
112
-
113
- if (domainCookies.length === 0) {
114
- return { success: false, domain: targetDomain, cookies_stored: 0, error: "No cookies captured for domain" };
65
+ // Log progress every 10s
66
+ const elapsed = Date.now() - startTime;
67
+ if (elapsed % 10_000 < POLL_INTERVAL_MS) {
68
+ log("auth", `waiting for login... ${Math.round(elapsed / 1000)}s elapsed`);
115
69
  }
116
-
117
- const storableCookies = domainCookies.map((c) => ({
118
- name: c.name, value: c.value, domain: c.domain, path: c.path,
119
- secure: c.secure, httpOnly: c.httpOnly, sameSite: c.sameSite, expires: c.expires,
120
- }));
121
-
122
- const vaultKey = `auth:${getRegistrableDomain(targetDomain)}`;
123
- await storeCredential(vaultKey, JSON.stringify({ cookies: storableCookies }));
124
- log("auth", `stored ${storableCookies.length} cookies under ${vaultKey}`);
125
-
126
- return { success: true, domain: targetDomain, cookies_stored: storableCookies.length };
127
- } finally {
128
- // Cleanup handled by Kuri's tab management
129
70
  }
71
+
72
+ return {
73
+ success: false,
74
+ domain: targetDomain,
75
+ cookies_stored: 0,
76
+ error: `Login timed out after ${LOGIN_TIMEOUT_MS / 1000}s — no cookies detected in browser`,
77
+ };
130
78
  }
131
79
 
132
80
  /**
@@ -596,20 +596,19 @@ export function buildPageArtifactCapture(
596
596
  const extracted = extractFromDOM(html, intent);
597
597
  if (!extracted.data || extracted.confidence <= 0.2) return {};
598
598
  const quality = validateExtractionQuality(extracted.data, extracted.confidence, intent);
599
- if (!quality.valid) {
600
- return { quality_note: quality.quality_note ?? "low_quality_dom_extraction" };
601
- }
602
599
  const semanticAssessment = assessIntentResult(extracted.data, intent);
603
600
  if (semanticAssessment.verdict === "fail") {
604
601
  return { quality_note: semanticAssessment.reason };
605
602
  }
603
+ // Quality gate: low confidence still returns data to the caller (better than
604
+ // no_endpoints), but marks it so the caller can decide whether to publish.
606
605
  const response_schema = inferSchema([extracted.data]);
607
606
  const endpoint: EndpointDescriptor = {
608
607
  endpoint_id: nanoid(),
609
608
  method: "GET",
610
609
  url_template: templatizeQueryParams(url),
611
610
  idempotency: "safe" as const,
612
- verification_status: "verified" as const,
611
+ verification_status: quality.valid ? "verified" as const : "unverified" as const,
613
612
  reliability_score: extracted.confidence,
614
613
  description: `Captured page artifact for ${intent}`,
615
614
  response_schema,
@@ -637,8 +636,10 @@ export function buildPageArtifactCapture(
637
636
  method: extracted.extraction_method,
638
637
  confidence: extracted.confidence,
639
638
  source: "dom-fallback",
639
+ ...(quality.quality_note ? { quality_note: quality.quality_note } : {}),
640
640
  },
641
641
  },
642
+ ...(!quality.valid ? { quality_note: quality.quality_note } : {}),
642
643
  };
643
644
  }
644
645
 
@@ -1163,9 +1164,27 @@ async function executeBrowserCapture(
1163
1164
  cleanEndpoints.push(canonicalDocumentEndpoint);
1164
1165
  }
1165
1166
 
1166
- const pageArtifact = captured.html
1167
+ let pageArtifact = captured.html
1167
1168
  ? buildPageArtifactCapture(url, intent, captured.html, authBackedCapture)
1168
1169
  : {};
1170
+
1171
+ // SSR fallback: if Kuri's headless Chrome was bot-detected and served stripped
1172
+ // HTML, the DOM extraction above will fail or return low quality. Try a plain
1173
+ // HTTP fetch — many sites serve full SSR HTML to normal requests.
1174
+ if (!pageArtifact.endpoint) {
1175
+ const kuriHtmlLen = captured.html?.length ?? 0;
1176
+ const ssrFallback = await tryHttpFetch(url, {}, []).catch(() => null);
1177
+ if (ssrFallback && ssrFallback.html.length > kuriHtmlLen * 1.2) {
1178
+ console.log(`[ssr-fallback] Kuri HTML=${kuriHtmlLen}, fetch HTML=${ssrFallback.html.length} — retrying DOM extraction`);
1179
+ const ssrArtifact = buildPageArtifactCapture(ssrFallback.final_url || url, intent, ssrFallback.html, authBackedCapture);
1180
+ if (ssrArtifact.endpoint) {
1181
+ console.log(`[ssr-fallback] success — extracted structured data via plain HTTP fetch`);
1182
+ pageArtifact = ssrArtifact;
1183
+ } else {
1184
+ console.log(`[ssr-fallback] fetch got larger HTML but extraction still failed${ssrArtifact.quality_note ? `: ${ssrArtifact.quality_note}` : ""}`);
1185
+ }
1186
+ }
1187
+ }
1169
1188
  const domArtifactEndpoint = pageArtifact.endpoint;
1170
1189
  const domArtifactResult = pageArtifact.result;
1171
1190
  const inferredOnlyCapture = cleanEndpoints.length > 0 && cleanEndpoints.every((endpoint) => isBundleInferredEndpoint(endpoint));
@@ -1249,7 +1268,8 @@ async function executeBrowserCapture(
1249
1268
  };
1250
1269
  }
1251
1270
 
1252
- if (pageArtifact.quality_note) {
1271
+ if (pageArtifact.quality_note && !pageArtifact.endpoint) {
1272
+ // Quality gate rejected AND no endpoint — nothing useful extracted
1253
1273
  const trace: ExecutionTrace = stampTrace({
1254
1274
  trace_id: traceId,
1255
1275
  skill_id: skill.skill_id,
@@ -1772,13 +1792,15 @@ export async function executeEndpoint(
1772
1792
 
1773
1793
  // CSRF token auto-detection (bird pattern): many sites require CSRF tokens
1774
1794
  // as both a cookie AND a header. Detect common patterns and replay them.
1775
- if (!headers["x-csrf-token"] && !headers["x-xsrf-token"]) {
1795
+ if (!headers["x-csrf-token"] && !headers["x-xsrf-token"] && !headers["csrf-token"]) {
1776
1796
  const csrfCookie = cookies.find((c) =>
1777
- /^(ct0|csrf_token|_csrf|csrftoken|XSRF-TOKEN|_xsrf)$/i.test(c.name)
1797
+ /^(ct0|csrf_token|_csrf|csrftoken|XSRF-TOKEN|_xsrf|JSESSIONID)$/i.test(c.name)
1778
1798
  );
1779
1799
  if (csrfCookie) {
1780
1800
  const v = csrfCookie.value.startsWith('"') && csrfCookie.value.endsWith('"') ? csrfCookie.value.slice(1, -1) : csrfCookie.value;
1781
- headers["x-csrf-token"] = v;
1801
+ // LinkedIn uses "csrf-token" header derived from JSESSIONID
1802
+ const headerName = csrfCookie.name === "JSESSIONID" ? "csrf-token" : "x-csrf-token";
1803
+ headers[headerName] = v;
1782
1804
  }
1783
1805
  }
1784
1806
  }
@@ -381,8 +381,30 @@ function inferCsrfPlan(req: RawRequest, parsedBody?: unknown): CsrfPlan | undefi
381
381
  Object.entries(req.request_headers).map(([key, value]) => [key.toLowerCase(), value]),
382
382
  );
383
383
  const cookies = parseCookieHeader(headers["cookie"]);
384
- const csrfCookieNames = Object.keys(cookies).filter((name) => /^(ct0|csrf_token|_csrf|csrftoken|xsrf-token|_xsrf)$/i.test(name));
385
- const headerName = ["x-csrf-token", "x-xsrf-token", "x-csrftoken"].find((name) => typeof headers[name] === "string" && headers[name].length > 0);
384
+ const csrfCookieNames = Object.keys(cookies).filter((name) => /^(ct0|csrf_token|_csrf|csrftoken|xsrf-token|_xsrf|JSESSIONID)$/i.test(name));
385
+ const headerName = ["x-csrf-token", "x-xsrf-token", "x-csrftoken", "csrf-token"].find((name) => typeof headers[name] === "string" && headers[name].length > 0);
386
+
387
+ // Also detect CSRF by value matching: if any cookie value appears as a header value,
388
+ // that's a CSRF token pattern regardless of naming convention
389
+ if (!headerName && csrfCookieNames.length === 0) {
390
+ for (const [cookieName, cookieValue] of Object.entries(cookies)) {
391
+ if (!cookieValue || cookieValue.length < 8) continue;
392
+ const unquoted = cookieValue.startsWith('"') && cookieValue.endsWith('"') ? cookieValue.slice(1, -1) : cookieValue;
393
+ for (const [hName, hValue] of Object.entries(headers)) {
394
+ if (hName === "cookie" || hName === "host" || hName === "content-length") continue;
395
+ const hUnquoted = hValue.startsWith('"') && hValue.endsWith('"') ? hValue.slice(1, -1) : hValue;
396
+ if (unquoted === hUnquoted && unquoted.length >= 8) {
397
+ return {
398
+ source: "cookie",
399
+ param_name: hName,
400
+ refresh_on_401: true,
401
+ extractor_sequence: [cookieName],
402
+ };
403
+ }
404
+ }
405
+ }
406
+ }
407
+
386
408
  if (headerName && csrfCookieNames.length > 0) {
387
409
  return {
388
410
  source: "cookie",
Binary file