unbrowse 2.0.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "2.0.1",
3
+ "version": "2.0.2",
4
4
  "description": "Reverse-engineer any website into reusable API skills. npm CLI + local engine.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -137,7 +137,9 @@ export function isBlockedAppShell(html?: string): boolean {
137
137
  /switch to a supported browser/i.test(html) ||
138
138
  /Something went wrong, but don.?t fret/i.test(html) ||
139
139
  /class=["']errorContainer["']/i.test(html) ||
140
- /#placeholder,\s*#react-root\s*\{\s*display:\s*none/i.test(html)
140
+ /#placeholder,\s*#react-root\s*\{\s*display:\s*none/i.test(html) ||
141
+ /Attention Required!\s*\|\s*Cloudflare/i.test(html) ||
142
+ /cf-error-details|cf\.errors\.css/i.test(html)
141
143
  );
142
144
  }
143
145
 
@@ -436,7 +438,15 @@ async function waitForContentReady(
436
438
  responseBodies?: Map<string, string>,
437
439
  ): Promise<void> {
438
440
  // Phase 1: Initial settle — let the page start rendering
439
- await new Promise((r) => setTimeout(r, 2000));
441
+ await new Promise((r) => setTimeout(r, 1000));
442
+
443
+ // Early exit: if interceptor already captured API responses, page is loaded enough
444
+ if (responseBodies && responseBodies.size > 0) {
445
+ log("capture", `early exit: ${responseBodies.size} API responses already captured during navigation`);
446
+ // Brief extra settle to catch any trailing responses
447
+ await new Promise((r) => setTimeout(r, 500));
448
+ return;
449
+ }
440
450
 
441
451
  // Phase 2: Cloudflare challenge detection and wait
442
452
  try {
@@ -453,7 +463,21 @@ async function waitForContentReady(
453
463
  }
454
464
 
455
465
  // Phase 3: Wait for document ready state (replaces networkidle)
456
- await waitForReadyState(tabId, 8000);
466
+ await waitForReadyState(tabId, 5000);
467
+
468
+ // Early exit: check again after readyState — SPAs often fire API calls during hydration
469
+ if (responseBodies) {
470
+ const intercepted = await collectInterceptedRequests(tabId);
471
+ for (const entry of intercepted) {
472
+ if (entry.response_body && !entry.is_js) {
473
+ responseBodies.set(entry.url, entry.response_body);
474
+ }
475
+ }
476
+ if (responseBodies.size > 0) {
477
+ log("capture", `early exit after readyState: ${responseBodies.size} API responses captured`);
478
+ return;
479
+ }
480
+ }
457
481
 
458
482
  // Phase 4: Intent-aware API wait — poll intercepted requests for matching API URLs
459
483
  if (captureUrl && responseBodies) {
@@ -464,8 +488,8 @@ async function waitForContentReady(
464
488
  if (wantedHints.length > 0) {
465
489
  log("capture", `intent-aware wait: looking for API matching one of [${wantedHints.join(", ")}] (from ${captureUrl})`);
466
490
  const intentStart = Date.now();
467
- const INTENT_MAX_WAIT = 15000;
468
- const INTENT_POLL_INTERVAL = 1500;
491
+ const INTENT_MAX_WAIT = 8000;
492
+ const INTENT_POLL_INTERVAL = 1000;
469
493
  while (Date.now() - intentStart < INTENT_MAX_WAIT) {
470
494
  await new Promise((r) => setTimeout(r, INTENT_POLL_INTERVAL));
471
495
  // Check newly intercepted requests
@@ -505,7 +529,7 @@ async function waitForContentReady(
505
529
  await new Promise((r) => setTimeout(r, 1200));
506
530
  await kuri.evaluate(tabId, "window.scrollTo(0, 0)");
507
531
  if (responseBodies.size === before) {
508
- await new Promise((r) => setTimeout(r, 2000));
532
+ await new Promise((r) => setTimeout(r, 1500));
509
533
  }
510
534
  } catch {
511
535
  // non-fatal
@@ -638,14 +662,9 @@ export async function captureSession(
638
662
  try { pageDomain = getRegistrableDomain(new URL(url).hostname); } catch { /* bad url */ }
639
663
 
640
664
  // Inject fetch/XHR interceptor BEFORE navigation to capture all response bodies
641
- // Navigate to origin first so the interceptor runs in the correct context
642
- try {
643
- const origin = new URL(url).origin;
644
- await kuri.navigate(tabId, origin);
645
- await new Promise((r) => setTimeout(r, 500));
646
- } catch { /* best-effort */ }
647
-
648
- await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT);
665
+ // Navigate directly to target URL skip origin pre-navigation to save 1-2s on heavy SPAs.
666
+ // The interceptor is re-injected after navigation anyway (page context resets on navigate).
667
+ await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT).catch(() => {});
649
668
 
650
669
  // Navigate to target URL
651
670
  await kuri.navigate(tabId, url);
@@ -707,10 +726,14 @@ export async function captureSession(
707
726
  log("capture", `response body captured: ${bodyUrl.substring(0, 150)}`);
708
727
  }
709
728
 
729
+
710
730
  let final_url = url;
711
731
  let html: string | undefined;
712
732
  try {
713
- final_url = await kuri.getCurrentUrl(tabId);
733
+ const rawUrl = await kuri.getCurrentUrl(tabId);
734
+ final_url = typeof rawUrl === "string" ? rawUrl : String(rawUrl ?? url);
735
+ // Validate it's actually a URL, fall back to original if not
736
+ try { new URL(final_url); } catch { final_url = url; }
714
737
  html = await kuri.getPageHtml(tabId);
715
738
  } catch {}
716
739
 
@@ -779,6 +802,14 @@ export async function captureSession(
779
802
  responseBodyCount < 10 &&
780
803
  !hasUsefulCapturedResponses(responseBodies.keys(), url, intent)
781
804
  ) {
805
+ // On ephemeral retry, if still blocked by Cloudflare WAF, throw auth_required
806
+ // so the caller can surface a login prompt instead of retrying forever
807
+ if (options?.forceEphemeral && html && /Cloudflare|cf\.errors\.css|cf-error-details/i.test(html)) {
808
+ throw Object.assign(new Error("cloudflare_waf_block"), {
809
+ code: "auth_required",
810
+ login_url: url,
811
+ });
812
+ }
782
813
  retryFreshTab = true;
783
814
  log("capture", `rendered blocked app shell for ${url}; retrying with fresh tab`);
784
815
  } else {
@@ -807,7 +838,7 @@ export async function captureSession(
807
838
  await resetTab(tabId);
808
839
  releaseTabSlot(tabId);
809
840
  }
810
- if (retryFreshTab) {
841
+ if (retryFreshTab && !options?.forceEphemeral) {
811
842
  return captureSession(url, authHeaders, cookies, intent, { forceEphemeral: true });
812
843
  }
813
844
  if (captureError) throw captureError;
@@ -939,7 +939,7 @@ async function executeBrowserCapture(
939
939
  skill.endpoints.find((endpoint) => typeof endpoint.trigger_url === "string" && endpoint.trigger_url)?.trigger_url ||
940
940
  skill.endpoints.find((endpoint) => !/\{[^}]+\}/.test(endpoint.url_template))?.url_template ||
941
941
  "";
942
- const url = String(params.url ?? fallbackUrl);
942
+ const url = typeof params.url === "string" ? params.url : String(params.url ?? fallbackUrl);
943
943
  const intent = String(params.intent ?? skill.intent_signature);
944
944
  if (!url) throw new Error("browser-capture skill requires params.url");
945
945
 
@@ -981,7 +981,33 @@ async function executeBrowserCapture(
981
981
  usedStoredAuth,
982
982
  );
983
983
  if (documentSeed) return documentSeed;
984
- const captured = await captureSession(url, authHeaders, cookies, intent);
984
+ let captured;
985
+ try {
986
+ captured = await captureSession(url, authHeaders, cookies, intent);
987
+ } catch (captureErr: unknown) {
988
+ const err = captureErr as Error & { code?: string; login_url?: string };
989
+ if (err.code === "auth_required") {
990
+ const trace: ExecutionTrace = stampTrace({
991
+ trace_id: traceId,
992
+ skill_id: skill.skill_id,
993
+ endpoint_id: "browser-capture",
994
+ started_at: startedAt,
995
+ completed_at: new Date().toISOString(),
996
+ success: false,
997
+ error: "auth_required",
998
+ });
999
+ return {
1000
+ trace,
1001
+ result: {
1002
+ error: "auth_required",
1003
+ provider: "cloudflare",
1004
+ login_url: err.login_url ?? url,
1005
+ message: `Site is blocked by Cloudflare WAF. Run: unbrowse login --url "${url}" to authenticate interactively.`,
1006
+ },
1007
+ };
1008
+ }
1009
+ throw captureErr;
1010
+ }
985
1011
 
986
1012
  const finalDomain = (() => {
987
1013
  try { return new URL(captured.final_url).hostname; } catch { return targetDomain; }
@@ -990,7 +1016,7 @@ async function executeBrowserCapture(
990
1016
  const LOGIN_PATHS = /\/(login|signin|sign-in|sso|auth|uas\/login|checkpoint|oauth)/i;
991
1017
 
992
1018
  const redirectedToAuth = finalDomain !== targetDomain && AUTH_PROVIDERS.test(finalDomain);
993
- const redirectedToLogin = captured.final_url !== url && LOGIN_PATHS.test(new URL(captured.final_url).pathname);
1019
+ const redirectedToLogin = captured.final_url !== url && (() => { try { return LOGIN_PATHS.test(new URL(String(captured.final_url)).pathname); } catch { return false; } })();
994
1020
 
995
1021
  if (redirectedToAuth || redirectedToLogin) {
996
1022
  const trace: ExecutionTrace = stampTrace({
@@ -1278,18 +1278,38 @@ export function extractFromDOMWithHint(
1278
1278
  * the best match for the given intent.
1279
1279
  */
1280
1280
  export function extractFromDOM(html: string, intent: string): ExtractionResult {
1281
+ // Cap HTML size to prevent cheerio from hanging on massive pages
1282
+ const MAX_HTML_SIZE = 300_000;
1283
+ let workingHtml = html;
1284
+ if (workingHtml.length > MAX_HTML_SIZE) {
1285
+ // Strip attribute bloat first (class/style/data-* attributes inflate HTML 2-3x)
1286
+ workingHtml = workingHtml
1287
+ .replace(/\s+class="[^"]*"/g, "")
1288
+ .replace(/\s+style="[^"]*"/g, "")
1289
+ .replace(/\s+data-[a-z][-a-z]*="[^"]*"/g, "");
1290
+ // If still too large, truncate keeping body content
1291
+ if (workingHtml.length > MAX_HTML_SIZE) {
1292
+ const bodyStart = workingHtml.indexOf("<body");
1293
+ if (bodyStart > 0) {
1294
+ workingHtml = workingHtml.substring(0, Math.max(MAX_HTML_SIZE, bodyStart + MAX_HTML_SIZE));
1295
+ } else {
1296
+ workingHtml = workingHtml.substring(0, MAX_HTML_SIZE);
1297
+ }
1298
+ }
1299
+ }
1300
+
1281
1301
  // Extract SPA-embedded data from raw HTML BEFORE cleanDOM strips scripts
1282
- const spaStructures = extractSPAData(html);
1283
- const flashStructures = extractFlashNoticeSpecial(html, intent);
1284
- const cleaned = cleanDOM(html);
1285
- const githubStructures = extractGitHubSpecial(html, intent);
1286
- const linkedInStructures = extractLinkedInSpecial(html, intent);
1287
- const packageSearchStructures = extractPackageSearchSpecial(html, intent);
1288
- const xProfileStructures = extractXProfileSpecial(html, intent);
1289
- const postStructures = extractPostSpecial(html, intent);
1290
- const trendStructures = extractTrendSpecial(html, intent);
1291
- const definitionStructures = extractDefinitionSpecial(html, intent);
1292
- const courseStructures = extractCourseSearchSpecial(html, intent);
1302
+ const spaStructures = extractSPAData(workingHtml);
1303
+ const flashStructures = extractFlashNoticeSpecial(workingHtml, intent);
1304
+ const cleaned = cleanDOM(workingHtml);
1305
+ const githubStructures = extractGitHubSpecial(workingHtml, intent);
1306
+ const linkedInStructures = extractLinkedInSpecial(workingHtml, intent);
1307
+ const packageSearchStructures = extractPackageSearchSpecial(workingHtml, intent);
1308
+ const xProfileStructures = extractXProfileSpecial(workingHtml, intent);
1309
+ const postStructures = extractPostSpecial(workingHtml, intent);
1310
+ const trendStructures = extractTrendSpecial(workingHtml, intent);
1311
+ const definitionStructures = extractDefinitionSpecial(workingHtml, intent);
1312
+ const courseStructures = extractCourseSearchSpecial(workingHtml, intent);
1293
1313
  const structures = [...flashStructures, ...githubStructures, ...linkedInStructures, ...packageSearchStructures, ...xProfileStructures, ...postStructures, ...trendStructures, ...definitionStructures, ...courseStructures, ...spaStructures, ...parseStructured(cleaned)]
1294
1314
  .map((structure) => normalizeStructureForIntent(structure, intent));
1295
1315
 
@@ -1306,7 +1326,17 @@ export function extractFromDOM(html: string, intent: string): ExtractionResult {
1306
1326
 
1307
1327
  scored.sort((a, b) => b.score - a.score);
1308
1328
 
1309
- const bestPassing = scored.find((candidate) => assessIntentResult(candidate.structure.data, intent).verdict === "pass");
1329
+ const passing = scored.filter((candidate) => assessIntentResult(candidate.structure.data, intent).verdict === "pass");
1330
+ const bestPassing = (() => {
1331
+ if (passing.length === 0) return undefined;
1332
+ const bestPassingOverall = passing[0];
1333
+ const bestPassingSpa = passing.find((candidate) => candidate.structure.type.startsWith("spa-"));
1334
+ // Prefer cleaner SPA payloads when they're effectively tied with DOM-derived candidates.
1335
+ if (bestPassingSpa && bestPassingOverall && bestPassingSpa.score >= bestPassingOverall.score - 2) {
1336
+ return bestPassingSpa;
1337
+ }
1338
+ return bestPassingOverall;
1339
+ })();
1310
1340
  if (bestPassing) {
1311
1341
  return {
1312
1342
  data: bestPassing.structure.data,
@@ -1325,7 +1355,17 @@ export function extractFromDOM(html: string, intent: string): ExtractionResult {
1325
1355
  selector: best.structure.selector,
1326
1356
  };
1327
1357
  }
1328
- const hasClearWinner = scored.length === 1 || best.score > scored[1].score * 1.5;
1358
+
1359
+ if (scored.length === 1) {
1360
+ return {
1361
+ data: best.structure.data,
1362
+ extraction_method: best.structure.type,
1363
+ confidence: computeConfidence(best.structure, best.score),
1364
+ selector: best.structure.selector,
1365
+ };
1366
+ }
1367
+
1368
+ const hasClearWinner = best.score > scored[1].score * 1.5;
1329
1369
 
1330
1370
  if (hasClearWinner && best.score > 0) {
1331
1371
  return {
@@ -339,12 +339,16 @@ export async function getDefaultTab(): Promise<string> {
339
339
  throw new Error("No tabs available and failed to create one");
340
340
  }
341
341
 
342
+ /** Trigger Kuri's /discover to sync Chrome tabs into Kuri's registry. */
342
343
  /** Trigger Kuri's /discover to sync Chrome tabs into Kuri's registry. */
343
344
  async function ensureTabsDiscovered(): Promise<void> {
344
345
  try {
345
- await kuriGet("/discover");
346
+ // Pass CDP URL as query param so /discover works even if Kuri was started without CDP_URL env
347
+ const params: Record<string, string> = {};
348
+ if (kuriCdpPort) params.cdp_url = `ws://127.0.0.1:${kuriCdpPort}`;
349
+ await kuriGet("/discover", params);
346
350
  } catch {
347
- // /discover may fail if CDP_URL not set — that's handled by start()
351
+ // /discover may fail if no Chrome running — that's OK
348
352
  }
349
353
  }
350
354
 
@@ -353,12 +357,35 @@ export async function navigate(tabId: string, url: string): Promise<void> {
353
357
  await kuriGet("/navigate", { tab_id: tabId, url });
354
358
  }
355
359
 
360
+ /** Evaluate JavaScript in tab context. */
361
+ /** Evaluate JavaScript in tab context. */
362
+ /** Evaluate JavaScript in tab context. */
356
363
  /** Evaluate JavaScript in tab context. */
357
364
  export async function evaluate(tabId: string, expression: string): Promise<unknown> {
358
- const raw = (await kuriGet("/evaluate", { tab_id: tabId, expression })) as {
365
+ let raw: {
359
366
  id?: number;
360
367
  result?: { result?: { type?: string; value?: unknown; description?: string }; exceptionDetails?: unknown };
361
368
  };
369
+ if (expression.length > 2000) {
370
+ // Use POST with raw text body for large expressions to avoid URL length limits
371
+ const url = kuriUrl("/evaluate", { tab_id: tabId });
372
+ const controller = new AbortController();
373
+ const timeout = setTimeout(() => controller.abort(), KURI_REQUEST_TIMEOUT_MS);
374
+ try {
375
+ const res = await fetch(url, {
376
+ method: "POST",
377
+ headers: { "Content-Type": "text/plain" },
378
+ body: expression,
379
+ signal: controller.signal,
380
+ });
381
+ const text = await res.text();
382
+ try { raw = JSON.parse(text); } catch { raw = text as never; }
383
+ } finally {
384
+ clearTimeout(timeout);
385
+ }
386
+ } else {
387
+ raw = (await kuriGet("/evaluate", { tab_id: tabId, expression })) as typeof raw;
388
+ }
362
389
  // CDP Runtime.evaluate response: { id, result: { result: { type, value } } }
363
390
  const inner = raw?.result?.result;
364
391
  if (!inner) return raw;
@@ -483,7 +510,10 @@ export async function hasCloudflareChallenge(tabId: string): Promise<boolean> {
483
510
  var html = document.documentElement.innerHTML;
484
511
  return html.indexOf('challenge-platform') !== -1 ||
485
512
  html.indexOf('cf_chl_opt') !== -1 ||
513
+ html.indexOf('cf-error-details') !== -1 ||
514
+ html.indexOf('cf.errors.css') !== -1 ||
486
515
  document.title === 'Just a moment...' ||
516
+ /Attention Required.*Cloudflare/.test(document.title) ||
487
517
  !!document.querySelector('#challenge-running, #challenge-form, .cf-browser-verification');
488
518
  })()`);
489
519
  return result === true;
Binary file
Binary file
Binary file
Binary file