unbrowse 2.0.1 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/runtime-src/capture/index.ts +47 -16
- package/runtime-src/execution/index.ts +29 -3
- package/runtime-src/extraction/index.ts +53 -13
- package/runtime-src/kuri/client.ts +33 -3
- package/runtime-src/kuri/vendor/kuri/darwin-arm64/kuri +0 -0
- package/vendor/kuri/darwin-arm64/kuri +0 -0
- package/vendor/kuri/darwin-x64/kuri +0 -0
- package/vendor/kuri/linux-arm64/kuri +0 -0
- package/vendor/kuri/linux-x64/kuri +0 -0
package/package.json
CHANGED
|
@@ -137,7 +137,9 @@ export function isBlockedAppShell(html?: string): boolean {
|
|
|
137
137
|
/switch to a supported browser/i.test(html) ||
|
|
138
138
|
/Something went wrong, but don.?t fret/i.test(html) ||
|
|
139
139
|
/class=["']errorContainer["']/i.test(html) ||
|
|
140
|
-
/#placeholder,\s*#react-root\s*\{\s*display:\s*none/i.test(html)
|
|
140
|
+
/#placeholder,\s*#react-root\s*\{\s*display:\s*none/i.test(html) ||
|
|
141
|
+
/Attention Required!\s*\|\s*Cloudflare/i.test(html) ||
|
|
142
|
+
/cf-error-details|cf\.errors\.css/i.test(html)
|
|
141
143
|
);
|
|
142
144
|
}
|
|
143
145
|
|
|
@@ -436,7 +438,15 @@ async function waitForContentReady(
|
|
|
436
438
|
responseBodies?: Map<string, string>,
|
|
437
439
|
): Promise<void> {
|
|
438
440
|
// Phase 1: Initial settle — let the page start rendering
|
|
439
|
-
await new Promise((r) => setTimeout(r,
|
|
441
|
+
await new Promise((r) => setTimeout(r, 1000));
|
|
442
|
+
|
|
443
|
+
// Early exit: if interceptor already captured API responses, page is loaded enough
|
|
444
|
+
if (responseBodies && responseBodies.size > 0) {
|
|
445
|
+
log("capture", `early exit: ${responseBodies.size} API responses already captured during navigation`);
|
|
446
|
+
// Brief extra settle to catch any trailing responses
|
|
447
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
448
|
+
return;
|
|
449
|
+
}
|
|
440
450
|
|
|
441
451
|
// Phase 2: Cloudflare challenge detection and wait
|
|
442
452
|
try {
|
|
@@ -453,7 +463,21 @@ async function waitForContentReady(
|
|
|
453
463
|
}
|
|
454
464
|
|
|
455
465
|
// Phase 3: Wait for document ready state (replaces networkidle)
|
|
456
|
-
await waitForReadyState(tabId,
|
|
466
|
+
await waitForReadyState(tabId, 5000);
|
|
467
|
+
|
|
468
|
+
// Early exit: check again after readyState — SPAs often fire API calls during hydration
|
|
469
|
+
if (responseBodies) {
|
|
470
|
+
const intercepted = await collectInterceptedRequests(tabId);
|
|
471
|
+
for (const entry of intercepted) {
|
|
472
|
+
if (entry.response_body && !entry.is_js) {
|
|
473
|
+
responseBodies.set(entry.url, entry.response_body);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
if (responseBodies.size > 0) {
|
|
477
|
+
log("capture", `early exit after readyState: ${responseBodies.size} API responses captured`);
|
|
478
|
+
return;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
457
481
|
|
|
458
482
|
// Phase 4: Intent-aware API wait — poll intercepted requests for matching API URLs
|
|
459
483
|
if (captureUrl && responseBodies) {
|
|
@@ -464,8 +488,8 @@ async function waitForContentReady(
|
|
|
464
488
|
if (wantedHints.length > 0) {
|
|
465
489
|
log("capture", `intent-aware wait: looking for API matching one of [${wantedHints.join(", ")}] (from ${captureUrl})`);
|
|
466
490
|
const intentStart = Date.now();
|
|
467
|
-
const INTENT_MAX_WAIT =
|
|
468
|
-
const INTENT_POLL_INTERVAL =
|
|
491
|
+
const INTENT_MAX_WAIT = 8000;
|
|
492
|
+
const INTENT_POLL_INTERVAL = 1000;
|
|
469
493
|
while (Date.now() - intentStart < INTENT_MAX_WAIT) {
|
|
470
494
|
await new Promise((r) => setTimeout(r, INTENT_POLL_INTERVAL));
|
|
471
495
|
// Check newly intercepted requests
|
|
@@ -505,7 +529,7 @@ async function waitForContentReady(
|
|
|
505
529
|
await new Promise((r) => setTimeout(r, 1200));
|
|
506
530
|
await kuri.evaluate(tabId, "window.scrollTo(0, 0)");
|
|
507
531
|
if (responseBodies.size === before) {
|
|
508
|
-
await new Promise((r) => setTimeout(r,
|
|
532
|
+
await new Promise((r) => setTimeout(r, 1500));
|
|
509
533
|
}
|
|
510
534
|
} catch {
|
|
511
535
|
// non-fatal
|
|
@@ -638,14 +662,9 @@ export async function captureSession(
|
|
|
638
662
|
try { pageDomain = getRegistrableDomain(new URL(url).hostname); } catch { /* bad url */ }
|
|
639
663
|
|
|
640
664
|
// Inject fetch/XHR interceptor BEFORE navigation to capture all response bodies
|
|
641
|
-
// Navigate to
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
await kuri.navigate(tabId, origin);
|
|
645
|
-
await new Promise((r) => setTimeout(r, 500));
|
|
646
|
-
} catch { /* best-effort */ }
|
|
647
|
-
|
|
648
|
-
await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT);
|
|
665
|
+
// Navigate directly to target URL — skip origin pre-navigation to save 1-2s on heavy SPAs.
|
|
666
|
+
// The interceptor is re-injected after navigation anyway (page context resets on navigate).
|
|
667
|
+
await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT).catch(() => {});
|
|
649
668
|
|
|
650
669
|
// Navigate to target URL
|
|
651
670
|
await kuri.navigate(tabId, url);
|
|
@@ -707,10 +726,14 @@ export async function captureSession(
|
|
|
707
726
|
log("capture", `response body captured: ${bodyUrl.substring(0, 150)}`);
|
|
708
727
|
}
|
|
709
728
|
|
|
729
|
+
|
|
710
730
|
let final_url = url;
|
|
711
731
|
let html: string | undefined;
|
|
712
732
|
try {
|
|
713
|
-
|
|
733
|
+
const rawUrl = await kuri.getCurrentUrl(tabId);
|
|
734
|
+
final_url = typeof rawUrl === "string" ? rawUrl : String(rawUrl ?? url);
|
|
735
|
+
// Validate it's actually a URL, fall back to original if not
|
|
736
|
+
try { new URL(final_url); } catch { final_url = url; }
|
|
714
737
|
html = await kuri.getPageHtml(tabId);
|
|
715
738
|
} catch {}
|
|
716
739
|
|
|
@@ -779,6 +802,14 @@ export async function captureSession(
|
|
|
779
802
|
responseBodyCount < 10 &&
|
|
780
803
|
!hasUsefulCapturedResponses(responseBodies.keys(), url, intent)
|
|
781
804
|
) {
|
|
805
|
+
// On ephemeral retry, if still blocked by Cloudflare WAF, throw auth_required
|
|
806
|
+
// so the caller can surface a login prompt instead of retrying forever
|
|
807
|
+
if (options?.forceEphemeral && html && /Cloudflare|cf\.errors\.css|cf-error-details/i.test(html)) {
|
|
808
|
+
throw Object.assign(new Error("cloudflare_waf_block"), {
|
|
809
|
+
code: "auth_required",
|
|
810
|
+
login_url: url,
|
|
811
|
+
});
|
|
812
|
+
}
|
|
782
813
|
retryFreshTab = true;
|
|
783
814
|
log("capture", `rendered blocked app shell for ${url}; retrying with fresh tab`);
|
|
784
815
|
} else {
|
|
@@ -807,7 +838,7 @@ export async function captureSession(
|
|
|
807
838
|
await resetTab(tabId);
|
|
808
839
|
releaseTabSlot(tabId);
|
|
809
840
|
}
|
|
810
|
-
if (retryFreshTab) {
|
|
841
|
+
if (retryFreshTab && !options?.forceEphemeral) {
|
|
811
842
|
return captureSession(url, authHeaders, cookies, intent, { forceEphemeral: true });
|
|
812
843
|
}
|
|
813
844
|
if (captureError) throw captureError;
|
|
@@ -939,7 +939,7 @@ async function executeBrowserCapture(
|
|
|
939
939
|
skill.endpoints.find((endpoint) => typeof endpoint.trigger_url === "string" && endpoint.trigger_url)?.trigger_url ||
|
|
940
940
|
skill.endpoints.find((endpoint) => !/\{[^}]+\}/.test(endpoint.url_template))?.url_template ||
|
|
941
941
|
"";
|
|
942
|
-
const url = String(params.url ?? fallbackUrl);
|
|
942
|
+
const url = typeof params.url === "string" ? params.url : String(params.url ?? fallbackUrl);
|
|
943
943
|
const intent = String(params.intent ?? skill.intent_signature);
|
|
944
944
|
if (!url) throw new Error("browser-capture skill requires params.url");
|
|
945
945
|
|
|
@@ -981,7 +981,33 @@ async function executeBrowserCapture(
|
|
|
981
981
|
usedStoredAuth,
|
|
982
982
|
);
|
|
983
983
|
if (documentSeed) return documentSeed;
|
|
984
|
-
|
|
984
|
+
let captured;
|
|
985
|
+
try {
|
|
986
|
+
captured = await captureSession(url, authHeaders, cookies, intent);
|
|
987
|
+
} catch (captureErr: unknown) {
|
|
988
|
+
const err = captureErr as Error & { code?: string; login_url?: string };
|
|
989
|
+
if (err.code === "auth_required") {
|
|
990
|
+
const trace: ExecutionTrace = stampTrace({
|
|
991
|
+
trace_id: traceId,
|
|
992
|
+
skill_id: skill.skill_id,
|
|
993
|
+
endpoint_id: "browser-capture",
|
|
994
|
+
started_at: startedAt,
|
|
995
|
+
completed_at: new Date().toISOString(),
|
|
996
|
+
success: false,
|
|
997
|
+
error: "auth_required",
|
|
998
|
+
});
|
|
999
|
+
return {
|
|
1000
|
+
trace,
|
|
1001
|
+
result: {
|
|
1002
|
+
error: "auth_required",
|
|
1003
|
+
provider: "cloudflare",
|
|
1004
|
+
login_url: err.login_url ?? url,
|
|
1005
|
+
message: `Site is blocked by Cloudflare WAF. Run: unbrowse login --url "${url}" to authenticate interactively.`,
|
|
1006
|
+
},
|
|
1007
|
+
};
|
|
1008
|
+
}
|
|
1009
|
+
throw captureErr;
|
|
1010
|
+
}
|
|
985
1011
|
|
|
986
1012
|
const finalDomain = (() => {
|
|
987
1013
|
try { return new URL(captured.final_url).hostname; } catch { return targetDomain; }
|
|
@@ -990,7 +1016,7 @@ async function executeBrowserCapture(
|
|
|
990
1016
|
const LOGIN_PATHS = /\/(login|signin|sign-in|sso|auth|uas\/login|checkpoint|oauth)/i;
|
|
991
1017
|
|
|
992
1018
|
const redirectedToAuth = finalDomain !== targetDomain && AUTH_PROVIDERS.test(finalDomain);
|
|
993
|
-
const redirectedToLogin = captured.final_url !== url && LOGIN_PATHS.test(new URL(captured.final_url).pathname);
|
|
1019
|
+
const redirectedToLogin = captured.final_url !== url && (() => { try { return LOGIN_PATHS.test(new URL(String(captured.final_url)).pathname); } catch { return false; } })();
|
|
994
1020
|
|
|
995
1021
|
if (redirectedToAuth || redirectedToLogin) {
|
|
996
1022
|
const trace: ExecutionTrace = stampTrace({
|
|
@@ -1278,18 +1278,38 @@ export function extractFromDOMWithHint(
|
|
|
1278
1278
|
* the best match for the given intent.
|
|
1279
1279
|
*/
|
|
1280
1280
|
export function extractFromDOM(html: string, intent: string): ExtractionResult {
|
|
1281
|
+
// Cap HTML size to prevent cheerio from hanging on massive pages
|
|
1282
|
+
const MAX_HTML_SIZE = 300_000;
|
|
1283
|
+
let workingHtml = html;
|
|
1284
|
+
if (workingHtml.length > MAX_HTML_SIZE) {
|
|
1285
|
+
// Strip attribute bloat first (class/style/data-* attributes inflate HTML 2-3x)
|
|
1286
|
+
workingHtml = workingHtml
|
|
1287
|
+
.replace(/\s+class="[^"]*"/g, "")
|
|
1288
|
+
.replace(/\s+style="[^"]*"/g, "")
|
|
1289
|
+
.replace(/\s+data-[a-z][-a-z]*="[^"]*"/g, "");
|
|
1290
|
+
// If still too large, truncate keeping body content
|
|
1291
|
+
if (workingHtml.length > MAX_HTML_SIZE) {
|
|
1292
|
+
const bodyStart = workingHtml.indexOf("<body");
|
|
1293
|
+
if (bodyStart > 0) {
|
|
1294
|
+
workingHtml = workingHtml.substring(0, Math.max(MAX_HTML_SIZE, bodyStart + MAX_HTML_SIZE));
|
|
1295
|
+
} else {
|
|
1296
|
+
workingHtml = workingHtml.substring(0, MAX_HTML_SIZE);
|
|
1297
|
+
}
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1281
1301
|
// Extract SPA-embedded data from raw HTML BEFORE cleanDOM strips scripts
|
|
1282
|
-
const spaStructures = extractSPAData(
|
|
1283
|
-
const flashStructures = extractFlashNoticeSpecial(
|
|
1284
|
-
const cleaned = cleanDOM(
|
|
1285
|
-
const githubStructures = extractGitHubSpecial(
|
|
1286
|
-
const linkedInStructures = extractLinkedInSpecial(
|
|
1287
|
-
const packageSearchStructures = extractPackageSearchSpecial(
|
|
1288
|
-
const xProfileStructures = extractXProfileSpecial(
|
|
1289
|
-
const postStructures = extractPostSpecial(
|
|
1290
|
-
const trendStructures = extractTrendSpecial(
|
|
1291
|
-
const definitionStructures = extractDefinitionSpecial(
|
|
1292
|
-
const courseStructures = extractCourseSearchSpecial(
|
|
1302
|
+
const spaStructures = extractSPAData(workingHtml);
|
|
1303
|
+
const flashStructures = extractFlashNoticeSpecial(workingHtml, intent);
|
|
1304
|
+
const cleaned = cleanDOM(workingHtml);
|
|
1305
|
+
const githubStructures = extractGitHubSpecial(workingHtml, intent);
|
|
1306
|
+
const linkedInStructures = extractLinkedInSpecial(workingHtml, intent);
|
|
1307
|
+
const packageSearchStructures = extractPackageSearchSpecial(workingHtml, intent);
|
|
1308
|
+
const xProfileStructures = extractXProfileSpecial(workingHtml, intent);
|
|
1309
|
+
const postStructures = extractPostSpecial(workingHtml, intent);
|
|
1310
|
+
const trendStructures = extractTrendSpecial(workingHtml, intent);
|
|
1311
|
+
const definitionStructures = extractDefinitionSpecial(workingHtml, intent);
|
|
1312
|
+
const courseStructures = extractCourseSearchSpecial(workingHtml, intent);
|
|
1293
1313
|
const structures = [...flashStructures, ...githubStructures, ...linkedInStructures, ...packageSearchStructures, ...xProfileStructures, ...postStructures, ...trendStructures, ...definitionStructures, ...courseStructures, ...spaStructures, ...parseStructured(cleaned)]
|
|
1294
1314
|
.map((structure) => normalizeStructureForIntent(structure, intent));
|
|
1295
1315
|
|
|
@@ -1306,7 +1326,17 @@ export function extractFromDOM(html: string, intent: string): ExtractionResult {
|
|
|
1306
1326
|
|
|
1307
1327
|
scored.sort((a, b) => b.score - a.score);
|
|
1308
1328
|
|
|
1309
|
-
const
|
|
1329
|
+
const passing = scored.filter((candidate) => assessIntentResult(candidate.structure.data, intent).verdict === "pass");
|
|
1330
|
+
const bestPassing = (() => {
|
|
1331
|
+
if (passing.length === 0) return undefined;
|
|
1332
|
+
const bestPassingOverall = passing[0];
|
|
1333
|
+
const bestPassingSpa = passing.find((candidate) => candidate.structure.type.startsWith("spa-"));
|
|
1334
|
+
// Prefer cleaner SPA payloads when they're effectively tied with DOM-derived candidates.
|
|
1335
|
+
if (bestPassingSpa && bestPassingOverall && bestPassingSpa.score >= bestPassingOverall.score - 2) {
|
|
1336
|
+
return bestPassingSpa;
|
|
1337
|
+
}
|
|
1338
|
+
return bestPassingOverall;
|
|
1339
|
+
})();
|
|
1310
1340
|
if (bestPassing) {
|
|
1311
1341
|
return {
|
|
1312
1342
|
data: bestPassing.structure.data,
|
|
@@ -1325,7 +1355,17 @@ export function extractFromDOM(html: string, intent: string): ExtractionResult {
|
|
|
1325
1355
|
selector: best.structure.selector,
|
|
1326
1356
|
};
|
|
1327
1357
|
}
|
|
1328
|
-
|
|
1358
|
+
|
|
1359
|
+
if (scored.length === 1) {
|
|
1360
|
+
return {
|
|
1361
|
+
data: best.structure.data,
|
|
1362
|
+
extraction_method: best.structure.type,
|
|
1363
|
+
confidence: computeConfidence(best.structure, best.score),
|
|
1364
|
+
selector: best.structure.selector,
|
|
1365
|
+
};
|
|
1366
|
+
}
|
|
1367
|
+
|
|
1368
|
+
const hasClearWinner = best.score > scored[1].score * 1.5;
|
|
1329
1369
|
|
|
1330
1370
|
if (hasClearWinner && best.score > 0) {
|
|
1331
1371
|
return {
|
|
@@ -339,12 +339,16 @@ export async function getDefaultTab(): Promise<string> {
|
|
|
339
339
|
throw new Error("No tabs available and failed to create one");
|
|
340
340
|
}
|
|
341
341
|
|
|
342
|
+
/** Trigger Kuri's /discover to sync Chrome tabs into Kuri's registry. */
|
|
342
343
|
/** Trigger Kuri's /discover to sync Chrome tabs into Kuri's registry. */
|
|
343
344
|
async function ensureTabsDiscovered(): Promise<void> {
|
|
344
345
|
try {
|
|
345
|
-
|
|
346
|
+
// Pass CDP URL as query param so /discover works even if Kuri was started without CDP_URL env
|
|
347
|
+
const params: Record<string, string> = {};
|
|
348
|
+
if (kuriCdpPort) params.cdp_url = `ws://127.0.0.1:${kuriCdpPort}`;
|
|
349
|
+
await kuriGet("/discover", params);
|
|
346
350
|
} catch {
|
|
347
|
-
// /discover may fail if
|
|
351
|
+
// /discover may fail if no Chrome running — that's OK
|
|
348
352
|
}
|
|
349
353
|
}
|
|
350
354
|
|
|
@@ -353,12 +357,35 @@ export async function navigate(tabId: string, url: string): Promise<void> {
|
|
|
353
357
|
await kuriGet("/navigate", { tab_id: tabId, url });
|
|
354
358
|
}
|
|
355
359
|
|
|
360
|
+
/** Evaluate JavaScript in tab context. */
|
|
361
|
+
/** Evaluate JavaScript in tab context. */
|
|
362
|
+
/** Evaluate JavaScript in tab context. */
|
|
356
363
|
/** Evaluate JavaScript in tab context. */
|
|
357
364
|
export async function evaluate(tabId: string, expression: string): Promise<unknown> {
|
|
358
|
-
|
|
365
|
+
let raw: {
|
|
359
366
|
id?: number;
|
|
360
367
|
result?: { result?: { type?: string; value?: unknown; description?: string }; exceptionDetails?: unknown };
|
|
361
368
|
};
|
|
369
|
+
if (expression.length > 2000) {
|
|
370
|
+
// Use POST with raw text body for large expressions to avoid URL length limits
|
|
371
|
+
const url = kuriUrl("/evaluate", { tab_id: tabId });
|
|
372
|
+
const controller = new AbortController();
|
|
373
|
+
const timeout = setTimeout(() => controller.abort(), KURI_REQUEST_TIMEOUT_MS);
|
|
374
|
+
try {
|
|
375
|
+
const res = await fetch(url, {
|
|
376
|
+
method: "POST",
|
|
377
|
+
headers: { "Content-Type": "text/plain" },
|
|
378
|
+
body: expression,
|
|
379
|
+
signal: controller.signal,
|
|
380
|
+
});
|
|
381
|
+
const text = await res.text();
|
|
382
|
+
try { raw = JSON.parse(text); } catch { raw = text as never; }
|
|
383
|
+
} finally {
|
|
384
|
+
clearTimeout(timeout);
|
|
385
|
+
}
|
|
386
|
+
} else {
|
|
387
|
+
raw = (await kuriGet("/evaluate", { tab_id: tabId, expression })) as typeof raw;
|
|
388
|
+
}
|
|
362
389
|
// CDP Runtime.evaluate response: { id, result: { result: { type, value } } }
|
|
363
390
|
const inner = raw?.result?.result;
|
|
364
391
|
if (!inner) return raw;
|
|
@@ -483,7 +510,10 @@ export async function hasCloudflareChallenge(tabId: string): Promise<boolean> {
|
|
|
483
510
|
var html = document.documentElement.innerHTML;
|
|
484
511
|
return html.indexOf('challenge-platform') !== -1 ||
|
|
485
512
|
html.indexOf('cf_chl_opt') !== -1 ||
|
|
513
|
+
html.indexOf('cf-error-details') !== -1 ||
|
|
514
|
+
html.indexOf('cf.errors.css') !== -1 ||
|
|
486
515
|
document.title === 'Just a moment...' ||
|
|
516
|
+
/Attention Required.*Cloudflare/.test(document.title) ||
|
|
487
517
|
!!document.querySelector('#challenge-running, #challenge-form, .cf-browser-verification');
|
|
488
518
|
})()`);
|
|
489
519
|
return result === true;
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|