unbrowse 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +132 -24
- package/package.json +1 -1
- package/runtime-src/capture/index.ts +2 -2
- package/runtime-src/execution/index.ts +65 -2
- package/runtime-src/orchestrator/index.ts +105 -27
package/dist/index.js
CHANGED
|
@@ -2251,7 +2251,7 @@ function shouldRetryEphemeralProfileError(error) {
|
|
|
2251
2251
|
}
|
|
2252
2252
|
function shouldRestartKuriForError(error) {
|
|
2253
2253
|
const message = error instanceof Error ? error.message : String(error ?? "");
|
|
2254
|
-
return /CDP command failed|target closed|session closed|No target with given id/i.test(message);
|
|
2254
|
+
return /CDP command failed|target closed|session closed|No target with given id|No tabs available and failed to create one/i.test(message);
|
|
2255
2255
|
}
|
|
2256
2256
|
function extractRouteHint(url) {
|
|
2257
2257
|
try {
|
|
@@ -11204,11 +11204,26 @@ function shouldFallbackToBrowserReplay(data, endpoint, intent, contextUrl) {
|
|
|
11204
11204
|
return false;
|
|
11205
11205
|
if (endpoint.dom_extraction && typeof data === "string" && isHtml(data))
|
|
11206
11206
|
return false;
|
|
11207
|
-
if (typeof data === "string")
|
|
11207
|
+
if (typeof data === "string") {
|
|
11208
|
+
if (isHtml(data) && looksLikeSearchAuthOrHomepageBounceHtml(data))
|
|
11209
|
+
return false;
|
|
11208
11210
|
return isHtml(data) || isSpaShell(data);
|
|
11211
|
+
}
|
|
11209
11212
|
const assessment = assessIntentResult(data, intent);
|
|
11210
11213
|
return assessment.verdict === "fail";
|
|
11211
11214
|
}
|
|
11215
|
+
function looksLikeSearchAuthOrHomepageBounceHtml(html, finalUrl) {
|
|
11216
|
+
if (!isHtml(html))
|
|
11217
|
+
return false;
|
|
11218
|
+
const lower = html.toLowerCase();
|
|
11219
|
+
const titleMatch = lower.match(/<title[^>]*>([^<]+)</i);
|
|
11220
|
+
const title = titleMatch?.[1]?.trim() ?? "";
|
|
11221
|
+
const final = finalUrl?.toLowerCase() ?? "";
|
|
11222
|
+
const combined = `${title} ${lower}`;
|
|
11223
|
+
const hasLawnetBounceMarkers = /about lawnet legal research/.test(combined) || /what is lawnet/.test(combined) || /forgot password/.test(combined) || /lawnet legal research, a service of/.test(combined) || /\/lawnet\/web\/lawnet\/about-lawnet\b/.test(combined) || /\/lawnet\/web\/lawnet\/home\b/.test(final);
|
|
11224
|
+
const hasGenericAuthMarkers = /\b(login|log in|sign in|forgot password)\b/.test(combined) && /\b(search|legal research|lawnet)\b/.test(combined);
|
|
11225
|
+
return hasLawnetBounceMarkers || hasGenericAuthMarkers;
|
|
11226
|
+
}
|
|
11212
11227
|
function buildSampleRequestFromUrl(url) {
|
|
11213
11228
|
try {
|
|
11214
11229
|
return Object.fromEntries(sanitizeNavigationQueryParams(new URL(url)).searchParams.entries());
|
|
@@ -12340,8 +12355,10 @@ async function tryHttpFetch(url, authHeaders, cookies) {
|
|
|
12340
12355
|
}
|
|
12341
12356
|
async function executeDomExtractionEndpoint(endpoint, url, intent, authHeaders, cookies) {
|
|
12342
12357
|
const extractionIntent = deriveDomExecutionIntent(endpoint, intent);
|
|
12358
|
+
const isCapturedPageArtifact = /captured page artifact/i.test(endpoint.description ?? "");
|
|
12343
12359
|
const ssrResult = await tryHttpFetch(url, authHeaders, cookies);
|
|
12344
12360
|
if (ssrResult) {
|
|
12361
|
+
const looksLikeBounce = looksLikeSearchAuthOrHomepageBounceHtml(ssrResult.html, ssrResult.final_url);
|
|
12345
12362
|
const ssrExtracted = extractFromDOMWithHint(ssrResult.html, extractionIntent, endpoint.dom_extraction);
|
|
12346
12363
|
if (ssrExtracted.data) {
|
|
12347
12364
|
const ssrQuality = validateExtractionQuality(ssrExtracted.data, ssrExtracted.confidence, extractionIntent);
|
|
@@ -12373,6 +12390,41 @@ async function executeDomExtractionEndpoint(endpoint, url, intent, authHeaders,
|
|
|
12373
12390
|
};
|
|
12374
12391
|
}
|
|
12375
12392
|
}
|
|
12393
|
+
if (isCapturedPageArtifact) {
|
|
12394
|
+
return {
|
|
12395
|
+
data: {
|
|
12396
|
+
error: "low_quality_dom_extraction",
|
|
12397
|
+
message: `Structured DOM extraction was rejected: ${looksLikeBounce ? "search_auth_or_homepage_bounce" : "captured_page_artifact_miss"}`
|
|
12398
|
+
},
|
|
12399
|
+
status: 422,
|
|
12400
|
+
trace_id: nanoid5(),
|
|
12401
|
+
network_events: [toTraceNetworkEvent({
|
|
12402
|
+
url: ssrResult.final_url,
|
|
12403
|
+
method: "GET",
|
|
12404
|
+
requestHeaders: authHeaders,
|
|
12405
|
+
responseStatus: 200,
|
|
12406
|
+
responseHeaders: { "content-type": "text/html" },
|
|
12407
|
+
responseBody: ssrResult.html
|
|
12408
|
+
})]
|
|
12409
|
+
};
|
|
12410
|
+
}
|
|
12411
|
+
} else if (isCapturedPageArtifact && looksLikeBounce) {
|
|
12412
|
+
return {
|
|
12413
|
+
data: {
|
|
12414
|
+
error: "low_quality_dom_extraction",
|
|
12415
|
+
message: "Structured DOM extraction was rejected: search_auth_or_homepage_bounce"
|
|
12416
|
+
},
|
|
12417
|
+
status: 422,
|
|
12418
|
+
trace_id: nanoid5(),
|
|
12419
|
+
network_events: [toTraceNetworkEvent({
|
|
12420
|
+
url: ssrResult.final_url,
|
|
12421
|
+
method: "GET",
|
|
12422
|
+
requestHeaders: authHeaders,
|
|
12423
|
+
responseStatus: 200,
|
|
12424
|
+
responseHeaders: { "content-type": "text/html" },
|
|
12425
|
+
responseBody: ssrResult.html
|
|
12426
|
+
})]
|
|
12427
|
+
};
|
|
12376
12428
|
}
|
|
12377
12429
|
console.log(`[ssr-fast] miss, falling back to browser`);
|
|
12378
12430
|
} else {
|
|
@@ -14125,6 +14177,24 @@ function promoteResultSnapshot(cacheKey, skill, endpointId, result, trace, respo
|
|
|
14125
14177
|
expires: Date.now() + ROUTE_CACHE_TTL
|
|
14126
14178
|
});
|
|
14127
14179
|
}
|
|
14180
|
+
function invalidateResolveCacheEntries(cacheKeys, domainKeys = []) {
|
|
14181
|
+
let routeCacheDirty = false;
|
|
14182
|
+
let domainCacheDirty = false;
|
|
14183
|
+
for (const cacheKey of new Set(cacheKeys.filter(Boolean))) {
|
|
14184
|
+
routeResultCache.delete(cacheKey);
|
|
14185
|
+
capturedDomainCache.delete(cacheKey);
|
|
14186
|
+
if (skillRouteCache.delete(cacheKey))
|
|
14187
|
+
routeCacheDirty = true;
|
|
14188
|
+
}
|
|
14189
|
+
for (const domainKey of new Set(domainKeys.filter(Boolean))) {
|
|
14190
|
+
if (domainSkillCache.delete(domainKey))
|
|
14191
|
+
domainCacheDirty = true;
|
|
14192
|
+
}
|
|
14193
|
+
if (routeCacheDirty)
|
|
14194
|
+
persistRouteCache();
|
|
14195
|
+
if (domainCacheDirty)
|
|
14196
|
+
persistDomainCache();
|
|
14197
|
+
}
|
|
14128
14198
|
async function getSkillWithTimeout(skillId, scope, timeoutMs = MARKETPLACE_GET_SKILL_TIMEOUT_MS) {
|
|
14129
14199
|
return Promise.race([
|
|
14130
14200
|
getSkill2(skillId, scope),
|
|
@@ -14176,6 +14246,10 @@ function isCachedSkillRelevantForIntent(skill, intent, contextUrl) {
|
|
|
14176
14246
|
const resolvedSkill = withContextReplayEndpoint(skill, intent, contextUrl);
|
|
14177
14247
|
const ranked = rankEndpoints(resolvedSkill.endpoints, intent, resolvedSkill.domain, contextUrl);
|
|
14178
14248
|
const top = ranked[0];
|
|
14249
|
+
const isSearchIntent = /\b(search|find|lookup|browse|discover)\b/i.test(intent);
|
|
14250
|
+
if (top && isSearchIntent && contextUrl && /captured page artifact/i.test(top.endpoint.description ?? "") && top.endpoint.response_schema?.type !== "array" && top.endpoint.url_template === contextUrl && !skillHasBetterStructuredSearchEndpoint(resolvedSkill, top.endpoint.endpoint_id, intent, contextUrl)) {
|
|
14251
|
+
return false;
|
|
14252
|
+
}
|
|
14179
14253
|
if (top && isEducationCatalogIntent(intent) && isRootContextUrl(contextUrl) && /captured page artifact/i.test(top.endpoint.description ?? "") && top.endpoint.url_template === contextUrl) {
|
|
14180
14254
|
return false;
|
|
14181
14255
|
}
|
|
@@ -14324,6 +14398,9 @@ async function withDomainCaptureLock(domain, fn) {
|
|
|
14324
14398
|
captureDomainLocks.delete(domain);
|
|
14325
14399
|
}
|
|
14326
14400
|
}
|
|
14401
|
+
function shouldFallbackToLiveCaptureAfterAutoexecFailure(autoexecFailedAll, contextUrl) {
|
|
14402
|
+
return autoexecFailedAll && !!contextUrl;
|
|
14403
|
+
}
|
|
14327
14404
|
function computeCompositeScore(embeddingScore, skill) {
|
|
14328
14405
|
const reliabilities = skill.endpoints.map((e) => e.reliability_score);
|
|
14329
14406
|
const avgReliability = reliabilities.length > 0 ? reliabilities.reduce((a, b) => a + b, 0) / reliabilities.length : 0.5;
|
|
@@ -14904,13 +14981,20 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
14904
14981
|
const autoResult = await tryAutoExecute(skill, source);
|
|
14905
14982
|
if (autoResult) {
|
|
14906
14983
|
promoteLearnedSkill(clientScope, cacheKey, skill, autoResult.trace.endpoint_id ?? "", context?.url);
|
|
14907
|
-
return autoResult;
|
|
14984
|
+
return { orchestratorResult: autoResult, autoexecFailedAll: false };
|
|
14908
14985
|
}
|
|
14986
|
+
return {
|
|
14987
|
+
orchestratorResult: buildDeferral(skill, source, extraFields),
|
|
14988
|
+
autoexecFailedAll: true
|
|
14989
|
+
};
|
|
14909
14990
|
} catch (err) {
|
|
14910
14991
|
console.warn(`[auto-exec] failed, falling back to deferral: ${err.message}`);
|
|
14911
14992
|
}
|
|
14912
14993
|
}
|
|
14913
|
-
return
|
|
14994
|
+
return {
|
|
14995
|
+
orchestratorResult: buildDeferral(skill, source, extraFields),
|
|
14996
|
+
autoexecFailedAll: false
|
|
14997
|
+
};
|
|
14914
14998
|
}
|
|
14915
14999
|
function buildDeferral(skill, source, extraFields) {
|
|
14916
15000
|
const resolvedSkill = withContextReplayEndpoint(skill, intent, context?.url);
|
|
@@ -15238,6 +15322,7 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15238
15322
|
return null;
|
|
15239
15323
|
}
|
|
15240
15324
|
const requestedDomain = context?.domain ?? (context?.url ? new URL(context.url).hostname : null);
|
|
15325
|
+
const requestedDomainCacheKey = getDomainReuseKey(context?.url ?? requestedDomain);
|
|
15241
15326
|
const resolveCacheKey = buildResolveCacheKey(requestedDomain, intent, context?.url);
|
|
15242
15327
|
const cacheKey = scopedCacheKey(clientScope, resolveCacheKey);
|
|
15243
15328
|
if (!forceCapture && !agentChoseEndpoint) {
|
|
@@ -15246,10 +15331,15 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15246
15331
|
if (cachedResult.expires <= Date.now() || !isAcceptableIntentResult(cachedResult.result, intent) || !isCachedSkillRelevantForIntent(cachedResult.skill, intent, context?.url)) {
|
|
15247
15332
|
routeResultCache.delete(cacheKey);
|
|
15248
15333
|
} else {
|
|
15249
|
-
timing.cache_hit = true;
|
|
15250
15334
|
const deferred2 = await buildDeferralWithAutoExec(cachedResult.skill, "marketplace");
|
|
15251
|
-
deferred2.
|
|
15252
|
-
|
|
15335
|
+
if (shouldFallbackToLiveCaptureAfterAutoexecFailure(deferred2.autoexecFailedAll, context?.url)) {
|
|
15336
|
+
console.log("[route-result-cache] stale cached skill; retrying via live capture");
|
|
15337
|
+
invalidateResolveCacheEntries([cacheKey], requestedDomainCacheKey ? [requestedDomainCacheKey] : []);
|
|
15338
|
+
} else {
|
|
15339
|
+
timing.cache_hit = true;
|
|
15340
|
+
deferred2.orchestratorResult.timing.cache_hit = true;
|
|
15341
|
+
return deferred2.orchestratorResult;
|
|
15342
|
+
}
|
|
15253
15343
|
}
|
|
15254
15344
|
}
|
|
15255
15345
|
}
|
|
@@ -15282,10 +15372,15 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15282
15372
|
if (bestCached.scopedKey !== cacheKey) {
|
|
15283
15373
|
promoteLearnedSkill(clientScope, resolveCacheKey, bestCached.skill, bestCached.entry.endpointId, context?.url);
|
|
15284
15374
|
}
|
|
15285
|
-
timing.cache_hit = true;
|
|
15286
15375
|
const deferred2 = await buildDeferralWithAutoExec(bestCached.skill, "marketplace");
|
|
15287
|
-
deferred2.
|
|
15288
|
-
|
|
15376
|
+
if (shouldFallbackToLiveCaptureAfterAutoexecFailure(deferred2.autoexecFailedAll, context?.url)) {
|
|
15377
|
+
console.log("[route-cache] stale cached skill; retrying via live capture");
|
|
15378
|
+
invalidateResolveCacheEntries([cacheKey, bestCached.scopedKey], requestedDomainCacheKey ? [requestedDomainCacheKey] : []);
|
|
15379
|
+
} else {
|
|
15380
|
+
timing.cache_hit = true;
|
|
15381
|
+
deferred2.orchestratorResult.timing.cache_hit = true;
|
|
15382
|
+
return deferred2.orchestratorResult;
|
|
15383
|
+
}
|
|
15289
15384
|
}
|
|
15290
15385
|
}
|
|
15291
15386
|
if (!forceCapture && !agentChoseEndpoint && requestedDomain) {
|
|
@@ -15294,11 +15389,16 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15294
15389
|
if (domainCached && Date.now() - domainCached.ts < 7 * 24 * 60 * 60000) {
|
|
15295
15390
|
const skill = readSkillSnapshot(domainCached.localSkillPath) ?? await getSkill2(domainCached.skillId, clientScope);
|
|
15296
15391
|
if (skill && isCachedSkillRelevantForIntent(skill, intent, context?.url)) {
|
|
15297
|
-
timing.cache_hit = true;
|
|
15298
15392
|
console.log(`[domain-cache] hit for ${domainKey} → skill ${skill.skill_id.slice(0, 15)}`);
|
|
15299
15393
|
const result2 = await buildDeferralWithAutoExec(skill, "marketplace");
|
|
15300
|
-
result2.
|
|
15301
|
-
|
|
15394
|
+
if (shouldFallbackToLiveCaptureAfterAutoexecFailure(result2.autoexecFailedAll, context?.url)) {
|
|
15395
|
+
console.log(`[domain-cache] stale skill for ${domainKey}; retrying via live capture`);
|
|
15396
|
+
invalidateResolveCacheEntries([cacheKey], [domainKey]);
|
|
15397
|
+
} else {
|
|
15398
|
+
timing.cache_hit = true;
|
|
15399
|
+
result2.orchestratorResult.timing.cache_hit = true;
|
|
15400
|
+
return result2.orchestratorResult;
|
|
15401
|
+
}
|
|
15302
15402
|
} else if (skill) {
|
|
15303
15403
|
const ranked = rankEndpoints(skill.endpoints, intent, skill.domain, context?.url);
|
|
15304
15404
|
const top = ranked[0];
|
|
@@ -15459,7 +15559,11 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15459
15559
|
if (best.endpointId) {
|
|
15460
15560
|
console.log(`[search] endpoint-level hit hint: ${best.endpointId} score=${best.candidate.score.toFixed(3)}`);
|
|
15461
15561
|
}
|
|
15462
|
-
|
|
15562
|
+
const deferred2 = await buildDeferralWithAutoExec(best.skill, "marketplace");
|
|
15563
|
+
if (!shouldFallbackToLiveCaptureAfterAutoexecFailure(deferred2.autoexecFailedAll, context?.url)) {
|
|
15564
|
+
return deferred2.orchestratorResult;
|
|
15565
|
+
}
|
|
15566
|
+
console.log("[marketplace] stale top skill; retrying via live capture");
|
|
15463
15567
|
}
|
|
15464
15568
|
}
|
|
15465
15569
|
}
|
|
@@ -15472,8 +15576,6 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15472
15576
|
if (!isCachedSkillRelevantForIntent(domainHit.skill, intent, context?.url)) {
|
|
15473
15577
|
capturedDomainCache.delete(cacheKey);
|
|
15474
15578
|
} else {
|
|
15475
|
-
timing.cache_hit = true;
|
|
15476
|
-
let staleCachedEndpoint = false;
|
|
15477
15579
|
if (agentChoseEndpoint) {
|
|
15478
15580
|
const execOut = await executeSkill(domainHit.skill, { ...params, endpoint_id: params.endpoint_id ?? domainHit.endpointId }, projection, { ...options, intent, contextUrl: context?.url });
|
|
15479
15581
|
if (execOut.trace.success && isAcceptableIntentResult(execOut.result, intent)) {
|
|
@@ -15488,11 +15590,17 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15488
15590
|
extraction_hints: execOut.extraction_hints
|
|
15489
15591
|
};
|
|
15490
15592
|
}
|
|
15491
|
-
|
|
15593
|
+
invalidateResolveCacheEntries([cacheKey], requestedDomainCacheKey ? [requestedDomainCacheKey] : []);
|
|
15492
15594
|
}
|
|
15493
15595
|
const deferred2 = await buildDeferralWithAutoExec(domainHit.skill, "marketplace");
|
|
15494
|
-
deferred2.
|
|
15495
|
-
|
|
15596
|
+
if (shouldFallbackToLiveCaptureAfterAutoexecFailure(deferred2.autoexecFailedAll, context?.url)) {
|
|
15597
|
+
console.log("[captured-domain-cache] stale skill; retrying via live capture");
|
|
15598
|
+
invalidateResolveCacheEntries([cacheKey], requestedDomainCacheKey ? [requestedDomainCacheKey] : []);
|
|
15599
|
+
} else {
|
|
15600
|
+
timing.cache_hit = true;
|
|
15601
|
+
deferred2.orchestratorResult.timing.cache_hit = true;
|
|
15602
|
+
return deferred2.orchestratorResult;
|
|
15603
|
+
}
|
|
15496
15604
|
}
|
|
15497
15605
|
}
|
|
15498
15606
|
const bypassLiveCaptureQueue = shouldBypassLiveCaptureQueue(context?.url);
|
|
@@ -15522,9 +15630,9 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15522
15630
|
auth_recommended: true,
|
|
15523
15631
|
auth_hint: captureResult2.auth_hint
|
|
15524
15632
|
} : undefined);
|
|
15525
|
-
queuePassivePublishIfExecuted(learned_skill, deferred2, parityBaseline2);
|
|
15526
|
-
deferred2.timing.cache_hit = true;
|
|
15527
|
-
return deferred2;
|
|
15633
|
+
queuePassivePublishIfExecuted(learned_skill, deferred2.orchestratorResult, parityBaseline2);
|
|
15634
|
+
deferred2.orchestratorResult.timing.cache_hit = true;
|
|
15635
|
+
return deferred2.orchestratorResult;
|
|
15528
15636
|
}
|
|
15529
15637
|
return {
|
|
15530
15638
|
result,
|
|
@@ -15725,8 +15833,8 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
15725
15833
|
auth_recommended: true,
|
|
15726
15834
|
auth_hint: captureResult.auth_hint
|
|
15727
15835
|
} : undefined);
|
|
15728
|
-
queuePassivePublishIfExecuted(learned_skill, deferred, parityBaseline);
|
|
15729
|
-
return deferred;
|
|
15836
|
+
queuePassivePublishIfExecuted(learned_skill, deferred.orchestratorResult, parityBaseline);
|
|
15837
|
+
return deferred.orchestratorResult;
|
|
15730
15838
|
}
|
|
15731
15839
|
async function getOrCreateBrowserCaptureSkill() {
|
|
15732
15840
|
const existing = await getSkill2(BROWSER_CAPTURE_SKILL_ID);
|
package/package.json
CHANGED
|
@@ -315,9 +315,9 @@ function shouldRetryEphemeralProfileError(error: unknown): boolean {
|
|
|
315
315
|
return /persistentcontext|target page, context or browser has been closed|browser has been closed|page has been closed/i.test(message);
|
|
316
316
|
}
|
|
317
317
|
|
|
318
|
-
function shouldRestartKuriForError(error: unknown): boolean {
|
|
318
|
+
export function shouldRestartKuriForError(error: unknown): boolean {
|
|
319
319
|
const message = error instanceof Error ? error.message : String(error ?? "");
|
|
320
|
-
return /CDP command failed|target closed|session closed|No target with given id/i.test(message);
|
|
320
|
+
return /CDP command failed|target closed|session closed|No target with given id|No tabs available and failed to create one/i.test(message);
|
|
321
321
|
}
|
|
322
322
|
|
|
323
323
|
/**
|
|
@@ -746,7 +746,7 @@ export function buildStructuredReplayHeaders(
|
|
|
746
746
|
return headers;
|
|
747
747
|
}
|
|
748
748
|
|
|
749
|
-
function shouldFallbackToBrowserReplay(
|
|
749
|
+
export function shouldFallbackToBrowserReplay(
|
|
750
750
|
data: unknown,
|
|
751
751
|
endpoint: EndpointDescriptor,
|
|
752
752
|
intent?: string,
|
|
@@ -755,11 +755,37 @@ function shouldFallbackToBrowserReplay(
|
|
|
755
755
|
const replayUrl = resolveExecutionUrlTemplate(endpoint, contextUrl);
|
|
756
756
|
if (!isDocumentLikeUrl(replayUrl)) return false;
|
|
757
757
|
if (endpoint.dom_extraction && typeof data === "string" && isHtml(data)) return false;
|
|
758
|
-
if (typeof data === "string")
|
|
758
|
+
if (typeof data === "string") {
|
|
759
|
+
if (isHtml(data) && looksLikeSearchAuthOrHomepageBounceHtml(data)) return false;
|
|
760
|
+
return isHtml(data) || isSpaShell(data);
|
|
761
|
+
}
|
|
759
762
|
const assessment = assessIntentResult(data, intent);
|
|
760
763
|
return assessment.verdict === "fail";
|
|
761
764
|
}
|
|
762
765
|
|
|
766
|
+
export function looksLikeSearchAuthOrHomepageBounceHtml(
|
|
767
|
+
html: string,
|
|
768
|
+
finalUrl?: string,
|
|
769
|
+
): boolean {
|
|
770
|
+
if (!isHtml(html)) return false;
|
|
771
|
+
const lower = html.toLowerCase();
|
|
772
|
+
const titleMatch = lower.match(/<title[^>]*>([^<]+)</i);
|
|
773
|
+
const title = titleMatch?.[1]?.trim() ?? "";
|
|
774
|
+
const final = finalUrl?.toLowerCase() ?? "";
|
|
775
|
+
const combined = `${title} ${lower}`;
|
|
776
|
+
const hasLawnetBounceMarkers =
|
|
777
|
+
/about lawnet legal research/.test(combined) ||
|
|
778
|
+
/what is lawnet/.test(combined) ||
|
|
779
|
+
/forgot password/.test(combined) ||
|
|
780
|
+
/lawnet legal research, a service of/.test(combined) ||
|
|
781
|
+
/\/lawnet\/web\/lawnet\/about-lawnet\b/.test(combined) ||
|
|
782
|
+
/\/lawnet\/web\/lawnet\/home\b/.test(final);
|
|
783
|
+
const hasGenericAuthMarkers =
|
|
784
|
+
/\b(login|log in|sign in|forgot password)\b/.test(combined) &&
|
|
785
|
+
/\b(search|legal research|lawnet)\b/.test(combined);
|
|
786
|
+
return hasLawnetBounceMarkers || hasGenericAuthMarkers;
|
|
787
|
+
}
|
|
788
|
+
|
|
763
789
|
function buildSampleRequestFromUrl(url: string): Record<string, unknown> {
|
|
764
790
|
try {
|
|
765
791
|
return Object.fromEntries(sanitizeNavigationQueryParams(new URL(url)).searchParams.entries());
|
|
@@ -2127,10 +2153,12 @@ async function executeDomExtractionEndpoint(
|
|
|
2127
2153
|
cookies: Array<{ name: string; value: string; domain: string }>,
|
|
2128
2154
|
): Promise<{ data: unknown; status: number; trace_id: string; network_events?: TraceNetworkEvent[] }> {
|
|
2129
2155
|
const extractionIntent = deriveDomExecutionIntent(endpoint, intent);
|
|
2156
|
+
const isCapturedPageArtifact = /captured page artifact/i.test(endpoint.description ?? "");
|
|
2130
2157
|
|
|
2131
2158
|
// SSR fast-path: try plain HTTP fetch before browser
|
|
2132
2159
|
const ssrResult = await tryHttpFetch(url, authHeaders, cookies);
|
|
2133
2160
|
if (ssrResult) {
|
|
2161
|
+
const looksLikeBounce = looksLikeSearchAuthOrHomepageBounceHtml(ssrResult.html, ssrResult.final_url);
|
|
2134
2162
|
const ssrExtracted = extractFromDOMWithHint(ssrResult.html, extractionIntent, endpoint.dom_extraction);
|
|
2135
2163
|
if (ssrExtracted.data) {
|
|
2136
2164
|
const ssrQuality = validateExtractionQuality(ssrExtracted.data, ssrExtracted.confidence, extractionIntent);
|
|
@@ -2162,6 +2190,41 @@ async function executeDomExtractionEndpoint(
|
|
|
2162
2190
|
};
|
|
2163
2191
|
}
|
|
2164
2192
|
}
|
|
2193
|
+
if (isCapturedPageArtifact) {
|
|
2194
|
+
return {
|
|
2195
|
+
data: {
|
|
2196
|
+
error: "low_quality_dom_extraction",
|
|
2197
|
+
message: `Structured DOM extraction was rejected: ${looksLikeBounce ? "search_auth_or_homepage_bounce" : "captured_page_artifact_miss"}`,
|
|
2198
|
+
},
|
|
2199
|
+
status: 422,
|
|
2200
|
+
trace_id: nanoid(),
|
|
2201
|
+
network_events: [toTraceNetworkEvent({
|
|
2202
|
+
url: ssrResult.final_url,
|
|
2203
|
+
method: "GET",
|
|
2204
|
+
requestHeaders: authHeaders,
|
|
2205
|
+
responseStatus: 200,
|
|
2206
|
+
responseHeaders: { "content-type": "text/html" },
|
|
2207
|
+
responseBody: ssrResult.html,
|
|
2208
|
+
})],
|
|
2209
|
+
};
|
|
2210
|
+
}
|
|
2211
|
+
} else if (isCapturedPageArtifact && looksLikeBounce) {
|
|
2212
|
+
return {
|
|
2213
|
+
data: {
|
|
2214
|
+
error: "low_quality_dom_extraction",
|
|
2215
|
+
message: "Structured DOM extraction was rejected: search_auth_or_homepage_bounce",
|
|
2216
|
+
},
|
|
2217
|
+
status: 422,
|
|
2218
|
+
trace_id: nanoid(),
|
|
2219
|
+
network_events: [toTraceNetworkEvent({
|
|
2220
|
+
url: ssrResult.final_url,
|
|
2221
|
+
method: "GET",
|
|
2222
|
+
requestHeaders: authHeaders,
|
|
2223
|
+
responseStatus: 200,
|
|
2224
|
+
responseHeaders: { "content-type": "text/html" },
|
|
2225
|
+
responseBody: ssrResult.html,
|
|
2226
|
+
})],
|
|
2227
|
+
};
|
|
2165
2228
|
}
|
|
2166
2229
|
console.log(`[ssr-fast] miss, falling back to browser`);
|
|
2167
2230
|
} else {
|
|
@@ -329,6 +329,21 @@ function promoteResultSnapshot(
|
|
|
329
329
|
});
|
|
330
330
|
}
|
|
331
331
|
|
|
332
|
+
function invalidateResolveCacheEntries(cacheKeys: string[], domainKeys: string[] = []): void {
|
|
333
|
+
let routeCacheDirty = false;
|
|
334
|
+
let domainCacheDirty = false;
|
|
335
|
+
for (const cacheKey of new Set(cacheKeys.filter(Boolean))) {
|
|
336
|
+
routeResultCache.delete(cacheKey);
|
|
337
|
+
capturedDomainCache.delete(cacheKey);
|
|
338
|
+
if (skillRouteCache.delete(cacheKey)) routeCacheDirty = true;
|
|
339
|
+
}
|
|
340
|
+
for (const domainKey of new Set(domainKeys.filter(Boolean))) {
|
|
341
|
+
if (domainSkillCache.delete(domainKey)) domainCacheDirty = true;
|
|
342
|
+
}
|
|
343
|
+
if (routeCacheDirty) persistRouteCache();
|
|
344
|
+
if (domainCacheDirty) persistDomainCache();
|
|
345
|
+
}
|
|
346
|
+
|
|
332
347
|
async function getSkillWithTimeout(
|
|
333
348
|
skillId: string,
|
|
334
349
|
scope: string,
|
|
@@ -408,6 +423,23 @@ export function isCachedSkillRelevantForIntent(
|
|
|
408
423
|
contextUrl,
|
|
409
424
|
);
|
|
410
425
|
const top = ranked[0];
|
|
426
|
+
const isSearchIntent = /\b(search|find|lookup|browse|discover)\b/i.test(intent);
|
|
427
|
+
if (
|
|
428
|
+
top &&
|
|
429
|
+
isSearchIntent &&
|
|
430
|
+
contextUrl &&
|
|
431
|
+
/captured page artifact/i.test(top.endpoint.description ?? "") &&
|
|
432
|
+
top.endpoint.response_schema?.type !== "array" &&
|
|
433
|
+
top.endpoint.url_template === contextUrl &&
|
|
434
|
+
!skillHasBetterStructuredSearchEndpoint(
|
|
435
|
+
resolvedSkill,
|
|
436
|
+
top.endpoint.endpoint_id,
|
|
437
|
+
intent,
|
|
438
|
+
contextUrl,
|
|
439
|
+
)
|
|
440
|
+
) {
|
|
441
|
+
return false;
|
|
442
|
+
}
|
|
411
443
|
if (
|
|
412
444
|
top &&
|
|
413
445
|
isEducationCatalogIntent(intent) &&
|
|
@@ -626,6 +658,18 @@ export interface OrchestratorResult {
|
|
|
626
658
|
extraction_hints?: import("../transform/schema-hints.js").ExtractionHint;
|
|
627
659
|
}
|
|
628
660
|
|
|
661
|
+
type AutoExecDecision = {
|
|
662
|
+
orchestratorResult: OrchestratorResult;
|
|
663
|
+
autoexecFailedAll: boolean;
|
|
664
|
+
};
|
|
665
|
+
|
|
666
|
+
export function shouldFallbackToLiveCaptureAfterAutoexecFailure(
|
|
667
|
+
autoexecFailedAll: boolean,
|
|
668
|
+
contextUrl?: string,
|
|
669
|
+
): boolean {
|
|
670
|
+
return autoexecFailedAll && !!contextUrl;
|
|
671
|
+
}
|
|
672
|
+
|
|
629
673
|
function computeCompositeScore(embeddingScore: number, skill: SkillManifest): number {
|
|
630
674
|
// Average reliability across endpoints
|
|
631
675
|
const reliabilities = skill.endpoints.map((e) => e.reliability_score);
|
|
@@ -1391,7 +1435,7 @@ export async function resolveAndExecute(
|
|
|
1391
1435
|
skill: SkillManifest,
|
|
1392
1436
|
source: "marketplace" | "live-capture",
|
|
1393
1437
|
extraFields?: Record<string, unknown>,
|
|
1394
|
-
): Promise<
|
|
1438
|
+
): Promise<AutoExecDecision> {
|
|
1395
1439
|
// Only attempt auto-exec if we have an intent to infer params from
|
|
1396
1440
|
if (intent && intent.trim().length > 0) {
|
|
1397
1441
|
try {
|
|
@@ -1399,13 +1443,20 @@ export async function resolveAndExecute(
|
|
|
1399
1443
|
if (autoResult) {
|
|
1400
1444
|
// Promote to marketplace cache so subsequent requests skip live-capture
|
|
1401
1445
|
promoteLearnedSkill(clientScope, cacheKey, skill, autoResult.trace.endpoint_id ?? "", context?.url);
|
|
1402
|
-
return autoResult;
|
|
1446
|
+
return { orchestratorResult: autoResult, autoexecFailedAll: false };
|
|
1403
1447
|
}
|
|
1448
|
+
return {
|
|
1449
|
+
orchestratorResult: buildDeferral(skill, source, extraFields),
|
|
1450
|
+
autoexecFailedAll: true,
|
|
1451
|
+
};
|
|
1404
1452
|
} catch (err) {
|
|
1405
1453
|
console.warn(`[auto-exec] failed, falling back to deferral: ${(err as Error).message}`);
|
|
1406
1454
|
}
|
|
1407
1455
|
}
|
|
1408
|
-
return
|
|
1456
|
+
return {
|
|
1457
|
+
orchestratorResult: buildDeferral(skill, source, extraFields),
|
|
1458
|
+
autoexecFailedAll: false,
|
|
1459
|
+
};
|
|
1409
1460
|
}
|
|
1410
1461
|
|
|
1411
1462
|
/** Build a deferral response — returns the skill + ranked endpoints for the agent to choose. */
|
|
@@ -1882,6 +1933,7 @@ export async function resolveAndExecute(
|
|
|
1882
1933
|
}
|
|
1883
1934
|
|
|
1884
1935
|
const requestedDomain = context?.domain ?? (context?.url ? new URL(context.url).hostname : null);
|
|
1936
|
+
const requestedDomainCacheKey = getDomainReuseKey(context?.url ?? requestedDomain);
|
|
1885
1937
|
const resolveCacheKey = buildResolveCacheKey(requestedDomain, intent, context?.url);
|
|
1886
1938
|
const cacheKey = scopedCacheKey(clientScope, resolveCacheKey);
|
|
1887
1939
|
|
|
@@ -1895,10 +1947,15 @@ export async function resolveAndExecute(
|
|
|
1895
1947
|
) {
|
|
1896
1948
|
routeResultCache.delete(cacheKey);
|
|
1897
1949
|
} else {
|
|
1898
|
-
timing.cache_hit = true;
|
|
1899
1950
|
const deferred = await buildDeferralWithAutoExec(cachedResult.skill, "marketplace");
|
|
1900
|
-
deferred.
|
|
1901
|
-
|
|
1951
|
+
if (shouldFallbackToLiveCaptureAfterAutoexecFailure(deferred.autoexecFailedAll, context?.url)) {
|
|
1952
|
+
console.log("[route-result-cache] stale cached skill; retrying via live capture");
|
|
1953
|
+
invalidateResolveCacheEntries([cacheKey], requestedDomainCacheKey ? [requestedDomainCacheKey] : []);
|
|
1954
|
+
} else {
|
|
1955
|
+
timing.cache_hit = true;
|
|
1956
|
+
deferred.orchestratorResult.timing.cache_hit = true;
|
|
1957
|
+
return deferred.orchestratorResult;
|
|
1958
|
+
}
|
|
1902
1959
|
}
|
|
1903
1960
|
}
|
|
1904
1961
|
}
|
|
@@ -1940,10 +1997,18 @@ export async function resolveAndExecute(
|
|
|
1940
1997
|
context?.url,
|
|
1941
1998
|
);
|
|
1942
1999
|
}
|
|
1943
|
-
timing.cache_hit = true;
|
|
1944
2000
|
const deferred = await buildDeferralWithAutoExec(bestCached.skill, "marketplace");
|
|
1945
|
-
deferred.
|
|
1946
|
-
|
|
2001
|
+
if (shouldFallbackToLiveCaptureAfterAutoexecFailure(deferred.autoexecFailedAll, context?.url)) {
|
|
2002
|
+
console.log("[route-cache] stale cached skill; retrying via live capture");
|
|
2003
|
+
invalidateResolveCacheEntries(
|
|
2004
|
+
[cacheKey, bestCached.scopedKey],
|
|
2005
|
+
requestedDomainCacheKey ? [requestedDomainCacheKey] : [],
|
|
2006
|
+
);
|
|
2007
|
+
} else {
|
|
2008
|
+
timing.cache_hit = true;
|
|
2009
|
+
deferred.orchestratorResult.timing.cache_hit = true;
|
|
2010
|
+
return deferred.orchestratorResult;
|
|
2011
|
+
}
|
|
1947
2012
|
}
|
|
1948
2013
|
}
|
|
1949
2014
|
|
|
@@ -1954,11 +2019,16 @@ export async function resolveAndExecute(
|
|
|
1954
2019
|
if (domainCached && Date.now() - domainCached.ts < 7 * 24 * 60 * 60_000) {
|
|
1955
2020
|
const skill = readSkillSnapshot(domainCached.localSkillPath) ?? await getSkill(domainCached.skillId, clientScope);
|
|
1956
2021
|
if (skill && isCachedSkillRelevantForIntent(skill, intent, context?.url)) {
|
|
1957
|
-
timing.cache_hit = true;
|
|
1958
2022
|
console.log(`[domain-cache] hit for ${domainKey} → skill ${skill.skill_id.slice(0, 15)}`);
|
|
1959
2023
|
const result = await buildDeferralWithAutoExec(skill, "marketplace");
|
|
1960
|
-
result.
|
|
1961
|
-
|
|
2024
|
+
if (shouldFallbackToLiveCaptureAfterAutoexecFailure(result.autoexecFailedAll, context?.url)) {
|
|
2025
|
+
console.log(`[domain-cache] stale skill for ${domainKey}; retrying via live capture`);
|
|
2026
|
+
invalidateResolveCacheEntries([cacheKey], [domainKey]);
|
|
2027
|
+
} else {
|
|
2028
|
+
timing.cache_hit = true;
|
|
2029
|
+
result.orchestratorResult.timing.cache_hit = true;
|
|
2030
|
+
return result.orchestratorResult;
|
|
2031
|
+
}
|
|
1962
2032
|
} else if (skill) {
|
|
1963
2033
|
const ranked = rankEndpoints(skill.endpoints, intent, skill.domain, context?.url);
|
|
1964
2034
|
const top = ranked[0];
|
|
@@ -2202,7 +2272,11 @@ export async function resolveAndExecute(
|
|
|
2202
2272
|
`[search] endpoint-level hit hint: ${best.endpointId} score=${best.candidate.score.toFixed(3)}`,
|
|
2203
2273
|
);
|
|
2204
2274
|
}
|
|
2205
|
-
|
|
2275
|
+
const deferred = await buildDeferralWithAutoExec(best.skill, "marketplace");
|
|
2276
|
+
if (!shouldFallbackToLiveCaptureAfterAutoexecFailure(deferred.autoexecFailedAll, context?.url)) {
|
|
2277
|
+
return deferred.orchestratorResult;
|
|
2278
|
+
}
|
|
2279
|
+
console.log("[marketplace] stale top skill; retrying via live capture");
|
|
2206
2280
|
}
|
|
2207
2281
|
}
|
|
2208
2282
|
} // end !forceCapture
|
|
@@ -2222,8 +2296,6 @@ export async function resolveAndExecute(
|
|
|
2222
2296
|
if (!isCachedSkillRelevantForIntent(domainHit.skill, intent, context?.url)) {
|
|
2223
2297
|
capturedDomainCache.delete(cacheKey);
|
|
2224
2298
|
} else {
|
|
2225
|
-
timing.cache_hit = true;
|
|
2226
|
-
let staleCachedEndpoint = false;
|
|
2227
2299
|
if (agentChoseEndpoint) {
|
|
2228
2300
|
const execOut = await executeSkill(
|
|
2229
2301
|
domainHit.skill,
|
|
@@ -2254,14 +2326,20 @@ export async function resolveAndExecute(
|
|
|
2254
2326
|
execOut.trace,
|
|
2255
2327
|
),
|
|
2256
2328
|
response_schema: execOut.response_schema,
|
|
2257
|
-
|
|
2258
|
-
|
|
2329
|
+
extraction_hints: execOut.extraction_hints,
|
|
2330
|
+
};
|
|
2259
2331
|
}
|
|
2260
|
-
|
|
2332
|
+
invalidateResolveCacheEntries([cacheKey], requestedDomainCacheKey ? [requestedDomainCacheKey] : []);
|
|
2261
2333
|
}
|
|
2262
2334
|
const deferred = await buildDeferralWithAutoExec(domainHit.skill, "marketplace");
|
|
2263
|
-
deferred.
|
|
2264
|
-
|
|
2335
|
+
if (shouldFallbackToLiveCaptureAfterAutoexecFailure(deferred.autoexecFailedAll, context?.url)) {
|
|
2336
|
+
console.log("[captured-domain-cache] stale skill; retrying via live capture");
|
|
2337
|
+
invalidateResolveCacheEntries([cacheKey], requestedDomainCacheKey ? [requestedDomainCacheKey] : []);
|
|
2338
|
+
} else {
|
|
2339
|
+
timing.cache_hit = true;
|
|
2340
|
+
deferred.orchestratorResult.timing.cache_hit = true;
|
|
2341
|
+
return deferred.orchestratorResult;
|
|
2342
|
+
}
|
|
2265
2343
|
}
|
|
2266
2344
|
}
|
|
2267
2345
|
|
|
@@ -2299,13 +2377,13 @@ export async function resolveAndExecute(
|
|
|
2299
2377
|
authRecommended
|
|
2300
2378
|
? {
|
|
2301
2379
|
auth_recommended: true,
|
|
2302
|
-
|
|
2303
|
-
|
|
2380
|
+
auth_hint: captureResult!.auth_hint,
|
|
2381
|
+
}
|
|
2304
2382
|
: undefined,
|
|
2305
2383
|
);
|
|
2306
|
-
queuePassivePublishIfExecuted(learned_skill, deferred, parityBaseline);
|
|
2307
|
-
deferred.timing.cache_hit = true;
|
|
2308
|
-
return deferred;
|
|
2384
|
+
queuePassivePublishIfExecuted(learned_skill, deferred.orchestratorResult, parityBaseline);
|
|
2385
|
+
deferred.orchestratorResult.timing.cache_hit = true;
|
|
2386
|
+
return deferred.orchestratorResult;
|
|
2309
2387
|
}
|
|
2310
2388
|
return {
|
|
2311
2389
|
result,
|
|
@@ -2596,8 +2674,8 @@ export async function resolveAndExecute(
|
|
|
2596
2674
|
}
|
|
2597
2675
|
: undefined,
|
|
2598
2676
|
);
|
|
2599
|
-
queuePassivePublishIfExecuted(learned_skill, deferred, parityBaseline);
|
|
2600
|
-
return deferred;
|
|
2677
|
+
queuePassivePublishIfExecuted(learned_skill, deferred.orchestratorResult, parityBaseline);
|
|
2678
|
+
return deferred.orchestratorResult;
|
|
2601
2679
|
}
|
|
2602
2680
|
|
|
2603
2681
|
async function getOrCreateBrowserCaptureSkill(): Promise<SkillManifest> {
|