unbrowse 3.8.0-preview.2 → 3.8.0-preview.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +14 -6
- package/dist/mcp.js +8 -7
- package/dist/server.js +441 -52
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -31,7 +31,7 @@ var __promiseAll = (args) => Promise.all(args);
|
|
|
31
31
|
var __require = /* @__PURE__ */ createRequire(import.meta.url);
|
|
32
32
|
|
|
33
33
|
// ../../src/build-info.generated.ts
|
|
34
|
-
var BUILD_RELEASE_VERSION = "3.8.0-preview.
|
|
34
|
+
var BUILD_RELEASE_VERSION = "3.8.0-preview.4", BUILD_GIT_SHA = "c391cacadcd1", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy40IiwiZ2l0X3NoYSI6ImMzOTFjYWNhZGNkMSIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAYzM5MWNhY2FkY2QxIiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0yNVQwMDo0NDo0Mi45NTdaIn0", BUILD_RELEASE_MANIFEST_SIGNATURE = "9_GilAHQk3IHxPXLqGomsY7NFZuS1dc8dKCpLYvF8Gs", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
|
|
35
35
|
|
|
36
36
|
// ../../src/version.ts
|
|
37
37
|
import { createHash } from "crypto";
|
|
@@ -425,8 +425,9 @@ function resolveSiblingEntrypoint(metaUrl, basename) {
|
|
|
425
425
|
return path.join(path.dirname(file), `${basename}${path.extname(file) || ".js"}`);
|
|
426
426
|
}
|
|
427
427
|
function runtimeArgsForEntrypoint(metaUrl, entrypoint) {
|
|
428
|
-
if (path.extname(entrypoint) !== ".ts")
|
|
429
|
-
return [entrypoint];
|
|
428
|
+
if (path.extname(entrypoint) !== ".ts") {
|
|
429
|
+
return [pathToFileURL(entrypoint).href];
|
|
430
|
+
}
|
|
430
431
|
if (process.versions.bun)
|
|
431
432
|
return [entrypoint];
|
|
432
433
|
try {
|
|
@@ -3007,6 +3008,7 @@ var init_execution = __esm(async () => {
|
|
|
3007
3008
|
init_bundle_scanner();
|
|
3008
3009
|
init_token_resolver();
|
|
3009
3010
|
init_marketplace();
|
|
3011
|
+
init_publish_admission();
|
|
3010
3012
|
init_transform();
|
|
3011
3013
|
init_drift();
|
|
3012
3014
|
init_client();
|
|
@@ -5447,8 +5449,9 @@ function isBundledVirtualEntrypoint(entrypoint) {
|
|
|
5447
5449
|
return entrypoint.startsWith("/$bunfs/");
|
|
5448
5450
|
}
|
|
5449
5451
|
function runtimeArgsForEntrypoint2(metaUrl, entrypoint) {
|
|
5450
|
-
if (path3.extname(entrypoint) !== ".ts")
|
|
5451
|
-
return [entrypoint];
|
|
5452
|
+
if (path3.extname(entrypoint) !== ".ts") {
|
|
5453
|
+
return [pathToFileURL2(entrypoint).href];
|
|
5454
|
+
}
|
|
5452
5455
|
if (process.versions.bun)
|
|
5453
5456
|
return [entrypoint];
|
|
5454
5457
|
try {
|
|
@@ -5461,9 +5464,14 @@ function runtimeArgsForEntrypoint2(metaUrl, entrypoint) {
|
|
|
5461
5464
|
return ["--import", "tsx", entrypoint];
|
|
5462
5465
|
}
|
|
5463
5466
|
function isMainModule(metaUrl) {
|
|
5464
|
-
|
|
5467
|
+
let entry = process.argv[1];
|
|
5465
5468
|
if (!entry)
|
|
5466
5469
|
return false;
|
|
5470
|
+
if (entry.startsWith("file://")) {
|
|
5471
|
+
try {
|
|
5472
|
+
entry = fileURLToPath3(entry);
|
|
5473
|
+
} catch {}
|
|
5474
|
+
}
|
|
5467
5475
|
const modulePath = fileURLToPath3(metaUrl);
|
|
5468
5476
|
try {
|
|
5469
5477
|
return realpathSync2(entry) === realpathSync2(modulePath);
|
package/dist/mcp.js
CHANGED
|
@@ -156,8 +156,9 @@ function resolveSiblingEntrypoint(metaUrl, basename) {
|
|
|
156
156
|
return path.join(path.dirname(file), `${basename}${path.extname(file) || ".js"}`);
|
|
157
157
|
}
|
|
158
158
|
function runtimeArgsForEntrypoint(metaUrl, entrypoint) {
|
|
159
|
-
if (path.extname(entrypoint) !== ".ts")
|
|
160
|
-
return [entrypoint];
|
|
159
|
+
if (path.extname(entrypoint) !== ".ts") {
|
|
160
|
+
return [pathToFileURL(entrypoint).href];
|
|
161
|
+
}
|
|
161
162
|
if (process.versions.bun)
|
|
162
163
|
return [entrypoint];
|
|
163
164
|
try {
|
|
@@ -225,11 +226,11 @@ import { dirname, join, parse } from "path";
|
|
|
225
226
|
import { fileURLToPath as fileURLToPath2 } from "url";
|
|
226
227
|
|
|
227
228
|
// ../../src/build-info.generated.ts
|
|
228
|
-
var BUILD_RELEASE_VERSION = "3.8.0-preview.
|
|
229
|
-
var BUILD_GIT_SHA = "
|
|
229
|
+
var BUILD_RELEASE_VERSION = "3.8.0-preview.4";
|
|
230
|
+
var BUILD_GIT_SHA = "c391cacadcd1";
|
|
230
231
|
var BUILD_CODE_HASH = "5d9ebf619c61";
|
|
231
|
-
var BUILD_RELEASE_MANIFEST_BASE64 = "
|
|
232
|
-
var BUILD_RELEASE_MANIFEST_SIGNATURE = "
|
|
232
|
+
var BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy40IiwiZ2l0X3NoYSI6ImMzOTFjYWNhZGNkMSIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAYzM5MWNhY2FkY2QxIiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0yNVQwMDo0NDo0Mi45NTdaIn0";
|
|
233
|
+
var BUILD_RELEASE_MANIFEST_SIGNATURE = "9_GilAHQk3IHxPXLqGomsY7NFZuS1dc8dKCpLYvF8Gs";
|
|
233
234
|
var BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
|
|
234
235
|
|
|
235
236
|
// ../../src/version.ts
|
|
@@ -2498,7 +2499,7 @@ var tools = [
|
|
|
2498
2499
|
{
|
|
2499
2500
|
name: "unbrowse_annotate",
|
|
2500
2501
|
description: "Contribute constraints or best practices for an endpoint. Call this after executing an endpoint to share what you learned (required params, gotchas, tips) with other agents.",
|
|
2501
|
-
|
|
2502
|
+
inputSchema: {
|
|
2502
2503
|
type: "object",
|
|
2503
2504
|
properties: {
|
|
2504
2505
|
skill: { type: "string", description: "Skill ID" },
|
package/dist/server.js
CHANGED
|
@@ -4234,6 +4234,17 @@ function getIntentEntityRules(kind) {
|
|
|
4234
4234
|
}
|
|
4235
4235
|
}
|
|
4236
4236
|
function isSemanticallyAdmissibleResponse(req, sampleResponse, sampleRequest, context) {
|
|
4237
|
+
if (/\/graphql(\/|$|\?)/i.test(req.url)) {
|
|
4238
|
+
const noiseOpRe = /globalnav|sidenav|navdash|topnav|nav_|notification|notif_|preloadstate|tracking|telemetry|pingback|heartbeat|presence|rightrail|gno_|viewertracking|metadata$|\bmetadata\b/i;
|
|
4239
|
+
const opName = extractGraphQLOperationName(req.url, req.request_body) ?? "";
|
|
4240
|
+
const opMatchesNoise = !!opName && noiseOpRe.test(opName);
|
|
4241
|
+
const urlMatchesNoise = noiseOpRe.test(req.url);
|
|
4242
|
+
const bodyMatchesNoise = typeof req.request_body === "string" && noiseOpRe.test(req.request_body.slice(0, 500));
|
|
4243
|
+
if (opMatchesNoise || urlMatchesNoise || bodyMatchesNoise) {
|
|
4244
|
+
return { ok: false, reason: "graphql_noise_operation" };
|
|
4245
|
+
}
|
|
4246
|
+
return { ok: true, reason: "semantic_graphql_bypass" };
|
|
4247
|
+
}
|
|
4237
4248
|
const kind = inferIntentEntityKind(context?.intent);
|
|
4238
4249
|
const action2 = inferIntentActionKind(context?.intent);
|
|
4239
4250
|
if (!kind) {
|
|
@@ -4338,7 +4349,7 @@ function scoreRequest(req) {
|
|
|
4338
4349
|
}
|
|
4339
4350
|
return score;
|
|
4340
4351
|
}
|
|
4341
|
-
function extractEndpoints(requests, wsMessages, context) {
|
|
4352
|
+
function extractEndpoints(requests, wsMessages, context, traceSink) {
|
|
4342
4353
|
const seen = new Set;
|
|
4343
4354
|
const endpoints = [];
|
|
4344
4355
|
const traceRows = [];
|
|
@@ -4362,6 +4373,16 @@ function extractEndpoints(requests, wsMessages, context) {
|
|
|
4362
4373
|
continue;
|
|
4363
4374
|
}
|
|
4364
4375
|
if (!hasAdmissibleParsedBody(req.response_body)) {
|
|
4376
|
+
const body = req.response_body;
|
|
4377
|
+
if (typeof body === "string" && body.length > 20) {
|
|
4378
|
+
const trimmed = body.trimStart().slice(0, 200).toLowerCase();
|
|
4379
|
+
const looksCss = /^[.#@a-z][a-z0-9_\-]*\s*\{|^body\s*\{|^@media|^@import|^@font-face/.test(trimmed);
|
|
4380
|
+
const looksJs = /^function\s|^var\s|^let\s|^const\s|^import\s|^\(function|^\/\*|^\!function/.test(trimmed) && !trimmed.includes('{"');
|
|
4381
|
+
if (looksCss || looksJs) {
|
|
4382
|
+
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "body_not_json_or_html" });
|
|
4383
|
+
continue;
|
|
4384
|
+
}
|
|
4385
|
+
}
|
|
4365
4386
|
const urlPath = (() => {
|
|
4366
4387
|
try {
|
|
4367
4388
|
return new URL(req.url).pathname;
|
|
@@ -4369,7 +4390,7 @@ function extractEndpoints(requests, wsMessages, context) {
|
|
|
4369
4390
|
return "";
|
|
4370
4391
|
}
|
|
4371
4392
|
})();
|
|
4372
|
-
const isApiUrl = /\/(api|graphql|youtubei|__ssr_data__)\b/i.test(urlPath) || /\.(json)(\?|$)/.test(req.url);
|
|
4393
|
+
const isApiUrl = /\/(api|graphql|youtubei|__ssr_data__|_next\/data|xhr|ajax|rest)\b/i.test(urlPath) || /\/v\d+\//i.test(urlPath) || /\/async[-_]/i.test(urlPath) || /[-_](state|data|feed|timeline|search|list|results)(\?|$|\/)/i.test(urlPath) || /\.(json)(\?|$)/.test(req.url);
|
|
4373
4394
|
let graphqlOpName;
|
|
4374
4395
|
if (/graphql/i.test(req.url)) {
|
|
4375
4396
|
graphqlOpName = extractGraphQLOperationName(req.url, req.request_body);
|
|
@@ -4393,8 +4414,21 @@ function extractEndpoints(requests, wsMessages, context) {
|
|
|
4393
4414
|
const reqHost = new URL(req.url).hostname;
|
|
4394
4415
|
const reqDomain = getRegistrableDomain(reqHost);
|
|
4395
4416
|
if (!affinityDomains.has(reqDomain)) {
|
|
4396
|
-
|
|
4397
|
-
|
|
4417
|
+
const reqBrand = reqDomain.split(".")[0] ?? "";
|
|
4418
|
+
let siblingOk = false;
|
|
4419
|
+
if (reqBrand.length >= 4) {
|
|
4420
|
+
for (const a of affinityDomains) {
|
|
4421
|
+
const aBrand = a.split(".")[0] ?? "";
|
|
4422
|
+
if (aBrand.length >= 4 && (reqBrand.startsWith(aBrand) || aBrand.startsWith(reqBrand))) {
|
|
4423
|
+
siblingOk = true;
|
|
4424
|
+
break;
|
|
4425
|
+
}
|
|
4426
|
+
}
|
|
4427
|
+
}
|
|
4428
|
+
if (!siblingOk) {
|
|
4429
|
+
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "domain_mismatch" });
|
|
4430
|
+
continue;
|
|
4431
|
+
}
|
|
4398
4432
|
}
|
|
4399
4433
|
} catch {
|
|
4400
4434
|
traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "bad_url" });
|
|
@@ -4599,6 +4633,8 @@ function extractEndpoints(requests, wsMessages, context) {
|
|
|
4599
4633
|
resource_kind: endpoint.semantic?.resource_kind
|
|
4600
4634
|
}))
|
|
4601
4635
|
});
|
|
4636
|
+
if (traceSink)
|
|
4637
|
+
traceSink.rows = traceRows;
|
|
4602
4638
|
return endpoints;
|
|
4603
4639
|
}
|
|
4604
4640
|
function isApiLike(req) {
|
|
@@ -7120,7 +7156,7 @@ var init_capture = __esm(async () => {
|
|
|
7120
7156
|
});
|
|
7121
7157
|
|
|
7122
7158
|
// ../../src/build-info.generated.ts
|
|
7123
|
-
var BUILD_RELEASE_VERSION = "3.8.0-preview.
|
|
7159
|
+
var BUILD_RELEASE_VERSION = "3.8.0-preview.4", BUILD_GIT_SHA = "c391cacadcd1", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy40IiwiZ2l0X3NoYSI6ImMzOTFjYWNhZGNkMSIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAYzM5MWNhY2FkY2QxIiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0yNVQwMDo0NDo0Mi45NTdaIn0", BUILD_RELEASE_MANIFEST_SIGNATURE = "9_GilAHQk3IHxPXLqGomsY7NFZuS1dc8dKCpLYvF8Gs", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
|
|
7124
7160
|
|
|
7125
7161
|
// ../../src/version.ts
|
|
7126
7162
|
import { createHash as createHash2 } from "crypto";
|
|
@@ -8793,6 +8829,7 @@ function freshReasonCounts() {
|
|
|
8793
8829
|
noise: 0,
|
|
8794
8830
|
fragile_graphql: 0,
|
|
8795
8831
|
no_durable_signal: 0,
|
|
8832
|
+
dom_fallback_only: 0,
|
|
8796
8833
|
family_dedup: 0,
|
|
8797
8834
|
over_limit: 0
|
|
8798
8835
|
};
|
|
@@ -8812,6 +8849,9 @@ function looksOpaqueIdentifier(value) {
|
|
|
8812
8849
|
return hasAlpha && hasDigit;
|
|
8813
8850
|
}
|
|
8814
8851
|
function isCanonicalDocumentReplay(endpoint) {
|
|
8852
|
+
const extractionMethod = endpoint.dom_extraction?.extraction_method ?? "";
|
|
8853
|
+
if (extractionMethod.startsWith("spa-"))
|
|
8854
|
+
return false;
|
|
8815
8855
|
if (/captured page artifact/i.test(endpoint.description ?? ""))
|
|
8816
8856
|
return true;
|
|
8817
8857
|
if (!endpoint.trigger_url)
|
|
@@ -8963,6 +9003,27 @@ function selectMarketplacePublishEndpoints(skill, options = {}) {
|
|
|
8963
9003
|
score: scoreEndpoint(endpoint)
|
|
8964
9004
|
});
|
|
8965
9005
|
}
|
|
9006
|
+
const hasRealEndpoint = candidates.some((c) => !isCanonicalDocumentReplay(c.endpoint));
|
|
9007
|
+
if (!hasRealEndpoint) {
|
|
9008
|
+
reasons.dom_fallback_only += candidates.length;
|
|
9009
|
+
return {
|
|
9010
|
+
endpoints: [],
|
|
9011
|
+
stats: {
|
|
9012
|
+
total: skill.endpoints?.length ?? 0,
|
|
9013
|
+
kept: 0,
|
|
9014
|
+
by_reason: reasons
|
|
9015
|
+
}
|
|
9016
|
+
};
|
|
9017
|
+
}
|
|
9018
|
+
const filteredCandidates = candidates.filter((c) => {
|
|
9019
|
+
if (isCanonicalDocumentReplay(c.endpoint)) {
|
|
9020
|
+
reasons.dom_fallback_only += 1;
|
|
9021
|
+
return false;
|
|
9022
|
+
}
|
|
9023
|
+
return true;
|
|
9024
|
+
});
|
|
9025
|
+
candidates.length = 0;
|
|
9026
|
+
candidates.push(...filteredCandidates);
|
|
8966
9027
|
candidates.sort((a, b) => {
|
|
8967
9028
|
if (b.score !== a.score)
|
|
8968
9029
|
return b.score - a.score;
|
|
@@ -12408,6 +12469,161 @@ function extractFlashNoticeSpecial(html, intent) {
|
|
|
12408
12469
|
selector: buildReplaySelector(flash)
|
|
12409
12470
|
}];
|
|
12410
12471
|
}
|
|
12472
|
+
function sliceBalancedObject(src, startIdx) {
|
|
12473
|
+
const first = src.indexOf("{", startIdx);
|
|
12474
|
+
if (first < 0)
|
|
12475
|
+
return null;
|
|
12476
|
+
let depth = 0;
|
|
12477
|
+
let inString = false;
|
|
12478
|
+
let stringChar = "";
|
|
12479
|
+
let escaped = false;
|
|
12480
|
+
for (let i = first;i < src.length; i++) {
|
|
12481
|
+
const c = src[i];
|
|
12482
|
+
if (escaped) {
|
|
12483
|
+
escaped = false;
|
|
12484
|
+
continue;
|
|
12485
|
+
}
|
|
12486
|
+
if (inString) {
|
|
12487
|
+
if (c === "\\") {
|
|
12488
|
+
escaped = true;
|
|
12489
|
+
continue;
|
|
12490
|
+
}
|
|
12491
|
+
if (c === stringChar) {
|
|
12492
|
+
inString = false;
|
|
12493
|
+
}
|
|
12494
|
+
continue;
|
|
12495
|
+
}
|
|
12496
|
+
if (c === '"' || c === "'" || c === "`") {
|
|
12497
|
+
inString = true;
|
|
12498
|
+
stringChar = c;
|
|
12499
|
+
continue;
|
|
12500
|
+
}
|
|
12501
|
+
if (c === "{")
|
|
12502
|
+
depth += 1;
|
|
12503
|
+
else if (c === "}") {
|
|
12504
|
+
depth -= 1;
|
|
12505
|
+
if (depth === 0)
|
|
12506
|
+
return src.substring(first, i + 1);
|
|
12507
|
+
}
|
|
12508
|
+
}
|
|
12509
|
+
return null;
|
|
12510
|
+
}
|
|
12511
|
+
function sliceBalancedAny(src, startIdx) {
|
|
12512
|
+
const open = src[startIdx];
|
|
12513
|
+
if (open !== "{" && open !== "[")
|
|
12514
|
+
return null;
|
|
12515
|
+
const closeCh = open === "{" ? "}" : "]";
|
|
12516
|
+
let depth = 0;
|
|
12517
|
+
let inString = false;
|
|
12518
|
+
let stringChar = "";
|
|
12519
|
+
let escaped = false;
|
|
12520
|
+
for (let i = startIdx;i < src.length; i++) {
|
|
12521
|
+
const c = src[i];
|
|
12522
|
+
if (escaped) {
|
|
12523
|
+
escaped = false;
|
|
12524
|
+
continue;
|
|
12525
|
+
}
|
|
12526
|
+
if (inString) {
|
|
12527
|
+
if (c === "\\") {
|
|
12528
|
+
escaped = true;
|
|
12529
|
+
continue;
|
|
12530
|
+
}
|
|
12531
|
+
if (c === stringChar) {
|
|
12532
|
+
inString = false;
|
|
12533
|
+
}
|
|
12534
|
+
continue;
|
|
12535
|
+
}
|
|
12536
|
+
if (c === '"' || c === "'" || c === "`") {
|
|
12537
|
+
inString = true;
|
|
12538
|
+
stringChar = c;
|
|
12539
|
+
continue;
|
|
12540
|
+
}
|
|
12541
|
+
if (c === open)
|
|
12542
|
+
depth += 1;
|
|
12543
|
+
else if (c === closeCh) {
|
|
12544
|
+
depth -= 1;
|
|
12545
|
+
if (depth === 0)
|
|
12546
|
+
return src.substring(startIdx, i + 1);
|
|
12547
|
+
}
|
|
12548
|
+
}
|
|
12549
|
+
return null;
|
|
12550
|
+
}
|
|
12551
|
+
function findWindowAssignmentPayload(html, varName) {
|
|
12552
|
+
const assignRe = new RegExp(String.raw`window\.${varName}\s*=\s*(\{)`, "i");
|
|
12553
|
+
const m = assignRe.exec(html);
|
|
12554
|
+
if (!m)
|
|
12555
|
+
return null;
|
|
12556
|
+
const startIdx = m.index + m[0].length - 1;
|
|
12557
|
+
return sliceBalancedObject(html, startIdx);
|
|
12558
|
+
}
|
|
12559
|
+
function unwrapInfiniteQuery(data) {
|
|
12560
|
+
if (!data || typeof data !== "object")
|
|
12561
|
+
return [];
|
|
12562
|
+
const d = data;
|
|
12563
|
+
const pages = d.pages;
|
|
12564
|
+
if (!Array.isArray(pages) || pages.length === 0)
|
|
12565
|
+
return [];
|
|
12566
|
+
if (!("pageParams" in d))
|
|
12567
|
+
return [];
|
|
12568
|
+
const merged = [];
|
|
12569
|
+
for (const page of pages) {
|
|
12570
|
+
if (Array.isArray(page)) {
|
|
12571
|
+
merged.push(...page);
|
|
12572
|
+
} else if (page && typeof page === "object") {
|
|
12573
|
+
const p = page;
|
|
12574
|
+
const listKeys = ["data", "items", "results", "articles", "posts", "nodes", "records"];
|
|
12575
|
+
let found = false;
|
|
12576
|
+
for (const k of listKeys) {
|
|
12577
|
+
const v = p[k];
|
|
12578
|
+
if (Array.isArray(v)) {
|
|
12579
|
+
merged.push(...v);
|
|
12580
|
+
found = true;
|
|
12581
|
+
break;
|
|
12582
|
+
}
|
|
12583
|
+
}
|
|
12584
|
+
if (!found) {
|
|
12585
|
+
for (const v of Object.values(p)) {
|
|
12586
|
+
if (Array.isArray(v) && v.length > 0) {
|
|
12587
|
+
merged.push(...v);
|
|
12588
|
+
found = true;
|
|
12589
|
+
break;
|
|
12590
|
+
}
|
|
12591
|
+
}
|
|
12592
|
+
}
|
|
12593
|
+
if (!found)
|
|
12594
|
+
merged.push(page);
|
|
12595
|
+
}
|
|
12596
|
+
}
|
|
12597
|
+
return merged;
|
|
12598
|
+
}
|
|
12599
|
+
function unwrapDehydratedState(pageProps) {
|
|
12600
|
+
if (!pageProps || typeof pageProps !== "object")
|
|
12601
|
+
return [];
|
|
12602
|
+
const dh = pageProps.dehydratedState;
|
|
12603
|
+
if (!dh || typeof dh !== "object")
|
|
12604
|
+
return [];
|
|
12605
|
+
const queries = dh.queries;
|
|
12606
|
+
if (!Array.isArray(queries))
|
|
12607
|
+
return [];
|
|
12608
|
+
const extracted = [];
|
|
12609
|
+
for (const q of queries) {
|
|
12610
|
+
if (!q || typeof q !== "object")
|
|
12611
|
+
continue;
|
|
12612
|
+
const state = q.state;
|
|
12613
|
+
if (!state || typeof state !== "object")
|
|
12614
|
+
continue;
|
|
12615
|
+
const data = state.data;
|
|
12616
|
+
if (data == null)
|
|
12617
|
+
continue;
|
|
12618
|
+
const infinitePages = unwrapInfiniteQuery(data);
|
|
12619
|
+
if (infinitePages.length > 0) {
|
|
12620
|
+
extracted.push(infinitePages);
|
|
12621
|
+
} else {
|
|
12622
|
+
extracted.push(data);
|
|
12623
|
+
}
|
|
12624
|
+
}
|
|
12625
|
+
return extracted;
|
|
12626
|
+
}
|
|
12411
12627
|
function extractSPAData(html) {
|
|
12412
12628
|
const results = [];
|
|
12413
12629
|
const nextDataMatch = html.match(/<script\s+id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
|
|
@@ -12416,18 +12632,31 @@ function extractSPAData(html) {
|
|
|
12416
12632
|
const parsed = JSON.parse(nextDataMatch[1]);
|
|
12417
12633
|
const pageProps = parsed?.props?.pageProps;
|
|
12418
12634
|
if (pageProps && typeof pageProps === "object" && Object.keys(pageProps).length > 0) {
|
|
12419
|
-
|
|
12420
|
-
|
|
12421
|
-
|
|
12422
|
-
|
|
12423
|
-
|
|
12635
|
+
const dehydrated = unwrapDehydratedState(pageProps);
|
|
12636
|
+
for (const qdata of dehydrated) {
|
|
12637
|
+
if (qdata && typeof qdata === "object") {
|
|
12638
|
+
results.push({
|
|
12639
|
+
type: "spa-nextjs",
|
|
12640
|
+
data: qdata,
|
|
12641
|
+
element_count: countDataElements(qdata)
|
|
12642
|
+
});
|
|
12643
|
+
}
|
|
12644
|
+
}
|
|
12645
|
+
const rawPageProps = dehydrated.length > 0 ? Object.fromEntries(Object.entries(pageProps).filter(([key]) => key !== "dehydratedState")) : pageProps;
|
|
12646
|
+
if (rawPageProps && Object.keys(rawPageProps).length > 0) {
|
|
12647
|
+
results.push({
|
|
12648
|
+
type: "spa-nextjs",
|
|
12649
|
+
data: rawPageProps,
|
|
12650
|
+
element_count: countDataElements(rawPageProps)
|
|
12651
|
+
});
|
|
12652
|
+
}
|
|
12424
12653
|
}
|
|
12425
12654
|
} catch {}
|
|
12426
12655
|
}
|
|
12427
|
-
const
|
|
12428
|
-
if (
|
|
12656
|
+
const nuxtPayload = findWindowAssignmentPayload(html, "__NUXT__");
|
|
12657
|
+
if (nuxtPayload) {
|
|
12429
12658
|
try {
|
|
12430
|
-
const parsed = JSON.parse(
|
|
12659
|
+
const parsed = JSON.parse(nuxtPayload);
|
|
12431
12660
|
const data = parsed?.data?.[0] ?? parsed?.state ?? parsed;
|
|
12432
12661
|
if (data && typeof data === "object" && Object.keys(data).length > 0) {
|
|
12433
12662
|
results.push({
|
|
@@ -12438,10 +12667,10 @@ function extractSPAData(html) {
|
|
|
12438
12667
|
}
|
|
12439
12668
|
} catch {}
|
|
12440
12669
|
}
|
|
12441
|
-
const
|
|
12442
|
-
if (
|
|
12670
|
+
const initialStatePayload = findWindowAssignmentPayload(html, "__INITIAL_STATE__");
|
|
12671
|
+
if (initialStatePayload) {
|
|
12443
12672
|
try {
|
|
12444
|
-
const parsed = JSON.parse(
|
|
12673
|
+
const parsed = JSON.parse(initialStatePayload);
|
|
12445
12674
|
if (parsed && typeof parsed === "object" && Object.keys(parsed).length > 0) {
|
|
12446
12675
|
results.push({
|
|
12447
12676
|
type: "spa-initial-state",
|
|
@@ -12451,10 +12680,10 @@ function extractSPAData(html) {
|
|
|
12451
12680
|
}
|
|
12452
12681
|
} catch {}
|
|
12453
12682
|
}
|
|
12454
|
-
const
|
|
12455
|
-
if (
|
|
12683
|
+
const preloadedPayload = findWindowAssignmentPayload(html, "__PRELOADED_STATE__");
|
|
12684
|
+
if (preloadedPayload) {
|
|
12456
12685
|
try {
|
|
12457
|
-
const parsed = JSON.parse(
|
|
12686
|
+
const parsed = JSON.parse(preloadedPayload);
|
|
12458
12687
|
if (parsed && typeof parsed === "object" && Object.keys(parsed).length > 0) {
|
|
12459
12688
|
results.push({
|
|
12460
12689
|
type: "spa-preloaded-state",
|
|
@@ -12464,6 +12693,59 @@ function extractSPAData(html) {
|
|
|
12464
12693
|
}
|
|
12465
12694
|
} catch {}
|
|
12466
12695
|
}
|
|
12696
|
+
const apolloPayload = findWindowAssignmentPayload(html, "__APOLLO_STATE__");
|
|
12697
|
+
if (apolloPayload) {
|
|
12698
|
+
try {
|
|
12699
|
+
const parsed = JSON.parse(apolloPayload);
|
|
12700
|
+
if (parsed && typeof parsed === "object" && Object.keys(parsed).length > 0) {
|
|
12701
|
+
results.push({
|
|
12702
|
+
type: "spa-initial-state",
|
|
12703
|
+
data: parsed,
|
|
12704
|
+
element_count: countDataElements(parsed)
|
|
12705
|
+
});
|
|
12706
|
+
}
|
|
12707
|
+
} catch {}
|
|
12708
|
+
}
|
|
12709
|
+
const nextFPayloads = [];
|
|
12710
|
+
const nextFRe = /self\.__next_f\.push\(\s*\[\s*\d+\s*,\s*("(?:[^"\\]|\\.)*")/g;
|
|
12711
|
+
let nextFMatch;
|
|
12712
|
+
while (nextFMatch = nextFRe.exec(html)) {
|
|
12713
|
+
try {
|
|
12714
|
+
const decoded = JSON.parse(nextFMatch[1]);
|
|
12715
|
+
if (typeof decoded === "string" && decoded.length > 0) {
|
|
12716
|
+
nextFPayloads.push(decoded);
|
|
12717
|
+
}
|
|
12718
|
+
} catch {}
|
|
12719
|
+
}
|
|
12720
|
+
if (nextFPayloads.length > 0) {
|
|
12721
|
+
const combined = nextFPayloads.join("");
|
|
12722
|
+
const fragments = [];
|
|
12723
|
+
for (let i = 0;i < combined.length; i++) {
|
|
12724
|
+
const c = combined[i];
|
|
12725
|
+
if (c !== "{" && c !== "[")
|
|
12726
|
+
continue;
|
|
12727
|
+
const body = sliceBalancedAny(combined, i);
|
|
12728
|
+
if (!body)
|
|
12729
|
+
continue;
|
|
12730
|
+
try {
|
|
12731
|
+
const parsed = JSON.parse(body);
|
|
12732
|
+
if (parsed && typeof parsed === "object") {
|
|
12733
|
+
fragments.push(parsed);
|
|
12734
|
+
}
|
|
12735
|
+
} catch {}
|
|
12736
|
+
i += body.length - 1;
|
|
12737
|
+
}
|
|
12738
|
+
const scored = fragments.filter((f) => f && typeof f === "object").map((f) => ({ data: f, count: countDataElements(f) })).sort((a, b) => b.count - a.count).slice(0, 3);
|
|
12739
|
+
for (const entry of scored) {
|
|
12740
|
+
if (entry.count >= 2) {
|
|
12741
|
+
results.push({
|
|
12742
|
+
type: "spa-initial-state",
|
|
12743
|
+
data: entry.data,
|
|
12744
|
+
element_count: entry.count
|
|
12745
|
+
});
|
|
12746
|
+
}
|
|
12747
|
+
}
|
|
12748
|
+
}
|
|
12467
12749
|
return results;
|
|
12468
12750
|
}
|
|
12469
12751
|
function countDataElements(obj, depth = 0) {
|
|
@@ -13421,6 +13703,7 @@ function extractFromDOMWithHint(html, intent, hint) {
|
|
|
13421
13703
|
return extractFromDOM(html, intent);
|
|
13422
13704
|
}
|
|
13423
13705
|
function extractFromDOM(html, intent) {
|
|
13706
|
+
const spaStructures = extractSPAData(html);
|
|
13424
13707
|
const MAX_HTML_SIZE = 300000;
|
|
13425
13708
|
let workingHtml = html;
|
|
13426
13709
|
if (workingHtml.length > MAX_HTML_SIZE) {
|
|
@@ -13434,7 +13717,6 @@ function extractFromDOM(html, intent) {
|
|
|
13434
13717
|
}
|
|
13435
13718
|
}
|
|
13436
13719
|
}
|
|
13437
|
-
const spaStructures = extractSPAData(workingHtml);
|
|
13438
13720
|
const flashStructures = extractFlashNoticeSpecial(workingHtml, intent);
|
|
13439
13721
|
const cleaned = cleanDOM(workingHtml);
|
|
13440
13722
|
const githubStructures = extractGitHubSpecial(workingHtml, intent);
|
|
@@ -15515,6 +15797,7 @@ __export(exports_execution, {
|
|
|
15515
15797
|
isBundleInferredEndpoint: () => isBundleInferredEndpoint,
|
|
15516
15798
|
executeSkill: () => executeSkill,
|
|
15517
15799
|
executeEndpoint: () => executeEndpoint,
|
|
15800
|
+
detectBrowserBlockSignals: () => detectBrowserBlockSignals,
|
|
15518
15801
|
deriveStructuredDataReplayUrl: () => deriveStructuredDataReplayUrl,
|
|
15519
15802
|
deriveStructuredDataReplayTemplate: () => deriveStructuredDataReplayTemplate,
|
|
15520
15803
|
deriveStructuredDataReplayCandidatesFromInputs: () => deriveStructuredDataReplayCandidatesFromInputs,
|
|
@@ -16072,8 +16355,10 @@ function buildPageArtifactCapture(url, intent, html, authRequired = false) {
|
|
|
16072
16355
|
}
|
|
16073
16356
|
const searchForms = detectSearchForms(html);
|
|
16074
16357
|
const validSearchForm = searchForms.find((spec) => isStructuredSearchForm(spec));
|
|
16358
|
+
const isSpaSource = extracted.extraction_method.startsWith("spa-");
|
|
16075
16359
|
const response_schema = inferSchema([extracted.data]);
|
|
16076
16360
|
const computedTemplate = templatizeQueryParams(url);
|
|
16361
|
+
const description = validSearchForm ? `Captured search form artifact for ${intent}` : isSpaSource ? `SSR embedded data (${extracted.extraction_method}) for ${intent}` : `Captured page artifact for ${intent}`;
|
|
16077
16362
|
const endpoint = {
|
|
16078
16363
|
endpoint_id: stableEndpointId2("GET", computedTemplate),
|
|
16079
16364
|
method: "GET",
|
|
@@ -16081,7 +16366,7 @@ function buildPageArtifactCapture(url, intent, html, authRequired = false) {
|
|
|
16081
16366
|
idempotency: "safe",
|
|
16082
16367
|
verification_status: "verified",
|
|
16083
16368
|
reliability_score: extracted.confidence,
|
|
16084
|
-
description
|
|
16369
|
+
description,
|
|
16085
16370
|
response_schema,
|
|
16086
16371
|
dom_extraction: {
|
|
16087
16372
|
extraction_method: extracted.extraction_method,
|
|
@@ -16233,7 +16518,8 @@ async function trySeedStructuredDocumentSkill(skill, url, intent, params, target
|
|
|
16233
16518
|
};
|
|
16234
16519
|
let learned = localDraft;
|
|
16235
16520
|
const validation = await validateManifest({ ...localDraft, skill_id: "__validate__" });
|
|
16236
|
-
|
|
16521
|
+
const admission = selectMarketplacePublishEndpoints(localDraft);
|
|
16522
|
+
if (validation.valid && admission.endpoints.length > 0) {
|
|
16237
16523
|
try {
|
|
16238
16524
|
const { operation_graph: _graph, ...publishDraft } = localDraft;
|
|
16239
16525
|
const published = await publishSkill3(publishDraft);
|
|
@@ -16245,6 +16531,8 @@ async function trySeedStructuredDocumentSkill(skill, url, intent, params, target
|
|
|
16245
16531
|
} catch {
|
|
16246
16532
|
learned = localDraft;
|
|
16247
16533
|
}
|
|
16534
|
+
} else if (admission.endpoints.length === 0) {
|
|
16535
|
+
console.warn(`[publish] direct publish skipped for ${localDraft.skill_id}: ${admission.stats.by_reason.dom_fallback_only > 0 ? "dom_fallback_only" : "no admitted endpoints"}`);
|
|
16248
16536
|
}
|
|
16249
16537
|
try {
|
|
16250
16538
|
cachePublishedSkill(learned);
|
|
@@ -16392,7 +16680,8 @@ async function trySeedPublicDocumentFetchSkill(skill, url, intent, targetDomain,
|
|
|
16392
16680
|
};
|
|
16393
16681
|
let learned = localDraft;
|
|
16394
16682
|
const validation = await validateManifest({ ...localDraft, skill_id: "__validate__" });
|
|
16395
|
-
|
|
16683
|
+
const admission = selectMarketplacePublishEndpoints(localDraft);
|
|
16684
|
+
if (validation.valid && admission.endpoints.length > 0) {
|
|
16396
16685
|
try {
|
|
16397
16686
|
const { operation_graph: _graph, ...publishDraft } = localDraft;
|
|
16398
16687
|
const published = await publishSkill3(publishDraft);
|
|
@@ -16404,6 +16693,8 @@ async function trySeedPublicDocumentFetchSkill(skill, url, intent, targetDomain,
|
|
|
16404
16693
|
} catch {
|
|
16405
16694
|
learned = localDraft;
|
|
16406
16695
|
}
|
|
16696
|
+
} else if (admission.endpoints.length === 0) {
|
|
16697
|
+
console.warn(`[publish] direct publish skipped for ${localDraft.skill_id}: ${admission.stats.by_reason.dom_fallback_only > 0 ? "dom_fallback_only" : "no admitted endpoints"}`);
|
|
16407
16698
|
}
|
|
16408
16699
|
try {
|
|
16409
16700
|
cachePublishedSkill(learned);
|
|
@@ -16627,7 +16918,62 @@ async function executeBrowserCapture(skill, params, options) {
|
|
|
16627
16918
|
};
|
|
16628
16919
|
}
|
|
16629
16920
|
}
|
|
16630
|
-
const
|
|
16921
|
+
const extractionTrace = {};
|
|
16922
|
+
const endpoints = extractEndpoints(captured.requests, captured.ws_messages, { pageUrl: url, finalUrl: captured.final_url, intent }, extractionTrace);
|
|
16923
|
+
const computeCapturedMeta = () => {
|
|
16924
|
+
const html = captured.html ?? "";
|
|
16925
|
+
const titleMatch = html.toLowerCase().match(/<title[^>]*>([^<]{0,200})<\/title>/);
|
|
16926
|
+
const title = titleMatch ? titleMatch[1].trim() : "";
|
|
16927
|
+
const stripped = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
|
|
16928
|
+
const text = stripped.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
16929
|
+
let intentVerdict = "skip";
|
|
16930
|
+
let intentReason = "no_semantic_assessment";
|
|
16931
|
+
if (text && intent) {
|
|
16932
|
+
try {
|
|
16933
|
+
const assessment = assessIntentResult(text, intent);
|
|
16934
|
+
intentVerdict = assessment.verdict;
|
|
16935
|
+
intentReason = assessment.reason;
|
|
16936
|
+
} catch {}
|
|
16937
|
+
}
|
|
16938
|
+
const rows = extractionTrace.rows ?? [];
|
|
16939
|
+
const rejectionCounts = {};
|
|
16940
|
+
const samplesByReason = {};
|
|
16941
|
+
const PER_REASON_SAMPLE_CAP = 5;
|
|
16942
|
+
for (const row of rows) {
|
|
16943
|
+
if (row.kept === true)
|
|
16944
|
+
continue;
|
|
16945
|
+
const reason = String(row.reason ?? "unknown");
|
|
16946
|
+
rejectionCounts[reason] = (rejectionCounts[reason] ?? 0) + 1;
|
|
16947
|
+
if (typeof row.url === "string") {
|
|
16948
|
+
const bucket = samplesByReason[reason] ?? (samplesByReason[reason] = []);
|
|
16949
|
+
if (bucket.length < PER_REASON_SAMPLE_CAP)
|
|
16950
|
+
bucket.push(row.url);
|
|
16951
|
+
}
|
|
16952
|
+
}
|
|
16953
|
+
const rejectedSamples = [];
|
|
16954
|
+
for (const [reason, urls] of Object.entries(samplesByReason)) {
|
|
16955
|
+
for (const u of urls)
|
|
16956
|
+
rejectedSamples.push({ url: u, reason });
|
|
16957
|
+
}
|
|
16958
|
+
const apiCallCount = captured.requests?.length ?? 0;
|
|
16959
|
+
const blockSignals = detectBrowserBlockSignals({
|
|
16960
|
+
requestUrls: (captured.requests ?? []).map((r) => r.url ?? ""),
|
|
16961
|
+
title,
|
|
16962
|
+
htmlLength: html.length,
|
|
16963
|
+
rejectionCounts
|
|
16964
|
+
});
|
|
16965
|
+
return {
|
|
16966
|
+
html_bytes: html.length,
|
|
16967
|
+
title,
|
|
16968
|
+
text_bytes: text.length,
|
|
16969
|
+
observed_api_calls: apiCallCount,
|
|
16970
|
+
intent_verdict: intentVerdict,
|
|
16971
|
+
intent_reason: intentReason,
|
|
16972
|
+
filter_rejections: rejectionCounts,
|
|
16973
|
+
rejected_samples: rejectedSamples,
|
|
16974
|
+
browser_block_signals: blockSignals
|
|
16975
|
+
};
|
|
16976
|
+
};
|
|
16631
16977
|
if (captured.html) {
|
|
16632
16978
|
const detectedForms = detectSearchForms(captured.html);
|
|
16633
16979
|
if (detectedForms.length > 0) {
|
|
@@ -16796,8 +17142,11 @@ async function executeBrowserCapture(skill, params, options) {
|
|
|
16796
17142
|
let learned2 = domDraft;
|
|
16797
17143
|
try {
|
|
16798
17144
|
const validation = await validateManifest({ ...domDraft, skill_id: "__validate__" });
|
|
16799
|
-
|
|
17145
|
+
const admission = selectMarketplacePublishEndpoints(domDraft);
|
|
17146
|
+
if (validation.valid && admission.endpoints.length > 0) {
|
|
16800
17147
|
learned2 = await publishSkill3(domDraft);
|
|
17148
|
+
} else if (admission.endpoints.length === 0) {
|
|
17149
|
+
console.warn(`[publish] dom-artifact publish skipped for ${domDraft.skill_id}: dom_fallback_only (kept local-only)`);
|
|
16801
17150
|
}
|
|
16802
17151
|
} catch {}
|
|
16803
17152
|
if (learned2) {
|
|
@@ -16835,34 +17184,12 @@ async function executeBrowserCapture(skill, params, options) {
|
|
|
16835
17184
|
trace: trace3,
|
|
16836
17185
|
result: {
|
|
16837
17186
|
error: "low_quality_dom_extraction",
|
|
16838
|
-
message: `Structured DOM extraction was rejected for ${url}: ${pageArtifact.quality_note}
|
|
17187
|
+
message: `Structured DOM extraction was rejected for ${url}: ${pageArtifact.quality_note}`,
|
|
17188
|
+
captured_meta: computeCapturedMeta()
|
|
16839
17189
|
}
|
|
16840
17190
|
};
|
|
16841
17191
|
}
|
|
16842
|
-
const capturedMeta = (
|
|
16843
|
-
const html = captured.html ?? "";
|
|
16844
|
-
const titleMatch = html.toLowerCase().match(/<title[^>]*>([^<]{0,200})<\/title>/);
|
|
16845
|
-
const title = titleMatch ? titleMatch[1].trim() : "";
|
|
16846
|
-
const stripped = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
|
|
16847
|
-
const text = stripped.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
|
|
16848
|
-
let intentVerdict = "skip";
|
|
16849
|
-
let intentReason = "no_semantic_assessment";
|
|
16850
|
-
if (text && intent) {
|
|
16851
|
-
try {
|
|
16852
|
-
const assessment = assessIntentResult(text, intent);
|
|
16853
|
-
intentVerdict = assessment.verdict;
|
|
16854
|
-
intentReason = assessment.reason;
|
|
16855
|
-
} catch {}
|
|
16856
|
-
}
|
|
16857
|
-
return {
|
|
16858
|
-
html_bytes: html.length,
|
|
16859
|
-
title,
|
|
16860
|
-
text_bytes: text.length,
|
|
16861
|
-
observed_api_calls: captured.requests?.length ?? 0,
|
|
16862
|
-
intent_verdict: intentVerdict,
|
|
16863
|
-
intent_reason: intentReason
|
|
16864
|
-
};
|
|
16865
|
-
})();
|
|
17192
|
+
const capturedMeta = computeCapturedMeta();
|
|
16866
17193
|
const trace2 = stampTrace({
|
|
16867
17194
|
trace_id: traceId,
|
|
16868
17195
|
skill_id: skill.skill_id,
|
|
@@ -18077,6 +18404,51 @@ function semanticIntentAdjustment(endpoint, intent) {
|
|
|
18077
18404
|
}
|
|
18078
18405
|
return delta;
|
|
18079
18406
|
}
|
|
18407
|
+
function detectBrowserBlockSignals(input) {
|
|
18408
|
+
const { requestUrls, title, htmlLength, rejectionCounts } = input;
|
|
18409
|
+
const signals = [];
|
|
18410
|
+
const titleLower = title.toLowerCase();
|
|
18411
|
+
if (/just a moment|attention required|access denied|pardon our interruption|captcha|verifying you are human|human verification|are you a robot|bot check|cloudflare|press and hold|request could not be satisfied|403 forbidden|\b404\b|\b502\b|\b503\b|\b504\b|bad gateway|service unavailable|gateway timeout|site blocked|unusual traffic|security check|not[ _.]?found|page (does )?not exist|page doesn't exist|this page can't be|server error/i.test(titleLower)) {
|
|
18412
|
+
signals.push("challenge_title");
|
|
18413
|
+
}
|
|
18414
|
+
const vendorHits = new Set;
|
|
18415
|
+
for (const u of requestUrls) {
|
|
18416
|
+
if (/perimeterx|px-cloud|px-cdn|pxhd\.net/i.test(u) || /KP_UIDz=/.test(u) || /\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\/(ips\.js|tl|xhr|init)/i.test(u)) {
|
|
18417
|
+
vendorHits.add("perimeterx");
|
|
18418
|
+
}
|
|
18419
|
+
if (/datadome|js\.datadome|dd\.datadome|_dd\.s|ddjskey/i.test(u))
|
|
18420
|
+
vendorHits.add("datadome");
|
|
18421
|
+
if (/akamaihd|ak-challenge|_Incapsula|incapsula|reese84/i.test(u))
|
|
18422
|
+
vendorHits.add("imperva_incapsula");
|
|
18423
|
+
if (/akam\.net|bot-defender|\/_bm\/|sensor[-_]data|bm\.nuid|_abck/i.test(u))
|
|
18424
|
+
vendorHits.add("akamai_bot_manager");
|
|
18425
|
+
if (/cf-challenge|__cf_chl_|turnstile|challenges\.cloudflare/i.test(u))
|
|
18426
|
+
vendorHits.add("cloudflare");
|
|
18427
|
+
if (/hcaptcha|recaptcha|arkoselabs|funcaptcha/i.test(u))
|
|
18428
|
+
vendorHits.add("captcha_vendor");
|
|
18429
|
+
if (/shape\.security|f5\.com\/shape|ShapeSecurity/i.test(u))
|
|
18430
|
+
vendorHits.add("shape_security");
|
|
18431
|
+
if (/kasada|client\.kasada|ips\.kasada/i.test(u))
|
|
18432
|
+
vendorHits.add("kasada");
|
|
18433
|
+
}
|
|
18434
|
+
for (const v of vendorHits)
|
|
18435
|
+
signals.push(`vendor:${v}`);
|
|
18436
|
+
const apiCallCount = requestUrls.length;
|
|
18437
|
+
const noisyRejections = (rejectionCounts.not_api_like ?? 0) + (rejectionCounts.score_non_positive ?? 0);
|
|
18438
|
+
if (apiCallCount > 0 && apiCallCount <= 20 && noisyRejections >= Math.max(1, Math.floor(apiCallCount * 0.6))) {
|
|
18439
|
+
signals.push("sparse_capture_mostly_noise");
|
|
18440
|
+
}
|
|
18441
|
+
if (htmlLength < 500 && apiCallCount === 0) {
|
|
18442
|
+
signals.push("empty_capture");
|
|
18443
|
+
}
|
|
18444
|
+
if (htmlLength < 500 && apiCallCount >= 30) {
|
|
18445
|
+
signals.push("no_html_many_apis");
|
|
18446
|
+
}
|
|
18447
|
+
if (htmlLength < 500 && apiCallCount > 0 && apiCallCount < 30) {
|
|
18448
|
+
signals.push("low_capture");
|
|
18449
|
+
}
|
|
18450
|
+
return signals;
|
|
18451
|
+
}
|
|
18080
18452
|
function rankEndpoints(endpoints, intent, skillDomain, contextUrl) {
|
|
18081
18453
|
const NOISE_HOSTS = /(id5-sync\.com|btloader\.com|presage\.io|onetrust\.com|adsrvr\.org|googlesyndication\.com|adtrafficquality\.google|amazon-adsystem\.com|crazyegg\.com|challenges\.cloudflare\.com|google-analytics\.com|doubleclick\.net|gstatic\.com|accounts\.google\.com|login\.microsoftonline\.com|auth0\.com|cognito-idp\.|protechts\.net|demdex\.net|datadoghq\.com|fullstory\.com|launchdarkly\.com|intercom\.io|sentry\.io|segment\.io|amplitude\.com|mixpanel\.com|hotjar\.com|clarity\.ms|googletagmanager\.com|walletconnect\.com|cloudflareinsights\.com|fonts\.googleapis\.com|recaptcha|waa-pa\.|signaler-pa\.|ogads-pa\.|reddit\.com\/pixels?|pixel-config\.|dns-finder\.com|cookieconsentpub|firebase\.googleapis\.com|firebaseinstallations\.googleapis\.com|identitytoolkit\.googleapis\.com|securetoken\.googleapis\.com|apis\.google\.com|connect\.facebook\.net|bat\.bing\.com|static\.cloudflareinsights\.com|cdn\.mxpnl\.com|js\.hs-analytics\.net|snap\.licdn\.com|clc\.stackoverflow\.com|px\.ads|t\.co\/i|analytics\.|telemetry\.|stats\.)/i;
|
|
18082
18454
|
const NOISE_PATHS = /\/(track|pixel|telemetry|beacon|csp-report|litms|demdex|analytics|protechts|collect|tr\/|gen_204|generate_204|log$|logging|heartbeat|metrics|consent|sodar|tag$|event$|events$|impression|pageview|click|__|adx\/|\/cm\/ttc|\/pfb$|_stm$|videoads\/|prerolls|phantom\/)/i;
|
|
@@ -18209,7 +18581,7 @@ function rankEndpoints(endpoints, intent, skillDomain, contextUrl) {
|
|
|
18209
18581
|
score += bm25Score(queryTokens, docs[i], avgDl, docCount, docFreqs) * 20;
|
|
18210
18582
|
}
|
|
18211
18583
|
if (descriptionMeta.source === "agent" && descriptionMeta.display && rawTokens.length > 0) {
|
|
18212
|
-
const descTokens = new Set(descriptionMeta.display.toLowerCase().replace(/[^a-z0-9]+/g, " ").split(/\s+/).filter((w) => w.length > 1 && !STOPWORDS.has(w)).map((w) => stem(w)));
|
|
18584
|
+
const descTokens = new Set(descriptionMeta.display.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2").toLowerCase().replace(/[^a-z0-9]+/g, " ").split(/\s+/).filter((w) => w.length > 1 && !STOPWORDS.has(w)).map((w) => stem(w)));
|
|
18213
18585
|
const rawStems = new Set(rawTokens.map((t) => stem(t)));
|
|
18214
18586
|
let matches = 0;
|
|
18215
18587
|
for (const t of rawStems) {
|
|
@@ -18479,6 +18851,7 @@ var init_execution = __esm(async () => {
|
|
|
18479
18851
|
init_bundle_scanner();
|
|
18480
18852
|
init_token_resolver();
|
|
18481
18853
|
init_marketplace();
|
|
18854
|
+
init_publish_admission();
|
|
18482
18855
|
init_transform();
|
|
18483
18856
|
init_drift();
|
|
18484
18857
|
init_client2();
|
|
@@ -20131,7 +20504,19 @@ function isCachedSkillRelevantForIntent(skill, intent, contextUrl) {
|
|
|
20131
20504
|
if (collectExplicitSearchContextBindingKeys(contextUrl).size > 0)
|
|
20132
20505
|
return false;
|
|
20133
20506
|
}
|
|
20134
|
-
|
|
20507
|
+
if ((top?.score ?? Number.NEGATIVE_INFINITY) >= 0)
|
|
20508
|
+
return true;
|
|
20509
|
+
if (top && top.score >= -5 && contextUrl) {
|
|
20510
|
+
try {
|
|
20511
|
+
const epHost = new URL(top.endpoint.url_template).hostname;
|
|
20512
|
+
const ctxHost = new URL(contextUrl).hostname;
|
|
20513
|
+
const epReg = getRegistrableDomain(epHost);
|
|
20514
|
+
const ctxReg = getRegistrableDomain(ctxHost);
|
|
20515
|
+
if (epReg && ctxReg && epReg === ctxReg)
|
|
20516
|
+
return true;
|
|
20517
|
+
} catch {}
|
|
20518
|
+
}
|
|
20519
|
+
return false;
|
|
20135
20520
|
}
|
|
20136
20521
|
function assessLocalExecutionResult(endpoint, result, intent, trace) {
|
|
20137
20522
|
const semanticAssessment = assessIntentResult(result, intent);
|
|
@@ -22617,6 +23002,8 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
22617
23002
|
error: `No relevant endpoint discovered for "${queryIntent}"`
|
|
22618
23003
|
};
|
|
22619
23004
|
console.warn(`[capture] dropping learned skill with no relevant endpoints for "${queryIntent}"`);
|
|
23005
|
+
const totalEndpoints = resolvedSkill.endpoints.length;
|
|
23006
|
+
const captureDiagnostic = totalEndpoints === 0 ? "no_endpoints_extracted" : ranked.length === 0 ? "all_endpoints_filtered_by_noise_rules" : "endpoints_scored_below_relevance_threshold";
|
|
22620
23007
|
return {
|
|
22621
23008
|
result: {
|
|
22622
23009
|
error: `No relevant endpoint discovered for "${queryIntent}"`,
|
|
@@ -22626,6 +23013,8 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
|
|
|
22626
23013
|
description: candidate.endpoint.description,
|
|
22627
23014
|
url: candidate.endpoint.url_template
|
|
22628
23015
|
})),
|
|
23016
|
+
capture_diagnostic: captureDiagnostic,
|
|
23017
|
+
total_endpoints_captured: totalEndpoints,
|
|
22629
23018
|
...authRecommended ? {
|
|
22630
23019
|
auth_recommended: true,
|
|
22631
23020
|
auth_hint: captureResult?.auth_hint
|
package/package.json
CHANGED