unbrowse 3.8.0-preview.2 → 3.8.0-preview.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -31,7 +31,7 @@ var __promiseAll = (args) => Promise.all(args);
31
31
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
32
32
 
33
33
  // ../../src/build-info.generated.ts
34
- var BUILD_RELEASE_VERSION = "3.8.0-preview.2", BUILD_GIT_SHA = "efefee1f85df", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy4yIiwiZ2l0X3NoYSI6ImVmZWZlZTFmODVkZiIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAZWZlZmVlMWY4NWRmIiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0xMFQxNzowOTo0OS44NzNaIn0", BUILD_RELEASE_MANIFEST_SIGNATURE = "nVliy0ydfucv7W56hI5CiuHzYJ92ve4oJqgzV1NFASg", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
34
+ var BUILD_RELEASE_VERSION = "3.8.0-preview.3", BUILD_GIT_SHA = "dea0516c7186", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy4zIiwiZ2l0X3NoYSI6ImRlYTA1MTZjNzE4NiIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAZGVhMDUxNmM3MTg2IiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0xMVQwMjo1OTo0NC4zMTVaIn0", BUILD_RELEASE_MANIFEST_SIGNATURE = "4DMqMBhc7BFMz0Q9wxIWg0Rib1Dv1aVs9waZQIpXK4Q", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
35
35
 
36
36
  // ../../src/version.ts
37
37
  import { createHash } from "crypto";
@@ -3007,6 +3007,7 @@ var init_execution = __esm(async () => {
3007
3007
  init_bundle_scanner();
3008
3008
  init_token_resolver();
3009
3009
  init_marketplace();
3010
+ init_publish_admission();
3010
3011
  init_transform();
3011
3012
  init_drift();
3012
3013
  init_client();
package/dist/mcp.js CHANGED
@@ -225,11 +225,11 @@ import { dirname, join, parse } from "path";
225
225
  import { fileURLToPath as fileURLToPath2 } from "url";
226
226
 
227
227
  // ../../src/build-info.generated.ts
228
- var BUILD_RELEASE_VERSION = "3.8.0-preview.2";
229
- var BUILD_GIT_SHA = "efefee1f85df";
228
+ var BUILD_RELEASE_VERSION = "3.8.0-preview.3";
229
+ var BUILD_GIT_SHA = "dea0516c7186";
230
230
  var BUILD_CODE_HASH = "5d9ebf619c61";
231
- var BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy4yIiwiZ2l0X3NoYSI6ImVmZWZlZTFmODVkZiIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAZWZlZmVlMWY4NWRmIiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0xMFQxNzowOTo0OS44NzNaIn0";
232
- var BUILD_RELEASE_MANIFEST_SIGNATURE = "nVliy0ydfucv7W56hI5CiuHzYJ92ve4oJqgzV1NFASg";
231
+ var BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy4zIiwiZ2l0X3NoYSI6ImRlYTA1MTZjNzE4NiIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAZGVhMDUxNmM3MTg2IiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0xMVQwMjo1OTo0NC4zMTVaIn0";
232
+ var BUILD_RELEASE_MANIFEST_SIGNATURE = "4DMqMBhc7BFMz0Q9wxIWg0Rib1Dv1aVs9waZQIpXK4Q";
233
233
  var BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
234
234
 
235
235
  // ../../src/version.ts
package/dist/server.js CHANGED
@@ -4234,6 +4234,17 @@ function getIntentEntityRules(kind) {
4234
4234
  }
4235
4235
  }
4236
4236
  function isSemanticallyAdmissibleResponse(req, sampleResponse, sampleRequest, context) {
4237
+ if (/\/graphql(\/|$|\?)/i.test(req.url)) {
4238
+ const noiseOpRe = /globalnav|sidenav|navdash|topnav|nav_|notification|notif_|preloadstate|tracking|telemetry|pingback|heartbeat|presence|rightrail|gno_|viewertracking|metadata$|\bmetadata\b/i;
4239
+ const opName = extractGraphQLOperationName(req.url, req.request_body) ?? "";
4240
+ const opMatchesNoise = !!opName && noiseOpRe.test(opName);
4241
+ const urlMatchesNoise = noiseOpRe.test(req.url);
4242
+ const bodyMatchesNoise = typeof req.request_body === "string" && noiseOpRe.test(req.request_body.slice(0, 500));
4243
+ if (opMatchesNoise || urlMatchesNoise || bodyMatchesNoise) {
4244
+ return { ok: false, reason: "graphql_noise_operation" };
4245
+ }
4246
+ return { ok: true, reason: "semantic_graphql_bypass" };
4247
+ }
4237
4248
  const kind = inferIntentEntityKind(context?.intent);
4238
4249
  const action2 = inferIntentActionKind(context?.intent);
4239
4250
  if (!kind) {
@@ -4338,7 +4349,7 @@ function scoreRequest(req) {
4338
4349
  }
4339
4350
  return score;
4340
4351
  }
4341
- function extractEndpoints(requests, wsMessages, context) {
4352
+ function extractEndpoints(requests, wsMessages, context, traceSink) {
4342
4353
  const seen = new Set;
4343
4354
  const endpoints = [];
4344
4355
  const traceRows = [];
@@ -4362,6 +4373,16 @@ function extractEndpoints(requests, wsMessages, context) {
4362
4373
  continue;
4363
4374
  }
4364
4375
  if (!hasAdmissibleParsedBody(req.response_body)) {
4376
+ const body = req.response_body;
4377
+ if (typeof body === "string" && body.length > 20) {
4378
+ const trimmed = body.trimStart().slice(0, 200).toLowerCase();
4379
+ const looksCss = /^[.#@a-z][a-z0-9_\-]*\s*\{|^body\s*\{|^@media|^@import|^@font-face/.test(trimmed);
4380
+ const looksJs = /^function\s|^var\s|^let\s|^const\s|^import\s|^\(function|^\/\*|^\!function/.test(trimmed) && !trimmed.includes('{"');
4381
+ if (looksCss || looksJs) {
4382
+ traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "body_not_json_or_html" });
4383
+ continue;
4384
+ }
4385
+ }
4365
4386
  const urlPath = (() => {
4366
4387
  try {
4367
4388
  return new URL(req.url).pathname;
@@ -4369,7 +4390,7 @@ function extractEndpoints(requests, wsMessages, context) {
4369
4390
  return "";
4370
4391
  }
4371
4392
  })();
4372
- const isApiUrl = /\/(api|graphql|youtubei|__ssr_data__)\b/i.test(urlPath) || /\.(json)(\?|$)/.test(req.url);
4393
+ const isApiUrl = /\/(api|graphql|youtubei|__ssr_data__|_next\/data|xhr|ajax|rest)\b/i.test(urlPath) || /\/v\d+\//i.test(urlPath) || /\/async[-_]/i.test(urlPath) || /[-_](state|data|feed|timeline|search|list|results)(\?|$|\/)/i.test(urlPath) || /\.(json)(\?|$)/.test(req.url);
4373
4394
  let graphqlOpName;
4374
4395
  if (/graphql/i.test(req.url)) {
4375
4396
  graphqlOpName = extractGraphQLOperationName(req.url, req.request_body);
@@ -4393,8 +4414,21 @@ function extractEndpoints(requests, wsMessages, context) {
4393
4414
  const reqHost = new URL(req.url).hostname;
4394
4415
  const reqDomain = getRegistrableDomain(reqHost);
4395
4416
  if (!affinityDomains.has(reqDomain)) {
4396
- traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "domain_mismatch" });
4397
- continue;
4417
+ const reqBrand = reqDomain.split(".")[0] ?? "";
4418
+ let siblingOk = false;
4419
+ if (reqBrand.length >= 4) {
4420
+ for (const a of affinityDomains) {
4421
+ const aBrand = a.split(".")[0] ?? "";
4422
+ if (aBrand.length >= 4 && (reqBrand.startsWith(aBrand) || aBrand.startsWith(reqBrand))) {
4423
+ siblingOk = true;
4424
+ break;
4425
+ }
4426
+ }
4427
+ }
4428
+ if (!siblingOk) {
4429
+ traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "domain_mismatch" });
4430
+ continue;
4431
+ }
4398
4432
  }
4399
4433
  } catch {
4400
4434
  traceRows.push({ url: req.url, method: req.method, score, kept: false, reason: "bad_url" });
@@ -4599,6 +4633,8 @@ function extractEndpoints(requests, wsMessages, context) {
4599
4633
  resource_kind: endpoint.semantic?.resource_kind
4600
4634
  }))
4601
4635
  });
4636
+ if (traceSink)
4637
+ traceSink.rows = traceRows;
4602
4638
  return endpoints;
4603
4639
  }
4604
4640
  function isApiLike(req) {
@@ -7120,7 +7156,7 @@ var init_capture = __esm(async () => {
7120
7156
  });
7121
7157
 
7122
7158
  // ../../src/build-info.generated.ts
7123
- var BUILD_RELEASE_VERSION = "3.8.0-preview.2", BUILD_GIT_SHA = "efefee1f85df", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy4yIiwiZ2l0X3NoYSI6ImVmZWZlZTFmODVkZiIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAZWZlZmVlMWY4NWRmIiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0xMFQxNzowOTo0OS44NzNaIn0", BUILD_RELEASE_MANIFEST_SIGNATURE = "nVliy0ydfucv7W56hI5CiuHzYJ92ve4oJqgzV1NFASg", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
7159
+ var BUILD_RELEASE_VERSION = "3.8.0-preview.3", BUILD_GIT_SHA = "dea0516c7186", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiMy44LjAtcHJldmlldy4zIiwiZ2l0X3NoYSI6ImRlYTA1MTZjNzE4NiIsImNvZGVfaGFzaCI6IjVkOWViZjYxOWM2MSIsInRyYWNlX3ZlcnNpb24iOiI1ZDllYmY2MTljNjFAZGVhMDUxNmM3MTg2IiwiaXNzdWVkX2F0IjoiMjAyNi0wNC0xMVQwMjo1OTo0NC4zMTVaIn0", BUILD_RELEASE_MANIFEST_SIGNATURE = "4DMqMBhc7BFMz0Q9wxIWg0Rib1Dv1aVs9waZQIpXK4Q", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
7124
7160
 
7125
7161
  // ../../src/version.ts
7126
7162
  import { createHash as createHash2 } from "crypto";
@@ -8793,6 +8829,7 @@ function freshReasonCounts() {
8793
8829
  noise: 0,
8794
8830
  fragile_graphql: 0,
8795
8831
  no_durable_signal: 0,
8832
+ dom_fallback_only: 0,
8796
8833
  family_dedup: 0,
8797
8834
  over_limit: 0
8798
8835
  };
@@ -8812,6 +8849,9 @@ function looksOpaqueIdentifier(value) {
8812
8849
  return hasAlpha && hasDigit;
8813
8850
  }
8814
8851
  function isCanonicalDocumentReplay(endpoint) {
8852
+ const extractionMethod = endpoint.dom_extraction?.extraction_method ?? "";
8853
+ if (extractionMethod.startsWith("spa-"))
8854
+ return false;
8815
8855
  if (/captured page artifact/i.test(endpoint.description ?? ""))
8816
8856
  return true;
8817
8857
  if (!endpoint.trigger_url)
@@ -8963,6 +9003,27 @@ function selectMarketplacePublishEndpoints(skill, options = {}) {
8963
9003
  score: scoreEndpoint(endpoint)
8964
9004
  });
8965
9005
  }
9006
+ const hasRealEndpoint = candidates.some((c) => !isCanonicalDocumentReplay(c.endpoint));
9007
+ if (!hasRealEndpoint) {
9008
+ reasons.dom_fallback_only += candidates.length;
9009
+ return {
9010
+ endpoints: [],
9011
+ stats: {
9012
+ total: skill.endpoints?.length ?? 0,
9013
+ kept: 0,
9014
+ by_reason: reasons
9015
+ }
9016
+ };
9017
+ }
9018
+ const filteredCandidates = candidates.filter((c) => {
9019
+ if (isCanonicalDocumentReplay(c.endpoint)) {
9020
+ reasons.dom_fallback_only += 1;
9021
+ return false;
9022
+ }
9023
+ return true;
9024
+ });
9025
+ candidates.length = 0;
9026
+ candidates.push(...filteredCandidates);
8966
9027
  candidates.sort((a, b) => {
8967
9028
  if (b.score !== a.score)
8968
9029
  return b.score - a.score;
@@ -12408,6 +12469,161 @@ function extractFlashNoticeSpecial(html, intent) {
12408
12469
  selector: buildReplaySelector(flash)
12409
12470
  }];
12410
12471
  }
12472
+ function sliceBalancedObject(src, startIdx) {
12473
+ const first = src.indexOf("{", startIdx);
12474
+ if (first < 0)
12475
+ return null;
12476
+ let depth = 0;
12477
+ let inString = false;
12478
+ let stringChar = "";
12479
+ let escaped = false;
12480
+ for (let i = first;i < src.length; i++) {
12481
+ const c = src[i];
12482
+ if (escaped) {
12483
+ escaped = false;
12484
+ continue;
12485
+ }
12486
+ if (inString) {
12487
+ if (c === "\\") {
12488
+ escaped = true;
12489
+ continue;
12490
+ }
12491
+ if (c === stringChar) {
12492
+ inString = false;
12493
+ }
12494
+ continue;
12495
+ }
12496
+ if (c === '"' || c === "'" || c === "`") {
12497
+ inString = true;
12498
+ stringChar = c;
12499
+ continue;
12500
+ }
12501
+ if (c === "{")
12502
+ depth += 1;
12503
+ else if (c === "}") {
12504
+ depth -= 1;
12505
+ if (depth === 0)
12506
+ return src.substring(first, i + 1);
12507
+ }
12508
+ }
12509
+ return null;
12510
+ }
12511
+ function sliceBalancedAny(src, startIdx) {
12512
+ const open = src[startIdx];
12513
+ if (open !== "{" && open !== "[")
12514
+ return null;
12515
+ const closeCh = open === "{" ? "}" : "]";
12516
+ let depth = 0;
12517
+ let inString = false;
12518
+ let stringChar = "";
12519
+ let escaped = false;
12520
+ for (let i = startIdx;i < src.length; i++) {
12521
+ const c = src[i];
12522
+ if (escaped) {
12523
+ escaped = false;
12524
+ continue;
12525
+ }
12526
+ if (inString) {
12527
+ if (c === "\\") {
12528
+ escaped = true;
12529
+ continue;
12530
+ }
12531
+ if (c === stringChar) {
12532
+ inString = false;
12533
+ }
12534
+ continue;
12535
+ }
12536
+ if (c === '"' || c === "'" || c === "`") {
12537
+ inString = true;
12538
+ stringChar = c;
12539
+ continue;
12540
+ }
12541
+ if (c === open)
12542
+ depth += 1;
12543
+ else if (c === closeCh) {
12544
+ depth -= 1;
12545
+ if (depth === 0)
12546
+ return src.substring(startIdx, i + 1);
12547
+ }
12548
+ }
12549
+ return null;
12550
+ }
12551
+ function findWindowAssignmentPayload(html, varName) {
12552
+ const assignRe = new RegExp(String.raw`window\.${varName}\s*=\s*(\{)`, "i");
12553
+ const m = assignRe.exec(html);
12554
+ if (!m)
12555
+ return null;
12556
+ const startIdx = m.index + m[0].length - 1;
12557
+ return sliceBalancedObject(html, startIdx);
12558
+ }
12559
+ function unwrapInfiniteQuery(data) {
12560
+ if (!data || typeof data !== "object")
12561
+ return [];
12562
+ const d = data;
12563
+ const pages = d.pages;
12564
+ if (!Array.isArray(pages) || pages.length === 0)
12565
+ return [];
12566
+ if (!("pageParams" in d))
12567
+ return [];
12568
+ const merged = [];
12569
+ for (const page of pages) {
12570
+ if (Array.isArray(page)) {
12571
+ merged.push(...page);
12572
+ } else if (page && typeof page === "object") {
12573
+ const p = page;
12574
+ const listKeys = ["data", "items", "results", "articles", "posts", "nodes", "records"];
12575
+ let found = false;
12576
+ for (const k of listKeys) {
12577
+ const v = p[k];
12578
+ if (Array.isArray(v)) {
12579
+ merged.push(...v);
12580
+ found = true;
12581
+ break;
12582
+ }
12583
+ }
12584
+ if (!found) {
12585
+ for (const v of Object.values(p)) {
12586
+ if (Array.isArray(v) && v.length > 0) {
12587
+ merged.push(...v);
12588
+ found = true;
12589
+ break;
12590
+ }
12591
+ }
12592
+ }
12593
+ if (!found)
12594
+ merged.push(page);
12595
+ }
12596
+ }
12597
+ return merged;
12598
+ }
12599
+ function unwrapDehydratedState(pageProps) {
12600
+ if (!pageProps || typeof pageProps !== "object")
12601
+ return [];
12602
+ const dh = pageProps.dehydratedState;
12603
+ if (!dh || typeof dh !== "object")
12604
+ return [];
12605
+ const queries = dh.queries;
12606
+ if (!Array.isArray(queries))
12607
+ return [];
12608
+ const extracted = [];
12609
+ for (const q of queries) {
12610
+ if (!q || typeof q !== "object")
12611
+ continue;
12612
+ const state = q.state;
12613
+ if (!state || typeof state !== "object")
12614
+ continue;
12615
+ const data = state.data;
12616
+ if (data == null)
12617
+ continue;
12618
+ const infinitePages = unwrapInfiniteQuery(data);
12619
+ if (infinitePages.length > 0) {
12620
+ extracted.push(infinitePages);
12621
+ } else {
12622
+ extracted.push(data);
12623
+ }
12624
+ }
12625
+ return extracted;
12626
+ }
12411
12627
  function extractSPAData(html) {
12412
12628
  const results = [];
12413
12629
  const nextDataMatch = html.match(/<script\s+id="__NEXT_DATA__"[^>]*>([\s\S]*?)<\/script>/i);
@@ -12416,18 +12632,31 @@ function extractSPAData(html) {
12416
12632
  const parsed = JSON.parse(nextDataMatch[1]);
12417
12633
  const pageProps = parsed?.props?.pageProps;
12418
12634
  if (pageProps && typeof pageProps === "object" && Object.keys(pageProps).length > 0) {
12419
- results.push({
12420
- type: "spa-nextjs",
12421
- data: pageProps,
12422
- element_count: countDataElements(pageProps)
12423
- });
12635
+ const dehydrated = unwrapDehydratedState(pageProps);
12636
+ for (const qdata of dehydrated) {
12637
+ if (qdata && typeof qdata === "object") {
12638
+ results.push({
12639
+ type: "spa-nextjs",
12640
+ data: qdata,
12641
+ element_count: countDataElements(qdata)
12642
+ });
12643
+ }
12644
+ }
12645
+ const rawPageProps = dehydrated.length > 0 ? Object.fromEntries(Object.entries(pageProps).filter(([key]) => key !== "dehydratedState")) : pageProps;
12646
+ if (rawPageProps && Object.keys(rawPageProps).length > 0) {
12647
+ results.push({
12648
+ type: "spa-nextjs",
12649
+ data: rawPageProps,
12650
+ element_count: countDataElements(rawPageProps)
12651
+ });
12652
+ }
12424
12653
  }
12425
12654
  } catch {}
12426
12655
  }
12427
- const nuxtMatch = html.match(/window\.__NUXT__\s*=\s*(\{[\s\S]*?\});?\s*(?:<\/script>|$)/i);
12428
- if (nuxtMatch) {
12656
+ const nuxtPayload = findWindowAssignmentPayload(html, "__NUXT__");
12657
+ if (nuxtPayload) {
12429
12658
  try {
12430
- const parsed = JSON.parse(nuxtMatch[1]);
12659
+ const parsed = JSON.parse(nuxtPayload);
12431
12660
  const data = parsed?.data?.[0] ?? parsed?.state ?? parsed;
12432
12661
  if (data && typeof data === "object" && Object.keys(data).length > 0) {
12433
12662
  results.push({
@@ -12438,10 +12667,10 @@ function extractSPAData(html) {
12438
12667
  }
12439
12668
  } catch {}
12440
12669
  }
12441
- const initialStateMatch = html.match(/window\.__INITIAL_STATE__\s*=\s*(\{[\s\S]*?\});?\s*(?:<\/script>|$)/i);
12442
- if (initialStateMatch) {
12670
+ const initialStatePayload = findWindowAssignmentPayload(html, "__INITIAL_STATE__");
12671
+ if (initialStatePayload) {
12443
12672
  try {
12444
- const parsed = JSON.parse(initialStateMatch[1]);
12673
+ const parsed = JSON.parse(initialStatePayload);
12445
12674
  if (parsed && typeof parsed === "object" && Object.keys(parsed).length > 0) {
12446
12675
  results.push({
12447
12676
  type: "spa-initial-state",
@@ -12451,10 +12680,10 @@ function extractSPAData(html) {
12451
12680
  }
12452
12681
  } catch {}
12453
12682
  }
12454
- const preloadedMatch = html.match(/window\.__PRELOADED_STATE__\s*=\s*(\{[\s\S]*?\});?\s*(?:<\/script>|$)/i);
12455
- if (preloadedMatch) {
12683
+ const preloadedPayload = findWindowAssignmentPayload(html, "__PRELOADED_STATE__");
12684
+ if (preloadedPayload) {
12456
12685
  try {
12457
- const parsed = JSON.parse(preloadedMatch[1]);
12686
+ const parsed = JSON.parse(preloadedPayload);
12458
12687
  if (parsed && typeof parsed === "object" && Object.keys(parsed).length > 0) {
12459
12688
  results.push({
12460
12689
  type: "spa-preloaded-state",
@@ -12464,6 +12693,59 @@ function extractSPAData(html) {
12464
12693
  }
12465
12694
  } catch {}
12466
12695
  }
12696
+ const apolloPayload = findWindowAssignmentPayload(html, "__APOLLO_STATE__");
12697
+ if (apolloPayload) {
12698
+ try {
12699
+ const parsed = JSON.parse(apolloPayload);
12700
+ if (parsed && typeof parsed === "object" && Object.keys(parsed).length > 0) {
12701
+ results.push({
12702
+ type: "spa-initial-state",
12703
+ data: parsed,
12704
+ element_count: countDataElements(parsed)
12705
+ });
12706
+ }
12707
+ } catch {}
12708
+ }
12709
+ const nextFPayloads = [];
12710
+ const nextFRe = /self\.__next_f\.push\(\s*\[\s*\d+\s*,\s*("(?:[^"\\]|\\.)*")/g;
12711
+ let nextFMatch;
12712
+ while (nextFMatch = nextFRe.exec(html)) {
12713
+ try {
12714
+ const decoded = JSON.parse(nextFMatch[1]);
12715
+ if (typeof decoded === "string" && decoded.length > 0) {
12716
+ nextFPayloads.push(decoded);
12717
+ }
12718
+ } catch {}
12719
+ }
12720
+ if (nextFPayloads.length > 0) {
12721
+ const combined = nextFPayloads.join("");
12722
+ const fragments = [];
12723
+ for (let i = 0;i < combined.length; i++) {
12724
+ const c = combined[i];
12725
+ if (c !== "{" && c !== "[")
12726
+ continue;
12727
+ const body = sliceBalancedAny(combined, i);
12728
+ if (!body)
12729
+ continue;
12730
+ try {
12731
+ const parsed = JSON.parse(body);
12732
+ if (parsed && typeof parsed === "object") {
12733
+ fragments.push(parsed);
12734
+ }
12735
+ } catch {}
12736
+ i += body.length - 1;
12737
+ }
12738
+ const scored = fragments.filter((f) => f && typeof f === "object").map((f) => ({ data: f, count: countDataElements(f) })).sort((a, b) => b.count - a.count).slice(0, 3);
12739
+ for (const entry of scored) {
12740
+ if (entry.count >= 2) {
12741
+ results.push({
12742
+ type: "spa-initial-state",
12743
+ data: entry.data,
12744
+ element_count: entry.count
12745
+ });
12746
+ }
12747
+ }
12748
+ }
12467
12749
  return results;
12468
12750
  }
12469
12751
  function countDataElements(obj, depth = 0) {
@@ -13421,6 +13703,7 @@ function extractFromDOMWithHint(html, intent, hint) {
13421
13703
  return extractFromDOM(html, intent);
13422
13704
  }
13423
13705
  function extractFromDOM(html, intent) {
13706
+ const spaStructures = extractSPAData(html);
13424
13707
  const MAX_HTML_SIZE = 300000;
13425
13708
  let workingHtml = html;
13426
13709
  if (workingHtml.length > MAX_HTML_SIZE) {
@@ -13434,7 +13717,6 @@ function extractFromDOM(html, intent) {
13434
13717
  }
13435
13718
  }
13436
13719
  }
13437
- const spaStructures = extractSPAData(workingHtml);
13438
13720
  const flashStructures = extractFlashNoticeSpecial(workingHtml, intent);
13439
13721
  const cleaned = cleanDOM(workingHtml);
13440
13722
  const githubStructures = extractGitHubSpecial(workingHtml, intent);
@@ -15515,6 +15797,7 @@ __export(exports_execution, {
15515
15797
  isBundleInferredEndpoint: () => isBundleInferredEndpoint,
15516
15798
  executeSkill: () => executeSkill,
15517
15799
  executeEndpoint: () => executeEndpoint,
15800
+ detectBrowserBlockSignals: () => detectBrowserBlockSignals,
15518
15801
  deriveStructuredDataReplayUrl: () => deriveStructuredDataReplayUrl,
15519
15802
  deriveStructuredDataReplayTemplate: () => deriveStructuredDataReplayTemplate,
15520
15803
  deriveStructuredDataReplayCandidatesFromInputs: () => deriveStructuredDataReplayCandidatesFromInputs,
@@ -16072,8 +16355,10 @@ function buildPageArtifactCapture(url, intent, html, authRequired = false) {
16072
16355
  }
16073
16356
  const searchForms = detectSearchForms(html);
16074
16357
  const validSearchForm = searchForms.find((spec) => isStructuredSearchForm(spec));
16358
+ const isSpaSource = extracted.extraction_method.startsWith("spa-");
16075
16359
  const response_schema = inferSchema([extracted.data]);
16076
16360
  const computedTemplate = templatizeQueryParams(url);
16361
+ const description = validSearchForm ? `Captured search form artifact for ${intent}` : isSpaSource ? `SSR embedded data (${extracted.extraction_method}) for ${intent}` : `Captured page artifact for ${intent}`;
16077
16362
  const endpoint = {
16078
16363
  endpoint_id: stableEndpointId2("GET", computedTemplate),
16079
16364
  method: "GET",
@@ -16081,7 +16366,7 @@ function buildPageArtifactCapture(url, intent, html, authRequired = false) {
16081
16366
  idempotency: "safe",
16082
16367
  verification_status: "verified",
16083
16368
  reliability_score: extracted.confidence,
16084
- description: validSearchForm ? `Captured search form artifact for ${intent}` : `Captured page artifact for ${intent}`,
16369
+ description,
16085
16370
  response_schema,
16086
16371
  dom_extraction: {
16087
16372
  extraction_method: extracted.extraction_method,
@@ -16233,7 +16518,8 @@ async function trySeedStructuredDocumentSkill(skill, url, intent, params, target
16233
16518
  };
16234
16519
  let learned = localDraft;
16235
16520
  const validation = await validateManifest({ ...localDraft, skill_id: "__validate__" });
16236
- if (validation.valid) {
16521
+ const admission = selectMarketplacePublishEndpoints(localDraft);
16522
+ if (validation.valid && admission.endpoints.length > 0) {
16237
16523
  try {
16238
16524
  const { operation_graph: _graph, ...publishDraft } = localDraft;
16239
16525
  const published = await publishSkill3(publishDraft);
@@ -16245,6 +16531,8 @@ async function trySeedStructuredDocumentSkill(skill, url, intent, params, target
16245
16531
  } catch {
16246
16532
  learned = localDraft;
16247
16533
  }
16534
+ } else if (admission.endpoints.length === 0) {
16535
+ console.warn(`[publish] direct publish skipped for ${localDraft.skill_id}: ${admission.stats.by_reason.dom_fallback_only > 0 ? "dom_fallback_only" : "no admitted endpoints"}`);
16248
16536
  }
16249
16537
  try {
16250
16538
  cachePublishedSkill(learned);
@@ -16392,7 +16680,8 @@ async function trySeedPublicDocumentFetchSkill(skill, url, intent, targetDomain,
16392
16680
  };
16393
16681
  let learned = localDraft;
16394
16682
  const validation = await validateManifest({ ...localDraft, skill_id: "__validate__" });
16395
- if (validation.valid) {
16683
+ const admission = selectMarketplacePublishEndpoints(localDraft);
16684
+ if (validation.valid && admission.endpoints.length > 0) {
16396
16685
  try {
16397
16686
  const { operation_graph: _graph, ...publishDraft } = localDraft;
16398
16687
  const published = await publishSkill3(publishDraft);
@@ -16404,6 +16693,8 @@ async function trySeedPublicDocumentFetchSkill(skill, url, intent, targetDomain,
16404
16693
  } catch {
16405
16694
  learned = localDraft;
16406
16695
  }
16696
+ } else if (admission.endpoints.length === 0) {
16697
+ console.warn(`[publish] direct publish skipped for ${localDraft.skill_id}: ${admission.stats.by_reason.dom_fallback_only > 0 ? "dom_fallback_only" : "no admitted endpoints"}`);
16407
16698
  }
16408
16699
  try {
16409
16700
  cachePublishedSkill(learned);
@@ -16627,7 +16918,62 @@ async function executeBrowserCapture(skill, params, options) {
16627
16918
  };
16628
16919
  }
16629
16920
  }
16630
- const endpoints = extractEndpoints(captured.requests, captured.ws_messages, { pageUrl: url, finalUrl: captured.final_url, intent });
16921
+ const extractionTrace = {};
16922
+ const endpoints = extractEndpoints(captured.requests, captured.ws_messages, { pageUrl: url, finalUrl: captured.final_url, intent }, extractionTrace);
16923
+ const computeCapturedMeta = () => {
16924
+ const html = captured.html ?? "";
16925
+ const titleMatch = html.toLowerCase().match(/<title[^>]*>([^<]{0,200})<\/title>/);
16926
+ const title = titleMatch ? titleMatch[1].trim() : "";
16927
+ const stripped = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
16928
+ const text = stripped.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
16929
+ let intentVerdict = "skip";
16930
+ let intentReason = "no_semantic_assessment";
16931
+ if (text && intent) {
16932
+ try {
16933
+ const assessment = assessIntentResult(text, intent);
16934
+ intentVerdict = assessment.verdict;
16935
+ intentReason = assessment.reason;
16936
+ } catch {}
16937
+ }
16938
+ const rows = extractionTrace.rows ?? [];
16939
+ const rejectionCounts = {};
16940
+ const samplesByReason = {};
16941
+ const PER_REASON_SAMPLE_CAP = 5;
16942
+ for (const row of rows) {
16943
+ if (row.kept === true)
16944
+ continue;
16945
+ const reason = String(row.reason ?? "unknown");
16946
+ rejectionCounts[reason] = (rejectionCounts[reason] ?? 0) + 1;
16947
+ if (typeof row.url === "string") {
16948
+ const bucket = samplesByReason[reason] ?? (samplesByReason[reason] = []);
16949
+ if (bucket.length < PER_REASON_SAMPLE_CAP)
16950
+ bucket.push(row.url);
16951
+ }
16952
+ }
16953
+ const rejectedSamples = [];
16954
+ for (const [reason, urls] of Object.entries(samplesByReason)) {
16955
+ for (const u of urls)
16956
+ rejectedSamples.push({ url: u, reason });
16957
+ }
16958
+ const apiCallCount = captured.requests?.length ?? 0;
16959
+ const blockSignals = detectBrowserBlockSignals({
16960
+ requestUrls: (captured.requests ?? []).map((r) => r.url ?? ""),
16961
+ title,
16962
+ htmlLength: html.length,
16963
+ rejectionCounts
16964
+ });
16965
+ return {
16966
+ html_bytes: html.length,
16967
+ title,
16968
+ text_bytes: text.length,
16969
+ observed_api_calls: apiCallCount,
16970
+ intent_verdict: intentVerdict,
16971
+ intent_reason: intentReason,
16972
+ filter_rejections: rejectionCounts,
16973
+ rejected_samples: rejectedSamples,
16974
+ browser_block_signals: blockSignals
16975
+ };
16976
+ };
16631
16977
  if (captured.html) {
16632
16978
  const detectedForms = detectSearchForms(captured.html);
16633
16979
  if (detectedForms.length > 0) {
@@ -16796,8 +17142,11 @@ async function executeBrowserCapture(skill, params, options) {
16796
17142
  let learned2 = domDraft;
16797
17143
  try {
16798
17144
  const validation = await validateManifest({ ...domDraft, skill_id: "__validate__" });
16799
- if (validation.valid) {
17145
+ const admission = selectMarketplacePublishEndpoints(domDraft);
17146
+ if (validation.valid && admission.endpoints.length > 0) {
16800
17147
  learned2 = await publishSkill3(domDraft);
17148
+ } else if (admission.endpoints.length === 0) {
17149
+ console.warn(`[publish] dom-artifact publish skipped for ${domDraft.skill_id}: dom_fallback_only (kept local-only)`);
16801
17150
  }
16802
17151
  } catch {}
16803
17152
  if (learned2) {
@@ -16835,34 +17184,12 @@ async function executeBrowserCapture(skill, params, options) {
16835
17184
  trace: trace3,
16836
17185
  result: {
16837
17186
  error: "low_quality_dom_extraction",
16838
- message: `Structured DOM extraction was rejected for ${url}: ${pageArtifact.quality_note}`
17187
+ message: `Structured DOM extraction was rejected for ${url}: ${pageArtifact.quality_note}`,
17188
+ captured_meta: computeCapturedMeta()
16839
17189
  }
16840
17190
  };
16841
17191
  }
16842
- const capturedMeta = (() => {
16843
- const html = captured.html ?? "";
16844
- const titleMatch = html.toLowerCase().match(/<title[^>]*>([^<]{0,200})<\/title>/);
16845
- const title = titleMatch ? titleMatch[1].trim() : "";
16846
- const stripped = html.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "").replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
16847
- const text = stripped.replace(/<[^>]+>/g, " ").replace(/\s+/g, " ").trim();
16848
- let intentVerdict = "skip";
16849
- let intentReason = "no_semantic_assessment";
16850
- if (text && intent) {
16851
- try {
16852
- const assessment = assessIntentResult(text, intent);
16853
- intentVerdict = assessment.verdict;
16854
- intentReason = assessment.reason;
16855
- } catch {}
16856
- }
16857
- return {
16858
- html_bytes: html.length,
16859
- title,
16860
- text_bytes: text.length,
16861
- observed_api_calls: captured.requests?.length ?? 0,
16862
- intent_verdict: intentVerdict,
16863
- intent_reason: intentReason
16864
- };
16865
- })();
17192
+ const capturedMeta = computeCapturedMeta();
16866
17193
  const trace2 = stampTrace({
16867
17194
  trace_id: traceId,
16868
17195
  skill_id: skill.skill_id,
@@ -18077,6 +18404,51 @@ function semanticIntentAdjustment(endpoint, intent) {
18077
18404
  }
18078
18405
  return delta;
18079
18406
  }
18407
+ function detectBrowserBlockSignals(input) {
18408
+ const { requestUrls, title, htmlLength, rejectionCounts } = input;
18409
+ const signals = [];
18410
+ const titleLower = title.toLowerCase();
18411
+ if (/just a moment|attention required|access denied|pardon our interruption|captcha|verifying you are human|human verification|are you a robot|bot check|cloudflare|press and hold|request could not be satisfied|403 forbidden|\b404\b|\b502\b|\b503\b|\b504\b|bad gateway|service unavailable|gateway timeout|site blocked|unusual traffic|security check|not[ _.]?found|page (does )?not exist|page doesn't exist|this page can't be|server error/i.test(titleLower)) {
18412
+ signals.push("challenge_title");
18413
+ }
18414
+ const vendorHits = new Set;
18415
+ for (const u of requestUrls) {
18416
+ if (/perimeterx|px-cloud|px-cdn|pxhd\.net/i.test(u) || /KP_UIDz=/.test(u) || /\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}\/(ips\.js|tl|xhr|init)/i.test(u)) {
18417
+ vendorHits.add("perimeterx");
18418
+ }
18419
+ if (/datadome|js\.datadome|dd\.datadome|_dd\.s|ddjskey/i.test(u))
18420
+ vendorHits.add("datadome");
18421
+ if (/akamaihd|ak-challenge|_Incapsula|incapsula|reese84/i.test(u))
18422
+ vendorHits.add("imperva_incapsula");
18423
+ if (/akam\.net|bot-defender|\/_bm\/|sensor[-_]data|bm\.nuid|_abck/i.test(u))
18424
+ vendorHits.add("akamai_bot_manager");
18425
+ if (/cf-challenge|__cf_chl_|turnstile|challenges\.cloudflare/i.test(u))
18426
+ vendorHits.add("cloudflare");
18427
+ if (/hcaptcha|recaptcha|arkoselabs|funcaptcha/i.test(u))
18428
+ vendorHits.add("captcha_vendor");
18429
+ if (/shape\.security|f5\.com\/shape|ShapeSecurity/i.test(u))
18430
+ vendorHits.add("shape_security");
18431
+ if (/kasada|client\.kasada|ips\.kasada/i.test(u))
18432
+ vendorHits.add("kasada");
18433
+ }
18434
+ for (const v of vendorHits)
18435
+ signals.push(`vendor:${v}`);
18436
+ const apiCallCount = requestUrls.length;
18437
+ const noisyRejections = (rejectionCounts.not_api_like ?? 0) + (rejectionCounts.score_non_positive ?? 0);
18438
+ if (apiCallCount > 0 && apiCallCount <= 20 && noisyRejections >= Math.max(1, Math.floor(apiCallCount * 0.6))) {
18439
+ signals.push("sparse_capture_mostly_noise");
18440
+ }
18441
+ if (htmlLength < 500 && apiCallCount === 0) {
18442
+ signals.push("empty_capture");
18443
+ }
18444
+ if (htmlLength < 500 && apiCallCount >= 30) {
18445
+ signals.push("no_html_many_apis");
18446
+ }
18447
+ if (htmlLength < 500 && apiCallCount > 0 && apiCallCount < 30) {
18448
+ signals.push("low_capture");
18449
+ }
18450
+ return signals;
18451
+ }
18080
18452
  function rankEndpoints(endpoints, intent, skillDomain, contextUrl) {
18081
18453
  const NOISE_HOSTS = /(id5-sync\.com|btloader\.com|presage\.io|onetrust\.com|adsrvr\.org|googlesyndication\.com|adtrafficquality\.google|amazon-adsystem\.com|crazyegg\.com|challenges\.cloudflare\.com|google-analytics\.com|doubleclick\.net|gstatic\.com|accounts\.google\.com|login\.microsoftonline\.com|auth0\.com|cognito-idp\.|protechts\.net|demdex\.net|datadoghq\.com|fullstory\.com|launchdarkly\.com|intercom\.io|sentry\.io|segment\.io|amplitude\.com|mixpanel\.com|hotjar\.com|clarity\.ms|googletagmanager\.com|walletconnect\.com|cloudflareinsights\.com|fonts\.googleapis\.com|recaptcha|waa-pa\.|signaler-pa\.|ogads-pa\.|reddit\.com\/pixels?|pixel-config\.|dns-finder\.com|cookieconsentpub|firebase\.googleapis\.com|firebaseinstallations\.googleapis\.com|identitytoolkit\.googleapis\.com|securetoken\.googleapis\.com|apis\.google\.com|connect\.facebook\.net|bat\.bing\.com|static\.cloudflareinsights\.com|cdn\.mxpnl\.com|js\.hs-analytics\.net|snap\.licdn\.com|clc\.stackoverflow\.com|px\.ads|t\.co\/i|analytics\.|telemetry\.|stats\.)/i;
18082
18454
  const NOISE_PATHS = /\/(track|pixel|telemetry|beacon|csp-report|litms|demdex|analytics|protechts|collect|tr\/|gen_204|generate_204|log$|logging|heartbeat|metrics|consent|sodar|tag$|event$|events$|impression|pageview|click|__|adx\/|\/cm\/ttc|\/pfb$|_stm$|videoads\/|prerolls|phantom\/)/i;
@@ -18209,7 +18581,7 @@ function rankEndpoints(endpoints, intent, skillDomain, contextUrl) {
18209
18581
  score += bm25Score(queryTokens, docs[i], avgDl, docCount, docFreqs) * 20;
18210
18582
  }
18211
18583
  if (descriptionMeta.source === "agent" && descriptionMeta.display && rawTokens.length > 0) {
18212
- const descTokens = new Set(descriptionMeta.display.toLowerCase().replace(/[^a-z0-9]+/g, " ").split(/\s+/).filter((w) => w.length > 1 && !STOPWORDS.has(w)).map((w) => stem(w)));
18584
+ const descTokens = new Set(descriptionMeta.display.replace(/([a-z0-9])([A-Z])/g, "$1 $2").replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2").toLowerCase().replace(/[^a-z0-9]+/g, " ").split(/\s+/).filter((w) => w.length > 1 && !STOPWORDS.has(w)).map((w) => stem(w)));
18213
18585
  const rawStems = new Set(rawTokens.map((t) => stem(t)));
18214
18586
  let matches = 0;
18215
18587
  for (const t of rawStems) {
@@ -18479,6 +18851,7 @@ var init_execution = __esm(async () => {
18479
18851
  init_bundle_scanner();
18480
18852
  init_token_resolver();
18481
18853
  init_marketplace();
18854
+ init_publish_admission();
18482
18855
  init_transform();
18483
18856
  init_drift();
18484
18857
  init_client2();
@@ -20131,7 +20504,19 @@ function isCachedSkillRelevantForIntent(skill, intent, contextUrl) {
20131
20504
  if (collectExplicitSearchContextBindingKeys(contextUrl).size > 0)
20132
20505
  return false;
20133
20506
  }
20134
- return (top?.score ?? Number.NEGATIVE_INFINITY) >= 0;
20507
+ if ((top?.score ?? Number.NEGATIVE_INFINITY) >= 0)
20508
+ return true;
20509
+ if (top && top.score >= -5 && contextUrl) {
20510
+ try {
20511
+ const epHost = new URL(top.endpoint.url_template).hostname;
20512
+ const ctxHost = new URL(contextUrl).hostname;
20513
+ const epReg = getRegistrableDomain(epHost);
20514
+ const ctxReg = getRegistrableDomain(ctxHost);
20515
+ if (epReg && ctxReg && epReg === ctxReg)
20516
+ return true;
20517
+ } catch {}
20518
+ }
20519
+ return false;
20135
20520
  }
20136
20521
  function assessLocalExecutionResult(endpoint, result, intent, trace) {
20137
20522
  const semanticAssessment = assessIntentResult(result, intent);
@@ -22617,6 +23002,8 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
22617
23002
  error: `No relevant endpoint discovered for "${queryIntent}"`
22618
23003
  };
22619
23004
  console.warn(`[capture] dropping learned skill with no relevant endpoints for "${queryIntent}"`);
23005
+ const totalEndpoints = resolvedSkill.endpoints.length;
23006
+ const captureDiagnostic = totalEndpoints === 0 ? "no_endpoints_extracted" : ranked.length === 0 ? "all_endpoints_filtered_by_noise_rules" : "endpoints_scored_below_relevance_threshold";
22620
23007
  return {
22621
23008
  result: {
22622
23009
  error: `No relevant endpoint discovered for "${queryIntent}"`,
@@ -22626,6 +23013,8 @@ async function resolveAndExecute(intent, params = {}, context, projection, optio
22626
23013
  description: candidate.endpoint.description,
22627
23014
  url: candidate.endpoint.url_template
22628
23015
  })),
23016
+ capture_diagnostic: captureDiagnostic,
23017
+ total_endpoints_captured: totalEndpoints,
22629
23018
  ...authRecommended ? {
22630
23019
  auth_recommended: true,
22631
23020
  auth_hint: captureResult?.auth_hint
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "3.8.0-preview.2",
3
+ "version": "3.8.0-preview.3",
4
4
  "description": "Reverse-engineer any website into reusable API skills. Zero-dep single binary with embedded browser engine.",
5
5
  "type": "module",
6
6
  "bin": {