unbrowse 6.2.5 → 6.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -31,7 +31,7 @@ var __promiseAll = (args) => Promise.all(args);
31
31
  var __require = /* @__PURE__ */ createRequire(import.meta.url);
32
32
 
33
33
  // ../../src/build-info.generated.ts
34
- var BUILD_RELEASE_VERSION = "6.2.5", BUILD_GIT_SHA = "842fe8f374d6", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiNi4yLjUiLCJnaXRfc2hhIjoiODQyZmU4ZjM3NGQ2IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUA4NDJmZThmMzc0ZDYiLCJpc3N1ZWRfYXQiOiIyMDI2LTA1LTAxVDAzOjE5OjI1LjE4OVoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "qYak9fhJCHowq94BznDE7aDxbTn53oRrfywJe503s8M", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
34
+ var BUILD_RELEASE_VERSION = "6.3.0", BUILD_GIT_SHA = "ebf3580e8a7b", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiNi4zLjAiLCJnaXRfc2hhIjoiZWJmMzU4MGU4YTdiIiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUBlYmYzNTgwZThhN2IiLCJpc3N1ZWRfYXQiOiIyMDI2LTA1LTAxVDA1OjU0OjI4LjgyNloifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "eROrMsDX6qJfzkUSaa6C9my4wf18yIDN6v0qG57deps", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
35
35
 
36
36
  // ../../src/version.ts
37
37
  import { createHash } from "crypto";
@@ -1559,6 +1559,11 @@ var RETRYABLE_STATUSES;
1559
1559
  var init_retry = __esm(() => {
1560
1560
  RETRYABLE_STATUSES = new Set([500, 502, 503, 504, 429]);
1561
1561
  });
1562
+
1563
+ // ../../src/execution/probe.ts
1564
+ var init_probe = __esm(() => {
1565
+ init_logger();
1566
+ });
1562
1567
  // ../../src/extraction/index.ts
1563
1568
  import * as cheerio from "cheerio";
1564
1569
  var STRIP_TAGS, CHROME_TAGS;
@@ -1733,6 +1738,7 @@ var init_execution = __esm(async () => {
1733
1738
  init_client();
1734
1739
  init_client();
1735
1740
  init_retry();
1741
+ init_probe();
1736
1742
  init_domain();
1737
1743
  init_extraction();
1738
1744
  init_graph();
@@ -4566,6 +4572,8 @@ function slimTrace(obj) {
4566
4572
  out.provider = obj.provider;
4567
4573
  if ("result" in obj)
4568
4574
  out.result = obj.result;
4575
+ if (Array.isArray(obj.decision_trace))
4576
+ out.decision_trace = obj.decision_trace;
4569
4577
  if (obj.available_endpoints)
4570
4578
  out.available_endpoints = obj.available_endpoints;
4571
4579
  if (obj.impact)
package/dist/mcp.js CHANGED
@@ -226,11 +226,11 @@ import { dirname, join, parse } from "path";
226
226
  import { fileURLToPath as fileURLToPath2 } from "url";
227
227
 
228
228
  // ../../src/build-info.generated.ts
229
- var BUILD_RELEASE_VERSION = "6.2.5";
230
- var BUILD_GIT_SHA = "842fe8f374d6";
229
+ var BUILD_RELEASE_VERSION = "6.3.0";
230
+ var BUILD_GIT_SHA = "ebf3580e8a7b";
231
231
  var BUILD_CODE_HASH = "5d9ebf619c61";
232
- var BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiNi4yLjUiLCJnaXRfc2hhIjoiODQyZmU4ZjM3NGQ2IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUA4NDJmZThmMzc0ZDYiLCJpc3N1ZWRfYXQiOiIyMDI2LTA1LTAxVDAzOjE5OjI1LjE4OVoifQ";
233
- var BUILD_RELEASE_MANIFEST_SIGNATURE = "qYak9fhJCHowq94BznDE7aDxbTn53oRrfywJe503s8M";
232
+ var BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiNi4zLjAiLCJnaXRfc2hhIjoiZWJmMzU4MGU4YTdiIiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUBlYmYzNTgwZThhN2IiLCJpc3N1ZWRfYXQiOiIyMDI2LTA1LTAxVDA1OjU0OjI4LjgyNloifQ";
233
+ var BUILD_RELEASE_MANIFEST_SIGNATURE = "eROrMsDX6qJfzkUSaa6C9my4wf18yIDN6v0qG57deps";
234
234
  var BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
235
235
 
236
236
  // ../../src/version.ts
package/dist/server.js CHANGED
@@ -3911,13 +3911,6 @@ function extractRscDataEndpoints(body) {
3911
3911
  }
3912
3912
 
3913
3913
  // ../../src/reverse-engineer/index.ts
3914
- var exports_reverse_engineer = {};
3915
- __export(exports_reverse_engineer, {
3916
- minePathTemplates: () => minePathTemplates,
3917
- extractGraphQLOperationName: () => extractGraphQLOperationName,
3918
- extractEndpoints: () => extractEndpoints,
3919
- extractAuthHeaders: () => extractAuthHeaders
3920
- });
3921
3914
  import { nanoid as nanoid2 } from "nanoid";
3922
3915
  import { createHash } from "node:crypto";
3923
3916
  function stableEndpointId(method, urlTemplate) {
@@ -4639,7 +4632,8 @@ function extractEndpoints(requests, wsMessages, context, traceSink) {
4639
4632
  reliability_score: 0.5,
4640
4633
  response_schema,
4641
4634
  trigger_url: context?.pageUrl,
4642
- ...pathBindingCandidates.length > 0 ? { _path_binding_candidates: pathBindingCandidates } : {}
4635
+ ...pathBindingCandidates.length > 0 ? { _path_binding_candidates: pathBindingCandidates } : {},
4636
+ ...buildProvenRecipe(req, computedUrlTemplate) ? { proven_recipe: buildProvenRecipe(req, computedUrlTemplate) } : {}
4643
4637
  };
4644
4638
  endpoint = resolveEndpointPathBindings(endpoint);
4645
4639
  endpoint.semantic = inferEndpointSemantic(endpoint, {
@@ -4860,6 +4854,58 @@ function sanitizeHeaders(headers) {
4860
4854
  return !isSensitiveHeader(k);
4861
4855
  }));
4862
4856
  }
4857
+ function pickReplayHeaders(headers) {
4858
+ const out = {};
4859
+ for (const [k, v] of Object.entries(headers ?? {})) {
4860
+ const lower = k.toLowerCase();
4861
+ if (lower.startsWith("sec-fetch-"))
4862
+ continue;
4863
+ if (lower.startsWith("sec-ch-ua"))
4864
+ continue;
4865
+ if (lower.startsWith("if-"))
4866
+ continue;
4867
+ if (lower === "cookie" || lower === "authorization" || lower === "content-length" || lower === "host" || lower === "origin" || lower === "referer" || lower === "user-agent" || lower === "accept-encoding" || lower === "cache-control" || lower === "pragma" || lower === "connection" || lower === "te" || lower === "upgrade-insecure-requests" || lower === "x-csrf-token" || lower === "x-xsrf-token") {
4868
+ continue;
4869
+ }
4870
+ out[k] = v;
4871
+ }
4872
+ return out;
4873
+ }
4874
+ function buildProvenRecipe(req, urlTemplate) {
4875
+ if (req.response_status < 200 || req.response_status >= 300)
4876
+ return;
4877
+ if (!req.response_body)
4878
+ return;
4879
+ const ct = (req.response_headers?.["content-type"] ?? "").toLowerCase();
4880
+ const bodyLen = Buffer.byteLength(req.response_body);
4881
+ let json_top_keys;
4882
+ if (ct.includes("application/json") || ct.includes("+json")) {
4883
+ try {
4884
+ const parsed = JSON.parse(stripJsonPrefix(req.response_body));
4885
+ if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
4886
+ json_top_keys = Object.keys(parsed).slice(0, 8);
4887
+ }
4888
+ } catch {}
4889
+ }
4890
+ let body = undefined;
4891
+ if (req.method !== "GET" && req.method !== "HEAD" && req.request_body) {
4892
+ body = tryParseBody(req.request_body) ?? req.request_body;
4893
+ }
4894
+ return {
4895
+ method: req.method,
4896
+ url_template: urlTemplate,
4897
+ headers: pickReplayHeaders(req.request_headers),
4898
+ ...body !== undefined ? { body } : {},
4899
+ response_signal: {
4900
+ status: req.response_status,
4901
+ ...ct ? { content_type: ct } : {},
4902
+ byte_length_min: Math.floor(bodyLen * 0.5),
4903
+ byte_length_max: Math.ceil(bodyLen * 2),
4904
+ ...json_top_keys ? { json_top_keys } : {}
4905
+ },
4906
+ captured_at: req.timestamp || new Date().toISOString()
4907
+ };
4908
+ }
4863
4909
  function extractAuthHeaders(requests) {
4864
4910
  const authHeaders = {};
4865
4911
  for (const req of requests) {
@@ -7285,7 +7331,7 @@ var init_capture = __esm(async () => {
7285
7331
  });
7286
7332
 
7287
7333
  // ../../src/build-info.generated.ts
7288
- var BUILD_RELEASE_VERSION = "6.2.5", BUILD_GIT_SHA = "842fe8f374d6", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiNi4yLjUiLCJnaXRfc2hhIjoiODQyZmU4ZjM3NGQ2IiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUA4NDJmZThmMzc0ZDYiLCJpc3N1ZWRfYXQiOiIyMDI2LTA1LTAxVDAzOjE5OjI1LjE4OVoifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "qYak9fhJCHowq94BznDE7aDxbTn53oRrfywJe503s8M", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
7334
+ var BUILD_RELEASE_VERSION = "6.3.0", BUILD_GIT_SHA = "ebf3580e8a7b", BUILD_CODE_HASH = "5d9ebf619c61", BUILD_RELEASE_MANIFEST_BASE64 = "eyJzY2hlbWFfdmVyc2lvbiI6MSwicmVsZWFzZV92ZXJzaW9uIjoiNi4zLjAiLCJnaXRfc2hhIjoiZWJmMzU4MGU4YTdiIiwiY29kZV9oYXNoIjoiNWQ5ZWJmNjE5YzYxIiwidHJhY2VfdmVyc2lvbiI6IjVkOWViZjYxOWM2MUBlYmYzNTgwZThhN2IiLCJpc3N1ZWRfYXQiOiIyMDI2LTA1LTAxVDA1OjU0OjI4LjgyNloifQ", BUILD_RELEASE_MANIFEST_SIGNATURE = "eROrMsDX6qJfzkUSaa6C9my4wf18yIDN6v0qG57deps", BUILD_DEFAULT_BACKEND_URL = "https://beta-api.unbrowse.ai";
7289
7335
 
7290
7336
  // ../../src/version.ts
7291
7337
  import { createHash as createHash2 } from "crypto";
@@ -10951,6 +10997,151 @@ var init_retry = __esm(() => {
10951
10997
  RETRYABLE_STATUSES = new Set([500, 502, 503, 504, 429]);
10952
10998
  });
10953
10999
 
11000
+ // ../../src/execution/probe.ts
11001
+ async function probeUrl(url, opts = {}) {
11002
+ const headers = {
11003
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
11004
+ accept: "*/*",
11005
+ "accept-language": "en-US,en;q=0.9",
11006
+ ...opts.headers ?? {}
11007
+ };
11008
+ if (opts.cookies && opts.cookies.length > 0) {
11009
+ headers["cookie"] = opts.cookies.map((c) => {
11010
+ const v = c.value.startsWith('"') && c.value.endsWith('"') ? c.value.slice(1, -1) : c.value;
11011
+ return `${c.name}=${v}`;
11012
+ }).join("; ");
11013
+ }
11014
+ const headTimeout = opts.timeout_ms ?? 1500;
11015
+ const rangedTimeout = opts.timeout_ms ?? 1000;
11016
+ const start2 = Date.now();
11017
+ try {
11018
+ const res = await fetchWithTimeout(url, { method: "HEAD", headers, redirect: "follow" }, headTimeout);
11019
+ if (res.status !== 405 && res.status !== 501) {
11020
+ const ct = (res.headers.get("content-type") || "").toLowerCase();
11021
+ const lenHeader = res.headers.get("content-length");
11022
+ const byte_length = lenHeader && Number.isFinite(Number(lenHeader)) ? Number(lenHeader) : undefined;
11023
+ return {
11024
+ status: res.status,
11025
+ content_type: ct || undefined,
11026
+ byte_length,
11027
+ ms: Date.now() - start2,
11028
+ method_used: "HEAD"
11029
+ };
11030
+ }
11031
+ log("probe", `HEAD ${url} returned ${res.status}; trying ranged GET`);
11032
+ } catch (err) {
11033
+ log("probe", `HEAD ${url} failed: ${err.message}; trying ranged GET`);
11034
+ }
11035
+ const start22 = Date.now();
11036
+ try {
11037
+ const res = await fetchWithTimeout(url, {
11038
+ method: "GET",
11039
+ headers: { ...headers, range: "bytes=0-0" },
11040
+ redirect: "follow"
11041
+ }, rangedTimeout);
11042
+ const ct = (res.headers.get("content-type") || "").toLowerCase();
11043
+ const range = res.headers.get("content-range");
11044
+ const lenHeader = res.headers.get("content-length");
11045
+ let byte_length;
11046
+ if (range) {
11047
+ const slash = range.split("/")[1];
11048
+ const total = Number(slash);
11049
+ if (Number.isFinite(total))
11050
+ byte_length = total;
11051
+ }
11052
+ if (byte_length === undefined && lenHeader && Number.isFinite(Number(lenHeader))) {
11053
+ const n = Number(lenHeader);
11054
+ if (n > 1)
11055
+ byte_length = n;
11056
+ }
11057
+ try {
11058
+ await res.arrayBuffer();
11059
+ } catch {}
11060
+ return {
11061
+ status: res.status === 206 ? 200 : res.status,
11062
+ content_type: ct || undefined,
11063
+ byte_length,
11064
+ ms: Date.now() - start2 + (Date.now() - start22),
11065
+ method_used: "GET-1byte"
11066
+ };
11067
+ } catch (err) {
11068
+ return {
11069
+ status: 0,
11070
+ ms: Date.now() - start2,
11071
+ error: err.message || "network_error",
11072
+ method_used: "GET-1byte"
11073
+ };
11074
+ }
11075
+ }
11076
+ async function fetchWithTimeout(url, init, timeoutMs) {
11077
+ const ctrl = new AbortController;
11078
+ const timer = setTimeout(() => ctrl.abort(), timeoutMs);
11079
+ try {
11080
+ return await fetch(url, { ...init, signal: ctrl.signal });
11081
+ } finally {
11082
+ clearTimeout(timer);
11083
+ }
11084
+ }
11085
+ function decideFromProbe(input) {
11086
+ const { probe, has_trigger_url, intent_wants_dom } = input;
11087
+ const { status, content_type = "", byte_length } = probe;
11088
+ if (status >= 400) {
11089
+ return {
11090
+ strategy: "return-error",
11091
+ reason: `probe status ${status}; returning to caller`
11092
+ };
11093
+ }
11094
+ if (status === 0) {
11095
+ return {
11096
+ strategy: "browser",
11097
+ reason: `probe network error: ${probe.error ?? "unknown"}`
11098
+ };
11099
+ }
11100
+ const isJson = JSON_LIKE.test(content_type);
11101
+ const isHtml = HTML_LIKE.test(content_type);
11102
+ const bodyLarge = (byte_length ?? 0) >= SMALL_HTML_BYTES;
11103
+ if (isJson) {
11104
+ return {
11105
+ strategy: "server",
11106
+ reason: `probe ${status} + ${content_type} — direct fetchable`
11107
+ };
11108
+ }
11109
+ if (isHtml && bodyLarge) {
11110
+ return {
11111
+ strategy: "server",
11112
+ reason: `probe ${status} + html ${byte_length}B — server-rendered, fetch + extract`
11113
+ };
11114
+ }
11115
+ if (isHtml && !bodyLarge && has_trigger_url) {
11116
+ return {
11117
+ strategy: "trigger-intercept",
11118
+ reason: `probe ${status} + html ${byte_length ?? "?"}B — likely SPA shell, trigger-intercept`
11119
+ };
11120
+ }
11121
+ if (isHtml && !bodyLarge) {
11122
+ return {
11123
+ strategy: "browser",
11124
+ reason: `probe ${status} + html ${byte_length ?? "?"}B SPA shell, no trigger_url — browser`
11125
+ };
11126
+ }
11127
+ if (intent_wants_dom) {
11128
+ return {
11129
+ strategy: "browser",
11130
+ reason: `intent wants DOM, content-type ${content_type || "unknown"} — browser`
11131
+ };
11132
+ }
11133
+ return {
11134
+ strategy: "server",
11135
+ reason: `probe ${status}, content-type ${content_type || "unknown"} — try server`
11136
+ };
11137
+ }
11138
+ var SMALL_HTML_BYTES = 5000, JSON_LIKE, HTML_LIKE;
11139
+ var init_probe = __esm(() => {
11140
+ init_logger();
11141
+ JSON_LIKE = /(application\/json|application\/[\w.+-]+\+json|text\/csv|application\/xml|text\/xml)/i;
11142
+ HTML_LIKE = /text\/html|application\/xhtml\+xml/i;
11143
+ });
11144
+
10954
11145
  // ../../src/intent-match.ts
10955
11146
  function isRecord(value) {
10956
11147
  return !!value && typeof value === "object" && !Array.isArray(value);
@@ -16115,15 +16306,6 @@ function normalizeReplayHeaders(...bags) {
16115
16306
  }
16116
16307
  return normalized;
16117
16308
  }
16118
- function shouldFallbackToBrowserReplay(data, endpoint, intent, contextUrl) {
16119
- const replayUrl = resolveExecutionUrlTemplate(endpoint, contextUrl);
16120
- if (!isDocumentLikeUrl(replayUrl))
16121
- return false;
16122
- if (typeof data === "string")
16123
- return isHtml(data) || isSpaShell(data);
16124
- const assessment = assessIntentResult(data, intent);
16125
- return assessment.verdict === "fail";
16126
- }
16127
16309
  function buildSampleRequestFromUrl(url) {
16128
16310
  try {
16129
16311
  return Object.fromEntries(sanitizeNavigationQueryParams(new URL(url)).searchParams.entries());
@@ -17089,124 +17271,6 @@ async function executeBrowserCapture(skill, params, options) {
17089
17271
  learned_skill: learned
17090
17272
  };
17091
17273
  }
17092
- async function tryHttpFetch(url, authHeaders, cookies) {
17093
- try {
17094
- const headers = {
17095
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
17096
- Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
17097
- "Accept-Language": "en-US,en;q=0.9",
17098
- "Cache-Control": "no-cache",
17099
- ...authHeaders
17100
- };
17101
- if (cookies && cookies.length > 0) {
17102
- headers["Cookie"] = cookies.map((c) => `${c.name}=${c.value}`).join("; ");
17103
- }
17104
- const controller = new AbortController;
17105
- const timeout = setTimeout(() => controller.abort(), 1e4);
17106
- const res = await fetch(url, {
17107
- headers,
17108
- signal: controller.signal,
17109
- redirect: "follow"
17110
- });
17111
- clearTimeout(timeout);
17112
- if (res.status !== 200)
17113
- return null;
17114
- const ct = res.headers.get("content-type") ?? "";
17115
- if (!ct.includes("text/html") && !ct.includes("application/xhtml"))
17116
- return null;
17117
- const html = await res.text();
17118
- if (!html || html.length < 1024)
17119
- return null;
17120
- return { html, final_url: res.url || url };
17121
- } catch {
17122
- return null;
17123
- }
17124
- }
17125
- function flattenExtracted(data) {
17126
- if (!Array.isArray(data))
17127
- return data;
17128
- const first = data[0];
17129
- if (first && typeof first === "object" && "type" in first && "data" in first && "relevance_score" in first) {
17130
- return data.reduce((best, cur) => (cur.relevance_score ?? 0) > (best.relevance_score ?? 0) ? cur : best).data;
17131
- }
17132
- return data;
17133
- }
17134
- async function executeDomExtractionEndpoint(endpoint, url, intent, authHeaders, cookies) {
17135
- const ssrResult = await tryHttpFetch(url, authHeaders, cookies);
17136
- if (ssrResult) {
17137
- const ssrExtracted = extractFromDOMWithHint(ssrResult.html, intent, endpoint.dom_extraction);
17138
- if (ssrExtracted.data) {
17139
- const ssrQuality = validateExtractionQuality(ssrExtracted.data, ssrExtracted.confidence, intent);
17140
- if (ssrQuality.valid) {
17141
- const ssrSemantic = assessIntentResult(ssrExtracted.data, intent);
17142
- if (ssrSemantic.verdict !== "fail") {
17143
- console.log(`[ssr-fast] hit — extracted via HTTP fetch`);
17144
- return {
17145
- data: flattenExtracted(ssrExtracted.data),
17146
- status: 200,
17147
- trace_id: nanoid6()
17148
- };
17149
- }
17150
- }
17151
- }
17152
- console.log(`[ssr-fast] miss, falling back to browser`);
17153
- } else {
17154
- console.log(`[ssr-fast] miss, falling back to browser`);
17155
- }
17156
- const captured = await captureSession(url, authHeaders, cookies, intent);
17157
- if (captured.requests.length > 0) {
17158
- const { extractEndpoints: extractEps } = await Promise.resolve().then(() => (init_reverse_engineer(), exports_reverse_engineer));
17159
- const apiEndpoints = extractEps(captured.requests, undefined, { pageUrl: url, finalUrl: captured.final_url });
17160
- const jsonEndpoints = apiEndpoints.filter((ep) => ep.response_schema && !ep.dom_extraction);
17161
- if (jsonEndpoints.length > 0) {
17162
- const best = jsonEndpoints[0];
17163
- const matchingReq = captured.requests.find((r) => r.url.includes(best.url_template.split("?")[0].split("{")[0]) && r.response_body && r.response_status >= 200 && r.response_status < 400);
17164
- if (matchingReq?.response_body) {
17165
- try {
17166
- const data = JSON.parse(matchingReq.response_body);
17167
- console.log(`[dom-exec] found API response from browser capture: ${matchingReq.url.substring(0, 80)}`);
17168
- return { data, status: matchingReq.response_status, trace_id: nanoid6() };
17169
- } catch {}
17170
- }
17171
- }
17172
- }
17173
- const html = captured.html ?? "";
17174
- const extracted = extractFromDOMWithHint(html, intent, endpoint.dom_extraction);
17175
- if (extracted.data) {
17176
- const quality = validateExtractionQuality(extracted.data, extracted.confidence, intent);
17177
- if (!quality.valid) {
17178
- return {
17179
- data: {
17180
- error: "low_quality_dom_extraction",
17181
- message: `Structured DOM extraction was rejected: ${quality.quality_note ?? "low quality extraction"}`
17182
- },
17183
- status: 422,
17184
- trace_id: nanoid6()
17185
- };
17186
- }
17187
- const semanticAssessment = assessIntentResult(extracted.data, intent);
17188
- if (semanticAssessment.verdict === "fail") {
17189
- return {
17190
- data: {
17191
- error: "low_quality_dom_extraction",
17192
- message: `Structured DOM extraction was rejected: ${semanticAssessment.reason}`
17193
- },
17194
- status: 422,
17195
- trace_id: nanoid6()
17196
- };
17197
- }
17198
- return {
17199
- data: flattenExtracted(extracted.data),
17200
- status: 200,
17201
- trace_id: nanoid6()
17202
- };
17203
- }
17204
- return {
17205
- data: html,
17206
- status: 200,
17207
- trace_id: nanoid6()
17208
- };
17209
- }
17210
17274
  async function executeEndpoint(skill, endpoint, params = {}, projection, options) {
17211
17275
  endpoint = annotateEndpointPolicy(endpoint);
17212
17276
  if (endpoint.policy?.requires_live_session) {
@@ -17786,175 +17850,109 @@ async function executeEndpoint(skill, endpoint, params = {}, projection, options
17786
17850
  const hasAuth = cookies.length > 0 || Object.keys(authHeaders).length > 0;
17787
17851
  const preferredWorkflowStrategy = workflowRecipe?.steps[0]?.strategy ? translateWorkflowStrategy(workflowRecipe.steps[0].strategy) : undefined;
17788
17852
  let workflowChosenStrategy = workflowRecipe?.steps[0]?.strategy;
17789
- if (endpoint.dom_extraction && isSafe) {
17790
- if (hasStructuredReplay) {
17791
- result = await serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride);
17792
- if (shouldFallbackToBrowserReplay(result.data, endpoint, options?.intent ?? skill.intent_signature, options?.contextUrl)) {
17793
- result = await executeDomExtractionEndpoint(endpoint, url, options?.intent ?? skill.intent_signature, authHeaders, cookies);
17794
- }
17795
- } else {
17796
- result = await executeDomExtractionEndpoint(endpoint, url, options?.intent ?? skill.intent_signature, authHeaders, cookies);
17797
- }
17798
- } else if (hasAuth) {
17799
- let strategy;
17800
- const endpointStrategy = preferredWorkflowStrategy ?? endpoint.exec_strategy;
17801
- if (hasStructuredReplay) {
17802
- result = await serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride);
17803
- if (result.status >= 200 && result.status < 400 && !shouldFallbackToBrowserReplay(result.data, endpoint, options?.intent ?? skill.intent_signature, options?.contextUrl)) {
17804
- strategy = "server";
17805
- workflowChosenStrategy = "server";
17806
- } else if (endpoint.trigger_url && isSafe) {
17807
- let triggerUrl = endpoint.trigger_url;
17808
- if (Object.keys(mergedParams).length > 0) {
17809
- try {
17810
- const tu = new URL(endpoint.trigger_url);
17811
- for (const [k, v] of Object.entries(mergedParams)) {
17812
- if (v != null && !reservedMetaParams.has(k)) {
17813
- tu.searchParams.set(k, String(v));
17814
- }
17815
- }
17816
- triggerUrl = tu.toString();
17817
- } catch {}
17818
- }
17819
- result = await triggerAndIntercept(triggerUrl, endpoint.url_template, cookies, authHeaders);
17820
- const isSelfFetchableFirst = endpoint.method === "GET" && /\.(json)(\?|$)|\/api\//i.test(url);
17821
- if (result.status === 0 && isSelfFetchableFirst) {
17822
- log("exec", `trigger-intercept timed out; trying serverFetch for self-fetchable ${url}`);
17823
- result = await serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride);
17824
- if (result.status >= 200 && result.status < 400) {
17825
- strategy = "server";
17826
- workflowChosenStrategy = "server";
17827
- } else {
17828
- strategy = "trigger-intercept";
17829
- workflowChosenStrategy = "trigger-intercept";
17830
- }
17831
- } else {
17832
- strategy = "trigger-intercept";
17833
- workflowChosenStrategy = "trigger-intercept";
17834
- }
17835
- } else {
17836
- result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17837
- strategy = "browser";
17838
- workflowChosenStrategy = workflowRecipe?.steps[0]?.strategy === "browser-action" ? "browser-action" : "browser-fetch";
17839
- }
17840
- } else if (endpointStrategy === "server") {
17841
- result = await serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride);
17842
- if (shouldFallbackToBrowserReplay(result.data, endpoint, options?.intent ?? skill.intent_signature, options?.contextUrl)) {
17843
- result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17844
- strategy = "browser";
17845
- workflowChosenStrategy = workflowRecipe?.steps[0]?.strategy === "browser-action" ? "browser-action" : "browser-fetch";
17846
- } else {
17847
- strategy = "server";
17848
- workflowChosenStrategy = "server";
17849
- }
17850
- } else if (endpointStrategy === "trigger-intercept" && endpoint.trigger_url && isSafe) {
17851
- let triggerUrl = endpoint.trigger_url;
17852
- if (Object.keys(mergedParams).length > 0) {
17853
- try {
17854
- const tu = new URL(endpoint.trigger_url);
17855
- for (const [k, v] of Object.entries(mergedParams)) {
17856
- if (v != null && !reservedMetaParams.has(k))
17857
- tu.searchParams.set(k, String(v));
17858
- }
17859
- triggerUrl = tu.toString();
17860
- } catch {}
17861
- }
17862
- log("exec", `using learned strategy trigger-intercept via ${triggerUrl}`);
17863
- result = await triggerAndIntercept(triggerUrl, endpoint.url_template, cookies, authHeaders);
17864
- const isSelfFetchable = endpoint.method === "GET" && /\.(json)(\?|$)|\/api\//i.test(url);
17865
- if (result.status === 0 && isSelfFetchable) {
17866
- log("exec", `trigger-intercept timed out; trying serverFetch for self-fetchable ${url}`);
17853
+ const decisionTrace = [];
17854
+ let recipeMatched = false;
17855
+ if (endpoint.proven_recipe && shouldReplayRecipe(endpoint.proven_recipe, url)) {
17856
+ const recipeStart = Date.now();
17857
+ const recipeResult = await replayRecipe(endpoint.proven_recipe, url, cookies, authHeaders, mergedParams);
17858
+ const matchVerdict = matchResponseSignal(recipeResult, endpoint.proven_recipe.response_signal);
17859
+ decisionTrace.push({
17860
+ step: "recipe_replay",
17861
+ method: endpoint.proven_recipe.method,
17862
+ status: recipeResult.status,
17863
+ match: matchVerdict.match,
17864
+ ...matchVerdict.match ? {} : { reason: matchVerdict.reason ?? "unknown" },
17865
+ ms: Date.now() - recipeStart
17866
+ });
17867
+ if (matchVerdict.match) {
17868
+ result = recipeResult;
17869
+ recipeMatched = true;
17870
+ workflowChosenStrategy = workflowChosenStrategy ?? "recipe-replay";
17871
+ }
17872
+ }
17873
+ if (!recipeMatched) {
17874
+ const probeCookies = cookies.map((c) => ({ name: c.name, value: c.value }));
17875
+ const probe = await probeUrl(url, {
17876
+ cookies: probeCookies,
17877
+ headers: { ...authHeaders }
17878
+ });
17879
+ decisionTrace.push({
17880
+ step: "probe",
17881
+ method: probe.method_used,
17882
+ status: probe.status,
17883
+ content_type: probe.content_type,
17884
+ byte_length: probe.byte_length,
17885
+ ms: probe.ms,
17886
+ ...probe.error ? { error: probe.error } : {}
17887
+ });
17888
+ const decision = decideFromProbe({
17889
+ probe,
17890
+ has_trigger_url: !!endpoint.trigger_url,
17891
+ intent_wants_dom: !!endpoint.dom_extraction
17892
+ });
17893
+ decisionTrace.push({
17894
+ step: "decision",
17895
+ strategy: decision.strategy,
17896
+ reason: decision.reason
17897
+ });
17898
+ switch (decision.strategy) {
17899
+ case "server": {
17867
17900
  result = await serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride);
17868
- if (result.status >= 200 && result.status < 400) {
17869
- strategy = "server";
17870
- workflowChosenStrategy = "server";
17871
- }
17872
- } else {
17873
- strategy = "trigger-intercept";
17874
- workflowChosenStrategy = "trigger-intercept";
17901
+ decisionTrace.push({ step: "server_fetch", status: result.status });
17902
+ workflowChosenStrategy = workflowChosenStrategy ?? "server";
17903
+ break;
17875
17904
  }
17876
- } else if (endpointStrategy === "browser") {
17877
- if (shouldIgnoreLearnedBrowserStrategy(endpoint, url)) {
17878
- result = await serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride);
17879
- if (result.status >= 200 && result.status < 400 && !shouldFallbackToBrowserReplay(result.data, endpoint, options?.intent ?? skill.intent_signature, options?.contextUrl)) {
17880
- strategy = "server";
17881
- workflowChosenStrategy = "server";
17882
- } else {
17883
- log("exec", `server replay rejected stale learned browser strategy for ${endpoint.endpoint_id}; falling back to browser`);
17905
+ case "trigger-intercept": {
17906
+ if (!endpoint.trigger_url || !isSafe) {
17884
17907
  result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17885
- strategy = "browser";
17886
- workflowChosenStrategy = workflowRecipe?.steps[0]?.strategy === "browser-action" ? "browser-action" : "browser-fetch";
17887
- }
17888
- } else {
17889
- log("exec", `using learned strategy browser`);
17890
- result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17891
- strategy = "browser";
17892
- workflowChosenStrategy = workflowRecipe?.steps[0]?.strategy === "browser-action" ? "browser-action" : "browser-fetch";
17893
- }
17894
- } else {
17895
- try {
17896
- result = await serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride);
17897
- if (result.status >= 200 && result.status < 400) {
17898
- const isApiEndpoint = /\/(api|graphql)\b/i.test(endpoint.url_template) || /\.(json)(\?|$)/.test(endpoint.url_template);
17899
- if (!isApiEndpoint && shouldFallbackToBrowserReplay(result.data, endpoint, options?.intent ?? skill.intent_signature, options?.contextUrl)) {
17900
- result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17901
- strategy = "browser";
17902
- workflowChosenStrategy = workflowRecipe?.steps[0]?.strategy === "browser-action" ? "browser-action" : "browser-fetch";
17903
- } else {
17904
- strategy = "server";
17905
- workflowChosenStrategy = "server";
17906
- }
17908
+ decisionTrace.push({ step: "browser_fallback", reason: "no trigger_url or unsafe method", status: result.status });
17909
+ workflowChosenStrategy = workflowChosenStrategy ?? "browser-fetch";
17907
17910
  } else {
17908
- log("exec", `server fetch returned ${result.status}, falling back`);
17909
- if (endpoint.trigger_url && isSafe) {
17910
- let triggerUrl = endpoint.trigger_url;
17911
- if (Object.keys(mergedParams).length > 0) {
17912
- try {
17913
- const tu = new URL(endpoint.trigger_url);
17914
- for (const [k, v] of Object.entries(mergedParams)) {
17915
- if (v != null && !reservedMetaParams.has(k))
17916
- tu.searchParams.set(k, String(v));
17911
+ let triggerUrl = endpoint.trigger_url;
17912
+ if (Object.keys(mergedParams).length > 0) {
17913
+ try {
17914
+ const tu = new URL(endpoint.trigger_url);
17915
+ for (const [k, v] of Object.entries(mergedParams)) {
17916
+ if (v != null && !reservedMetaParams.has(k)) {
17917
+ tu.searchParams.set(k, String(v));
17917
17918
  }
17918
- triggerUrl = tu.toString();
17919
- } catch {}
17920
- }
17921
- result = await triggerAndIntercept(triggerUrl, endpoint.url_template, cookies, authHeaders);
17922
- strategy = "trigger-intercept";
17923
- workflowChosenStrategy = "trigger-intercept";
17924
- } else {
17925
- result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17926
- strategy = "browser";
17927
- workflowChosenStrategy = workflowRecipe?.steps[0]?.strategy === "browser-action" ? "browser-action" : "browser-fetch";
17919
+ }
17920
+ triggerUrl = tu.toString();
17921
+ } catch {}
17928
17922
  }
17923
+ result = await triggerAndIntercept(triggerUrl, endpoint.url_template, cookies, authHeaders);
17924
+ decisionTrace.push({ step: "trigger_intercept", trigger_url: triggerUrl, status: result.status });
17925
+ workflowChosenStrategy = "trigger-intercept";
17929
17926
  }
17930
- } catch {
17927
+ break;
17928
+ }
17929
+ case "browser": {
17931
17930
  result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17932
- strategy = "browser";
17933
- workflowChosenStrategy = workflowRecipe?.steps[0]?.strategy === "browser-action" ? "browser-action" : "browser-fetch";
17931
+ decisionTrace.push({ step: "browser", status: result.status });
17932
+ workflowChosenStrategy = workflowChosenStrategy ?? (workflowRecipe?.steps[0]?.strategy === "browser-action" ? "browser-action" : "browser-fetch");
17933
+ break;
17934
17934
  }
17935
- }
17936
- if (strategy && result.status >= 200 && result.status < 400 && strategy !== endpoint.exec_strategy) {
17937
- log("exec", `learned exec_strategy=${strategy} for endpoint ${endpoint.endpoint_id}`);
17938
- endpoint.exec_strategy = strategy;
17939
- try {
17940
- cachePublishedSkill(skill, options?.client_scope);
17941
- } catch (e) {
17942
- log("exec", `failed to cache strategy: ${e}`);
17935
+ case "return-error": {
17936
+ result = {
17937
+ status: probe.status,
17938
+ data: {
17939
+ error: `http_${probe.status}`,
17940
+ message: `Probe returned status ${probe.status}; returned to caller without escalating.`,
17941
+ probe_method: probe.method_used,
17942
+ ...probe.content_type ? { content_type: probe.content_type } : {}
17943
+ },
17944
+ trace_id: nanoid6()
17945
+ };
17946
+ decisionTrace.push({ step: "return_error", status: probe.status });
17947
+ workflowChosenStrategy = workflowChosenStrategy ?? "server";
17948
+ break;
17943
17949
  }
17944
- }
17945
- } else if (isSafe) {
17946
- try {
17947
- result = await withRetry(() => serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride), (r) => isRetryableStatus(r.status));
17948
- if (typeof result.data === "string" && isHtml(result.data)) {
17949
- if (isSpaShell(result.data)) {
17950
- result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17951
- }
17950
+ default: {
17951
+ result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17952
+ decisionTrace.push({ step: "browser_default", status: result.status });
17953
+ workflowChosenStrategy = workflowChosenStrategy ?? "browser-fetch";
17952
17954
  }
17953
- } catch {
17954
- result = await withRetry(browserCall, (r) => isRetryableStatus(r.status));
17955
17955
  }
17956
- } else {
17957
- result = await serverFetch(workflowBindings?.extraHeaders, workflowBindings?.bodyOverride);
17958
17956
  }
17959
17957
  if (workflowRecipe && workflowArtifact && needsWorkflowTokenRefresh(result.status)) {
17960
17958
  const refreshed = await refreshAuthFromBrowser(epDomain);
@@ -17983,6 +17981,7 @@ async function executeEndpoint(skill, endpoint, params = {}, projection, options
17983
17981
  success: status >= 200 && status < 300,
17984
17982
  status_code: status
17985
17983
  });
17984
+ trace.decision_trace = decisionTrace;
17986
17985
  if (!trace.success) {
17987
17986
  trace.error = status === 0 ? `HTTP 0 — network failure or browser fetch was blocked (DNS, TLS, CORS, anti-bot, or kuri tab error). Try \`unbrowse go\` to open a live session, then re-run.` : status === 404 ? `HTTP 404 — endpoint may be stale. Re-run via POST /v1/intent/resolve to get fresh endpoints.` : `HTTP ${status}`;
17988
17987
  const isEmptyData = data == null || typeof data === "object" && !Array.isArray(data) && Object.keys(data).length === 0;
@@ -18163,7 +18162,8 @@ async function executeEndpoint(skill, endpoint, params = {}, projection, options
18163
18162
  }
18164
18163
  return {
18165
18164
  trace,
18166
- result: resultData
18165
+ result: resultData,
18166
+ decision_trace: decisionTrace
18167
18167
  };
18168
18168
  }
18169
18169
  function templatizeQueryParams(url) {
@@ -18205,6 +18205,67 @@ function interpolate(template, params) {
18205
18205
  function interpolateObj(obj, params) {
18206
18206
  return JSON.parse(JSON.stringify(obj).replace(/"(\{(\w+)\})"/g, (_, _full, k) => params[k] != null ? JSON.stringify(params[k]) : `"{${k}}"`));
18207
18207
  }
18208
+ function shouldReplayRecipe(_recipe, substitutedUrl) {
18209
+ return !/\{[a-z0-9_]+\}/i.test(substitutedUrl);
18210
+ }
18211
+ async function replayRecipe(recipe, url, cookies, authHeaders, params) {
18212
+ const headers = { ...recipe.headers, ...authHeaders };
18213
+ if (cookies.length > 0) {
18214
+ headers["cookie"] = cookies.map((c) => {
18215
+ const v = c.value.startsWith('"') && c.value.endsWith('"') ? c.value.slice(1, -1) : c.value;
18216
+ return `${c.name}=${v}`;
18217
+ }).join("; ");
18218
+ }
18219
+ let body;
18220
+ if (recipe.body !== undefined && recipe.method !== "GET" && recipe.method !== "HEAD") {
18221
+ if (typeof recipe.body === "string") {
18222
+ body = recipe.body;
18223
+ } else if (recipe.body && typeof recipe.body === "object") {
18224
+ const interpolated = interpolateObj(recipe.body, params);
18225
+ body = JSON.stringify(interpolated);
18226
+ }
18227
+ }
18228
+ try {
18229
+ const res = await fetch(url, {
18230
+ method: recipe.method,
18231
+ headers,
18232
+ body,
18233
+ redirect: "follow"
18234
+ });
18235
+ const text = await res.text();
18236
+ let data = text;
18237
+ try {
18238
+ data = JSON.parse(text);
18239
+ } catch {}
18240
+ return { status: res.status, data, trace_id: nanoid6() };
18241
+ } catch (err) {
18242
+ return {
18243
+ status: 0,
18244
+ data: { error: err.message || "network_error" },
18245
+ trace_id: nanoid6()
18246
+ };
18247
+ }
18248
+ }
18249
+ function matchResponseSignal(result, signal) {
18250
+ if (result.status !== signal.status) {
18251
+ return { match: false, reason: `status_changed: ${signal.status} → ${result.status}` };
18252
+ }
18253
+ const bodyLen = typeof result.data === "string" ? Buffer.byteLength(result.data) : Buffer.byteLength(JSON.stringify(result.data ?? null));
18254
+ if (signal.byte_length_min !== undefined && bodyLen < signal.byte_length_min) {
18255
+ return { match: false, reason: `body_shrunk: ${bodyLen}B < min ${signal.byte_length_min}B` };
18256
+ }
18257
+ if (signal.byte_length_max !== undefined && bodyLen > signal.byte_length_max) {
18258
+ return { match: false, reason: `body_grew: ${bodyLen}B > max ${signal.byte_length_max}B` };
18259
+ }
18260
+ if (signal.json_top_keys && result.data && typeof result.data === "object" && !Array.isArray(result.data)) {
18261
+ const actual = new Set(Object.keys(result.data));
18262
+ const missing = signal.json_top_keys.filter((k) => !actual.has(k));
18263
+ if (missing.length > 0) {
18264
+ return { match: false, reason: `missing_top_keys: ${missing.slice(0, 3).join(",")}` };
18265
+ }
18266
+ }
18267
+ return { match: true };
18268
+ }
18208
18269
  function stem(word) {
18209
18270
  if (word.endsWith("ies") && word.length > 4)
18210
18271
  return word.slice(0, -3) + "y";
@@ -19122,6 +19183,7 @@ var init_execution = __esm(async () => {
19122
19183
  init_client2();
19123
19184
  init_client2();
19124
19185
  init_retry();
19186
+ init_probe();
19125
19187
  init_domain();
19126
19188
  init_extraction();
19127
19189
  init_graph();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "6.2.5",
3
+ "version": "6.3.0",
4
4
  "description": "Reverse-engineer any website into reusable API skills. Zero-dep single binary with embedded browser engine.",
5
5
  "type": "module",
6
6
  "bin": {