unbrowse 2.0.21 → 2.0.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "2.0.21",
3
+ "version": "2.0.23",
4
4
  "description": "Reverse-engineer any website into reusable API skills. npm CLI + local engine.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -38,6 +38,25 @@ export interface LaunchedProfileContext {
38
38
  tempDir: string;
39
39
  }
40
40
 
41
+ async function waitForChildExit(child: ChildProcess, timeoutMs = 2_000): Promise<void> {
42
+ if (child.exitCode !== null || child.killed) return;
43
+ await new Promise<void>((resolve) => {
44
+ const timer = setTimeout(resolve, timeoutMs);
45
+ child.once("exit", () => {
46
+ clearTimeout(timer);
47
+ resolve();
48
+ });
49
+ });
50
+ }
51
+
52
+ function removeTempDirQuietly(dir: string): void {
53
+ try {
54
+ rmSync(dir, { recursive: true, force: true });
55
+ } catch {
56
+ // best-effort cleanup; do not fail captures on temp profile removal
57
+ }
58
+ }
59
+
41
60
  function resolveChromiumBinary(browserName: string): string | null {
42
61
  const macos = new Map<string, string>([
43
62
  ["Chrome", "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
@@ -296,13 +315,14 @@ export async function launchChromiumProfileContext(meta: BrowserAuthSourceMeta):
296
315
  };
297
316
  } catch (error) {
298
317
  try { child.kill("SIGTERM"); } catch {}
299
- rmSync(tempDir, { recursive: true, force: true });
318
+ removeTempDirQuietly(tempDir);
300
319
  throw error;
301
320
  }
302
321
  }
303
322
 
304
- export function cleanupProfileContext(ctx: LaunchedProfileContext | null | undefined): void {
323
+ export async function cleanupProfileContext(ctx: LaunchedProfileContext | null | undefined): Promise<void> {
305
324
  if (!ctx) return;
306
325
  try { ctx.child.kill("SIGTERM"); } catch {}
307
- rmSync(ctx.tempDir, { recursive: true, force: true });
326
+ await waitForChildExit(ctx.child);
327
+ removeTempDirQuietly(ctx.tempDir);
308
328
  }
@@ -268,6 +268,20 @@ export function blockedAppShellErrorCode(
268
268
  return hasAuth ? "blocked_app_shell" : "auth_required";
269
269
  }
270
270
 
271
+ export function shouldShortCircuitEmbeddedPayloadCapture(url: string, intent: string | undefined, html?: string): boolean {
272
+ if (!html) return false;
273
+ const lowerIntent = intent?.toLowerCase() ?? "";
274
+ if (
275
+ /linkedin\.com/i.test(url) &&
276
+ /\/feed(?:\/|$)/i.test(url) &&
277
+ /\b(feed|timeline|stream|post|posts|update|updates|home)\b/.test(lowerIntent) &&
278
+ /voyagerFeedDashMainFeed/.test(html)
279
+ ) {
280
+ return true;
281
+ }
282
+ return false;
283
+ }
284
+
271
285
  function shouldRetryEphemeralProfileError(error: unknown): boolean {
272
286
  const message = error instanceof Error ? error.message : String(error ?? "");
273
287
  return /persistentcontext|target page, context or browser has been closed|browser has been closed|page has been closed/i.test(message);
@@ -930,16 +944,23 @@ export async function captureSession(
930
944
  }
931
945
  await kuri.stop();
932
946
  kuri.useExternalChrome(browserCdpBaseUrl(profileCtx.cdpUrl), { child: profileCtx.child, tempDir: profileCtx.tempDir });
947
+ let nestedResult: CaptureResult | null = null;
933
948
  try {
934
- return await captureSession(url, undefined, undefined, intent, {
949
+ nestedResult = await captureSession(url, undefined, undefined, intent, {
935
950
  ...options,
936
951
  forceEphemeral: true,
937
952
  usedProfileContext: true,
938
953
  preferExistingTab,
939
954
  authStrategy: "header-replay",
940
955
  });
956
+ return nestedResult;
941
957
  } finally {
942
- await kuri.stop();
958
+ try {
959
+ await kuri.stop();
960
+ } catch (stopErr) {
961
+ log("capture", `profile-context cleanup failed for ${url}: ${stopErr instanceof Error ? stopErr.message : String(stopErr)}`);
962
+ if (!nestedResult) throw stopErr;
963
+ }
943
964
  }
944
965
  } catch (attachErr) {
945
966
  log("capture", `forced profile context failed for ${url}: ${attachErr instanceof Error ? attachErr.message : String(attachErr)}`);
@@ -1081,6 +1102,40 @@ export async function captureSession(
1081
1102
  await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT);
1082
1103
  } catch { /* page may not be ready */ }
1083
1104
 
1105
+ // For pages that embed the task payload directly in the HTML, return before
1106
+ // the longer network/intercept wait. This avoids losing useful captures to
1107
+ // later browser-engine instability on auth-gated SPAs like LinkedIn feed.
1108
+ try {
1109
+ await sleep(1_500, signal);
1110
+ throwIfAborted(signal);
1111
+ const earlyHtml = await kuri.getPageHtml(tabId);
1112
+ if (shouldShortCircuitEmbeddedPayloadCapture(url, intent, earlyHtml)) {
1113
+ let final_url = url;
1114
+ try {
1115
+ const rawUrl = await kuri.getCurrentUrl(tabId);
1116
+ final_url = typeof rawUrl === "string" ? rawUrl : String(rawUrl ?? url);
1117
+ try { new URL(final_url); } catch { final_url = url; }
1118
+ } catch {
1119
+ final_url = url;
1120
+ }
1121
+ lastHtml = earlyHtml;
1122
+ const rawCookies = await extractCookiesFromPage(tabId, url);
1123
+ const sessionCookies = filterFirstPartySessionCookies(rawCookies, url, final_url);
1124
+ log("capture", `short-circuiting embedded payload capture for ${url}`);
1125
+ return {
1126
+ requests: [],
1127
+ har_lineage_id: nanoid(),
1128
+ domain,
1129
+ cookies: sessionCookies,
1130
+ final_url,
1131
+ html: earlyHtml,
1132
+ js_bundles: new Map(),
1133
+ };
1134
+ }
1135
+ } catch {
1136
+ // fall through to the longer capture path
1137
+ }
1138
+
1084
1139
  // Build response bodies map from intercepted requests
1085
1140
  const responseBodies = new Map<string, string>();
1086
1141
  const jsBundleBodies = new Map<string, string>();
@@ -27,7 +27,7 @@ import { buildSkillOperationGraph, inferEndpointSemantic, resolveEndpointSemanti
27
27
  import { augmentEndpointsWithAgent } from "../graph/agent-augment.js";
28
28
  import { log } from "../logger.js";
29
29
  import { TRACE_VERSION } from "../version.js";
30
- import { buildQueryBindingMap, extractTemplateQueryBindings, mergeContextTemplateParams } from "../template-params.js";
30
+ import { buildQueryBindingMap, buildTemplatedQuery, extractTemplateQueryBindings, extractTemplateVariables, mergeContextTemplateParams, parseStructuredQueryTuple } from "../template-params.js";
31
31
  import { assessIntentResult, projectIntentData } from "../intent-match.js";
32
32
  import * as cheerio from "cheerio";
33
33
 
@@ -782,59 +782,93 @@ function buildLinkedInEmbeddedFeedCapture(
782
782
  return {};
783
783
  }
784
784
 
785
- try {
786
- const $ = cheerio.load(html);
787
- let metadata: {
788
- request?: string;
789
- method?: string;
790
- headers?: Record<string, string>;
791
- body?: string;
792
- } | null = null;
793
-
794
- $("code").each((_, el) => {
795
- if (metadata) return;
796
- const text = $(el).text().trim();
797
- if (!/voyagerFeedDashMainFeed/.test(text)) return;
798
- if (!/"request":"\/voyager\/api\/graphql/.test(text)) return;
785
+ const $ = cheerio.load(html);
786
+ let metadata: {
787
+ request?: string;
788
+ method?: string;
789
+ headers?: Record<string, string>;
790
+ body?: string;
791
+ } | null = null;
792
+
793
+ $("code").each((_, el) => {
794
+ if (metadata) return;
795
+ const text = $(el).text().trim();
796
+ if (!/voyagerFeedDashMainFeed/.test(text)) return;
797
+ if (!/"request":"\/voyager\/api\/graphql/.test(text)) return;
798
+ try {
799
799
  metadata = JSON.parse(text);
800
- });
801
- if (!metadata?.body) return {};
802
-
803
- let payloadText = "";
804
- $("code").each((_, el) => {
805
- if (payloadText) return;
806
- const id = $(el).attr("id");
807
- if (id !== metadata?.body) return;
808
- payloadText = $(el).text().trim();
809
- });
810
- if (!payloadText) return {};
811
-
812
- const payload = JSON.parse(payloadText);
813
- const semanticAssessment = assessIntentResult(payload, intent);
814
- if (semanticAssessment.verdict === "fail") {
815
- return { quality_note: semanticAssessment.reason };
800
+ } catch {
801
+ metadata = null;
816
802
  }
803
+ });
804
+ if (!metadata?.body) return {};
805
+
806
+ let payloadText = "";
807
+ $("code").each((_, el) => {
808
+ if (payloadText) return;
809
+ const id = $(el).attr("id");
810
+ if (id !== metadata?.body) return;
811
+ payloadText = $(el).text().trim();
812
+ });
813
+ if (!payloadText) return {};
817
814
 
818
- const requestUrl = metadata.request?.startsWith("http")
819
- ? metadata.request
820
- : `https://www.linkedin.com${metadata.request?.startsWith("/") ? "" : "/"}${metadata.request ?? ""}`;
821
- if (!requestUrl || requestUrl === "https://www.linkedin.com/") return {};
815
+ let payload: unknown;
816
+ try {
817
+ payload = JSON.parse(payloadText);
818
+ } catch {
819
+ return {};
820
+ }
822
821
 
823
- const endpoint: EndpointDescriptor = {
824
- endpoint_id: nanoid(),
825
- method: (metadata.method ?? "GET").toUpperCase() as EndpointDescriptor["method"],
826
- url_template: requestUrl,
827
- exec_strategy: "trigger-intercept",
828
- idempotency: "safe",
829
- verification_status: "verified",
830
- reliability_score: 0.95,
831
- description: `Embedded LinkedIn feed payload for ${intent}`,
832
- response_schema: inferSchema([payload]),
833
- trigger_url: url,
834
- ...(metadata.headers && Object.keys(metadata.headers).length > 0
835
- ? { headers_template: metadata.headers }
836
- : {}),
837
- };
822
+ const semanticAssessment = assessIntentResult(payload, intent);
823
+ if (semanticAssessment.verdict === "fail") {
824
+ return { quality_note: semanticAssessment.reason };
825
+ }
826
+
827
+ const requestUrl = metadata.request?.startsWith("http")
828
+ ? metadata.request
829
+ : `https://www.linkedin.com${metadata.request?.startsWith("/") ? "" : "/"}${metadata.request ?? ""}`;
830
+ if (!requestUrl || requestUrl === "https://www.linkedin.com/") return {};
831
+
832
+ const queryDefaults = (() => {
833
+ try {
834
+ return Object.fromEntries(new URL(requestUrl).searchParams.entries());
835
+ } catch {
836
+ return {} as Record<string, string>;
837
+ }
838
+ })();
839
+ let urlTemplate = requestUrl;
840
+ try {
841
+ const parsed = new URL(requestUrl);
842
+ const templatedQuery = buildTemplatedQuery(queryDefaults);
843
+ const query = Object.entries(templatedQuery)
844
+ .map(([key, value]) => `${encodeURIComponent(key)}=${value}`)
845
+ .join("&");
846
+ urlTemplate = query ? `${parsed.origin}${parsed.pathname}?${query}` : `${parsed.origin}${parsed.pathname}`;
847
+ } catch {
848
+ urlTemplate = requestUrl;
849
+ }
850
+
851
+ const endpoint: EndpointDescriptor = {
852
+ endpoint_id: nanoid(),
853
+ method: (metadata.method ?? "GET").toUpperCase() as EndpointDescriptor["method"],
854
+ url_template: urlTemplate,
855
+ exec_strategy: "trigger-intercept",
856
+ idempotency: "safe",
857
+ verification_status: "verified",
858
+ reliability_score: 0.95,
859
+ description: `Embedded LinkedIn feed payload for ${intent}`,
860
+ trigger_url: url,
861
+ ...(Object.keys(queryDefaults).length > 0 ? { query: queryDefaults } : {}),
862
+ ...(metadata.headers && Object.keys(metadata.headers).length > 0
863
+ ? { headers_template: metadata.headers }
864
+ : {}),
865
+ };
866
+ try {
867
+ endpoint.response_schema = inferSchema([payload]);
868
+ } catch {
869
+ // keep embedded endpoint even if schema inference chokes on the payload
870
+ }
871
+ try {
838
872
  endpoint.semantic = {
839
873
  ...inferEndpointSemantic(endpoint, {
840
874
  sampleResponse: payload,
@@ -844,21 +878,21 @@ function buildLinkedInEmbeddedFeedCapture(
844
878
  }),
845
879
  ...(authRequired ? { auth_required: true } : {}),
846
880
  };
847
-
848
- return {
849
- endpoint,
850
- result: {
851
- data: payload,
852
- _extraction: {
853
- method: "linkedin-embedded-feed",
854
- confidence: 0.95,
855
- source: "html-embedded",
856
- },
857
- },
858
- };
859
881
  } catch {
860
- return {};
882
+ endpoint.semantic = authRequired ? { action_kind: "timeline", resource_kind: "post", auth_required: true } : undefined;
861
883
  }
884
+
885
+ return {
886
+ endpoint,
887
+ result: {
888
+ data: payload,
889
+ _extraction: {
890
+ method: "linkedin-embedded-feed",
891
+ confidence: 0.95,
892
+ source: "html-embedded",
893
+ },
894
+ },
895
+ };
862
896
  }
863
897
 
864
898
  export function buildPageArtifactCapture(
@@ -2380,6 +2414,7 @@ export async function executeEndpoint(
2380
2414
  }
2381
2415
  }
2382
2416
  }
2417
+ applyStructuredQueryDefaults(mergedParams, endpoint.url_template, endpoint.query);
2383
2418
 
2384
2419
  // Merge captured query params into URL — user params override endpoint defaults
2385
2420
  let urlTemplate = resolveExecutionUrlTemplate(endpoint, options?.contextUrl);
@@ -2388,12 +2423,23 @@ export async function executeEndpoint(
2388
2423
  const u = new URL(urlTemplate);
2389
2424
  const queryBindings = extractTemplateQueryBindings(endpoint.url_template);
2390
2425
  for (const [k, v] of Object.entries(endpoint.query)) {
2426
+ const currentTemplateValue = u.searchParams.get(k) ?? "";
2427
+ const structuredOverride = typeof v === "string"
2428
+ ? mergeStructuredQueryValue(currentTemplateValue, v, mergedParams)
2429
+ : null;
2430
+ const hasStructuredPlaceholders = parseStructuredQueryTuple(currentTemplateValue)?.some((entry) =>
2431
+ extractTemplateVariables(entry.value).length > 0
2432
+ ) ?? false;
2391
2433
  const bindingKey = queryBindings[k];
2392
2434
  // User params override captured query defaults
2393
2435
  if (bindingKey && mergedParams[bindingKey] != null) {
2394
2436
  u.searchParams.set(k, String(mergedParams[bindingKey]));
2395
2437
  } else if (mergedParams[k] != null) {
2396
2438
  u.searchParams.set(k, String(mergedParams[k]));
2439
+ } else if (structuredOverride) {
2440
+ u.searchParams.set(k, structuredOverride);
2441
+ } else if (hasStructuredPlaceholders) {
2442
+ continue;
2397
2443
  } else if (v != null) {
2398
2444
  u.searchParams.set(k, String(v));
2399
2445
  }
@@ -2416,6 +2462,13 @@ export async function executeEndpoint(
2416
2462
  ...Object.keys(endpoint.path_params ?? {}),
2417
2463
  ...Object.keys(endpoint.query ?? {}),
2418
2464
  ]);
2465
+ for (const value of Object.values(endpoint.query ?? {})) {
2466
+ if (typeof value !== "string") continue;
2467
+ for (const entry of parseStructuredQueryTuple(value) ?? []) {
2468
+ consumedKeys.add(entry.key);
2469
+ for (const placeholder of extractTemplateVariables(entry.value)) consumedKeys.add(placeholder);
2470
+ }
2471
+ }
2419
2472
  for (const [rawKey, bindingKey] of Object.entries(extractTemplateQueryBindings(endpoint.url_template))) {
2420
2473
  consumedKeys.add(rawKey);
2421
2474
  consumedKeys.add(bindingKey);
@@ -2891,6 +2944,68 @@ function interpolate(template: string, params: Record<string, unknown>): string
2891
2944
  return `${interpolatedBase}?${interpolatedQuery}`;
2892
2945
  }
2893
2946
 
2947
+ function applyStructuredQueryDefaults(
2948
+ mergedParams: Record<string, unknown>,
2949
+ urlTemplate: string,
2950
+ queryDefaults?: Record<string, unknown>,
2951
+ ): void {
2952
+ if (!queryDefaults || Object.keys(queryDefaults).length === 0) return;
2953
+ try {
2954
+ const templateUrl = new URL(urlTemplate);
2955
+ for (const [key, rawValue] of Object.entries(queryDefaults)) {
2956
+ if (typeof rawValue !== "string") continue;
2957
+ const templateValue = templateUrl.searchParams.get(key);
2958
+ if (!templateValue) continue;
2959
+ const templateTuple = parseStructuredQueryTuple(templateValue);
2960
+ const defaultTuple = parseStructuredQueryTuple(rawValue);
2961
+ if (!templateTuple || !defaultTuple || templateTuple.length === 0 || defaultTuple.length === 0) continue;
2962
+ const defaultByKey = new Map(defaultTuple.map((entry) => [entry.key, entry.value]));
2963
+ for (const entry of templateTuple) {
2964
+ const placeholder = entry.value.match(/^\{([^}]+)\}$/)?.[1];
2965
+ if (!placeholder || mergedParams[placeholder] != null) continue;
2966
+ const fallback = defaultByKey.get(entry.key);
2967
+ if (fallback != null && fallback !== "") mergedParams[placeholder] = fallback;
2968
+ }
2969
+ }
2970
+ } catch {
2971
+ // ignore malformed template URL
2972
+ }
2973
+ }
2974
+
2975
+ function mergeStructuredQueryValue(
2976
+ currentValue: string,
2977
+ fallbackValue: string | undefined,
2978
+ mergedParams: Record<string, unknown>,
2979
+ ): string | null {
2980
+ const templateTuple = parseStructuredQueryTuple(currentValue);
2981
+ const fallbackTuple = fallbackValue ? parseStructuredQueryTuple(fallbackValue) : null;
2982
+ const activeTuple = templateTuple ?? fallbackTuple;
2983
+ if (!activeTuple || activeTuple.length === 0) return null;
2984
+
2985
+ const fallbackByKey = new Map((fallbackTuple ?? []).map((entry) => [entry.key, entry.value]));
2986
+ let changed = false;
2987
+ const rewritten = activeTuple.map((entry) => {
2988
+ const placeholder = entry.value.match(/^\{([^}]+)\}$/)?.[1];
2989
+ const directOverride = mergedParams[entry.key];
2990
+ const placeholderOverride = placeholder ? mergedParams[placeholder] : undefined;
2991
+ const nextValue = placeholderOverride ?? directOverride;
2992
+ if (nextValue != null) {
2993
+ changed = true;
2994
+ return `${entry.key}:${String(nextValue)}`;
2995
+ }
2996
+ if (placeholder) {
2997
+ const fallback = fallbackByKey.get(entry.key);
2998
+ if (fallback != null && fallback !== "") {
2999
+ changed = true;
3000
+ return `${entry.key}:${fallback}`;
3001
+ }
3002
+ }
3003
+ return `${entry.key}:${entry.value}`;
3004
+ });
3005
+
3006
+ return changed ? `(${rewritten.join(",")})` : null;
3007
+ }
3008
+
2894
3009
  function interpolateObj(
2895
3010
  obj: Record<string, unknown>,
2896
3011
  params: Record<string, unknown>
@@ -65,6 +65,18 @@ let externalChromeOverride: {
65
65
  previousAttach?: string;
66
66
  } | null = null;
67
67
 
68
+ async function waitForChildExit(child: ChildProcess | null | undefined, timeoutMs = 2_000): Promise<void> {
69
+ if (!child) return;
70
+ if (child.exitCode !== null || child.killed) return;
71
+ await new Promise<void>((resolve) => {
72
+ const timer = setTimeout(resolve, timeoutMs);
73
+ child.once("exit", () => {
74
+ clearTimeout(timer);
75
+ resolve();
76
+ });
77
+ });
78
+ }
79
+
68
80
  function kuriBinaryName(): string {
69
81
  return process.platform === "win32" ? "kuri.exe" : "kuri";
70
82
  }
@@ -426,7 +438,12 @@ export async function stop(): Promise<void> {
426
438
  // ignore
427
439
  }
428
440
  if (externalChromeOverride.tempDir) {
429
- rmSync(externalChromeOverride.tempDir, { recursive: true, force: true });
441
+ await waitForChildExit(externalChromeOverride.child);
442
+ try {
443
+ rmSync(externalChromeOverride.tempDir, { recursive: true, force: true });
444
+ } catch {
445
+ // best-effort cleanup; don't fail the caller on temp profile removal
446
+ }
430
447
  }
431
448
  if (externalChromeOverride.previousCdpUrl == null) delete process.env.CDP_URL;
432
449
  else process.env.CDP_URL = externalChromeOverride.previousCdpUrl;