unbrowse 2.0.21 → 2.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -325,6 +325,7 @@ import { existsSync as existsSync2, mkdirSync as mkdirSync2, realpathSync } from
325
325
  import os from "node:os";
326
326
  import path from "node:path";
327
327
  import { createRequire } from "node:module";
328
+ import { execFileSync } from "node:child_process";
328
329
  import { fileURLToPath } from "node:url";
329
330
  function getModuleDir(metaUrl) {
330
331
  return path.dirname(fileURLToPath(metaUrl));
@@ -349,19 +350,32 @@ function resolveSiblingEntrypoint(metaUrl, basename) {
349
350
  const file = fileURLToPath(metaUrl);
350
351
  return path.join(path.dirname(file), `${basename}${path.extname(file) || ".js"}`);
351
352
  }
352
- function runtimeArgsForEntrypoint(metaUrl, entrypoint) {
353
+ function resolveBinaryOnPath(name) {
354
+ const checker = process.platform === "win32" ? "where" : "which";
355
+ try {
356
+ const output = execFileSync(checker, [name], { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] });
357
+ const match = output.split(/\r?\n/).map((line) => line.trim()).find(Boolean);
358
+ return match || null;
359
+ } catch {
360
+ return null;
361
+ }
362
+ }
363
+ function runtimeInvocationForEntrypoint(metaUrl, entrypoint) {
353
364
  if (path.extname(entrypoint) !== ".ts")
354
- return [entrypoint];
365
+ return { command: process.execPath, args: [entrypoint] };
355
366
  if (process.versions.bun)
356
- return [entrypoint];
367
+ return { command: process.execPath, args: [entrypoint] };
368
+ const bunBinary = process.env.BUN_BIN || resolveBinaryOnPath("bun");
369
+ if (bunBinary)
370
+ return { command: bunBinary, args: [entrypoint] };
357
371
  try {
358
372
  const req = createRequire(metaUrl);
359
373
  const tsxPkg = req.resolve("tsx/package.json");
360
374
  const tsxLoader = path.join(path.dirname(tsxPkg), "dist", "loader.mjs");
361
375
  if (existsSync2(tsxLoader))
362
- return ["--import", tsxLoader, entrypoint];
376
+ return { command: process.execPath, args: ["--import", tsxLoader, entrypoint] };
363
377
  } catch {}
364
- return ["--import", "tsx", entrypoint];
378
+ return { command: process.execPath, args: ["--import", "tsx", entrypoint] };
365
379
  }
366
380
  function getUnbrowseHome() {
367
381
  return path.join(os.homedir(), ".unbrowse");
@@ -560,7 +574,7 @@ async function maybeAutoUpdate(metaUrl, overrides = {}) {
560
574
  import { closeSync, openSync, readFileSync as readFileSync4, statSync, unlinkSync, writeFileSync as writeFileSync2 } from "node:fs";
561
575
  import path3 from "node:path";
562
576
  import { spawn } from "node:child_process";
563
- import { execFileSync } from "node:child_process";
577
+ import { execFileSync as execFileSync2 } from "node:child_process";
564
578
 
565
579
  // ../../src/version.ts
566
580
  import { createHash } from "crypto";
@@ -671,7 +685,7 @@ function findListeningPid(baseUrl) {
671
685
  try {
672
686
  const url = new URL(baseUrl);
673
687
  const port = url.port || (url.protocol === "https:" ? "443" : "80");
674
- const output = execFileSync("lsof", ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-t"], {
688
+ const output = execFileSync2("lsof", ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN", "-t"], {
675
689
  encoding: "utf8",
676
690
  stdio: ["ignore", "pipe", "ignore"]
677
691
  }).trim();
@@ -683,7 +697,7 @@ function findListeningPid(baseUrl) {
683
697
  }
684
698
  function readProcessCommand(pid) {
685
699
  try {
686
- return execFileSync("ps", ["-o", "command=", "-p", String(pid)], {
700
+ return execFileSync2("ps", ["-o", "command=", "-p", String(pid)], {
687
701
  encoding: "utf8",
688
702
  stdio: ["ignore", "pipe", "ignore"]
689
703
  }).trim();
@@ -791,7 +805,8 @@ async function ensureLocalServer(baseUrl, noAutoStart, metaUrl) {
791
805
  const logFile = getServerAutostartLogFile();
792
806
  ensureDir(path3.dirname(logFile));
793
807
  const logFd = openSync(logFile, "a");
794
- const child = spawn(process.execPath, runtimeArgsForEntrypoint(metaUrl, entrypoint), {
808
+ const runtime = runtimeInvocationForEntrypoint(metaUrl, entrypoint);
809
+ const child = spawn(runtime.command, runtime.args, {
795
810
  cwd: packageRoot,
796
811
  detached: true,
797
812
  stdio: ["ignore", logFd, logFd],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unbrowse",
3
- "version": "2.0.21",
3
+ "version": "2.0.22",
4
4
  "description": "Reverse-engineer any website into reusable API skills. npm CLI + local engine.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -38,6 +38,25 @@ export interface LaunchedProfileContext {
38
38
  tempDir: string;
39
39
  }
40
40
 
41
+ async function waitForChildExit(child: ChildProcess, timeoutMs = 2_000): Promise<void> {
42
+ if (child.exitCode !== null || child.killed) return;
43
+ await new Promise<void>((resolve) => {
44
+ const timer = setTimeout(resolve, timeoutMs);
45
+ child.once("exit", () => {
46
+ clearTimeout(timer);
47
+ resolve();
48
+ });
49
+ });
50
+ }
51
+
52
+ function removeTempDirQuietly(dir: string): void {
53
+ try {
54
+ rmSync(dir, { recursive: true, force: true });
55
+ } catch {
56
+ // best-effort cleanup; do not fail captures on temp profile removal
57
+ }
58
+ }
59
+
41
60
  function resolveChromiumBinary(browserName: string): string | null {
42
61
  const macos = new Map<string, string>([
43
62
  ["Chrome", "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"],
@@ -296,13 +315,14 @@ export async function launchChromiumProfileContext(meta: BrowserAuthSourceMeta):
296
315
  };
297
316
  } catch (error) {
298
317
  try { child.kill("SIGTERM"); } catch {}
299
- rmSync(tempDir, { recursive: true, force: true });
318
+ removeTempDirQuietly(tempDir);
300
319
  throw error;
301
320
  }
302
321
  }
303
322
 
304
- export function cleanupProfileContext(ctx: LaunchedProfileContext | null | undefined): void {
323
+ export async function cleanupProfileContext(ctx: LaunchedProfileContext | null | undefined): Promise<void> {
305
324
  if (!ctx) return;
306
325
  try { ctx.child.kill("SIGTERM"); } catch {}
307
- rmSync(ctx.tempDir, { recursive: true, force: true });
326
+ await waitForChildExit(ctx.child);
327
+ removeTempDirQuietly(ctx.tempDir);
308
328
  }
@@ -268,6 +268,20 @@ export function blockedAppShellErrorCode(
268
268
  return hasAuth ? "blocked_app_shell" : "auth_required";
269
269
  }
270
270
 
271
+ export function shouldShortCircuitEmbeddedPayloadCapture(url: string, intent: string | undefined, html?: string): boolean {
272
+ if (!html) return false;
273
+ const lowerIntent = intent?.toLowerCase() ?? "";
274
+ if (
275
+ /linkedin\.com/i.test(url) &&
276
+ /\/feed(?:\/|$)/i.test(url) &&
277
+ /\b(feed|timeline|stream|post|posts|update|updates|home)\b/.test(lowerIntent) &&
278
+ /voyagerFeedDashMainFeed/.test(html)
279
+ ) {
280
+ return true;
281
+ }
282
+ return false;
283
+ }
284
+
271
285
  function shouldRetryEphemeralProfileError(error: unknown): boolean {
272
286
  const message = error instanceof Error ? error.message : String(error ?? "");
273
287
  return /persistentcontext|target page, context or browser has been closed|browser has been closed|page has been closed/i.test(message);
@@ -930,16 +944,23 @@ export async function captureSession(
930
944
  }
931
945
  await kuri.stop();
932
946
  kuri.useExternalChrome(browserCdpBaseUrl(profileCtx.cdpUrl), { child: profileCtx.child, tempDir: profileCtx.tempDir });
947
+ let nestedResult: CaptureResult | null = null;
933
948
  try {
934
- return await captureSession(url, undefined, undefined, intent, {
949
+ nestedResult = await captureSession(url, undefined, undefined, intent, {
935
950
  ...options,
936
951
  forceEphemeral: true,
937
952
  usedProfileContext: true,
938
953
  preferExistingTab,
939
954
  authStrategy: "header-replay",
940
955
  });
956
+ return nestedResult;
941
957
  } finally {
942
- await kuri.stop();
958
+ try {
959
+ await kuri.stop();
960
+ } catch (stopErr) {
961
+ log("capture", `profile-context cleanup failed for ${url}: ${stopErr instanceof Error ? stopErr.message : String(stopErr)}`);
962
+ if (!nestedResult) throw stopErr;
963
+ }
943
964
  }
944
965
  } catch (attachErr) {
945
966
  log("capture", `forced profile context failed for ${url}: ${attachErr instanceof Error ? attachErr.message : String(attachErr)}`);
@@ -1081,6 +1102,40 @@ export async function captureSession(
1081
1102
  await kuri.evaluate(tabId, INTERCEPTOR_SCRIPT);
1082
1103
  } catch { /* page may not be ready */ }
1083
1104
 
1105
+ // For pages that embed the task payload directly in the HTML, return before
1106
+ // the longer network/intercept wait. This avoids losing useful captures to
1107
+ // later browser-engine instability on auth-gated SPAs like LinkedIn feed.
1108
+ try {
1109
+ await sleep(1_500, signal);
1110
+ throwIfAborted(signal);
1111
+ const earlyHtml = await kuri.getPageHtml(tabId);
1112
+ if (shouldShortCircuitEmbeddedPayloadCapture(url, intent, earlyHtml)) {
1113
+ let final_url = url;
1114
+ try {
1115
+ const rawUrl = await kuri.getCurrentUrl(tabId);
1116
+ final_url = typeof rawUrl === "string" ? rawUrl : String(rawUrl ?? url);
1117
+ try { new URL(final_url); } catch { final_url = url; }
1118
+ } catch {
1119
+ final_url = url;
1120
+ }
1121
+ lastHtml = earlyHtml;
1122
+ const rawCookies = await extractCookiesFromPage(tabId, url);
1123
+ const sessionCookies = filterFirstPartySessionCookies(rawCookies, url, final_url);
1124
+ log("capture", `short-circuiting embedded payload capture for ${url}`);
1125
+ return {
1126
+ requests: [],
1127
+ har_lineage_id: nanoid(),
1128
+ domain,
1129
+ cookies: sessionCookies,
1130
+ final_url,
1131
+ html: earlyHtml,
1132
+ js_bundles: new Map(),
1133
+ };
1134
+ }
1135
+ } catch {
1136
+ // fall through to the longer capture path
1137
+ }
1138
+
1084
1139
  // Build response bodies map from intercepted requests
1085
1140
  const responseBodies = new Map<string, string>();
1086
1141
  const jsBundleBodies = new Map<string, string>();
@@ -65,6 +65,18 @@ let externalChromeOverride: {
65
65
  previousAttach?: string;
66
66
  } | null = null;
67
67
 
68
+ async function waitForChildExit(child: ChildProcess | null | undefined, timeoutMs = 2_000): Promise<void> {
69
+ if (!child) return;
70
+ if (child.exitCode !== null || child.killed) return;
71
+ await new Promise<void>((resolve) => {
72
+ const timer = setTimeout(resolve, timeoutMs);
73
+ child.once("exit", () => {
74
+ clearTimeout(timer);
75
+ resolve();
76
+ });
77
+ });
78
+ }
79
+
68
80
  function kuriBinaryName(): string {
69
81
  return process.platform === "win32" ? "kuri.exe" : "kuri";
70
82
  }
@@ -426,7 +438,12 @@ export async function stop(): Promise<void> {
426
438
  // ignore
427
439
  }
428
440
  if (externalChromeOverride.tempDir) {
429
- rmSync(externalChromeOverride.tempDir, { recursive: true, force: true });
441
+ await waitForChildExit(externalChromeOverride.child);
442
+ try {
443
+ rmSync(externalChromeOverride.tempDir, { recursive: true, force: true });
444
+ } catch {
445
+ // best-effort cleanup; don't fail the caller on temp profile removal
446
+ }
430
447
  }
431
448
  if (externalChromeOverride.previousCdpUrl == null) delete process.env.CDP_URL;
432
449
  else process.env.CDP_URL = externalChromeOverride.previousCdpUrl;
@@ -2,7 +2,7 @@ import { closeSync, openSync, readFileSync, statSync, unlinkSync, writeFileSync
2
2
  import path from "node:path";
3
3
  import { spawn } from "node:child_process";
4
4
  import { execFileSync } from "node:child_process";
5
- import { ensureDir, getPackageRoot, getServerAutostartLogFile, getServerPidFile, resolveSiblingEntrypoint, runtimeArgsForEntrypoint } from "./paths.js";
5
+ import { ensureDir, getPackageRoot, getServerAutostartLogFile, getServerPidFile, resolveSiblingEntrypoint, runtimeInvocationForEntrypoint } from "./paths.js";
6
6
  import { CODE_HASH } from "../version.js";
7
7
 
8
8
  type PidState = {
@@ -222,7 +222,8 @@ export async function ensureLocalServer(baseUrl: string, noAutoStart: boolean, m
222
222
  const logFile = getServerAutostartLogFile();
223
223
  ensureDir(path.dirname(logFile));
224
224
  const logFd = openSync(logFile, "a");
225
- const child = spawn(process.execPath, runtimeArgsForEntrypoint(metaUrl, entrypoint), {
225
+ const runtime = runtimeInvocationForEntrypoint(metaUrl, entrypoint);
226
+ const child = spawn(runtime.command, runtime.args, {
226
227
  cwd: packageRoot,
227
228
  detached: true,
228
229
  stdio: ["ignore", logFd, logFd],
@@ -2,6 +2,7 @@ import { existsSync, mkdirSync, realpathSync } from "node:fs";
2
2
  import os from "node:os";
3
3
  import path from "node:path";
4
4
  import { createRequire } from "node:module";
5
+ import { execFileSync } from "node:child_process";
5
6
  import { fileURLToPath } from "node:url";
6
7
 
7
8
  export function getModuleDir(metaUrl: string): string {
@@ -29,20 +30,38 @@ export function resolveSiblingEntrypoint(metaUrl: string, basename: string): str
29
30
  return path.join(path.dirname(file), `${basename}${path.extname(file) || ".js"}`);
30
31
  }
31
32
 
32
- export function runtimeArgsForEntrypoint(metaUrl: string, entrypoint: string): string[] {
33
- if (path.extname(entrypoint) !== ".ts") return [entrypoint];
34
- if (process.versions.bun) return [entrypoint];
33
+ function resolveBinaryOnPath(name: string): string | null {
34
+ const checker = process.platform === "win32" ? "where" : "which";
35
+ try {
36
+ const output = execFileSync(checker, [name], { encoding: "utf8", stdio: ["ignore", "pipe", "ignore"] });
37
+ const match = output.split(/\r?\n/).map((line) => line.trim()).find(Boolean);
38
+ return match || null;
39
+ } catch {
40
+ return null;
41
+ }
42
+ }
43
+
44
+ export function runtimeInvocationForEntrypoint(metaUrl: string, entrypoint: string): { command: string; args: string[] } {
45
+ if (path.extname(entrypoint) !== ".ts") return { command: process.execPath, args: [entrypoint] };
46
+ if (process.versions.bun) return { command: process.execPath, args: [entrypoint] };
47
+
48
+ const bunBinary = process.env.BUN_BIN || resolveBinaryOnPath("bun");
49
+ if (bunBinary) return { command: bunBinary, args: [entrypoint] };
35
50
 
36
51
  try {
37
52
  const req = createRequire(metaUrl);
38
53
  const tsxPkg = req.resolve("tsx/package.json");
39
54
  const tsxLoader = path.join(path.dirname(tsxPkg), "dist", "loader.mjs");
40
- if (existsSync(tsxLoader)) return ["--import", tsxLoader, entrypoint];
55
+ if (existsSync(tsxLoader)) return { command: process.execPath, args: ["--import", tsxLoader, entrypoint] };
41
56
  } catch {
42
57
  // fall through to bare specifier
43
58
  }
44
59
 
45
- return ["--import", "tsx", entrypoint];
60
+ return { command: process.execPath, args: ["--import", "tsx", entrypoint] };
61
+ }
62
+
63
+ export function runtimeArgsForEntrypoint(metaUrl: string, entrypoint: string): string[] {
64
+ return runtimeInvocationForEntrypoint(metaUrl, entrypoint).args;
46
65
  }
47
66
 
48
67
  export function isMainModule(metaUrl: string): boolean {