mcp-scraper 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +5 -0
  2. package/dist/bin/api-server.cjs +15553 -7587
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +3 -3
  5. package/dist/bin/mcp-stdio-server.cjs +312 -119
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +1 -1
  8. package/dist/bin/paa-harvest.cjs +1537 -165
  9. package/dist/bin/paa-harvest.cjs.map +1 -1
  10. package/dist/bin/paa-harvest.js +1 -1
  11. package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
  12. package/dist/chunk-D4CJBZBY.js.map +1 -0
  13. package/dist/chunk-HERFK7W6.js +2781 -0
  14. package/dist/chunk-HERFK7W6.js.map +1 -0
  15. package/dist/chunk-JQKZWEON.js +1000 -0
  16. package/dist/chunk-JQKZWEON.js.map +1 -0
  17. package/dist/chunk-Y74EXABN.js +295 -0
  18. package/dist/chunk-Y74EXABN.js.map +1 -0
  19. package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
  20. package/dist/index.cjs +1660 -237
  21. package/dist/index.cjs.map +1 -1
  22. package/dist/index.d.cts +169 -2
  23. package/dist/index.d.ts +169 -2
  24. package/dist/index.js +120 -69
  25. package/dist/index.js.map +1 -1
  26. package/dist/server-W5NWH5KF.js +11625 -0
  27. package/dist/server-W5NWH5KF.js.map +1 -0
  28. package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
  29. package/dist/worker-D4D2YQTA.js.map +1 -0
  30. package/package.json +17 -5
  31. package/dist/chunk-4API3ZCT.js +0 -1387
  32. package/dist/chunk-4API3ZCT.js.map +0 -1
  33. package/dist/chunk-LXZDJJXR.js.map +0 -1
  34. package/dist/chunk-ZBP4RHNW.js +0 -805
  35. package/dist/chunk-ZBP4RHNW.js.map +0 -1
  36. package/dist/server-63DR2HE5.js +0 -6062
  37. package/dist/server-63DR2HE5.js.map +0 -1
  38. package/dist/worker-3ECJHPRE.js.map +0 -1
  39. /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
package/dist/index.cjs CHANGED
@@ -43,6 +43,10 @@ var HarvestOptionsSchema = import_zod.z.object({
43
43
  location: import_zod.z.string().optional(),
44
44
  gl: import_zod.z.string().length(2).default("us"),
45
45
  hl: import_zod.z.string().length(2).default("en"),
46
+ device: import_zod.z.enum(["desktop", "mobile"]).default("desktop"),
47
+ proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location"),
48
+ proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional(),
49
+ debug: import_zod.z.boolean().default(false),
46
50
  depth: import_zod.z.number().int().min(1).max(30).default(3),
47
51
  maxQuestions: import_zod.z.number().int().min(1).max(1e3).default(100),
48
52
  headless: import_zod.z.boolean().default(false),
@@ -50,6 +54,7 @@ var HarvestOptionsSchema = import_zod.z.object({
50
54
  proxy: import_zod.z.string().url().optional(),
51
55
  kernelApiKey: import_zod.z.string().optional(),
52
56
  kernelProxyId: import_zod.z.string().optional(),
57
+ kernelProxyResolution: import_zod.z.unknown().optional(),
53
58
  outputDir: import_zod.z.string().default("./paa-output"),
54
59
  format: import_zod.z.enum(["json", "csv", "both"]).default("both"),
55
60
  serpOnly: import_zod.z.boolean().default(false),
@@ -73,6 +78,45 @@ var RawPAAItemSchema = import_zod.z.object({
73
78
  sourceSite: import_zod.z.string().optional(),
74
79
  sourceCite: import_zod.z.string().optional()
75
80
  });
81
+ var RawMapsOverviewSchema = import_zod.z.object({
82
+ name: import_zod.z.string().nullable(),
83
+ rating: import_zod.z.string().nullable(),
84
+ reviewCount: import_zod.z.string().nullable(),
85
+ category: import_zod.z.string().nullable(),
86
+ address: import_zod.z.string().nullable(),
87
+ hoursSummary: import_zod.z.string().nullable(),
88
+ phone: import_zod.z.string().nullable(),
89
+ phoneDisplay: import_zod.z.string().nullable(),
90
+ website: import_zod.z.string().nullable(),
91
+ plusCode: import_zod.z.string().nullable(),
92
+ bookingUrl: import_zod.z.string().nullable()
93
+ });
94
+ var RawMapsHoursRowSchema = import_zod.z.object({
95
+ day: import_zod.z.string(),
96
+ hours: import_zod.z.string()
97
+ });
98
+ var RawMapsReviewStatsSchema = import_zod.z.object({
99
+ reviewHistogram: import_zod.z.array(import_zod.z.object({
100
+ stars: import_zod.z.number(),
101
+ count: import_zod.z.string()
102
+ })),
103
+ reviewTopics: import_zod.z.array(import_zod.z.object({
104
+ label: import_zod.z.string(),
105
+ count: import_zod.z.string()
106
+ }))
107
+ });
108
+ var RawMapsReviewCardSchema = import_zod.z.object({
109
+ reviewId: import_zod.z.string(),
110
+ author: import_zod.z.string().nullable(),
111
+ stars: import_zod.z.string().nullable(),
112
+ date: import_zod.z.string().nullable(),
113
+ text: import_zod.z.string().nullable(),
114
+ ownerResponse: import_zod.z.string().nullable()
115
+ });
116
+ var RawMapsAboutAttributeSchema = import_zod.z.object({
117
+ section: import_zod.z.string(),
118
+ attribute: import_zod.z.string()
119
+ });
76
120
 
77
121
  // src/driver/BrowserDriver.ts
78
122
  var import_playwright_extra = require("playwright-extra");
@@ -88,7 +132,7 @@ var PAASelectors = {
88
132
  itemDataQ: "data-q",
89
133
  itemDataInitQ: "data-initq",
90
134
  itemQuestionEl: ".JlqpRe",
91
- answerContainer: ".bCOlv",
135
+ answerContainer: ".bCOlv, .hgKElc, .wDYxhc, .LGOjhe, .fo7IQd, .fmW3u",
92
136
  sourceTitle: "h3",
93
137
  sourceSite: ".VuuXrf",
94
138
  sourceCite: "cite",
@@ -128,9 +172,16 @@ var WhatPeopleSayingSelectors = {
128
172
  authorNote: ".nDgy9d"
129
173
  };
130
174
  var AIOverviewSelectors = {
131
- root: '[data-hveid="CBMQAA"]',
175
+ root: "[data-lhcontainer][data-streaming-container][eid]",
176
+ legacyRoot: '[data-hveid="CBMQAA"]',
132
177
  wrapper: ".Fgyi2e",
133
- citations: '.Fgyi2e [data-hveid] a[jsname="pxBnId"]'
178
+ controller: '[jscontroller="AkrxPe"]',
179
+ contentSubtree: '[data-subtree="mfc"]',
180
+ header: ".heWuVc",
181
+ heading: ".Fzsovc.cwYVJe.RJPOee",
182
+ showMoreButton: '[aria-label="Show more AI Overview"]',
183
+ sourcesPanel: ".OZ9ddf.WAUd4",
184
+ disclaimer: ".DuQANe.MSJHRb"
134
185
  };
135
186
  var AIModeSelectors = {
136
187
  root: '[data-hveid="CAUQAA"]',
@@ -158,6 +209,9 @@ var LocalPackSelectors = {
158
209
 
159
210
  // src/errors.ts
160
211
  var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
212
+ function sanitizeVendorName(message) {
213
+ return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
214
+ }
161
215
  var CaptchaError = class extends Error {
162
216
  constructor(instructions) {
163
217
  super(`CAPTCHA detected. ${instructions}`);
@@ -174,10 +228,55 @@ var ExtractionError = class extends Error {
174
228
  cause;
175
229
  name = "ExtractionError";
176
230
  };
231
+ var RequestAbortedError = class extends Error {
232
+ name = "RequestAbortedError";
233
+ constructor(message = "Request aborted before harvest completed") {
234
+ super(message);
235
+ }
236
+ };
177
237
 
178
238
  // src/driver/BrowserDriver.ts
179
239
  import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
180
240
  var DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
241
+ var MOBILE_USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
242
+ var DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS = 180;
243
+ var KERNEL_BROWSER_CLOSE_TIMEOUT_MS = 3e3;
244
+ var KERNEL_SESSION_DELETE_TIMEOUT_MS = 5e3;
245
+ function positiveIntFromEnv(name, fallback) {
246
+ const raw = process.env[name];
247
+ if (!raw) return fallback;
248
+ const parsed = Number(raw);
249
+ return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback;
250
+ }
251
+ function proxyIdSuffix(proxyId) {
252
+ return proxyId ? proxyId.slice(-6) : null;
253
+ }
254
+ function errorText(err) {
255
+ return err instanceof Error ? err.message : String(err);
256
+ }
257
+ function rankCheckContextOptions(config) {
258
+ return {
259
+ viewport: config.viewport,
260
+ locale: config.locale,
261
+ userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
262
+ ...config.deviceScaleFactor ? { deviceScaleFactor: config.deviceScaleFactor } : {},
263
+ ...config.isMobile !== void 0 ? { isMobile: config.isMobile } : {},
264
+ ...config.hasTouch !== void 0 ? { hasTouch: config.hasTouch } : {}
265
+ };
266
+ }
267
+ async function withTimeout(promise, timeoutMs, label) {
268
+ let timeout;
269
+ try {
270
+ return await Promise.race([
271
+ promise,
272
+ new Promise((_, reject) => {
273
+ timeout = setTimeout(() => reject(new Error(`${label} timed out after ${timeoutMs}ms`)), timeoutMs);
274
+ })
275
+ ]);
276
+ } finally {
277
+ if (timeout) clearTimeout(timeout);
278
+ }
279
+ }
181
280
  function buildYouTubeChannelVideosUrl(channelInput) {
182
281
  const raw = channelInput.trim();
183
282
  if (!raw) throw new Error("channelHandle is required");
@@ -211,30 +310,101 @@ var BrowserDriver = class {
211
310
  page = null;
212
311
  kernelClient = null;
213
312
  kernelSessionId = null;
313
+ debugEnabled = false;
314
+ debugSnapshot = {
315
+ kernel: null,
316
+ context: null,
317
+ networkLocation: null,
318
+ serpNavigation: null
319
+ };
214
320
  async launch(config) {
321
+ this.debugEnabled = config.debug === true;
322
+ const proxyMode = config.proxyMode ?? (config.kernelProxyId ? "configured" : "none");
323
+ const device = config.isMobile ? "mobile" : "desktop";
324
+ this.debugSnapshot = {
325
+ kernel: null,
326
+ context: {
327
+ viewport: config.viewport,
328
+ locale: config.locale,
329
+ device,
330
+ userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
331
+ deviceScaleFactor: config.deviceScaleFactor ?? null,
332
+ isMobile: config.isMobile === true,
333
+ hasTouch: config.hasTouch === true
334
+ },
335
+ networkLocation: null,
336
+ serpNavigation: null
337
+ };
215
338
  if (config.kernelApiKey) {
216
339
  this.kernelClient = new import_sdk.default({ apiKey: config.kernelApiKey });
340
+ const timeoutSeconds = positiveIntFromEnv("KERNEL_BROWSER_TIMEOUT_SECONDS", DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS);
217
341
  const kernelBrowser = await this.kernelClient.browsers.create({
218
342
  stealth: true,
219
- timeout_seconds: 600,
343
+ timeout_seconds: timeoutSeconds,
220
344
  ...config.kernelProxyId ? { proxy_id: config.kernelProxyId } : {}
221
345
  });
222
346
  this.kernelSessionId = kernelBrowser.session_id;
347
+ let defaultProxyDisabled = null;
348
+ let defaultProxyDisableError = null;
349
+ if (proxyMode === "none") {
350
+ try {
351
+ await withTimeout(
352
+ this.kernelClient.browsers.update(this.kernelSessionId, { disable_default_proxy: true }),
353
+ 5e3,
354
+ `Kernel session ${this.kernelSessionId} disable default proxy`
355
+ );
356
+ defaultProxyDisabled = true;
357
+ } catch (err) {
358
+ defaultProxyDisabled = false;
359
+ defaultProxyDisableError = errorText(err);
360
+ }
361
+ }
362
+ const kernelDebug = {
363
+ sessionId: this.kernelSessionId,
364
+ proxyMode,
365
+ requestedProxyIdPresent: Boolean(config.kernelProxyId),
366
+ requestedProxyIdSuffix: proxyIdSuffix(config.kernelProxyId),
367
+ createdProxyIdPresent: typeof kernelBrowser.proxy_id === "string" ? Boolean(kernelBrowser.proxy_id) : null,
368
+ createdProxyIdSuffix: proxyIdSuffix(kernelBrowser.proxy_id),
369
+ retrievedProxyIdPresent: null,
370
+ retrievedProxyIdSuffix: null,
371
+ retrievedProxyIdMatchesRequested: null,
372
+ defaultProxyDisabled,
373
+ defaultProxyDisableError,
374
+ proxyResolution: config.kernelProxyResolution ?? null,
375
+ timeoutSeconds,
376
+ stealth: typeof kernelBrowser.stealth === "boolean" ? kernelBrowser.stealth : null,
377
+ profilePresent: null,
378
+ poolPresent: null,
379
+ retrieveError: null
380
+ };
381
+ this.debugSnapshot.kernel = kernelDebug;
382
+ console.info(JSON.stringify({
383
+ event: "kernel_browser_created",
384
+ kernel_session_id: this.kernelSessionId,
385
+ timeout_seconds: timeoutSeconds,
386
+ proxy_mode: proxyMode,
387
+ proxy_id_present: Boolean(config.kernelProxyId),
388
+ proxy_resolution_source: config.kernelProxyResolution?.source
389
+ }));
390
+ if (this.debugEnabled) {
391
+ await this.populateKernelRetrieveDebug(kernelDebug, config.kernelProxyId);
392
+ }
223
393
  this.browser = await import_playwright.chromium.connectOverCDP(kernelBrowser.cdp_ws_url);
224
- this.context = this.browser.contexts()[0] ?? await this.browser.newContext();
394
+ this.context = await this.browser.newContext(rankCheckContextOptions(config));
225
395
  await this.installEsbuildHelperShims(this.context);
226
- this.page = this.context.pages()[0] ?? await this.context.newPage();
396
+ this.page = await this.context.newPage();
397
+ await this.page.setViewportSize(config.viewport);
398
+ if (this.debugEnabled) {
399
+ this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
400
+ }
227
401
  return;
228
402
  }
229
403
  const launchOpts = {
230
404
  headless: config.headless,
231
405
  proxy: config.proxy ? { server: config.proxy } : void 0
232
406
  };
233
- const ctxOpts = {
234
- viewport: config.viewport,
235
- locale: config.locale,
236
- userAgent: DESKTOP_USER_AGENT
237
- };
407
+ const ctxOpts = rankCheckContextOptions(config);
238
408
  if (config.profileDir) {
239
409
  this.context = await import_playwright_extra.chromium.launchPersistentContext(config.profileDir, {
240
410
  ...launchOpts,
@@ -248,6 +418,107 @@ var BrowserDriver = class {
248
418
  await this.installEsbuildHelperShims(this.context);
249
419
  this.page = await this.context.newPage();
250
420
  }
421
+ if (this.debugEnabled) {
422
+ this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
423
+ }
424
+ }
425
+ async populateKernelRetrieveDebug(kernelDebug, requestedProxyId) {
426
+ if (!this.kernelClient || !this.kernelSessionId) return;
427
+ try {
428
+ const retrieved = await withTimeout(
429
+ this.kernelClient.browsers.retrieve(this.kernelSessionId),
430
+ 5e3,
431
+ `Kernel session ${this.kernelSessionId} retrieve`
432
+ );
433
+ kernelDebug.retrievedProxyIdPresent = typeof retrieved.proxy_id === "string" ? Boolean(retrieved.proxy_id) : false;
434
+ kernelDebug.retrievedProxyIdSuffix = proxyIdSuffix(retrieved.proxy_id);
435
+ kernelDebug.retrievedProxyIdMatchesRequested = requestedProxyId ? retrieved.proxy_id === requestedProxyId : !retrieved.proxy_id;
436
+ kernelDebug.timeoutSeconds = typeof retrieved.timeout_seconds === "number" ? retrieved.timeout_seconds : kernelDebug.timeoutSeconds;
437
+ kernelDebug.stealth = typeof retrieved.stealth === "boolean" ? retrieved.stealth : kernelDebug.stealth;
438
+ kernelDebug.profilePresent = Boolean(retrieved.profile);
439
+ kernelDebug.poolPresent = Boolean(retrieved.pool);
440
+ } catch (err) {
441
+ kernelDebug.retrieveError = errorText(err);
442
+ }
443
+ }
444
+ async captureBrowserNetworkLocation() {
445
+ const fallback = (message, source = "ipapi.co") => ({
446
+ source,
447
+ ip: null,
448
+ city: null,
449
+ region: null,
450
+ country: null,
451
+ org: null,
452
+ timezone: null,
453
+ error: message
454
+ });
455
+ if (!this.context) return fallback("browser context is not available");
456
+ let debugPage = null;
457
+ try {
458
+ debugPage = await this.context.newPage();
459
+ const ipwho = await this.loadJsonInDebugPage(debugPage, "https://ipwho.is/");
460
+ if (ipwho) {
461
+ const connection = typeof ipwho.connection === "object" && ipwho.connection !== null ? ipwho.connection : {};
462
+ return {
463
+ source: "ipwho.is",
464
+ ip: typeof ipwho.ip === "string" ? ipwho.ip : null,
465
+ city: typeof ipwho.city === "string" ? ipwho.city : null,
466
+ region: typeof ipwho.region === "string" ? ipwho.region : null,
467
+ country: typeof ipwho.country === "string" ? ipwho.country : null,
468
+ org: typeof connection.org === "string" ? connection.org : null,
469
+ timezone: typeof ipwho.timezone === "object" && ipwho.timezone !== null && typeof ipwho.timezone.id === "string" ? ipwho.timezone.id : null,
470
+ error: null
471
+ };
472
+ }
473
+ const ipify = await this.loadJsonInDebugPage(debugPage, "https://api64.ipify.org?format=json");
474
+ if (ipify) {
475
+ return {
476
+ source: "api64.ipify.org",
477
+ ip: typeof ipify.ip === "string" ? ipify.ip : null,
478
+ city: null,
479
+ region: null,
480
+ country: null,
481
+ org: null,
482
+ timezone: null,
483
+ error: null
484
+ };
485
+ }
486
+ await withTimeout(
487
+ debugPage.goto("https://ipapi.co/json/", { waitUntil: "domcontentloaded", timeout: 7e3 }),
488
+ 8e3,
489
+ "browser network location navigation"
490
+ );
491
+ const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
492
+ const data = JSON.parse(body);
493
+ return {
494
+ source: "ipapi.co",
495
+ ip: typeof data.ip === "string" ? data.ip : null,
496
+ city: typeof data.city === "string" ? data.city : null,
497
+ region: typeof data.region === "string" ? data.region : null,
498
+ country: typeof data.country_name === "string" ? data.country_name : typeof data.country === "string" ? data.country : null,
499
+ org: typeof data.org === "string" ? data.org : null,
500
+ timezone: typeof data.timezone === "string" ? data.timezone : null,
501
+ error: null
502
+ };
503
+ } catch (err) {
504
+ return fallback(errorText(err));
505
+ } finally {
506
+ await debugPage?.close().catch(() => {
507
+ });
508
+ }
509
+ }
510
+ async loadJsonInDebugPage(debugPage, url) {
511
+ try {
512
+ await withTimeout(
513
+ debugPage.goto(url, { waitUntil: "domcontentloaded", timeout: 7e3 }),
514
+ 8e3,
515
+ `browser network location navigation ${url}`
516
+ );
517
+ const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
518
+ return JSON.parse(body);
519
+ } catch {
520
+ return null;
521
+ }
251
522
  }
252
523
  async installEsbuildHelperShims(context) {
253
524
  await context.addInitScript(() => {
@@ -259,42 +530,79 @@ var BrowserDriver = class {
259
530
  };
260
531
  });
261
532
  }
262
- async navigateToSERP(query, uule, gl, hl) {
263
- const params = new URLSearchParams({ q: query, gl, hl });
533
+ async navigateToSERP(query, uule, gl, hl, options) {
534
+ const params = new URLSearchParams({ q: query, gl, hl, pws: "0" });
535
+ if (options?.num) params.set("num", String(options.num));
264
536
  if (uule) params.set("uule", uule);
265
537
  const url = "https://www.google.com/search?" + params.toString();
538
+ const navDebug = options?.debug ? {
539
+ requestedUrl: url,
540
+ finalUrl: null,
541
+ title: null,
542
+ bodySnippet: null,
543
+ hasPaa: null,
544
+ captchaDetected: null,
545
+ googleSorryUrl: null,
546
+ redirected: null
547
+ } : null;
548
+ if (navDebug) this.debugSnapshot.serpNavigation = navDebug;
266
549
  try {
267
550
  await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
268
551
  } catch (err) {
552
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: null, captchaDetected: null });
269
553
  const diag = await this.captureDiagnostics(url);
270
554
  throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
271
555
  }
272
556
  const captchaCount = await this.page.locator(PAASelectors.captchaMarker).count();
273
557
  if (captchaCount > 0) {
274
- if (this.kernelClient) {
275
- try {
276
- await this.page.waitForSelector(PAASelectors.container, { timeout: 45e3 });
277
- return { hasPaa: true };
278
- } catch {
279
- throw new CaptchaError(this.captchaMessage());
280
- }
281
- }
558
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
282
559
  throw new CaptchaError(this.captchaMessage());
283
560
  }
284
561
  const fastFound = await this.page.waitForSelector(PAASelectors.item, { timeout: 4e3 }).catch(() => null);
285
- if (fastFound) return { hasPaa: true };
562
+ if (fastFound) {
563
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
564
+ return { hasPaa: true };
565
+ }
286
566
  const captchaAfter = await this.page.locator(PAASelectors.captchaMarker).count();
287
- if (captchaAfter > 0) throw new CaptchaError(this.captchaMessage());
567
+ if (captchaAfter > 0) {
568
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
569
+ throw new CaptchaError(this.captchaMessage());
570
+ }
288
571
  for (let i = 1; i <= 6; i++) {
289
572
  await this.page.evaluate((f) => {
290
573
  window.scrollTo(0, document.body.scrollHeight * f);
291
574
  }, i / 6);
292
575
  await this.page.waitForTimeout(600);
293
576
  const count = await this.page.locator(PAASelectors.item).count();
294
- if (count > 0) return { hasPaa: true };
577
+ if (count > 0) {
578
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
579
+ return { hasPaa: true };
580
+ }
295
581
  }
582
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: false });
296
583
  return { hasPaa: false };
297
584
  }
585
+ async updateSerpNavigationDebug(navDebug, requestedUrl, state) {
586
+ if (!navDebug || !this.page) return;
587
+ try {
588
+ const finalUrl = this.page.url();
589
+ const title = await this.page.title().catch(() => "");
590
+ const bodySnippet = await this.page.evaluate(() => {
591
+ const text = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
592
+ return text.slice(0, 500);
593
+ }).catch(() => "");
594
+ const textCaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
595
+ navDebug.finalUrl = finalUrl;
596
+ navDebug.title = title;
597
+ navDebug.bodySnippet = bodySnippet;
598
+ navDebug.hasPaa = state.hasPaa;
599
+ navDebug.captchaDetected = state.captchaDetected ?? textCaptcha;
600
+ navDebug.googleSorryUrl = /google\.[^/]+\/sorry\//i.test(finalUrl);
601
+ navDebug.redirected = finalUrl !== requestedUrl;
602
+ } catch (err) {
603
+ navDebug.bodySnippet = `debug capture failed: ${errorText(err)}`;
604
+ }
605
+ }
298
606
  async captureDiagnostics(intendedUrl) {
299
607
  try {
300
608
  const finalUrl = this.page.url();
@@ -316,7 +624,7 @@ var BrowserDriver = class {
316
624
  }
317
625
  }
318
626
  captchaMessage() {
319
- return this.kernelClient ? "Google returned a CAPTCHA on this Kernel.sh session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
627
+ return this.kernelClient ? "Google returned a CAPTCHA on this session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
320
628
  }
321
629
  async navigateTo(url) {
322
630
  try {
@@ -341,6 +649,12 @@ var BrowserDriver = class {
341
649
  getPage() {
342
650
  return this.page;
343
651
  }
652
+ getKernelSessionId() {
653
+ return this.kernelSessionId;
654
+ }
655
+ getDebugSnapshot() {
656
+ return this.debugSnapshot;
657
+ }
344
658
  async close() {
345
659
  if (this.browser) {
346
660
  const b = this.browser;
@@ -351,21 +665,84 @@ var BrowserDriver = class {
351
665
  this.page = null;
352
666
  this.kernelSessionId = null;
353
667
  this.kernelClient = null;
354
- try {
355
- await b.close();
356
- } finally {
357
- if (client && sessionId) {
358
- await client.browsers.deleteByID(sessionId).catch(
359
- (err) => console.warn("Kernel session cleanup failed:", err)
360
- );
668
+ if (client && sessionId) {
669
+ console.info(JSON.stringify({
670
+ event: "kernel_browser_delete_started",
671
+ kernel_session_id: sessionId
672
+ }));
673
+ const deleteSession = withTimeout(
674
+ client.browsers.deleteByID(sessionId),
675
+ KERNEL_SESSION_DELETE_TIMEOUT_MS,
676
+ `Kernel session ${sessionId} delete`
677
+ );
678
+ const closeBrowser = withTimeout(
679
+ b.close(),
680
+ KERNEL_BROWSER_CLOSE_TIMEOUT_MS,
681
+ `Kernel browser ${sessionId} close`
682
+ );
683
+ const [deleteResult, closeResult] = await Promise.allSettled([deleteSession, closeBrowser]);
684
+ const result = {
685
+ kernelSessionId: sessionId,
686
+ kernelDeleteStarted: true,
687
+ kernelDeleteSucceeded: deleteResult.status === "fulfilled",
688
+ kernelDeleteError: deleteResult.status === "rejected" ? deleteResult.reason instanceof Error ? deleteResult.reason.message : String(deleteResult.reason) : null,
689
+ browserCloseSucceeded: closeResult.status === "fulfilled",
690
+ browserCloseError: closeResult.status === "rejected" ? closeResult.reason instanceof Error ? closeResult.reason.message : String(closeResult.reason) : null
691
+ };
692
+ if (deleteResult.status === "rejected") {
693
+ console.warn(JSON.stringify({
694
+ event: "kernel_browser_delete_failed",
695
+ kernel_session_id: sessionId,
696
+ message: result.kernelDeleteError
697
+ }));
698
+ console.warn(`Kernel session cleanup failed for ${sessionId}:`, deleteResult.reason);
699
+ } else {
700
+ console.info(JSON.stringify({
701
+ event: "kernel_browser_delete_succeeded",
702
+ kernel_session_id: sessionId
703
+ }));
361
704
  }
705
+ if (closeResult.status === "rejected") {
706
+ console.warn(JSON.stringify({
707
+ event: "kernel_browser_close_failed",
708
+ kernel_session_id: sessionId,
709
+ message: result.browserCloseError
710
+ }));
711
+ console.warn(`Kernel browser close failed for ${sessionId}:`, closeResult.reason);
712
+ }
713
+ return result;
362
714
  }
715
+ await b.close();
716
+ return {
717
+ kernelSessionId: null,
718
+ kernelDeleteStarted: false,
719
+ kernelDeleteSucceeded: null,
720
+ kernelDeleteError: null,
721
+ browserCloseSucceeded: true,
722
+ browserCloseError: null
723
+ };
363
724
  } else if (this.context) {
364
725
  const ctx = this.context;
365
726
  this.context = null;
366
727
  this.page = null;
367
728
  await ctx.close();
729
+ return {
730
+ kernelSessionId: null,
731
+ kernelDeleteStarted: false,
732
+ kernelDeleteSucceeded: null,
733
+ kernelDeleteError: null,
734
+ browserCloseSucceeded: true,
735
+ browserCloseError: null
736
+ };
368
737
  }
738
+ return {
739
+ kernelSessionId: null,
740
+ kernelDeleteStarted: false,
741
+ kernelDeleteSucceeded: null,
742
+ kernelDeleteError: null,
743
+ browserCloseSucceeded: null,
744
+ browserCloseError: null
745
+ };
369
746
  }
370
747
  };
371
748
 
@@ -436,13 +813,157 @@ var LOCATIONS = {
436
813
  };
437
814
 
438
815
  // src/uule.ts
816
+ function encodeVarint(value) {
817
+ const bytes = [];
818
+ let remaining = value;
819
+ do {
820
+ let byte = remaining & 127;
821
+ remaining >>>= 7;
822
+ if (remaining > 0) byte |= 128;
823
+ bytes.push(byte);
824
+ } while (remaining > 0);
825
+ return bytes;
826
+ }
439
827
  function encodeUule(name) {
440
- const encoded = Buffer.from(String.fromCharCode(name.length) + name).toString("base64");
441
- return `w+CAIQICI${encoded}`;
828
+ const locationBytes = Buffer.from(name, "utf8");
829
+ const payload = Buffer.concat([
830
+ Buffer.from([8, 2, 16, 32, 34]),
831
+ Buffer.from(encodeVarint(locationBytes.length)),
832
+ locationBytes
833
+ ]);
834
+ return `w+${payload.toString("base64")}`;
442
835
  }
443
836
  function normalizeLocation(input) {
444
- const key = input.toLowerCase().trim();
445
- return LOCATIONS[key] ?? input;
837
+ const raw = input.toLowerCase().trim();
838
+ if (LOCATIONS[raw]) return LOCATIONS[raw];
839
+ const beforeComma = raw.split(",")[0].trim();
840
+ if (beforeComma !== raw && LOCATIONS[beforeComma]) return LOCATIONS[beforeComma];
841
+ const withoutState = raw.replace(/\s+[a-z]{2}$/, "").trim();
842
+ if (withoutState !== raw && LOCATIONS[withoutState]) return LOCATIONS[withoutState];
843
+ return input;
844
+ }
845
+
846
+ // src/serp-location-debug.ts
847
+ var STATE_TO_CODE = {
848
+ alabama: "AL",
849
+ alaska: "AK",
850
+ arizona: "AZ",
851
+ arkansas: "AR",
852
+ california: "CA",
853
+ colorado: "CO",
854
+ connecticut: "CT",
855
+ delaware: "DE",
856
+ florida: "FL",
857
+ georgia: "GA",
858
+ hawaii: "HI",
859
+ idaho: "ID",
860
+ illinois: "IL",
861
+ indiana: "IN",
862
+ iowa: "IA",
863
+ kansas: "KS",
864
+ kentucky: "KY",
865
+ louisiana: "LA",
866
+ maine: "ME",
867
+ maryland: "MD",
868
+ massachusetts: "MA",
869
+ michigan: "MI",
870
+ minnesota: "MN",
871
+ mississippi: "MS",
872
+ missouri: "MO",
873
+ montana: "MT",
874
+ nebraska: "NE",
875
+ nevada: "NV",
876
+ "new hampshire": "NH",
877
+ "new jersey": "NJ",
878
+ "new mexico": "NM",
879
+ "new york": "NY",
880
+ "north carolina": "NC",
881
+ "north dakota": "ND",
882
+ ohio: "OH",
883
+ oklahoma: "OK",
884
+ oregon: "OR",
885
+ pennsylvania: "PA",
886
+ "rhode island": "RI",
887
+ "south carolina": "SC",
888
+ "south dakota": "SD",
889
+ tennessee: "TN",
890
+ texas: "TX",
891
+ utah: "UT",
892
+ vermont: "VT",
893
+ virginia: "VA",
894
+ washington: "WA",
895
+ "west virginia": "WV",
896
+ wisconsin: "WI",
897
+ wyoming: "WY",
898
+ "district of columbia": "DC"
899
+ };
900
+ var STATE_PATTERN = [
901
+ ...Object.keys(STATE_TO_CODE).map((s) => s.replace(/\s+/g, "\\s+")),
902
+ ...Object.values(STATE_TO_CODE)
903
+ ].join("|");
904
+ var CITY_STATE_RE = new RegExp(`\\b([A-Z][A-Za-z]+(?:[\\s.-][A-Z][A-Za-z]+){0,4}),?\\s+(${STATE_PATTERN})\\b`, "gi");
905
+ function normalizeRegionCode(input) {
906
+ if (!input) return null;
907
+ const trimmed = input.trim();
908
+ if (/^[A-Z]{2}$/i.test(trimmed)) return trimmed.toUpperCase();
909
+ return STATE_TO_CODE[trimmed.toLowerCase()] ?? null;
910
+ }
911
+ function normalizeCity(input) {
912
+ const cleaned = input.replace(/\s+/g, " ").trim().replace(/^.*\b(?:in|near|around|serving)\s+/i, "");
913
+ return cleaned.toLowerCase().replace(/\b[a-z]/g, (char) => char.toUpperCase());
914
+ }
915
+ function parseExpected(canonicalLocation) {
916
+ if (!canonicalLocation) return null;
917
+ const [city = "", region = ""] = canonicalLocation.split(",").map((part) => part.trim());
918
+ return {
919
+ city: normalizeCity(city),
920
+ regionCode: normalizeRegionCode(region),
921
+ canonicalLocation
922
+ };
923
+ }
924
+ function addCandidate(candidates, city, region, example) {
925
+ const normalizedCity = normalizeCity(city);
926
+ const regionCode = normalizeRegionCode(region);
927
+ if (!normalizedCity || !regionCode) return;
928
+ const key = `${normalizedCity.toLowerCase()}|${regionCode}`;
929
+ const existing = candidates.get(key);
930
+ if (existing) {
931
+ existing.count++;
932
+ if (existing.examples.length < 3 && !existing.examples.includes(example)) existing.examples.push(example);
933
+ return;
934
+ }
935
+ candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
936
+ }
937
+ function scanText(candidates, text) {
938
+ const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
939
+ for (const match of normalized.matchAll(CITY_STATE_RE)) {
940
+ addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
941
+ }
942
+ }
943
+ function inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) {
944
+ const expected = parseExpected(canonicalLocation);
945
+ const candidates = /* @__PURE__ */ new Map();
946
+ for (const result of organicResults) {
947
+ scanText(candidates, [result.title, result.snippet ?? "", result.cite ?? "", result.url].join(" "));
948
+ }
949
+ for (const business of localPack) {
950
+ scanText(candidates, [business.name, ...business.metadata, business.websiteUrl ?? "", business.directionsUrl ?? ""].join(" "));
951
+ }
952
+ const rankedCandidates = Array.from(candidates.values()).sort((a, b) => b.count - a.count || a.city.localeCompare(b.city)).slice(0, 8);
953
+ if (!expected) {
954
+ return { status: "not_requested", expected: null, candidates: rankedCandidates };
955
+ }
956
+ if (rankedCandidates.length === 0) {
957
+ return { status: "unknown", expected, candidates: [] };
958
+ }
959
+ const matched = rankedCandidates.some(
960
+ (candidate) => candidate.city.toLowerCase() === expected.city.toLowerCase() && (expected.regionCode == null || candidate.regionCode === expected.regionCode)
961
+ );
962
+ return {
963
+ status: matched ? "matched" : "mismatch",
964
+ expected,
965
+ candidates: rankedCandidates
966
+ };
446
967
  }
447
968
 
448
969
  // src/lib/paa-answer-cleanup.ts
@@ -537,7 +1058,220 @@ function cleanPAAAnswerText(answer, question, sourceTitle) {
537
1058
  return text;
538
1059
  }
539
1060
 
1061
+ // src/extractor/ai-surfaces.ts
1062
+ async function extractAISurfacesFromDocument(config) {
1063
+ const selectors = config ?? {
1064
+ aio: {
1065
+ root: "[data-lhcontainer][data-streaming-container][eid]",
1066
+ legacyRoot: '[data-hveid="CBMQAA"]',
1067
+ wrapper: ".Fgyi2e",
1068
+ controller: '[jscontroller="AkrxPe"]',
1069
+ contentSubtree: '[data-subtree="mfc"]',
1070
+ heading: ".Fzsovc.cwYVJe.RJPOee",
1071
+ header: ".heWuVc",
1072
+ showMoreButton: '[aria-label="Show more AI Overview"]',
1073
+ sourcesPanel: ".OZ9ddf.WAUd4",
1074
+ disclaimer: ".DuQANe.MSJHRb"
1075
+ },
1076
+ aim: {
1077
+ root: '[data-hveid="CAUQAA"]',
1078
+ wrapper: ".Fgyi2e"
1079
+ },
1080
+ expandWaitMs: 1500
1081
+ };
1082
+ const sn = window.google?.sn ?? "unknown";
1083
+ const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
1084
+ function textOf(el) {
1085
+ if (!el) return "";
1086
+ return (el.innerText ?? el.textContent ?? "").trim();
1087
+ }
1088
+ function hasAIOverviewLabel(el) {
1089
+ const heading = el.querySelector(selectors.aio.heading);
1090
+ if (textOf(heading) === "AI Overview") return true;
1091
+ const header = el.querySelector(selectors.aio.header);
1092
+ if (textOf(header).split(/\n|\s{2,}/).some((part) => part.trim() === "AI Overview")) return true;
1093
+ return textOf(el).includes("AI Overview");
1094
+ }
1095
+ function findAIORoot() {
1096
+ const primaryRoots = Array.from(document.querySelectorAll(selectors.aio.root));
1097
+ const labeledPrimary = primaryRoots.find(hasAIOverviewLabel);
1098
+ if (labeledPrimary) return labeledPrimary;
1099
+ if (primaryRoots.length > 0) return primaryRoots[0];
1100
+ if (selectors.aio.legacyRoot) {
1101
+ const legacy = document.querySelector(selectors.aio.legacyRoot);
1102
+ if (legacy) return legacy;
1103
+ }
1104
+ const headings = document.querySelectorAll(`${selectors.aio.heading}, h1, h2, h3, [role="heading"]`);
1105
+ for (const h of headings) {
1106
+ if (textOf(h) !== "AI Overview") continue;
1107
+ let el = h.parentElement;
1108
+ for (let i = 0; i < 8 && el; i++) {
1109
+ if (el.matches(selectors.aio.root) || el.querySelector(selectors.aio.controller) || el.querySelector(selectors.aio.contentSubtree)) {
1110
+ return el;
1111
+ }
1112
+ el = el.parentElement;
1113
+ }
1114
+ return h.parentElement;
1115
+ }
1116
+ return null;
1117
+ }
1118
+ function cleanText(target) {
1119
+ if (!target) return null;
1120
+ const clone = target.cloneNode(true);
1121
+ clone.querySelectorAll([
1122
+ "script",
1123
+ "style",
1124
+ "noscript",
1125
+ "img",
1126
+ "picture",
1127
+ "video",
1128
+ selectors.aio.header,
1129
+ selectors.aio.showMoreButton,
1130
+ selectors.aio.sourcesPanel,
1131
+ selectors.aio.disclaimer,
1132
+ '[data-subtree="dfa"]',
1133
+ "[data-src-id]",
1134
+ '[role="dialog"]',
1135
+ ".HWMcu",
1136
+ ".bTFeG",
1137
+ ".CyMdWb",
1138
+ ".MFrAxb",
1139
+ ".F0OfWd.hfWAgb",
1140
+ ".x2qcTc.fZavHb",
1141
+ ".SvjEff",
1142
+ ".sR2MY",
1143
+ ".lKuDef",
1144
+ ".GSPQcc",
1145
+ "a[href]",
1146
+ "button",
1147
+ '[role="button"]'
1148
+ ].join(",")).forEach((el) => el.remove());
1149
+ const holder = document.createElement("div");
1150
+ holder.style.position = "fixed";
1151
+ holder.style.left = "-10000px";
1152
+ holder.style.top = "0";
1153
+ holder.style.width = `${Math.max(320, Math.round(target.getBoundingClientRect?.().width || 960))}px`;
1154
+ holder.style.opacity = "0";
1155
+ holder.style.pointerEvents = "none";
1156
+ holder.append(clone);
1157
+ document.body.append(holder);
1158
+ const rendered = clone.innerText || clone.textContent || "";
1159
+ holder.remove();
1160
+ const lines = rendered.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n[ \t]+/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ").trim().split("\n").map((line) => line.replace(/\u00a0/g, " ").trim()).filter(Boolean);
1161
+ const filteredLines = [];
1162
+ for (let i = 0; i < lines.length; i++) {
1163
+ const line = lines[i];
1164
+ const next = lines[i + 1] ?? "";
1165
+ if (line === "AI Overview") continue;
1166
+ if (line === "Show more") continue;
1167
+ if (/^AI can make mistakes/i.test(line)) continue;
1168
+ if (/^Thank you\b/i.test(line)) continue;
1169
+ if (/^Your feedback helps Google improve/i.test(line)) continue;
1170
+ if (/^\+?\d+$/.test(line)) continue;
1171
+ if (/^\+\d+$/.test(next) && line.length <= 80) {
1172
+ i++;
1173
+ continue;
1174
+ }
1175
+ filteredLines.push(line);
1176
+ }
1177
+ const raw = filteredLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
1178
+ if (!raw || /not available|try again|can't generate/i.test(raw)) return null;
1179
+ return raw;
1180
+ }
1181
+ function normalizeHref(rawHref) {
1182
+ if (!rawHref || rawHref.startsWith("javascript:")) return null;
1183
+ let href = rawHref;
1184
+ try {
1185
+ const absolute = new URL(rawHref, window.location.href);
1186
+ const q = absolute.searchParams.get("q") ?? absolute.searchParams.get("url");
1187
+ if (/(\.|^)google\./i.test(absolute.hostname) && q?.startsWith("http")) {
1188
+ href = q;
1189
+ } else {
1190
+ href = absolute.href;
1191
+ }
1192
+ } catch {
1193
+ return null;
1194
+ }
1195
+ if (!/^https?:\/\//i.test(href)) return null;
1196
+ try {
1197
+ const url = new URL(href);
1198
+ const isGoogleInternal = /(\.|^)google\./i.test(url.hostname);
1199
+ if (isGoogleInternal) return null;
1200
+ return url.href;
1201
+ } catch {
1202
+ return null;
1203
+ }
1204
+ }
1205
+ function extractCitations(root) {
1206
+ if (!root) return [];
1207
+ const seen = /* @__PURE__ */ new Set();
1208
+ const citations = [];
1209
+ for (const a of Array.from(root.querySelectorAll("a[href]"))) {
1210
+ const href = normalizeHref(a.getAttribute("href") ?? "");
1211
+ if (!href || seen.has(href)) continue;
1212
+ seen.add(href);
1213
+ let fallbackHost = "";
1214
+ try {
1215
+ fallbackHost = new URL(href).hostname.replace(/^www\./, "");
1216
+ } catch {
1217
+ }
1218
+ citations.push({
1219
+ text: textOf(a) || fallbackHost || href,
1220
+ href
1221
+ });
1222
+ }
1223
+ return citations;
1224
+ }
1225
+ async function maybeExpand(root) {
1226
+ const button = root.querySelector(selectors.aio.showMoreButton);
1227
+ if (!button || button.getAttribute("aria-expanded") !== "false") return false;
1228
+ button.click();
1229
+ const waitMs = selectors.expandWaitMs ?? 1500;
1230
+ if (waitMs > 0) await new Promise((resolve) => setTimeout(resolve, waitMs));
1231
+ return true;
1232
+ }
1233
+ const aioRoot = findAIORoot();
1234
+ let aioText = null;
1235
+ let aioCitations = [];
1236
+ let aioExpanded = false;
1237
+ let aioFullyExpanded = false;
1238
+ let aioSections = [];
1239
+ if (aioRoot) {
1240
+ aioExpanded = await maybeExpand(aioRoot);
1241
+ const controller = aioRoot.querySelector(selectors.aio.controller);
1242
+ const contentSubtree = aioRoot.querySelector(selectors.aio.contentSubtree);
1243
+ const showMore = aioRoot.querySelector(selectors.aio.showMoreButton);
1244
+ aioFullyExpanded = controller?.getAttribute("data-trnct") === "false" || showMore?.getAttribute("aria-expanded") === "true" || !showMore;
1245
+ aioText = cleanText(contentSubtree ?? controller ?? aioRoot);
1246
+ aioSections = (aioText ?? "").split("\n").map((line) => line.trim()).filter((line) => /^\d+\.\s+.+/.test(line));
1247
+ aioCitations = extractCitations(aioRoot);
1248
+ }
1249
+ const aimRoot = document.querySelector(selectors.aim.root);
1250
+ const aimDetected = surface === "aim" && !!aimRoot;
1251
+ const aimContainer = aimRoot?.closest(selectors.aim.wrapper) ?? aimRoot;
1252
+ const aimText = cleanText(aimContainer);
1253
+ const aimCitations = aimDetected ? extractCitations(aimContainer) : [];
1254
+ return {
1255
+ surface,
1256
+ aiOverview: {
1257
+ detected: !!aioRoot && aioText !== null,
1258
+ text: aioText,
1259
+ citations: aioCitations,
1260
+ expanded: aioExpanded,
1261
+ fullyExpanded: aioFullyExpanded,
1262
+ sections: aioSections
1263
+ },
1264
+ aiMode: {
1265
+ detected: aimDetected && aimText !== null,
1266
+ text: aimText,
1267
+ citations: aimCitations
1268
+ }
1269
+ };
1270
+ }
1271
+
540
1272
  // src/extractor/PAAExtractor.ts
1273
+ var DESKTOP_USER_AGENT2 = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
1274
+ var MOBILE_USER_AGENT2 = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
541
1275
  var PAAExtractor = class {
542
1276
  constructor(driver, reporter) {
543
1277
  this.driver = driver;
@@ -548,6 +1282,17 @@ var PAAExtractor = class {
548
1282
  normalizeQuestion(q) {
549
1283
  return q.toLowerCase().replace(/[^\w\s]/g, "").replace(/\s+/g, " ").trim();
550
1284
  }
1285
+ throwIfAborted(signal) {
1286
+ if (!signal?.aborted) return;
1287
+ if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") throw signal.reason;
1288
+ throw new RequestAbortedError();
1289
+ }
1290
+ async throwIfCaptcha(page, context) {
1291
+ const captchaCount = await page.locator(PAASelectors.captchaMarker).count().catch(() => 0);
1292
+ if (captchaCount > 0) {
1293
+ throw new CaptchaError(`${context} returned a CAPTCHA \u2014 retrying with a fresh session.`);
1294
+ }
1295
+ }
551
1296
  async extractVisibleItems(page) {
552
1297
  const sels = PAASelectors;
553
1298
  const raw = await page.evaluate((selectors) => {
@@ -610,10 +1355,10 @@ var PAAExtractor = class {
610
1355
  extracted_at: (/* @__PURE__ */ new Date()).toISOString()
611
1356
  };
612
1357
  }
613
- async runBFS(page, options) {
1358
+ async runBFS(page, options, signal) {
614
1359
  const seenKeys = /* @__PURE__ */ new Set();
615
1360
  const seenQs = /* @__PURE__ */ new Set();
616
- const depthMap = /* @__PURE__ */ new Map();
1361
+ const orderedQs = [];
617
1362
  const results = [];
618
1363
  const readAllQs = () => page.evaluate(
619
1364
  ({ sel, dataQ, dataInitQ, questionEl }) => Array.from(document.querySelectorAll(sel)).map(
@@ -621,42 +1366,43 @@ var PAAExtractor = class {
621
1366
  ).filter(Boolean),
622
1367
  { sel: PAASelectors.item, dataQ: PAASelectors.itemDataQ, dataInitQ: PAASelectors.itemDataInitQ, questionEl: PAASelectors.itemQuestionEl }
623
1368
  );
624
- const dupRates = [];
625
- const orderedQs = [];
626
- for (let round = 0; round < options.depth; round++) {
627
- this.reporter.onDepth(round + 1);
628
- if (seenQs.size >= options.maxQuestions) break;
1369
+ let round = 0;
1370
+ while (seenQs.size < options.maxQuestions) {
1371
+ this.throwIfAborted(signal);
1372
+ await this.throwIfCaptcha(page, "Google PAA expansion");
629
1373
  const beforeQs = await readAllQs();
630
1374
  if (beforeQs.length >= options.maxQuestions) break;
631
- const unexpandedItems = await page.$$(
632
- `${PAASelectors.item}:not(.${PAASelectors.expandedClass})`
633
- );
634
- if (unexpandedItems.length === 0) break;
635
- for (const item of unexpandedItems) {
1375
+ const unexpandedSel = `${PAASelectors.item}:not(.${PAASelectors.expandedClass}) ${PAASelectors.clickTarget}`;
1376
+ const unexpandedCount = await page.locator(unexpandedSel).count();
1377
+ if (unexpandedCount === 0) break;
1378
+ this.reporter.onDepth(++round);
1379
+ for (let ci = 0; ci < unexpandedCount; ci++) {
1380
+ this.throwIfAborted(signal);
636
1381
  try {
637
- await item.scrollIntoViewIfNeeded();
638
- await item.click({ force: true });
1382
+ const btn = page.locator(unexpandedSel).first();
1383
+ await btn.scrollIntoViewIfNeeded();
1384
+ await btn.hover({ force: true });
1385
+ await page.waitForTimeout(100);
1386
+ await btn.click({ force: true });
639
1387
  await page.waitForTimeout(500);
640
1388
  } catch {
641
1389
  }
642
1390
  }
643
- await page.waitForTimeout(1500);
1391
+ await page.waitForFunction(
1392
+ ({ sel, min }) => document.querySelectorAll(sel).length > min,
1393
+ { sel: PAASelectors.item, min: beforeQs.length },
1394
+ { timeout: 5e3 }
1395
+ ).catch(() => {
1396
+ });
1397
+ await this.throwIfCaptcha(page, "Google PAA expansion");
644
1398
  const afterQs = await readAllQs();
645
- const newQs = afterQs.slice(beforeQs.length);
646
- const newDups = newQs.filter((q) => seenQs.has(q)).length;
647
- const dupRate = newQs.length > 0 ? newDups / newQs.length : 0;
648
- dupRates.push(dupRate);
649
- if (dupRates.length > 2) dupRates.shift();
650
- const rollingDupRate = dupRates.reduce((a, b) => a + b, 0) / dupRates.length;
1399
+ if (afterQs.length === beforeQs.length) break;
651
1400
  for (const q of afterQs) {
652
1401
  if (!seenQs.has(q)) {
653
1402
  seenQs.add(q);
654
1403
  orderedQs.push(q);
655
1404
  }
656
- if (!depthMap.has(q)) depthMap.set(q, round + 1);
657
1405
  }
658
- if (afterQs.length === beforeQs.length) break;
659
- if (rollingDupRate >= 0.6) break;
660
1406
  }
661
1407
  const itemMap = new Map((await this.extractVisibleItems(page)).map((i) => [i.question, i]));
662
1408
  for (const q of orderedQs) {
@@ -664,13 +1410,12 @@ var PAAExtractor = class {
664
1410
  const key = this.normalizeQuestion(q);
665
1411
  if (seenKeys.has(key)) continue;
666
1412
  seenKeys.add(key);
667
- const d = depthMap.get(q) ?? 1;
668
1413
  const item = itemMap.get(q);
669
1414
  if (item) {
670
- results.push(this.toFlatRow(item, d, null, options.query));
671
- this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: d, parentQuestion: null, children: [] });
1415
+ results.push(this.toFlatRow(item, 1, null, options.query));
1416
+ this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: 1, parentQuestion: null, children: [] });
672
1417
  } else {
673
- results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, d, null, options.query));
1418
+ results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, 1, null, options.query));
674
1419
  }
675
1420
  }
676
1421
  return results;
@@ -728,6 +1473,7 @@ var PAAExtractor = class {
728
1473
  } catch {
729
1474
  return [];
730
1475
  }
1476
+ await this.throwIfCaptcha(page, "Google short video search");
731
1477
  const svSels = {
732
1478
  item: ShortVideoSelectors.item,
733
1479
  platforms: [...ShortVideoSelectors.platforms]
@@ -1009,69 +1755,11 @@ var PAAExtractor = class {
1009
1755
  return { ...entityIds, entities: records, cids: [...cidSet] };
1010
1756
  }
1011
1757
  async extractAISurfaces(page) {
1012
- const aioSels = AIOverviewSelectors;
1013
- const aimSels = AIModeSelectors;
1014
- return page.evaluate(({ aio, aim }) => {
1015
- const sn = window.google?.sn ?? "unknown";
1016
- const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
1017
- function findAIORoot() {
1018
- const primary = document.querySelector(aio.root);
1019
- if (primary) return primary;
1020
- const headings = document.querySelectorAll('h1, h2, h3, [role="heading"]');
1021
- for (const h of headings) {
1022
- if (h.textContent?.trim() === "AI Overview") {
1023
- let el = h.parentElement;
1024
- for (let i = 0; i < 6 && el; i++) {
1025
- if (el.querySelectorAll("a").length > 1) return el;
1026
- el = el.parentElement;
1027
- }
1028
- return h.parentElement;
1029
- }
1030
- }
1031
- return null;
1032
- }
1033
- const aioRoot = findAIORoot();
1034
- const aioContainer = aioRoot ? aioRoot.closest(aio.wrapper) ?? aioRoot : null;
1035
- let aioText = null;
1036
- if (aioContainer) {
1037
- const clone = aioContainer.cloneNode(true);
1038
- clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
1039
- clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
1040
- clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
1041
- clone.querySelectorAll("a").forEach((el) => el.remove());
1042
- const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
1043
- const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
1044
- aioText = isErrorState ? null : candidate;
1045
- }
1046
- const aioDetected = !!aioRoot && aioText !== null;
1047
- const aioCitations = Array.from(aioContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
1048
- text: a.textContent?.trim() ?? "",
1049
- href: a.href
1050
- })).filter((c) => c.text && c.href);
1051
- const aimRoot = document.querySelector(aim.root);
1052
- const aimDetected = surface === "aim" && !!aimRoot;
1053
- const aimContainer = aimRoot?.closest(aim.wrapper) ?? null;
1054
- let aimText = null;
1055
- if (aimContainer) {
1056
- const clone = aimContainer.cloneNode(true);
1057
- clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
1058
- clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
1059
- clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
1060
- clone.querySelectorAll("a").forEach((el) => el.remove());
1061
- const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
1062
- const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
1063
- aimText = isErrorState ? null : candidate;
1064
- }
1065
- const aimCitations = aimDetected ? Array.from(aimContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
1066
- text: a.textContent?.trim() ?? "",
1067
- href: a.href
1068
- })).filter((c) => c.text && c.href) : [];
1069
- return {
1070
- surface,
1071
- aiOverview: { detected: aioDetected, text: aioText, citations: aioCitations },
1072
- aiMode: { detected: aimDetected, text: aimText, citations: aimCitations }
1073
- };
1074
- }, { aio: aioSels, aim: aimSels });
1758
+ return page.evaluate(extractAISurfacesFromDocument, {
1759
+ aio: AIOverviewSelectors,
1760
+ aim: AIModeSelectors,
1761
+ expandWaitMs: 1500
1762
+ });
1075
1763
  }
1076
1764
  buildTree(flat, _seed) {
1077
1765
  const roots = [];
@@ -1098,23 +1786,70 @@ var PAAExtractor = class {
1098
1786
  }
1099
1787
  return roots;
1100
1788
  }
1101
- async extract(options) {
1789
+ getBrowserDebugSnapshot() {
1790
+ return this.driver.getDebugSnapshot();
1791
+ }
1792
+ buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) {
1793
+ if (!options.debug) return void 0;
1794
+ return {
1795
+ enabled: true,
1796
+ request: {
1797
+ query: options.query,
1798
+ locationInput: options.location ?? null,
1799
+ canonicalLocation,
1800
+ uule,
1801
+ gl: options.gl,
1802
+ hl: options.hl,
1803
+ device: options.device,
1804
+ proxyMode: options.proxyMode,
1805
+ proxyZip: options.proxyZip ?? null,
1806
+ serpOnly: options.serpOnly,
1807
+ pages: options.pages ?? 1
1808
+ },
1809
+ browser: this.getBrowserDebugSnapshot(),
1810
+ ...locationEvidence ? { locationEvidence } : {}
1811
+ };
1812
+ }
1813
+ async extract(options, signal) {
1102
1814
  const startMs = Date.now();
1815
+ const isMobile = options.device === "mobile";
1103
1816
  const config = {
1104
1817
  headless: options.headless,
1105
1818
  profileDir: options.profileDir,
1106
1819
  proxy: options.proxy,
1107
1820
  kernelApiKey: options.kernelApiKey,
1108
1821
  kernelProxyId: options.kernelProxyId,
1109
- viewport: { width: 1280, height: 800 },
1110
- locale: `${options.hl}-${options.gl.toUpperCase()}`
1822
+ kernelProxyResolution: options.kernelProxyResolution,
1823
+ proxyMode: options.proxyMode,
1824
+ viewport: isMobile ? { width: 390, height: 844 } : { width: 1280, height: 800 },
1825
+ locale: `${options.hl}-${options.gl.toUpperCase()}`,
1826
+ userAgent: isMobile ? MOBILE_USER_AGENT2 : DESKTOP_USER_AGENT2,
1827
+ deviceScaleFactor: isMobile ? 3 : 1,
1828
+ isMobile,
1829
+ hasTouch: isMobile,
1830
+ debug: options.debug
1111
1831
  };
1112
1832
  let errorCount = 0;
1833
+ const diagnosticWarnings = [];
1113
1834
  try {
1835
+ this.throwIfAborted(signal);
1114
1836
  await this.driver.launch(config);
1115
- const uule = options.location ? encodeUule(normalizeLocation(options.location)) : null;
1116
- const { hasPaa } = await this.driver.navigateToSERP(options.query, uule, options.gl, options.hl);
1837
+ this.throwIfAborted(signal);
1838
+ const canonicalLocation = options.location ? normalizeLocation(options.location) : null;
1839
+ const uule = canonicalLocation ? encodeUule(canonicalLocation) : null;
1840
+ const { hasPaa } = await this.driver.navigateToSERP(
1841
+ options.query,
1842
+ uule,
1843
+ options.gl,
1844
+ options.hl,
1845
+ {
1846
+ ...options.serpOnly ? { num: 100 } : {},
1847
+ debug: options.debug
1848
+ }
1849
+ );
1850
+ this.throwIfAborted(signal);
1117
1851
  const page = this.driver.getPage();
1852
+ await this.throwIfCaptcha(page, "Google SERP");
1118
1853
  if (options.serpOnly) {
1119
1854
  const [organicResults2, localPack2, rawEntityIds2] = await Promise.all([
1120
1855
  this.extractOrganicResults(page),
@@ -1122,13 +1857,19 @@ var PAAExtractor = class {
1122
1857
  this.extractEntityIds(page)
1123
1858
  ]);
1124
1859
  const entityIds2 = this.mergeLocalPackIntoEntities(rawEntityIds2, localPack2);
1860
+ const aiSurfaces2 = await this.extractAISurfaces(page);
1861
+ let locationEvidence2 = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults2, localPack2) : void 0;
1125
1862
  let allOrganic2 = organicResults2;
1126
1863
  if ((options.pages ?? 1) >= 2) {
1127
- const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1864
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1128
1865
  if (uule) p2params.set("uule", uule);
1129
1866
  await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1867
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1130
1868
  const p2organic = await this.extractOrganicResults(page);
1131
1869
  allOrganic2 = [...organicResults2, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1870
+ if (options.debug) {
1871
+ locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, allOrganic2, localPack2);
1872
+ }
1132
1873
  }
1133
1874
  const stats2 = {
1134
1875
  seed: options.query,
@@ -1142,10 +1883,15 @@ var PAAExtractor = class {
1142
1883
  seed: options.query,
1143
1884
  location: options.location ?? null,
1144
1885
  extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1886
+ diagnostics: {
1887
+ completionStatus: "serp_only",
1888
+ problem: null,
1889
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
1890
+ },
1145
1891
  totalQuestions: 0,
1146
- surface: "web",
1147
- aiOverview: { detected: false, text: null, citations: [] },
1148
- aiMode: { detected: false, text: null, citations: [] },
1892
+ surface: aiSurfaces2.surface,
1893
+ aiOverview: aiSurfaces2.aiOverview,
1894
+ aiMode: aiSurfaces2.aiMode,
1149
1895
  whatPeopleSaying: [],
1150
1896
  tree: [],
1151
1897
  flat: [],
@@ -1166,16 +1912,22 @@ var PAAExtractor = class {
1166
1912
  this.extractLocalPack(page)
1167
1913
  ]);
1168
1914
  const entityIds = this.mergeLocalPackIntoEntities(rawEntityIds, localPack);
1915
+ const initialLocationEvidence = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) : void 0;
1169
1916
  this.reporter.onVideos(videos);
1170
1917
  this.reporter.onForums(forums);
1171
1918
  if (!hasPaa) {
1172
1919
  let noPaaOrganic = organicResults;
1920
+ let locationEvidence2 = initialLocationEvidence;
1173
1921
  if ((options.pages ?? 1) >= 2) {
1174
- const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1922
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1175
1923
  if (uule) p2params.set("uule", uule);
1176
1924
  await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1925
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1177
1926
  const p2organic = await this.extractOrganicResults(page);
1178
1927
  noPaaOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1928
+ if (options.debug) {
1929
+ locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, noPaaOrganic, localPack);
1930
+ }
1179
1931
  }
1180
1932
  const aiSurfaces2 = await this.extractAISurfaces(page);
1181
1933
  const stats2 = {
@@ -1190,6 +1942,11 @@ var PAAExtractor = class {
1190
1942
  seed: options.query,
1191
1943
  location: options.location ?? null,
1192
1944
  extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1945
+ diagnostics: {
1946
+ completionStatus: "no_paa",
1947
+ problem: null,
1948
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
1949
+ },
1193
1950
  totalQuestions: 0,
1194
1951
  surface: aiSurfaces2.surface,
1195
1952
  aiOverview: aiSurfaces2.aiOverview,
@@ -1205,19 +1962,37 @@ var PAAExtractor = class {
1205
1962
  stats: stats2
1206
1963
  };
1207
1964
  }
1208
- const flat = await this.runBFS(page, options);
1965
+ const flat = await this.runBFS(page, options, signal);
1966
+ this.throwIfAborted(signal);
1209
1967
  const aiSurfaces = await this.extractAISurfaces(page);
1210
- const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, udm: ShortVideoSelectors.udm });
1968
+ const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", udm: ShortVideoSelectors.udm });
1211
1969
  if (uule) shortVidsParams.set("uule", uule);
1212
- const shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
1970
+ let shortVideos = [];
1971
+ try {
1972
+ shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
1973
+ } catch (err) {
1974
+ if (!(err instanceof CaptchaError)) throw err;
1975
+ errorCount++;
1976
+ diagnosticWarnings.push({
1977
+ code: "short_videos_captcha_skipped",
1978
+ surface: "short_videos",
1979
+ message: err.message,
1980
+ retryable: true
1981
+ });
1982
+ }
1213
1983
  this.reporter.onVideos(shortVideos);
1214
1984
  let allOrganic = organicResults;
1985
+ let locationEvidence = initialLocationEvidence;
1215
1986
  if ((options.pages ?? 1) >= 2) {
1216
- const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1987
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1217
1988
  if (uule) p2params.set("uule", uule);
1218
1989
  await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1990
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1219
1991
  const p2organic = await this.extractOrganicResults(page);
1220
1992
  allOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1993
+ if (options.debug) {
1994
+ locationEvidence = inferSerpLocationEvidence(canonicalLocation, allOrganic, localPack);
1995
+ }
1221
1996
  }
1222
1997
  const allVideos = [...videos, ...shortVideos];
1223
1998
  const tree = this.buildTree(flat, options.query);
@@ -1233,6 +2008,12 @@ var PAAExtractor = class {
1233
2008
  seed: options.query,
1234
2009
  location: options.location ?? null,
1235
2010
  extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
2011
+ diagnostics: {
2012
+ completionStatus: "paa_found",
2013
+ problem: null,
2014
+ ...diagnosticWarnings.length > 0 ? { warnings: diagnosticWarnings } : {},
2015
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) } : {}
2016
+ },
1236
2017
  totalQuestions: flat.length,
1237
2018
  surface: aiSurfaces.surface,
1238
2019
  aiOverview: aiSurfaces.aiOverview,
@@ -1251,8 +2032,6 @@ var PAAExtractor = class {
1251
2032
  errorCount++;
1252
2033
  this.reporter.onError(err instanceof Error ? err : new Error(String(err)));
1253
2034
  throw err;
1254
- } finally {
1255
- await this.driver.close();
1256
2035
  }
1257
2036
  }
1258
2037
  };
@@ -1366,61 +2145,654 @@ var ProgressReporter = class {
1366
2145
  }
1367
2146
  };
1368
2147
 
2148
+ // src/kernel-proxy-resolver.ts
2149
+ var import_sdk2 = __toESM(require("@onkernel/sdk"), 1);
2150
+ var US_STATE_CODES = {
2151
+ alabama: "AL",
2152
+ alaska: "AK",
2153
+ arizona: "AZ",
2154
+ arkansas: "AR",
2155
+ california: "CA",
2156
+ colorado: "CO",
2157
+ connecticut: "CT",
2158
+ delaware: "DE",
2159
+ florida: "FL",
2160
+ georgia: "GA",
2161
+ hawaii: "HI",
2162
+ idaho: "ID",
2163
+ illinois: "IL",
2164
+ indiana: "IN",
2165
+ iowa: "IA",
2166
+ kansas: "KS",
2167
+ kentucky: "KY",
2168
+ louisiana: "LA",
2169
+ maine: "ME",
2170
+ maryland: "MD",
2171
+ massachusetts: "MA",
2172
+ michigan: "MI",
2173
+ minnesota: "MN",
2174
+ mississippi: "MS",
2175
+ missouri: "MO",
2176
+ montana: "MT",
2177
+ nebraska: "NE",
2178
+ nevada: "NV",
2179
+ "new hampshire": "NH",
2180
+ "new jersey": "NJ",
2181
+ "new mexico": "NM",
2182
+ "new york": "NY",
2183
+ "north carolina": "NC",
2184
+ "north dakota": "ND",
2185
+ ohio: "OH",
2186
+ oklahoma: "OK",
2187
+ oregon: "OR",
2188
+ pennsylvania: "PA",
2189
+ "rhode island": "RI",
2190
+ "south carolina": "SC",
2191
+ "south dakota": "SD",
2192
+ tennessee: "TN",
2193
+ texas: "TX",
2194
+ utah: "UT",
2195
+ vermont: "VT",
2196
+ virginia: "VA",
2197
+ washington: "WA",
2198
+ "west virginia": "WV",
2199
+ wisconsin: "WI",
2200
+ wyoming: "WY"
2201
+ };
2202
+ var US_CITY_CENTER_ZIPS = {
2203
+ "atlanta|GA": "30303",
2204
+ "austin|TX": "78701",
2205
+ "baltimore|MD": "21201",
2206
+ "boston|MA": "02108",
2207
+ "boulder|CO": "80302",
2208
+ "charlotte|NC": "28202",
2209
+ "chicago|IL": "60601",
2210
+ "colorado_springs|CO": "80903",
2211
+ "columbus|OH": "43215",
2212
+ "dallas|TX": "75201",
2213
+ "denver|CO": "80202",
2214
+ "detroit|MI": "48226",
2215
+ "fort_collins|CO": "80524",
2216
+ "fort_worth|TX": "76102",
2217
+ "houston|TX": "77002",
2218
+ "indianapolis|IN": "46204",
2219
+ "jacksonville|FL": "32202",
2220
+ "las_vegas|NV": "89101",
2221
+ "los_angeles|CA": "90012",
2222
+ "louisville|KY": "40202",
2223
+ "loveland|CO": "80537",
2224
+ "memphis|TN": "38103",
2225
+ "miami|FL": "33131",
2226
+ "minneapolis|MN": "55401",
2227
+ "nashville|TN": "37203",
2228
+ "new_york|NY": "10001",
2229
+ "orlando|FL": "32801",
2230
+ "philadelphia|PA": "19103",
2231
+ "phoenix|AZ": "85004",
2232
+ "portland|OR": "97205",
2233
+ "raleigh|NC": "27601",
2234
+ "richmond|VA": "23219",
2235
+ "sacramento|CA": "95814",
2236
+ "salt_lake_city|UT": "84101",
2237
+ "san_antonio|TX": "78205",
2238
+ "san_diego|CA": "92101",
2239
+ "san_francisco|CA": "94103",
2240
+ "san_jose|CA": "95113",
2241
+ "seattle|WA": "98101"
2242
+ };
2243
+ function proxyIdSuffix2(proxyId) {
2244
+ return proxyId ? proxyId.slice(-6) : null;
2245
+ }
2246
+ function resolution(source, proxyMode, proxyId, target, error) {
2247
+ return {
2248
+ kernelProxyId: proxyId,
2249
+ resolution: {
2250
+ source,
2251
+ proxyMode,
2252
+ proxyIdPresent: Boolean(proxyId),
2253
+ proxyIdSuffix: proxyIdSuffix2(proxyId),
2254
+ target,
2255
+ error
2256
+ }
2257
+ };
2258
+ }
2259
+ function normalizeStateName(value) {
2260
+ return value.trim().toLowerCase().replace(/\s+/g, " ");
2261
+ }
2262
+ function normalizeCountryName(value) {
2263
+ return value.trim().toLowerCase().replace(/\./g, "").replace(/\s+/g, " ");
2264
+ }
2265
+ function isUnitedStates(country) {
2266
+ if (!country) return true;
2267
+ const normalized = normalizeCountryName(country);
2268
+ return normalized === "united states" || normalized === "united states of america" || normalized === "usa" || normalized === "us";
2269
+ }
2270
+ function stateCodeFor(region) {
2271
+ const trimmed = region.trim();
2272
+ if (/^[A-Za-z]{2}$/.test(trimmed)) return trimmed.toUpperCase();
2273
+ return US_STATE_CODES[normalizeStateName(trimmed)] ?? null;
2274
+ }
2275
+ function kernelCityIdentifierCandidates(city) {
2276
+ const ascii = city.normalize("NFKD").replace(/[^\x00-\x7F]/g, "").toLowerCase();
2277
+ const words = ascii.split(/[^a-z0-9]+/).filter(Boolean);
2278
+ const underscored = words.join("_");
2279
+ const compact = words.join("");
2280
+ return Array.from(new Set([underscored, compact].filter(Boolean)));
2281
+ }
2282
+ function proxyName(country, state, city) {
2283
+ return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
2284
+ }
2285
+ function zipProxyName(zip) {
2286
+ return `mcp-serp-residential-us-zip-${zip}`;
2287
+ }
2288
+ function parseKernelLocationProxyTarget(location, gl) {
2289
+ if (!location || gl.toLowerCase() !== "us") return null;
2290
+ const canonicalLocation = normalizeLocation(location);
2291
+ let parts = canonicalLocation.split(",").map((part) => part.trim()).filter(Boolean);
2292
+ if (parts.length > 1 && isUnitedStates(parts[parts.length - 1])) {
2293
+ parts = parts.slice(0, -1);
2294
+ }
2295
+ if (parts.length === 1) {
2296
+ const stateOnly = stateCodeFor(parts[0]);
2297
+ if (!stateOnly) return null;
2298
+ return {
2299
+ canonicalLocation,
2300
+ level: "state",
2301
+ country: "US",
2302
+ state: stateOnly,
2303
+ city: "",
2304
+ cityCandidates: [],
2305
+ proxyName: proxyName("US", stateOnly),
2306
+ config: {
2307
+ country: "US",
2308
+ state: stateOnly
2309
+ }
2310
+ };
2311
+ }
2312
+ const [city = "", region = ""] = parts;
2313
+ if (!city || !region) return null;
2314
+ const state = stateCodeFor(region);
2315
+ if (!state) return null;
2316
+ const cityCandidates = kernelCityIdentifierCandidates(city);
2317
+ const primaryCity = cityCandidates[0];
2318
+ if (!primaryCity) return null;
2319
+ return {
2320
+ canonicalLocation,
2321
+ level: "city",
2322
+ country: "US",
2323
+ state,
2324
+ city: primaryCity,
2325
+ cityCandidates,
2326
+ proxyName: proxyName("US", state, primaryCity),
2327
+ config: {
2328
+ country: "US",
2329
+ state,
2330
+ city: primaryCity
2331
+ }
2332
+ };
2333
+ }
2334
+ function cityZipKey(target) {
2335
+ return `${target.city}|${target.state}`;
2336
+ }
2337
+ function knownZipFor(target, explicitZip) {
2338
+ if (explicitZip && /^\d{5}$/.test(explicitZip)) return explicitZip;
2339
+ return US_CITY_CENTER_ZIPS[cityZipKey(target)] ?? null;
2340
+ }
2341
+ function zipTarget(target, zip) {
2342
+ return {
2343
+ ...target,
2344
+ level: "zip",
2345
+ zip,
2346
+ proxyName: zipProxyName(zip),
2347
+ config: {
2348
+ country: target.country,
2349
+ state: target.state,
2350
+ zip
2351
+ }
2352
+ };
2353
+ }
2354
+ function configMatches(config, target, city) {
2355
+ if (target.level === "zip") {
2356
+ return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
2357
+ }
2358
+ return config?.country?.toUpperCase() === target.country && config?.state?.toUpperCase() === target.state && (city ? config?.city === city : !config?.city);
2359
+ }
2360
+ function findExistingTargetProxy(proxies, target) {
2361
+ return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === target.proxyName || configMatches(proxy.config, target, target.level === "city" ? target.city : void 0))) ?? null;
2362
+ }
2363
+ function findExistingProxy(proxies, target) {
2364
+ for (const city of target.cityCandidates) {
2365
+ const name = proxyName(target.country, target.state, city);
2366
+ const found = proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target, city)));
2367
+ if (found) return found;
2368
+ }
2369
+ return null;
2370
+ }
2371
+ function stateTarget(target) {
2372
+ return {
2373
+ ...target,
2374
+ level: "state",
2375
+ proxyName: proxyName(target.country, target.state),
2376
+ config: {
2377
+ country: target.country,
2378
+ state: target.state
2379
+ }
2380
+ };
2381
+ }
2382
+ function findExistingStateProxy(proxies, target) {
2383
+ const name = proxyName(target.country, target.state);
2384
+ return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target))) ?? null;
2385
+ }
2386
+ function escalatedTargetLevel(target, attemptIndex) {
2387
+ return stateTarget(target);
2388
+ }
2389
+ function errorText2(err) {
2390
+ return err instanceof Error ? err.message : String(err);
2391
+ }
2392
+ async function resolveKernelProxyId(options) {
2393
+ if (options.proxyMode === "none") {
2394
+ return resolution("disabled", options.proxyMode, void 0, null, null);
2395
+ }
2396
+ if (options.proxyMode === "configured") {
2397
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, null, null);
2398
+ }
2399
+ const target = parseKernelLocationProxyTarget(options.location, options.gl);
2400
+ if (!target || !options.kernelApiKey) {
2401
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, target ? null : "location could not be normalized to a US city/state proxy target");
2402
+ }
2403
+ const kernel = new import_sdk2.default({ apiKey: options.kernelApiKey });
2404
+ try {
2405
+ const attemptIndex = options.attemptIndex ?? 0;
2406
+ if (attemptIndex >= 1) {
2407
+ const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
2408
+ const createErrors2 = [];
2409
+ try {
2410
+ const created = await kernel.proxies.create({
2411
+ type: "residential",
2412
+ name: escalatedTarget.proxyName,
2413
+ config: escalatedTarget.config
2414
+ });
2415
+ if (created.id) {
2416
+ return resolution("location_created", options.proxyMode, created.id, escalatedTarget, null);
2417
+ }
2418
+ createErrors2.push(`${escalatedTarget.state}: Kernel did not return a proxy id`);
2419
+ } catch (err) {
2420
+ createErrors2.push(`${escalatedTarget.state}: ${errorText2(err)}`);
2421
+ }
2422
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, escalatedTarget, createErrors2.join(" | "));
2423
+ }
2424
+ const proxies = await kernel.proxies.list();
2425
+ const zip = knownZipFor(target, options.proxyZip);
2426
+ const createErrors = [];
2427
+ if (zip) {
2428
+ const targetZip = zipTarget(target, zip);
2429
+ const existingZip = findExistingTargetProxy(proxies, targetZip);
2430
+ if (existingZip?.id) {
2431
+ return resolution("location_reused", options.proxyMode, existingZip.id, targetZip, null);
2432
+ }
2433
+ try {
2434
+ const created = await kernel.proxies.create({
2435
+ type: "residential",
2436
+ name: targetZip.proxyName,
2437
+ config: {
2438
+ country: targetZip.country,
2439
+ zip
2440
+ }
2441
+ });
2442
+ if (created.id) {
2443
+ return resolution("location_created", options.proxyMode, created.id, targetZip, null);
2444
+ }
2445
+ createErrors.push(`${zip}: Kernel did not return a proxy id`);
2446
+ } catch (err) {
2447
+ createErrors.push(`${zip}: ${errorText2(err)}`);
2448
+ }
2449
+ }
2450
+ const existing = findExistingProxy(proxies, target);
2451
+ if (existing?.id) {
2452
+ return resolution("location_reused", options.proxyMode, existing.id, target, createErrors.join(" | ") || null);
2453
+ }
2454
+ for (const city of target.cityCandidates) {
2455
+ try {
2456
+ const created = await kernel.proxies.create({
2457
+ type: "residential",
2458
+ name: proxyName(target.country, target.state, city),
2459
+ config: {
2460
+ country: target.country,
2461
+ state: target.state,
2462
+ city
2463
+ }
2464
+ });
2465
+ if (created.id) {
2466
+ return resolution("location_created", options.proxyMode, created.id, {
2467
+ ...target,
2468
+ level: "city",
2469
+ city,
2470
+ proxyName: proxyName(target.country, target.state, city),
2471
+ config: {
2472
+ country: target.country,
2473
+ state: target.state,
2474
+ city
2475
+ }
2476
+ }, null);
2477
+ }
2478
+ createErrors.push(`${city}: Kernel did not return a proxy id`);
2479
+ } catch (err) {
2480
+ createErrors.push(`${city}: ${errorText2(err)}`);
2481
+ }
2482
+ }
2483
+ const fallbackTarget = stateTarget(target);
2484
+ const existingState = findExistingStateProxy(proxies, fallbackTarget);
2485
+ if (existingState?.id) {
2486
+ return resolution("location_reused", options.proxyMode, existingState.id, fallbackTarget, createErrors.join(" | "));
2487
+ }
2488
+ try {
2489
+ const created = await kernel.proxies.create({
2490
+ type: "residential",
2491
+ name: fallbackTarget.proxyName,
2492
+ config: fallbackTarget.config
2493
+ });
2494
+ if (created.id) {
2495
+ return resolution("location_created", options.proxyMode, created.id, fallbackTarget, createErrors.join(" | "));
2496
+ }
2497
+ createErrors.push(`${fallbackTarget.state}: Kernel did not return a proxy id`);
2498
+ } catch (err) {
2499
+ createErrors.push(`${fallbackTarget.state}: ${errorText2(err)}`);
2500
+ }
2501
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
2502
+ } catch (err) {
2503
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, errorText2(err));
2504
+ }
2505
+ }
2506
+
1369
2507
  // src/harvest.ts
1370
2508
  var MAX_ATTEMPTS = 3;
1371
- async function extractOnce(options) {
2509
+ function abortReason(signal) {
2510
+ if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
2511
+ return new RequestAbortedError();
2512
+ }
2513
+ function getAbortSignal(rawOptions) {
2514
+ if (!rawOptions || typeof rawOptions !== "object") return void 0;
2515
+ const signal = rawOptions.signal;
2516
+ if (signal instanceof AbortSignal) return signal;
2517
+ return void 0;
2518
+ }
2519
+ function getAttemptLogSink(rawOptions) {
2520
+ if (!rawOptions || typeof rawOptions !== "object") return void 0;
2521
+ const sink = rawOptions.onAttemptEvent;
2522
+ return typeof sink === "function" ? sink : void 0;
2523
+ }
2524
+ async function emitAttemptEvent(sink, event) {
2525
+ if (!sink) return;
2526
+ try {
2527
+ await sink(event);
2528
+ } catch (err) {
2529
+ console.warn(JSON.stringify({
2530
+ event: "harvest_attempt_log_failed",
2531
+ attempt_number: event.attemptNumber,
2532
+ message: err instanceof Error ? err.message : String(err)
2533
+ }));
2534
+ }
2535
+ }
2536
+ function classifyAttemptError(err) {
2537
+ if (err instanceof CaptchaError) return "captcha";
2538
+ if (err instanceof RequestAbortedError) return "request_aborted";
2539
+ if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
2540
+ const message = err instanceof Error ? err.message : String(err);
2541
+ return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
2542
+ }
2543
+ function classifyAttemptResult(result) {
2544
+ return result.diagnostics?.completionStatus ?? (result.totalQuestions > 0 ? "paa_found" : "no_paa");
2545
+ }
2546
+ function errorMessage(err) {
2547
+ return err instanceof Error ? err.message : String(err);
2548
+ }
2549
+ async function extractOnce(options, signal) {
1372
2550
  const driver = new BrowserDriver();
1373
2551
  const reporter = new ProgressReporter();
1374
2552
  const extractor = new PAAExtractor(driver, reporter);
2553
+ if (signal?.aborted) {
2554
+ return {
2555
+ result: null,
2556
+ error: abortReason(signal),
2557
+ cleanup: await driver.close(),
2558
+ debug: null
2559
+ };
2560
+ }
2561
+ let onAbort;
2562
+ const abortPromise = signal ? new Promise((_, reject) => {
2563
+ onAbort = () => reject(abortReason(signal));
2564
+ signal.addEventListener("abort", onAbort, { once: true });
2565
+ }) : null;
2566
+ let result = null;
2567
+ let error = null;
2568
+ let cleanup;
2569
+ let debug = null;
1375
2570
  try {
1376
- return await extractor.extract(options);
2571
+ const extraction = extractor.extract(options, signal);
2572
+ if (abortPromise) extraction.catch(() => {
2573
+ });
2574
+ result = await (abortPromise ? Promise.race([extraction, abortPromise]) : extraction);
2575
+ } catch (err) {
2576
+ error = err;
1377
2577
  } finally {
1378
- await driver.close();
2578
+ if (signal && onAbort) signal.removeEventListener("abort", onAbort);
2579
+ debug = result?.diagnostics.debug ?? (options.debug ? {
2580
+ enabled: true,
2581
+ request: {
2582
+ query: options.query,
2583
+ locationInput: options.location ?? null,
2584
+ canonicalLocation: null,
2585
+ uule: null,
2586
+ gl: options.gl,
2587
+ hl: options.hl,
2588
+ device: options.device,
2589
+ proxyMode: options.proxyMode,
2590
+ proxyZip: options.proxyZip ?? null,
2591
+ serpOnly: options.serpOnly,
2592
+ pages: options.pages ?? 1
2593
+ },
2594
+ browser: driver.getDebugSnapshot()
2595
+ } : null);
2596
+ cleanup = await driver.close();
1379
2597
  }
2598
+ return error ? { result: null, error, cleanup, debug } : { result, error: null, cleanup, debug };
1380
2599
  }
1381
2600
  async function harvest(rawOptions) {
1382
2601
  const raw = typeof rawOptions === "object" && rawOptions !== null ? rawOptions : {};
1383
- const merged = {
1384
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
1385
- kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
1386
- ...raw
2602
+ const signal = getAbortSignal(rawOptions);
2603
+ const onAttemptEvent = getAttemptLogSink(rawOptions);
2604
+ const requestedProxyMode = raw.proxyMode;
2605
+ const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
2606
+ const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
2607
+ const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
2608
+ const proxyOpts = {
2609
+ kernelApiKey,
2610
+ proxyMode,
2611
+ configuredKernelProxyId,
2612
+ location: typeof raw.location === "string" ? raw.location : void 0,
2613
+ proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
2614
+ gl: typeof raw.gl === "string" ? raw.gl : "us"
1387
2615
  };
1388
- const options = HarvestOptionsSchema.parse(merged);
1389
2616
  const serializer = new OutputSerializer();
1390
2617
  for (let i = 0; i < MAX_ATTEMPTS; i++) {
2618
+ const attemptNumber = i + 1;
2619
+ const startedAtMs = Date.now();
1391
2620
  try {
1392
- const result = await extractOnce(options);
1393
- if (options.format === "json" || options.format === "both") {
1394
- await serializer.writeJSON(result, options.outputDir);
2621
+ if (signal?.aborted) throw abortReason(signal);
2622
+ const resolution2 = await resolveKernelProxyId({ ...proxyOpts, attemptIndex: i });
2623
+ const mergedAttempt = {
2624
+ ...raw,
2625
+ kernelApiKey,
2626
+ kernelProxyId: resolution2.kernelProxyId,
2627
+ kernelProxyResolution: resolution2.resolution,
2628
+ proxyMode
2629
+ };
2630
+ if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
2631
+ const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
2632
+ await emitAttemptEvent(onAttemptEvent, {
2633
+ type: "started",
2634
+ attemptNumber,
2635
+ maxAttempts: MAX_ATTEMPTS,
2636
+ query: attemptOptions.query,
2637
+ location: attemptOptions.location ?? null,
2638
+ maxQuestions: attemptOptions.maxQuestions,
2639
+ startedAt: new Date(startedAtMs).toISOString()
2640
+ });
2641
+ console.info(JSON.stringify({
2642
+ event: "harvest_attempt_started",
2643
+ attempt_number: attemptNumber,
2644
+ max_attempts: MAX_ATTEMPTS,
2645
+ query: attemptOptions.query,
2646
+ location: attemptOptions.location ?? null,
2647
+ max_questions: attemptOptions.maxQuestions
2648
+ }));
2649
+ const attempt = await extractOnce(attemptOptions, signal);
2650
+ if (attempt.error) {
2651
+ const err = attempt.error;
2652
+ if (err instanceof CaptchaError) {
2653
+ const willRetry = i < MAX_ATTEMPTS - 1;
2654
+ console.warn(JSON.stringify({
2655
+ event: "harvest_attempt_captcha",
2656
+ attempt_number: attemptNumber,
2657
+ max_attempts: MAX_ATTEMPTS,
2658
+ message: err.message,
2659
+ will_retry: willRetry
2660
+ }));
2661
+ await emitAttemptEvent(onAttemptEvent, {
2662
+ type: "finished",
2663
+ attemptNumber,
2664
+ maxAttempts: MAX_ATTEMPTS,
2665
+ outcome: "captcha",
2666
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2667
+ questionCount: 0,
2668
+ durationMs: Date.now() - startedAtMs,
2669
+ error: err.message,
2670
+ willRetry,
2671
+ cleanup: attempt.cleanup,
2672
+ debug: attempt.debug,
2673
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2674
+ });
2675
+ if (willRetry) continue;
2676
+ break;
2677
+ }
2678
+ await emitAttemptEvent(onAttemptEvent, {
2679
+ type: "finished",
2680
+ attemptNumber,
2681
+ maxAttempts: MAX_ATTEMPTS,
2682
+ outcome: classifyAttemptError(err),
2683
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2684
+ questionCount: 0,
2685
+ durationMs: Date.now() - startedAtMs,
2686
+ error: errorMessage(err),
2687
+ willRetry: false,
2688
+ cleanup: attempt.cleanup,
2689
+ debug: attempt.debug,
2690
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2691
+ });
2692
+ throw err;
1395
2693
  }
1396
- if (options.format === "csv" || options.format === "both") {
2694
+ const result = attempt.result;
2695
+ if (!result) throw new Error("Harvest attempt completed without a result");
2696
+ await emitAttemptEvent(onAttemptEvent, {
2697
+ type: "finished",
2698
+ attemptNumber,
2699
+ maxAttempts: MAX_ATTEMPTS,
2700
+ outcome: classifyAttemptResult(result),
2701
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2702
+ questionCount: result.totalQuestions,
2703
+ durationMs: Date.now() - startedAtMs,
2704
+ error: null,
2705
+ willRetry: false,
2706
+ cleanup: attempt.cleanup,
2707
+ debug: attempt.debug,
2708
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2709
+ });
2710
+ if (attemptOptions.format === "json" || attemptOptions.format === "both") {
2711
+ await serializer.writeJSON(result, attemptOptions.outputDir);
2712
+ }
2713
+ if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
1397
2714
  await Promise.all([
1398
- serializer.writeCSV(result.flat, options.outputDir),
1399
- result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, options.outputDir) : Promise.resolve(""),
1400
- result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, options.outputDir) : Promise.resolve(""),
1401
- result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, options.outputDir) : Promise.resolve(""),
1402
- result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, options.outputDir) : Promise.resolve(""),
1403
- result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, options.outputDir) : Promise.resolve("")
2715
+ serializer.writeCSV(result.flat, attemptOptions.outputDir),
2716
+ result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2717
+ result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2718
+ result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2719
+ result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2720
+ result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, attemptOptions.outputDir) : Promise.resolve("")
1404
2721
  ]);
1405
2722
  }
1406
2723
  return result;
1407
2724
  } catch (err) {
1408
- if (err instanceof CaptchaError && i < MAX_ATTEMPTS - 1) {
1409
- continue;
2725
+ if (err instanceof CaptchaError) {
2726
+ const willRetry = i < MAX_ATTEMPTS - 1;
2727
+ console.warn(JSON.stringify({
2728
+ event: "harvest_attempt_captcha",
2729
+ attempt_number: attemptNumber,
2730
+ max_attempts: MAX_ATTEMPTS,
2731
+ message: err.message,
2732
+ will_retry: willRetry
2733
+ }));
2734
+ await emitAttemptEvent(onAttemptEvent, {
2735
+ type: "finished",
2736
+ attemptNumber,
2737
+ maxAttempts: MAX_ATTEMPTS,
2738
+ outcome: "captcha",
2739
+ kernelSessionId: null,
2740
+ questionCount: 0,
2741
+ durationMs: Date.now() - startedAtMs,
2742
+ error: err.message,
2743
+ willRetry,
2744
+ cleanup: {
2745
+ kernelSessionId: null,
2746
+ kernelDeleteStarted: false,
2747
+ kernelDeleteSucceeded: null,
2748
+ kernelDeleteError: null,
2749
+ browserCloseSucceeded: null,
2750
+ browserCloseError: null
2751
+ },
2752
+ debug: null,
2753
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2754
+ });
2755
+ if (willRetry) continue;
2756
+ break;
1410
2757
  }
2758
+ await emitAttemptEvent(onAttemptEvent, {
2759
+ type: "finished",
2760
+ attemptNumber,
2761
+ maxAttempts: MAX_ATTEMPTS,
2762
+ outcome: classifyAttemptError(err),
2763
+ kernelSessionId: null,
2764
+ questionCount: 0,
2765
+ durationMs: Date.now() - startedAtMs,
2766
+ error: errorMessage(err),
2767
+ willRetry: false,
2768
+ cleanup: {
2769
+ kernelSessionId: null,
2770
+ kernelDeleteStarted: false,
2771
+ kernelDeleteSucceeded: null,
2772
+ kernelDeleteError: null,
2773
+ browserCloseSucceeded: null,
2774
+ browserCloseError: null
2775
+ },
2776
+ debug: null,
2777
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2778
+ });
1411
2779
  throw err;
1412
2780
  }
1413
2781
  }
1414
- const sessionDesc = options.kernelApiKey ? `${MAX_ATTEMPTS} fresh Kernel.sh sessions` : `${MAX_ATTEMPTS} attempts`;
1415
- throw new CaptchaError(`CAPTCHA on all ${sessionDesc}. Try again in a few minutes.`);
2782
+ console.warn(JSON.stringify({
2783
+ event: "harvest_captcha_exhausted",
2784
+ max_attempts: MAX_ATTEMPTS,
2785
+ session_kind: kernelApiKey ? "kernel" : "local"
2786
+ }));
2787
+ throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${MAX_ATTEMPTS} fresh sessions. Try again in a few minutes.`));
1416
2788
  }
1417
2789
 
1418
2790
  // src/video/VideoGenerator.ts
1419
2791
  var import_node_child_process2 = require("child_process");
1420
- var import_node_fs3 = require("fs");
1421
- var import_node_os = require("os");
1422
- var import_node_path3 = require("path");
1423
- var import_client2 = require("@fal-ai/client");
2792
+ var import_node_fs4 = require("fs");
2793
+ var import_node_os2 = require("os");
2794
+ var import_node_path4 = require("path");
2795
+ var import_client3 = require("@fal-ai/client");
1424
2796
 
1425
2797
  // src/video/promptBuilder.ts
1426
2798
  var DEEPINFRA_URL = "https://api.deepinfra.com/v1/openai/chat/completions";
@@ -1490,72 +2862,78 @@ async function buildClipPrompts(question, answer) {
1490
2862
  }
1491
2863
  throw new Error("No LLM key \u2014 set DEEPINFRA_API_KEY or OPENROUTER_API_KEY");
1492
2864
  }
2865
+ function extractEpisodePrompts(brief) {
2866
+ if (!brief.clip1 || !brief.clip2 || !brief.voiceover || !brief.audioMood) {
2867
+ throw new Error("Episode brief is missing prompt fields \u2014 run blog-to-video skill to regenerate");
2868
+ }
2869
+ return { clip1: brief.clip1, clip2: brief.clip2, voiceover: brief.voiceover, audioMood: brief.audioMood };
2870
+ }
1493
2871
 
1494
2872
  // src/video/AudioGenerator.ts
1495
- var TTS_MODEL = "fal-ai/inworld-tts";
2873
+ var import_node_fs2 = require("fs");
2874
+ var import_node_path2 = require("path");
2875
+ var import_node_os = require("os");
2876
+ var import_client = require("@fal-ai/client");
1496
2877
  var MMAUDIO_MODEL = "fal-ai/mmaudio-v2";
1497
- var QUEUE_BASE = "https://queue.fal.run";
1498
- async function rawQueueRun(model, input, apiKey) {
1499
- const headers = { "Authorization": `Key ${apiKey}`, "Content-Type": "application/json" };
1500
- const submitRes = await fetch(`${QUEUE_BASE}/${model}`, {
1501
- method: "POST",
1502
- headers,
1503
- body: JSON.stringify(input)
1504
- });
1505
- if (!submitRes.ok) throw new Error(`${model} submit failed (${submitRes.status}): ${await submitRes.text()}`);
1506
- const { request_id } = await submitRes.json();
1507
- console.log(`[fal] submitted ${model} \u2192 ${request_id}`);
1508
- while (true) {
1509
- await new Promise((r) => setTimeout(r, 5e3));
1510
- const statusRes = await fetch(`${QUEUE_BASE}/${model}/requests/${request_id}/status`, { headers });
1511
- if (!statusRes.ok) continue;
1512
- const { status } = await statusRes.json();
1513
- console.log(`[fal] ${request_id} \u2192 ${status}`);
1514
- if (status === "FAILED") throw new Error(`${model} request ${request_id} failed`);
1515
- if (status !== "COMPLETED") continue;
1516
- const resultRes = await fetch(`${QUEUE_BASE}/${model}/requests/${request_id}`, { headers });
1517
- if (!resultRes.ok) throw new Error(`Result fetch failed (${resultRes.status})`);
1518
- return await resultRes.json();
1519
- }
1520
- }
1521
- function getKey() {
1522
- const key = process.env["FAL_KEY"];
1523
- if (!key) throw new Error("FAL_KEY required");
1524
- return key;
1525
- }
1526
- async function generateVoiceover(text, voice = "Serena (en)") {
2878
+ var ELEVENLABS_MODEL = "fal-ai/elevenlabs/tts";
2879
+ var GEMINI_TTS_MODEL = "fal-ai/google/gemini-2.5-flash-preview-tts";
2880
+ async function downloadAudio(url) {
2881
+ const res = await fetch(url);
2882
+ if (!res.ok) throw new Error(`Failed to download TTS audio (${res.status})`);
2883
+ return Buffer.from(await res.arrayBuffer());
2884
+ }
2885
+ async function generateVoiceover(text) {
1527
2886
  console.log("[AudioGenerator] Generating voiceover...");
1528
- const out = await rawQueueRun(TTS_MODEL, { text, voice, sample_rate_hertz: 48e3 }, getKey());
1529
- return out.audio.url;
2887
+ const outDir = (0, import_node_path2.join)((0, import_node_os.tmpdir)(), `tts-${Date.now()}`);
2888
+ (0, import_node_fs2.mkdirSync)(outDir, { recursive: true });
2889
+ const outPath = (0, import_node_path2.join)(outDir, "voiceover.mp3");
2890
+ try {
2891
+ const voiceId = process.env["ELEVENLABS_VOICE_ID"] ?? "pNInz6obpgDQGcFmaJgB";
2892
+ const result2 = await import_client.fal.run(ELEVENLABS_MODEL, {
2893
+ input: { text, voice_id: voiceId, model_id: "eleven_v3" }
2894
+ });
2895
+ (0, import_node_fs2.writeFileSync)(outPath, await downloadAudio(result2.audio.url));
2896
+ console.log("[AudioGenerator] TTS: ElevenLabs via fal");
2897
+ return outPath;
2898
+ } catch (err) {
2899
+ console.warn("[AudioGenerator] ElevenLabs via fal failed, trying Gemini:", err.message);
2900
+ }
2901
+ const voice = process.env["GEMINI_TTS_VOICE"] ?? "Kore";
2902
+ const result = await import_client.fal.run(GEMINI_TTS_MODEL, { input: { text, voice } });
2903
+ (0, import_node_fs2.writeFileSync)(outPath, await downloadAudio(result.audio.url));
2904
+ console.log("[AudioGenerator] TTS: Gemini via fal");
2905
+ return outPath;
1530
2906
  }
1531
2907
  async function addBackgroundAudio(videoUrl, mood, durationSeconds) {
1532
2908
  console.log("[AudioGenerator] Adding background audio via MMAudio V2...");
1533
- const out = await rawQueueRun(MMAUDIO_MODEL, {
1534
- video_url: videoUrl,
1535
- prompt: mood,
1536
- negative_prompt: "speech, voice, talking, dialogue, narration, vocals, singing, human voice, conversation, words, lyrics, announcer, commentary",
1537
- duration: durationSeconds,
1538
- cfg_strength: 4.5
1539
- }, getKey());
1540
- return out.video.url;
2909
+ const result = await import_client.fal.run(MMAUDIO_MODEL, {
2910
+ input: {
2911
+ video_url: videoUrl,
2912
+ prompt: mood,
2913
+ negative_prompt: "speech, voice, talking, dialogue, narration, vocals, singing, human voice, conversation, words, lyrics, announcer, commentary",
2914
+ duration: durationSeconds,
2915
+ cfg_strength: 4.5
2916
+ }
2917
+ });
2918
+ return result.video.url;
1541
2919
  }
1542
2920
 
1543
2921
  // src/video/VideoMixer.ts
1544
2922
  var import_node_child_process = require("child_process");
1545
- var import_node_fs2 = require("fs");
1546
- var import_node_path2 = require("path");
1547
- var import_client = require("@fal-ai/client");
2923
+ var import_node_fs3 = require("fs");
2924
+ var import_node_path3 = require("path");
2925
+ var import_client2 = require("@fal-ai/client");
1548
2926
  async function download(url, destPath) {
1549
2927
  const res = await fetch(url);
1550
2928
  if (!res.ok) throw new Error(`Download failed (${res.status}): ${url}`);
1551
- (0, import_node_fs2.writeFileSync)(destPath, Buffer.from(await res.arrayBuffer()));
2929
+ (0, import_node_fs3.writeFileSync)(destPath, Buffer.from(await res.arrayBuffer()));
1552
2930
  }
1553
2931
  async function concatenateClips(clip1Url, clip2Url, outDir) {
1554
- (0, import_node_fs2.mkdirSync)(outDir, { recursive: true });
2932
+ (0, import_node_fs3.mkdirSync)(outDir, { recursive: true });
1555
2933
  const ts = Date.now();
1556
- const p1 = (0, import_node_path2.join)(outDir, `clip1-${ts}.mp4`);
1557
- const p2 = (0, import_node_path2.join)(outDir, `clip2-${ts}.mp4`);
1558
- const out = (0, import_node_path2.join)(outDir, `combined-${ts}.mp4`);
2934
+ const p1 = (0, import_node_path3.join)(outDir, `clip1-${ts}.mp4`);
2935
+ const p2 = (0, import_node_path3.join)(outDir, `clip2-${ts}.mp4`);
2936
+ const out = (0, import_node_path3.join)(outDir, `combined-${ts}.mp4`);
1559
2937
  console.log("[VideoMixer] Downloading clips...");
1560
2938
  await Promise.all([download(clip1Url, p1), download(clip2Url, p2)]);
1561
2939
  console.log("[VideoMixer] Concatenating...");
@@ -1567,14 +2945,14 @@ async function concatenateClips(clip1Url, clip2Url, outDir) {
1567
2945
  async function uploadToFal(localPath) {
1568
2946
  const { readFileSync: readFileSync2 } = await import("fs");
1569
2947
  const blob = new Blob([readFileSync2(localPath)], { type: "video/mp4" });
1570
- const url = await import_client.fal.storage.upload(blob);
2948
+ const url = await import_client2.fal.storage.upload(blob);
1571
2949
  console.log("[VideoMixer] Uploaded to fal:", url);
1572
2950
  return url;
1573
2951
  }
1574
2952
  async function overlayVoiceover(videoPath, voiceoverUrl, outDir) {
1575
2953
  const ts = Date.now();
1576
- const wav = (0, import_node_path2.join)(outDir, `voiceover-${ts}.wav`);
1577
- const out = (0, import_node_path2.join)(outDir, `final-${ts}.mp4`);
2954
+ const wav = (0, import_node_path3.join)(outDir, `voiceover-${ts}.wav`);
2955
+ const out = (0, import_node_path3.join)(outDir, `final-${ts}.mp4`);
1578
2956
  console.log("[VideoMixer] Downloading voiceover...");
1579
2957
  await download(voiceoverUrl, wav);
1580
2958
  console.log("[VideoMixer] Mixing voiceover over background audio...");
@@ -1599,30 +2977,30 @@ function buildInput(prompt, opts, seed, imageUrl) {
1599
2977
  };
1600
2978
  }
1601
2979
  async function generate(model, input) {
1602
- const { request_id } = await import_client2.fal.queue.submit(model, { input });
2980
+ const { request_id } = await import_client3.fal.queue.submit(model, { input });
1603
2981
  console.log(`[fal] submitted ${model} \u2192 ${request_id}`);
1604
2982
  while (true) {
1605
2983
  await new Promise((r) => setTimeout(r, 5e3));
1606
- const s = await import_client2.fal.queue.status(model, { requestId: request_id, logs: false });
2984
+ const s = await import_client3.fal.queue.status(model, { requestId: request_id, logs: false });
1607
2985
  console.log(`[fal] ${request_id} \u2192 ${s.status}`);
1608
2986
  if (s.status === "FAILED") throw new Error(`Request ${request_id} failed`);
1609
2987
  if (s.status !== "COMPLETED") continue;
1610
- const result = await import_client2.fal.queue.result(model, { requestId: request_id });
2988
+ const result = await import_client3.fal.queue.result(model, { requestId: request_id });
1611
2989
  return result.data;
1612
2990
  }
1613
2991
  }
1614
2992
  async function extractLastFrame(videoUrl, outDir) {
1615
2993
  const ts = Date.now();
1616
- const mp4Path = (0, import_node_path3.join)(outDir, `clip1-raw-${ts}.mp4`);
1617
- const jpgPath = (0, import_node_path3.join)(outDir, `last-frame-${ts}.jpg`);
2994
+ const mp4Path = (0, import_node_path4.join)(outDir, `clip1-raw-${ts}.mp4`);
2995
+ const jpgPath = (0, import_node_path4.join)(outDir, `last-frame-${ts}.jpg`);
1618
2996
  const res = await fetch(videoUrl);
1619
2997
  if (!res.ok) throw new Error(`Failed to download clip 1 (${res.status})`);
1620
- (0, import_node_fs3.writeFileSync)(mp4Path, Buffer.from(await res.arrayBuffer()));
2998
+ (0, import_node_fs4.writeFileSync)(mp4Path, Buffer.from(await res.arrayBuffer()));
1621
2999
  try {
1622
3000
  (0, import_node_child_process2.execSync)(`ffmpeg -sseof -0.1 -i "${mp4Path}" -vframes 1 -y "${jpgPath}" -loglevel error`);
1623
3001
  } finally {
1624
3002
  try {
1625
- (0, import_node_fs3.unlinkSync)(mp4Path);
3003
+ (0, import_node_fs4.unlinkSync)(mp4Path);
1626
3004
  } catch {
1627
3005
  }
1628
3006
  }
@@ -1632,11 +3010,11 @@ var VideoGenerator = class {
1632
3010
  constructor(apiKey) {
1633
3011
  const key = apiKey ?? process.env["FAL_KEY"];
1634
3012
  if (!key) throw new Error("FAL_KEY is required");
1635
- import_client2.fal.config({ credentials: key });
3013
+ import_client3.fal.config({ credentials: key });
1636
3014
  }
1637
3015
  async generateClipPair(question, answer, opts = {}) {
1638
- const outDir = opts.outputDir ?? (0, import_node_path3.join)((0, import_node_os.tmpdir)(), `paa-video-${Date.now()}`);
1639
- (0, import_node_fs3.mkdirSync)(outDir, { recursive: true });
3016
+ const outDir = opts.outputDir ?? (0, import_node_path4.join)((0, import_node_os2.tmpdir)(), `paa-video-${Date.now()}`);
3017
+ (0, import_node_fs4.mkdirSync)(outDir, { recursive: true });
1640
3018
  console.log("\n[1/7] Generating prompts via QWEN 3.6...");
1641
3019
  const prompts = await buildClipPrompts(question, answer);
1642
3020
  console.log(" Voiceover:", prompts.voiceover);
@@ -1645,10 +3023,55 @@ var VideoGenerator = class {
1645
3023
  const result1 = await generate(T2V, buildInput(prompts.clip1, opts, opts.seed));
1646
3024
  console.log("\n[3/7] Extracting last frame \u2192 clip 2 start...");
1647
3025
  const jpgPath = await extractLastFrame(result1.video.url, outDir);
1648
- const imageBlob = new Blob([(0, import_node_fs3.readFileSync)(jpgPath)], { type: "image/jpeg" });
1649
- const frameUrl = await import_client2.fal.storage.upload(imageBlob);
3026
+ const imageBlob = new Blob([(0, import_node_fs4.readFileSync)(jpgPath)], { type: "image/jpeg" });
3027
+ const frameUrl = await import_client3.fal.storage.upload(imageBlob);
3028
+ try {
3029
+ (0, import_node_fs4.unlinkSync)(jpgPath);
3030
+ } catch {
3031
+ }
3032
+ console.log("\n[4/7] Generating clip 2 (image-to-video from last frame)...");
3033
+ const seed2 = opts.seed !== void 0 ? opts.seed + 1 : void 0;
3034
+ const result2 = await generate(I2V, buildInput(prompts.clip2, opts, seed2, frameUrl));
3035
+ console.log("\n[5/7] Concatenating clips + generating voiceover (parallel)...");
3036
+ const [combinedPath, voiceoverUrl] = await Promise.all([
3037
+ concatenateClips(result1.video.url, result2.video.url, outDir),
3038
+ generateVoiceover(prompts.voiceover)
3039
+ ]);
3040
+ console.log("\n[6/7] Adding background audio via MMAudio V2...");
3041
+ const falVideoUrl = await uploadToFal(combinedPath);
3042
+ const totalDuration = (opts.clipDurationSeconds ?? 8) * 2;
3043
+ const videoWithAudioUrl = await addBackgroundAudio(falVideoUrl, prompts.audioMood, totalDuration);
3044
+ console.log("\n[7/7] Overlaying voiceover on final video...");
3045
+ const videoWithAudioPath = (0, import_node_path4.join)(outDir, `with-bg-audio-${Date.now()}.mp4`);
3046
+ const bgRes = await fetch(videoWithAudioUrl);
3047
+ (0, import_node_fs4.writeFileSync)(videoWithAudioPath, Buffer.from(await bgRes.arrayBuffer()));
3048
+ const finalVideoPath = await overlayVoiceover(videoWithAudioPath, voiceoverUrl, outDir);
3049
+ return {
3050
+ clip1Url: result1.video.url,
3051
+ clip2Url: result2.video.url,
3052
+ finalVideoPath,
3053
+ seed: result1.seed,
3054
+ promptClip1: prompts.clip1,
3055
+ promptClip2: prompts.clip2,
3056
+ voiceover: prompts.voiceover,
3057
+ audioMood: prompts.audioMood
3058
+ };
3059
+ }
3060
+ async generateEpisode(brief, opts = {}) {
3061
+ const outDir = opts.outputDir ?? (0, import_node_path4.join)((0, import_node_os2.tmpdir)(), `episode-${brief.episodeNumber}-${Date.now()}`);
3062
+ (0, import_node_fs4.mkdirSync)(outDir, { recursive: true });
3063
+ const prompts = extractEpisodePrompts(brief);
3064
+ console.log(`
3065
+ [Episode ${brief.episodeNumber}/${brief.episodeCount}] ${brief.sectionTitle}`);
3066
+ console.log(" Voiceover:", prompts.voiceover);
3067
+ console.log("\n[2/7] Generating clip 1 (text-to-video)...");
3068
+ const result1 = await generate(T2V, buildInput(prompts.clip1, opts, opts.seed));
3069
+ console.log("\n[3/7] Extracting last frame \u2192 clip 2 start...");
3070
+ const jpgPath = await extractLastFrame(result1.video.url, outDir);
3071
+ const imageBlob = new Blob([(0, import_node_fs4.readFileSync)(jpgPath)], { type: "image/jpeg" });
3072
+ const frameUrl = await import_client3.fal.storage.upload(imageBlob);
1650
3073
  try {
1651
- (0, import_node_fs3.unlinkSync)(jpgPath);
3074
+ (0, import_node_fs4.unlinkSync)(jpgPath);
1652
3075
  } catch {
1653
3076
  }
1654
3077
  console.log("\n[4/7] Generating clip 2 (image-to-video from last frame)...");
@@ -1657,16 +3080,16 @@ var VideoGenerator = class {
1657
3080
  console.log("\n[5/7] Concatenating clips + generating voiceover (parallel)...");
1658
3081
  const [combinedPath, voiceoverUrl] = await Promise.all([
1659
3082
  concatenateClips(result1.video.url, result2.video.url, outDir),
1660
- generateVoiceover(prompts.voiceover, opts.ttsVoice)
3083
+ generateVoiceover(prompts.voiceover)
1661
3084
  ]);
1662
3085
  console.log("\n[6/7] Adding background audio via MMAudio V2...");
1663
3086
  const falVideoUrl = await uploadToFal(combinedPath);
1664
3087
  const totalDuration = (opts.clipDurationSeconds ?? 8) * 2;
1665
3088
  const videoWithAudioUrl = await addBackgroundAudio(falVideoUrl, prompts.audioMood, totalDuration);
1666
3089
  console.log("\n[7/7] Overlaying voiceover on final video...");
1667
- const videoWithAudioPath = (0, import_node_path3.join)(outDir, `with-bg-audio-${Date.now()}.mp4`);
3090
+ const videoWithAudioPath = (0, import_node_path4.join)(outDir, `with-bg-audio-${Date.now()}.mp4`);
1668
3091
  const bgRes = await fetch(videoWithAudioUrl);
1669
- (0, import_node_fs3.writeFileSync)(videoWithAudioPath, Buffer.from(await bgRes.arrayBuffer()));
3092
+ (0, import_node_fs4.writeFileSync)(videoWithAudioPath, Buffer.from(await bgRes.arrayBuffer()));
1670
3093
  const finalVideoPath = await overlayVoiceover(videoWithAudioPath, voiceoverUrl, outDir);
1671
3094
  return {
1672
3095
  clip1Url: result1.video.url,