mcp-scraper 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +5 -0
  2. package/dist/bin/api-server.cjs +15553 -7587
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +3 -3
  5. package/dist/bin/mcp-stdio-server.cjs +312 -119
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +1 -1
  8. package/dist/bin/paa-harvest.cjs +1537 -165
  9. package/dist/bin/paa-harvest.cjs.map +1 -1
  10. package/dist/bin/paa-harvest.js +1 -1
  11. package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
  12. package/dist/chunk-D4CJBZBY.js.map +1 -0
  13. package/dist/chunk-HERFK7W6.js +2781 -0
  14. package/dist/chunk-HERFK7W6.js.map +1 -0
  15. package/dist/chunk-JQKZWEON.js +1000 -0
  16. package/dist/chunk-JQKZWEON.js.map +1 -0
  17. package/dist/chunk-Y74EXABN.js +295 -0
  18. package/dist/chunk-Y74EXABN.js.map +1 -0
  19. package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
  20. package/dist/index.cjs +1660 -237
  21. package/dist/index.cjs.map +1 -1
  22. package/dist/index.d.cts +169 -2
  23. package/dist/index.d.ts +169 -2
  24. package/dist/index.js +120 -69
  25. package/dist/index.js.map +1 -1
  26. package/dist/server-W5NWH5KF.js +11625 -0
  27. package/dist/server-W5NWH5KF.js.map +1 -0
  28. package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
  29. package/dist/worker-D4D2YQTA.js.map +1 -0
  30. package/package.json +17 -5
  31. package/dist/chunk-4API3ZCT.js +0 -1387
  32. package/dist/chunk-4API3ZCT.js.map +0 -1
  33. package/dist/chunk-LXZDJJXR.js.map +0 -1
  34. package/dist/chunk-ZBP4RHNW.js +0 -805
  35. package/dist/chunk-ZBP4RHNW.js.map +0 -1
  36. package/dist/server-63DR2HE5.js +0 -6062
  37. package/dist/server-63DR2HE5.js.map +0 -1
  38. package/dist/worker-3ECJHPRE.js.map +0 -1
  39. /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
@@ -33,6 +33,10 @@ var HarvestOptionsSchema = import_zod.z.object({
33
33
  location: import_zod.z.string().optional(),
34
34
  gl: import_zod.z.string().length(2).default("us"),
35
35
  hl: import_zod.z.string().length(2).default("en"),
36
+ device: import_zod.z.enum(["desktop", "mobile"]).default("desktop"),
37
+ proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location"),
38
+ proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional(),
39
+ debug: import_zod.z.boolean().default(false),
36
40
  depth: import_zod.z.number().int().min(1).max(30).default(3),
37
41
  maxQuestions: import_zod.z.number().int().min(1).max(1e3).default(100),
38
42
  headless: import_zod.z.boolean().default(false),
@@ -40,6 +44,7 @@ var HarvestOptionsSchema = import_zod.z.object({
40
44
  proxy: import_zod.z.string().url().optional(),
41
45
  kernelApiKey: import_zod.z.string().optional(),
42
46
  kernelProxyId: import_zod.z.string().optional(),
47
+ kernelProxyResolution: import_zod.z.unknown().optional(),
43
48
  outputDir: import_zod.z.string().default("./paa-output"),
44
49
  format: import_zod.z.enum(["json", "csv", "both"]).default("both"),
45
50
  serpOnly: import_zod.z.boolean().default(false),
@@ -63,6 +68,45 @@ var RawPAAItemSchema = import_zod.z.object({
63
68
  sourceSite: import_zod.z.string().optional(),
64
69
  sourceCite: import_zod.z.string().optional()
65
70
  });
71
+ var RawMapsOverviewSchema = import_zod.z.object({
72
+ name: import_zod.z.string().nullable(),
73
+ rating: import_zod.z.string().nullable(),
74
+ reviewCount: import_zod.z.string().nullable(),
75
+ category: import_zod.z.string().nullable(),
76
+ address: import_zod.z.string().nullable(),
77
+ hoursSummary: import_zod.z.string().nullable(),
78
+ phone: import_zod.z.string().nullable(),
79
+ phoneDisplay: import_zod.z.string().nullable(),
80
+ website: import_zod.z.string().nullable(),
81
+ plusCode: import_zod.z.string().nullable(),
82
+ bookingUrl: import_zod.z.string().nullable()
83
+ });
84
+ var RawMapsHoursRowSchema = import_zod.z.object({
85
+ day: import_zod.z.string(),
86
+ hours: import_zod.z.string()
87
+ });
88
+ var RawMapsReviewStatsSchema = import_zod.z.object({
89
+ reviewHistogram: import_zod.z.array(import_zod.z.object({
90
+ stars: import_zod.z.number(),
91
+ count: import_zod.z.string()
92
+ })),
93
+ reviewTopics: import_zod.z.array(import_zod.z.object({
94
+ label: import_zod.z.string(),
95
+ count: import_zod.z.string()
96
+ }))
97
+ });
98
+ var RawMapsReviewCardSchema = import_zod.z.object({
99
+ reviewId: import_zod.z.string(),
100
+ author: import_zod.z.string().nullable(),
101
+ stars: import_zod.z.string().nullable(),
102
+ date: import_zod.z.string().nullable(),
103
+ text: import_zod.z.string().nullable(),
104
+ ownerResponse: import_zod.z.string().nullable()
105
+ });
106
+ var RawMapsAboutAttributeSchema = import_zod.z.object({
107
+ section: import_zod.z.string(),
108
+ attribute: import_zod.z.string()
109
+ });
66
110
 
67
111
  // src/driver/BrowserDriver.ts
68
112
  var import_playwright_extra = require("playwright-extra");
@@ -78,7 +122,7 @@ var PAASelectors = {
78
122
  itemDataQ: "data-q",
79
123
  itemDataInitQ: "data-initq",
80
124
  itemQuestionEl: ".JlqpRe",
81
- answerContainer: ".bCOlv",
125
+ answerContainer: ".bCOlv, .hgKElc, .wDYxhc, .LGOjhe, .fo7IQd, .fmW3u",
82
126
  sourceTitle: "h3",
83
127
  sourceSite: ".VuuXrf",
84
128
  sourceCite: "cite",
@@ -118,9 +162,16 @@ var WhatPeopleSayingSelectors = {
118
162
  authorNote: ".nDgy9d"
119
163
  };
120
164
  var AIOverviewSelectors = {
121
- root: '[data-hveid="CBMQAA"]',
165
+ root: "[data-lhcontainer][data-streaming-container][eid]",
166
+ legacyRoot: '[data-hveid="CBMQAA"]',
122
167
  wrapper: ".Fgyi2e",
123
- citations: '.Fgyi2e [data-hveid] a[jsname="pxBnId"]'
168
+ controller: '[jscontroller="AkrxPe"]',
169
+ contentSubtree: '[data-subtree="mfc"]',
170
+ header: ".heWuVc",
171
+ heading: ".Fzsovc.cwYVJe.RJPOee",
172
+ showMoreButton: '[aria-label="Show more AI Overview"]',
173
+ sourcesPanel: ".OZ9ddf.WAUd4",
174
+ disclaimer: ".DuQANe.MSJHRb"
124
175
  };
125
176
  var AIModeSelectors = {
126
177
  root: '[data-hveid="CAUQAA"]',
@@ -148,6 +199,9 @@ var LocalPackSelectors = {
148
199
 
149
200
  // src/errors.ts
150
201
  var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
202
+ function sanitizeVendorName(message) {
203
+ return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
204
+ }
151
205
  var CaptchaError = class extends Error {
152
206
  constructor(instructions) {
153
207
  super(`CAPTCHA detected. ${instructions}`);
@@ -164,10 +218,55 @@ var ExtractionError = class extends Error {
164
218
  cause;
165
219
  name = "ExtractionError";
166
220
  };
221
+ var RequestAbortedError = class extends Error {
222
+ name = "RequestAbortedError";
223
+ constructor(message = "Request aborted before harvest completed") {
224
+ super(message);
225
+ }
226
+ };
167
227
 
168
228
  // src/driver/BrowserDriver.ts
169
229
  import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
170
230
  var DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
231
+ var MOBILE_USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
232
+ var DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS = 180;
233
+ var KERNEL_BROWSER_CLOSE_TIMEOUT_MS = 3e3;
234
+ var KERNEL_SESSION_DELETE_TIMEOUT_MS = 5e3;
235
+ function positiveIntFromEnv(name, fallback) {
236
+ const raw = process.env[name];
237
+ if (!raw) return fallback;
238
+ const parsed = Number(raw);
239
+ return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback;
240
+ }
241
+ function proxyIdSuffix(proxyId) {
242
+ return proxyId ? proxyId.slice(-6) : null;
243
+ }
244
+ function errorText(err) {
245
+ return err instanceof Error ? err.message : String(err);
246
+ }
247
+ function rankCheckContextOptions(config) {
248
+ return {
249
+ viewport: config.viewport,
250
+ locale: config.locale,
251
+ userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
252
+ ...config.deviceScaleFactor ? { deviceScaleFactor: config.deviceScaleFactor } : {},
253
+ ...config.isMobile !== void 0 ? { isMobile: config.isMobile } : {},
254
+ ...config.hasTouch !== void 0 ? { hasTouch: config.hasTouch } : {}
255
+ };
256
+ }
257
+ async function withTimeout(promise, timeoutMs, label) {
258
+ let timeout;
259
+ try {
260
+ return await Promise.race([
261
+ promise,
262
+ new Promise((_, reject) => {
263
+ timeout = setTimeout(() => reject(new Error(`${label} timed out after ${timeoutMs}ms`)), timeoutMs);
264
+ })
265
+ ]);
266
+ } finally {
267
+ if (timeout) clearTimeout(timeout);
268
+ }
269
+ }
171
270
  function buildYouTubeChannelVideosUrl(channelInput) {
172
271
  const raw = channelInput.trim();
173
272
  if (!raw) throw new Error("channelHandle is required");
@@ -201,30 +300,101 @@ var BrowserDriver = class {
201
300
  page = null;
202
301
  kernelClient = null;
203
302
  kernelSessionId = null;
303
+ debugEnabled = false;
304
+ debugSnapshot = {
305
+ kernel: null,
306
+ context: null,
307
+ networkLocation: null,
308
+ serpNavigation: null
309
+ };
204
310
  async launch(config) {
311
+ this.debugEnabled = config.debug === true;
312
+ const proxyMode = config.proxyMode ?? (config.kernelProxyId ? "configured" : "none");
313
+ const device = config.isMobile ? "mobile" : "desktop";
314
+ this.debugSnapshot = {
315
+ kernel: null,
316
+ context: {
317
+ viewport: config.viewport,
318
+ locale: config.locale,
319
+ device,
320
+ userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
321
+ deviceScaleFactor: config.deviceScaleFactor ?? null,
322
+ isMobile: config.isMobile === true,
323
+ hasTouch: config.hasTouch === true
324
+ },
325
+ networkLocation: null,
326
+ serpNavigation: null
327
+ };
205
328
  if (config.kernelApiKey) {
206
329
  this.kernelClient = new import_sdk.default({ apiKey: config.kernelApiKey });
330
+ const timeoutSeconds = positiveIntFromEnv("KERNEL_BROWSER_TIMEOUT_SECONDS", DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS);
207
331
  const kernelBrowser = await this.kernelClient.browsers.create({
208
332
  stealth: true,
209
- timeout_seconds: 600,
333
+ timeout_seconds: timeoutSeconds,
210
334
  ...config.kernelProxyId ? { proxy_id: config.kernelProxyId } : {}
211
335
  });
212
336
  this.kernelSessionId = kernelBrowser.session_id;
337
+ let defaultProxyDisabled = null;
338
+ let defaultProxyDisableError = null;
339
+ if (proxyMode === "none") {
340
+ try {
341
+ await withTimeout(
342
+ this.kernelClient.browsers.update(this.kernelSessionId, { disable_default_proxy: true }),
343
+ 5e3,
344
+ `Kernel session ${this.kernelSessionId} disable default proxy`
345
+ );
346
+ defaultProxyDisabled = true;
347
+ } catch (err) {
348
+ defaultProxyDisabled = false;
349
+ defaultProxyDisableError = errorText(err);
350
+ }
351
+ }
352
+ const kernelDebug = {
353
+ sessionId: this.kernelSessionId,
354
+ proxyMode,
355
+ requestedProxyIdPresent: Boolean(config.kernelProxyId),
356
+ requestedProxyIdSuffix: proxyIdSuffix(config.kernelProxyId),
357
+ createdProxyIdPresent: typeof kernelBrowser.proxy_id === "string" ? Boolean(kernelBrowser.proxy_id) : null,
358
+ createdProxyIdSuffix: proxyIdSuffix(kernelBrowser.proxy_id),
359
+ retrievedProxyIdPresent: null,
360
+ retrievedProxyIdSuffix: null,
361
+ retrievedProxyIdMatchesRequested: null,
362
+ defaultProxyDisabled,
363
+ defaultProxyDisableError,
364
+ proxyResolution: config.kernelProxyResolution ?? null,
365
+ timeoutSeconds,
366
+ stealth: typeof kernelBrowser.stealth === "boolean" ? kernelBrowser.stealth : null,
367
+ profilePresent: null,
368
+ poolPresent: null,
369
+ retrieveError: null
370
+ };
371
+ this.debugSnapshot.kernel = kernelDebug;
372
+ console.info(JSON.stringify({
373
+ event: "kernel_browser_created",
374
+ kernel_session_id: this.kernelSessionId,
375
+ timeout_seconds: timeoutSeconds,
376
+ proxy_mode: proxyMode,
377
+ proxy_id_present: Boolean(config.kernelProxyId),
378
+ proxy_resolution_source: config.kernelProxyResolution?.source
379
+ }));
380
+ if (this.debugEnabled) {
381
+ await this.populateKernelRetrieveDebug(kernelDebug, config.kernelProxyId);
382
+ }
213
383
  this.browser = await import_playwright.chromium.connectOverCDP(kernelBrowser.cdp_ws_url);
214
- this.context = this.browser.contexts()[0] ?? await this.browser.newContext();
384
+ this.context = await this.browser.newContext(rankCheckContextOptions(config));
215
385
  await this.installEsbuildHelperShims(this.context);
216
- this.page = this.context.pages()[0] ?? await this.context.newPage();
386
+ this.page = await this.context.newPage();
387
+ await this.page.setViewportSize(config.viewport);
388
+ if (this.debugEnabled) {
389
+ this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
390
+ }
217
391
  return;
218
392
  }
219
393
  const launchOpts = {
220
394
  headless: config.headless,
221
395
  proxy: config.proxy ? { server: config.proxy } : void 0
222
396
  };
223
- const ctxOpts = {
224
- viewport: config.viewport,
225
- locale: config.locale,
226
- userAgent: DESKTOP_USER_AGENT
227
- };
397
+ const ctxOpts = rankCheckContextOptions(config);
228
398
  if (config.profileDir) {
229
399
  this.context = await import_playwright_extra.chromium.launchPersistentContext(config.profileDir, {
230
400
  ...launchOpts,
@@ -238,6 +408,107 @@ var BrowserDriver = class {
238
408
  await this.installEsbuildHelperShims(this.context);
239
409
  this.page = await this.context.newPage();
240
410
  }
411
+ if (this.debugEnabled) {
412
+ this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
413
+ }
414
+ }
415
+ async populateKernelRetrieveDebug(kernelDebug, requestedProxyId) {
416
+ if (!this.kernelClient || !this.kernelSessionId) return;
417
+ try {
418
+ const retrieved = await withTimeout(
419
+ this.kernelClient.browsers.retrieve(this.kernelSessionId),
420
+ 5e3,
421
+ `Kernel session ${this.kernelSessionId} retrieve`
422
+ );
423
+ kernelDebug.retrievedProxyIdPresent = typeof retrieved.proxy_id === "string" ? Boolean(retrieved.proxy_id) : false;
424
+ kernelDebug.retrievedProxyIdSuffix = proxyIdSuffix(retrieved.proxy_id);
425
+ kernelDebug.retrievedProxyIdMatchesRequested = requestedProxyId ? retrieved.proxy_id === requestedProxyId : !retrieved.proxy_id;
426
+ kernelDebug.timeoutSeconds = typeof retrieved.timeout_seconds === "number" ? retrieved.timeout_seconds : kernelDebug.timeoutSeconds;
427
+ kernelDebug.stealth = typeof retrieved.stealth === "boolean" ? retrieved.stealth : kernelDebug.stealth;
428
+ kernelDebug.profilePresent = Boolean(retrieved.profile);
429
+ kernelDebug.poolPresent = Boolean(retrieved.pool);
430
+ } catch (err) {
431
+ kernelDebug.retrieveError = errorText(err);
432
+ }
433
+ }
434
+ async captureBrowserNetworkLocation() {
435
+ const fallback = (message, source = "ipapi.co") => ({
436
+ source,
437
+ ip: null,
438
+ city: null,
439
+ region: null,
440
+ country: null,
441
+ org: null,
442
+ timezone: null,
443
+ error: message
444
+ });
445
+ if (!this.context) return fallback("browser context is not available");
446
+ let debugPage = null;
447
+ try {
448
+ debugPage = await this.context.newPage();
449
+ const ipwho = await this.loadJsonInDebugPage(debugPage, "https://ipwho.is/");
450
+ if (ipwho) {
451
+ const connection = typeof ipwho.connection === "object" && ipwho.connection !== null ? ipwho.connection : {};
452
+ return {
453
+ source: "ipwho.is",
454
+ ip: typeof ipwho.ip === "string" ? ipwho.ip : null,
455
+ city: typeof ipwho.city === "string" ? ipwho.city : null,
456
+ region: typeof ipwho.region === "string" ? ipwho.region : null,
457
+ country: typeof ipwho.country === "string" ? ipwho.country : null,
458
+ org: typeof connection.org === "string" ? connection.org : null,
459
+ timezone: typeof ipwho.timezone === "object" && ipwho.timezone !== null && typeof ipwho.timezone.id === "string" ? ipwho.timezone.id : null,
460
+ error: null
461
+ };
462
+ }
463
+ const ipify = await this.loadJsonInDebugPage(debugPage, "https://api64.ipify.org?format=json");
464
+ if (ipify) {
465
+ return {
466
+ source: "api64.ipify.org",
467
+ ip: typeof ipify.ip === "string" ? ipify.ip : null,
468
+ city: null,
469
+ region: null,
470
+ country: null,
471
+ org: null,
472
+ timezone: null,
473
+ error: null
474
+ };
475
+ }
476
+ await withTimeout(
477
+ debugPage.goto("https://ipapi.co/json/", { waitUntil: "domcontentloaded", timeout: 7e3 }),
478
+ 8e3,
479
+ "browser network location navigation"
480
+ );
481
+ const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
482
+ const data = JSON.parse(body);
483
+ return {
484
+ source: "ipapi.co",
485
+ ip: typeof data.ip === "string" ? data.ip : null,
486
+ city: typeof data.city === "string" ? data.city : null,
487
+ region: typeof data.region === "string" ? data.region : null,
488
+ country: typeof data.country_name === "string" ? data.country_name : typeof data.country === "string" ? data.country : null,
489
+ org: typeof data.org === "string" ? data.org : null,
490
+ timezone: typeof data.timezone === "string" ? data.timezone : null,
491
+ error: null
492
+ };
493
+ } catch (err) {
494
+ return fallback(errorText(err));
495
+ } finally {
496
+ await debugPage?.close().catch(() => {
497
+ });
498
+ }
499
+ }
500
+ async loadJsonInDebugPage(debugPage, url) {
501
+ try {
502
+ await withTimeout(
503
+ debugPage.goto(url, { waitUntil: "domcontentloaded", timeout: 7e3 }),
504
+ 8e3,
505
+ `browser network location navigation ${url}`
506
+ );
507
+ const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
508
+ return JSON.parse(body);
509
+ } catch {
510
+ return null;
511
+ }
241
512
  }
242
513
  async installEsbuildHelperShims(context) {
243
514
  await context.addInitScript(() => {
@@ -249,42 +520,79 @@ var BrowserDriver = class {
249
520
  };
250
521
  });
251
522
  }
252
- async navigateToSERP(query, uule, gl, hl) {
253
- const params = new URLSearchParams({ q: query, gl, hl });
523
+ async navigateToSERP(query, uule, gl, hl, options) {
524
+ const params = new URLSearchParams({ q: query, gl, hl, pws: "0" });
525
+ if (options?.num) params.set("num", String(options.num));
254
526
  if (uule) params.set("uule", uule);
255
527
  const url = "https://www.google.com/search?" + params.toString();
528
+ const navDebug = options?.debug ? {
529
+ requestedUrl: url,
530
+ finalUrl: null,
531
+ title: null,
532
+ bodySnippet: null,
533
+ hasPaa: null,
534
+ captchaDetected: null,
535
+ googleSorryUrl: null,
536
+ redirected: null
537
+ } : null;
538
+ if (navDebug) this.debugSnapshot.serpNavigation = navDebug;
256
539
  try {
257
540
  await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
258
541
  } catch (err) {
542
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: null, captchaDetected: null });
259
543
  const diag = await this.captureDiagnostics(url);
260
544
  throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
261
545
  }
262
546
  const captchaCount = await this.page.locator(PAASelectors.captchaMarker).count();
263
547
  if (captchaCount > 0) {
264
- if (this.kernelClient) {
265
- try {
266
- await this.page.waitForSelector(PAASelectors.container, { timeout: 45e3 });
267
- return { hasPaa: true };
268
- } catch {
269
- throw new CaptchaError(this.captchaMessage());
270
- }
271
- }
548
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
272
549
  throw new CaptchaError(this.captchaMessage());
273
550
  }
274
551
  const fastFound = await this.page.waitForSelector(PAASelectors.item, { timeout: 4e3 }).catch(() => null);
275
- if (fastFound) return { hasPaa: true };
552
+ if (fastFound) {
553
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
554
+ return { hasPaa: true };
555
+ }
276
556
  const captchaAfter = await this.page.locator(PAASelectors.captchaMarker).count();
277
- if (captchaAfter > 0) throw new CaptchaError(this.captchaMessage());
557
+ if (captchaAfter > 0) {
558
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
559
+ throw new CaptchaError(this.captchaMessage());
560
+ }
278
561
  for (let i = 1; i <= 6; i++) {
279
562
  await this.page.evaluate((f) => {
280
563
  window.scrollTo(0, document.body.scrollHeight * f);
281
564
  }, i / 6);
282
565
  await this.page.waitForTimeout(600);
283
566
  const count = await this.page.locator(PAASelectors.item).count();
284
- if (count > 0) return { hasPaa: true };
567
+ if (count > 0) {
568
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
569
+ return { hasPaa: true };
570
+ }
285
571
  }
572
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: false });
286
573
  return { hasPaa: false };
287
574
  }
575
+ async updateSerpNavigationDebug(navDebug, requestedUrl, state) {
576
+ if (!navDebug || !this.page) return;
577
+ try {
578
+ const finalUrl = this.page.url();
579
+ const title = await this.page.title().catch(() => "");
580
+ const bodySnippet = await this.page.evaluate(() => {
581
+ const text = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
582
+ return text.slice(0, 500);
583
+ }).catch(() => "");
584
+ const textCaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
585
+ navDebug.finalUrl = finalUrl;
586
+ navDebug.title = title;
587
+ navDebug.bodySnippet = bodySnippet;
588
+ navDebug.hasPaa = state.hasPaa;
589
+ navDebug.captchaDetected = state.captchaDetected ?? textCaptcha;
590
+ navDebug.googleSorryUrl = /google\.[^/]+\/sorry\//i.test(finalUrl);
591
+ navDebug.redirected = finalUrl !== requestedUrl;
592
+ } catch (err) {
593
+ navDebug.bodySnippet = `debug capture failed: ${errorText(err)}`;
594
+ }
595
+ }
288
596
  async captureDiagnostics(intendedUrl) {
289
597
  try {
290
598
  const finalUrl = this.page.url();
@@ -306,7 +614,7 @@ var BrowserDriver = class {
306
614
  }
307
615
  }
308
616
  captchaMessage() {
309
- return this.kernelClient ? "Google returned a CAPTCHA on this Kernel.sh session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
617
+ return this.kernelClient ? "Google returned a CAPTCHA on this session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
310
618
  }
311
619
  async navigateTo(url) {
312
620
  try {
@@ -331,6 +639,12 @@ var BrowserDriver = class {
331
639
  getPage() {
332
640
  return this.page;
333
641
  }
642
+ getKernelSessionId() {
643
+ return this.kernelSessionId;
644
+ }
645
+ getDebugSnapshot() {
646
+ return this.debugSnapshot;
647
+ }
334
648
  async close() {
335
649
  if (this.browser) {
336
650
  const b = this.browser;
@@ -341,21 +655,84 @@ var BrowserDriver = class {
341
655
  this.page = null;
342
656
  this.kernelSessionId = null;
343
657
  this.kernelClient = null;
344
- try {
345
- await b.close();
346
- } finally {
347
- if (client && sessionId) {
348
- await client.browsers.deleteByID(sessionId).catch(
349
- (err) => console.warn("Kernel session cleanup failed:", err)
350
- );
658
+ if (client && sessionId) {
659
+ console.info(JSON.stringify({
660
+ event: "kernel_browser_delete_started",
661
+ kernel_session_id: sessionId
662
+ }));
663
+ const deleteSession = withTimeout(
664
+ client.browsers.deleteByID(sessionId),
665
+ KERNEL_SESSION_DELETE_TIMEOUT_MS,
666
+ `Kernel session ${sessionId} delete`
667
+ );
668
+ const closeBrowser = withTimeout(
669
+ b.close(),
670
+ KERNEL_BROWSER_CLOSE_TIMEOUT_MS,
671
+ `Kernel browser ${sessionId} close`
672
+ );
673
+ const [deleteResult, closeResult] = await Promise.allSettled([deleteSession, closeBrowser]);
674
+ const result = {
675
+ kernelSessionId: sessionId,
676
+ kernelDeleteStarted: true,
677
+ kernelDeleteSucceeded: deleteResult.status === "fulfilled",
678
+ kernelDeleteError: deleteResult.status === "rejected" ? deleteResult.reason instanceof Error ? deleteResult.reason.message : String(deleteResult.reason) : null,
679
+ browserCloseSucceeded: closeResult.status === "fulfilled",
680
+ browserCloseError: closeResult.status === "rejected" ? closeResult.reason instanceof Error ? closeResult.reason.message : String(closeResult.reason) : null
681
+ };
682
+ if (deleteResult.status === "rejected") {
683
+ console.warn(JSON.stringify({
684
+ event: "kernel_browser_delete_failed",
685
+ kernel_session_id: sessionId,
686
+ message: result.kernelDeleteError
687
+ }));
688
+ console.warn(`Kernel session cleanup failed for ${sessionId}:`, deleteResult.reason);
689
+ } else {
690
+ console.info(JSON.stringify({
691
+ event: "kernel_browser_delete_succeeded",
692
+ kernel_session_id: sessionId
693
+ }));
351
694
  }
695
+ if (closeResult.status === "rejected") {
696
+ console.warn(JSON.stringify({
697
+ event: "kernel_browser_close_failed",
698
+ kernel_session_id: sessionId,
699
+ message: result.browserCloseError
700
+ }));
701
+ console.warn(`Kernel browser close failed for ${sessionId}:`, closeResult.reason);
702
+ }
703
+ return result;
352
704
  }
705
+ await b.close();
706
+ return {
707
+ kernelSessionId: null,
708
+ kernelDeleteStarted: false,
709
+ kernelDeleteSucceeded: null,
710
+ kernelDeleteError: null,
711
+ browserCloseSucceeded: true,
712
+ browserCloseError: null
713
+ };
353
714
  } else if (this.context) {
354
715
  const ctx = this.context;
355
716
  this.context = null;
356
717
  this.page = null;
357
718
  await ctx.close();
719
+ return {
720
+ kernelSessionId: null,
721
+ kernelDeleteStarted: false,
722
+ kernelDeleteSucceeded: null,
723
+ kernelDeleteError: null,
724
+ browserCloseSucceeded: true,
725
+ browserCloseError: null
726
+ };
358
727
  }
728
+ return {
729
+ kernelSessionId: null,
730
+ kernelDeleteStarted: false,
731
+ kernelDeleteSucceeded: null,
732
+ kernelDeleteError: null,
733
+ browserCloseSucceeded: null,
734
+ browserCloseError: null
735
+ };
359
736
  }
360
737
  };
361
738
 
@@ -426,13 +803,157 @@ var LOCATIONS = {
426
803
  };
427
804
 
428
805
  // src/uule.ts
806
+ function encodeVarint(value) {
807
+ const bytes = [];
808
+ let remaining = value;
809
+ do {
810
+ let byte = remaining & 127;
811
+ remaining >>>= 7;
812
+ if (remaining > 0) byte |= 128;
813
+ bytes.push(byte);
814
+ } while (remaining > 0);
815
+ return bytes;
816
+ }
429
817
  function encodeUule(name) {
430
- const encoded = Buffer.from(String.fromCharCode(name.length) + name).toString("base64");
431
- return `w+CAIQICI${encoded}`;
818
+ const locationBytes = Buffer.from(name, "utf8");
819
+ const payload = Buffer.concat([
820
+ Buffer.from([8, 2, 16, 32, 34]),
821
+ Buffer.from(encodeVarint(locationBytes.length)),
822
+ locationBytes
823
+ ]);
824
+ return `w+${payload.toString("base64")}`;
432
825
  }
433
826
  function normalizeLocation(input) {
434
- const key = input.toLowerCase().trim();
435
- return LOCATIONS[key] ?? input;
827
+ const raw = input.toLowerCase().trim();
828
+ if (LOCATIONS[raw]) return LOCATIONS[raw];
829
+ const beforeComma = raw.split(",")[0].trim();
830
+ if (beforeComma !== raw && LOCATIONS[beforeComma]) return LOCATIONS[beforeComma];
831
+ const withoutState = raw.replace(/\s+[a-z]{2}$/, "").trim();
832
+ if (withoutState !== raw && LOCATIONS[withoutState]) return LOCATIONS[withoutState];
833
+ return input;
834
+ }
835
+
836
+ // src/serp-location-debug.ts
837
+ var STATE_TO_CODE = {
838
+ alabama: "AL",
839
+ alaska: "AK",
840
+ arizona: "AZ",
841
+ arkansas: "AR",
842
+ california: "CA",
843
+ colorado: "CO",
844
+ connecticut: "CT",
845
+ delaware: "DE",
846
+ florida: "FL",
847
+ georgia: "GA",
848
+ hawaii: "HI",
849
+ idaho: "ID",
850
+ illinois: "IL",
851
+ indiana: "IN",
852
+ iowa: "IA",
853
+ kansas: "KS",
854
+ kentucky: "KY",
855
+ louisiana: "LA",
856
+ maine: "ME",
857
+ maryland: "MD",
858
+ massachusetts: "MA",
859
+ michigan: "MI",
860
+ minnesota: "MN",
861
+ mississippi: "MS",
862
+ missouri: "MO",
863
+ montana: "MT",
864
+ nebraska: "NE",
865
+ nevada: "NV",
866
+ "new hampshire": "NH",
867
+ "new jersey": "NJ",
868
+ "new mexico": "NM",
869
+ "new york": "NY",
870
+ "north carolina": "NC",
871
+ "north dakota": "ND",
872
+ ohio: "OH",
873
+ oklahoma: "OK",
874
+ oregon: "OR",
875
+ pennsylvania: "PA",
876
+ "rhode island": "RI",
877
+ "south carolina": "SC",
878
+ "south dakota": "SD",
879
+ tennessee: "TN",
880
+ texas: "TX",
881
+ utah: "UT",
882
+ vermont: "VT",
883
+ virginia: "VA",
884
+ washington: "WA",
885
+ "west virginia": "WV",
886
+ wisconsin: "WI",
887
+ wyoming: "WY",
888
+ "district of columbia": "DC"
889
+ };
890
+ var STATE_PATTERN = [
891
+ ...Object.keys(STATE_TO_CODE).map((s) => s.replace(/\s+/g, "\\s+")),
892
+ ...Object.values(STATE_TO_CODE)
893
+ ].join("|");
894
+ var CITY_STATE_RE = new RegExp(`\\b([A-Z][A-Za-z]+(?:[\\s.-][A-Z][A-Za-z]+){0,4}),?\\s+(${STATE_PATTERN})\\b`, "gi");
895
+ function normalizeRegionCode(input) {
896
+ if (!input) return null;
897
+ const trimmed = input.trim();
898
+ if (/^[A-Z]{2}$/i.test(trimmed)) return trimmed.toUpperCase();
899
+ return STATE_TO_CODE[trimmed.toLowerCase()] ?? null;
900
+ }
901
+ function normalizeCity(input) {
902
+ const cleaned = input.replace(/\s+/g, " ").trim().replace(/^.*\b(?:in|near|around|serving)\s+/i, "");
903
+ return cleaned.toLowerCase().replace(/\b[a-z]/g, (char) => char.toUpperCase());
904
+ }
905
+ function parseExpected(canonicalLocation) {
906
+ if (!canonicalLocation) return null;
907
+ const [city = "", region = ""] = canonicalLocation.split(",").map((part) => part.trim());
908
+ return {
909
+ city: normalizeCity(city),
910
+ regionCode: normalizeRegionCode(region),
911
+ canonicalLocation
912
+ };
913
+ }
914
+ function addCandidate(candidates, city, region, example) {
915
+ const normalizedCity = normalizeCity(city);
916
+ const regionCode = normalizeRegionCode(region);
917
+ if (!normalizedCity || !regionCode) return;
918
+ const key = `${normalizedCity.toLowerCase()}|${regionCode}`;
919
+ const existing = candidates.get(key);
920
+ if (existing) {
921
+ existing.count++;
922
+ if (existing.examples.length < 3 && !existing.examples.includes(example)) existing.examples.push(example);
923
+ return;
924
+ }
925
+ candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
926
+ }
927
+ function scanText(candidates, text) {
928
+ const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
929
+ for (const match of normalized.matchAll(CITY_STATE_RE)) {
930
+ addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
931
+ }
932
+ }
933
+ function inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) {
934
+ const expected = parseExpected(canonicalLocation);
935
+ const candidates = /* @__PURE__ */ new Map();
936
+ for (const result of organicResults) {
937
+ scanText(candidates, [result.title, result.snippet ?? "", result.cite ?? "", result.url].join(" "));
938
+ }
939
+ for (const business of localPack) {
940
+ scanText(candidates, [business.name, ...business.metadata, business.websiteUrl ?? "", business.directionsUrl ?? ""].join(" "));
941
+ }
942
+ const rankedCandidates = Array.from(candidates.values()).sort((a, b) => b.count - a.count || a.city.localeCompare(b.city)).slice(0, 8);
943
+ if (!expected) {
944
+ return { status: "not_requested", expected: null, candidates: rankedCandidates };
945
+ }
946
+ if (rankedCandidates.length === 0) {
947
+ return { status: "unknown", expected, candidates: [] };
948
+ }
949
+ const matched = rankedCandidates.some(
950
+ (candidate) => candidate.city.toLowerCase() === expected.city.toLowerCase() && (expected.regionCode == null || candidate.regionCode === expected.regionCode)
951
+ );
952
+ return {
953
+ status: matched ? "matched" : "mismatch",
954
+ expected,
955
+ candidates: rankedCandidates
956
+ };
436
957
  }
437
958
 
438
959
  // src/lib/paa-answer-cleanup.ts
@@ -527,7 +1048,220 @@ function cleanPAAAnswerText(answer, question, sourceTitle) {
527
1048
  return text;
528
1049
  }
529
1050
 
1051
+ // src/extractor/ai-surfaces.ts
1052
+ async function extractAISurfacesFromDocument(config) {
1053
+ const selectors = config ?? {
1054
+ aio: {
1055
+ root: "[data-lhcontainer][data-streaming-container][eid]",
1056
+ legacyRoot: '[data-hveid="CBMQAA"]',
1057
+ wrapper: ".Fgyi2e",
1058
+ controller: '[jscontroller="AkrxPe"]',
1059
+ contentSubtree: '[data-subtree="mfc"]',
1060
+ heading: ".Fzsovc.cwYVJe.RJPOee",
1061
+ header: ".heWuVc",
1062
+ showMoreButton: '[aria-label="Show more AI Overview"]',
1063
+ sourcesPanel: ".OZ9ddf.WAUd4",
1064
+ disclaimer: ".DuQANe.MSJHRb"
1065
+ },
1066
+ aim: {
1067
+ root: '[data-hveid="CAUQAA"]',
1068
+ wrapper: ".Fgyi2e"
1069
+ },
1070
+ expandWaitMs: 1500
1071
+ };
1072
+ const sn = window.google?.sn ?? "unknown";
1073
+ const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
1074
+ function textOf(el) {
1075
+ if (!el) return "";
1076
+ return (el.innerText ?? el.textContent ?? "").trim();
1077
+ }
1078
+ function hasAIOverviewLabel(el) {
1079
+ const heading = el.querySelector(selectors.aio.heading);
1080
+ if (textOf(heading) === "AI Overview") return true;
1081
+ const header = el.querySelector(selectors.aio.header);
1082
+ if (textOf(header).split(/\n|\s{2,}/).some((part) => part.trim() === "AI Overview")) return true;
1083
+ return textOf(el).includes("AI Overview");
1084
+ }
1085
+ function findAIORoot() {
1086
+ const primaryRoots = Array.from(document.querySelectorAll(selectors.aio.root));
1087
+ const labeledPrimary = primaryRoots.find(hasAIOverviewLabel);
1088
+ if (labeledPrimary) return labeledPrimary;
1089
+ if (primaryRoots.length > 0) return primaryRoots[0];
1090
+ if (selectors.aio.legacyRoot) {
1091
+ const legacy = document.querySelector(selectors.aio.legacyRoot);
1092
+ if (legacy) return legacy;
1093
+ }
1094
+ const headings = document.querySelectorAll(`${selectors.aio.heading}, h1, h2, h3, [role="heading"]`);
1095
+ for (const h of headings) {
1096
+ if (textOf(h) !== "AI Overview") continue;
1097
+ let el = h.parentElement;
1098
+ for (let i = 0; i < 8 && el; i++) {
1099
+ if (el.matches(selectors.aio.root) || el.querySelector(selectors.aio.controller) || el.querySelector(selectors.aio.contentSubtree)) {
1100
+ return el;
1101
+ }
1102
+ el = el.parentElement;
1103
+ }
1104
+ return h.parentElement;
1105
+ }
1106
+ return null;
1107
+ }
1108
+ function cleanText(target) {
1109
+ if (!target) return null;
1110
+ const clone = target.cloneNode(true);
1111
+ clone.querySelectorAll([
1112
+ "script",
1113
+ "style",
1114
+ "noscript",
1115
+ "img",
1116
+ "picture",
1117
+ "video",
1118
+ selectors.aio.header,
1119
+ selectors.aio.showMoreButton,
1120
+ selectors.aio.sourcesPanel,
1121
+ selectors.aio.disclaimer,
1122
+ '[data-subtree="dfa"]',
1123
+ "[data-src-id]",
1124
+ '[role="dialog"]',
1125
+ ".HWMcu",
1126
+ ".bTFeG",
1127
+ ".CyMdWb",
1128
+ ".MFrAxb",
1129
+ ".F0OfWd.hfWAgb",
1130
+ ".x2qcTc.fZavHb",
1131
+ ".SvjEff",
1132
+ ".sR2MY",
1133
+ ".lKuDef",
1134
+ ".GSPQcc",
1135
+ "a[href]",
1136
+ "button",
1137
+ '[role="button"]'
1138
+ ].join(",")).forEach((el) => el.remove());
1139
+ const holder = document.createElement("div");
1140
+ holder.style.position = "fixed";
1141
+ holder.style.left = "-10000px";
1142
+ holder.style.top = "0";
1143
+ holder.style.width = `${Math.max(320, Math.round(target.getBoundingClientRect?.().width || 960))}px`;
1144
+ holder.style.opacity = "0";
1145
+ holder.style.pointerEvents = "none";
1146
+ holder.append(clone);
1147
+ document.body.append(holder);
1148
+ const rendered = clone.innerText || clone.textContent || "";
1149
+ holder.remove();
1150
+ const lines = rendered.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n[ \t]+/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ").trim().split("\n").map((line) => line.replace(/\u00a0/g, " ").trim()).filter(Boolean);
1151
+ const filteredLines = [];
1152
+ for (let i = 0; i < lines.length; i++) {
1153
+ const line = lines[i];
1154
+ const next = lines[i + 1] ?? "";
1155
+ if (line === "AI Overview") continue;
1156
+ if (line === "Show more") continue;
1157
+ if (/^AI can make mistakes/i.test(line)) continue;
1158
+ if (/^Thank you\b/i.test(line)) continue;
1159
+ if (/^Your feedback helps Google improve/i.test(line)) continue;
1160
+ if (/^\+?\d+$/.test(line)) continue;
1161
+ if (/^\+\d+$/.test(next) && line.length <= 80) {
1162
+ i++;
1163
+ continue;
1164
+ }
1165
+ filteredLines.push(line);
1166
+ }
1167
+ const raw = filteredLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
1168
+ if (!raw || /not available|try again|can't generate/i.test(raw)) return null;
1169
+ return raw;
1170
+ }
1171
+ function normalizeHref(rawHref) {
1172
+ if (!rawHref || rawHref.startsWith("javascript:")) return null;
1173
+ let href = rawHref;
1174
+ try {
1175
+ const absolute = new URL(rawHref, window.location.href);
1176
+ const q = absolute.searchParams.get("q") ?? absolute.searchParams.get("url");
1177
+ if (/(\.|^)google\./i.test(absolute.hostname) && q?.startsWith("http")) {
1178
+ href = q;
1179
+ } else {
1180
+ href = absolute.href;
1181
+ }
1182
+ } catch {
1183
+ return null;
1184
+ }
1185
+ if (!/^https?:\/\//i.test(href)) return null;
1186
+ try {
1187
+ const url = new URL(href);
1188
+ const isGoogleInternal = /(\.|^)google\./i.test(url.hostname);
1189
+ if (isGoogleInternal) return null;
1190
+ return url.href;
1191
+ } catch {
1192
+ return null;
1193
+ }
1194
+ }
1195
+ function extractCitations(root) {
1196
+ if (!root) return [];
1197
+ const seen = /* @__PURE__ */ new Set();
1198
+ const citations = [];
1199
+ for (const a of Array.from(root.querySelectorAll("a[href]"))) {
1200
+ const href = normalizeHref(a.getAttribute("href") ?? "");
1201
+ if (!href || seen.has(href)) continue;
1202
+ seen.add(href);
1203
+ let fallbackHost = "";
1204
+ try {
1205
+ fallbackHost = new URL(href).hostname.replace(/^www\./, "");
1206
+ } catch {
1207
+ }
1208
+ citations.push({
1209
+ text: textOf(a) || fallbackHost || href,
1210
+ href
1211
+ });
1212
+ }
1213
+ return citations;
1214
+ }
1215
+ async function maybeExpand(root) {
1216
+ const button = root.querySelector(selectors.aio.showMoreButton);
1217
+ if (!button || button.getAttribute("aria-expanded") !== "false") return false;
1218
+ button.click();
1219
+ const waitMs = selectors.expandWaitMs ?? 1500;
1220
+ if (waitMs > 0) await new Promise((resolve) => setTimeout(resolve, waitMs));
1221
+ return true;
1222
+ }
1223
+ const aioRoot = findAIORoot();
1224
+ let aioText = null;
1225
+ let aioCitations = [];
1226
+ let aioExpanded = false;
1227
+ let aioFullyExpanded = false;
1228
+ let aioSections = [];
1229
+ if (aioRoot) {
1230
+ aioExpanded = await maybeExpand(aioRoot);
1231
+ const controller = aioRoot.querySelector(selectors.aio.controller);
1232
+ const contentSubtree = aioRoot.querySelector(selectors.aio.contentSubtree);
1233
+ const showMore = aioRoot.querySelector(selectors.aio.showMoreButton);
1234
+ aioFullyExpanded = controller?.getAttribute("data-trnct") === "false" || showMore?.getAttribute("aria-expanded") === "true" || !showMore;
1235
+ aioText = cleanText(contentSubtree ?? controller ?? aioRoot);
1236
+ aioSections = (aioText ?? "").split("\n").map((line) => line.trim()).filter((line) => /^\d+\.\s+.+/.test(line));
1237
+ aioCitations = extractCitations(aioRoot);
1238
+ }
1239
+ const aimRoot = document.querySelector(selectors.aim.root);
1240
+ const aimDetected = surface === "aim" && !!aimRoot;
1241
+ const aimContainer = aimRoot?.closest(selectors.aim.wrapper) ?? aimRoot;
1242
+ const aimText = cleanText(aimContainer);
1243
+ const aimCitations = aimDetected ? extractCitations(aimContainer) : [];
1244
+ return {
1245
+ surface,
1246
+ aiOverview: {
1247
+ detected: !!aioRoot && aioText !== null,
1248
+ text: aioText,
1249
+ citations: aioCitations,
1250
+ expanded: aioExpanded,
1251
+ fullyExpanded: aioFullyExpanded,
1252
+ sections: aioSections
1253
+ },
1254
+ aiMode: {
1255
+ detected: aimDetected && aimText !== null,
1256
+ text: aimText,
1257
+ citations: aimCitations
1258
+ }
1259
+ };
1260
+ }
1261
+
530
1262
  // src/extractor/PAAExtractor.ts
1263
+ var DESKTOP_USER_AGENT2 = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
1264
+ var MOBILE_USER_AGENT2 = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
531
1265
  var PAAExtractor = class {
532
1266
  constructor(driver, reporter) {
533
1267
  this.driver = driver;
@@ -538,6 +1272,17 @@ var PAAExtractor = class {
538
1272
  normalizeQuestion(q) {
539
1273
  return q.toLowerCase().replace(/[^\w\s]/g, "").replace(/\s+/g, " ").trim();
540
1274
  }
1275
+ throwIfAborted(signal) {
1276
+ if (!signal?.aborted) return;
1277
+ if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") throw signal.reason;
1278
+ throw new RequestAbortedError();
1279
+ }
1280
+ async throwIfCaptcha(page, context) {
1281
+ const captchaCount = await page.locator(PAASelectors.captchaMarker).count().catch(() => 0);
1282
+ if (captchaCount > 0) {
1283
+ throw new CaptchaError(`${context} returned a CAPTCHA \u2014 retrying with a fresh session.`);
1284
+ }
1285
+ }
541
1286
  async extractVisibleItems(page) {
542
1287
  const sels = PAASelectors;
543
1288
  const raw = await page.evaluate((selectors) => {
@@ -600,10 +1345,10 @@ var PAAExtractor = class {
600
1345
  extracted_at: (/* @__PURE__ */ new Date()).toISOString()
601
1346
  };
602
1347
  }
603
- async runBFS(page, options) {
1348
+ async runBFS(page, options, signal) {
604
1349
  const seenKeys = /* @__PURE__ */ new Set();
605
1350
  const seenQs = /* @__PURE__ */ new Set();
606
- const depthMap = /* @__PURE__ */ new Map();
1351
+ const orderedQs = [];
607
1352
  const results = [];
608
1353
  const readAllQs = () => page.evaluate(
609
1354
  ({ sel, dataQ, dataInitQ, questionEl }) => Array.from(document.querySelectorAll(sel)).map(
@@ -611,42 +1356,43 @@ var PAAExtractor = class {
611
1356
  ).filter(Boolean),
612
1357
  { sel: PAASelectors.item, dataQ: PAASelectors.itemDataQ, dataInitQ: PAASelectors.itemDataInitQ, questionEl: PAASelectors.itemQuestionEl }
613
1358
  );
614
- const dupRates = [];
615
- const orderedQs = [];
616
- for (let round = 0; round < options.depth; round++) {
617
- this.reporter.onDepth(round + 1);
618
- if (seenQs.size >= options.maxQuestions) break;
1359
+ let round = 0;
1360
+ while (seenQs.size < options.maxQuestions) {
1361
+ this.throwIfAborted(signal);
1362
+ await this.throwIfCaptcha(page, "Google PAA expansion");
619
1363
  const beforeQs = await readAllQs();
620
1364
  if (beforeQs.length >= options.maxQuestions) break;
621
- const unexpandedItems = await page.$$(
622
- `${PAASelectors.item}:not(.${PAASelectors.expandedClass})`
623
- );
624
- if (unexpandedItems.length === 0) break;
625
- for (const item of unexpandedItems) {
1365
+ const unexpandedSel = `${PAASelectors.item}:not(.${PAASelectors.expandedClass}) ${PAASelectors.clickTarget}`;
1366
+ const unexpandedCount = await page.locator(unexpandedSel).count();
1367
+ if (unexpandedCount === 0) break;
1368
+ this.reporter.onDepth(++round);
1369
+ for (let ci = 0; ci < unexpandedCount; ci++) {
1370
+ this.throwIfAborted(signal);
626
1371
  try {
627
- await item.scrollIntoViewIfNeeded();
628
- await item.click({ force: true });
1372
+ const btn = page.locator(unexpandedSel).first();
1373
+ await btn.scrollIntoViewIfNeeded();
1374
+ await btn.hover({ force: true });
1375
+ await page.waitForTimeout(100);
1376
+ await btn.click({ force: true });
629
1377
  await page.waitForTimeout(500);
630
1378
  } catch {
631
1379
  }
632
1380
  }
633
- await page.waitForTimeout(1500);
1381
+ await page.waitForFunction(
1382
+ ({ sel, min }) => document.querySelectorAll(sel).length > min,
1383
+ { sel: PAASelectors.item, min: beforeQs.length },
1384
+ { timeout: 5e3 }
1385
+ ).catch(() => {
1386
+ });
1387
+ await this.throwIfCaptcha(page, "Google PAA expansion");
634
1388
  const afterQs = await readAllQs();
635
- const newQs = afterQs.slice(beforeQs.length);
636
- const newDups = newQs.filter((q) => seenQs.has(q)).length;
637
- const dupRate = newQs.length > 0 ? newDups / newQs.length : 0;
638
- dupRates.push(dupRate);
639
- if (dupRates.length > 2) dupRates.shift();
640
- const rollingDupRate = dupRates.reduce((a, b) => a + b, 0) / dupRates.length;
1389
+ if (afterQs.length === beforeQs.length) break;
641
1390
  for (const q of afterQs) {
642
1391
  if (!seenQs.has(q)) {
643
1392
  seenQs.add(q);
644
1393
  orderedQs.push(q);
645
1394
  }
646
- if (!depthMap.has(q)) depthMap.set(q, round + 1);
647
1395
  }
648
- if (afterQs.length === beforeQs.length) break;
649
- if (rollingDupRate >= 0.6) break;
650
1396
  }
651
1397
  const itemMap = new Map((await this.extractVisibleItems(page)).map((i) => [i.question, i]));
652
1398
  for (const q of orderedQs) {
@@ -654,13 +1400,12 @@ var PAAExtractor = class {
654
1400
  const key = this.normalizeQuestion(q);
655
1401
  if (seenKeys.has(key)) continue;
656
1402
  seenKeys.add(key);
657
- const d = depthMap.get(q) ?? 1;
658
1403
  const item = itemMap.get(q);
659
1404
  if (item) {
660
- results.push(this.toFlatRow(item, d, null, options.query));
661
- this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: d, parentQuestion: null, children: [] });
1405
+ results.push(this.toFlatRow(item, 1, null, options.query));
1406
+ this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: 1, parentQuestion: null, children: [] });
662
1407
  } else {
663
- results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, d, null, options.query));
1408
+ results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, 1, null, options.query));
664
1409
  }
665
1410
  }
666
1411
  return results;
@@ -718,6 +1463,7 @@ var PAAExtractor = class {
718
1463
  } catch {
719
1464
  return [];
720
1465
  }
1466
+ await this.throwIfCaptcha(page, "Google short video search");
721
1467
  const svSels = {
722
1468
  item: ShortVideoSelectors.item,
723
1469
  platforms: [...ShortVideoSelectors.platforms]
@@ -999,69 +1745,11 @@ var PAAExtractor = class {
999
1745
  return { ...entityIds, entities: records, cids: [...cidSet] };
1000
1746
  }
1001
1747
  async extractAISurfaces(page) {
1002
- const aioSels = AIOverviewSelectors;
1003
- const aimSels = AIModeSelectors;
1004
- return page.evaluate(({ aio, aim }) => {
1005
- const sn = window.google?.sn ?? "unknown";
1006
- const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
1007
- function findAIORoot() {
1008
- const primary = document.querySelector(aio.root);
1009
- if (primary) return primary;
1010
- const headings = document.querySelectorAll('h1, h2, h3, [role="heading"]');
1011
- for (const h of headings) {
1012
- if (h.textContent?.trim() === "AI Overview") {
1013
- let el = h.parentElement;
1014
- for (let i = 0; i < 6 && el; i++) {
1015
- if (el.querySelectorAll("a").length > 1) return el;
1016
- el = el.parentElement;
1017
- }
1018
- return h.parentElement;
1019
- }
1020
- }
1021
- return null;
1022
- }
1023
- const aioRoot = findAIORoot();
1024
- const aioContainer = aioRoot ? aioRoot.closest(aio.wrapper) ?? aioRoot : null;
1025
- let aioText = null;
1026
- if (aioContainer) {
1027
- const clone = aioContainer.cloneNode(true);
1028
- clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
1029
- clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
1030
- clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
1031
- clone.querySelectorAll("a").forEach((el) => el.remove());
1032
- const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
1033
- const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
1034
- aioText = isErrorState ? null : candidate;
1035
- }
1036
- const aioDetected = !!aioRoot && aioText !== null;
1037
- const aioCitations = Array.from(aioContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
1038
- text: a.textContent?.trim() ?? "",
1039
- href: a.href
1040
- })).filter((c) => c.text && c.href);
1041
- const aimRoot = document.querySelector(aim.root);
1042
- const aimDetected = surface === "aim" && !!aimRoot;
1043
- const aimContainer = aimRoot?.closest(aim.wrapper) ?? null;
1044
- let aimText = null;
1045
- if (aimContainer) {
1046
- const clone = aimContainer.cloneNode(true);
1047
- clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
1048
- clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
1049
- clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
1050
- clone.querySelectorAll("a").forEach((el) => el.remove());
1051
- const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
1052
- const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
1053
- aimText = isErrorState ? null : candidate;
1054
- }
1055
- const aimCitations = aimDetected ? Array.from(aimContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
1056
- text: a.textContent?.trim() ?? "",
1057
- href: a.href
1058
- })).filter((c) => c.text && c.href) : [];
1059
- return {
1060
- surface,
1061
- aiOverview: { detected: aioDetected, text: aioText, citations: aioCitations },
1062
- aiMode: { detected: aimDetected, text: aimText, citations: aimCitations }
1063
- };
1064
- }, { aio: aioSels, aim: aimSels });
1748
+ return page.evaluate(extractAISurfacesFromDocument, {
1749
+ aio: AIOverviewSelectors,
1750
+ aim: AIModeSelectors,
1751
+ expandWaitMs: 1500
1752
+ });
1065
1753
  }
1066
1754
  buildTree(flat, _seed) {
1067
1755
  const roots = [];
@@ -1088,23 +1776,70 @@ var PAAExtractor = class {
1088
1776
  }
1089
1777
  return roots;
1090
1778
  }
1091
- async extract(options) {
1779
+ getBrowserDebugSnapshot() {
1780
+ return this.driver.getDebugSnapshot();
1781
+ }
1782
+ buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) {
1783
+ if (!options.debug) return void 0;
1784
+ return {
1785
+ enabled: true,
1786
+ request: {
1787
+ query: options.query,
1788
+ locationInput: options.location ?? null,
1789
+ canonicalLocation,
1790
+ uule,
1791
+ gl: options.gl,
1792
+ hl: options.hl,
1793
+ device: options.device,
1794
+ proxyMode: options.proxyMode,
1795
+ proxyZip: options.proxyZip ?? null,
1796
+ serpOnly: options.serpOnly,
1797
+ pages: options.pages ?? 1
1798
+ },
1799
+ browser: this.getBrowserDebugSnapshot(),
1800
+ ...locationEvidence ? { locationEvidence } : {}
1801
+ };
1802
+ }
1803
+ async extract(options, signal) {
1092
1804
  const startMs = Date.now();
1805
+ const isMobile = options.device === "mobile";
1093
1806
  const config = {
1094
1807
  headless: options.headless,
1095
1808
  profileDir: options.profileDir,
1096
1809
  proxy: options.proxy,
1097
1810
  kernelApiKey: options.kernelApiKey,
1098
1811
  kernelProxyId: options.kernelProxyId,
1099
- viewport: { width: 1280, height: 800 },
1100
- locale: `${options.hl}-${options.gl.toUpperCase()}`
1812
+ kernelProxyResolution: options.kernelProxyResolution,
1813
+ proxyMode: options.proxyMode,
1814
+ viewport: isMobile ? { width: 390, height: 844 } : { width: 1280, height: 800 },
1815
+ locale: `${options.hl}-${options.gl.toUpperCase()}`,
1816
+ userAgent: isMobile ? MOBILE_USER_AGENT2 : DESKTOP_USER_AGENT2,
1817
+ deviceScaleFactor: isMobile ? 3 : 1,
1818
+ isMobile,
1819
+ hasTouch: isMobile,
1820
+ debug: options.debug
1101
1821
  };
1102
1822
  let errorCount = 0;
1823
+ const diagnosticWarnings = [];
1103
1824
  try {
1825
+ this.throwIfAborted(signal);
1104
1826
  await this.driver.launch(config);
1105
- const uule = options.location ? encodeUule(normalizeLocation(options.location)) : null;
1106
- const { hasPaa } = await this.driver.navigateToSERP(options.query, uule, options.gl, options.hl);
1827
+ this.throwIfAborted(signal);
1828
+ const canonicalLocation = options.location ? normalizeLocation(options.location) : null;
1829
+ const uule = canonicalLocation ? encodeUule(canonicalLocation) : null;
1830
+ const { hasPaa } = await this.driver.navigateToSERP(
1831
+ options.query,
1832
+ uule,
1833
+ options.gl,
1834
+ options.hl,
1835
+ {
1836
+ ...options.serpOnly ? { num: 100 } : {},
1837
+ debug: options.debug
1838
+ }
1839
+ );
1840
+ this.throwIfAborted(signal);
1107
1841
  const page = this.driver.getPage();
1842
+ await this.throwIfCaptcha(page, "Google SERP");
1108
1843
  if (options.serpOnly) {
1109
1844
  const [organicResults2, localPack2, rawEntityIds2] = await Promise.all([
1110
1845
  this.extractOrganicResults(page),
@@ -1112,13 +1847,19 @@ var PAAExtractor = class {
1112
1847
  this.extractEntityIds(page)
1113
1848
  ]);
1114
1849
  const entityIds2 = this.mergeLocalPackIntoEntities(rawEntityIds2, localPack2);
1850
+ const aiSurfaces2 = await this.extractAISurfaces(page);
1851
+ let locationEvidence2 = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults2, localPack2) : void 0;
1115
1852
  let allOrganic2 = organicResults2;
1116
1853
  if ((options.pages ?? 1) >= 2) {
1117
- const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1854
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1118
1855
  if (uule) p2params.set("uule", uule);
1119
1856
  await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1857
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1120
1858
  const p2organic = await this.extractOrganicResults(page);
1121
1859
  allOrganic2 = [...organicResults2, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1860
+ if (options.debug) {
1861
+ locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, allOrganic2, localPack2);
1862
+ }
1122
1863
  }
1123
1864
  const stats2 = {
1124
1865
  seed: options.query,
@@ -1132,10 +1873,15 @@ var PAAExtractor = class {
1132
1873
  seed: options.query,
1133
1874
  location: options.location ?? null,
1134
1875
  extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1876
+ diagnostics: {
1877
+ completionStatus: "serp_only",
1878
+ problem: null,
1879
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
1880
+ },
1135
1881
  totalQuestions: 0,
1136
- surface: "web",
1137
- aiOverview: { detected: false, text: null, citations: [] },
1138
- aiMode: { detected: false, text: null, citations: [] },
1882
+ surface: aiSurfaces2.surface,
1883
+ aiOverview: aiSurfaces2.aiOverview,
1884
+ aiMode: aiSurfaces2.aiMode,
1139
1885
  whatPeopleSaying: [],
1140
1886
  tree: [],
1141
1887
  flat: [],
@@ -1156,16 +1902,22 @@ var PAAExtractor = class {
1156
1902
  this.extractLocalPack(page)
1157
1903
  ]);
1158
1904
  const entityIds = this.mergeLocalPackIntoEntities(rawEntityIds, localPack);
1905
+ const initialLocationEvidence = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) : void 0;
1159
1906
  this.reporter.onVideos(videos);
1160
1907
  this.reporter.onForums(forums);
1161
1908
  if (!hasPaa) {
1162
1909
  let noPaaOrganic = organicResults;
1910
+ let locationEvidence2 = initialLocationEvidence;
1163
1911
  if ((options.pages ?? 1) >= 2) {
1164
- const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1912
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1165
1913
  if (uule) p2params.set("uule", uule);
1166
1914
  await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1915
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1167
1916
  const p2organic = await this.extractOrganicResults(page);
1168
1917
  noPaaOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1918
+ if (options.debug) {
1919
+ locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, noPaaOrganic, localPack);
1920
+ }
1169
1921
  }
1170
1922
  const aiSurfaces2 = await this.extractAISurfaces(page);
1171
1923
  const stats2 = {
@@ -1180,6 +1932,11 @@ var PAAExtractor = class {
1180
1932
  seed: options.query,
1181
1933
  location: options.location ?? null,
1182
1934
  extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1935
+ diagnostics: {
1936
+ completionStatus: "no_paa",
1937
+ problem: null,
1938
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
1939
+ },
1183
1940
  totalQuestions: 0,
1184
1941
  surface: aiSurfaces2.surface,
1185
1942
  aiOverview: aiSurfaces2.aiOverview,
@@ -1195,19 +1952,37 @@ var PAAExtractor = class {
1195
1952
  stats: stats2
1196
1953
  };
1197
1954
  }
1198
- const flat = await this.runBFS(page, options);
1955
+ const flat = await this.runBFS(page, options, signal);
1956
+ this.throwIfAborted(signal);
1199
1957
  const aiSurfaces = await this.extractAISurfaces(page);
1200
- const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, udm: ShortVideoSelectors.udm });
1958
+ const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", udm: ShortVideoSelectors.udm });
1201
1959
  if (uule) shortVidsParams.set("uule", uule);
1202
- const shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
1960
+ let shortVideos = [];
1961
+ try {
1962
+ shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
1963
+ } catch (err) {
1964
+ if (!(err instanceof CaptchaError)) throw err;
1965
+ errorCount++;
1966
+ diagnosticWarnings.push({
1967
+ code: "short_videos_captcha_skipped",
1968
+ surface: "short_videos",
1969
+ message: err.message,
1970
+ retryable: true
1971
+ });
1972
+ }
1203
1973
  this.reporter.onVideos(shortVideos);
1204
1974
  let allOrganic = organicResults;
1975
+ let locationEvidence = initialLocationEvidence;
1205
1976
  if ((options.pages ?? 1) >= 2) {
1206
- const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1977
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1207
1978
  if (uule) p2params.set("uule", uule);
1208
1979
  await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1980
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1209
1981
  const p2organic = await this.extractOrganicResults(page);
1210
1982
  allOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1983
+ if (options.debug) {
1984
+ locationEvidence = inferSerpLocationEvidence(canonicalLocation, allOrganic, localPack);
1985
+ }
1211
1986
  }
1212
1987
  const allVideos = [...videos, ...shortVideos];
1213
1988
  const tree = this.buildTree(flat, options.query);
@@ -1223,6 +1998,12 @@ var PAAExtractor = class {
1223
1998
  seed: options.query,
1224
1999
  location: options.location ?? null,
1225
2000
  extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
2001
+ diagnostics: {
2002
+ completionStatus: "paa_found",
2003
+ problem: null,
2004
+ ...diagnosticWarnings.length > 0 ? { warnings: diagnosticWarnings } : {},
2005
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) } : {}
2006
+ },
1226
2007
  totalQuestions: flat.length,
1227
2008
  surface: aiSurfaces.surface,
1228
2009
  aiOverview: aiSurfaces.aiOverview,
@@ -1241,8 +2022,6 @@ var PAAExtractor = class {
1241
2022
  errorCount++;
1242
2023
  this.reporter.onError(err instanceof Error ? err : new Error(String(err)));
1243
2024
  throw err;
1244
- } finally {
1245
- await this.driver.close();
1246
2025
  }
1247
2026
  }
1248
2027
  };
@@ -1356,53 +2135,646 @@ var ProgressReporter = class {
1356
2135
  }
1357
2136
  };
1358
2137
 
2138
+ // src/kernel-proxy-resolver.ts
2139
+ var import_sdk2 = __toESM(require("@onkernel/sdk"), 1);
2140
+ var US_STATE_CODES = {
2141
+ alabama: "AL",
2142
+ alaska: "AK",
2143
+ arizona: "AZ",
2144
+ arkansas: "AR",
2145
+ california: "CA",
2146
+ colorado: "CO",
2147
+ connecticut: "CT",
2148
+ delaware: "DE",
2149
+ florida: "FL",
2150
+ georgia: "GA",
2151
+ hawaii: "HI",
2152
+ idaho: "ID",
2153
+ illinois: "IL",
2154
+ indiana: "IN",
2155
+ iowa: "IA",
2156
+ kansas: "KS",
2157
+ kentucky: "KY",
2158
+ louisiana: "LA",
2159
+ maine: "ME",
2160
+ maryland: "MD",
2161
+ massachusetts: "MA",
2162
+ michigan: "MI",
2163
+ minnesota: "MN",
2164
+ mississippi: "MS",
2165
+ missouri: "MO",
2166
+ montana: "MT",
2167
+ nebraska: "NE",
2168
+ nevada: "NV",
2169
+ "new hampshire": "NH",
2170
+ "new jersey": "NJ",
2171
+ "new mexico": "NM",
2172
+ "new york": "NY",
2173
+ "north carolina": "NC",
2174
+ "north dakota": "ND",
2175
+ ohio: "OH",
2176
+ oklahoma: "OK",
2177
+ oregon: "OR",
2178
+ pennsylvania: "PA",
2179
+ "rhode island": "RI",
2180
+ "south carolina": "SC",
2181
+ "south dakota": "SD",
2182
+ tennessee: "TN",
2183
+ texas: "TX",
2184
+ utah: "UT",
2185
+ vermont: "VT",
2186
+ virginia: "VA",
2187
+ washington: "WA",
2188
+ "west virginia": "WV",
2189
+ wisconsin: "WI",
2190
+ wyoming: "WY"
2191
+ };
2192
+ var US_CITY_CENTER_ZIPS = {
2193
+ "atlanta|GA": "30303",
2194
+ "austin|TX": "78701",
2195
+ "baltimore|MD": "21201",
2196
+ "boston|MA": "02108",
2197
+ "boulder|CO": "80302",
2198
+ "charlotte|NC": "28202",
2199
+ "chicago|IL": "60601",
2200
+ "colorado_springs|CO": "80903",
2201
+ "columbus|OH": "43215",
2202
+ "dallas|TX": "75201",
2203
+ "denver|CO": "80202",
2204
+ "detroit|MI": "48226",
2205
+ "fort_collins|CO": "80524",
2206
+ "fort_worth|TX": "76102",
2207
+ "houston|TX": "77002",
2208
+ "indianapolis|IN": "46204",
2209
+ "jacksonville|FL": "32202",
2210
+ "las_vegas|NV": "89101",
2211
+ "los_angeles|CA": "90012",
2212
+ "louisville|KY": "40202",
2213
+ "loveland|CO": "80537",
2214
+ "memphis|TN": "38103",
2215
+ "miami|FL": "33131",
2216
+ "minneapolis|MN": "55401",
2217
+ "nashville|TN": "37203",
2218
+ "new_york|NY": "10001",
2219
+ "orlando|FL": "32801",
2220
+ "philadelphia|PA": "19103",
2221
+ "phoenix|AZ": "85004",
2222
+ "portland|OR": "97205",
2223
+ "raleigh|NC": "27601",
2224
+ "richmond|VA": "23219",
2225
+ "sacramento|CA": "95814",
2226
+ "salt_lake_city|UT": "84101",
2227
+ "san_antonio|TX": "78205",
2228
+ "san_diego|CA": "92101",
2229
+ "san_francisco|CA": "94103",
2230
+ "san_jose|CA": "95113",
2231
+ "seattle|WA": "98101"
2232
+ };
2233
+ function proxyIdSuffix2(proxyId) {
2234
+ return proxyId ? proxyId.slice(-6) : null;
2235
+ }
2236
+ function resolution(source, proxyMode, proxyId, target, error) {
2237
+ return {
2238
+ kernelProxyId: proxyId,
2239
+ resolution: {
2240
+ source,
2241
+ proxyMode,
2242
+ proxyIdPresent: Boolean(proxyId),
2243
+ proxyIdSuffix: proxyIdSuffix2(proxyId),
2244
+ target,
2245
+ error
2246
+ }
2247
+ };
2248
+ }
2249
+ function normalizeStateName(value) {
2250
+ return value.trim().toLowerCase().replace(/\s+/g, " ");
2251
+ }
2252
+ function normalizeCountryName(value) {
2253
+ return value.trim().toLowerCase().replace(/\./g, "").replace(/\s+/g, " ");
2254
+ }
2255
+ function isUnitedStates(country) {
2256
+ if (!country) return true;
2257
+ const normalized = normalizeCountryName(country);
2258
+ return normalized === "united states" || normalized === "united states of america" || normalized === "usa" || normalized === "us";
2259
+ }
2260
+ function stateCodeFor(region) {
2261
+ const trimmed = region.trim();
2262
+ if (/^[A-Za-z]{2}$/.test(trimmed)) return trimmed.toUpperCase();
2263
+ return US_STATE_CODES[normalizeStateName(trimmed)] ?? null;
2264
+ }
2265
+ function kernelCityIdentifierCandidates(city) {
2266
+ const ascii = city.normalize("NFKD").replace(/[^\x00-\x7F]/g, "").toLowerCase();
2267
+ const words = ascii.split(/[^a-z0-9]+/).filter(Boolean);
2268
+ const underscored = words.join("_");
2269
+ const compact = words.join("");
2270
+ return Array.from(new Set([underscored, compact].filter(Boolean)));
2271
+ }
2272
+ function proxyName(country, state, city) {
2273
+ return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
2274
+ }
2275
+ function zipProxyName(zip) {
2276
+ return `mcp-serp-residential-us-zip-${zip}`;
2277
+ }
2278
+ function parseKernelLocationProxyTarget(location, gl) {
2279
+ if (!location || gl.toLowerCase() !== "us") return null;
2280
+ const canonicalLocation = normalizeLocation(location);
2281
+ let parts = canonicalLocation.split(",").map((part) => part.trim()).filter(Boolean);
2282
+ if (parts.length > 1 && isUnitedStates(parts[parts.length - 1])) {
2283
+ parts = parts.slice(0, -1);
2284
+ }
2285
+ if (parts.length === 1) {
2286
+ const stateOnly = stateCodeFor(parts[0]);
2287
+ if (!stateOnly) return null;
2288
+ return {
2289
+ canonicalLocation,
2290
+ level: "state",
2291
+ country: "US",
2292
+ state: stateOnly,
2293
+ city: "",
2294
+ cityCandidates: [],
2295
+ proxyName: proxyName("US", stateOnly),
2296
+ config: {
2297
+ country: "US",
2298
+ state: stateOnly
2299
+ }
2300
+ };
2301
+ }
2302
+ const [city = "", region = ""] = parts;
2303
+ if (!city || !region) return null;
2304
+ const state = stateCodeFor(region);
2305
+ if (!state) return null;
2306
+ const cityCandidates = kernelCityIdentifierCandidates(city);
2307
+ const primaryCity = cityCandidates[0];
2308
+ if (!primaryCity) return null;
2309
+ return {
2310
+ canonicalLocation,
2311
+ level: "city",
2312
+ country: "US",
2313
+ state,
2314
+ city: primaryCity,
2315
+ cityCandidates,
2316
+ proxyName: proxyName("US", state, primaryCity),
2317
+ config: {
2318
+ country: "US",
2319
+ state,
2320
+ city: primaryCity
2321
+ }
2322
+ };
2323
+ }
2324
+ function cityZipKey(target) {
2325
+ return `${target.city}|${target.state}`;
2326
+ }
2327
+ function knownZipFor(target, explicitZip) {
2328
+ if (explicitZip && /^\d{5}$/.test(explicitZip)) return explicitZip;
2329
+ return US_CITY_CENTER_ZIPS[cityZipKey(target)] ?? null;
2330
+ }
2331
+ function zipTarget(target, zip) {
2332
+ return {
2333
+ ...target,
2334
+ level: "zip",
2335
+ zip,
2336
+ proxyName: zipProxyName(zip),
2337
+ config: {
2338
+ country: target.country,
2339
+ state: target.state,
2340
+ zip
2341
+ }
2342
+ };
2343
+ }
2344
+ function configMatches(config, target, city) {
2345
+ if (target.level === "zip") {
2346
+ return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
2347
+ }
2348
+ return config?.country?.toUpperCase() === target.country && config?.state?.toUpperCase() === target.state && (city ? config?.city === city : !config?.city);
2349
+ }
2350
+ function findExistingTargetProxy(proxies, target) {
2351
+ return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === target.proxyName || configMatches(proxy.config, target, target.level === "city" ? target.city : void 0))) ?? null;
2352
+ }
2353
+ function findExistingProxy(proxies, target) {
2354
+ for (const city of target.cityCandidates) {
2355
+ const name = proxyName(target.country, target.state, city);
2356
+ const found = proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target, city)));
2357
+ if (found) return found;
2358
+ }
2359
+ return null;
2360
+ }
2361
+ function stateTarget(target) {
2362
+ return {
2363
+ ...target,
2364
+ level: "state",
2365
+ proxyName: proxyName(target.country, target.state),
2366
+ config: {
2367
+ country: target.country,
2368
+ state: target.state
2369
+ }
2370
+ };
2371
+ }
2372
+ function findExistingStateProxy(proxies, target) {
2373
+ const name = proxyName(target.country, target.state);
2374
+ return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target))) ?? null;
2375
+ }
2376
+ function escalatedTargetLevel(target, attemptIndex) {
2377
+ return stateTarget(target);
2378
+ }
2379
+ function errorText2(err) {
2380
+ return err instanceof Error ? err.message : String(err);
2381
+ }
2382
+ async function resolveKernelProxyId(options) {
2383
+ if (options.proxyMode === "none") {
2384
+ return resolution("disabled", options.proxyMode, void 0, null, null);
2385
+ }
2386
+ if (options.proxyMode === "configured") {
2387
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, null, null);
2388
+ }
2389
+ const target = parseKernelLocationProxyTarget(options.location, options.gl);
2390
+ if (!target || !options.kernelApiKey) {
2391
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, target ? null : "location could not be normalized to a US city/state proxy target");
2392
+ }
2393
+ const kernel = new import_sdk2.default({ apiKey: options.kernelApiKey });
2394
+ try {
2395
+ const attemptIndex = options.attemptIndex ?? 0;
2396
+ if (attemptIndex >= 1) {
2397
+ const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
2398
+ const createErrors2 = [];
2399
+ try {
2400
+ const created = await kernel.proxies.create({
2401
+ type: "residential",
2402
+ name: escalatedTarget.proxyName,
2403
+ config: escalatedTarget.config
2404
+ });
2405
+ if (created.id) {
2406
+ return resolution("location_created", options.proxyMode, created.id, escalatedTarget, null);
2407
+ }
2408
+ createErrors2.push(`${escalatedTarget.state}: Kernel did not return a proxy id`);
2409
+ } catch (err) {
2410
+ createErrors2.push(`${escalatedTarget.state}: ${errorText2(err)}`);
2411
+ }
2412
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, escalatedTarget, createErrors2.join(" | "));
2413
+ }
2414
+ const proxies = await kernel.proxies.list();
2415
+ const zip = knownZipFor(target, options.proxyZip);
2416
+ const createErrors = [];
2417
+ if (zip) {
2418
+ const targetZip = zipTarget(target, zip);
2419
+ const existingZip = findExistingTargetProxy(proxies, targetZip);
2420
+ if (existingZip?.id) {
2421
+ return resolution("location_reused", options.proxyMode, existingZip.id, targetZip, null);
2422
+ }
2423
+ try {
2424
+ const created = await kernel.proxies.create({
2425
+ type: "residential",
2426
+ name: targetZip.proxyName,
2427
+ config: {
2428
+ country: targetZip.country,
2429
+ zip
2430
+ }
2431
+ });
2432
+ if (created.id) {
2433
+ return resolution("location_created", options.proxyMode, created.id, targetZip, null);
2434
+ }
2435
+ createErrors.push(`${zip}: Kernel did not return a proxy id`);
2436
+ } catch (err) {
2437
+ createErrors.push(`${zip}: ${errorText2(err)}`);
2438
+ }
2439
+ }
2440
+ const existing = findExistingProxy(proxies, target);
2441
+ if (existing?.id) {
2442
+ return resolution("location_reused", options.proxyMode, existing.id, target, createErrors.join(" | ") || null);
2443
+ }
2444
+ for (const city of target.cityCandidates) {
2445
+ try {
2446
+ const created = await kernel.proxies.create({
2447
+ type: "residential",
2448
+ name: proxyName(target.country, target.state, city),
2449
+ config: {
2450
+ country: target.country,
2451
+ state: target.state,
2452
+ city
2453
+ }
2454
+ });
2455
+ if (created.id) {
2456
+ return resolution("location_created", options.proxyMode, created.id, {
2457
+ ...target,
2458
+ level: "city",
2459
+ city,
2460
+ proxyName: proxyName(target.country, target.state, city),
2461
+ config: {
2462
+ country: target.country,
2463
+ state: target.state,
2464
+ city
2465
+ }
2466
+ }, null);
2467
+ }
2468
+ createErrors.push(`${city}: Kernel did not return a proxy id`);
2469
+ } catch (err) {
2470
+ createErrors.push(`${city}: ${errorText2(err)}`);
2471
+ }
2472
+ }
2473
+ const fallbackTarget = stateTarget(target);
2474
+ const existingState = findExistingStateProxy(proxies, fallbackTarget);
2475
+ if (existingState?.id) {
2476
+ return resolution("location_reused", options.proxyMode, existingState.id, fallbackTarget, createErrors.join(" | "));
2477
+ }
2478
+ try {
2479
+ const created = await kernel.proxies.create({
2480
+ type: "residential",
2481
+ name: fallbackTarget.proxyName,
2482
+ config: fallbackTarget.config
2483
+ });
2484
+ if (created.id) {
2485
+ return resolution("location_created", options.proxyMode, created.id, fallbackTarget, createErrors.join(" | "));
2486
+ }
2487
+ createErrors.push(`${fallbackTarget.state}: Kernel did not return a proxy id`);
2488
+ } catch (err) {
2489
+ createErrors.push(`${fallbackTarget.state}: ${errorText2(err)}`);
2490
+ }
2491
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
2492
+ } catch (err) {
2493
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, errorText2(err));
2494
+ }
2495
+ }
2496
+
1359
2497
  // src/harvest.ts
1360
2498
  var MAX_ATTEMPTS = 3;
1361
- async function extractOnce(options) {
2499
+ function abortReason(signal) {
2500
+ if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
2501
+ return new RequestAbortedError();
2502
+ }
2503
+ function getAbortSignal(rawOptions) {
2504
+ if (!rawOptions || typeof rawOptions !== "object") return void 0;
2505
+ const signal = rawOptions.signal;
2506
+ if (signal instanceof AbortSignal) return signal;
2507
+ return void 0;
2508
+ }
2509
+ function getAttemptLogSink(rawOptions) {
2510
+ if (!rawOptions || typeof rawOptions !== "object") return void 0;
2511
+ const sink = rawOptions.onAttemptEvent;
2512
+ return typeof sink === "function" ? sink : void 0;
2513
+ }
2514
+ async function emitAttemptEvent(sink, event) {
2515
+ if (!sink) return;
2516
+ try {
2517
+ await sink(event);
2518
+ } catch (err) {
2519
+ console.warn(JSON.stringify({
2520
+ event: "harvest_attempt_log_failed",
2521
+ attempt_number: event.attemptNumber,
2522
+ message: err instanceof Error ? err.message : String(err)
2523
+ }));
2524
+ }
2525
+ }
2526
+ function classifyAttemptError(err) {
2527
+ if (err instanceof CaptchaError) return "captcha";
2528
+ if (err instanceof RequestAbortedError) return "request_aborted";
2529
+ if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
2530
+ const message = err instanceof Error ? err.message : String(err);
2531
+ return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
2532
+ }
2533
+ function classifyAttemptResult(result) {
2534
+ return result.diagnostics?.completionStatus ?? (result.totalQuestions > 0 ? "paa_found" : "no_paa");
2535
+ }
2536
+ function errorMessage(err) {
2537
+ return err instanceof Error ? err.message : String(err);
2538
+ }
2539
+ async function extractOnce(options, signal) {
1362
2540
  const driver = new BrowserDriver();
1363
2541
  const reporter = new ProgressReporter();
1364
2542
  const extractor = new PAAExtractor(driver, reporter);
2543
+ if (signal?.aborted) {
2544
+ return {
2545
+ result: null,
2546
+ error: abortReason(signal),
2547
+ cleanup: await driver.close(),
2548
+ debug: null
2549
+ };
2550
+ }
2551
+ let onAbort;
2552
+ const abortPromise = signal ? new Promise((_, reject) => {
2553
+ onAbort = () => reject(abortReason(signal));
2554
+ signal.addEventListener("abort", onAbort, { once: true });
2555
+ }) : null;
2556
+ let result = null;
2557
+ let error = null;
2558
+ let cleanup;
2559
+ let debug = null;
1365
2560
  try {
1366
- return await extractor.extract(options);
2561
+ const extraction = extractor.extract(options, signal);
2562
+ if (abortPromise) extraction.catch(() => {
2563
+ });
2564
+ result = await (abortPromise ? Promise.race([extraction, abortPromise]) : extraction);
2565
+ } catch (err) {
2566
+ error = err;
1367
2567
  } finally {
1368
- await driver.close();
2568
+ if (signal && onAbort) signal.removeEventListener("abort", onAbort);
2569
+ debug = result?.diagnostics.debug ?? (options.debug ? {
2570
+ enabled: true,
2571
+ request: {
2572
+ query: options.query,
2573
+ locationInput: options.location ?? null,
2574
+ canonicalLocation: null,
2575
+ uule: null,
2576
+ gl: options.gl,
2577
+ hl: options.hl,
2578
+ device: options.device,
2579
+ proxyMode: options.proxyMode,
2580
+ proxyZip: options.proxyZip ?? null,
2581
+ serpOnly: options.serpOnly,
2582
+ pages: options.pages ?? 1
2583
+ },
2584
+ browser: driver.getDebugSnapshot()
2585
+ } : null);
2586
+ cleanup = await driver.close();
1369
2587
  }
2588
+ return error ? { result: null, error, cleanup, debug } : { result, error: null, cleanup, debug };
1370
2589
  }
1371
2590
  async function harvest(rawOptions) {
1372
2591
  const raw = typeof rawOptions === "object" && rawOptions !== null ? rawOptions : {};
1373
- const merged = {
1374
- kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
1375
- kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
1376
- ...raw
2592
+ const signal = getAbortSignal(rawOptions);
2593
+ const onAttemptEvent = getAttemptLogSink(rawOptions);
2594
+ const requestedProxyMode = raw.proxyMode;
2595
+ const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
2596
+ const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
2597
+ const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
2598
+ const proxyOpts = {
2599
+ kernelApiKey,
2600
+ proxyMode,
2601
+ configuredKernelProxyId,
2602
+ location: typeof raw.location === "string" ? raw.location : void 0,
2603
+ proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
2604
+ gl: typeof raw.gl === "string" ? raw.gl : "us"
1377
2605
  };
1378
- const options = HarvestOptionsSchema.parse(merged);
1379
2606
  const serializer = new OutputSerializer();
1380
2607
  for (let i = 0; i < MAX_ATTEMPTS; i++) {
2608
+ const attemptNumber = i + 1;
2609
+ const startedAtMs = Date.now();
1381
2610
  try {
1382
- const result = await extractOnce(options);
1383
- if (options.format === "json" || options.format === "both") {
1384
- await serializer.writeJSON(result, options.outputDir);
2611
+ if (signal?.aborted) throw abortReason(signal);
2612
+ const resolution2 = await resolveKernelProxyId({ ...proxyOpts, attemptIndex: i });
2613
+ const mergedAttempt = {
2614
+ ...raw,
2615
+ kernelApiKey,
2616
+ kernelProxyId: resolution2.kernelProxyId,
2617
+ kernelProxyResolution: resolution2.resolution,
2618
+ proxyMode
2619
+ };
2620
+ if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
2621
+ const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
2622
+ await emitAttemptEvent(onAttemptEvent, {
2623
+ type: "started",
2624
+ attemptNumber,
2625
+ maxAttempts: MAX_ATTEMPTS,
2626
+ query: attemptOptions.query,
2627
+ location: attemptOptions.location ?? null,
2628
+ maxQuestions: attemptOptions.maxQuestions,
2629
+ startedAt: new Date(startedAtMs).toISOString()
2630
+ });
2631
+ console.info(JSON.stringify({
2632
+ event: "harvest_attempt_started",
2633
+ attempt_number: attemptNumber,
2634
+ max_attempts: MAX_ATTEMPTS,
2635
+ query: attemptOptions.query,
2636
+ location: attemptOptions.location ?? null,
2637
+ max_questions: attemptOptions.maxQuestions
2638
+ }));
2639
+ const attempt = await extractOnce(attemptOptions, signal);
2640
+ if (attempt.error) {
2641
+ const err = attempt.error;
2642
+ if (err instanceof CaptchaError) {
2643
+ const willRetry = i < MAX_ATTEMPTS - 1;
2644
+ console.warn(JSON.stringify({
2645
+ event: "harvest_attempt_captcha",
2646
+ attempt_number: attemptNumber,
2647
+ max_attempts: MAX_ATTEMPTS,
2648
+ message: err.message,
2649
+ will_retry: willRetry
2650
+ }));
2651
+ await emitAttemptEvent(onAttemptEvent, {
2652
+ type: "finished",
2653
+ attemptNumber,
2654
+ maxAttempts: MAX_ATTEMPTS,
2655
+ outcome: "captcha",
2656
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2657
+ questionCount: 0,
2658
+ durationMs: Date.now() - startedAtMs,
2659
+ error: err.message,
2660
+ willRetry,
2661
+ cleanup: attempt.cleanup,
2662
+ debug: attempt.debug,
2663
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2664
+ });
2665
+ if (willRetry) continue;
2666
+ break;
2667
+ }
2668
+ await emitAttemptEvent(onAttemptEvent, {
2669
+ type: "finished",
2670
+ attemptNumber,
2671
+ maxAttempts: MAX_ATTEMPTS,
2672
+ outcome: classifyAttemptError(err),
2673
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2674
+ questionCount: 0,
2675
+ durationMs: Date.now() - startedAtMs,
2676
+ error: errorMessage(err),
2677
+ willRetry: false,
2678
+ cleanup: attempt.cleanup,
2679
+ debug: attempt.debug,
2680
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2681
+ });
2682
+ throw err;
2683
+ }
2684
+ const result = attempt.result;
2685
+ if (!result) throw new Error("Harvest attempt completed without a result");
2686
+ await emitAttemptEvent(onAttemptEvent, {
2687
+ type: "finished",
2688
+ attemptNumber,
2689
+ maxAttempts: MAX_ATTEMPTS,
2690
+ outcome: classifyAttemptResult(result),
2691
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2692
+ questionCount: result.totalQuestions,
2693
+ durationMs: Date.now() - startedAtMs,
2694
+ error: null,
2695
+ willRetry: false,
2696
+ cleanup: attempt.cleanup,
2697
+ debug: attempt.debug,
2698
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2699
+ });
2700
+ if (attemptOptions.format === "json" || attemptOptions.format === "both") {
2701
+ await serializer.writeJSON(result, attemptOptions.outputDir);
1385
2702
  }
1386
- if (options.format === "csv" || options.format === "both") {
2703
+ if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
1387
2704
  await Promise.all([
1388
- serializer.writeCSV(result.flat, options.outputDir),
1389
- result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, options.outputDir) : Promise.resolve(""),
1390
- result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, options.outputDir) : Promise.resolve(""),
1391
- result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, options.outputDir) : Promise.resolve(""),
1392
- result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, options.outputDir) : Promise.resolve(""),
1393
- result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, options.outputDir) : Promise.resolve("")
2705
+ serializer.writeCSV(result.flat, attemptOptions.outputDir),
2706
+ result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2707
+ result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2708
+ result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2709
+ result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2710
+ result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, attemptOptions.outputDir) : Promise.resolve("")
1394
2711
  ]);
1395
2712
  }
1396
2713
  return result;
1397
2714
  } catch (err) {
1398
- if (err instanceof CaptchaError && i < MAX_ATTEMPTS - 1) {
1399
- continue;
2715
+ if (err instanceof CaptchaError) {
2716
+ const willRetry = i < MAX_ATTEMPTS - 1;
2717
+ console.warn(JSON.stringify({
2718
+ event: "harvest_attempt_captcha",
2719
+ attempt_number: attemptNumber,
2720
+ max_attempts: MAX_ATTEMPTS,
2721
+ message: err.message,
2722
+ will_retry: willRetry
2723
+ }));
2724
+ await emitAttemptEvent(onAttemptEvent, {
2725
+ type: "finished",
2726
+ attemptNumber,
2727
+ maxAttempts: MAX_ATTEMPTS,
2728
+ outcome: "captcha",
2729
+ kernelSessionId: null,
2730
+ questionCount: 0,
2731
+ durationMs: Date.now() - startedAtMs,
2732
+ error: err.message,
2733
+ willRetry,
2734
+ cleanup: {
2735
+ kernelSessionId: null,
2736
+ kernelDeleteStarted: false,
2737
+ kernelDeleteSucceeded: null,
2738
+ kernelDeleteError: null,
2739
+ browserCloseSucceeded: null,
2740
+ browserCloseError: null
2741
+ },
2742
+ debug: null,
2743
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2744
+ });
2745
+ if (willRetry) continue;
2746
+ break;
1400
2747
  }
2748
+ await emitAttemptEvent(onAttemptEvent, {
2749
+ type: "finished",
2750
+ attemptNumber,
2751
+ maxAttempts: MAX_ATTEMPTS,
2752
+ outcome: classifyAttemptError(err),
2753
+ kernelSessionId: null,
2754
+ questionCount: 0,
2755
+ durationMs: Date.now() - startedAtMs,
2756
+ error: errorMessage(err),
2757
+ willRetry: false,
2758
+ cleanup: {
2759
+ kernelSessionId: null,
2760
+ kernelDeleteStarted: false,
2761
+ kernelDeleteSucceeded: null,
2762
+ kernelDeleteError: null,
2763
+ browserCloseSucceeded: null,
2764
+ browserCloseError: null
2765
+ },
2766
+ debug: null,
2767
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2768
+ });
1401
2769
  throw err;
1402
2770
  }
1403
2771
  }
1404
- const sessionDesc = options.kernelApiKey ? `${MAX_ATTEMPTS} fresh Kernel.sh sessions` : `${MAX_ATTEMPTS} attempts`;
1405
- throw new CaptchaError(`CAPTCHA on all ${sessionDesc}. Try again in a few minutes.`);
2772
+ console.warn(JSON.stringify({
2773
+ event: "harvest_captcha_exhausted",
2774
+ max_attempts: MAX_ATTEMPTS,
2775
+ session_kind: kernelApiKey ? "kernel" : "local"
2776
+ }));
2777
+ throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${MAX_ATTEMPTS} fresh sessions. Try again in a few minutes.`));
1406
2778
  }
1407
2779
 
1408
2780
  // src/cli.ts