mcp-scraper 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/README.md +5 -0
  2. package/dist/bin/api-server.cjs +15553 -7587
  3. package/dist/bin/api-server.cjs.map +1 -1
  4. package/dist/bin/api-server.js +3 -3
  5. package/dist/bin/mcp-stdio-server.cjs +312 -119
  6. package/dist/bin/mcp-stdio-server.cjs.map +1 -1
  7. package/dist/bin/mcp-stdio-server.js +1 -1
  8. package/dist/bin/paa-harvest.cjs +1537 -165
  9. package/dist/bin/paa-harvest.cjs.map +1 -1
  10. package/dist/bin/paa-harvest.js +1 -1
  11. package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
  12. package/dist/chunk-D4CJBZBY.js.map +1 -0
  13. package/dist/chunk-HERFK7W6.js +2781 -0
  14. package/dist/chunk-HERFK7W6.js.map +1 -0
  15. package/dist/chunk-JQKZWEON.js +1000 -0
  16. package/dist/chunk-JQKZWEON.js.map +1 -0
  17. package/dist/chunk-Y74EXABN.js +295 -0
  18. package/dist/chunk-Y74EXABN.js.map +1 -0
  19. package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
  20. package/dist/index.cjs +1660 -237
  21. package/dist/index.cjs.map +1 -1
  22. package/dist/index.d.cts +169 -2
  23. package/dist/index.d.ts +169 -2
  24. package/dist/index.js +120 -69
  25. package/dist/index.js.map +1 -1
  26. package/dist/server-W5NWH5KF.js +11625 -0
  27. package/dist/server-W5NWH5KF.js.map +1 -0
  28. package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
  29. package/dist/worker-D4D2YQTA.js.map +1 -0
  30. package/package.json +17 -5
  31. package/dist/chunk-4API3ZCT.js +0 -1387
  32. package/dist/chunk-4API3ZCT.js.map +0 -1
  33. package/dist/chunk-LXZDJJXR.js.map +0 -1
  34. package/dist/chunk-ZBP4RHNW.js +0 -805
  35. package/dist/chunk-ZBP4RHNW.js.map +0 -1
  36. package/dist/server-63DR2HE5.js +0 -6062
  37. package/dist/server-63DR2HE5.js.map +0 -1
  38. package/dist/worker-3ECJHPRE.js.map +0 -1
  39. /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
@@ -0,0 +1,2781 @@
1
+ // src/schemas.ts
2
+ import { z } from "zod";
3
+ var HarvestOptionsSchema = z.object({
4
+ query: z.string().min(1),
5
+ location: z.string().optional(),
6
+ gl: z.string().length(2).default("us"),
7
+ hl: z.string().length(2).default("en"),
8
+ device: z.enum(["desktop", "mobile"]).default("desktop"),
9
+ proxyMode: z.enum(["location", "configured", "none"]).default("location"),
10
+ proxyZip: z.string().regex(/^\d{5}$/).optional(),
11
+ debug: z.boolean().default(false),
12
+ depth: z.number().int().min(1).max(30).default(3),
13
+ maxQuestions: z.number().int().min(1).max(1e3).default(100),
14
+ headless: z.boolean().default(false),
15
+ profileDir: z.string().optional(),
16
+ proxy: z.string().url().optional(),
17
+ kernelApiKey: z.string().optional(),
18
+ kernelProxyId: z.string().optional(),
19
+ kernelProxyResolution: z.unknown().optional(),
20
+ outputDir: z.string().default("./paa-output"),
21
+ format: z.enum(["json", "csv", "both"]).default("both"),
22
+ serpOnly: z.boolean().default(false),
23
+ pages: z.number().int().min(1).max(2).default(1)
24
+ });
25
+ var MapsPlaceOptionsSchema = z.object({
26
+ businessName: z.string().min(1),
27
+ location: z.string().min(1),
28
+ gl: z.string().length(2).default("us"),
29
+ hl: z.string().length(2).default("en"),
30
+ includeReviews: z.boolean().default(false),
31
+ maxReviews: z.number().int().min(1).max(500).default(50),
32
+ kernelApiKey: z.string().optional(),
33
+ kernelProxyId: z.string().optional(),
34
+ headless: z.boolean().default(true)
35
+ });
36
+ var RawPAAItemSchema = z.object({
37
+ question: z.string().min(1),
38
+ answer: z.string().optional(),
39
+ sourceTitle: z.string().optional(),
40
+ sourceSite: z.string().optional(),
41
+ sourceCite: z.string().optional()
42
+ });
43
+ var RawMapsOverviewSchema = z.object({
44
+ name: z.string().nullable(),
45
+ rating: z.string().nullable(),
46
+ reviewCount: z.string().nullable(),
47
+ category: z.string().nullable(),
48
+ address: z.string().nullable(),
49
+ hoursSummary: z.string().nullable(),
50
+ phone: z.string().nullable(),
51
+ phoneDisplay: z.string().nullable(),
52
+ website: z.string().nullable(),
53
+ plusCode: z.string().nullable(),
54
+ bookingUrl: z.string().nullable()
55
+ });
56
+ var RawMapsHoursRowSchema = z.object({
57
+ day: z.string(),
58
+ hours: z.string()
59
+ });
60
+ var RawMapsReviewStatsSchema = z.object({
61
+ reviewHistogram: z.array(z.object({
62
+ stars: z.number(),
63
+ count: z.string()
64
+ })),
65
+ reviewTopics: z.array(z.object({
66
+ label: z.string(),
67
+ count: z.string()
68
+ }))
69
+ });
70
+ var RawMapsReviewCardSchema = z.object({
71
+ reviewId: z.string(),
72
+ author: z.string().nullable(),
73
+ stars: z.string().nullable(),
74
+ date: z.string().nullable(),
75
+ text: z.string().nullable(),
76
+ ownerResponse: z.string().nullable()
77
+ });
78
+ var RawMapsAboutAttributeSchema = z.object({
79
+ section: z.string(),
80
+ attribute: z.string()
81
+ });
82
+
83
+ // src/driver/BrowserDriver.ts
84
+ import { chromium } from "playwright-extra";
85
+ import StealthPlugin from "puppeteer-extra-plugin-stealth";
86
+ import { chromium as playwrightChromium } from "playwright";
87
+ import Kernel from "@onkernel/sdk";
88
+
89
+ // src/selectors.ts
90
+ var PAASelectors = {
91
+ container: ".eJH8qe.adDDi",
92
+ dataInitq: "[data-initq]",
93
+ item: ".related-question-pair",
94
+ itemDataQ: "data-q",
95
+ itemDataInitQ: "data-initq",
96
+ itemQuestionEl: ".JlqpRe",
97
+ answerContainer: ".bCOlv, .hgKElc, .wDYxhc, .LGOjhe, .fo7IQd, .fmW3u",
98
+ sourceTitle: "h3",
99
+ sourceSite: ".VuuXrf",
100
+ sourceCite: "cite",
101
+ clickTarget: ".JlqpRe",
102
+ expandedClass: "aoRk1c",
103
+ captchaMarker: '#captcha-form, #recaptcha, form[action*="/sorry/"], .g-recaptcha, [data-sitekey]'
104
+ };
105
+ var VideoSelectors = {
106
+ container: 'div[jscontroller="HWk0Gf"]',
107
+ sectionHeading: '.mgAbYb[role="heading"]',
108
+ item: "a.rIRoqf"
109
+ };
110
+ var ShortVideoSelectors = {
111
+ udm: "39",
112
+ item: "a.rIRoqf",
113
+ durationPattern: /^\d+:\d+$/,
114
+ platforms: ["YouTube", "TikTok", "Instagram", "Facebook", "X"]
115
+ };
116
+ var ForumSelectors = {
117
+ section: ".ULSxyf",
118
+ item: "a.KYg7td.INpicf",
119
+ title: ".hyYc0c",
120
+ source: ".K4ETW"
121
+ };
122
+ var WhatPeopleSayingSelectors = {
123
+ sectionTag: "g-section-with-header",
124
+ sectionHeadingText: "What people are saying",
125
+ card: '.dRzkFf[role="listitem"]',
126
+ cardLink: 'a.WlydOe[jsname="YKoRaf"]',
127
+ titleH1: "h1.WQWxe",
128
+ titleDiv: ".eAaXgc",
129
+ popularCommentLabel: ".qgdis",
130
+ source: ".sTl1Td",
131
+ platformBadge: ".appd0, .KrMNbf",
132
+ ytChannel: ".sjVJQd",
133
+ ytDate: ".PLq9Je",
134
+ authorNote: ".nDgy9d"
135
+ };
136
+ var AIOverviewSelectors = {
137
+ root: "[data-lhcontainer][data-streaming-container][eid]",
138
+ legacyRoot: '[data-hveid="CBMQAA"]',
139
+ wrapper: ".Fgyi2e",
140
+ controller: '[jscontroller="AkrxPe"]',
141
+ contentSubtree: '[data-subtree="mfc"]',
142
+ header: ".heWuVc",
143
+ heading: ".Fzsovc.cwYVJe.RJPOee",
144
+ showMoreButton: '[aria-label="Show more AI Overview"]',
145
+ sourcesPanel: ".OZ9ddf.WAUd4",
146
+ disclaimer: ".DuQANe.MSJHRb"
147
+ };
148
+ var AIModeSelectors = {
149
+ root: '[data-hveid="CAUQAA"]',
150
+ wrapper: ".Fgyi2e",
151
+ citations: '.Fgyi2e [data-hveid] a[jsname="pxBnId"]'
152
+ };
153
+ var OrganicSelectors = {
154
+ result: ".wHYlTd.tF2Cxc",
155
+ title: "h3.LC20lb",
156
+ siteName: ".VuuXrf",
157
+ cite: "cite.tjvcx",
158
+ snippet: ".VwiC3b",
159
+ redditCite: "cite.qLRx3b",
160
+ ratingWrap: ".Y0A0hc",
161
+ ratingValue: ".yi40Hd",
162
+ reviewCount: ".RDApEe"
163
+ };
164
+ var LocalPackSelectors = {
165
+ headingText: "Businesses",
166
+ card: ".w7Dbne",
167
+ name: ".OSrXXb",
168
+ ratingValue: ".yi40Hd",
169
+ reviewCount: ".RDApEe"
170
+ };
171
+ var MapsSelectors = {
172
+ ratingAndCount: "div.F7nice",
173
+ hoursTable: "table.eK4R0e",
174
+ reviewScrollPane: 'div.m6QErb[tabindex="-1"]',
175
+ reviewScrollPaneFallback: '[role="main"] div[tabindex="-1"]',
176
+ reviewCardAuthor: "div.d4r55, span.d4r55, span.RPZfBb",
177
+ reviewCardDate: "span.rsqaWe",
178
+ reviewCardText: "span.wiI7pd",
179
+ reviewCardOwnerBlock: "div.CDe7pd",
180
+ hoursTableAlt: 'table[aria-label*="Hour"]',
181
+ reviewCard: "[data-review-id]",
182
+ reviewStars: '[role="img"][aria-label*="star"]',
183
+ reviewTab: 'button[role="tab"][aria-label*="Review"]',
184
+ aboutTab: 'button[role="tab"][aria-label*="About"]',
185
+ expandReview: '[data-review-id] button[aria-label*="See more"], [data-review-id] button.w8nwRe'
186
+ };
187
+
188
+ // src/errors.ts
189
+ var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
190
+ function sanitizeVendorName(message) {
191
+ return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
192
+ }
193
+ var CaptchaError = class extends Error {
194
+ constructor(instructions) {
195
+ super(`CAPTCHA detected. ${instructions}`);
196
+ this.instructions = instructions;
197
+ }
198
+ instructions;
199
+ name = "CaptchaError";
200
+ };
201
+ var ExtractionError = class extends Error {
202
+ constructor(message, cause) {
203
+ super(message);
204
+ this.cause = cause;
205
+ }
206
+ cause;
207
+ name = "ExtractionError";
208
+ };
209
+ var RequestAbortedError = class extends Error {
210
+ name = "RequestAbortedError";
211
+ constructor(message = "Request aborted before harvest completed") {
212
+ super(message);
213
+ }
214
+ };
215
+
216
+ // src/driver/BrowserDriver.ts
217
+ chromium.use(StealthPlugin());
218
+ var DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
219
+ var MOBILE_USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
220
+ var DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS = 180;
221
+ var KERNEL_BROWSER_CLOSE_TIMEOUT_MS = 3e3;
222
+ var KERNEL_SESSION_DELETE_TIMEOUT_MS = 5e3;
223
+ function positiveIntFromEnv(name, fallback) {
224
+ const raw = process.env[name];
225
+ if (!raw) return fallback;
226
+ const parsed = Number(raw);
227
+ return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback;
228
+ }
229
+ function proxyIdSuffix(proxyId) {
230
+ return proxyId ? proxyId.slice(-6) : null;
231
+ }
232
+ function errorText(err) {
233
+ return err instanceof Error ? err.message : String(err);
234
+ }
235
+ function rankCheckContextOptions(config) {
236
+ return {
237
+ viewport: config.viewport,
238
+ locale: config.locale,
239
+ userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
240
+ ...config.deviceScaleFactor ? { deviceScaleFactor: config.deviceScaleFactor } : {},
241
+ ...config.isMobile !== void 0 ? { isMobile: config.isMobile } : {},
242
+ ...config.hasTouch !== void 0 ? { hasTouch: config.hasTouch } : {}
243
+ };
244
+ }
245
+ async function withTimeout(promise, timeoutMs, label) {
246
+ let timeout;
247
+ try {
248
+ return await Promise.race([
249
+ promise,
250
+ new Promise((_, reject) => {
251
+ timeout = setTimeout(() => reject(new Error(`${label} timed out after ${timeoutMs}ms`)), timeoutMs);
252
+ })
253
+ ]);
254
+ } finally {
255
+ if (timeout) clearTimeout(timeout);
256
+ }
257
+ }
258
+ function buildYouTubeChannelVideosUrl(channelInput) {
259
+ const raw = channelInput.trim();
260
+ if (!raw) throw new Error("channelHandle is required");
261
+ const urlLike = /^https?:\/\//i.test(raw) || /^(www\.|m\.)?youtube\.com\//i.test(raw);
262
+ if (urlLike) {
263
+ const parsed = new URL(/^https?:\/\//i.test(raw) ? raw : `https://${raw}`);
264
+ const host = parsed.hostname.replace(/^www\./, "").replace(/^m\./, "").toLowerCase();
265
+ if (host !== "youtube.com") throw new Error("channel URL must be on youtube.com");
266
+ const segments = parsed.pathname.split("/").filter(Boolean);
267
+ const first = segments[0] ?? "";
268
+ const second = segments[1] ?? "";
269
+ if (first.startsWith("@")) return `https://www.youtube.com/${first}/videos`;
270
+ if (first === "channel" && second) return `https://www.youtube.com/channel/${second}/videos`;
271
+ if ((first === "c" || first === "user") && second) return `https://www.youtube.com/${first}/${second}/videos`;
272
+ throw new Error("channel URL must be a YouTube handle, /channel/UC..., /c/..., or /user/... URL");
273
+ }
274
+ const stripped = raw.replace(/^\/+/, "").replace(/\/+$/, "");
275
+ const withoutVideos = stripped.replace(/\/videos$/i, "");
276
+ if (/^UC[\w-]{20,}$/.test(withoutVideos)) {
277
+ return `https://www.youtube.com/channel/${withoutVideos}/videos`;
278
+ }
279
+ const handle = withoutVideos.startsWith("@") ? withoutVideos : `@${withoutVideos}`;
280
+ if (!/^@[\w.-]+$/.test(handle)) {
281
+ throw new Error("channelHandle must be an @handle, UC channel ID, or YouTube channel URL");
282
+ }
283
+ return `https://www.youtube.com/${handle}/videos`;
284
+ }
285
+ var BrowserDriver = class {
286
+ browser = null;
287
+ context = null;
288
+ page = null;
289
+ kernelClient = null;
290
+ kernelSessionId = null;
291
+ debugEnabled = false;
292
+ debugSnapshot = {
293
+ kernel: null,
294
+ context: null,
295
+ networkLocation: null,
296
+ serpNavigation: null
297
+ };
298
+ async launch(config) {
299
+ this.debugEnabled = config.debug === true;
300
+ const proxyMode = config.proxyMode ?? (config.kernelProxyId ? "configured" : "none");
301
+ const device = config.isMobile ? "mobile" : "desktop";
302
+ this.debugSnapshot = {
303
+ kernel: null,
304
+ context: {
305
+ viewport: config.viewport,
306
+ locale: config.locale,
307
+ device,
308
+ userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
309
+ deviceScaleFactor: config.deviceScaleFactor ?? null,
310
+ isMobile: config.isMobile === true,
311
+ hasTouch: config.hasTouch === true
312
+ },
313
+ networkLocation: null,
314
+ serpNavigation: null
315
+ };
316
+ if (config.kernelApiKey) {
317
+ this.kernelClient = new Kernel({ apiKey: config.kernelApiKey });
318
+ const timeoutSeconds = positiveIntFromEnv("KERNEL_BROWSER_TIMEOUT_SECONDS", DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS);
319
+ const kernelBrowser = await this.kernelClient.browsers.create({
320
+ stealth: true,
321
+ timeout_seconds: timeoutSeconds,
322
+ ...config.kernelProxyId ? { proxy_id: config.kernelProxyId } : {}
323
+ });
324
+ this.kernelSessionId = kernelBrowser.session_id;
325
+ let defaultProxyDisabled = null;
326
+ let defaultProxyDisableError = null;
327
+ if (proxyMode === "none") {
328
+ try {
329
+ await withTimeout(
330
+ this.kernelClient.browsers.update(this.kernelSessionId, { disable_default_proxy: true }),
331
+ 5e3,
332
+ `Kernel session ${this.kernelSessionId} disable default proxy`
333
+ );
334
+ defaultProxyDisabled = true;
335
+ } catch (err) {
336
+ defaultProxyDisabled = false;
337
+ defaultProxyDisableError = errorText(err);
338
+ }
339
+ }
340
+ const kernelDebug = {
341
+ sessionId: this.kernelSessionId,
342
+ proxyMode,
343
+ requestedProxyIdPresent: Boolean(config.kernelProxyId),
344
+ requestedProxyIdSuffix: proxyIdSuffix(config.kernelProxyId),
345
+ createdProxyIdPresent: typeof kernelBrowser.proxy_id === "string" ? Boolean(kernelBrowser.proxy_id) : null,
346
+ createdProxyIdSuffix: proxyIdSuffix(kernelBrowser.proxy_id),
347
+ retrievedProxyIdPresent: null,
348
+ retrievedProxyIdSuffix: null,
349
+ retrievedProxyIdMatchesRequested: null,
350
+ defaultProxyDisabled,
351
+ defaultProxyDisableError,
352
+ proxyResolution: config.kernelProxyResolution ?? null,
353
+ timeoutSeconds,
354
+ stealth: typeof kernelBrowser.stealth === "boolean" ? kernelBrowser.stealth : null,
355
+ profilePresent: null,
356
+ poolPresent: null,
357
+ retrieveError: null
358
+ };
359
+ this.debugSnapshot.kernel = kernelDebug;
360
+ console.info(JSON.stringify({
361
+ event: "kernel_browser_created",
362
+ kernel_session_id: this.kernelSessionId,
363
+ timeout_seconds: timeoutSeconds,
364
+ proxy_mode: proxyMode,
365
+ proxy_id_present: Boolean(config.kernelProxyId),
366
+ proxy_resolution_source: config.kernelProxyResolution?.source
367
+ }));
368
+ if (this.debugEnabled) {
369
+ await this.populateKernelRetrieveDebug(kernelDebug, config.kernelProxyId);
370
+ }
371
+ this.browser = await playwrightChromium.connectOverCDP(kernelBrowser.cdp_ws_url);
372
+ this.context = await this.browser.newContext(rankCheckContextOptions(config));
373
+ await this.installEsbuildHelperShims(this.context);
374
+ this.page = await this.context.newPage();
375
+ await this.page.setViewportSize(config.viewport);
376
+ if (this.debugEnabled) {
377
+ this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
378
+ }
379
+ return;
380
+ }
381
+ const launchOpts = {
382
+ headless: config.headless,
383
+ proxy: config.proxy ? { server: config.proxy } : void 0
384
+ };
385
+ const ctxOpts = rankCheckContextOptions(config);
386
+ if (config.profileDir) {
387
+ this.context = await chromium.launchPersistentContext(config.profileDir, {
388
+ ...launchOpts,
389
+ ...ctxOpts
390
+ });
391
+ await this.installEsbuildHelperShims(this.context);
392
+ this.page = await this.context.newPage();
393
+ } else {
394
+ this.browser = await chromium.launch(launchOpts);
395
+ this.context = await this.browser.newContext(ctxOpts);
396
+ await this.installEsbuildHelperShims(this.context);
397
+ this.page = await this.context.newPage();
398
+ }
399
+ if (this.debugEnabled) {
400
+ this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
401
+ }
402
+ }
403
+ async populateKernelRetrieveDebug(kernelDebug, requestedProxyId) {
404
+ if (!this.kernelClient || !this.kernelSessionId) return;
405
+ try {
406
+ const retrieved = await withTimeout(
407
+ this.kernelClient.browsers.retrieve(this.kernelSessionId),
408
+ 5e3,
409
+ `Kernel session ${this.kernelSessionId} retrieve`
410
+ );
411
+ kernelDebug.retrievedProxyIdPresent = typeof retrieved.proxy_id === "string" ? Boolean(retrieved.proxy_id) : false;
412
+ kernelDebug.retrievedProxyIdSuffix = proxyIdSuffix(retrieved.proxy_id);
413
+ kernelDebug.retrievedProxyIdMatchesRequested = requestedProxyId ? retrieved.proxy_id === requestedProxyId : !retrieved.proxy_id;
414
+ kernelDebug.timeoutSeconds = typeof retrieved.timeout_seconds === "number" ? retrieved.timeout_seconds : kernelDebug.timeoutSeconds;
415
+ kernelDebug.stealth = typeof retrieved.stealth === "boolean" ? retrieved.stealth : kernelDebug.stealth;
416
+ kernelDebug.profilePresent = Boolean(retrieved.profile);
417
+ kernelDebug.poolPresent = Boolean(retrieved.pool);
418
+ } catch (err) {
419
+ kernelDebug.retrieveError = errorText(err);
420
+ }
421
+ }
422
+ async captureBrowserNetworkLocation() {
423
+ const fallback = (message, source = "ipapi.co") => ({
424
+ source,
425
+ ip: null,
426
+ city: null,
427
+ region: null,
428
+ country: null,
429
+ org: null,
430
+ timezone: null,
431
+ error: message
432
+ });
433
+ if (!this.context) return fallback("browser context is not available");
434
+ let debugPage = null;
435
+ try {
436
+ debugPage = await this.context.newPage();
437
+ const ipwho = await this.loadJsonInDebugPage(debugPage, "https://ipwho.is/");
438
+ if (ipwho) {
439
+ const connection = typeof ipwho.connection === "object" && ipwho.connection !== null ? ipwho.connection : {};
440
+ return {
441
+ source: "ipwho.is",
442
+ ip: typeof ipwho.ip === "string" ? ipwho.ip : null,
443
+ city: typeof ipwho.city === "string" ? ipwho.city : null,
444
+ region: typeof ipwho.region === "string" ? ipwho.region : null,
445
+ country: typeof ipwho.country === "string" ? ipwho.country : null,
446
+ org: typeof connection.org === "string" ? connection.org : null,
447
+ timezone: typeof ipwho.timezone === "object" && ipwho.timezone !== null && typeof ipwho.timezone.id === "string" ? ipwho.timezone.id : null,
448
+ error: null
449
+ };
450
+ }
451
+ const ipify = await this.loadJsonInDebugPage(debugPage, "https://api64.ipify.org?format=json");
452
+ if (ipify) {
453
+ return {
454
+ source: "api64.ipify.org",
455
+ ip: typeof ipify.ip === "string" ? ipify.ip : null,
456
+ city: null,
457
+ region: null,
458
+ country: null,
459
+ org: null,
460
+ timezone: null,
461
+ error: null
462
+ };
463
+ }
464
+ await withTimeout(
465
+ debugPage.goto("https://ipapi.co/json/", { waitUntil: "domcontentloaded", timeout: 7e3 }),
466
+ 8e3,
467
+ "browser network location navigation"
468
+ );
469
+ const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
470
+ const data = JSON.parse(body);
471
+ return {
472
+ source: "ipapi.co",
473
+ ip: typeof data.ip === "string" ? data.ip : null,
474
+ city: typeof data.city === "string" ? data.city : null,
475
+ region: typeof data.region === "string" ? data.region : null,
476
+ country: typeof data.country_name === "string" ? data.country_name : typeof data.country === "string" ? data.country : null,
477
+ org: typeof data.org === "string" ? data.org : null,
478
+ timezone: typeof data.timezone === "string" ? data.timezone : null,
479
+ error: null
480
+ };
481
+ } catch (err) {
482
+ return fallback(errorText(err));
483
+ } finally {
484
+ await debugPage?.close().catch(() => {
485
+ });
486
+ }
487
+ }
488
+ async loadJsonInDebugPage(debugPage, url) {
489
+ try {
490
+ await withTimeout(
491
+ debugPage.goto(url, { waitUntil: "domcontentloaded", timeout: 7e3 }),
492
+ 8e3,
493
+ `browser network location navigation ${url}`
494
+ );
495
+ const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
496
+ return JSON.parse(body);
497
+ } catch {
498
+ return null;
499
+ }
500
+ }
501
+ async installEsbuildHelperShims(context) {
502
+ await context.addInitScript(() => {
503
+ const g = globalThis;
504
+ if (typeof g.__name !== "function") g.__name = (fn) => fn;
505
+ if (typeof g.__publicField !== "function") g.__publicField = (obj, key, value) => {
506
+ obj[key] = value;
507
+ return value;
508
+ };
509
+ });
510
+ }
511
+ async navigateToSERP(query, uule, gl, hl, options) {
512
+ const params = new URLSearchParams({ q: query, gl, hl, pws: "0" });
513
+ if (options?.num) params.set("num", String(options.num));
514
+ if (uule) params.set("uule", uule);
515
+ const url = "https://www.google.com/search?" + params.toString();
516
+ const navDebug = options?.debug ? {
517
+ requestedUrl: url,
518
+ finalUrl: null,
519
+ title: null,
520
+ bodySnippet: null,
521
+ hasPaa: null,
522
+ captchaDetected: null,
523
+ googleSorryUrl: null,
524
+ redirected: null
525
+ } : null;
526
+ if (navDebug) this.debugSnapshot.serpNavigation = navDebug;
527
+ try {
528
+ await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
529
+ } catch (err) {
530
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: null, captchaDetected: null });
531
+ const diag = await this.captureDiagnostics(url);
532
+ throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
533
+ }
534
+ const captchaCount = await this.page.locator(PAASelectors.captchaMarker).count();
535
+ if (captchaCount > 0) {
536
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
537
+ throw new CaptchaError(this.captchaMessage());
538
+ }
539
+ const fastFound = await this.page.waitForSelector(PAASelectors.item, { timeout: 4e3 }).catch(() => null);
540
+ if (fastFound) {
541
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
542
+ return { hasPaa: true };
543
+ }
544
+ const captchaAfter = await this.page.locator(PAASelectors.captchaMarker).count();
545
+ if (captchaAfter > 0) {
546
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
547
+ throw new CaptchaError(this.captchaMessage());
548
+ }
549
+ for (let i = 1; i <= 6; i++) {
550
+ await this.page.evaluate((f) => {
551
+ window.scrollTo(0, document.body.scrollHeight * f);
552
+ }, i / 6);
553
+ await this.page.waitForTimeout(600);
554
+ const count = await this.page.locator(PAASelectors.item).count();
555
+ if (count > 0) {
556
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
557
+ return { hasPaa: true };
558
+ }
559
+ }
560
+ await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: false });
561
+ return { hasPaa: false };
562
+ }
563
+ async updateSerpNavigationDebug(navDebug, requestedUrl, state) {
564
+ if (!navDebug || !this.page) return;
565
+ try {
566
+ const finalUrl = this.page.url();
567
+ const title = await this.page.title().catch(() => "");
568
+ const bodySnippet = await this.page.evaluate(() => {
569
+ const text = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
570
+ return text.slice(0, 500);
571
+ }).catch(() => "");
572
+ const textCaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
573
+ navDebug.finalUrl = finalUrl;
574
+ navDebug.title = title;
575
+ navDebug.bodySnippet = bodySnippet;
576
+ navDebug.hasPaa = state.hasPaa;
577
+ navDebug.captchaDetected = state.captchaDetected ?? textCaptcha;
578
+ navDebug.googleSorryUrl = /google\.[^/]+\/sorry\//i.test(finalUrl);
579
+ navDebug.redirected = finalUrl !== requestedUrl;
580
+ } catch (err) {
581
+ navDebug.bodySnippet = `debug capture failed: ${errorText(err)}`;
582
+ }
583
+ }
584
+ async captureDiagnostics(intendedUrl) {
585
+ try {
586
+ const finalUrl = this.page.url();
587
+ const title = await this.page.title().catch(() => "");
588
+ const bodySnippet = await this.page.evaluate(() => {
589
+ const t = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
590
+ return t.slice(0, 400);
591
+ }).catch(() => "");
592
+ const consent = /consent\.google\./.test(finalUrl) || /before you continue/i.test(bodySnippet);
593
+ const recaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
594
+ const flags = [
595
+ consent ? "CONSENT_WALL" : "",
596
+ recaptcha ? "BOT_CHALLENGE" : "",
597
+ finalUrl !== intendedUrl ? "REDIRECTED" : ""
598
+ ].filter(Boolean).join(",");
599
+ return `intended=${intendedUrl} | final=${finalUrl} | title="${title}" | flags=[${flags}] | body="${bodySnippet}"`;
600
+ } catch (e) {
601
+ return `diagnostics-failed: ${e.message}`;
602
+ }
603
+ }
604
+ captchaMessage() {
605
+ return this.kernelClient ? "Google returned a CAPTCHA on this session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
606
+ }
607
+ async navigateTo(url) {
608
+ try {
609
+ await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
610
+ } catch (err) {
611
+ const diag = await this.captureDiagnostics(url);
612
+ throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
613
+ }
614
+ }
615
+ async navigateToChannel(channelHandle) {
616
+ const url = buildYouTubeChannelVideosUrl(channelHandle);
617
+ try {
618
+ await this.page.goto(url, { waitUntil: "networkidle", timeout: 3e4 });
619
+ } catch (err) {
620
+ const diag = await this.captureDiagnostics(url);
621
+ throw new ExtractionError(`navigateToChannel failed: ${err.message} | ${diag}`);
622
+ }
623
+ }
624
+ async evaluate(fn, arg) {
625
+ return this.page.evaluate(fn, arg);
626
+ }
627
+ getPage() {
628
+ return this.page;
629
+ }
630
+ getKernelSessionId() {
631
+ return this.kernelSessionId;
632
+ }
633
+ getDebugSnapshot() {
634
+ return this.debugSnapshot;
635
+ }
636
+ async close() {
637
+ if (this.browser) {
638
+ const b = this.browser;
639
+ const sessionId = this.kernelSessionId;
640
+ const client = this.kernelClient;
641
+ this.browser = null;
642
+ this.context = null;
643
+ this.page = null;
644
+ this.kernelSessionId = null;
645
+ this.kernelClient = null;
646
+ if (client && sessionId) {
647
+ console.info(JSON.stringify({
648
+ event: "kernel_browser_delete_started",
649
+ kernel_session_id: sessionId
650
+ }));
651
+ const deleteSession = withTimeout(
652
+ client.browsers.deleteByID(sessionId),
653
+ KERNEL_SESSION_DELETE_TIMEOUT_MS,
654
+ `Kernel session ${sessionId} delete`
655
+ );
656
+ const closeBrowser = withTimeout(
657
+ b.close(),
658
+ KERNEL_BROWSER_CLOSE_TIMEOUT_MS,
659
+ `Kernel browser ${sessionId} close`
660
+ );
661
+ const [deleteResult, closeResult] = await Promise.allSettled([deleteSession, closeBrowser]);
662
+ const result = {
663
+ kernelSessionId: sessionId,
664
+ kernelDeleteStarted: true,
665
+ kernelDeleteSucceeded: deleteResult.status === "fulfilled",
666
+ kernelDeleteError: deleteResult.status === "rejected" ? deleteResult.reason instanceof Error ? deleteResult.reason.message : String(deleteResult.reason) : null,
667
+ browserCloseSucceeded: closeResult.status === "fulfilled",
668
+ browserCloseError: closeResult.status === "rejected" ? closeResult.reason instanceof Error ? closeResult.reason.message : String(closeResult.reason) : null
669
+ };
670
+ if (deleteResult.status === "rejected") {
671
+ console.warn(JSON.stringify({
672
+ event: "kernel_browser_delete_failed",
673
+ kernel_session_id: sessionId,
674
+ message: result.kernelDeleteError
675
+ }));
676
+ console.warn(`Kernel session cleanup failed for ${sessionId}:`, deleteResult.reason);
677
+ } else {
678
+ console.info(JSON.stringify({
679
+ event: "kernel_browser_delete_succeeded",
680
+ kernel_session_id: sessionId
681
+ }));
682
+ }
683
+ if (closeResult.status === "rejected") {
684
+ console.warn(JSON.stringify({
685
+ event: "kernel_browser_close_failed",
686
+ kernel_session_id: sessionId,
687
+ message: result.browserCloseError
688
+ }));
689
+ console.warn(`Kernel browser close failed for ${sessionId}:`, closeResult.reason);
690
+ }
691
+ return result;
692
+ }
693
+ await b.close();
694
+ return {
695
+ kernelSessionId: null,
696
+ kernelDeleteStarted: false,
697
+ kernelDeleteSucceeded: null,
698
+ kernelDeleteError: null,
699
+ browserCloseSucceeded: true,
700
+ browserCloseError: null
701
+ };
702
+ } else if (this.context) {
703
+ const ctx = this.context;
704
+ this.context = null;
705
+ this.page = null;
706
+ await ctx.close();
707
+ return {
708
+ kernelSessionId: null,
709
+ kernelDeleteStarted: false,
710
+ kernelDeleteSucceeded: null,
711
+ kernelDeleteError: null,
712
+ browserCloseSucceeded: true,
713
+ browserCloseError: null
714
+ };
715
+ }
716
+ return {
717
+ kernelSessionId: null,
718
+ kernelDeleteStarted: false,
719
+ kernelDeleteSucceeded: null,
720
+ kernelDeleteError: null,
721
+ browserCloseSucceeded: null,
722
+ browserCloseError: null
723
+ };
724
+ }
725
+ };
726
+
727
+ // src/locations.ts
728
+ var LOCATIONS = {
729
+ "austin": "Austin,Texas,United States",
730
+ "new york": "New York,New York,United States",
731
+ "new york city": "New York,New York,United States",
732
+ "nyc": "New York,New York,United States",
733
+ "los angeles": "Los Angeles,California,United States",
734
+ "la": "Los Angeles,California,United States",
735
+ "chicago": "Chicago,Illinois,United States",
736
+ "houston": "Houston,Texas,United States",
737
+ "phoenix": "Phoenix,Arizona,United States",
738
+ "philadelphia": "Philadelphia,Pennsylvania,United States",
739
+ "philly": "Philadelphia,Pennsylvania,United States",
740
+ "san antonio": "San Antonio,Texas,United States",
741
+ "dallas": "Dallas,Texas,United States",
742
+ "miami": "Miami,Florida,United States",
743
+ "seattle": "Seattle,Washington,United States",
744
+ "denver": "Denver,Colorado,United States",
745
+ "loveland": "Loveland,Colorado,United States",
746
+ "loveland co": "Loveland,Colorado,United States",
747
+ "fort collins": "Fort Collins,Colorado,United States",
748
+ "boulder": "Boulder,Colorado,United States",
749
+ "colorado springs": "Colorado Springs,Colorado,United States",
750
+ "boston": "Boston,Massachusetts,United States",
751
+ "atlanta": "Atlanta,Georgia,United States",
752
+ "san francisco": "San Francisco,California,United States",
753
+ "sf": "San Francisco,California,United States",
754
+ "portland": "Portland,Oregon,United States",
755
+ "las vegas": "Las Vegas,Nevada,United States",
756
+ "minneapolis": "Minneapolis,Minnesota,United States",
757
+ "detroit": "Detroit,Michigan,United States",
758
+ "nashville": "Nashville,Tennessee,United States",
759
+ "charlotte": "Charlotte,North Carolina,United States",
760
+ "orlando": "Orlando,Florida,United States",
761
+ "san diego": "San Diego,California,United States",
762
+ "baltimore": "Baltimore,Maryland,United States",
763
+ "sacramento": "Sacramento,California,United States",
764
+ "columbus": "Columbus,Ohio,United States",
765
+ "indianapolis": "Indianapolis,Indiana,United States",
766
+ "san jose": "San Jose,California,United States",
767
+ "fort worth": "Fort Worth,Texas,United States",
768
+ "jacksonville": "Jacksonville,Florida,United States",
769
+ "memphis": "Memphis,Tennessee,United States",
770
+ "louisville": "Louisville,Kentucky,United States",
771
+ "raleigh": "Raleigh,North Carolina,United States",
772
+ "richmond": "Richmond,Virginia,United States",
773
+ "salt lake city": "Salt Lake City,Utah,United States",
774
+ "toronto": "Toronto,Ontario,Canada",
775
+ "vancouver": "Vancouver,British Columbia,Canada",
776
+ "montreal": "Montreal,Quebec,Canada",
777
+ "calgary": "Calgary,Alberta,Canada",
778
+ "ottawa": "Ottawa,Ontario,Canada",
779
+ "london": "London,England,United Kingdom",
780
+ "manchester": "Manchester,England,United Kingdom",
781
+ "birmingham": "Birmingham,England,United Kingdom",
782
+ "edinburgh": "Edinburgh,Scotland,United Kingdom",
783
+ "glasgow": "Glasgow,Scotland,United Kingdom",
784
+ "leeds": "Leeds,England,United Kingdom",
785
+ "sydney": "Sydney,New South Wales,Australia",
786
+ "melbourne": "Melbourne,Victoria,Australia",
787
+ "brisbane": "Brisbane,Queensland,Australia",
788
+ "perth": "Perth,Western Australia,Australia",
789
+ "adelaide": "Adelaide,South Australia,Australia",
790
+ "dublin": "Dublin,Leinster,Ireland"
791
+ };
792
+
793
+ // src/uule.ts
794
+ function encodeVarint(value) {
795
+ const bytes = [];
796
+ let remaining = value;
797
+ do {
798
+ let byte = remaining & 127;
799
+ remaining >>>= 7;
800
+ if (remaining > 0) byte |= 128;
801
+ bytes.push(byte);
802
+ } while (remaining > 0);
803
+ return bytes;
804
+ }
805
+ function encodeUule(name) {
806
+ const locationBytes = Buffer.from(name, "utf8");
807
+ const payload = Buffer.concat([
808
+ Buffer.from([8, 2, 16, 32, 34]),
809
+ Buffer.from(encodeVarint(locationBytes.length)),
810
+ locationBytes
811
+ ]);
812
+ return `w+${payload.toString("base64")}`;
813
+ }
814
+ function normalizeLocation(input) {
815
+ const raw = input.toLowerCase().trim();
816
+ if (LOCATIONS[raw]) return LOCATIONS[raw];
817
+ const beforeComma = raw.split(",")[0].trim();
818
+ if (beforeComma !== raw && LOCATIONS[beforeComma]) return LOCATIONS[beforeComma];
819
+ const withoutState = raw.replace(/\s+[a-z]{2}$/, "").trim();
820
+ if (withoutState !== raw && LOCATIONS[withoutState]) return LOCATIONS[withoutState];
821
+ return input;
822
+ }
823
+
824
+ // src/serp-location-debug.ts
825
+ var STATE_TO_CODE = {
826
+ alabama: "AL",
827
+ alaska: "AK",
828
+ arizona: "AZ",
829
+ arkansas: "AR",
830
+ california: "CA",
831
+ colorado: "CO",
832
+ connecticut: "CT",
833
+ delaware: "DE",
834
+ florida: "FL",
835
+ georgia: "GA",
836
+ hawaii: "HI",
837
+ idaho: "ID",
838
+ illinois: "IL",
839
+ indiana: "IN",
840
+ iowa: "IA",
841
+ kansas: "KS",
842
+ kentucky: "KY",
843
+ louisiana: "LA",
844
+ maine: "ME",
845
+ maryland: "MD",
846
+ massachusetts: "MA",
847
+ michigan: "MI",
848
+ minnesota: "MN",
849
+ mississippi: "MS",
850
+ missouri: "MO",
851
+ montana: "MT",
852
+ nebraska: "NE",
853
+ nevada: "NV",
854
+ "new hampshire": "NH",
855
+ "new jersey": "NJ",
856
+ "new mexico": "NM",
857
+ "new york": "NY",
858
+ "north carolina": "NC",
859
+ "north dakota": "ND",
860
+ ohio: "OH",
861
+ oklahoma: "OK",
862
+ oregon: "OR",
863
+ pennsylvania: "PA",
864
+ "rhode island": "RI",
865
+ "south carolina": "SC",
866
+ "south dakota": "SD",
867
+ tennessee: "TN",
868
+ texas: "TX",
869
+ utah: "UT",
870
+ vermont: "VT",
871
+ virginia: "VA",
872
+ washington: "WA",
873
+ "west virginia": "WV",
874
+ wisconsin: "WI",
875
+ wyoming: "WY",
876
+ "district of columbia": "DC"
877
+ };
878
+ var STATE_PATTERN = [
879
+ ...Object.keys(STATE_TO_CODE).map((s) => s.replace(/\s+/g, "\\s+")),
880
+ ...Object.values(STATE_TO_CODE)
881
+ ].join("|");
882
+ var CITY_STATE_RE = new RegExp(`\\b([A-Z][A-Za-z]+(?:[\\s.-][A-Z][A-Za-z]+){0,4}),?\\s+(${STATE_PATTERN})\\b`, "gi");
883
+ function normalizeRegionCode(input) {
884
+ if (!input) return null;
885
+ const trimmed = input.trim();
886
+ if (/^[A-Z]{2}$/i.test(trimmed)) return trimmed.toUpperCase();
887
+ return STATE_TO_CODE[trimmed.toLowerCase()] ?? null;
888
+ }
889
+ function normalizeCity(input) {
890
+ const cleaned = input.replace(/\s+/g, " ").trim().replace(/^.*\b(?:in|near|around|serving)\s+/i, "");
891
+ return cleaned.toLowerCase().replace(/\b[a-z]/g, (char) => char.toUpperCase());
892
+ }
893
+ function parseExpected(canonicalLocation) {
894
+ if (!canonicalLocation) return null;
895
+ const [city = "", region = ""] = canonicalLocation.split(",").map((part) => part.trim());
896
+ return {
897
+ city: normalizeCity(city),
898
+ regionCode: normalizeRegionCode(region),
899
+ canonicalLocation
900
+ };
901
+ }
902
+ function addCandidate(candidates, city, region, example) {
903
+ const normalizedCity = normalizeCity(city);
904
+ const regionCode = normalizeRegionCode(region);
905
+ if (!normalizedCity || !regionCode) return;
906
+ const key = `${normalizedCity.toLowerCase()}|${regionCode}`;
907
+ const existing = candidates.get(key);
908
+ if (existing) {
909
+ existing.count++;
910
+ if (existing.examples.length < 3 && !existing.examples.includes(example)) existing.examples.push(example);
911
+ return;
912
+ }
913
+ candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
914
+ }
915
+ function scanText(candidates, text) {
916
+ const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
917
+ for (const match of normalized.matchAll(CITY_STATE_RE)) {
918
+ addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
919
+ }
920
+ }
921
+ function inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) {
922
+ const expected = parseExpected(canonicalLocation);
923
+ const candidates = /* @__PURE__ */ new Map();
924
+ for (const result of organicResults) {
925
+ scanText(candidates, [result.title, result.snippet ?? "", result.cite ?? "", result.url].join(" "));
926
+ }
927
+ for (const business of localPack) {
928
+ scanText(candidates, [business.name, ...business.metadata, business.websiteUrl ?? "", business.directionsUrl ?? ""].join(" "));
929
+ }
930
+ const rankedCandidates = Array.from(candidates.values()).sort((a, b) => b.count - a.count || a.city.localeCompare(b.city)).slice(0, 8);
931
+ if (!expected) {
932
+ return { status: "not_requested", expected: null, candidates: rankedCandidates };
933
+ }
934
+ if (rankedCandidates.length === 0) {
935
+ return { status: "unknown", expected, candidates: [] };
936
+ }
937
+ const matched = rankedCandidates.some(
938
+ (candidate) => candidate.city.toLowerCase() === expected.city.toLowerCase() && (expected.regionCode == null || candidate.regionCode === expected.regionCode)
939
+ );
940
+ return {
941
+ status: matched ? "matched" : "mismatch",
942
+ expected,
943
+ candidates: rankedCandidates
944
+ };
945
+ }
946
+
947
+ // src/lib/paa-answer-cleanup.ts
948
+ var MAX_ANSWER_LENGTH = 1200;
949
+ var BOILERPLATE_PATTERNS = [
950
+ /An AI Overview is not available for this search/gi,
951
+ /Can't generate an AI overview right now\.?\s*Try again later\.?/gi,
952
+ /\bAI Overview\b/gi,
953
+ /\bView all\b/gi
954
+ ];
955
+ var CUT_MARKERS = [
956
+ /\bRelated Links\b/i,
957
+ /\bAsk anything in\s*AI Mode\b/i,
958
+ /\bAI can make mistakes\b/i,
959
+ /\bThis is for informational purposes only\b/i,
960
+ /\bShow more\b/i,
961
+ /\b\d+\s+sites\b/i,
962
+ /\b\d{1,2}\s*[msh]\s*[A-Z][A-Za-z]/,
963
+ /\b(?:YouTube|Reddit|Facebook|Instagram|TikTok)·/
964
+ ];
965
+ function normalizeWhitespace(text) {
966
+ return text.replace(/\u00a0/g, " ").replace(/([.!?])([A-Z])/g, "$1 $2").replace(/([:;])([A-Z])/g, "$1 $2").replace(/([a-z])([A-Z][a-z])/g, "$1 $2").replace(/(\d)([A-Z][a-z])/g, "$1 $2").replace(/([a-z])(\d)/g, "$1 $2").replace(/\s+/g, " ").trim();
967
+ }
968
+ function cutAtFirstMarker(text) {
969
+ let cutAt = -1;
970
+ for (const marker of CUT_MARKERS) {
971
+ const match = marker.exec(text);
972
+ marker.lastIndex = 0;
973
+ if (match && (cutAt === -1 || match.index < cutAt)) cutAt = match.index;
974
+ }
975
+ return cutAt === -1 ? text : text.slice(0, cutAt);
976
+ }
977
+ function cutAtSourceTitle(text, sourceTitle) {
978
+ const title = sourceTitle?.trim();
979
+ if (!title || title.length < 8) return text;
980
+ const idx = text.toLowerCase().indexOf(title.toLowerCase());
981
+ return idx > 40 ? text.slice(0, idx) : text;
982
+ }
983
+ function findAttributionCut(beforeUrl) {
984
+ const dateMatch = beforeUrl.match(/[•·]\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s+\d{4}/i);
985
+ if (dateMatch?.index && dateMatch.index > 40) return dateMatch.index;
986
+ const start = Math.max(0, beforeUrl.length - 260);
987
+ const tail = beforeUrl.slice(start);
988
+ const sentenceBreaks = [...tail.matchAll(/[.!?]\s*(?=[A-Z][A-Za-z0-9"'$])/g)];
989
+ for (const match of sentenceBreaks) {
990
+ const remainder = tail.slice(match.index + 1).trim();
991
+ const lead = remainder.slice(0, 160);
992
+ const looksLikeTitle = /^(?:Best|Top|What|How|Why|When|Where|Which|Can|Should|Is|Are|Do|Does)\b/i.test(remainder);
993
+ if (remainder.length > 20 && looksLikeTitle && /(?:\s[-|]\s|Heating|Cooling|Company|Services|Blog|Guide|Review)/i.test(lead)) {
994
+ return start + match.index + 1;
995
+ }
996
+ }
997
+ const last = sentenceBreaks.at(-1);
998
+ if (last?.index !== void 0) return start + last.index + 1;
999
+ return beforeUrl.length;
1000
+ }
1001
+ function cutAtUrlAttribution(text) {
1002
+ const urlMatch = text.match(/https?:\/\/\S+/i);
1003
+ if (!urlMatch?.index) return text;
1004
+ const beforeUrl = text.slice(0, urlMatch.index);
1005
+ return beforeUrl.slice(0, findAttributionCut(beforeUrl));
1006
+ }
1007
+ function trimToSentenceLimit(text) {
1008
+ if (text.length <= MAX_ANSWER_LENGTH) return text;
1009
+ const slice = text.slice(0, MAX_ANSWER_LENGTH);
1010
+ const lastSentence = Math.max(slice.lastIndexOf("."), slice.lastIndexOf("!"), slice.lastIndexOf("?"));
1011
+ return (lastSentence > 240 ? slice.slice(0, lastSentence + 1) : slice).trim();
1012
+ }
1013
+ function cleanPAAAnswerText(answer, question, sourceTitle) {
1014
+ if (!answer) return void 0;
1015
+ let text = normalizeWhitespace(answer);
1016
+ const normalizedQuestion = question ? normalizeWhitespace(question) : "";
1017
+ if (normalizedQuestion && text.toLowerCase().startsWith(normalizedQuestion.toLowerCase())) {
1018
+ text = text.slice(normalizedQuestion.length).trim();
1019
+ }
1020
+ if (/^An error has occurred\.?\s*Please try again later\.?/i.test(text)) {
1021
+ return void 0;
1022
+ }
1023
+ for (const pattern of BOILERPLATE_PATTERNS) {
1024
+ text = text.replace(pattern, " ");
1025
+ }
1026
+ text = text.replace(/\b[A-Z][A-Za-z&'\u2019 -]{2,60}\+\d+\b/g, " ").replace(/\b(?:[a-z0-9-]+\.)+[a-z]{2,}\+\d+\b/gi, " ");
1027
+ text = normalizeWhitespace(text);
1028
+ text = cutAtFirstMarker(text);
1029
+ text = cutAtSourceTitle(text, sourceTitle);
1030
+ text = cutAtUrlAttribution(text);
1031
+ text = normalizeWhitespace(text);
1032
+ text = text.replace(/\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s+\d{4}$/i, "").trim();
1033
+ text = trimToSentenceLimit(text);
1034
+ if (!text) return void 0;
1035
+ if (/^An error has occurred\.?\s*Please try again later\.?$/i.test(text)) return void 0;
1036
+ return text;
1037
+ }
1038
+
1039
+ // src/extractor/ai-surfaces.ts
1040
+ async function extractAISurfacesFromDocument(config) {
1041
+ const selectors = config ?? {
1042
+ aio: {
1043
+ root: "[data-lhcontainer][data-streaming-container][eid]",
1044
+ legacyRoot: '[data-hveid="CBMQAA"]',
1045
+ wrapper: ".Fgyi2e",
1046
+ controller: '[jscontroller="AkrxPe"]',
1047
+ contentSubtree: '[data-subtree="mfc"]',
1048
+ heading: ".Fzsovc.cwYVJe.RJPOee",
1049
+ header: ".heWuVc",
1050
+ showMoreButton: '[aria-label="Show more AI Overview"]',
1051
+ sourcesPanel: ".OZ9ddf.WAUd4",
1052
+ disclaimer: ".DuQANe.MSJHRb"
1053
+ },
1054
+ aim: {
1055
+ root: '[data-hveid="CAUQAA"]',
1056
+ wrapper: ".Fgyi2e"
1057
+ },
1058
+ expandWaitMs: 1500
1059
+ };
1060
+ const sn = window.google?.sn ?? "unknown";
1061
+ const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
1062
+ function textOf(el) {
1063
+ if (!el) return "";
1064
+ return (el.innerText ?? el.textContent ?? "").trim();
1065
+ }
1066
+ function hasAIOverviewLabel(el) {
1067
+ const heading = el.querySelector(selectors.aio.heading);
1068
+ if (textOf(heading) === "AI Overview") return true;
1069
+ const header = el.querySelector(selectors.aio.header);
1070
+ if (textOf(header).split(/\n|\s{2,}/).some((part) => part.trim() === "AI Overview")) return true;
1071
+ return textOf(el).includes("AI Overview");
1072
+ }
1073
+ function findAIORoot() {
1074
+ const primaryRoots = Array.from(document.querySelectorAll(selectors.aio.root));
1075
+ const labeledPrimary = primaryRoots.find(hasAIOverviewLabel);
1076
+ if (labeledPrimary) return labeledPrimary;
1077
+ if (primaryRoots.length > 0) return primaryRoots[0];
1078
+ if (selectors.aio.legacyRoot) {
1079
+ const legacy = document.querySelector(selectors.aio.legacyRoot);
1080
+ if (legacy) return legacy;
1081
+ }
1082
+ const headings = document.querySelectorAll(`${selectors.aio.heading}, h1, h2, h3, [role="heading"]`);
1083
+ for (const h of headings) {
1084
+ if (textOf(h) !== "AI Overview") continue;
1085
+ let el = h.parentElement;
1086
+ for (let i = 0; i < 8 && el; i++) {
1087
+ if (el.matches(selectors.aio.root) || el.querySelector(selectors.aio.controller) || el.querySelector(selectors.aio.contentSubtree)) {
1088
+ return el;
1089
+ }
1090
+ el = el.parentElement;
1091
+ }
1092
+ return h.parentElement;
1093
+ }
1094
+ return null;
1095
+ }
1096
+ function cleanText(target) {
1097
+ if (!target) return null;
1098
+ const clone = target.cloneNode(true);
1099
+ clone.querySelectorAll([
1100
+ "script",
1101
+ "style",
1102
+ "noscript",
1103
+ "img",
1104
+ "picture",
1105
+ "video",
1106
+ selectors.aio.header,
1107
+ selectors.aio.showMoreButton,
1108
+ selectors.aio.sourcesPanel,
1109
+ selectors.aio.disclaimer,
1110
+ '[data-subtree="dfa"]',
1111
+ "[data-src-id]",
1112
+ '[role="dialog"]',
1113
+ ".HWMcu",
1114
+ ".bTFeG",
1115
+ ".CyMdWb",
1116
+ ".MFrAxb",
1117
+ ".F0OfWd.hfWAgb",
1118
+ ".x2qcTc.fZavHb",
1119
+ ".SvjEff",
1120
+ ".sR2MY",
1121
+ ".lKuDef",
1122
+ ".GSPQcc",
1123
+ "a[href]",
1124
+ "button",
1125
+ '[role="button"]'
1126
+ ].join(",")).forEach((el) => el.remove());
1127
+ const holder = document.createElement("div");
1128
+ holder.style.position = "fixed";
1129
+ holder.style.left = "-10000px";
1130
+ holder.style.top = "0";
1131
+ holder.style.width = `${Math.max(320, Math.round(target.getBoundingClientRect?.().width || 960))}px`;
1132
+ holder.style.opacity = "0";
1133
+ holder.style.pointerEvents = "none";
1134
+ holder.append(clone);
1135
+ document.body.append(holder);
1136
+ const rendered = clone.innerText || clone.textContent || "";
1137
+ holder.remove();
1138
+ const lines = rendered.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n[ \t]+/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ").trim().split("\n").map((line) => line.replace(/\u00a0/g, " ").trim()).filter(Boolean);
1139
+ const filteredLines = [];
1140
+ for (let i = 0; i < lines.length; i++) {
1141
+ const line = lines[i];
1142
+ const next = lines[i + 1] ?? "";
1143
+ if (line === "AI Overview") continue;
1144
+ if (line === "Show more") continue;
1145
+ if (/^AI can make mistakes/i.test(line)) continue;
1146
+ if (/^Thank you\b/i.test(line)) continue;
1147
+ if (/^Your feedback helps Google improve/i.test(line)) continue;
1148
+ if (/^\+?\d+$/.test(line)) continue;
1149
+ if (/^\+\d+$/.test(next) && line.length <= 80) {
1150
+ i++;
1151
+ continue;
1152
+ }
1153
+ filteredLines.push(line);
1154
+ }
1155
+ const raw = filteredLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
1156
+ if (!raw || /not available|try again|can't generate/i.test(raw)) return null;
1157
+ return raw;
1158
+ }
1159
+ function normalizeHref(rawHref) {
1160
+ if (!rawHref || rawHref.startsWith("javascript:")) return null;
1161
+ let href = rawHref;
1162
+ try {
1163
+ const absolute = new URL(rawHref, window.location.href);
1164
+ const q = absolute.searchParams.get("q") ?? absolute.searchParams.get("url");
1165
+ if (/(\.|^)google\./i.test(absolute.hostname) && q?.startsWith("http")) {
1166
+ href = q;
1167
+ } else {
1168
+ href = absolute.href;
1169
+ }
1170
+ } catch {
1171
+ return null;
1172
+ }
1173
+ if (!/^https?:\/\//i.test(href)) return null;
1174
+ try {
1175
+ const url = new URL(href);
1176
+ const isGoogleInternal = /(\.|^)google\./i.test(url.hostname);
1177
+ if (isGoogleInternal) return null;
1178
+ return url.href;
1179
+ } catch {
1180
+ return null;
1181
+ }
1182
+ }
1183
+ function extractCitations(root) {
1184
+ if (!root) return [];
1185
+ const seen = /* @__PURE__ */ new Set();
1186
+ const citations = [];
1187
+ for (const a of Array.from(root.querySelectorAll("a[href]"))) {
1188
+ const href = normalizeHref(a.getAttribute("href") ?? "");
1189
+ if (!href || seen.has(href)) continue;
1190
+ seen.add(href);
1191
+ let fallbackHost = "";
1192
+ try {
1193
+ fallbackHost = new URL(href).hostname.replace(/^www\./, "");
1194
+ } catch {
1195
+ }
1196
+ citations.push({
1197
+ text: textOf(a) || fallbackHost || href,
1198
+ href
1199
+ });
1200
+ }
1201
+ return citations;
1202
+ }
1203
+ async function maybeExpand(root) {
1204
+ const button = root.querySelector(selectors.aio.showMoreButton);
1205
+ if (!button || button.getAttribute("aria-expanded") !== "false") return false;
1206
+ button.click();
1207
+ const waitMs = selectors.expandWaitMs ?? 1500;
1208
+ if (waitMs > 0) await new Promise((resolve) => setTimeout(resolve, waitMs));
1209
+ return true;
1210
+ }
1211
+ const aioRoot = findAIORoot();
1212
+ let aioText = null;
1213
+ let aioCitations = [];
1214
+ let aioExpanded = false;
1215
+ let aioFullyExpanded = false;
1216
+ let aioSections = [];
1217
+ if (aioRoot) {
1218
+ aioExpanded = await maybeExpand(aioRoot);
1219
+ const controller = aioRoot.querySelector(selectors.aio.controller);
1220
+ const contentSubtree = aioRoot.querySelector(selectors.aio.contentSubtree);
1221
+ const showMore = aioRoot.querySelector(selectors.aio.showMoreButton);
1222
+ aioFullyExpanded = controller?.getAttribute("data-trnct") === "false" || showMore?.getAttribute("aria-expanded") === "true" || !showMore;
1223
+ aioText = cleanText(contentSubtree ?? controller ?? aioRoot);
1224
+ aioSections = (aioText ?? "").split("\n").map((line) => line.trim()).filter((line) => /^\d+\.\s+.+/.test(line));
1225
+ aioCitations = extractCitations(aioRoot);
1226
+ }
1227
+ const aimRoot = document.querySelector(selectors.aim.root);
1228
+ const aimDetected = surface === "aim" && !!aimRoot;
1229
+ const aimContainer = aimRoot?.closest(selectors.aim.wrapper) ?? aimRoot;
1230
+ const aimText = cleanText(aimContainer);
1231
+ const aimCitations = aimDetected ? extractCitations(aimContainer) : [];
1232
+ return {
1233
+ surface,
1234
+ aiOverview: {
1235
+ detected: !!aioRoot && aioText !== null,
1236
+ text: aioText,
1237
+ citations: aioCitations,
1238
+ expanded: aioExpanded,
1239
+ fullyExpanded: aioFullyExpanded,
1240
+ sections: aioSections
1241
+ },
1242
+ aiMode: {
1243
+ detected: aimDetected && aimText !== null,
1244
+ text: aimText,
1245
+ citations: aimCitations
1246
+ }
1247
+ };
1248
+ }
1249
+
1250
+ // src/extractor/PAAExtractor.ts
1251
+ var DESKTOP_USER_AGENT2 = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
1252
+ var MOBILE_USER_AGENT2 = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
1253
+ var PAAExtractor = class {
1254
+ constructor(driver, reporter) {
1255
+ this.driver = driver;
1256
+ this.reporter = reporter;
1257
+ }
1258
+ driver;
1259
+ reporter;
1260
+ normalizeQuestion(q) {
1261
+ return q.toLowerCase().replace(/[^\w\s]/g, "").replace(/\s+/g, " ").trim();
1262
+ }
1263
+ throwIfAborted(signal) {
1264
+ if (!signal?.aborted) return;
1265
+ if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") throw signal.reason;
1266
+ throw new RequestAbortedError();
1267
+ }
1268
+ async throwIfCaptcha(page, context) {
1269
+ const captchaCount = await page.locator(PAASelectors.captchaMarker).count().catch(() => 0);
1270
+ if (captchaCount > 0) {
1271
+ throw new CaptchaError(`${context} returned a CAPTCHA \u2014 retrying with a fresh session.`);
1272
+ }
1273
+ }
1274
+ async extractVisibleItems(page) {
1275
+ const sels = PAASelectors;
1276
+ const raw = await page.evaluate((selectors) => {
1277
+ function cleanText(el) {
1278
+ if (!el) return "";
1279
+ const parts = [];
1280
+ for (const n of el.childNodes) {
1281
+ if (n.nodeType === Node.TEXT_NODE) {
1282
+ const text = n.textContent?.trim();
1283
+ if (text) parts.push(text);
1284
+ } else if (n.tagName === "STYLE" || n.tagName === "SCRIPT") {
1285
+ continue;
1286
+ } else {
1287
+ const text = cleanText(n);
1288
+ if (text) parts.push(text);
1289
+ }
1290
+ }
1291
+ return parts.join(" ").replace(/\s+/g, " ").trim();
1292
+ }
1293
+ return Array.from(document.querySelectorAll(selectors.item)).map((pair) => ({
1294
+ question: pair.getAttribute(selectors.itemDataQ) || pair.getAttribute(selectors.itemDataInitQ) || "",
1295
+ answer: cleanText(pair.querySelector(selectors.answerContainer)) || void 0,
1296
+ sourceTitle: pair.querySelector(selectors.sourceTitle)?.innerText?.trim() || void 0,
1297
+ sourceSite: pair.querySelector(selectors.sourceSite)?.innerText?.trim() || void 0,
1298
+ sourceCite: pair.querySelector(selectors.sourceCite)?.innerText?.trim() || void 0
1299
+ }));
1300
+ }, sels);
1301
+ return raw.flatMap((item) => {
1302
+ const cleaned = {
1303
+ ...item,
1304
+ answer: cleanPAAAnswerText(item.answer, item.question, item.sourceTitle)
1305
+ };
1306
+ const result = RawPAAItemSchema.safeParse(cleaned);
1307
+ if (!result.success) {
1308
+ console.warn("[PAAExtractor] item parse failed:", item.question, result.error.issues[0]?.message);
1309
+ return [];
1310
+ }
1311
+ return [result.data];
1312
+ });
1313
+ }
1314
+ async clickItem(page, questionText) {
1315
+ try {
1316
+ const pairLocator = page.locator(
1317
+ `${PAASelectors.item}[data-q="${questionText}"], ${PAASelectors.item}[data-initq="${questionText}"]`
1318
+ ).first();
1319
+ await pairLocator.click();
1320
+ } catch {
1321
+ }
1322
+ }
1323
+ toFlatRow(item, depth, parentQuestion, seed) {
1324
+ return {
1325
+ seed_query: seed,
1326
+ question: item.question,
1327
+ answer: item.answer ?? "",
1328
+ source_title: item.sourceTitle ?? "",
1329
+ source_site: item.sourceSite ?? "",
1330
+ source_cite: item.sourceCite ?? "",
1331
+ depth,
1332
+ parent_question: parentQuestion ?? "",
1333
+ extracted_at: (/* @__PURE__ */ new Date()).toISOString()
1334
+ };
1335
+ }
1336
+ async runBFS(page, options, signal) {
1337
+ const seenKeys = /* @__PURE__ */ new Set();
1338
+ const seenQs = /* @__PURE__ */ new Set();
1339
+ const orderedQs = [];
1340
+ const results = [];
1341
+ const readAllQs = () => page.evaluate(
1342
+ ({ sel, dataQ, dataInitQ, questionEl }) => Array.from(document.querySelectorAll(sel)).map(
1343
+ (el) => el.getAttribute(dataQ) || el.getAttribute(dataInitQ) || el.querySelector(questionEl)?.innerText?.trim() || ""
1344
+ ).filter(Boolean),
1345
+ { sel: PAASelectors.item, dataQ: PAASelectors.itemDataQ, dataInitQ: PAASelectors.itemDataInitQ, questionEl: PAASelectors.itemQuestionEl }
1346
+ );
1347
+ let round = 0;
1348
+ while (seenQs.size < options.maxQuestions) {
1349
+ this.throwIfAborted(signal);
1350
+ await this.throwIfCaptcha(page, "Google PAA expansion");
1351
+ const beforeQs = await readAllQs();
1352
+ if (beforeQs.length >= options.maxQuestions) break;
1353
+ const unexpandedSel = `${PAASelectors.item}:not(.${PAASelectors.expandedClass}) ${PAASelectors.clickTarget}`;
1354
+ const unexpandedCount = await page.locator(unexpandedSel).count();
1355
+ if (unexpandedCount === 0) break;
1356
+ this.reporter.onDepth(++round);
1357
+ for (let ci = 0; ci < unexpandedCount; ci++) {
1358
+ this.throwIfAborted(signal);
1359
+ try {
1360
+ const btn = page.locator(unexpandedSel).first();
1361
+ await btn.scrollIntoViewIfNeeded();
1362
+ await btn.hover({ force: true });
1363
+ await page.waitForTimeout(100);
1364
+ await btn.click({ force: true });
1365
+ await page.waitForTimeout(500);
1366
+ } catch {
1367
+ }
1368
+ }
1369
+ await page.waitForFunction(
1370
+ ({ sel, min }) => document.querySelectorAll(sel).length > min,
1371
+ { sel: PAASelectors.item, min: beforeQs.length },
1372
+ { timeout: 5e3 }
1373
+ ).catch(() => {
1374
+ });
1375
+ await this.throwIfCaptcha(page, "Google PAA expansion");
1376
+ const afterQs = await readAllQs();
1377
+ if (afterQs.length === beforeQs.length) break;
1378
+ for (const q of afterQs) {
1379
+ if (!seenQs.has(q)) {
1380
+ seenQs.add(q);
1381
+ orderedQs.push(q);
1382
+ }
1383
+ }
1384
+ }
1385
+ const itemMap = new Map((await this.extractVisibleItems(page)).map((i) => [i.question, i]));
1386
+ for (const q of orderedQs) {
1387
+ if (results.length >= options.maxQuestions) break;
1388
+ const key = this.normalizeQuestion(q);
1389
+ if (seenKeys.has(key)) continue;
1390
+ seenKeys.add(key);
1391
+ const item = itemMap.get(q);
1392
+ if (item) {
1393
+ results.push(this.toFlatRow(item, 1, null, options.query));
1394
+ this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: 1, parentQuestion: null, children: [] });
1395
+ } else {
1396
+ results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, 1, null, options.query));
1397
+ }
1398
+ }
1399
+ return results;
1400
+ }
1401
+ async extractVideos(page) {
1402
+ const vsels = VideoSelectors;
1403
+ return page.evaluate((sels) => {
1404
+ const results = [];
1405
+ const containers = Array.from(document.querySelectorAll(sels.container));
1406
+ for (const container of containers) {
1407
+ const headingEl = container.querySelector(sels.sectionHeading);
1408
+ const headingText = headingEl?.textContent?.trim() ?? "";
1409
+ const type = headingText.toLowerCase().includes("short") ? "short_video" : "video";
1410
+ const items = Array.from(container.querySelectorAll(sels.item));
1411
+ for (const a of items) {
1412
+ const href = a.href;
1413
+ if (!href || !href.includes("youtube") && !href.includes("youtu.be")) continue;
1414
+ const raw = a.textContent?.trim() ?? "";
1415
+ const ytIdx = raw.indexOf("YouTube");
1416
+ if (ytIdx === -1) continue;
1417
+ const title = raw.slice(0, ytIdx).trim();
1418
+ const remainder = raw.slice(ytIdx + 7).replace(/^[·\s·]+/, "");
1419
+ const channelMatch = remainder.match(/^([^·\n]+)/);
1420
+ const channel = channelMatch ? channelMatch[1].trim() : "";
1421
+ if (title) results.push({ type, title, channel, platform: "YouTube", duration: "", url: href });
1422
+ }
1423
+ }
1424
+ return results;
1425
+ }, vsels);
1426
+ }
1427
+ async extractForums(page) {
1428
+ const fsels = ForumSelectors;
1429
+ return page.evaluate((sels) => {
1430
+ const results = [];
1431
+ const sections = Array.from(document.querySelectorAll(sels.section));
1432
+ const forumSection = sections.find((s) => s.textContent?.includes("Discussions"));
1433
+ if (!forumSection) return results;
1434
+ const items = Array.from(forumSection.querySelectorAll(sels.item));
1435
+ for (const a of items) {
1436
+ const href = a.href;
1437
+ if (!href) continue;
1438
+ const titleEl = a.querySelector(sels.title);
1439
+ const sourceEl = a.querySelector(sels.source);
1440
+ const title = titleEl?.textContent?.trim() ?? "";
1441
+ const source = sourceEl?.textContent?.trim() ?? "";
1442
+ if (title) results.push({ title, source, url: href });
1443
+ }
1444
+ return results;
1445
+ }, fsels);
1446
+ }
1447
+ async extractShortVideos(page, shortUrl) {
1448
+ try {
1449
+ await page.goto(shortUrl, { waitUntil: "domcontentloaded" });
1450
+ await page.waitForTimeout(1500);
1451
+ } catch {
1452
+ return [];
1453
+ }
1454
+ await this.throwIfCaptcha(page, "Google short video search");
1455
+ const svSels = {
1456
+ item: ShortVideoSelectors.item,
1457
+ platforms: [...ShortVideoSelectors.platforms]
1458
+ };
1459
+ const raw = await page.evaluate((sels) => {
1460
+ const seen = /* @__PURE__ */ new Set();
1461
+ const results = [];
1462
+ const items = Array.from(document.querySelectorAll(sels.item));
1463
+ const videoHosts = ["youtube.com", "youtu.be", "tiktok.com", "instagram.com", "facebook.com", "fb.watch"];
1464
+ const byHref = /* @__PURE__ */ new Map();
1465
+ for (const a of items) {
1466
+ const href = a.href;
1467
+ if (!href) continue;
1468
+ if (!videoHosts.some((h) => href.includes(h))) continue;
1469
+ const text = a.textContent?.trim() ?? "";
1470
+ if (!byHref.has(href)) byHref.set(href, []);
1471
+ byHref.get(href).push(text);
1472
+ }
1473
+ for (const [href, texts] of byHref.entries()) {
1474
+ if (seen.has(href)) continue;
1475
+ seen.add(href);
1476
+ const duration = texts.find((t) => /^\d+:\d+$/.test(t)) ?? "";
1477
+ const titleText = texts.find((t) => !/^\d+:\d+$/.test(t) && t.length > 5) ?? "";
1478
+ if (!titleText) continue;
1479
+ let title = titleText;
1480
+ let platform = "";
1481
+ let channel = "";
1482
+ for (const p of sels.platforms) {
1483
+ let lastIdx = -1;
1484
+ let search = 0;
1485
+ while (true) {
1486
+ const found = titleText.indexOf(p, search);
1487
+ if (found === -1) break;
1488
+ lastIdx = found;
1489
+ search = found + 1;
1490
+ }
1491
+ if (lastIdx === -1) continue;
1492
+ const after = titleText.slice(lastIdx + p.length);
1493
+ const isSourceTag = /^[\s·]/.test(after) || after.trim() === "";
1494
+ if (!isSourceTag) continue;
1495
+ title = titleText.slice(0, lastIdx).trim();
1496
+ platform = p;
1497
+ const stripped = after.replace(/^[\s·]+/, "");
1498
+ const dotIdx = stripped.indexOf("\xB7");
1499
+ channel = (dotIdx === -1 ? stripped : stripped.slice(0, dotIdx)).trim();
1500
+ break;
1501
+ }
1502
+ if (title) results.push({ title, channel, platform, duration, url: href });
1503
+ }
1504
+ return results;
1505
+ }, svSels);
1506
+ return raw.map((r) => ({ type: "short_video", ...r }));
1507
+ }
1508
+ async extractWhatPeopleSaying(page) {
1509
+ const sels = WhatPeopleSayingSelectors;
1510
+ return page.evaluate((s) => {
1511
+ const section = Array.from(document.querySelectorAll(s.sectionTag)).find((el) => el.textContent?.includes(s.sectionHeadingText)) ?? document.querySelector(".yG4QQe.TBC9ub.NbhJ1c");
1512
+ if (!section) return [];
1513
+ return Array.from(section.querySelectorAll(s.card)).map((card) => {
1514
+ const link = card.querySelector(s.cardLink);
1515
+ const url = link?.href ?? "";
1516
+ const titleH1 = card.querySelector(s.titleH1)?.textContent?.trim();
1517
+ const titleDiv = card.querySelector(s.titleDiv)?.textContent?.trim();
1518
+ const title = titleH1 ?? titleDiv ?? "";
1519
+ const sourceText = card.querySelector(s.source)?.textContent?.trim() ?? "";
1520
+ const platformEl = card.querySelector(s.platformBadge);
1521
+ const platformText = platformEl?.textContent?.trim() ?? "";
1522
+ const ytChannel = card.querySelector(s.ytChannel)?.textContent?.trim() ?? "";
1523
+ const ytDate = card.querySelector(s.ytDate)?.textContent?.trim() ?? "";
1524
+ const authorNote = card.querySelector(s.authorNote)?.textContent?.trim() ?? null;
1525
+ const commentLabelEl = card.querySelector(s.popularCommentLabel);
1526
+ let popularComment = null;
1527
+ if (commentLabelEl) {
1528
+ let next = commentLabelEl.nextSibling;
1529
+ while (next) {
1530
+ const t = next.textContent?.trim();
1531
+ if (t) {
1532
+ popularComment = t;
1533
+ break;
1534
+ }
1535
+ next = next.nextSibling;
1536
+ }
1537
+ }
1538
+ const allSpans = Array.from(card.querySelectorAll("span"));
1539
+ const duration = allSpans.find((s2) => /^\d+:\d+$/.test(s2.textContent?.trim() ?? ""))?.textContent?.trim() ?? null;
1540
+ const engagementParts = allSpans.map((s2) => s2.textContent?.trim() ?? "").filter(
1541
+ (t) => /\d/.test(t) && (t.includes("comment") || t.includes("reaction") || t.includes("view") || t.includes("like") || t.includes("share"))
1542
+ );
1543
+ const engagement = engagementParts[0] ?? "";
1544
+ const dateCandidates = allSpans.map((s2) => s2.textContent?.trim() ?? "").filter((t) => /\d+ (day|week|month|year|hour)s? ago|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/.test(t));
1545
+ const date = ytDate || (dateCandidates[0] ?? "");
1546
+ const platform = platformText || (ytChannel ? "YouTube" : "");
1547
+ const source = ytChannel || sourceText;
1548
+ let type = "unknown";
1549
+ const pl = platform.toLowerCase();
1550
+ const src = source.toLowerCase();
1551
+ const srcRaw = sourceText.toLowerCase();
1552
+ if (pl.includes("reddit") || src.startsWith("r/")) type = "reddit";
1553
+ else if (pl.includes("facebook") || srcRaw.includes("facebook")) type = "facebook";
1554
+ else if (pl.includes("instagram") || srcRaw.includes("instagram")) type = "instagram";
1555
+ else if (pl.includes("tiktok") || srcRaw.includes("tiktok")) type = "tiktok";
1556
+ else if (pl.includes("youtube") || !!ytChannel) type = "youtube";
1557
+ else type = "news";
1558
+ return { type, title, url, source, platform, popularComment, engagement, date, duration, authorNote };
1559
+ });
1560
+ }, sels);
1561
+ }
1562
+ async extractOrganicResults(page) {
1563
+ const sels = OrganicSelectors;
1564
+ return page.evaluate((s) => {
1565
+ const out = [];
1566
+ let pos = 0;
1567
+ document.querySelectorAll(s.result).forEach((card) => {
1568
+ const titleEl = card.querySelector(s.title);
1569
+ if (!titleEl) return;
1570
+ const title = titleEl.textContent?.trim() ?? "";
1571
+ const linkEl = titleEl.closest("a");
1572
+ const url = linkEl?.href ?? "";
1573
+ if (!title || !url) return;
1574
+ pos++;
1575
+ const cite = card.querySelector(s.cite)?.textContent?.trim() ?? null;
1576
+ const snippet = card.querySelector(s.snippet)?.textContent?.trim() ?? null;
1577
+ const isRedditStyle = !!card.querySelector(s.redditCite);
1578
+ const ratingEl = card.querySelector(s.ratingWrap);
1579
+ const inlineRating = ratingEl ? { value: ratingEl.querySelector(s.ratingValue)?.textContent?.trim() ?? "", count: ratingEl.querySelector(s.reviewCount)?.textContent?.trim() ?? "" } : null;
1580
+ let domain = "";
1581
+ try {
1582
+ domain = new URL(url).hostname.replace(/^www\./, "");
1583
+ } catch {
1584
+ domain = card.querySelector(s.siteName)?.textContent?.trim() ?? "";
1585
+ }
1586
+ out.push({ position: pos, title, url, domain, cite, snippet, isRedditStyle, inlineRating });
1587
+ });
1588
+ return out;
1589
+ }, sels);
1590
+ }
1591
+ async extractLocalPack(page) {
1592
+ const sels = LocalPackSelectors;
1593
+ return page.evaluate((s) => {
1594
+ const out = [];
1595
+ let container = null;
1596
+ document.querySelectorAll('[role="heading"]').forEach((h) => {
1597
+ if (!container && h.textContent?.includes(s.headingText)) container = h.closest("[data-hveid]");
1598
+ });
1599
+ if (!container) return out;
1600
+ container.querySelectorAll(s.card).forEach((card, i) => {
1601
+ const name = card.querySelector(s.name)?.textContent?.trim() ?? "";
1602
+ if (!name) return;
1603
+ const rating = card.querySelector(s.ratingValue)?.textContent?.trim() ?? null;
1604
+ const reviewRaw = card.querySelector(s.reviewCount)?.textContent?.trim() ?? null;
1605
+ const reviewCount = reviewRaw ? reviewRaw.replace(/[()]/g, "").trim() : null;
1606
+ let cid = card.querySelector("a[data-cid]")?.getAttribute("data-cid") ?? null;
1607
+ if (!cid) {
1608
+ for (const link of Array.from(card.querySelectorAll("a[href]"))) {
1609
+ const m1 = link.href.match(/[?&]cid=(\d+)/);
1610
+ if (m1) {
1611
+ cid = m1[1];
1612
+ break;
1613
+ }
1614
+ const m2 = link.href.match(/!1s0x[0-9a-f]+:0x([0-9a-f]+)/i);
1615
+ if (m2) {
1616
+ try {
1617
+ cid = BigInt("0x" + m2[1]).toString();
1618
+ } catch {
1619
+ }
1620
+ if (cid) break;
1621
+ }
1622
+ }
1623
+ }
1624
+ const metadata = [];
1625
+ card.querySelectorAll("div, span").forEach((el) => {
1626
+ const text = Array.from(el.childNodes).filter((n) => n.nodeType === 3).map((n) => n.textContent?.trim() ?? "").filter((t) => t.length > 1 && t.length < 120).join(" ");
1627
+ if (text && !metadata.includes(text)) metadata.push(text);
1628
+ });
1629
+ const links = Array.from(card.querySelectorAll("a[href]"));
1630
+ const directionsUrl = links.find((a) => a.href.includes("google.com/maps"))?.href ?? null;
1631
+ const websiteUrl = links.find((a) => !a.href.includes("google.com") && a.href.startsWith("http"))?.href ?? null;
1632
+ out.push({ position: i + 1, name, cid, rating, reviewCount, metadata, websiteUrl, directionsUrl });
1633
+ });
1634
+ return out;
1635
+ }, sels);
1636
+ }
1637
+ async extractEntityIds(page) {
1638
+ return page.evaluate(() => {
1639
+ const kgIds = /* @__PURE__ */ new Set();
1640
+ const cids = /* @__PURE__ */ new Set();
1641
+ const gcids = /* @__PURE__ */ new Set();
1642
+ const recordMap = /* @__PURE__ */ new Map();
1643
+ function nameFromWrapper(el) {
1644
+ const sel = [".OSrXXb", ".dbg0pd", ".tzt0oe", '[role="heading"]', "h3"];
1645
+ for (const s of sel) {
1646
+ const found = el.querySelector(s);
1647
+ if (found?.textContent?.trim()) return found.textContent.trim();
1648
+ }
1649
+ return "";
1650
+ }
1651
+ document.querySelectorAll('[id^="pv-/g/"]').forEach((wrapper) => {
1652
+ const raw = wrapper.getAttribute("id");
1653
+ if (!raw) return;
1654
+ const kgId = raw.replace("pv-", "");
1655
+ kgIds.add(kgId);
1656
+ const name = nameFromWrapper(wrapper);
1657
+ const cidEl = wrapper.querySelector("a[data-cid]");
1658
+ const cid = cidEl?.getAttribute("data-cid") ?? null;
1659
+ if (cid) cids.add(cid);
1660
+ if (name) recordMap.set(kgId, { name, kgId, cid, gcid: null });
1661
+ });
1662
+ document.querySelectorAll("[data-mid]").forEach((el) => {
1663
+ const mid = el.getAttribute("data-mid");
1664
+ if (!mid?.startsWith("/g/")) return;
1665
+ kgIds.add(mid);
1666
+ if (!recordMap.has(mid)) {
1667
+ const name = nameFromWrapper(el);
1668
+ if (name) recordMap.set(mid, { name, kgId: mid, cid: null, gcid: null });
1669
+ }
1670
+ });
1671
+ document.querySelectorAll(".w7Dbne").forEach((card) => {
1672
+ const cidEl = card.querySelector("a[data-cid]");
1673
+ const cid = cidEl?.getAttribute("data-cid") ?? null;
1674
+ if (!cid) return;
1675
+ cids.add(cid);
1676
+ const name = card.querySelector(".OSrXXb")?.textContent?.trim() ?? "";
1677
+ if (!name) return;
1678
+ const kgIdEl = card.querySelector('[id^="pv-/g/"]');
1679
+ const kgId = kgIdEl ? kgIdEl.getAttribute("id").replace("pv-", "") : null;
1680
+ const key = kgId ?? `cid:${cid}`;
1681
+ if (recordMap.has(key)) {
1682
+ const existing = recordMap.get(key);
1683
+ if (!existing.cid) recordMap.set(key, { ...existing, cid });
1684
+ } else {
1685
+ recordMap.set(key, { name, kgId, cid, gcid: null });
1686
+ }
1687
+ });
1688
+ document.querySelectorAll("a[data-cid]").forEach((el) => {
1689
+ const cid = el.getAttribute("data-cid");
1690
+ if (!cid) return;
1691
+ cids.add(cid);
1692
+ const alreadyNamed = [...recordMap.values()].some((r) => r.cid === cid);
1693
+ if (!alreadyNamed) {
1694
+ let node = el.parentElement;
1695
+ let name = "";
1696
+ for (let i = 0; i < 8 && node; i++) {
1697
+ const h = node.querySelector('.OSrXXb, .dbg0pd, [role="heading"], h3');
1698
+ if (h?.textContent?.trim()) {
1699
+ name = h.textContent.trim();
1700
+ break;
1701
+ }
1702
+ node = node.parentElement;
1703
+ }
1704
+ if (name) recordMap.set(`cid:${cid}`, { name, kgId: null, cid, gcid: null });
1705
+ }
1706
+ });
1707
+ const scriptContent = Array.from(document.querySelectorAll("script:not([src])")).map((s) => s.textContent ?? "").filter((t) => t.length > 1e4).join("\n");
1708
+ for (const m of scriptContent.matchAll(/\/g\/[a-zA-Z0-9_-]{5,20}/g)) kgIds.add(m[0]);
1709
+ for (const m of scriptContent.matchAll(/gcid:[a-zA-Z0-9_]+/g)) gcids.add(m[0]);
1710
+ for (const m of scriptContent.matchAll(/0x[0-9a-f]+:0x([0-9a-f]+)/gi)) {
1711
+ try {
1712
+ cids.add(BigInt("0x" + m[1]).toString());
1713
+ } catch {
1714
+ }
1715
+ }
1716
+ return { entities: [...recordMap.values()], kgIds: [...kgIds], cids: [...cids], gcids: [...gcids] };
1717
+ });
1718
+ }
1719
+ mergeLocalPackIntoEntities(entityIds, localPack) {
1720
+ const cidSet = new Set(entityIds.cids);
1721
+ const records = entityIds.entities.map((r) => ({ ...r }));
1722
+ for (const biz of localPack) {
1723
+ if (!biz.cid) continue;
1724
+ cidSet.add(biz.cid);
1725
+ const nameNorm = biz.name.toLowerCase().trim();
1726
+ const byName = records.find((r) => r.name.toLowerCase().trim() === nameNorm);
1727
+ if (byName) {
1728
+ if (!byName.cid) byName.cid = biz.cid;
1729
+ } else if (!records.find((r) => r.cid === biz.cid)) {
1730
+ records.push({ name: biz.name, kgId: null, cid: biz.cid, gcid: null });
1731
+ }
1732
+ }
1733
+ return { ...entityIds, entities: records, cids: [...cidSet] };
1734
+ }
1735
+ async extractAISurfaces(page) {
1736
+ return page.evaluate(extractAISurfacesFromDocument, {
1737
+ aio: AIOverviewSelectors,
1738
+ aim: AIModeSelectors,
1739
+ expandWaitMs: 1500
1740
+ });
1741
+ }
1742
+ buildTree(flat, _seed) {
1743
+ const roots = [];
1744
+ const nodeMap = /* @__PURE__ */ new Map();
1745
+ for (const row of flat) {
1746
+ const node = {
1747
+ question: row.question,
1748
+ answer: row.answer || null,
1749
+ sourceTitle: row.source_title || null,
1750
+ sourceSite: row.source_site || null,
1751
+ sourceCite: row.source_cite || null,
1752
+ depth: row.depth,
1753
+ parentQuestion: row.parent_question || null,
1754
+ children: []
1755
+ };
1756
+ nodeMap.set(row.question, node);
1757
+ }
1758
+ for (const node of nodeMap.values()) {
1759
+ if (node.parentQuestion && nodeMap.has(node.parentQuestion)) {
1760
+ nodeMap.get(node.parentQuestion).children.push(node);
1761
+ } else {
1762
+ roots.push(node);
1763
+ }
1764
+ }
1765
+ return roots;
1766
+ }
1767
+ getBrowserDebugSnapshot() {
1768
+ return this.driver.getDebugSnapshot();
1769
+ }
1770
+ buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) {
1771
+ if (!options.debug) return void 0;
1772
+ return {
1773
+ enabled: true,
1774
+ request: {
1775
+ query: options.query,
1776
+ locationInput: options.location ?? null,
1777
+ canonicalLocation,
1778
+ uule,
1779
+ gl: options.gl,
1780
+ hl: options.hl,
1781
+ device: options.device,
1782
+ proxyMode: options.proxyMode,
1783
+ proxyZip: options.proxyZip ?? null,
1784
+ serpOnly: options.serpOnly,
1785
+ pages: options.pages ?? 1
1786
+ },
1787
+ browser: this.getBrowserDebugSnapshot(),
1788
+ ...locationEvidence ? { locationEvidence } : {}
1789
+ };
1790
+ }
1791
+ async extract(options, signal) {
1792
+ const startMs = Date.now();
1793
+ const isMobile = options.device === "mobile";
1794
+ const config = {
1795
+ headless: options.headless,
1796
+ profileDir: options.profileDir,
1797
+ proxy: options.proxy,
1798
+ kernelApiKey: options.kernelApiKey,
1799
+ kernelProxyId: options.kernelProxyId,
1800
+ kernelProxyResolution: options.kernelProxyResolution,
1801
+ proxyMode: options.proxyMode,
1802
+ viewport: isMobile ? { width: 390, height: 844 } : { width: 1280, height: 800 },
1803
+ locale: `${options.hl}-${options.gl.toUpperCase()}`,
1804
+ userAgent: isMobile ? MOBILE_USER_AGENT2 : DESKTOP_USER_AGENT2,
1805
+ deviceScaleFactor: isMobile ? 3 : 1,
1806
+ isMobile,
1807
+ hasTouch: isMobile,
1808
+ debug: options.debug
1809
+ };
1810
+ let errorCount = 0;
1811
+ const diagnosticWarnings = [];
1812
+ try {
1813
+ this.throwIfAborted(signal);
1814
+ await this.driver.launch(config);
1815
+ this.throwIfAborted(signal);
1816
+ const canonicalLocation = options.location ? normalizeLocation(options.location) : null;
1817
+ const uule = canonicalLocation ? encodeUule(canonicalLocation) : null;
1818
+ const { hasPaa } = await this.driver.navigateToSERP(
1819
+ options.query,
1820
+ uule,
1821
+ options.gl,
1822
+ options.hl,
1823
+ {
1824
+ ...options.serpOnly ? { num: 100 } : {},
1825
+ debug: options.debug
1826
+ }
1827
+ );
1828
+ this.throwIfAborted(signal);
1829
+ const page = this.driver.getPage();
1830
+ await this.throwIfCaptcha(page, "Google SERP");
1831
+ if (options.serpOnly) {
1832
+ const [organicResults2, localPack2, rawEntityIds2] = await Promise.all([
1833
+ this.extractOrganicResults(page),
1834
+ this.extractLocalPack(page),
1835
+ this.extractEntityIds(page)
1836
+ ]);
1837
+ const entityIds2 = this.mergeLocalPackIntoEntities(rawEntityIds2, localPack2);
1838
+ const aiSurfaces2 = await this.extractAISurfaces(page);
1839
+ let locationEvidence2 = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults2, localPack2) : void 0;
1840
+ let allOrganic2 = organicResults2;
1841
+ if ((options.pages ?? 1) >= 2) {
1842
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1843
+ if (uule) p2params.set("uule", uule);
1844
+ await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1845
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1846
+ const p2organic = await this.extractOrganicResults(page);
1847
+ allOrganic2 = [...organicResults2, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1848
+ if (options.debug) {
1849
+ locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, allOrganic2, localPack2);
1850
+ }
1851
+ }
1852
+ const stats2 = {
1853
+ seed: options.query,
1854
+ totalQuestions: 0,
1855
+ maxDepthReached: 0,
1856
+ durationMs: Date.now() - startMs,
1857
+ errorCount
1858
+ };
1859
+ this.reporter.onComplete(stats2);
1860
+ return {
1861
+ seed: options.query,
1862
+ location: options.location ?? null,
1863
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1864
+ diagnostics: {
1865
+ completionStatus: "serp_only",
1866
+ problem: null,
1867
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
1868
+ },
1869
+ totalQuestions: 0,
1870
+ surface: aiSurfaces2.surface,
1871
+ aiOverview: aiSurfaces2.aiOverview,
1872
+ aiMode: aiSurfaces2.aiMode,
1873
+ whatPeopleSaying: [],
1874
+ tree: [],
1875
+ flat: [],
1876
+ videos: [],
1877
+ forums: [],
1878
+ organicResults: allOrganic2,
1879
+ localPack: localPack2,
1880
+ entityIds: entityIds2,
1881
+ stats: stats2
1882
+ };
1883
+ }
1884
+ const [videos, forums, whatPeopleSaying, rawEntityIds, organicResults, localPack] = await Promise.all([
1885
+ this.extractVideos(page),
1886
+ this.extractForums(page),
1887
+ this.extractWhatPeopleSaying(page),
1888
+ this.extractEntityIds(page),
1889
+ this.extractOrganicResults(page),
1890
+ this.extractLocalPack(page)
1891
+ ]);
1892
+ const entityIds = this.mergeLocalPackIntoEntities(rawEntityIds, localPack);
1893
+ const initialLocationEvidence = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) : void 0;
1894
+ this.reporter.onVideos(videos);
1895
+ this.reporter.onForums(forums);
1896
+ if (!hasPaa) {
1897
+ let noPaaOrganic = organicResults;
1898
+ let locationEvidence2 = initialLocationEvidence;
1899
+ if ((options.pages ?? 1) >= 2) {
1900
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1901
+ if (uule) p2params.set("uule", uule);
1902
+ await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1903
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1904
+ const p2organic = await this.extractOrganicResults(page);
1905
+ noPaaOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1906
+ if (options.debug) {
1907
+ locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, noPaaOrganic, localPack);
1908
+ }
1909
+ }
1910
+ const aiSurfaces2 = await this.extractAISurfaces(page);
1911
+ const stats2 = {
1912
+ seed: options.query,
1913
+ totalQuestions: 0,
1914
+ maxDepthReached: 0,
1915
+ durationMs: Date.now() - startMs,
1916
+ errorCount
1917
+ };
1918
+ this.reporter.onComplete(stats2);
1919
+ return {
1920
+ seed: options.query,
1921
+ location: options.location ?? null,
1922
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1923
+ diagnostics: {
1924
+ completionStatus: "no_paa",
1925
+ problem: null,
1926
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
1927
+ },
1928
+ totalQuestions: 0,
1929
+ surface: aiSurfaces2.surface,
1930
+ aiOverview: aiSurfaces2.aiOverview,
1931
+ aiMode: aiSurfaces2.aiMode,
1932
+ whatPeopleSaying,
1933
+ tree: [],
1934
+ flat: [],
1935
+ videos,
1936
+ forums,
1937
+ organicResults: noPaaOrganic,
1938
+ localPack,
1939
+ entityIds,
1940
+ stats: stats2
1941
+ };
1942
+ }
1943
+ const flat = await this.runBFS(page, options, signal);
1944
+ this.throwIfAborted(signal);
1945
+ const aiSurfaces = await this.extractAISurfaces(page);
1946
+ const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", udm: ShortVideoSelectors.udm });
1947
+ if (uule) shortVidsParams.set("uule", uule);
1948
+ let shortVideos = [];
1949
+ try {
1950
+ shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
1951
+ } catch (err) {
1952
+ if (!(err instanceof CaptchaError)) throw err;
1953
+ errorCount++;
1954
+ diagnosticWarnings.push({
1955
+ code: "short_videos_captcha_skipped",
1956
+ surface: "short_videos",
1957
+ message: err.message,
1958
+ retryable: true
1959
+ });
1960
+ }
1961
+ this.reporter.onVideos(shortVideos);
1962
+ let allOrganic = organicResults;
1963
+ let locationEvidence = initialLocationEvidence;
1964
+ if ((options.pages ?? 1) >= 2) {
1965
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
1966
+ if (uule) p2params.set("uule", uule);
1967
+ await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1968
+ await this.throwIfCaptcha(page, "Google SERP page 2");
1969
+ const p2organic = await this.extractOrganicResults(page);
1970
+ allOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1971
+ if (options.debug) {
1972
+ locationEvidence = inferSerpLocationEvidence(canonicalLocation, allOrganic, localPack);
1973
+ }
1974
+ }
1975
+ const allVideos = [...videos, ...shortVideos];
1976
+ const tree = this.buildTree(flat, options.query);
1977
+ const stats = {
1978
+ seed: options.query,
1979
+ totalQuestions: flat.length,
1980
+ maxDepthReached: flat.reduce((m, r) => Math.max(m, r.depth), 0),
1981
+ durationMs: Date.now() - startMs,
1982
+ errorCount
1983
+ };
1984
+ this.reporter.onComplete(stats);
1985
+ return {
1986
+ seed: options.query,
1987
+ location: options.location ?? null,
1988
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1989
+ diagnostics: {
1990
+ completionStatus: "paa_found",
1991
+ problem: null,
1992
+ ...diagnosticWarnings.length > 0 ? { warnings: diagnosticWarnings } : {},
1993
+ ...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) } : {}
1994
+ },
1995
+ totalQuestions: flat.length,
1996
+ surface: aiSurfaces.surface,
1997
+ aiOverview: aiSurfaces.aiOverview,
1998
+ aiMode: aiSurfaces.aiMode,
1999
+ whatPeopleSaying,
2000
+ tree,
2001
+ flat,
2002
+ videos: allVideos,
2003
+ forums,
2004
+ organicResults: allOrganic,
2005
+ localPack,
2006
+ entityIds,
2007
+ stats
2008
+ };
2009
+ } catch (err) {
2010
+ errorCount++;
2011
+ this.reporter.onError(err instanceof Error ? err : new Error(String(err)));
2012
+ throw err;
2013
+ }
2014
+ }
2015
+ };
2016
+
2017
+ // src/output/OutputSerializer.ts
2018
+ import { promises as fs } from "fs";
2019
+ import path from "path";
2020
+ import Papa from "papaparse";
2021
+ var OutputSerializer = class {
2022
+ async writeJSON(result, outputDir) {
2023
+ await fs.mkdir(outputDir, { recursive: true });
2024
+ const slug = result.seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
2025
+ const filename = `${slug}-${Date.now()}.json`;
2026
+ const fullPath = path.join(outputDir, filename);
2027
+ await fs.writeFile(fullPath, JSON.stringify(result, null, 2), "utf8");
2028
+ return fullPath;
2029
+ }
2030
+ async writeCSV(rows, outputDir) {
2031
+ await fs.mkdir(outputDir, { recursive: true });
2032
+ const seedRaw = rows[0]?.seed_query ?? "paa";
2033
+ const slug = seedRaw.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
2034
+ const csv = Papa.unparse(rows, { header: true });
2035
+ const filename = `${slug}-${Date.now()}.csv`;
2036
+ const fullPath = path.join(outputDir, filename);
2037
+ await fs.writeFile(fullPath, csv, "utf8");
2038
+ return fullPath;
2039
+ }
2040
+ async writeVideoCSV(videos, seed, outputDir) {
2041
+ await fs.mkdir(outputDir, { recursive: true });
2042
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
2043
+ const csv = Papa.unparse(videos, { header: true });
2044
+ const filename = `${slug}-videos-${Date.now()}.csv`;
2045
+ const fullPath = path.join(outputDir, filename);
2046
+ await fs.writeFile(fullPath, csv, "utf8");
2047
+ return fullPath;
2048
+ }
2049
+ async writeForumCSV(forums, seed, outputDir) {
2050
+ await fs.mkdir(outputDir, { recursive: true });
2051
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
2052
+ const csv = Papa.unparse(forums, { header: true });
2053
+ const filename = `${slug}-forums-${Date.now()}.csv`;
2054
+ const fullPath = path.join(outputDir, filename);
2055
+ await fs.writeFile(fullPath, csv, "utf8");
2056
+ return fullPath;
2057
+ }
2058
+ async writeAIOverviewCSV(citations, text, seed, outputDir) {
2059
+ await fs.mkdir(outputDir, { recursive: true });
2060
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
2061
+ const rows = citations.map((c, i) => ({
2062
+ seed_query: seed,
2063
+ response_text: i === 0 ? text ?? "" : "",
2064
+ citation_text: c.text,
2065
+ citation_href: c.href
2066
+ }));
2067
+ const csv = Papa.unparse(rows, { header: true });
2068
+ const filename = `${slug}-ai-overview-${Date.now()}.csv`;
2069
+ const fullPath = path.join(outputDir, filename);
2070
+ await fs.writeFile(fullPath, csv, "utf8");
2071
+ return fullPath;
2072
+ }
2073
+ async writeAIModeCSV(citations, text, seed, outputDir) {
2074
+ await fs.mkdir(outputDir, { recursive: true });
2075
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
2076
+ const rows = citations.map((c, i) => ({
2077
+ seed_query: seed,
2078
+ response_text: i === 0 ? text ?? "" : "",
2079
+ citation_text: c.text,
2080
+ citation_href: c.href
2081
+ }));
2082
+ const csv = Papa.unparse(rows, { header: true });
2083
+ const filename = `${slug}-ai-mode-${Date.now()}.csv`;
2084
+ const fullPath = path.join(outputDir, filename);
2085
+ await fs.writeFile(fullPath, csv, "utf8");
2086
+ return fullPath;
2087
+ }
2088
+ async writeWhatPeopleSayingCSV(cards, seed, outputDir) {
2089
+ await fs.mkdir(outputDir, { recursive: true });
2090
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
2091
+ const rows = cards.map((c) => ({ seed_query: seed, ...c }));
2092
+ const csv = Papa.unparse(rows, { header: true });
2093
+ const filename = `${slug}-what-people-saying-${Date.now()}.csv`;
2094
+ const fullPath = path.join(outputDir, filename);
2095
+ await fs.writeFile(fullPath, csv, "utf8");
2096
+ return fullPath;
2097
+ }
2098
+ };
2099
+
2100
+ // src/output/ProgressReporter.ts
2101
+ var ProgressReporter = class {
2102
+ onQuestion(node) {
2103
+ process.stdout.write(JSON.stringify({ event: "question", depth: node.depth, question: node.question }) + "\n");
2104
+ }
2105
+ onDepth(depth) {
2106
+ process.stdout.write(JSON.stringify({ event: "depth", depth }) + "\n");
2107
+ }
2108
+ onVideos(videos) {
2109
+ for (const v of videos) {
2110
+ process.stdout.write(JSON.stringify({ event: "video", type: v.type, platform: v.platform, duration: v.duration, title: v.title, channel: v.channel, url: v.url }) + "\n");
2111
+ }
2112
+ }
2113
+ onForums(forums) {
2114
+ for (const f of forums) {
2115
+ process.stdout.write(JSON.stringify({ event: "forum", title: f.title, source: f.source, url: f.url }) + "\n");
2116
+ }
2117
+ }
2118
+ onComplete(stats) {
2119
+ process.stdout.write(JSON.stringify({ event: "complete", ...stats }) + "\n");
2120
+ }
2121
+ onError(err) {
2122
+ process.stderr.write(JSON.stringify({ event: "error", type: err.constructor.name, message: err.message }) + "\n");
2123
+ }
2124
+ };
2125
+
2126
+ // src/kernel-proxy-resolver.ts
2127
+ import Kernel2 from "@onkernel/sdk";
2128
+ var US_STATE_CODES = {
2129
+ alabama: "AL",
2130
+ alaska: "AK",
2131
+ arizona: "AZ",
2132
+ arkansas: "AR",
2133
+ california: "CA",
2134
+ colorado: "CO",
2135
+ connecticut: "CT",
2136
+ delaware: "DE",
2137
+ florida: "FL",
2138
+ georgia: "GA",
2139
+ hawaii: "HI",
2140
+ idaho: "ID",
2141
+ illinois: "IL",
2142
+ indiana: "IN",
2143
+ iowa: "IA",
2144
+ kansas: "KS",
2145
+ kentucky: "KY",
2146
+ louisiana: "LA",
2147
+ maine: "ME",
2148
+ maryland: "MD",
2149
+ massachusetts: "MA",
2150
+ michigan: "MI",
2151
+ minnesota: "MN",
2152
+ mississippi: "MS",
2153
+ missouri: "MO",
2154
+ montana: "MT",
2155
+ nebraska: "NE",
2156
+ nevada: "NV",
2157
+ "new hampshire": "NH",
2158
+ "new jersey": "NJ",
2159
+ "new mexico": "NM",
2160
+ "new york": "NY",
2161
+ "north carolina": "NC",
2162
+ "north dakota": "ND",
2163
+ ohio: "OH",
2164
+ oklahoma: "OK",
2165
+ oregon: "OR",
2166
+ pennsylvania: "PA",
2167
+ "rhode island": "RI",
2168
+ "south carolina": "SC",
2169
+ "south dakota": "SD",
2170
+ tennessee: "TN",
2171
+ texas: "TX",
2172
+ utah: "UT",
2173
+ vermont: "VT",
2174
+ virginia: "VA",
2175
+ washington: "WA",
2176
+ "west virginia": "WV",
2177
+ wisconsin: "WI",
2178
+ wyoming: "WY"
2179
+ };
2180
+ var US_CITY_CENTER_ZIPS = {
2181
+ "atlanta|GA": "30303",
2182
+ "austin|TX": "78701",
2183
+ "baltimore|MD": "21201",
2184
+ "boston|MA": "02108",
2185
+ "boulder|CO": "80302",
2186
+ "charlotte|NC": "28202",
2187
+ "chicago|IL": "60601",
2188
+ "colorado_springs|CO": "80903",
2189
+ "columbus|OH": "43215",
2190
+ "dallas|TX": "75201",
2191
+ "denver|CO": "80202",
2192
+ "detroit|MI": "48226",
2193
+ "fort_collins|CO": "80524",
2194
+ "fort_worth|TX": "76102",
2195
+ "houston|TX": "77002",
2196
+ "indianapolis|IN": "46204",
2197
+ "jacksonville|FL": "32202",
2198
+ "las_vegas|NV": "89101",
2199
+ "los_angeles|CA": "90012",
2200
+ "louisville|KY": "40202",
2201
+ "loveland|CO": "80537",
2202
+ "memphis|TN": "38103",
2203
+ "miami|FL": "33131",
2204
+ "minneapolis|MN": "55401",
2205
+ "nashville|TN": "37203",
2206
+ "new_york|NY": "10001",
2207
+ "orlando|FL": "32801",
2208
+ "philadelphia|PA": "19103",
2209
+ "phoenix|AZ": "85004",
2210
+ "portland|OR": "97205",
2211
+ "raleigh|NC": "27601",
2212
+ "richmond|VA": "23219",
2213
+ "sacramento|CA": "95814",
2214
+ "salt_lake_city|UT": "84101",
2215
+ "san_antonio|TX": "78205",
2216
+ "san_diego|CA": "92101",
2217
+ "san_francisco|CA": "94103",
2218
+ "san_jose|CA": "95113",
2219
+ "seattle|WA": "98101"
2220
+ };
2221
+ function proxyIdSuffix2(proxyId) {
2222
+ return proxyId ? proxyId.slice(-6) : null;
2223
+ }
2224
+ function resolution(source, proxyMode, proxyId, target, error) {
2225
+ return {
2226
+ kernelProxyId: proxyId,
2227
+ resolution: {
2228
+ source,
2229
+ proxyMode,
2230
+ proxyIdPresent: Boolean(proxyId),
2231
+ proxyIdSuffix: proxyIdSuffix2(proxyId),
2232
+ target,
2233
+ error
2234
+ }
2235
+ };
2236
+ }
2237
+ function normalizeStateName(value) {
2238
+ return value.trim().toLowerCase().replace(/\s+/g, " ");
2239
+ }
2240
+ function normalizeCountryName(value) {
2241
+ return value.trim().toLowerCase().replace(/\./g, "").replace(/\s+/g, " ");
2242
+ }
2243
+ function isUnitedStates(country) {
2244
+ if (!country) return true;
2245
+ const normalized = normalizeCountryName(country);
2246
+ return normalized === "united states" || normalized === "united states of america" || normalized === "usa" || normalized === "us";
2247
+ }
2248
+ function stateCodeFor(region) {
2249
+ const trimmed = region.trim();
2250
+ if (/^[A-Za-z]{2}$/.test(trimmed)) return trimmed.toUpperCase();
2251
+ return US_STATE_CODES[normalizeStateName(trimmed)] ?? null;
2252
+ }
2253
+ function kernelCityIdentifierCandidates(city) {
2254
+ const ascii = city.normalize("NFKD").replace(/[^\x00-\x7F]/g, "").toLowerCase();
2255
+ const words = ascii.split(/[^a-z0-9]+/).filter(Boolean);
2256
+ const underscored = words.join("_");
2257
+ const compact = words.join("");
2258
+ return Array.from(new Set([underscored, compact].filter(Boolean)));
2259
+ }
2260
+ function proxyName(country, state, city) {
2261
+ return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
2262
+ }
2263
+ function zipProxyName(zip) {
2264
+ return `mcp-serp-residential-us-zip-${zip}`;
2265
+ }
2266
+ function parseKernelLocationProxyTarget(location, gl) {
2267
+ if (!location || gl.toLowerCase() !== "us") return null;
2268
+ const canonicalLocation = normalizeLocation(location);
2269
+ let parts = canonicalLocation.split(",").map((part) => part.trim()).filter(Boolean);
2270
+ if (parts.length > 1 && isUnitedStates(parts[parts.length - 1])) {
2271
+ parts = parts.slice(0, -1);
2272
+ }
2273
+ if (parts.length === 1) {
2274
+ const stateOnly = stateCodeFor(parts[0]);
2275
+ if (!stateOnly) return null;
2276
+ return {
2277
+ canonicalLocation,
2278
+ level: "state",
2279
+ country: "US",
2280
+ state: stateOnly,
2281
+ city: "",
2282
+ cityCandidates: [],
2283
+ proxyName: proxyName("US", stateOnly),
2284
+ config: {
2285
+ country: "US",
2286
+ state: stateOnly
2287
+ }
2288
+ };
2289
+ }
2290
+ const [city = "", region = ""] = parts;
2291
+ if (!city || !region) return null;
2292
+ const state = stateCodeFor(region);
2293
+ if (!state) return null;
2294
+ const cityCandidates = kernelCityIdentifierCandidates(city);
2295
+ const primaryCity = cityCandidates[0];
2296
+ if (!primaryCity) return null;
2297
+ return {
2298
+ canonicalLocation,
2299
+ level: "city",
2300
+ country: "US",
2301
+ state,
2302
+ city: primaryCity,
2303
+ cityCandidates,
2304
+ proxyName: proxyName("US", state, primaryCity),
2305
+ config: {
2306
+ country: "US",
2307
+ state,
2308
+ city: primaryCity
2309
+ }
2310
+ };
2311
+ }
2312
+ function cityZipKey(target) {
2313
+ return `${target.city}|${target.state}`;
2314
+ }
2315
+ function knownZipFor(target, explicitZip) {
2316
+ if (explicitZip && /^\d{5}$/.test(explicitZip)) return explicitZip;
2317
+ return US_CITY_CENTER_ZIPS[cityZipKey(target)] ?? null;
2318
+ }
2319
+ function zipTarget(target, zip) {
2320
+ return {
2321
+ ...target,
2322
+ level: "zip",
2323
+ zip,
2324
+ proxyName: zipProxyName(zip),
2325
+ config: {
2326
+ country: target.country,
2327
+ state: target.state,
2328
+ zip
2329
+ }
2330
+ };
2331
+ }
2332
+ function configMatches(config, target, city) {
2333
+ if (target.level === "zip") {
2334
+ return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
2335
+ }
2336
+ return config?.country?.toUpperCase() === target.country && config?.state?.toUpperCase() === target.state && (city ? config?.city === city : !config?.city);
2337
+ }
2338
+ function findExistingTargetProxy(proxies, target) {
2339
+ return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === target.proxyName || configMatches(proxy.config, target, target.level === "city" ? target.city : void 0))) ?? null;
2340
+ }
2341
+ function findExistingProxy(proxies, target) {
2342
+ for (const city of target.cityCandidates) {
2343
+ const name = proxyName(target.country, target.state, city);
2344
+ const found = proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target, city)));
2345
+ if (found) return found;
2346
+ }
2347
+ return null;
2348
+ }
2349
+ function stateTarget(target) {
2350
+ return {
2351
+ ...target,
2352
+ level: "state",
2353
+ proxyName: proxyName(target.country, target.state),
2354
+ config: {
2355
+ country: target.country,
2356
+ state: target.state
2357
+ }
2358
+ };
2359
+ }
2360
+ function findExistingStateProxy(proxies, target) {
2361
+ const name = proxyName(target.country, target.state);
2362
+ return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target))) ?? null;
2363
+ }
2364
+ function escalatedTargetLevel(target, attemptIndex) {
2365
+ return stateTarget(target);
2366
+ }
2367
+ function errorText2(err) {
2368
+ return err instanceof Error ? err.message : String(err);
2369
+ }
2370
+ async function resolveKernelProxyId(options) {
2371
+ if (options.proxyMode === "none") {
2372
+ return resolution("disabled", options.proxyMode, void 0, null, null);
2373
+ }
2374
+ if (options.proxyMode === "configured") {
2375
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, null, null);
2376
+ }
2377
+ const target = parseKernelLocationProxyTarget(options.location, options.gl);
2378
+ if (!target || !options.kernelApiKey) {
2379
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, target ? null : "location could not be normalized to a US city/state proxy target");
2380
+ }
2381
+ const kernel = new Kernel2({ apiKey: options.kernelApiKey });
2382
+ try {
2383
+ const attemptIndex = options.attemptIndex ?? 0;
2384
+ if (attemptIndex >= 1) {
2385
+ const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
2386
+ const createErrors2 = [];
2387
+ try {
2388
+ const created = await kernel.proxies.create({
2389
+ type: "residential",
2390
+ name: escalatedTarget.proxyName,
2391
+ config: escalatedTarget.config
2392
+ });
2393
+ if (created.id) {
2394
+ return resolution("location_created", options.proxyMode, created.id, escalatedTarget, null);
2395
+ }
2396
+ createErrors2.push(`${escalatedTarget.state}: Kernel did not return a proxy id`);
2397
+ } catch (err) {
2398
+ createErrors2.push(`${escalatedTarget.state}: ${errorText2(err)}`);
2399
+ }
2400
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, escalatedTarget, createErrors2.join(" | "));
2401
+ }
2402
+ const proxies = await kernel.proxies.list();
2403
+ const zip = knownZipFor(target, options.proxyZip);
2404
+ const createErrors = [];
2405
+ if (zip) {
2406
+ const targetZip = zipTarget(target, zip);
2407
+ const existingZip = findExistingTargetProxy(proxies, targetZip);
2408
+ if (existingZip?.id) {
2409
+ return resolution("location_reused", options.proxyMode, existingZip.id, targetZip, null);
2410
+ }
2411
+ try {
2412
+ const created = await kernel.proxies.create({
2413
+ type: "residential",
2414
+ name: targetZip.proxyName,
2415
+ config: {
2416
+ country: targetZip.country,
2417
+ zip
2418
+ }
2419
+ });
2420
+ if (created.id) {
2421
+ return resolution("location_created", options.proxyMode, created.id, targetZip, null);
2422
+ }
2423
+ createErrors.push(`${zip}: Kernel did not return a proxy id`);
2424
+ } catch (err) {
2425
+ createErrors.push(`${zip}: ${errorText2(err)}`);
2426
+ }
2427
+ }
2428
+ const existing = findExistingProxy(proxies, target);
2429
+ if (existing?.id) {
2430
+ return resolution("location_reused", options.proxyMode, existing.id, target, createErrors.join(" | ") || null);
2431
+ }
2432
+ for (const city of target.cityCandidates) {
2433
+ try {
2434
+ const created = await kernel.proxies.create({
2435
+ type: "residential",
2436
+ name: proxyName(target.country, target.state, city),
2437
+ config: {
2438
+ country: target.country,
2439
+ state: target.state,
2440
+ city
2441
+ }
2442
+ });
2443
+ if (created.id) {
2444
+ return resolution("location_created", options.proxyMode, created.id, {
2445
+ ...target,
2446
+ level: "city",
2447
+ city,
2448
+ proxyName: proxyName(target.country, target.state, city),
2449
+ config: {
2450
+ country: target.country,
2451
+ state: target.state,
2452
+ city
2453
+ }
2454
+ }, null);
2455
+ }
2456
+ createErrors.push(`${city}: Kernel did not return a proxy id`);
2457
+ } catch (err) {
2458
+ createErrors.push(`${city}: ${errorText2(err)}`);
2459
+ }
2460
+ }
2461
+ const fallbackTarget = stateTarget(target);
2462
+ const existingState = findExistingStateProxy(proxies, fallbackTarget);
2463
+ if (existingState?.id) {
2464
+ return resolution("location_reused", options.proxyMode, existingState.id, fallbackTarget, createErrors.join(" | "));
2465
+ }
2466
+ try {
2467
+ const created = await kernel.proxies.create({
2468
+ type: "residential",
2469
+ name: fallbackTarget.proxyName,
2470
+ config: fallbackTarget.config
2471
+ });
2472
+ if (created.id) {
2473
+ return resolution("location_created", options.proxyMode, created.id, fallbackTarget, createErrors.join(" | "));
2474
+ }
2475
+ createErrors.push(`${fallbackTarget.state}: Kernel did not return a proxy id`);
2476
+ } catch (err) {
2477
+ createErrors.push(`${fallbackTarget.state}: ${errorText2(err)}`);
2478
+ }
2479
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
2480
+ } catch (err) {
2481
+ return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, errorText2(err));
2482
+ }
2483
+ }
2484
+
2485
+ // src/harvest.ts
2486
+ var MAX_ATTEMPTS = 3;
2487
+ function abortReason(signal) {
2488
+ if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
2489
+ return new RequestAbortedError();
2490
+ }
2491
+ function getAbortSignal(rawOptions) {
2492
+ if (!rawOptions || typeof rawOptions !== "object") return void 0;
2493
+ const signal = rawOptions.signal;
2494
+ if (signal instanceof AbortSignal) return signal;
2495
+ return void 0;
2496
+ }
2497
+ function getAttemptLogSink(rawOptions) {
2498
+ if (!rawOptions || typeof rawOptions !== "object") return void 0;
2499
+ const sink = rawOptions.onAttemptEvent;
2500
+ return typeof sink === "function" ? sink : void 0;
2501
+ }
2502
+ async function emitAttemptEvent(sink, event) {
2503
+ if (!sink) return;
2504
+ try {
2505
+ await sink(event);
2506
+ } catch (err) {
2507
+ console.warn(JSON.stringify({
2508
+ event: "harvest_attempt_log_failed",
2509
+ attempt_number: event.attemptNumber,
2510
+ message: err instanceof Error ? err.message : String(err)
2511
+ }));
2512
+ }
2513
+ }
2514
+ function classifyAttemptError(err) {
2515
+ if (err instanceof CaptchaError) return "captcha";
2516
+ if (err instanceof RequestAbortedError) return "request_aborted";
2517
+ if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
2518
+ const message = err instanceof Error ? err.message : String(err);
2519
+ return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
2520
+ }
2521
+ function classifyAttemptResult(result) {
2522
+ return result.diagnostics?.completionStatus ?? (result.totalQuestions > 0 ? "paa_found" : "no_paa");
2523
+ }
2524
+ function errorMessage(err) {
2525
+ return err instanceof Error ? err.message : String(err);
2526
+ }
2527
+ async function extractOnce(options, signal) {
2528
+ const driver = new BrowserDriver();
2529
+ const reporter = new ProgressReporter();
2530
+ const extractor = new PAAExtractor(driver, reporter);
2531
+ if (signal?.aborted) {
2532
+ return {
2533
+ result: null,
2534
+ error: abortReason(signal),
2535
+ cleanup: await driver.close(),
2536
+ debug: null
2537
+ };
2538
+ }
2539
+ let onAbort;
2540
+ const abortPromise = signal ? new Promise((_, reject) => {
2541
+ onAbort = () => reject(abortReason(signal));
2542
+ signal.addEventListener("abort", onAbort, { once: true });
2543
+ }) : null;
2544
+ let result = null;
2545
+ let error = null;
2546
+ let cleanup;
2547
+ let debug = null;
2548
+ try {
2549
+ const extraction = extractor.extract(options, signal);
2550
+ if (abortPromise) extraction.catch(() => {
2551
+ });
2552
+ result = await (abortPromise ? Promise.race([extraction, abortPromise]) : extraction);
2553
+ } catch (err) {
2554
+ error = err;
2555
+ } finally {
2556
+ if (signal && onAbort) signal.removeEventListener("abort", onAbort);
2557
+ debug = result?.diagnostics.debug ?? (options.debug ? {
2558
+ enabled: true,
2559
+ request: {
2560
+ query: options.query,
2561
+ locationInput: options.location ?? null,
2562
+ canonicalLocation: null,
2563
+ uule: null,
2564
+ gl: options.gl,
2565
+ hl: options.hl,
2566
+ device: options.device,
2567
+ proxyMode: options.proxyMode,
2568
+ proxyZip: options.proxyZip ?? null,
2569
+ serpOnly: options.serpOnly,
2570
+ pages: options.pages ?? 1
2571
+ },
2572
+ browser: driver.getDebugSnapshot()
2573
+ } : null);
2574
+ cleanup = await driver.close();
2575
+ }
2576
+ return error ? { result: null, error, cleanup, debug } : { result, error: null, cleanup, debug };
2577
+ }
2578
+ async function harvest(rawOptions) {
2579
+ const raw = typeof rawOptions === "object" && rawOptions !== null ? rawOptions : {};
2580
+ const signal = getAbortSignal(rawOptions);
2581
+ const onAttemptEvent = getAttemptLogSink(rawOptions);
2582
+ const requestedProxyMode = raw.proxyMode;
2583
+ const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
2584
+ const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
2585
+ const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
2586
+ const proxyOpts = {
2587
+ kernelApiKey,
2588
+ proxyMode,
2589
+ configuredKernelProxyId,
2590
+ location: typeof raw.location === "string" ? raw.location : void 0,
2591
+ proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
2592
+ gl: typeof raw.gl === "string" ? raw.gl : "us"
2593
+ };
2594
+ const serializer = new OutputSerializer();
2595
+ for (let i = 0; i < MAX_ATTEMPTS; i++) {
2596
+ const attemptNumber = i + 1;
2597
+ const startedAtMs = Date.now();
2598
+ try {
2599
+ if (signal?.aborted) throw abortReason(signal);
2600
+ const resolution2 = await resolveKernelProxyId({ ...proxyOpts, attemptIndex: i });
2601
+ const mergedAttempt = {
2602
+ ...raw,
2603
+ kernelApiKey,
2604
+ kernelProxyId: resolution2.kernelProxyId,
2605
+ kernelProxyResolution: resolution2.resolution,
2606
+ proxyMode
2607
+ };
2608
+ if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
2609
+ const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
2610
+ await emitAttemptEvent(onAttemptEvent, {
2611
+ type: "started",
2612
+ attemptNumber,
2613
+ maxAttempts: MAX_ATTEMPTS,
2614
+ query: attemptOptions.query,
2615
+ location: attemptOptions.location ?? null,
2616
+ maxQuestions: attemptOptions.maxQuestions,
2617
+ startedAt: new Date(startedAtMs).toISOString()
2618
+ });
2619
+ console.info(JSON.stringify({
2620
+ event: "harvest_attempt_started",
2621
+ attempt_number: attemptNumber,
2622
+ max_attempts: MAX_ATTEMPTS,
2623
+ query: attemptOptions.query,
2624
+ location: attemptOptions.location ?? null,
2625
+ max_questions: attemptOptions.maxQuestions
2626
+ }));
2627
+ const attempt = await extractOnce(attemptOptions, signal);
2628
+ if (attempt.error) {
2629
+ const err = attempt.error;
2630
+ if (err instanceof CaptchaError) {
2631
+ const willRetry = i < MAX_ATTEMPTS - 1;
2632
+ console.warn(JSON.stringify({
2633
+ event: "harvest_attempt_captcha",
2634
+ attempt_number: attemptNumber,
2635
+ max_attempts: MAX_ATTEMPTS,
2636
+ message: err.message,
2637
+ will_retry: willRetry
2638
+ }));
2639
+ await emitAttemptEvent(onAttemptEvent, {
2640
+ type: "finished",
2641
+ attemptNumber,
2642
+ maxAttempts: MAX_ATTEMPTS,
2643
+ outcome: "captcha",
2644
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2645
+ questionCount: 0,
2646
+ durationMs: Date.now() - startedAtMs,
2647
+ error: err.message,
2648
+ willRetry,
2649
+ cleanup: attempt.cleanup,
2650
+ debug: attempt.debug,
2651
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2652
+ });
2653
+ if (willRetry) continue;
2654
+ break;
2655
+ }
2656
+ await emitAttemptEvent(onAttemptEvent, {
2657
+ type: "finished",
2658
+ attemptNumber,
2659
+ maxAttempts: MAX_ATTEMPTS,
2660
+ outcome: classifyAttemptError(err),
2661
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2662
+ questionCount: 0,
2663
+ durationMs: Date.now() - startedAtMs,
2664
+ error: errorMessage(err),
2665
+ willRetry: false,
2666
+ cleanup: attempt.cleanup,
2667
+ debug: attempt.debug,
2668
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2669
+ });
2670
+ throw err;
2671
+ }
2672
+ const result = attempt.result;
2673
+ if (!result) throw new Error("Harvest attempt completed without a result");
2674
+ await emitAttemptEvent(onAttemptEvent, {
2675
+ type: "finished",
2676
+ attemptNumber,
2677
+ maxAttempts: MAX_ATTEMPTS,
2678
+ outcome: classifyAttemptResult(result),
2679
+ kernelSessionId: attempt.cleanup.kernelSessionId,
2680
+ questionCount: result.totalQuestions,
2681
+ durationMs: Date.now() - startedAtMs,
2682
+ error: null,
2683
+ willRetry: false,
2684
+ cleanup: attempt.cleanup,
2685
+ debug: attempt.debug,
2686
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2687
+ });
2688
+ if (attemptOptions.format === "json" || attemptOptions.format === "both") {
2689
+ await serializer.writeJSON(result, attemptOptions.outputDir);
2690
+ }
2691
+ if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
2692
+ await Promise.all([
2693
+ serializer.writeCSV(result.flat, attemptOptions.outputDir),
2694
+ result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2695
+ result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2696
+ result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2697
+ result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
2698
+ result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, attemptOptions.outputDir) : Promise.resolve("")
2699
+ ]);
2700
+ }
2701
+ return result;
2702
+ } catch (err) {
2703
+ if (err instanceof CaptchaError) {
2704
+ const willRetry = i < MAX_ATTEMPTS - 1;
2705
+ console.warn(JSON.stringify({
2706
+ event: "harvest_attempt_captcha",
2707
+ attempt_number: attemptNumber,
2708
+ max_attempts: MAX_ATTEMPTS,
2709
+ message: err.message,
2710
+ will_retry: willRetry
2711
+ }));
2712
+ await emitAttemptEvent(onAttemptEvent, {
2713
+ type: "finished",
2714
+ attemptNumber,
2715
+ maxAttempts: MAX_ATTEMPTS,
2716
+ outcome: "captcha",
2717
+ kernelSessionId: null,
2718
+ questionCount: 0,
2719
+ durationMs: Date.now() - startedAtMs,
2720
+ error: err.message,
2721
+ willRetry,
2722
+ cleanup: {
2723
+ kernelSessionId: null,
2724
+ kernelDeleteStarted: false,
2725
+ kernelDeleteSucceeded: null,
2726
+ kernelDeleteError: null,
2727
+ browserCloseSucceeded: null,
2728
+ browserCloseError: null
2729
+ },
2730
+ debug: null,
2731
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2732
+ });
2733
+ if (willRetry) continue;
2734
+ break;
2735
+ }
2736
+ await emitAttemptEvent(onAttemptEvent, {
2737
+ type: "finished",
2738
+ attemptNumber,
2739
+ maxAttempts: MAX_ATTEMPTS,
2740
+ outcome: classifyAttemptError(err),
2741
+ kernelSessionId: null,
2742
+ questionCount: 0,
2743
+ durationMs: Date.now() - startedAtMs,
2744
+ error: errorMessage(err),
2745
+ willRetry: false,
2746
+ cleanup: {
2747
+ kernelSessionId: null,
2748
+ kernelDeleteStarted: false,
2749
+ kernelDeleteSucceeded: null,
2750
+ kernelDeleteError: null,
2751
+ browserCloseSucceeded: null,
2752
+ browserCloseError: null
2753
+ },
2754
+ debug: null,
2755
+ completedAt: (/* @__PURE__ */ new Date()).toISOString()
2756
+ });
2757
+ throw err;
2758
+ }
2759
+ }
2760
+ console.warn(JSON.stringify({
2761
+ event: "harvest_captcha_exhausted",
2762
+ max_attempts: MAX_ATTEMPTS,
2763
+ session_kind: kernelApiKey ? "kernel" : "local"
2764
+ }));
2765
+ throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${MAX_ATTEMPTS} fresh sessions. Try again in a few minutes.`));
2766
+ }
2767
+
2768
+ export {
2769
+ MapsPlaceOptionsSchema,
2770
+ RawMapsOverviewSchema,
2771
+ RawMapsHoursRowSchema,
2772
+ RawMapsReviewStatsSchema,
2773
+ RawMapsAboutAttributeSchema,
2774
+ MapsSelectors,
2775
+ CaptchaError,
2776
+ RequestAbortedError,
2777
+ buildYouTubeChannelVideosUrl,
2778
+ BrowserDriver,
2779
+ harvest
2780
+ };
2781
+ //# sourceMappingURL=chunk-HERFK7W6.js.map