mcp-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +56 -0
  2. package/dist/bin/api-server.cjs +9256 -0
  3. package/dist/bin/api-server.cjs.map +1 -0
  4. package/dist/bin/api-server.d.cts +1 -0
  5. package/dist/bin/api-server.d.ts +1 -0
  6. package/dist/bin/api-server.js +38 -0
  7. package/dist/bin/api-server.js.map +1 -0
  8. package/dist/bin/mcp-stdio-server.cjs +840 -0
  9. package/dist/bin/mcp-stdio-server.cjs.map +1 -0
  10. package/dist/bin/mcp-stdio-server.d.cts +1 -0
  11. package/dist/bin/mcp-stdio-server.d.ts +1 -0
  12. package/dist/bin/mcp-stdio-server.js +41 -0
  13. package/dist/bin/mcp-stdio-server.js.map +1 -0
  14. package/dist/bin/paa-harvest.cjs +1438 -0
  15. package/dist/bin/paa-harvest.cjs.map +1 -0
  16. package/dist/bin/paa-harvest.d.cts +1 -0
  17. package/dist/bin/paa-harvest.d.ts +1 -0
  18. package/dist/bin/paa-harvest.js +37 -0
  19. package/dist/bin/paa-harvest.js.map +1 -0
  20. package/dist/chunk-4API3ZCT.js +1387 -0
  21. package/dist/chunk-4API3ZCT.js.map +1 -0
  22. package/dist/chunk-LXZDJJXR.js +476 -0
  23. package/dist/chunk-LXZDJJXR.js.map +1 -0
  24. package/dist/chunk-ZBP4RHNW.js +805 -0
  25. package/dist/chunk-ZBP4RHNW.js.map +1 -0
  26. package/dist/db-IOYMX64U.js +87 -0
  27. package/dist/db-IOYMX64U.js.map +1 -0
  28. package/dist/index.cjs +1689 -0
  29. package/dist/index.cjs.map +1 -0
  30. package/dist/index.d.cts +210 -0
  31. package/dist/index.d.ts +210 -0
  32. package/dist/index.js +275 -0
  33. package/dist/index.js.map +1 -0
  34. package/dist/server-63DR2HE5.js +6062 -0
  35. package/dist/server-63DR2HE5.js.map +1 -0
  36. package/dist/worker-3ECJHPRE.js +88 -0
  37. package/dist/worker-3ECJHPRE.js.map +1 -0
  38. package/package.json +76 -0
@@ -0,0 +1,1438 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ var __create = Object.create;
4
+ var __defProp = Object.defineProperty;
5
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
6
+ var __getOwnPropNames = Object.getOwnPropertyNames;
7
+ var __getProtoOf = Object.getPrototypeOf;
8
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
9
+ var __copyProps = (to, from, except, desc) => {
10
+ if (from && typeof from === "object" || typeof from === "function") {
11
+ for (let key of __getOwnPropNames(from))
12
+ if (!__hasOwnProp.call(to, key) && key !== except)
13
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
14
+ }
15
+ return to;
16
+ };
17
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
18
+ // If the importer is in node compatibility mode or this is not an ESM
19
+ // file that has been converted to a CommonJS file using a Babel-
20
+ // compatible transform (i.e. "__esModule" has not been set), then set
21
+ // "default" to the CommonJS "module.exports" for node compatibility.
22
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
23
+ mod
24
+ ));
25
+
26
+ // src/cli.ts
27
+ var import_commander = require("commander");
28
+
29
+ // src/schemas.ts
30
+ var import_zod = require("zod");
31
+ var HarvestOptionsSchema = import_zod.z.object({
32
+ query: import_zod.z.string().min(1),
33
+ location: import_zod.z.string().optional(),
34
+ gl: import_zod.z.string().length(2).default("us"),
35
+ hl: import_zod.z.string().length(2).default("en"),
36
+ depth: import_zod.z.number().int().min(1).max(30).default(3),
37
+ maxQuestions: import_zod.z.number().int().min(1).max(1e3).default(100),
38
+ headless: import_zod.z.boolean().default(false),
39
+ profileDir: import_zod.z.string().optional(),
40
+ proxy: import_zod.z.string().url().optional(),
41
+ kernelApiKey: import_zod.z.string().optional(),
42
+ kernelProxyId: import_zod.z.string().optional(),
43
+ outputDir: import_zod.z.string().default("./paa-output"),
44
+ format: import_zod.z.enum(["json", "csv", "both"]).default("both"),
45
+ serpOnly: import_zod.z.boolean().default(false),
46
+ pages: import_zod.z.number().int().min(1).max(2).default(1)
47
+ });
48
+ var MapsPlaceOptionsSchema = import_zod.z.object({
49
+ businessName: import_zod.z.string().min(1),
50
+ location: import_zod.z.string().min(1),
51
+ gl: import_zod.z.string().length(2).default("us"),
52
+ hl: import_zod.z.string().length(2).default("en"),
53
+ includeReviews: import_zod.z.boolean().default(false),
54
+ maxReviews: import_zod.z.number().int().min(1).max(500).default(50),
55
+ kernelApiKey: import_zod.z.string().optional(),
56
+ kernelProxyId: import_zod.z.string().optional(),
57
+ headless: import_zod.z.boolean().default(true)
58
+ });
59
+ var RawPAAItemSchema = import_zod.z.object({
60
+ question: import_zod.z.string().min(1),
61
+ answer: import_zod.z.string().optional(),
62
+ sourceTitle: import_zod.z.string().optional(),
63
+ sourceSite: import_zod.z.string().optional(),
64
+ sourceCite: import_zod.z.string().optional()
65
+ });
66
+
67
+ // src/driver/BrowserDriver.ts
68
+ var import_playwright_extra = require("playwright-extra");
69
+ var import_puppeteer_extra_plugin_stealth = __toESM(require("puppeteer-extra-plugin-stealth"), 1);
70
+ var import_playwright = require("playwright");
71
+ var import_sdk = __toESM(require("@onkernel/sdk"), 1);
72
+
73
+ // src/selectors.ts
74
+ var PAASelectors = {
75
+ container: ".eJH8qe.adDDi",
76
+ dataInitq: "[data-initq]",
77
+ item: ".related-question-pair",
78
+ itemDataQ: "data-q",
79
+ itemDataInitQ: "data-initq",
80
+ itemQuestionEl: ".JlqpRe",
81
+ answerContainer: ".bCOlv",
82
+ sourceTitle: "h3",
83
+ sourceSite: ".VuuXrf",
84
+ sourceCite: "cite",
85
+ clickTarget: ".JlqpRe",
86
+ expandedClass: "aoRk1c",
87
+ captchaMarker: '#captcha-form, #recaptcha, form[action*="/sorry/"], .g-recaptcha, [data-sitekey]'
88
+ };
89
+ var VideoSelectors = {
90
+ container: 'div[jscontroller="HWk0Gf"]',
91
+ sectionHeading: '.mgAbYb[role="heading"]',
92
+ item: "a.rIRoqf"
93
+ };
94
+ var ShortVideoSelectors = {
95
+ udm: "39",
96
+ item: "a.rIRoqf",
97
+ durationPattern: /^\d+:\d+$/,
98
+ platforms: ["YouTube", "TikTok", "Instagram", "Facebook", "X"]
99
+ };
100
+ var ForumSelectors = {
101
+ section: ".ULSxyf",
102
+ item: "a.KYg7td.INpicf",
103
+ title: ".hyYc0c",
104
+ source: ".K4ETW"
105
+ };
106
+ var WhatPeopleSayingSelectors = {
107
+ sectionTag: "g-section-with-header",
108
+ sectionHeadingText: "What people are saying",
109
+ card: '.dRzkFf[role="listitem"]',
110
+ cardLink: 'a.WlydOe[jsname="YKoRaf"]',
111
+ titleH1: "h1.WQWxe",
112
+ titleDiv: ".eAaXgc",
113
+ popularCommentLabel: ".qgdis",
114
+ source: ".sTl1Td",
115
+ platformBadge: ".appd0, .KrMNbf",
116
+ ytChannel: ".sjVJQd",
117
+ ytDate: ".PLq9Je",
118
+ authorNote: ".nDgy9d"
119
+ };
120
+ var AIOverviewSelectors = {
121
+ root: '[data-hveid="CBMQAA"]',
122
+ wrapper: ".Fgyi2e",
123
+ citations: '.Fgyi2e [data-hveid] a[jsname="pxBnId"]'
124
+ };
125
+ var AIModeSelectors = {
126
+ root: '[data-hveid="CAUQAA"]',
127
+ wrapper: ".Fgyi2e",
128
+ citations: '.Fgyi2e [data-hveid] a[jsname="pxBnId"]'
129
+ };
130
+ var OrganicSelectors = {
131
+ result: ".wHYlTd.tF2Cxc",
132
+ title: "h3.LC20lb",
133
+ siteName: ".VuuXrf",
134
+ cite: "cite.tjvcx",
135
+ snippet: ".VwiC3b",
136
+ redditCite: "cite.qLRx3b",
137
+ ratingWrap: ".Y0A0hc",
138
+ ratingValue: ".yi40Hd",
139
+ reviewCount: ".RDApEe"
140
+ };
141
+ var LocalPackSelectors = {
142
+ headingText: "Businesses",
143
+ card: ".w7Dbne",
144
+ name: ".OSrXXb",
145
+ ratingValue: ".yi40Hd",
146
+ reviewCount: ".RDApEe"
147
+ };
148
+
149
+ // src/errors.ts
150
+ var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
151
+ var CaptchaError = class extends Error {
152
+ constructor(instructions) {
153
+ super(`CAPTCHA detected. ${instructions}`);
154
+ this.instructions = instructions;
155
+ }
156
+ instructions;
157
+ name = "CaptchaError";
158
+ };
159
+ var ExtractionError = class extends Error {
160
+ constructor(message, cause) {
161
+ super(message);
162
+ this.cause = cause;
163
+ }
164
+ cause;
165
+ name = "ExtractionError";
166
+ };
167
+
168
+ // src/driver/BrowserDriver.ts
169
+ import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
170
+ var DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
171
+ function buildYouTubeChannelVideosUrl(channelInput) {
172
+ const raw = channelInput.trim();
173
+ if (!raw) throw new Error("channelHandle is required");
174
+ const urlLike = /^https?:\/\//i.test(raw) || /^(www\.|m\.)?youtube\.com\//i.test(raw);
175
+ if (urlLike) {
176
+ const parsed = new URL(/^https?:\/\//i.test(raw) ? raw : `https://${raw}`);
177
+ const host = parsed.hostname.replace(/^www\./, "").replace(/^m\./, "").toLowerCase();
178
+ if (host !== "youtube.com") throw new Error("channel URL must be on youtube.com");
179
+ const segments = parsed.pathname.split("/").filter(Boolean);
180
+ const first = segments[0] ?? "";
181
+ const second = segments[1] ?? "";
182
+ if (first.startsWith("@")) return `https://www.youtube.com/${first}/videos`;
183
+ if (first === "channel" && second) return `https://www.youtube.com/channel/${second}/videos`;
184
+ if ((first === "c" || first === "user") && second) return `https://www.youtube.com/${first}/${second}/videos`;
185
+ throw new Error("channel URL must be a YouTube handle, /channel/UC..., /c/..., or /user/... URL");
186
+ }
187
+ const stripped = raw.replace(/^\/+/, "").replace(/\/+$/, "");
188
+ const withoutVideos = stripped.replace(/\/videos$/i, "");
189
+ if (/^UC[\w-]{20,}$/.test(withoutVideos)) {
190
+ return `https://www.youtube.com/channel/${withoutVideos}/videos`;
191
+ }
192
+ const handle = withoutVideos.startsWith("@") ? withoutVideos : `@${withoutVideos}`;
193
+ if (!/^@[\w.-]+$/.test(handle)) {
194
+ throw new Error("channelHandle must be an @handle, UC channel ID, or YouTube channel URL");
195
+ }
196
+ return `https://www.youtube.com/${handle}/videos`;
197
+ }
198
+ var BrowserDriver = class {
199
+ browser = null;
200
+ context = null;
201
+ page = null;
202
+ kernelClient = null;
203
+ kernelSessionId = null;
204
+ async launch(config) {
205
+ if (config.kernelApiKey) {
206
+ this.kernelClient = new import_sdk.default({ apiKey: config.kernelApiKey });
207
+ const kernelBrowser = await this.kernelClient.browsers.create({
208
+ stealth: true,
209
+ timeout_seconds: 600,
210
+ ...config.kernelProxyId ? { proxy_id: config.kernelProxyId } : {}
211
+ });
212
+ this.kernelSessionId = kernelBrowser.session_id;
213
+ this.browser = await import_playwright.chromium.connectOverCDP(kernelBrowser.cdp_ws_url);
214
+ this.context = this.browser.contexts()[0] ?? await this.browser.newContext();
215
+ await this.installEsbuildHelperShims(this.context);
216
+ this.page = this.context.pages()[0] ?? await this.context.newPage();
217
+ return;
218
+ }
219
+ const launchOpts = {
220
+ headless: config.headless,
221
+ proxy: config.proxy ? { server: config.proxy } : void 0
222
+ };
223
+ const ctxOpts = {
224
+ viewport: config.viewport,
225
+ locale: config.locale,
226
+ userAgent: DESKTOP_USER_AGENT
227
+ };
228
+ if (config.profileDir) {
229
+ this.context = await import_playwright_extra.chromium.launchPersistentContext(config.profileDir, {
230
+ ...launchOpts,
231
+ ...ctxOpts
232
+ });
233
+ await this.installEsbuildHelperShims(this.context);
234
+ this.page = await this.context.newPage();
235
+ } else {
236
+ this.browser = await import_playwright_extra.chromium.launch(launchOpts);
237
+ this.context = await this.browser.newContext(ctxOpts);
238
+ await this.installEsbuildHelperShims(this.context);
239
+ this.page = await this.context.newPage();
240
+ }
241
+ }
242
+ async installEsbuildHelperShims(context) {
243
+ await context.addInitScript(() => {
244
+ const g = globalThis;
245
+ if (typeof g.__name !== "function") g.__name = (fn) => fn;
246
+ if (typeof g.__publicField !== "function") g.__publicField = (obj, key, value) => {
247
+ obj[key] = value;
248
+ return value;
249
+ };
250
+ });
251
+ }
252
+ async navigateToSERP(query, uule, gl, hl) {
253
+ const params = new URLSearchParams({ q: query, gl, hl });
254
+ if (uule) params.set("uule", uule);
255
+ const url = "https://www.google.com/search?" + params.toString();
256
+ try {
257
+ await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
258
+ } catch (err) {
259
+ const diag = await this.captureDiagnostics(url);
260
+ throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
261
+ }
262
+ const captchaCount = await this.page.locator(PAASelectors.captchaMarker).count();
263
+ if (captchaCount > 0) {
264
+ if (this.kernelClient) {
265
+ try {
266
+ await this.page.waitForSelector(PAASelectors.container, { timeout: 45e3 });
267
+ return { hasPaa: true };
268
+ } catch {
269
+ throw new CaptchaError(this.captchaMessage());
270
+ }
271
+ }
272
+ throw new CaptchaError(this.captchaMessage());
273
+ }
274
+ const fastFound = await this.page.waitForSelector(PAASelectors.item, { timeout: 4e3 }).catch(() => null);
275
+ if (fastFound) return { hasPaa: true };
276
+ const captchaAfter = await this.page.locator(PAASelectors.captchaMarker).count();
277
+ if (captchaAfter > 0) throw new CaptchaError(this.captchaMessage());
278
+ for (let i = 1; i <= 6; i++) {
279
+ await this.page.evaluate((f) => {
280
+ window.scrollTo(0, document.body.scrollHeight * f);
281
+ }, i / 6);
282
+ await this.page.waitForTimeout(600);
283
+ const count = await this.page.locator(PAASelectors.item).count();
284
+ if (count > 0) return { hasPaa: true };
285
+ }
286
+ return { hasPaa: false };
287
+ }
288
+ async captureDiagnostics(intendedUrl) {
289
+ try {
290
+ const finalUrl = this.page.url();
291
+ const title = await this.page.title().catch(() => "");
292
+ const bodySnippet = await this.page.evaluate(() => {
293
+ const t = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
294
+ return t.slice(0, 400);
295
+ }).catch(() => "");
296
+ const consent = /consent\.google\./.test(finalUrl) || /before you continue/i.test(bodySnippet);
297
+ const recaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
298
+ const flags = [
299
+ consent ? "CONSENT_WALL" : "",
300
+ recaptcha ? "BOT_CHALLENGE" : "",
301
+ finalUrl !== intendedUrl ? "REDIRECTED" : ""
302
+ ].filter(Boolean).join(",");
303
+ return `intended=${intendedUrl} | final=${finalUrl} | title="${title}" | flags=[${flags}] | body="${bodySnippet}"`;
304
+ } catch (e) {
305
+ return `diagnostics-failed: ${e.message}`;
306
+ }
307
+ }
308
+ captchaMessage() {
309
+ return this.kernelClient ? "Google returned a CAPTCHA on this Kernel.sh session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
310
+ }
311
+ async navigateTo(url) {
312
+ try {
313
+ await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
314
+ } catch (err) {
315
+ const diag = await this.captureDiagnostics(url);
316
+ throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
317
+ }
318
+ }
319
+ async navigateToChannel(channelHandle) {
320
+ const url = buildYouTubeChannelVideosUrl(channelHandle);
321
+ try {
322
+ await this.page.goto(url, { waitUntil: "networkidle", timeout: 3e4 });
323
+ } catch (err) {
324
+ const diag = await this.captureDiagnostics(url);
325
+ throw new ExtractionError(`navigateToChannel failed: ${err.message} | ${diag}`);
326
+ }
327
+ }
328
+ async evaluate(fn, arg) {
329
+ return this.page.evaluate(fn, arg);
330
+ }
331
+ getPage() {
332
+ return this.page;
333
+ }
334
+ async close() {
335
+ if (this.browser) {
336
+ const b = this.browser;
337
+ const sessionId = this.kernelSessionId;
338
+ const client = this.kernelClient;
339
+ this.browser = null;
340
+ this.context = null;
341
+ this.page = null;
342
+ this.kernelSessionId = null;
343
+ this.kernelClient = null;
344
+ try {
345
+ await b.close();
346
+ } finally {
347
+ if (client && sessionId) {
348
+ await client.browsers.deleteByID(sessionId).catch(
349
+ (err) => console.warn("Kernel session cleanup failed:", err)
350
+ );
351
+ }
352
+ }
353
+ } else if (this.context) {
354
+ const ctx = this.context;
355
+ this.context = null;
356
+ this.page = null;
357
+ await ctx.close();
358
+ }
359
+ }
360
+ };
361
+
362
+ // src/locations.ts
363
+ var LOCATIONS = {
364
+ "austin": "Austin,Texas,United States",
365
+ "new york": "New York,New York,United States",
366
+ "new york city": "New York,New York,United States",
367
+ "nyc": "New York,New York,United States",
368
+ "los angeles": "Los Angeles,California,United States",
369
+ "la": "Los Angeles,California,United States",
370
+ "chicago": "Chicago,Illinois,United States",
371
+ "houston": "Houston,Texas,United States",
372
+ "phoenix": "Phoenix,Arizona,United States",
373
+ "philadelphia": "Philadelphia,Pennsylvania,United States",
374
+ "philly": "Philadelphia,Pennsylvania,United States",
375
+ "san antonio": "San Antonio,Texas,United States",
376
+ "dallas": "Dallas,Texas,United States",
377
+ "miami": "Miami,Florida,United States",
378
+ "seattle": "Seattle,Washington,United States",
379
+ "denver": "Denver,Colorado,United States",
380
+ "loveland": "Loveland,Colorado,United States",
381
+ "loveland co": "Loveland,Colorado,United States",
382
+ "fort collins": "Fort Collins,Colorado,United States",
383
+ "boulder": "Boulder,Colorado,United States",
384
+ "colorado springs": "Colorado Springs,Colorado,United States",
385
+ "boston": "Boston,Massachusetts,United States",
386
+ "atlanta": "Atlanta,Georgia,United States",
387
+ "san francisco": "San Francisco,California,United States",
388
+ "sf": "San Francisco,California,United States",
389
+ "portland": "Portland,Oregon,United States",
390
+ "las vegas": "Las Vegas,Nevada,United States",
391
+ "minneapolis": "Minneapolis,Minnesota,United States",
392
+ "detroit": "Detroit,Michigan,United States",
393
+ "nashville": "Nashville,Tennessee,United States",
394
+ "charlotte": "Charlotte,North Carolina,United States",
395
+ "orlando": "Orlando,Florida,United States",
396
+ "san diego": "San Diego,California,United States",
397
+ "baltimore": "Baltimore,Maryland,United States",
398
+ "sacramento": "Sacramento,California,United States",
399
+ "columbus": "Columbus,Ohio,United States",
400
+ "indianapolis": "Indianapolis,Indiana,United States",
401
+ "san jose": "San Jose,California,United States",
402
+ "fort worth": "Fort Worth,Texas,United States",
403
+ "jacksonville": "Jacksonville,Florida,United States",
404
+ "memphis": "Memphis,Tennessee,United States",
405
+ "louisville": "Louisville,Kentucky,United States",
406
+ "raleigh": "Raleigh,North Carolina,United States",
407
+ "richmond": "Richmond,Virginia,United States",
408
+ "salt lake city": "Salt Lake City,Utah,United States",
409
+ "toronto": "Toronto,Ontario,Canada",
410
+ "vancouver": "Vancouver,British Columbia,Canada",
411
+ "montreal": "Montreal,Quebec,Canada",
412
+ "calgary": "Calgary,Alberta,Canada",
413
+ "ottawa": "Ottawa,Ontario,Canada",
414
+ "london": "London,England,United Kingdom",
415
+ "manchester": "Manchester,England,United Kingdom",
416
+ "birmingham": "Birmingham,England,United Kingdom",
417
+ "edinburgh": "Edinburgh,Scotland,United Kingdom",
418
+ "glasgow": "Glasgow,Scotland,United Kingdom",
419
+ "leeds": "Leeds,England,United Kingdom",
420
+ "sydney": "Sydney,New South Wales,Australia",
421
+ "melbourne": "Melbourne,Victoria,Australia",
422
+ "brisbane": "Brisbane,Queensland,Australia",
423
+ "perth": "Perth,Western Australia,Australia",
424
+ "adelaide": "Adelaide,South Australia,Australia",
425
+ "dublin": "Dublin,Leinster,Ireland"
426
+ };
427
+
428
+ // src/uule.ts
429
+ function encodeUule(name) {
430
+ const encoded = Buffer.from(String.fromCharCode(name.length) + name).toString("base64");
431
+ return `w+CAIQICI${encoded}`;
432
+ }
433
+ function normalizeLocation(input) {
434
+ const key = input.toLowerCase().trim();
435
+ return LOCATIONS[key] ?? input;
436
+ }
437
+
438
+ // src/lib/paa-answer-cleanup.ts
439
+ var MAX_ANSWER_LENGTH = 1200;
440
+ var BOILERPLATE_PATTERNS = [
441
+ /An AI Overview is not available for this search/gi,
442
+ /Can't generate an AI overview right now\.?\s*Try again later\.?/gi,
443
+ /\bAI Overview\b/gi,
444
+ /\bView all\b/gi
445
+ ];
446
+ var CUT_MARKERS = [
447
+ /\bRelated Links\b/i,
448
+ /\bAsk anything in\s*AI Mode\b/i,
449
+ /\bAI can make mistakes\b/i,
450
+ /\bThis is for informational purposes only\b/i,
451
+ /\bShow more\b/i,
452
+ /\b\d+\s+sites\b/i,
453
+ /\b\d{1,2}\s*[msh]\s*[A-Z][A-Za-z]/,
454
+ /\b(?:YouTube|Reddit|Facebook|Instagram|TikTok)·/
455
+ ];
456
+ function normalizeWhitespace(text) {
457
+ return text.replace(/\u00a0/g, " ").replace(/([.!?])([A-Z])/g, "$1 $2").replace(/([:;])([A-Z])/g, "$1 $2").replace(/([a-z])([A-Z][a-z])/g, "$1 $2").replace(/(\d)([A-Z][a-z])/g, "$1 $2").replace(/([a-z])(\d)/g, "$1 $2").replace(/\s+/g, " ").trim();
458
+ }
459
+ function cutAtFirstMarker(text) {
460
+ let cutAt = -1;
461
+ for (const marker of CUT_MARKERS) {
462
+ const match = marker.exec(text);
463
+ marker.lastIndex = 0;
464
+ if (match && (cutAt === -1 || match.index < cutAt)) cutAt = match.index;
465
+ }
466
+ return cutAt === -1 ? text : text.slice(0, cutAt);
467
+ }
468
+ function cutAtSourceTitle(text, sourceTitle) {
469
+ const title = sourceTitle?.trim();
470
+ if (!title || title.length < 8) return text;
471
+ const idx = text.toLowerCase().indexOf(title.toLowerCase());
472
+ return idx > 40 ? text.slice(0, idx) : text;
473
+ }
474
+ function findAttributionCut(beforeUrl) {
475
+ const dateMatch = beforeUrl.match(/[•·]\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s+\d{4}/i);
476
+ if (dateMatch?.index && dateMatch.index > 40) return dateMatch.index;
477
+ const start = Math.max(0, beforeUrl.length - 260);
478
+ const tail = beforeUrl.slice(start);
479
+ const sentenceBreaks = [...tail.matchAll(/[.!?]\s*(?=[A-Z][A-Za-z0-9"'$])/g)];
480
+ for (const match of sentenceBreaks) {
481
+ const remainder = tail.slice(match.index + 1).trim();
482
+ const lead = remainder.slice(0, 160);
483
+ const looksLikeTitle = /^(?:Best|Top|What|How|Why|When|Where|Which|Can|Should|Is|Are|Do|Does)\b/i.test(remainder);
484
+ if (remainder.length > 20 && looksLikeTitle && /(?:\s[-|]\s|Heating|Cooling|Company|Services|Blog|Guide|Review)/i.test(lead)) {
485
+ return start + match.index + 1;
486
+ }
487
+ }
488
+ const last = sentenceBreaks.at(-1);
489
+ if (last?.index !== void 0) return start + last.index + 1;
490
+ return beforeUrl.length;
491
+ }
492
+ function cutAtUrlAttribution(text) {
493
+ const urlMatch = text.match(/https?:\/\/\S+/i);
494
+ if (!urlMatch?.index) return text;
495
+ const beforeUrl = text.slice(0, urlMatch.index);
496
+ return beforeUrl.slice(0, findAttributionCut(beforeUrl));
497
+ }
498
+ function trimToSentenceLimit(text) {
499
+ if (text.length <= MAX_ANSWER_LENGTH) return text;
500
+ const slice = text.slice(0, MAX_ANSWER_LENGTH);
501
+ const lastSentence = Math.max(slice.lastIndexOf("."), slice.lastIndexOf("!"), slice.lastIndexOf("?"));
502
+ return (lastSentence > 240 ? slice.slice(0, lastSentence + 1) : slice).trim();
503
+ }
504
+ function cleanPAAAnswerText(answer, question, sourceTitle) {
505
+ if (!answer) return void 0;
506
+ let text = normalizeWhitespace(answer);
507
+ const normalizedQuestion = question ? normalizeWhitespace(question) : "";
508
+ if (normalizedQuestion && text.toLowerCase().startsWith(normalizedQuestion.toLowerCase())) {
509
+ text = text.slice(normalizedQuestion.length).trim();
510
+ }
511
+ if (/^An error has occurred\.?\s*Please try again later\.?/i.test(text)) {
512
+ return void 0;
513
+ }
514
+ for (const pattern of BOILERPLATE_PATTERNS) {
515
+ text = text.replace(pattern, " ");
516
+ }
517
+ text = text.replace(/\b[A-Z][A-Za-z&'\u2019 -]{2,60}\+\d+\b/g, " ").replace(/\b(?:[a-z0-9-]+\.)+[a-z]{2,}\+\d+\b/gi, " ");
518
+ text = normalizeWhitespace(text);
519
+ text = cutAtFirstMarker(text);
520
+ text = cutAtSourceTitle(text, sourceTitle);
521
+ text = cutAtUrlAttribution(text);
522
+ text = normalizeWhitespace(text);
523
+ text = text.replace(/\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s+\d{4}$/i, "").trim();
524
+ text = trimToSentenceLimit(text);
525
+ if (!text) return void 0;
526
+ if (/^An error has occurred\.?\s*Please try again later\.?$/i.test(text)) return void 0;
527
+ return text;
528
+ }
529
+
530
+ // src/extractor/PAAExtractor.ts
531
+ var PAAExtractor = class {
532
+ constructor(driver, reporter) {
533
+ this.driver = driver;
534
+ this.reporter = reporter;
535
+ }
536
+ driver;
537
+ reporter;
538
+ normalizeQuestion(q) {
539
+ return q.toLowerCase().replace(/[^\w\s]/g, "").replace(/\s+/g, " ").trim();
540
+ }
541
+ async extractVisibleItems(page) {
542
+ const sels = PAASelectors;
543
+ const raw = await page.evaluate((selectors) => {
544
+ function cleanText(el) {
545
+ if (!el) return "";
546
+ const parts = [];
547
+ for (const n of el.childNodes) {
548
+ if (n.nodeType === Node.TEXT_NODE) {
549
+ const text = n.textContent?.trim();
550
+ if (text) parts.push(text);
551
+ } else if (n.tagName === "STYLE" || n.tagName === "SCRIPT") {
552
+ continue;
553
+ } else {
554
+ const text = cleanText(n);
555
+ if (text) parts.push(text);
556
+ }
557
+ }
558
+ return parts.join(" ").replace(/\s+/g, " ").trim();
559
+ }
560
+ return Array.from(document.querySelectorAll(selectors.item)).map((pair) => ({
561
+ question: pair.getAttribute(selectors.itemDataQ) || pair.getAttribute(selectors.itemDataInitQ) || "",
562
+ answer: cleanText(pair.querySelector(selectors.answerContainer)) || void 0,
563
+ sourceTitle: pair.querySelector(selectors.sourceTitle)?.innerText?.trim() || void 0,
564
+ sourceSite: pair.querySelector(selectors.sourceSite)?.innerText?.trim() || void 0,
565
+ sourceCite: pair.querySelector(selectors.sourceCite)?.innerText?.trim() || void 0
566
+ }));
567
+ }, sels);
568
+ return raw.flatMap((item) => {
569
+ const cleaned = {
570
+ ...item,
571
+ answer: cleanPAAAnswerText(item.answer, item.question, item.sourceTitle)
572
+ };
573
+ const result = RawPAAItemSchema.safeParse(cleaned);
574
+ if (!result.success) {
575
+ console.warn("[PAAExtractor] item parse failed:", item.question, result.error.issues[0]?.message);
576
+ return [];
577
+ }
578
+ return [result.data];
579
+ });
580
+ }
581
+ async clickItem(page, questionText) {
582
+ try {
583
+ const pairLocator = page.locator(
584
+ `${PAASelectors.item}[data-q="${questionText}"], ${PAASelectors.item}[data-initq="${questionText}"]`
585
+ ).first();
586
+ await pairLocator.click();
587
+ } catch {
588
+ }
589
+ }
590
+ toFlatRow(item, depth, parentQuestion, seed) {
591
+ return {
592
+ seed_query: seed,
593
+ question: item.question,
594
+ answer: item.answer ?? "",
595
+ source_title: item.sourceTitle ?? "",
596
+ source_site: item.sourceSite ?? "",
597
+ source_cite: item.sourceCite ?? "",
598
+ depth,
599
+ parent_question: parentQuestion ?? "",
600
+ extracted_at: (/* @__PURE__ */ new Date()).toISOString()
601
+ };
602
+ }
603
+ async runBFS(page, options) {
604
+ const seenKeys = /* @__PURE__ */ new Set();
605
+ const seenQs = /* @__PURE__ */ new Set();
606
+ const depthMap = /* @__PURE__ */ new Map();
607
+ const results = [];
608
+ const readAllQs = () => page.evaluate(
609
+ ({ sel, dataQ, dataInitQ, questionEl }) => Array.from(document.querySelectorAll(sel)).map(
610
+ (el) => el.getAttribute(dataQ) || el.getAttribute(dataInitQ) || el.querySelector(questionEl)?.innerText?.trim() || ""
611
+ ).filter(Boolean),
612
+ { sel: PAASelectors.item, dataQ: PAASelectors.itemDataQ, dataInitQ: PAASelectors.itemDataInitQ, questionEl: PAASelectors.itemQuestionEl }
613
+ );
614
+ const dupRates = [];
615
+ const orderedQs = [];
616
+ for (let round = 0; round < options.depth; round++) {
617
+ this.reporter.onDepth(round + 1);
618
+ if (seenQs.size >= options.maxQuestions) break;
619
+ const beforeQs = await readAllQs();
620
+ if (beforeQs.length >= options.maxQuestions) break;
621
+ const unexpandedItems = await page.$$(
622
+ `${PAASelectors.item}:not(.${PAASelectors.expandedClass})`
623
+ );
624
+ if (unexpandedItems.length === 0) break;
625
+ for (const item of unexpandedItems) {
626
+ try {
627
+ await item.scrollIntoViewIfNeeded();
628
+ await item.click({ force: true });
629
+ await page.waitForTimeout(500);
630
+ } catch {
631
+ }
632
+ }
633
+ await page.waitForTimeout(1500);
634
+ const afterQs = await readAllQs();
635
+ const newQs = afterQs.slice(beforeQs.length);
636
+ const newDups = newQs.filter((q) => seenQs.has(q)).length;
637
+ const dupRate = newQs.length > 0 ? newDups / newQs.length : 0;
638
+ dupRates.push(dupRate);
639
+ if (dupRates.length > 2) dupRates.shift();
640
+ const rollingDupRate = dupRates.reduce((a, b) => a + b, 0) / dupRates.length;
641
+ for (const q of afterQs) {
642
+ if (!seenQs.has(q)) {
643
+ seenQs.add(q);
644
+ orderedQs.push(q);
645
+ }
646
+ if (!depthMap.has(q)) depthMap.set(q, round + 1);
647
+ }
648
+ if (afterQs.length === beforeQs.length) break;
649
+ if (rollingDupRate >= 0.6) break;
650
+ }
651
+ const itemMap = new Map((await this.extractVisibleItems(page)).map((i) => [i.question, i]));
652
+ for (const q of orderedQs) {
653
+ if (results.length >= options.maxQuestions) break;
654
+ const key = this.normalizeQuestion(q);
655
+ if (seenKeys.has(key)) continue;
656
+ seenKeys.add(key);
657
+ const d = depthMap.get(q) ?? 1;
658
+ const item = itemMap.get(q);
659
+ if (item) {
660
+ results.push(this.toFlatRow(item, d, null, options.query));
661
+ this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: d, parentQuestion: null, children: [] });
662
+ } else {
663
+ results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, d, null, options.query));
664
+ }
665
+ }
666
+ return results;
667
+ }
668
+ async extractVideos(page) {
669
+ const vsels = VideoSelectors;
670
+ return page.evaluate((sels) => {
671
+ const results = [];
672
+ const containers = Array.from(document.querySelectorAll(sels.container));
673
+ for (const container of containers) {
674
+ const headingEl = container.querySelector(sels.sectionHeading);
675
+ const headingText = headingEl?.textContent?.trim() ?? "";
676
+ const type = headingText.toLowerCase().includes("short") ? "short_video" : "video";
677
+ const items = Array.from(container.querySelectorAll(sels.item));
678
+ for (const a of items) {
679
+ const href = a.href;
680
+ if (!href || !href.includes("youtube") && !href.includes("youtu.be")) continue;
681
+ const raw = a.textContent?.trim() ?? "";
682
+ const ytIdx = raw.indexOf("YouTube");
683
+ if (ytIdx === -1) continue;
684
+ const title = raw.slice(0, ytIdx).trim();
685
+ const remainder = raw.slice(ytIdx + 7).replace(/^[·\s·]+/, "");
686
+ const channelMatch = remainder.match(/^([^·\n]+)/);
687
+ const channel = channelMatch ? channelMatch[1].trim() : "";
688
+ if (title) results.push({ type, title, channel, platform: "YouTube", duration: "", url: href });
689
+ }
690
+ }
691
+ return results;
692
+ }, vsels);
693
+ }
694
+ async extractForums(page) {
695
+ const fsels = ForumSelectors;
696
+ return page.evaluate((sels) => {
697
+ const results = [];
698
+ const sections = Array.from(document.querySelectorAll(sels.section));
699
+ const forumSection = sections.find((s) => s.textContent?.includes("Discussions"));
700
+ if (!forumSection) return results;
701
+ const items = Array.from(forumSection.querySelectorAll(sels.item));
702
+ for (const a of items) {
703
+ const href = a.href;
704
+ if (!href) continue;
705
+ const titleEl = a.querySelector(sels.title);
706
+ const sourceEl = a.querySelector(sels.source);
707
+ const title = titleEl?.textContent?.trim() ?? "";
708
+ const source = sourceEl?.textContent?.trim() ?? "";
709
+ if (title) results.push({ title, source, url: href });
710
+ }
711
+ return results;
712
+ }, fsels);
713
+ }
714
+ async extractShortVideos(page, shortUrl) {
715
+ try {
716
+ await page.goto(shortUrl, { waitUntil: "domcontentloaded" });
717
+ await page.waitForTimeout(1500);
718
+ } catch {
719
+ return [];
720
+ }
721
+ const svSels = {
722
+ item: ShortVideoSelectors.item,
723
+ platforms: [...ShortVideoSelectors.platforms]
724
+ };
725
+ const raw = await page.evaluate((sels) => {
726
+ const seen = /* @__PURE__ */ new Set();
727
+ const results = [];
728
+ const items = Array.from(document.querySelectorAll(sels.item));
729
+ const videoHosts = ["youtube.com", "youtu.be", "tiktok.com", "instagram.com", "facebook.com", "fb.watch"];
730
+ const byHref = /* @__PURE__ */ new Map();
731
+ for (const a of items) {
732
+ const href = a.href;
733
+ if (!href) continue;
734
+ if (!videoHosts.some((h) => href.includes(h))) continue;
735
+ const text = a.textContent?.trim() ?? "";
736
+ if (!byHref.has(href)) byHref.set(href, []);
737
+ byHref.get(href).push(text);
738
+ }
739
+ for (const [href, texts] of byHref.entries()) {
740
+ if (seen.has(href)) continue;
741
+ seen.add(href);
742
+ const duration = texts.find((t) => /^\d+:\d+$/.test(t)) ?? "";
743
+ const titleText = texts.find((t) => !/^\d+:\d+$/.test(t) && t.length > 5) ?? "";
744
+ if (!titleText) continue;
745
+ let title = titleText;
746
+ let platform = "";
747
+ let channel = "";
748
+ for (const p of sels.platforms) {
749
+ let lastIdx = -1;
750
+ let search = 0;
751
+ while (true) {
752
+ const found = titleText.indexOf(p, search);
753
+ if (found === -1) break;
754
+ lastIdx = found;
755
+ search = found + 1;
756
+ }
757
+ if (lastIdx === -1) continue;
758
+ const after = titleText.slice(lastIdx + p.length);
759
+ const isSourceTag = /^[\s·]/.test(after) || after.trim() === "";
760
+ if (!isSourceTag) continue;
761
+ title = titleText.slice(0, lastIdx).trim();
762
+ platform = p;
763
+ const stripped = after.replace(/^[\s·]+/, "");
764
+ const dotIdx = stripped.indexOf("\xB7");
765
+ channel = (dotIdx === -1 ? stripped : stripped.slice(0, dotIdx)).trim();
766
+ break;
767
+ }
768
+ if (title) results.push({ title, channel, platform, duration, url: href });
769
+ }
770
+ return results;
771
+ }, svSels);
772
+ return raw.map((r) => ({ type: "short_video", ...r }));
773
+ }
774
+ async extractWhatPeopleSaying(page) {
775
+ const sels = WhatPeopleSayingSelectors;
776
+ return page.evaluate((s) => {
777
+ const section = Array.from(document.querySelectorAll(s.sectionTag)).find((el) => el.textContent?.includes(s.sectionHeadingText)) ?? document.querySelector(".yG4QQe.TBC9ub.NbhJ1c");
778
+ if (!section) return [];
779
+ return Array.from(section.querySelectorAll(s.card)).map((card) => {
780
+ const link = card.querySelector(s.cardLink);
781
+ const url = link?.href ?? "";
782
+ const titleH1 = card.querySelector(s.titleH1)?.textContent?.trim();
783
+ const titleDiv = card.querySelector(s.titleDiv)?.textContent?.trim();
784
+ const title = titleH1 ?? titleDiv ?? "";
785
+ const sourceText = card.querySelector(s.source)?.textContent?.trim() ?? "";
786
+ const platformEl = card.querySelector(s.platformBadge);
787
+ const platformText = platformEl?.textContent?.trim() ?? "";
788
+ const ytChannel = card.querySelector(s.ytChannel)?.textContent?.trim() ?? "";
789
+ const ytDate = card.querySelector(s.ytDate)?.textContent?.trim() ?? "";
790
+ const authorNote = card.querySelector(s.authorNote)?.textContent?.trim() ?? null;
791
+ const commentLabelEl = card.querySelector(s.popularCommentLabel);
792
+ let popularComment = null;
793
+ if (commentLabelEl) {
794
+ let next = commentLabelEl.nextSibling;
795
+ while (next) {
796
+ const t = next.textContent?.trim();
797
+ if (t) {
798
+ popularComment = t;
799
+ break;
800
+ }
801
+ next = next.nextSibling;
802
+ }
803
+ }
804
+ const allSpans = Array.from(card.querySelectorAll("span"));
805
+ const duration = allSpans.find((s2) => /^\d+:\d+$/.test(s2.textContent?.trim() ?? ""))?.textContent?.trim() ?? null;
806
+ const engagementParts = allSpans.map((s2) => s2.textContent?.trim() ?? "").filter(
807
+ (t) => /\d/.test(t) && (t.includes("comment") || t.includes("reaction") || t.includes("view") || t.includes("like") || t.includes("share"))
808
+ );
809
+ const engagement = engagementParts[0] ?? "";
810
+ const dateCandidates = allSpans.map((s2) => s2.textContent?.trim() ?? "").filter((t) => /\d+ (day|week|month|year|hour)s? ago|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/.test(t));
811
+ const date = ytDate || (dateCandidates[0] ?? "");
812
+ const platform = platformText || (ytChannel ? "YouTube" : "");
813
+ const source = ytChannel || sourceText;
814
+ let type = "unknown";
815
+ const pl = platform.toLowerCase();
816
+ const src = source.toLowerCase();
817
+ const srcRaw = sourceText.toLowerCase();
818
+ if (pl.includes("reddit") || src.startsWith("r/")) type = "reddit";
819
+ else if (pl.includes("facebook") || srcRaw.includes("facebook")) type = "facebook";
820
+ else if (pl.includes("instagram") || srcRaw.includes("instagram")) type = "instagram";
821
+ else if (pl.includes("tiktok") || srcRaw.includes("tiktok")) type = "tiktok";
822
+ else if (pl.includes("youtube") || !!ytChannel) type = "youtube";
823
+ else type = "news";
824
+ return { type, title, url, source, platform, popularComment, engagement, date, duration, authorNote };
825
+ });
826
+ }, sels);
827
+ }
828
+ async extractOrganicResults(page) {
829
+ const sels = OrganicSelectors;
830
+ return page.evaluate((s) => {
831
+ const out = [];
832
+ let pos = 0;
833
+ document.querySelectorAll(s.result).forEach((card) => {
834
+ const titleEl = card.querySelector(s.title);
835
+ if (!titleEl) return;
836
+ const title = titleEl.textContent?.trim() ?? "";
837
+ const linkEl = titleEl.closest("a");
838
+ const url = linkEl?.href ?? "";
839
+ if (!title || !url) return;
840
+ pos++;
841
+ const cite = card.querySelector(s.cite)?.textContent?.trim() ?? null;
842
+ const snippet = card.querySelector(s.snippet)?.textContent?.trim() ?? null;
843
+ const isRedditStyle = !!card.querySelector(s.redditCite);
844
+ const ratingEl = card.querySelector(s.ratingWrap);
845
+ const inlineRating = ratingEl ? { value: ratingEl.querySelector(s.ratingValue)?.textContent?.trim() ?? "", count: ratingEl.querySelector(s.reviewCount)?.textContent?.trim() ?? "" } : null;
846
+ let domain = "";
847
+ try {
848
+ domain = new URL(url).hostname.replace(/^www\./, "");
849
+ } catch {
850
+ domain = card.querySelector(s.siteName)?.textContent?.trim() ?? "";
851
+ }
852
+ out.push({ position: pos, title, url, domain, cite, snippet, isRedditStyle, inlineRating });
853
+ });
854
+ return out;
855
+ }, sels);
856
+ }
857
+ async extractLocalPack(page) {
858
+ const sels = LocalPackSelectors;
859
+ return page.evaluate((s) => {
860
+ const out = [];
861
+ let container = null;
862
+ document.querySelectorAll('[role="heading"]').forEach((h) => {
863
+ if (!container && h.textContent?.includes(s.headingText)) container = h.closest("[data-hveid]");
864
+ });
865
+ if (!container) return out;
866
+ container.querySelectorAll(s.card).forEach((card, i) => {
867
+ const name = card.querySelector(s.name)?.textContent?.trim() ?? "";
868
+ if (!name) return;
869
+ const rating = card.querySelector(s.ratingValue)?.textContent?.trim() ?? null;
870
+ const reviewRaw = card.querySelector(s.reviewCount)?.textContent?.trim() ?? null;
871
+ const reviewCount = reviewRaw ? reviewRaw.replace(/[()]/g, "").trim() : null;
872
+ let cid = card.querySelector("a[data-cid]")?.getAttribute("data-cid") ?? null;
873
+ if (!cid) {
874
+ for (const link of Array.from(card.querySelectorAll("a[href]"))) {
875
+ const m1 = link.href.match(/[?&]cid=(\d+)/);
876
+ if (m1) {
877
+ cid = m1[1];
878
+ break;
879
+ }
880
+ const m2 = link.href.match(/!1s0x[0-9a-f]+:0x([0-9a-f]+)/i);
881
+ if (m2) {
882
+ try {
883
+ cid = BigInt("0x" + m2[1]).toString();
884
+ } catch {
885
+ }
886
+ if (cid) break;
887
+ }
888
+ }
889
+ }
890
+ const metadata = [];
891
+ card.querySelectorAll("div, span").forEach((el) => {
892
+ const text = Array.from(el.childNodes).filter((n) => n.nodeType === 3).map((n) => n.textContent?.trim() ?? "").filter((t) => t.length > 1 && t.length < 120).join(" ");
893
+ if (text && !metadata.includes(text)) metadata.push(text);
894
+ });
895
+ const links = Array.from(card.querySelectorAll("a[href]"));
896
+ const directionsUrl = links.find((a) => a.href.includes("google.com/maps"))?.href ?? null;
897
+ const websiteUrl = links.find((a) => !a.href.includes("google.com") && a.href.startsWith("http"))?.href ?? null;
898
+ out.push({ position: i + 1, name, cid, rating, reviewCount, metadata, websiteUrl, directionsUrl });
899
+ });
900
+ return out;
901
+ }, sels);
902
+ }
903
+ async extractEntityIds(page) {
904
+ return page.evaluate(() => {
905
+ const kgIds = /* @__PURE__ */ new Set();
906
+ const cids = /* @__PURE__ */ new Set();
907
+ const gcids = /* @__PURE__ */ new Set();
908
+ const recordMap = /* @__PURE__ */ new Map();
909
+ function nameFromWrapper(el) {
910
+ const sel = [".OSrXXb", ".dbg0pd", ".tzt0oe", '[role="heading"]', "h3"];
911
+ for (const s of sel) {
912
+ const found = el.querySelector(s);
913
+ if (found?.textContent?.trim()) return found.textContent.trim();
914
+ }
915
+ return "";
916
+ }
917
+ document.querySelectorAll('[id^="pv-/g/"]').forEach((wrapper) => {
918
+ const raw = wrapper.getAttribute("id");
919
+ if (!raw) return;
920
+ const kgId = raw.replace("pv-", "");
921
+ kgIds.add(kgId);
922
+ const name = nameFromWrapper(wrapper);
923
+ const cidEl = wrapper.querySelector("a[data-cid]");
924
+ const cid = cidEl?.getAttribute("data-cid") ?? null;
925
+ if (cid) cids.add(cid);
926
+ if (name) recordMap.set(kgId, { name, kgId, cid, gcid: null });
927
+ });
928
+ document.querySelectorAll("[data-mid]").forEach((el) => {
929
+ const mid = el.getAttribute("data-mid");
930
+ if (!mid?.startsWith("/g/")) return;
931
+ kgIds.add(mid);
932
+ if (!recordMap.has(mid)) {
933
+ const name = nameFromWrapper(el);
934
+ if (name) recordMap.set(mid, { name, kgId: mid, cid: null, gcid: null });
935
+ }
936
+ });
937
+ document.querySelectorAll(".w7Dbne").forEach((card) => {
938
+ const cidEl = card.querySelector("a[data-cid]");
939
+ const cid = cidEl?.getAttribute("data-cid") ?? null;
940
+ if (!cid) return;
941
+ cids.add(cid);
942
+ const name = card.querySelector(".OSrXXb")?.textContent?.trim() ?? "";
943
+ if (!name) return;
944
+ const kgIdEl = card.querySelector('[id^="pv-/g/"]');
945
+ const kgId = kgIdEl ? kgIdEl.getAttribute("id").replace("pv-", "") : null;
946
+ const key = kgId ?? `cid:${cid}`;
947
+ if (recordMap.has(key)) {
948
+ const existing = recordMap.get(key);
949
+ if (!existing.cid) recordMap.set(key, { ...existing, cid });
950
+ } else {
951
+ recordMap.set(key, { name, kgId, cid, gcid: null });
952
+ }
953
+ });
954
+ document.querySelectorAll("a[data-cid]").forEach((el) => {
955
+ const cid = el.getAttribute("data-cid");
956
+ if (!cid) return;
957
+ cids.add(cid);
958
+ const alreadyNamed = [...recordMap.values()].some((r) => r.cid === cid);
959
+ if (!alreadyNamed) {
960
+ let node = el.parentElement;
961
+ let name = "";
962
+ for (let i = 0; i < 8 && node; i++) {
963
+ const h = node.querySelector('.OSrXXb, .dbg0pd, [role="heading"], h3');
964
+ if (h?.textContent?.trim()) {
965
+ name = h.textContent.trim();
966
+ break;
967
+ }
968
+ node = node.parentElement;
969
+ }
970
+ if (name) recordMap.set(`cid:${cid}`, { name, kgId: null, cid, gcid: null });
971
+ }
972
+ });
973
+ const scriptContent = Array.from(document.querySelectorAll("script:not([src])")).map((s) => s.textContent ?? "").filter((t) => t.length > 1e4).join("\n");
974
+ for (const m of scriptContent.matchAll(/\/g\/[a-zA-Z0-9_-]{5,20}/g)) kgIds.add(m[0]);
975
+ for (const m of scriptContent.matchAll(/gcid:[a-zA-Z0-9_]+/g)) gcids.add(m[0]);
976
+ for (const m of scriptContent.matchAll(/0x[0-9a-f]+:0x([0-9a-f]+)/gi)) {
977
+ try {
978
+ cids.add(BigInt("0x" + m[1]).toString());
979
+ } catch {
980
+ }
981
+ }
982
+ return { entities: [...recordMap.values()], kgIds: [...kgIds], cids: [...cids], gcids: [...gcids] };
983
+ });
984
+ }
985
+ mergeLocalPackIntoEntities(entityIds, localPack) {
986
+ const cidSet = new Set(entityIds.cids);
987
+ const records = entityIds.entities.map((r) => ({ ...r }));
988
+ for (const biz of localPack) {
989
+ if (!biz.cid) continue;
990
+ cidSet.add(biz.cid);
991
+ const nameNorm = biz.name.toLowerCase().trim();
992
+ const byName = records.find((r) => r.name.toLowerCase().trim() === nameNorm);
993
+ if (byName) {
994
+ if (!byName.cid) byName.cid = biz.cid;
995
+ } else if (!records.find((r) => r.cid === biz.cid)) {
996
+ records.push({ name: biz.name, kgId: null, cid: biz.cid, gcid: null });
997
+ }
998
+ }
999
+ return { ...entityIds, entities: records, cids: [...cidSet] };
1000
+ }
1001
+ async extractAISurfaces(page) {
1002
+ const aioSels = AIOverviewSelectors;
1003
+ const aimSels = AIModeSelectors;
1004
+ return page.evaluate(({ aio, aim }) => {
1005
+ const sn = window.google?.sn ?? "unknown";
1006
+ const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
1007
+ function findAIORoot() {
1008
+ const primary = document.querySelector(aio.root);
1009
+ if (primary) return primary;
1010
+ const headings = document.querySelectorAll('h1, h2, h3, [role="heading"]');
1011
+ for (const h of headings) {
1012
+ if (h.textContent?.trim() === "AI Overview") {
1013
+ let el = h.parentElement;
1014
+ for (let i = 0; i < 6 && el; i++) {
1015
+ if (el.querySelectorAll("a").length > 1) return el;
1016
+ el = el.parentElement;
1017
+ }
1018
+ return h.parentElement;
1019
+ }
1020
+ }
1021
+ return null;
1022
+ }
1023
+ const aioRoot = findAIORoot();
1024
+ const aioContainer = aioRoot ? aioRoot.closest(aio.wrapper) ?? aioRoot : null;
1025
+ let aioText = null;
1026
+ if (aioContainer) {
1027
+ const clone = aioContainer.cloneNode(true);
1028
+ clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
1029
+ clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
1030
+ clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
1031
+ clone.querySelectorAll("a").forEach((el) => el.remove());
1032
+ const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
1033
+ const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
1034
+ aioText = isErrorState ? null : candidate;
1035
+ }
1036
+ const aioDetected = !!aioRoot && aioText !== null;
1037
+ const aioCitations = Array.from(aioContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
1038
+ text: a.textContent?.trim() ?? "",
1039
+ href: a.href
1040
+ })).filter((c) => c.text && c.href);
1041
+ const aimRoot = document.querySelector(aim.root);
1042
+ const aimDetected = surface === "aim" && !!aimRoot;
1043
+ const aimContainer = aimRoot?.closest(aim.wrapper) ?? null;
1044
+ let aimText = null;
1045
+ if (aimContainer) {
1046
+ const clone = aimContainer.cloneNode(true);
1047
+ clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
1048
+ clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
1049
+ clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
1050
+ clone.querySelectorAll("a").forEach((el) => el.remove());
1051
+ const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
1052
+ const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
1053
+ aimText = isErrorState ? null : candidate;
1054
+ }
1055
+ const aimCitations = aimDetected ? Array.from(aimContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
1056
+ text: a.textContent?.trim() ?? "",
1057
+ href: a.href
1058
+ })).filter((c) => c.text && c.href) : [];
1059
+ return {
1060
+ surface,
1061
+ aiOverview: { detected: aioDetected, text: aioText, citations: aioCitations },
1062
+ aiMode: { detected: aimDetected, text: aimText, citations: aimCitations }
1063
+ };
1064
+ }, { aio: aioSels, aim: aimSels });
1065
+ }
1066
+ buildTree(flat, _seed) {
1067
+ const roots = [];
1068
+ const nodeMap = /* @__PURE__ */ new Map();
1069
+ for (const row of flat) {
1070
+ const node = {
1071
+ question: row.question,
1072
+ answer: row.answer || null,
1073
+ sourceTitle: row.source_title || null,
1074
+ sourceSite: row.source_site || null,
1075
+ sourceCite: row.source_cite || null,
1076
+ depth: row.depth,
1077
+ parentQuestion: row.parent_question || null,
1078
+ children: []
1079
+ };
1080
+ nodeMap.set(row.question, node);
1081
+ }
1082
+ for (const node of nodeMap.values()) {
1083
+ if (node.parentQuestion && nodeMap.has(node.parentQuestion)) {
1084
+ nodeMap.get(node.parentQuestion).children.push(node);
1085
+ } else {
1086
+ roots.push(node);
1087
+ }
1088
+ }
1089
+ return roots;
1090
+ }
1091
+ async extract(options) {
1092
+ const startMs = Date.now();
1093
+ const config = {
1094
+ headless: options.headless,
1095
+ profileDir: options.profileDir,
1096
+ proxy: options.proxy,
1097
+ kernelApiKey: options.kernelApiKey,
1098
+ kernelProxyId: options.kernelProxyId,
1099
+ viewport: { width: 1280, height: 800 },
1100
+ locale: `${options.hl}-${options.gl.toUpperCase()}`
1101
+ };
1102
+ let errorCount = 0;
1103
+ try {
1104
+ await this.driver.launch(config);
1105
+ const uule = options.location ? encodeUule(normalizeLocation(options.location)) : null;
1106
+ const { hasPaa } = await this.driver.navigateToSERP(options.query, uule, options.gl, options.hl);
1107
+ const page = this.driver.getPage();
1108
+ if (options.serpOnly) {
1109
+ const [organicResults2, localPack2, rawEntityIds2] = await Promise.all([
1110
+ this.extractOrganicResults(page),
1111
+ this.extractLocalPack(page),
1112
+ this.extractEntityIds(page)
1113
+ ]);
1114
+ const entityIds2 = this.mergeLocalPackIntoEntities(rawEntityIds2, localPack2);
1115
+ let allOrganic2 = organicResults2;
1116
+ if ((options.pages ?? 1) >= 2) {
1117
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1118
+ if (uule) p2params.set("uule", uule);
1119
+ await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1120
+ const p2organic = await this.extractOrganicResults(page);
1121
+ allOrganic2 = [...organicResults2, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1122
+ }
1123
+ const stats2 = {
1124
+ seed: options.query,
1125
+ totalQuestions: 0,
1126
+ maxDepthReached: 0,
1127
+ durationMs: Date.now() - startMs,
1128
+ errorCount
1129
+ };
1130
+ this.reporter.onComplete(stats2);
1131
+ return {
1132
+ seed: options.query,
1133
+ location: options.location ?? null,
1134
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1135
+ totalQuestions: 0,
1136
+ surface: "web",
1137
+ aiOverview: { detected: false, text: null, citations: [] },
1138
+ aiMode: { detected: false, text: null, citations: [] },
1139
+ whatPeopleSaying: [],
1140
+ tree: [],
1141
+ flat: [],
1142
+ videos: [],
1143
+ forums: [],
1144
+ organicResults: allOrganic2,
1145
+ localPack: localPack2,
1146
+ entityIds: entityIds2,
1147
+ stats: stats2
1148
+ };
1149
+ }
1150
+ const [videos, forums, whatPeopleSaying, rawEntityIds, organicResults, localPack] = await Promise.all([
1151
+ this.extractVideos(page),
1152
+ this.extractForums(page),
1153
+ this.extractWhatPeopleSaying(page),
1154
+ this.extractEntityIds(page),
1155
+ this.extractOrganicResults(page),
1156
+ this.extractLocalPack(page)
1157
+ ]);
1158
+ const entityIds = this.mergeLocalPackIntoEntities(rawEntityIds, localPack);
1159
+ this.reporter.onVideos(videos);
1160
+ this.reporter.onForums(forums);
1161
+ if (!hasPaa) {
1162
+ let noPaaOrganic = organicResults;
1163
+ if ((options.pages ?? 1) >= 2) {
1164
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1165
+ if (uule) p2params.set("uule", uule);
1166
+ await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1167
+ const p2organic = await this.extractOrganicResults(page);
1168
+ noPaaOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1169
+ }
1170
+ const aiSurfaces2 = await this.extractAISurfaces(page);
1171
+ const stats2 = {
1172
+ seed: options.query,
1173
+ totalQuestions: 0,
1174
+ maxDepthReached: 0,
1175
+ durationMs: Date.now() - startMs,
1176
+ errorCount
1177
+ };
1178
+ this.reporter.onComplete(stats2);
1179
+ return {
1180
+ seed: options.query,
1181
+ location: options.location ?? null,
1182
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1183
+ totalQuestions: 0,
1184
+ surface: aiSurfaces2.surface,
1185
+ aiOverview: aiSurfaces2.aiOverview,
1186
+ aiMode: aiSurfaces2.aiMode,
1187
+ whatPeopleSaying,
1188
+ tree: [],
1189
+ flat: [],
1190
+ videos,
1191
+ forums,
1192
+ organicResults: noPaaOrganic,
1193
+ localPack,
1194
+ entityIds,
1195
+ stats: stats2
1196
+ };
1197
+ }
1198
+ const flat = await this.runBFS(page, options);
1199
+ const aiSurfaces = await this.extractAISurfaces(page);
1200
+ const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, udm: ShortVideoSelectors.udm });
1201
+ if (uule) shortVidsParams.set("uule", uule);
1202
+ const shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
1203
+ this.reporter.onVideos(shortVideos);
1204
+ let allOrganic = organicResults;
1205
+ if ((options.pages ?? 1) >= 2) {
1206
+ const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
1207
+ if (uule) p2params.set("uule", uule);
1208
+ await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
1209
+ const p2organic = await this.extractOrganicResults(page);
1210
+ allOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
1211
+ }
1212
+ const allVideos = [...videos, ...shortVideos];
1213
+ const tree = this.buildTree(flat, options.query);
1214
+ const stats = {
1215
+ seed: options.query,
1216
+ totalQuestions: flat.length,
1217
+ maxDepthReached: flat.reduce((m, r) => Math.max(m, r.depth), 0),
1218
+ durationMs: Date.now() - startMs,
1219
+ errorCount
1220
+ };
1221
+ this.reporter.onComplete(stats);
1222
+ return {
1223
+ seed: options.query,
1224
+ location: options.location ?? null,
1225
+ extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
1226
+ totalQuestions: flat.length,
1227
+ surface: aiSurfaces.surface,
1228
+ aiOverview: aiSurfaces.aiOverview,
1229
+ aiMode: aiSurfaces.aiMode,
1230
+ whatPeopleSaying,
1231
+ tree,
1232
+ flat,
1233
+ videos: allVideos,
1234
+ forums,
1235
+ organicResults: allOrganic,
1236
+ localPack,
1237
+ entityIds,
1238
+ stats
1239
+ };
1240
+ } catch (err) {
1241
+ errorCount++;
1242
+ this.reporter.onError(err instanceof Error ? err : new Error(String(err)));
1243
+ throw err;
1244
+ } finally {
1245
+ await this.driver.close();
1246
+ }
1247
+ }
1248
+ };
1249
+
1250
+ // src/output/OutputSerializer.ts
1251
+ var import_node_fs = require("fs");
1252
+ var import_node_path = __toESM(require("path"), 1);
1253
+ var import_papaparse = __toESM(require("papaparse"), 1);
1254
+ var OutputSerializer = class {
1255
+ async writeJSON(result, outputDir) {
1256
+ await import_node_fs.promises.mkdir(outputDir, { recursive: true });
1257
+ const slug = result.seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
1258
+ const filename = `${slug}-${Date.now()}.json`;
1259
+ const fullPath = import_node_path.default.join(outputDir, filename);
1260
+ await import_node_fs.promises.writeFile(fullPath, JSON.stringify(result, null, 2), "utf8");
1261
+ return fullPath;
1262
+ }
1263
+ async writeCSV(rows, outputDir) {
1264
+ await import_node_fs.promises.mkdir(outputDir, { recursive: true });
1265
+ const seedRaw = rows[0]?.seed_query ?? "paa";
1266
+ const slug = seedRaw.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
1267
+ const csv = import_papaparse.default.unparse(rows, { header: true });
1268
+ const filename = `${slug}-${Date.now()}.csv`;
1269
+ const fullPath = import_node_path.default.join(outputDir, filename);
1270
+ await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
1271
+ return fullPath;
1272
+ }
1273
+ async writeVideoCSV(videos, seed, outputDir) {
1274
+ await import_node_fs.promises.mkdir(outputDir, { recursive: true });
1275
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
1276
+ const csv = import_papaparse.default.unparse(videos, { header: true });
1277
+ const filename = `${slug}-videos-${Date.now()}.csv`;
1278
+ const fullPath = import_node_path.default.join(outputDir, filename);
1279
+ await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
1280
+ return fullPath;
1281
+ }
1282
+ async writeForumCSV(forums, seed, outputDir) {
1283
+ await import_node_fs.promises.mkdir(outputDir, { recursive: true });
1284
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
1285
+ const csv = import_papaparse.default.unparse(forums, { header: true });
1286
+ const filename = `${slug}-forums-${Date.now()}.csv`;
1287
+ const fullPath = import_node_path.default.join(outputDir, filename);
1288
+ await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
1289
+ return fullPath;
1290
+ }
1291
+ async writeAIOverviewCSV(citations, text, seed, outputDir) {
1292
+ await import_node_fs.promises.mkdir(outputDir, { recursive: true });
1293
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
1294
+ const rows = citations.map((c, i) => ({
1295
+ seed_query: seed,
1296
+ response_text: i === 0 ? text ?? "" : "",
1297
+ citation_text: c.text,
1298
+ citation_href: c.href
1299
+ }));
1300
+ const csv = import_papaparse.default.unparse(rows, { header: true });
1301
+ const filename = `${slug}-ai-overview-${Date.now()}.csv`;
1302
+ const fullPath = import_node_path.default.join(outputDir, filename);
1303
+ await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
1304
+ return fullPath;
1305
+ }
1306
+ async writeAIModeCSV(citations, text, seed, outputDir) {
1307
+ await import_node_fs.promises.mkdir(outputDir, { recursive: true });
1308
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
1309
+ const rows = citations.map((c, i) => ({
1310
+ seed_query: seed,
1311
+ response_text: i === 0 ? text ?? "" : "",
1312
+ citation_text: c.text,
1313
+ citation_href: c.href
1314
+ }));
1315
+ const csv = import_papaparse.default.unparse(rows, { header: true });
1316
+ const filename = `${slug}-ai-mode-${Date.now()}.csv`;
1317
+ const fullPath = import_node_path.default.join(outputDir, filename);
1318
+ await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
1319
+ return fullPath;
1320
+ }
1321
+ async writeWhatPeopleSayingCSV(cards, seed, outputDir) {
1322
+ await import_node_fs.promises.mkdir(outputDir, { recursive: true });
1323
+ const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
1324
+ const rows = cards.map((c) => ({ seed_query: seed, ...c }));
1325
+ const csv = import_papaparse.default.unparse(rows, { header: true });
1326
+ const filename = `${slug}-what-people-saying-${Date.now()}.csv`;
1327
+ const fullPath = import_node_path.default.join(outputDir, filename);
1328
+ await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
1329
+ return fullPath;
1330
+ }
1331
+ };
1332
+
1333
+ // src/output/ProgressReporter.ts
1334
+ var ProgressReporter = class {
1335
+ onQuestion(node) {
1336
+ process.stdout.write(JSON.stringify({ event: "question", depth: node.depth, question: node.question }) + "\n");
1337
+ }
1338
+ onDepth(depth) {
1339
+ process.stdout.write(JSON.stringify({ event: "depth", depth }) + "\n");
1340
+ }
1341
+ onVideos(videos) {
1342
+ for (const v of videos) {
1343
+ process.stdout.write(JSON.stringify({ event: "video", type: v.type, platform: v.platform, duration: v.duration, title: v.title, channel: v.channel, url: v.url }) + "\n");
1344
+ }
1345
+ }
1346
+ onForums(forums) {
1347
+ for (const f of forums) {
1348
+ process.stdout.write(JSON.stringify({ event: "forum", title: f.title, source: f.source, url: f.url }) + "\n");
1349
+ }
1350
+ }
1351
+ onComplete(stats) {
1352
+ process.stdout.write(JSON.stringify({ event: "complete", ...stats }) + "\n");
1353
+ }
1354
+ onError(err) {
1355
+ process.stderr.write(JSON.stringify({ event: "error", type: err.constructor.name, message: err.message }) + "\n");
1356
+ }
1357
+ };
1358
+
1359
+ // src/harvest.ts
1360
+ var MAX_ATTEMPTS = 3;
1361
+ async function extractOnce(options) {
1362
+ const driver = new BrowserDriver();
1363
+ const reporter = new ProgressReporter();
1364
+ const extractor = new PAAExtractor(driver, reporter);
1365
+ try {
1366
+ return await extractor.extract(options);
1367
+ } finally {
1368
+ await driver.close();
1369
+ }
1370
+ }
1371
+ async function harvest(rawOptions) {
1372
+ const raw = typeof rawOptions === "object" && rawOptions !== null ? rawOptions : {};
1373
+ const merged = {
1374
+ kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
1375
+ kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
1376
+ ...raw
1377
+ };
1378
+ const options = HarvestOptionsSchema.parse(merged);
1379
+ const serializer = new OutputSerializer();
1380
+ for (let i = 0; i < MAX_ATTEMPTS; i++) {
1381
+ try {
1382
+ const result = await extractOnce(options);
1383
+ if (options.format === "json" || options.format === "both") {
1384
+ await serializer.writeJSON(result, options.outputDir);
1385
+ }
1386
+ if (options.format === "csv" || options.format === "both") {
1387
+ await Promise.all([
1388
+ serializer.writeCSV(result.flat, options.outputDir),
1389
+ result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, options.outputDir) : Promise.resolve(""),
1390
+ result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, options.outputDir) : Promise.resolve(""),
1391
+ result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, options.outputDir) : Promise.resolve(""),
1392
+ result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, options.outputDir) : Promise.resolve(""),
1393
+ result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, options.outputDir) : Promise.resolve("")
1394
+ ]);
1395
+ }
1396
+ return result;
1397
+ } catch (err) {
1398
+ if (err instanceof CaptchaError && i < MAX_ATTEMPTS - 1) {
1399
+ continue;
1400
+ }
1401
+ throw err;
1402
+ }
1403
+ }
1404
+ const sessionDesc = options.kernelApiKey ? `${MAX_ATTEMPTS} fresh Kernel.sh sessions` : `${MAX_ATTEMPTS} attempts`;
1405
+ throw new CaptchaError(`CAPTCHA on all ${sessionDesc}. Try again in a few minutes.`);
1406
+ }
1407
+
1408
+ // src/cli.ts
1409
+ var program = new import_commander.Command();
1410
+ program.name("paa-harvest").description("Recursively extract Google People Also Ask questions").requiredOption("-q, --query <query>", "Seed query").option("-l, --location <location>", 'Location name (e.g. "austin" or "Austin,Texas,United States")').option("--gl <gl>", "Google country code", "us").option("--hl <hl>", "Google language code", "en").option("-d, --depth <depth>", "BFS depth (1-30)", "3").option("-m, --max-questions <n>", "Max questions to harvest", "100").option("-o, --output <dir>", "Output directory", "./paa-output").option("-f, --format <format>", "Output format: json, csv, or both", "both").option("--headless", "Run browser in headless mode", false).option("--profile <dir>", "Persistent browser profile directory").option("--proxy <url>", "Proxy server URL").option("--kernel-api-key <key>", "Kernel.sh API key (or set KERNEL_API_KEY env var)").action(async (opts) => {
1411
+ try {
1412
+ const result = await harvest({
1413
+ query: opts.query,
1414
+ location: opts.location,
1415
+ gl: opts.gl,
1416
+ hl: opts.hl,
1417
+ depth: parseInt(opts.depth, 10),
1418
+ maxQuestions: parseInt(opts.maxQuestions, 10),
1419
+ outputDir: opts.output,
1420
+ format: opts.format,
1421
+ headless: opts.headless,
1422
+ profileDir: opts.profile,
1423
+ proxy: opts.proxy,
1424
+ kernelApiKey: opts.kernelApiKey ?? process.env.KERNEL_API_KEY
1425
+ });
1426
+ console.log(JSON.stringify({ totalQuestions: result.totalQuestions, outputDir: result.stats.seed }));
1427
+ } catch (err) {
1428
+ console.error(err instanceof Error ? err.message : String(err));
1429
+ process.exit(1);
1430
+ }
1431
+ });
1432
+ async function runCli() {
1433
+ await program.parseAsync();
1434
+ }
1435
+
1436
+ // bin/paa-harvest.ts
1437
+ runCli();
1438
+ //# sourceMappingURL=paa-harvest.cjs.map