mcp-scraper 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/dist/bin/api-server.cjs +15730 -7780
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +3 -3
- package/dist/bin/mcp-stdio-server.cjs +300 -110
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +1537 -165
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +1 -1
- package/dist/{chunk-ZBP4RHNW.js → chunk-4743MZHT.js} +298 -106
- package/dist/chunk-4743MZHT.js.map +1 -0
- package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
- package/dist/chunk-D4CJBZBY.js.map +1 -0
- package/dist/chunk-HERFK7W6.js +2781 -0
- package/dist/chunk-HERFK7W6.js.map +1 -0
- package/dist/chunk-Y74EXABN.js +295 -0
- package/dist/chunk-Y74EXABN.js.map +1 -0
- package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
- package/dist/index.cjs +1660 -237
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +169 -2
- package/dist/index.d.ts +169 -2
- package/dist/index.js +120 -69
- package/dist/index.js.map +1 -1
- package/dist/server-N7Q6H4OR.js +11612 -0
- package/dist/server-N7Q6H4OR.js.map +1 -0
- package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
- package/dist/worker-D4D2YQTA.js.map +1 -0
- package/package.json +17 -5
- package/dist/chunk-4API3ZCT.js +0 -1387
- package/dist/chunk-4API3ZCT.js.map +0 -1
- package/dist/chunk-LXZDJJXR.js.map +0 -1
- package/dist/chunk-ZBP4RHNW.js.map +0 -1
- package/dist/server-63DR2HE5.js +0 -6062
- package/dist/server-63DR2HE5.js.map +0 -1
- package/dist/worker-3ECJHPRE.js.map +0 -1
- /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
package/dist/bin/paa-harvest.cjs
CHANGED
|
@@ -33,6 +33,10 @@ var HarvestOptionsSchema = import_zod.z.object({
|
|
|
33
33
|
location: import_zod.z.string().optional(),
|
|
34
34
|
gl: import_zod.z.string().length(2).default("us"),
|
|
35
35
|
hl: import_zod.z.string().length(2).default("en"),
|
|
36
|
+
device: import_zod.z.enum(["desktop", "mobile"]).default("desktop"),
|
|
37
|
+
proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location"),
|
|
38
|
+
proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional(),
|
|
39
|
+
debug: import_zod.z.boolean().default(false),
|
|
36
40
|
depth: import_zod.z.number().int().min(1).max(30).default(3),
|
|
37
41
|
maxQuestions: import_zod.z.number().int().min(1).max(1e3).default(100),
|
|
38
42
|
headless: import_zod.z.boolean().default(false),
|
|
@@ -40,6 +44,7 @@ var HarvestOptionsSchema = import_zod.z.object({
|
|
|
40
44
|
proxy: import_zod.z.string().url().optional(),
|
|
41
45
|
kernelApiKey: import_zod.z.string().optional(),
|
|
42
46
|
kernelProxyId: import_zod.z.string().optional(),
|
|
47
|
+
kernelProxyResolution: import_zod.z.unknown().optional(),
|
|
43
48
|
outputDir: import_zod.z.string().default("./paa-output"),
|
|
44
49
|
format: import_zod.z.enum(["json", "csv", "both"]).default("both"),
|
|
45
50
|
serpOnly: import_zod.z.boolean().default(false),
|
|
@@ -63,6 +68,45 @@ var RawPAAItemSchema = import_zod.z.object({
|
|
|
63
68
|
sourceSite: import_zod.z.string().optional(),
|
|
64
69
|
sourceCite: import_zod.z.string().optional()
|
|
65
70
|
});
|
|
71
|
+
var RawMapsOverviewSchema = import_zod.z.object({
|
|
72
|
+
name: import_zod.z.string().nullable(),
|
|
73
|
+
rating: import_zod.z.string().nullable(),
|
|
74
|
+
reviewCount: import_zod.z.string().nullable(),
|
|
75
|
+
category: import_zod.z.string().nullable(),
|
|
76
|
+
address: import_zod.z.string().nullable(),
|
|
77
|
+
hoursSummary: import_zod.z.string().nullable(),
|
|
78
|
+
phone: import_zod.z.string().nullable(),
|
|
79
|
+
phoneDisplay: import_zod.z.string().nullable(),
|
|
80
|
+
website: import_zod.z.string().nullable(),
|
|
81
|
+
plusCode: import_zod.z.string().nullable(),
|
|
82
|
+
bookingUrl: import_zod.z.string().nullable()
|
|
83
|
+
});
|
|
84
|
+
var RawMapsHoursRowSchema = import_zod.z.object({
|
|
85
|
+
day: import_zod.z.string(),
|
|
86
|
+
hours: import_zod.z.string()
|
|
87
|
+
});
|
|
88
|
+
var RawMapsReviewStatsSchema = import_zod.z.object({
|
|
89
|
+
reviewHistogram: import_zod.z.array(import_zod.z.object({
|
|
90
|
+
stars: import_zod.z.number(),
|
|
91
|
+
count: import_zod.z.string()
|
|
92
|
+
})),
|
|
93
|
+
reviewTopics: import_zod.z.array(import_zod.z.object({
|
|
94
|
+
label: import_zod.z.string(),
|
|
95
|
+
count: import_zod.z.string()
|
|
96
|
+
}))
|
|
97
|
+
});
|
|
98
|
+
var RawMapsReviewCardSchema = import_zod.z.object({
|
|
99
|
+
reviewId: import_zod.z.string(),
|
|
100
|
+
author: import_zod.z.string().nullable(),
|
|
101
|
+
stars: import_zod.z.string().nullable(),
|
|
102
|
+
date: import_zod.z.string().nullable(),
|
|
103
|
+
text: import_zod.z.string().nullable(),
|
|
104
|
+
ownerResponse: import_zod.z.string().nullable()
|
|
105
|
+
});
|
|
106
|
+
var RawMapsAboutAttributeSchema = import_zod.z.object({
|
|
107
|
+
section: import_zod.z.string(),
|
|
108
|
+
attribute: import_zod.z.string()
|
|
109
|
+
});
|
|
66
110
|
|
|
67
111
|
// src/driver/BrowserDriver.ts
|
|
68
112
|
var import_playwright_extra = require("playwright-extra");
|
|
@@ -78,7 +122,7 @@ var PAASelectors = {
|
|
|
78
122
|
itemDataQ: "data-q",
|
|
79
123
|
itemDataInitQ: "data-initq",
|
|
80
124
|
itemQuestionEl: ".JlqpRe",
|
|
81
|
-
answerContainer: ".bCOlv",
|
|
125
|
+
answerContainer: ".bCOlv, .hgKElc, .wDYxhc, .LGOjhe, .fo7IQd, .fmW3u",
|
|
82
126
|
sourceTitle: "h3",
|
|
83
127
|
sourceSite: ".VuuXrf",
|
|
84
128
|
sourceCite: "cite",
|
|
@@ -118,9 +162,16 @@ var WhatPeopleSayingSelectors = {
|
|
|
118
162
|
authorNote: ".nDgy9d"
|
|
119
163
|
};
|
|
120
164
|
var AIOverviewSelectors = {
|
|
121
|
-
root:
|
|
165
|
+
root: "[data-lhcontainer][data-streaming-container][eid]",
|
|
166
|
+
legacyRoot: '[data-hveid="CBMQAA"]',
|
|
122
167
|
wrapper: ".Fgyi2e",
|
|
123
|
-
|
|
168
|
+
controller: '[jscontroller="AkrxPe"]',
|
|
169
|
+
contentSubtree: '[data-subtree="mfc"]',
|
|
170
|
+
header: ".heWuVc",
|
|
171
|
+
heading: ".Fzsovc.cwYVJe.RJPOee",
|
|
172
|
+
showMoreButton: '[aria-label="Show more AI Overview"]',
|
|
173
|
+
sourcesPanel: ".OZ9ddf.WAUd4",
|
|
174
|
+
disclaimer: ".DuQANe.MSJHRb"
|
|
124
175
|
};
|
|
125
176
|
var AIModeSelectors = {
|
|
126
177
|
root: '[data-hveid="CAUQAA"]',
|
|
@@ -148,6 +199,9 @@ var LocalPackSelectors = {
|
|
|
148
199
|
|
|
149
200
|
// src/errors.ts
|
|
150
201
|
var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
|
|
202
|
+
function sanitizeVendorName(message) {
|
|
203
|
+
return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
|
|
204
|
+
}
|
|
151
205
|
var CaptchaError = class extends Error {
|
|
152
206
|
constructor(instructions) {
|
|
153
207
|
super(`CAPTCHA detected. ${instructions}`);
|
|
@@ -164,10 +218,55 @@ var ExtractionError = class extends Error {
|
|
|
164
218
|
cause;
|
|
165
219
|
name = "ExtractionError";
|
|
166
220
|
};
|
|
221
|
+
var RequestAbortedError = class extends Error {
|
|
222
|
+
name = "RequestAbortedError";
|
|
223
|
+
constructor(message = "Request aborted before harvest completed") {
|
|
224
|
+
super(message);
|
|
225
|
+
}
|
|
226
|
+
};
|
|
167
227
|
|
|
168
228
|
// src/driver/BrowserDriver.ts
|
|
169
229
|
import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
|
|
170
230
|
var DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
231
|
+
var MOBILE_USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
|
|
232
|
+
var DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS = 180;
|
|
233
|
+
var KERNEL_BROWSER_CLOSE_TIMEOUT_MS = 3e3;
|
|
234
|
+
var KERNEL_SESSION_DELETE_TIMEOUT_MS = 5e3;
|
|
235
|
+
function positiveIntFromEnv(name, fallback) {
|
|
236
|
+
const raw = process.env[name];
|
|
237
|
+
if (!raw) return fallback;
|
|
238
|
+
const parsed = Number(raw);
|
|
239
|
+
return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback;
|
|
240
|
+
}
|
|
241
|
+
function proxyIdSuffix(proxyId) {
|
|
242
|
+
return proxyId ? proxyId.slice(-6) : null;
|
|
243
|
+
}
|
|
244
|
+
function errorText(err) {
|
|
245
|
+
return err instanceof Error ? err.message : String(err);
|
|
246
|
+
}
|
|
247
|
+
function rankCheckContextOptions(config) {
|
|
248
|
+
return {
|
|
249
|
+
viewport: config.viewport,
|
|
250
|
+
locale: config.locale,
|
|
251
|
+
userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
|
|
252
|
+
...config.deviceScaleFactor ? { deviceScaleFactor: config.deviceScaleFactor } : {},
|
|
253
|
+
...config.isMobile !== void 0 ? { isMobile: config.isMobile } : {},
|
|
254
|
+
...config.hasTouch !== void 0 ? { hasTouch: config.hasTouch } : {}
|
|
255
|
+
};
|
|
256
|
+
}
|
|
257
|
+
async function withTimeout(promise, timeoutMs, label) {
|
|
258
|
+
let timeout;
|
|
259
|
+
try {
|
|
260
|
+
return await Promise.race([
|
|
261
|
+
promise,
|
|
262
|
+
new Promise((_, reject) => {
|
|
263
|
+
timeout = setTimeout(() => reject(new Error(`${label} timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
264
|
+
})
|
|
265
|
+
]);
|
|
266
|
+
} finally {
|
|
267
|
+
if (timeout) clearTimeout(timeout);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
171
270
|
function buildYouTubeChannelVideosUrl(channelInput) {
|
|
172
271
|
const raw = channelInput.trim();
|
|
173
272
|
if (!raw) throw new Error("channelHandle is required");
|
|
@@ -201,30 +300,101 @@ var BrowserDriver = class {
|
|
|
201
300
|
page = null;
|
|
202
301
|
kernelClient = null;
|
|
203
302
|
kernelSessionId = null;
|
|
303
|
+
debugEnabled = false;
|
|
304
|
+
debugSnapshot = {
|
|
305
|
+
kernel: null,
|
|
306
|
+
context: null,
|
|
307
|
+
networkLocation: null,
|
|
308
|
+
serpNavigation: null
|
|
309
|
+
};
|
|
204
310
|
async launch(config) {
|
|
311
|
+
this.debugEnabled = config.debug === true;
|
|
312
|
+
const proxyMode = config.proxyMode ?? (config.kernelProxyId ? "configured" : "none");
|
|
313
|
+
const device = config.isMobile ? "mobile" : "desktop";
|
|
314
|
+
this.debugSnapshot = {
|
|
315
|
+
kernel: null,
|
|
316
|
+
context: {
|
|
317
|
+
viewport: config.viewport,
|
|
318
|
+
locale: config.locale,
|
|
319
|
+
device,
|
|
320
|
+
userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
|
|
321
|
+
deviceScaleFactor: config.deviceScaleFactor ?? null,
|
|
322
|
+
isMobile: config.isMobile === true,
|
|
323
|
+
hasTouch: config.hasTouch === true
|
|
324
|
+
},
|
|
325
|
+
networkLocation: null,
|
|
326
|
+
serpNavigation: null
|
|
327
|
+
};
|
|
205
328
|
if (config.kernelApiKey) {
|
|
206
329
|
this.kernelClient = new import_sdk.default({ apiKey: config.kernelApiKey });
|
|
330
|
+
const timeoutSeconds = positiveIntFromEnv("KERNEL_BROWSER_TIMEOUT_SECONDS", DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS);
|
|
207
331
|
const kernelBrowser = await this.kernelClient.browsers.create({
|
|
208
332
|
stealth: true,
|
|
209
|
-
timeout_seconds:
|
|
333
|
+
timeout_seconds: timeoutSeconds,
|
|
210
334
|
...config.kernelProxyId ? { proxy_id: config.kernelProxyId } : {}
|
|
211
335
|
});
|
|
212
336
|
this.kernelSessionId = kernelBrowser.session_id;
|
|
337
|
+
let defaultProxyDisabled = null;
|
|
338
|
+
let defaultProxyDisableError = null;
|
|
339
|
+
if (proxyMode === "none") {
|
|
340
|
+
try {
|
|
341
|
+
await withTimeout(
|
|
342
|
+
this.kernelClient.browsers.update(this.kernelSessionId, { disable_default_proxy: true }),
|
|
343
|
+
5e3,
|
|
344
|
+
`Kernel session ${this.kernelSessionId} disable default proxy`
|
|
345
|
+
);
|
|
346
|
+
defaultProxyDisabled = true;
|
|
347
|
+
} catch (err) {
|
|
348
|
+
defaultProxyDisabled = false;
|
|
349
|
+
defaultProxyDisableError = errorText(err);
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
const kernelDebug = {
|
|
353
|
+
sessionId: this.kernelSessionId,
|
|
354
|
+
proxyMode,
|
|
355
|
+
requestedProxyIdPresent: Boolean(config.kernelProxyId),
|
|
356
|
+
requestedProxyIdSuffix: proxyIdSuffix(config.kernelProxyId),
|
|
357
|
+
createdProxyIdPresent: typeof kernelBrowser.proxy_id === "string" ? Boolean(kernelBrowser.proxy_id) : null,
|
|
358
|
+
createdProxyIdSuffix: proxyIdSuffix(kernelBrowser.proxy_id),
|
|
359
|
+
retrievedProxyIdPresent: null,
|
|
360
|
+
retrievedProxyIdSuffix: null,
|
|
361
|
+
retrievedProxyIdMatchesRequested: null,
|
|
362
|
+
defaultProxyDisabled,
|
|
363
|
+
defaultProxyDisableError,
|
|
364
|
+
proxyResolution: config.kernelProxyResolution ?? null,
|
|
365
|
+
timeoutSeconds,
|
|
366
|
+
stealth: typeof kernelBrowser.stealth === "boolean" ? kernelBrowser.stealth : null,
|
|
367
|
+
profilePresent: null,
|
|
368
|
+
poolPresent: null,
|
|
369
|
+
retrieveError: null
|
|
370
|
+
};
|
|
371
|
+
this.debugSnapshot.kernel = kernelDebug;
|
|
372
|
+
console.info(JSON.stringify({
|
|
373
|
+
event: "kernel_browser_created",
|
|
374
|
+
kernel_session_id: this.kernelSessionId,
|
|
375
|
+
timeout_seconds: timeoutSeconds,
|
|
376
|
+
proxy_mode: proxyMode,
|
|
377
|
+
proxy_id_present: Boolean(config.kernelProxyId),
|
|
378
|
+
proxy_resolution_source: config.kernelProxyResolution?.source
|
|
379
|
+
}));
|
|
380
|
+
if (this.debugEnabled) {
|
|
381
|
+
await this.populateKernelRetrieveDebug(kernelDebug, config.kernelProxyId);
|
|
382
|
+
}
|
|
213
383
|
this.browser = await import_playwright.chromium.connectOverCDP(kernelBrowser.cdp_ws_url);
|
|
214
|
-
this.context =
|
|
384
|
+
this.context = await this.browser.newContext(rankCheckContextOptions(config));
|
|
215
385
|
await this.installEsbuildHelperShims(this.context);
|
|
216
|
-
this.page =
|
|
386
|
+
this.page = await this.context.newPage();
|
|
387
|
+
await this.page.setViewportSize(config.viewport);
|
|
388
|
+
if (this.debugEnabled) {
|
|
389
|
+
this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
|
|
390
|
+
}
|
|
217
391
|
return;
|
|
218
392
|
}
|
|
219
393
|
const launchOpts = {
|
|
220
394
|
headless: config.headless,
|
|
221
395
|
proxy: config.proxy ? { server: config.proxy } : void 0
|
|
222
396
|
};
|
|
223
|
-
const ctxOpts =
|
|
224
|
-
viewport: config.viewport,
|
|
225
|
-
locale: config.locale,
|
|
226
|
-
userAgent: DESKTOP_USER_AGENT
|
|
227
|
-
};
|
|
397
|
+
const ctxOpts = rankCheckContextOptions(config);
|
|
228
398
|
if (config.profileDir) {
|
|
229
399
|
this.context = await import_playwright_extra.chromium.launchPersistentContext(config.profileDir, {
|
|
230
400
|
...launchOpts,
|
|
@@ -238,6 +408,107 @@ var BrowserDriver = class {
|
|
|
238
408
|
await this.installEsbuildHelperShims(this.context);
|
|
239
409
|
this.page = await this.context.newPage();
|
|
240
410
|
}
|
|
411
|
+
if (this.debugEnabled) {
|
|
412
|
+
this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
async populateKernelRetrieveDebug(kernelDebug, requestedProxyId) {
|
|
416
|
+
if (!this.kernelClient || !this.kernelSessionId) return;
|
|
417
|
+
try {
|
|
418
|
+
const retrieved = await withTimeout(
|
|
419
|
+
this.kernelClient.browsers.retrieve(this.kernelSessionId),
|
|
420
|
+
5e3,
|
|
421
|
+
`Kernel session ${this.kernelSessionId} retrieve`
|
|
422
|
+
);
|
|
423
|
+
kernelDebug.retrievedProxyIdPresent = typeof retrieved.proxy_id === "string" ? Boolean(retrieved.proxy_id) : false;
|
|
424
|
+
kernelDebug.retrievedProxyIdSuffix = proxyIdSuffix(retrieved.proxy_id);
|
|
425
|
+
kernelDebug.retrievedProxyIdMatchesRequested = requestedProxyId ? retrieved.proxy_id === requestedProxyId : !retrieved.proxy_id;
|
|
426
|
+
kernelDebug.timeoutSeconds = typeof retrieved.timeout_seconds === "number" ? retrieved.timeout_seconds : kernelDebug.timeoutSeconds;
|
|
427
|
+
kernelDebug.stealth = typeof retrieved.stealth === "boolean" ? retrieved.stealth : kernelDebug.stealth;
|
|
428
|
+
kernelDebug.profilePresent = Boolean(retrieved.profile);
|
|
429
|
+
kernelDebug.poolPresent = Boolean(retrieved.pool);
|
|
430
|
+
} catch (err) {
|
|
431
|
+
kernelDebug.retrieveError = errorText(err);
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
async captureBrowserNetworkLocation() {
|
|
435
|
+
const fallback = (message, source = "ipapi.co") => ({
|
|
436
|
+
source,
|
|
437
|
+
ip: null,
|
|
438
|
+
city: null,
|
|
439
|
+
region: null,
|
|
440
|
+
country: null,
|
|
441
|
+
org: null,
|
|
442
|
+
timezone: null,
|
|
443
|
+
error: message
|
|
444
|
+
});
|
|
445
|
+
if (!this.context) return fallback("browser context is not available");
|
|
446
|
+
let debugPage = null;
|
|
447
|
+
try {
|
|
448
|
+
debugPage = await this.context.newPage();
|
|
449
|
+
const ipwho = await this.loadJsonInDebugPage(debugPage, "https://ipwho.is/");
|
|
450
|
+
if (ipwho) {
|
|
451
|
+
const connection = typeof ipwho.connection === "object" && ipwho.connection !== null ? ipwho.connection : {};
|
|
452
|
+
return {
|
|
453
|
+
source: "ipwho.is",
|
|
454
|
+
ip: typeof ipwho.ip === "string" ? ipwho.ip : null,
|
|
455
|
+
city: typeof ipwho.city === "string" ? ipwho.city : null,
|
|
456
|
+
region: typeof ipwho.region === "string" ? ipwho.region : null,
|
|
457
|
+
country: typeof ipwho.country === "string" ? ipwho.country : null,
|
|
458
|
+
org: typeof connection.org === "string" ? connection.org : null,
|
|
459
|
+
timezone: typeof ipwho.timezone === "object" && ipwho.timezone !== null && typeof ipwho.timezone.id === "string" ? ipwho.timezone.id : null,
|
|
460
|
+
error: null
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
const ipify = await this.loadJsonInDebugPage(debugPage, "https://api64.ipify.org?format=json");
|
|
464
|
+
if (ipify) {
|
|
465
|
+
return {
|
|
466
|
+
source: "api64.ipify.org",
|
|
467
|
+
ip: typeof ipify.ip === "string" ? ipify.ip : null,
|
|
468
|
+
city: null,
|
|
469
|
+
region: null,
|
|
470
|
+
country: null,
|
|
471
|
+
org: null,
|
|
472
|
+
timezone: null,
|
|
473
|
+
error: null
|
|
474
|
+
};
|
|
475
|
+
}
|
|
476
|
+
await withTimeout(
|
|
477
|
+
debugPage.goto("https://ipapi.co/json/", { waitUntil: "domcontentloaded", timeout: 7e3 }),
|
|
478
|
+
8e3,
|
|
479
|
+
"browser network location navigation"
|
|
480
|
+
);
|
|
481
|
+
const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
|
|
482
|
+
const data = JSON.parse(body);
|
|
483
|
+
return {
|
|
484
|
+
source: "ipapi.co",
|
|
485
|
+
ip: typeof data.ip === "string" ? data.ip : null,
|
|
486
|
+
city: typeof data.city === "string" ? data.city : null,
|
|
487
|
+
region: typeof data.region === "string" ? data.region : null,
|
|
488
|
+
country: typeof data.country_name === "string" ? data.country_name : typeof data.country === "string" ? data.country : null,
|
|
489
|
+
org: typeof data.org === "string" ? data.org : null,
|
|
490
|
+
timezone: typeof data.timezone === "string" ? data.timezone : null,
|
|
491
|
+
error: null
|
|
492
|
+
};
|
|
493
|
+
} catch (err) {
|
|
494
|
+
return fallback(errorText(err));
|
|
495
|
+
} finally {
|
|
496
|
+
await debugPage?.close().catch(() => {
|
|
497
|
+
});
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
async loadJsonInDebugPage(debugPage, url) {
|
|
501
|
+
try {
|
|
502
|
+
await withTimeout(
|
|
503
|
+
debugPage.goto(url, { waitUntil: "domcontentloaded", timeout: 7e3 }),
|
|
504
|
+
8e3,
|
|
505
|
+
`browser network location navigation ${url}`
|
|
506
|
+
);
|
|
507
|
+
const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
|
|
508
|
+
return JSON.parse(body);
|
|
509
|
+
} catch {
|
|
510
|
+
return null;
|
|
511
|
+
}
|
|
241
512
|
}
|
|
242
513
|
async installEsbuildHelperShims(context) {
|
|
243
514
|
await context.addInitScript(() => {
|
|
@@ -249,42 +520,79 @@ var BrowserDriver = class {
|
|
|
249
520
|
};
|
|
250
521
|
});
|
|
251
522
|
}
|
|
252
|
-
async navigateToSERP(query, uule, gl, hl) {
|
|
253
|
-
const params = new URLSearchParams({ q: query, gl, hl });
|
|
523
|
+
async navigateToSERP(query, uule, gl, hl, options) {
|
|
524
|
+
const params = new URLSearchParams({ q: query, gl, hl, pws: "0" });
|
|
525
|
+
if (options?.num) params.set("num", String(options.num));
|
|
254
526
|
if (uule) params.set("uule", uule);
|
|
255
527
|
const url = "https://www.google.com/search?" + params.toString();
|
|
528
|
+
const navDebug = options?.debug ? {
|
|
529
|
+
requestedUrl: url,
|
|
530
|
+
finalUrl: null,
|
|
531
|
+
title: null,
|
|
532
|
+
bodySnippet: null,
|
|
533
|
+
hasPaa: null,
|
|
534
|
+
captchaDetected: null,
|
|
535
|
+
googleSorryUrl: null,
|
|
536
|
+
redirected: null
|
|
537
|
+
} : null;
|
|
538
|
+
if (navDebug) this.debugSnapshot.serpNavigation = navDebug;
|
|
256
539
|
try {
|
|
257
540
|
await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
|
|
258
541
|
} catch (err) {
|
|
542
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: null, captchaDetected: null });
|
|
259
543
|
const diag = await this.captureDiagnostics(url);
|
|
260
544
|
throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
|
|
261
545
|
}
|
|
262
546
|
const captchaCount = await this.page.locator(PAASelectors.captchaMarker).count();
|
|
263
547
|
if (captchaCount > 0) {
|
|
264
|
-
|
|
265
|
-
try {
|
|
266
|
-
await this.page.waitForSelector(PAASelectors.container, { timeout: 45e3 });
|
|
267
|
-
return { hasPaa: true };
|
|
268
|
-
} catch {
|
|
269
|
-
throw new CaptchaError(this.captchaMessage());
|
|
270
|
-
}
|
|
271
|
-
}
|
|
548
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
|
|
272
549
|
throw new CaptchaError(this.captchaMessage());
|
|
273
550
|
}
|
|
274
551
|
const fastFound = await this.page.waitForSelector(PAASelectors.item, { timeout: 4e3 }).catch(() => null);
|
|
275
|
-
if (fastFound)
|
|
552
|
+
if (fastFound) {
|
|
553
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
|
|
554
|
+
return { hasPaa: true };
|
|
555
|
+
}
|
|
276
556
|
const captchaAfter = await this.page.locator(PAASelectors.captchaMarker).count();
|
|
277
|
-
if (captchaAfter > 0)
|
|
557
|
+
if (captchaAfter > 0) {
|
|
558
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
|
|
559
|
+
throw new CaptchaError(this.captchaMessage());
|
|
560
|
+
}
|
|
278
561
|
for (let i = 1; i <= 6; i++) {
|
|
279
562
|
await this.page.evaluate((f) => {
|
|
280
563
|
window.scrollTo(0, document.body.scrollHeight * f);
|
|
281
564
|
}, i / 6);
|
|
282
565
|
await this.page.waitForTimeout(600);
|
|
283
566
|
const count = await this.page.locator(PAASelectors.item).count();
|
|
284
|
-
if (count > 0)
|
|
567
|
+
if (count > 0) {
|
|
568
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
|
|
569
|
+
return { hasPaa: true };
|
|
570
|
+
}
|
|
285
571
|
}
|
|
572
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: false });
|
|
286
573
|
return { hasPaa: false };
|
|
287
574
|
}
|
|
575
|
+
async updateSerpNavigationDebug(navDebug, requestedUrl, state) {
|
|
576
|
+
if (!navDebug || !this.page) return;
|
|
577
|
+
try {
|
|
578
|
+
const finalUrl = this.page.url();
|
|
579
|
+
const title = await this.page.title().catch(() => "");
|
|
580
|
+
const bodySnippet = await this.page.evaluate(() => {
|
|
581
|
+
const text = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
|
|
582
|
+
return text.slice(0, 500);
|
|
583
|
+
}).catch(() => "");
|
|
584
|
+
const textCaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
|
|
585
|
+
navDebug.finalUrl = finalUrl;
|
|
586
|
+
navDebug.title = title;
|
|
587
|
+
navDebug.bodySnippet = bodySnippet;
|
|
588
|
+
navDebug.hasPaa = state.hasPaa;
|
|
589
|
+
navDebug.captchaDetected = state.captchaDetected ?? textCaptcha;
|
|
590
|
+
navDebug.googleSorryUrl = /google\.[^/]+\/sorry\//i.test(finalUrl);
|
|
591
|
+
navDebug.redirected = finalUrl !== requestedUrl;
|
|
592
|
+
} catch (err) {
|
|
593
|
+
navDebug.bodySnippet = `debug capture failed: ${errorText(err)}`;
|
|
594
|
+
}
|
|
595
|
+
}
|
|
288
596
|
async captureDiagnostics(intendedUrl) {
|
|
289
597
|
try {
|
|
290
598
|
const finalUrl = this.page.url();
|
|
@@ -306,7 +614,7 @@ var BrowserDriver = class {
|
|
|
306
614
|
}
|
|
307
615
|
}
|
|
308
616
|
captchaMessage() {
|
|
309
|
-
return this.kernelClient ? "Google returned a CAPTCHA on this
|
|
617
|
+
return this.kernelClient ? "Google returned a CAPTCHA on this session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
|
|
310
618
|
}
|
|
311
619
|
async navigateTo(url) {
|
|
312
620
|
try {
|
|
@@ -331,6 +639,12 @@ var BrowserDriver = class {
|
|
|
331
639
|
getPage() {
|
|
332
640
|
return this.page;
|
|
333
641
|
}
|
|
642
|
+
getKernelSessionId() {
|
|
643
|
+
return this.kernelSessionId;
|
|
644
|
+
}
|
|
645
|
+
getDebugSnapshot() {
|
|
646
|
+
return this.debugSnapshot;
|
|
647
|
+
}
|
|
334
648
|
async close() {
|
|
335
649
|
if (this.browser) {
|
|
336
650
|
const b = this.browser;
|
|
@@ -341,21 +655,84 @@ var BrowserDriver = class {
|
|
|
341
655
|
this.page = null;
|
|
342
656
|
this.kernelSessionId = null;
|
|
343
657
|
this.kernelClient = null;
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
)
|
|
658
|
+
if (client && sessionId) {
|
|
659
|
+
console.info(JSON.stringify({
|
|
660
|
+
event: "kernel_browser_delete_started",
|
|
661
|
+
kernel_session_id: sessionId
|
|
662
|
+
}));
|
|
663
|
+
const deleteSession = withTimeout(
|
|
664
|
+
client.browsers.deleteByID(sessionId),
|
|
665
|
+
KERNEL_SESSION_DELETE_TIMEOUT_MS,
|
|
666
|
+
`Kernel session ${sessionId} delete`
|
|
667
|
+
);
|
|
668
|
+
const closeBrowser = withTimeout(
|
|
669
|
+
b.close(),
|
|
670
|
+
KERNEL_BROWSER_CLOSE_TIMEOUT_MS,
|
|
671
|
+
`Kernel browser ${sessionId} close`
|
|
672
|
+
);
|
|
673
|
+
const [deleteResult, closeResult] = await Promise.allSettled([deleteSession, closeBrowser]);
|
|
674
|
+
const result = {
|
|
675
|
+
kernelSessionId: sessionId,
|
|
676
|
+
kernelDeleteStarted: true,
|
|
677
|
+
kernelDeleteSucceeded: deleteResult.status === "fulfilled",
|
|
678
|
+
kernelDeleteError: deleteResult.status === "rejected" ? deleteResult.reason instanceof Error ? deleteResult.reason.message : String(deleteResult.reason) : null,
|
|
679
|
+
browserCloseSucceeded: closeResult.status === "fulfilled",
|
|
680
|
+
browserCloseError: closeResult.status === "rejected" ? closeResult.reason instanceof Error ? closeResult.reason.message : String(closeResult.reason) : null
|
|
681
|
+
};
|
|
682
|
+
if (deleteResult.status === "rejected") {
|
|
683
|
+
console.warn(JSON.stringify({
|
|
684
|
+
event: "kernel_browser_delete_failed",
|
|
685
|
+
kernel_session_id: sessionId,
|
|
686
|
+
message: result.kernelDeleteError
|
|
687
|
+
}));
|
|
688
|
+
console.warn(`Kernel session cleanup failed for ${sessionId}:`, deleteResult.reason);
|
|
689
|
+
} else {
|
|
690
|
+
console.info(JSON.stringify({
|
|
691
|
+
event: "kernel_browser_delete_succeeded",
|
|
692
|
+
kernel_session_id: sessionId
|
|
693
|
+
}));
|
|
351
694
|
}
|
|
695
|
+
if (closeResult.status === "rejected") {
|
|
696
|
+
console.warn(JSON.stringify({
|
|
697
|
+
event: "kernel_browser_close_failed",
|
|
698
|
+
kernel_session_id: sessionId,
|
|
699
|
+
message: result.browserCloseError
|
|
700
|
+
}));
|
|
701
|
+
console.warn(`Kernel browser close failed for ${sessionId}:`, closeResult.reason);
|
|
702
|
+
}
|
|
703
|
+
return result;
|
|
352
704
|
}
|
|
705
|
+
await b.close();
|
|
706
|
+
return {
|
|
707
|
+
kernelSessionId: null,
|
|
708
|
+
kernelDeleteStarted: false,
|
|
709
|
+
kernelDeleteSucceeded: null,
|
|
710
|
+
kernelDeleteError: null,
|
|
711
|
+
browserCloseSucceeded: true,
|
|
712
|
+
browserCloseError: null
|
|
713
|
+
};
|
|
353
714
|
} else if (this.context) {
|
|
354
715
|
const ctx = this.context;
|
|
355
716
|
this.context = null;
|
|
356
717
|
this.page = null;
|
|
357
718
|
await ctx.close();
|
|
719
|
+
return {
|
|
720
|
+
kernelSessionId: null,
|
|
721
|
+
kernelDeleteStarted: false,
|
|
722
|
+
kernelDeleteSucceeded: null,
|
|
723
|
+
kernelDeleteError: null,
|
|
724
|
+
browserCloseSucceeded: true,
|
|
725
|
+
browserCloseError: null
|
|
726
|
+
};
|
|
358
727
|
}
|
|
728
|
+
return {
|
|
729
|
+
kernelSessionId: null,
|
|
730
|
+
kernelDeleteStarted: false,
|
|
731
|
+
kernelDeleteSucceeded: null,
|
|
732
|
+
kernelDeleteError: null,
|
|
733
|
+
browserCloseSucceeded: null,
|
|
734
|
+
browserCloseError: null
|
|
735
|
+
};
|
|
359
736
|
}
|
|
360
737
|
};
|
|
361
738
|
|
|
@@ -426,13 +803,157 @@ var LOCATIONS = {
|
|
|
426
803
|
};
|
|
427
804
|
|
|
428
805
|
// src/uule.ts
|
|
806
|
+
function encodeVarint(value) {
|
|
807
|
+
const bytes = [];
|
|
808
|
+
let remaining = value;
|
|
809
|
+
do {
|
|
810
|
+
let byte = remaining & 127;
|
|
811
|
+
remaining >>>= 7;
|
|
812
|
+
if (remaining > 0) byte |= 128;
|
|
813
|
+
bytes.push(byte);
|
|
814
|
+
} while (remaining > 0);
|
|
815
|
+
return bytes;
|
|
816
|
+
}
|
|
429
817
|
function encodeUule(name) {
|
|
430
|
-
const
|
|
431
|
-
|
|
818
|
+
const locationBytes = Buffer.from(name, "utf8");
|
|
819
|
+
const payload = Buffer.concat([
|
|
820
|
+
Buffer.from([8, 2, 16, 32, 34]),
|
|
821
|
+
Buffer.from(encodeVarint(locationBytes.length)),
|
|
822
|
+
locationBytes
|
|
823
|
+
]);
|
|
824
|
+
return `w+${payload.toString("base64")}`;
|
|
432
825
|
}
|
|
433
826
|
function normalizeLocation(input) {
|
|
434
|
-
const
|
|
435
|
-
|
|
827
|
+
const raw = input.toLowerCase().trim();
|
|
828
|
+
if (LOCATIONS[raw]) return LOCATIONS[raw];
|
|
829
|
+
const beforeComma = raw.split(",")[0].trim();
|
|
830
|
+
if (beforeComma !== raw && LOCATIONS[beforeComma]) return LOCATIONS[beforeComma];
|
|
831
|
+
const withoutState = raw.replace(/\s+[a-z]{2}$/, "").trim();
|
|
832
|
+
if (withoutState !== raw && LOCATIONS[withoutState]) return LOCATIONS[withoutState];
|
|
833
|
+
return input;
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
// src/serp-location-debug.ts
|
|
837
|
+
var STATE_TO_CODE = {
|
|
838
|
+
alabama: "AL",
|
|
839
|
+
alaska: "AK",
|
|
840
|
+
arizona: "AZ",
|
|
841
|
+
arkansas: "AR",
|
|
842
|
+
california: "CA",
|
|
843
|
+
colorado: "CO",
|
|
844
|
+
connecticut: "CT",
|
|
845
|
+
delaware: "DE",
|
|
846
|
+
florida: "FL",
|
|
847
|
+
georgia: "GA",
|
|
848
|
+
hawaii: "HI",
|
|
849
|
+
idaho: "ID",
|
|
850
|
+
illinois: "IL",
|
|
851
|
+
indiana: "IN",
|
|
852
|
+
iowa: "IA",
|
|
853
|
+
kansas: "KS",
|
|
854
|
+
kentucky: "KY",
|
|
855
|
+
louisiana: "LA",
|
|
856
|
+
maine: "ME",
|
|
857
|
+
maryland: "MD",
|
|
858
|
+
massachusetts: "MA",
|
|
859
|
+
michigan: "MI",
|
|
860
|
+
minnesota: "MN",
|
|
861
|
+
mississippi: "MS",
|
|
862
|
+
missouri: "MO",
|
|
863
|
+
montana: "MT",
|
|
864
|
+
nebraska: "NE",
|
|
865
|
+
nevada: "NV",
|
|
866
|
+
"new hampshire": "NH",
|
|
867
|
+
"new jersey": "NJ",
|
|
868
|
+
"new mexico": "NM",
|
|
869
|
+
"new york": "NY",
|
|
870
|
+
"north carolina": "NC",
|
|
871
|
+
"north dakota": "ND",
|
|
872
|
+
ohio: "OH",
|
|
873
|
+
oklahoma: "OK",
|
|
874
|
+
oregon: "OR",
|
|
875
|
+
pennsylvania: "PA",
|
|
876
|
+
"rhode island": "RI",
|
|
877
|
+
"south carolina": "SC",
|
|
878
|
+
"south dakota": "SD",
|
|
879
|
+
tennessee: "TN",
|
|
880
|
+
texas: "TX",
|
|
881
|
+
utah: "UT",
|
|
882
|
+
vermont: "VT",
|
|
883
|
+
virginia: "VA",
|
|
884
|
+
washington: "WA",
|
|
885
|
+
"west virginia": "WV",
|
|
886
|
+
wisconsin: "WI",
|
|
887
|
+
wyoming: "WY",
|
|
888
|
+
"district of columbia": "DC"
|
|
889
|
+
};
|
|
890
|
+
var STATE_PATTERN = [
|
|
891
|
+
...Object.keys(STATE_TO_CODE).map((s) => s.replace(/\s+/g, "\\s+")),
|
|
892
|
+
...Object.values(STATE_TO_CODE)
|
|
893
|
+
].join("|");
|
|
894
|
+
var CITY_STATE_RE = new RegExp(`\\b([A-Z][A-Za-z]+(?:[\\s.-][A-Z][A-Za-z]+){0,4}),?\\s+(${STATE_PATTERN})\\b`, "gi");
|
|
895
|
+
function normalizeRegionCode(input) {
|
|
896
|
+
if (!input) return null;
|
|
897
|
+
const trimmed = input.trim();
|
|
898
|
+
if (/^[A-Z]{2}$/i.test(trimmed)) return trimmed.toUpperCase();
|
|
899
|
+
return STATE_TO_CODE[trimmed.toLowerCase()] ?? null;
|
|
900
|
+
}
|
|
901
|
+
function normalizeCity(input) {
|
|
902
|
+
const cleaned = input.replace(/\s+/g, " ").trim().replace(/^.*\b(?:in|near|around|serving)\s+/i, "");
|
|
903
|
+
return cleaned.toLowerCase().replace(/\b[a-z]/g, (char) => char.toUpperCase());
|
|
904
|
+
}
|
|
905
|
+
function parseExpected(canonicalLocation) {
|
|
906
|
+
if (!canonicalLocation) return null;
|
|
907
|
+
const [city = "", region = ""] = canonicalLocation.split(",").map((part) => part.trim());
|
|
908
|
+
return {
|
|
909
|
+
city: normalizeCity(city),
|
|
910
|
+
regionCode: normalizeRegionCode(region),
|
|
911
|
+
canonicalLocation
|
|
912
|
+
};
|
|
913
|
+
}
|
|
914
|
+
function addCandidate(candidates, city, region, example) {
|
|
915
|
+
const normalizedCity = normalizeCity(city);
|
|
916
|
+
const regionCode = normalizeRegionCode(region);
|
|
917
|
+
if (!normalizedCity || !regionCode) return;
|
|
918
|
+
const key = `${normalizedCity.toLowerCase()}|${regionCode}`;
|
|
919
|
+
const existing = candidates.get(key);
|
|
920
|
+
if (existing) {
|
|
921
|
+
existing.count++;
|
|
922
|
+
if (existing.examples.length < 3 && !existing.examples.includes(example)) existing.examples.push(example);
|
|
923
|
+
return;
|
|
924
|
+
}
|
|
925
|
+
candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
|
|
926
|
+
}
|
|
927
|
+
function scanText(candidates, text) {
|
|
928
|
+
const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
|
|
929
|
+
for (const match of normalized.matchAll(CITY_STATE_RE)) {
|
|
930
|
+
addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
|
|
931
|
+
}
|
|
932
|
+
}
|
|
933
|
+
function inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) {
|
|
934
|
+
const expected = parseExpected(canonicalLocation);
|
|
935
|
+
const candidates = /* @__PURE__ */ new Map();
|
|
936
|
+
for (const result of organicResults) {
|
|
937
|
+
scanText(candidates, [result.title, result.snippet ?? "", result.cite ?? "", result.url].join(" "));
|
|
938
|
+
}
|
|
939
|
+
for (const business of localPack) {
|
|
940
|
+
scanText(candidates, [business.name, ...business.metadata, business.websiteUrl ?? "", business.directionsUrl ?? ""].join(" "));
|
|
941
|
+
}
|
|
942
|
+
const rankedCandidates = Array.from(candidates.values()).sort((a, b) => b.count - a.count || a.city.localeCompare(b.city)).slice(0, 8);
|
|
943
|
+
if (!expected) {
|
|
944
|
+
return { status: "not_requested", expected: null, candidates: rankedCandidates };
|
|
945
|
+
}
|
|
946
|
+
if (rankedCandidates.length === 0) {
|
|
947
|
+
return { status: "unknown", expected, candidates: [] };
|
|
948
|
+
}
|
|
949
|
+
const matched = rankedCandidates.some(
|
|
950
|
+
(candidate) => candidate.city.toLowerCase() === expected.city.toLowerCase() && (expected.regionCode == null || candidate.regionCode === expected.regionCode)
|
|
951
|
+
);
|
|
952
|
+
return {
|
|
953
|
+
status: matched ? "matched" : "mismatch",
|
|
954
|
+
expected,
|
|
955
|
+
candidates: rankedCandidates
|
|
956
|
+
};
|
|
436
957
|
}
|
|
437
958
|
|
|
438
959
|
// src/lib/paa-answer-cleanup.ts
|
|
@@ -527,7 +1048,220 @@ function cleanPAAAnswerText(answer, question, sourceTitle) {
|
|
|
527
1048
|
return text;
|
|
528
1049
|
}
|
|
529
1050
|
|
|
1051
|
+
// src/extractor/ai-surfaces.ts
|
|
1052
|
+
async function extractAISurfacesFromDocument(config) {
|
|
1053
|
+
const selectors = config ?? {
|
|
1054
|
+
aio: {
|
|
1055
|
+
root: "[data-lhcontainer][data-streaming-container][eid]",
|
|
1056
|
+
legacyRoot: '[data-hveid="CBMQAA"]',
|
|
1057
|
+
wrapper: ".Fgyi2e",
|
|
1058
|
+
controller: '[jscontroller="AkrxPe"]',
|
|
1059
|
+
contentSubtree: '[data-subtree="mfc"]',
|
|
1060
|
+
heading: ".Fzsovc.cwYVJe.RJPOee",
|
|
1061
|
+
header: ".heWuVc",
|
|
1062
|
+
showMoreButton: '[aria-label="Show more AI Overview"]',
|
|
1063
|
+
sourcesPanel: ".OZ9ddf.WAUd4",
|
|
1064
|
+
disclaimer: ".DuQANe.MSJHRb"
|
|
1065
|
+
},
|
|
1066
|
+
aim: {
|
|
1067
|
+
root: '[data-hveid="CAUQAA"]',
|
|
1068
|
+
wrapper: ".Fgyi2e"
|
|
1069
|
+
},
|
|
1070
|
+
expandWaitMs: 1500
|
|
1071
|
+
};
|
|
1072
|
+
const sn = window.google?.sn ?? "unknown";
|
|
1073
|
+
const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
|
|
1074
|
+
function textOf(el) {
|
|
1075
|
+
if (!el) return "";
|
|
1076
|
+
return (el.innerText ?? el.textContent ?? "").trim();
|
|
1077
|
+
}
|
|
1078
|
+
function hasAIOverviewLabel(el) {
|
|
1079
|
+
const heading = el.querySelector(selectors.aio.heading);
|
|
1080
|
+
if (textOf(heading) === "AI Overview") return true;
|
|
1081
|
+
const header = el.querySelector(selectors.aio.header);
|
|
1082
|
+
if (textOf(header).split(/\n|\s{2,}/).some((part) => part.trim() === "AI Overview")) return true;
|
|
1083
|
+
return textOf(el).includes("AI Overview");
|
|
1084
|
+
}
|
|
1085
|
+
function findAIORoot() {
|
|
1086
|
+
const primaryRoots = Array.from(document.querySelectorAll(selectors.aio.root));
|
|
1087
|
+
const labeledPrimary = primaryRoots.find(hasAIOverviewLabel);
|
|
1088
|
+
if (labeledPrimary) return labeledPrimary;
|
|
1089
|
+
if (primaryRoots.length > 0) return primaryRoots[0];
|
|
1090
|
+
if (selectors.aio.legacyRoot) {
|
|
1091
|
+
const legacy = document.querySelector(selectors.aio.legacyRoot);
|
|
1092
|
+
if (legacy) return legacy;
|
|
1093
|
+
}
|
|
1094
|
+
const headings = document.querySelectorAll(`${selectors.aio.heading}, h1, h2, h3, [role="heading"]`);
|
|
1095
|
+
for (const h of headings) {
|
|
1096
|
+
if (textOf(h) !== "AI Overview") continue;
|
|
1097
|
+
let el = h.parentElement;
|
|
1098
|
+
for (let i = 0; i < 8 && el; i++) {
|
|
1099
|
+
if (el.matches(selectors.aio.root) || el.querySelector(selectors.aio.controller) || el.querySelector(selectors.aio.contentSubtree)) {
|
|
1100
|
+
return el;
|
|
1101
|
+
}
|
|
1102
|
+
el = el.parentElement;
|
|
1103
|
+
}
|
|
1104
|
+
return h.parentElement;
|
|
1105
|
+
}
|
|
1106
|
+
return null;
|
|
1107
|
+
}
|
|
1108
|
+
function cleanText(target) {
|
|
1109
|
+
if (!target) return null;
|
|
1110
|
+
const clone = target.cloneNode(true);
|
|
1111
|
+
clone.querySelectorAll([
|
|
1112
|
+
"script",
|
|
1113
|
+
"style",
|
|
1114
|
+
"noscript",
|
|
1115
|
+
"img",
|
|
1116
|
+
"picture",
|
|
1117
|
+
"video",
|
|
1118
|
+
selectors.aio.header,
|
|
1119
|
+
selectors.aio.showMoreButton,
|
|
1120
|
+
selectors.aio.sourcesPanel,
|
|
1121
|
+
selectors.aio.disclaimer,
|
|
1122
|
+
'[data-subtree="dfa"]',
|
|
1123
|
+
"[data-src-id]",
|
|
1124
|
+
'[role="dialog"]',
|
|
1125
|
+
".HWMcu",
|
|
1126
|
+
".bTFeG",
|
|
1127
|
+
".CyMdWb",
|
|
1128
|
+
".MFrAxb",
|
|
1129
|
+
".F0OfWd.hfWAgb",
|
|
1130
|
+
".x2qcTc.fZavHb",
|
|
1131
|
+
".SvjEff",
|
|
1132
|
+
".sR2MY",
|
|
1133
|
+
".lKuDef",
|
|
1134
|
+
".GSPQcc",
|
|
1135
|
+
"a[href]",
|
|
1136
|
+
"button",
|
|
1137
|
+
'[role="button"]'
|
|
1138
|
+
].join(",")).forEach((el) => el.remove());
|
|
1139
|
+
const holder = document.createElement("div");
|
|
1140
|
+
holder.style.position = "fixed";
|
|
1141
|
+
holder.style.left = "-10000px";
|
|
1142
|
+
holder.style.top = "0";
|
|
1143
|
+
holder.style.width = `${Math.max(320, Math.round(target.getBoundingClientRect?.().width || 960))}px`;
|
|
1144
|
+
holder.style.opacity = "0";
|
|
1145
|
+
holder.style.pointerEvents = "none";
|
|
1146
|
+
holder.append(clone);
|
|
1147
|
+
document.body.append(holder);
|
|
1148
|
+
const rendered = clone.innerText || clone.textContent || "";
|
|
1149
|
+
holder.remove();
|
|
1150
|
+
const lines = rendered.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n[ \t]+/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ").trim().split("\n").map((line) => line.replace(/\u00a0/g, " ").trim()).filter(Boolean);
|
|
1151
|
+
const filteredLines = [];
|
|
1152
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1153
|
+
const line = lines[i];
|
|
1154
|
+
const next = lines[i + 1] ?? "";
|
|
1155
|
+
if (line === "AI Overview") continue;
|
|
1156
|
+
if (line === "Show more") continue;
|
|
1157
|
+
if (/^AI can make mistakes/i.test(line)) continue;
|
|
1158
|
+
if (/^Thank you\b/i.test(line)) continue;
|
|
1159
|
+
if (/^Your feedback helps Google improve/i.test(line)) continue;
|
|
1160
|
+
if (/^\+?\d+$/.test(line)) continue;
|
|
1161
|
+
if (/^\+\d+$/.test(next) && line.length <= 80) {
|
|
1162
|
+
i++;
|
|
1163
|
+
continue;
|
|
1164
|
+
}
|
|
1165
|
+
filteredLines.push(line);
|
|
1166
|
+
}
|
|
1167
|
+
const raw = filteredLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
1168
|
+
if (!raw || /not available|try again|can't generate/i.test(raw)) return null;
|
|
1169
|
+
return raw;
|
|
1170
|
+
}
|
|
1171
|
+
function normalizeHref(rawHref) {
|
|
1172
|
+
if (!rawHref || rawHref.startsWith("javascript:")) return null;
|
|
1173
|
+
let href = rawHref;
|
|
1174
|
+
try {
|
|
1175
|
+
const absolute = new URL(rawHref, window.location.href);
|
|
1176
|
+
const q = absolute.searchParams.get("q") ?? absolute.searchParams.get("url");
|
|
1177
|
+
if (/(\.|^)google\./i.test(absolute.hostname) && q?.startsWith("http")) {
|
|
1178
|
+
href = q;
|
|
1179
|
+
} else {
|
|
1180
|
+
href = absolute.href;
|
|
1181
|
+
}
|
|
1182
|
+
} catch {
|
|
1183
|
+
return null;
|
|
1184
|
+
}
|
|
1185
|
+
if (!/^https?:\/\//i.test(href)) return null;
|
|
1186
|
+
try {
|
|
1187
|
+
const url = new URL(href);
|
|
1188
|
+
const isGoogleInternal = /(\.|^)google\./i.test(url.hostname);
|
|
1189
|
+
if (isGoogleInternal) return null;
|
|
1190
|
+
return url.href;
|
|
1191
|
+
} catch {
|
|
1192
|
+
return null;
|
|
1193
|
+
}
|
|
1194
|
+
}
|
|
1195
|
+
function extractCitations(root) {
|
|
1196
|
+
if (!root) return [];
|
|
1197
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1198
|
+
const citations = [];
|
|
1199
|
+
for (const a of Array.from(root.querySelectorAll("a[href]"))) {
|
|
1200
|
+
const href = normalizeHref(a.getAttribute("href") ?? "");
|
|
1201
|
+
if (!href || seen.has(href)) continue;
|
|
1202
|
+
seen.add(href);
|
|
1203
|
+
let fallbackHost = "";
|
|
1204
|
+
try {
|
|
1205
|
+
fallbackHost = new URL(href).hostname.replace(/^www\./, "");
|
|
1206
|
+
} catch {
|
|
1207
|
+
}
|
|
1208
|
+
citations.push({
|
|
1209
|
+
text: textOf(a) || fallbackHost || href,
|
|
1210
|
+
href
|
|
1211
|
+
});
|
|
1212
|
+
}
|
|
1213
|
+
return citations;
|
|
1214
|
+
}
|
|
1215
|
+
async function maybeExpand(root) {
|
|
1216
|
+
const button = root.querySelector(selectors.aio.showMoreButton);
|
|
1217
|
+
if (!button || button.getAttribute("aria-expanded") !== "false") return false;
|
|
1218
|
+
button.click();
|
|
1219
|
+
const waitMs = selectors.expandWaitMs ?? 1500;
|
|
1220
|
+
if (waitMs > 0) await new Promise((resolve) => setTimeout(resolve, waitMs));
|
|
1221
|
+
return true;
|
|
1222
|
+
}
|
|
1223
|
+
const aioRoot = findAIORoot();
|
|
1224
|
+
let aioText = null;
|
|
1225
|
+
let aioCitations = [];
|
|
1226
|
+
let aioExpanded = false;
|
|
1227
|
+
let aioFullyExpanded = false;
|
|
1228
|
+
let aioSections = [];
|
|
1229
|
+
if (aioRoot) {
|
|
1230
|
+
aioExpanded = await maybeExpand(aioRoot);
|
|
1231
|
+
const controller = aioRoot.querySelector(selectors.aio.controller);
|
|
1232
|
+
const contentSubtree = aioRoot.querySelector(selectors.aio.contentSubtree);
|
|
1233
|
+
const showMore = aioRoot.querySelector(selectors.aio.showMoreButton);
|
|
1234
|
+
aioFullyExpanded = controller?.getAttribute("data-trnct") === "false" || showMore?.getAttribute("aria-expanded") === "true" || !showMore;
|
|
1235
|
+
aioText = cleanText(contentSubtree ?? controller ?? aioRoot);
|
|
1236
|
+
aioSections = (aioText ?? "").split("\n").map((line) => line.trim()).filter((line) => /^\d+\.\s+.+/.test(line));
|
|
1237
|
+
aioCitations = extractCitations(aioRoot);
|
|
1238
|
+
}
|
|
1239
|
+
const aimRoot = document.querySelector(selectors.aim.root);
|
|
1240
|
+
const aimDetected = surface === "aim" && !!aimRoot;
|
|
1241
|
+
const aimContainer = aimRoot?.closest(selectors.aim.wrapper) ?? aimRoot;
|
|
1242
|
+
const aimText = cleanText(aimContainer);
|
|
1243
|
+
const aimCitations = aimDetected ? extractCitations(aimContainer) : [];
|
|
1244
|
+
return {
|
|
1245
|
+
surface,
|
|
1246
|
+
aiOverview: {
|
|
1247
|
+
detected: !!aioRoot && aioText !== null,
|
|
1248
|
+
text: aioText,
|
|
1249
|
+
citations: aioCitations,
|
|
1250
|
+
expanded: aioExpanded,
|
|
1251
|
+
fullyExpanded: aioFullyExpanded,
|
|
1252
|
+
sections: aioSections
|
|
1253
|
+
},
|
|
1254
|
+
aiMode: {
|
|
1255
|
+
detected: aimDetected && aimText !== null,
|
|
1256
|
+
text: aimText,
|
|
1257
|
+
citations: aimCitations
|
|
1258
|
+
}
|
|
1259
|
+
};
|
|
1260
|
+
}
|
|
1261
|
+
|
|
530
1262
|
// src/extractor/PAAExtractor.ts
|
|
1263
|
+
var DESKTOP_USER_AGENT2 = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
1264
|
+
var MOBILE_USER_AGENT2 = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
|
|
531
1265
|
var PAAExtractor = class {
|
|
532
1266
|
constructor(driver, reporter) {
|
|
533
1267
|
this.driver = driver;
|
|
@@ -538,6 +1272,17 @@ var PAAExtractor = class {
|
|
|
538
1272
|
normalizeQuestion(q) {
|
|
539
1273
|
return q.toLowerCase().replace(/[^\w\s]/g, "").replace(/\s+/g, " ").trim();
|
|
540
1274
|
}
|
|
1275
|
+
throwIfAborted(signal) {
|
|
1276
|
+
if (!signal?.aborted) return;
|
|
1277
|
+
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") throw signal.reason;
|
|
1278
|
+
throw new RequestAbortedError();
|
|
1279
|
+
}
|
|
1280
|
+
async throwIfCaptcha(page, context) {
|
|
1281
|
+
const captchaCount = await page.locator(PAASelectors.captchaMarker).count().catch(() => 0);
|
|
1282
|
+
if (captchaCount > 0) {
|
|
1283
|
+
throw new CaptchaError(`${context} returned a CAPTCHA \u2014 retrying with a fresh session.`);
|
|
1284
|
+
}
|
|
1285
|
+
}
|
|
541
1286
|
async extractVisibleItems(page) {
|
|
542
1287
|
const sels = PAASelectors;
|
|
543
1288
|
const raw = await page.evaluate((selectors) => {
|
|
@@ -600,10 +1345,10 @@ var PAAExtractor = class {
|
|
|
600
1345
|
extracted_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
601
1346
|
};
|
|
602
1347
|
}
|
|
603
|
-
async runBFS(page, options) {
|
|
1348
|
+
async runBFS(page, options, signal) {
|
|
604
1349
|
const seenKeys = /* @__PURE__ */ new Set();
|
|
605
1350
|
const seenQs = /* @__PURE__ */ new Set();
|
|
606
|
-
const
|
|
1351
|
+
const orderedQs = [];
|
|
607
1352
|
const results = [];
|
|
608
1353
|
const readAllQs = () => page.evaluate(
|
|
609
1354
|
({ sel, dataQ, dataInitQ, questionEl }) => Array.from(document.querySelectorAll(sel)).map(
|
|
@@ -611,42 +1356,43 @@ var PAAExtractor = class {
|
|
|
611
1356
|
).filter(Boolean),
|
|
612
1357
|
{ sel: PAASelectors.item, dataQ: PAASelectors.itemDataQ, dataInitQ: PAASelectors.itemDataInitQ, questionEl: PAASelectors.itemQuestionEl }
|
|
613
1358
|
);
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
this.
|
|
618
|
-
if (seenQs.size >= options.maxQuestions) break;
|
|
1359
|
+
let round = 0;
|
|
1360
|
+
while (seenQs.size < options.maxQuestions) {
|
|
1361
|
+
this.throwIfAborted(signal);
|
|
1362
|
+
await this.throwIfCaptcha(page, "Google PAA expansion");
|
|
619
1363
|
const beforeQs = await readAllQs();
|
|
620
1364
|
if (beforeQs.length >= options.maxQuestions) break;
|
|
621
|
-
const
|
|
622
|
-
|
|
623
|
-
);
|
|
624
|
-
|
|
625
|
-
for (
|
|
1365
|
+
const unexpandedSel = `${PAASelectors.item}:not(.${PAASelectors.expandedClass}) ${PAASelectors.clickTarget}`;
|
|
1366
|
+
const unexpandedCount = await page.locator(unexpandedSel).count();
|
|
1367
|
+
if (unexpandedCount === 0) break;
|
|
1368
|
+
this.reporter.onDepth(++round);
|
|
1369
|
+
for (let ci = 0; ci < unexpandedCount; ci++) {
|
|
1370
|
+
this.throwIfAborted(signal);
|
|
626
1371
|
try {
|
|
627
|
-
|
|
628
|
-
await
|
|
1372
|
+
const btn = page.locator(unexpandedSel).first();
|
|
1373
|
+
await btn.scrollIntoViewIfNeeded();
|
|
1374
|
+
await btn.hover({ force: true });
|
|
1375
|
+
await page.waitForTimeout(100);
|
|
1376
|
+
await btn.click({ force: true });
|
|
629
1377
|
await page.waitForTimeout(500);
|
|
630
1378
|
} catch {
|
|
631
1379
|
}
|
|
632
1380
|
}
|
|
633
|
-
await page.
|
|
1381
|
+
await page.waitForFunction(
|
|
1382
|
+
({ sel, min }) => document.querySelectorAll(sel).length > min,
|
|
1383
|
+
{ sel: PAASelectors.item, min: beforeQs.length },
|
|
1384
|
+
{ timeout: 5e3 }
|
|
1385
|
+
).catch(() => {
|
|
1386
|
+
});
|
|
1387
|
+
await this.throwIfCaptcha(page, "Google PAA expansion");
|
|
634
1388
|
const afterQs = await readAllQs();
|
|
635
|
-
|
|
636
|
-
const newDups = newQs.filter((q) => seenQs.has(q)).length;
|
|
637
|
-
const dupRate = newQs.length > 0 ? newDups / newQs.length : 0;
|
|
638
|
-
dupRates.push(dupRate);
|
|
639
|
-
if (dupRates.length > 2) dupRates.shift();
|
|
640
|
-
const rollingDupRate = dupRates.reduce((a, b) => a + b, 0) / dupRates.length;
|
|
1389
|
+
if (afterQs.length === beforeQs.length) break;
|
|
641
1390
|
for (const q of afterQs) {
|
|
642
1391
|
if (!seenQs.has(q)) {
|
|
643
1392
|
seenQs.add(q);
|
|
644
1393
|
orderedQs.push(q);
|
|
645
1394
|
}
|
|
646
|
-
if (!depthMap.has(q)) depthMap.set(q, round + 1);
|
|
647
1395
|
}
|
|
648
|
-
if (afterQs.length === beforeQs.length) break;
|
|
649
|
-
if (rollingDupRate >= 0.6) break;
|
|
650
1396
|
}
|
|
651
1397
|
const itemMap = new Map((await this.extractVisibleItems(page)).map((i) => [i.question, i]));
|
|
652
1398
|
for (const q of orderedQs) {
|
|
@@ -654,13 +1400,12 @@ var PAAExtractor = class {
|
|
|
654
1400
|
const key = this.normalizeQuestion(q);
|
|
655
1401
|
if (seenKeys.has(key)) continue;
|
|
656
1402
|
seenKeys.add(key);
|
|
657
|
-
const d = depthMap.get(q) ?? 1;
|
|
658
1403
|
const item = itemMap.get(q);
|
|
659
1404
|
if (item) {
|
|
660
|
-
results.push(this.toFlatRow(item,
|
|
661
|
-
this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth:
|
|
1405
|
+
results.push(this.toFlatRow(item, 1, null, options.query));
|
|
1406
|
+
this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: 1, parentQuestion: null, children: [] });
|
|
662
1407
|
} else {
|
|
663
|
-
results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 },
|
|
1408
|
+
results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, 1, null, options.query));
|
|
664
1409
|
}
|
|
665
1410
|
}
|
|
666
1411
|
return results;
|
|
@@ -718,6 +1463,7 @@ var PAAExtractor = class {
|
|
|
718
1463
|
} catch {
|
|
719
1464
|
return [];
|
|
720
1465
|
}
|
|
1466
|
+
await this.throwIfCaptcha(page, "Google short video search");
|
|
721
1467
|
const svSels = {
|
|
722
1468
|
item: ShortVideoSelectors.item,
|
|
723
1469
|
platforms: [...ShortVideoSelectors.platforms]
|
|
@@ -999,69 +1745,11 @@ var PAAExtractor = class {
|
|
|
999
1745
|
return { ...entityIds, entities: records, cids: [...cidSet] };
|
|
1000
1746
|
}
|
|
1001
1747
|
async extractAISurfaces(page) {
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
function findAIORoot() {
|
|
1008
|
-
const primary = document.querySelector(aio.root);
|
|
1009
|
-
if (primary) return primary;
|
|
1010
|
-
const headings = document.querySelectorAll('h1, h2, h3, [role="heading"]');
|
|
1011
|
-
for (const h of headings) {
|
|
1012
|
-
if (h.textContent?.trim() === "AI Overview") {
|
|
1013
|
-
let el = h.parentElement;
|
|
1014
|
-
for (let i = 0; i < 6 && el; i++) {
|
|
1015
|
-
if (el.querySelectorAll("a").length > 1) return el;
|
|
1016
|
-
el = el.parentElement;
|
|
1017
|
-
}
|
|
1018
|
-
return h.parentElement;
|
|
1019
|
-
}
|
|
1020
|
-
}
|
|
1021
|
-
return null;
|
|
1022
|
-
}
|
|
1023
|
-
const aioRoot = findAIORoot();
|
|
1024
|
-
const aioContainer = aioRoot ? aioRoot.closest(aio.wrapper) ?? aioRoot : null;
|
|
1025
|
-
let aioText = null;
|
|
1026
|
-
if (aioContainer) {
|
|
1027
|
-
const clone = aioContainer.cloneNode(true);
|
|
1028
|
-
clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
|
|
1029
|
-
clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
|
|
1030
|
-
clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
|
|
1031
|
-
clone.querySelectorAll("a").forEach((el) => el.remove());
|
|
1032
|
-
const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
|
|
1033
|
-
const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
|
|
1034
|
-
aioText = isErrorState ? null : candidate;
|
|
1035
|
-
}
|
|
1036
|
-
const aioDetected = !!aioRoot && aioText !== null;
|
|
1037
|
-
const aioCitations = Array.from(aioContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
|
|
1038
|
-
text: a.textContent?.trim() ?? "",
|
|
1039
|
-
href: a.href
|
|
1040
|
-
})).filter((c) => c.text && c.href);
|
|
1041
|
-
const aimRoot = document.querySelector(aim.root);
|
|
1042
|
-
const aimDetected = surface === "aim" && !!aimRoot;
|
|
1043
|
-
const aimContainer = aimRoot?.closest(aim.wrapper) ?? null;
|
|
1044
|
-
let aimText = null;
|
|
1045
|
-
if (aimContainer) {
|
|
1046
|
-
const clone = aimContainer.cloneNode(true);
|
|
1047
|
-
clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
|
|
1048
|
-
clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
|
|
1049
|
-
clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
|
|
1050
|
-
clone.querySelectorAll("a").forEach((el) => el.remove());
|
|
1051
|
-
const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
|
|
1052
|
-
const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
|
|
1053
|
-
aimText = isErrorState ? null : candidate;
|
|
1054
|
-
}
|
|
1055
|
-
const aimCitations = aimDetected ? Array.from(aimContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
|
|
1056
|
-
text: a.textContent?.trim() ?? "",
|
|
1057
|
-
href: a.href
|
|
1058
|
-
})).filter((c) => c.text && c.href) : [];
|
|
1059
|
-
return {
|
|
1060
|
-
surface,
|
|
1061
|
-
aiOverview: { detected: aioDetected, text: aioText, citations: aioCitations },
|
|
1062
|
-
aiMode: { detected: aimDetected, text: aimText, citations: aimCitations }
|
|
1063
|
-
};
|
|
1064
|
-
}, { aio: aioSels, aim: aimSels });
|
|
1748
|
+
return page.evaluate(extractAISurfacesFromDocument, {
|
|
1749
|
+
aio: AIOverviewSelectors,
|
|
1750
|
+
aim: AIModeSelectors,
|
|
1751
|
+
expandWaitMs: 1500
|
|
1752
|
+
});
|
|
1065
1753
|
}
|
|
1066
1754
|
buildTree(flat, _seed) {
|
|
1067
1755
|
const roots = [];
|
|
@@ -1088,23 +1776,70 @@ var PAAExtractor = class {
|
|
|
1088
1776
|
}
|
|
1089
1777
|
return roots;
|
|
1090
1778
|
}
|
|
1091
|
-
|
|
1779
|
+
getBrowserDebugSnapshot() {
|
|
1780
|
+
return this.driver.getDebugSnapshot();
|
|
1781
|
+
}
|
|
1782
|
+
buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) {
|
|
1783
|
+
if (!options.debug) return void 0;
|
|
1784
|
+
return {
|
|
1785
|
+
enabled: true,
|
|
1786
|
+
request: {
|
|
1787
|
+
query: options.query,
|
|
1788
|
+
locationInput: options.location ?? null,
|
|
1789
|
+
canonicalLocation,
|
|
1790
|
+
uule,
|
|
1791
|
+
gl: options.gl,
|
|
1792
|
+
hl: options.hl,
|
|
1793
|
+
device: options.device,
|
|
1794
|
+
proxyMode: options.proxyMode,
|
|
1795
|
+
proxyZip: options.proxyZip ?? null,
|
|
1796
|
+
serpOnly: options.serpOnly,
|
|
1797
|
+
pages: options.pages ?? 1
|
|
1798
|
+
},
|
|
1799
|
+
browser: this.getBrowserDebugSnapshot(),
|
|
1800
|
+
...locationEvidence ? { locationEvidence } : {}
|
|
1801
|
+
};
|
|
1802
|
+
}
|
|
1803
|
+
async extract(options, signal) {
|
|
1092
1804
|
const startMs = Date.now();
|
|
1805
|
+
const isMobile = options.device === "mobile";
|
|
1093
1806
|
const config = {
|
|
1094
1807
|
headless: options.headless,
|
|
1095
1808
|
profileDir: options.profileDir,
|
|
1096
1809
|
proxy: options.proxy,
|
|
1097
1810
|
kernelApiKey: options.kernelApiKey,
|
|
1098
1811
|
kernelProxyId: options.kernelProxyId,
|
|
1099
|
-
|
|
1100
|
-
|
|
1812
|
+
kernelProxyResolution: options.kernelProxyResolution,
|
|
1813
|
+
proxyMode: options.proxyMode,
|
|
1814
|
+
viewport: isMobile ? { width: 390, height: 844 } : { width: 1280, height: 800 },
|
|
1815
|
+
locale: `${options.hl}-${options.gl.toUpperCase()}`,
|
|
1816
|
+
userAgent: isMobile ? MOBILE_USER_AGENT2 : DESKTOP_USER_AGENT2,
|
|
1817
|
+
deviceScaleFactor: isMobile ? 3 : 1,
|
|
1818
|
+
isMobile,
|
|
1819
|
+
hasTouch: isMobile,
|
|
1820
|
+
debug: options.debug
|
|
1101
1821
|
};
|
|
1102
1822
|
let errorCount = 0;
|
|
1823
|
+
const diagnosticWarnings = [];
|
|
1103
1824
|
try {
|
|
1825
|
+
this.throwIfAborted(signal);
|
|
1104
1826
|
await this.driver.launch(config);
|
|
1105
|
-
|
|
1106
|
-
const
|
|
1827
|
+
this.throwIfAborted(signal);
|
|
1828
|
+
const canonicalLocation = options.location ? normalizeLocation(options.location) : null;
|
|
1829
|
+
const uule = canonicalLocation ? encodeUule(canonicalLocation) : null;
|
|
1830
|
+
const { hasPaa } = await this.driver.navigateToSERP(
|
|
1831
|
+
options.query,
|
|
1832
|
+
uule,
|
|
1833
|
+
options.gl,
|
|
1834
|
+
options.hl,
|
|
1835
|
+
{
|
|
1836
|
+
...options.serpOnly ? { num: 100 } : {},
|
|
1837
|
+
debug: options.debug
|
|
1838
|
+
}
|
|
1839
|
+
);
|
|
1840
|
+
this.throwIfAborted(signal);
|
|
1107
1841
|
const page = this.driver.getPage();
|
|
1842
|
+
await this.throwIfCaptcha(page, "Google SERP");
|
|
1108
1843
|
if (options.serpOnly) {
|
|
1109
1844
|
const [organicResults2, localPack2, rawEntityIds2] = await Promise.all([
|
|
1110
1845
|
this.extractOrganicResults(page),
|
|
@@ -1112,13 +1847,19 @@ var PAAExtractor = class {
|
|
|
1112
1847
|
this.extractEntityIds(page)
|
|
1113
1848
|
]);
|
|
1114
1849
|
const entityIds2 = this.mergeLocalPackIntoEntities(rawEntityIds2, localPack2);
|
|
1850
|
+
const aiSurfaces2 = await this.extractAISurfaces(page);
|
|
1851
|
+
let locationEvidence2 = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults2, localPack2) : void 0;
|
|
1115
1852
|
let allOrganic2 = organicResults2;
|
|
1116
1853
|
if ((options.pages ?? 1) >= 2) {
|
|
1117
|
-
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1854
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1118
1855
|
if (uule) p2params.set("uule", uule);
|
|
1119
1856
|
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1857
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1120
1858
|
const p2organic = await this.extractOrganicResults(page);
|
|
1121
1859
|
allOrganic2 = [...organicResults2, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1860
|
+
if (options.debug) {
|
|
1861
|
+
locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, allOrganic2, localPack2);
|
|
1862
|
+
}
|
|
1122
1863
|
}
|
|
1123
1864
|
const stats2 = {
|
|
1124
1865
|
seed: options.query,
|
|
@@ -1132,10 +1873,15 @@ var PAAExtractor = class {
|
|
|
1132
1873
|
seed: options.query,
|
|
1133
1874
|
location: options.location ?? null,
|
|
1134
1875
|
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1876
|
+
diagnostics: {
|
|
1877
|
+
completionStatus: "serp_only",
|
|
1878
|
+
problem: null,
|
|
1879
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
|
|
1880
|
+
},
|
|
1135
1881
|
totalQuestions: 0,
|
|
1136
|
-
surface:
|
|
1137
|
-
aiOverview:
|
|
1138
|
-
aiMode:
|
|
1882
|
+
surface: aiSurfaces2.surface,
|
|
1883
|
+
aiOverview: aiSurfaces2.aiOverview,
|
|
1884
|
+
aiMode: aiSurfaces2.aiMode,
|
|
1139
1885
|
whatPeopleSaying: [],
|
|
1140
1886
|
tree: [],
|
|
1141
1887
|
flat: [],
|
|
@@ -1156,16 +1902,22 @@ var PAAExtractor = class {
|
|
|
1156
1902
|
this.extractLocalPack(page)
|
|
1157
1903
|
]);
|
|
1158
1904
|
const entityIds = this.mergeLocalPackIntoEntities(rawEntityIds, localPack);
|
|
1905
|
+
const initialLocationEvidence = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) : void 0;
|
|
1159
1906
|
this.reporter.onVideos(videos);
|
|
1160
1907
|
this.reporter.onForums(forums);
|
|
1161
1908
|
if (!hasPaa) {
|
|
1162
1909
|
let noPaaOrganic = organicResults;
|
|
1910
|
+
let locationEvidence2 = initialLocationEvidence;
|
|
1163
1911
|
if ((options.pages ?? 1) >= 2) {
|
|
1164
|
-
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1912
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1165
1913
|
if (uule) p2params.set("uule", uule);
|
|
1166
1914
|
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1915
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1167
1916
|
const p2organic = await this.extractOrganicResults(page);
|
|
1168
1917
|
noPaaOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1918
|
+
if (options.debug) {
|
|
1919
|
+
locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, noPaaOrganic, localPack);
|
|
1920
|
+
}
|
|
1169
1921
|
}
|
|
1170
1922
|
const aiSurfaces2 = await this.extractAISurfaces(page);
|
|
1171
1923
|
const stats2 = {
|
|
@@ -1180,6 +1932,11 @@ var PAAExtractor = class {
|
|
|
1180
1932
|
seed: options.query,
|
|
1181
1933
|
location: options.location ?? null,
|
|
1182
1934
|
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1935
|
+
diagnostics: {
|
|
1936
|
+
completionStatus: "no_paa",
|
|
1937
|
+
problem: null,
|
|
1938
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
|
|
1939
|
+
},
|
|
1183
1940
|
totalQuestions: 0,
|
|
1184
1941
|
surface: aiSurfaces2.surface,
|
|
1185
1942
|
aiOverview: aiSurfaces2.aiOverview,
|
|
@@ -1195,19 +1952,37 @@ var PAAExtractor = class {
|
|
|
1195
1952
|
stats: stats2
|
|
1196
1953
|
};
|
|
1197
1954
|
}
|
|
1198
|
-
const flat = await this.runBFS(page, options);
|
|
1955
|
+
const flat = await this.runBFS(page, options, signal);
|
|
1956
|
+
this.throwIfAborted(signal);
|
|
1199
1957
|
const aiSurfaces = await this.extractAISurfaces(page);
|
|
1200
|
-
const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, udm: ShortVideoSelectors.udm });
|
|
1958
|
+
const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", udm: ShortVideoSelectors.udm });
|
|
1201
1959
|
if (uule) shortVidsParams.set("uule", uule);
|
|
1202
|
-
|
|
1960
|
+
let shortVideos = [];
|
|
1961
|
+
try {
|
|
1962
|
+
shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
|
|
1963
|
+
} catch (err) {
|
|
1964
|
+
if (!(err instanceof CaptchaError)) throw err;
|
|
1965
|
+
errorCount++;
|
|
1966
|
+
diagnosticWarnings.push({
|
|
1967
|
+
code: "short_videos_captcha_skipped",
|
|
1968
|
+
surface: "short_videos",
|
|
1969
|
+
message: err.message,
|
|
1970
|
+
retryable: true
|
|
1971
|
+
});
|
|
1972
|
+
}
|
|
1203
1973
|
this.reporter.onVideos(shortVideos);
|
|
1204
1974
|
let allOrganic = organicResults;
|
|
1975
|
+
let locationEvidence = initialLocationEvidence;
|
|
1205
1976
|
if ((options.pages ?? 1) >= 2) {
|
|
1206
|
-
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1977
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1207
1978
|
if (uule) p2params.set("uule", uule);
|
|
1208
1979
|
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1980
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1209
1981
|
const p2organic = await this.extractOrganicResults(page);
|
|
1210
1982
|
allOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1983
|
+
if (options.debug) {
|
|
1984
|
+
locationEvidence = inferSerpLocationEvidence(canonicalLocation, allOrganic, localPack);
|
|
1985
|
+
}
|
|
1211
1986
|
}
|
|
1212
1987
|
const allVideos = [...videos, ...shortVideos];
|
|
1213
1988
|
const tree = this.buildTree(flat, options.query);
|
|
@@ -1223,6 +1998,12 @@ var PAAExtractor = class {
|
|
|
1223
1998
|
seed: options.query,
|
|
1224
1999
|
location: options.location ?? null,
|
|
1225
2000
|
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2001
|
+
diagnostics: {
|
|
2002
|
+
completionStatus: "paa_found",
|
|
2003
|
+
problem: null,
|
|
2004
|
+
...diagnosticWarnings.length > 0 ? { warnings: diagnosticWarnings } : {},
|
|
2005
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) } : {}
|
|
2006
|
+
},
|
|
1226
2007
|
totalQuestions: flat.length,
|
|
1227
2008
|
surface: aiSurfaces.surface,
|
|
1228
2009
|
aiOverview: aiSurfaces.aiOverview,
|
|
@@ -1241,8 +2022,6 @@ var PAAExtractor = class {
|
|
|
1241
2022
|
errorCount++;
|
|
1242
2023
|
this.reporter.onError(err instanceof Error ? err : new Error(String(err)));
|
|
1243
2024
|
throw err;
|
|
1244
|
-
} finally {
|
|
1245
|
-
await this.driver.close();
|
|
1246
2025
|
}
|
|
1247
2026
|
}
|
|
1248
2027
|
};
|
|
@@ -1356,53 +2135,646 @@ var ProgressReporter = class {
|
|
|
1356
2135
|
}
|
|
1357
2136
|
};
|
|
1358
2137
|
|
|
2138
|
+
// src/kernel-proxy-resolver.ts
|
|
2139
|
+
var import_sdk2 = __toESM(require("@onkernel/sdk"), 1);
|
|
2140
|
+
var US_STATE_CODES = {
|
|
2141
|
+
alabama: "AL",
|
|
2142
|
+
alaska: "AK",
|
|
2143
|
+
arizona: "AZ",
|
|
2144
|
+
arkansas: "AR",
|
|
2145
|
+
california: "CA",
|
|
2146
|
+
colorado: "CO",
|
|
2147
|
+
connecticut: "CT",
|
|
2148
|
+
delaware: "DE",
|
|
2149
|
+
florida: "FL",
|
|
2150
|
+
georgia: "GA",
|
|
2151
|
+
hawaii: "HI",
|
|
2152
|
+
idaho: "ID",
|
|
2153
|
+
illinois: "IL",
|
|
2154
|
+
indiana: "IN",
|
|
2155
|
+
iowa: "IA",
|
|
2156
|
+
kansas: "KS",
|
|
2157
|
+
kentucky: "KY",
|
|
2158
|
+
louisiana: "LA",
|
|
2159
|
+
maine: "ME",
|
|
2160
|
+
maryland: "MD",
|
|
2161
|
+
massachusetts: "MA",
|
|
2162
|
+
michigan: "MI",
|
|
2163
|
+
minnesota: "MN",
|
|
2164
|
+
mississippi: "MS",
|
|
2165
|
+
missouri: "MO",
|
|
2166
|
+
montana: "MT",
|
|
2167
|
+
nebraska: "NE",
|
|
2168
|
+
nevada: "NV",
|
|
2169
|
+
"new hampshire": "NH",
|
|
2170
|
+
"new jersey": "NJ",
|
|
2171
|
+
"new mexico": "NM",
|
|
2172
|
+
"new york": "NY",
|
|
2173
|
+
"north carolina": "NC",
|
|
2174
|
+
"north dakota": "ND",
|
|
2175
|
+
ohio: "OH",
|
|
2176
|
+
oklahoma: "OK",
|
|
2177
|
+
oregon: "OR",
|
|
2178
|
+
pennsylvania: "PA",
|
|
2179
|
+
"rhode island": "RI",
|
|
2180
|
+
"south carolina": "SC",
|
|
2181
|
+
"south dakota": "SD",
|
|
2182
|
+
tennessee: "TN",
|
|
2183
|
+
texas: "TX",
|
|
2184
|
+
utah: "UT",
|
|
2185
|
+
vermont: "VT",
|
|
2186
|
+
virginia: "VA",
|
|
2187
|
+
washington: "WA",
|
|
2188
|
+
"west virginia": "WV",
|
|
2189
|
+
wisconsin: "WI",
|
|
2190
|
+
wyoming: "WY"
|
|
2191
|
+
};
|
|
2192
|
+
var US_CITY_CENTER_ZIPS = {
|
|
2193
|
+
"atlanta|GA": "30303",
|
|
2194
|
+
"austin|TX": "78701",
|
|
2195
|
+
"baltimore|MD": "21201",
|
|
2196
|
+
"boston|MA": "02108",
|
|
2197
|
+
"boulder|CO": "80302",
|
|
2198
|
+
"charlotte|NC": "28202",
|
|
2199
|
+
"chicago|IL": "60601",
|
|
2200
|
+
"colorado_springs|CO": "80903",
|
|
2201
|
+
"columbus|OH": "43215",
|
|
2202
|
+
"dallas|TX": "75201",
|
|
2203
|
+
"denver|CO": "80202",
|
|
2204
|
+
"detroit|MI": "48226",
|
|
2205
|
+
"fort_collins|CO": "80524",
|
|
2206
|
+
"fort_worth|TX": "76102",
|
|
2207
|
+
"houston|TX": "77002",
|
|
2208
|
+
"indianapolis|IN": "46204",
|
|
2209
|
+
"jacksonville|FL": "32202",
|
|
2210
|
+
"las_vegas|NV": "89101",
|
|
2211
|
+
"los_angeles|CA": "90012",
|
|
2212
|
+
"louisville|KY": "40202",
|
|
2213
|
+
"loveland|CO": "80537",
|
|
2214
|
+
"memphis|TN": "38103",
|
|
2215
|
+
"miami|FL": "33131",
|
|
2216
|
+
"minneapolis|MN": "55401",
|
|
2217
|
+
"nashville|TN": "37203",
|
|
2218
|
+
"new_york|NY": "10001",
|
|
2219
|
+
"orlando|FL": "32801",
|
|
2220
|
+
"philadelphia|PA": "19103",
|
|
2221
|
+
"phoenix|AZ": "85004",
|
|
2222
|
+
"portland|OR": "97205",
|
|
2223
|
+
"raleigh|NC": "27601",
|
|
2224
|
+
"richmond|VA": "23219",
|
|
2225
|
+
"sacramento|CA": "95814",
|
|
2226
|
+
"salt_lake_city|UT": "84101",
|
|
2227
|
+
"san_antonio|TX": "78205",
|
|
2228
|
+
"san_diego|CA": "92101",
|
|
2229
|
+
"san_francisco|CA": "94103",
|
|
2230
|
+
"san_jose|CA": "95113",
|
|
2231
|
+
"seattle|WA": "98101"
|
|
2232
|
+
};
|
|
2233
|
+
function proxyIdSuffix2(proxyId) {
|
|
2234
|
+
return proxyId ? proxyId.slice(-6) : null;
|
|
2235
|
+
}
|
|
2236
|
+
function resolution(source, proxyMode, proxyId, target, error) {
|
|
2237
|
+
return {
|
|
2238
|
+
kernelProxyId: proxyId,
|
|
2239
|
+
resolution: {
|
|
2240
|
+
source,
|
|
2241
|
+
proxyMode,
|
|
2242
|
+
proxyIdPresent: Boolean(proxyId),
|
|
2243
|
+
proxyIdSuffix: proxyIdSuffix2(proxyId),
|
|
2244
|
+
target,
|
|
2245
|
+
error
|
|
2246
|
+
}
|
|
2247
|
+
};
|
|
2248
|
+
}
|
|
2249
|
+
function normalizeStateName(value) {
|
|
2250
|
+
return value.trim().toLowerCase().replace(/\s+/g, " ");
|
|
2251
|
+
}
|
|
2252
|
+
function normalizeCountryName(value) {
|
|
2253
|
+
return value.trim().toLowerCase().replace(/\./g, "").replace(/\s+/g, " ");
|
|
2254
|
+
}
|
|
2255
|
+
function isUnitedStates(country) {
|
|
2256
|
+
if (!country) return true;
|
|
2257
|
+
const normalized = normalizeCountryName(country);
|
|
2258
|
+
return normalized === "united states" || normalized === "united states of america" || normalized === "usa" || normalized === "us";
|
|
2259
|
+
}
|
|
2260
|
+
function stateCodeFor(region) {
|
|
2261
|
+
const trimmed = region.trim();
|
|
2262
|
+
if (/^[A-Za-z]{2}$/.test(trimmed)) return trimmed.toUpperCase();
|
|
2263
|
+
return US_STATE_CODES[normalizeStateName(trimmed)] ?? null;
|
|
2264
|
+
}
|
|
2265
|
+
function kernelCityIdentifierCandidates(city) {
|
|
2266
|
+
const ascii = city.normalize("NFKD").replace(/[^\x00-\x7F]/g, "").toLowerCase();
|
|
2267
|
+
const words = ascii.split(/[^a-z0-9]+/).filter(Boolean);
|
|
2268
|
+
const underscored = words.join("_");
|
|
2269
|
+
const compact = words.join("");
|
|
2270
|
+
return Array.from(new Set([underscored, compact].filter(Boolean)));
|
|
2271
|
+
}
|
|
2272
|
+
function proxyName(country, state, city) {
|
|
2273
|
+
return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
|
|
2274
|
+
}
|
|
2275
|
+
function zipProxyName(zip) {
|
|
2276
|
+
return `mcp-serp-residential-us-zip-${zip}`;
|
|
2277
|
+
}
|
|
2278
|
+
function parseKernelLocationProxyTarget(location, gl) {
|
|
2279
|
+
if (!location || gl.toLowerCase() !== "us") return null;
|
|
2280
|
+
const canonicalLocation = normalizeLocation(location);
|
|
2281
|
+
let parts = canonicalLocation.split(",").map((part) => part.trim()).filter(Boolean);
|
|
2282
|
+
if (parts.length > 1 && isUnitedStates(parts[parts.length - 1])) {
|
|
2283
|
+
parts = parts.slice(0, -1);
|
|
2284
|
+
}
|
|
2285
|
+
if (parts.length === 1) {
|
|
2286
|
+
const stateOnly = stateCodeFor(parts[0]);
|
|
2287
|
+
if (!stateOnly) return null;
|
|
2288
|
+
return {
|
|
2289
|
+
canonicalLocation,
|
|
2290
|
+
level: "state",
|
|
2291
|
+
country: "US",
|
|
2292
|
+
state: stateOnly,
|
|
2293
|
+
city: "",
|
|
2294
|
+
cityCandidates: [],
|
|
2295
|
+
proxyName: proxyName("US", stateOnly),
|
|
2296
|
+
config: {
|
|
2297
|
+
country: "US",
|
|
2298
|
+
state: stateOnly
|
|
2299
|
+
}
|
|
2300
|
+
};
|
|
2301
|
+
}
|
|
2302
|
+
const [city = "", region = ""] = parts;
|
|
2303
|
+
if (!city || !region) return null;
|
|
2304
|
+
const state = stateCodeFor(region);
|
|
2305
|
+
if (!state) return null;
|
|
2306
|
+
const cityCandidates = kernelCityIdentifierCandidates(city);
|
|
2307
|
+
const primaryCity = cityCandidates[0];
|
|
2308
|
+
if (!primaryCity) return null;
|
|
2309
|
+
return {
|
|
2310
|
+
canonicalLocation,
|
|
2311
|
+
level: "city",
|
|
2312
|
+
country: "US",
|
|
2313
|
+
state,
|
|
2314
|
+
city: primaryCity,
|
|
2315
|
+
cityCandidates,
|
|
2316
|
+
proxyName: proxyName("US", state, primaryCity),
|
|
2317
|
+
config: {
|
|
2318
|
+
country: "US",
|
|
2319
|
+
state,
|
|
2320
|
+
city: primaryCity
|
|
2321
|
+
}
|
|
2322
|
+
};
|
|
2323
|
+
}
|
|
2324
|
+
function cityZipKey(target) {
|
|
2325
|
+
return `${target.city}|${target.state}`;
|
|
2326
|
+
}
|
|
2327
|
+
function knownZipFor(target, explicitZip) {
|
|
2328
|
+
if (explicitZip && /^\d{5}$/.test(explicitZip)) return explicitZip;
|
|
2329
|
+
return US_CITY_CENTER_ZIPS[cityZipKey(target)] ?? null;
|
|
2330
|
+
}
|
|
2331
|
+
function zipTarget(target, zip) {
|
|
2332
|
+
return {
|
|
2333
|
+
...target,
|
|
2334
|
+
level: "zip",
|
|
2335
|
+
zip,
|
|
2336
|
+
proxyName: zipProxyName(zip),
|
|
2337
|
+
config: {
|
|
2338
|
+
country: target.country,
|
|
2339
|
+
state: target.state,
|
|
2340
|
+
zip
|
|
2341
|
+
}
|
|
2342
|
+
};
|
|
2343
|
+
}
|
|
2344
|
+
function configMatches(config, target, city) {
|
|
2345
|
+
if (target.level === "zip") {
|
|
2346
|
+
return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
|
|
2347
|
+
}
|
|
2348
|
+
return config?.country?.toUpperCase() === target.country && config?.state?.toUpperCase() === target.state && (city ? config?.city === city : !config?.city);
|
|
2349
|
+
}
|
|
2350
|
+
function findExistingTargetProxy(proxies, target) {
|
|
2351
|
+
return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === target.proxyName || configMatches(proxy.config, target, target.level === "city" ? target.city : void 0))) ?? null;
|
|
2352
|
+
}
|
|
2353
|
+
function findExistingProxy(proxies, target) {
|
|
2354
|
+
for (const city of target.cityCandidates) {
|
|
2355
|
+
const name = proxyName(target.country, target.state, city);
|
|
2356
|
+
const found = proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target, city)));
|
|
2357
|
+
if (found) return found;
|
|
2358
|
+
}
|
|
2359
|
+
return null;
|
|
2360
|
+
}
|
|
2361
|
+
function stateTarget(target) {
|
|
2362
|
+
return {
|
|
2363
|
+
...target,
|
|
2364
|
+
level: "state",
|
|
2365
|
+
proxyName: proxyName(target.country, target.state),
|
|
2366
|
+
config: {
|
|
2367
|
+
country: target.country,
|
|
2368
|
+
state: target.state
|
|
2369
|
+
}
|
|
2370
|
+
};
|
|
2371
|
+
}
|
|
2372
|
+
function findExistingStateProxy(proxies, target) {
|
|
2373
|
+
const name = proxyName(target.country, target.state);
|
|
2374
|
+
return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target))) ?? null;
|
|
2375
|
+
}
|
|
2376
|
+
function escalatedTargetLevel(target, attemptIndex) {
|
|
2377
|
+
return stateTarget(target);
|
|
2378
|
+
}
|
|
2379
|
+
function errorText2(err) {
|
|
2380
|
+
return err instanceof Error ? err.message : String(err);
|
|
2381
|
+
}
|
|
2382
|
+
async function resolveKernelProxyId(options) {
|
|
2383
|
+
if (options.proxyMode === "none") {
|
|
2384
|
+
return resolution("disabled", options.proxyMode, void 0, null, null);
|
|
2385
|
+
}
|
|
2386
|
+
if (options.proxyMode === "configured") {
|
|
2387
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, null, null);
|
|
2388
|
+
}
|
|
2389
|
+
const target = parseKernelLocationProxyTarget(options.location, options.gl);
|
|
2390
|
+
if (!target || !options.kernelApiKey) {
|
|
2391
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, target ? null : "location could not be normalized to a US city/state proxy target");
|
|
2392
|
+
}
|
|
2393
|
+
const kernel = new import_sdk2.default({ apiKey: options.kernelApiKey });
|
|
2394
|
+
try {
|
|
2395
|
+
const attemptIndex = options.attemptIndex ?? 0;
|
|
2396
|
+
if (attemptIndex >= 1) {
|
|
2397
|
+
const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
|
|
2398
|
+
const createErrors2 = [];
|
|
2399
|
+
try {
|
|
2400
|
+
const created = await kernel.proxies.create({
|
|
2401
|
+
type: "residential",
|
|
2402
|
+
name: escalatedTarget.proxyName,
|
|
2403
|
+
config: escalatedTarget.config
|
|
2404
|
+
});
|
|
2405
|
+
if (created.id) {
|
|
2406
|
+
return resolution("location_created", options.proxyMode, created.id, escalatedTarget, null);
|
|
2407
|
+
}
|
|
2408
|
+
createErrors2.push(`${escalatedTarget.state}: Kernel did not return a proxy id`);
|
|
2409
|
+
} catch (err) {
|
|
2410
|
+
createErrors2.push(`${escalatedTarget.state}: ${errorText2(err)}`);
|
|
2411
|
+
}
|
|
2412
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, escalatedTarget, createErrors2.join(" | "));
|
|
2413
|
+
}
|
|
2414
|
+
const proxies = await kernel.proxies.list();
|
|
2415
|
+
const zip = knownZipFor(target, options.proxyZip);
|
|
2416
|
+
const createErrors = [];
|
|
2417
|
+
if (zip) {
|
|
2418
|
+
const targetZip = zipTarget(target, zip);
|
|
2419
|
+
const existingZip = findExistingTargetProxy(proxies, targetZip);
|
|
2420
|
+
if (existingZip?.id) {
|
|
2421
|
+
return resolution("location_reused", options.proxyMode, existingZip.id, targetZip, null);
|
|
2422
|
+
}
|
|
2423
|
+
try {
|
|
2424
|
+
const created = await kernel.proxies.create({
|
|
2425
|
+
type: "residential",
|
|
2426
|
+
name: targetZip.proxyName,
|
|
2427
|
+
config: {
|
|
2428
|
+
country: targetZip.country,
|
|
2429
|
+
zip
|
|
2430
|
+
}
|
|
2431
|
+
});
|
|
2432
|
+
if (created.id) {
|
|
2433
|
+
return resolution("location_created", options.proxyMode, created.id, targetZip, null);
|
|
2434
|
+
}
|
|
2435
|
+
createErrors.push(`${zip}: Kernel did not return a proxy id`);
|
|
2436
|
+
} catch (err) {
|
|
2437
|
+
createErrors.push(`${zip}: ${errorText2(err)}`);
|
|
2438
|
+
}
|
|
2439
|
+
}
|
|
2440
|
+
const existing = findExistingProxy(proxies, target);
|
|
2441
|
+
if (existing?.id) {
|
|
2442
|
+
return resolution("location_reused", options.proxyMode, existing.id, target, createErrors.join(" | ") || null);
|
|
2443
|
+
}
|
|
2444
|
+
for (const city of target.cityCandidates) {
|
|
2445
|
+
try {
|
|
2446
|
+
const created = await kernel.proxies.create({
|
|
2447
|
+
type: "residential",
|
|
2448
|
+
name: proxyName(target.country, target.state, city),
|
|
2449
|
+
config: {
|
|
2450
|
+
country: target.country,
|
|
2451
|
+
state: target.state,
|
|
2452
|
+
city
|
|
2453
|
+
}
|
|
2454
|
+
});
|
|
2455
|
+
if (created.id) {
|
|
2456
|
+
return resolution("location_created", options.proxyMode, created.id, {
|
|
2457
|
+
...target,
|
|
2458
|
+
level: "city",
|
|
2459
|
+
city,
|
|
2460
|
+
proxyName: proxyName(target.country, target.state, city),
|
|
2461
|
+
config: {
|
|
2462
|
+
country: target.country,
|
|
2463
|
+
state: target.state,
|
|
2464
|
+
city
|
|
2465
|
+
}
|
|
2466
|
+
}, null);
|
|
2467
|
+
}
|
|
2468
|
+
createErrors.push(`${city}: Kernel did not return a proxy id`);
|
|
2469
|
+
} catch (err) {
|
|
2470
|
+
createErrors.push(`${city}: ${errorText2(err)}`);
|
|
2471
|
+
}
|
|
2472
|
+
}
|
|
2473
|
+
const fallbackTarget = stateTarget(target);
|
|
2474
|
+
const existingState = findExistingStateProxy(proxies, fallbackTarget);
|
|
2475
|
+
if (existingState?.id) {
|
|
2476
|
+
return resolution("location_reused", options.proxyMode, existingState.id, fallbackTarget, createErrors.join(" | "));
|
|
2477
|
+
}
|
|
2478
|
+
try {
|
|
2479
|
+
const created = await kernel.proxies.create({
|
|
2480
|
+
type: "residential",
|
|
2481
|
+
name: fallbackTarget.proxyName,
|
|
2482
|
+
config: fallbackTarget.config
|
|
2483
|
+
});
|
|
2484
|
+
if (created.id) {
|
|
2485
|
+
return resolution("location_created", options.proxyMode, created.id, fallbackTarget, createErrors.join(" | "));
|
|
2486
|
+
}
|
|
2487
|
+
createErrors.push(`${fallbackTarget.state}: Kernel did not return a proxy id`);
|
|
2488
|
+
} catch (err) {
|
|
2489
|
+
createErrors.push(`${fallbackTarget.state}: ${errorText2(err)}`);
|
|
2490
|
+
}
|
|
2491
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
|
|
2492
|
+
} catch (err) {
|
|
2493
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, errorText2(err));
|
|
2494
|
+
}
|
|
2495
|
+
}
|
|
2496
|
+
|
|
1359
2497
|
// src/harvest.ts
|
|
1360
2498
|
var MAX_ATTEMPTS = 3;
|
|
1361
|
-
|
|
2499
|
+
function abortReason(signal) {
|
|
2500
|
+
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
|
|
2501
|
+
return new RequestAbortedError();
|
|
2502
|
+
}
|
|
2503
|
+
function getAbortSignal(rawOptions) {
|
|
2504
|
+
if (!rawOptions || typeof rawOptions !== "object") return void 0;
|
|
2505
|
+
const signal = rawOptions.signal;
|
|
2506
|
+
if (signal instanceof AbortSignal) return signal;
|
|
2507
|
+
return void 0;
|
|
2508
|
+
}
|
|
2509
|
+
function getAttemptLogSink(rawOptions) {
|
|
2510
|
+
if (!rawOptions || typeof rawOptions !== "object") return void 0;
|
|
2511
|
+
const sink = rawOptions.onAttemptEvent;
|
|
2512
|
+
return typeof sink === "function" ? sink : void 0;
|
|
2513
|
+
}
|
|
2514
|
+
async function emitAttemptEvent(sink, event) {
|
|
2515
|
+
if (!sink) return;
|
|
2516
|
+
try {
|
|
2517
|
+
await sink(event);
|
|
2518
|
+
} catch (err) {
|
|
2519
|
+
console.warn(JSON.stringify({
|
|
2520
|
+
event: "harvest_attempt_log_failed",
|
|
2521
|
+
attempt_number: event.attemptNumber,
|
|
2522
|
+
message: err instanceof Error ? err.message : String(err)
|
|
2523
|
+
}));
|
|
2524
|
+
}
|
|
2525
|
+
}
|
|
2526
|
+
function classifyAttemptError(err) {
|
|
2527
|
+
if (err instanceof CaptchaError) return "captcha";
|
|
2528
|
+
if (err instanceof RequestAbortedError) return "request_aborted";
|
|
2529
|
+
if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
|
|
2530
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2531
|
+
return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
|
|
2532
|
+
}
|
|
2533
|
+
function classifyAttemptResult(result) {
|
|
2534
|
+
return result.diagnostics?.completionStatus ?? (result.totalQuestions > 0 ? "paa_found" : "no_paa");
|
|
2535
|
+
}
|
|
2536
|
+
function errorMessage(err) {
|
|
2537
|
+
return err instanceof Error ? err.message : String(err);
|
|
2538
|
+
}
|
|
2539
|
+
async function extractOnce(options, signal) {
|
|
1362
2540
|
const driver = new BrowserDriver();
|
|
1363
2541
|
const reporter = new ProgressReporter();
|
|
1364
2542
|
const extractor = new PAAExtractor(driver, reporter);
|
|
2543
|
+
if (signal?.aborted) {
|
|
2544
|
+
return {
|
|
2545
|
+
result: null,
|
|
2546
|
+
error: abortReason(signal),
|
|
2547
|
+
cleanup: await driver.close(),
|
|
2548
|
+
debug: null
|
|
2549
|
+
};
|
|
2550
|
+
}
|
|
2551
|
+
let onAbort;
|
|
2552
|
+
const abortPromise = signal ? new Promise((_, reject) => {
|
|
2553
|
+
onAbort = () => reject(abortReason(signal));
|
|
2554
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
2555
|
+
}) : null;
|
|
2556
|
+
let result = null;
|
|
2557
|
+
let error = null;
|
|
2558
|
+
let cleanup;
|
|
2559
|
+
let debug = null;
|
|
1365
2560
|
try {
|
|
1366
|
-
|
|
2561
|
+
const extraction = extractor.extract(options, signal);
|
|
2562
|
+
if (abortPromise) extraction.catch(() => {
|
|
2563
|
+
});
|
|
2564
|
+
result = await (abortPromise ? Promise.race([extraction, abortPromise]) : extraction);
|
|
2565
|
+
} catch (err) {
|
|
2566
|
+
error = err;
|
|
1367
2567
|
} finally {
|
|
1368
|
-
|
|
2568
|
+
if (signal && onAbort) signal.removeEventListener("abort", onAbort);
|
|
2569
|
+
debug = result?.diagnostics.debug ?? (options.debug ? {
|
|
2570
|
+
enabled: true,
|
|
2571
|
+
request: {
|
|
2572
|
+
query: options.query,
|
|
2573
|
+
locationInput: options.location ?? null,
|
|
2574
|
+
canonicalLocation: null,
|
|
2575
|
+
uule: null,
|
|
2576
|
+
gl: options.gl,
|
|
2577
|
+
hl: options.hl,
|
|
2578
|
+
device: options.device,
|
|
2579
|
+
proxyMode: options.proxyMode,
|
|
2580
|
+
proxyZip: options.proxyZip ?? null,
|
|
2581
|
+
serpOnly: options.serpOnly,
|
|
2582
|
+
pages: options.pages ?? 1
|
|
2583
|
+
},
|
|
2584
|
+
browser: driver.getDebugSnapshot()
|
|
2585
|
+
} : null);
|
|
2586
|
+
cleanup = await driver.close();
|
|
1369
2587
|
}
|
|
2588
|
+
return error ? { result: null, error, cleanup, debug } : { result, error: null, cleanup, debug };
|
|
1370
2589
|
}
|
|
1371
2590
|
async function harvest(rawOptions) {
|
|
1372
2591
|
const raw = typeof rawOptions === "object" && rawOptions !== null ? rawOptions : {};
|
|
1373
|
-
const
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
2592
|
+
const signal = getAbortSignal(rawOptions);
|
|
2593
|
+
const onAttemptEvent = getAttemptLogSink(rawOptions);
|
|
2594
|
+
const requestedProxyMode = raw.proxyMode;
|
|
2595
|
+
const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
|
|
2596
|
+
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
|
|
2597
|
+
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
|
|
2598
|
+
const proxyOpts = {
|
|
2599
|
+
kernelApiKey,
|
|
2600
|
+
proxyMode,
|
|
2601
|
+
configuredKernelProxyId,
|
|
2602
|
+
location: typeof raw.location === "string" ? raw.location : void 0,
|
|
2603
|
+
proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
|
|
2604
|
+
gl: typeof raw.gl === "string" ? raw.gl : "us"
|
|
1377
2605
|
};
|
|
1378
|
-
const options = HarvestOptionsSchema.parse(merged);
|
|
1379
2606
|
const serializer = new OutputSerializer();
|
|
1380
2607
|
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
|
2608
|
+
const attemptNumber = i + 1;
|
|
2609
|
+
const startedAtMs = Date.now();
|
|
1381
2610
|
try {
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
2611
|
+
if (signal?.aborted) throw abortReason(signal);
|
|
2612
|
+
const resolution2 = await resolveKernelProxyId({ ...proxyOpts, attemptIndex: i });
|
|
2613
|
+
const mergedAttempt = {
|
|
2614
|
+
...raw,
|
|
2615
|
+
kernelApiKey,
|
|
2616
|
+
kernelProxyId: resolution2.kernelProxyId,
|
|
2617
|
+
kernelProxyResolution: resolution2.resolution,
|
|
2618
|
+
proxyMode
|
|
2619
|
+
};
|
|
2620
|
+
if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
|
|
2621
|
+
const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
|
|
2622
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2623
|
+
type: "started",
|
|
2624
|
+
attemptNumber,
|
|
2625
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2626
|
+
query: attemptOptions.query,
|
|
2627
|
+
location: attemptOptions.location ?? null,
|
|
2628
|
+
maxQuestions: attemptOptions.maxQuestions,
|
|
2629
|
+
startedAt: new Date(startedAtMs).toISOString()
|
|
2630
|
+
});
|
|
2631
|
+
console.info(JSON.stringify({
|
|
2632
|
+
event: "harvest_attempt_started",
|
|
2633
|
+
attempt_number: attemptNumber,
|
|
2634
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2635
|
+
query: attemptOptions.query,
|
|
2636
|
+
location: attemptOptions.location ?? null,
|
|
2637
|
+
max_questions: attemptOptions.maxQuestions
|
|
2638
|
+
}));
|
|
2639
|
+
const attempt = await extractOnce(attemptOptions, signal);
|
|
2640
|
+
if (attempt.error) {
|
|
2641
|
+
const err = attempt.error;
|
|
2642
|
+
if (err instanceof CaptchaError) {
|
|
2643
|
+
const willRetry = i < MAX_ATTEMPTS - 1;
|
|
2644
|
+
console.warn(JSON.stringify({
|
|
2645
|
+
event: "harvest_attempt_captcha",
|
|
2646
|
+
attempt_number: attemptNumber,
|
|
2647
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2648
|
+
message: err.message,
|
|
2649
|
+
will_retry: willRetry
|
|
2650
|
+
}));
|
|
2651
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2652
|
+
type: "finished",
|
|
2653
|
+
attemptNumber,
|
|
2654
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2655
|
+
outcome: "captcha",
|
|
2656
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2657
|
+
questionCount: 0,
|
|
2658
|
+
durationMs: Date.now() - startedAtMs,
|
|
2659
|
+
error: err.message,
|
|
2660
|
+
willRetry,
|
|
2661
|
+
cleanup: attempt.cleanup,
|
|
2662
|
+
debug: attempt.debug,
|
|
2663
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2664
|
+
});
|
|
2665
|
+
if (willRetry) continue;
|
|
2666
|
+
break;
|
|
2667
|
+
}
|
|
2668
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2669
|
+
type: "finished",
|
|
2670
|
+
attemptNumber,
|
|
2671
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2672
|
+
outcome: classifyAttemptError(err),
|
|
2673
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2674
|
+
questionCount: 0,
|
|
2675
|
+
durationMs: Date.now() - startedAtMs,
|
|
2676
|
+
error: errorMessage(err),
|
|
2677
|
+
willRetry: false,
|
|
2678
|
+
cleanup: attempt.cleanup,
|
|
2679
|
+
debug: attempt.debug,
|
|
2680
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2681
|
+
});
|
|
2682
|
+
throw err;
|
|
2683
|
+
}
|
|
2684
|
+
const result = attempt.result;
|
|
2685
|
+
if (!result) throw new Error("Harvest attempt completed without a result");
|
|
2686
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2687
|
+
type: "finished",
|
|
2688
|
+
attemptNumber,
|
|
2689
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2690
|
+
outcome: classifyAttemptResult(result),
|
|
2691
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2692
|
+
questionCount: result.totalQuestions,
|
|
2693
|
+
durationMs: Date.now() - startedAtMs,
|
|
2694
|
+
error: null,
|
|
2695
|
+
willRetry: false,
|
|
2696
|
+
cleanup: attempt.cleanup,
|
|
2697
|
+
debug: attempt.debug,
|
|
2698
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2699
|
+
});
|
|
2700
|
+
if (attemptOptions.format === "json" || attemptOptions.format === "both") {
|
|
2701
|
+
await serializer.writeJSON(result, attemptOptions.outputDir);
|
|
1385
2702
|
}
|
|
1386
|
-
if (
|
|
2703
|
+
if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
|
|
1387
2704
|
await Promise.all([
|
|
1388
|
-
serializer.writeCSV(result.flat,
|
|
1389
|
-
result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed,
|
|
1390
|
-
result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed,
|
|
1391
|
-
result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed,
|
|
1392
|
-
result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed,
|
|
1393
|
-
result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed,
|
|
2705
|
+
serializer.writeCSV(result.flat, attemptOptions.outputDir),
|
|
2706
|
+
result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2707
|
+
result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2708
|
+
result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2709
|
+
result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2710
|
+
result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, attemptOptions.outputDir) : Promise.resolve("")
|
|
1394
2711
|
]);
|
|
1395
2712
|
}
|
|
1396
2713
|
return result;
|
|
1397
2714
|
} catch (err) {
|
|
1398
|
-
if (err instanceof CaptchaError
|
|
1399
|
-
|
|
2715
|
+
if (err instanceof CaptchaError) {
|
|
2716
|
+
const willRetry = i < MAX_ATTEMPTS - 1;
|
|
2717
|
+
console.warn(JSON.stringify({
|
|
2718
|
+
event: "harvest_attempt_captcha",
|
|
2719
|
+
attempt_number: attemptNumber,
|
|
2720
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2721
|
+
message: err.message,
|
|
2722
|
+
will_retry: willRetry
|
|
2723
|
+
}));
|
|
2724
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2725
|
+
type: "finished",
|
|
2726
|
+
attemptNumber,
|
|
2727
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2728
|
+
outcome: "captcha",
|
|
2729
|
+
kernelSessionId: null,
|
|
2730
|
+
questionCount: 0,
|
|
2731
|
+
durationMs: Date.now() - startedAtMs,
|
|
2732
|
+
error: err.message,
|
|
2733
|
+
willRetry,
|
|
2734
|
+
cleanup: {
|
|
2735
|
+
kernelSessionId: null,
|
|
2736
|
+
kernelDeleteStarted: false,
|
|
2737
|
+
kernelDeleteSucceeded: null,
|
|
2738
|
+
kernelDeleteError: null,
|
|
2739
|
+
browserCloseSucceeded: null,
|
|
2740
|
+
browserCloseError: null
|
|
2741
|
+
},
|
|
2742
|
+
debug: null,
|
|
2743
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2744
|
+
});
|
|
2745
|
+
if (willRetry) continue;
|
|
2746
|
+
break;
|
|
1400
2747
|
}
|
|
2748
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2749
|
+
type: "finished",
|
|
2750
|
+
attemptNumber,
|
|
2751
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2752
|
+
outcome: classifyAttemptError(err),
|
|
2753
|
+
kernelSessionId: null,
|
|
2754
|
+
questionCount: 0,
|
|
2755
|
+
durationMs: Date.now() - startedAtMs,
|
|
2756
|
+
error: errorMessage(err),
|
|
2757
|
+
willRetry: false,
|
|
2758
|
+
cleanup: {
|
|
2759
|
+
kernelSessionId: null,
|
|
2760
|
+
kernelDeleteStarted: false,
|
|
2761
|
+
kernelDeleteSucceeded: null,
|
|
2762
|
+
kernelDeleteError: null,
|
|
2763
|
+
browserCloseSucceeded: null,
|
|
2764
|
+
browserCloseError: null
|
|
2765
|
+
},
|
|
2766
|
+
debug: null,
|
|
2767
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2768
|
+
});
|
|
1401
2769
|
throw err;
|
|
1402
2770
|
}
|
|
1403
2771
|
}
|
|
1404
|
-
|
|
1405
|
-
|
|
2772
|
+
console.warn(JSON.stringify({
|
|
2773
|
+
event: "harvest_captcha_exhausted",
|
|
2774
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2775
|
+
session_kind: kernelApiKey ? "kernel" : "local"
|
|
2776
|
+
}));
|
|
2777
|
+
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${MAX_ATTEMPTS} fresh sessions. Try again in a few minutes.`));
|
|
1406
2778
|
}
|
|
1407
2779
|
|
|
1408
2780
|
// src/cli.ts
|