mcp-scraper 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/dist/bin/api-server.cjs +15730 -7780
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +3 -3
- package/dist/bin/mcp-stdio-server.cjs +300 -110
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +1537 -165
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +1 -1
- package/dist/{chunk-ZBP4RHNW.js → chunk-4743MZHT.js} +298 -106
- package/dist/chunk-4743MZHT.js.map +1 -0
- package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
- package/dist/chunk-D4CJBZBY.js.map +1 -0
- package/dist/chunk-HERFK7W6.js +2781 -0
- package/dist/chunk-HERFK7W6.js.map +1 -0
- package/dist/chunk-Y74EXABN.js +295 -0
- package/dist/chunk-Y74EXABN.js.map +1 -0
- package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
- package/dist/index.cjs +1660 -237
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +169 -2
- package/dist/index.d.ts +169 -2
- package/dist/index.js +120 -69
- package/dist/index.js.map +1 -1
- package/dist/server-N7Q6H4OR.js +11612 -0
- package/dist/server-N7Q6H4OR.js.map +1 -0
- package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
- package/dist/worker-D4D2YQTA.js.map +1 -0
- package/package.json +17 -5
- package/dist/chunk-4API3ZCT.js +0 -1387
- package/dist/chunk-4API3ZCT.js.map +0 -1
- package/dist/chunk-LXZDJJXR.js.map +0 -1
- package/dist/chunk-ZBP4RHNW.js.map +0 -1
- package/dist/server-63DR2HE5.js +0 -6062
- package/dist/server-63DR2HE5.js.map +0 -1
- package/dist/worker-3ECJHPRE.js.map +0 -1
- /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -43,6 +43,10 @@ var HarvestOptionsSchema = import_zod.z.object({
|
|
|
43
43
|
location: import_zod.z.string().optional(),
|
|
44
44
|
gl: import_zod.z.string().length(2).default("us"),
|
|
45
45
|
hl: import_zod.z.string().length(2).default("en"),
|
|
46
|
+
device: import_zod.z.enum(["desktop", "mobile"]).default("desktop"),
|
|
47
|
+
proxyMode: import_zod.z.enum(["location", "configured", "none"]).default("location"),
|
|
48
|
+
proxyZip: import_zod.z.string().regex(/^\d{5}$/).optional(),
|
|
49
|
+
debug: import_zod.z.boolean().default(false),
|
|
46
50
|
depth: import_zod.z.number().int().min(1).max(30).default(3),
|
|
47
51
|
maxQuestions: import_zod.z.number().int().min(1).max(1e3).default(100),
|
|
48
52
|
headless: import_zod.z.boolean().default(false),
|
|
@@ -50,6 +54,7 @@ var HarvestOptionsSchema = import_zod.z.object({
|
|
|
50
54
|
proxy: import_zod.z.string().url().optional(),
|
|
51
55
|
kernelApiKey: import_zod.z.string().optional(),
|
|
52
56
|
kernelProxyId: import_zod.z.string().optional(),
|
|
57
|
+
kernelProxyResolution: import_zod.z.unknown().optional(),
|
|
53
58
|
outputDir: import_zod.z.string().default("./paa-output"),
|
|
54
59
|
format: import_zod.z.enum(["json", "csv", "both"]).default("both"),
|
|
55
60
|
serpOnly: import_zod.z.boolean().default(false),
|
|
@@ -73,6 +78,45 @@ var RawPAAItemSchema = import_zod.z.object({
|
|
|
73
78
|
sourceSite: import_zod.z.string().optional(),
|
|
74
79
|
sourceCite: import_zod.z.string().optional()
|
|
75
80
|
});
|
|
81
|
+
var RawMapsOverviewSchema = import_zod.z.object({
|
|
82
|
+
name: import_zod.z.string().nullable(),
|
|
83
|
+
rating: import_zod.z.string().nullable(),
|
|
84
|
+
reviewCount: import_zod.z.string().nullable(),
|
|
85
|
+
category: import_zod.z.string().nullable(),
|
|
86
|
+
address: import_zod.z.string().nullable(),
|
|
87
|
+
hoursSummary: import_zod.z.string().nullable(),
|
|
88
|
+
phone: import_zod.z.string().nullable(),
|
|
89
|
+
phoneDisplay: import_zod.z.string().nullable(),
|
|
90
|
+
website: import_zod.z.string().nullable(),
|
|
91
|
+
plusCode: import_zod.z.string().nullable(),
|
|
92
|
+
bookingUrl: import_zod.z.string().nullable()
|
|
93
|
+
});
|
|
94
|
+
var RawMapsHoursRowSchema = import_zod.z.object({
|
|
95
|
+
day: import_zod.z.string(),
|
|
96
|
+
hours: import_zod.z.string()
|
|
97
|
+
});
|
|
98
|
+
var RawMapsReviewStatsSchema = import_zod.z.object({
|
|
99
|
+
reviewHistogram: import_zod.z.array(import_zod.z.object({
|
|
100
|
+
stars: import_zod.z.number(),
|
|
101
|
+
count: import_zod.z.string()
|
|
102
|
+
})),
|
|
103
|
+
reviewTopics: import_zod.z.array(import_zod.z.object({
|
|
104
|
+
label: import_zod.z.string(),
|
|
105
|
+
count: import_zod.z.string()
|
|
106
|
+
}))
|
|
107
|
+
});
|
|
108
|
+
var RawMapsReviewCardSchema = import_zod.z.object({
|
|
109
|
+
reviewId: import_zod.z.string(),
|
|
110
|
+
author: import_zod.z.string().nullable(),
|
|
111
|
+
stars: import_zod.z.string().nullable(),
|
|
112
|
+
date: import_zod.z.string().nullable(),
|
|
113
|
+
text: import_zod.z.string().nullable(),
|
|
114
|
+
ownerResponse: import_zod.z.string().nullable()
|
|
115
|
+
});
|
|
116
|
+
var RawMapsAboutAttributeSchema = import_zod.z.object({
|
|
117
|
+
section: import_zod.z.string(),
|
|
118
|
+
attribute: import_zod.z.string()
|
|
119
|
+
});
|
|
76
120
|
|
|
77
121
|
// src/driver/BrowserDriver.ts
|
|
78
122
|
var import_playwright_extra = require("playwright-extra");
|
|
@@ -88,7 +132,7 @@ var PAASelectors = {
|
|
|
88
132
|
itemDataQ: "data-q",
|
|
89
133
|
itemDataInitQ: "data-initq",
|
|
90
134
|
itemQuestionEl: ".JlqpRe",
|
|
91
|
-
answerContainer: ".bCOlv",
|
|
135
|
+
answerContainer: ".bCOlv, .hgKElc, .wDYxhc, .LGOjhe, .fo7IQd, .fmW3u",
|
|
92
136
|
sourceTitle: "h3",
|
|
93
137
|
sourceSite: ".VuuXrf",
|
|
94
138
|
sourceCite: "cite",
|
|
@@ -128,9 +172,16 @@ var WhatPeopleSayingSelectors = {
|
|
|
128
172
|
authorNote: ".nDgy9d"
|
|
129
173
|
};
|
|
130
174
|
var AIOverviewSelectors = {
|
|
131
|
-
root:
|
|
175
|
+
root: "[data-lhcontainer][data-streaming-container][eid]",
|
|
176
|
+
legacyRoot: '[data-hveid="CBMQAA"]',
|
|
132
177
|
wrapper: ".Fgyi2e",
|
|
133
|
-
|
|
178
|
+
controller: '[jscontroller="AkrxPe"]',
|
|
179
|
+
contentSubtree: '[data-subtree="mfc"]',
|
|
180
|
+
header: ".heWuVc",
|
|
181
|
+
heading: ".Fzsovc.cwYVJe.RJPOee",
|
|
182
|
+
showMoreButton: '[aria-label="Show more AI Overview"]',
|
|
183
|
+
sourcesPanel: ".OZ9ddf.WAUd4",
|
|
184
|
+
disclaimer: ".DuQANe.MSJHRb"
|
|
134
185
|
};
|
|
135
186
|
var AIModeSelectors = {
|
|
136
187
|
root: '[data-hveid="CAUQAA"]',
|
|
@@ -158,6 +209,9 @@ var LocalPackSelectors = {
|
|
|
158
209
|
|
|
159
210
|
// src/errors.ts
|
|
160
211
|
var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
|
|
212
|
+
function sanitizeVendorName(message) {
|
|
213
|
+
return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
|
|
214
|
+
}
|
|
161
215
|
var CaptchaError = class extends Error {
|
|
162
216
|
constructor(instructions) {
|
|
163
217
|
super(`CAPTCHA detected. ${instructions}`);
|
|
@@ -174,10 +228,55 @@ var ExtractionError = class extends Error {
|
|
|
174
228
|
cause;
|
|
175
229
|
name = "ExtractionError";
|
|
176
230
|
};
|
|
231
|
+
var RequestAbortedError = class extends Error {
|
|
232
|
+
name = "RequestAbortedError";
|
|
233
|
+
constructor(message = "Request aborted before harvest completed") {
|
|
234
|
+
super(message);
|
|
235
|
+
}
|
|
236
|
+
};
|
|
177
237
|
|
|
178
238
|
// src/driver/BrowserDriver.ts
|
|
179
239
|
import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
|
|
180
240
|
var DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
241
|
+
var MOBILE_USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
|
|
242
|
+
var DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS = 180;
|
|
243
|
+
var KERNEL_BROWSER_CLOSE_TIMEOUT_MS = 3e3;
|
|
244
|
+
var KERNEL_SESSION_DELETE_TIMEOUT_MS = 5e3;
|
|
245
|
+
function positiveIntFromEnv(name, fallback) {
|
|
246
|
+
const raw = process.env[name];
|
|
247
|
+
if (!raw) return fallback;
|
|
248
|
+
const parsed = Number(raw);
|
|
249
|
+
return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback;
|
|
250
|
+
}
|
|
251
|
+
function proxyIdSuffix(proxyId) {
|
|
252
|
+
return proxyId ? proxyId.slice(-6) : null;
|
|
253
|
+
}
|
|
254
|
+
function errorText(err) {
|
|
255
|
+
return err instanceof Error ? err.message : String(err);
|
|
256
|
+
}
|
|
257
|
+
function rankCheckContextOptions(config) {
|
|
258
|
+
return {
|
|
259
|
+
viewport: config.viewport,
|
|
260
|
+
locale: config.locale,
|
|
261
|
+
userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
|
|
262
|
+
...config.deviceScaleFactor ? { deviceScaleFactor: config.deviceScaleFactor } : {},
|
|
263
|
+
...config.isMobile !== void 0 ? { isMobile: config.isMobile } : {},
|
|
264
|
+
...config.hasTouch !== void 0 ? { hasTouch: config.hasTouch } : {}
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
async function withTimeout(promise, timeoutMs, label) {
|
|
268
|
+
let timeout;
|
|
269
|
+
try {
|
|
270
|
+
return await Promise.race([
|
|
271
|
+
promise,
|
|
272
|
+
new Promise((_, reject) => {
|
|
273
|
+
timeout = setTimeout(() => reject(new Error(`${label} timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
274
|
+
})
|
|
275
|
+
]);
|
|
276
|
+
} finally {
|
|
277
|
+
if (timeout) clearTimeout(timeout);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
181
280
|
function buildYouTubeChannelVideosUrl(channelInput) {
|
|
182
281
|
const raw = channelInput.trim();
|
|
183
282
|
if (!raw) throw new Error("channelHandle is required");
|
|
@@ -211,30 +310,101 @@ var BrowserDriver = class {
|
|
|
211
310
|
page = null;
|
|
212
311
|
kernelClient = null;
|
|
213
312
|
kernelSessionId = null;
|
|
313
|
+
debugEnabled = false;
|
|
314
|
+
debugSnapshot = {
|
|
315
|
+
kernel: null,
|
|
316
|
+
context: null,
|
|
317
|
+
networkLocation: null,
|
|
318
|
+
serpNavigation: null
|
|
319
|
+
};
|
|
214
320
|
async launch(config) {
|
|
321
|
+
this.debugEnabled = config.debug === true;
|
|
322
|
+
const proxyMode = config.proxyMode ?? (config.kernelProxyId ? "configured" : "none");
|
|
323
|
+
const device = config.isMobile ? "mobile" : "desktop";
|
|
324
|
+
this.debugSnapshot = {
|
|
325
|
+
kernel: null,
|
|
326
|
+
context: {
|
|
327
|
+
viewport: config.viewport,
|
|
328
|
+
locale: config.locale,
|
|
329
|
+
device,
|
|
330
|
+
userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
|
|
331
|
+
deviceScaleFactor: config.deviceScaleFactor ?? null,
|
|
332
|
+
isMobile: config.isMobile === true,
|
|
333
|
+
hasTouch: config.hasTouch === true
|
|
334
|
+
},
|
|
335
|
+
networkLocation: null,
|
|
336
|
+
serpNavigation: null
|
|
337
|
+
};
|
|
215
338
|
if (config.kernelApiKey) {
|
|
216
339
|
this.kernelClient = new import_sdk.default({ apiKey: config.kernelApiKey });
|
|
340
|
+
const timeoutSeconds = positiveIntFromEnv("KERNEL_BROWSER_TIMEOUT_SECONDS", DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS);
|
|
217
341
|
const kernelBrowser = await this.kernelClient.browsers.create({
|
|
218
342
|
stealth: true,
|
|
219
|
-
timeout_seconds:
|
|
343
|
+
timeout_seconds: timeoutSeconds,
|
|
220
344
|
...config.kernelProxyId ? { proxy_id: config.kernelProxyId } : {}
|
|
221
345
|
});
|
|
222
346
|
this.kernelSessionId = kernelBrowser.session_id;
|
|
347
|
+
let defaultProxyDisabled = null;
|
|
348
|
+
let defaultProxyDisableError = null;
|
|
349
|
+
if (proxyMode === "none") {
|
|
350
|
+
try {
|
|
351
|
+
await withTimeout(
|
|
352
|
+
this.kernelClient.browsers.update(this.kernelSessionId, { disable_default_proxy: true }),
|
|
353
|
+
5e3,
|
|
354
|
+
`Kernel session ${this.kernelSessionId} disable default proxy`
|
|
355
|
+
);
|
|
356
|
+
defaultProxyDisabled = true;
|
|
357
|
+
} catch (err) {
|
|
358
|
+
defaultProxyDisabled = false;
|
|
359
|
+
defaultProxyDisableError = errorText(err);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
const kernelDebug = {
|
|
363
|
+
sessionId: this.kernelSessionId,
|
|
364
|
+
proxyMode,
|
|
365
|
+
requestedProxyIdPresent: Boolean(config.kernelProxyId),
|
|
366
|
+
requestedProxyIdSuffix: proxyIdSuffix(config.kernelProxyId),
|
|
367
|
+
createdProxyIdPresent: typeof kernelBrowser.proxy_id === "string" ? Boolean(kernelBrowser.proxy_id) : null,
|
|
368
|
+
createdProxyIdSuffix: proxyIdSuffix(kernelBrowser.proxy_id),
|
|
369
|
+
retrievedProxyIdPresent: null,
|
|
370
|
+
retrievedProxyIdSuffix: null,
|
|
371
|
+
retrievedProxyIdMatchesRequested: null,
|
|
372
|
+
defaultProxyDisabled,
|
|
373
|
+
defaultProxyDisableError,
|
|
374
|
+
proxyResolution: config.kernelProxyResolution ?? null,
|
|
375
|
+
timeoutSeconds,
|
|
376
|
+
stealth: typeof kernelBrowser.stealth === "boolean" ? kernelBrowser.stealth : null,
|
|
377
|
+
profilePresent: null,
|
|
378
|
+
poolPresent: null,
|
|
379
|
+
retrieveError: null
|
|
380
|
+
};
|
|
381
|
+
this.debugSnapshot.kernel = kernelDebug;
|
|
382
|
+
console.info(JSON.stringify({
|
|
383
|
+
event: "kernel_browser_created",
|
|
384
|
+
kernel_session_id: this.kernelSessionId,
|
|
385
|
+
timeout_seconds: timeoutSeconds,
|
|
386
|
+
proxy_mode: proxyMode,
|
|
387
|
+
proxy_id_present: Boolean(config.kernelProxyId),
|
|
388
|
+
proxy_resolution_source: config.kernelProxyResolution?.source
|
|
389
|
+
}));
|
|
390
|
+
if (this.debugEnabled) {
|
|
391
|
+
await this.populateKernelRetrieveDebug(kernelDebug, config.kernelProxyId);
|
|
392
|
+
}
|
|
223
393
|
this.browser = await import_playwright.chromium.connectOverCDP(kernelBrowser.cdp_ws_url);
|
|
224
|
-
this.context =
|
|
394
|
+
this.context = await this.browser.newContext(rankCheckContextOptions(config));
|
|
225
395
|
await this.installEsbuildHelperShims(this.context);
|
|
226
|
-
this.page =
|
|
396
|
+
this.page = await this.context.newPage();
|
|
397
|
+
await this.page.setViewportSize(config.viewport);
|
|
398
|
+
if (this.debugEnabled) {
|
|
399
|
+
this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
|
|
400
|
+
}
|
|
227
401
|
return;
|
|
228
402
|
}
|
|
229
403
|
const launchOpts = {
|
|
230
404
|
headless: config.headless,
|
|
231
405
|
proxy: config.proxy ? { server: config.proxy } : void 0
|
|
232
406
|
};
|
|
233
|
-
const ctxOpts =
|
|
234
|
-
viewport: config.viewport,
|
|
235
|
-
locale: config.locale,
|
|
236
|
-
userAgent: DESKTOP_USER_AGENT
|
|
237
|
-
};
|
|
407
|
+
const ctxOpts = rankCheckContextOptions(config);
|
|
238
408
|
if (config.profileDir) {
|
|
239
409
|
this.context = await import_playwright_extra.chromium.launchPersistentContext(config.profileDir, {
|
|
240
410
|
...launchOpts,
|
|
@@ -248,6 +418,107 @@ var BrowserDriver = class {
|
|
|
248
418
|
await this.installEsbuildHelperShims(this.context);
|
|
249
419
|
this.page = await this.context.newPage();
|
|
250
420
|
}
|
|
421
|
+
if (this.debugEnabled) {
|
|
422
|
+
this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
async populateKernelRetrieveDebug(kernelDebug, requestedProxyId) {
|
|
426
|
+
if (!this.kernelClient || !this.kernelSessionId) return;
|
|
427
|
+
try {
|
|
428
|
+
const retrieved = await withTimeout(
|
|
429
|
+
this.kernelClient.browsers.retrieve(this.kernelSessionId),
|
|
430
|
+
5e3,
|
|
431
|
+
`Kernel session ${this.kernelSessionId} retrieve`
|
|
432
|
+
);
|
|
433
|
+
kernelDebug.retrievedProxyIdPresent = typeof retrieved.proxy_id === "string" ? Boolean(retrieved.proxy_id) : false;
|
|
434
|
+
kernelDebug.retrievedProxyIdSuffix = proxyIdSuffix(retrieved.proxy_id);
|
|
435
|
+
kernelDebug.retrievedProxyIdMatchesRequested = requestedProxyId ? retrieved.proxy_id === requestedProxyId : !retrieved.proxy_id;
|
|
436
|
+
kernelDebug.timeoutSeconds = typeof retrieved.timeout_seconds === "number" ? retrieved.timeout_seconds : kernelDebug.timeoutSeconds;
|
|
437
|
+
kernelDebug.stealth = typeof retrieved.stealth === "boolean" ? retrieved.stealth : kernelDebug.stealth;
|
|
438
|
+
kernelDebug.profilePresent = Boolean(retrieved.profile);
|
|
439
|
+
kernelDebug.poolPresent = Boolean(retrieved.pool);
|
|
440
|
+
} catch (err) {
|
|
441
|
+
kernelDebug.retrieveError = errorText(err);
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
async captureBrowserNetworkLocation() {
|
|
445
|
+
const fallback = (message, source = "ipapi.co") => ({
|
|
446
|
+
source,
|
|
447
|
+
ip: null,
|
|
448
|
+
city: null,
|
|
449
|
+
region: null,
|
|
450
|
+
country: null,
|
|
451
|
+
org: null,
|
|
452
|
+
timezone: null,
|
|
453
|
+
error: message
|
|
454
|
+
});
|
|
455
|
+
if (!this.context) return fallback("browser context is not available");
|
|
456
|
+
let debugPage = null;
|
|
457
|
+
try {
|
|
458
|
+
debugPage = await this.context.newPage();
|
|
459
|
+
const ipwho = await this.loadJsonInDebugPage(debugPage, "https://ipwho.is/");
|
|
460
|
+
if (ipwho) {
|
|
461
|
+
const connection = typeof ipwho.connection === "object" && ipwho.connection !== null ? ipwho.connection : {};
|
|
462
|
+
return {
|
|
463
|
+
source: "ipwho.is",
|
|
464
|
+
ip: typeof ipwho.ip === "string" ? ipwho.ip : null,
|
|
465
|
+
city: typeof ipwho.city === "string" ? ipwho.city : null,
|
|
466
|
+
region: typeof ipwho.region === "string" ? ipwho.region : null,
|
|
467
|
+
country: typeof ipwho.country === "string" ? ipwho.country : null,
|
|
468
|
+
org: typeof connection.org === "string" ? connection.org : null,
|
|
469
|
+
timezone: typeof ipwho.timezone === "object" && ipwho.timezone !== null && typeof ipwho.timezone.id === "string" ? ipwho.timezone.id : null,
|
|
470
|
+
error: null
|
|
471
|
+
};
|
|
472
|
+
}
|
|
473
|
+
const ipify = await this.loadJsonInDebugPage(debugPage, "https://api64.ipify.org?format=json");
|
|
474
|
+
if (ipify) {
|
|
475
|
+
return {
|
|
476
|
+
source: "api64.ipify.org",
|
|
477
|
+
ip: typeof ipify.ip === "string" ? ipify.ip : null,
|
|
478
|
+
city: null,
|
|
479
|
+
region: null,
|
|
480
|
+
country: null,
|
|
481
|
+
org: null,
|
|
482
|
+
timezone: null,
|
|
483
|
+
error: null
|
|
484
|
+
};
|
|
485
|
+
}
|
|
486
|
+
await withTimeout(
|
|
487
|
+
debugPage.goto("https://ipapi.co/json/", { waitUntil: "domcontentloaded", timeout: 7e3 }),
|
|
488
|
+
8e3,
|
|
489
|
+
"browser network location navigation"
|
|
490
|
+
);
|
|
491
|
+
const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
|
|
492
|
+
const data = JSON.parse(body);
|
|
493
|
+
return {
|
|
494
|
+
source: "ipapi.co",
|
|
495
|
+
ip: typeof data.ip === "string" ? data.ip : null,
|
|
496
|
+
city: typeof data.city === "string" ? data.city : null,
|
|
497
|
+
region: typeof data.region === "string" ? data.region : null,
|
|
498
|
+
country: typeof data.country_name === "string" ? data.country_name : typeof data.country === "string" ? data.country : null,
|
|
499
|
+
org: typeof data.org === "string" ? data.org : null,
|
|
500
|
+
timezone: typeof data.timezone === "string" ? data.timezone : null,
|
|
501
|
+
error: null
|
|
502
|
+
};
|
|
503
|
+
} catch (err) {
|
|
504
|
+
return fallback(errorText(err));
|
|
505
|
+
} finally {
|
|
506
|
+
await debugPage?.close().catch(() => {
|
|
507
|
+
});
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
async loadJsonInDebugPage(debugPage, url) {
|
|
511
|
+
try {
|
|
512
|
+
await withTimeout(
|
|
513
|
+
debugPage.goto(url, { waitUntil: "domcontentloaded", timeout: 7e3 }),
|
|
514
|
+
8e3,
|
|
515
|
+
`browser network location navigation ${url}`
|
|
516
|
+
);
|
|
517
|
+
const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
|
|
518
|
+
return JSON.parse(body);
|
|
519
|
+
} catch {
|
|
520
|
+
return null;
|
|
521
|
+
}
|
|
251
522
|
}
|
|
252
523
|
async installEsbuildHelperShims(context) {
|
|
253
524
|
await context.addInitScript(() => {
|
|
@@ -259,42 +530,79 @@ var BrowserDriver = class {
|
|
|
259
530
|
};
|
|
260
531
|
});
|
|
261
532
|
}
|
|
262
|
-
async navigateToSERP(query, uule, gl, hl) {
|
|
263
|
-
const params = new URLSearchParams({ q: query, gl, hl });
|
|
533
|
+
async navigateToSERP(query, uule, gl, hl, options) {
|
|
534
|
+
const params = new URLSearchParams({ q: query, gl, hl, pws: "0" });
|
|
535
|
+
if (options?.num) params.set("num", String(options.num));
|
|
264
536
|
if (uule) params.set("uule", uule);
|
|
265
537
|
const url = "https://www.google.com/search?" + params.toString();
|
|
538
|
+
const navDebug = options?.debug ? {
|
|
539
|
+
requestedUrl: url,
|
|
540
|
+
finalUrl: null,
|
|
541
|
+
title: null,
|
|
542
|
+
bodySnippet: null,
|
|
543
|
+
hasPaa: null,
|
|
544
|
+
captchaDetected: null,
|
|
545
|
+
googleSorryUrl: null,
|
|
546
|
+
redirected: null
|
|
547
|
+
} : null;
|
|
548
|
+
if (navDebug) this.debugSnapshot.serpNavigation = navDebug;
|
|
266
549
|
try {
|
|
267
550
|
await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
|
|
268
551
|
} catch (err) {
|
|
552
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: null, captchaDetected: null });
|
|
269
553
|
const diag = await this.captureDiagnostics(url);
|
|
270
554
|
throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
|
|
271
555
|
}
|
|
272
556
|
const captchaCount = await this.page.locator(PAASelectors.captchaMarker).count();
|
|
273
557
|
if (captchaCount > 0) {
|
|
274
|
-
|
|
275
|
-
try {
|
|
276
|
-
await this.page.waitForSelector(PAASelectors.container, { timeout: 45e3 });
|
|
277
|
-
return { hasPaa: true };
|
|
278
|
-
} catch {
|
|
279
|
-
throw new CaptchaError(this.captchaMessage());
|
|
280
|
-
}
|
|
281
|
-
}
|
|
558
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
|
|
282
559
|
throw new CaptchaError(this.captchaMessage());
|
|
283
560
|
}
|
|
284
561
|
const fastFound = await this.page.waitForSelector(PAASelectors.item, { timeout: 4e3 }).catch(() => null);
|
|
285
|
-
if (fastFound)
|
|
562
|
+
if (fastFound) {
|
|
563
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
|
|
564
|
+
return { hasPaa: true };
|
|
565
|
+
}
|
|
286
566
|
const captchaAfter = await this.page.locator(PAASelectors.captchaMarker).count();
|
|
287
|
-
if (captchaAfter > 0)
|
|
567
|
+
if (captchaAfter > 0) {
|
|
568
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
|
|
569
|
+
throw new CaptchaError(this.captchaMessage());
|
|
570
|
+
}
|
|
288
571
|
for (let i = 1; i <= 6; i++) {
|
|
289
572
|
await this.page.evaluate((f) => {
|
|
290
573
|
window.scrollTo(0, document.body.scrollHeight * f);
|
|
291
574
|
}, i / 6);
|
|
292
575
|
await this.page.waitForTimeout(600);
|
|
293
576
|
const count = await this.page.locator(PAASelectors.item).count();
|
|
294
|
-
if (count > 0)
|
|
577
|
+
if (count > 0) {
|
|
578
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
|
|
579
|
+
return { hasPaa: true };
|
|
580
|
+
}
|
|
295
581
|
}
|
|
582
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: false });
|
|
296
583
|
return { hasPaa: false };
|
|
297
584
|
}
|
|
585
|
+
async updateSerpNavigationDebug(navDebug, requestedUrl, state) {
|
|
586
|
+
if (!navDebug || !this.page) return;
|
|
587
|
+
try {
|
|
588
|
+
const finalUrl = this.page.url();
|
|
589
|
+
const title = await this.page.title().catch(() => "");
|
|
590
|
+
const bodySnippet = await this.page.evaluate(() => {
|
|
591
|
+
const text = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
|
|
592
|
+
return text.slice(0, 500);
|
|
593
|
+
}).catch(() => "");
|
|
594
|
+
const textCaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
|
|
595
|
+
navDebug.finalUrl = finalUrl;
|
|
596
|
+
navDebug.title = title;
|
|
597
|
+
navDebug.bodySnippet = bodySnippet;
|
|
598
|
+
navDebug.hasPaa = state.hasPaa;
|
|
599
|
+
navDebug.captchaDetected = state.captchaDetected ?? textCaptcha;
|
|
600
|
+
navDebug.googleSorryUrl = /google\.[^/]+\/sorry\//i.test(finalUrl);
|
|
601
|
+
navDebug.redirected = finalUrl !== requestedUrl;
|
|
602
|
+
} catch (err) {
|
|
603
|
+
navDebug.bodySnippet = `debug capture failed: ${errorText(err)}`;
|
|
604
|
+
}
|
|
605
|
+
}
|
|
298
606
|
async captureDiagnostics(intendedUrl) {
|
|
299
607
|
try {
|
|
300
608
|
const finalUrl = this.page.url();
|
|
@@ -316,7 +624,7 @@ var BrowserDriver = class {
|
|
|
316
624
|
}
|
|
317
625
|
}
|
|
318
626
|
captchaMessage() {
|
|
319
|
-
return this.kernelClient ? "Google returned a CAPTCHA on this
|
|
627
|
+
return this.kernelClient ? "Google returned a CAPTCHA on this session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
|
|
320
628
|
}
|
|
321
629
|
async navigateTo(url) {
|
|
322
630
|
try {
|
|
@@ -341,6 +649,12 @@ var BrowserDriver = class {
|
|
|
341
649
|
getPage() {
|
|
342
650
|
return this.page;
|
|
343
651
|
}
|
|
652
|
+
getKernelSessionId() {
|
|
653
|
+
return this.kernelSessionId;
|
|
654
|
+
}
|
|
655
|
+
getDebugSnapshot() {
|
|
656
|
+
return this.debugSnapshot;
|
|
657
|
+
}
|
|
344
658
|
async close() {
|
|
345
659
|
if (this.browser) {
|
|
346
660
|
const b = this.browser;
|
|
@@ -351,21 +665,84 @@ var BrowserDriver = class {
|
|
|
351
665
|
this.page = null;
|
|
352
666
|
this.kernelSessionId = null;
|
|
353
667
|
this.kernelClient = null;
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
)
|
|
668
|
+
if (client && sessionId) {
|
|
669
|
+
console.info(JSON.stringify({
|
|
670
|
+
event: "kernel_browser_delete_started",
|
|
671
|
+
kernel_session_id: sessionId
|
|
672
|
+
}));
|
|
673
|
+
const deleteSession = withTimeout(
|
|
674
|
+
client.browsers.deleteByID(sessionId),
|
|
675
|
+
KERNEL_SESSION_DELETE_TIMEOUT_MS,
|
|
676
|
+
`Kernel session ${sessionId} delete`
|
|
677
|
+
);
|
|
678
|
+
const closeBrowser = withTimeout(
|
|
679
|
+
b.close(),
|
|
680
|
+
KERNEL_BROWSER_CLOSE_TIMEOUT_MS,
|
|
681
|
+
`Kernel browser ${sessionId} close`
|
|
682
|
+
);
|
|
683
|
+
const [deleteResult, closeResult] = await Promise.allSettled([deleteSession, closeBrowser]);
|
|
684
|
+
const result = {
|
|
685
|
+
kernelSessionId: sessionId,
|
|
686
|
+
kernelDeleteStarted: true,
|
|
687
|
+
kernelDeleteSucceeded: deleteResult.status === "fulfilled",
|
|
688
|
+
kernelDeleteError: deleteResult.status === "rejected" ? deleteResult.reason instanceof Error ? deleteResult.reason.message : String(deleteResult.reason) : null,
|
|
689
|
+
browserCloseSucceeded: closeResult.status === "fulfilled",
|
|
690
|
+
browserCloseError: closeResult.status === "rejected" ? closeResult.reason instanceof Error ? closeResult.reason.message : String(closeResult.reason) : null
|
|
691
|
+
};
|
|
692
|
+
if (deleteResult.status === "rejected") {
|
|
693
|
+
console.warn(JSON.stringify({
|
|
694
|
+
event: "kernel_browser_delete_failed",
|
|
695
|
+
kernel_session_id: sessionId,
|
|
696
|
+
message: result.kernelDeleteError
|
|
697
|
+
}));
|
|
698
|
+
console.warn(`Kernel session cleanup failed for ${sessionId}:`, deleteResult.reason);
|
|
699
|
+
} else {
|
|
700
|
+
console.info(JSON.stringify({
|
|
701
|
+
event: "kernel_browser_delete_succeeded",
|
|
702
|
+
kernel_session_id: sessionId
|
|
703
|
+
}));
|
|
361
704
|
}
|
|
705
|
+
if (closeResult.status === "rejected") {
|
|
706
|
+
console.warn(JSON.stringify({
|
|
707
|
+
event: "kernel_browser_close_failed",
|
|
708
|
+
kernel_session_id: sessionId,
|
|
709
|
+
message: result.browserCloseError
|
|
710
|
+
}));
|
|
711
|
+
console.warn(`Kernel browser close failed for ${sessionId}:`, closeResult.reason);
|
|
712
|
+
}
|
|
713
|
+
return result;
|
|
362
714
|
}
|
|
715
|
+
await b.close();
|
|
716
|
+
return {
|
|
717
|
+
kernelSessionId: null,
|
|
718
|
+
kernelDeleteStarted: false,
|
|
719
|
+
kernelDeleteSucceeded: null,
|
|
720
|
+
kernelDeleteError: null,
|
|
721
|
+
browserCloseSucceeded: true,
|
|
722
|
+
browserCloseError: null
|
|
723
|
+
};
|
|
363
724
|
} else if (this.context) {
|
|
364
725
|
const ctx = this.context;
|
|
365
726
|
this.context = null;
|
|
366
727
|
this.page = null;
|
|
367
728
|
await ctx.close();
|
|
729
|
+
return {
|
|
730
|
+
kernelSessionId: null,
|
|
731
|
+
kernelDeleteStarted: false,
|
|
732
|
+
kernelDeleteSucceeded: null,
|
|
733
|
+
kernelDeleteError: null,
|
|
734
|
+
browserCloseSucceeded: true,
|
|
735
|
+
browserCloseError: null
|
|
736
|
+
};
|
|
368
737
|
}
|
|
738
|
+
return {
|
|
739
|
+
kernelSessionId: null,
|
|
740
|
+
kernelDeleteStarted: false,
|
|
741
|
+
kernelDeleteSucceeded: null,
|
|
742
|
+
kernelDeleteError: null,
|
|
743
|
+
browserCloseSucceeded: null,
|
|
744
|
+
browserCloseError: null
|
|
745
|
+
};
|
|
369
746
|
}
|
|
370
747
|
};
|
|
371
748
|
|
|
@@ -436,13 +813,157 @@ var LOCATIONS = {
|
|
|
436
813
|
};
|
|
437
814
|
|
|
438
815
|
// src/uule.ts
|
|
816
|
+
function encodeVarint(value) {
|
|
817
|
+
const bytes = [];
|
|
818
|
+
let remaining = value;
|
|
819
|
+
do {
|
|
820
|
+
let byte = remaining & 127;
|
|
821
|
+
remaining >>>= 7;
|
|
822
|
+
if (remaining > 0) byte |= 128;
|
|
823
|
+
bytes.push(byte);
|
|
824
|
+
} while (remaining > 0);
|
|
825
|
+
return bytes;
|
|
826
|
+
}
|
|
439
827
|
function encodeUule(name) {
|
|
440
|
-
const
|
|
441
|
-
|
|
828
|
+
const locationBytes = Buffer.from(name, "utf8");
|
|
829
|
+
const payload = Buffer.concat([
|
|
830
|
+
Buffer.from([8, 2, 16, 32, 34]),
|
|
831
|
+
Buffer.from(encodeVarint(locationBytes.length)),
|
|
832
|
+
locationBytes
|
|
833
|
+
]);
|
|
834
|
+
return `w+${payload.toString("base64")}`;
|
|
442
835
|
}
|
|
443
836
|
function normalizeLocation(input) {
|
|
444
|
-
const
|
|
445
|
-
|
|
837
|
+
const raw = input.toLowerCase().trim();
|
|
838
|
+
if (LOCATIONS[raw]) return LOCATIONS[raw];
|
|
839
|
+
const beforeComma = raw.split(",")[0].trim();
|
|
840
|
+
if (beforeComma !== raw && LOCATIONS[beforeComma]) return LOCATIONS[beforeComma];
|
|
841
|
+
const withoutState = raw.replace(/\s+[a-z]{2}$/, "").trim();
|
|
842
|
+
if (withoutState !== raw && LOCATIONS[withoutState]) return LOCATIONS[withoutState];
|
|
843
|
+
return input;
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
// src/serp-location-debug.ts
|
|
847
|
+
var STATE_TO_CODE = {
|
|
848
|
+
alabama: "AL",
|
|
849
|
+
alaska: "AK",
|
|
850
|
+
arizona: "AZ",
|
|
851
|
+
arkansas: "AR",
|
|
852
|
+
california: "CA",
|
|
853
|
+
colorado: "CO",
|
|
854
|
+
connecticut: "CT",
|
|
855
|
+
delaware: "DE",
|
|
856
|
+
florida: "FL",
|
|
857
|
+
georgia: "GA",
|
|
858
|
+
hawaii: "HI",
|
|
859
|
+
idaho: "ID",
|
|
860
|
+
illinois: "IL",
|
|
861
|
+
indiana: "IN",
|
|
862
|
+
iowa: "IA",
|
|
863
|
+
kansas: "KS",
|
|
864
|
+
kentucky: "KY",
|
|
865
|
+
louisiana: "LA",
|
|
866
|
+
maine: "ME",
|
|
867
|
+
maryland: "MD",
|
|
868
|
+
massachusetts: "MA",
|
|
869
|
+
michigan: "MI",
|
|
870
|
+
minnesota: "MN",
|
|
871
|
+
mississippi: "MS",
|
|
872
|
+
missouri: "MO",
|
|
873
|
+
montana: "MT",
|
|
874
|
+
nebraska: "NE",
|
|
875
|
+
nevada: "NV",
|
|
876
|
+
"new hampshire": "NH",
|
|
877
|
+
"new jersey": "NJ",
|
|
878
|
+
"new mexico": "NM",
|
|
879
|
+
"new york": "NY",
|
|
880
|
+
"north carolina": "NC",
|
|
881
|
+
"north dakota": "ND",
|
|
882
|
+
ohio: "OH",
|
|
883
|
+
oklahoma: "OK",
|
|
884
|
+
oregon: "OR",
|
|
885
|
+
pennsylvania: "PA",
|
|
886
|
+
"rhode island": "RI",
|
|
887
|
+
"south carolina": "SC",
|
|
888
|
+
"south dakota": "SD",
|
|
889
|
+
tennessee: "TN",
|
|
890
|
+
texas: "TX",
|
|
891
|
+
utah: "UT",
|
|
892
|
+
vermont: "VT",
|
|
893
|
+
virginia: "VA",
|
|
894
|
+
washington: "WA",
|
|
895
|
+
"west virginia": "WV",
|
|
896
|
+
wisconsin: "WI",
|
|
897
|
+
wyoming: "WY",
|
|
898
|
+
"district of columbia": "DC"
|
|
899
|
+
};
|
|
900
|
+
var STATE_PATTERN = [
|
|
901
|
+
...Object.keys(STATE_TO_CODE).map((s) => s.replace(/\s+/g, "\\s+")),
|
|
902
|
+
...Object.values(STATE_TO_CODE)
|
|
903
|
+
].join("|");
|
|
904
|
+
var CITY_STATE_RE = new RegExp(`\\b([A-Z][A-Za-z]+(?:[\\s.-][A-Z][A-Za-z]+){0,4}),?\\s+(${STATE_PATTERN})\\b`, "gi");
|
|
905
|
+
function normalizeRegionCode(input) {
|
|
906
|
+
if (!input) return null;
|
|
907
|
+
const trimmed = input.trim();
|
|
908
|
+
if (/^[A-Z]{2}$/i.test(trimmed)) return trimmed.toUpperCase();
|
|
909
|
+
return STATE_TO_CODE[trimmed.toLowerCase()] ?? null;
|
|
910
|
+
}
|
|
911
|
+
function normalizeCity(input) {
|
|
912
|
+
const cleaned = input.replace(/\s+/g, " ").trim().replace(/^.*\b(?:in|near|around|serving)\s+/i, "");
|
|
913
|
+
return cleaned.toLowerCase().replace(/\b[a-z]/g, (char) => char.toUpperCase());
|
|
914
|
+
}
|
|
915
|
+
function parseExpected(canonicalLocation) {
|
|
916
|
+
if (!canonicalLocation) return null;
|
|
917
|
+
const [city = "", region = ""] = canonicalLocation.split(",").map((part) => part.trim());
|
|
918
|
+
return {
|
|
919
|
+
city: normalizeCity(city),
|
|
920
|
+
regionCode: normalizeRegionCode(region),
|
|
921
|
+
canonicalLocation
|
|
922
|
+
};
|
|
923
|
+
}
|
|
924
|
+
function addCandidate(candidates, city, region, example) {
|
|
925
|
+
const normalizedCity = normalizeCity(city);
|
|
926
|
+
const regionCode = normalizeRegionCode(region);
|
|
927
|
+
if (!normalizedCity || !regionCode) return;
|
|
928
|
+
const key = `${normalizedCity.toLowerCase()}|${regionCode}`;
|
|
929
|
+
const existing = candidates.get(key);
|
|
930
|
+
if (existing) {
|
|
931
|
+
existing.count++;
|
|
932
|
+
if (existing.examples.length < 3 && !existing.examples.includes(example)) existing.examples.push(example);
|
|
933
|
+
return;
|
|
934
|
+
}
|
|
935
|
+
candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
|
|
936
|
+
}
|
|
937
|
+
function scanText(candidates, text) {
|
|
938
|
+
const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
|
|
939
|
+
for (const match of normalized.matchAll(CITY_STATE_RE)) {
|
|
940
|
+
addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
function inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) {
|
|
944
|
+
const expected = parseExpected(canonicalLocation);
|
|
945
|
+
const candidates = /* @__PURE__ */ new Map();
|
|
946
|
+
for (const result of organicResults) {
|
|
947
|
+
scanText(candidates, [result.title, result.snippet ?? "", result.cite ?? "", result.url].join(" "));
|
|
948
|
+
}
|
|
949
|
+
for (const business of localPack) {
|
|
950
|
+
scanText(candidates, [business.name, ...business.metadata, business.websiteUrl ?? "", business.directionsUrl ?? ""].join(" "));
|
|
951
|
+
}
|
|
952
|
+
const rankedCandidates = Array.from(candidates.values()).sort((a, b) => b.count - a.count || a.city.localeCompare(b.city)).slice(0, 8);
|
|
953
|
+
if (!expected) {
|
|
954
|
+
return { status: "not_requested", expected: null, candidates: rankedCandidates };
|
|
955
|
+
}
|
|
956
|
+
if (rankedCandidates.length === 0) {
|
|
957
|
+
return { status: "unknown", expected, candidates: [] };
|
|
958
|
+
}
|
|
959
|
+
const matched = rankedCandidates.some(
|
|
960
|
+
(candidate) => candidate.city.toLowerCase() === expected.city.toLowerCase() && (expected.regionCode == null || candidate.regionCode === expected.regionCode)
|
|
961
|
+
);
|
|
962
|
+
return {
|
|
963
|
+
status: matched ? "matched" : "mismatch",
|
|
964
|
+
expected,
|
|
965
|
+
candidates: rankedCandidates
|
|
966
|
+
};
|
|
446
967
|
}
|
|
447
968
|
|
|
448
969
|
// src/lib/paa-answer-cleanup.ts
|
|
@@ -537,7 +1058,220 @@ function cleanPAAAnswerText(answer, question, sourceTitle) {
|
|
|
537
1058
|
return text;
|
|
538
1059
|
}
|
|
539
1060
|
|
|
1061
|
+
// src/extractor/ai-surfaces.ts
|
|
1062
|
+
async function extractAISurfacesFromDocument(config) {
|
|
1063
|
+
const selectors = config ?? {
|
|
1064
|
+
aio: {
|
|
1065
|
+
root: "[data-lhcontainer][data-streaming-container][eid]",
|
|
1066
|
+
legacyRoot: '[data-hveid="CBMQAA"]',
|
|
1067
|
+
wrapper: ".Fgyi2e",
|
|
1068
|
+
controller: '[jscontroller="AkrxPe"]',
|
|
1069
|
+
contentSubtree: '[data-subtree="mfc"]',
|
|
1070
|
+
heading: ".Fzsovc.cwYVJe.RJPOee",
|
|
1071
|
+
header: ".heWuVc",
|
|
1072
|
+
showMoreButton: '[aria-label="Show more AI Overview"]',
|
|
1073
|
+
sourcesPanel: ".OZ9ddf.WAUd4",
|
|
1074
|
+
disclaimer: ".DuQANe.MSJHRb"
|
|
1075
|
+
},
|
|
1076
|
+
aim: {
|
|
1077
|
+
root: '[data-hveid="CAUQAA"]',
|
|
1078
|
+
wrapper: ".Fgyi2e"
|
|
1079
|
+
},
|
|
1080
|
+
expandWaitMs: 1500
|
|
1081
|
+
};
|
|
1082
|
+
const sn = window.google?.sn ?? "unknown";
|
|
1083
|
+
const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
|
|
1084
|
+
function textOf(el) {
|
|
1085
|
+
if (!el) return "";
|
|
1086
|
+
return (el.innerText ?? el.textContent ?? "").trim();
|
|
1087
|
+
}
|
|
1088
|
+
function hasAIOverviewLabel(el) {
|
|
1089
|
+
const heading = el.querySelector(selectors.aio.heading);
|
|
1090
|
+
if (textOf(heading) === "AI Overview") return true;
|
|
1091
|
+
const header = el.querySelector(selectors.aio.header);
|
|
1092
|
+
if (textOf(header).split(/\n|\s{2,}/).some((part) => part.trim() === "AI Overview")) return true;
|
|
1093
|
+
return textOf(el).includes("AI Overview");
|
|
1094
|
+
}
|
|
1095
|
+
function findAIORoot() {
|
|
1096
|
+
const primaryRoots = Array.from(document.querySelectorAll(selectors.aio.root));
|
|
1097
|
+
const labeledPrimary = primaryRoots.find(hasAIOverviewLabel);
|
|
1098
|
+
if (labeledPrimary) return labeledPrimary;
|
|
1099
|
+
if (primaryRoots.length > 0) return primaryRoots[0];
|
|
1100
|
+
if (selectors.aio.legacyRoot) {
|
|
1101
|
+
const legacy = document.querySelector(selectors.aio.legacyRoot);
|
|
1102
|
+
if (legacy) return legacy;
|
|
1103
|
+
}
|
|
1104
|
+
const headings = document.querySelectorAll(`${selectors.aio.heading}, h1, h2, h3, [role="heading"]`);
|
|
1105
|
+
for (const h of headings) {
|
|
1106
|
+
if (textOf(h) !== "AI Overview") continue;
|
|
1107
|
+
let el = h.parentElement;
|
|
1108
|
+
for (let i = 0; i < 8 && el; i++) {
|
|
1109
|
+
if (el.matches(selectors.aio.root) || el.querySelector(selectors.aio.controller) || el.querySelector(selectors.aio.contentSubtree)) {
|
|
1110
|
+
return el;
|
|
1111
|
+
}
|
|
1112
|
+
el = el.parentElement;
|
|
1113
|
+
}
|
|
1114
|
+
return h.parentElement;
|
|
1115
|
+
}
|
|
1116
|
+
return null;
|
|
1117
|
+
}
|
|
1118
|
+
function cleanText(target) {
|
|
1119
|
+
if (!target) return null;
|
|
1120
|
+
const clone = target.cloneNode(true);
|
|
1121
|
+
clone.querySelectorAll([
|
|
1122
|
+
"script",
|
|
1123
|
+
"style",
|
|
1124
|
+
"noscript",
|
|
1125
|
+
"img",
|
|
1126
|
+
"picture",
|
|
1127
|
+
"video",
|
|
1128
|
+
selectors.aio.header,
|
|
1129
|
+
selectors.aio.showMoreButton,
|
|
1130
|
+
selectors.aio.sourcesPanel,
|
|
1131
|
+
selectors.aio.disclaimer,
|
|
1132
|
+
'[data-subtree="dfa"]',
|
|
1133
|
+
"[data-src-id]",
|
|
1134
|
+
'[role="dialog"]',
|
|
1135
|
+
".HWMcu",
|
|
1136
|
+
".bTFeG",
|
|
1137
|
+
".CyMdWb",
|
|
1138
|
+
".MFrAxb",
|
|
1139
|
+
".F0OfWd.hfWAgb",
|
|
1140
|
+
".x2qcTc.fZavHb",
|
|
1141
|
+
".SvjEff",
|
|
1142
|
+
".sR2MY",
|
|
1143
|
+
".lKuDef",
|
|
1144
|
+
".GSPQcc",
|
|
1145
|
+
"a[href]",
|
|
1146
|
+
"button",
|
|
1147
|
+
'[role="button"]'
|
|
1148
|
+
].join(",")).forEach((el) => el.remove());
|
|
1149
|
+
const holder = document.createElement("div");
|
|
1150
|
+
holder.style.position = "fixed";
|
|
1151
|
+
holder.style.left = "-10000px";
|
|
1152
|
+
holder.style.top = "0";
|
|
1153
|
+
holder.style.width = `${Math.max(320, Math.round(target.getBoundingClientRect?.().width || 960))}px`;
|
|
1154
|
+
holder.style.opacity = "0";
|
|
1155
|
+
holder.style.pointerEvents = "none";
|
|
1156
|
+
holder.append(clone);
|
|
1157
|
+
document.body.append(holder);
|
|
1158
|
+
const rendered = clone.innerText || clone.textContent || "";
|
|
1159
|
+
holder.remove();
|
|
1160
|
+
const lines = rendered.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n[ \t]+/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ").trim().split("\n").map((line) => line.replace(/\u00a0/g, " ").trim()).filter(Boolean);
|
|
1161
|
+
const filteredLines = [];
|
|
1162
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1163
|
+
const line = lines[i];
|
|
1164
|
+
const next = lines[i + 1] ?? "";
|
|
1165
|
+
if (line === "AI Overview") continue;
|
|
1166
|
+
if (line === "Show more") continue;
|
|
1167
|
+
if (/^AI can make mistakes/i.test(line)) continue;
|
|
1168
|
+
if (/^Thank you\b/i.test(line)) continue;
|
|
1169
|
+
if (/^Your feedback helps Google improve/i.test(line)) continue;
|
|
1170
|
+
if (/^\+?\d+$/.test(line)) continue;
|
|
1171
|
+
if (/^\+\d+$/.test(next) && line.length <= 80) {
|
|
1172
|
+
i++;
|
|
1173
|
+
continue;
|
|
1174
|
+
}
|
|
1175
|
+
filteredLines.push(line);
|
|
1176
|
+
}
|
|
1177
|
+
const raw = filteredLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
1178
|
+
if (!raw || /not available|try again|can't generate/i.test(raw)) return null;
|
|
1179
|
+
return raw;
|
|
1180
|
+
}
|
|
1181
|
+
function normalizeHref(rawHref) {
|
|
1182
|
+
if (!rawHref || rawHref.startsWith("javascript:")) return null;
|
|
1183
|
+
let href = rawHref;
|
|
1184
|
+
try {
|
|
1185
|
+
const absolute = new URL(rawHref, window.location.href);
|
|
1186
|
+
const q = absolute.searchParams.get("q") ?? absolute.searchParams.get("url");
|
|
1187
|
+
if (/(\.|^)google\./i.test(absolute.hostname) && q?.startsWith("http")) {
|
|
1188
|
+
href = q;
|
|
1189
|
+
} else {
|
|
1190
|
+
href = absolute.href;
|
|
1191
|
+
}
|
|
1192
|
+
} catch {
|
|
1193
|
+
return null;
|
|
1194
|
+
}
|
|
1195
|
+
if (!/^https?:\/\//i.test(href)) return null;
|
|
1196
|
+
try {
|
|
1197
|
+
const url = new URL(href);
|
|
1198
|
+
const isGoogleInternal = /(\.|^)google\./i.test(url.hostname);
|
|
1199
|
+
if (isGoogleInternal) return null;
|
|
1200
|
+
return url.href;
|
|
1201
|
+
} catch {
|
|
1202
|
+
return null;
|
|
1203
|
+
}
|
|
1204
|
+
}
|
|
1205
|
+
function extractCitations(root) {
|
|
1206
|
+
if (!root) return [];
|
|
1207
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1208
|
+
const citations = [];
|
|
1209
|
+
for (const a of Array.from(root.querySelectorAll("a[href]"))) {
|
|
1210
|
+
const href = normalizeHref(a.getAttribute("href") ?? "");
|
|
1211
|
+
if (!href || seen.has(href)) continue;
|
|
1212
|
+
seen.add(href);
|
|
1213
|
+
let fallbackHost = "";
|
|
1214
|
+
try {
|
|
1215
|
+
fallbackHost = new URL(href).hostname.replace(/^www\./, "");
|
|
1216
|
+
} catch {
|
|
1217
|
+
}
|
|
1218
|
+
citations.push({
|
|
1219
|
+
text: textOf(a) || fallbackHost || href,
|
|
1220
|
+
href
|
|
1221
|
+
});
|
|
1222
|
+
}
|
|
1223
|
+
return citations;
|
|
1224
|
+
}
|
|
1225
|
+
async function maybeExpand(root) {
|
|
1226
|
+
const button = root.querySelector(selectors.aio.showMoreButton);
|
|
1227
|
+
if (!button || button.getAttribute("aria-expanded") !== "false") return false;
|
|
1228
|
+
button.click();
|
|
1229
|
+
const waitMs = selectors.expandWaitMs ?? 1500;
|
|
1230
|
+
if (waitMs > 0) await new Promise((resolve) => setTimeout(resolve, waitMs));
|
|
1231
|
+
return true;
|
|
1232
|
+
}
|
|
1233
|
+
const aioRoot = findAIORoot();
|
|
1234
|
+
let aioText = null;
|
|
1235
|
+
let aioCitations = [];
|
|
1236
|
+
let aioExpanded = false;
|
|
1237
|
+
let aioFullyExpanded = false;
|
|
1238
|
+
let aioSections = [];
|
|
1239
|
+
if (aioRoot) {
|
|
1240
|
+
aioExpanded = await maybeExpand(aioRoot);
|
|
1241
|
+
const controller = aioRoot.querySelector(selectors.aio.controller);
|
|
1242
|
+
const contentSubtree = aioRoot.querySelector(selectors.aio.contentSubtree);
|
|
1243
|
+
const showMore = aioRoot.querySelector(selectors.aio.showMoreButton);
|
|
1244
|
+
aioFullyExpanded = controller?.getAttribute("data-trnct") === "false" || showMore?.getAttribute("aria-expanded") === "true" || !showMore;
|
|
1245
|
+
aioText = cleanText(contentSubtree ?? controller ?? aioRoot);
|
|
1246
|
+
aioSections = (aioText ?? "").split("\n").map((line) => line.trim()).filter((line) => /^\d+\.\s+.+/.test(line));
|
|
1247
|
+
aioCitations = extractCitations(aioRoot);
|
|
1248
|
+
}
|
|
1249
|
+
const aimRoot = document.querySelector(selectors.aim.root);
|
|
1250
|
+
const aimDetected = surface === "aim" && !!aimRoot;
|
|
1251
|
+
const aimContainer = aimRoot?.closest(selectors.aim.wrapper) ?? aimRoot;
|
|
1252
|
+
const aimText = cleanText(aimContainer);
|
|
1253
|
+
const aimCitations = aimDetected ? extractCitations(aimContainer) : [];
|
|
1254
|
+
return {
|
|
1255
|
+
surface,
|
|
1256
|
+
aiOverview: {
|
|
1257
|
+
detected: !!aioRoot && aioText !== null,
|
|
1258
|
+
text: aioText,
|
|
1259
|
+
citations: aioCitations,
|
|
1260
|
+
expanded: aioExpanded,
|
|
1261
|
+
fullyExpanded: aioFullyExpanded,
|
|
1262
|
+
sections: aioSections
|
|
1263
|
+
},
|
|
1264
|
+
aiMode: {
|
|
1265
|
+
detected: aimDetected && aimText !== null,
|
|
1266
|
+
text: aimText,
|
|
1267
|
+
citations: aimCitations
|
|
1268
|
+
}
|
|
1269
|
+
};
|
|
1270
|
+
}
|
|
1271
|
+
|
|
540
1272
|
// src/extractor/PAAExtractor.ts
|
|
1273
|
+
var DESKTOP_USER_AGENT2 = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
1274
|
+
var MOBILE_USER_AGENT2 = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
|
|
541
1275
|
var PAAExtractor = class {
|
|
542
1276
|
constructor(driver, reporter) {
|
|
543
1277
|
this.driver = driver;
|
|
@@ -548,6 +1282,17 @@ var PAAExtractor = class {
|
|
|
548
1282
|
normalizeQuestion(q) {
|
|
549
1283
|
return q.toLowerCase().replace(/[^\w\s]/g, "").replace(/\s+/g, " ").trim();
|
|
550
1284
|
}
|
|
1285
|
+
throwIfAborted(signal) {
|
|
1286
|
+
if (!signal?.aborted) return;
|
|
1287
|
+
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") throw signal.reason;
|
|
1288
|
+
throw new RequestAbortedError();
|
|
1289
|
+
}
|
|
1290
|
+
async throwIfCaptcha(page, context) {
|
|
1291
|
+
const captchaCount = await page.locator(PAASelectors.captchaMarker).count().catch(() => 0);
|
|
1292
|
+
if (captchaCount > 0) {
|
|
1293
|
+
throw new CaptchaError(`${context} returned a CAPTCHA \u2014 retrying with a fresh session.`);
|
|
1294
|
+
}
|
|
1295
|
+
}
|
|
551
1296
|
async extractVisibleItems(page) {
|
|
552
1297
|
const sels = PAASelectors;
|
|
553
1298
|
const raw = await page.evaluate((selectors) => {
|
|
@@ -610,10 +1355,10 @@ var PAAExtractor = class {
|
|
|
610
1355
|
extracted_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
611
1356
|
};
|
|
612
1357
|
}
|
|
613
|
-
async runBFS(page, options) {
|
|
1358
|
+
async runBFS(page, options, signal) {
|
|
614
1359
|
const seenKeys = /* @__PURE__ */ new Set();
|
|
615
1360
|
const seenQs = /* @__PURE__ */ new Set();
|
|
616
|
-
const
|
|
1361
|
+
const orderedQs = [];
|
|
617
1362
|
const results = [];
|
|
618
1363
|
const readAllQs = () => page.evaluate(
|
|
619
1364
|
({ sel, dataQ, dataInitQ, questionEl }) => Array.from(document.querySelectorAll(sel)).map(
|
|
@@ -621,42 +1366,43 @@ var PAAExtractor = class {
|
|
|
621
1366
|
).filter(Boolean),
|
|
622
1367
|
{ sel: PAASelectors.item, dataQ: PAASelectors.itemDataQ, dataInitQ: PAASelectors.itemDataInitQ, questionEl: PAASelectors.itemQuestionEl }
|
|
623
1368
|
);
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
this.
|
|
628
|
-
if (seenQs.size >= options.maxQuestions) break;
|
|
1369
|
+
let round = 0;
|
|
1370
|
+
while (seenQs.size < options.maxQuestions) {
|
|
1371
|
+
this.throwIfAborted(signal);
|
|
1372
|
+
await this.throwIfCaptcha(page, "Google PAA expansion");
|
|
629
1373
|
const beforeQs = await readAllQs();
|
|
630
1374
|
if (beforeQs.length >= options.maxQuestions) break;
|
|
631
|
-
const
|
|
632
|
-
|
|
633
|
-
);
|
|
634
|
-
|
|
635
|
-
for (
|
|
1375
|
+
const unexpandedSel = `${PAASelectors.item}:not(.${PAASelectors.expandedClass}) ${PAASelectors.clickTarget}`;
|
|
1376
|
+
const unexpandedCount = await page.locator(unexpandedSel).count();
|
|
1377
|
+
if (unexpandedCount === 0) break;
|
|
1378
|
+
this.reporter.onDepth(++round);
|
|
1379
|
+
for (let ci = 0; ci < unexpandedCount; ci++) {
|
|
1380
|
+
this.throwIfAborted(signal);
|
|
636
1381
|
try {
|
|
637
|
-
|
|
638
|
-
await
|
|
1382
|
+
const btn = page.locator(unexpandedSel).first();
|
|
1383
|
+
await btn.scrollIntoViewIfNeeded();
|
|
1384
|
+
await btn.hover({ force: true });
|
|
1385
|
+
await page.waitForTimeout(100);
|
|
1386
|
+
await btn.click({ force: true });
|
|
639
1387
|
await page.waitForTimeout(500);
|
|
640
1388
|
} catch {
|
|
641
1389
|
}
|
|
642
1390
|
}
|
|
643
|
-
await page.
|
|
1391
|
+
await page.waitForFunction(
|
|
1392
|
+
({ sel, min }) => document.querySelectorAll(sel).length > min,
|
|
1393
|
+
{ sel: PAASelectors.item, min: beforeQs.length },
|
|
1394
|
+
{ timeout: 5e3 }
|
|
1395
|
+
).catch(() => {
|
|
1396
|
+
});
|
|
1397
|
+
await this.throwIfCaptcha(page, "Google PAA expansion");
|
|
644
1398
|
const afterQs = await readAllQs();
|
|
645
|
-
|
|
646
|
-
const newDups = newQs.filter((q) => seenQs.has(q)).length;
|
|
647
|
-
const dupRate = newQs.length > 0 ? newDups / newQs.length : 0;
|
|
648
|
-
dupRates.push(dupRate);
|
|
649
|
-
if (dupRates.length > 2) dupRates.shift();
|
|
650
|
-
const rollingDupRate = dupRates.reduce((a, b) => a + b, 0) / dupRates.length;
|
|
1399
|
+
if (afterQs.length === beforeQs.length) break;
|
|
651
1400
|
for (const q of afterQs) {
|
|
652
1401
|
if (!seenQs.has(q)) {
|
|
653
1402
|
seenQs.add(q);
|
|
654
1403
|
orderedQs.push(q);
|
|
655
1404
|
}
|
|
656
|
-
if (!depthMap.has(q)) depthMap.set(q, round + 1);
|
|
657
1405
|
}
|
|
658
|
-
if (afterQs.length === beforeQs.length) break;
|
|
659
|
-
if (rollingDupRate >= 0.6) break;
|
|
660
1406
|
}
|
|
661
1407
|
const itemMap = new Map((await this.extractVisibleItems(page)).map((i) => [i.question, i]));
|
|
662
1408
|
for (const q of orderedQs) {
|
|
@@ -664,13 +1410,12 @@ var PAAExtractor = class {
|
|
|
664
1410
|
const key = this.normalizeQuestion(q);
|
|
665
1411
|
if (seenKeys.has(key)) continue;
|
|
666
1412
|
seenKeys.add(key);
|
|
667
|
-
const d = depthMap.get(q) ?? 1;
|
|
668
1413
|
const item = itemMap.get(q);
|
|
669
1414
|
if (item) {
|
|
670
|
-
results.push(this.toFlatRow(item,
|
|
671
|
-
this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth:
|
|
1415
|
+
results.push(this.toFlatRow(item, 1, null, options.query));
|
|
1416
|
+
this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: 1, parentQuestion: null, children: [] });
|
|
672
1417
|
} else {
|
|
673
|
-
results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 },
|
|
1418
|
+
results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, 1, null, options.query));
|
|
674
1419
|
}
|
|
675
1420
|
}
|
|
676
1421
|
return results;
|
|
@@ -728,6 +1473,7 @@ var PAAExtractor = class {
|
|
|
728
1473
|
} catch {
|
|
729
1474
|
return [];
|
|
730
1475
|
}
|
|
1476
|
+
await this.throwIfCaptcha(page, "Google short video search");
|
|
731
1477
|
const svSels = {
|
|
732
1478
|
item: ShortVideoSelectors.item,
|
|
733
1479
|
platforms: [...ShortVideoSelectors.platforms]
|
|
@@ -1009,69 +1755,11 @@ var PAAExtractor = class {
|
|
|
1009
1755
|
return { ...entityIds, entities: records, cids: [...cidSet] };
|
|
1010
1756
|
}
|
|
1011
1757
|
async extractAISurfaces(page) {
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
function findAIORoot() {
|
|
1018
|
-
const primary = document.querySelector(aio.root);
|
|
1019
|
-
if (primary) return primary;
|
|
1020
|
-
const headings = document.querySelectorAll('h1, h2, h3, [role="heading"]');
|
|
1021
|
-
for (const h of headings) {
|
|
1022
|
-
if (h.textContent?.trim() === "AI Overview") {
|
|
1023
|
-
let el = h.parentElement;
|
|
1024
|
-
for (let i = 0; i < 6 && el; i++) {
|
|
1025
|
-
if (el.querySelectorAll("a").length > 1) return el;
|
|
1026
|
-
el = el.parentElement;
|
|
1027
|
-
}
|
|
1028
|
-
return h.parentElement;
|
|
1029
|
-
}
|
|
1030
|
-
}
|
|
1031
|
-
return null;
|
|
1032
|
-
}
|
|
1033
|
-
const aioRoot = findAIORoot();
|
|
1034
|
-
const aioContainer = aioRoot ? aioRoot.closest(aio.wrapper) ?? aioRoot : null;
|
|
1035
|
-
let aioText = null;
|
|
1036
|
-
if (aioContainer) {
|
|
1037
|
-
const clone = aioContainer.cloneNode(true);
|
|
1038
|
-
clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
|
|
1039
|
-
clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
|
|
1040
|
-
clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
|
|
1041
|
-
clone.querySelectorAll("a").forEach((el) => el.remove());
|
|
1042
|
-
const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
|
|
1043
|
-
const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
|
|
1044
|
-
aioText = isErrorState ? null : candidate;
|
|
1045
|
-
}
|
|
1046
|
-
const aioDetected = !!aioRoot && aioText !== null;
|
|
1047
|
-
const aioCitations = Array.from(aioContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
|
|
1048
|
-
text: a.textContent?.trim() ?? "",
|
|
1049
|
-
href: a.href
|
|
1050
|
-
})).filter((c) => c.text && c.href);
|
|
1051
|
-
const aimRoot = document.querySelector(aim.root);
|
|
1052
|
-
const aimDetected = surface === "aim" && !!aimRoot;
|
|
1053
|
-
const aimContainer = aimRoot?.closest(aim.wrapper) ?? null;
|
|
1054
|
-
let aimText = null;
|
|
1055
|
-
if (aimContainer) {
|
|
1056
|
-
const clone = aimContainer.cloneNode(true);
|
|
1057
|
-
clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
|
|
1058
|
-
clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
|
|
1059
|
-
clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
|
|
1060
|
-
clone.querySelectorAll("a").forEach((el) => el.remove());
|
|
1061
|
-
const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
|
|
1062
|
-
const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
|
|
1063
|
-
aimText = isErrorState ? null : candidate;
|
|
1064
|
-
}
|
|
1065
|
-
const aimCitations = aimDetected ? Array.from(aimContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
|
|
1066
|
-
text: a.textContent?.trim() ?? "",
|
|
1067
|
-
href: a.href
|
|
1068
|
-
})).filter((c) => c.text && c.href) : [];
|
|
1069
|
-
return {
|
|
1070
|
-
surface,
|
|
1071
|
-
aiOverview: { detected: aioDetected, text: aioText, citations: aioCitations },
|
|
1072
|
-
aiMode: { detected: aimDetected, text: aimText, citations: aimCitations }
|
|
1073
|
-
};
|
|
1074
|
-
}, { aio: aioSels, aim: aimSels });
|
|
1758
|
+
return page.evaluate(extractAISurfacesFromDocument, {
|
|
1759
|
+
aio: AIOverviewSelectors,
|
|
1760
|
+
aim: AIModeSelectors,
|
|
1761
|
+
expandWaitMs: 1500
|
|
1762
|
+
});
|
|
1075
1763
|
}
|
|
1076
1764
|
buildTree(flat, _seed) {
|
|
1077
1765
|
const roots = [];
|
|
@@ -1098,23 +1786,70 @@ var PAAExtractor = class {
|
|
|
1098
1786
|
}
|
|
1099
1787
|
return roots;
|
|
1100
1788
|
}
|
|
1101
|
-
|
|
1789
|
+
getBrowserDebugSnapshot() {
|
|
1790
|
+
return this.driver.getDebugSnapshot();
|
|
1791
|
+
}
|
|
1792
|
+
buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) {
|
|
1793
|
+
if (!options.debug) return void 0;
|
|
1794
|
+
return {
|
|
1795
|
+
enabled: true,
|
|
1796
|
+
request: {
|
|
1797
|
+
query: options.query,
|
|
1798
|
+
locationInput: options.location ?? null,
|
|
1799
|
+
canonicalLocation,
|
|
1800
|
+
uule,
|
|
1801
|
+
gl: options.gl,
|
|
1802
|
+
hl: options.hl,
|
|
1803
|
+
device: options.device,
|
|
1804
|
+
proxyMode: options.proxyMode,
|
|
1805
|
+
proxyZip: options.proxyZip ?? null,
|
|
1806
|
+
serpOnly: options.serpOnly,
|
|
1807
|
+
pages: options.pages ?? 1
|
|
1808
|
+
},
|
|
1809
|
+
browser: this.getBrowserDebugSnapshot(),
|
|
1810
|
+
...locationEvidence ? { locationEvidence } : {}
|
|
1811
|
+
};
|
|
1812
|
+
}
|
|
1813
|
+
async extract(options, signal) {
|
|
1102
1814
|
const startMs = Date.now();
|
|
1815
|
+
const isMobile = options.device === "mobile";
|
|
1103
1816
|
const config = {
|
|
1104
1817
|
headless: options.headless,
|
|
1105
1818
|
profileDir: options.profileDir,
|
|
1106
1819
|
proxy: options.proxy,
|
|
1107
1820
|
kernelApiKey: options.kernelApiKey,
|
|
1108
1821
|
kernelProxyId: options.kernelProxyId,
|
|
1109
|
-
|
|
1110
|
-
|
|
1822
|
+
kernelProxyResolution: options.kernelProxyResolution,
|
|
1823
|
+
proxyMode: options.proxyMode,
|
|
1824
|
+
viewport: isMobile ? { width: 390, height: 844 } : { width: 1280, height: 800 },
|
|
1825
|
+
locale: `${options.hl}-${options.gl.toUpperCase()}`,
|
|
1826
|
+
userAgent: isMobile ? MOBILE_USER_AGENT2 : DESKTOP_USER_AGENT2,
|
|
1827
|
+
deviceScaleFactor: isMobile ? 3 : 1,
|
|
1828
|
+
isMobile,
|
|
1829
|
+
hasTouch: isMobile,
|
|
1830
|
+
debug: options.debug
|
|
1111
1831
|
};
|
|
1112
1832
|
let errorCount = 0;
|
|
1833
|
+
const diagnosticWarnings = [];
|
|
1113
1834
|
try {
|
|
1835
|
+
this.throwIfAborted(signal);
|
|
1114
1836
|
await this.driver.launch(config);
|
|
1115
|
-
|
|
1116
|
-
const
|
|
1837
|
+
this.throwIfAborted(signal);
|
|
1838
|
+
const canonicalLocation = options.location ? normalizeLocation(options.location) : null;
|
|
1839
|
+
const uule = canonicalLocation ? encodeUule(canonicalLocation) : null;
|
|
1840
|
+
const { hasPaa } = await this.driver.navigateToSERP(
|
|
1841
|
+
options.query,
|
|
1842
|
+
uule,
|
|
1843
|
+
options.gl,
|
|
1844
|
+
options.hl,
|
|
1845
|
+
{
|
|
1846
|
+
...options.serpOnly ? { num: 100 } : {},
|
|
1847
|
+
debug: options.debug
|
|
1848
|
+
}
|
|
1849
|
+
);
|
|
1850
|
+
this.throwIfAborted(signal);
|
|
1117
1851
|
const page = this.driver.getPage();
|
|
1852
|
+
await this.throwIfCaptcha(page, "Google SERP");
|
|
1118
1853
|
if (options.serpOnly) {
|
|
1119
1854
|
const [organicResults2, localPack2, rawEntityIds2] = await Promise.all([
|
|
1120
1855
|
this.extractOrganicResults(page),
|
|
@@ -1122,13 +1857,19 @@ var PAAExtractor = class {
|
|
|
1122
1857
|
this.extractEntityIds(page)
|
|
1123
1858
|
]);
|
|
1124
1859
|
const entityIds2 = this.mergeLocalPackIntoEntities(rawEntityIds2, localPack2);
|
|
1860
|
+
const aiSurfaces2 = await this.extractAISurfaces(page);
|
|
1861
|
+
let locationEvidence2 = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults2, localPack2) : void 0;
|
|
1125
1862
|
let allOrganic2 = organicResults2;
|
|
1126
1863
|
if ((options.pages ?? 1) >= 2) {
|
|
1127
|
-
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1864
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1128
1865
|
if (uule) p2params.set("uule", uule);
|
|
1129
1866
|
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1867
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1130
1868
|
const p2organic = await this.extractOrganicResults(page);
|
|
1131
1869
|
allOrganic2 = [...organicResults2, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1870
|
+
if (options.debug) {
|
|
1871
|
+
locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, allOrganic2, localPack2);
|
|
1872
|
+
}
|
|
1132
1873
|
}
|
|
1133
1874
|
const stats2 = {
|
|
1134
1875
|
seed: options.query,
|
|
@@ -1142,10 +1883,15 @@ var PAAExtractor = class {
|
|
|
1142
1883
|
seed: options.query,
|
|
1143
1884
|
location: options.location ?? null,
|
|
1144
1885
|
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1886
|
+
diagnostics: {
|
|
1887
|
+
completionStatus: "serp_only",
|
|
1888
|
+
problem: null,
|
|
1889
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
|
|
1890
|
+
},
|
|
1145
1891
|
totalQuestions: 0,
|
|
1146
|
-
surface:
|
|
1147
|
-
aiOverview:
|
|
1148
|
-
aiMode:
|
|
1892
|
+
surface: aiSurfaces2.surface,
|
|
1893
|
+
aiOverview: aiSurfaces2.aiOverview,
|
|
1894
|
+
aiMode: aiSurfaces2.aiMode,
|
|
1149
1895
|
whatPeopleSaying: [],
|
|
1150
1896
|
tree: [],
|
|
1151
1897
|
flat: [],
|
|
@@ -1166,16 +1912,22 @@ var PAAExtractor = class {
|
|
|
1166
1912
|
this.extractLocalPack(page)
|
|
1167
1913
|
]);
|
|
1168
1914
|
const entityIds = this.mergeLocalPackIntoEntities(rawEntityIds, localPack);
|
|
1915
|
+
const initialLocationEvidence = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) : void 0;
|
|
1169
1916
|
this.reporter.onVideos(videos);
|
|
1170
1917
|
this.reporter.onForums(forums);
|
|
1171
1918
|
if (!hasPaa) {
|
|
1172
1919
|
let noPaaOrganic = organicResults;
|
|
1920
|
+
let locationEvidence2 = initialLocationEvidence;
|
|
1173
1921
|
if ((options.pages ?? 1) >= 2) {
|
|
1174
|
-
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1922
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1175
1923
|
if (uule) p2params.set("uule", uule);
|
|
1176
1924
|
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1925
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1177
1926
|
const p2organic = await this.extractOrganicResults(page);
|
|
1178
1927
|
noPaaOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1928
|
+
if (options.debug) {
|
|
1929
|
+
locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, noPaaOrganic, localPack);
|
|
1930
|
+
}
|
|
1179
1931
|
}
|
|
1180
1932
|
const aiSurfaces2 = await this.extractAISurfaces(page);
|
|
1181
1933
|
const stats2 = {
|
|
@@ -1190,6 +1942,11 @@ var PAAExtractor = class {
|
|
|
1190
1942
|
seed: options.query,
|
|
1191
1943
|
location: options.location ?? null,
|
|
1192
1944
|
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1945
|
+
diagnostics: {
|
|
1946
|
+
completionStatus: "no_paa",
|
|
1947
|
+
problem: null,
|
|
1948
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
|
|
1949
|
+
},
|
|
1193
1950
|
totalQuestions: 0,
|
|
1194
1951
|
surface: aiSurfaces2.surface,
|
|
1195
1952
|
aiOverview: aiSurfaces2.aiOverview,
|
|
@@ -1205,19 +1962,37 @@ var PAAExtractor = class {
|
|
|
1205
1962
|
stats: stats2
|
|
1206
1963
|
};
|
|
1207
1964
|
}
|
|
1208
|
-
const flat = await this.runBFS(page, options);
|
|
1965
|
+
const flat = await this.runBFS(page, options, signal);
|
|
1966
|
+
this.throwIfAborted(signal);
|
|
1209
1967
|
const aiSurfaces = await this.extractAISurfaces(page);
|
|
1210
|
-
const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, udm: ShortVideoSelectors.udm });
|
|
1968
|
+
const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", udm: ShortVideoSelectors.udm });
|
|
1211
1969
|
if (uule) shortVidsParams.set("uule", uule);
|
|
1212
|
-
|
|
1970
|
+
let shortVideos = [];
|
|
1971
|
+
try {
|
|
1972
|
+
shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
|
|
1973
|
+
} catch (err) {
|
|
1974
|
+
if (!(err instanceof CaptchaError)) throw err;
|
|
1975
|
+
errorCount++;
|
|
1976
|
+
diagnosticWarnings.push({
|
|
1977
|
+
code: "short_videos_captcha_skipped",
|
|
1978
|
+
surface: "short_videos",
|
|
1979
|
+
message: err.message,
|
|
1980
|
+
retryable: true
|
|
1981
|
+
});
|
|
1982
|
+
}
|
|
1213
1983
|
this.reporter.onVideos(shortVideos);
|
|
1214
1984
|
let allOrganic = organicResults;
|
|
1985
|
+
let locationEvidence = initialLocationEvidence;
|
|
1215
1986
|
if ((options.pages ?? 1) >= 2) {
|
|
1216
|
-
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1987
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1217
1988
|
if (uule) p2params.set("uule", uule);
|
|
1218
1989
|
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1990
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1219
1991
|
const p2organic = await this.extractOrganicResults(page);
|
|
1220
1992
|
allOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1993
|
+
if (options.debug) {
|
|
1994
|
+
locationEvidence = inferSerpLocationEvidence(canonicalLocation, allOrganic, localPack);
|
|
1995
|
+
}
|
|
1221
1996
|
}
|
|
1222
1997
|
const allVideos = [...videos, ...shortVideos];
|
|
1223
1998
|
const tree = this.buildTree(flat, options.query);
|
|
@@ -1233,6 +2008,12 @@ var PAAExtractor = class {
|
|
|
1233
2008
|
seed: options.query,
|
|
1234
2009
|
location: options.location ?? null,
|
|
1235
2010
|
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2011
|
+
diagnostics: {
|
|
2012
|
+
completionStatus: "paa_found",
|
|
2013
|
+
problem: null,
|
|
2014
|
+
...diagnosticWarnings.length > 0 ? { warnings: diagnosticWarnings } : {},
|
|
2015
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) } : {}
|
|
2016
|
+
},
|
|
1236
2017
|
totalQuestions: flat.length,
|
|
1237
2018
|
surface: aiSurfaces.surface,
|
|
1238
2019
|
aiOverview: aiSurfaces.aiOverview,
|
|
@@ -1251,8 +2032,6 @@ var PAAExtractor = class {
|
|
|
1251
2032
|
errorCount++;
|
|
1252
2033
|
this.reporter.onError(err instanceof Error ? err : new Error(String(err)));
|
|
1253
2034
|
throw err;
|
|
1254
|
-
} finally {
|
|
1255
|
-
await this.driver.close();
|
|
1256
2035
|
}
|
|
1257
2036
|
}
|
|
1258
2037
|
};
|
|
@@ -1366,61 +2145,654 @@ var ProgressReporter = class {
|
|
|
1366
2145
|
}
|
|
1367
2146
|
};
|
|
1368
2147
|
|
|
2148
|
+
// src/kernel-proxy-resolver.ts
|
|
2149
|
+
var import_sdk2 = __toESM(require("@onkernel/sdk"), 1);
|
|
2150
|
+
var US_STATE_CODES = {
|
|
2151
|
+
alabama: "AL",
|
|
2152
|
+
alaska: "AK",
|
|
2153
|
+
arizona: "AZ",
|
|
2154
|
+
arkansas: "AR",
|
|
2155
|
+
california: "CA",
|
|
2156
|
+
colorado: "CO",
|
|
2157
|
+
connecticut: "CT",
|
|
2158
|
+
delaware: "DE",
|
|
2159
|
+
florida: "FL",
|
|
2160
|
+
georgia: "GA",
|
|
2161
|
+
hawaii: "HI",
|
|
2162
|
+
idaho: "ID",
|
|
2163
|
+
illinois: "IL",
|
|
2164
|
+
indiana: "IN",
|
|
2165
|
+
iowa: "IA",
|
|
2166
|
+
kansas: "KS",
|
|
2167
|
+
kentucky: "KY",
|
|
2168
|
+
louisiana: "LA",
|
|
2169
|
+
maine: "ME",
|
|
2170
|
+
maryland: "MD",
|
|
2171
|
+
massachusetts: "MA",
|
|
2172
|
+
michigan: "MI",
|
|
2173
|
+
minnesota: "MN",
|
|
2174
|
+
mississippi: "MS",
|
|
2175
|
+
missouri: "MO",
|
|
2176
|
+
montana: "MT",
|
|
2177
|
+
nebraska: "NE",
|
|
2178
|
+
nevada: "NV",
|
|
2179
|
+
"new hampshire": "NH",
|
|
2180
|
+
"new jersey": "NJ",
|
|
2181
|
+
"new mexico": "NM",
|
|
2182
|
+
"new york": "NY",
|
|
2183
|
+
"north carolina": "NC",
|
|
2184
|
+
"north dakota": "ND",
|
|
2185
|
+
ohio: "OH",
|
|
2186
|
+
oklahoma: "OK",
|
|
2187
|
+
oregon: "OR",
|
|
2188
|
+
pennsylvania: "PA",
|
|
2189
|
+
"rhode island": "RI",
|
|
2190
|
+
"south carolina": "SC",
|
|
2191
|
+
"south dakota": "SD",
|
|
2192
|
+
tennessee: "TN",
|
|
2193
|
+
texas: "TX",
|
|
2194
|
+
utah: "UT",
|
|
2195
|
+
vermont: "VT",
|
|
2196
|
+
virginia: "VA",
|
|
2197
|
+
washington: "WA",
|
|
2198
|
+
"west virginia": "WV",
|
|
2199
|
+
wisconsin: "WI",
|
|
2200
|
+
wyoming: "WY"
|
|
2201
|
+
};
|
|
2202
|
+
var US_CITY_CENTER_ZIPS = {
|
|
2203
|
+
"atlanta|GA": "30303",
|
|
2204
|
+
"austin|TX": "78701",
|
|
2205
|
+
"baltimore|MD": "21201",
|
|
2206
|
+
"boston|MA": "02108",
|
|
2207
|
+
"boulder|CO": "80302",
|
|
2208
|
+
"charlotte|NC": "28202",
|
|
2209
|
+
"chicago|IL": "60601",
|
|
2210
|
+
"colorado_springs|CO": "80903",
|
|
2211
|
+
"columbus|OH": "43215",
|
|
2212
|
+
"dallas|TX": "75201",
|
|
2213
|
+
"denver|CO": "80202",
|
|
2214
|
+
"detroit|MI": "48226",
|
|
2215
|
+
"fort_collins|CO": "80524",
|
|
2216
|
+
"fort_worth|TX": "76102",
|
|
2217
|
+
"houston|TX": "77002",
|
|
2218
|
+
"indianapolis|IN": "46204",
|
|
2219
|
+
"jacksonville|FL": "32202",
|
|
2220
|
+
"las_vegas|NV": "89101",
|
|
2221
|
+
"los_angeles|CA": "90012",
|
|
2222
|
+
"louisville|KY": "40202",
|
|
2223
|
+
"loveland|CO": "80537",
|
|
2224
|
+
"memphis|TN": "38103",
|
|
2225
|
+
"miami|FL": "33131",
|
|
2226
|
+
"minneapolis|MN": "55401",
|
|
2227
|
+
"nashville|TN": "37203",
|
|
2228
|
+
"new_york|NY": "10001",
|
|
2229
|
+
"orlando|FL": "32801",
|
|
2230
|
+
"philadelphia|PA": "19103",
|
|
2231
|
+
"phoenix|AZ": "85004",
|
|
2232
|
+
"portland|OR": "97205",
|
|
2233
|
+
"raleigh|NC": "27601",
|
|
2234
|
+
"richmond|VA": "23219",
|
|
2235
|
+
"sacramento|CA": "95814",
|
|
2236
|
+
"salt_lake_city|UT": "84101",
|
|
2237
|
+
"san_antonio|TX": "78205",
|
|
2238
|
+
"san_diego|CA": "92101",
|
|
2239
|
+
"san_francisco|CA": "94103",
|
|
2240
|
+
"san_jose|CA": "95113",
|
|
2241
|
+
"seattle|WA": "98101"
|
|
2242
|
+
};
|
|
2243
|
+
function proxyIdSuffix2(proxyId) {
|
|
2244
|
+
return proxyId ? proxyId.slice(-6) : null;
|
|
2245
|
+
}
|
|
2246
|
+
function resolution(source, proxyMode, proxyId, target, error) {
|
|
2247
|
+
return {
|
|
2248
|
+
kernelProxyId: proxyId,
|
|
2249
|
+
resolution: {
|
|
2250
|
+
source,
|
|
2251
|
+
proxyMode,
|
|
2252
|
+
proxyIdPresent: Boolean(proxyId),
|
|
2253
|
+
proxyIdSuffix: proxyIdSuffix2(proxyId),
|
|
2254
|
+
target,
|
|
2255
|
+
error
|
|
2256
|
+
}
|
|
2257
|
+
};
|
|
2258
|
+
}
|
|
2259
|
+
function normalizeStateName(value) {
|
|
2260
|
+
return value.trim().toLowerCase().replace(/\s+/g, " ");
|
|
2261
|
+
}
|
|
2262
|
+
function normalizeCountryName(value) {
|
|
2263
|
+
return value.trim().toLowerCase().replace(/\./g, "").replace(/\s+/g, " ");
|
|
2264
|
+
}
|
|
2265
|
+
function isUnitedStates(country) {
|
|
2266
|
+
if (!country) return true;
|
|
2267
|
+
const normalized = normalizeCountryName(country);
|
|
2268
|
+
return normalized === "united states" || normalized === "united states of america" || normalized === "usa" || normalized === "us";
|
|
2269
|
+
}
|
|
2270
|
+
function stateCodeFor(region) {
|
|
2271
|
+
const trimmed = region.trim();
|
|
2272
|
+
if (/^[A-Za-z]{2}$/.test(trimmed)) return trimmed.toUpperCase();
|
|
2273
|
+
return US_STATE_CODES[normalizeStateName(trimmed)] ?? null;
|
|
2274
|
+
}
|
|
2275
|
+
function kernelCityIdentifierCandidates(city) {
|
|
2276
|
+
const ascii = city.normalize("NFKD").replace(/[^\x00-\x7F]/g, "").toLowerCase();
|
|
2277
|
+
const words = ascii.split(/[^a-z0-9]+/).filter(Boolean);
|
|
2278
|
+
const underscored = words.join("_");
|
|
2279
|
+
const compact = words.join("");
|
|
2280
|
+
return Array.from(new Set([underscored, compact].filter(Boolean)));
|
|
2281
|
+
}
|
|
2282
|
+
function proxyName(country, state, city) {
|
|
2283
|
+
return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
|
|
2284
|
+
}
|
|
2285
|
+
function zipProxyName(zip) {
|
|
2286
|
+
return `mcp-serp-residential-us-zip-${zip}`;
|
|
2287
|
+
}
|
|
2288
|
+
function parseKernelLocationProxyTarget(location, gl) {
|
|
2289
|
+
if (!location || gl.toLowerCase() !== "us") return null;
|
|
2290
|
+
const canonicalLocation = normalizeLocation(location);
|
|
2291
|
+
let parts = canonicalLocation.split(",").map((part) => part.trim()).filter(Boolean);
|
|
2292
|
+
if (parts.length > 1 && isUnitedStates(parts[parts.length - 1])) {
|
|
2293
|
+
parts = parts.slice(0, -1);
|
|
2294
|
+
}
|
|
2295
|
+
if (parts.length === 1) {
|
|
2296
|
+
const stateOnly = stateCodeFor(parts[0]);
|
|
2297
|
+
if (!stateOnly) return null;
|
|
2298
|
+
return {
|
|
2299
|
+
canonicalLocation,
|
|
2300
|
+
level: "state",
|
|
2301
|
+
country: "US",
|
|
2302
|
+
state: stateOnly,
|
|
2303
|
+
city: "",
|
|
2304
|
+
cityCandidates: [],
|
|
2305
|
+
proxyName: proxyName("US", stateOnly),
|
|
2306
|
+
config: {
|
|
2307
|
+
country: "US",
|
|
2308
|
+
state: stateOnly
|
|
2309
|
+
}
|
|
2310
|
+
};
|
|
2311
|
+
}
|
|
2312
|
+
const [city = "", region = ""] = parts;
|
|
2313
|
+
if (!city || !region) return null;
|
|
2314
|
+
const state = stateCodeFor(region);
|
|
2315
|
+
if (!state) return null;
|
|
2316
|
+
const cityCandidates = kernelCityIdentifierCandidates(city);
|
|
2317
|
+
const primaryCity = cityCandidates[0];
|
|
2318
|
+
if (!primaryCity) return null;
|
|
2319
|
+
return {
|
|
2320
|
+
canonicalLocation,
|
|
2321
|
+
level: "city",
|
|
2322
|
+
country: "US",
|
|
2323
|
+
state,
|
|
2324
|
+
city: primaryCity,
|
|
2325
|
+
cityCandidates,
|
|
2326
|
+
proxyName: proxyName("US", state, primaryCity),
|
|
2327
|
+
config: {
|
|
2328
|
+
country: "US",
|
|
2329
|
+
state,
|
|
2330
|
+
city: primaryCity
|
|
2331
|
+
}
|
|
2332
|
+
};
|
|
2333
|
+
}
|
|
2334
|
+
function cityZipKey(target) {
|
|
2335
|
+
return `${target.city}|${target.state}`;
|
|
2336
|
+
}
|
|
2337
|
+
function knownZipFor(target, explicitZip) {
|
|
2338
|
+
if (explicitZip && /^\d{5}$/.test(explicitZip)) return explicitZip;
|
|
2339
|
+
return US_CITY_CENTER_ZIPS[cityZipKey(target)] ?? null;
|
|
2340
|
+
}
|
|
2341
|
+
function zipTarget(target, zip) {
|
|
2342
|
+
return {
|
|
2343
|
+
...target,
|
|
2344
|
+
level: "zip",
|
|
2345
|
+
zip,
|
|
2346
|
+
proxyName: zipProxyName(zip),
|
|
2347
|
+
config: {
|
|
2348
|
+
country: target.country,
|
|
2349
|
+
state: target.state,
|
|
2350
|
+
zip
|
|
2351
|
+
}
|
|
2352
|
+
};
|
|
2353
|
+
}
|
|
2354
|
+
function configMatches(config, target, city) {
|
|
2355
|
+
if (target.level === "zip") {
|
|
2356
|
+
return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
|
|
2357
|
+
}
|
|
2358
|
+
return config?.country?.toUpperCase() === target.country && config?.state?.toUpperCase() === target.state && (city ? config?.city === city : !config?.city);
|
|
2359
|
+
}
|
|
2360
|
+
function findExistingTargetProxy(proxies, target) {
|
|
2361
|
+
return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === target.proxyName || configMatches(proxy.config, target, target.level === "city" ? target.city : void 0))) ?? null;
|
|
2362
|
+
}
|
|
2363
|
+
function findExistingProxy(proxies, target) {
|
|
2364
|
+
for (const city of target.cityCandidates) {
|
|
2365
|
+
const name = proxyName(target.country, target.state, city);
|
|
2366
|
+
const found = proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target, city)));
|
|
2367
|
+
if (found) return found;
|
|
2368
|
+
}
|
|
2369
|
+
return null;
|
|
2370
|
+
}
|
|
2371
|
+
function stateTarget(target) {
|
|
2372
|
+
return {
|
|
2373
|
+
...target,
|
|
2374
|
+
level: "state",
|
|
2375
|
+
proxyName: proxyName(target.country, target.state),
|
|
2376
|
+
config: {
|
|
2377
|
+
country: target.country,
|
|
2378
|
+
state: target.state
|
|
2379
|
+
}
|
|
2380
|
+
};
|
|
2381
|
+
}
|
|
2382
|
+
function findExistingStateProxy(proxies, target) {
|
|
2383
|
+
const name = proxyName(target.country, target.state);
|
|
2384
|
+
return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target))) ?? null;
|
|
2385
|
+
}
|
|
2386
|
+
function escalatedTargetLevel(target, attemptIndex) {
|
|
2387
|
+
return stateTarget(target);
|
|
2388
|
+
}
|
|
2389
|
+
function errorText2(err) {
|
|
2390
|
+
return err instanceof Error ? err.message : String(err);
|
|
2391
|
+
}
|
|
2392
|
+
async function resolveKernelProxyId(options) {
|
|
2393
|
+
if (options.proxyMode === "none") {
|
|
2394
|
+
return resolution("disabled", options.proxyMode, void 0, null, null);
|
|
2395
|
+
}
|
|
2396
|
+
if (options.proxyMode === "configured") {
|
|
2397
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, null, null);
|
|
2398
|
+
}
|
|
2399
|
+
const target = parseKernelLocationProxyTarget(options.location, options.gl);
|
|
2400
|
+
if (!target || !options.kernelApiKey) {
|
|
2401
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, target ? null : "location could not be normalized to a US city/state proxy target");
|
|
2402
|
+
}
|
|
2403
|
+
const kernel = new import_sdk2.default({ apiKey: options.kernelApiKey });
|
|
2404
|
+
try {
|
|
2405
|
+
const attemptIndex = options.attemptIndex ?? 0;
|
|
2406
|
+
if (attemptIndex >= 1) {
|
|
2407
|
+
const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
|
|
2408
|
+
const createErrors2 = [];
|
|
2409
|
+
try {
|
|
2410
|
+
const created = await kernel.proxies.create({
|
|
2411
|
+
type: "residential",
|
|
2412
|
+
name: escalatedTarget.proxyName,
|
|
2413
|
+
config: escalatedTarget.config
|
|
2414
|
+
});
|
|
2415
|
+
if (created.id) {
|
|
2416
|
+
return resolution("location_created", options.proxyMode, created.id, escalatedTarget, null);
|
|
2417
|
+
}
|
|
2418
|
+
createErrors2.push(`${escalatedTarget.state}: Kernel did not return a proxy id`);
|
|
2419
|
+
} catch (err) {
|
|
2420
|
+
createErrors2.push(`${escalatedTarget.state}: ${errorText2(err)}`);
|
|
2421
|
+
}
|
|
2422
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, escalatedTarget, createErrors2.join(" | "));
|
|
2423
|
+
}
|
|
2424
|
+
const proxies = await kernel.proxies.list();
|
|
2425
|
+
const zip = knownZipFor(target, options.proxyZip);
|
|
2426
|
+
const createErrors = [];
|
|
2427
|
+
if (zip) {
|
|
2428
|
+
const targetZip = zipTarget(target, zip);
|
|
2429
|
+
const existingZip = findExistingTargetProxy(proxies, targetZip);
|
|
2430
|
+
if (existingZip?.id) {
|
|
2431
|
+
return resolution("location_reused", options.proxyMode, existingZip.id, targetZip, null);
|
|
2432
|
+
}
|
|
2433
|
+
try {
|
|
2434
|
+
const created = await kernel.proxies.create({
|
|
2435
|
+
type: "residential",
|
|
2436
|
+
name: targetZip.proxyName,
|
|
2437
|
+
config: {
|
|
2438
|
+
country: targetZip.country,
|
|
2439
|
+
zip
|
|
2440
|
+
}
|
|
2441
|
+
});
|
|
2442
|
+
if (created.id) {
|
|
2443
|
+
return resolution("location_created", options.proxyMode, created.id, targetZip, null);
|
|
2444
|
+
}
|
|
2445
|
+
createErrors.push(`${zip}: Kernel did not return a proxy id`);
|
|
2446
|
+
} catch (err) {
|
|
2447
|
+
createErrors.push(`${zip}: ${errorText2(err)}`);
|
|
2448
|
+
}
|
|
2449
|
+
}
|
|
2450
|
+
const existing = findExistingProxy(proxies, target);
|
|
2451
|
+
if (existing?.id) {
|
|
2452
|
+
return resolution("location_reused", options.proxyMode, existing.id, target, createErrors.join(" | ") || null);
|
|
2453
|
+
}
|
|
2454
|
+
for (const city of target.cityCandidates) {
|
|
2455
|
+
try {
|
|
2456
|
+
const created = await kernel.proxies.create({
|
|
2457
|
+
type: "residential",
|
|
2458
|
+
name: proxyName(target.country, target.state, city),
|
|
2459
|
+
config: {
|
|
2460
|
+
country: target.country,
|
|
2461
|
+
state: target.state,
|
|
2462
|
+
city
|
|
2463
|
+
}
|
|
2464
|
+
});
|
|
2465
|
+
if (created.id) {
|
|
2466
|
+
return resolution("location_created", options.proxyMode, created.id, {
|
|
2467
|
+
...target,
|
|
2468
|
+
level: "city",
|
|
2469
|
+
city,
|
|
2470
|
+
proxyName: proxyName(target.country, target.state, city),
|
|
2471
|
+
config: {
|
|
2472
|
+
country: target.country,
|
|
2473
|
+
state: target.state,
|
|
2474
|
+
city
|
|
2475
|
+
}
|
|
2476
|
+
}, null);
|
|
2477
|
+
}
|
|
2478
|
+
createErrors.push(`${city}: Kernel did not return a proxy id`);
|
|
2479
|
+
} catch (err) {
|
|
2480
|
+
createErrors.push(`${city}: ${errorText2(err)}`);
|
|
2481
|
+
}
|
|
2482
|
+
}
|
|
2483
|
+
const fallbackTarget = stateTarget(target);
|
|
2484
|
+
const existingState = findExistingStateProxy(proxies, fallbackTarget);
|
|
2485
|
+
if (existingState?.id) {
|
|
2486
|
+
return resolution("location_reused", options.proxyMode, existingState.id, fallbackTarget, createErrors.join(" | "));
|
|
2487
|
+
}
|
|
2488
|
+
try {
|
|
2489
|
+
const created = await kernel.proxies.create({
|
|
2490
|
+
type: "residential",
|
|
2491
|
+
name: fallbackTarget.proxyName,
|
|
2492
|
+
config: fallbackTarget.config
|
|
2493
|
+
});
|
|
2494
|
+
if (created.id) {
|
|
2495
|
+
return resolution("location_created", options.proxyMode, created.id, fallbackTarget, createErrors.join(" | "));
|
|
2496
|
+
}
|
|
2497
|
+
createErrors.push(`${fallbackTarget.state}: Kernel did not return a proxy id`);
|
|
2498
|
+
} catch (err) {
|
|
2499
|
+
createErrors.push(`${fallbackTarget.state}: ${errorText2(err)}`);
|
|
2500
|
+
}
|
|
2501
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
|
|
2502
|
+
} catch (err) {
|
|
2503
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, errorText2(err));
|
|
2504
|
+
}
|
|
2505
|
+
}
|
|
2506
|
+
|
|
1369
2507
|
// src/harvest.ts
|
|
1370
2508
|
var MAX_ATTEMPTS = 3;
|
|
1371
|
-
|
|
2509
|
+
function abortReason(signal) {
|
|
2510
|
+
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
|
|
2511
|
+
return new RequestAbortedError();
|
|
2512
|
+
}
|
|
2513
|
+
function getAbortSignal(rawOptions) {
|
|
2514
|
+
if (!rawOptions || typeof rawOptions !== "object") return void 0;
|
|
2515
|
+
const signal = rawOptions.signal;
|
|
2516
|
+
if (signal instanceof AbortSignal) return signal;
|
|
2517
|
+
return void 0;
|
|
2518
|
+
}
|
|
2519
|
+
function getAttemptLogSink(rawOptions) {
|
|
2520
|
+
if (!rawOptions || typeof rawOptions !== "object") return void 0;
|
|
2521
|
+
const sink = rawOptions.onAttemptEvent;
|
|
2522
|
+
return typeof sink === "function" ? sink : void 0;
|
|
2523
|
+
}
|
|
2524
|
+
async function emitAttemptEvent(sink, event) {
|
|
2525
|
+
if (!sink) return;
|
|
2526
|
+
try {
|
|
2527
|
+
await sink(event);
|
|
2528
|
+
} catch (err) {
|
|
2529
|
+
console.warn(JSON.stringify({
|
|
2530
|
+
event: "harvest_attempt_log_failed",
|
|
2531
|
+
attempt_number: event.attemptNumber,
|
|
2532
|
+
message: err instanceof Error ? err.message : String(err)
|
|
2533
|
+
}));
|
|
2534
|
+
}
|
|
2535
|
+
}
|
|
2536
|
+
function classifyAttemptError(err) {
|
|
2537
|
+
if (err instanceof CaptchaError) return "captcha";
|
|
2538
|
+
if (err instanceof RequestAbortedError) return "request_aborted";
|
|
2539
|
+
if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
|
|
2540
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2541
|
+
return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
|
|
2542
|
+
}
|
|
2543
|
+
function classifyAttemptResult(result) {
|
|
2544
|
+
return result.diagnostics?.completionStatus ?? (result.totalQuestions > 0 ? "paa_found" : "no_paa");
|
|
2545
|
+
}
|
|
2546
|
+
function errorMessage(err) {
|
|
2547
|
+
return err instanceof Error ? err.message : String(err);
|
|
2548
|
+
}
|
|
2549
|
+
async function extractOnce(options, signal) {
|
|
1372
2550
|
const driver = new BrowserDriver();
|
|
1373
2551
|
const reporter = new ProgressReporter();
|
|
1374
2552
|
const extractor = new PAAExtractor(driver, reporter);
|
|
2553
|
+
if (signal?.aborted) {
|
|
2554
|
+
return {
|
|
2555
|
+
result: null,
|
|
2556
|
+
error: abortReason(signal),
|
|
2557
|
+
cleanup: await driver.close(),
|
|
2558
|
+
debug: null
|
|
2559
|
+
};
|
|
2560
|
+
}
|
|
2561
|
+
let onAbort;
|
|
2562
|
+
const abortPromise = signal ? new Promise((_, reject) => {
|
|
2563
|
+
onAbort = () => reject(abortReason(signal));
|
|
2564
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
2565
|
+
}) : null;
|
|
2566
|
+
let result = null;
|
|
2567
|
+
let error = null;
|
|
2568
|
+
let cleanup;
|
|
2569
|
+
let debug = null;
|
|
1375
2570
|
try {
|
|
1376
|
-
|
|
2571
|
+
const extraction = extractor.extract(options, signal);
|
|
2572
|
+
if (abortPromise) extraction.catch(() => {
|
|
2573
|
+
});
|
|
2574
|
+
result = await (abortPromise ? Promise.race([extraction, abortPromise]) : extraction);
|
|
2575
|
+
} catch (err) {
|
|
2576
|
+
error = err;
|
|
1377
2577
|
} finally {
|
|
1378
|
-
|
|
2578
|
+
if (signal && onAbort) signal.removeEventListener("abort", onAbort);
|
|
2579
|
+
debug = result?.diagnostics.debug ?? (options.debug ? {
|
|
2580
|
+
enabled: true,
|
|
2581
|
+
request: {
|
|
2582
|
+
query: options.query,
|
|
2583
|
+
locationInput: options.location ?? null,
|
|
2584
|
+
canonicalLocation: null,
|
|
2585
|
+
uule: null,
|
|
2586
|
+
gl: options.gl,
|
|
2587
|
+
hl: options.hl,
|
|
2588
|
+
device: options.device,
|
|
2589
|
+
proxyMode: options.proxyMode,
|
|
2590
|
+
proxyZip: options.proxyZip ?? null,
|
|
2591
|
+
serpOnly: options.serpOnly,
|
|
2592
|
+
pages: options.pages ?? 1
|
|
2593
|
+
},
|
|
2594
|
+
browser: driver.getDebugSnapshot()
|
|
2595
|
+
} : null);
|
|
2596
|
+
cleanup = await driver.close();
|
|
1379
2597
|
}
|
|
2598
|
+
return error ? { result: null, error, cleanup, debug } : { result, error: null, cleanup, debug };
|
|
1380
2599
|
}
|
|
1381
2600
|
async function harvest(rawOptions) {
|
|
1382
2601
|
const raw = typeof rawOptions === "object" && rawOptions !== null ? rawOptions : {};
|
|
1383
|
-
const
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
2602
|
+
const signal = getAbortSignal(rawOptions);
|
|
2603
|
+
const onAttemptEvent = getAttemptLogSink(rawOptions);
|
|
2604
|
+
const requestedProxyMode = raw.proxyMode;
|
|
2605
|
+
const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
|
|
2606
|
+
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
|
|
2607
|
+
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
|
|
2608
|
+
const proxyOpts = {
|
|
2609
|
+
kernelApiKey,
|
|
2610
|
+
proxyMode,
|
|
2611
|
+
configuredKernelProxyId,
|
|
2612
|
+
location: typeof raw.location === "string" ? raw.location : void 0,
|
|
2613
|
+
proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
|
|
2614
|
+
gl: typeof raw.gl === "string" ? raw.gl : "us"
|
|
1387
2615
|
};
|
|
1388
|
-
const options = HarvestOptionsSchema.parse(merged);
|
|
1389
2616
|
const serializer = new OutputSerializer();
|
|
1390
2617
|
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
|
2618
|
+
const attemptNumber = i + 1;
|
|
2619
|
+
const startedAtMs = Date.now();
|
|
1391
2620
|
try {
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
2621
|
+
if (signal?.aborted) throw abortReason(signal);
|
|
2622
|
+
const resolution2 = await resolveKernelProxyId({ ...proxyOpts, attemptIndex: i });
|
|
2623
|
+
const mergedAttempt = {
|
|
2624
|
+
...raw,
|
|
2625
|
+
kernelApiKey,
|
|
2626
|
+
kernelProxyId: resolution2.kernelProxyId,
|
|
2627
|
+
kernelProxyResolution: resolution2.resolution,
|
|
2628
|
+
proxyMode
|
|
2629
|
+
};
|
|
2630
|
+
if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
|
|
2631
|
+
const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
|
|
2632
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2633
|
+
type: "started",
|
|
2634
|
+
attemptNumber,
|
|
2635
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2636
|
+
query: attemptOptions.query,
|
|
2637
|
+
location: attemptOptions.location ?? null,
|
|
2638
|
+
maxQuestions: attemptOptions.maxQuestions,
|
|
2639
|
+
startedAt: new Date(startedAtMs).toISOString()
|
|
2640
|
+
});
|
|
2641
|
+
console.info(JSON.stringify({
|
|
2642
|
+
event: "harvest_attempt_started",
|
|
2643
|
+
attempt_number: attemptNumber,
|
|
2644
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2645
|
+
query: attemptOptions.query,
|
|
2646
|
+
location: attemptOptions.location ?? null,
|
|
2647
|
+
max_questions: attemptOptions.maxQuestions
|
|
2648
|
+
}));
|
|
2649
|
+
const attempt = await extractOnce(attemptOptions, signal);
|
|
2650
|
+
if (attempt.error) {
|
|
2651
|
+
const err = attempt.error;
|
|
2652
|
+
if (err instanceof CaptchaError) {
|
|
2653
|
+
const willRetry = i < MAX_ATTEMPTS - 1;
|
|
2654
|
+
console.warn(JSON.stringify({
|
|
2655
|
+
event: "harvest_attempt_captcha",
|
|
2656
|
+
attempt_number: attemptNumber,
|
|
2657
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2658
|
+
message: err.message,
|
|
2659
|
+
will_retry: willRetry
|
|
2660
|
+
}));
|
|
2661
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2662
|
+
type: "finished",
|
|
2663
|
+
attemptNumber,
|
|
2664
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2665
|
+
outcome: "captcha",
|
|
2666
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2667
|
+
questionCount: 0,
|
|
2668
|
+
durationMs: Date.now() - startedAtMs,
|
|
2669
|
+
error: err.message,
|
|
2670
|
+
willRetry,
|
|
2671
|
+
cleanup: attempt.cleanup,
|
|
2672
|
+
debug: attempt.debug,
|
|
2673
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2674
|
+
});
|
|
2675
|
+
if (willRetry) continue;
|
|
2676
|
+
break;
|
|
2677
|
+
}
|
|
2678
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2679
|
+
type: "finished",
|
|
2680
|
+
attemptNumber,
|
|
2681
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2682
|
+
outcome: classifyAttemptError(err),
|
|
2683
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2684
|
+
questionCount: 0,
|
|
2685
|
+
durationMs: Date.now() - startedAtMs,
|
|
2686
|
+
error: errorMessage(err),
|
|
2687
|
+
willRetry: false,
|
|
2688
|
+
cleanup: attempt.cleanup,
|
|
2689
|
+
debug: attempt.debug,
|
|
2690
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2691
|
+
});
|
|
2692
|
+
throw err;
|
|
1395
2693
|
}
|
|
1396
|
-
|
|
2694
|
+
const result = attempt.result;
|
|
2695
|
+
if (!result) throw new Error("Harvest attempt completed without a result");
|
|
2696
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2697
|
+
type: "finished",
|
|
2698
|
+
attemptNumber,
|
|
2699
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2700
|
+
outcome: classifyAttemptResult(result),
|
|
2701
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2702
|
+
questionCount: result.totalQuestions,
|
|
2703
|
+
durationMs: Date.now() - startedAtMs,
|
|
2704
|
+
error: null,
|
|
2705
|
+
willRetry: false,
|
|
2706
|
+
cleanup: attempt.cleanup,
|
|
2707
|
+
debug: attempt.debug,
|
|
2708
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2709
|
+
});
|
|
2710
|
+
if (attemptOptions.format === "json" || attemptOptions.format === "both") {
|
|
2711
|
+
await serializer.writeJSON(result, attemptOptions.outputDir);
|
|
2712
|
+
}
|
|
2713
|
+
if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
|
|
1397
2714
|
await Promise.all([
|
|
1398
|
-
serializer.writeCSV(result.flat,
|
|
1399
|
-
result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed,
|
|
1400
|
-
result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed,
|
|
1401
|
-
result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed,
|
|
1402
|
-
result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed,
|
|
1403
|
-
result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed,
|
|
2715
|
+
serializer.writeCSV(result.flat, attemptOptions.outputDir),
|
|
2716
|
+
result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2717
|
+
result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2718
|
+
result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2719
|
+
result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2720
|
+
result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, attemptOptions.outputDir) : Promise.resolve("")
|
|
1404
2721
|
]);
|
|
1405
2722
|
}
|
|
1406
2723
|
return result;
|
|
1407
2724
|
} catch (err) {
|
|
1408
|
-
if (err instanceof CaptchaError
|
|
1409
|
-
|
|
2725
|
+
if (err instanceof CaptchaError) {
|
|
2726
|
+
const willRetry = i < MAX_ATTEMPTS - 1;
|
|
2727
|
+
console.warn(JSON.stringify({
|
|
2728
|
+
event: "harvest_attempt_captcha",
|
|
2729
|
+
attempt_number: attemptNumber,
|
|
2730
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2731
|
+
message: err.message,
|
|
2732
|
+
will_retry: willRetry
|
|
2733
|
+
}));
|
|
2734
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2735
|
+
type: "finished",
|
|
2736
|
+
attemptNumber,
|
|
2737
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2738
|
+
outcome: "captcha",
|
|
2739
|
+
kernelSessionId: null,
|
|
2740
|
+
questionCount: 0,
|
|
2741
|
+
durationMs: Date.now() - startedAtMs,
|
|
2742
|
+
error: err.message,
|
|
2743
|
+
willRetry,
|
|
2744
|
+
cleanup: {
|
|
2745
|
+
kernelSessionId: null,
|
|
2746
|
+
kernelDeleteStarted: false,
|
|
2747
|
+
kernelDeleteSucceeded: null,
|
|
2748
|
+
kernelDeleteError: null,
|
|
2749
|
+
browserCloseSucceeded: null,
|
|
2750
|
+
browserCloseError: null
|
|
2751
|
+
},
|
|
2752
|
+
debug: null,
|
|
2753
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2754
|
+
});
|
|
2755
|
+
if (willRetry) continue;
|
|
2756
|
+
break;
|
|
1410
2757
|
}
|
|
2758
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2759
|
+
type: "finished",
|
|
2760
|
+
attemptNumber,
|
|
2761
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2762
|
+
outcome: classifyAttemptError(err),
|
|
2763
|
+
kernelSessionId: null,
|
|
2764
|
+
questionCount: 0,
|
|
2765
|
+
durationMs: Date.now() - startedAtMs,
|
|
2766
|
+
error: errorMessage(err),
|
|
2767
|
+
willRetry: false,
|
|
2768
|
+
cleanup: {
|
|
2769
|
+
kernelSessionId: null,
|
|
2770
|
+
kernelDeleteStarted: false,
|
|
2771
|
+
kernelDeleteSucceeded: null,
|
|
2772
|
+
kernelDeleteError: null,
|
|
2773
|
+
browserCloseSucceeded: null,
|
|
2774
|
+
browserCloseError: null
|
|
2775
|
+
},
|
|
2776
|
+
debug: null,
|
|
2777
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2778
|
+
});
|
|
1411
2779
|
throw err;
|
|
1412
2780
|
}
|
|
1413
2781
|
}
|
|
1414
|
-
|
|
1415
|
-
|
|
2782
|
+
console.warn(JSON.stringify({
|
|
2783
|
+
event: "harvest_captcha_exhausted",
|
|
2784
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2785
|
+
session_kind: kernelApiKey ? "kernel" : "local"
|
|
2786
|
+
}));
|
|
2787
|
+
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${MAX_ATTEMPTS} fresh sessions. Try again in a few minutes.`));
|
|
1416
2788
|
}
|
|
1417
2789
|
|
|
1418
2790
|
// src/video/VideoGenerator.ts
|
|
1419
2791
|
var import_node_child_process2 = require("child_process");
|
|
1420
|
-
var
|
|
1421
|
-
var
|
|
1422
|
-
var
|
|
1423
|
-
var
|
|
2792
|
+
var import_node_fs4 = require("fs");
|
|
2793
|
+
var import_node_os2 = require("os");
|
|
2794
|
+
var import_node_path4 = require("path");
|
|
2795
|
+
var import_client3 = require("@fal-ai/client");
|
|
1424
2796
|
|
|
1425
2797
|
// src/video/promptBuilder.ts
|
|
1426
2798
|
var DEEPINFRA_URL = "https://api.deepinfra.com/v1/openai/chat/completions";
|
|
@@ -1490,72 +2862,78 @@ async function buildClipPrompts(question, answer) {
|
|
|
1490
2862
|
}
|
|
1491
2863
|
throw new Error("No LLM key \u2014 set DEEPINFRA_API_KEY or OPENROUTER_API_KEY");
|
|
1492
2864
|
}
|
|
2865
|
+
function extractEpisodePrompts(brief) {
|
|
2866
|
+
if (!brief.clip1 || !brief.clip2 || !brief.voiceover || !brief.audioMood) {
|
|
2867
|
+
throw new Error("Episode brief is missing prompt fields \u2014 run blog-to-video skill to regenerate");
|
|
2868
|
+
}
|
|
2869
|
+
return { clip1: brief.clip1, clip2: brief.clip2, voiceover: brief.voiceover, audioMood: brief.audioMood };
|
|
2870
|
+
}
|
|
1493
2871
|
|
|
1494
2872
|
// src/video/AudioGenerator.ts
|
|
1495
|
-
var
|
|
2873
|
+
var import_node_fs2 = require("fs");
|
|
2874
|
+
var import_node_path2 = require("path");
|
|
2875
|
+
var import_node_os = require("os");
|
|
2876
|
+
var import_client = require("@fal-ai/client");
|
|
1496
2877
|
var MMAUDIO_MODEL = "fal-ai/mmaudio-v2";
|
|
1497
|
-
var
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
const
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
if (!submitRes.ok) throw new Error(`${model} submit failed (${submitRes.status}): ${await submitRes.text()}`);
|
|
1506
|
-
const { request_id } = await submitRes.json();
|
|
1507
|
-
console.log(`[fal] submitted ${model} \u2192 ${request_id}`);
|
|
1508
|
-
while (true) {
|
|
1509
|
-
await new Promise((r) => setTimeout(r, 5e3));
|
|
1510
|
-
const statusRes = await fetch(`${QUEUE_BASE}/${model}/requests/${request_id}/status`, { headers });
|
|
1511
|
-
if (!statusRes.ok) continue;
|
|
1512
|
-
const { status } = await statusRes.json();
|
|
1513
|
-
console.log(`[fal] ${request_id} \u2192 ${status}`);
|
|
1514
|
-
if (status === "FAILED") throw new Error(`${model} request ${request_id} failed`);
|
|
1515
|
-
if (status !== "COMPLETED") continue;
|
|
1516
|
-
const resultRes = await fetch(`${QUEUE_BASE}/${model}/requests/${request_id}`, { headers });
|
|
1517
|
-
if (!resultRes.ok) throw new Error(`Result fetch failed (${resultRes.status})`);
|
|
1518
|
-
return await resultRes.json();
|
|
1519
|
-
}
|
|
1520
|
-
}
|
|
1521
|
-
function getKey() {
|
|
1522
|
-
const key = process.env["FAL_KEY"];
|
|
1523
|
-
if (!key) throw new Error("FAL_KEY required");
|
|
1524
|
-
return key;
|
|
1525
|
-
}
|
|
1526
|
-
async function generateVoiceover(text, voice = "Serena (en)") {
|
|
2878
|
+
var ELEVENLABS_MODEL = "fal-ai/elevenlabs/tts";
|
|
2879
|
+
var GEMINI_TTS_MODEL = "fal-ai/google/gemini-2.5-flash-preview-tts";
|
|
2880
|
+
async function downloadAudio(url) {
|
|
2881
|
+
const res = await fetch(url);
|
|
2882
|
+
if (!res.ok) throw new Error(`Failed to download TTS audio (${res.status})`);
|
|
2883
|
+
return Buffer.from(await res.arrayBuffer());
|
|
2884
|
+
}
|
|
2885
|
+
async function generateVoiceover(text) {
|
|
1527
2886
|
console.log("[AudioGenerator] Generating voiceover...");
|
|
1528
|
-
const
|
|
1529
|
-
|
|
2887
|
+
const outDir = (0, import_node_path2.join)((0, import_node_os.tmpdir)(), `tts-${Date.now()}`);
|
|
2888
|
+
(0, import_node_fs2.mkdirSync)(outDir, { recursive: true });
|
|
2889
|
+
const outPath = (0, import_node_path2.join)(outDir, "voiceover.mp3");
|
|
2890
|
+
try {
|
|
2891
|
+
const voiceId = process.env["ELEVENLABS_VOICE_ID"] ?? "pNInz6obpgDQGcFmaJgB";
|
|
2892
|
+
const result2 = await import_client.fal.run(ELEVENLABS_MODEL, {
|
|
2893
|
+
input: { text, voice_id: voiceId, model_id: "eleven_v3" }
|
|
2894
|
+
});
|
|
2895
|
+
(0, import_node_fs2.writeFileSync)(outPath, await downloadAudio(result2.audio.url));
|
|
2896
|
+
console.log("[AudioGenerator] TTS: ElevenLabs via fal");
|
|
2897
|
+
return outPath;
|
|
2898
|
+
} catch (err) {
|
|
2899
|
+
console.warn("[AudioGenerator] ElevenLabs via fal failed, trying Gemini:", err.message);
|
|
2900
|
+
}
|
|
2901
|
+
const voice = process.env["GEMINI_TTS_VOICE"] ?? "Kore";
|
|
2902
|
+
const result = await import_client.fal.run(GEMINI_TTS_MODEL, { input: { text, voice } });
|
|
2903
|
+
(0, import_node_fs2.writeFileSync)(outPath, await downloadAudio(result.audio.url));
|
|
2904
|
+
console.log("[AudioGenerator] TTS: Gemini via fal");
|
|
2905
|
+
return outPath;
|
|
1530
2906
|
}
|
|
1531
2907
|
async function addBackgroundAudio(videoUrl, mood, durationSeconds) {
|
|
1532
2908
|
console.log("[AudioGenerator] Adding background audio via MMAudio V2...");
|
|
1533
|
-
const
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
2909
|
+
const result = await import_client.fal.run(MMAUDIO_MODEL, {
|
|
2910
|
+
input: {
|
|
2911
|
+
video_url: videoUrl,
|
|
2912
|
+
prompt: mood,
|
|
2913
|
+
negative_prompt: "speech, voice, talking, dialogue, narration, vocals, singing, human voice, conversation, words, lyrics, announcer, commentary",
|
|
2914
|
+
duration: durationSeconds,
|
|
2915
|
+
cfg_strength: 4.5
|
|
2916
|
+
}
|
|
2917
|
+
});
|
|
2918
|
+
return result.video.url;
|
|
1541
2919
|
}
|
|
1542
2920
|
|
|
1543
2921
|
// src/video/VideoMixer.ts
|
|
1544
2922
|
var import_node_child_process = require("child_process");
|
|
1545
|
-
var
|
|
1546
|
-
var
|
|
1547
|
-
var
|
|
2923
|
+
var import_node_fs3 = require("fs");
|
|
2924
|
+
var import_node_path3 = require("path");
|
|
2925
|
+
var import_client2 = require("@fal-ai/client");
|
|
1548
2926
|
async function download(url, destPath) {
|
|
1549
2927
|
const res = await fetch(url);
|
|
1550
2928
|
if (!res.ok) throw new Error(`Download failed (${res.status}): ${url}`);
|
|
1551
|
-
(0,
|
|
2929
|
+
(0, import_node_fs3.writeFileSync)(destPath, Buffer.from(await res.arrayBuffer()));
|
|
1552
2930
|
}
|
|
1553
2931
|
async function concatenateClips(clip1Url, clip2Url, outDir) {
|
|
1554
|
-
(0,
|
|
2932
|
+
(0, import_node_fs3.mkdirSync)(outDir, { recursive: true });
|
|
1555
2933
|
const ts = Date.now();
|
|
1556
|
-
const p1 = (0,
|
|
1557
|
-
const p2 = (0,
|
|
1558
|
-
const out = (0,
|
|
2934
|
+
const p1 = (0, import_node_path3.join)(outDir, `clip1-${ts}.mp4`);
|
|
2935
|
+
const p2 = (0, import_node_path3.join)(outDir, `clip2-${ts}.mp4`);
|
|
2936
|
+
const out = (0, import_node_path3.join)(outDir, `combined-${ts}.mp4`);
|
|
1559
2937
|
console.log("[VideoMixer] Downloading clips...");
|
|
1560
2938
|
await Promise.all([download(clip1Url, p1), download(clip2Url, p2)]);
|
|
1561
2939
|
console.log("[VideoMixer] Concatenating...");
|
|
@@ -1567,14 +2945,14 @@ async function concatenateClips(clip1Url, clip2Url, outDir) {
|
|
|
1567
2945
|
async function uploadToFal(localPath) {
|
|
1568
2946
|
const { readFileSync: readFileSync2 } = await import("fs");
|
|
1569
2947
|
const blob = new Blob([readFileSync2(localPath)], { type: "video/mp4" });
|
|
1570
|
-
const url = await
|
|
2948
|
+
const url = await import_client2.fal.storage.upload(blob);
|
|
1571
2949
|
console.log("[VideoMixer] Uploaded to fal:", url);
|
|
1572
2950
|
return url;
|
|
1573
2951
|
}
|
|
1574
2952
|
async function overlayVoiceover(videoPath, voiceoverUrl, outDir) {
|
|
1575
2953
|
const ts = Date.now();
|
|
1576
|
-
const wav = (0,
|
|
1577
|
-
const out = (0,
|
|
2954
|
+
const wav = (0, import_node_path3.join)(outDir, `voiceover-${ts}.wav`);
|
|
2955
|
+
const out = (0, import_node_path3.join)(outDir, `final-${ts}.mp4`);
|
|
1578
2956
|
console.log("[VideoMixer] Downloading voiceover...");
|
|
1579
2957
|
await download(voiceoverUrl, wav);
|
|
1580
2958
|
console.log("[VideoMixer] Mixing voiceover over background audio...");
|
|
@@ -1599,30 +2977,30 @@ function buildInput(prompt, opts, seed, imageUrl) {
|
|
|
1599
2977
|
};
|
|
1600
2978
|
}
|
|
1601
2979
|
async function generate(model, input) {
|
|
1602
|
-
const { request_id } = await
|
|
2980
|
+
const { request_id } = await import_client3.fal.queue.submit(model, { input });
|
|
1603
2981
|
console.log(`[fal] submitted ${model} \u2192 ${request_id}`);
|
|
1604
2982
|
while (true) {
|
|
1605
2983
|
await new Promise((r) => setTimeout(r, 5e3));
|
|
1606
|
-
const s = await
|
|
2984
|
+
const s = await import_client3.fal.queue.status(model, { requestId: request_id, logs: false });
|
|
1607
2985
|
console.log(`[fal] ${request_id} \u2192 ${s.status}`);
|
|
1608
2986
|
if (s.status === "FAILED") throw new Error(`Request ${request_id} failed`);
|
|
1609
2987
|
if (s.status !== "COMPLETED") continue;
|
|
1610
|
-
const result = await
|
|
2988
|
+
const result = await import_client3.fal.queue.result(model, { requestId: request_id });
|
|
1611
2989
|
return result.data;
|
|
1612
2990
|
}
|
|
1613
2991
|
}
|
|
1614
2992
|
async function extractLastFrame(videoUrl, outDir) {
|
|
1615
2993
|
const ts = Date.now();
|
|
1616
|
-
const mp4Path = (0,
|
|
1617
|
-
const jpgPath = (0,
|
|
2994
|
+
const mp4Path = (0, import_node_path4.join)(outDir, `clip1-raw-${ts}.mp4`);
|
|
2995
|
+
const jpgPath = (0, import_node_path4.join)(outDir, `last-frame-${ts}.jpg`);
|
|
1618
2996
|
const res = await fetch(videoUrl);
|
|
1619
2997
|
if (!res.ok) throw new Error(`Failed to download clip 1 (${res.status})`);
|
|
1620
|
-
(0,
|
|
2998
|
+
(0, import_node_fs4.writeFileSync)(mp4Path, Buffer.from(await res.arrayBuffer()));
|
|
1621
2999
|
try {
|
|
1622
3000
|
(0, import_node_child_process2.execSync)(`ffmpeg -sseof -0.1 -i "${mp4Path}" -vframes 1 -y "${jpgPath}" -loglevel error`);
|
|
1623
3001
|
} finally {
|
|
1624
3002
|
try {
|
|
1625
|
-
(0,
|
|
3003
|
+
(0, import_node_fs4.unlinkSync)(mp4Path);
|
|
1626
3004
|
} catch {
|
|
1627
3005
|
}
|
|
1628
3006
|
}
|
|
@@ -1632,11 +3010,11 @@ var VideoGenerator = class {
|
|
|
1632
3010
|
constructor(apiKey) {
|
|
1633
3011
|
const key = apiKey ?? process.env["FAL_KEY"];
|
|
1634
3012
|
if (!key) throw new Error("FAL_KEY is required");
|
|
1635
|
-
|
|
3013
|
+
import_client3.fal.config({ credentials: key });
|
|
1636
3014
|
}
|
|
1637
3015
|
async generateClipPair(question, answer, opts = {}) {
|
|
1638
|
-
const outDir = opts.outputDir ?? (0,
|
|
1639
|
-
(0,
|
|
3016
|
+
const outDir = opts.outputDir ?? (0, import_node_path4.join)((0, import_node_os2.tmpdir)(), `paa-video-${Date.now()}`);
|
|
3017
|
+
(0, import_node_fs4.mkdirSync)(outDir, { recursive: true });
|
|
1640
3018
|
console.log("\n[1/7] Generating prompts via QWEN 3.6...");
|
|
1641
3019
|
const prompts = await buildClipPrompts(question, answer);
|
|
1642
3020
|
console.log(" Voiceover:", prompts.voiceover);
|
|
@@ -1645,10 +3023,55 @@ var VideoGenerator = class {
|
|
|
1645
3023
|
const result1 = await generate(T2V, buildInput(prompts.clip1, opts, opts.seed));
|
|
1646
3024
|
console.log("\n[3/7] Extracting last frame \u2192 clip 2 start...");
|
|
1647
3025
|
const jpgPath = await extractLastFrame(result1.video.url, outDir);
|
|
1648
|
-
const imageBlob = new Blob([(0,
|
|
1649
|
-
const frameUrl = await
|
|
3026
|
+
const imageBlob = new Blob([(0, import_node_fs4.readFileSync)(jpgPath)], { type: "image/jpeg" });
|
|
3027
|
+
const frameUrl = await import_client3.fal.storage.upload(imageBlob);
|
|
3028
|
+
try {
|
|
3029
|
+
(0, import_node_fs4.unlinkSync)(jpgPath);
|
|
3030
|
+
} catch {
|
|
3031
|
+
}
|
|
3032
|
+
console.log("\n[4/7] Generating clip 2 (image-to-video from last frame)...");
|
|
3033
|
+
const seed2 = opts.seed !== void 0 ? opts.seed + 1 : void 0;
|
|
3034
|
+
const result2 = await generate(I2V, buildInput(prompts.clip2, opts, seed2, frameUrl));
|
|
3035
|
+
console.log("\n[5/7] Concatenating clips + generating voiceover (parallel)...");
|
|
3036
|
+
const [combinedPath, voiceoverUrl] = await Promise.all([
|
|
3037
|
+
concatenateClips(result1.video.url, result2.video.url, outDir),
|
|
3038
|
+
generateVoiceover(prompts.voiceover)
|
|
3039
|
+
]);
|
|
3040
|
+
console.log("\n[6/7] Adding background audio via MMAudio V2...");
|
|
3041
|
+
const falVideoUrl = await uploadToFal(combinedPath);
|
|
3042
|
+
const totalDuration = (opts.clipDurationSeconds ?? 8) * 2;
|
|
3043
|
+
const videoWithAudioUrl = await addBackgroundAudio(falVideoUrl, prompts.audioMood, totalDuration);
|
|
3044
|
+
console.log("\n[7/7] Overlaying voiceover on final video...");
|
|
3045
|
+
const videoWithAudioPath = (0, import_node_path4.join)(outDir, `with-bg-audio-${Date.now()}.mp4`);
|
|
3046
|
+
const bgRes = await fetch(videoWithAudioUrl);
|
|
3047
|
+
(0, import_node_fs4.writeFileSync)(videoWithAudioPath, Buffer.from(await bgRes.arrayBuffer()));
|
|
3048
|
+
const finalVideoPath = await overlayVoiceover(videoWithAudioPath, voiceoverUrl, outDir);
|
|
3049
|
+
return {
|
|
3050
|
+
clip1Url: result1.video.url,
|
|
3051
|
+
clip2Url: result2.video.url,
|
|
3052
|
+
finalVideoPath,
|
|
3053
|
+
seed: result1.seed,
|
|
3054
|
+
promptClip1: prompts.clip1,
|
|
3055
|
+
promptClip2: prompts.clip2,
|
|
3056
|
+
voiceover: prompts.voiceover,
|
|
3057
|
+
audioMood: prompts.audioMood
|
|
3058
|
+
};
|
|
3059
|
+
}
|
|
3060
|
+
async generateEpisode(brief, opts = {}) {
|
|
3061
|
+
const outDir = opts.outputDir ?? (0, import_node_path4.join)((0, import_node_os2.tmpdir)(), `episode-${brief.episodeNumber}-${Date.now()}`);
|
|
3062
|
+
(0, import_node_fs4.mkdirSync)(outDir, { recursive: true });
|
|
3063
|
+
const prompts = extractEpisodePrompts(brief);
|
|
3064
|
+
console.log(`
|
|
3065
|
+
[Episode ${brief.episodeNumber}/${brief.episodeCount}] ${brief.sectionTitle}`);
|
|
3066
|
+
console.log(" Voiceover:", prompts.voiceover);
|
|
3067
|
+
console.log("\n[2/7] Generating clip 1 (text-to-video)...");
|
|
3068
|
+
const result1 = await generate(T2V, buildInput(prompts.clip1, opts, opts.seed));
|
|
3069
|
+
console.log("\n[3/7] Extracting last frame \u2192 clip 2 start...");
|
|
3070
|
+
const jpgPath = await extractLastFrame(result1.video.url, outDir);
|
|
3071
|
+
const imageBlob = new Blob([(0, import_node_fs4.readFileSync)(jpgPath)], { type: "image/jpeg" });
|
|
3072
|
+
const frameUrl = await import_client3.fal.storage.upload(imageBlob);
|
|
1650
3073
|
try {
|
|
1651
|
-
(0,
|
|
3074
|
+
(0, import_node_fs4.unlinkSync)(jpgPath);
|
|
1652
3075
|
} catch {
|
|
1653
3076
|
}
|
|
1654
3077
|
console.log("\n[4/7] Generating clip 2 (image-to-video from last frame)...");
|
|
@@ -1657,16 +3080,16 @@ var VideoGenerator = class {
|
|
|
1657
3080
|
console.log("\n[5/7] Concatenating clips + generating voiceover (parallel)...");
|
|
1658
3081
|
const [combinedPath, voiceoverUrl] = await Promise.all([
|
|
1659
3082
|
concatenateClips(result1.video.url, result2.video.url, outDir),
|
|
1660
|
-
generateVoiceover(prompts.voiceover
|
|
3083
|
+
generateVoiceover(prompts.voiceover)
|
|
1661
3084
|
]);
|
|
1662
3085
|
console.log("\n[6/7] Adding background audio via MMAudio V2...");
|
|
1663
3086
|
const falVideoUrl = await uploadToFal(combinedPath);
|
|
1664
3087
|
const totalDuration = (opts.clipDurationSeconds ?? 8) * 2;
|
|
1665
3088
|
const videoWithAudioUrl = await addBackgroundAudio(falVideoUrl, prompts.audioMood, totalDuration);
|
|
1666
3089
|
console.log("\n[7/7] Overlaying voiceover on final video...");
|
|
1667
|
-
const videoWithAudioPath = (0,
|
|
3090
|
+
const videoWithAudioPath = (0, import_node_path4.join)(outDir, `with-bg-audio-${Date.now()}.mp4`);
|
|
1668
3091
|
const bgRes = await fetch(videoWithAudioUrl);
|
|
1669
|
-
(0,
|
|
3092
|
+
(0, import_node_fs4.writeFileSync)(videoWithAudioPath, Buffer.from(await bgRes.arrayBuffer()));
|
|
1670
3093
|
const finalVideoPath = await overlayVoiceover(videoWithAudioPath, voiceoverUrl, outDir);
|
|
1671
3094
|
return {
|
|
1672
3095
|
clip1Url: result1.video.url,
|