mcp-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +56 -0
- package/dist/bin/api-server.cjs +9256 -0
- package/dist/bin/api-server.cjs.map +1 -0
- package/dist/bin/api-server.d.cts +1 -0
- package/dist/bin/api-server.d.ts +1 -0
- package/dist/bin/api-server.js +38 -0
- package/dist/bin/api-server.js.map +1 -0
- package/dist/bin/mcp-stdio-server.cjs +840 -0
- package/dist/bin/mcp-stdio-server.cjs.map +1 -0
- package/dist/bin/mcp-stdio-server.d.cts +1 -0
- package/dist/bin/mcp-stdio-server.d.ts +1 -0
- package/dist/bin/mcp-stdio-server.js +41 -0
- package/dist/bin/mcp-stdio-server.js.map +1 -0
- package/dist/bin/paa-harvest.cjs +1438 -0
- package/dist/bin/paa-harvest.cjs.map +1 -0
- package/dist/bin/paa-harvest.d.cts +1 -0
- package/dist/bin/paa-harvest.d.ts +1 -0
- package/dist/bin/paa-harvest.js +37 -0
- package/dist/bin/paa-harvest.js.map +1 -0
- package/dist/chunk-4API3ZCT.js +1387 -0
- package/dist/chunk-4API3ZCT.js.map +1 -0
- package/dist/chunk-LXZDJJXR.js +476 -0
- package/dist/chunk-LXZDJJXR.js.map +1 -0
- package/dist/chunk-ZBP4RHNW.js +805 -0
- package/dist/chunk-ZBP4RHNW.js.map +1 -0
- package/dist/db-IOYMX64U.js +87 -0
- package/dist/db-IOYMX64U.js.map +1 -0
- package/dist/index.cjs +1689 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +210 -0
- package/dist/index.d.ts +210 -0
- package/dist/index.js +275 -0
- package/dist/index.js.map +1 -0
- package/dist/server-63DR2HE5.js +6062 -0
- package/dist/server-63DR2HE5.js.map +1 -0
- package/dist/worker-3ECJHPRE.js +88 -0
- package/dist/worker-3ECJHPRE.js.map +1 -0
- package/package.json +76 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,1689 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var src_exports = {};
|
|
32
|
+
__export(src_exports, {
|
|
33
|
+
VideoGenerator: () => VideoGenerator,
|
|
34
|
+
buildClipPrompts: () => buildClipPrompts,
|
|
35
|
+
harvest: () => harvest
|
|
36
|
+
});
|
|
37
|
+
module.exports = __toCommonJS(src_exports);
|
|
38
|
+
|
|
39
|
+
// src/schemas.ts
|
|
40
|
+
var import_zod = require("zod");
|
|
41
|
+
var HarvestOptionsSchema = import_zod.z.object({
|
|
42
|
+
query: import_zod.z.string().min(1),
|
|
43
|
+
location: import_zod.z.string().optional(),
|
|
44
|
+
gl: import_zod.z.string().length(2).default("us"),
|
|
45
|
+
hl: import_zod.z.string().length(2).default("en"),
|
|
46
|
+
depth: import_zod.z.number().int().min(1).max(30).default(3),
|
|
47
|
+
maxQuestions: import_zod.z.number().int().min(1).max(1e3).default(100),
|
|
48
|
+
headless: import_zod.z.boolean().default(false),
|
|
49
|
+
profileDir: import_zod.z.string().optional(),
|
|
50
|
+
proxy: import_zod.z.string().url().optional(),
|
|
51
|
+
kernelApiKey: import_zod.z.string().optional(),
|
|
52
|
+
kernelProxyId: import_zod.z.string().optional(),
|
|
53
|
+
outputDir: import_zod.z.string().default("./paa-output"),
|
|
54
|
+
format: import_zod.z.enum(["json", "csv", "both"]).default("both"),
|
|
55
|
+
serpOnly: import_zod.z.boolean().default(false),
|
|
56
|
+
pages: import_zod.z.number().int().min(1).max(2).default(1)
|
|
57
|
+
});
|
|
58
|
+
var MapsPlaceOptionsSchema = import_zod.z.object({
|
|
59
|
+
businessName: import_zod.z.string().min(1),
|
|
60
|
+
location: import_zod.z.string().min(1),
|
|
61
|
+
gl: import_zod.z.string().length(2).default("us"),
|
|
62
|
+
hl: import_zod.z.string().length(2).default("en"),
|
|
63
|
+
includeReviews: import_zod.z.boolean().default(false),
|
|
64
|
+
maxReviews: import_zod.z.number().int().min(1).max(500).default(50),
|
|
65
|
+
kernelApiKey: import_zod.z.string().optional(),
|
|
66
|
+
kernelProxyId: import_zod.z.string().optional(),
|
|
67
|
+
headless: import_zod.z.boolean().default(true)
|
|
68
|
+
});
|
|
69
|
+
var RawPAAItemSchema = import_zod.z.object({
|
|
70
|
+
question: import_zod.z.string().min(1),
|
|
71
|
+
answer: import_zod.z.string().optional(),
|
|
72
|
+
sourceTitle: import_zod.z.string().optional(),
|
|
73
|
+
sourceSite: import_zod.z.string().optional(),
|
|
74
|
+
sourceCite: import_zod.z.string().optional()
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
// src/driver/BrowserDriver.ts
|
|
78
|
+
var import_playwright_extra = require("playwright-extra");
|
|
79
|
+
var import_puppeteer_extra_plugin_stealth = __toESM(require("puppeteer-extra-plugin-stealth"), 1);
|
|
80
|
+
var import_playwright = require("playwright");
|
|
81
|
+
var import_sdk = __toESM(require("@onkernel/sdk"), 1);
|
|
82
|
+
|
|
83
|
+
// src/selectors.ts
|
|
84
|
+
var PAASelectors = {
|
|
85
|
+
container: ".eJH8qe.adDDi",
|
|
86
|
+
dataInitq: "[data-initq]",
|
|
87
|
+
item: ".related-question-pair",
|
|
88
|
+
itemDataQ: "data-q",
|
|
89
|
+
itemDataInitQ: "data-initq",
|
|
90
|
+
itemQuestionEl: ".JlqpRe",
|
|
91
|
+
answerContainer: ".bCOlv",
|
|
92
|
+
sourceTitle: "h3",
|
|
93
|
+
sourceSite: ".VuuXrf",
|
|
94
|
+
sourceCite: "cite",
|
|
95
|
+
clickTarget: ".JlqpRe",
|
|
96
|
+
expandedClass: "aoRk1c",
|
|
97
|
+
captchaMarker: '#captcha-form, #recaptcha, form[action*="/sorry/"], .g-recaptcha, [data-sitekey]'
|
|
98
|
+
};
|
|
99
|
+
var VideoSelectors = {
|
|
100
|
+
container: 'div[jscontroller="HWk0Gf"]',
|
|
101
|
+
sectionHeading: '.mgAbYb[role="heading"]',
|
|
102
|
+
item: "a.rIRoqf"
|
|
103
|
+
};
|
|
104
|
+
var ShortVideoSelectors = {
|
|
105
|
+
udm: "39",
|
|
106
|
+
item: "a.rIRoqf",
|
|
107
|
+
durationPattern: /^\d+:\d+$/,
|
|
108
|
+
platforms: ["YouTube", "TikTok", "Instagram", "Facebook", "X"]
|
|
109
|
+
};
|
|
110
|
+
var ForumSelectors = {
|
|
111
|
+
section: ".ULSxyf",
|
|
112
|
+
item: "a.KYg7td.INpicf",
|
|
113
|
+
title: ".hyYc0c",
|
|
114
|
+
source: ".K4ETW"
|
|
115
|
+
};
|
|
116
|
+
var WhatPeopleSayingSelectors = {
|
|
117
|
+
sectionTag: "g-section-with-header",
|
|
118
|
+
sectionHeadingText: "What people are saying",
|
|
119
|
+
card: '.dRzkFf[role="listitem"]',
|
|
120
|
+
cardLink: 'a.WlydOe[jsname="YKoRaf"]',
|
|
121
|
+
titleH1: "h1.WQWxe",
|
|
122
|
+
titleDiv: ".eAaXgc",
|
|
123
|
+
popularCommentLabel: ".qgdis",
|
|
124
|
+
source: ".sTl1Td",
|
|
125
|
+
platformBadge: ".appd0, .KrMNbf",
|
|
126
|
+
ytChannel: ".sjVJQd",
|
|
127
|
+
ytDate: ".PLq9Je",
|
|
128
|
+
authorNote: ".nDgy9d"
|
|
129
|
+
};
|
|
130
|
+
var AIOverviewSelectors = {
|
|
131
|
+
root: '[data-hveid="CBMQAA"]',
|
|
132
|
+
wrapper: ".Fgyi2e",
|
|
133
|
+
citations: '.Fgyi2e [data-hveid] a[jsname="pxBnId"]'
|
|
134
|
+
};
|
|
135
|
+
var AIModeSelectors = {
|
|
136
|
+
root: '[data-hveid="CAUQAA"]',
|
|
137
|
+
wrapper: ".Fgyi2e",
|
|
138
|
+
citations: '.Fgyi2e [data-hveid] a[jsname="pxBnId"]'
|
|
139
|
+
};
|
|
140
|
+
var OrganicSelectors = {
|
|
141
|
+
result: ".wHYlTd.tF2Cxc",
|
|
142
|
+
title: "h3.LC20lb",
|
|
143
|
+
siteName: ".VuuXrf",
|
|
144
|
+
cite: "cite.tjvcx",
|
|
145
|
+
snippet: ".VwiC3b",
|
|
146
|
+
redditCite: "cite.qLRx3b",
|
|
147
|
+
ratingWrap: ".Y0A0hc",
|
|
148
|
+
ratingValue: ".yi40Hd",
|
|
149
|
+
reviewCount: ".RDApEe"
|
|
150
|
+
};
|
|
151
|
+
var LocalPackSelectors = {
|
|
152
|
+
headingText: "Businesses",
|
|
153
|
+
card: ".w7Dbne",
|
|
154
|
+
name: ".OSrXXb",
|
|
155
|
+
ratingValue: ".yi40Hd",
|
|
156
|
+
reviewCount: ".RDApEe"
|
|
157
|
+
};
|
|
158
|
+
|
|
159
|
+
// src/errors.ts
|
|
160
|
+
var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
|
|
161
|
+
var CaptchaError = class extends Error {
|
|
162
|
+
constructor(instructions) {
|
|
163
|
+
super(`CAPTCHA detected. ${instructions}`);
|
|
164
|
+
this.instructions = instructions;
|
|
165
|
+
}
|
|
166
|
+
instructions;
|
|
167
|
+
name = "CaptchaError";
|
|
168
|
+
};
|
|
169
|
+
var ExtractionError = class extends Error {
|
|
170
|
+
constructor(message, cause) {
|
|
171
|
+
super(message);
|
|
172
|
+
this.cause = cause;
|
|
173
|
+
}
|
|
174
|
+
cause;
|
|
175
|
+
name = "ExtractionError";
|
|
176
|
+
};
|
|
177
|
+
|
|
178
|
+
// src/driver/BrowserDriver.ts
|
|
179
|
+
import_playwright_extra.chromium.use((0, import_puppeteer_extra_plugin_stealth.default)());
|
|
180
|
+
var DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
181
|
+
function buildYouTubeChannelVideosUrl(channelInput) {
|
|
182
|
+
const raw = channelInput.trim();
|
|
183
|
+
if (!raw) throw new Error("channelHandle is required");
|
|
184
|
+
const urlLike = /^https?:\/\//i.test(raw) || /^(www\.|m\.)?youtube\.com\//i.test(raw);
|
|
185
|
+
if (urlLike) {
|
|
186
|
+
const parsed = new URL(/^https?:\/\//i.test(raw) ? raw : `https://${raw}`);
|
|
187
|
+
const host = parsed.hostname.replace(/^www\./, "").replace(/^m\./, "").toLowerCase();
|
|
188
|
+
if (host !== "youtube.com") throw new Error("channel URL must be on youtube.com");
|
|
189
|
+
const segments = parsed.pathname.split("/").filter(Boolean);
|
|
190
|
+
const first = segments[0] ?? "";
|
|
191
|
+
const second = segments[1] ?? "";
|
|
192
|
+
if (first.startsWith("@")) return `https://www.youtube.com/${first}/videos`;
|
|
193
|
+
if (first === "channel" && second) return `https://www.youtube.com/channel/${second}/videos`;
|
|
194
|
+
if ((first === "c" || first === "user") && second) return `https://www.youtube.com/${first}/${second}/videos`;
|
|
195
|
+
throw new Error("channel URL must be a YouTube handle, /channel/UC..., /c/..., or /user/... URL");
|
|
196
|
+
}
|
|
197
|
+
const stripped = raw.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
198
|
+
const withoutVideos = stripped.replace(/\/videos$/i, "");
|
|
199
|
+
if (/^UC[\w-]{20,}$/.test(withoutVideos)) {
|
|
200
|
+
return `https://www.youtube.com/channel/${withoutVideos}/videos`;
|
|
201
|
+
}
|
|
202
|
+
const handle = withoutVideos.startsWith("@") ? withoutVideos : `@${withoutVideos}`;
|
|
203
|
+
if (!/^@[\w.-]+$/.test(handle)) {
|
|
204
|
+
throw new Error("channelHandle must be an @handle, UC channel ID, or YouTube channel URL");
|
|
205
|
+
}
|
|
206
|
+
return `https://www.youtube.com/${handle}/videos`;
|
|
207
|
+
}
|
|
208
|
+
var BrowserDriver = class {
|
|
209
|
+
browser = null;
|
|
210
|
+
context = null;
|
|
211
|
+
page = null;
|
|
212
|
+
kernelClient = null;
|
|
213
|
+
kernelSessionId = null;
|
|
214
|
+
async launch(config) {
|
|
215
|
+
if (config.kernelApiKey) {
|
|
216
|
+
this.kernelClient = new import_sdk.default({ apiKey: config.kernelApiKey });
|
|
217
|
+
const kernelBrowser = await this.kernelClient.browsers.create({
|
|
218
|
+
stealth: true,
|
|
219
|
+
timeout_seconds: 600,
|
|
220
|
+
...config.kernelProxyId ? { proxy_id: config.kernelProxyId } : {}
|
|
221
|
+
});
|
|
222
|
+
this.kernelSessionId = kernelBrowser.session_id;
|
|
223
|
+
this.browser = await import_playwright.chromium.connectOverCDP(kernelBrowser.cdp_ws_url);
|
|
224
|
+
this.context = this.browser.contexts()[0] ?? await this.browser.newContext();
|
|
225
|
+
await this.installEsbuildHelperShims(this.context);
|
|
226
|
+
this.page = this.context.pages()[0] ?? await this.context.newPage();
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
const launchOpts = {
|
|
230
|
+
headless: config.headless,
|
|
231
|
+
proxy: config.proxy ? { server: config.proxy } : void 0
|
|
232
|
+
};
|
|
233
|
+
const ctxOpts = {
|
|
234
|
+
viewport: config.viewport,
|
|
235
|
+
locale: config.locale,
|
|
236
|
+
userAgent: DESKTOP_USER_AGENT
|
|
237
|
+
};
|
|
238
|
+
if (config.profileDir) {
|
|
239
|
+
this.context = await import_playwright_extra.chromium.launchPersistentContext(config.profileDir, {
|
|
240
|
+
...launchOpts,
|
|
241
|
+
...ctxOpts
|
|
242
|
+
});
|
|
243
|
+
await this.installEsbuildHelperShims(this.context);
|
|
244
|
+
this.page = await this.context.newPage();
|
|
245
|
+
} else {
|
|
246
|
+
this.browser = await import_playwright_extra.chromium.launch(launchOpts);
|
|
247
|
+
this.context = await this.browser.newContext(ctxOpts);
|
|
248
|
+
await this.installEsbuildHelperShims(this.context);
|
|
249
|
+
this.page = await this.context.newPage();
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
async installEsbuildHelperShims(context) {
|
|
253
|
+
await context.addInitScript(() => {
|
|
254
|
+
const g = globalThis;
|
|
255
|
+
if (typeof g.__name !== "function") g.__name = (fn) => fn;
|
|
256
|
+
if (typeof g.__publicField !== "function") g.__publicField = (obj, key, value) => {
|
|
257
|
+
obj[key] = value;
|
|
258
|
+
return value;
|
|
259
|
+
};
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
async navigateToSERP(query, uule, gl, hl) {
|
|
263
|
+
const params = new URLSearchParams({ q: query, gl, hl });
|
|
264
|
+
if (uule) params.set("uule", uule);
|
|
265
|
+
const url = "https://www.google.com/search?" + params.toString();
|
|
266
|
+
try {
|
|
267
|
+
await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
|
|
268
|
+
} catch (err) {
|
|
269
|
+
const diag = await this.captureDiagnostics(url);
|
|
270
|
+
throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
|
|
271
|
+
}
|
|
272
|
+
const captchaCount = await this.page.locator(PAASelectors.captchaMarker).count();
|
|
273
|
+
if (captchaCount > 0) {
|
|
274
|
+
if (this.kernelClient) {
|
|
275
|
+
try {
|
|
276
|
+
await this.page.waitForSelector(PAASelectors.container, { timeout: 45e3 });
|
|
277
|
+
return { hasPaa: true };
|
|
278
|
+
} catch {
|
|
279
|
+
throw new CaptchaError(this.captchaMessage());
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
throw new CaptchaError(this.captchaMessage());
|
|
283
|
+
}
|
|
284
|
+
const fastFound = await this.page.waitForSelector(PAASelectors.item, { timeout: 4e3 }).catch(() => null);
|
|
285
|
+
if (fastFound) return { hasPaa: true };
|
|
286
|
+
const captchaAfter = await this.page.locator(PAASelectors.captchaMarker).count();
|
|
287
|
+
if (captchaAfter > 0) throw new CaptchaError(this.captchaMessage());
|
|
288
|
+
for (let i = 1; i <= 6; i++) {
|
|
289
|
+
await this.page.evaluate((f) => {
|
|
290
|
+
window.scrollTo(0, document.body.scrollHeight * f);
|
|
291
|
+
}, i / 6);
|
|
292
|
+
await this.page.waitForTimeout(600);
|
|
293
|
+
const count = await this.page.locator(PAASelectors.item).count();
|
|
294
|
+
if (count > 0) return { hasPaa: true };
|
|
295
|
+
}
|
|
296
|
+
return { hasPaa: false };
|
|
297
|
+
}
|
|
298
|
+
async captureDiagnostics(intendedUrl) {
|
|
299
|
+
try {
|
|
300
|
+
const finalUrl = this.page.url();
|
|
301
|
+
const title = await this.page.title().catch(() => "");
|
|
302
|
+
const bodySnippet = await this.page.evaluate(() => {
|
|
303
|
+
const t = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
|
|
304
|
+
return t.slice(0, 400);
|
|
305
|
+
}).catch(() => "");
|
|
306
|
+
const consent = /consent\.google\./.test(finalUrl) || /before you continue/i.test(bodySnippet);
|
|
307
|
+
const recaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
|
|
308
|
+
const flags = [
|
|
309
|
+
consent ? "CONSENT_WALL" : "",
|
|
310
|
+
recaptcha ? "BOT_CHALLENGE" : "",
|
|
311
|
+
finalUrl !== intendedUrl ? "REDIRECTED" : ""
|
|
312
|
+
].filter(Boolean).join(",");
|
|
313
|
+
return `intended=${intendedUrl} | final=${finalUrl} | title="${title}" | flags=[${flags}] | body="${bodySnippet}"`;
|
|
314
|
+
} catch (e) {
|
|
315
|
+
return `diagnostics-failed: ${e.message}`;
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
captchaMessage() {
|
|
319
|
+
return this.kernelClient ? "Google returned a CAPTCHA on this Kernel.sh session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
|
|
320
|
+
}
|
|
321
|
+
async navigateTo(url) {
|
|
322
|
+
try {
|
|
323
|
+
await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
|
|
324
|
+
} catch (err) {
|
|
325
|
+
const diag = await this.captureDiagnostics(url);
|
|
326
|
+
throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
async navigateToChannel(channelHandle) {
|
|
330
|
+
const url = buildYouTubeChannelVideosUrl(channelHandle);
|
|
331
|
+
try {
|
|
332
|
+
await this.page.goto(url, { waitUntil: "networkidle", timeout: 3e4 });
|
|
333
|
+
} catch (err) {
|
|
334
|
+
const diag = await this.captureDiagnostics(url);
|
|
335
|
+
throw new ExtractionError(`navigateToChannel failed: ${err.message} | ${diag}`);
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
async evaluate(fn, arg) {
|
|
339
|
+
return this.page.evaluate(fn, arg);
|
|
340
|
+
}
|
|
341
|
+
getPage() {
|
|
342
|
+
return this.page;
|
|
343
|
+
}
|
|
344
|
+
async close() {
|
|
345
|
+
if (this.browser) {
|
|
346
|
+
const b = this.browser;
|
|
347
|
+
const sessionId = this.kernelSessionId;
|
|
348
|
+
const client = this.kernelClient;
|
|
349
|
+
this.browser = null;
|
|
350
|
+
this.context = null;
|
|
351
|
+
this.page = null;
|
|
352
|
+
this.kernelSessionId = null;
|
|
353
|
+
this.kernelClient = null;
|
|
354
|
+
try {
|
|
355
|
+
await b.close();
|
|
356
|
+
} finally {
|
|
357
|
+
if (client && sessionId) {
|
|
358
|
+
await client.browsers.deleteByID(sessionId).catch(
|
|
359
|
+
(err) => console.warn("Kernel session cleanup failed:", err)
|
|
360
|
+
);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
} else if (this.context) {
|
|
364
|
+
const ctx = this.context;
|
|
365
|
+
this.context = null;
|
|
366
|
+
this.page = null;
|
|
367
|
+
await ctx.close();
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
};
|
|
371
|
+
|
|
372
|
+
// src/locations.ts
|
|
373
|
+
var LOCATIONS = {
|
|
374
|
+
"austin": "Austin,Texas,United States",
|
|
375
|
+
"new york": "New York,New York,United States",
|
|
376
|
+
"new york city": "New York,New York,United States",
|
|
377
|
+
"nyc": "New York,New York,United States",
|
|
378
|
+
"los angeles": "Los Angeles,California,United States",
|
|
379
|
+
"la": "Los Angeles,California,United States",
|
|
380
|
+
"chicago": "Chicago,Illinois,United States",
|
|
381
|
+
"houston": "Houston,Texas,United States",
|
|
382
|
+
"phoenix": "Phoenix,Arizona,United States",
|
|
383
|
+
"philadelphia": "Philadelphia,Pennsylvania,United States",
|
|
384
|
+
"philly": "Philadelphia,Pennsylvania,United States",
|
|
385
|
+
"san antonio": "San Antonio,Texas,United States",
|
|
386
|
+
"dallas": "Dallas,Texas,United States",
|
|
387
|
+
"miami": "Miami,Florida,United States",
|
|
388
|
+
"seattle": "Seattle,Washington,United States",
|
|
389
|
+
"denver": "Denver,Colorado,United States",
|
|
390
|
+
"loveland": "Loveland,Colorado,United States",
|
|
391
|
+
"loveland co": "Loveland,Colorado,United States",
|
|
392
|
+
"fort collins": "Fort Collins,Colorado,United States",
|
|
393
|
+
"boulder": "Boulder,Colorado,United States",
|
|
394
|
+
"colorado springs": "Colorado Springs,Colorado,United States",
|
|
395
|
+
"boston": "Boston,Massachusetts,United States",
|
|
396
|
+
"atlanta": "Atlanta,Georgia,United States",
|
|
397
|
+
"san francisco": "San Francisco,California,United States",
|
|
398
|
+
"sf": "San Francisco,California,United States",
|
|
399
|
+
"portland": "Portland,Oregon,United States",
|
|
400
|
+
"las vegas": "Las Vegas,Nevada,United States",
|
|
401
|
+
"minneapolis": "Minneapolis,Minnesota,United States",
|
|
402
|
+
"detroit": "Detroit,Michigan,United States",
|
|
403
|
+
"nashville": "Nashville,Tennessee,United States",
|
|
404
|
+
"charlotte": "Charlotte,North Carolina,United States",
|
|
405
|
+
"orlando": "Orlando,Florida,United States",
|
|
406
|
+
"san diego": "San Diego,California,United States",
|
|
407
|
+
"baltimore": "Baltimore,Maryland,United States",
|
|
408
|
+
"sacramento": "Sacramento,California,United States",
|
|
409
|
+
"columbus": "Columbus,Ohio,United States",
|
|
410
|
+
"indianapolis": "Indianapolis,Indiana,United States",
|
|
411
|
+
"san jose": "San Jose,California,United States",
|
|
412
|
+
"fort worth": "Fort Worth,Texas,United States",
|
|
413
|
+
"jacksonville": "Jacksonville,Florida,United States",
|
|
414
|
+
"memphis": "Memphis,Tennessee,United States",
|
|
415
|
+
"louisville": "Louisville,Kentucky,United States",
|
|
416
|
+
"raleigh": "Raleigh,North Carolina,United States",
|
|
417
|
+
"richmond": "Richmond,Virginia,United States",
|
|
418
|
+
"salt lake city": "Salt Lake City,Utah,United States",
|
|
419
|
+
"toronto": "Toronto,Ontario,Canada",
|
|
420
|
+
"vancouver": "Vancouver,British Columbia,Canada",
|
|
421
|
+
"montreal": "Montreal,Quebec,Canada",
|
|
422
|
+
"calgary": "Calgary,Alberta,Canada",
|
|
423
|
+
"ottawa": "Ottawa,Ontario,Canada",
|
|
424
|
+
"london": "London,England,United Kingdom",
|
|
425
|
+
"manchester": "Manchester,England,United Kingdom",
|
|
426
|
+
"birmingham": "Birmingham,England,United Kingdom",
|
|
427
|
+
"edinburgh": "Edinburgh,Scotland,United Kingdom",
|
|
428
|
+
"glasgow": "Glasgow,Scotland,United Kingdom",
|
|
429
|
+
"leeds": "Leeds,England,United Kingdom",
|
|
430
|
+
"sydney": "Sydney,New South Wales,Australia",
|
|
431
|
+
"melbourne": "Melbourne,Victoria,Australia",
|
|
432
|
+
"brisbane": "Brisbane,Queensland,Australia",
|
|
433
|
+
"perth": "Perth,Western Australia,Australia",
|
|
434
|
+
"adelaide": "Adelaide,South Australia,Australia",
|
|
435
|
+
"dublin": "Dublin,Leinster,Ireland"
|
|
436
|
+
};
|
|
437
|
+
|
|
438
|
+
// src/uule.ts
|
|
439
|
+
function encodeUule(name) {
|
|
440
|
+
const encoded = Buffer.from(String.fromCharCode(name.length) + name).toString("base64");
|
|
441
|
+
return `w+CAIQICI${encoded}`;
|
|
442
|
+
}
|
|
443
|
+
function normalizeLocation(input) {
|
|
444
|
+
const key = input.toLowerCase().trim();
|
|
445
|
+
return LOCATIONS[key] ?? input;
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// src/lib/paa-answer-cleanup.ts
|
|
449
|
+
var MAX_ANSWER_LENGTH = 1200;
|
|
450
|
+
var BOILERPLATE_PATTERNS = [
|
|
451
|
+
/An AI Overview is not available for this search/gi,
|
|
452
|
+
/Can't generate an AI overview right now\.?\s*Try again later\.?/gi,
|
|
453
|
+
/\bAI Overview\b/gi,
|
|
454
|
+
/\bView all\b/gi
|
|
455
|
+
];
|
|
456
|
+
var CUT_MARKERS = [
|
|
457
|
+
/\bRelated Links\b/i,
|
|
458
|
+
/\bAsk anything in\s*AI Mode\b/i,
|
|
459
|
+
/\bAI can make mistakes\b/i,
|
|
460
|
+
/\bThis is for informational purposes only\b/i,
|
|
461
|
+
/\bShow more\b/i,
|
|
462
|
+
/\b\d+\s+sites\b/i,
|
|
463
|
+
/\b\d{1,2}\s*[msh]\s*[A-Z][A-Za-z]/,
|
|
464
|
+
/\b(?:YouTube|Reddit|Facebook|Instagram|TikTok)·/
|
|
465
|
+
];
|
|
466
|
+
function normalizeWhitespace(text) {
|
|
467
|
+
return text.replace(/\u00a0/g, " ").replace(/([.!?])([A-Z])/g, "$1 $2").replace(/([:;])([A-Z])/g, "$1 $2").replace(/([a-z])([A-Z][a-z])/g, "$1 $2").replace(/(\d)([A-Z][a-z])/g, "$1 $2").replace(/([a-z])(\d)/g, "$1 $2").replace(/\s+/g, " ").trim();
|
|
468
|
+
}
|
|
469
|
+
function cutAtFirstMarker(text) {
|
|
470
|
+
let cutAt = -1;
|
|
471
|
+
for (const marker of CUT_MARKERS) {
|
|
472
|
+
const match = marker.exec(text);
|
|
473
|
+
marker.lastIndex = 0;
|
|
474
|
+
if (match && (cutAt === -1 || match.index < cutAt)) cutAt = match.index;
|
|
475
|
+
}
|
|
476
|
+
return cutAt === -1 ? text : text.slice(0, cutAt);
|
|
477
|
+
}
|
|
478
|
+
function cutAtSourceTitle(text, sourceTitle) {
|
|
479
|
+
const title = sourceTitle?.trim();
|
|
480
|
+
if (!title || title.length < 8) return text;
|
|
481
|
+
const idx = text.toLowerCase().indexOf(title.toLowerCase());
|
|
482
|
+
return idx > 40 ? text.slice(0, idx) : text;
|
|
483
|
+
}
|
|
484
|
+
function findAttributionCut(beforeUrl) {
|
|
485
|
+
const dateMatch = beforeUrl.match(/[•·]\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s+\d{4}/i);
|
|
486
|
+
if (dateMatch?.index && dateMatch.index > 40) return dateMatch.index;
|
|
487
|
+
const start = Math.max(0, beforeUrl.length - 260);
|
|
488
|
+
const tail = beforeUrl.slice(start);
|
|
489
|
+
const sentenceBreaks = [...tail.matchAll(/[.!?]\s*(?=[A-Z][A-Za-z0-9"'$])/g)];
|
|
490
|
+
for (const match of sentenceBreaks) {
|
|
491
|
+
const remainder = tail.slice(match.index + 1).trim();
|
|
492
|
+
const lead = remainder.slice(0, 160);
|
|
493
|
+
const looksLikeTitle = /^(?:Best|Top|What|How|Why|When|Where|Which|Can|Should|Is|Are|Do|Does)\b/i.test(remainder);
|
|
494
|
+
if (remainder.length > 20 && looksLikeTitle && /(?:\s[-|]\s|Heating|Cooling|Company|Services|Blog|Guide|Review)/i.test(lead)) {
|
|
495
|
+
return start + match.index + 1;
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
const last = sentenceBreaks.at(-1);
|
|
499
|
+
if (last?.index !== void 0) return start + last.index + 1;
|
|
500
|
+
return beforeUrl.length;
|
|
501
|
+
}
|
|
502
|
+
function cutAtUrlAttribution(text) {
|
|
503
|
+
const urlMatch = text.match(/https?:\/\/\S+/i);
|
|
504
|
+
if (!urlMatch?.index) return text;
|
|
505
|
+
const beforeUrl = text.slice(0, urlMatch.index);
|
|
506
|
+
return beforeUrl.slice(0, findAttributionCut(beforeUrl));
|
|
507
|
+
}
|
|
508
|
+
function trimToSentenceLimit(text) {
|
|
509
|
+
if (text.length <= MAX_ANSWER_LENGTH) return text;
|
|
510
|
+
const slice = text.slice(0, MAX_ANSWER_LENGTH);
|
|
511
|
+
const lastSentence = Math.max(slice.lastIndexOf("."), slice.lastIndexOf("!"), slice.lastIndexOf("?"));
|
|
512
|
+
return (lastSentence > 240 ? slice.slice(0, lastSentence + 1) : slice).trim();
|
|
513
|
+
}
|
|
514
|
+
function cleanPAAAnswerText(answer, question, sourceTitle) {
|
|
515
|
+
if (!answer) return void 0;
|
|
516
|
+
let text = normalizeWhitespace(answer);
|
|
517
|
+
const normalizedQuestion = question ? normalizeWhitespace(question) : "";
|
|
518
|
+
if (normalizedQuestion && text.toLowerCase().startsWith(normalizedQuestion.toLowerCase())) {
|
|
519
|
+
text = text.slice(normalizedQuestion.length).trim();
|
|
520
|
+
}
|
|
521
|
+
if (/^An error has occurred\.?\s*Please try again later\.?/i.test(text)) {
|
|
522
|
+
return void 0;
|
|
523
|
+
}
|
|
524
|
+
for (const pattern of BOILERPLATE_PATTERNS) {
|
|
525
|
+
text = text.replace(pattern, " ");
|
|
526
|
+
}
|
|
527
|
+
text = text.replace(/\b[A-Z][A-Za-z&'\u2019 -]{2,60}\+\d+\b/g, " ").replace(/\b(?:[a-z0-9-]+\.)+[a-z]{2,}\+\d+\b/gi, " ");
|
|
528
|
+
text = normalizeWhitespace(text);
|
|
529
|
+
text = cutAtFirstMarker(text);
|
|
530
|
+
text = cutAtSourceTitle(text, sourceTitle);
|
|
531
|
+
text = cutAtUrlAttribution(text);
|
|
532
|
+
text = normalizeWhitespace(text);
|
|
533
|
+
text = text.replace(/\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s+\d{4}$/i, "").trim();
|
|
534
|
+
text = trimToSentenceLimit(text);
|
|
535
|
+
if (!text) return void 0;
|
|
536
|
+
if (/^An error has occurred\.?\s*Please try again later\.?$/i.test(text)) return void 0;
|
|
537
|
+
return text;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
// src/extractor/PAAExtractor.ts
|
|
541
|
+
var PAAExtractor = class {
|
|
542
|
+
constructor(driver, reporter) {
|
|
543
|
+
this.driver = driver;
|
|
544
|
+
this.reporter = reporter;
|
|
545
|
+
}
|
|
546
|
+
driver;
|
|
547
|
+
reporter;
|
|
548
|
+
normalizeQuestion(q) {
|
|
549
|
+
return q.toLowerCase().replace(/[^\w\s]/g, "").replace(/\s+/g, " ").trim();
|
|
550
|
+
}
|
|
551
|
+
async extractVisibleItems(page) {
|
|
552
|
+
const sels = PAASelectors;
|
|
553
|
+
const raw = await page.evaluate((selectors) => {
|
|
554
|
+
function cleanText(el) {
|
|
555
|
+
if (!el) return "";
|
|
556
|
+
const parts = [];
|
|
557
|
+
for (const n of el.childNodes) {
|
|
558
|
+
if (n.nodeType === Node.TEXT_NODE) {
|
|
559
|
+
const text = n.textContent?.trim();
|
|
560
|
+
if (text) parts.push(text);
|
|
561
|
+
} else if (n.tagName === "STYLE" || n.tagName === "SCRIPT") {
|
|
562
|
+
continue;
|
|
563
|
+
} else {
|
|
564
|
+
const text = cleanText(n);
|
|
565
|
+
if (text) parts.push(text);
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
return parts.join(" ").replace(/\s+/g, " ").trim();
|
|
569
|
+
}
|
|
570
|
+
return Array.from(document.querySelectorAll(selectors.item)).map((pair) => ({
|
|
571
|
+
question: pair.getAttribute(selectors.itemDataQ) || pair.getAttribute(selectors.itemDataInitQ) || "",
|
|
572
|
+
answer: cleanText(pair.querySelector(selectors.answerContainer)) || void 0,
|
|
573
|
+
sourceTitle: pair.querySelector(selectors.sourceTitle)?.innerText?.trim() || void 0,
|
|
574
|
+
sourceSite: pair.querySelector(selectors.sourceSite)?.innerText?.trim() || void 0,
|
|
575
|
+
sourceCite: pair.querySelector(selectors.sourceCite)?.innerText?.trim() || void 0
|
|
576
|
+
}));
|
|
577
|
+
}, sels);
|
|
578
|
+
return raw.flatMap((item) => {
|
|
579
|
+
const cleaned = {
|
|
580
|
+
...item,
|
|
581
|
+
answer: cleanPAAAnswerText(item.answer, item.question, item.sourceTitle)
|
|
582
|
+
};
|
|
583
|
+
const result = RawPAAItemSchema.safeParse(cleaned);
|
|
584
|
+
if (!result.success) {
|
|
585
|
+
console.warn("[PAAExtractor] item parse failed:", item.question, result.error.issues[0]?.message);
|
|
586
|
+
return [];
|
|
587
|
+
}
|
|
588
|
+
return [result.data];
|
|
589
|
+
});
|
|
590
|
+
}
|
|
591
|
+
async clickItem(page, questionText) {
|
|
592
|
+
try {
|
|
593
|
+
const pairLocator = page.locator(
|
|
594
|
+
`${PAASelectors.item}[data-q="${questionText}"], ${PAASelectors.item}[data-initq="${questionText}"]`
|
|
595
|
+
).first();
|
|
596
|
+
await pairLocator.click();
|
|
597
|
+
} catch {
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
toFlatRow(item, depth, parentQuestion, seed) {
|
|
601
|
+
return {
|
|
602
|
+
seed_query: seed,
|
|
603
|
+
question: item.question,
|
|
604
|
+
answer: item.answer ?? "",
|
|
605
|
+
source_title: item.sourceTitle ?? "",
|
|
606
|
+
source_site: item.sourceSite ?? "",
|
|
607
|
+
source_cite: item.sourceCite ?? "",
|
|
608
|
+
depth,
|
|
609
|
+
parent_question: parentQuestion ?? "",
|
|
610
|
+
extracted_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
async runBFS(page, options) {
|
|
614
|
+
const seenKeys = /* @__PURE__ */ new Set();
|
|
615
|
+
const seenQs = /* @__PURE__ */ new Set();
|
|
616
|
+
const depthMap = /* @__PURE__ */ new Map();
|
|
617
|
+
const results = [];
|
|
618
|
+
const readAllQs = () => page.evaluate(
|
|
619
|
+
({ sel, dataQ, dataInitQ, questionEl }) => Array.from(document.querySelectorAll(sel)).map(
|
|
620
|
+
(el) => el.getAttribute(dataQ) || el.getAttribute(dataInitQ) || el.querySelector(questionEl)?.innerText?.trim() || ""
|
|
621
|
+
).filter(Boolean),
|
|
622
|
+
{ sel: PAASelectors.item, dataQ: PAASelectors.itemDataQ, dataInitQ: PAASelectors.itemDataInitQ, questionEl: PAASelectors.itemQuestionEl }
|
|
623
|
+
);
|
|
624
|
+
const dupRates = [];
|
|
625
|
+
const orderedQs = [];
|
|
626
|
+
for (let round = 0; round < options.depth; round++) {
|
|
627
|
+
this.reporter.onDepth(round + 1);
|
|
628
|
+
if (seenQs.size >= options.maxQuestions) break;
|
|
629
|
+
const beforeQs = await readAllQs();
|
|
630
|
+
if (beforeQs.length >= options.maxQuestions) break;
|
|
631
|
+
const unexpandedItems = await page.$$(
|
|
632
|
+
`${PAASelectors.item}:not(.${PAASelectors.expandedClass})`
|
|
633
|
+
);
|
|
634
|
+
if (unexpandedItems.length === 0) break;
|
|
635
|
+
for (const item of unexpandedItems) {
|
|
636
|
+
try {
|
|
637
|
+
await item.scrollIntoViewIfNeeded();
|
|
638
|
+
await item.click({ force: true });
|
|
639
|
+
await page.waitForTimeout(500);
|
|
640
|
+
} catch {
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
await page.waitForTimeout(1500);
|
|
644
|
+
const afterQs = await readAllQs();
|
|
645
|
+
const newQs = afterQs.slice(beforeQs.length);
|
|
646
|
+
const newDups = newQs.filter((q) => seenQs.has(q)).length;
|
|
647
|
+
const dupRate = newQs.length > 0 ? newDups / newQs.length : 0;
|
|
648
|
+
dupRates.push(dupRate);
|
|
649
|
+
if (dupRates.length > 2) dupRates.shift();
|
|
650
|
+
const rollingDupRate = dupRates.reduce((a, b) => a + b, 0) / dupRates.length;
|
|
651
|
+
for (const q of afterQs) {
|
|
652
|
+
if (!seenQs.has(q)) {
|
|
653
|
+
seenQs.add(q);
|
|
654
|
+
orderedQs.push(q);
|
|
655
|
+
}
|
|
656
|
+
if (!depthMap.has(q)) depthMap.set(q, round + 1);
|
|
657
|
+
}
|
|
658
|
+
if (afterQs.length === beforeQs.length) break;
|
|
659
|
+
if (rollingDupRate >= 0.6) break;
|
|
660
|
+
}
|
|
661
|
+
const itemMap = new Map((await this.extractVisibleItems(page)).map((i) => [i.question, i]));
|
|
662
|
+
for (const q of orderedQs) {
|
|
663
|
+
if (results.length >= options.maxQuestions) break;
|
|
664
|
+
const key = this.normalizeQuestion(q);
|
|
665
|
+
if (seenKeys.has(key)) continue;
|
|
666
|
+
seenKeys.add(key);
|
|
667
|
+
const d = depthMap.get(q) ?? 1;
|
|
668
|
+
const item = itemMap.get(q);
|
|
669
|
+
if (item) {
|
|
670
|
+
results.push(this.toFlatRow(item, d, null, options.query));
|
|
671
|
+
this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: d, parentQuestion: null, children: [] });
|
|
672
|
+
} else {
|
|
673
|
+
results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, d, null, options.query));
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
return results;
|
|
677
|
+
}
|
|
678
|
+
async extractVideos(page) {
|
|
679
|
+
const vsels = VideoSelectors;
|
|
680
|
+
return page.evaluate((sels) => {
|
|
681
|
+
const results = [];
|
|
682
|
+
const containers = Array.from(document.querySelectorAll(sels.container));
|
|
683
|
+
for (const container of containers) {
|
|
684
|
+
const headingEl = container.querySelector(sels.sectionHeading);
|
|
685
|
+
const headingText = headingEl?.textContent?.trim() ?? "";
|
|
686
|
+
const type = headingText.toLowerCase().includes("short") ? "short_video" : "video";
|
|
687
|
+
const items = Array.from(container.querySelectorAll(sels.item));
|
|
688
|
+
for (const a of items) {
|
|
689
|
+
const href = a.href;
|
|
690
|
+
if (!href || !href.includes("youtube") && !href.includes("youtu.be")) continue;
|
|
691
|
+
const raw = a.textContent?.trim() ?? "";
|
|
692
|
+
const ytIdx = raw.indexOf("YouTube");
|
|
693
|
+
if (ytIdx === -1) continue;
|
|
694
|
+
const title = raw.slice(0, ytIdx).trim();
|
|
695
|
+
const remainder = raw.slice(ytIdx + 7).replace(/^[·\s·]+/, "");
|
|
696
|
+
const channelMatch = remainder.match(/^([^·\n]+)/);
|
|
697
|
+
const channel = channelMatch ? channelMatch[1].trim() : "";
|
|
698
|
+
if (title) results.push({ type, title, channel, platform: "YouTube", duration: "", url: href });
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
return results;
|
|
702
|
+
}, vsels);
|
|
703
|
+
}
|
|
704
|
+
async extractForums(page) {
|
|
705
|
+
const fsels = ForumSelectors;
|
|
706
|
+
return page.evaluate((sels) => {
|
|
707
|
+
const results = [];
|
|
708
|
+
const sections = Array.from(document.querySelectorAll(sels.section));
|
|
709
|
+
const forumSection = sections.find((s) => s.textContent?.includes("Discussions"));
|
|
710
|
+
if (!forumSection) return results;
|
|
711
|
+
const items = Array.from(forumSection.querySelectorAll(sels.item));
|
|
712
|
+
for (const a of items) {
|
|
713
|
+
const href = a.href;
|
|
714
|
+
if (!href) continue;
|
|
715
|
+
const titleEl = a.querySelector(sels.title);
|
|
716
|
+
const sourceEl = a.querySelector(sels.source);
|
|
717
|
+
const title = titleEl?.textContent?.trim() ?? "";
|
|
718
|
+
const source = sourceEl?.textContent?.trim() ?? "";
|
|
719
|
+
if (title) results.push({ title, source, url: href });
|
|
720
|
+
}
|
|
721
|
+
return results;
|
|
722
|
+
}, fsels);
|
|
723
|
+
}
|
|
724
|
+
async extractShortVideos(page, shortUrl) {
|
|
725
|
+
try {
|
|
726
|
+
await page.goto(shortUrl, { waitUntil: "domcontentloaded" });
|
|
727
|
+
await page.waitForTimeout(1500);
|
|
728
|
+
} catch {
|
|
729
|
+
return [];
|
|
730
|
+
}
|
|
731
|
+
const svSels = {
|
|
732
|
+
item: ShortVideoSelectors.item,
|
|
733
|
+
platforms: [...ShortVideoSelectors.platforms]
|
|
734
|
+
};
|
|
735
|
+
const raw = await page.evaluate((sels) => {
|
|
736
|
+
const seen = /* @__PURE__ */ new Set();
|
|
737
|
+
const results = [];
|
|
738
|
+
const items = Array.from(document.querySelectorAll(sels.item));
|
|
739
|
+
const videoHosts = ["youtube.com", "youtu.be", "tiktok.com", "instagram.com", "facebook.com", "fb.watch"];
|
|
740
|
+
const byHref = /* @__PURE__ */ new Map();
|
|
741
|
+
for (const a of items) {
|
|
742
|
+
const href = a.href;
|
|
743
|
+
if (!href) continue;
|
|
744
|
+
if (!videoHosts.some((h) => href.includes(h))) continue;
|
|
745
|
+
const text = a.textContent?.trim() ?? "";
|
|
746
|
+
if (!byHref.has(href)) byHref.set(href, []);
|
|
747
|
+
byHref.get(href).push(text);
|
|
748
|
+
}
|
|
749
|
+
for (const [href, texts] of byHref.entries()) {
|
|
750
|
+
if (seen.has(href)) continue;
|
|
751
|
+
seen.add(href);
|
|
752
|
+
const duration = texts.find((t) => /^\d+:\d+$/.test(t)) ?? "";
|
|
753
|
+
const titleText = texts.find((t) => !/^\d+:\d+$/.test(t) && t.length > 5) ?? "";
|
|
754
|
+
if (!titleText) continue;
|
|
755
|
+
let title = titleText;
|
|
756
|
+
let platform = "";
|
|
757
|
+
let channel = "";
|
|
758
|
+
for (const p of sels.platforms) {
|
|
759
|
+
let lastIdx = -1;
|
|
760
|
+
let search = 0;
|
|
761
|
+
while (true) {
|
|
762
|
+
const found = titleText.indexOf(p, search);
|
|
763
|
+
if (found === -1) break;
|
|
764
|
+
lastIdx = found;
|
|
765
|
+
search = found + 1;
|
|
766
|
+
}
|
|
767
|
+
if (lastIdx === -1) continue;
|
|
768
|
+
const after = titleText.slice(lastIdx + p.length);
|
|
769
|
+
const isSourceTag = /^[\s·]/.test(after) || after.trim() === "";
|
|
770
|
+
if (!isSourceTag) continue;
|
|
771
|
+
title = titleText.slice(0, lastIdx).trim();
|
|
772
|
+
platform = p;
|
|
773
|
+
const stripped = after.replace(/^[\s·]+/, "");
|
|
774
|
+
const dotIdx = stripped.indexOf("\xB7");
|
|
775
|
+
channel = (dotIdx === -1 ? stripped : stripped.slice(0, dotIdx)).trim();
|
|
776
|
+
break;
|
|
777
|
+
}
|
|
778
|
+
if (title) results.push({ title, channel, platform, duration, url: href });
|
|
779
|
+
}
|
|
780
|
+
return results;
|
|
781
|
+
}, svSels);
|
|
782
|
+
return raw.map((r) => ({ type: "short_video", ...r }));
|
|
783
|
+
}
|
|
784
|
+
async extractWhatPeopleSaying(page) {
|
|
785
|
+
const sels = WhatPeopleSayingSelectors;
|
|
786
|
+
return page.evaluate((s) => {
|
|
787
|
+
const section = Array.from(document.querySelectorAll(s.sectionTag)).find((el) => el.textContent?.includes(s.sectionHeadingText)) ?? document.querySelector(".yG4QQe.TBC9ub.NbhJ1c");
|
|
788
|
+
if (!section) return [];
|
|
789
|
+
return Array.from(section.querySelectorAll(s.card)).map((card) => {
|
|
790
|
+
const link = card.querySelector(s.cardLink);
|
|
791
|
+
const url = link?.href ?? "";
|
|
792
|
+
const titleH1 = card.querySelector(s.titleH1)?.textContent?.trim();
|
|
793
|
+
const titleDiv = card.querySelector(s.titleDiv)?.textContent?.trim();
|
|
794
|
+
const title = titleH1 ?? titleDiv ?? "";
|
|
795
|
+
const sourceText = card.querySelector(s.source)?.textContent?.trim() ?? "";
|
|
796
|
+
const platformEl = card.querySelector(s.platformBadge);
|
|
797
|
+
const platformText = platformEl?.textContent?.trim() ?? "";
|
|
798
|
+
const ytChannel = card.querySelector(s.ytChannel)?.textContent?.trim() ?? "";
|
|
799
|
+
const ytDate = card.querySelector(s.ytDate)?.textContent?.trim() ?? "";
|
|
800
|
+
const authorNote = card.querySelector(s.authorNote)?.textContent?.trim() ?? null;
|
|
801
|
+
const commentLabelEl = card.querySelector(s.popularCommentLabel);
|
|
802
|
+
let popularComment = null;
|
|
803
|
+
if (commentLabelEl) {
|
|
804
|
+
let next = commentLabelEl.nextSibling;
|
|
805
|
+
while (next) {
|
|
806
|
+
const t = next.textContent?.trim();
|
|
807
|
+
if (t) {
|
|
808
|
+
popularComment = t;
|
|
809
|
+
break;
|
|
810
|
+
}
|
|
811
|
+
next = next.nextSibling;
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
const allSpans = Array.from(card.querySelectorAll("span"));
|
|
815
|
+
const duration = allSpans.find((s2) => /^\d+:\d+$/.test(s2.textContent?.trim() ?? ""))?.textContent?.trim() ?? null;
|
|
816
|
+
const engagementParts = allSpans.map((s2) => s2.textContent?.trim() ?? "").filter(
|
|
817
|
+
(t) => /\d/.test(t) && (t.includes("comment") || t.includes("reaction") || t.includes("view") || t.includes("like") || t.includes("share"))
|
|
818
|
+
);
|
|
819
|
+
const engagement = engagementParts[0] ?? "";
|
|
820
|
+
const dateCandidates = allSpans.map((s2) => s2.textContent?.trim() ?? "").filter((t) => /\d+ (day|week|month|year|hour)s? ago|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/.test(t));
|
|
821
|
+
const date = ytDate || (dateCandidates[0] ?? "");
|
|
822
|
+
const platform = platformText || (ytChannel ? "YouTube" : "");
|
|
823
|
+
const source = ytChannel || sourceText;
|
|
824
|
+
let type = "unknown";
|
|
825
|
+
const pl = platform.toLowerCase();
|
|
826
|
+
const src = source.toLowerCase();
|
|
827
|
+
const srcRaw = sourceText.toLowerCase();
|
|
828
|
+
if (pl.includes("reddit") || src.startsWith("r/")) type = "reddit";
|
|
829
|
+
else if (pl.includes("facebook") || srcRaw.includes("facebook")) type = "facebook";
|
|
830
|
+
else if (pl.includes("instagram") || srcRaw.includes("instagram")) type = "instagram";
|
|
831
|
+
else if (pl.includes("tiktok") || srcRaw.includes("tiktok")) type = "tiktok";
|
|
832
|
+
else if (pl.includes("youtube") || !!ytChannel) type = "youtube";
|
|
833
|
+
else type = "news";
|
|
834
|
+
return { type, title, url, source, platform, popularComment, engagement, date, duration, authorNote };
|
|
835
|
+
});
|
|
836
|
+
}, sels);
|
|
837
|
+
}
|
|
838
|
+
async extractOrganicResults(page) {
|
|
839
|
+
const sels = OrganicSelectors;
|
|
840
|
+
return page.evaluate((s) => {
|
|
841
|
+
const out = [];
|
|
842
|
+
let pos = 0;
|
|
843
|
+
document.querySelectorAll(s.result).forEach((card) => {
|
|
844
|
+
const titleEl = card.querySelector(s.title);
|
|
845
|
+
if (!titleEl) return;
|
|
846
|
+
const title = titleEl.textContent?.trim() ?? "";
|
|
847
|
+
const linkEl = titleEl.closest("a");
|
|
848
|
+
const url = linkEl?.href ?? "";
|
|
849
|
+
if (!title || !url) return;
|
|
850
|
+
pos++;
|
|
851
|
+
const cite = card.querySelector(s.cite)?.textContent?.trim() ?? null;
|
|
852
|
+
const snippet = card.querySelector(s.snippet)?.textContent?.trim() ?? null;
|
|
853
|
+
const isRedditStyle = !!card.querySelector(s.redditCite);
|
|
854
|
+
const ratingEl = card.querySelector(s.ratingWrap);
|
|
855
|
+
const inlineRating = ratingEl ? { value: ratingEl.querySelector(s.ratingValue)?.textContent?.trim() ?? "", count: ratingEl.querySelector(s.reviewCount)?.textContent?.trim() ?? "" } : null;
|
|
856
|
+
let domain = "";
|
|
857
|
+
try {
|
|
858
|
+
domain = new URL(url).hostname.replace(/^www\./, "");
|
|
859
|
+
} catch {
|
|
860
|
+
domain = card.querySelector(s.siteName)?.textContent?.trim() ?? "";
|
|
861
|
+
}
|
|
862
|
+
out.push({ position: pos, title, url, domain, cite, snippet, isRedditStyle, inlineRating });
|
|
863
|
+
});
|
|
864
|
+
return out;
|
|
865
|
+
}, sels);
|
|
866
|
+
}
|
|
867
|
+
async extractLocalPack(page) {
|
|
868
|
+
const sels = LocalPackSelectors;
|
|
869
|
+
return page.evaluate((s) => {
|
|
870
|
+
const out = [];
|
|
871
|
+
let container = null;
|
|
872
|
+
document.querySelectorAll('[role="heading"]').forEach((h) => {
|
|
873
|
+
if (!container && h.textContent?.includes(s.headingText)) container = h.closest("[data-hveid]");
|
|
874
|
+
});
|
|
875
|
+
if (!container) return out;
|
|
876
|
+
container.querySelectorAll(s.card).forEach((card, i) => {
|
|
877
|
+
const name = card.querySelector(s.name)?.textContent?.trim() ?? "";
|
|
878
|
+
if (!name) return;
|
|
879
|
+
const rating = card.querySelector(s.ratingValue)?.textContent?.trim() ?? null;
|
|
880
|
+
const reviewRaw = card.querySelector(s.reviewCount)?.textContent?.trim() ?? null;
|
|
881
|
+
const reviewCount = reviewRaw ? reviewRaw.replace(/[()]/g, "").trim() : null;
|
|
882
|
+
let cid = card.querySelector("a[data-cid]")?.getAttribute("data-cid") ?? null;
|
|
883
|
+
if (!cid) {
|
|
884
|
+
for (const link of Array.from(card.querySelectorAll("a[href]"))) {
|
|
885
|
+
const m1 = link.href.match(/[?&]cid=(\d+)/);
|
|
886
|
+
if (m1) {
|
|
887
|
+
cid = m1[1];
|
|
888
|
+
break;
|
|
889
|
+
}
|
|
890
|
+
const m2 = link.href.match(/!1s0x[0-9a-f]+:0x([0-9a-f]+)/i);
|
|
891
|
+
if (m2) {
|
|
892
|
+
try {
|
|
893
|
+
cid = BigInt("0x" + m2[1]).toString();
|
|
894
|
+
} catch {
|
|
895
|
+
}
|
|
896
|
+
if (cid) break;
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
const metadata = [];
|
|
901
|
+
card.querySelectorAll("div, span").forEach((el) => {
|
|
902
|
+
const text = Array.from(el.childNodes).filter((n) => n.nodeType === 3).map((n) => n.textContent?.trim() ?? "").filter((t) => t.length > 1 && t.length < 120).join(" ");
|
|
903
|
+
if (text && !metadata.includes(text)) metadata.push(text);
|
|
904
|
+
});
|
|
905
|
+
const links = Array.from(card.querySelectorAll("a[href]"));
|
|
906
|
+
const directionsUrl = links.find((a) => a.href.includes("google.com/maps"))?.href ?? null;
|
|
907
|
+
const websiteUrl = links.find((a) => !a.href.includes("google.com") && a.href.startsWith("http"))?.href ?? null;
|
|
908
|
+
out.push({ position: i + 1, name, cid, rating, reviewCount, metadata, websiteUrl, directionsUrl });
|
|
909
|
+
});
|
|
910
|
+
return out;
|
|
911
|
+
}, sels);
|
|
912
|
+
}
|
|
913
|
+
async extractEntityIds(page) {
|
|
914
|
+
return page.evaluate(() => {
|
|
915
|
+
const kgIds = /* @__PURE__ */ new Set();
|
|
916
|
+
const cids = /* @__PURE__ */ new Set();
|
|
917
|
+
const gcids = /* @__PURE__ */ new Set();
|
|
918
|
+
const recordMap = /* @__PURE__ */ new Map();
|
|
919
|
+
function nameFromWrapper(el) {
|
|
920
|
+
const sel = [".OSrXXb", ".dbg0pd", ".tzt0oe", '[role="heading"]', "h3"];
|
|
921
|
+
for (const s of sel) {
|
|
922
|
+
const found = el.querySelector(s);
|
|
923
|
+
if (found?.textContent?.trim()) return found.textContent.trim();
|
|
924
|
+
}
|
|
925
|
+
return "";
|
|
926
|
+
}
|
|
927
|
+
document.querySelectorAll('[id^="pv-/g/"]').forEach((wrapper) => {
|
|
928
|
+
const raw = wrapper.getAttribute("id");
|
|
929
|
+
if (!raw) return;
|
|
930
|
+
const kgId = raw.replace("pv-", "");
|
|
931
|
+
kgIds.add(kgId);
|
|
932
|
+
const name = nameFromWrapper(wrapper);
|
|
933
|
+
const cidEl = wrapper.querySelector("a[data-cid]");
|
|
934
|
+
const cid = cidEl?.getAttribute("data-cid") ?? null;
|
|
935
|
+
if (cid) cids.add(cid);
|
|
936
|
+
if (name) recordMap.set(kgId, { name, kgId, cid, gcid: null });
|
|
937
|
+
});
|
|
938
|
+
document.querySelectorAll("[data-mid]").forEach((el) => {
|
|
939
|
+
const mid = el.getAttribute("data-mid");
|
|
940
|
+
if (!mid?.startsWith("/g/")) return;
|
|
941
|
+
kgIds.add(mid);
|
|
942
|
+
if (!recordMap.has(mid)) {
|
|
943
|
+
const name = nameFromWrapper(el);
|
|
944
|
+
if (name) recordMap.set(mid, { name, kgId: mid, cid: null, gcid: null });
|
|
945
|
+
}
|
|
946
|
+
});
|
|
947
|
+
document.querySelectorAll(".w7Dbne").forEach((card) => {
|
|
948
|
+
const cidEl = card.querySelector("a[data-cid]");
|
|
949
|
+
const cid = cidEl?.getAttribute("data-cid") ?? null;
|
|
950
|
+
if (!cid) return;
|
|
951
|
+
cids.add(cid);
|
|
952
|
+
const name = card.querySelector(".OSrXXb")?.textContent?.trim() ?? "";
|
|
953
|
+
if (!name) return;
|
|
954
|
+
const kgIdEl = card.querySelector('[id^="pv-/g/"]');
|
|
955
|
+
const kgId = kgIdEl ? kgIdEl.getAttribute("id").replace("pv-", "") : null;
|
|
956
|
+
const key = kgId ?? `cid:${cid}`;
|
|
957
|
+
if (recordMap.has(key)) {
|
|
958
|
+
const existing = recordMap.get(key);
|
|
959
|
+
if (!existing.cid) recordMap.set(key, { ...existing, cid });
|
|
960
|
+
} else {
|
|
961
|
+
recordMap.set(key, { name, kgId, cid, gcid: null });
|
|
962
|
+
}
|
|
963
|
+
});
|
|
964
|
+
document.querySelectorAll("a[data-cid]").forEach((el) => {
|
|
965
|
+
const cid = el.getAttribute("data-cid");
|
|
966
|
+
if (!cid) return;
|
|
967
|
+
cids.add(cid);
|
|
968
|
+
const alreadyNamed = [...recordMap.values()].some((r) => r.cid === cid);
|
|
969
|
+
if (!alreadyNamed) {
|
|
970
|
+
let node = el.parentElement;
|
|
971
|
+
let name = "";
|
|
972
|
+
for (let i = 0; i < 8 && node; i++) {
|
|
973
|
+
const h = node.querySelector('.OSrXXb, .dbg0pd, [role="heading"], h3');
|
|
974
|
+
if (h?.textContent?.trim()) {
|
|
975
|
+
name = h.textContent.trim();
|
|
976
|
+
break;
|
|
977
|
+
}
|
|
978
|
+
node = node.parentElement;
|
|
979
|
+
}
|
|
980
|
+
if (name) recordMap.set(`cid:${cid}`, { name, kgId: null, cid, gcid: null });
|
|
981
|
+
}
|
|
982
|
+
});
|
|
983
|
+
const scriptContent = Array.from(document.querySelectorAll("script:not([src])")).map((s) => s.textContent ?? "").filter((t) => t.length > 1e4).join("\n");
|
|
984
|
+
for (const m of scriptContent.matchAll(/\/g\/[a-zA-Z0-9_-]{5,20}/g)) kgIds.add(m[0]);
|
|
985
|
+
for (const m of scriptContent.matchAll(/gcid:[a-zA-Z0-9_]+/g)) gcids.add(m[0]);
|
|
986
|
+
for (const m of scriptContent.matchAll(/0x[0-9a-f]+:0x([0-9a-f]+)/gi)) {
|
|
987
|
+
try {
|
|
988
|
+
cids.add(BigInt("0x" + m[1]).toString());
|
|
989
|
+
} catch {
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
return { entities: [...recordMap.values()], kgIds: [...kgIds], cids: [...cids], gcids: [...gcids] };
|
|
993
|
+
});
|
|
994
|
+
}
|
|
995
|
+
mergeLocalPackIntoEntities(entityIds, localPack) {
|
|
996
|
+
const cidSet = new Set(entityIds.cids);
|
|
997
|
+
const records = entityIds.entities.map((r) => ({ ...r }));
|
|
998
|
+
for (const biz of localPack) {
|
|
999
|
+
if (!biz.cid) continue;
|
|
1000
|
+
cidSet.add(biz.cid);
|
|
1001
|
+
const nameNorm = biz.name.toLowerCase().trim();
|
|
1002
|
+
const byName = records.find((r) => r.name.toLowerCase().trim() === nameNorm);
|
|
1003
|
+
if (byName) {
|
|
1004
|
+
if (!byName.cid) byName.cid = biz.cid;
|
|
1005
|
+
} else if (!records.find((r) => r.cid === biz.cid)) {
|
|
1006
|
+
records.push({ name: biz.name, kgId: null, cid: biz.cid, gcid: null });
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
return { ...entityIds, entities: records, cids: [...cidSet] };
|
|
1010
|
+
}
|
|
1011
|
+
async extractAISurfaces(page) {
|
|
1012
|
+
const aioSels = AIOverviewSelectors;
|
|
1013
|
+
const aimSels = AIModeSelectors;
|
|
1014
|
+
return page.evaluate(({ aio, aim }) => {
|
|
1015
|
+
const sn = window.google?.sn ?? "unknown";
|
|
1016
|
+
const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
|
|
1017
|
+
function findAIORoot() {
|
|
1018
|
+
const primary = document.querySelector(aio.root);
|
|
1019
|
+
if (primary) return primary;
|
|
1020
|
+
const headings = document.querySelectorAll('h1, h2, h3, [role="heading"]');
|
|
1021
|
+
for (const h of headings) {
|
|
1022
|
+
if (h.textContent?.trim() === "AI Overview") {
|
|
1023
|
+
let el = h.parentElement;
|
|
1024
|
+
for (let i = 0; i < 6 && el; i++) {
|
|
1025
|
+
if (el.querySelectorAll("a").length > 1) return el;
|
|
1026
|
+
el = el.parentElement;
|
|
1027
|
+
}
|
|
1028
|
+
return h.parentElement;
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
return null;
|
|
1032
|
+
}
|
|
1033
|
+
const aioRoot = findAIORoot();
|
|
1034
|
+
const aioContainer = aioRoot ? aioRoot.closest(aio.wrapper) ?? aioRoot : null;
|
|
1035
|
+
let aioText = null;
|
|
1036
|
+
if (aioContainer) {
|
|
1037
|
+
const clone = aioContainer.cloneNode(true);
|
|
1038
|
+
clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
|
|
1039
|
+
clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
|
|
1040
|
+
clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
|
|
1041
|
+
clone.querySelectorAll("a").forEach((el) => el.remove());
|
|
1042
|
+
const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
|
|
1043
|
+
const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
|
|
1044
|
+
aioText = isErrorState ? null : candidate;
|
|
1045
|
+
}
|
|
1046
|
+
const aioDetected = !!aioRoot && aioText !== null;
|
|
1047
|
+
const aioCitations = Array.from(aioContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
|
|
1048
|
+
text: a.textContent?.trim() ?? "",
|
|
1049
|
+
href: a.href
|
|
1050
|
+
})).filter((c) => c.text && c.href);
|
|
1051
|
+
const aimRoot = document.querySelector(aim.root);
|
|
1052
|
+
const aimDetected = surface === "aim" && !!aimRoot;
|
|
1053
|
+
const aimContainer = aimRoot?.closest(aim.wrapper) ?? null;
|
|
1054
|
+
let aimText = null;
|
|
1055
|
+
if (aimContainer) {
|
|
1056
|
+
const clone = aimContainer.cloneNode(true);
|
|
1057
|
+
clone.querySelectorAll("script,style,noscript").forEach((el) => el.remove());
|
|
1058
|
+
clone.querySelectorAll('h1,h2,h3,h4,[role="heading"]').forEach((el) => el.remove());
|
|
1059
|
+
clone.querySelectorAll('button,[role="button"]').forEach((el) => el.remove());
|
|
1060
|
+
clone.querySelectorAll("a").forEach((el) => el.remove());
|
|
1061
|
+
const candidate = clone.textContent?.replace(/\s+/g, " ").trim() || null;
|
|
1062
|
+
const isErrorState = !candidate || /not available|try again|can't generate/i.test(candidate);
|
|
1063
|
+
aimText = isErrorState ? null : candidate;
|
|
1064
|
+
}
|
|
1065
|
+
const aimCitations = aimDetected ? Array.from(aimContainer?.querySelectorAll("a[href]") ?? []).filter((a) => a.href && !a.href.startsWith("javascript")).map((a) => ({
|
|
1066
|
+
text: a.textContent?.trim() ?? "",
|
|
1067
|
+
href: a.href
|
|
1068
|
+
})).filter((c) => c.text && c.href) : [];
|
|
1069
|
+
return {
|
|
1070
|
+
surface,
|
|
1071
|
+
aiOverview: { detected: aioDetected, text: aioText, citations: aioCitations },
|
|
1072
|
+
aiMode: { detected: aimDetected, text: aimText, citations: aimCitations }
|
|
1073
|
+
};
|
|
1074
|
+
}, { aio: aioSels, aim: aimSels });
|
|
1075
|
+
}
|
|
1076
|
+
buildTree(flat, _seed) {
|
|
1077
|
+
const roots = [];
|
|
1078
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
1079
|
+
for (const row of flat) {
|
|
1080
|
+
const node = {
|
|
1081
|
+
question: row.question,
|
|
1082
|
+
answer: row.answer || null,
|
|
1083
|
+
sourceTitle: row.source_title || null,
|
|
1084
|
+
sourceSite: row.source_site || null,
|
|
1085
|
+
sourceCite: row.source_cite || null,
|
|
1086
|
+
depth: row.depth,
|
|
1087
|
+
parentQuestion: row.parent_question || null,
|
|
1088
|
+
children: []
|
|
1089
|
+
};
|
|
1090
|
+
nodeMap.set(row.question, node);
|
|
1091
|
+
}
|
|
1092
|
+
for (const node of nodeMap.values()) {
|
|
1093
|
+
if (node.parentQuestion && nodeMap.has(node.parentQuestion)) {
|
|
1094
|
+
nodeMap.get(node.parentQuestion).children.push(node);
|
|
1095
|
+
} else {
|
|
1096
|
+
roots.push(node);
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
return roots;
|
|
1100
|
+
}
|
|
1101
|
+
async extract(options) {
|
|
1102
|
+
const startMs = Date.now();
|
|
1103
|
+
const config = {
|
|
1104
|
+
headless: options.headless,
|
|
1105
|
+
profileDir: options.profileDir,
|
|
1106
|
+
proxy: options.proxy,
|
|
1107
|
+
kernelApiKey: options.kernelApiKey,
|
|
1108
|
+
kernelProxyId: options.kernelProxyId,
|
|
1109
|
+
viewport: { width: 1280, height: 800 },
|
|
1110
|
+
locale: `${options.hl}-${options.gl.toUpperCase()}`
|
|
1111
|
+
};
|
|
1112
|
+
let errorCount = 0;
|
|
1113
|
+
try {
|
|
1114
|
+
await this.driver.launch(config);
|
|
1115
|
+
const uule = options.location ? encodeUule(normalizeLocation(options.location)) : null;
|
|
1116
|
+
const { hasPaa } = await this.driver.navigateToSERP(options.query, uule, options.gl, options.hl);
|
|
1117
|
+
const page = this.driver.getPage();
|
|
1118
|
+
if (options.serpOnly) {
|
|
1119
|
+
const [organicResults2, localPack2, rawEntityIds2] = await Promise.all([
|
|
1120
|
+
this.extractOrganicResults(page),
|
|
1121
|
+
this.extractLocalPack(page),
|
|
1122
|
+
this.extractEntityIds(page)
|
|
1123
|
+
]);
|
|
1124
|
+
const entityIds2 = this.mergeLocalPackIntoEntities(rawEntityIds2, localPack2);
|
|
1125
|
+
let allOrganic2 = organicResults2;
|
|
1126
|
+
if ((options.pages ?? 1) >= 2) {
|
|
1127
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1128
|
+
if (uule) p2params.set("uule", uule);
|
|
1129
|
+
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1130
|
+
const p2organic = await this.extractOrganicResults(page);
|
|
1131
|
+
allOrganic2 = [...organicResults2, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1132
|
+
}
|
|
1133
|
+
const stats2 = {
|
|
1134
|
+
seed: options.query,
|
|
1135
|
+
totalQuestions: 0,
|
|
1136
|
+
maxDepthReached: 0,
|
|
1137
|
+
durationMs: Date.now() - startMs,
|
|
1138
|
+
errorCount
|
|
1139
|
+
};
|
|
1140
|
+
this.reporter.onComplete(stats2);
|
|
1141
|
+
return {
|
|
1142
|
+
seed: options.query,
|
|
1143
|
+
location: options.location ?? null,
|
|
1144
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1145
|
+
totalQuestions: 0,
|
|
1146
|
+
surface: "web",
|
|
1147
|
+
aiOverview: { detected: false, text: null, citations: [] },
|
|
1148
|
+
aiMode: { detected: false, text: null, citations: [] },
|
|
1149
|
+
whatPeopleSaying: [],
|
|
1150
|
+
tree: [],
|
|
1151
|
+
flat: [],
|
|
1152
|
+
videos: [],
|
|
1153
|
+
forums: [],
|
|
1154
|
+
organicResults: allOrganic2,
|
|
1155
|
+
localPack: localPack2,
|
|
1156
|
+
entityIds: entityIds2,
|
|
1157
|
+
stats: stats2
|
|
1158
|
+
};
|
|
1159
|
+
}
|
|
1160
|
+
const [videos, forums, whatPeopleSaying, rawEntityIds, organicResults, localPack] = await Promise.all([
|
|
1161
|
+
this.extractVideos(page),
|
|
1162
|
+
this.extractForums(page),
|
|
1163
|
+
this.extractWhatPeopleSaying(page),
|
|
1164
|
+
this.extractEntityIds(page),
|
|
1165
|
+
this.extractOrganicResults(page),
|
|
1166
|
+
this.extractLocalPack(page)
|
|
1167
|
+
]);
|
|
1168
|
+
const entityIds = this.mergeLocalPackIntoEntities(rawEntityIds, localPack);
|
|
1169
|
+
this.reporter.onVideos(videos);
|
|
1170
|
+
this.reporter.onForums(forums);
|
|
1171
|
+
if (!hasPaa) {
|
|
1172
|
+
let noPaaOrganic = organicResults;
|
|
1173
|
+
if ((options.pages ?? 1) >= 2) {
|
|
1174
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1175
|
+
if (uule) p2params.set("uule", uule);
|
|
1176
|
+
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1177
|
+
const p2organic = await this.extractOrganicResults(page);
|
|
1178
|
+
noPaaOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1179
|
+
}
|
|
1180
|
+
const aiSurfaces2 = await this.extractAISurfaces(page);
|
|
1181
|
+
const stats2 = {
|
|
1182
|
+
seed: options.query,
|
|
1183
|
+
totalQuestions: 0,
|
|
1184
|
+
maxDepthReached: 0,
|
|
1185
|
+
durationMs: Date.now() - startMs,
|
|
1186
|
+
errorCount
|
|
1187
|
+
};
|
|
1188
|
+
this.reporter.onComplete(stats2);
|
|
1189
|
+
return {
|
|
1190
|
+
seed: options.query,
|
|
1191
|
+
location: options.location ?? null,
|
|
1192
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1193
|
+
totalQuestions: 0,
|
|
1194
|
+
surface: aiSurfaces2.surface,
|
|
1195
|
+
aiOverview: aiSurfaces2.aiOverview,
|
|
1196
|
+
aiMode: aiSurfaces2.aiMode,
|
|
1197
|
+
whatPeopleSaying,
|
|
1198
|
+
tree: [],
|
|
1199
|
+
flat: [],
|
|
1200
|
+
videos,
|
|
1201
|
+
forums,
|
|
1202
|
+
organicResults: noPaaOrganic,
|
|
1203
|
+
localPack,
|
|
1204
|
+
entityIds,
|
|
1205
|
+
stats: stats2
|
|
1206
|
+
};
|
|
1207
|
+
}
|
|
1208
|
+
const flat = await this.runBFS(page, options);
|
|
1209
|
+
const aiSurfaces = await this.extractAISurfaces(page);
|
|
1210
|
+
const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, udm: ShortVideoSelectors.udm });
|
|
1211
|
+
if (uule) shortVidsParams.set("uule", uule);
|
|
1212
|
+
const shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
|
|
1213
|
+
this.reporter.onVideos(shortVideos);
|
|
1214
|
+
let allOrganic = organicResults;
|
|
1215
|
+
if ((options.pages ?? 1) >= 2) {
|
|
1216
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, start: "10" });
|
|
1217
|
+
if (uule) p2params.set("uule", uule);
|
|
1218
|
+
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1219
|
+
const p2organic = await this.extractOrganicResults(page);
|
|
1220
|
+
allOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1221
|
+
}
|
|
1222
|
+
const allVideos = [...videos, ...shortVideos];
|
|
1223
|
+
const tree = this.buildTree(flat, options.query);
|
|
1224
|
+
const stats = {
|
|
1225
|
+
seed: options.query,
|
|
1226
|
+
totalQuestions: flat.length,
|
|
1227
|
+
maxDepthReached: flat.reduce((m, r) => Math.max(m, r.depth), 0),
|
|
1228
|
+
durationMs: Date.now() - startMs,
|
|
1229
|
+
errorCount
|
|
1230
|
+
};
|
|
1231
|
+
this.reporter.onComplete(stats);
|
|
1232
|
+
return {
|
|
1233
|
+
seed: options.query,
|
|
1234
|
+
location: options.location ?? null,
|
|
1235
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1236
|
+
totalQuestions: flat.length,
|
|
1237
|
+
surface: aiSurfaces.surface,
|
|
1238
|
+
aiOverview: aiSurfaces.aiOverview,
|
|
1239
|
+
aiMode: aiSurfaces.aiMode,
|
|
1240
|
+
whatPeopleSaying,
|
|
1241
|
+
tree,
|
|
1242
|
+
flat,
|
|
1243
|
+
videos: allVideos,
|
|
1244
|
+
forums,
|
|
1245
|
+
organicResults: allOrganic,
|
|
1246
|
+
localPack,
|
|
1247
|
+
entityIds,
|
|
1248
|
+
stats
|
|
1249
|
+
};
|
|
1250
|
+
} catch (err) {
|
|
1251
|
+
errorCount++;
|
|
1252
|
+
this.reporter.onError(err instanceof Error ? err : new Error(String(err)));
|
|
1253
|
+
throw err;
|
|
1254
|
+
} finally {
|
|
1255
|
+
await this.driver.close();
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
};
|
|
1259
|
+
|
|
1260
|
+
// src/output/OutputSerializer.ts
|
|
1261
|
+
var import_node_fs = require("fs");
|
|
1262
|
+
var import_node_path = __toESM(require("path"), 1);
|
|
1263
|
+
var import_papaparse = __toESM(require("papaparse"), 1);
|
|
1264
|
+
var OutputSerializer = class {
|
|
1265
|
+
async writeJSON(result, outputDir) {
|
|
1266
|
+
await import_node_fs.promises.mkdir(outputDir, { recursive: true });
|
|
1267
|
+
const slug = result.seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
1268
|
+
const filename = `${slug}-${Date.now()}.json`;
|
|
1269
|
+
const fullPath = import_node_path.default.join(outputDir, filename);
|
|
1270
|
+
await import_node_fs.promises.writeFile(fullPath, JSON.stringify(result, null, 2), "utf8");
|
|
1271
|
+
return fullPath;
|
|
1272
|
+
}
|
|
1273
|
+
async writeCSV(rows, outputDir) {
|
|
1274
|
+
await import_node_fs.promises.mkdir(outputDir, { recursive: true });
|
|
1275
|
+
const seedRaw = rows[0]?.seed_query ?? "paa";
|
|
1276
|
+
const slug = seedRaw.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
1277
|
+
const csv = import_papaparse.default.unparse(rows, { header: true });
|
|
1278
|
+
const filename = `${slug}-${Date.now()}.csv`;
|
|
1279
|
+
const fullPath = import_node_path.default.join(outputDir, filename);
|
|
1280
|
+
await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
|
|
1281
|
+
return fullPath;
|
|
1282
|
+
}
|
|
1283
|
+
async writeVideoCSV(videos, seed, outputDir) {
|
|
1284
|
+
await import_node_fs.promises.mkdir(outputDir, { recursive: true });
|
|
1285
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
1286
|
+
const csv = import_papaparse.default.unparse(videos, { header: true });
|
|
1287
|
+
const filename = `${slug}-videos-${Date.now()}.csv`;
|
|
1288
|
+
const fullPath = import_node_path.default.join(outputDir, filename);
|
|
1289
|
+
await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
|
|
1290
|
+
return fullPath;
|
|
1291
|
+
}
|
|
1292
|
+
async writeForumCSV(forums, seed, outputDir) {
|
|
1293
|
+
await import_node_fs.promises.mkdir(outputDir, { recursive: true });
|
|
1294
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
1295
|
+
const csv = import_papaparse.default.unparse(forums, { header: true });
|
|
1296
|
+
const filename = `${slug}-forums-${Date.now()}.csv`;
|
|
1297
|
+
const fullPath = import_node_path.default.join(outputDir, filename);
|
|
1298
|
+
await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
|
|
1299
|
+
return fullPath;
|
|
1300
|
+
}
|
|
1301
|
+
async writeAIOverviewCSV(citations, text, seed, outputDir) {
|
|
1302
|
+
await import_node_fs.promises.mkdir(outputDir, { recursive: true });
|
|
1303
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
1304
|
+
const rows = citations.map((c, i) => ({
|
|
1305
|
+
seed_query: seed,
|
|
1306
|
+
response_text: i === 0 ? text ?? "" : "",
|
|
1307
|
+
citation_text: c.text,
|
|
1308
|
+
citation_href: c.href
|
|
1309
|
+
}));
|
|
1310
|
+
const csv = import_papaparse.default.unparse(rows, { header: true });
|
|
1311
|
+
const filename = `${slug}-ai-overview-${Date.now()}.csv`;
|
|
1312
|
+
const fullPath = import_node_path.default.join(outputDir, filename);
|
|
1313
|
+
await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
|
|
1314
|
+
return fullPath;
|
|
1315
|
+
}
|
|
1316
|
+
async writeAIModeCSV(citations, text, seed, outputDir) {
|
|
1317
|
+
await import_node_fs.promises.mkdir(outputDir, { recursive: true });
|
|
1318
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
1319
|
+
const rows = citations.map((c, i) => ({
|
|
1320
|
+
seed_query: seed,
|
|
1321
|
+
response_text: i === 0 ? text ?? "" : "",
|
|
1322
|
+
citation_text: c.text,
|
|
1323
|
+
citation_href: c.href
|
|
1324
|
+
}));
|
|
1325
|
+
const csv = import_papaparse.default.unparse(rows, { header: true });
|
|
1326
|
+
const filename = `${slug}-ai-mode-${Date.now()}.csv`;
|
|
1327
|
+
const fullPath = import_node_path.default.join(outputDir, filename);
|
|
1328
|
+
await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
|
|
1329
|
+
return fullPath;
|
|
1330
|
+
}
|
|
1331
|
+
async writeWhatPeopleSayingCSV(cards, seed, outputDir) {
|
|
1332
|
+
await import_node_fs.promises.mkdir(outputDir, { recursive: true });
|
|
1333
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
1334
|
+
const rows = cards.map((c) => ({ seed_query: seed, ...c }));
|
|
1335
|
+
const csv = import_papaparse.default.unparse(rows, { header: true });
|
|
1336
|
+
const filename = `${slug}-what-people-saying-${Date.now()}.csv`;
|
|
1337
|
+
const fullPath = import_node_path.default.join(outputDir, filename);
|
|
1338
|
+
await import_node_fs.promises.writeFile(fullPath, csv, "utf8");
|
|
1339
|
+
return fullPath;
|
|
1340
|
+
}
|
|
1341
|
+
};
|
|
1342
|
+
|
|
1343
|
+
// src/output/ProgressReporter.ts
|
|
1344
|
+
var ProgressReporter = class {
|
|
1345
|
+
onQuestion(node) {
|
|
1346
|
+
process.stdout.write(JSON.stringify({ event: "question", depth: node.depth, question: node.question }) + "\n");
|
|
1347
|
+
}
|
|
1348
|
+
onDepth(depth) {
|
|
1349
|
+
process.stdout.write(JSON.stringify({ event: "depth", depth }) + "\n");
|
|
1350
|
+
}
|
|
1351
|
+
onVideos(videos) {
|
|
1352
|
+
for (const v of videos) {
|
|
1353
|
+
process.stdout.write(JSON.stringify({ event: "video", type: v.type, platform: v.platform, duration: v.duration, title: v.title, channel: v.channel, url: v.url }) + "\n");
|
|
1354
|
+
}
|
|
1355
|
+
}
|
|
1356
|
+
onForums(forums) {
|
|
1357
|
+
for (const f of forums) {
|
|
1358
|
+
process.stdout.write(JSON.stringify({ event: "forum", title: f.title, source: f.source, url: f.url }) + "\n");
|
|
1359
|
+
}
|
|
1360
|
+
}
|
|
1361
|
+
onComplete(stats) {
|
|
1362
|
+
process.stdout.write(JSON.stringify({ event: "complete", ...stats }) + "\n");
|
|
1363
|
+
}
|
|
1364
|
+
onError(err) {
|
|
1365
|
+
process.stderr.write(JSON.stringify({ event: "error", type: err.constructor.name, message: err.message }) + "\n");
|
|
1366
|
+
}
|
|
1367
|
+
};
|
|
1368
|
+
|
|
1369
|
+
// src/harvest.ts
|
|
1370
|
+
var MAX_ATTEMPTS = 3;
|
|
1371
|
+
async function extractOnce(options) {
|
|
1372
|
+
const driver = new BrowserDriver();
|
|
1373
|
+
const reporter = new ProgressReporter();
|
|
1374
|
+
const extractor = new PAAExtractor(driver, reporter);
|
|
1375
|
+
try {
|
|
1376
|
+
return await extractor.extract(options);
|
|
1377
|
+
} finally {
|
|
1378
|
+
await driver.close();
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
async function harvest(rawOptions) {
|
|
1382
|
+
const raw = typeof rawOptions === "object" && rawOptions !== null ? rawOptions : {};
|
|
1383
|
+
const merged = {
|
|
1384
|
+
kernelApiKey: process.env.KERNEL_API_KEY?.trim(),
|
|
1385
|
+
kernelProxyId: process.env.KERNEL_PROXY_ID?.trim(),
|
|
1386
|
+
...raw
|
|
1387
|
+
};
|
|
1388
|
+
const options = HarvestOptionsSchema.parse(merged);
|
|
1389
|
+
const serializer = new OutputSerializer();
|
|
1390
|
+
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
|
1391
|
+
try {
|
|
1392
|
+
const result = await extractOnce(options);
|
|
1393
|
+
if (options.format === "json" || options.format === "both") {
|
|
1394
|
+
await serializer.writeJSON(result, options.outputDir);
|
|
1395
|
+
}
|
|
1396
|
+
if (options.format === "csv" || options.format === "both") {
|
|
1397
|
+
await Promise.all([
|
|
1398
|
+
serializer.writeCSV(result.flat, options.outputDir),
|
|
1399
|
+
result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, options.outputDir) : Promise.resolve(""),
|
|
1400
|
+
result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, options.outputDir) : Promise.resolve(""),
|
|
1401
|
+
result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, options.outputDir) : Promise.resolve(""),
|
|
1402
|
+
result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, options.outputDir) : Promise.resolve(""),
|
|
1403
|
+
result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, options.outputDir) : Promise.resolve("")
|
|
1404
|
+
]);
|
|
1405
|
+
}
|
|
1406
|
+
return result;
|
|
1407
|
+
} catch (err) {
|
|
1408
|
+
if (err instanceof CaptchaError && i < MAX_ATTEMPTS - 1) {
|
|
1409
|
+
continue;
|
|
1410
|
+
}
|
|
1411
|
+
throw err;
|
|
1412
|
+
}
|
|
1413
|
+
}
|
|
1414
|
+
const sessionDesc = options.kernelApiKey ? `${MAX_ATTEMPTS} fresh Kernel.sh sessions` : `${MAX_ATTEMPTS} attempts`;
|
|
1415
|
+
throw new CaptchaError(`CAPTCHA on all ${sessionDesc}. Try again in a few minutes.`);
|
|
1416
|
+
}
|
|
1417
|
+
|
|
1418
|
+
// src/video/VideoGenerator.ts
|
|
1419
|
+
var import_node_child_process2 = require("child_process");
|
|
1420
|
+
var import_node_fs3 = require("fs");
|
|
1421
|
+
var import_node_os = require("os");
|
|
1422
|
+
var import_node_path3 = require("path");
|
|
1423
|
+
var import_client2 = require("@fal-ai/client");
|
|
1424
|
+
|
|
1425
|
+
// src/video/promptBuilder.ts
|
|
1426
|
+
var DEEPINFRA_URL = "https://api.deepinfra.com/v1/openai/chat/completions";
|
|
1427
|
+
var OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions";
|
|
1428
|
+
var QWEN_MODEL = "Qwen/Qwen3.6-35B-A3B";
|
|
1429
|
+
var SYSTEM_PROMPT = `You are a video prompt engineer for Seedance 2.0 text-to-video AI.
|
|
1430
|
+
|
|
1431
|
+
IMPORTANT TECHNICAL CONTEXT: Clip 2 will be generated using image-to-video, meaning it will start from the exact last frame of Clip 1. The two clips will be visually seamless \u2014 same location, same characters, continuous motion. You must write prompts that make this feel like one uninterrupted 16-second video.
|
|
1432
|
+
|
|
1433
|
+
Your job: turn a PAA question and its answer into a complete short-form video with visuals, narration, and a background audio mood.
|
|
1434
|
+
|
|
1435
|
+
Produce four things:
|
|
1436
|
+
|
|
1437
|
+
1. clip1 (~8s): Show a real person experiencing the situation the question describes. End on a specific frozen moment \u2014 a held expression, a paused action \u2014 that clip 2 continues from.
|
|
1438
|
+
|
|
1439
|
+
2. clip2 (~8s): Continue from that exact frozen frame. Deliver the informational payoff from the answer. Show specific facts playing out visually. Describe motion continuing from clip 1's ending frame, not a new scene.
|
|
1440
|
+
|
|
1441
|
+
3. voiceover: A spoken narration that fits in 10 seconds of TTS audio. HARD LIMIT: 22 words maximum. Count every word before finalizing \u2014 if it exceeds 22 words, cut it. Starts with the problem, ends with the answer. No filler, no "in conclusion". Informational and direct.
|
|
1442
|
+
|
|
1443
|
+
4. audioMood: 6-10 words describing INSTRUMENTAL background music only \u2014 no vocals, no voice, no lyrics. Describe instruments and mood (e.g. "warm acoustic guitar, uplifting, professional home service, no vocals"). Will be passed to an AI audio model.
|
|
1444
|
+
|
|
1445
|
+
Rules for visuals:
|
|
1446
|
+
- No text, captions, graphics, or overlays
|
|
1447
|
+
- Photorealistic, natural lighting, specific details
|
|
1448
|
+
- Describe exactly what the camera sees
|
|
1449
|
+
|
|
1450
|
+
Respond with JSON only:
|
|
1451
|
+
{"clip1": "...", "clip2": "...", "voiceover": "...", "audioMood": "..."}`;
|
|
1452
|
+
async function callLLM(apiKey, baseUrl, question, answer) {
|
|
1453
|
+
const res = await fetch(baseUrl, {
|
|
1454
|
+
method: "POST",
|
|
1455
|
+
headers: { "Authorization": `Bearer ${apiKey}`, "Content-Type": "application/json" },
|
|
1456
|
+
body: JSON.stringify({
|
|
1457
|
+
model: QWEN_MODEL,
|
|
1458
|
+
temperature: 0.7,
|
|
1459
|
+
messages: [
|
|
1460
|
+
{ role: "system", content: SYSTEM_PROMPT },
|
|
1461
|
+
{ role: "user", content: `Question: ${question}
|
|
1462
|
+
|
|
1463
|
+
Answer: ${answer.slice(0, 500)}` }
|
|
1464
|
+
]
|
|
1465
|
+
})
|
|
1466
|
+
});
|
|
1467
|
+
if (!res.ok) throw new Error(`LLM call failed (${res.status}): ${await res.text()}`);
|
|
1468
|
+
const data = await res.json();
|
|
1469
|
+
const raw = data.choices[0]?.message?.content?.trim() ?? "";
|
|
1470
|
+
const match = raw.match(/\{[\s\S]*\}/);
|
|
1471
|
+
if (!match) throw new Error(`No JSON in QWEN response: ${raw.slice(0, 200)}`);
|
|
1472
|
+
const parsed = JSON.parse(match[0]);
|
|
1473
|
+
if (!parsed.clip1 || !parsed.clip2 || !parsed.voiceover || !parsed.audioMood) {
|
|
1474
|
+
throw new Error(`QWEN response missing fields: ${raw.slice(0, 200)}`);
|
|
1475
|
+
}
|
|
1476
|
+
return parsed;
|
|
1477
|
+
}
|
|
1478
|
+
async function buildClipPrompts(question, answer) {
|
|
1479
|
+
const deepinfraKey = process.env["DEEPINFRA_API_KEY"];
|
|
1480
|
+
const openrouterKey = process.env["OPENROUTER_API_KEY"];
|
|
1481
|
+
if (deepinfraKey) {
|
|
1482
|
+
try {
|
|
1483
|
+
return await callLLM(deepinfraKey, DEEPINFRA_URL, question, answer);
|
|
1484
|
+
} catch (err) {
|
|
1485
|
+
console.warn("[promptBuilder] DeepInfra failed, trying OpenRouter:", err.message);
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
if (openrouterKey) {
|
|
1489
|
+
return await callLLM(openrouterKey, OPENROUTER_URL, question, answer);
|
|
1490
|
+
}
|
|
1491
|
+
throw new Error("No LLM key \u2014 set DEEPINFRA_API_KEY or OPENROUTER_API_KEY");
|
|
1492
|
+
}
|
|
1493
|
+
|
|
1494
|
+
// src/video/AudioGenerator.ts
|
|
1495
|
+
var TTS_MODEL = "fal-ai/inworld-tts";
|
|
1496
|
+
var MMAUDIO_MODEL = "fal-ai/mmaudio-v2";
|
|
1497
|
+
var QUEUE_BASE = "https://queue.fal.run";
|
|
1498
|
+
async function rawQueueRun(model, input, apiKey) {
|
|
1499
|
+
const headers = { "Authorization": `Key ${apiKey}`, "Content-Type": "application/json" };
|
|
1500
|
+
const submitRes = await fetch(`${QUEUE_BASE}/${model}`, {
|
|
1501
|
+
method: "POST",
|
|
1502
|
+
headers,
|
|
1503
|
+
body: JSON.stringify(input)
|
|
1504
|
+
});
|
|
1505
|
+
if (!submitRes.ok) throw new Error(`${model} submit failed (${submitRes.status}): ${await submitRes.text()}`);
|
|
1506
|
+
const { request_id } = await submitRes.json();
|
|
1507
|
+
console.log(`[fal] submitted ${model} \u2192 ${request_id}`);
|
|
1508
|
+
while (true) {
|
|
1509
|
+
await new Promise((r) => setTimeout(r, 5e3));
|
|
1510
|
+
const statusRes = await fetch(`${QUEUE_BASE}/${model}/requests/${request_id}/status`, { headers });
|
|
1511
|
+
if (!statusRes.ok) continue;
|
|
1512
|
+
const { status } = await statusRes.json();
|
|
1513
|
+
console.log(`[fal] ${request_id} \u2192 ${status}`);
|
|
1514
|
+
if (status === "FAILED") throw new Error(`${model} request ${request_id} failed`);
|
|
1515
|
+
if (status !== "COMPLETED") continue;
|
|
1516
|
+
const resultRes = await fetch(`${QUEUE_BASE}/${model}/requests/${request_id}`, { headers });
|
|
1517
|
+
if (!resultRes.ok) throw new Error(`Result fetch failed (${resultRes.status})`);
|
|
1518
|
+
return await resultRes.json();
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
function getKey() {
|
|
1522
|
+
const key = process.env["FAL_KEY"];
|
|
1523
|
+
if (!key) throw new Error("FAL_KEY required");
|
|
1524
|
+
return key;
|
|
1525
|
+
}
|
|
1526
|
+
async function generateVoiceover(text, voice = "Serena (en)") {
|
|
1527
|
+
console.log("[AudioGenerator] Generating voiceover...");
|
|
1528
|
+
const out = await rawQueueRun(TTS_MODEL, { text, voice, sample_rate_hertz: 48e3 }, getKey());
|
|
1529
|
+
return out.audio.url;
|
|
1530
|
+
}
|
|
1531
|
+
async function addBackgroundAudio(videoUrl, mood, durationSeconds) {
|
|
1532
|
+
console.log("[AudioGenerator] Adding background audio via MMAudio V2...");
|
|
1533
|
+
const out = await rawQueueRun(MMAUDIO_MODEL, {
|
|
1534
|
+
video_url: videoUrl,
|
|
1535
|
+
prompt: mood,
|
|
1536
|
+
negative_prompt: "speech, voice, talking, dialogue, narration, vocals, singing, human voice, conversation, words, lyrics, announcer, commentary",
|
|
1537
|
+
duration: durationSeconds,
|
|
1538
|
+
cfg_strength: 4.5
|
|
1539
|
+
}, getKey());
|
|
1540
|
+
return out.video.url;
|
|
1541
|
+
}
|
|
1542
|
+
|
|
1543
|
+
// src/video/VideoMixer.ts
|
|
1544
|
+
var import_node_child_process = require("child_process");
|
|
1545
|
+
var import_node_fs2 = require("fs");
|
|
1546
|
+
var import_node_path2 = require("path");
|
|
1547
|
+
var import_client = require("@fal-ai/client");
|
|
1548
|
+
async function download(url, destPath) {
|
|
1549
|
+
const res = await fetch(url);
|
|
1550
|
+
if (!res.ok) throw new Error(`Download failed (${res.status}): ${url}`);
|
|
1551
|
+
(0, import_node_fs2.writeFileSync)(destPath, Buffer.from(await res.arrayBuffer()));
|
|
1552
|
+
}
|
|
1553
|
+
async function concatenateClips(clip1Url, clip2Url, outDir) {
|
|
1554
|
+
(0, import_node_fs2.mkdirSync)(outDir, { recursive: true });
|
|
1555
|
+
const ts = Date.now();
|
|
1556
|
+
const p1 = (0, import_node_path2.join)(outDir, `clip1-${ts}.mp4`);
|
|
1557
|
+
const p2 = (0, import_node_path2.join)(outDir, `clip2-${ts}.mp4`);
|
|
1558
|
+
const out = (0, import_node_path2.join)(outDir, `combined-${ts}.mp4`);
|
|
1559
|
+
console.log("[VideoMixer] Downloading clips...");
|
|
1560
|
+
await Promise.all([download(clip1Url, p1), download(clip2Url, p2)]);
|
|
1561
|
+
console.log("[VideoMixer] Concatenating...");
|
|
1562
|
+
(0, import_node_child_process.execSync)(
|
|
1563
|
+
`ffmpeg -i "${p1}" -i "${p2}" -filter_complex "[0:v][1:v]concat=n=2:v=1:a=0[v]" -map "[v]" -y "${out}" -loglevel error`
|
|
1564
|
+
);
|
|
1565
|
+
return out;
|
|
1566
|
+
}
|
|
1567
|
+
async function uploadToFal(localPath) {
|
|
1568
|
+
const { readFileSync: readFileSync2 } = await import("fs");
|
|
1569
|
+
const blob = new Blob([readFileSync2(localPath)], { type: "video/mp4" });
|
|
1570
|
+
const url = await import_client.fal.storage.upload(blob);
|
|
1571
|
+
console.log("[VideoMixer] Uploaded to fal:", url);
|
|
1572
|
+
return url;
|
|
1573
|
+
}
|
|
1574
|
+
async function overlayVoiceover(videoPath, voiceoverUrl, outDir) {
|
|
1575
|
+
const ts = Date.now();
|
|
1576
|
+
const wav = (0, import_node_path2.join)(outDir, `voiceover-${ts}.wav`);
|
|
1577
|
+
const out = (0, import_node_path2.join)(outDir, `final-${ts}.mp4`);
|
|
1578
|
+
console.log("[VideoMixer] Downloading voiceover...");
|
|
1579
|
+
await download(voiceoverUrl, wav);
|
|
1580
|
+
console.log("[VideoMixer] Mixing voiceover over background audio...");
|
|
1581
|
+
(0, import_node_child_process.execSync)(
|
|
1582
|
+
`ffmpeg -i "${videoPath}" -i "${wav}" -filter_complex "[0:a]volume=0.2[bg];[1:a]volume=1.0[vo];[bg][vo]amix=inputs=2:duration=first[a]" -map 0:v -map "[a]" -c:v copy -y "${out}" -loglevel error`
|
|
1583
|
+
);
|
|
1584
|
+
return out;
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
// src/video/VideoGenerator.ts
|
|
1588
|
+
var T2V = "bytedance/seedance-2.0/text-to-video";
|
|
1589
|
+
var I2V = "bytedance/seedance-2.0/image-to-video";
|
|
1590
|
+
function buildInput(prompt, opts, seed, imageUrl) {
|
|
1591
|
+
return {
|
|
1592
|
+
prompt,
|
|
1593
|
+
resolution: opts.resolution ?? "720p",
|
|
1594
|
+
duration: opts.clipDurationSeconds ?? 8,
|
|
1595
|
+
aspect_ratio: opts.aspectRatio ?? "16:9",
|
|
1596
|
+
generate_audio: false,
|
|
1597
|
+
...seed !== void 0 ? { seed } : {},
|
|
1598
|
+
...imageUrl !== void 0 ? { image_url: imageUrl } : {}
|
|
1599
|
+
};
|
|
1600
|
+
}
|
|
1601
|
+
async function generate(model, input) {
|
|
1602
|
+
const { request_id } = await import_client2.fal.queue.submit(model, { input });
|
|
1603
|
+
console.log(`[fal] submitted ${model} \u2192 ${request_id}`);
|
|
1604
|
+
while (true) {
|
|
1605
|
+
await new Promise((r) => setTimeout(r, 5e3));
|
|
1606
|
+
const s = await import_client2.fal.queue.status(model, { requestId: request_id, logs: false });
|
|
1607
|
+
console.log(`[fal] ${request_id} \u2192 ${s.status}`);
|
|
1608
|
+
if (s.status === "FAILED") throw new Error(`Request ${request_id} failed`);
|
|
1609
|
+
if (s.status !== "COMPLETED") continue;
|
|
1610
|
+
const result = await import_client2.fal.queue.result(model, { requestId: request_id });
|
|
1611
|
+
return result.data;
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1614
|
+
async function extractLastFrame(videoUrl, outDir) {
|
|
1615
|
+
const ts = Date.now();
|
|
1616
|
+
const mp4Path = (0, import_node_path3.join)(outDir, `clip1-raw-${ts}.mp4`);
|
|
1617
|
+
const jpgPath = (0, import_node_path3.join)(outDir, `last-frame-${ts}.jpg`);
|
|
1618
|
+
const res = await fetch(videoUrl);
|
|
1619
|
+
if (!res.ok) throw new Error(`Failed to download clip 1 (${res.status})`);
|
|
1620
|
+
(0, import_node_fs3.writeFileSync)(mp4Path, Buffer.from(await res.arrayBuffer()));
|
|
1621
|
+
try {
|
|
1622
|
+
(0, import_node_child_process2.execSync)(`ffmpeg -sseof -0.1 -i "${mp4Path}" -vframes 1 -y "${jpgPath}" -loglevel error`);
|
|
1623
|
+
} finally {
|
|
1624
|
+
try {
|
|
1625
|
+
(0, import_node_fs3.unlinkSync)(mp4Path);
|
|
1626
|
+
} catch {
|
|
1627
|
+
}
|
|
1628
|
+
}
|
|
1629
|
+
return jpgPath;
|
|
1630
|
+
}
|
|
1631
|
+
var VideoGenerator = class {
|
|
1632
|
+
constructor(apiKey) {
|
|
1633
|
+
const key = apiKey ?? process.env["FAL_KEY"];
|
|
1634
|
+
if (!key) throw new Error("FAL_KEY is required");
|
|
1635
|
+
import_client2.fal.config({ credentials: key });
|
|
1636
|
+
}
|
|
1637
|
+
async generateClipPair(question, answer, opts = {}) {
|
|
1638
|
+
const outDir = opts.outputDir ?? (0, import_node_path3.join)((0, import_node_os.tmpdir)(), `paa-video-${Date.now()}`);
|
|
1639
|
+
(0, import_node_fs3.mkdirSync)(outDir, { recursive: true });
|
|
1640
|
+
console.log("\n[1/7] Generating prompts via QWEN 3.6...");
|
|
1641
|
+
const prompts = await buildClipPrompts(question, answer);
|
|
1642
|
+
console.log(" Voiceover:", prompts.voiceover);
|
|
1643
|
+
console.log(" Audio mood:", prompts.audioMood);
|
|
1644
|
+
console.log("\n[2/7] Generating clip 1 (text-to-video)...");
|
|
1645
|
+
const result1 = await generate(T2V, buildInput(prompts.clip1, opts, opts.seed));
|
|
1646
|
+
console.log("\n[3/7] Extracting last frame \u2192 clip 2 start...");
|
|
1647
|
+
const jpgPath = await extractLastFrame(result1.video.url, outDir);
|
|
1648
|
+
const imageBlob = new Blob([(0, import_node_fs3.readFileSync)(jpgPath)], { type: "image/jpeg" });
|
|
1649
|
+
const frameUrl = await import_client2.fal.storage.upload(imageBlob);
|
|
1650
|
+
try {
|
|
1651
|
+
(0, import_node_fs3.unlinkSync)(jpgPath);
|
|
1652
|
+
} catch {
|
|
1653
|
+
}
|
|
1654
|
+
console.log("\n[4/7] Generating clip 2 (image-to-video from last frame)...");
|
|
1655
|
+
const seed2 = opts.seed !== void 0 ? opts.seed + 1 : void 0;
|
|
1656
|
+
const result2 = await generate(I2V, buildInput(prompts.clip2, opts, seed2, frameUrl));
|
|
1657
|
+
console.log("\n[5/7] Concatenating clips + generating voiceover (parallel)...");
|
|
1658
|
+
const [combinedPath, voiceoverUrl] = await Promise.all([
|
|
1659
|
+
concatenateClips(result1.video.url, result2.video.url, outDir),
|
|
1660
|
+
generateVoiceover(prompts.voiceover, opts.ttsVoice)
|
|
1661
|
+
]);
|
|
1662
|
+
console.log("\n[6/7] Adding background audio via MMAudio V2...");
|
|
1663
|
+
const falVideoUrl = await uploadToFal(combinedPath);
|
|
1664
|
+
const totalDuration = (opts.clipDurationSeconds ?? 8) * 2;
|
|
1665
|
+
const videoWithAudioUrl = await addBackgroundAudio(falVideoUrl, prompts.audioMood, totalDuration);
|
|
1666
|
+
console.log("\n[7/7] Overlaying voiceover on final video...");
|
|
1667
|
+
const videoWithAudioPath = (0, import_node_path3.join)(outDir, `with-bg-audio-${Date.now()}.mp4`);
|
|
1668
|
+
const bgRes = await fetch(videoWithAudioUrl);
|
|
1669
|
+
(0, import_node_fs3.writeFileSync)(videoWithAudioPath, Buffer.from(await bgRes.arrayBuffer()));
|
|
1670
|
+
const finalVideoPath = await overlayVoiceover(videoWithAudioPath, voiceoverUrl, outDir);
|
|
1671
|
+
return {
|
|
1672
|
+
clip1Url: result1.video.url,
|
|
1673
|
+
clip2Url: result2.video.url,
|
|
1674
|
+
finalVideoPath,
|
|
1675
|
+
seed: result1.seed,
|
|
1676
|
+
promptClip1: prompts.clip1,
|
|
1677
|
+
promptClip2: prompts.clip2,
|
|
1678
|
+
voiceover: prompts.voiceover,
|
|
1679
|
+
audioMood: prompts.audioMood
|
|
1680
|
+
};
|
|
1681
|
+
}
|
|
1682
|
+
};
|
|
1683
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
1684
|
+
0 && (module.exports = {
|
|
1685
|
+
VideoGenerator,
|
|
1686
|
+
buildClipPrompts,
|
|
1687
|
+
harvest
|
|
1688
|
+
});
|
|
1689
|
+
//# sourceMappingURL=index.cjs.map
|