mcp-scraper 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -0
- package/dist/bin/api-server.cjs +15730 -7780
- package/dist/bin/api-server.cjs.map +1 -1
- package/dist/bin/api-server.js +3 -3
- package/dist/bin/mcp-stdio-server.cjs +300 -110
- package/dist/bin/mcp-stdio-server.cjs.map +1 -1
- package/dist/bin/mcp-stdio-server.js +1 -1
- package/dist/bin/paa-harvest.cjs +1537 -165
- package/dist/bin/paa-harvest.cjs.map +1 -1
- package/dist/bin/paa-harvest.js +1 -1
- package/dist/{chunk-ZBP4RHNW.js → chunk-4743MZHT.js} +298 -106
- package/dist/chunk-4743MZHT.js.map +1 -0
- package/dist/{chunk-LXZDJJXR.js → chunk-D4CJBZBY.js} +426 -29
- package/dist/chunk-D4CJBZBY.js.map +1 -0
- package/dist/chunk-HERFK7W6.js +2781 -0
- package/dist/chunk-HERFK7W6.js.map +1 -0
- package/dist/chunk-Y74EXABN.js +295 -0
- package/dist/chunk-Y74EXABN.js.map +1 -0
- package/dist/{db-IOYMX64U.js → db-YWCNHBLH.js} +36 -4
- package/dist/index.cjs +1660 -237
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +169 -2
- package/dist/index.d.ts +169 -2
- package/dist/index.js +120 -69
- package/dist/index.js.map +1 -1
- package/dist/server-N7Q6H4OR.js +11612 -0
- package/dist/server-N7Q6H4OR.js.map +1 -0
- package/dist/{worker-3ECJHPRE.js → worker-D4D2YQTA.js} +44 -9
- package/dist/worker-D4D2YQTA.js.map +1 -0
- package/package.json +17 -5
- package/dist/chunk-4API3ZCT.js +0 -1387
- package/dist/chunk-4API3ZCT.js.map +0 -1
- package/dist/chunk-LXZDJJXR.js.map +0 -1
- package/dist/chunk-ZBP4RHNW.js.map +0 -1
- package/dist/server-63DR2HE5.js +0 -6062
- package/dist/server-63DR2HE5.js.map +0 -1
- package/dist/worker-3ECJHPRE.js.map +0 -1
- /package/dist/{db-IOYMX64U.js.map → db-YWCNHBLH.js.map} +0 -0
|
@@ -0,0 +1,2781 @@
|
|
|
1
|
+
// src/schemas.ts
|
|
2
|
+
import { z } from "zod";
|
|
3
|
+
var HarvestOptionsSchema = z.object({
|
|
4
|
+
query: z.string().min(1),
|
|
5
|
+
location: z.string().optional(),
|
|
6
|
+
gl: z.string().length(2).default("us"),
|
|
7
|
+
hl: z.string().length(2).default("en"),
|
|
8
|
+
device: z.enum(["desktop", "mobile"]).default("desktop"),
|
|
9
|
+
proxyMode: z.enum(["location", "configured", "none"]).default("location"),
|
|
10
|
+
proxyZip: z.string().regex(/^\d{5}$/).optional(),
|
|
11
|
+
debug: z.boolean().default(false),
|
|
12
|
+
depth: z.number().int().min(1).max(30).default(3),
|
|
13
|
+
maxQuestions: z.number().int().min(1).max(1e3).default(100),
|
|
14
|
+
headless: z.boolean().default(false),
|
|
15
|
+
profileDir: z.string().optional(),
|
|
16
|
+
proxy: z.string().url().optional(),
|
|
17
|
+
kernelApiKey: z.string().optional(),
|
|
18
|
+
kernelProxyId: z.string().optional(),
|
|
19
|
+
kernelProxyResolution: z.unknown().optional(),
|
|
20
|
+
outputDir: z.string().default("./paa-output"),
|
|
21
|
+
format: z.enum(["json", "csv", "both"]).default("both"),
|
|
22
|
+
serpOnly: z.boolean().default(false),
|
|
23
|
+
pages: z.number().int().min(1).max(2).default(1)
|
|
24
|
+
});
|
|
25
|
+
var MapsPlaceOptionsSchema = z.object({
|
|
26
|
+
businessName: z.string().min(1),
|
|
27
|
+
location: z.string().min(1),
|
|
28
|
+
gl: z.string().length(2).default("us"),
|
|
29
|
+
hl: z.string().length(2).default("en"),
|
|
30
|
+
includeReviews: z.boolean().default(false),
|
|
31
|
+
maxReviews: z.number().int().min(1).max(500).default(50),
|
|
32
|
+
kernelApiKey: z.string().optional(),
|
|
33
|
+
kernelProxyId: z.string().optional(),
|
|
34
|
+
headless: z.boolean().default(true)
|
|
35
|
+
});
|
|
36
|
+
var RawPAAItemSchema = z.object({
|
|
37
|
+
question: z.string().min(1),
|
|
38
|
+
answer: z.string().optional(),
|
|
39
|
+
sourceTitle: z.string().optional(),
|
|
40
|
+
sourceSite: z.string().optional(),
|
|
41
|
+
sourceCite: z.string().optional()
|
|
42
|
+
});
|
|
43
|
+
var RawMapsOverviewSchema = z.object({
|
|
44
|
+
name: z.string().nullable(),
|
|
45
|
+
rating: z.string().nullable(),
|
|
46
|
+
reviewCount: z.string().nullable(),
|
|
47
|
+
category: z.string().nullable(),
|
|
48
|
+
address: z.string().nullable(),
|
|
49
|
+
hoursSummary: z.string().nullable(),
|
|
50
|
+
phone: z.string().nullable(),
|
|
51
|
+
phoneDisplay: z.string().nullable(),
|
|
52
|
+
website: z.string().nullable(),
|
|
53
|
+
plusCode: z.string().nullable(),
|
|
54
|
+
bookingUrl: z.string().nullable()
|
|
55
|
+
});
|
|
56
|
+
var RawMapsHoursRowSchema = z.object({
|
|
57
|
+
day: z.string(),
|
|
58
|
+
hours: z.string()
|
|
59
|
+
});
|
|
60
|
+
var RawMapsReviewStatsSchema = z.object({
|
|
61
|
+
reviewHistogram: z.array(z.object({
|
|
62
|
+
stars: z.number(),
|
|
63
|
+
count: z.string()
|
|
64
|
+
})),
|
|
65
|
+
reviewTopics: z.array(z.object({
|
|
66
|
+
label: z.string(),
|
|
67
|
+
count: z.string()
|
|
68
|
+
}))
|
|
69
|
+
});
|
|
70
|
+
var RawMapsReviewCardSchema = z.object({
|
|
71
|
+
reviewId: z.string(),
|
|
72
|
+
author: z.string().nullable(),
|
|
73
|
+
stars: z.string().nullable(),
|
|
74
|
+
date: z.string().nullable(),
|
|
75
|
+
text: z.string().nullable(),
|
|
76
|
+
ownerResponse: z.string().nullable()
|
|
77
|
+
});
|
|
78
|
+
var RawMapsAboutAttributeSchema = z.object({
|
|
79
|
+
section: z.string(),
|
|
80
|
+
attribute: z.string()
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
// src/driver/BrowserDriver.ts
|
|
84
|
+
import { chromium } from "playwright-extra";
|
|
85
|
+
import StealthPlugin from "puppeteer-extra-plugin-stealth";
|
|
86
|
+
import { chromium as playwrightChromium } from "playwright";
|
|
87
|
+
import Kernel from "@onkernel/sdk";
|
|
88
|
+
|
|
89
|
+
// src/selectors.ts
|
|
90
|
+
var PAASelectors = {
|
|
91
|
+
container: ".eJH8qe.adDDi",
|
|
92
|
+
dataInitq: "[data-initq]",
|
|
93
|
+
item: ".related-question-pair",
|
|
94
|
+
itemDataQ: "data-q",
|
|
95
|
+
itemDataInitQ: "data-initq",
|
|
96
|
+
itemQuestionEl: ".JlqpRe",
|
|
97
|
+
answerContainer: ".bCOlv, .hgKElc, .wDYxhc, .LGOjhe, .fo7IQd, .fmW3u",
|
|
98
|
+
sourceTitle: "h3",
|
|
99
|
+
sourceSite: ".VuuXrf",
|
|
100
|
+
sourceCite: "cite",
|
|
101
|
+
clickTarget: ".JlqpRe",
|
|
102
|
+
expandedClass: "aoRk1c",
|
|
103
|
+
captchaMarker: '#captcha-form, #recaptcha, form[action*="/sorry/"], .g-recaptcha, [data-sitekey]'
|
|
104
|
+
};
|
|
105
|
+
var VideoSelectors = {
|
|
106
|
+
container: 'div[jscontroller="HWk0Gf"]',
|
|
107
|
+
sectionHeading: '.mgAbYb[role="heading"]',
|
|
108
|
+
item: "a.rIRoqf"
|
|
109
|
+
};
|
|
110
|
+
var ShortVideoSelectors = {
|
|
111
|
+
udm: "39",
|
|
112
|
+
item: "a.rIRoqf",
|
|
113
|
+
durationPattern: /^\d+:\d+$/,
|
|
114
|
+
platforms: ["YouTube", "TikTok", "Instagram", "Facebook", "X"]
|
|
115
|
+
};
|
|
116
|
+
var ForumSelectors = {
|
|
117
|
+
section: ".ULSxyf",
|
|
118
|
+
item: "a.KYg7td.INpicf",
|
|
119
|
+
title: ".hyYc0c",
|
|
120
|
+
source: ".K4ETW"
|
|
121
|
+
};
|
|
122
|
+
var WhatPeopleSayingSelectors = {
|
|
123
|
+
sectionTag: "g-section-with-header",
|
|
124
|
+
sectionHeadingText: "What people are saying",
|
|
125
|
+
card: '.dRzkFf[role="listitem"]',
|
|
126
|
+
cardLink: 'a.WlydOe[jsname="YKoRaf"]',
|
|
127
|
+
titleH1: "h1.WQWxe",
|
|
128
|
+
titleDiv: ".eAaXgc",
|
|
129
|
+
popularCommentLabel: ".qgdis",
|
|
130
|
+
source: ".sTl1Td",
|
|
131
|
+
platformBadge: ".appd0, .KrMNbf",
|
|
132
|
+
ytChannel: ".sjVJQd",
|
|
133
|
+
ytDate: ".PLq9Je",
|
|
134
|
+
authorNote: ".nDgy9d"
|
|
135
|
+
};
|
|
136
|
+
var AIOverviewSelectors = {
|
|
137
|
+
root: "[data-lhcontainer][data-streaming-container][eid]",
|
|
138
|
+
legacyRoot: '[data-hveid="CBMQAA"]',
|
|
139
|
+
wrapper: ".Fgyi2e",
|
|
140
|
+
controller: '[jscontroller="AkrxPe"]',
|
|
141
|
+
contentSubtree: '[data-subtree="mfc"]',
|
|
142
|
+
header: ".heWuVc",
|
|
143
|
+
heading: ".Fzsovc.cwYVJe.RJPOee",
|
|
144
|
+
showMoreButton: '[aria-label="Show more AI Overview"]',
|
|
145
|
+
sourcesPanel: ".OZ9ddf.WAUd4",
|
|
146
|
+
disclaimer: ".DuQANe.MSJHRb"
|
|
147
|
+
};
|
|
148
|
+
var AIModeSelectors = {
|
|
149
|
+
root: '[data-hveid="CAUQAA"]',
|
|
150
|
+
wrapper: ".Fgyi2e",
|
|
151
|
+
citations: '.Fgyi2e [data-hveid] a[jsname="pxBnId"]'
|
|
152
|
+
};
|
|
153
|
+
var OrganicSelectors = {
|
|
154
|
+
result: ".wHYlTd.tF2Cxc",
|
|
155
|
+
title: "h3.LC20lb",
|
|
156
|
+
siteName: ".VuuXrf",
|
|
157
|
+
cite: "cite.tjvcx",
|
|
158
|
+
snippet: ".VwiC3b",
|
|
159
|
+
redditCite: "cite.qLRx3b",
|
|
160
|
+
ratingWrap: ".Y0A0hc",
|
|
161
|
+
ratingValue: ".yi40Hd",
|
|
162
|
+
reviewCount: ".RDApEe"
|
|
163
|
+
};
|
|
164
|
+
var LocalPackSelectors = {
|
|
165
|
+
headingText: "Businesses",
|
|
166
|
+
card: ".w7Dbne",
|
|
167
|
+
name: ".OSrXXb",
|
|
168
|
+
ratingValue: ".yi40Hd",
|
|
169
|
+
reviewCount: ".RDApEe"
|
|
170
|
+
};
|
|
171
|
+
var MapsSelectors = {
|
|
172
|
+
ratingAndCount: "div.F7nice",
|
|
173
|
+
hoursTable: "table.eK4R0e",
|
|
174
|
+
reviewScrollPane: 'div.m6QErb[tabindex="-1"]',
|
|
175
|
+
reviewScrollPaneFallback: '[role="main"] div[tabindex="-1"]',
|
|
176
|
+
reviewCardAuthor: "div.d4r55, span.d4r55, span.RPZfBb",
|
|
177
|
+
reviewCardDate: "span.rsqaWe",
|
|
178
|
+
reviewCardText: "span.wiI7pd",
|
|
179
|
+
reviewCardOwnerBlock: "div.CDe7pd",
|
|
180
|
+
hoursTableAlt: 'table[aria-label*="Hour"]',
|
|
181
|
+
reviewCard: "[data-review-id]",
|
|
182
|
+
reviewStars: '[role="img"][aria-label*="star"]',
|
|
183
|
+
reviewTab: 'button[role="tab"][aria-label*="Review"]',
|
|
184
|
+
aboutTab: 'button[role="tab"][aria-label*="About"]',
|
|
185
|
+
expandReview: '[data-review-id] button[aria-label*="See more"], [data-review-id] button.w8nwRe'
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
// src/errors.ts
|
|
189
|
+
var RECAPTCHA_INSTRUCTIONS = "Google returned a CAPTCHA. Run with --headless=false to re-warm the browser profile, then retry.";
|
|
190
|
+
function sanitizeVendorName(message) {
|
|
191
|
+
return message.replace(/kernel\.sh\s+sessions?/gi, "sessions").replace(/kernel\.sh\s+session/gi, "this session").replace(/kernel\.sh/gi, "the service").replace(/kernel\s+sessions?/gi, "sessions").replace(/kernel\s+session/gi, "this session").replace(/\bkernel\b/gi, "the service").replace(/ +/g, " ").trim();
|
|
192
|
+
}
|
|
193
|
+
var CaptchaError = class extends Error {
|
|
194
|
+
constructor(instructions) {
|
|
195
|
+
super(`CAPTCHA detected. ${instructions}`);
|
|
196
|
+
this.instructions = instructions;
|
|
197
|
+
}
|
|
198
|
+
instructions;
|
|
199
|
+
name = "CaptchaError";
|
|
200
|
+
};
|
|
201
|
+
var ExtractionError = class extends Error {
|
|
202
|
+
constructor(message, cause) {
|
|
203
|
+
super(message);
|
|
204
|
+
this.cause = cause;
|
|
205
|
+
}
|
|
206
|
+
cause;
|
|
207
|
+
name = "ExtractionError";
|
|
208
|
+
};
|
|
209
|
+
var RequestAbortedError = class extends Error {
|
|
210
|
+
name = "RequestAbortedError";
|
|
211
|
+
constructor(message = "Request aborted before harvest completed") {
|
|
212
|
+
super(message);
|
|
213
|
+
}
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
// src/driver/BrowserDriver.ts
|
|
217
|
+
chromium.use(StealthPlugin());
|
|
218
|
+
var DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
219
|
+
var MOBILE_USER_AGENT = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
|
|
220
|
+
var DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS = 180;
|
|
221
|
+
var KERNEL_BROWSER_CLOSE_TIMEOUT_MS = 3e3;
|
|
222
|
+
var KERNEL_SESSION_DELETE_TIMEOUT_MS = 5e3;
|
|
223
|
+
function positiveIntFromEnv(name, fallback) {
|
|
224
|
+
const raw = process.env[name];
|
|
225
|
+
if (!raw) return fallback;
|
|
226
|
+
const parsed = Number(raw);
|
|
227
|
+
return Number.isInteger(parsed) && parsed > 0 ? parsed : fallback;
|
|
228
|
+
}
|
|
229
|
+
function proxyIdSuffix(proxyId) {
|
|
230
|
+
return proxyId ? proxyId.slice(-6) : null;
|
|
231
|
+
}
|
|
232
|
+
function errorText(err) {
|
|
233
|
+
return err instanceof Error ? err.message : String(err);
|
|
234
|
+
}
|
|
235
|
+
function rankCheckContextOptions(config) {
|
|
236
|
+
return {
|
|
237
|
+
viewport: config.viewport,
|
|
238
|
+
locale: config.locale,
|
|
239
|
+
userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
|
|
240
|
+
...config.deviceScaleFactor ? { deviceScaleFactor: config.deviceScaleFactor } : {},
|
|
241
|
+
...config.isMobile !== void 0 ? { isMobile: config.isMobile } : {},
|
|
242
|
+
...config.hasTouch !== void 0 ? { hasTouch: config.hasTouch } : {}
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
async function withTimeout(promise, timeoutMs, label) {
|
|
246
|
+
let timeout;
|
|
247
|
+
try {
|
|
248
|
+
return await Promise.race([
|
|
249
|
+
promise,
|
|
250
|
+
new Promise((_, reject) => {
|
|
251
|
+
timeout = setTimeout(() => reject(new Error(`${label} timed out after ${timeoutMs}ms`)), timeoutMs);
|
|
252
|
+
})
|
|
253
|
+
]);
|
|
254
|
+
} finally {
|
|
255
|
+
if (timeout) clearTimeout(timeout);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
function buildYouTubeChannelVideosUrl(channelInput) {
|
|
259
|
+
const raw = channelInput.trim();
|
|
260
|
+
if (!raw) throw new Error("channelHandle is required");
|
|
261
|
+
const urlLike = /^https?:\/\//i.test(raw) || /^(www\.|m\.)?youtube\.com\//i.test(raw);
|
|
262
|
+
if (urlLike) {
|
|
263
|
+
const parsed = new URL(/^https?:\/\//i.test(raw) ? raw : `https://${raw}`);
|
|
264
|
+
const host = parsed.hostname.replace(/^www\./, "").replace(/^m\./, "").toLowerCase();
|
|
265
|
+
if (host !== "youtube.com") throw new Error("channel URL must be on youtube.com");
|
|
266
|
+
const segments = parsed.pathname.split("/").filter(Boolean);
|
|
267
|
+
const first = segments[0] ?? "";
|
|
268
|
+
const second = segments[1] ?? "";
|
|
269
|
+
if (first.startsWith("@")) return `https://www.youtube.com/${first}/videos`;
|
|
270
|
+
if (first === "channel" && second) return `https://www.youtube.com/channel/${second}/videos`;
|
|
271
|
+
if ((first === "c" || first === "user") && second) return `https://www.youtube.com/${first}/${second}/videos`;
|
|
272
|
+
throw new Error("channel URL must be a YouTube handle, /channel/UC..., /c/..., or /user/... URL");
|
|
273
|
+
}
|
|
274
|
+
const stripped = raw.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
275
|
+
const withoutVideos = stripped.replace(/\/videos$/i, "");
|
|
276
|
+
if (/^UC[\w-]{20,}$/.test(withoutVideos)) {
|
|
277
|
+
return `https://www.youtube.com/channel/${withoutVideos}/videos`;
|
|
278
|
+
}
|
|
279
|
+
const handle = withoutVideos.startsWith("@") ? withoutVideos : `@${withoutVideos}`;
|
|
280
|
+
if (!/^@[\w.-]+$/.test(handle)) {
|
|
281
|
+
throw new Error("channelHandle must be an @handle, UC channel ID, or YouTube channel URL");
|
|
282
|
+
}
|
|
283
|
+
return `https://www.youtube.com/${handle}/videos`;
|
|
284
|
+
}
|
|
285
|
+
var BrowserDriver = class {
|
|
286
|
+
browser = null;
|
|
287
|
+
context = null;
|
|
288
|
+
page = null;
|
|
289
|
+
kernelClient = null;
|
|
290
|
+
kernelSessionId = null;
|
|
291
|
+
debugEnabled = false;
|
|
292
|
+
debugSnapshot = {
|
|
293
|
+
kernel: null,
|
|
294
|
+
context: null,
|
|
295
|
+
networkLocation: null,
|
|
296
|
+
serpNavigation: null
|
|
297
|
+
};
|
|
298
|
+
async launch(config) {
|
|
299
|
+
this.debugEnabled = config.debug === true;
|
|
300
|
+
const proxyMode = config.proxyMode ?? (config.kernelProxyId ? "configured" : "none");
|
|
301
|
+
const device = config.isMobile ? "mobile" : "desktop";
|
|
302
|
+
this.debugSnapshot = {
|
|
303
|
+
kernel: null,
|
|
304
|
+
context: {
|
|
305
|
+
viewport: config.viewport,
|
|
306
|
+
locale: config.locale,
|
|
307
|
+
device,
|
|
308
|
+
userAgent: config.userAgent ?? (config.isMobile ? MOBILE_USER_AGENT : DESKTOP_USER_AGENT),
|
|
309
|
+
deviceScaleFactor: config.deviceScaleFactor ?? null,
|
|
310
|
+
isMobile: config.isMobile === true,
|
|
311
|
+
hasTouch: config.hasTouch === true
|
|
312
|
+
},
|
|
313
|
+
networkLocation: null,
|
|
314
|
+
serpNavigation: null
|
|
315
|
+
};
|
|
316
|
+
if (config.kernelApiKey) {
|
|
317
|
+
this.kernelClient = new Kernel({ apiKey: config.kernelApiKey });
|
|
318
|
+
const timeoutSeconds = positiveIntFromEnv("KERNEL_BROWSER_TIMEOUT_SECONDS", DEFAULT_KERNEL_BROWSER_TIMEOUT_SECONDS);
|
|
319
|
+
const kernelBrowser = await this.kernelClient.browsers.create({
|
|
320
|
+
stealth: true,
|
|
321
|
+
timeout_seconds: timeoutSeconds,
|
|
322
|
+
...config.kernelProxyId ? { proxy_id: config.kernelProxyId } : {}
|
|
323
|
+
});
|
|
324
|
+
this.kernelSessionId = kernelBrowser.session_id;
|
|
325
|
+
let defaultProxyDisabled = null;
|
|
326
|
+
let defaultProxyDisableError = null;
|
|
327
|
+
if (proxyMode === "none") {
|
|
328
|
+
try {
|
|
329
|
+
await withTimeout(
|
|
330
|
+
this.kernelClient.browsers.update(this.kernelSessionId, { disable_default_proxy: true }),
|
|
331
|
+
5e3,
|
|
332
|
+
`Kernel session ${this.kernelSessionId} disable default proxy`
|
|
333
|
+
);
|
|
334
|
+
defaultProxyDisabled = true;
|
|
335
|
+
} catch (err) {
|
|
336
|
+
defaultProxyDisabled = false;
|
|
337
|
+
defaultProxyDisableError = errorText(err);
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
const kernelDebug = {
|
|
341
|
+
sessionId: this.kernelSessionId,
|
|
342
|
+
proxyMode,
|
|
343
|
+
requestedProxyIdPresent: Boolean(config.kernelProxyId),
|
|
344
|
+
requestedProxyIdSuffix: proxyIdSuffix(config.kernelProxyId),
|
|
345
|
+
createdProxyIdPresent: typeof kernelBrowser.proxy_id === "string" ? Boolean(kernelBrowser.proxy_id) : null,
|
|
346
|
+
createdProxyIdSuffix: proxyIdSuffix(kernelBrowser.proxy_id),
|
|
347
|
+
retrievedProxyIdPresent: null,
|
|
348
|
+
retrievedProxyIdSuffix: null,
|
|
349
|
+
retrievedProxyIdMatchesRequested: null,
|
|
350
|
+
defaultProxyDisabled,
|
|
351
|
+
defaultProxyDisableError,
|
|
352
|
+
proxyResolution: config.kernelProxyResolution ?? null,
|
|
353
|
+
timeoutSeconds,
|
|
354
|
+
stealth: typeof kernelBrowser.stealth === "boolean" ? kernelBrowser.stealth : null,
|
|
355
|
+
profilePresent: null,
|
|
356
|
+
poolPresent: null,
|
|
357
|
+
retrieveError: null
|
|
358
|
+
};
|
|
359
|
+
this.debugSnapshot.kernel = kernelDebug;
|
|
360
|
+
console.info(JSON.stringify({
|
|
361
|
+
event: "kernel_browser_created",
|
|
362
|
+
kernel_session_id: this.kernelSessionId,
|
|
363
|
+
timeout_seconds: timeoutSeconds,
|
|
364
|
+
proxy_mode: proxyMode,
|
|
365
|
+
proxy_id_present: Boolean(config.kernelProxyId),
|
|
366
|
+
proxy_resolution_source: config.kernelProxyResolution?.source
|
|
367
|
+
}));
|
|
368
|
+
if (this.debugEnabled) {
|
|
369
|
+
await this.populateKernelRetrieveDebug(kernelDebug, config.kernelProxyId);
|
|
370
|
+
}
|
|
371
|
+
this.browser = await playwrightChromium.connectOverCDP(kernelBrowser.cdp_ws_url);
|
|
372
|
+
this.context = await this.browser.newContext(rankCheckContextOptions(config));
|
|
373
|
+
await this.installEsbuildHelperShims(this.context);
|
|
374
|
+
this.page = await this.context.newPage();
|
|
375
|
+
await this.page.setViewportSize(config.viewport);
|
|
376
|
+
if (this.debugEnabled) {
|
|
377
|
+
this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
|
|
378
|
+
}
|
|
379
|
+
return;
|
|
380
|
+
}
|
|
381
|
+
const launchOpts = {
|
|
382
|
+
headless: config.headless,
|
|
383
|
+
proxy: config.proxy ? { server: config.proxy } : void 0
|
|
384
|
+
};
|
|
385
|
+
const ctxOpts = rankCheckContextOptions(config);
|
|
386
|
+
if (config.profileDir) {
|
|
387
|
+
this.context = await chromium.launchPersistentContext(config.profileDir, {
|
|
388
|
+
...launchOpts,
|
|
389
|
+
...ctxOpts
|
|
390
|
+
});
|
|
391
|
+
await this.installEsbuildHelperShims(this.context);
|
|
392
|
+
this.page = await this.context.newPage();
|
|
393
|
+
} else {
|
|
394
|
+
this.browser = await chromium.launch(launchOpts);
|
|
395
|
+
this.context = await this.browser.newContext(ctxOpts);
|
|
396
|
+
await this.installEsbuildHelperShims(this.context);
|
|
397
|
+
this.page = await this.context.newPage();
|
|
398
|
+
}
|
|
399
|
+
if (this.debugEnabled) {
|
|
400
|
+
this.debugSnapshot.networkLocation = await this.captureBrowserNetworkLocation();
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
async populateKernelRetrieveDebug(kernelDebug, requestedProxyId) {
|
|
404
|
+
if (!this.kernelClient || !this.kernelSessionId) return;
|
|
405
|
+
try {
|
|
406
|
+
const retrieved = await withTimeout(
|
|
407
|
+
this.kernelClient.browsers.retrieve(this.kernelSessionId),
|
|
408
|
+
5e3,
|
|
409
|
+
`Kernel session ${this.kernelSessionId} retrieve`
|
|
410
|
+
);
|
|
411
|
+
kernelDebug.retrievedProxyIdPresent = typeof retrieved.proxy_id === "string" ? Boolean(retrieved.proxy_id) : false;
|
|
412
|
+
kernelDebug.retrievedProxyIdSuffix = proxyIdSuffix(retrieved.proxy_id);
|
|
413
|
+
kernelDebug.retrievedProxyIdMatchesRequested = requestedProxyId ? retrieved.proxy_id === requestedProxyId : !retrieved.proxy_id;
|
|
414
|
+
kernelDebug.timeoutSeconds = typeof retrieved.timeout_seconds === "number" ? retrieved.timeout_seconds : kernelDebug.timeoutSeconds;
|
|
415
|
+
kernelDebug.stealth = typeof retrieved.stealth === "boolean" ? retrieved.stealth : kernelDebug.stealth;
|
|
416
|
+
kernelDebug.profilePresent = Boolean(retrieved.profile);
|
|
417
|
+
kernelDebug.poolPresent = Boolean(retrieved.pool);
|
|
418
|
+
} catch (err) {
|
|
419
|
+
kernelDebug.retrieveError = errorText(err);
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
async captureBrowserNetworkLocation() {
|
|
423
|
+
const fallback = (message, source = "ipapi.co") => ({
|
|
424
|
+
source,
|
|
425
|
+
ip: null,
|
|
426
|
+
city: null,
|
|
427
|
+
region: null,
|
|
428
|
+
country: null,
|
|
429
|
+
org: null,
|
|
430
|
+
timezone: null,
|
|
431
|
+
error: message
|
|
432
|
+
});
|
|
433
|
+
if (!this.context) return fallback("browser context is not available");
|
|
434
|
+
let debugPage = null;
|
|
435
|
+
try {
|
|
436
|
+
debugPage = await this.context.newPage();
|
|
437
|
+
const ipwho = await this.loadJsonInDebugPage(debugPage, "https://ipwho.is/");
|
|
438
|
+
if (ipwho) {
|
|
439
|
+
const connection = typeof ipwho.connection === "object" && ipwho.connection !== null ? ipwho.connection : {};
|
|
440
|
+
return {
|
|
441
|
+
source: "ipwho.is",
|
|
442
|
+
ip: typeof ipwho.ip === "string" ? ipwho.ip : null,
|
|
443
|
+
city: typeof ipwho.city === "string" ? ipwho.city : null,
|
|
444
|
+
region: typeof ipwho.region === "string" ? ipwho.region : null,
|
|
445
|
+
country: typeof ipwho.country === "string" ? ipwho.country : null,
|
|
446
|
+
org: typeof connection.org === "string" ? connection.org : null,
|
|
447
|
+
timezone: typeof ipwho.timezone === "object" && ipwho.timezone !== null && typeof ipwho.timezone.id === "string" ? ipwho.timezone.id : null,
|
|
448
|
+
error: null
|
|
449
|
+
};
|
|
450
|
+
}
|
|
451
|
+
const ipify = await this.loadJsonInDebugPage(debugPage, "https://api64.ipify.org?format=json");
|
|
452
|
+
if (ipify) {
|
|
453
|
+
return {
|
|
454
|
+
source: "api64.ipify.org",
|
|
455
|
+
ip: typeof ipify.ip === "string" ? ipify.ip : null,
|
|
456
|
+
city: null,
|
|
457
|
+
region: null,
|
|
458
|
+
country: null,
|
|
459
|
+
org: null,
|
|
460
|
+
timezone: null,
|
|
461
|
+
error: null
|
|
462
|
+
};
|
|
463
|
+
}
|
|
464
|
+
await withTimeout(
|
|
465
|
+
debugPage.goto("https://ipapi.co/json/", { waitUntil: "domcontentloaded", timeout: 7e3 }),
|
|
466
|
+
8e3,
|
|
467
|
+
"browser network location navigation"
|
|
468
|
+
);
|
|
469
|
+
const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
|
|
470
|
+
const data = JSON.parse(body);
|
|
471
|
+
return {
|
|
472
|
+
source: "ipapi.co",
|
|
473
|
+
ip: typeof data.ip === "string" ? data.ip : null,
|
|
474
|
+
city: typeof data.city === "string" ? data.city : null,
|
|
475
|
+
region: typeof data.region === "string" ? data.region : null,
|
|
476
|
+
country: typeof data.country_name === "string" ? data.country_name : typeof data.country === "string" ? data.country : null,
|
|
477
|
+
org: typeof data.org === "string" ? data.org : null,
|
|
478
|
+
timezone: typeof data.timezone === "string" ? data.timezone : null,
|
|
479
|
+
error: null
|
|
480
|
+
};
|
|
481
|
+
} catch (err) {
|
|
482
|
+
return fallback(errorText(err));
|
|
483
|
+
} finally {
|
|
484
|
+
await debugPage?.close().catch(() => {
|
|
485
|
+
});
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
async loadJsonInDebugPage(debugPage, url) {
|
|
489
|
+
try {
|
|
490
|
+
await withTimeout(
|
|
491
|
+
debugPage.goto(url, { waitUntil: "domcontentloaded", timeout: 7e3 }),
|
|
492
|
+
8e3,
|
|
493
|
+
`browser network location navigation ${url}`
|
|
494
|
+
);
|
|
495
|
+
const body = await debugPage.locator("body").innerText({ timeout: 2e3 });
|
|
496
|
+
return JSON.parse(body);
|
|
497
|
+
} catch {
|
|
498
|
+
return null;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
async installEsbuildHelperShims(context) {
|
|
502
|
+
await context.addInitScript(() => {
|
|
503
|
+
const g = globalThis;
|
|
504
|
+
if (typeof g.__name !== "function") g.__name = (fn) => fn;
|
|
505
|
+
if (typeof g.__publicField !== "function") g.__publicField = (obj, key, value) => {
|
|
506
|
+
obj[key] = value;
|
|
507
|
+
return value;
|
|
508
|
+
};
|
|
509
|
+
});
|
|
510
|
+
}
|
|
511
|
+
async navigateToSERP(query, uule, gl, hl, options) {
|
|
512
|
+
const params = new URLSearchParams({ q: query, gl, hl, pws: "0" });
|
|
513
|
+
if (options?.num) params.set("num", String(options.num));
|
|
514
|
+
if (uule) params.set("uule", uule);
|
|
515
|
+
const url = "https://www.google.com/search?" + params.toString();
|
|
516
|
+
const navDebug = options?.debug ? {
|
|
517
|
+
requestedUrl: url,
|
|
518
|
+
finalUrl: null,
|
|
519
|
+
title: null,
|
|
520
|
+
bodySnippet: null,
|
|
521
|
+
hasPaa: null,
|
|
522
|
+
captchaDetected: null,
|
|
523
|
+
googleSorryUrl: null,
|
|
524
|
+
redirected: null
|
|
525
|
+
} : null;
|
|
526
|
+
if (navDebug) this.debugSnapshot.serpNavigation = navDebug;
|
|
527
|
+
try {
|
|
528
|
+
await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
|
|
529
|
+
} catch (err) {
|
|
530
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: null, captchaDetected: null });
|
|
531
|
+
const diag = await this.captureDiagnostics(url);
|
|
532
|
+
throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
|
|
533
|
+
}
|
|
534
|
+
const captchaCount = await this.page.locator(PAASelectors.captchaMarker).count();
|
|
535
|
+
if (captchaCount > 0) {
|
|
536
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
|
|
537
|
+
throw new CaptchaError(this.captchaMessage());
|
|
538
|
+
}
|
|
539
|
+
const fastFound = await this.page.waitForSelector(PAASelectors.item, { timeout: 4e3 }).catch(() => null);
|
|
540
|
+
if (fastFound) {
|
|
541
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
|
|
542
|
+
return { hasPaa: true };
|
|
543
|
+
}
|
|
544
|
+
const captchaAfter = await this.page.locator(PAASelectors.captchaMarker).count();
|
|
545
|
+
if (captchaAfter > 0) {
|
|
546
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: true });
|
|
547
|
+
throw new CaptchaError(this.captchaMessage());
|
|
548
|
+
}
|
|
549
|
+
for (let i = 1; i <= 6; i++) {
|
|
550
|
+
await this.page.evaluate((f) => {
|
|
551
|
+
window.scrollTo(0, document.body.scrollHeight * f);
|
|
552
|
+
}, i / 6);
|
|
553
|
+
await this.page.waitForTimeout(600);
|
|
554
|
+
const count = await this.page.locator(PAASelectors.item).count();
|
|
555
|
+
if (count > 0) {
|
|
556
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: true, captchaDetected: false });
|
|
557
|
+
return { hasPaa: true };
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
await this.updateSerpNavigationDebug(navDebug, url, { hasPaa: false, captchaDetected: false });
|
|
561
|
+
return { hasPaa: false };
|
|
562
|
+
}
|
|
563
|
+
async updateSerpNavigationDebug(navDebug, requestedUrl, state) {
|
|
564
|
+
if (!navDebug || !this.page) return;
|
|
565
|
+
try {
|
|
566
|
+
const finalUrl = this.page.url();
|
|
567
|
+
const title = await this.page.title().catch(() => "");
|
|
568
|
+
const bodySnippet = await this.page.evaluate(() => {
|
|
569
|
+
const text = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
|
|
570
|
+
return text.slice(0, 500);
|
|
571
|
+
}).catch(() => "");
|
|
572
|
+
const textCaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
|
|
573
|
+
navDebug.finalUrl = finalUrl;
|
|
574
|
+
navDebug.title = title;
|
|
575
|
+
navDebug.bodySnippet = bodySnippet;
|
|
576
|
+
navDebug.hasPaa = state.hasPaa;
|
|
577
|
+
navDebug.captchaDetected = state.captchaDetected ?? textCaptcha;
|
|
578
|
+
navDebug.googleSorryUrl = /google\.[^/]+\/sorry\//i.test(finalUrl);
|
|
579
|
+
navDebug.redirected = finalUrl !== requestedUrl;
|
|
580
|
+
} catch (err) {
|
|
581
|
+
navDebug.bodySnippet = `debug capture failed: ${errorText(err)}`;
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
async captureDiagnostics(intendedUrl) {
|
|
585
|
+
try {
|
|
586
|
+
const finalUrl = this.page.url();
|
|
587
|
+
const title = await this.page.title().catch(() => "");
|
|
588
|
+
const bodySnippet = await this.page.evaluate(() => {
|
|
589
|
+
const t = (document.body?.innerText ?? "").replace(/\s+/g, " ").trim();
|
|
590
|
+
return t.slice(0, 400);
|
|
591
|
+
}).catch(() => "");
|
|
592
|
+
const consent = /consent\.google\./.test(finalUrl) || /before you continue/i.test(bodySnippet);
|
|
593
|
+
const recaptcha = /recaptcha|unusual traffic|are you a robot/i.test(bodySnippet);
|
|
594
|
+
const flags = [
|
|
595
|
+
consent ? "CONSENT_WALL" : "",
|
|
596
|
+
recaptcha ? "BOT_CHALLENGE" : "",
|
|
597
|
+
finalUrl !== intendedUrl ? "REDIRECTED" : ""
|
|
598
|
+
].filter(Boolean).join(",");
|
|
599
|
+
return `intended=${intendedUrl} | final=${finalUrl} | title="${title}" | flags=[${flags}] | body="${bodySnippet}"`;
|
|
600
|
+
} catch (e) {
|
|
601
|
+
return `diagnostics-failed: ${e.message}`;
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
captchaMessage() {
|
|
605
|
+
return this.kernelClient ? "Google returned a CAPTCHA on this session \u2014 retrying with a fresh session." : RECAPTCHA_INSTRUCTIONS;
|
|
606
|
+
}
|
|
607
|
+
async navigateTo(url) {
|
|
608
|
+
try {
|
|
609
|
+
await this.page.goto(url, { waitUntil: "domcontentloaded", timeout: 45e3 });
|
|
610
|
+
} catch (err) {
|
|
611
|
+
const diag = await this.captureDiagnostics(url);
|
|
612
|
+
throw new ExtractionError(`page.goto failed: ${err.message} | ${diag}`);
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
async navigateToChannel(channelHandle) {
|
|
616
|
+
const url = buildYouTubeChannelVideosUrl(channelHandle);
|
|
617
|
+
try {
|
|
618
|
+
await this.page.goto(url, { waitUntil: "networkidle", timeout: 3e4 });
|
|
619
|
+
} catch (err) {
|
|
620
|
+
const diag = await this.captureDiagnostics(url);
|
|
621
|
+
throw new ExtractionError(`navigateToChannel failed: ${err.message} | ${diag}`);
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
async evaluate(fn, arg) {
|
|
625
|
+
return this.page.evaluate(fn, arg);
|
|
626
|
+
}
|
|
627
|
+
getPage() {
|
|
628
|
+
return this.page;
|
|
629
|
+
}
|
|
630
|
+
getKernelSessionId() {
|
|
631
|
+
return this.kernelSessionId;
|
|
632
|
+
}
|
|
633
|
+
getDebugSnapshot() {
|
|
634
|
+
return this.debugSnapshot;
|
|
635
|
+
}
|
|
636
|
+
async close() {
|
|
637
|
+
if (this.browser) {
|
|
638
|
+
const b = this.browser;
|
|
639
|
+
const sessionId = this.kernelSessionId;
|
|
640
|
+
const client = this.kernelClient;
|
|
641
|
+
this.browser = null;
|
|
642
|
+
this.context = null;
|
|
643
|
+
this.page = null;
|
|
644
|
+
this.kernelSessionId = null;
|
|
645
|
+
this.kernelClient = null;
|
|
646
|
+
if (client && sessionId) {
|
|
647
|
+
console.info(JSON.stringify({
|
|
648
|
+
event: "kernel_browser_delete_started",
|
|
649
|
+
kernel_session_id: sessionId
|
|
650
|
+
}));
|
|
651
|
+
const deleteSession = withTimeout(
|
|
652
|
+
client.browsers.deleteByID(sessionId),
|
|
653
|
+
KERNEL_SESSION_DELETE_TIMEOUT_MS,
|
|
654
|
+
`Kernel session ${sessionId} delete`
|
|
655
|
+
);
|
|
656
|
+
const closeBrowser = withTimeout(
|
|
657
|
+
b.close(),
|
|
658
|
+
KERNEL_BROWSER_CLOSE_TIMEOUT_MS,
|
|
659
|
+
`Kernel browser ${sessionId} close`
|
|
660
|
+
);
|
|
661
|
+
const [deleteResult, closeResult] = await Promise.allSettled([deleteSession, closeBrowser]);
|
|
662
|
+
const result = {
|
|
663
|
+
kernelSessionId: sessionId,
|
|
664
|
+
kernelDeleteStarted: true,
|
|
665
|
+
kernelDeleteSucceeded: deleteResult.status === "fulfilled",
|
|
666
|
+
kernelDeleteError: deleteResult.status === "rejected" ? deleteResult.reason instanceof Error ? deleteResult.reason.message : String(deleteResult.reason) : null,
|
|
667
|
+
browserCloseSucceeded: closeResult.status === "fulfilled",
|
|
668
|
+
browserCloseError: closeResult.status === "rejected" ? closeResult.reason instanceof Error ? closeResult.reason.message : String(closeResult.reason) : null
|
|
669
|
+
};
|
|
670
|
+
if (deleteResult.status === "rejected") {
|
|
671
|
+
console.warn(JSON.stringify({
|
|
672
|
+
event: "kernel_browser_delete_failed",
|
|
673
|
+
kernel_session_id: sessionId,
|
|
674
|
+
message: result.kernelDeleteError
|
|
675
|
+
}));
|
|
676
|
+
console.warn(`Kernel session cleanup failed for ${sessionId}:`, deleteResult.reason);
|
|
677
|
+
} else {
|
|
678
|
+
console.info(JSON.stringify({
|
|
679
|
+
event: "kernel_browser_delete_succeeded",
|
|
680
|
+
kernel_session_id: sessionId
|
|
681
|
+
}));
|
|
682
|
+
}
|
|
683
|
+
if (closeResult.status === "rejected") {
|
|
684
|
+
console.warn(JSON.stringify({
|
|
685
|
+
event: "kernel_browser_close_failed",
|
|
686
|
+
kernel_session_id: sessionId,
|
|
687
|
+
message: result.browserCloseError
|
|
688
|
+
}));
|
|
689
|
+
console.warn(`Kernel browser close failed for ${sessionId}:`, closeResult.reason);
|
|
690
|
+
}
|
|
691
|
+
return result;
|
|
692
|
+
}
|
|
693
|
+
await b.close();
|
|
694
|
+
return {
|
|
695
|
+
kernelSessionId: null,
|
|
696
|
+
kernelDeleteStarted: false,
|
|
697
|
+
kernelDeleteSucceeded: null,
|
|
698
|
+
kernelDeleteError: null,
|
|
699
|
+
browserCloseSucceeded: true,
|
|
700
|
+
browserCloseError: null
|
|
701
|
+
};
|
|
702
|
+
} else if (this.context) {
|
|
703
|
+
const ctx = this.context;
|
|
704
|
+
this.context = null;
|
|
705
|
+
this.page = null;
|
|
706
|
+
await ctx.close();
|
|
707
|
+
return {
|
|
708
|
+
kernelSessionId: null,
|
|
709
|
+
kernelDeleteStarted: false,
|
|
710
|
+
kernelDeleteSucceeded: null,
|
|
711
|
+
kernelDeleteError: null,
|
|
712
|
+
browserCloseSucceeded: true,
|
|
713
|
+
browserCloseError: null
|
|
714
|
+
};
|
|
715
|
+
}
|
|
716
|
+
return {
|
|
717
|
+
kernelSessionId: null,
|
|
718
|
+
kernelDeleteStarted: false,
|
|
719
|
+
kernelDeleteSucceeded: null,
|
|
720
|
+
kernelDeleteError: null,
|
|
721
|
+
browserCloseSucceeded: null,
|
|
722
|
+
browserCloseError: null
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
};
|
|
726
|
+
|
|
727
|
+
// src/locations.ts
|
|
728
|
+
var LOCATIONS = {
|
|
729
|
+
"austin": "Austin,Texas,United States",
|
|
730
|
+
"new york": "New York,New York,United States",
|
|
731
|
+
"new york city": "New York,New York,United States",
|
|
732
|
+
"nyc": "New York,New York,United States",
|
|
733
|
+
"los angeles": "Los Angeles,California,United States",
|
|
734
|
+
"la": "Los Angeles,California,United States",
|
|
735
|
+
"chicago": "Chicago,Illinois,United States",
|
|
736
|
+
"houston": "Houston,Texas,United States",
|
|
737
|
+
"phoenix": "Phoenix,Arizona,United States",
|
|
738
|
+
"philadelphia": "Philadelphia,Pennsylvania,United States",
|
|
739
|
+
"philly": "Philadelphia,Pennsylvania,United States",
|
|
740
|
+
"san antonio": "San Antonio,Texas,United States",
|
|
741
|
+
"dallas": "Dallas,Texas,United States",
|
|
742
|
+
"miami": "Miami,Florida,United States",
|
|
743
|
+
"seattle": "Seattle,Washington,United States",
|
|
744
|
+
"denver": "Denver,Colorado,United States",
|
|
745
|
+
"loveland": "Loveland,Colorado,United States",
|
|
746
|
+
"loveland co": "Loveland,Colorado,United States",
|
|
747
|
+
"fort collins": "Fort Collins,Colorado,United States",
|
|
748
|
+
"boulder": "Boulder,Colorado,United States",
|
|
749
|
+
"colorado springs": "Colorado Springs,Colorado,United States",
|
|
750
|
+
"boston": "Boston,Massachusetts,United States",
|
|
751
|
+
"atlanta": "Atlanta,Georgia,United States",
|
|
752
|
+
"san francisco": "San Francisco,California,United States",
|
|
753
|
+
"sf": "San Francisco,California,United States",
|
|
754
|
+
"portland": "Portland,Oregon,United States",
|
|
755
|
+
"las vegas": "Las Vegas,Nevada,United States",
|
|
756
|
+
"minneapolis": "Minneapolis,Minnesota,United States",
|
|
757
|
+
"detroit": "Detroit,Michigan,United States",
|
|
758
|
+
"nashville": "Nashville,Tennessee,United States",
|
|
759
|
+
"charlotte": "Charlotte,North Carolina,United States",
|
|
760
|
+
"orlando": "Orlando,Florida,United States",
|
|
761
|
+
"san diego": "San Diego,California,United States",
|
|
762
|
+
"baltimore": "Baltimore,Maryland,United States",
|
|
763
|
+
"sacramento": "Sacramento,California,United States",
|
|
764
|
+
"columbus": "Columbus,Ohio,United States",
|
|
765
|
+
"indianapolis": "Indianapolis,Indiana,United States",
|
|
766
|
+
"san jose": "San Jose,California,United States",
|
|
767
|
+
"fort worth": "Fort Worth,Texas,United States",
|
|
768
|
+
"jacksonville": "Jacksonville,Florida,United States",
|
|
769
|
+
"memphis": "Memphis,Tennessee,United States",
|
|
770
|
+
"louisville": "Louisville,Kentucky,United States",
|
|
771
|
+
"raleigh": "Raleigh,North Carolina,United States",
|
|
772
|
+
"richmond": "Richmond,Virginia,United States",
|
|
773
|
+
"salt lake city": "Salt Lake City,Utah,United States",
|
|
774
|
+
"toronto": "Toronto,Ontario,Canada",
|
|
775
|
+
"vancouver": "Vancouver,British Columbia,Canada",
|
|
776
|
+
"montreal": "Montreal,Quebec,Canada",
|
|
777
|
+
"calgary": "Calgary,Alberta,Canada",
|
|
778
|
+
"ottawa": "Ottawa,Ontario,Canada",
|
|
779
|
+
"london": "London,England,United Kingdom",
|
|
780
|
+
"manchester": "Manchester,England,United Kingdom",
|
|
781
|
+
"birmingham": "Birmingham,England,United Kingdom",
|
|
782
|
+
"edinburgh": "Edinburgh,Scotland,United Kingdom",
|
|
783
|
+
"glasgow": "Glasgow,Scotland,United Kingdom",
|
|
784
|
+
"leeds": "Leeds,England,United Kingdom",
|
|
785
|
+
"sydney": "Sydney,New South Wales,Australia",
|
|
786
|
+
"melbourne": "Melbourne,Victoria,Australia",
|
|
787
|
+
"brisbane": "Brisbane,Queensland,Australia",
|
|
788
|
+
"perth": "Perth,Western Australia,Australia",
|
|
789
|
+
"adelaide": "Adelaide,South Australia,Australia",
|
|
790
|
+
"dublin": "Dublin,Leinster,Ireland"
|
|
791
|
+
};
|
|
792
|
+
|
|
793
|
+
// src/uule.ts
|
|
794
|
+
function encodeVarint(value) {
|
|
795
|
+
const bytes = [];
|
|
796
|
+
let remaining = value;
|
|
797
|
+
do {
|
|
798
|
+
let byte = remaining & 127;
|
|
799
|
+
remaining >>>= 7;
|
|
800
|
+
if (remaining > 0) byte |= 128;
|
|
801
|
+
bytes.push(byte);
|
|
802
|
+
} while (remaining > 0);
|
|
803
|
+
return bytes;
|
|
804
|
+
}
|
|
805
|
+
function encodeUule(name) {
|
|
806
|
+
const locationBytes = Buffer.from(name, "utf8");
|
|
807
|
+
const payload = Buffer.concat([
|
|
808
|
+
Buffer.from([8, 2, 16, 32, 34]),
|
|
809
|
+
Buffer.from(encodeVarint(locationBytes.length)),
|
|
810
|
+
locationBytes
|
|
811
|
+
]);
|
|
812
|
+
return `w+${payload.toString("base64")}`;
|
|
813
|
+
}
|
|
814
|
+
function normalizeLocation(input) {
|
|
815
|
+
const raw = input.toLowerCase().trim();
|
|
816
|
+
if (LOCATIONS[raw]) return LOCATIONS[raw];
|
|
817
|
+
const beforeComma = raw.split(",")[0].trim();
|
|
818
|
+
if (beforeComma !== raw && LOCATIONS[beforeComma]) return LOCATIONS[beforeComma];
|
|
819
|
+
const withoutState = raw.replace(/\s+[a-z]{2}$/, "").trim();
|
|
820
|
+
if (withoutState !== raw && LOCATIONS[withoutState]) return LOCATIONS[withoutState];
|
|
821
|
+
return input;
|
|
822
|
+
}
|
|
823
|
+
|
|
824
|
+
// src/serp-location-debug.ts
|
|
825
|
+
var STATE_TO_CODE = {
|
|
826
|
+
alabama: "AL",
|
|
827
|
+
alaska: "AK",
|
|
828
|
+
arizona: "AZ",
|
|
829
|
+
arkansas: "AR",
|
|
830
|
+
california: "CA",
|
|
831
|
+
colorado: "CO",
|
|
832
|
+
connecticut: "CT",
|
|
833
|
+
delaware: "DE",
|
|
834
|
+
florida: "FL",
|
|
835
|
+
georgia: "GA",
|
|
836
|
+
hawaii: "HI",
|
|
837
|
+
idaho: "ID",
|
|
838
|
+
illinois: "IL",
|
|
839
|
+
indiana: "IN",
|
|
840
|
+
iowa: "IA",
|
|
841
|
+
kansas: "KS",
|
|
842
|
+
kentucky: "KY",
|
|
843
|
+
louisiana: "LA",
|
|
844
|
+
maine: "ME",
|
|
845
|
+
maryland: "MD",
|
|
846
|
+
massachusetts: "MA",
|
|
847
|
+
michigan: "MI",
|
|
848
|
+
minnesota: "MN",
|
|
849
|
+
mississippi: "MS",
|
|
850
|
+
missouri: "MO",
|
|
851
|
+
montana: "MT",
|
|
852
|
+
nebraska: "NE",
|
|
853
|
+
nevada: "NV",
|
|
854
|
+
"new hampshire": "NH",
|
|
855
|
+
"new jersey": "NJ",
|
|
856
|
+
"new mexico": "NM",
|
|
857
|
+
"new york": "NY",
|
|
858
|
+
"north carolina": "NC",
|
|
859
|
+
"north dakota": "ND",
|
|
860
|
+
ohio: "OH",
|
|
861
|
+
oklahoma: "OK",
|
|
862
|
+
oregon: "OR",
|
|
863
|
+
pennsylvania: "PA",
|
|
864
|
+
"rhode island": "RI",
|
|
865
|
+
"south carolina": "SC",
|
|
866
|
+
"south dakota": "SD",
|
|
867
|
+
tennessee: "TN",
|
|
868
|
+
texas: "TX",
|
|
869
|
+
utah: "UT",
|
|
870
|
+
vermont: "VT",
|
|
871
|
+
virginia: "VA",
|
|
872
|
+
washington: "WA",
|
|
873
|
+
"west virginia": "WV",
|
|
874
|
+
wisconsin: "WI",
|
|
875
|
+
wyoming: "WY",
|
|
876
|
+
"district of columbia": "DC"
|
|
877
|
+
};
|
|
878
|
+
var STATE_PATTERN = [
|
|
879
|
+
...Object.keys(STATE_TO_CODE).map((s) => s.replace(/\s+/g, "\\s+")),
|
|
880
|
+
...Object.values(STATE_TO_CODE)
|
|
881
|
+
].join("|");
|
|
882
|
+
var CITY_STATE_RE = new RegExp(`\\b([A-Z][A-Za-z]+(?:[\\s.-][A-Z][A-Za-z]+){0,4}),?\\s+(${STATE_PATTERN})\\b`, "gi");
|
|
883
|
+
function normalizeRegionCode(input) {
|
|
884
|
+
if (!input) return null;
|
|
885
|
+
const trimmed = input.trim();
|
|
886
|
+
if (/^[A-Z]{2}$/i.test(trimmed)) return trimmed.toUpperCase();
|
|
887
|
+
return STATE_TO_CODE[trimmed.toLowerCase()] ?? null;
|
|
888
|
+
}
|
|
889
|
+
function normalizeCity(input) {
|
|
890
|
+
const cleaned = input.replace(/\s+/g, " ").trim().replace(/^.*\b(?:in|near|around|serving)\s+/i, "");
|
|
891
|
+
return cleaned.toLowerCase().replace(/\b[a-z]/g, (char) => char.toUpperCase());
|
|
892
|
+
}
|
|
893
|
+
function parseExpected(canonicalLocation) {
|
|
894
|
+
if (!canonicalLocation) return null;
|
|
895
|
+
const [city = "", region = ""] = canonicalLocation.split(",").map((part) => part.trim());
|
|
896
|
+
return {
|
|
897
|
+
city: normalizeCity(city),
|
|
898
|
+
regionCode: normalizeRegionCode(region),
|
|
899
|
+
canonicalLocation
|
|
900
|
+
};
|
|
901
|
+
}
|
|
902
|
+
function addCandidate(candidates, city, region, example) {
|
|
903
|
+
const normalizedCity = normalizeCity(city);
|
|
904
|
+
const regionCode = normalizeRegionCode(region);
|
|
905
|
+
if (!normalizedCity || !regionCode) return;
|
|
906
|
+
const key = `${normalizedCity.toLowerCase()}|${regionCode}`;
|
|
907
|
+
const existing = candidates.get(key);
|
|
908
|
+
if (existing) {
|
|
909
|
+
existing.count++;
|
|
910
|
+
if (existing.examples.length < 3 && !existing.examples.includes(example)) existing.examples.push(example);
|
|
911
|
+
return;
|
|
912
|
+
}
|
|
913
|
+
candidates.set(key, { city: normalizedCity, regionCode, count: 1, examples: [example] });
|
|
914
|
+
}
|
|
915
|
+
function scanText(candidates, text) {
|
|
916
|
+
const normalized = decodeURIComponent(text).replace(/[+/|_-]+/g, " ");
|
|
917
|
+
for (const match of normalized.matchAll(CITY_STATE_RE)) {
|
|
918
|
+
addCandidate(candidates, match[1] ?? "", match[2] ?? "", normalized.slice(0, 180));
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
function inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) {
|
|
922
|
+
const expected = parseExpected(canonicalLocation);
|
|
923
|
+
const candidates = /* @__PURE__ */ new Map();
|
|
924
|
+
for (const result of organicResults) {
|
|
925
|
+
scanText(candidates, [result.title, result.snippet ?? "", result.cite ?? "", result.url].join(" "));
|
|
926
|
+
}
|
|
927
|
+
for (const business of localPack) {
|
|
928
|
+
scanText(candidates, [business.name, ...business.metadata, business.websiteUrl ?? "", business.directionsUrl ?? ""].join(" "));
|
|
929
|
+
}
|
|
930
|
+
const rankedCandidates = Array.from(candidates.values()).sort((a, b) => b.count - a.count || a.city.localeCompare(b.city)).slice(0, 8);
|
|
931
|
+
if (!expected) {
|
|
932
|
+
return { status: "not_requested", expected: null, candidates: rankedCandidates };
|
|
933
|
+
}
|
|
934
|
+
if (rankedCandidates.length === 0) {
|
|
935
|
+
return { status: "unknown", expected, candidates: [] };
|
|
936
|
+
}
|
|
937
|
+
const matched = rankedCandidates.some(
|
|
938
|
+
(candidate) => candidate.city.toLowerCase() === expected.city.toLowerCase() && (expected.regionCode == null || candidate.regionCode === expected.regionCode)
|
|
939
|
+
);
|
|
940
|
+
return {
|
|
941
|
+
status: matched ? "matched" : "mismatch",
|
|
942
|
+
expected,
|
|
943
|
+
candidates: rankedCandidates
|
|
944
|
+
};
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
// src/lib/paa-answer-cleanup.ts
|
|
948
|
+
var MAX_ANSWER_LENGTH = 1200;
|
|
949
|
+
var BOILERPLATE_PATTERNS = [
|
|
950
|
+
/An AI Overview is not available for this search/gi,
|
|
951
|
+
/Can't generate an AI overview right now\.?\s*Try again later\.?/gi,
|
|
952
|
+
/\bAI Overview\b/gi,
|
|
953
|
+
/\bView all\b/gi
|
|
954
|
+
];
|
|
955
|
+
var CUT_MARKERS = [
|
|
956
|
+
/\bRelated Links\b/i,
|
|
957
|
+
/\bAsk anything in\s*AI Mode\b/i,
|
|
958
|
+
/\bAI can make mistakes\b/i,
|
|
959
|
+
/\bThis is for informational purposes only\b/i,
|
|
960
|
+
/\bShow more\b/i,
|
|
961
|
+
/\b\d+\s+sites\b/i,
|
|
962
|
+
/\b\d{1,2}\s*[msh]\s*[A-Z][A-Za-z]/,
|
|
963
|
+
/\b(?:YouTube|Reddit|Facebook|Instagram|TikTok)·/
|
|
964
|
+
];
|
|
965
|
+
function normalizeWhitespace(text) {
|
|
966
|
+
return text.replace(/\u00a0/g, " ").replace(/([.!?])([A-Z])/g, "$1 $2").replace(/([:;])([A-Z])/g, "$1 $2").replace(/([a-z])([A-Z][a-z])/g, "$1 $2").replace(/(\d)([A-Z][a-z])/g, "$1 $2").replace(/([a-z])(\d)/g, "$1 $2").replace(/\s+/g, " ").trim();
|
|
967
|
+
}
|
|
968
|
+
function cutAtFirstMarker(text) {
|
|
969
|
+
let cutAt = -1;
|
|
970
|
+
for (const marker of CUT_MARKERS) {
|
|
971
|
+
const match = marker.exec(text);
|
|
972
|
+
marker.lastIndex = 0;
|
|
973
|
+
if (match && (cutAt === -1 || match.index < cutAt)) cutAt = match.index;
|
|
974
|
+
}
|
|
975
|
+
return cutAt === -1 ? text : text.slice(0, cutAt);
|
|
976
|
+
}
|
|
977
|
+
function cutAtSourceTitle(text, sourceTitle) {
|
|
978
|
+
const title = sourceTitle?.trim();
|
|
979
|
+
if (!title || title.length < 8) return text;
|
|
980
|
+
const idx = text.toLowerCase().indexOf(title.toLowerCase());
|
|
981
|
+
return idx > 40 ? text.slice(0, idx) : text;
|
|
982
|
+
}
|
|
983
|
+
function findAttributionCut(beforeUrl) {
|
|
984
|
+
const dateMatch = beforeUrl.match(/[•·]\s*(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s+\d{4}/i);
|
|
985
|
+
if (dateMatch?.index && dateMatch.index > 40) return dateMatch.index;
|
|
986
|
+
const start = Math.max(0, beforeUrl.length - 260);
|
|
987
|
+
const tail = beforeUrl.slice(start);
|
|
988
|
+
const sentenceBreaks = [...tail.matchAll(/[.!?]\s*(?=[A-Z][A-Za-z0-9"'$])/g)];
|
|
989
|
+
for (const match of sentenceBreaks) {
|
|
990
|
+
const remainder = tail.slice(match.index + 1).trim();
|
|
991
|
+
const lead = remainder.slice(0, 160);
|
|
992
|
+
const looksLikeTitle = /^(?:Best|Top|What|How|Why|When|Where|Which|Can|Should|Is|Are|Do|Does)\b/i.test(remainder);
|
|
993
|
+
if (remainder.length > 20 && looksLikeTitle && /(?:\s[-|]\s|Heating|Cooling|Company|Services|Blog|Guide|Review)/i.test(lead)) {
|
|
994
|
+
return start + match.index + 1;
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
const last = sentenceBreaks.at(-1);
|
|
998
|
+
if (last?.index !== void 0) return start + last.index + 1;
|
|
999
|
+
return beforeUrl.length;
|
|
1000
|
+
}
|
|
1001
|
+
function cutAtUrlAttribution(text) {
|
|
1002
|
+
const urlMatch = text.match(/https?:\/\/\S+/i);
|
|
1003
|
+
if (!urlMatch?.index) return text;
|
|
1004
|
+
const beforeUrl = text.slice(0, urlMatch.index);
|
|
1005
|
+
return beforeUrl.slice(0, findAttributionCut(beforeUrl));
|
|
1006
|
+
}
|
|
1007
|
+
function trimToSentenceLimit(text) {
|
|
1008
|
+
if (text.length <= MAX_ANSWER_LENGTH) return text;
|
|
1009
|
+
const slice = text.slice(0, MAX_ANSWER_LENGTH);
|
|
1010
|
+
const lastSentence = Math.max(slice.lastIndexOf("."), slice.lastIndexOf("!"), slice.lastIndexOf("?"));
|
|
1011
|
+
return (lastSentence > 240 ? slice.slice(0, lastSentence + 1) : slice).trim();
|
|
1012
|
+
}
|
|
1013
|
+
function cleanPAAAnswerText(answer, question, sourceTitle) {
|
|
1014
|
+
if (!answer) return void 0;
|
|
1015
|
+
let text = normalizeWhitespace(answer);
|
|
1016
|
+
const normalizedQuestion = question ? normalizeWhitespace(question) : "";
|
|
1017
|
+
if (normalizedQuestion && text.toLowerCase().startsWith(normalizedQuestion.toLowerCase())) {
|
|
1018
|
+
text = text.slice(normalizedQuestion.length).trim();
|
|
1019
|
+
}
|
|
1020
|
+
if (/^An error has occurred\.?\s*Please try again later\.?/i.test(text)) {
|
|
1021
|
+
return void 0;
|
|
1022
|
+
}
|
|
1023
|
+
for (const pattern of BOILERPLATE_PATTERNS) {
|
|
1024
|
+
text = text.replace(pattern, " ");
|
|
1025
|
+
}
|
|
1026
|
+
text = text.replace(/\b[A-Z][A-Za-z&'\u2019 -]{2,60}\+\d+\b/g, " ").replace(/\b(?:[a-z0-9-]+\.)+[a-z]{2,}\+\d+\b/gi, " ");
|
|
1027
|
+
text = normalizeWhitespace(text);
|
|
1028
|
+
text = cutAtFirstMarker(text);
|
|
1029
|
+
text = cutAtSourceTitle(text, sourceTitle);
|
|
1030
|
+
text = cutAtUrlAttribution(text);
|
|
1031
|
+
text = normalizeWhitespace(text);
|
|
1032
|
+
text = text.replace(/\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\.?\s+\d{1,2},\s+\d{4}$/i, "").trim();
|
|
1033
|
+
text = trimToSentenceLimit(text);
|
|
1034
|
+
if (!text) return void 0;
|
|
1035
|
+
if (/^An error has occurred\.?\s*Please try again later\.?$/i.test(text)) return void 0;
|
|
1036
|
+
return text;
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
// src/extractor/ai-surfaces.ts
|
|
1040
|
+
async function extractAISurfacesFromDocument(config) {
|
|
1041
|
+
const selectors = config ?? {
|
|
1042
|
+
aio: {
|
|
1043
|
+
root: "[data-lhcontainer][data-streaming-container][eid]",
|
|
1044
|
+
legacyRoot: '[data-hveid="CBMQAA"]',
|
|
1045
|
+
wrapper: ".Fgyi2e",
|
|
1046
|
+
controller: '[jscontroller="AkrxPe"]',
|
|
1047
|
+
contentSubtree: '[data-subtree="mfc"]',
|
|
1048
|
+
heading: ".Fzsovc.cwYVJe.RJPOee",
|
|
1049
|
+
header: ".heWuVc",
|
|
1050
|
+
showMoreButton: '[aria-label="Show more AI Overview"]',
|
|
1051
|
+
sourcesPanel: ".OZ9ddf.WAUd4",
|
|
1052
|
+
disclaimer: ".DuQANe.MSJHRb"
|
|
1053
|
+
},
|
|
1054
|
+
aim: {
|
|
1055
|
+
root: '[data-hveid="CAUQAA"]',
|
|
1056
|
+
wrapper: ".Fgyi2e"
|
|
1057
|
+
},
|
|
1058
|
+
expandWaitMs: 1500
|
|
1059
|
+
};
|
|
1060
|
+
const sn = window.google?.sn ?? "unknown";
|
|
1061
|
+
const surface = sn === "aim" ? "aim" : sn === "web" ? "web" : "unknown";
|
|
1062
|
+
function textOf(el) {
|
|
1063
|
+
if (!el) return "";
|
|
1064
|
+
return (el.innerText ?? el.textContent ?? "").trim();
|
|
1065
|
+
}
|
|
1066
|
+
function hasAIOverviewLabel(el) {
|
|
1067
|
+
const heading = el.querySelector(selectors.aio.heading);
|
|
1068
|
+
if (textOf(heading) === "AI Overview") return true;
|
|
1069
|
+
const header = el.querySelector(selectors.aio.header);
|
|
1070
|
+
if (textOf(header).split(/\n|\s{2,}/).some((part) => part.trim() === "AI Overview")) return true;
|
|
1071
|
+
return textOf(el).includes("AI Overview");
|
|
1072
|
+
}
|
|
1073
|
+
function findAIORoot() {
|
|
1074
|
+
const primaryRoots = Array.from(document.querySelectorAll(selectors.aio.root));
|
|
1075
|
+
const labeledPrimary = primaryRoots.find(hasAIOverviewLabel);
|
|
1076
|
+
if (labeledPrimary) return labeledPrimary;
|
|
1077
|
+
if (primaryRoots.length > 0) return primaryRoots[0];
|
|
1078
|
+
if (selectors.aio.legacyRoot) {
|
|
1079
|
+
const legacy = document.querySelector(selectors.aio.legacyRoot);
|
|
1080
|
+
if (legacy) return legacy;
|
|
1081
|
+
}
|
|
1082
|
+
const headings = document.querySelectorAll(`${selectors.aio.heading}, h1, h2, h3, [role="heading"]`);
|
|
1083
|
+
for (const h of headings) {
|
|
1084
|
+
if (textOf(h) !== "AI Overview") continue;
|
|
1085
|
+
let el = h.parentElement;
|
|
1086
|
+
for (let i = 0; i < 8 && el; i++) {
|
|
1087
|
+
if (el.matches(selectors.aio.root) || el.querySelector(selectors.aio.controller) || el.querySelector(selectors.aio.contentSubtree)) {
|
|
1088
|
+
return el;
|
|
1089
|
+
}
|
|
1090
|
+
el = el.parentElement;
|
|
1091
|
+
}
|
|
1092
|
+
return h.parentElement;
|
|
1093
|
+
}
|
|
1094
|
+
return null;
|
|
1095
|
+
}
|
|
1096
|
+
function cleanText(target) {
|
|
1097
|
+
if (!target) return null;
|
|
1098
|
+
const clone = target.cloneNode(true);
|
|
1099
|
+
clone.querySelectorAll([
|
|
1100
|
+
"script",
|
|
1101
|
+
"style",
|
|
1102
|
+
"noscript",
|
|
1103
|
+
"img",
|
|
1104
|
+
"picture",
|
|
1105
|
+
"video",
|
|
1106
|
+
selectors.aio.header,
|
|
1107
|
+
selectors.aio.showMoreButton,
|
|
1108
|
+
selectors.aio.sourcesPanel,
|
|
1109
|
+
selectors.aio.disclaimer,
|
|
1110
|
+
'[data-subtree="dfa"]',
|
|
1111
|
+
"[data-src-id]",
|
|
1112
|
+
'[role="dialog"]',
|
|
1113
|
+
".HWMcu",
|
|
1114
|
+
".bTFeG",
|
|
1115
|
+
".CyMdWb",
|
|
1116
|
+
".MFrAxb",
|
|
1117
|
+
".F0OfWd.hfWAgb",
|
|
1118
|
+
".x2qcTc.fZavHb",
|
|
1119
|
+
".SvjEff",
|
|
1120
|
+
".sR2MY",
|
|
1121
|
+
".lKuDef",
|
|
1122
|
+
".GSPQcc",
|
|
1123
|
+
"a[href]",
|
|
1124
|
+
"button",
|
|
1125
|
+
'[role="button"]'
|
|
1126
|
+
].join(",")).forEach((el) => el.remove());
|
|
1127
|
+
const holder = document.createElement("div");
|
|
1128
|
+
holder.style.position = "fixed";
|
|
1129
|
+
holder.style.left = "-10000px";
|
|
1130
|
+
holder.style.top = "0";
|
|
1131
|
+
holder.style.width = `${Math.max(320, Math.round(target.getBoundingClientRect?.().width || 960))}px`;
|
|
1132
|
+
holder.style.opacity = "0";
|
|
1133
|
+
holder.style.pointerEvents = "none";
|
|
1134
|
+
holder.append(clone);
|
|
1135
|
+
document.body.append(holder);
|
|
1136
|
+
const rendered = clone.innerText || clone.textContent || "";
|
|
1137
|
+
holder.remove();
|
|
1138
|
+
const lines = rendered.replace(/\r/g, "").replace(/[ \t]+\n/g, "\n").replace(/\n[ \t]+/g, "\n").replace(/\n{3,}/g, "\n\n").replace(/[ \t]{2,}/g, " ").trim().split("\n").map((line) => line.replace(/\u00a0/g, " ").trim()).filter(Boolean);
|
|
1139
|
+
const filteredLines = [];
|
|
1140
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1141
|
+
const line = lines[i];
|
|
1142
|
+
const next = lines[i + 1] ?? "";
|
|
1143
|
+
if (line === "AI Overview") continue;
|
|
1144
|
+
if (line === "Show more") continue;
|
|
1145
|
+
if (/^AI can make mistakes/i.test(line)) continue;
|
|
1146
|
+
if (/^Thank you\b/i.test(line)) continue;
|
|
1147
|
+
if (/^Your feedback helps Google improve/i.test(line)) continue;
|
|
1148
|
+
if (/^\+?\d+$/.test(line)) continue;
|
|
1149
|
+
if (/^\+\d+$/.test(next) && line.length <= 80) {
|
|
1150
|
+
i++;
|
|
1151
|
+
continue;
|
|
1152
|
+
}
|
|
1153
|
+
filteredLines.push(line);
|
|
1154
|
+
}
|
|
1155
|
+
const raw = filteredLines.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
1156
|
+
if (!raw || /not available|try again|can't generate/i.test(raw)) return null;
|
|
1157
|
+
return raw;
|
|
1158
|
+
}
|
|
1159
|
+
function normalizeHref(rawHref) {
|
|
1160
|
+
if (!rawHref || rawHref.startsWith("javascript:")) return null;
|
|
1161
|
+
let href = rawHref;
|
|
1162
|
+
try {
|
|
1163
|
+
const absolute = new URL(rawHref, window.location.href);
|
|
1164
|
+
const q = absolute.searchParams.get("q") ?? absolute.searchParams.get("url");
|
|
1165
|
+
if (/(\.|^)google\./i.test(absolute.hostname) && q?.startsWith("http")) {
|
|
1166
|
+
href = q;
|
|
1167
|
+
} else {
|
|
1168
|
+
href = absolute.href;
|
|
1169
|
+
}
|
|
1170
|
+
} catch {
|
|
1171
|
+
return null;
|
|
1172
|
+
}
|
|
1173
|
+
if (!/^https?:\/\//i.test(href)) return null;
|
|
1174
|
+
try {
|
|
1175
|
+
const url = new URL(href);
|
|
1176
|
+
const isGoogleInternal = /(\.|^)google\./i.test(url.hostname);
|
|
1177
|
+
if (isGoogleInternal) return null;
|
|
1178
|
+
return url.href;
|
|
1179
|
+
} catch {
|
|
1180
|
+
return null;
|
|
1181
|
+
}
|
|
1182
|
+
}
|
|
1183
|
+
function extractCitations(root) {
|
|
1184
|
+
if (!root) return [];
|
|
1185
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1186
|
+
const citations = [];
|
|
1187
|
+
for (const a of Array.from(root.querySelectorAll("a[href]"))) {
|
|
1188
|
+
const href = normalizeHref(a.getAttribute("href") ?? "");
|
|
1189
|
+
if (!href || seen.has(href)) continue;
|
|
1190
|
+
seen.add(href);
|
|
1191
|
+
let fallbackHost = "";
|
|
1192
|
+
try {
|
|
1193
|
+
fallbackHost = new URL(href).hostname.replace(/^www\./, "");
|
|
1194
|
+
} catch {
|
|
1195
|
+
}
|
|
1196
|
+
citations.push({
|
|
1197
|
+
text: textOf(a) || fallbackHost || href,
|
|
1198
|
+
href
|
|
1199
|
+
});
|
|
1200
|
+
}
|
|
1201
|
+
return citations;
|
|
1202
|
+
}
|
|
1203
|
+
async function maybeExpand(root) {
|
|
1204
|
+
const button = root.querySelector(selectors.aio.showMoreButton);
|
|
1205
|
+
if (!button || button.getAttribute("aria-expanded") !== "false") return false;
|
|
1206
|
+
button.click();
|
|
1207
|
+
const waitMs = selectors.expandWaitMs ?? 1500;
|
|
1208
|
+
if (waitMs > 0) await new Promise((resolve) => setTimeout(resolve, waitMs));
|
|
1209
|
+
return true;
|
|
1210
|
+
}
|
|
1211
|
+
const aioRoot = findAIORoot();
|
|
1212
|
+
let aioText = null;
|
|
1213
|
+
let aioCitations = [];
|
|
1214
|
+
let aioExpanded = false;
|
|
1215
|
+
let aioFullyExpanded = false;
|
|
1216
|
+
let aioSections = [];
|
|
1217
|
+
if (aioRoot) {
|
|
1218
|
+
aioExpanded = await maybeExpand(aioRoot);
|
|
1219
|
+
const controller = aioRoot.querySelector(selectors.aio.controller);
|
|
1220
|
+
const contentSubtree = aioRoot.querySelector(selectors.aio.contentSubtree);
|
|
1221
|
+
const showMore = aioRoot.querySelector(selectors.aio.showMoreButton);
|
|
1222
|
+
aioFullyExpanded = controller?.getAttribute("data-trnct") === "false" || showMore?.getAttribute("aria-expanded") === "true" || !showMore;
|
|
1223
|
+
aioText = cleanText(contentSubtree ?? controller ?? aioRoot);
|
|
1224
|
+
aioSections = (aioText ?? "").split("\n").map((line) => line.trim()).filter((line) => /^\d+\.\s+.+/.test(line));
|
|
1225
|
+
aioCitations = extractCitations(aioRoot);
|
|
1226
|
+
}
|
|
1227
|
+
const aimRoot = document.querySelector(selectors.aim.root);
|
|
1228
|
+
const aimDetected = surface === "aim" && !!aimRoot;
|
|
1229
|
+
const aimContainer = aimRoot?.closest(selectors.aim.wrapper) ?? aimRoot;
|
|
1230
|
+
const aimText = cleanText(aimContainer);
|
|
1231
|
+
const aimCitations = aimDetected ? extractCitations(aimContainer) : [];
|
|
1232
|
+
return {
|
|
1233
|
+
surface,
|
|
1234
|
+
aiOverview: {
|
|
1235
|
+
detected: !!aioRoot && aioText !== null,
|
|
1236
|
+
text: aioText,
|
|
1237
|
+
citations: aioCitations,
|
|
1238
|
+
expanded: aioExpanded,
|
|
1239
|
+
fullyExpanded: aioFullyExpanded,
|
|
1240
|
+
sections: aioSections
|
|
1241
|
+
},
|
|
1242
|
+
aiMode: {
|
|
1243
|
+
detected: aimDetected && aimText !== null,
|
|
1244
|
+
text: aimText,
|
|
1245
|
+
citations: aimCitations
|
|
1246
|
+
}
|
|
1247
|
+
};
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
// src/extractor/PAAExtractor.ts
|
|
1251
|
+
var DESKTOP_USER_AGENT2 = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36";
|
|
1252
|
+
var MOBILE_USER_AGENT2 = "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Mobile/15E148 Safari/604.1";
|
|
1253
|
+
var PAAExtractor = class {
|
|
1254
|
+
constructor(driver, reporter) {
|
|
1255
|
+
this.driver = driver;
|
|
1256
|
+
this.reporter = reporter;
|
|
1257
|
+
}
|
|
1258
|
+
driver;
|
|
1259
|
+
reporter;
|
|
1260
|
+
normalizeQuestion(q) {
|
|
1261
|
+
return q.toLowerCase().replace(/[^\w\s]/g, "").replace(/\s+/g, " ").trim();
|
|
1262
|
+
}
|
|
1263
|
+
throwIfAborted(signal) {
|
|
1264
|
+
if (!signal?.aborted) return;
|
|
1265
|
+
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") throw signal.reason;
|
|
1266
|
+
throw new RequestAbortedError();
|
|
1267
|
+
}
|
|
1268
|
+
async throwIfCaptcha(page, context) {
|
|
1269
|
+
const captchaCount = await page.locator(PAASelectors.captchaMarker).count().catch(() => 0);
|
|
1270
|
+
if (captchaCount > 0) {
|
|
1271
|
+
throw new CaptchaError(`${context} returned a CAPTCHA \u2014 retrying with a fresh session.`);
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
async extractVisibleItems(page) {
|
|
1275
|
+
const sels = PAASelectors;
|
|
1276
|
+
const raw = await page.evaluate((selectors) => {
|
|
1277
|
+
function cleanText(el) {
|
|
1278
|
+
if (!el) return "";
|
|
1279
|
+
const parts = [];
|
|
1280
|
+
for (const n of el.childNodes) {
|
|
1281
|
+
if (n.nodeType === Node.TEXT_NODE) {
|
|
1282
|
+
const text = n.textContent?.trim();
|
|
1283
|
+
if (text) parts.push(text);
|
|
1284
|
+
} else if (n.tagName === "STYLE" || n.tagName === "SCRIPT") {
|
|
1285
|
+
continue;
|
|
1286
|
+
} else {
|
|
1287
|
+
const text = cleanText(n);
|
|
1288
|
+
if (text) parts.push(text);
|
|
1289
|
+
}
|
|
1290
|
+
}
|
|
1291
|
+
return parts.join(" ").replace(/\s+/g, " ").trim();
|
|
1292
|
+
}
|
|
1293
|
+
return Array.from(document.querySelectorAll(selectors.item)).map((pair) => ({
|
|
1294
|
+
question: pair.getAttribute(selectors.itemDataQ) || pair.getAttribute(selectors.itemDataInitQ) || "",
|
|
1295
|
+
answer: cleanText(pair.querySelector(selectors.answerContainer)) || void 0,
|
|
1296
|
+
sourceTitle: pair.querySelector(selectors.sourceTitle)?.innerText?.trim() || void 0,
|
|
1297
|
+
sourceSite: pair.querySelector(selectors.sourceSite)?.innerText?.trim() || void 0,
|
|
1298
|
+
sourceCite: pair.querySelector(selectors.sourceCite)?.innerText?.trim() || void 0
|
|
1299
|
+
}));
|
|
1300
|
+
}, sels);
|
|
1301
|
+
return raw.flatMap((item) => {
|
|
1302
|
+
const cleaned = {
|
|
1303
|
+
...item,
|
|
1304
|
+
answer: cleanPAAAnswerText(item.answer, item.question, item.sourceTitle)
|
|
1305
|
+
};
|
|
1306
|
+
const result = RawPAAItemSchema.safeParse(cleaned);
|
|
1307
|
+
if (!result.success) {
|
|
1308
|
+
console.warn("[PAAExtractor] item parse failed:", item.question, result.error.issues[0]?.message);
|
|
1309
|
+
return [];
|
|
1310
|
+
}
|
|
1311
|
+
return [result.data];
|
|
1312
|
+
});
|
|
1313
|
+
}
|
|
1314
|
+
async clickItem(page, questionText) {
|
|
1315
|
+
try {
|
|
1316
|
+
const pairLocator = page.locator(
|
|
1317
|
+
`${PAASelectors.item}[data-q="${questionText}"], ${PAASelectors.item}[data-initq="${questionText}"]`
|
|
1318
|
+
).first();
|
|
1319
|
+
await pairLocator.click();
|
|
1320
|
+
} catch {
|
|
1321
|
+
}
|
|
1322
|
+
}
|
|
1323
|
+
toFlatRow(item, depth, parentQuestion, seed) {
|
|
1324
|
+
return {
|
|
1325
|
+
seed_query: seed,
|
|
1326
|
+
question: item.question,
|
|
1327
|
+
answer: item.answer ?? "",
|
|
1328
|
+
source_title: item.sourceTitle ?? "",
|
|
1329
|
+
source_site: item.sourceSite ?? "",
|
|
1330
|
+
source_cite: item.sourceCite ?? "",
|
|
1331
|
+
depth,
|
|
1332
|
+
parent_question: parentQuestion ?? "",
|
|
1333
|
+
extracted_at: (/* @__PURE__ */ new Date()).toISOString()
|
|
1334
|
+
};
|
|
1335
|
+
}
|
|
1336
|
+
async runBFS(page, options, signal) {
|
|
1337
|
+
const seenKeys = /* @__PURE__ */ new Set();
|
|
1338
|
+
const seenQs = /* @__PURE__ */ new Set();
|
|
1339
|
+
const orderedQs = [];
|
|
1340
|
+
const results = [];
|
|
1341
|
+
const readAllQs = () => page.evaluate(
|
|
1342
|
+
({ sel, dataQ, dataInitQ, questionEl }) => Array.from(document.querySelectorAll(sel)).map(
|
|
1343
|
+
(el) => el.getAttribute(dataQ) || el.getAttribute(dataInitQ) || el.querySelector(questionEl)?.innerText?.trim() || ""
|
|
1344
|
+
).filter(Boolean),
|
|
1345
|
+
{ sel: PAASelectors.item, dataQ: PAASelectors.itemDataQ, dataInitQ: PAASelectors.itemDataInitQ, questionEl: PAASelectors.itemQuestionEl }
|
|
1346
|
+
);
|
|
1347
|
+
let round = 0;
|
|
1348
|
+
while (seenQs.size < options.maxQuestions) {
|
|
1349
|
+
this.throwIfAborted(signal);
|
|
1350
|
+
await this.throwIfCaptcha(page, "Google PAA expansion");
|
|
1351
|
+
const beforeQs = await readAllQs();
|
|
1352
|
+
if (beforeQs.length >= options.maxQuestions) break;
|
|
1353
|
+
const unexpandedSel = `${PAASelectors.item}:not(.${PAASelectors.expandedClass}) ${PAASelectors.clickTarget}`;
|
|
1354
|
+
const unexpandedCount = await page.locator(unexpandedSel).count();
|
|
1355
|
+
if (unexpandedCount === 0) break;
|
|
1356
|
+
this.reporter.onDepth(++round);
|
|
1357
|
+
for (let ci = 0; ci < unexpandedCount; ci++) {
|
|
1358
|
+
this.throwIfAborted(signal);
|
|
1359
|
+
try {
|
|
1360
|
+
const btn = page.locator(unexpandedSel).first();
|
|
1361
|
+
await btn.scrollIntoViewIfNeeded();
|
|
1362
|
+
await btn.hover({ force: true });
|
|
1363
|
+
await page.waitForTimeout(100);
|
|
1364
|
+
await btn.click({ force: true });
|
|
1365
|
+
await page.waitForTimeout(500);
|
|
1366
|
+
} catch {
|
|
1367
|
+
}
|
|
1368
|
+
}
|
|
1369
|
+
await page.waitForFunction(
|
|
1370
|
+
({ sel, min }) => document.querySelectorAll(sel).length > min,
|
|
1371
|
+
{ sel: PAASelectors.item, min: beforeQs.length },
|
|
1372
|
+
{ timeout: 5e3 }
|
|
1373
|
+
).catch(() => {
|
|
1374
|
+
});
|
|
1375
|
+
await this.throwIfCaptcha(page, "Google PAA expansion");
|
|
1376
|
+
const afterQs = await readAllQs();
|
|
1377
|
+
if (afterQs.length === beforeQs.length) break;
|
|
1378
|
+
for (const q of afterQs) {
|
|
1379
|
+
if (!seenQs.has(q)) {
|
|
1380
|
+
seenQs.add(q);
|
|
1381
|
+
orderedQs.push(q);
|
|
1382
|
+
}
|
|
1383
|
+
}
|
|
1384
|
+
}
|
|
1385
|
+
const itemMap = new Map((await this.extractVisibleItems(page)).map((i) => [i.question, i]));
|
|
1386
|
+
for (const q of orderedQs) {
|
|
1387
|
+
if (results.length >= options.maxQuestions) break;
|
|
1388
|
+
const key = this.normalizeQuestion(q);
|
|
1389
|
+
if (seenKeys.has(key)) continue;
|
|
1390
|
+
seenKeys.add(key);
|
|
1391
|
+
const item = itemMap.get(q);
|
|
1392
|
+
if (item) {
|
|
1393
|
+
results.push(this.toFlatRow(item, 1, null, options.query));
|
|
1394
|
+
this.reporter.onQuestion({ question: item.question, answer: item.answer ?? null, sourceTitle: item.sourceTitle ?? null, sourceSite: item.sourceSite ?? null, sourceCite: item.sourceCite ?? null, depth: 1, parentQuestion: null, children: [] });
|
|
1395
|
+
} else {
|
|
1396
|
+
results.push(this.toFlatRow({ question: q, answer: void 0, sourceTitle: void 0, sourceSite: void 0, sourceCite: void 0 }, 1, null, options.query));
|
|
1397
|
+
}
|
|
1398
|
+
}
|
|
1399
|
+
return results;
|
|
1400
|
+
}
|
|
1401
|
+
async extractVideos(page) {
|
|
1402
|
+
const vsels = VideoSelectors;
|
|
1403
|
+
return page.evaluate((sels) => {
|
|
1404
|
+
const results = [];
|
|
1405
|
+
const containers = Array.from(document.querySelectorAll(sels.container));
|
|
1406
|
+
for (const container of containers) {
|
|
1407
|
+
const headingEl = container.querySelector(sels.sectionHeading);
|
|
1408
|
+
const headingText = headingEl?.textContent?.trim() ?? "";
|
|
1409
|
+
const type = headingText.toLowerCase().includes("short") ? "short_video" : "video";
|
|
1410
|
+
const items = Array.from(container.querySelectorAll(sels.item));
|
|
1411
|
+
for (const a of items) {
|
|
1412
|
+
const href = a.href;
|
|
1413
|
+
if (!href || !href.includes("youtube") && !href.includes("youtu.be")) continue;
|
|
1414
|
+
const raw = a.textContent?.trim() ?? "";
|
|
1415
|
+
const ytIdx = raw.indexOf("YouTube");
|
|
1416
|
+
if (ytIdx === -1) continue;
|
|
1417
|
+
const title = raw.slice(0, ytIdx).trim();
|
|
1418
|
+
const remainder = raw.slice(ytIdx + 7).replace(/^[·\s·]+/, "");
|
|
1419
|
+
const channelMatch = remainder.match(/^([^·\n]+)/);
|
|
1420
|
+
const channel = channelMatch ? channelMatch[1].trim() : "";
|
|
1421
|
+
if (title) results.push({ type, title, channel, platform: "YouTube", duration: "", url: href });
|
|
1422
|
+
}
|
|
1423
|
+
}
|
|
1424
|
+
return results;
|
|
1425
|
+
}, vsels);
|
|
1426
|
+
}
|
|
1427
|
+
async extractForums(page) {
|
|
1428
|
+
const fsels = ForumSelectors;
|
|
1429
|
+
return page.evaluate((sels) => {
|
|
1430
|
+
const results = [];
|
|
1431
|
+
const sections = Array.from(document.querySelectorAll(sels.section));
|
|
1432
|
+
const forumSection = sections.find((s) => s.textContent?.includes("Discussions"));
|
|
1433
|
+
if (!forumSection) return results;
|
|
1434
|
+
const items = Array.from(forumSection.querySelectorAll(sels.item));
|
|
1435
|
+
for (const a of items) {
|
|
1436
|
+
const href = a.href;
|
|
1437
|
+
if (!href) continue;
|
|
1438
|
+
const titleEl = a.querySelector(sels.title);
|
|
1439
|
+
const sourceEl = a.querySelector(sels.source);
|
|
1440
|
+
const title = titleEl?.textContent?.trim() ?? "";
|
|
1441
|
+
const source = sourceEl?.textContent?.trim() ?? "";
|
|
1442
|
+
if (title) results.push({ title, source, url: href });
|
|
1443
|
+
}
|
|
1444
|
+
return results;
|
|
1445
|
+
}, fsels);
|
|
1446
|
+
}
|
|
1447
|
+
async extractShortVideos(page, shortUrl) {
|
|
1448
|
+
try {
|
|
1449
|
+
await page.goto(shortUrl, { waitUntil: "domcontentloaded" });
|
|
1450
|
+
await page.waitForTimeout(1500);
|
|
1451
|
+
} catch {
|
|
1452
|
+
return [];
|
|
1453
|
+
}
|
|
1454
|
+
await this.throwIfCaptcha(page, "Google short video search");
|
|
1455
|
+
const svSels = {
|
|
1456
|
+
item: ShortVideoSelectors.item,
|
|
1457
|
+
platforms: [...ShortVideoSelectors.platforms]
|
|
1458
|
+
};
|
|
1459
|
+
const raw = await page.evaluate((sels) => {
|
|
1460
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1461
|
+
const results = [];
|
|
1462
|
+
const items = Array.from(document.querySelectorAll(sels.item));
|
|
1463
|
+
const videoHosts = ["youtube.com", "youtu.be", "tiktok.com", "instagram.com", "facebook.com", "fb.watch"];
|
|
1464
|
+
const byHref = /* @__PURE__ */ new Map();
|
|
1465
|
+
for (const a of items) {
|
|
1466
|
+
const href = a.href;
|
|
1467
|
+
if (!href) continue;
|
|
1468
|
+
if (!videoHosts.some((h) => href.includes(h))) continue;
|
|
1469
|
+
const text = a.textContent?.trim() ?? "";
|
|
1470
|
+
if (!byHref.has(href)) byHref.set(href, []);
|
|
1471
|
+
byHref.get(href).push(text);
|
|
1472
|
+
}
|
|
1473
|
+
for (const [href, texts] of byHref.entries()) {
|
|
1474
|
+
if (seen.has(href)) continue;
|
|
1475
|
+
seen.add(href);
|
|
1476
|
+
const duration = texts.find((t) => /^\d+:\d+$/.test(t)) ?? "";
|
|
1477
|
+
const titleText = texts.find((t) => !/^\d+:\d+$/.test(t) && t.length > 5) ?? "";
|
|
1478
|
+
if (!titleText) continue;
|
|
1479
|
+
let title = titleText;
|
|
1480
|
+
let platform = "";
|
|
1481
|
+
let channel = "";
|
|
1482
|
+
for (const p of sels.platforms) {
|
|
1483
|
+
let lastIdx = -1;
|
|
1484
|
+
let search = 0;
|
|
1485
|
+
while (true) {
|
|
1486
|
+
const found = titleText.indexOf(p, search);
|
|
1487
|
+
if (found === -1) break;
|
|
1488
|
+
lastIdx = found;
|
|
1489
|
+
search = found + 1;
|
|
1490
|
+
}
|
|
1491
|
+
if (lastIdx === -1) continue;
|
|
1492
|
+
const after = titleText.slice(lastIdx + p.length);
|
|
1493
|
+
const isSourceTag = /^[\s·]/.test(after) || after.trim() === "";
|
|
1494
|
+
if (!isSourceTag) continue;
|
|
1495
|
+
title = titleText.slice(0, lastIdx).trim();
|
|
1496
|
+
platform = p;
|
|
1497
|
+
const stripped = after.replace(/^[\s·]+/, "");
|
|
1498
|
+
const dotIdx = stripped.indexOf("\xB7");
|
|
1499
|
+
channel = (dotIdx === -1 ? stripped : stripped.slice(0, dotIdx)).trim();
|
|
1500
|
+
break;
|
|
1501
|
+
}
|
|
1502
|
+
if (title) results.push({ title, channel, platform, duration, url: href });
|
|
1503
|
+
}
|
|
1504
|
+
return results;
|
|
1505
|
+
}, svSels);
|
|
1506
|
+
return raw.map((r) => ({ type: "short_video", ...r }));
|
|
1507
|
+
}
|
|
1508
|
+
async extractWhatPeopleSaying(page) {
|
|
1509
|
+
const sels = WhatPeopleSayingSelectors;
|
|
1510
|
+
return page.evaluate((s) => {
|
|
1511
|
+
const section = Array.from(document.querySelectorAll(s.sectionTag)).find((el) => el.textContent?.includes(s.sectionHeadingText)) ?? document.querySelector(".yG4QQe.TBC9ub.NbhJ1c");
|
|
1512
|
+
if (!section) return [];
|
|
1513
|
+
return Array.from(section.querySelectorAll(s.card)).map((card) => {
|
|
1514
|
+
const link = card.querySelector(s.cardLink);
|
|
1515
|
+
const url = link?.href ?? "";
|
|
1516
|
+
const titleH1 = card.querySelector(s.titleH1)?.textContent?.trim();
|
|
1517
|
+
const titleDiv = card.querySelector(s.titleDiv)?.textContent?.trim();
|
|
1518
|
+
const title = titleH1 ?? titleDiv ?? "";
|
|
1519
|
+
const sourceText = card.querySelector(s.source)?.textContent?.trim() ?? "";
|
|
1520
|
+
const platformEl = card.querySelector(s.platformBadge);
|
|
1521
|
+
const platformText = platformEl?.textContent?.trim() ?? "";
|
|
1522
|
+
const ytChannel = card.querySelector(s.ytChannel)?.textContent?.trim() ?? "";
|
|
1523
|
+
const ytDate = card.querySelector(s.ytDate)?.textContent?.trim() ?? "";
|
|
1524
|
+
const authorNote = card.querySelector(s.authorNote)?.textContent?.trim() ?? null;
|
|
1525
|
+
const commentLabelEl = card.querySelector(s.popularCommentLabel);
|
|
1526
|
+
let popularComment = null;
|
|
1527
|
+
if (commentLabelEl) {
|
|
1528
|
+
let next = commentLabelEl.nextSibling;
|
|
1529
|
+
while (next) {
|
|
1530
|
+
const t = next.textContent?.trim();
|
|
1531
|
+
if (t) {
|
|
1532
|
+
popularComment = t;
|
|
1533
|
+
break;
|
|
1534
|
+
}
|
|
1535
|
+
next = next.nextSibling;
|
|
1536
|
+
}
|
|
1537
|
+
}
|
|
1538
|
+
const allSpans = Array.from(card.querySelectorAll("span"));
|
|
1539
|
+
const duration = allSpans.find((s2) => /^\d+:\d+$/.test(s2.textContent?.trim() ?? ""))?.textContent?.trim() ?? null;
|
|
1540
|
+
const engagementParts = allSpans.map((s2) => s2.textContent?.trim() ?? "").filter(
|
|
1541
|
+
(t) => /\d/.test(t) && (t.includes("comment") || t.includes("reaction") || t.includes("view") || t.includes("like") || t.includes("share"))
|
|
1542
|
+
);
|
|
1543
|
+
const engagement = engagementParts[0] ?? "";
|
|
1544
|
+
const dateCandidates = allSpans.map((s2) => s2.textContent?.trim() ?? "").filter((t) => /\d+ (day|week|month|year|hour)s? ago|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec/.test(t));
|
|
1545
|
+
const date = ytDate || (dateCandidates[0] ?? "");
|
|
1546
|
+
const platform = platformText || (ytChannel ? "YouTube" : "");
|
|
1547
|
+
const source = ytChannel || sourceText;
|
|
1548
|
+
let type = "unknown";
|
|
1549
|
+
const pl = platform.toLowerCase();
|
|
1550
|
+
const src = source.toLowerCase();
|
|
1551
|
+
const srcRaw = sourceText.toLowerCase();
|
|
1552
|
+
if (pl.includes("reddit") || src.startsWith("r/")) type = "reddit";
|
|
1553
|
+
else if (pl.includes("facebook") || srcRaw.includes("facebook")) type = "facebook";
|
|
1554
|
+
else if (pl.includes("instagram") || srcRaw.includes("instagram")) type = "instagram";
|
|
1555
|
+
else if (pl.includes("tiktok") || srcRaw.includes("tiktok")) type = "tiktok";
|
|
1556
|
+
else if (pl.includes("youtube") || !!ytChannel) type = "youtube";
|
|
1557
|
+
else type = "news";
|
|
1558
|
+
return { type, title, url, source, platform, popularComment, engagement, date, duration, authorNote };
|
|
1559
|
+
});
|
|
1560
|
+
}, sels);
|
|
1561
|
+
}
|
|
1562
|
+
async extractOrganicResults(page) {
|
|
1563
|
+
const sels = OrganicSelectors;
|
|
1564
|
+
return page.evaluate((s) => {
|
|
1565
|
+
const out = [];
|
|
1566
|
+
let pos = 0;
|
|
1567
|
+
document.querySelectorAll(s.result).forEach((card) => {
|
|
1568
|
+
const titleEl = card.querySelector(s.title);
|
|
1569
|
+
if (!titleEl) return;
|
|
1570
|
+
const title = titleEl.textContent?.trim() ?? "";
|
|
1571
|
+
const linkEl = titleEl.closest("a");
|
|
1572
|
+
const url = linkEl?.href ?? "";
|
|
1573
|
+
if (!title || !url) return;
|
|
1574
|
+
pos++;
|
|
1575
|
+
const cite = card.querySelector(s.cite)?.textContent?.trim() ?? null;
|
|
1576
|
+
const snippet = card.querySelector(s.snippet)?.textContent?.trim() ?? null;
|
|
1577
|
+
const isRedditStyle = !!card.querySelector(s.redditCite);
|
|
1578
|
+
const ratingEl = card.querySelector(s.ratingWrap);
|
|
1579
|
+
const inlineRating = ratingEl ? { value: ratingEl.querySelector(s.ratingValue)?.textContent?.trim() ?? "", count: ratingEl.querySelector(s.reviewCount)?.textContent?.trim() ?? "" } : null;
|
|
1580
|
+
let domain = "";
|
|
1581
|
+
try {
|
|
1582
|
+
domain = new URL(url).hostname.replace(/^www\./, "");
|
|
1583
|
+
} catch {
|
|
1584
|
+
domain = card.querySelector(s.siteName)?.textContent?.trim() ?? "";
|
|
1585
|
+
}
|
|
1586
|
+
out.push({ position: pos, title, url, domain, cite, snippet, isRedditStyle, inlineRating });
|
|
1587
|
+
});
|
|
1588
|
+
return out;
|
|
1589
|
+
}, sels);
|
|
1590
|
+
}
|
|
1591
|
+
async extractLocalPack(page) {
|
|
1592
|
+
const sels = LocalPackSelectors;
|
|
1593
|
+
return page.evaluate((s) => {
|
|
1594
|
+
const out = [];
|
|
1595
|
+
let container = null;
|
|
1596
|
+
document.querySelectorAll('[role="heading"]').forEach((h) => {
|
|
1597
|
+
if (!container && h.textContent?.includes(s.headingText)) container = h.closest("[data-hveid]");
|
|
1598
|
+
});
|
|
1599
|
+
if (!container) return out;
|
|
1600
|
+
container.querySelectorAll(s.card).forEach((card, i) => {
|
|
1601
|
+
const name = card.querySelector(s.name)?.textContent?.trim() ?? "";
|
|
1602
|
+
if (!name) return;
|
|
1603
|
+
const rating = card.querySelector(s.ratingValue)?.textContent?.trim() ?? null;
|
|
1604
|
+
const reviewRaw = card.querySelector(s.reviewCount)?.textContent?.trim() ?? null;
|
|
1605
|
+
const reviewCount = reviewRaw ? reviewRaw.replace(/[()]/g, "").trim() : null;
|
|
1606
|
+
let cid = card.querySelector("a[data-cid]")?.getAttribute("data-cid") ?? null;
|
|
1607
|
+
if (!cid) {
|
|
1608
|
+
for (const link of Array.from(card.querySelectorAll("a[href]"))) {
|
|
1609
|
+
const m1 = link.href.match(/[?&]cid=(\d+)/);
|
|
1610
|
+
if (m1) {
|
|
1611
|
+
cid = m1[1];
|
|
1612
|
+
break;
|
|
1613
|
+
}
|
|
1614
|
+
const m2 = link.href.match(/!1s0x[0-9a-f]+:0x([0-9a-f]+)/i);
|
|
1615
|
+
if (m2) {
|
|
1616
|
+
try {
|
|
1617
|
+
cid = BigInt("0x" + m2[1]).toString();
|
|
1618
|
+
} catch {
|
|
1619
|
+
}
|
|
1620
|
+
if (cid) break;
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
}
|
|
1624
|
+
const metadata = [];
|
|
1625
|
+
card.querySelectorAll("div, span").forEach((el) => {
|
|
1626
|
+
const text = Array.from(el.childNodes).filter((n) => n.nodeType === 3).map((n) => n.textContent?.trim() ?? "").filter((t) => t.length > 1 && t.length < 120).join(" ");
|
|
1627
|
+
if (text && !metadata.includes(text)) metadata.push(text);
|
|
1628
|
+
});
|
|
1629
|
+
const links = Array.from(card.querySelectorAll("a[href]"));
|
|
1630
|
+
const directionsUrl = links.find((a) => a.href.includes("google.com/maps"))?.href ?? null;
|
|
1631
|
+
const websiteUrl = links.find((a) => !a.href.includes("google.com") && a.href.startsWith("http"))?.href ?? null;
|
|
1632
|
+
out.push({ position: i + 1, name, cid, rating, reviewCount, metadata, websiteUrl, directionsUrl });
|
|
1633
|
+
});
|
|
1634
|
+
return out;
|
|
1635
|
+
}, sels);
|
|
1636
|
+
}
|
|
1637
|
+
async extractEntityIds(page) {
|
|
1638
|
+
return page.evaluate(() => {
|
|
1639
|
+
const kgIds = /* @__PURE__ */ new Set();
|
|
1640
|
+
const cids = /* @__PURE__ */ new Set();
|
|
1641
|
+
const gcids = /* @__PURE__ */ new Set();
|
|
1642
|
+
const recordMap = /* @__PURE__ */ new Map();
|
|
1643
|
+
function nameFromWrapper(el) {
|
|
1644
|
+
const sel = [".OSrXXb", ".dbg0pd", ".tzt0oe", '[role="heading"]', "h3"];
|
|
1645
|
+
for (const s of sel) {
|
|
1646
|
+
const found = el.querySelector(s);
|
|
1647
|
+
if (found?.textContent?.trim()) return found.textContent.trim();
|
|
1648
|
+
}
|
|
1649
|
+
return "";
|
|
1650
|
+
}
|
|
1651
|
+
document.querySelectorAll('[id^="pv-/g/"]').forEach((wrapper) => {
|
|
1652
|
+
const raw = wrapper.getAttribute("id");
|
|
1653
|
+
if (!raw) return;
|
|
1654
|
+
const kgId = raw.replace("pv-", "");
|
|
1655
|
+
kgIds.add(kgId);
|
|
1656
|
+
const name = nameFromWrapper(wrapper);
|
|
1657
|
+
const cidEl = wrapper.querySelector("a[data-cid]");
|
|
1658
|
+
const cid = cidEl?.getAttribute("data-cid") ?? null;
|
|
1659
|
+
if (cid) cids.add(cid);
|
|
1660
|
+
if (name) recordMap.set(kgId, { name, kgId, cid, gcid: null });
|
|
1661
|
+
});
|
|
1662
|
+
document.querySelectorAll("[data-mid]").forEach((el) => {
|
|
1663
|
+
const mid = el.getAttribute("data-mid");
|
|
1664
|
+
if (!mid?.startsWith("/g/")) return;
|
|
1665
|
+
kgIds.add(mid);
|
|
1666
|
+
if (!recordMap.has(mid)) {
|
|
1667
|
+
const name = nameFromWrapper(el);
|
|
1668
|
+
if (name) recordMap.set(mid, { name, kgId: mid, cid: null, gcid: null });
|
|
1669
|
+
}
|
|
1670
|
+
});
|
|
1671
|
+
document.querySelectorAll(".w7Dbne").forEach((card) => {
|
|
1672
|
+
const cidEl = card.querySelector("a[data-cid]");
|
|
1673
|
+
const cid = cidEl?.getAttribute("data-cid") ?? null;
|
|
1674
|
+
if (!cid) return;
|
|
1675
|
+
cids.add(cid);
|
|
1676
|
+
const name = card.querySelector(".OSrXXb")?.textContent?.trim() ?? "";
|
|
1677
|
+
if (!name) return;
|
|
1678
|
+
const kgIdEl = card.querySelector('[id^="pv-/g/"]');
|
|
1679
|
+
const kgId = kgIdEl ? kgIdEl.getAttribute("id").replace("pv-", "") : null;
|
|
1680
|
+
const key = kgId ?? `cid:${cid}`;
|
|
1681
|
+
if (recordMap.has(key)) {
|
|
1682
|
+
const existing = recordMap.get(key);
|
|
1683
|
+
if (!existing.cid) recordMap.set(key, { ...existing, cid });
|
|
1684
|
+
} else {
|
|
1685
|
+
recordMap.set(key, { name, kgId, cid, gcid: null });
|
|
1686
|
+
}
|
|
1687
|
+
});
|
|
1688
|
+
document.querySelectorAll("a[data-cid]").forEach((el) => {
|
|
1689
|
+
const cid = el.getAttribute("data-cid");
|
|
1690
|
+
if (!cid) return;
|
|
1691
|
+
cids.add(cid);
|
|
1692
|
+
const alreadyNamed = [...recordMap.values()].some((r) => r.cid === cid);
|
|
1693
|
+
if (!alreadyNamed) {
|
|
1694
|
+
let node = el.parentElement;
|
|
1695
|
+
let name = "";
|
|
1696
|
+
for (let i = 0; i < 8 && node; i++) {
|
|
1697
|
+
const h = node.querySelector('.OSrXXb, .dbg0pd, [role="heading"], h3');
|
|
1698
|
+
if (h?.textContent?.trim()) {
|
|
1699
|
+
name = h.textContent.trim();
|
|
1700
|
+
break;
|
|
1701
|
+
}
|
|
1702
|
+
node = node.parentElement;
|
|
1703
|
+
}
|
|
1704
|
+
if (name) recordMap.set(`cid:${cid}`, { name, kgId: null, cid, gcid: null });
|
|
1705
|
+
}
|
|
1706
|
+
});
|
|
1707
|
+
const scriptContent = Array.from(document.querySelectorAll("script:not([src])")).map((s) => s.textContent ?? "").filter((t) => t.length > 1e4).join("\n");
|
|
1708
|
+
for (const m of scriptContent.matchAll(/\/g\/[a-zA-Z0-9_-]{5,20}/g)) kgIds.add(m[0]);
|
|
1709
|
+
for (const m of scriptContent.matchAll(/gcid:[a-zA-Z0-9_]+/g)) gcids.add(m[0]);
|
|
1710
|
+
for (const m of scriptContent.matchAll(/0x[0-9a-f]+:0x([0-9a-f]+)/gi)) {
|
|
1711
|
+
try {
|
|
1712
|
+
cids.add(BigInt("0x" + m[1]).toString());
|
|
1713
|
+
} catch {
|
|
1714
|
+
}
|
|
1715
|
+
}
|
|
1716
|
+
return { entities: [...recordMap.values()], kgIds: [...kgIds], cids: [...cids], gcids: [...gcids] };
|
|
1717
|
+
});
|
|
1718
|
+
}
|
|
1719
|
+
mergeLocalPackIntoEntities(entityIds, localPack) {
|
|
1720
|
+
const cidSet = new Set(entityIds.cids);
|
|
1721
|
+
const records = entityIds.entities.map((r) => ({ ...r }));
|
|
1722
|
+
for (const biz of localPack) {
|
|
1723
|
+
if (!biz.cid) continue;
|
|
1724
|
+
cidSet.add(biz.cid);
|
|
1725
|
+
const nameNorm = biz.name.toLowerCase().trim();
|
|
1726
|
+
const byName = records.find((r) => r.name.toLowerCase().trim() === nameNorm);
|
|
1727
|
+
if (byName) {
|
|
1728
|
+
if (!byName.cid) byName.cid = biz.cid;
|
|
1729
|
+
} else if (!records.find((r) => r.cid === biz.cid)) {
|
|
1730
|
+
records.push({ name: biz.name, kgId: null, cid: biz.cid, gcid: null });
|
|
1731
|
+
}
|
|
1732
|
+
}
|
|
1733
|
+
return { ...entityIds, entities: records, cids: [...cidSet] };
|
|
1734
|
+
}
|
|
1735
|
+
async extractAISurfaces(page) {
|
|
1736
|
+
return page.evaluate(extractAISurfacesFromDocument, {
|
|
1737
|
+
aio: AIOverviewSelectors,
|
|
1738
|
+
aim: AIModeSelectors,
|
|
1739
|
+
expandWaitMs: 1500
|
|
1740
|
+
});
|
|
1741
|
+
}
|
|
1742
|
+
buildTree(flat, _seed) {
|
|
1743
|
+
const roots = [];
|
|
1744
|
+
const nodeMap = /* @__PURE__ */ new Map();
|
|
1745
|
+
for (const row of flat) {
|
|
1746
|
+
const node = {
|
|
1747
|
+
question: row.question,
|
|
1748
|
+
answer: row.answer || null,
|
|
1749
|
+
sourceTitle: row.source_title || null,
|
|
1750
|
+
sourceSite: row.source_site || null,
|
|
1751
|
+
sourceCite: row.source_cite || null,
|
|
1752
|
+
depth: row.depth,
|
|
1753
|
+
parentQuestion: row.parent_question || null,
|
|
1754
|
+
children: []
|
|
1755
|
+
};
|
|
1756
|
+
nodeMap.set(row.question, node);
|
|
1757
|
+
}
|
|
1758
|
+
for (const node of nodeMap.values()) {
|
|
1759
|
+
if (node.parentQuestion && nodeMap.has(node.parentQuestion)) {
|
|
1760
|
+
nodeMap.get(node.parentQuestion).children.push(node);
|
|
1761
|
+
} else {
|
|
1762
|
+
roots.push(node);
|
|
1763
|
+
}
|
|
1764
|
+
}
|
|
1765
|
+
return roots;
|
|
1766
|
+
}
|
|
1767
|
+
getBrowserDebugSnapshot() {
|
|
1768
|
+
return this.driver.getDebugSnapshot();
|
|
1769
|
+
}
|
|
1770
|
+
buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) {
|
|
1771
|
+
if (!options.debug) return void 0;
|
|
1772
|
+
return {
|
|
1773
|
+
enabled: true,
|
|
1774
|
+
request: {
|
|
1775
|
+
query: options.query,
|
|
1776
|
+
locationInput: options.location ?? null,
|
|
1777
|
+
canonicalLocation,
|
|
1778
|
+
uule,
|
|
1779
|
+
gl: options.gl,
|
|
1780
|
+
hl: options.hl,
|
|
1781
|
+
device: options.device,
|
|
1782
|
+
proxyMode: options.proxyMode,
|
|
1783
|
+
proxyZip: options.proxyZip ?? null,
|
|
1784
|
+
serpOnly: options.serpOnly,
|
|
1785
|
+
pages: options.pages ?? 1
|
|
1786
|
+
},
|
|
1787
|
+
browser: this.getBrowserDebugSnapshot(),
|
|
1788
|
+
...locationEvidence ? { locationEvidence } : {}
|
|
1789
|
+
};
|
|
1790
|
+
}
|
|
1791
|
+
async extract(options, signal) {
|
|
1792
|
+
const startMs = Date.now();
|
|
1793
|
+
const isMobile = options.device === "mobile";
|
|
1794
|
+
const config = {
|
|
1795
|
+
headless: options.headless,
|
|
1796
|
+
profileDir: options.profileDir,
|
|
1797
|
+
proxy: options.proxy,
|
|
1798
|
+
kernelApiKey: options.kernelApiKey,
|
|
1799
|
+
kernelProxyId: options.kernelProxyId,
|
|
1800
|
+
kernelProxyResolution: options.kernelProxyResolution,
|
|
1801
|
+
proxyMode: options.proxyMode,
|
|
1802
|
+
viewport: isMobile ? { width: 390, height: 844 } : { width: 1280, height: 800 },
|
|
1803
|
+
locale: `${options.hl}-${options.gl.toUpperCase()}`,
|
|
1804
|
+
userAgent: isMobile ? MOBILE_USER_AGENT2 : DESKTOP_USER_AGENT2,
|
|
1805
|
+
deviceScaleFactor: isMobile ? 3 : 1,
|
|
1806
|
+
isMobile,
|
|
1807
|
+
hasTouch: isMobile,
|
|
1808
|
+
debug: options.debug
|
|
1809
|
+
};
|
|
1810
|
+
let errorCount = 0;
|
|
1811
|
+
const diagnosticWarnings = [];
|
|
1812
|
+
try {
|
|
1813
|
+
this.throwIfAborted(signal);
|
|
1814
|
+
await this.driver.launch(config);
|
|
1815
|
+
this.throwIfAborted(signal);
|
|
1816
|
+
const canonicalLocation = options.location ? normalizeLocation(options.location) : null;
|
|
1817
|
+
const uule = canonicalLocation ? encodeUule(canonicalLocation) : null;
|
|
1818
|
+
const { hasPaa } = await this.driver.navigateToSERP(
|
|
1819
|
+
options.query,
|
|
1820
|
+
uule,
|
|
1821
|
+
options.gl,
|
|
1822
|
+
options.hl,
|
|
1823
|
+
{
|
|
1824
|
+
...options.serpOnly ? { num: 100 } : {},
|
|
1825
|
+
debug: options.debug
|
|
1826
|
+
}
|
|
1827
|
+
);
|
|
1828
|
+
this.throwIfAborted(signal);
|
|
1829
|
+
const page = this.driver.getPage();
|
|
1830
|
+
await this.throwIfCaptcha(page, "Google SERP");
|
|
1831
|
+
if (options.serpOnly) {
|
|
1832
|
+
const [organicResults2, localPack2, rawEntityIds2] = await Promise.all([
|
|
1833
|
+
this.extractOrganicResults(page),
|
|
1834
|
+
this.extractLocalPack(page),
|
|
1835
|
+
this.extractEntityIds(page)
|
|
1836
|
+
]);
|
|
1837
|
+
const entityIds2 = this.mergeLocalPackIntoEntities(rawEntityIds2, localPack2);
|
|
1838
|
+
const aiSurfaces2 = await this.extractAISurfaces(page);
|
|
1839
|
+
let locationEvidence2 = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults2, localPack2) : void 0;
|
|
1840
|
+
let allOrganic2 = organicResults2;
|
|
1841
|
+
if ((options.pages ?? 1) >= 2) {
|
|
1842
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1843
|
+
if (uule) p2params.set("uule", uule);
|
|
1844
|
+
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1845
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1846
|
+
const p2organic = await this.extractOrganicResults(page);
|
|
1847
|
+
allOrganic2 = [...organicResults2, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1848
|
+
if (options.debug) {
|
|
1849
|
+
locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, allOrganic2, localPack2);
|
|
1850
|
+
}
|
|
1851
|
+
}
|
|
1852
|
+
const stats2 = {
|
|
1853
|
+
seed: options.query,
|
|
1854
|
+
totalQuestions: 0,
|
|
1855
|
+
maxDepthReached: 0,
|
|
1856
|
+
durationMs: Date.now() - startMs,
|
|
1857
|
+
errorCount
|
|
1858
|
+
};
|
|
1859
|
+
this.reporter.onComplete(stats2);
|
|
1860
|
+
return {
|
|
1861
|
+
seed: options.query,
|
|
1862
|
+
location: options.location ?? null,
|
|
1863
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1864
|
+
diagnostics: {
|
|
1865
|
+
completionStatus: "serp_only",
|
|
1866
|
+
problem: null,
|
|
1867
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
|
|
1868
|
+
},
|
|
1869
|
+
totalQuestions: 0,
|
|
1870
|
+
surface: aiSurfaces2.surface,
|
|
1871
|
+
aiOverview: aiSurfaces2.aiOverview,
|
|
1872
|
+
aiMode: aiSurfaces2.aiMode,
|
|
1873
|
+
whatPeopleSaying: [],
|
|
1874
|
+
tree: [],
|
|
1875
|
+
flat: [],
|
|
1876
|
+
videos: [],
|
|
1877
|
+
forums: [],
|
|
1878
|
+
organicResults: allOrganic2,
|
|
1879
|
+
localPack: localPack2,
|
|
1880
|
+
entityIds: entityIds2,
|
|
1881
|
+
stats: stats2
|
|
1882
|
+
};
|
|
1883
|
+
}
|
|
1884
|
+
const [videos, forums, whatPeopleSaying, rawEntityIds, organicResults, localPack] = await Promise.all([
|
|
1885
|
+
this.extractVideos(page),
|
|
1886
|
+
this.extractForums(page),
|
|
1887
|
+
this.extractWhatPeopleSaying(page),
|
|
1888
|
+
this.extractEntityIds(page),
|
|
1889
|
+
this.extractOrganicResults(page),
|
|
1890
|
+
this.extractLocalPack(page)
|
|
1891
|
+
]);
|
|
1892
|
+
const entityIds = this.mergeLocalPackIntoEntities(rawEntityIds, localPack);
|
|
1893
|
+
const initialLocationEvidence = options.debug ? inferSerpLocationEvidence(canonicalLocation, organicResults, localPack) : void 0;
|
|
1894
|
+
this.reporter.onVideos(videos);
|
|
1895
|
+
this.reporter.onForums(forums);
|
|
1896
|
+
if (!hasPaa) {
|
|
1897
|
+
let noPaaOrganic = organicResults;
|
|
1898
|
+
let locationEvidence2 = initialLocationEvidence;
|
|
1899
|
+
if ((options.pages ?? 1) >= 2) {
|
|
1900
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1901
|
+
if (uule) p2params.set("uule", uule);
|
|
1902
|
+
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1903
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1904
|
+
const p2organic = await this.extractOrganicResults(page);
|
|
1905
|
+
noPaaOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1906
|
+
if (options.debug) {
|
|
1907
|
+
locationEvidence2 = inferSerpLocationEvidence(canonicalLocation, noPaaOrganic, localPack);
|
|
1908
|
+
}
|
|
1909
|
+
}
|
|
1910
|
+
const aiSurfaces2 = await this.extractAISurfaces(page);
|
|
1911
|
+
const stats2 = {
|
|
1912
|
+
seed: options.query,
|
|
1913
|
+
totalQuestions: 0,
|
|
1914
|
+
maxDepthReached: 0,
|
|
1915
|
+
durationMs: Date.now() - startMs,
|
|
1916
|
+
errorCount
|
|
1917
|
+
};
|
|
1918
|
+
this.reporter.onComplete(stats2);
|
|
1919
|
+
return {
|
|
1920
|
+
seed: options.query,
|
|
1921
|
+
location: options.location ?? null,
|
|
1922
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1923
|
+
diagnostics: {
|
|
1924
|
+
completionStatus: "no_paa",
|
|
1925
|
+
problem: null,
|
|
1926
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence2) } : {}
|
|
1927
|
+
},
|
|
1928
|
+
totalQuestions: 0,
|
|
1929
|
+
surface: aiSurfaces2.surface,
|
|
1930
|
+
aiOverview: aiSurfaces2.aiOverview,
|
|
1931
|
+
aiMode: aiSurfaces2.aiMode,
|
|
1932
|
+
whatPeopleSaying,
|
|
1933
|
+
tree: [],
|
|
1934
|
+
flat: [],
|
|
1935
|
+
videos,
|
|
1936
|
+
forums,
|
|
1937
|
+
organicResults: noPaaOrganic,
|
|
1938
|
+
localPack,
|
|
1939
|
+
entityIds,
|
|
1940
|
+
stats: stats2
|
|
1941
|
+
};
|
|
1942
|
+
}
|
|
1943
|
+
const flat = await this.runBFS(page, options, signal);
|
|
1944
|
+
this.throwIfAborted(signal);
|
|
1945
|
+
const aiSurfaces = await this.extractAISurfaces(page);
|
|
1946
|
+
const shortVidsParams = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", udm: ShortVideoSelectors.udm });
|
|
1947
|
+
if (uule) shortVidsParams.set("uule", uule);
|
|
1948
|
+
let shortVideos = [];
|
|
1949
|
+
try {
|
|
1950
|
+
shortVideos = await this.extractShortVideos(page, "https://www.google.com/search?" + shortVidsParams.toString());
|
|
1951
|
+
} catch (err) {
|
|
1952
|
+
if (!(err instanceof CaptchaError)) throw err;
|
|
1953
|
+
errorCount++;
|
|
1954
|
+
diagnosticWarnings.push({
|
|
1955
|
+
code: "short_videos_captcha_skipped",
|
|
1956
|
+
surface: "short_videos",
|
|
1957
|
+
message: err.message,
|
|
1958
|
+
retryable: true
|
|
1959
|
+
});
|
|
1960
|
+
}
|
|
1961
|
+
this.reporter.onVideos(shortVideos);
|
|
1962
|
+
let allOrganic = organicResults;
|
|
1963
|
+
let locationEvidence = initialLocationEvidence;
|
|
1964
|
+
if ((options.pages ?? 1) >= 2) {
|
|
1965
|
+
const p2params = new URLSearchParams({ q: options.query, gl: options.gl, hl: options.hl, pws: "0", start: "10" });
|
|
1966
|
+
if (uule) p2params.set("uule", uule);
|
|
1967
|
+
await this.driver.navigateTo("https://www.google.com/search?" + p2params.toString());
|
|
1968
|
+
await this.throwIfCaptcha(page, "Google SERP page 2");
|
|
1969
|
+
const p2organic = await this.extractOrganicResults(page);
|
|
1970
|
+
allOrganic = [...organicResults, ...p2organic.map((r) => ({ ...r, position: r.position + 10 }))];
|
|
1971
|
+
if (options.debug) {
|
|
1972
|
+
locationEvidence = inferSerpLocationEvidence(canonicalLocation, allOrganic, localPack);
|
|
1973
|
+
}
|
|
1974
|
+
}
|
|
1975
|
+
const allVideos = [...videos, ...shortVideos];
|
|
1976
|
+
const tree = this.buildTree(flat, options.query);
|
|
1977
|
+
const stats = {
|
|
1978
|
+
seed: options.query,
|
|
1979
|
+
totalQuestions: flat.length,
|
|
1980
|
+
maxDepthReached: flat.reduce((m, r) => Math.max(m, r.depth), 0),
|
|
1981
|
+
durationMs: Date.now() - startMs,
|
|
1982
|
+
errorCount
|
|
1983
|
+
};
|
|
1984
|
+
this.reporter.onComplete(stats);
|
|
1985
|
+
return {
|
|
1986
|
+
seed: options.query,
|
|
1987
|
+
location: options.location ?? null,
|
|
1988
|
+
extractedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1989
|
+
diagnostics: {
|
|
1990
|
+
completionStatus: "paa_found",
|
|
1991
|
+
problem: null,
|
|
1992
|
+
...diagnosticWarnings.length > 0 ? { warnings: diagnosticWarnings } : {},
|
|
1993
|
+
...options.debug ? { debug: this.buildHarvestDebugSnapshot(options, canonicalLocation, uule, locationEvidence) } : {}
|
|
1994
|
+
},
|
|
1995
|
+
totalQuestions: flat.length,
|
|
1996
|
+
surface: aiSurfaces.surface,
|
|
1997
|
+
aiOverview: aiSurfaces.aiOverview,
|
|
1998
|
+
aiMode: aiSurfaces.aiMode,
|
|
1999
|
+
whatPeopleSaying,
|
|
2000
|
+
tree,
|
|
2001
|
+
flat,
|
|
2002
|
+
videos: allVideos,
|
|
2003
|
+
forums,
|
|
2004
|
+
organicResults: allOrganic,
|
|
2005
|
+
localPack,
|
|
2006
|
+
entityIds,
|
|
2007
|
+
stats
|
|
2008
|
+
};
|
|
2009
|
+
} catch (err) {
|
|
2010
|
+
errorCount++;
|
|
2011
|
+
this.reporter.onError(err instanceof Error ? err : new Error(String(err)));
|
|
2012
|
+
throw err;
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
};
|
|
2016
|
+
|
|
2017
|
+
// src/output/OutputSerializer.ts
|
|
2018
|
+
import { promises as fs } from "fs";
|
|
2019
|
+
import path from "path";
|
|
2020
|
+
import Papa from "papaparse";
|
|
2021
|
+
var OutputSerializer = class {
|
|
2022
|
+
async writeJSON(result, outputDir) {
|
|
2023
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
2024
|
+
const slug = result.seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
2025
|
+
const filename = `${slug}-${Date.now()}.json`;
|
|
2026
|
+
const fullPath = path.join(outputDir, filename);
|
|
2027
|
+
await fs.writeFile(fullPath, JSON.stringify(result, null, 2), "utf8");
|
|
2028
|
+
return fullPath;
|
|
2029
|
+
}
|
|
2030
|
+
async writeCSV(rows, outputDir) {
|
|
2031
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
2032
|
+
const seedRaw = rows[0]?.seed_query ?? "paa";
|
|
2033
|
+
const slug = seedRaw.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
2034
|
+
const csv = Papa.unparse(rows, { header: true });
|
|
2035
|
+
const filename = `${slug}-${Date.now()}.csv`;
|
|
2036
|
+
const fullPath = path.join(outputDir, filename);
|
|
2037
|
+
await fs.writeFile(fullPath, csv, "utf8");
|
|
2038
|
+
return fullPath;
|
|
2039
|
+
}
|
|
2040
|
+
async writeVideoCSV(videos, seed, outputDir) {
|
|
2041
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
2042
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
2043
|
+
const csv = Papa.unparse(videos, { header: true });
|
|
2044
|
+
const filename = `${slug}-videos-${Date.now()}.csv`;
|
|
2045
|
+
const fullPath = path.join(outputDir, filename);
|
|
2046
|
+
await fs.writeFile(fullPath, csv, "utf8");
|
|
2047
|
+
return fullPath;
|
|
2048
|
+
}
|
|
2049
|
+
async writeForumCSV(forums, seed, outputDir) {
|
|
2050
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
2051
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
2052
|
+
const csv = Papa.unparse(forums, { header: true });
|
|
2053
|
+
const filename = `${slug}-forums-${Date.now()}.csv`;
|
|
2054
|
+
const fullPath = path.join(outputDir, filename);
|
|
2055
|
+
await fs.writeFile(fullPath, csv, "utf8");
|
|
2056
|
+
return fullPath;
|
|
2057
|
+
}
|
|
2058
|
+
async writeAIOverviewCSV(citations, text, seed, outputDir) {
|
|
2059
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
2060
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
2061
|
+
const rows = citations.map((c, i) => ({
|
|
2062
|
+
seed_query: seed,
|
|
2063
|
+
response_text: i === 0 ? text ?? "" : "",
|
|
2064
|
+
citation_text: c.text,
|
|
2065
|
+
citation_href: c.href
|
|
2066
|
+
}));
|
|
2067
|
+
const csv = Papa.unparse(rows, { header: true });
|
|
2068
|
+
const filename = `${slug}-ai-overview-${Date.now()}.csv`;
|
|
2069
|
+
const fullPath = path.join(outputDir, filename);
|
|
2070
|
+
await fs.writeFile(fullPath, csv, "utf8");
|
|
2071
|
+
return fullPath;
|
|
2072
|
+
}
|
|
2073
|
+
async writeAIModeCSV(citations, text, seed, outputDir) {
|
|
2074
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
2075
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
2076
|
+
const rows = citations.map((c, i) => ({
|
|
2077
|
+
seed_query: seed,
|
|
2078
|
+
response_text: i === 0 ? text ?? "" : "",
|
|
2079
|
+
citation_text: c.text,
|
|
2080
|
+
citation_href: c.href
|
|
2081
|
+
}));
|
|
2082
|
+
const csv = Papa.unparse(rows, { header: true });
|
|
2083
|
+
const filename = `${slug}-ai-mode-${Date.now()}.csv`;
|
|
2084
|
+
const fullPath = path.join(outputDir, filename);
|
|
2085
|
+
await fs.writeFile(fullPath, csv, "utf8");
|
|
2086
|
+
return fullPath;
|
|
2087
|
+
}
|
|
2088
|
+
async writeWhatPeopleSayingCSV(cards, seed, outputDir) {
|
|
2089
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
2090
|
+
const slug = seed.toLowerCase().replace(/\W+/g, "-").slice(0, 40);
|
|
2091
|
+
const rows = cards.map((c) => ({ seed_query: seed, ...c }));
|
|
2092
|
+
const csv = Papa.unparse(rows, { header: true });
|
|
2093
|
+
const filename = `${slug}-what-people-saying-${Date.now()}.csv`;
|
|
2094
|
+
const fullPath = path.join(outputDir, filename);
|
|
2095
|
+
await fs.writeFile(fullPath, csv, "utf8");
|
|
2096
|
+
return fullPath;
|
|
2097
|
+
}
|
|
2098
|
+
};
|
|
2099
|
+
|
|
2100
|
+
// src/output/ProgressReporter.ts
|
|
2101
|
+
var ProgressReporter = class {
|
|
2102
|
+
onQuestion(node) {
|
|
2103
|
+
process.stdout.write(JSON.stringify({ event: "question", depth: node.depth, question: node.question }) + "\n");
|
|
2104
|
+
}
|
|
2105
|
+
onDepth(depth) {
|
|
2106
|
+
process.stdout.write(JSON.stringify({ event: "depth", depth }) + "\n");
|
|
2107
|
+
}
|
|
2108
|
+
onVideos(videos) {
|
|
2109
|
+
for (const v of videos) {
|
|
2110
|
+
process.stdout.write(JSON.stringify({ event: "video", type: v.type, platform: v.platform, duration: v.duration, title: v.title, channel: v.channel, url: v.url }) + "\n");
|
|
2111
|
+
}
|
|
2112
|
+
}
|
|
2113
|
+
onForums(forums) {
|
|
2114
|
+
for (const f of forums) {
|
|
2115
|
+
process.stdout.write(JSON.stringify({ event: "forum", title: f.title, source: f.source, url: f.url }) + "\n");
|
|
2116
|
+
}
|
|
2117
|
+
}
|
|
2118
|
+
onComplete(stats) {
|
|
2119
|
+
process.stdout.write(JSON.stringify({ event: "complete", ...stats }) + "\n");
|
|
2120
|
+
}
|
|
2121
|
+
onError(err) {
|
|
2122
|
+
process.stderr.write(JSON.stringify({ event: "error", type: err.constructor.name, message: err.message }) + "\n");
|
|
2123
|
+
}
|
|
2124
|
+
};
|
|
2125
|
+
|
|
2126
|
+
// src/kernel-proxy-resolver.ts
|
|
2127
|
+
import Kernel2 from "@onkernel/sdk";
|
|
2128
|
+
var US_STATE_CODES = {
|
|
2129
|
+
alabama: "AL",
|
|
2130
|
+
alaska: "AK",
|
|
2131
|
+
arizona: "AZ",
|
|
2132
|
+
arkansas: "AR",
|
|
2133
|
+
california: "CA",
|
|
2134
|
+
colorado: "CO",
|
|
2135
|
+
connecticut: "CT",
|
|
2136
|
+
delaware: "DE",
|
|
2137
|
+
florida: "FL",
|
|
2138
|
+
georgia: "GA",
|
|
2139
|
+
hawaii: "HI",
|
|
2140
|
+
idaho: "ID",
|
|
2141
|
+
illinois: "IL",
|
|
2142
|
+
indiana: "IN",
|
|
2143
|
+
iowa: "IA",
|
|
2144
|
+
kansas: "KS",
|
|
2145
|
+
kentucky: "KY",
|
|
2146
|
+
louisiana: "LA",
|
|
2147
|
+
maine: "ME",
|
|
2148
|
+
maryland: "MD",
|
|
2149
|
+
massachusetts: "MA",
|
|
2150
|
+
michigan: "MI",
|
|
2151
|
+
minnesota: "MN",
|
|
2152
|
+
mississippi: "MS",
|
|
2153
|
+
missouri: "MO",
|
|
2154
|
+
montana: "MT",
|
|
2155
|
+
nebraska: "NE",
|
|
2156
|
+
nevada: "NV",
|
|
2157
|
+
"new hampshire": "NH",
|
|
2158
|
+
"new jersey": "NJ",
|
|
2159
|
+
"new mexico": "NM",
|
|
2160
|
+
"new york": "NY",
|
|
2161
|
+
"north carolina": "NC",
|
|
2162
|
+
"north dakota": "ND",
|
|
2163
|
+
ohio: "OH",
|
|
2164
|
+
oklahoma: "OK",
|
|
2165
|
+
oregon: "OR",
|
|
2166
|
+
pennsylvania: "PA",
|
|
2167
|
+
"rhode island": "RI",
|
|
2168
|
+
"south carolina": "SC",
|
|
2169
|
+
"south dakota": "SD",
|
|
2170
|
+
tennessee: "TN",
|
|
2171
|
+
texas: "TX",
|
|
2172
|
+
utah: "UT",
|
|
2173
|
+
vermont: "VT",
|
|
2174
|
+
virginia: "VA",
|
|
2175
|
+
washington: "WA",
|
|
2176
|
+
"west virginia": "WV",
|
|
2177
|
+
wisconsin: "WI",
|
|
2178
|
+
wyoming: "WY"
|
|
2179
|
+
};
|
|
2180
|
+
var US_CITY_CENTER_ZIPS = {
|
|
2181
|
+
"atlanta|GA": "30303",
|
|
2182
|
+
"austin|TX": "78701",
|
|
2183
|
+
"baltimore|MD": "21201",
|
|
2184
|
+
"boston|MA": "02108",
|
|
2185
|
+
"boulder|CO": "80302",
|
|
2186
|
+
"charlotte|NC": "28202",
|
|
2187
|
+
"chicago|IL": "60601",
|
|
2188
|
+
"colorado_springs|CO": "80903",
|
|
2189
|
+
"columbus|OH": "43215",
|
|
2190
|
+
"dallas|TX": "75201",
|
|
2191
|
+
"denver|CO": "80202",
|
|
2192
|
+
"detroit|MI": "48226",
|
|
2193
|
+
"fort_collins|CO": "80524",
|
|
2194
|
+
"fort_worth|TX": "76102",
|
|
2195
|
+
"houston|TX": "77002",
|
|
2196
|
+
"indianapolis|IN": "46204",
|
|
2197
|
+
"jacksonville|FL": "32202",
|
|
2198
|
+
"las_vegas|NV": "89101",
|
|
2199
|
+
"los_angeles|CA": "90012",
|
|
2200
|
+
"louisville|KY": "40202",
|
|
2201
|
+
"loveland|CO": "80537",
|
|
2202
|
+
"memphis|TN": "38103",
|
|
2203
|
+
"miami|FL": "33131",
|
|
2204
|
+
"minneapolis|MN": "55401",
|
|
2205
|
+
"nashville|TN": "37203",
|
|
2206
|
+
"new_york|NY": "10001",
|
|
2207
|
+
"orlando|FL": "32801",
|
|
2208
|
+
"philadelphia|PA": "19103",
|
|
2209
|
+
"phoenix|AZ": "85004",
|
|
2210
|
+
"portland|OR": "97205",
|
|
2211
|
+
"raleigh|NC": "27601",
|
|
2212
|
+
"richmond|VA": "23219",
|
|
2213
|
+
"sacramento|CA": "95814",
|
|
2214
|
+
"salt_lake_city|UT": "84101",
|
|
2215
|
+
"san_antonio|TX": "78205",
|
|
2216
|
+
"san_diego|CA": "92101",
|
|
2217
|
+
"san_francisco|CA": "94103",
|
|
2218
|
+
"san_jose|CA": "95113",
|
|
2219
|
+
"seattle|WA": "98101"
|
|
2220
|
+
};
|
|
2221
|
+
function proxyIdSuffix2(proxyId) {
|
|
2222
|
+
return proxyId ? proxyId.slice(-6) : null;
|
|
2223
|
+
}
|
|
2224
|
+
function resolution(source, proxyMode, proxyId, target, error) {
|
|
2225
|
+
return {
|
|
2226
|
+
kernelProxyId: proxyId,
|
|
2227
|
+
resolution: {
|
|
2228
|
+
source,
|
|
2229
|
+
proxyMode,
|
|
2230
|
+
proxyIdPresent: Boolean(proxyId),
|
|
2231
|
+
proxyIdSuffix: proxyIdSuffix2(proxyId),
|
|
2232
|
+
target,
|
|
2233
|
+
error
|
|
2234
|
+
}
|
|
2235
|
+
};
|
|
2236
|
+
}
|
|
2237
|
+
function normalizeStateName(value) {
|
|
2238
|
+
return value.trim().toLowerCase().replace(/\s+/g, " ");
|
|
2239
|
+
}
|
|
2240
|
+
function normalizeCountryName(value) {
|
|
2241
|
+
return value.trim().toLowerCase().replace(/\./g, "").replace(/\s+/g, " ");
|
|
2242
|
+
}
|
|
2243
|
+
function isUnitedStates(country) {
|
|
2244
|
+
if (!country) return true;
|
|
2245
|
+
const normalized = normalizeCountryName(country);
|
|
2246
|
+
return normalized === "united states" || normalized === "united states of america" || normalized === "usa" || normalized === "us";
|
|
2247
|
+
}
|
|
2248
|
+
function stateCodeFor(region) {
|
|
2249
|
+
const trimmed = region.trim();
|
|
2250
|
+
if (/^[A-Za-z]{2}$/.test(trimmed)) return trimmed.toUpperCase();
|
|
2251
|
+
return US_STATE_CODES[normalizeStateName(trimmed)] ?? null;
|
|
2252
|
+
}
|
|
2253
|
+
function kernelCityIdentifierCandidates(city) {
|
|
2254
|
+
const ascii = city.normalize("NFKD").replace(/[^\x00-\x7F]/g, "").toLowerCase();
|
|
2255
|
+
const words = ascii.split(/[^a-z0-9]+/).filter(Boolean);
|
|
2256
|
+
const underscored = words.join("_");
|
|
2257
|
+
const compact = words.join("");
|
|
2258
|
+
return Array.from(new Set([underscored, compact].filter(Boolean)));
|
|
2259
|
+
}
|
|
2260
|
+
function proxyName(country, state, city) {
|
|
2261
|
+
return city ? `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}-${city}` : `mcp-serp-residential-${country.toLowerCase()}-${state.toLowerCase()}`;
|
|
2262
|
+
}
|
|
2263
|
+
function zipProxyName(zip) {
|
|
2264
|
+
return `mcp-serp-residential-us-zip-${zip}`;
|
|
2265
|
+
}
|
|
2266
|
+
function parseKernelLocationProxyTarget(location, gl) {
|
|
2267
|
+
if (!location || gl.toLowerCase() !== "us") return null;
|
|
2268
|
+
const canonicalLocation = normalizeLocation(location);
|
|
2269
|
+
let parts = canonicalLocation.split(",").map((part) => part.trim()).filter(Boolean);
|
|
2270
|
+
if (parts.length > 1 && isUnitedStates(parts[parts.length - 1])) {
|
|
2271
|
+
parts = parts.slice(0, -1);
|
|
2272
|
+
}
|
|
2273
|
+
if (parts.length === 1) {
|
|
2274
|
+
const stateOnly = stateCodeFor(parts[0]);
|
|
2275
|
+
if (!stateOnly) return null;
|
|
2276
|
+
return {
|
|
2277
|
+
canonicalLocation,
|
|
2278
|
+
level: "state",
|
|
2279
|
+
country: "US",
|
|
2280
|
+
state: stateOnly,
|
|
2281
|
+
city: "",
|
|
2282
|
+
cityCandidates: [],
|
|
2283
|
+
proxyName: proxyName("US", stateOnly),
|
|
2284
|
+
config: {
|
|
2285
|
+
country: "US",
|
|
2286
|
+
state: stateOnly
|
|
2287
|
+
}
|
|
2288
|
+
};
|
|
2289
|
+
}
|
|
2290
|
+
const [city = "", region = ""] = parts;
|
|
2291
|
+
if (!city || !region) return null;
|
|
2292
|
+
const state = stateCodeFor(region);
|
|
2293
|
+
if (!state) return null;
|
|
2294
|
+
const cityCandidates = kernelCityIdentifierCandidates(city);
|
|
2295
|
+
const primaryCity = cityCandidates[0];
|
|
2296
|
+
if (!primaryCity) return null;
|
|
2297
|
+
return {
|
|
2298
|
+
canonicalLocation,
|
|
2299
|
+
level: "city",
|
|
2300
|
+
country: "US",
|
|
2301
|
+
state,
|
|
2302
|
+
city: primaryCity,
|
|
2303
|
+
cityCandidates,
|
|
2304
|
+
proxyName: proxyName("US", state, primaryCity),
|
|
2305
|
+
config: {
|
|
2306
|
+
country: "US",
|
|
2307
|
+
state,
|
|
2308
|
+
city: primaryCity
|
|
2309
|
+
}
|
|
2310
|
+
};
|
|
2311
|
+
}
|
|
2312
|
+
function cityZipKey(target) {
|
|
2313
|
+
return `${target.city}|${target.state}`;
|
|
2314
|
+
}
|
|
2315
|
+
function knownZipFor(target, explicitZip) {
|
|
2316
|
+
if (explicitZip && /^\d{5}$/.test(explicitZip)) return explicitZip;
|
|
2317
|
+
return US_CITY_CENTER_ZIPS[cityZipKey(target)] ?? null;
|
|
2318
|
+
}
|
|
2319
|
+
function zipTarget(target, zip) {
|
|
2320
|
+
return {
|
|
2321
|
+
...target,
|
|
2322
|
+
level: "zip",
|
|
2323
|
+
zip,
|
|
2324
|
+
proxyName: zipProxyName(zip),
|
|
2325
|
+
config: {
|
|
2326
|
+
country: target.country,
|
|
2327
|
+
state: target.state,
|
|
2328
|
+
zip
|
|
2329
|
+
}
|
|
2330
|
+
};
|
|
2331
|
+
}
|
|
2332
|
+
function configMatches(config, target, city) {
|
|
2333
|
+
if (target.level === "zip") {
|
|
2334
|
+
return config?.country?.toUpperCase() === target.country && config?.zip === target.zip;
|
|
2335
|
+
}
|
|
2336
|
+
return config?.country?.toUpperCase() === target.country && config?.state?.toUpperCase() === target.state && (city ? config?.city === city : !config?.city);
|
|
2337
|
+
}
|
|
2338
|
+
function findExistingTargetProxy(proxies, target) {
|
|
2339
|
+
return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === target.proxyName || configMatches(proxy.config, target, target.level === "city" ? target.city : void 0))) ?? null;
|
|
2340
|
+
}
|
|
2341
|
+
function findExistingProxy(proxies, target) {
|
|
2342
|
+
for (const city of target.cityCandidates) {
|
|
2343
|
+
const name = proxyName(target.country, target.state, city);
|
|
2344
|
+
const found = proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target, city)));
|
|
2345
|
+
if (found) return found;
|
|
2346
|
+
}
|
|
2347
|
+
return null;
|
|
2348
|
+
}
|
|
2349
|
+
function stateTarget(target) {
|
|
2350
|
+
return {
|
|
2351
|
+
...target,
|
|
2352
|
+
level: "state",
|
|
2353
|
+
proxyName: proxyName(target.country, target.state),
|
|
2354
|
+
config: {
|
|
2355
|
+
country: target.country,
|
|
2356
|
+
state: target.state
|
|
2357
|
+
}
|
|
2358
|
+
};
|
|
2359
|
+
}
|
|
2360
|
+
function findExistingStateProxy(proxies, target) {
|
|
2361
|
+
const name = proxyName(target.country, target.state);
|
|
2362
|
+
return proxies.find((proxy) => proxy.type === "residential" && proxy.status !== "unavailable" && Boolean(proxy.id) && (proxy.name === name || configMatches(proxy.config, target))) ?? null;
|
|
2363
|
+
}
|
|
2364
|
+
function escalatedTargetLevel(target, attemptIndex) {
|
|
2365
|
+
return stateTarget(target);
|
|
2366
|
+
}
|
|
2367
|
+
function errorText2(err) {
|
|
2368
|
+
return err instanceof Error ? err.message : String(err);
|
|
2369
|
+
}
|
|
2370
|
+
async function resolveKernelProxyId(options) {
|
|
2371
|
+
if (options.proxyMode === "none") {
|
|
2372
|
+
return resolution("disabled", options.proxyMode, void 0, null, null);
|
|
2373
|
+
}
|
|
2374
|
+
if (options.proxyMode === "configured") {
|
|
2375
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, null, null);
|
|
2376
|
+
}
|
|
2377
|
+
const target = parseKernelLocationProxyTarget(options.location, options.gl);
|
|
2378
|
+
if (!target || !options.kernelApiKey) {
|
|
2379
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, target ? null : "location could not be normalized to a US city/state proxy target");
|
|
2380
|
+
}
|
|
2381
|
+
const kernel = new Kernel2({ apiKey: options.kernelApiKey });
|
|
2382
|
+
try {
|
|
2383
|
+
const attemptIndex = options.attemptIndex ?? 0;
|
|
2384
|
+
if (attemptIndex >= 1) {
|
|
2385
|
+
const escalatedTarget = escalatedTargetLevel(target, attemptIndex);
|
|
2386
|
+
const createErrors2 = [];
|
|
2387
|
+
try {
|
|
2388
|
+
const created = await kernel.proxies.create({
|
|
2389
|
+
type: "residential",
|
|
2390
|
+
name: escalatedTarget.proxyName,
|
|
2391
|
+
config: escalatedTarget.config
|
|
2392
|
+
});
|
|
2393
|
+
if (created.id) {
|
|
2394
|
+
return resolution("location_created", options.proxyMode, created.id, escalatedTarget, null);
|
|
2395
|
+
}
|
|
2396
|
+
createErrors2.push(`${escalatedTarget.state}: Kernel did not return a proxy id`);
|
|
2397
|
+
} catch (err) {
|
|
2398
|
+
createErrors2.push(`${escalatedTarget.state}: ${errorText2(err)}`);
|
|
2399
|
+
}
|
|
2400
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, escalatedTarget, createErrors2.join(" | "));
|
|
2401
|
+
}
|
|
2402
|
+
const proxies = await kernel.proxies.list();
|
|
2403
|
+
const zip = knownZipFor(target, options.proxyZip);
|
|
2404
|
+
const createErrors = [];
|
|
2405
|
+
if (zip) {
|
|
2406
|
+
const targetZip = zipTarget(target, zip);
|
|
2407
|
+
const existingZip = findExistingTargetProxy(proxies, targetZip);
|
|
2408
|
+
if (existingZip?.id) {
|
|
2409
|
+
return resolution("location_reused", options.proxyMode, existingZip.id, targetZip, null);
|
|
2410
|
+
}
|
|
2411
|
+
try {
|
|
2412
|
+
const created = await kernel.proxies.create({
|
|
2413
|
+
type: "residential",
|
|
2414
|
+
name: targetZip.proxyName,
|
|
2415
|
+
config: {
|
|
2416
|
+
country: targetZip.country,
|
|
2417
|
+
zip
|
|
2418
|
+
}
|
|
2419
|
+
});
|
|
2420
|
+
if (created.id) {
|
|
2421
|
+
return resolution("location_created", options.proxyMode, created.id, targetZip, null);
|
|
2422
|
+
}
|
|
2423
|
+
createErrors.push(`${zip}: Kernel did not return a proxy id`);
|
|
2424
|
+
} catch (err) {
|
|
2425
|
+
createErrors.push(`${zip}: ${errorText2(err)}`);
|
|
2426
|
+
}
|
|
2427
|
+
}
|
|
2428
|
+
const existing = findExistingProxy(proxies, target);
|
|
2429
|
+
if (existing?.id) {
|
|
2430
|
+
return resolution("location_reused", options.proxyMode, existing.id, target, createErrors.join(" | ") || null);
|
|
2431
|
+
}
|
|
2432
|
+
for (const city of target.cityCandidates) {
|
|
2433
|
+
try {
|
|
2434
|
+
const created = await kernel.proxies.create({
|
|
2435
|
+
type: "residential",
|
|
2436
|
+
name: proxyName(target.country, target.state, city),
|
|
2437
|
+
config: {
|
|
2438
|
+
country: target.country,
|
|
2439
|
+
state: target.state,
|
|
2440
|
+
city
|
|
2441
|
+
}
|
|
2442
|
+
});
|
|
2443
|
+
if (created.id) {
|
|
2444
|
+
return resolution("location_created", options.proxyMode, created.id, {
|
|
2445
|
+
...target,
|
|
2446
|
+
level: "city",
|
|
2447
|
+
city,
|
|
2448
|
+
proxyName: proxyName(target.country, target.state, city),
|
|
2449
|
+
config: {
|
|
2450
|
+
country: target.country,
|
|
2451
|
+
state: target.state,
|
|
2452
|
+
city
|
|
2453
|
+
}
|
|
2454
|
+
}, null);
|
|
2455
|
+
}
|
|
2456
|
+
createErrors.push(`${city}: Kernel did not return a proxy id`);
|
|
2457
|
+
} catch (err) {
|
|
2458
|
+
createErrors.push(`${city}: ${errorText2(err)}`);
|
|
2459
|
+
}
|
|
2460
|
+
}
|
|
2461
|
+
const fallbackTarget = stateTarget(target);
|
|
2462
|
+
const existingState = findExistingStateProxy(proxies, fallbackTarget);
|
|
2463
|
+
if (existingState?.id) {
|
|
2464
|
+
return resolution("location_reused", options.proxyMode, existingState.id, fallbackTarget, createErrors.join(" | "));
|
|
2465
|
+
}
|
|
2466
|
+
try {
|
|
2467
|
+
const created = await kernel.proxies.create({
|
|
2468
|
+
type: "residential",
|
|
2469
|
+
name: fallbackTarget.proxyName,
|
|
2470
|
+
config: fallbackTarget.config
|
|
2471
|
+
});
|
|
2472
|
+
if (created.id) {
|
|
2473
|
+
return resolution("location_created", options.proxyMode, created.id, fallbackTarget, createErrors.join(" | "));
|
|
2474
|
+
}
|
|
2475
|
+
createErrors.push(`${fallbackTarget.state}: Kernel did not return a proxy id`);
|
|
2476
|
+
} catch (err) {
|
|
2477
|
+
createErrors.push(`${fallbackTarget.state}: ${errorText2(err)}`);
|
|
2478
|
+
}
|
|
2479
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, createErrors.join(" | "));
|
|
2480
|
+
} catch (err) {
|
|
2481
|
+
return resolution("configured_fallback", options.proxyMode, options.configuredKernelProxyId, target, errorText2(err));
|
|
2482
|
+
}
|
|
2483
|
+
}
|
|
2484
|
+
|
|
2485
|
+
// src/harvest.ts
|
|
2486
|
+
var MAX_ATTEMPTS = 3;
|
|
2487
|
+
function abortReason(signal) {
|
|
2488
|
+
if (signal.reason instanceof DOMException && signal.reason.name === "TimeoutError") return signal.reason;
|
|
2489
|
+
return new RequestAbortedError();
|
|
2490
|
+
}
|
|
2491
|
+
function getAbortSignal(rawOptions) {
|
|
2492
|
+
if (!rawOptions || typeof rawOptions !== "object") return void 0;
|
|
2493
|
+
const signal = rawOptions.signal;
|
|
2494
|
+
if (signal instanceof AbortSignal) return signal;
|
|
2495
|
+
return void 0;
|
|
2496
|
+
}
|
|
2497
|
+
function getAttemptLogSink(rawOptions) {
|
|
2498
|
+
if (!rawOptions || typeof rawOptions !== "object") return void 0;
|
|
2499
|
+
const sink = rawOptions.onAttemptEvent;
|
|
2500
|
+
return typeof sink === "function" ? sink : void 0;
|
|
2501
|
+
}
|
|
2502
|
+
async function emitAttemptEvent(sink, event) {
|
|
2503
|
+
if (!sink) return;
|
|
2504
|
+
try {
|
|
2505
|
+
await sink(event);
|
|
2506
|
+
} catch (err) {
|
|
2507
|
+
console.warn(JSON.stringify({
|
|
2508
|
+
event: "harvest_attempt_log_failed",
|
|
2509
|
+
attempt_number: event.attemptNumber,
|
|
2510
|
+
message: err instanceof Error ? err.message : String(err)
|
|
2511
|
+
}));
|
|
2512
|
+
}
|
|
2513
|
+
}
|
|
2514
|
+
function classifyAttemptError(err) {
|
|
2515
|
+
if (err instanceof CaptchaError) return "captcha";
|
|
2516
|
+
if (err instanceof RequestAbortedError) return "request_aborted";
|
|
2517
|
+
if (err instanceof DOMException && (err.name === "TimeoutError" || err.name === "AbortError")) return "timeout";
|
|
2518
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2519
|
+
return /timeout|timed out|Timeout \d+ms exceeded|deadline/i.test(message) ? "timeout" : "error";
|
|
2520
|
+
}
|
|
2521
|
+
function classifyAttemptResult(result) {
|
|
2522
|
+
return result.diagnostics?.completionStatus ?? (result.totalQuestions > 0 ? "paa_found" : "no_paa");
|
|
2523
|
+
}
|
|
2524
|
+
function errorMessage(err) {
|
|
2525
|
+
return err instanceof Error ? err.message : String(err);
|
|
2526
|
+
}
|
|
2527
|
+
async function extractOnce(options, signal) {
|
|
2528
|
+
const driver = new BrowserDriver();
|
|
2529
|
+
const reporter = new ProgressReporter();
|
|
2530
|
+
const extractor = new PAAExtractor(driver, reporter);
|
|
2531
|
+
if (signal?.aborted) {
|
|
2532
|
+
return {
|
|
2533
|
+
result: null,
|
|
2534
|
+
error: abortReason(signal),
|
|
2535
|
+
cleanup: await driver.close(),
|
|
2536
|
+
debug: null
|
|
2537
|
+
};
|
|
2538
|
+
}
|
|
2539
|
+
let onAbort;
|
|
2540
|
+
const abortPromise = signal ? new Promise((_, reject) => {
|
|
2541
|
+
onAbort = () => reject(abortReason(signal));
|
|
2542
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
2543
|
+
}) : null;
|
|
2544
|
+
let result = null;
|
|
2545
|
+
let error = null;
|
|
2546
|
+
let cleanup;
|
|
2547
|
+
let debug = null;
|
|
2548
|
+
try {
|
|
2549
|
+
const extraction = extractor.extract(options, signal);
|
|
2550
|
+
if (abortPromise) extraction.catch(() => {
|
|
2551
|
+
});
|
|
2552
|
+
result = await (abortPromise ? Promise.race([extraction, abortPromise]) : extraction);
|
|
2553
|
+
} catch (err) {
|
|
2554
|
+
error = err;
|
|
2555
|
+
} finally {
|
|
2556
|
+
if (signal && onAbort) signal.removeEventListener("abort", onAbort);
|
|
2557
|
+
debug = result?.diagnostics.debug ?? (options.debug ? {
|
|
2558
|
+
enabled: true,
|
|
2559
|
+
request: {
|
|
2560
|
+
query: options.query,
|
|
2561
|
+
locationInput: options.location ?? null,
|
|
2562
|
+
canonicalLocation: null,
|
|
2563
|
+
uule: null,
|
|
2564
|
+
gl: options.gl,
|
|
2565
|
+
hl: options.hl,
|
|
2566
|
+
device: options.device,
|
|
2567
|
+
proxyMode: options.proxyMode,
|
|
2568
|
+
proxyZip: options.proxyZip ?? null,
|
|
2569
|
+
serpOnly: options.serpOnly,
|
|
2570
|
+
pages: options.pages ?? 1
|
|
2571
|
+
},
|
|
2572
|
+
browser: driver.getDebugSnapshot()
|
|
2573
|
+
} : null);
|
|
2574
|
+
cleanup = await driver.close();
|
|
2575
|
+
}
|
|
2576
|
+
return error ? { result: null, error, cleanup, debug } : { result, error: null, cleanup, debug };
|
|
2577
|
+
}
|
|
2578
|
+
async function harvest(rawOptions) {
|
|
2579
|
+
const raw = typeof rawOptions === "object" && rawOptions !== null ? rawOptions : {};
|
|
2580
|
+
const signal = getAbortSignal(rawOptions);
|
|
2581
|
+
const onAttemptEvent = getAttemptLogSink(rawOptions);
|
|
2582
|
+
const requestedProxyMode = raw.proxyMode;
|
|
2583
|
+
const proxyMode = requestedProxyMode === "none" ? "none" : requestedProxyMode === "configured" ? "configured" : "location";
|
|
2584
|
+
const kernelApiKey = typeof raw.kernelApiKey === "string" ? raw.kernelApiKey.trim() : process.env.KERNEL_API_KEY?.trim();
|
|
2585
|
+
const configuredKernelProxyId = typeof raw.kernelProxyId === "string" ? raw.kernelProxyId.trim() : process.env.KERNEL_PROXY_ID?.trim();
|
|
2586
|
+
const proxyOpts = {
|
|
2587
|
+
kernelApiKey,
|
|
2588
|
+
proxyMode,
|
|
2589
|
+
configuredKernelProxyId,
|
|
2590
|
+
location: typeof raw.location === "string" ? raw.location : void 0,
|
|
2591
|
+
proxyZip: typeof raw.proxyZip === "string" ? raw.proxyZip : void 0,
|
|
2592
|
+
gl: typeof raw.gl === "string" ? raw.gl : "us"
|
|
2593
|
+
};
|
|
2594
|
+
const serializer = new OutputSerializer();
|
|
2595
|
+
for (let i = 0; i < MAX_ATTEMPTS; i++) {
|
|
2596
|
+
const attemptNumber = i + 1;
|
|
2597
|
+
const startedAtMs = Date.now();
|
|
2598
|
+
try {
|
|
2599
|
+
if (signal?.aborted) throw abortReason(signal);
|
|
2600
|
+
const resolution2 = await resolveKernelProxyId({ ...proxyOpts, attemptIndex: i });
|
|
2601
|
+
const mergedAttempt = {
|
|
2602
|
+
...raw,
|
|
2603
|
+
kernelApiKey,
|
|
2604
|
+
kernelProxyId: resolution2.kernelProxyId,
|
|
2605
|
+
kernelProxyResolution: resolution2.resolution,
|
|
2606
|
+
proxyMode
|
|
2607
|
+
};
|
|
2608
|
+
if (proxyMode === "none") mergedAttempt.kernelProxyId = void 0;
|
|
2609
|
+
const attemptOptions = HarvestOptionsSchema.parse(mergedAttempt);
|
|
2610
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2611
|
+
type: "started",
|
|
2612
|
+
attemptNumber,
|
|
2613
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2614
|
+
query: attemptOptions.query,
|
|
2615
|
+
location: attemptOptions.location ?? null,
|
|
2616
|
+
maxQuestions: attemptOptions.maxQuestions,
|
|
2617
|
+
startedAt: new Date(startedAtMs).toISOString()
|
|
2618
|
+
});
|
|
2619
|
+
console.info(JSON.stringify({
|
|
2620
|
+
event: "harvest_attempt_started",
|
|
2621
|
+
attempt_number: attemptNumber,
|
|
2622
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2623
|
+
query: attemptOptions.query,
|
|
2624
|
+
location: attemptOptions.location ?? null,
|
|
2625
|
+
max_questions: attemptOptions.maxQuestions
|
|
2626
|
+
}));
|
|
2627
|
+
const attempt = await extractOnce(attemptOptions, signal);
|
|
2628
|
+
if (attempt.error) {
|
|
2629
|
+
const err = attempt.error;
|
|
2630
|
+
if (err instanceof CaptchaError) {
|
|
2631
|
+
const willRetry = i < MAX_ATTEMPTS - 1;
|
|
2632
|
+
console.warn(JSON.stringify({
|
|
2633
|
+
event: "harvest_attempt_captcha",
|
|
2634
|
+
attempt_number: attemptNumber,
|
|
2635
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2636
|
+
message: err.message,
|
|
2637
|
+
will_retry: willRetry
|
|
2638
|
+
}));
|
|
2639
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2640
|
+
type: "finished",
|
|
2641
|
+
attemptNumber,
|
|
2642
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2643
|
+
outcome: "captcha",
|
|
2644
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2645
|
+
questionCount: 0,
|
|
2646
|
+
durationMs: Date.now() - startedAtMs,
|
|
2647
|
+
error: err.message,
|
|
2648
|
+
willRetry,
|
|
2649
|
+
cleanup: attempt.cleanup,
|
|
2650
|
+
debug: attempt.debug,
|
|
2651
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2652
|
+
});
|
|
2653
|
+
if (willRetry) continue;
|
|
2654
|
+
break;
|
|
2655
|
+
}
|
|
2656
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2657
|
+
type: "finished",
|
|
2658
|
+
attemptNumber,
|
|
2659
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2660
|
+
outcome: classifyAttemptError(err),
|
|
2661
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2662
|
+
questionCount: 0,
|
|
2663
|
+
durationMs: Date.now() - startedAtMs,
|
|
2664
|
+
error: errorMessage(err),
|
|
2665
|
+
willRetry: false,
|
|
2666
|
+
cleanup: attempt.cleanup,
|
|
2667
|
+
debug: attempt.debug,
|
|
2668
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2669
|
+
});
|
|
2670
|
+
throw err;
|
|
2671
|
+
}
|
|
2672
|
+
const result = attempt.result;
|
|
2673
|
+
if (!result) throw new Error("Harvest attempt completed without a result");
|
|
2674
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2675
|
+
type: "finished",
|
|
2676
|
+
attemptNumber,
|
|
2677
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2678
|
+
outcome: classifyAttemptResult(result),
|
|
2679
|
+
kernelSessionId: attempt.cleanup.kernelSessionId,
|
|
2680
|
+
questionCount: result.totalQuestions,
|
|
2681
|
+
durationMs: Date.now() - startedAtMs,
|
|
2682
|
+
error: null,
|
|
2683
|
+
willRetry: false,
|
|
2684
|
+
cleanup: attempt.cleanup,
|
|
2685
|
+
debug: attempt.debug,
|
|
2686
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2687
|
+
});
|
|
2688
|
+
if (attemptOptions.format === "json" || attemptOptions.format === "both") {
|
|
2689
|
+
await serializer.writeJSON(result, attemptOptions.outputDir);
|
|
2690
|
+
}
|
|
2691
|
+
if (attemptOptions.format === "csv" || attemptOptions.format === "both") {
|
|
2692
|
+
await Promise.all([
|
|
2693
|
+
serializer.writeCSV(result.flat, attemptOptions.outputDir),
|
|
2694
|
+
result.videos.length > 0 ? serializer.writeVideoCSV(result.videos, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2695
|
+
result.forums.length > 0 ? serializer.writeForumCSV(result.forums, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2696
|
+
result.aiOverview.detected ? serializer.writeAIOverviewCSV(result.aiOverview.citations, result.aiOverview.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2697
|
+
result.aiMode.detected ? serializer.writeAIModeCSV(result.aiMode.citations, result.aiMode.text, result.seed, attemptOptions.outputDir) : Promise.resolve(""),
|
|
2698
|
+
result.whatPeopleSaying.length > 0 ? serializer.writeWhatPeopleSayingCSV(result.whatPeopleSaying, result.seed, attemptOptions.outputDir) : Promise.resolve("")
|
|
2699
|
+
]);
|
|
2700
|
+
}
|
|
2701
|
+
return result;
|
|
2702
|
+
} catch (err) {
|
|
2703
|
+
if (err instanceof CaptchaError) {
|
|
2704
|
+
const willRetry = i < MAX_ATTEMPTS - 1;
|
|
2705
|
+
console.warn(JSON.stringify({
|
|
2706
|
+
event: "harvest_attempt_captcha",
|
|
2707
|
+
attempt_number: attemptNumber,
|
|
2708
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2709
|
+
message: err.message,
|
|
2710
|
+
will_retry: willRetry
|
|
2711
|
+
}));
|
|
2712
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2713
|
+
type: "finished",
|
|
2714
|
+
attemptNumber,
|
|
2715
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2716
|
+
outcome: "captcha",
|
|
2717
|
+
kernelSessionId: null,
|
|
2718
|
+
questionCount: 0,
|
|
2719
|
+
durationMs: Date.now() - startedAtMs,
|
|
2720
|
+
error: err.message,
|
|
2721
|
+
willRetry,
|
|
2722
|
+
cleanup: {
|
|
2723
|
+
kernelSessionId: null,
|
|
2724
|
+
kernelDeleteStarted: false,
|
|
2725
|
+
kernelDeleteSucceeded: null,
|
|
2726
|
+
kernelDeleteError: null,
|
|
2727
|
+
browserCloseSucceeded: null,
|
|
2728
|
+
browserCloseError: null
|
|
2729
|
+
},
|
|
2730
|
+
debug: null,
|
|
2731
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2732
|
+
});
|
|
2733
|
+
if (willRetry) continue;
|
|
2734
|
+
break;
|
|
2735
|
+
}
|
|
2736
|
+
await emitAttemptEvent(onAttemptEvent, {
|
|
2737
|
+
type: "finished",
|
|
2738
|
+
attemptNumber,
|
|
2739
|
+
maxAttempts: MAX_ATTEMPTS,
|
|
2740
|
+
outcome: classifyAttemptError(err),
|
|
2741
|
+
kernelSessionId: null,
|
|
2742
|
+
questionCount: 0,
|
|
2743
|
+
durationMs: Date.now() - startedAtMs,
|
|
2744
|
+
error: errorMessage(err),
|
|
2745
|
+
willRetry: false,
|
|
2746
|
+
cleanup: {
|
|
2747
|
+
kernelSessionId: null,
|
|
2748
|
+
kernelDeleteStarted: false,
|
|
2749
|
+
kernelDeleteSucceeded: null,
|
|
2750
|
+
kernelDeleteError: null,
|
|
2751
|
+
browserCloseSucceeded: null,
|
|
2752
|
+
browserCloseError: null
|
|
2753
|
+
},
|
|
2754
|
+
debug: null,
|
|
2755
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
2756
|
+
});
|
|
2757
|
+
throw err;
|
|
2758
|
+
}
|
|
2759
|
+
}
|
|
2760
|
+
console.warn(JSON.stringify({
|
|
2761
|
+
event: "harvest_captcha_exhausted",
|
|
2762
|
+
max_attempts: MAX_ATTEMPTS,
|
|
2763
|
+
session_kind: kernelApiKey ? "kernel" : "local"
|
|
2764
|
+
}));
|
|
2765
|
+
throw new CaptchaError(sanitizeVendorName(`CAPTCHA on all ${MAX_ATTEMPTS} fresh sessions. Try again in a few minutes.`));
|
|
2766
|
+
}
|
|
2767
|
+
|
|
2768
|
+
export {
|
|
2769
|
+
MapsPlaceOptionsSchema,
|
|
2770
|
+
RawMapsOverviewSchema,
|
|
2771
|
+
RawMapsHoursRowSchema,
|
|
2772
|
+
RawMapsReviewStatsSchema,
|
|
2773
|
+
RawMapsAboutAttributeSchema,
|
|
2774
|
+
MapsSelectors,
|
|
2775
|
+
CaptchaError,
|
|
2776
|
+
RequestAbortedError,
|
|
2777
|
+
buildYouTubeChannelVideosUrl,
|
|
2778
|
+
BrowserDriver,
|
|
2779
|
+
harvest
|
|
2780
|
+
};
|
|
2781
|
+
//# sourceMappingURL=chunk-HERFK7W6.js.map
|