@humbletoes/google-search 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +339 -0
- package/bin/google-search +3 -0
- package/bin/google-search-mcp +3 -0
- package/bin/google-search-mcp.cmd +2 -0
- package/bin/google-search.cmd +2 -0
- package/dist/browser-config.d.ts +41 -0
- package/dist/browser-config.js +96 -0
- package/dist/browser-config.js.map +1 -0
- package/dist/browser-pool.d.ts +13 -0
- package/dist/browser-pool.js +37 -0
- package/dist/browser-pool.js.map +1 -0
- package/dist/cache.d.ts +48 -0
- package/dist/cache.js +111 -0
- package/dist/cache.js.map +1 -0
- package/dist/errors.d.ts +26 -0
- package/dist/errors.js +48 -0
- package/dist/errors.js.map +1 -0
- package/dist/filters.d.ts +48 -0
- package/dist/filters.js +192 -0
- package/dist/filters.js.map +1 -0
- package/dist/html-cleaner.d.ts +62 -0
- package/dist/html-cleaner.js +236 -0
- package/dist/html-cleaner.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +59 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +2 -0
- package/dist/logger.js +41 -0
- package/dist/logger.js.map +1 -0
- package/dist/mcp-server.d.ts +9 -0
- package/dist/mcp-server.js +822 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/search.d.ts +18 -0
- package/dist/search.js +1080 -0
- package/dist/search.js.map +1 -0
- package/dist/types.d.ts +67 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/validation.d.ts +6 -0
- package/dist/validation.js +23 -0
- package/dist/validation.js.map +1 -0
- package/dist/web-fetcher.d.ts +10 -0
- package/dist/web-fetcher.js +179 -0
- package/dist/web-fetcher.js.map +1 -0
- package/package.json +67 -0
- package/scripts/setup.js +53 -0
package/dist/search.js
ADDED
|
@@ -0,0 +1,1080 @@
|
|
|
1
|
+
import { chromium, devices } from "playwright";
|
|
2
|
+
import * as fs from "fs";
|
|
3
|
+
import * as path from "path";
|
|
4
|
+
import * as os from "os";
|
|
5
|
+
import logger from "./logger.js";
|
|
6
|
+
import { SecureBrowserConfig, GOOGLE_DOMAINS, getRandomDelay, } from "./browser-config.js";
|
|
7
|
+
/**
|
|
8
|
+
* Validate and resolve state file path for security
|
|
9
|
+
* @param userPath - User-provided path or undefined for default
|
|
10
|
+
* @returns Resolved, validated path
|
|
11
|
+
*/
|
|
12
|
+
function resolveStateFilePath(userPath) {
|
|
13
|
+
const homeDir = os.homedir();
|
|
14
|
+
const tmpDir = os.tmpdir();
|
|
15
|
+
// Default to home directory if no path provided
|
|
16
|
+
if (!userPath) {
|
|
17
|
+
return path.join(homeDir, ".google-search-browser-state.json");
|
|
18
|
+
}
|
|
19
|
+
// Resolve the path
|
|
20
|
+
const resolvedPath = path.resolve(userPath);
|
|
21
|
+
// Prevent directory traversal attacks
|
|
22
|
+
// Ensure the path is within allowed directories (home, tmp, or cwd)
|
|
23
|
+
const cwd = process.cwd();
|
|
24
|
+
const isInHome = resolvedPath.startsWith(path.normalize(homeDir + path.sep));
|
|
25
|
+
const isInTmp = resolvedPath.startsWith(path.normalize(tmpDir + path.sep));
|
|
26
|
+
const isInCwd = resolvedPath.startsWith(path.normalize(cwd + path.sep));
|
|
27
|
+
const isAbsoluteInAllowed = path.isAbsolute(resolvedPath) && (isInHome || isInTmp);
|
|
28
|
+
const isRelativePath = !path.isAbsolute(userPath); // Relative paths resolve from cwd
|
|
29
|
+
if (!isInCwd && !isAbsoluteInAllowed && !isRelativePath) {
|
|
30
|
+
throw new Error(`Invalid state file path: ${userPath}. Path must be within home directory, tmp directory, or relative to current working directory.`);
|
|
31
|
+
}
|
|
32
|
+
return resolvedPath;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Get the actual configuration of the host machine
|
|
36
|
+
* @param userLocale User-specified locale (if any)
|
|
37
|
+
* @returns Fingerprint configuration based on host machine
|
|
38
|
+
*/
|
|
39
|
+
function getHostMachineConfig(userLocale) {
|
|
40
|
+
// Force English locale for consistent English results
|
|
41
|
+
const systemLocale = userLocale || "en-US";
|
|
42
|
+
// Force US Eastern timezone to ensure English date formats
|
|
43
|
+
const timezoneId = "America/New_York";
|
|
44
|
+
// Detect system color scheme
|
|
45
|
+
// Node.js cannot directly get system color scheme, use reasonable defaults
|
|
46
|
+
// Can infer from time: use dark mode at night, light mode during the day
|
|
47
|
+
const hour = new Date().getHours();
|
|
48
|
+
const colorScheme = hour >= 19 || hour < 7 ? "dark" : "light";
|
|
49
|
+
// Use reasonable defaults for other settings
|
|
50
|
+
const reducedMotion = "no-preference"; // Most users won't enable reduced animations
|
|
51
|
+
const forcedColors = "none"; // Most users won't enable forced colors
|
|
52
|
+
return {
|
|
53
|
+
deviceName: "Desktop Chrome",
|
|
54
|
+
locale: systemLocale,
|
|
55
|
+
timezoneId,
|
|
56
|
+
colorScheme,
|
|
57
|
+
reducedMotion,
|
|
58
|
+
forcedColors,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* Common browser setup logic shared between search functions
|
|
63
|
+
* @param config Browser setup configuration
|
|
64
|
+
* @returns Browser setup data
|
|
65
|
+
*/
|
|
66
|
+
function setupBrowserEnvironment(config) {
|
|
67
|
+
// Check if state file exists
|
|
68
|
+
let storageState = undefined;
|
|
69
|
+
let savedState = {};
|
|
70
|
+
// Fingerprint configuration file path
|
|
71
|
+
const fingerprintFile = config.stateFile.replace(".json", "-fingerprint.json");
|
|
72
|
+
if (fs.existsSync(config.stateFile)) {
|
|
73
|
+
logger.info({ stateFile: config.stateFile }, "Found browser state file, will use saved browser state to avoid anti-bot detection");
|
|
74
|
+
storageState = config.stateFile;
|
|
75
|
+
// Try to load saved fingerprint configuration
|
|
76
|
+
if (fs.existsSync(fingerprintFile)) {
|
|
77
|
+
try {
|
|
78
|
+
const fingerprintData = fs.readFileSync(fingerprintFile, "utf8");
|
|
79
|
+
savedState = JSON.parse(fingerprintData);
|
|
80
|
+
logger.info("Loaded saved browser fingerprint configuration");
|
|
81
|
+
}
|
|
82
|
+
catch (e) {
|
|
83
|
+
logger.warn({ error: e }, "Unable to load fingerprint configuration file, will create new fingerprint");
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
logger.info({ stateFile: config.stateFile }, "Browser state file not found, will create new browser session and fingerprint");
|
|
89
|
+
}
|
|
90
|
+
// Get random device configuration or use saved configuration
|
|
91
|
+
const getRandomDeviceConfig = () => {
|
|
92
|
+
const deviceList = [
|
|
93
|
+
"Desktop Chrome",
|
|
94
|
+
"Desktop Edge",
|
|
95
|
+
"Desktop Firefox",
|
|
96
|
+
"Desktop Safari",
|
|
97
|
+
];
|
|
98
|
+
if (savedState.fingerprint?.deviceName &&
|
|
99
|
+
devices[savedState.fingerprint.deviceName]) {
|
|
100
|
+
// Use saved device configuration
|
|
101
|
+
return [
|
|
102
|
+
savedState.fingerprint.deviceName,
|
|
103
|
+
devices[savedState.fingerprint.deviceName],
|
|
104
|
+
];
|
|
105
|
+
}
|
|
106
|
+
else {
|
|
107
|
+
// Randomly select a device
|
|
108
|
+
const randomDevice = deviceList[Math.floor(Math.random() * deviceList.length)];
|
|
109
|
+
return [randomDevice, devices[randomDevice]];
|
|
110
|
+
}
|
|
111
|
+
};
|
|
112
|
+
return {
|
|
113
|
+
storageState,
|
|
114
|
+
savedState,
|
|
115
|
+
fingerprintFile,
|
|
116
|
+
googleDomains: GOOGLE_DOMAINS,
|
|
117
|
+
getRandomDeviceConfig,
|
|
118
|
+
getRandomDelay,
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
/**
|
|
122
|
+
* Execute Google search and return results
|
|
123
|
+
* @param query Search keywords
|
|
124
|
+
* @param options Search options
|
|
125
|
+
* @returns Search results
|
|
126
|
+
*/
|
|
127
|
+
export async function googleSearch(query, options = {}, existingBrowser) {
|
|
128
|
+
// Set default options with secure state file path
|
|
129
|
+
const { limit = 20, timeout = 60000, stateFile: userStateFile, noSaveState = false, locale = "en-US", // Default to English
|
|
130
|
+
} = options;
|
|
131
|
+
// Resolve and validate state file path
|
|
132
|
+
const stateFile = resolveStateFilePath(userStateFile);
|
|
133
|
+
// Ignore the passed headless parameter, always start in headless mode
|
|
134
|
+
const useHeadless = true;
|
|
135
|
+
logger.info({ options: { ...options, stateFile } }, "Initializing browser...");
|
|
136
|
+
// Setup browser environment
|
|
137
|
+
const { storageState, savedState, fingerprintFile, googleDomains, getRandomDeviceConfig, getRandomDelay, } = setupBrowserEnvironment({
|
|
138
|
+
stateFile,
|
|
139
|
+
timeout,
|
|
140
|
+
locale,
|
|
141
|
+
noSaveState,
|
|
142
|
+
});
|
|
143
|
+
// Define a function to perform search, can be reused for headless and headed modes
|
|
144
|
+
async function performSearch(headless) {
|
|
145
|
+
let browser;
|
|
146
|
+
let browserWasProvided = false;
|
|
147
|
+
if (existingBrowser) {
|
|
148
|
+
browser = existingBrowser;
|
|
149
|
+
browserWasProvided = true;
|
|
150
|
+
logger.info("Using existing browser instance");
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
logger.info({ headless }, `Preparing to start browser in ${headless ? "headless" : "headed"} mode...`);
|
|
154
|
+
// Initialize browser with secure arguments from shared config
|
|
155
|
+
browser = await chromium.launch({
|
|
156
|
+
headless,
|
|
157
|
+
timeout: timeout * 2, // Increase browser startup timeout
|
|
158
|
+
args: SecureBrowserConfig.getDefaultSearchArgs(),
|
|
159
|
+
ignoreDefaultArgs: ["--enable-automation"],
|
|
160
|
+
});
|
|
161
|
+
logger.info("Browser started successfully!");
|
|
162
|
+
}
|
|
163
|
+
// Get device configuration - use saved or randomly generated
|
|
164
|
+
const [deviceName, deviceConfig] = getRandomDeviceConfig();
|
|
165
|
+
// Create browser context options
|
|
166
|
+
let contextOptions = {
|
|
167
|
+
...deviceConfig,
|
|
168
|
+
};
|
|
169
|
+
// If there is saved fingerprint configuration, use it; otherwise use the actual settings of the host machine
|
|
170
|
+
if (savedState.fingerprint) {
|
|
171
|
+
contextOptions = {
|
|
172
|
+
...contextOptions,
|
|
173
|
+
locale: savedState.fingerprint.locale,
|
|
174
|
+
timezoneId: savedState.fingerprint.timezoneId,
|
|
175
|
+
colorScheme: savedState.fingerprint.colorScheme,
|
|
176
|
+
reducedMotion: savedState.fingerprint.reducedMotion,
|
|
177
|
+
forcedColors: savedState.fingerprint.forcedColors,
|
|
178
|
+
};
|
|
179
|
+
logger.info("Using saved browser fingerprint configuration");
|
|
180
|
+
}
|
|
181
|
+
else {
|
|
182
|
+
// Get the actual settings of the host machine
|
|
183
|
+
const hostConfig = getHostMachineConfig(locale);
|
|
184
|
+
// If need to use different device type, re-get device configuration
|
|
185
|
+
if (hostConfig.deviceName !== deviceName) {
|
|
186
|
+
logger.info({ deviceType: hostConfig.deviceName }, "Use device type based on host machine settings");
|
|
187
|
+
// Use new device configuration
|
|
188
|
+
contextOptions = { ...devices[hostConfig.deviceName] };
|
|
189
|
+
}
|
|
190
|
+
contextOptions = {
|
|
191
|
+
...contextOptions,
|
|
192
|
+
locale: hostConfig.locale,
|
|
193
|
+
timezoneId: hostConfig.timezoneId,
|
|
194
|
+
colorScheme: hostConfig.colorScheme,
|
|
195
|
+
reducedMotion: hostConfig.reducedMotion,
|
|
196
|
+
forcedColors: hostConfig.forcedColors,
|
|
197
|
+
};
|
|
198
|
+
// Save newly generated fingerprint configuration
|
|
199
|
+
savedState.fingerprint = hostConfig;
|
|
200
|
+
logger.info({
|
|
201
|
+
locale: hostConfig.locale,
|
|
202
|
+
timezone: hostConfig.timezoneId,
|
|
203
|
+
colorScheme: hostConfig.colorScheme,
|
|
204
|
+
deviceType: hostConfig.deviceName,
|
|
205
|
+
}, "New browser fingerprint configuration generated based on host machine");
|
|
206
|
+
}
|
|
207
|
+
// Add general options - ensure desktop configuration is used
|
|
208
|
+
contextOptions = {
|
|
209
|
+
...contextOptions,
|
|
210
|
+
permissions: ["geolocation", "notifications"],
|
|
211
|
+
acceptDownloads: true,
|
|
212
|
+
isMobile: false, // Force desktop mode
|
|
213
|
+
hasTouch: false, // Disable touch functionality
|
|
214
|
+
javaScriptEnabled: true,
|
|
215
|
+
};
|
|
216
|
+
if (storageState) {
|
|
217
|
+
logger.info("Loading saved browser state...");
|
|
218
|
+
}
|
|
219
|
+
const context = await browser.newContext(storageState ? { ...contextOptions, storageState } : contextOptions);
|
|
220
|
+
// Set additional browser properties to avoid detection
|
|
221
|
+
await context.addInitScript(() => {
|
|
222
|
+
// Override navigator properties
|
|
223
|
+
Object.defineProperty(navigator, "webdriver", { get: () => false });
|
|
224
|
+
Object.defineProperty(navigator, "plugins", {
|
|
225
|
+
get: () => [1, 2, 3, 4, 5],
|
|
226
|
+
});
|
|
227
|
+
Object.defineProperty(navigator, "languages", {
|
|
228
|
+
get: () => ["en-US", "en"],
|
|
229
|
+
});
|
|
230
|
+
// Override window properties
|
|
231
|
+
// @ts-expect-error - Ignore error that chrome property does not exist
|
|
232
|
+
window.chrome = {
|
|
233
|
+
runtime: {},
|
|
234
|
+
loadTimes: function () { },
|
|
235
|
+
csi: function () { },
|
|
236
|
+
app: {},
|
|
237
|
+
};
|
|
238
|
+
// Add WebGL fingerprint randomization
|
|
239
|
+
if (typeof WebGLRenderingContext !== "undefined") {
|
|
240
|
+
const getParameter = WebGLRenderingContext.prototype.getParameter;
|
|
241
|
+
WebGLRenderingContext.prototype.getParameter = function (parameter) {
|
|
242
|
+
// Randomize UNMASKED_VENDOR_WEBGL and UNMASKED_RENDERER_WEBGL
|
|
243
|
+
if (parameter === 37445) {
|
|
244
|
+
return "Intel Inc.";
|
|
245
|
+
}
|
|
246
|
+
if (parameter === 37446) {
|
|
247
|
+
return "Intel Iris OpenGL Engine";
|
|
248
|
+
}
|
|
249
|
+
return getParameter.call(this, parameter);
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
});
|
|
253
|
+
const page = await context.newPage();
|
|
254
|
+
// Set additional page properties
|
|
255
|
+
await page.addInitScript(() => {
|
|
256
|
+
// Simulate real screen size and color depth
|
|
257
|
+
Object.defineProperty(window.screen, "width", { get: () => 1920 });
|
|
258
|
+
Object.defineProperty(window.screen, "height", { get: () => 1080 });
|
|
259
|
+
Object.defineProperty(window.screen, "colorDepth", { get: () => 24 });
|
|
260
|
+
Object.defineProperty(window.screen, "pixelDepth", { get: () => 24 });
|
|
261
|
+
});
|
|
262
|
+
try {
|
|
263
|
+
// Use saved Google domain or randomly select one
|
|
264
|
+
let selectedDomain;
|
|
265
|
+
if (savedState.googleDomain) {
|
|
266
|
+
selectedDomain = savedState.googleDomain;
|
|
267
|
+
logger.info({ domain: selectedDomain }, "Using saved Google domain");
|
|
268
|
+
}
|
|
269
|
+
else {
|
|
270
|
+
const domains = [...googleDomains];
|
|
271
|
+
selectedDomain = domains[Math.floor(Math.random() * domains.length)];
|
|
272
|
+
// Save selected domain
|
|
273
|
+
savedState.googleDomain = selectedDomain;
|
|
274
|
+
logger.info({ domain: selectedDomain }, "Randomly select Google domain");
|
|
275
|
+
}
|
|
276
|
+
logger.info("Accessing Google search page...");
|
|
277
|
+
// Visit Google search page
|
|
278
|
+
const response = await page.goto(selectedDomain, {
|
|
279
|
+
timeout,
|
|
280
|
+
waitUntil: "load",
|
|
281
|
+
});
|
|
282
|
+
// Check if redirected to human-machine verification page
|
|
283
|
+
const currentUrl = page.url();
|
|
284
|
+
const sorryPatterns = [
|
|
285
|
+
"google.com/sorry/index",
|
|
286
|
+
"google.com/sorry",
|
|
287
|
+
"recaptcha",
|
|
288
|
+
"captcha",
|
|
289
|
+
"unusual traffic",
|
|
290
|
+
];
|
|
291
|
+
const isBlockedPage = sorryPatterns.some((pattern) => currentUrl.includes(pattern) ||
|
|
292
|
+
(response && response.url().toString().includes(pattern)));
|
|
293
|
+
if (isBlockedPage) {
|
|
294
|
+
if (headless) {
|
|
295
|
+
logger.warn("Detected human-machine verification page, restarting browser in headed mode...");
|
|
296
|
+
// Close current page and context
|
|
297
|
+
await page.close();
|
|
298
|
+
await context.close();
|
|
299
|
+
// If it's an externally provided browser, don't close it, but create a new browser instance
|
|
300
|
+
if (browserWasProvided) {
|
|
301
|
+
logger.info("Encountered human-machine verification when using external browser instance, creating new browser instance...");
|
|
302
|
+
// Create a new browser instance, no longer using the externally provided instance
|
|
303
|
+
const newBrowser = await chromium.launch({
|
|
304
|
+
headless: false, // Use headed mode
|
|
305
|
+
timeout: timeout * 2,
|
|
306
|
+
args: SecureBrowserConfig.getDefaultSearchArgs(),
|
|
307
|
+
ignoreDefaultArgs: ["--enable-automation"],
|
|
308
|
+
});
|
|
309
|
+
// Execute search using new browser instance
|
|
310
|
+
try {
|
|
311
|
+
const tempContext = await newBrowser.newContext(contextOptions);
|
|
312
|
+
const tempPage = await tempContext.newPage();
|
|
313
|
+
// Code to handle human-machine verification can be added here
|
|
314
|
+
// ...
|
|
315
|
+
// Close temporary browser after completion
|
|
316
|
+
await newBrowser.close();
|
|
317
|
+
// Re-execute search
|
|
318
|
+
return performSearch(false);
|
|
319
|
+
}
|
|
320
|
+
catch (error) {
|
|
321
|
+
await newBrowser.close();
|
|
322
|
+
throw error;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
else {
|
|
326
|
+
// If not externally provided browser, close directly and re-execute search
|
|
327
|
+
await browser.close();
|
|
328
|
+
return performSearch(false); // Re-execute search in headed mode
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
else {
|
|
332
|
+
logger.warn("Detected human-machine verification page, please complete verification in browser...");
|
|
333
|
+
// Wait for user to complete verification and redirect back to search page
|
|
334
|
+
await page.waitForNavigation({
|
|
335
|
+
timeout: timeout * 2,
|
|
336
|
+
url: (url) => {
|
|
337
|
+
const urlStr = url.toString();
|
|
338
|
+
return sorryPatterns.every((pattern) => !urlStr.includes(pattern));
|
|
339
|
+
},
|
|
340
|
+
});
|
|
341
|
+
logger.info("Human-machine verification completed, continuing search...");
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
logger.info({ query }, "Entering search keywords");
|
|
345
|
+
// Wait for search box to appear - try multiple possible selectors
|
|
346
|
+
const searchInputSelectors = [
|
|
347
|
+
"textarea[name='q']",
|
|
348
|
+
"input[name='q']",
|
|
349
|
+
"textarea[title='Search']",
|
|
350
|
+
"input[title='Search']",
|
|
351
|
+
"textarea[aria-label='Search']",
|
|
352
|
+
"input[aria-label='Search']",
|
|
353
|
+
"textarea",
|
|
354
|
+
];
|
|
355
|
+
let searchInput = null;
|
|
356
|
+
for (const selector of searchInputSelectors) {
|
|
357
|
+
searchInput = await page.$(selector);
|
|
358
|
+
if (searchInput) {
|
|
359
|
+
logger.info({ selector }, "Found search box");
|
|
360
|
+
break;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
if (!searchInput) {
|
|
364
|
+
logger.error("Unable to find search box");
|
|
365
|
+
throw new Error("Unable to find search box");
|
|
366
|
+
}
|
|
367
|
+
// Click search box directly, reduce delay
|
|
368
|
+
await searchInput.click();
|
|
369
|
+
// Input entire query string directly, instead of typing character by character
|
|
370
|
+
await page.keyboard.type(query, { delay: getRandomDelay(10, 30) });
|
|
371
|
+
// Reduce delay before pressing enter
|
|
372
|
+
await page.waitForTimeout(getRandomDelay(100, 300));
|
|
373
|
+
await page.keyboard.press("Enter");
|
|
374
|
+
logger.info("Waiting for page to finish loading...");
|
|
375
|
+
// Wait for page to finish loading
|
|
376
|
+
await page.waitForLoadState("load", { timeout });
|
|
377
|
+
// Check if URL after search is redirected to human-machine verification page
|
|
378
|
+
const searchUrl = page.url();
|
|
379
|
+
const isBlockedAfterSearch = sorryPatterns.some((pattern) => searchUrl.includes(pattern));
|
|
380
|
+
if (isBlockedAfterSearch) {
|
|
381
|
+
if (headless) {
|
|
382
|
+
logger.warn("Detected human-machine verification page after search, restarting browser in headed mode...");
|
|
383
|
+
// Close current page and context
|
|
384
|
+
await page.close();
|
|
385
|
+
await context.close();
|
|
386
|
+
// If it's an externally provided browser, don't close it, but create a new browser instance
|
|
387
|
+
if (browserWasProvided) {
|
|
388
|
+
logger.info("Encountered human-machine verification after search when using external browser instance, creating new browser instance...");
|
|
389
|
+
// Create a new browser instance, no longer using the externally provided instance
|
|
390
|
+
const newBrowser = await chromium.launch({
|
|
391
|
+
headless: false, // Use headed mode
|
|
392
|
+
timeout: timeout * 2,
|
|
393
|
+
args: SecureBrowserConfig.getDefaultSearchArgs(),
|
|
394
|
+
ignoreDefaultArgs: ["--enable-automation"],
|
|
395
|
+
});
|
|
396
|
+
// Execute search using new browser instance
|
|
397
|
+
try {
|
|
398
|
+
const tempContext = await newBrowser.newContext(contextOptions);
|
|
399
|
+
const tempPage = await tempContext.newPage();
|
|
400
|
+
// Code to handle human-machine verification can be added here
|
|
401
|
+
// ...
|
|
402
|
+
// Close temporary browser after completion
|
|
403
|
+
await newBrowser.close();
|
|
404
|
+
// Re-execute search
|
|
405
|
+
return performSearch(false);
|
|
406
|
+
}
|
|
407
|
+
catch (error) {
|
|
408
|
+
await newBrowser.close();
|
|
409
|
+
throw error;
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
else {
|
|
413
|
+
// If not externally provided browser, close directly and re-execute search
|
|
414
|
+
await browser.close();
|
|
415
|
+
return performSearch(false); // Re-execute search in headed mode
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
else {
|
|
419
|
+
logger.warn("Detected human-machine verification page after search, please complete verification in browser...");
|
|
420
|
+
// Wait for user to complete verification and redirect back to search page
|
|
421
|
+
await page.waitForNavigation({
|
|
422
|
+
timeout: timeout * 2,
|
|
423
|
+
url: (url) => {
|
|
424
|
+
const urlStr = url.toString();
|
|
425
|
+
return sorryPatterns.every((pattern) => !urlStr.includes(pattern));
|
|
426
|
+
},
|
|
427
|
+
});
|
|
428
|
+
logger.info("Human-machine verification completed, continuing search...");
|
|
429
|
+
// Wait for page to reload
|
|
430
|
+
await page.waitForLoadState("load", { timeout });
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
logger.info({ url: page.url() }, "Waiting for search results to load...");
|
|
434
|
+
// Try multiple possible search result selectors
|
|
435
|
+
const searchResultSelectors = [
|
|
436
|
+
"#search",
|
|
437
|
+
"#rso",
|
|
438
|
+
".g",
|
|
439
|
+
"[data-sokoban-container]",
|
|
440
|
+
"div[role='main']",
|
|
441
|
+
];
|
|
442
|
+
let resultsFound = false;
|
|
443
|
+
for (const selector of searchResultSelectors) {
|
|
444
|
+
try {
|
|
445
|
+
await page.waitForSelector(selector, { timeout: timeout / 2 });
|
|
446
|
+
logger.info({ selector }, "Found search results");
|
|
447
|
+
resultsFound = true;
|
|
448
|
+
break;
|
|
449
|
+
}
|
|
450
|
+
catch {
|
|
451
|
+
// Continue trying next selector
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
if (!resultsFound) {
|
|
455
|
+
// If search results cannot be found, check if redirected to human-machine verification page
|
|
456
|
+
const currentUrl = page.url();
|
|
457
|
+
const isBlockedDuringResults = sorryPatterns.some((pattern) => currentUrl.includes(pattern));
|
|
458
|
+
if (isBlockedDuringResults) {
|
|
459
|
+
if (headless) {
|
|
460
|
+
logger.warn("Detected human-machine verification page while waiting for search results, restarting browser in headed mode...");
|
|
461
|
+
// Close current page and context
|
|
462
|
+
await page.close();
|
|
463
|
+
await context.close();
|
|
464
|
+
// If it's an externally provided browser, don't close it, but create a new browser instance
|
|
465
|
+
if (browserWasProvided) {
|
|
466
|
+
logger.info("Encountered human-machine verification while waiting for search results when using external browser instance, creating new browser instance...");
|
|
467
|
+
// Create a new browser instance, no longer using the externally provided instance
|
|
468
|
+
const newBrowser = await chromium.launch({
|
|
469
|
+
headless: false, // Use headed mode
|
|
470
|
+
timeout: timeout * 2,
|
|
471
|
+
args: SecureBrowserConfig.getDefaultSearchArgs(),
|
|
472
|
+
ignoreDefaultArgs: ["--enable-automation"],
|
|
473
|
+
});
|
|
474
|
+
// Execute search using new browser instance
|
|
475
|
+
try {
|
|
476
|
+
const tempContext = await newBrowser.newContext(contextOptions);
|
|
477
|
+
const tempPage = await tempContext.newPage();
|
|
478
|
+
// Code to handle human-machine verification can be added here
|
|
479
|
+
// ...
|
|
480
|
+
// Close temporary browser after completion
|
|
481
|
+
await newBrowser.close();
|
|
482
|
+
// Re-execute search
|
|
483
|
+
return performSearch(false);
|
|
484
|
+
}
|
|
485
|
+
catch (error) {
|
|
486
|
+
await newBrowser.close();
|
|
487
|
+
throw error;
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
else {
|
|
491
|
+
// If not externally provided browser, close directly and re-execute search
|
|
492
|
+
await browser.close();
|
|
493
|
+
return performSearch(false); // Re-execute search in headed mode
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
else {
|
|
497
|
+
logger.warn("Detected human-machine verification page while waiting for search results, please complete verification in browser...");
|
|
498
|
+
// Wait for user to complete verification and redirect back to search page
|
|
499
|
+
await page.waitForNavigation({
|
|
500
|
+
timeout: timeout * 2,
|
|
501
|
+
url: (url) => {
|
|
502
|
+
const urlStr = url.toString();
|
|
503
|
+
return sorryPatterns.every((pattern) => !urlStr.includes(pattern));
|
|
504
|
+
},
|
|
505
|
+
});
|
|
506
|
+
logger.info("Human-machine verification completed, continuing search...");
|
|
507
|
+
// Try waiting for search results again
|
|
508
|
+
for (const selector of searchResultSelectors) {
|
|
509
|
+
try {
|
|
510
|
+
await page.waitForSelector(selector, { timeout: timeout / 2 });
|
|
511
|
+
logger.info({ selector }, "Found search results after verification");
|
|
512
|
+
resultsFound = true;
|
|
513
|
+
break;
|
|
514
|
+
}
|
|
515
|
+
catch {
|
|
516
|
+
// Continue trying next selector
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
if (!resultsFound) {
|
|
520
|
+
logger.error("Unable to find search result elements");
|
|
521
|
+
throw new Error("Unable to find search result elements");
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
else {
|
|
526
|
+
// If not a human-machine verification issue, throw error
|
|
527
|
+
logger.error("Unable to find search result elements");
|
|
528
|
+
throw new Error("Unable to find search result elements");
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
// Reduce waiting time
|
|
532
|
+
await page.waitForTimeout(getRandomDelay(200, 500));
|
|
533
|
+
logger.info("Extracting search results...");
|
|
534
|
+
// Track search start time for performance metrics
|
|
535
|
+
const searchStartTime = Date.now();
|
|
536
|
+
// Extract search results
|
|
537
|
+
const results = await page.evaluate((maxResults) => {
|
|
538
|
+
const results = [];
|
|
539
|
+
const seenUrls = new Set(); // For deduplication
|
|
540
|
+
// Helper function to extract domain from URL
|
|
541
|
+
const extractDomain = (url) => {
|
|
542
|
+
try {
|
|
543
|
+
const urlObj = new URL(url);
|
|
544
|
+
return urlObj.hostname.replace(/^www\./, "");
|
|
545
|
+
}
|
|
546
|
+
catch {
|
|
547
|
+
return "";
|
|
548
|
+
}
|
|
549
|
+
};
|
|
550
|
+
// Helper function to detect content type from URL
|
|
551
|
+
const detectContentType = (url) => {
|
|
552
|
+
const lowerUrl = url.toLowerCase();
|
|
553
|
+
if (lowerUrl.endsWith(".pdf") || lowerUrl.includes(".pdf?"))
|
|
554
|
+
return "pdf";
|
|
555
|
+
if (lowerUrl.endsWith(".doc") ||
|
|
556
|
+
lowerUrl.endsWith(".docx") ||
|
|
557
|
+
lowerUrl.endsWith(".txt"))
|
|
558
|
+
return "doc";
|
|
559
|
+
if (lowerUrl.includes("youtube.com") ||
|
|
560
|
+
lowerUrl.includes("vimeo.com") ||
|
|
561
|
+
lowerUrl.endsWith(".mp4") ||
|
|
562
|
+
lowerUrl.endsWith(".avi") ||
|
|
563
|
+
lowerUrl.includes("/video/"))
|
|
564
|
+
return "video";
|
|
565
|
+
if (lowerUrl.endsWith(".jpg") ||
|
|
566
|
+
lowerUrl.endsWith(".jpeg") ||
|
|
567
|
+
lowerUrl.endsWith(".png") ||
|
|
568
|
+
lowerUrl.endsWith(".gif") ||
|
|
569
|
+
lowerUrl.endsWith(".webp") ||
|
|
570
|
+
lowerUrl.includes("/image/"))
|
|
571
|
+
return "image";
|
|
572
|
+
return "html";
|
|
573
|
+
};
|
|
574
|
+
// Helper function to detect if URL is secure
|
|
575
|
+
const isSecureUrl = (url) => {
|
|
576
|
+
return url.startsWith("https://");
|
|
577
|
+
};
|
|
578
|
+
// Helper function to count words in text
|
|
579
|
+
const countWords = (text) => {
|
|
580
|
+
return text
|
|
581
|
+
.trim()
|
|
582
|
+
.split(/\s+/)
|
|
583
|
+
.filter((word) => word.length > 0).length;
|
|
584
|
+
};
|
|
585
|
+
// Define multiple selector sets, sorted by priority
|
|
586
|
+
const selectorSets = [
|
|
587
|
+
{
|
|
588
|
+
container: "#search div[data-hveid]",
|
|
589
|
+
title: "h3",
|
|
590
|
+
snippet: ".VwiC3b",
|
|
591
|
+
},
|
|
592
|
+
{
|
|
593
|
+
container: "#rso div[data-hveid]",
|
|
594
|
+
title: "h3",
|
|
595
|
+
snippet: '[data-sncf="1"]',
|
|
596
|
+
},
|
|
597
|
+
{
|
|
598
|
+
container: ".g",
|
|
599
|
+
title: "h3",
|
|
600
|
+
snippet: 'div[style*="webkit-line-clamp"]',
|
|
601
|
+
},
|
|
602
|
+
{
|
|
603
|
+
container: "div[jscontroller][data-hveid]",
|
|
604
|
+
title: "h3",
|
|
605
|
+
snippet: 'div[role="text"]',
|
|
606
|
+
},
|
|
607
|
+
];
|
|
608
|
+
// Alternative snippet selectors
|
|
609
|
+
const alternativeSnippetSelectors = [
|
|
610
|
+
".VwiC3b",
|
|
611
|
+
'[data-sncf="1"]',
|
|
612
|
+
'div[style*="webkit-line-clamp"]',
|
|
613
|
+
'div[role="text"]',
|
|
614
|
+
];
|
|
615
|
+
// Try each selector set
|
|
616
|
+
for (const selectors of selectorSets) {
|
|
617
|
+
if (results.length >= maxResults)
|
|
618
|
+
break; // Stop if quantity limit reached
|
|
619
|
+
const containers = document.querySelectorAll(selectors.container);
|
|
620
|
+
for (const container of containers) {
|
|
621
|
+
if (results.length >= maxResults)
|
|
622
|
+
break;
|
|
623
|
+
const titleElement = container.querySelector(selectors.title);
|
|
624
|
+
if (!titleElement)
|
|
625
|
+
continue;
|
|
626
|
+
const title = (titleElement.textContent || "").trim();
|
|
627
|
+
// Find link
|
|
628
|
+
let link = "";
|
|
629
|
+
const linkInTitle = titleElement.querySelector("a");
|
|
630
|
+
if (linkInTitle) {
|
|
631
|
+
link = linkInTitle.href;
|
|
632
|
+
}
|
|
633
|
+
else {
|
|
634
|
+
let current = titleElement;
|
|
635
|
+
while (current && current.tagName !== "A") {
|
|
636
|
+
current = current.parentElement;
|
|
637
|
+
}
|
|
638
|
+
if (current && current instanceof HTMLAnchorElement) {
|
|
639
|
+
link = current.href;
|
|
640
|
+
}
|
|
641
|
+
else {
|
|
642
|
+
const containerLink = container.querySelector("a");
|
|
643
|
+
if (containerLink) {
|
|
644
|
+
link = containerLink.href;
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
// Filter invalid or duplicate links
|
|
649
|
+
if (!link || !link.startsWith("http") || seenUrls.has(link))
|
|
650
|
+
continue;
|
|
651
|
+
// Find snippet
|
|
652
|
+
let snippet = "";
|
|
653
|
+
const snippetElement = container.querySelector(selectors.snippet);
|
|
654
|
+
if (snippetElement) {
|
|
655
|
+
snippet = (snippetElement.textContent || "").trim();
|
|
656
|
+
}
|
|
657
|
+
else {
|
|
658
|
+
// Try other snippet selectors
|
|
659
|
+
for (const altSelector of alternativeSnippetSelectors) {
|
|
660
|
+
const element = container.querySelector(altSelector);
|
|
661
|
+
if (element) {
|
|
662
|
+
snippet = (element.textContent || "").trim();
|
|
663
|
+
break;
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
// If still no snippet found, try general method
|
|
667
|
+
if (!snippet) {
|
|
668
|
+
const textNodes = Array.from(container.querySelectorAll("div")).filter((el) => !el.querySelector("h3") &&
|
|
669
|
+
(el.textContent || "").trim().length > 20);
|
|
670
|
+
if (textNodes.length > 0) {
|
|
671
|
+
snippet = (textNodes[0].textContent || "").trim();
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
// Only add results with title and link
|
|
676
|
+
if (title && link) {
|
|
677
|
+
// Check for rich snippet indicators
|
|
678
|
+
const hasRichSnippet = !!(container.querySelector(".kno-rdesc") || // Knowledge panel
|
|
679
|
+
container.querySelector("[data-attrid]") || // Structured data
|
|
680
|
+
container.querySelector(".commercial-unit-desktop-top") || // Featured snippets
|
|
681
|
+
container.querySelector('[role="heading"]') // FAQ/structured content
|
|
682
|
+
);
|
|
683
|
+
const contentType = detectContentType(link);
|
|
684
|
+
const wordCount = countWords(title + " " + snippet);
|
|
685
|
+
results.push({
|
|
686
|
+
title,
|
|
687
|
+
link,
|
|
688
|
+
snippet,
|
|
689
|
+
domain: extractDomain(link),
|
|
690
|
+
position: results.length + 1,
|
|
691
|
+
snippetLength: snippet.length,
|
|
692
|
+
hasRichSnippet,
|
|
693
|
+
contentType,
|
|
694
|
+
isSecure: isSecureUrl(link),
|
|
695
|
+
hasImages: container.querySelector("img") !== null,
|
|
696
|
+
wordCount,
|
|
697
|
+
language: "en", // Default to English
|
|
698
|
+
});
|
|
699
|
+
seenUrls.add(link); // Record processed URL
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
// If main selectors didn't find enough results, try more general method (as supplement)
|
|
704
|
+
if (results.length < maxResults) {
|
|
705
|
+
const anchorElements = Array.from(document.querySelectorAll("a[href^='http']"));
|
|
706
|
+
for (const el of anchorElements) {
|
|
707
|
+
if (results.length >= maxResults)
|
|
708
|
+
break;
|
|
709
|
+
// Check if el is HTMLAnchorElement
|
|
710
|
+
if (!(el instanceof HTMLAnchorElement)) {
|
|
711
|
+
continue;
|
|
712
|
+
}
|
|
713
|
+
const link = el.href;
|
|
714
|
+
// Filter out navigation links, image links, existing links, etc.
|
|
715
|
+
if (!link ||
|
|
716
|
+
seenUrls.has(link) ||
|
|
717
|
+
link.includes("google.com/") ||
|
|
718
|
+
link.includes("accounts.google") ||
|
|
719
|
+
link.includes("support.google")) {
|
|
720
|
+
continue;
|
|
721
|
+
}
|
|
722
|
+
const title = (el.textContent || "").trim();
|
|
723
|
+
if (!title)
|
|
724
|
+
continue; // Skip links without text content
|
|
725
|
+
// Try to get surrounding text as snippet
|
|
726
|
+
let snippet = "";
|
|
727
|
+
let parent = el.parentElement;
|
|
728
|
+
for (let i = 0; i < 3 && parent; i++) {
|
|
729
|
+
const text = (parent.textContent || "").trim();
|
|
730
|
+
// Ensure snippet text differs from title and has certain length
|
|
731
|
+
if (text.length > 20 && text !== title) {
|
|
732
|
+
snippet = text;
|
|
733
|
+
break; // Stop upward search when suitable snippet found
|
|
734
|
+
}
|
|
735
|
+
parent = parent.parentElement;
|
|
736
|
+
}
|
|
737
|
+
const contentType = detectContentType(link);
|
|
738
|
+
const wordCount = countWords(title + " " + snippet);
|
|
739
|
+
results.push({
|
|
740
|
+
title,
|
|
741
|
+
link,
|
|
742
|
+
snippet,
|
|
743
|
+
domain: extractDomain(link),
|
|
744
|
+
position: results.length + 1,
|
|
745
|
+
snippetLength: snippet.length,
|
|
746
|
+
hasRichSnippet: false,
|
|
747
|
+
contentType,
|
|
748
|
+
isSecure: isSecureUrl(link),
|
|
749
|
+
hasImages: false,
|
|
750
|
+
wordCount,
|
|
751
|
+
language: "en",
|
|
752
|
+
});
|
|
753
|
+
seenUrls.add(link);
|
|
754
|
+
}
|
|
755
|
+
}
|
|
756
|
+
return results.slice(0, maxResults); // Ensure not exceeding limit
|
|
757
|
+
}, limit); // Pass limit to evaluate function
|
|
758
|
+
const searchTime = Date.now() - searchStartTime;
|
|
759
|
+
logger.info({ count: results.length, searchTimeMs: searchTime }, "Successfully obtained search results");
|
|
760
|
+
try {
|
|
761
|
+
// Save browser state (unless user specifies not to save)
|
|
762
|
+
if (!noSaveState) {
|
|
763
|
+
logger.info({ stateFile }, "Saving browser state...");
|
|
764
|
+
// Ensure directory exists
|
|
765
|
+
const stateDir = path.dirname(stateFile);
|
|
766
|
+
if (!fs.existsSync(stateDir)) {
|
|
767
|
+
fs.mkdirSync(stateDir, { recursive: true });
|
|
768
|
+
}
|
|
769
|
+
// Save state
|
|
770
|
+
await context.storageState({ path: stateFile });
|
|
771
|
+
logger.info("Browser state saved successfully!");
|
|
772
|
+
// Save fingerprint configuration
|
|
773
|
+
try {
|
|
774
|
+
fs.writeFileSync(fingerprintFile, JSON.stringify(savedState, null, 2), "utf8");
|
|
775
|
+
logger.info({ fingerprintFile }, "Fingerprint configuration saved");
|
|
776
|
+
}
|
|
777
|
+
catch (fingerprintError) {
|
|
778
|
+
logger.error({ error: fingerprintError }, "Error saving fingerprint configuration");
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
else {
|
|
782
|
+
logger.info("Not saving browser state according to user settings");
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
catch (error) {
|
|
786
|
+
logger.error({ error }, "Error saving browser state");
|
|
787
|
+
}
|
|
788
|
+
// Only close browser if it's not externally provided
|
|
789
|
+
if (!browserWasProvided) {
|
|
790
|
+
logger.info("Closing browser...");
|
|
791
|
+
await browser.close();
|
|
792
|
+
}
|
|
793
|
+
else {
|
|
794
|
+
logger.info("Keeping browser instance open");
|
|
795
|
+
}
|
|
796
|
+
// Return search results with metadata
|
|
797
|
+
const resultSummary = results.length > 0
|
|
798
|
+
? `Found ${results.length} results${results[0].hasRichSnippet ? " (including rich snippets)" : ""}`
|
|
799
|
+
: "No results found";
|
|
800
|
+
return {
|
|
801
|
+
query,
|
|
802
|
+
results,
|
|
803
|
+
totalResults: results.length,
|
|
804
|
+
searchTime,
|
|
805
|
+
timestamp: new Date().toISOString(),
|
|
806
|
+
resultSummary,
|
|
807
|
+
};
|
|
808
|
+
}
|
|
809
|
+
catch (error) {
|
|
810
|
+
logger.error({ error }, "Error occurred during search");
|
|
811
|
+
try {
|
|
812
|
+
// Try to save browser state, even if error occurred
|
|
813
|
+
if (!noSaveState) {
|
|
814
|
+
logger.info({ stateFile }, "Saving browser state...");
|
|
815
|
+
const stateDir = path.dirname(stateFile);
|
|
816
|
+
if (!fs.existsSync(stateDir)) {
|
|
817
|
+
fs.mkdirSync(stateDir, { recursive: true });
|
|
818
|
+
}
|
|
819
|
+
await context.storageState({ path: stateFile });
|
|
820
|
+
// Save fingerprint configuration
|
|
821
|
+
try {
|
|
822
|
+
fs.writeFileSync(fingerprintFile, JSON.stringify(savedState, null, 2), "utf8");
|
|
823
|
+
logger.info({ fingerprintFile }, "Fingerprint configuration saved");
|
|
824
|
+
}
|
|
825
|
+
catch (fingerprintError) {
|
|
826
|
+
logger.error({ error: fingerprintError }, "Error saving fingerprint configuration");
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
catch (stateError) {
|
|
831
|
+
logger.error({ error: stateError }, "Error saving browser state");
|
|
832
|
+
}
|
|
833
|
+
// Only close browser if it's not externally provided
|
|
834
|
+
if (!browserWasProvided) {
|
|
835
|
+
logger.info("Closing browser...");
|
|
836
|
+
await browser.close();
|
|
837
|
+
}
|
|
838
|
+
else {
|
|
839
|
+
logger.info("Keeping browser instance open");
|
|
840
|
+
}
|
|
841
|
+
// Re-throw the error instead of returning error results
|
|
842
|
+
throw error;
|
|
843
|
+
}
|
|
844
|
+
// Remove finally block, as resource cleanup is already handled in try and catch blocks
|
|
845
|
+
}
|
|
846
|
+
// First try to execute search in headless mode
|
|
847
|
+
return performSearch(useHeadless);
|
|
848
|
+
}
|
|
849
|
+
/**
|
|
850
|
+
* Get the raw HTML of Google search result page
|
|
851
|
+
* @param query Search keywords
|
|
852
|
+
* @param options Search options
|
|
853
|
+
* @param saveToFile Whether to save HTML to file (optional)
|
|
854
|
+
* @param outputPath HTML output file path (optional, defaults to './google-search-html/[query]-[timestamp].html')
|
|
855
|
+
* @returns Response object containing HTML content
|
|
856
|
+
*/
|
|
857
|
+
export async function getGoogleSearchPageHtml(query, options = {}, saveToFile = false, outputPath) {
|
|
858
|
+
// Set default options, consistent with googleSearch
|
|
859
|
+
const { timeout = 60000, stateFile: userStateFile, noSaveState = false, locale = "en-US", // Default to English
|
|
860
|
+
} = options;
|
|
861
|
+
// Resolve and validate state file path
|
|
862
|
+
const stateFile = resolveStateFilePath(userStateFile);
|
|
863
|
+
// Ignore passed headless parameter, always start in headless mode
|
|
864
|
+
const useHeadless = true;
|
|
865
|
+
logger.info({ options: { ...options, stateFile } }, "Initializing browser to get search page HTML...");
|
|
866
|
+
// Setup browser environment
|
|
867
|
+
const { storageState, savedState, fingerprintFile, googleDomains, getRandomDeviceConfig, getRandomDelay, } = setupBrowserEnvironment({
|
|
868
|
+
stateFile,
|
|
869
|
+
timeout,
|
|
870
|
+
locale,
|
|
871
|
+
noSaveState,
|
|
872
|
+
});
|
|
873
|
+
// Define a dedicated function to get HTML
|
|
874
|
+
async function performSearchAndGetHtml(headless) {
|
|
875
|
+
let browser;
|
|
876
|
+
// Initialize browser with secure arguments from shared config
|
|
877
|
+
browser = await chromium.launch({
|
|
878
|
+
headless,
|
|
879
|
+
timeout: timeout * 2, // Increase browser startup timeout
|
|
880
|
+
args: SecureBrowserConfig.getDefaultSearchArgs(),
|
|
881
|
+
ignoreDefaultArgs: ["--enable-automation"],
|
|
882
|
+
});
|
|
883
|
+
// Get device configuration - use saved or randomly generated
|
|
884
|
+
const [deviceName, deviceConfig] = getRandomDeviceConfig();
|
|
885
|
+
// Create browser context options
|
|
886
|
+
let contextOptions = {
|
|
887
|
+
...deviceConfig,
|
|
888
|
+
};
|
|
889
|
+
// If there is saved fingerprint configuration, use it; otherwise use the actual settings of the host machine
|
|
890
|
+
if (savedState.fingerprint) {
|
|
891
|
+
contextOptions = {
|
|
892
|
+
...contextOptions,
|
|
893
|
+
locale: savedState.fingerprint.locale,
|
|
894
|
+
timezoneId: savedState.fingerprint.timezoneId,
|
|
895
|
+
colorScheme: savedState.fingerprint.colorScheme,
|
|
896
|
+
reducedMotion: savedState.fingerprint.reducedMotion,
|
|
897
|
+
forcedColors: savedState.fingerprint.forcedColors,
|
|
898
|
+
};
|
|
899
|
+
logger.info("Using saved browser fingerprint configuration");
|
|
900
|
+
}
|
|
901
|
+
else {
|
|
902
|
+
// Get the actual settings of the host machine
|
|
903
|
+
const hostConfig = getHostMachineConfig(locale);
|
|
904
|
+
contextOptions = {
|
|
905
|
+
...contextOptions,
|
|
906
|
+
locale: hostConfig.locale,
|
|
907
|
+
timezoneId: hostConfig.timezoneId,
|
|
908
|
+
colorScheme: hostConfig.colorScheme,
|
|
909
|
+
reducedMotion: hostConfig.reducedMotion,
|
|
910
|
+
forcedColors: hostConfig.forcedColors,
|
|
911
|
+
};
|
|
912
|
+
// Save newly generated fingerprint configuration
|
|
913
|
+
savedState.fingerprint = hostConfig;
|
|
914
|
+
logger.info({
|
|
915
|
+
locale: hostConfig.locale,
|
|
916
|
+
timezone: hostConfig.timezoneId,
|
|
917
|
+
colorScheme: hostConfig.colorScheme,
|
|
918
|
+
deviceType: hostConfig.deviceName,
|
|
919
|
+
}, "New browser fingerprint configuration generated based on host machine");
|
|
920
|
+
}
|
|
921
|
+
// Add general options - ensure desktop configuration is used
|
|
922
|
+
contextOptions = {
|
|
923
|
+
...contextOptions,
|
|
924
|
+
permissions: ["geolocation", "notifications"],
|
|
925
|
+
acceptDownloads: true,
|
|
926
|
+
isMobile: false, // Force desktop mode
|
|
927
|
+
hasTouch: false, // Disable touch functionality
|
|
928
|
+
javaScriptEnabled: true,
|
|
929
|
+
};
|
|
930
|
+
const context = await browser.newContext(storageState ? { ...contextOptions, storageState } : contextOptions);
|
|
931
|
+
// Set additional browser properties to avoid detection
|
|
932
|
+
await context.addInitScript(() => {
|
|
933
|
+
// Override navigator properties
|
|
934
|
+
Object.defineProperty(navigator, "webdriver", { get: () => false });
|
|
935
|
+
Object.defineProperty(navigator, "plugins", {
|
|
936
|
+
get: () => [1, 2, 3, 4, 5],
|
|
937
|
+
});
|
|
938
|
+
Object.defineProperty(navigator, "languages", {
|
|
939
|
+
get: () => ["en-US", "en"],
|
|
940
|
+
});
|
|
941
|
+
// Override window properties
|
|
942
|
+
// @ts-expect-error - Ignore error that chrome property does not exist
|
|
943
|
+
window.chrome = {
|
|
944
|
+
runtime: {},
|
|
945
|
+
loadTimes: function () { },
|
|
946
|
+
csi: function () { },
|
|
947
|
+
app: {},
|
|
948
|
+
};
|
|
949
|
+
});
|
|
950
|
+
const page = await context.newPage();
|
|
951
|
+
// Set additional page properties
|
|
952
|
+
await page.addInitScript(() => {
|
|
953
|
+
// Simulate real screen size and color depth
|
|
954
|
+
Object.defineProperty(window.screen, "width", { get: () => 1920 });
|
|
955
|
+
Object.defineProperty(window.screen, "height", { get: () => 1080 });
|
|
956
|
+
Object.defineProperty(window.screen, "colorDepth", { get: () => 24 });
|
|
957
|
+
Object.defineProperty(window.screen, "pixelDepth", { get: () => 24 });
|
|
958
|
+
});
|
|
959
|
+
try {
|
|
960
|
+
// Use saved Google domain or randomly select one
|
|
961
|
+
let selectedDomain;
|
|
962
|
+
if (savedState.googleDomain) {
|
|
963
|
+
selectedDomain = savedState.googleDomain;
|
|
964
|
+
logger.info({ domain: selectedDomain }, "Using saved Google domain");
|
|
965
|
+
}
|
|
966
|
+
else {
|
|
967
|
+
const domains = [...googleDomains];
|
|
968
|
+
selectedDomain = domains[Math.floor(Math.random() * domains.length)];
|
|
969
|
+
// Save selected domain
|
|
970
|
+
savedState.googleDomain = selectedDomain;
|
|
971
|
+
logger.info({ domain: selectedDomain }, "Randomly select Google domain");
|
|
972
|
+
}
|
|
973
|
+
logger.info("Accessing Google search page...");
|
|
974
|
+
// Visit Google search page
|
|
975
|
+
await page.goto(selectedDomain, {
|
|
976
|
+
timeout,
|
|
977
|
+
waitUntil: "load",
|
|
978
|
+
});
|
|
979
|
+
logger.info({ query }, "Entering search keywords");
|
|
980
|
+
// Wait for search box to appear - try multiple possible selectors
|
|
981
|
+
const searchInputSelectors = [
|
|
982
|
+
"textarea[name='q']",
|
|
983
|
+
"input[name='q']",
|
|
984
|
+
"textarea[title='Search']",
|
|
985
|
+
"input[title='Search']",
|
|
986
|
+
"textarea[aria-label='Search']",
|
|
987
|
+
"input[aria-label='Search']",
|
|
988
|
+
"textarea",
|
|
989
|
+
];
|
|
990
|
+
let searchInput = null;
|
|
991
|
+
for (const selector of searchInputSelectors) {
|
|
992
|
+
searchInput = await page.$(selector);
|
|
993
|
+
if (searchInput) {
|
|
994
|
+
logger.info({ selector }, "Found search box");
|
|
995
|
+
break;
|
|
996
|
+
}
|
|
997
|
+
}
|
|
998
|
+
if (!searchInput) {
|
|
999
|
+
logger.error("Unable to find search box");
|
|
1000
|
+
throw new Error("Unable to find search box");
|
|
1001
|
+
}
|
|
1002
|
+
// Click search box directly, reduce delay
|
|
1003
|
+
await searchInput.click();
|
|
1004
|
+
// Input entire query string directly, instead of typing character by character
|
|
1005
|
+
await page.keyboard.type(query, { delay: getRandomDelay(10, 30) });
|
|
1006
|
+
// Reduce delay before pressing enter
|
|
1007
|
+
await page.waitForTimeout(getRandomDelay(100, 300));
|
|
1008
|
+
await page.keyboard.press("Enter");
|
|
1009
|
+
logger.info("Waiting for page to finish loading...");
|
|
1010
|
+
// Wait for page to finish loading
|
|
1011
|
+
await page.waitForLoadState("load", { timeout });
|
|
1012
|
+
logger.info({ url: page.url() }, "Search page loaded successfully");
|
|
1013
|
+
// Get page HTML content
|
|
1014
|
+
const html = await page.content();
|
|
1015
|
+
// Take screenshot if requested
|
|
1016
|
+
let screenshotPath;
|
|
1017
|
+
const timestamp = Date.now();
|
|
1018
|
+
const sanitizedQuery = query.replace(/[^a-zA-Z0-9]/g, "_").slice(0, 50);
|
|
1019
|
+
// Determine output path
|
|
1020
|
+
let finalOutputPath = outputPath;
|
|
1021
|
+
if (saveToFile && !finalOutputPath) {
|
|
1022
|
+
// Default output directory
|
|
1023
|
+
const outputDir = path.join(process.cwd(), "google-search-html");
|
|
1024
|
+
if (!fs.existsSync(outputDir)) {
|
|
1025
|
+
fs.mkdirSync(outputDir, { recursive: true });
|
|
1026
|
+
}
|
|
1027
|
+
finalOutputPath = path.join(outputDir, `${sanitizedQuery}-${timestamp}.html`);
|
|
1028
|
+
}
|
|
1029
|
+
// Save HTML file if requested
|
|
1030
|
+
if (saveToFile && finalOutputPath) {
|
|
1031
|
+
fs.writeFileSync(finalOutputPath, html, "utf8");
|
|
1032
|
+
logger.info({ path: finalOutputPath }, "HTML saved to file");
|
|
1033
|
+
}
|
|
1034
|
+
// Take screenshot
|
|
1035
|
+
const screenshotDir = path.join(process.cwd(), "google-search-screenshots");
|
|
1036
|
+
if (!fs.existsSync(screenshotDir)) {
|
|
1037
|
+
fs.mkdirSync(screenshotDir, { recursive: true });
|
|
1038
|
+
}
|
|
1039
|
+
screenshotPath = path.join(screenshotDir, `${sanitizedQuery}-${timestamp}.png`);
|
|
1040
|
+
await page.screenshot({ path: screenshotPath, fullPage: true });
|
|
1041
|
+
logger.info({ path: screenshotPath }, "Screenshot saved");
|
|
1042
|
+
// Close browser
|
|
1043
|
+
await browser.close();
|
|
1044
|
+
// Save browser state (unless user specifies not to save)
|
|
1045
|
+
if (!noSaveState) {
|
|
1046
|
+
try {
|
|
1047
|
+
const stateDir = path.dirname(stateFile);
|
|
1048
|
+
if (!fs.existsSync(stateDir)) {
|
|
1049
|
+
fs.mkdirSync(stateDir, { recursive: true });
|
|
1050
|
+
}
|
|
1051
|
+
await context.storageState({ path: stateFile });
|
|
1052
|
+
fs.writeFileSync(fingerprintFile, JSON.stringify(savedState, null, 2), "utf8");
|
|
1053
|
+
}
|
|
1054
|
+
catch (saveError) {
|
|
1055
|
+
logger.error({ error: saveError }, "Error saving browser state");
|
|
1056
|
+
}
|
|
1057
|
+
}
|
|
1058
|
+
// Clean up HTML (remove script and style tags for cleaner output)
|
|
1059
|
+
const cleanedHtml = html
|
|
1060
|
+
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
|
|
1061
|
+
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
|
|
1062
|
+
return {
|
|
1063
|
+
query,
|
|
1064
|
+
html: cleanedHtml,
|
|
1065
|
+
url: page.url(),
|
|
1066
|
+
savedPath: finalOutputPath,
|
|
1067
|
+
screenshotPath,
|
|
1068
|
+
originalHtmlLength: html.length,
|
|
1069
|
+
};
|
|
1070
|
+
}
|
|
1071
|
+
catch (error) {
|
|
1072
|
+
logger.error({ error }, "Error occurred during HTML retrieval");
|
|
1073
|
+
await browser.close();
|
|
1074
|
+
throw error;
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
// Execute in headless mode
|
|
1078
|
+
return performSearchAndGetHtml(useHeadless);
|
|
1079
|
+
}
|
|
1080
|
+
//# sourceMappingURL=search.js.map
|