@humbletoes/google-search 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +7 -0
  2. package/README.md +339 -0
  3. package/bin/google-search +3 -0
  4. package/bin/google-search-mcp +3 -0
  5. package/bin/google-search-mcp.cmd +2 -0
  6. package/bin/google-search.cmd +2 -0
  7. package/dist/browser-config.d.ts +41 -0
  8. package/dist/browser-config.js +96 -0
  9. package/dist/browser-config.js.map +1 -0
  10. package/dist/browser-pool.d.ts +13 -0
  11. package/dist/browser-pool.js +37 -0
  12. package/dist/browser-pool.js.map +1 -0
  13. package/dist/cache.d.ts +48 -0
  14. package/dist/cache.js +111 -0
  15. package/dist/cache.js.map +1 -0
  16. package/dist/errors.d.ts +26 -0
  17. package/dist/errors.js +48 -0
  18. package/dist/errors.js.map +1 -0
  19. package/dist/filters.d.ts +48 -0
  20. package/dist/filters.js +192 -0
  21. package/dist/filters.js.map +1 -0
  22. package/dist/html-cleaner.d.ts +62 -0
  23. package/dist/html-cleaner.js +236 -0
  24. package/dist/html-cleaner.js.map +1 -0
  25. package/dist/index.d.ts +2 -0
  26. package/dist/index.js +59 -0
  27. package/dist/index.js.map +1 -0
  28. package/dist/logger.d.ts +2 -0
  29. package/dist/logger.js +41 -0
  30. package/dist/logger.js.map +1 -0
  31. package/dist/mcp-server.d.ts +9 -0
  32. package/dist/mcp-server.js +822 -0
  33. package/dist/mcp-server.js.map +1 -0
  34. package/dist/search.d.ts +18 -0
  35. package/dist/search.js +1080 -0
  36. package/dist/search.js.map +1 -0
  37. package/dist/types.d.ts +67 -0
  38. package/dist/types.js +2 -0
  39. package/dist/types.js.map +1 -0
  40. package/dist/validation.d.ts +6 -0
  41. package/dist/validation.js +23 -0
  42. package/dist/validation.js.map +1 -0
  43. package/dist/web-fetcher.d.ts +10 -0
  44. package/dist/web-fetcher.js +179 -0
  45. package/dist/web-fetcher.js.map +1 -0
  46. package/package.json +67 -0
  47. package/scripts/setup.js +53 -0
package/dist/search.js ADDED
@@ -0,0 +1,1080 @@
1
+ import { chromium, devices } from "playwright";
2
+ import * as fs from "fs";
3
+ import * as path from "path";
4
+ import * as os from "os";
5
+ import logger from "./logger.js";
6
+ import { SecureBrowserConfig, GOOGLE_DOMAINS, getRandomDelay, } from "./browser-config.js";
7
+ /**
8
+ * Validate and resolve state file path for security
9
+ * @param userPath - User-provided path or undefined for default
10
+ * @returns Resolved, validated path
11
+ */
12
+ function resolveStateFilePath(userPath) {
13
+ const homeDir = os.homedir();
14
+ const tmpDir = os.tmpdir();
15
+ // Default to home directory if no path provided
16
+ if (!userPath) {
17
+ return path.join(homeDir, ".google-search-browser-state.json");
18
+ }
19
+ // Resolve the path
20
+ const resolvedPath = path.resolve(userPath);
21
+ // Prevent directory traversal attacks
22
+ // Ensure the path is within allowed directories (home, tmp, or cwd)
23
+ const cwd = process.cwd();
24
+ const isInHome = resolvedPath.startsWith(path.normalize(homeDir + path.sep));
25
+ const isInTmp = resolvedPath.startsWith(path.normalize(tmpDir + path.sep));
26
+ const isInCwd = resolvedPath.startsWith(path.normalize(cwd + path.sep));
27
+ const isAbsoluteInAllowed = path.isAbsolute(resolvedPath) && (isInHome || isInTmp);
28
+ const isRelativePath = !path.isAbsolute(userPath); // Relative paths resolve from cwd
29
+ if (!isInCwd && !isAbsoluteInAllowed && !isRelativePath) {
30
+ throw new Error(`Invalid state file path: ${userPath}. Path must be within home directory, tmp directory, or relative to current working directory.`);
31
+ }
32
+ return resolvedPath;
33
+ }
34
+ /**
35
+ * Get the actual configuration of the host machine
36
+ * @param userLocale User-specified locale (if any)
37
+ * @returns Fingerprint configuration based on host machine
38
+ */
39
+ function getHostMachineConfig(userLocale) {
40
+ // Force English locale for consistent English results
41
+ const systemLocale = userLocale || "en-US";
42
+ // Force US Eastern timezone to ensure English date formats
43
+ const timezoneId = "America/New_York";
44
+ // Detect system color scheme
45
+ // Node.js cannot directly get system color scheme, use reasonable defaults
46
+ // Can infer from time: use dark mode at night, light mode during the day
47
+ const hour = new Date().getHours();
48
+ const colorScheme = hour >= 19 || hour < 7 ? "dark" : "light";
49
+ // Use reasonable defaults for other settings
50
+ const reducedMotion = "no-preference"; // Most users won't enable reduced animations
51
+ const forcedColors = "none"; // Most users won't enable forced colors
52
+ return {
53
+ deviceName: "Desktop Chrome",
54
+ locale: systemLocale,
55
+ timezoneId,
56
+ colorScheme,
57
+ reducedMotion,
58
+ forcedColors,
59
+ };
60
+ }
61
+ /**
62
+ * Common browser setup logic shared between search functions
63
+ * @param config Browser setup configuration
64
+ * @returns Browser setup data
65
+ */
66
+ function setupBrowserEnvironment(config) {
67
+ // Check if state file exists
68
+ let storageState = undefined;
69
+ let savedState = {};
70
+ // Fingerprint configuration file path
71
+ const fingerprintFile = config.stateFile.replace(".json", "-fingerprint.json");
72
+ if (fs.existsSync(config.stateFile)) {
73
+ logger.info({ stateFile: config.stateFile }, "Found browser state file, will use saved browser state to avoid anti-bot detection");
74
+ storageState = config.stateFile;
75
+ // Try to load saved fingerprint configuration
76
+ if (fs.existsSync(fingerprintFile)) {
77
+ try {
78
+ const fingerprintData = fs.readFileSync(fingerprintFile, "utf8");
79
+ savedState = JSON.parse(fingerprintData);
80
+ logger.info("Loaded saved browser fingerprint configuration");
81
+ }
82
+ catch (e) {
83
+ logger.warn({ error: e }, "Unable to load fingerprint configuration file, will create new fingerprint");
84
+ }
85
+ }
86
+ }
87
+ else {
88
+ logger.info({ stateFile: config.stateFile }, "Browser state file not found, will create new browser session and fingerprint");
89
+ }
90
+ // Get random device configuration or use saved configuration
91
+ const getRandomDeviceConfig = () => {
92
+ const deviceList = [
93
+ "Desktop Chrome",
94
+ "Desktop Edge",
95
+ "Desktop Firefox",
96
+ "Desktop Safari",
97
+ ];
98
+ if (savedState.fingerprint?.deviceName &&
99
+ devices[savedState.fingerprint.deviceName]) {
100
+ // Use saved device configuration
101
+ return [
102
+ savedState.fingerprint.deviceName,
103
+ devices[savedState.fingerprint.deviceName],
104
+ ];
105
+ }
106
+ else {
107
+ // Randomly select a device
108
+ const randomDevice = deviceList[Math.floor(Math.random() * deviceList.length)];
109
+ return [randomDevice, devices[randomDevice]];
110
+ }
111
+ };
112
+ return {
113
+ storageState,
114
+ savedState,
115
+ fingerprintFile,
116
+ googleDomains: GOOGLE_DOMAINS,
117
+ getRandomDeviceConfig,
118
+ getRandomDelay,
119
+ };
120
+ }
121
+ /**
122
+ * Execute Google search and return results
123
+ * @param query Search keywords
124
+ * @param options Search options
125
+ * @returns Search results
126
+ */
127
+ export async function googleSearch(query, options = {}, existingBrowser) {
128
+ // Set default options with secure state file path
129
+ const { limit = 20, timeout = 60000, stateFile: userStateFile, noSaveState = false, locale = "en-US", // Default to English
130
+ } = options;
131
+ // Resolve and validate state file path
132
+ const stateFile = resolveStateFilePath(userStateFile);
133
+ // Ignore the passed headless parameter, always start in headless mode
134
+ const useHeadless = true;
135
+ logger.info({ options: { ...options, stateFile } }, "Initializing browser...");
136
+ // Setup browser environment
137
+ const { storageState, savedState, fingerprintFile, googleDomains, getRandomDeviceConfig, getRandomDelay, } = setupBrowserEnvironment({
138
+ stateFile,
139
+ timeout,
140
+ locale,
141
+ noSaveState,
142
+ });
143
+ // Define a function to perform search, can be reused for headless and headed modes
144
+ async function performSearch(headless) {
145
+ let browser;
146
+ let browserWasProvided = false;
147
+ if (existingBrowser) {
148
+ browser = existingBrowser;
149
+ browserWasProvided = true;
150
+ logger.info("Using existing browser instance");
151
+ }
152
+ else {
153
+ logger.info({ headless }, `Preparing to start browser in ${headless ? "headless" : "headed"} mode...`);
154
+ // Initialize browser with secure arguments from shared config
155
+ browser = await chromium.launch({
156
+ headless,
157
+ timeout: timeout * 2, // Increase browser startup timeout
158
+ args: SecureBrowserConfig.getDefaultSearchArgs(),
159
+ ignoreDefaultArgs: ["--enable-automation"],
160
+ });
161
+ logger.info("Browser started successfully!");
162
+ }
163
+ // Get device configuration - use saved or randomly generated
164
+ const [deviceName, deviceConfig] = getRandomDeviceConfig();
165
+ // Create browser context options
166
+ let contextOptions = {
167
+ ...deviceConfig,
168
+ };
169
+ // If there is saved fingerprint configuration, use it; otherwise use the actual settings of the host machine
170
+ if (savedState.fingerprint) {
171
+ contextOptions = {
172
+ ...contextOptions,
173
+ locale: savedState.fingerprint.locale,
174
+ timezoneId: savedState.fingerprint.timezoneId,
175
+ colorScheme: savedState.fingerprint.colorScheme,
176
+ reducedMotion: savedState.fingerprint.reducedMotion,
177
+ forcedColors: savedState.fingerprint.forcedColors,
178
+ };
179
+ logger.info("Using saved browser fingerprint configuration");
180
+ }
181
+ else {
182
+ // Get the actual settings of the host machine
183
+ const hostConfig = getHostMachineConfig(locale);
184
+ // If need to use different device type, re-get device configuration
185
+ if (hostConfig.deviceName !== deviceName) {
186
+ logger.info({ deviceType: hostConfig.deviceName }, "Use device type based on host machine settings");
187
+ // Use new device configuration
188
+ contextOptions = { ...devices[hostConfig.deviceName] };
189
+ }
190
+ contextOptions = {
191
+ ...contextOptions,
192
+ locale: hostConfig.locale,
193
+ timezoneId: hostConfig.timezoneId,
194
+ colorScheme: hostConfig.colorScheme,
195
+ reducedMotion: hostConfig.reducedMotion,
196
+ forcedColors: hostConfig.forcedColors,
197
+ };
198
+ // Save newly generated fingerprint configuration
199
+ savedState.fingerprint = hostConfig;
200
+ logger.info({
201
+ locale: hostConfig.locale,
202
+ timezone: hostConfig.timezoneId,
203
+ colorScheme: hostConfig.colorScheme,
204
+ deviceType: hostConfig.deviceName,
205
+ }, "New browser fingerprint configuration generated based on host machine");
206
+ }
207
+ // Add general options - ensure desktop configuration is used
208
+ contextOptions = {
209
+ ...contextOptions,
210
+ permissions: ["geolocation", "notifications"],
211
+ acceptDownloads: true,
212
+ isMobile: false, // Force desktop mode
213
+ hasTouch: false, // Disable touch functionality
214
+ javaScriptEnabled: true,
215
+ };
216
+ if (storageState) {
217
+ logger.info("Loading saved browser state...");
218
+ }
219
+ const context = await browser.newContext(storageState ? { ...contextOptions, storageState } : contextOptions);
220
+ // Set additional browser properties to avoid detection
221
+ await context.addInitScript(() => {
222
+ // Override navigator properties
223
+ Object.defineProperty(navigator, "webdriver", { get: () => false });
224
+ Object.defineProperty(navigator, "plugins", {
225
+ get: () => [1, 2, 3, 4, 5],
226
+ });
227
+ Object.defineProperty(navigator, "languages", {
228
+ get: () => ["en-US", "en"],
229
+ });
230
+ // Override window properties
231
+ // @ts-expect-error - Ignore error that chrome property does not exist
232
+ window.chrome = {
233
+ runtime: {},
234
+ loadTimes: function () { },
235
+ csi: function () { },
236
+ app: {},
237
+ };
238
+ // Add WebGL fingerprint randomization
239
+ if (typeof WebGLRenderingContext !== "undefined") {
240
+ const getParameter = WebGLRenderingContext.prototype.getParameter;
241
+ WebGLRenderingContext.prototype.getParameter = function (parameter) {
242
+ // Randomize UNMASKED_VENDOR_WEBGL and UNMASKED_RENDERER_WEBGL
243
+ if (parameter === 37445) {
244
+ return "Intel Inc.";
245
+ }
246
+ if (parameter === 37446) {
247
+ return "Intel Iris OpenGL Engine";
248
+ }
249
+ return getParameter.call(this, parameter);
250
+ };
251
+ }
252
+ });
253
+ const page = await context.newPage();
254
+ // Set additional page properties
255
+ await page.addInitScript(() => {
256
+ // Simulate real screen size and color depth
257
+ Object.defineProperty(window.screen, "width", { get: () => 1920 });
258
+ Object.defineProperty(window.screen, "height", { get: () => 1080 });
259
+ Object.defineProperty(window.screen, "colorDepth", { get: () => 24 });
260
+ Object.defineProperty(window.screen, "pixelDepth", { get: () => 24 });
261
+ });
262
+ try {
263
+ // Use saved Google domain or randomly select one
264
+ let selectedDomain;
265
+ if (savedState.googleDomain) {
266
+ selectedDomain = savedState.googleDomain;
267
+ logger.info({ domain: selectedDomain }, "Using saved Google domain");
268
+ }
269
+ else {
270
+ const domains = [...googleDomains];
271
+ selectedDomain = domains[Math.floor(Math.random() * domains.length)];
272
+ // Save selected domain
273
+ savedState.googleDomain = selectedDomain;
274
+ logger.info({ domain: selectedDomain }, "Randomly select Google domain");
275
+ }
276
+ logger.info("Accessing Google search page...");
277
+ // Visit Google search page
278
+ const response = await page.goto(selectedDomain, {
279
+ timeout,
280
+ waitUntil: "load",
281
+ });
282
+ // Check if redirected to human-machine verification page
283
+ const currentUrl = page.url();
284
+ const sorryPatterns = [
285
+ "google.com/sorry/index",
286
+ "google.com/sorry",
287
+ "recaptcha",
288
+ "captcha",
289
+ "unusual traffic",
290
+ ];
291
+ const isBlockedPage = sorryPatterns.some((pattern) => currentUrl.includes(pattern) ||
292
+ (response && response.url().toString().includes(pattern)));
293
+ if (isBlockedPage) {
294
+ if (headless) {
295
+ logger.warn("Detected human-machine verification page, restarting browser in headed mode...");
296
+ // Close current page and context
297
+ await page.close();
298
+ await context.close();
299
+ // If it's an externally provided browser, don't close it, but create a new browser instance
300
+ if (browserWasProvided) {
301
+ logger.info("Encountered human-machine verification when using external browser instance, creating new browser instance...");
302
+ // Create a new browser instance, no longer using the externally provided instance
303
+ const newBrowser = await chromium.launch({
304
+ headless: false, // Use headed mode
305
+ timeout: timeout * 2,
306
+ args: SecureBrowserConfig.getDefaultSearchArgs(),
307
+ ignoreDefaultArgs: ["--enable-automation"],
308
+ });
309
+ // Execute search using new browser instance
310
+ try {
311
+ const tempContext = await newBrowser.newContext(contextOptions);
312
+ const tempPage = await tempContext.newPage();
313
+ // Code to handle human-machine verification can be added here
314
+ // ...
315
+ // Close temporary browser after completion
316
+ await newBrowser.close();
317
+ // Re-execute search
318
+ return performSearch(false);
319
+ }
320
+ catch (error) {
321
+ await newBrowser.close();
322
+ throw error;
323
+ }
324
+ }
325
+ else {
326
+ // If not externally provided browser, close directly and re-execute search
327
+ await browser.close();
328
+ return performSearch(false); // Re-execute search in headed mode
329
+ }
330
+ }
331
+ else {
332
+ logger.warn("Detected human-machine verification page, please complete verification in browser...");
333
+ // Wait for user to complete verification and redirect back to search page
334
+ await page.waitForNavigation({
335
+ timeout: timeout * 2,
336
+ url: (url) => {
337
+ const urlStr = url.toString();
338
+ return sorryPatterns.every((pattern) => !urlStr.includes(pattern));
339
+ },
340
+ });
341
+ logger.info("Human-machine verification completed, continuing search...");
342
+ }
343
+ }
344
+ logger.info({ query }, "Entering search keywords");
345
+ // Wait for search box to appear - try multiple possible selectors
346
+ const searchInputSelectors = [
347
+ "textarea[name='q']",
348
+ "input[name='q']",
349
+ "textarea[title='Search']",
350
+ "input[title='Search']",
351
+ "textarea[aria-label='Search']",
352
+ "input[aria-label='Search']",
353
+ "textarea",
354
+ ];
355
+ let searchInput = null;
356
+ for (const selector of searchInputSelectors) {
357
+ searchInput = await page.$(selector);
358
+ if (searchInput) {
359
+ logger.info({ selector }, "Found search box");
360
+ break;
361
+ }
362
+ }
363
+ if (!searchInput) {
364
+ logger.error("Unable to find search box");
365
+ throw new Error("Unable to find search box");
366
+ }
367
+ // Click search box directly, reduce delay
368
+ await searchInput.click();
369
+ // Input entire query string directly, instead of typing character by character
370
+ await page.keyboard.type(query, { delay: getRandomDelay(10, 30) });
371
+ // Reduce delay before pressing enter
372
+ await page.waitForTimeout(getRandomDelay(100, 300));
373
+ await page.keyboard.press("Enter");
374
+ logger.info("Waiting for page to finish loading...");
375
+ // Wait for page to finish loading
376
+ await page.waitForLoadState("load", { timeout });
377
+ // Check if URL after search is redirected to human-machine verification page
378
+ const searchUrl = page.url();
379
+ const isBlockedAfterSearch = sorryPatterns.some((pattern) => searchUrl.includes(pattern));
380
+ if (isBlockedAfterSearch) {
381
+ if (headless) {
382
+ logger.warn("Detected human-machine verification page after search, restarting browser in headed mode...");
383
+ // Close current page and context
384
+ await page.close();
385
+ await context.close();
386
+ // If it's an externally provided browser, don't close it, but create a new browser instance
387
+ if (browserWasProvided) {
388
+ logger.info("Encountered human-machine verification after search when using external browser instance, creating new browser instance...");
389
+ // Create a new browser instance, no longer using the externally provided instance
390
+ const newBrowser = await chromium.launch({
391
+ headless: false, // Use headed mode
392
+ timeout: timeout * 2,
393
+ args: SecureBrowserConfig.getDefaultSearchArgs(),
394
+ ignoreDefaultArgs: ["--enable-automation"],
395
+ });
396
+ // Execute search using new browser instance
397
+ try {
398
+ const tempContext = await newBrowser.newContext(contextOptions);
399
+ const tempPage = await tempContext.newPage();
400
+ // Code to handle human-machine verification can be added here
401
+ // ...
402
+ // Close temporary browser after completion
403
+ await newBrowser.close();
404
+ // Re-execute search
405
+ return performSearch(false);
406
+ }
407
+ catch (error) {
408
+ await newBrowser.close();
409
+ throw error;
410
+ }
411
+ }
412
+ else {
413
+ // If not externally provided browser, close directly and re-execute search
414
+ await browser.close();
415
+ return performSearch(false); // Re-execute search in headed mode
416
+ }
417
+ }
418
+ else {
419
+ logger.warn("Detected human-machine verification page after search, please complete verification in browser...");
420
+ // Wait for user to complete verification and redirect back to search page
421
+ await page.waitForNavigation({
422
+ timeout: timeout * 2,
423
+ url: (url) => {
424
+ const urlStr = url.toString();
425
+ return sorryPatterns.every((pattern) => !urlStr.includes(pattern));
426
+ },
427
+ });
428
+ logger.info("Human-machine verification completed, continuing search...");
429
+ // Wait for page to reload
430
+ await page.waitForLoadState("load", { timeout });
431
+ }
432
+ }
433
+ logger.info({ url: page.url() }, "Waiting for search results to load...");
434
+ // Try multiple possible search result selectors
435
+ const searchResultSelectors = [
436
+ "#search",
437
+ "#rso",
438
+ ".g",
439
+ "[data-sokoban-container]",
440
+ "div[role='main']",
441
+ ];
442
+ let resultsFound = false;
443
+ for (const selector of searchResultSelectors) {
444
+ try {
445
+ await page.waitForSelector(selector, { timeout: timeout / 2 });
446
+ logger.info({ selector }, "Found search results");
447
+ resultsFound = true;
448
+ break;
449
+ }
450
+ catch {
451
+ // Continue trying next selector
452
+ }
453
+ }
454
+ if (!resultsFound) {
455
+ // If search results cannot be found, check if redirected to human-machine verification page
456
+ const currentUrl = page.url();
457
+ const isBlockedDuringResults = sorryPatterns.some((pattern) => currentUrl.includes(pattern));
458
+ if (isBlockedDuringResults) {
459
+ if (headless) {
460
+ logger.warn("Detected human-machine verification page while waiting for search results, restarting browser in headed mode...");
461
+ // Close current page and context
462
+ await page.close();
463
+ await context.close();
464
+ // If it's an externally provided browser, don't close it, but create a new browser instance
465
+ if (browserWasProvided) {
466
+ logger.info("Encountered human-machine verification while waiting for search results when using external browser instance, creating new browser instance...");
467
+ // Create a new browser instance, no longer using the externally provided instance
468
+ const newBrowser = await chromium.launch({
469
+ headless: false, // Use headed mode
470
+ timeout: timeout * 2,
471
+ args: SecureBrowserConfig.getDefaultSearchArgs(),
472
+ ignoreDefaultArgs: ["--enable-automation"],
473
+ });
474
+ // Execute search using new browser instance
475
+ try {
476
+ const tempContext = await newBrowser.newContext(contextOptions);
477
+ const tempPage = await tempContext.newPage();
478
+ // Code to handle human-machine verification can be added here
479
+ // ...
480
+ // Close temporary browser after completion
481
+ await newBrowser.close();
482
+ // Re-execute search
483
+ return performSearch(false);
484
+ }
485
+ catch (error) {
486
+ await newBrowser.close();
487
+ throw error;
488
+ }
489
+ }
490
+ else {
491
+ // If not externally provided browser, close directly and re-execute search
492
+ await browser.close();
493
+ return performSearch(false); // Re-execute search in headed mode
494
+ }
495
+ }
496
+ else {
497
+ logger.warn("Detected human-machine verification page while waiting for search results, please complete verification in browser...");
498
+ // Wait for user to complete verification and redirect back to search page
499
+ await page.waitForNavigation({
500
+ timeout: timeout * 2,
501
+ url: (url) => {
502
+ const urlStr = url.toString();
503
+ return sorryPatterns.every((pattern) => !urlStr.includes(pattern));
504
+ },
505
+ });
506
+ logger.info("Human-machine verification completed, continuing search...");
507
+ // Try waiting for search results again
508
+ for (const selector of searchResultSelectors) {
509
+ try {
510
+ await page.waitForSelector(selector, { timeout: timeout / 2 });
511
+ logger.info({ selector }, "Found search results after verification");
512
+ resultsFound = true;
513
+ break;
514
+ }
515
+ catch {
516
+ // Continue trying next selector
517
+ }
518
+ }
519
+ if (!resultsFound) {
520
+ logger.error("Unable to find search result elements");
521
+ throw new Error("Unable to find search result elements");
522
+ }
523
+ }
524
+ }
525
+ else {
526
+ // If not a human-machine verification issue, throw error
527
+ logger.error("Unable to find search result elements");
528
+ throw new Error("Unable to find search result elements");
529
+ }
530
+ }
531
+ // Reduce waiting time
532
+ await page.waitForTimeout(getRandomDelay(200, 500));
533
+ logger.info("Extracting search results...");
534
+ // Track search start time for performance metrics
535
+ const searchStartTime = Date.now();
536
+ // Extract search results
537
+ const results = await page.evaluate((maxResults) => {
538
+ const results = [];
539
+ const seenUrls = new Set(); // For deduplication
540
+ // Helper function to extract domain from URL
541
+ const extractDomain = (url) => {
542
+ try {
543
+ const urlObj = new URL(url);
544
+ return urlObj.hostname.replace(/^www\./, "");
545
+ }
546
+ catch {
547
+ return "";
548
+ }
549
+ };
550
+ // Helper function to detect content type from URL
551
+ const detectContentType = (url) => {
552
+ const lowerUrl = url.toLowerCase();
553
+ if (lowerUrl.endsWith(".pdf") || lowerUrl.includes(".pdf?"))
554
+ return "pdf";
555
+ if (lowerUrl.endsWith(".doc") ||
556
+ lowerUrl.endsWith(".docx") ||
557
+ lowerUrl.endsWith(".txt"))
558
+ return "doc";
559
+ if (lowerUrl.includes("youtube.com") ||
560
+ lowerUrl.includes("vimeo.com") ||
561
+ lowerUrl.endsWith(".mp4") ||
562
+ lowerUrl.endsWith(".avi") ||
563
+ lowerUrl.includes("/video/"))
564
+ return "video";
565
+ if (lowerUrl.endsWith(".jpg") ||
566
+ lowerUrl.endsWith(".jpeg") ||
567
+ lowerUrl.endsWith(".png") ||
568
+ lowerUrl.endsWith(".gif") ||
569
+ lowerUrl.endsWith(".webp") ||
570
+ lowerUrl.includes("/image/"))
571
+ return "image";
572
+ return "html";
573
+ };
574
+ // Helper function to detect if URL is secure
575
+ const isSecureUrl = (url) => {
576
+ return url.startsWith("https://");
577
+ };
578
+ // Helper function to count words in text
579
+ const countWords = (text) => {
580
+ return text
581
+ .trim()
582
+ .split(/\s+/)
583
+ .filter((word) => word.length > 0).length;
584
+ };
585
+ // Define multiple selector sets, sorted by priority
586
+ const selectorSets = [
587
+ {
588
+ container: "#search div[data-hveid]",
589
+ title: "h3",
590
+ snippet: ".VwiC3b",
591
+ },
592
+ {
593
+ container: "#rso div[data-hveid]",
594
+ title: "h3",
595
+ snippet: '[data-sncf="1"]',
596
+ },
597
+ {
598
+ container: ".g",
599
+ title: "h3",
600
+ snippet: 'div[style*="webkit-line-clamp"]',
601
+ },
602
+ {
603
+ container: "div[jscontroller][data-hveid]",
604
+ title: "h3",
605
+ snippet: 'div[role="text"]',
606
+ },
607
+ ];
608
+ // Alternative snippet selectors
609
+ const alternativeSnippetSelectors = [
610
+ ".VwiC3b",
611
+ '[data-sncf="1"]',
612
+ 'div[style*="webkit-line-clamp"]',
613
+ 'div[role="text"]',
614
+ ];
615
+ // Try each selector set
616
+ for (const selectors of selectorSets) {
617
+ if (results.length >= maxResults)
618
+ break; // Stop if quantity limit reached
619
+ const containers = document.querySelectorAll(selectors.container);
620
+ for (const container of containers) {
621
+ if (results.length >= maxResults)
622
+ break;
623
+ const titleElement = container.querySelector(selectors.title);
624
+ if (!titleElement)
625
+ continue;
626
+ const title = (titleElement.textContent || "").trim();
627
+ // Find link
628
+ let link = "";
629
+ const linkInTitle = titleElement.querySelector("a");
630
+ if (linkInTitle) {
631
+ link = linkInTitle.href;
632
+ }
633
+ else {
634
+ let current = titleElement;
635
+ while (current && current.tagName !== "A") {
636
+ current = current.parentElement;
637
+ }
638
+ if (current && current instanceof HTMLAnchorElement) {
639
+ link = current.href;
640
+ }
641
+ else {
642
+ const containerLink = container.querySelector("a");
643
+ if (containerLink) {
644
+ link = containerLink.href;
645
+ }
646
+ }
647
+ }
648
+ // Filter invalid or duplicate links
649
+ if (!link || !link.startsWith("http") || seenUrls.has(link))
650
+ continue;
651
+ // Find snippet
652
+ let snippet = "";
653
+ const snippetElement = container.querySelector(selectors.snippet);
654
+ if (snippetElement) {
655
+ snippet = (snippetElement.textContent || "").trim();
656
+ }
657
+ else {
658
+ // Try other snippet selectors
659
+ for (const altSelector of alternativeSnippetSelectors) {
660
+ const element = container.querySelector(altSelector);
661
+ if (element) {
662
+ snippet = (element.textContent || "").trim();
663
+ break;
664
+ }
665
+ }
666
+ // If still no snippet found, try general method
667
+ if (!snippet) {
668
+ const textNodes = Array.from(container.querySelectorAll("div")).filter((el) => !el.querySelector("h3") &&
669
+ (el.textContent || "").trim().length > 20);
670
+ if (textNodes.length > 0) {
671
+ snippet = (textNodes[0].textContent || "").trim();
672
+ }
673
+ }
674
+ }
675
+ // Only add results with title and link
676
+ if (title && link) {
677
+ // Check for rich snippet indicators
678
+ const hasRichSnippet = !!(container.querySelector(".kno-rdesc") || // Knowledge panel
679
+ container.querySelector("[data-attrid]") || // Structured data
680
+ container.querySelector(".commercial-unit-desktop-top") || // Featured snippets
681
+ container.querySelector('[role="heading"]') // FAQ/structured content
682
+ );
683
+ const contentType = detectContentType(link);
684
+ const wordCount = countWords(title + " " + snippet);
685
+ results.push({
686
+ title,
687
+ link,
688
+ snippet,
689
+ domain: extractDomain(link),
690
+ position: results.length + 1,
691
+ snippetLength: snippet.length,
692
+ hasRichSnippet,
693
+ contentType,
694
+ isSecure: isSecureUrl(link),
695
+ hasImages: container.querySelector("img") !== null,
696
+ wordCount,
697
+ language: "en", // Default to English
698
+ });
699
+ seenUrls.add(link); // Record processed URL
700
+ }
701
+ }
702
+ }
703
+ // If main selectors didn't find enough results, try more general method (as supplement)
704
+ if (results.length < maxResults) {
705
+ const anchorElements = Array.from(document.querySelectorAll("a[href^='http']"));
706
+ for (const el of anchorElements) {
707
+ if (results.length >= maxResults)
708
+ break;
709
+ // Check if el is HTMLAnchorElement
710
+ if (!(el instanceof HTMLAnchorElement)) {
711
+ continue;
712
+ }
713
+ const link = el.href;
714
+ // Filter out navigation links, image links, existing links, etc.
715
+ if (!link ||
716
+ seenUrls.has(link) ||
717
+ link.includes("google.com/") ||
718
+ link.includes("accounts.google") ||
719
+ link.includes("support.google")) {
720
+ continue;
721
+ }
722
+ const title = (el.textContent || "").trim();
723
+ if (!title)
724
+ continue; // Skip links without text content
725
+ // Try to get surrounding text as snippet
726
+ let snippet = "";
727
+ let parent = el.parentElement;
728
+ for (let i = 0; i < 3 && parent; i++) {
729
+ const text = (parent.textContent || "").trim();
730
+ // Ensure snippet text differs from title and has certain length
731
+ if (text.length > 20 && text !== title) {
732
+ snippet = text;
733
+ break; // Stop upward search when suitable snippet found
734
+ }
735
+ parent = parent.parentElement;
736
+ }
737
+ const contentType = detectContentType(link);
738
+ const wordCount = countWords(title + " " + snippet);
739
+ results.push({
740
+ title,
741
+ link,
742
+ snippet,
743
+ domain: extractDomain(link),
744
+ position: results.length + 1,
745
+ snippetLength: snippet.length,
746
+ hasRichSnippet: false,
747
+ contentType,
748
+ isSecure: isSecureUrl(link),
749
+ hasImages: false,
750
+ wordCount,
751
+ language: "en",
752
+ });
753
+ seenUrls.add(link);
754
+ }
755
+ }
756
+ return results.slice(0, maxResults); // Ensure not exceeding limit
757
+ }, limit); // Pass limit to evaluate function
758
+ const searchTime = Date.now() - searchStartTime;
759
+ logger.info({ count: results.length, searchTimeMs: searchTime }, "Successfully obtained search results");
760
+ try {
761
+ // Save browser state (unless user specifies not to save)
762
+ if (!noSaveState) {
763
+ logger.info({ stateFile }, "Saving browser state...");
764
+ // Ensure directory exists
765
+ const stateDir = path.dirname(stateFile);
766
+ if (!fs.existsSync(stateDir)) {
767
+ fs.mkdirSync(stateDir, { recursive: true });
768
+ }
769
+ // Save state
770
+ await context.storageState({ path: stateFile });
771
+ logger.info("Browser state saved successfully!");
772
+ // Save fingerprint configuration
773
+ try {
774
+ fs.writeFileSync(fingerprintFile, JSON.stringify(savedState, null, 2), "utf8");
775
+ logger.info({ fingerprintFile }, "Fingerprint configuration saved");
776
+ }
777
+ catch (fingerprintError) {
778
+ logger.error({ error: fingerprintError }, "Error saving fingerprint configuration");
779
+ }
780
+ }
781
+ else {
782
+ logger.info("Not saving browser state according to user settings");
783
+ }
784
+ }
785
+ catch (error) {
786
+ logger.error({ error }, "Error saving browser state");
787
+ }
788
+ // Only close browser if it's not externally provided
789
+ if (!browserWasProvided) {
790
+ logger.info("Closing browser...");
791
+ await browser.close();
792
+ }
793
+ else {
794
+ logger.info("Keeping browser instance open");
795
+ }
796
+ // Return search results with metadata
797
+ const resultSummary = results.length > 0
798
+ ? `Found ${results.length} results${results[0].hasRichSnippet ? " (including rich snippets)" : ""}`
799
+ : "No results found";
800
+ return {
801
+ query,
802
+ results,
803
+ totalResults: results.length,
804
+ searchTime,
805
+ timestamp: new Date().toISOString(),
806
+ resultSummary,
807
+ };
808
+ }
809
+ catch (error) {
810
+ logger.error({ error }, "Error occurred during search");
811
+ try {
812
+ // Try to save browser state, even if error occurred
813
+ if (!noSaveState) {
814
+ logger.info({ stateFile }, "Saving browser state...");
815
+ const stateDir = path.dirname(stateFile);
816
+ if (!fs.existsSync(stateDir)) {
817
+ fs.mkdirSync(stateDir, { recursive: true });
818
+ }
819
+ await context.storageState({ path: stateFile });
820
+ // Save fingerprint configuration
821
+ try {
822
+ fs.writeFileSync(fingerprintFile, JSON.stringify(savedState, null, 2), "utf8");
823
+ logger.info({ fingerprintFile }, "Fingerprint configuration saved");
824
+ }
825
+ catch (fingerprintError) {
826
+ logger.error({ error: fingerprintError }, "Error saving fingerprint configuration");
827
+ }
828
+ }
829
+ }
830
+ catch (stateError) {
831
+ logger.error({ error: stateError }, "Error saving browser state");
832
+ }
833
+ // Only close browser if it's not externally provided
834
+ if (!browserWasProvided) {
835
+ logger.info("Closing browser...");
836
+ await browser.close();
837
+ }
838
+ else {
839
+ logger.info("Keeping browser instance open");
840
+ }
841
+ // Re-throw the error instead of returning error results
842
+ throw error;
843
+ }
844
+ // Remove finally block, as resource cleanup is already handled in try and catch blocks
845
+ }
846
+ // First try to execute search in headless mode
847
+ return performSearch(useHeadless);
848
+ }
849
+ /**
850
+ * Get the raw HTML of Google search result page
851
+ * @param query Search keywords
852
+ * @param options Search options
853
+ * @param saveToFile Whether to save HTML to file (optional)
854
+ * @param outputPath HTML output file path (optional, defaults to './google-search-html/[query]-[timestamp].html')
855
+ * @returns Response object containing HTML content
856
+ */
857
+ export async function getGoogleSearchPageHtml(query, options = {}, saveToFile = false, outputPath) {
858
+ // Set default options, consistent with googleSearch
859
+ const { timeout = 60000, stateFile: userStateFile, noSaveState = false, locale = "en-US", // Default to English
860
+ } = options;
861
+ // Resolve and validate state file path
862
+ const stateFile = resolveStateFilePath(userStateFile);
863
+ // Ignore passed headless parameter, always start in headless mode
864
+ const useHeadless = true;
865
+ logger.info({ options: { ...options, stateFile } }, "Initializing browser to get search page HTML...");
866
+ // Setup browser environment
867
+ const { storageState, savedState, fingerprintFile, googleDomains, getRandomDeviceConfig, getRandomDelay, } = setupBrowserEnvironment({
868
+ stateFile,
869
+ timeout,
870
+ locale,
871
+ noSaveState,
872
+ });
873
+ // Define a dedicated function to get HTML
874
+ async function performSearchAndGetHtml(headless) {
875
+ let browser;
876
+ // Initialize browser with secure arguments from shared config
877
+ browser = await chromium.launch({
878
+ headless,
879
+ timeout: timeout * 2, // Increase browser startup timeout
880
+ args: SecureBrowserConfig.getDefaultSearchArgs(),
881
+ ignoreDefaultArgs: ["--enable-automation"],
882
+ });
883
+ // Get device configuration - use saved or randomly generated
884
+ const [deviceName, deviceConfig] = getRandomDeviceConfig();
885
+ // Create browser context options
886
+ let contextOptions = {
887
+ ...deviceConfig,
888
+ };
889
+ // If there is saved fingerprint configuration, use it; otherwise use the actual settings of the host machine
890
+ if (savedState.fingerprint) {
891
+ contextOptions = {
892
+ ...contextOptions,
893
+ locale: savedState.fingerprint.locale,
894
+ timezoneId: savedState.fingerprint.timezoneId,
895
+ colorScheme: savedState.fingerprint.colorScheme,
896
+ reducedMotion: savedState.fingerprint.reducedMotion,
897
+ forcedColors: savedState.fingerprint.forcedColors,
898
+ };
899
+ logger.info("Using saved browser fingerprint configuration");
900
+ }
901
+ else {
902
+ // Get the actual settings of the host machine
903
+ const hostConfig = getHostMachineConfig(locale);
904
+ contextOptions = {
905
+ ...contextOptions,
906
+ locale: hostConfig.locale,
907
+ timezoneId: hostConfig.timezoneId,
908
+ colorScheme: hostConfig.colorScheme,
909
+ reducedMotion: hostConfig.reducedMotion,
910
+ forcedColors: hostConfig.forcedColors,
911
+ };
912
+ // Save newly generated fingerprint configuration
913
+ savedState.fingerprint = hostConfig;
914
+ logger.info({
915
+ locale: hostConfig.locale,
916
+ timezone: hostConfig.timezoneId,
917
+ colorScheme: hostConfig.colorScheme,
918
+ deviceType: hostConfig.deviceName,
919
+ }, "New browser fingerprint configuration generated based on host machine");
920
+ }
921
+ // Add general options - ensure desktop configuration is used
922
+ contextOptions = {
923
+ ...contextOptions,
924
+ permissions: ["geolocation", "notifications"],
925
+ acceptDownloads: true,
926
+ isMobile: false, // Force desktop mode
927
+ hasTouch: false, // Disable touch functionality
928
+ javaScriptEnabled: true,
929
+ };
930
+ const context = await browser.newContext(storageState ? { ...contextOptions, storageState } : contextOptions);
931
+ // Set additional browser properties to avoid detection
932
+ await context.addInitScript(() => {
933
+ // Override navigator properties
934
+ Object.defineProperty(navigator, "webdriver", { get: () => false });
935
+ Object.defineProperty(navigator, "plugins", {
936
+ get: () => [1, 2, 3, 4, 5],
937
+ });
938
+ Object.defineProperty(navigator, "languages", {
939
+ get: () => ["en-US", "en"],
940
+ });
941
+ // Override window properties
942
+ // @ts-expect-error - Ignore error that chrome property does not exist
943
+ window.chrome = {
944
+ runtime: {},
945
+ loadTimes: function () { },
946
+ csi: function () { },
947
+ app: {},
948
+ };
949
+ });
950
+ const page = await context.newPage();
951
+ // Set additional page properties
952
+ await page.addInitScript(() => {
953
+ // Simulate real screen size and color depth
954
+ Object.defineProperty(window.screen, "width", { get: () => 1920 });
955
+ Object.defineProperty(window.screen, "height", { get: () => 1080 });
956
+ Object.defineProperty(window.screen, "colorDepth", { get: () => 24 });
957
+ Object.defineProperty(window.screen, "pixelDepth", { get: () => 24 });
958
+ });
959
+ try {
960
+ // Use saved Google domain or randomly select one
961
+ let selectedDomain;
962
+ if (savedState.googleDomain) {
963
+ selectedDomain = savedState.googleDomain;
964
+ logger.info({ domain: selectedDomain }, "Using saved Google domain");
965
+ }
966
+ else {
967
+ const domains = [...googleDomains];
968
+ selectedDomain = domains[Math.floor(Math.random() * domains.length)];
969
+ // Save selected domain
970
+ savedState.googleDomain = selectedDomain;
971
+ logger.info({ domain: selectedDomain }, "Randomly select Google domain");
972
+ }
973
+ logger.info("Accessing Google search page...");
974
+ // Visit Google search page
975
+ await page.goto(selectedDomain, {
976
+ timeout,
977
+ waitUntil: "load",
978
+ });
979
+ logger.info({ query }, "Entering search keywords");
980
+ // Wait for search box to appear - try multiple possible selectors
981
+ const searchInputSelectors = [
982
+ "textarea[name='q']",
983
+ "input[name='q']",
984
+ "textarea[title='Search']",
985
+ "input[title='Search']",
986
+ "textarea[aria-label='Search']",
987
+ "input[aria-label='Search']",
988
+ "textarea",
989
+ ];
990
+ let searchInput = null;
991
+ for (const selector of searchInputSelectors) {
992
+ searchInput = await page.$(selector);
993
+ if (searchInput) {
994
+ logger.info({ selector }, "Found search box");
995
+ break;
996
+ }
997
+ }
998
+ if (!searchInput) {
999
+ logger.error("Unable to find search box");
1000
+ throw new Error("Unable to find search box");
1001
+ }
1002
+ // Click search box directly, reduce delay
1003
+ await searchInput.click();
1004
+ // Input entire query string directly, instead of typing character by character
1005
+ await page.keyboard.type(query, { delay: getRandomDelay(10, 30) });
1006
+ // Reduce delay before pressing enter
1007
+ await page.waitForTimeout(getRandomDelay(100, 300));
1008
+ await page.keyboard.press("Enter");
1009
+ logger.info("Waiting for page to finish loading...");
1010
+ // Wait for page to finish loading
1011
+ await page.waitForLoadState("load", { timeout });
1012
+ logger.info({ url: page.url() }, "Search page loaded successfully");
1013
+ // Get page HTML content
1014
+ const html = await page.content();
1015
+ // Take screenshot if requested
1016
+ let screenshotPath;
1017
+ const timestamp = Date.now();
1018
+ const sanitizedQuery = query.replace(/[^a-zA-Z0-9]/g, "_").slice(0, 50);
1019
+ // Determine output path
1020
+ let finalOutputPath = outputPath;
1021
+ if (saveToFile && !finalOutputPath) {
1022
+ // Default output directory
1023
+ const outputDir = path.join(process.cwd(), "google-search-html");
1024
+ if (!fs.existsSync(outputDir)) {
1025
+ fs.mkdirSync(outputDir, { recursive: true });
1026
+ }
1027
+ finalOutputPath = path.join(outputDir, `${sanitizedQuery}-${timestamp}.html`);
1028
+ }
1029
+ // Save HTML file if requested
1030
+ if (saveToFile && finalOutputPath) {
1031
+ fs.writeFileSync(finalOutputPath, html, "utf8");
1032
+ logger.info({ path: finalOutputPath }, "HTML saved to file");
1033
+ }
1034
+ // Take screenshot
1035
+ const screenshotDir = path.join(process.cwd(), "google-search-screenshots");
1036
+ if (!fs.existsSync(screenshotDir)) {
1037
+ fs.mkdirSync(screenshotDir, { recursive: true });
1038
+ }
1039
+ screenshotPath = path.join(screenshotDir, `${sanitizedQuery}-${timestamp}.png`);
1040
+ await page.screenshot({ path: screenshotPath, fullPage: true });
1041
+ logger.info({ path: screenshotPath }, "Screenshot saved");
1042
+ // Close browser
1043
+ await browser.close();
1044
+ // Save browser state (unless user specifies not to save)
1045
+ if (!noSaveState) {
1046
+ try {
1047
+ const stateDir = path.dirname(stateFile);
1048
+ if (!fs.existsSync(stateDir)) {
1049
+ fs.mkdirSync(stateDir, { recursive: true });
1050
+ }
1051
+ await context.storageState({ path: stateFile });
1052
+ fs.writeFileSync(fingerprintFile, JSON.stringify(savedState, null, 2), "utf8");
1053
+ }
1054
+ catch (saveError) {
1055
+ logger.error({ error: saveError }, "Error saving browser state");
1056
+ }
1057
+ }
1058
+ // Clean up HTML (remove script and style tags for cleaner output)
1059
+ const cleanedHtml = html
1060
+ .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "")
1061
+ .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "");
1062
+ return {
1063
+ query,
1064
+ html: cleanedHtml,
1065
+ url: page.url(),
1066
+ savedPath: finalOutputPath,
1067
+ screenshotPath,
1068
+ originalHtmlLength: html.length,
1069
+ };
1070
+ }
1071
+ catch (error) {
1072
+ logger.error({ error }, "Error occurred during HTML retrieval");
1073
+ await browser.close();
1074
+ throw error;
1075
+ }
1076
+ }
1077
+ // Execute in headless mode
1078
+ return performSearchAndGetHtml(useHeadless);
1079
+ }
1080
+ //# sourceMappingURL=search.js.map