llm-search-tools 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +244 -0
  3. package/dist/index.d.ts +18 -0
  4. package/dist/index.js +40 -0
  5. package/dist/index.js.map +1 -0
  6. package/dist/integration.test.d.ts +1 -0
  7. package/dist/integration.test.js +237 -0
  8. package/dist/modules/answerbox.test.d.ts +1 -0
  9. package/dist/modules/answerbox.test.js +105 -0
  10. package/dist/modules/autocomplete.d.ts +11 -0
  11. package/dist/modules/autocomplete.js +159 -0
  12. package/dist/modules/autocomplete.test.d.ts +1 -0
  13. package/dist/modules/autocomplete.test.js +188 -0
  14. package/dist/modules/common.d.ts +26 -0
  15. package/dist/modules/common.js +263 -0
  16. package/dist/modules/common.test.d.ts +1 -0
  17. package/dist/modules/common.test.js +87 -0
  18. package/dist/modules/crawl.d.ts +9 -0
  19. package/dist/modules/crawl.js +117 -0
  20. package/dist/modules/crawl.test.d.ts +1 -0
  21. package/dist/modules/crawl.test.js +48 -0
  22. package/dist/modules/events.d.ts +8 -0
  23. package/dist/modules/events.js +129 -0
  24. package/dist/modules/events.test.d.ts +1 -0
  25. package/dist/modules/events.test.js +104 -0
  26. package/dist/modules/finance.d.ts +10 -0
  27. package/dist/modules/finance.js +20 -0
  28. package/dist/modules/finance.test.d.ts +1 -0
  29. package/dist/modules/finance.test.js +77 -0
  30. package/dist/modules/flights.d.ts +8 -0
  31. package/dist/modules/flights.js +135 -0
  32. package/dist/modules/flights.test.d.ts +1 -0
  33. package/dist/modules/flights.test.js +128 -0
  34. package/dist/modules/hackernews.d.ts +8 -0
  35. package/dist/modules/hackernews.js +87 -0
  36. package/dist/modules/hackernews.js.map +1 -0
  37. package/dist/modules/images.test.d.ts +1 -0
  38. package/dist/modules/images.test.js +145 -0
  39. package/dist/modules/integrations.test.d.ts +1 -0
  40. package/dist/modules/integrations.test.js +93 -0
  41. package/dist/modules/media.d.ts +11 -0
  42. package/dist/modules/media.js +132 -0
  43. package/dist/modules/media.test.d.ts +1 -0
  44. package/dist/modules/media.test.js +186 -0
  45. package/dist/modules/news.d.ts +3 -0
  46. package/dist/modules/news.js +39 -0
  47. package/dist/modules/news.test.d.ts +1 -0
  48. package/dist/modules/news.test.js +88 -0
  49. package/dist/modules/parser.d.ts +19 -0
  50. package/dist/modules/parser.js +361 -0
  51. package/dist/modules/parser.test.d.ts +1 -0
  52. package/dist/modules/parser.test.js +151 -0
  53. package/dist/modules/reddit.d.ts +21 -0
  54. package/dist/modules/reddit.js +107 -0
  55. package/dist/modules/scrape.d.ts +16 -0
  56. package/dist/modules/scrape.js +272 -0
  57. package/dist/modules/scrape.test.d.ts +1 -0
  58. package/dist/modules/scrape.test.js +232 -0
  59. package/dist/modules/scraper.d.ts +12 -0
  60. package/dist/modules/scraper.js +640 -0
  61. package/dist/modules/scrapers/anidb.d.ts +8 -0
  62. package/dist/modules/scrapers/anidb.js +156 -0
  63. package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
  64. package/dist/modules/scrapers/duckduckgo.js +284 -0
  65. package/dist/modules/scrapers/google-news.d.ts +2 -0
  66. package/dist/modules/scrapers/google-news.js +60 -0
  67. package/dist/modules/scrapers/google.d.ts +6 -0
  68. package/dist/modules/scrapers/google.js +211 -0
  69. package/dist/modules/scrapers/searxng.d.ts +2 -0
  70. package/dist/modules/scrapers/searxng.js +93 -0
  71. package/dist/modules/scrapers/thetvdb.d.ts +3 -0
  72. package/dist/modules/scrapers/thetvdb.js +147 -0
  73. package/dist/modules/scrapers/tmdb.d.ts +3 -0
  74. package/dist/modules/scrapers/tmdb.js +172 -0
  75. package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
  76. package/dist/modules/scrapers/yahoo-finance.js +33 -0
  77. package/dist/modules/search.d.ts +5 -0
  78. package/dist/modules/search.js +45 -0
  79. package/dist/modules/search.js.map +1 -0
  80. package/dist/modules/search.test.d.ts +1 -0
  81. package/dist/modules/search.test.js +219 -0
  82. package/dist/modules/urbandictionary.d.ts +12 -0
  83. package/dist/modules/urbandictionary.js +26 -0
  84. package/dist/modules/webpage.d.ts +4 -0
  85. package/dist/modules/webpage.js +150 -0
  86. package/dist/modules/webpage.js.map +1 -0
  87. package/dist/modules/wikipedia.d.ts +5 -0
  88. package/dist/modules/wikipedia.js +85 -0
  89. package/dist/modules/wikipedia.js.map +1 -0
  90. package/dist/scripts/interactive-search.d.ts +1 -0
  91. package/dist/scripts/interactive-search.js +98 -0
  92. package/dist/test.d.ts +1 -0
  93. package/dist/test.js +179 -0
  94. package/dist/test.js.map +1 -0
  95. package/dist/testBraveSearch.d.ts +1 -0
  96. package/dist/testBraveSearch.js +34 -0
  97. package/dist/testDuckDuckGo.d.ts +1 -0
  98. package/dist/testDuckDuckGo.js +52 -0
  99. package/dist/testEcosia.d.ts +1 -0
  100. package/dist/testEcosia.js +57 -0
  101. package/dist/testSearchModule.d.ts +1 -0
  102. package/dist/testSearchModule.js +95 -0
  103. package/dist/testwebpage.d.ts +1 -0
  104. package/dist/testwebpage.js +81 -0
  105. package/dist/types.d.ts +174 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/createTestDocx.d.ts +1 -0
  109. package/dist/utils/createTestDocx.js +58 -0
  110. package/dist/utils/htmlcleaner.d.ts +20 -0
  111. package/dist/utils/htmlcleaner.js +172 -0
  112. package/docs/README.md +275 -0
  113. package/docs/autocomplete.md +73 -0
  114. package/docs/crawling.md +88 -0
  115. package/docs/events.md +58 -0
  116. package/docs/examples.md +158 -0
  117. package/docs/finance.md +60 -0
  118. package/docs/flights.md +71 -0
  119. package/docs/hackernews.md +121 -0
  120. package/docs/media.md +87 -0
  121. package/docs/news.md +75 -0
  122. package/docs/parser.md +197 -0
  123. package/docs/scraper.md +347 -0
  124. package/docs/search.md +106 -0
  125. package/docs/wikipedia.md +91 -0
  126. package/package.json +97 -0
@@ -0,0 +1,640 @@
1
+ "use strict";
2
+ // scraper.ts - unified scraper with bot detection, proxy support, and content extraction
3
+ var __importDefault = (this && this.__importDefault) || function (mod) {
4
+ return (mod && mod.__esModule) ? mod : { "default": mod };
5
+ };
6
+ Object.defineProperty(exports, "__esModule", { value: true });
7
+ exports.searchGoogle = searchGoogle;
8
+ exports.searchGoogleLegacy = searchGoogle;
9
+ exports.searchDuckDuckGo = searchDuckDuckGo;
10
+ exports.searchDuckDuckGoLegacy = searchDuckDuckGo;
11
+ exports.search = search;
12
+ exports.getWebpageContent = getWebpageContent;
13
+ exports.getWebpageText = getWebpageText;
14
+ exports.isUrlAccessible = isUrlAccessible;
15
+ const google_sr_1 = require("google-sr");
16
+ const duck_duck_scrape_1 = require("duck-duck-scrape");
17
+ const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
18
+ const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
19
+ const readability_1 = require("@mozilla/readability");
20
+ const jsdom_1 = require("jsdom");
21
+ const wikipedia_1 = require("./wikipedia");
22
+ const hackernews_1 = require("./hackernews");
23
+ // Use stealth plugin
24
+ puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
25
+ const defaultOptions = {
26
+ limit: 10,
27
+ safeSearch: true,
28
+ timeout: 10000,
29
+ forcePuppeteer: false,
30
+ antiBot: {
31
+ enabled: true,
32
+ maxRetries: 3,
33
+ retryDelay: 2000
34
+ }
35
+ };
36
+ // Rate limiting parameters
37
+ const MIN_DELAY_BETWEEN_SEARCHES = 5000; // 5 seconds for DuckDuckGo
38
+ const GOOGLE_DELAY = 2000; // 2 seconds for Google
39
+ const MAX_RETRIES = 3;
40
+ const RETRY_DELAY = 2000; // 2 seconds between retries
41
+ let lastDDGSearchTime = 0;
42
+ let lastGoogleSearchTime = 0;
43
+ // Cache for search results
44
+ const searchCache = new Map();
45
+ const CACHE_TTL = 60 * 60 * 1000; // 1 hour
46
+ // Bot detection patterns
47
+ const BOT_PROTECTION_PATTERNS = {
48
+ cloudflare: [
49
+ 'cf-ray',
50
+ '__cf_bm',
51
+ 'cloudflare',
52
+ 'challenge-platform',
53
+ 'Just a moment...',
54
+ 'Checking your browser',
55
+ 'DDoS protection by Cloudflare'
56
+ ],
57
+ perimeterx: [
58
+ '_px',
59
+ 'perimeterx',
60
+ 'px-captcha',
61
+ 'PX',
62
+ 'bot-management'
63
+ ],
64
+ akamai: [
65
+ 'akamai',
66
+ 'ak_bmsc',
67
+ 'akamaighost',
68
+ 'akamaized',
69
+ 'edgekey'
70
+ ],
71
+ datadome: [
72
+ 'datadome',
73
+ '__ddg_',
74
+ 'x-datadome',
75
+ 'ddg-',
76
+ 'bot-detection'
77
+ ],
78
+ generic: [
79
+ 'captcha',
80
+ 'recaptcha',
81
+ 'hcaptcha',
82
+ 'access denied',
83
+ '403 forbidden',
84
+ 'rate limit',
85
+ 'too many requests',
86
+ 'blocked',
87
+ 'security check',
88
+ 'unauthorized'
89
+ ]
90
+ };
91
+ // Helper function to detect bot protection
92
+ function detectBotProtection(headers, body) {
93
+ // Check headers
94
+ for (const [key, value] of headers.entries()) {
95
+ const headerContent = `${key}: ${value}`.toLowerCase();
96
+ for (const patterns of Object.values(BOT_PROTECTION_PATTERNS)) {
97
+ for (const pattern of patterns) {
98
+ if (headerContent.includes(pattern.toLowerCase())) {
99
+ return true;
100
+ }
101
+ }
102
+ }
103
+ }
104
+ // Check body content
105
+ const bodyLower = body.toLowerCase();
106
+ for (const patterns of Object.values(BOT_PROTECTION_PATTERNS)) {
107
+ for (const pattern of patterns) {
108
+ if (bodyLower.includes(pattern.toLowerCase())) {
109
+ return true;
110
+ }
111
+ }
112
+ }
113
+ return false;
114
+ }
115
+ // Parse proxy configuration
116
+ function parseProxyConfig(proxy) {
117
+ if (!proxy)
118
+ return null;
119
+ if (typeof proxy === 'string') {
120
+ // Parse proxy URL
121
+ try {
122
+ const url = new URL(proxy);
123
+ return {
124
+ url: proxy,
125
+ type: url.protocol.replace(':', '')
126
+ };
127
+ }
128
+ catch {
129
+ throw new Error('Invalid proxy URL format');
130
+ }
131
+ }
132
+ // Build proxy URL from config
133
+ const auth = proxy.auth ? `${proxy.auth.username}:${proxy.auth.password}@` : '';
134
+ const proxyUrl = `${proxy.type}://${auth}${proxy.host}:${proxy.port}`;
135
+ return {
136
+ url: proxyUrl,
137
+ type: proxy.type
138
+ };
139
+ }
140
+ // Create realistic headers for basic requests
141
+ function createRealisticHeaders() {
142
+ const userAgents = [
143
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
144
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
145
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0'
146
+ ];
147
+ return {
148
+ 'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
149
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
150
+ 'Accept-Language': 'en-US,en;q=0.5',
151
+ 'Accept-Encoding': 'gzip, deflate, br',
152
+ 'DNT': '1',
153
+ 'Connection': 'keep-alive',
154
+ 'Upgrade-Insecure-Requests': '1',
155
+ 'Sec-Fetch-Dest': 'document',
156
+ 'Sec-Fetch-Mode': 'navigate',
157
+ 'Sec-Fetch-Site': 'none',
158
+ 'Cache-Control': 'max-age=0'
159
+ };
160
+ }
161
+ // Puppeteer stealth configuration with enhanced options
162
+ async function createStealthBrowser(proxy) {
163
+ const launchOptions = {
164
+ headless: 'new',
165
+ args: [
166
+ '--no-sandbox',
167
+ '--disable-setuid-sandbox',
168
+ '--disable-dev-shm-usage',
169
+ '--disable-accelerated-2d-canvas',
170
+ '--no-first-run',
171
+ '--no-zygote',
172
+ '--single-process',
173
+ '--disable-gpu',
174
+ '--disable-web-security',
175
+ '--disable-features=VizDisplayCompositor',
176
+ '--ignore-certificate-errors',
177
+ '--ignore-certificate-errors-spki-list'
178
+ ]
179
+ };
180
+ if (proxy) {
181
+ launchOptions.args.push(`--proxy-server=${proxy.url}`);
182
+ }
183
+ const browser = await puppeteer_extra_1.default.launch(launchOptions);
184
+ // Additional stealth measures
185
+ await browser.defaultBrowserContext().overridePermissions('https://www.google.com', []);
186
+ await browser.defaultBrowserContext().overridePermissions('https://duckduckgo.com', []);
187
+ return browser;
188
+ }
189
+ // Helper function to enforce rate limiting
190
+ async function enforceRateLimit(searchType) {
191
+ const now = Date.now();
192
+ const delay = searchType === "ddg" ? MIN_DELAY_BETWEEN_SEARCHES : GOOGLE_DELAY;
193
+ const lastTime = searchType === "ddg" ? lastDDGSearchTime : lastGoogleSearchTime;
194
+ const timeSinceLastSearch = now - lastTime;
195
+ if (timeSinceLastSearch < delay) {
196
+ await new Promise((resolve) => setTimeout(resolve, delay - timeSinceLastSearch));
197
+ }
198
+ if (searchType === "ddg") {
199
+ lastDDGSearchTime = Date.now();
200
+ }
201
+ else {
202
+ lastGoogleSearchTime = Date.now();
203
+ }
204
+ }
205
+ // Helper function to get cache key
206
+ function getCacheKey(query, options) {
207
+ return `${query}-${JSON.stringify(options)}`;
208
+ }
209
+ // Fetch with bot detection
210
+ async function fetchWithDetection(url, options) {
211
+ const proxy = parseProxyConfig(options.proxy);
212
+ const headers = createRealisticHeaders();
213
+ const fetchOptions = {
214
+ headers,
215
+ timeout: options.timeout || 10000
216
+ };
217
+ if (proxy) {
218
+ try {
219
+ let agent;
220
+ if (proxy.type === 'socks4' || proxy.type === 'socks5') {
221
+ const { SocksProxyAgent } = await import('socks-proxy-agent');
222
+ agent = new SocksProxyAgent(proxy.url);
223
+ }
224
+ else {
225
+ const { HttpsProxyAgent } = await import('https-proxy-agent');
226
+ agent = new HttpsProxyAgent(proxy.url);
227
+ }
228
+ fetchOptions.agent = agent;
229
+ }
230
+ catch (error) {
231
+ const errorMessage = error instanceof Error ? error.message : String(error);
232
+ throw {
233
+ message: `Proxy connection failed: ${errorMessage}`,
234
+ code: 'PROXY_CONNECTION_FAILED',
235
+ originalError: error
236
+ };
237
+ }
238
+ }
239
+ try {
240
+ const response = await fetch(url, fetchOptions);
241
+ const body = await response.text();
242
+ if (detectBotProtection(response.headers, body)) {
243
+ throw new Error('Bot protection detected');
244
+ }
245
+ return {
246
+ headers: response.headers,
247
+ body
248
+ };
249
+ }
250
+ catch (error) {
251
+ const errorMessage = error instanceof Error ? error.message : String(error);
252
+ if (errorMessage.includes('407') || errorMessage.includes('authentication')) {
253
+ throw {
254
+ message: 'Proxy authentication failed',
255
+ code: 'PROXY_AUTH_FAILED',
256
+ originalError: error
257
+ };
258
+ }
259
+ if (errorMessage.includes('ECONNREFUSED') || errorMessage.includes('ENOTFOUND')) {
260
+ throw {
261
+ message: 'Proxy connection refused',
262
+ code: 'PROXY_CONNECTION_REFUSED',
263
+ originalError: error
264
+ };
265
+ }
266
+ throw error;
267
+ }
268
+ }
269
+ // Search using Puppeteer
270
+ async function searchWithPuppeteer(query, searchType, options) {
271
+ const proxy = parseProxyConfig(options.proxy);
272
+ const browser = await createStealthBrowser(proxy || undefined);
273
+ const page = await browser.newPage();
274
+ try {
275
+ // Set realistic viewport
276
+ await page.setViewport({ width: 1920, height: 1080 });
277
+ // Set extra headers
278
+ await page.setExtraHTTPHeaders(createRealisticHeaders());
279
+ // Navigate to search engine
280
+ const searchUrl = searchType === "google"
281
+ ? `https://www.google.com/search?q=${encodeURIComponent(query)}`
282
+ : `https://duckduckgo.com/?q=${encodeURIComponent(query)}`;
283
+ await page.goto(searchUrl, { waitUntil: 'networkidle2' });
284
+ // Wait for results
285
+ await page.waitForSelector(searchType === "google" ? 'div.g' : '#links .result', { timeout: 10000 });
286
+ // Extract results
287
+ const results = await page.evaluate((limit) => {
288
+ const items = [];
289
+ if (window.location.hostname.includes('google')) {
290
+ const elements = document.querySelectorAll('div.g');
291
+ for (let i = 0; i < Math.min(elements.length, limit || 10); i++) {
292
+ const el = elements[i];
293
+ const titleEl = el.querySelector('h3');
294
+ const linkEl = el.querySelector('a');
295
+ const snippetEl = el.querySelector('.VwiC3b');
296
+ if (titleEl && linkEl) {
297
+ items.push({
298
+ title: titleEl.textContent || '',
299
+ url: linkEl.href || '',
300
+ snippet: snippetEl?.textContent || '',
301
+ source: 'google'
302
+ });
303
+ }
304
+ }
305
+ }
306
+ else {
307
+ const elements = document.querySelectorAll('#links .result');
308
+ for (let i = 0; i < Math.min(elements.length, limit || 10); i++) {
309
+ const el = elements[i];
310
+ const titleEl = el.querySelector('h2');
311
+ const linkEl = el.querySelector('a');
312
+ const snippetEl = el.querySelector('.result__snippet');
313
+ if (titleEl && linkEl) {
314
+ items.push({
315
+ title: titleEl.textContent || '',
316
+ url: linkEl.href || '',
317
+ snippet: snippetEl?.textContent || '',
318
+ source: 'duckduckgo'
319
+ });
320
+ }
321
+ }
322
+ }
323
+ return items;
324
+ }, options.limit);
325
+ return results;
326
+ }
327
+ finally {
328
+ await browser.close();
329
+ }
330
+ }
331
+ async function searchGoogle(query, options = {}) {
332
+ try {
333
+ const opts = { ...defaultOptions, ...options };
334
+ const cacheKey = getCacheKey(query, opts);
335
+ const cached = searchCache.get(cacheKey);
336
+ if (cached &&
337
+ cached.source === "google" &&
338
+ Date.now() - cached.timestamp < CACHE_TTL) {
339
+ return cached.results;
340
+ }
341
+ await enforceRateLimit("google");
342
+ // Try basic fetch first unless Puppeteer is forced
343
+ if (!opts.forcePuppeteer) {
344
+ try {
345
+ const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
346
+ await fetchWithDetection(searchUrl, opts);
347
+ // If no bot detection, use library
348
+ const results = await (0, google_sr_1.search)({
349
+ query,
350
+ parsers: [google_sr_1.OrganicResult],
351
+ noPartialResults: true,
352
+ requestConfig: { queryParams: { safe: 'active' } }
353
+ });
354
+ const formattedResults = results.map((r) => ({
355
+ title: r.title || "",
356
+ url: r.link || "",
357
+ snippet: r.description || "",
358
+ source: "google",
359
+ }));
360
+ searchCache.set(cacheKey, {
361
+ results: formattedResults,
362
+ timestamp: Date.now(),
363
+ source: "google",
364
+ });
365
+ return formattedResults;
366
+ }
367
+ catch (error) {
368
+ const errorMessage = error instanceof Error ? error.message : String(error);
369
+ if (errorMessage === 'Bot protection detected' && opts.antiBot?.enabled) {
370
+ console.warn('Bot protection detected, falling back to Puppeteer...');
371
+ }
372
+ else {
373
+ throw error;
374
+ }
375
+ }
376
+ }
377
+ // Use Puppeteer as fallback
378
+ const results = await searchWithPuppeteer(query, "google", opts);
379
+ searchCache.set(cacheKey, {
380
+ results,
381
+ timestamp: Date.now(),
382
+ source: "google",
383
+ });
384
+ return results;
385
+ }
386
+ catch (error) {
387
+ throw {
388
+ message: "google search failed :(",
389
+ code: "GOOGLE_SEARCH_ERROR",
390
+ originalError: error,
391
+ };
392
+ }
393
+ }
394
+ async function searchDuckDuckGo(query, options = {}) {
395
+ const opts = { ...defaultOptions, ...options };
396
+ const cacheKey = getCacheKey(query, opts);
397
+ const cached = searchCache.get(cacheKey);
398
+ if (cached &&
399
+ cached.source === "duckduckgo" &&
400
+ Date.now() - cached.timestamp < CACHE_TTL) {
401
+ return cached.results;
402
+ }
403
+ let lastError;
404
+ for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
405
+ try {
406
+ await enforceRateLimit("ddg");
407
+ // Try basic fetch first unless Puppeteer is forced
408
+ if (!opts.forcePuppeteer) {
409
+ try {
410
+ const searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}`;
411
+ await fetchWithDetection(searchUrl, opts);
412
+ // If no bot detection, use library
413
+ const results = await (0, duck_duck_scrape_1.search)(query, {
414
+ safeSearch: opts.safeSearch
415
+ ? duck_duck_scrape_1.SafeSearchType.STRICT
416
+ : duck_duck_scrape_1.SafeSearchType.OFF,
417
+ });
418
+ const formattedResults = results.results
419
+ .slice(0, opts.limit)
420
+ .map((r) => ({
421
+ title: r.title,
422
+ url: r.url,
423
+ snippet: r.description,
424
+ source: "duckduckgo",
425
+ }));
426
+ searchCache.set(cacheKey, {
427
+ results: formattedResults,
428
+ timestamp: Date.now(),
429
+ source: "duckduckgo",
430
+ });
431
+ return formattedResults;
432
+ }
433
+ catch (error) {
434
+ const errorMessage = error instanceof Error ? error.message : String(error);
435
+ if (errorMessage === 'Bot protection detected' && opts.antiBot?.enabled) {
436
+ console.warn('Bot protection detected, falling back to Puppeteer...');
437
+ }
438
+ else {
439
+ throw error;
440
+ }
441
+ }
442
+ }
443
+ // Use Puppeteer as fallback
444
+ const results = await searchWithPuppeteer(query, "duckduckgo", opts);
445
+ searchCache.set(cacheKey, {
446
+ results,
447
+ timestamp: Date.now(),
448
+ source: "duckduckgo",
449
+ });
450
+ return results;
451
+ }
452
+ catch (error) {
453
+ lastError = error;
454
+ if (attempt < MAX_RETRIES) {
455
+ console.warn(`DuckDuckGo search attempt ${attempt} failed, retrying in ${RETRY_DELAY / 1000} seconds...`);
456
+ await new Promise((resolve) => setTimeout(resolve, RETRY_DELAY));
457
+ }
458
+ }
459
+ }
460
+ throw {
461
+ message: "duckduckgo search failed :/",
462
+ code: "DDG_SEARCH_ERROR",
463
+ originalError: lastError,
464
+ };
465
+ }
466
+ // Unified search that tries ddg first, then google
467
+ async function search(query, options = {}) {
468
+ try {
469
+ return await searchDuckDuckGo(query, options);
470
+ }
471
+ catch (err) {
472
+ // fallback to google if ddg fails
473
+ console.warn("duckduckgo search failed, falling back to google...", err);
474
+ return await searchGoogle(query, options);
475
+ }
476
+ }
477
+ // ===== WEBPAGE CONTENT EXTRACTION =====
478
+ // clean up text by removing excessive whitespace and making it more readable
479
+ function cleanText(text) {
480
+ return text
481
+ .replace(/[\n\s\r]+/g, ' ')
482
+ .replace(/([.!?])\s+/g, '$1\n\n')
483
+ .replace(/\n{3,}/g, '\n\n')
484
+ .replace(/\s+/g, ' ')
485
+ .trim();
486
+ }
487
+ // check url type and get appropriate handler
488
+ function getUrlType(url) {
489
+ try {
490
+ const urlObj = new URL(url);
491
+ const hostname = urlObj.hostname;
492
+ if (hostname.includes('wikipedia.org')) {
493
+ return 'wikipedia';
494
+ }
495
+ if (hostname === 'news.ycombinator.com' && url.includes('item?id=')) {
496
+ return 'hackernews';
497
+ }
498
+ // list of domains that don't work well with readability
499
+ const unsupported = [
500
+ 'youtube.com', 'youtu.be', 'vimeo.com',
501
+ 'twitter.com', 'x.com', 'instagram.com',
502
+ 'facebook.com', 'linkedin.com'
503
+ ];
504
+ if (unsupported.some(domain => hostname.includes(domain))) {
505
+ return 'unsupported';
506
+ }
507
+ return 'general';
508
+ }
509
+ catch {
510
+ return 'unsupported';
511
+ }
512
+ }
513
+ // get webpage content using readability with stealth puppeteer
514
+ async function getWebpageContent(url, options) {
515
+ // Backward compatibility: if options is boolean, treat as usePuppeteer
516
+ if (typeof options === 'boolean') {
517
+ options = { usePuppeteer: options };
518
+ }
519
+ else if (!options) {
520
+ options = {};
521
+ }
522
+ try {
523
+ const urlType = getUrlType(url);
524
+ // handle special cases
525
+ if (urlType === 'wikipedia') {
526
+ const title = url.split('/wiki/')[1]?.replace(/_/g, ' ') || url;
527
+ const content = await (0, wikipedia_1.wikiGetContent)(title);
528
+ return {
529
+ title,
530
+ content,
531
+ textContent: cleanText(content),
532
+ length: content.length,
533
+ excerpt: content.slice(0, 200) + '...',
534
+ siteName: 'Wikipedia'
535
+ };
536
+ }
537
+ if (urlType === 'hackernews') {
538
+ const id = parseInt(url.split('id=')[1]);
539
+ const story = await (0, hackernews_1.getStoryById)(id);
540
+ const content = story.snippet || story.title || 'No content available';
541
+ const cleanedContent = cleanText(content);
542
+ return {
543
+ title: story.title || url,
544
+ content: content,
545
+ textContent: cleanedContent,
546
+ length: cleanedContent.length,
547
+ excerpt: cleanedContent.slice(0, 200) + (cleanedContent.length > 200 ? '...' : ''),
548
+ siteName: 'Hacker News'
549
+ };
550
+ }
551
+ if (urlType === 'unsupported') {
552
+ return {
553
+ title: url,
554
+ content: '',
555
+ textContent: 'This URL type is not supported for content extraction.',
556
+ length: 0,
557
+ excerpt: 'Content not available - URL type not supported'
558
+ };
559
+ }
560
+ // handle general case with readability
561
+ let html;
562
+ if (options.usePuppeteer) {
563
+ // Use stealth puppeteer for bot-protected sites
564
+ const browser = await createStealthBrowser(parseProxyConfig(options.proxy) || undefined);
565
+ const page = await browser.newPage();
566
+ await page.setViewport({ width: 1920, height: 1080 });
567
+ await page.setExtraHTTPHeaders(createRealisticHeaders());
568
+ await page.goto(url, { waitUntil: 'networkidle2' });
569
+ html = await page.content();
570
+ await browser.close();
571
+ }
572
+ else {
573
+ try {
574
+ const headers = createRealisticHeaders();
575
+ const fetchOptions = { headers };
576
+ const proxy = parseProxyConfig(options.proxy);
577
+ if (proxy) {
578
+ if (proxy.type === 'socks4' || proxy.type === 'socks5') {
579
+ const { SocksProxyAgent } = await import('socks-proxy-agent');
580
+ fetchOptions.agent = new SocksProxyAgent(proxy.url);
581
+ }
582
+ else {
583
+ const { HttpsProxyAgent } = await import('https-proxy-agent');
584
+ fetchOptions.agent = new HttpsProxyAgent(proxy.url);
585
+ }
586
+ }
587
+ const response = await fetch(url, fetchOptions);
588
+ html = await response.text();
589
+ }
590
+ catch (error) {
591
+ // If basic fetch fails, try with puppeteer
592
+ console.warn('Basic fetch failed, trying with Puppeteer...', error);
593
+ return await getWebpageContent(url, { ...options, usePuppeteer: true });
594
+ }
595
+ }
596
+ const dom = new jsdom_1.JSDOM(html, { url });
597
+ const reader = new readability_1.Readability(dom.window.document);
598
+ const article = reader.parse();
599
+ if (!article) {
600
+ return {
601
+ title: url,
602
+ content: '',
603
+ textContent: 'Failed to extract readable content from this page.',
604
+ length: 0,
605
+ excerpt: 'Content extraction failed'
606
+ };
607
+ }
608
+ const cleanedText = cleanText(article.textContent || '');
609
+ return {
610
+ title: article.title || url,
611
+ content: article.content || '',
612
+ textContent: cleanedText,
613
+ length: cleanedText.length,
614
+ excerpt: article.excerpt || undefined,
615
+ siteName: article.siteName || undefined
616
+ };
617
+ }
618
+ catch (err) {
619
+ throw {
620
+ message: 'failed to get webpage content :/',
621
+ code: 'WEBPAGE_ERROR',
622
+ originalError: err
623
+ };
624
+ }
625
+ }
626
+ // get just the text content
627
+ async function getWebpageText(url, options = {}) {
628
+ const content = await getWebpageContent(url, options);
629
+ return content.textContent;
630
+ }
631
+ // check if url is accessible
632
+ async function isUrlAccessible(url) {
633
+ try {
634
+ const response = await fetch(url, { method: 'HEAD' });
635
+ return response.ok;
636
+ }
637
+ catch {
638
+ return false;
639
+ }
640
+ }
@@ -0,0 +1,8 @@
1
+ import { MediaResult, MediaSearchOptions } from "../../types";
2
+ /**
3
+ * AniDB Scraper
4
+ * AniDB has strict anti-bot protection ("AntiLeech").
5
+ * We must use Puppeteer with Stealth plugin and respect rate limits.
6
+ */
7
+ export declare function searchAniDB(query: string, options?: MediaSearchOptions): Promise<MediaResult[]>;
8
+ export declare function getAniDBDetails(url: string, options?: MediaSearchOptions): Promise<Partial<MediaResult>>;