@oevortex/ddg_search 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +47 -26
- package/README.md +3 -3
- package/babel.config.js +11 -11
- package/bin/cli.js +6 -6
- package/package.json +2 -2
- package/src/tools/searchTool.js +40 -40
- package/src/utils/search.js +322 -322
- package/src/utils/search_iask.js +228 -228
- package/src/utils/search_monica.js +238 -238
- package/test.setup.js +72 -119
package/src/utils/search.js
CHANGED
|
@@ -1,323 +1,323 @@
|
|
|
1
|
-
import axios from 'axios';
|
|
2
|
-
import * as cheerio from 'cheerio';
|
|
3
|
-
import https from 'https';
|
|
4
|
-
import { getRandomUserAgent } from './user_agents.js';
|
|
5
|
-
|
|
6
|
-
// Constants
|
|
7
|
-
const MAX_CACHE_PAGES = 5;
|
|
8
|
-
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
|
|
9
|
-
const REQUEST_TIMEOUT = 10000; // 10 seconds
|
|
10
|
-
|
|
11
|
-
// Cache results to avoid repeated requests
|
|
12
|
-
const resultsCache = new Map();
|
|
13
|
-
|
|
14
|
-
// HTTPS agent configuration to handle certificate chain issues
|
|
15
|
-
const httpsAgent = new https.Agent({
|
|
16
|
-
rejectUnauthorized: true, // Keep security enabled
|
|
17
|
-
keepAlive: true,
|
|
18
|
-
timeout: REQUEST_TIMEOUT,
|
|
19
|
-
// Provide fallback for certificate issues while maintaining security
|
|
20
|
-
secureProtocol: 'TLSv1_2_method'
|
|
21
|
-
});
|
|
22
|
-
|
|
23
|
-
/**
|
|
24
|
-
* Generate a cache key for a search query
|
|
25
|
-
* @param {string} query - The search query
|
|
26
|
-
* @returns {string} The cache key
|
|
27
|
-
*/
|
|
28
|
-
function getCacheKey(query) {
|
|
29
|
-
return `${query}`;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
/**
|
|
33
|
-
* Clear old entries from the cache
|
|
34
|
-
*/
|
|
35
|
-
function clearOldCache() {
|
|
36
|
-
const now = Date.now();
|
|
37
|
-
for (const [key, value] of resultsCache.entries()) {
|
|
38
|
-
if (now - value.timestamp > CACHE_DURATION) {
|
|
39
|
-
resultsCache.delete(key);
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
/**
|
|
45
|
-
* Extract the direct URL from a DuckDuckGo redirect URL
|
|
46
|
-
* @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
|
|
47
|
-
* @returns {string} The direct URL
|
|
48
|
-
*/
|
|
49
|
-
function extractDirectUrl(duckduckgoUrl) {
|
|
50
|
-
try {
|
|
51
|
-
// Handle relative URLs from DuckDuckGo
|
|
52
|
-
if (duckduckgoUrl.startsWith('//')) {
|
|
53
|
-
duckduckgoUrl = 'https:' + duckduckgoUrl;
|
|
54
|
-
} else if (duckduckgoUrl.startsWith('/')) {
|
|
55
|
-
duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
const url = new URL(duckduckgoUrl);
|
|
59
|
-
|
|
60
|
-
// Extract direct URL from DuckDuckGo redirect
|
|
61
|
-
if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
|
|
62
|
-
const uddg = url.searchParams.get('uddg');
|
|
63
|
-
if (uddg) {
|
|
64
|
-
return decodeURIComponent(uddg);
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
// Handle ad redirects
|
|
69
|
-
if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
|
|
70
|
-
const u3 = url.searchParams.get('u3');
|
|
71
|
-
if (u3) {
|
|
72
|
-
try {
|
|
73
|
-
const decodedU3 = decodeURIComponent(u3);
|
|
74
|
-
const u3Url = new URL(decodedU3);
|
|
75
|
-
const clickUrl = u3Url.searchParams.get('ld');
|
|
76
|
-
if (clickUrl) {
|
|
77
|
-
return decodeURIComponent(clickUrl);
|
|
78
|
-
}
|
|
79
|
-
return decodedU3;
|
|
80
|
-
} catch {
|
|
81
|
-
return duckduckgoUrl;
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
return duckduckgoUrl;
|
|
87
|
-
} catch {
|
|
88
|
-
// If URL parsing fails, try to extract URL from a basic string match
|
|
89
|
-
const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
|
|
90
|
-
if (urlMatch) {
|
|
91
|
-
return urlMatch[0];
|
|
92
|
-
}
|
|
93
|
-
return duckduckgoUrl;
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
/**
|
|
98
|
-
* Get a favicon URL for a given website URL
|
|
99
|
-
* @param {string} url - The website URL
|
|
100
|
-
* @returns {string} The favicon URL
|
|
101
|
-
*/
|
|
102
|
-
function getFaviconUrl(url) {
|
|
103
|
-
try {
|
|
104
|
-
const urlObj = new URL(url);
|
|
105
|
-
return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
|
|
106
|
-
} catch {
|
|
107
|
-
return ''; // Return empty string if URL is invalid
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
/**
|
|
112
|
-
* Generate a Jina AI URL for a given website URL
|
|
113
|
-
* @param {string} url - The website URL
|
|
114
|
-
* @returns {string} The Jina AI URL
|
|
115
|
-
*/
|
|
116
|
-
function getJinaAiUrl(url) {
|
|
117
|
-
try {
|
|
118
|
-
const urlObj = new URL(url);
|
|
119
|
-
return `https://r.jina.ai/${urlObj.href}`;
|
|
120
|
-
} catch {
|
|
121
|
-
return '';
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
/**
|
|
126
|
-
* Scrapes search results from DuckDuckGo HTML
|
|
127
|
-
* @param {string} query - The search query
|
|
128
|
-
* @param {number} numResults - Number of results to return (default: 10)
|
|
129
|
-
* @param {string} mode - 'short' or 'detailed' mode (default: 'short')
|
|
130
|
-
* @returns {Promise<Array>} - Array of search results
|
|
131
|
-
*/
|
|
132
|
-
async function searchDuckDuckGo(query, numResults = 10, mode = 'short') {
|
|
133
|
-
try {
|
|
134
|
-
// Input validation
|
|
135
|
-
if (!query || typeof query !== 'string') {
|
|
136
|
-
throw new Error('Invalid query: query must be a non-empty string');
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
if (!Number.isInteger(numResults) || numResults < 1 || numResults > 20) {
|
|
140
|
-
throw new Error('Invalid numResults: must be an integer between 1 and 20');
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
if (!['short', 'detailed'].includes(mode)) {
|
|
144
|
-
throw new Error('Invalid mode: must be "short" or "detailed"');
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
// Clear old cache entries
|
|
148
|
-
clearOldCache();
|
|
149
|
-
|
|
150
|
-
// Check cache first
|
|
151
|
-
const cacheKey = getCacheKey(query);
|
|
152
|
-
const cachedResults = resultsCache.get(cacheKey);
|
|
153
|
-
|
|
154
|
-
if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
|
|
155
|
-
console.log(`Cache hit for query: "${query}"`);
|
|
156
|
-
return cachedResults.results.slice(0, numResults);
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
// Get a random user agent
|
|
160
|
-
const userAgent = getRandomUserAgent();
|
|
161
|
-
|
|
162
|
-
console.log(`Searching DuckDuckGo for: "${query}" (${numResults} results, mode: ${mode})`);
|
|
163
|
-
|
|
164
|
-
// Fetch results with timeout
|
|
165
|
-
const controller = new AbortController();
|
|
166
|
-
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT);
|
|
167
|
-
|
|
168
|
-
try {
|
|
169
|
-
const response = await axios.get(
|
|
170
|
-
`https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`,
|
|
171
|
-
{
|
|
172
|
-
signal: controller.signal,
|
|
173
|
-
headers: {
|
|
174
|
-
'User-Agent': userAgent
|
|
175
|
-
},
|
|
176
|
-
httpsAgent: httpsAgent,
|
|
177
|
-
timeout: REQUEST_TIMEOUT
|
|
178
|
-
}
|
|
179
|
-
);
|
|
180
|
-
|
|
181
|
-
clearTimeout(timeoutId);
|
|
182
|
-
|
|
183
|
-
if (response.status !== 200) {
|
|
184
|
-
throw new Error(`HTTP ${response.status}: Failed to fetch search results`);
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
const html = response.data;
|
|
188
|
-
|
|
189
|
-
// Parse results using cheerio
|
|
190
|
-
const $ = cheerio.load(html);
|
|
191
|
-
|
|
192
|
-
const results = [];
|
|
193
|
-
const jinaFetchPromises = [];
|
|
194
|
-
|
|
195
|
-
$('.result').each((i, result) => {
|
|
196
|
-
const $result = $(result);
|
|
197
|
-
const titleEl = $result.find('.result__title a');
|
|
198
|
-
const linkEl = $result.find('.result__url');
|
|
199
|
-
const snippetEl = $result.find('.result__snippet');
|
|
200
|
-
|
|
201
|
-
const title = titleEl.text()?.trim();
|
|
202
|
-
const rawLink = titleEl.attr('href');
|
|
203
|
-
const description = snippetEl.text()?.trim();
|
|
204
|
-
const displayUrl = linkEl.text()?.trim();
|
|
205
|
-
|
|
206
|
-
const directLink = extractDirectUrl(rawLink || '');
|
|
207
|
-
const favicon = getFaviconUrl(directLink);
|
|
208
|
-
const jinaUrl = getJinaAiUrl(directLink);
|
|
209
|
-
|
|
210
|
-
if (title && directLink) {
|
|
211
|
-
if (mode === 'detailed') {
|
|
212
|
-
jinaFetchPromises.push(
|
|
213
|
-
axios.get(jinaUrl, {
|
|
214
|
-
headers: {
|
|
215
|
-
'User-Agent': getRandomUserAgent()
|
|
216
|
-
},
|
|
217
|
-
httpsAgent: httpsAgent,
|
|
218
|
-
timeout: 8000
|
|
219
|
-
})
|
|
220
|
-
.then(jinaRes => {
|
|
221
|
-
let jinaContent = '';
|
|
222
|
-
if (jinaRes.status === 200 && typeof jinaRes.data === 'string') {
|
|
223
|
-
const $jina = cheerio.load(jinaRes.data);
|
|
224
|
-
jinaContent = $jina('body').text();
|
|
225
|
-
}
|
|
226
|
-
return {
|
|
227
|
-
title,
|
|
228
|
-
url: directLink,
|
|
229
|
-
snippet: description || '',
|
|
230
|
-
favicon: favicon,
|
|
231
|
-
displayUrl: displayUrl || '',
|
|
232
|
-
description: jinaContent
|
|
233
|
-
};
|
|
234
|
-
})
|
|
235
|
-
.catch(() => {
|
|
236
|
-
// Return fallback without content
|
|
237
|
-
return {
|
|
238
|
-
title,
|
|
239
|
-
url: directLink,
|
|
240
|
-
snippet: description || '',
|
|
241
|
-
favicon: favicon,
|
|
242
|
-
displayUrl: displayUrl || '',
|
|
243
|
-
description: ''
|
|
244
|
-
};
|
|
245
|
-
})
|
|
246
|
-
);
|
|
247
|
-
} else {
|
|
248
|
-
// short mode: omit description
|
|
249
|
-
jinaFetchPromises.push(
|
|
250
|
-
Promise.resolve({
|
|
251
|
-
title,
|
|
252
|
-
url: directLink,
|
|
253
|
-
snippet: description || '',
|
|
254
|
-
favicon: favicon,
|
|
255
|
-
displayUrl: displayUrl || ''
|
|
256
|
-
})
|
|
257
|
-
);
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
});
|
|
261
|
-
|
|
262
|
-
// Wait for all Jina AI fetches to complete with timeout
|
|
263
|
-
const jinaResults = await Promise.race([
|
|
264
|
-
Promise.all(jinaFetchPromises),
|
|
265
|
-
new Promise((_, reject) =>
|
|
266
|
-
setTimeout(() => reject(new Error('Content fetch timeout')), 15000)
|
|
267
|
-
)
|
|
268
|
-
]);
|
|
269
|
-
|
|
270
|
-
results.push(...jinaResults);
|
|
271
|
-
|
|
272
|
-
// Get limited results
|
|
273
|
-
const limitedResults = results.slice(0, numResults);
|
|
274
|
-
|
|
275
|
-
// Cache the results
|
|
276
|
-
resultsCache.set(cacheKey, {
|
|
277
|
-
results: limitedResults,
|
|
278
|
-
timestamp: Date.now()
|
|
279
|
-
});
|
|
280
|
-
|
|
281
|
-
// If cache is too big, remove oldest entries
|
|
282
|
-
if (resultsCache.size > MAX_CACHE_PAGES) {
|
|
283
|
-
const oldestKey = Array.from(resultsCache.keys())[0];
|
|
284
|
-
resultsCache.delete(oldestKey);
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
console.log(`Found ${limitedResults.length} results for query: "${query}"`);
|
|
288
|
-
return limitedResults;
|
|
289
|
-
} catch (fetchError) {
|
|
290
|
-
clearTimeout(timeoutId);
|
|
291
|
-
|
|
292
|
-
if (fetchError.name === 'AbortError') {
|
|
293
|
-
throw new Error('Search request timeout: took longer than 10 seconds');
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
if (fetchError.code === 'ENOTFOUND') {
|
|
297
|
-
throw new Error('Network error: unable to resolve host');
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
if (fetchError.code === 'ECONNREFUSED') {
|
|
301
|
-
throw new Error('Network error: connection refused');
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
throw fetchError;
|
|
305
|
-
}
|
|
306
|
-
} catch (error) {
|
|
307
|
-
console.error('Error searching DuckDuckGo:', error.message);
|
|
308
|
-
|
|
309
|
-
// Enhanced error reporting
|
|
310
|
-
if (error.message.includes('Invalid')) {
|
|
311
|
-
throw error; // Re-throw validation errors as-is
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
throw new Error(`Search failed for "${query}": ${error.message}`);
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
export {
|
|
319
|
-
searchDuckDuckGo,
|
|
320
|
-
extractDirectUrl,
|
|
321
|
-
getFaviconUrl,
|
|
322
|
-
getJinaAiUrl
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import * as cheerio from 'cheerio';
|
|
3
|
+
import https from 'https';
|
|
4
|
+
import { getRandomUserAgent } from './user_agents.js';
|
|
5
|
+
|
|
6
|
+
// Constants
|
|
7
|
+
const MAX_CACHE_PAGES = 5;
|
|
8
|
+
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
|
|
9
|
+
const REQUEST_TIMEOUT = 10000; // 10 seconds
|
|
10
|
+
|
|
11
|
+
// Cache results to avoid repeated requests
|
|
12
|
+
const resultsCache = new Map();
|
|
13
|
+
|
|
14
|
+
// HTTPS agent configuration to handle certificate chain issues
|
|
15
|
+
const httpsAgent = new https.Agent({
|
|
16
|
+
rejectUnauthorized: true, // Keep security enabled
|
|
17
|
+
keepAlive: true,
|
|
18
|
+
timeout: REQUEST_TIMEOUT,
|
|
19
|
+
// Provide fallback for certificate issues while maintaining security
|
|
20
|
+
secureProtocol: 'TLSv1_2_method'
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Generate a cache key for a search query
|
|
25
|
+
* @param {string} query - The search query
|
|
26
|
+
* @returns {string} The cache key
|
|
27
|
+
*/
|
|
28
|
+
function getCacheKey(query) {
|
|
29
|
+
return `${query}`;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Clear old entries from the cache
|
|
34
|
+
*/
|
|
35
|
+
function clearOldCache() {
|
|
36
|
+
const now = Date.now();
|
|
37
|
+
for (const [key, value] of resultsCache.entries()) {
|
|
38
|
+
if (now - value.timestamp > CACHE_DURATION) {
|
|
39
|
+
resultsCache.delete(key);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Extract the direct URL from a DuckDuckGo redirect URL
|
|
46
|
+
* @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
|
|
47
|
+
* @returns {string} The direct URL
|
|
48
|
+
*/
|
|
49
|
+
function extractDirectUrl(duckduckgoUrl) {
|
|
50
|
+
try {
|
|
51
|
+
// Handle relative URLs from DuckDuckGo
|
|
52
|
+
if (duckduckgoUrl.startsWith('//')) {
|
|
53
|
+
duckduckgoUrl = 'https:' + duckduckgoUrl;
|
|
54
|
+
} else if (duckduckgoUrl.startsWith('/')) {
|
|
55
|
+
duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
const url = new URL(duckduckgoUrl);
|
|
59
|
+
|
|
60
|
+
// Extract direct URL from DuckDuckGo redirect
|
|
61
|
+
if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
|
|
62
|
+
const uddg = url.searchParams.get('uddg');
|
|
63
|
+
if (uddg) {
|
|
64
|
+
return decodeURIComponent(uddg);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Handle ad redirects
|
|
69
|
+
if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
|
|
70
|
+
const u3 = url.searchParams.get('u3');
|
|
71
|
+
if (u3) {
|
|
72
|
+
try {
|
|
73
|
+
const decodedU3 = decodeURIComponent(u3);
|
|
74
|
+
const u3Url = new URL(decodedU3);
|
|
75
|
+
const clickUrl = u3Url.searchParams.get('ld');
|
|
76
|
+
if (clickUrl) {
|
|
77
|
+
return decodeURIComponent(clickUrl);
|
|
78
|
+
}
|
|
79
|
+
return decodedU3;
|
|
80
|
+
} catch {
|
|
81
|
+
return duckduckgoUrl;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return duckduckgoUrl;
|
|
87
|
+
} catch {
|
|
88
|
+
// If URL parsing fails, try to extract URL from a basic string match
|
|
89
|
+
const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
|
|
90
|
+
if (urlMatch) {
|
|
91
|
+
return urlMatch[0];
|
|
92
|
+
}
|
|
93
|
+
return duckduckgoUrl;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Get a favicon URL for a given website URL
|
|
99
|
+
* @param {string} url - The website URL
|
|
100
|
+
* @returns {string} The favicon URL
|
|
101
|
+
*/
|
|
102
|
+
function getFaviconUrl(url) {
|
|
103
|
+
try {
|
|
104
|
+
const urlObj = new URL(url);
|
|
105
|
+
return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
|
|
106
|
+
} catch {
|
|
107
|
+
return ''; // Return empty string if URL is invalid
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Generate a Jina AI URL for a given website URL
|
|
113
|
+
* @param {string} url - The website URL
|
|
114
|
+
* @returns {string} The Jina AI URL
|
|
115
|
+
*/
|
|
116
|
+
function getJinaAiUrl(url) {
|
|
117
|
+
try {
|
|
118
|
+
const urlObj = new URL(url);
|
|
119
|
+
return `https://r.jina.ai/${urlObj.href}`;
|
|
120
|
+
} catch {
|
|
121
|
+
return '';
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Scrapes search results from DuckDuckGo HTML
|
|
127
|
+
* @param {string} query - The search query
|
|
128
|
+
* @param {number} numResults - Number of results to return (default: 10)
|
|
129
|
+
* @param {string} mode - 'short' or 'detailed' mode (default: 'short')
|
|
130
|
+
* @returns {Promise<Array>} - Array of search results
|
|
131
|
+
*/
|
|
132
|
+
async function searchDuckDuckGo(query, numResults = 10, mode = 'short') {
|
|
133
|
+
try {
|
|
134
|
+
// Input validation
|
|
135
|
+
if (!query || typeof query !== 'string') {
|
|
136
|
+
throw new Error('Invalid query: query must be a non-empty string');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (!Number.isInteger(numResults) || numResults < 1 || numResults > 20) {
|
|
140
|
+
throw new Error('Invalid numResults: must be an integer between 1 and 20');
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if (!['short', 'detailed'].includes(mode)) {
|
|
144
|
+
throw new Error('Invalid mode: must be "short" or "detailed"');
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// Clear old cache entries
|
|
148
|
+
clearOldCache();
|
|
149
|
+
|
|
150
|
+
// Check cache first
|
|
151
|
+
const cacheKey = getCacheKey(query);
|
|
152
|
+
const cachedResults = resultsCache.get(cacheKey);
|
|
153
|
+
|
|
154
|
+
if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
|
|
155
|
+
console.log(`Cache hit for query: "${query}"`);
|
|
156
|
+
return cachedResults.results.slice(0, numResults);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Get a random user agent
|
|
160
|
+
const userAgent = getRandomUserAgent();
|
|
161
|
+
|
|
162
|
+
console.log(`Searching DuckDuckGo for: "${query}" (${numResults} results, mode: ${mode})`);
|
|
163
|
+
|
|
164
|
+
// Fetch results with timeout
|
|
165
|
+
const controller = new AbortController();
|
|
166
|
+
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT);
|
|
167
|
+
|
|
168
|
+
try {
|
|
169
|
+
const response = await axios.get(
|
|
170
|
+
`https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`,
|
|
171
|
+
{
|
|
172
|
+
signal: controller.signal,
|
|
173
|
+
headers: {
|
|
174
|
+
'User-Agent': userAgent
|
|
175
|
+
},
|
|
176
|
+
httpsAgent: httpsAgent,
|
|
177
|
+
timeout: REQUEST_TIMEOUT
|
|
178
|
+
}
|
|
179
|
+
);
|
|
180
|
+
|
|
181
|
+
clearTimeout(timeoutId);
|
|
182
|
+
|
|
183
|
+
if (response.status !== 200) {
|
|
184
|
+
throw new Error(`HTTP ${response.status}: Failed to fetch search results`);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const html = response.data;
|
|
188
|
+
|
|
189
|
+
// Parse results using cheerio
|
|
190
|
+
const $ = cheerio.load(html);
|
|
191
|
+
|
|
192
|
+
const results = [];
|
|
193
|
+
const jinaFetchPromises = [];
|
|
194
|
+
|
|
195
|
+
$('.result').each((i, result) => {
|
|
196
|
+
const $result = $(result);
|
|
197
|
+
const titleEl = $result.find('.result__title a');
|
|
198
|
+
const linkEl = $result.find('.result__url');
|
|
199
|
+
const snippetEl = $result.find('.result__snippet');
|
|
200
|
+
|
|
201
|
+
const title = titleEl.text()?.trim();
|
|
202
|
+
const rawLink = titleEl.attr('href');
|
|
203
|
+
const description = snippetEl.text()?.trim();
|
|
204
|
+
const displayUrl = linkEl.text()?.trim();
|
|
205
|
+
|
|
206
|
+
const directLink = extractDirectUrl(rawLink || '');
|
|
207
|
+
const favicon = getFaviconUrl(directLink);
|
|
208
|
+
const jinaUrl = getJinaAiUrl(directLink);
|
|
209
|
+
|
|
210
|
+
if (title && directLink) {
|
|
211
|
+
if (mode === 'detailed') {
|
|
212
|
+
jinaFetchPromises.push(
|
|
213
|
+
axios.get(jinaUrl, {
|
|
214
|
+
headers: {
|
|
215
|
+
'User-Agent': getRandomUserAgent()
|
|
216
|
+
},
|
|
217
|
+
httpsAgent: httpsAgent,
|
|
218
|
+
timeout: 8000
|
|
219
|
+
})
|
|
220
|
+
.then(jinaRes => {
|
|
221
|
+
let jinaContent = '';
|
|
222
|
+
if (jinaRes.status === 200 && typeof jinaRes.data === 'string') {
|
|
223
|
+
const $jina = cheerio.load(jinaRes.data);
|
|
224
|
+
jinaContent = $jina('body').text();
|
|
225
|
+
}
|
|
226
|
+
return {
|
|
227
|
+
title,
|
|
228
|
+
url: directLink,
|
|
229
|
+
snippet: description || '',
|
|
230
|
+
favicon: favicon,
|
|
231
|
+
displayUrl: displayUrl || '',
|
|
232
|
+
description: jinaContent
|
|
233
|
+
};
|
|
234
|
+
})
|
|
235
|
+
.catch(() => {
|
|
236
|
+
// Return fallback without content
|
|
237
|
+
return {
|
|
238
|
+
title,
|
|
239
|
+
url: directLink,
|
|
240
|
+
snippet: description || '',
|
|
241
|
+
favicon: favicon,
|
|
242
|
+
displayUrl: displayUrl || '',
|
|
243
|
+
description: ''
|
|
244
|
+
};
|
|
245
|
+
})
|
|
246
|
+
);
|
|
247
|
+
} else {
|
|
248
|
+
// short mode: omit description
|
|
249
|
+
jinaFetchPromises.push(
|
|
250
|
+
Promise.resolve({
|
|
251
|
+
title,
|
|
252
|
+
url: directLink,
|
|
253
|
+
snippet: description || '',
|
|
254
|
+
favicon: favicon,
|
|
255
|
+
displayUrl: displayUrl || ''
|
|
256
|
+
})
|
|
257
|
+
);
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
// Wait for all Jina AI fetches to complete with timeout
|
|
263
|
+
const jinaResults = await Promise.race([
|
|
264
|
+
Promise.all(jinaFetchPromises),
|
|
265
|
+
new Promise((_, reject) =>
|
|
266
|
+
setTimeout(() => reject(new Error('Content fetch timeout')), 15000)
|
|
267
|
+
)
|
|
268
|
+
]);
|
|
269
|
+
|
|
270
|
+
results.push(...jinaResults);
|
|
271
|
+
|
|
272
|
+
// Get limited results
|
|
273
|
+
const limitedResults = results.slice(0, numResults);
|
|
274
|
+
|
|
275
|
+
// Cache the results
|
|
276
|
+
resultsCache.set(cacheKey, {
|
|
277
|
+
results: limitedResults,
|
|
278
|
+
timestamp: Date.now()
|
|
279
|
+
});
|
|
280
|
+
|
|
281
|
+
// If cache is too big, remove oldest entries
|
|
282
|
+
if (resultsCache.size > MAX_CACHE_PAGES) {
|
|
283
|
+
const oldestKey = Array.from(resultsCache.keys())[0];
|
|
284
|
+
resultsCache.delete(oldestKey);
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
console.log(`Found ${limitedResults.length} results for query: "${query}"`);
|
|
288
|
+
return limitedResults;
|
|
289
|
+
} catch (fetchError) {
|
|
290
|
+
clearTimeout(timeoutId);
|
|
291
|
+
|
|
292
|
+
if (fetchError.name === 'AbortError') {
|
|
293
|
+
throw new Error('Search request timeout: took longer than 10 seconds');
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
if (fetchError.code === 'ENOTFOUND') {
|
|
297
|
+
throw new Error('Network error: unable to resolve host');
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
if (fetchError.code === 'ECONNREFUSED') {
|
|
301
|
+
throw new Error('Network error: connection refused');
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
throw fetchError;
|
|
305
|
+
}
|
|
306
|
+
} catch (error) {
|
|
307
|
+
console.error('Error searching DuckDuckGo:', error.message);
|
|
308
|
+
|
|
309
|
+
// Enhanced error reporting
|
|
310
|
+
if (error.message.includes('Invalid')) {
|
|
311
|
+
throw error; // Re-throw validation errors as-is
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
throw new Error(`Search failed for "${query}": ${error.message}`);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
export {
|
|
319
|
+
searchDuckDuckGo,
|
|
320
|
+
extractDirectUrl,
|
|
321
|
+
getFaviconUrl,
|
|
322
|
+
getJinaAiUrl
|
|
323
323
|
};
|