@oevortex/ddg_search 1.1.8 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,268 +1,323 @@
1
- import axios from 'axios';
2
- import * as cheerio from 'cheerio';
3
- import https from 'https';
4
- import { getRandomUserAgent } from './user_agents.js';
5
-
6
- // Constants
7
- const MAX_CACHE_PAGES = 5;
8
-
9
- // Cache results to avoid repeated requests
10
- const resultsCache = new Map();
11
- const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
12
-
13
- // HTTPS agent configuration to handle certificate chain issues
14
- const httpsAgent = new https.Agent({
15
- rejectUnauthorized: true, // Keep security enabled
16
- keepAlive: true,
17
- timeout: 10000,
18
- // Provide fallback for certificate issues while maintaining security
19
- secureProtocol: 'TLSv1_2_method'
20
- });
21
-
22
- /**
23
- * Generate a cache key for a search query
24
- * @param {string} query - The search query
25
- * @returns {string} The cache key
26
- */
27
- function getCacheKey(query) {
28
- return `${query}`;
29
- }
30
-
31
- /**
32
- * Clear old entries from the cache
33
- */
34
- function clearOldCache() {
35
- const now = Date.now();
36
- for (const [key, value] of resultsCache.entries()) {
37
- if (now - value.timestamp > CACHE_DURATION) {
38
- resultsCache.delete(key);
39
- }
40
- }
41
- }
42
-
43
- /**
44
- * Extract the direct URL from a DuckDuckGo redirect URL
45
- * @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
46
- * @returns {string} The direct URL
47
- */
48
- function extractDirectUrl(duckduckgoUrl) {
49
- try {
50
- // Handle relative URLs from DuckDuckGo
51
- if (duckduckgoUrl.startsWith('//')) {
52
- duckduckgoUrl = 'https:' + duckduckgoUrl;
53
- } else if (duckduckgoUrl.startsWith('/')) {
54
- duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
55
- }
56
-
57
- const url = new URL(duckduckgoUrl);
58
-
59
- // Extract direct URL from DuckDuckGo redirect
60
- if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
61
- const uddg = url.searchParams.get('uddg');
62
- if (uddg) {
63
- return decodeURIComponent(uddg);
64
- }
65
- }
66
-
67
- // Handle ad redirects
68
- if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
69
- const u3 = url.searchParams.get('u3');
70
- if (u3) {
71
- try {
72
- const decodedU3 = decodeURIComponent(u3);
73
- const u3Url = new URL(decodedU3);
74
- const clickUrl = u3Url.searchParams.get('ld');
75
- if (clickUrl) {
76
- return decodeURIComponent(clickUrl);
77
- }
78
- return decodedU3;
79
- } catch {
80
- return duckduckgoUrl;
81
- }
82
- }
83
- }
84
-
85
- return duckduckgoUrl;
86
- } catch {
87
- // If URL parsing fails, try to extract URL from a basic string match
88
- const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
89
- if (urlMatch) {
90
- return urlMatch[0];
91
- }
92
- return duckduckgoUrl;
93
- }
94
- }
95
-
96
- /**
97
- * Get a favicon URL for a given website URL
98
- * @param {string} url - The website URL
99
- * @returns {string} The favicon URL
100
- */
101
- function getFaviconUrl(url) {
102
- try {
103
- const urlObj = new URL(url);
104
- return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
105
- } catch {
106
- return ''; // Return empty string if URL is invalid
107
- }
108
- }
109
-
110
-
111
- /**
112
- * Scrapes search results from DuckDuckGo HTML
113
- * @param {string} query - The search query
114
- * @param {number} numResults - Number of results to return (default: 10)
115
- * @returns {Promise<Array>} - Array of search results
116
- */
117
- async function searchDuckDuckGo(query, numResults = 10, mode = 'short') {
118
- try {
119
- // Clear old cache entries
120
- clearOldCache();
121
-
122
- // Check cache first
123
- const cacheKey = getCacheKey(query);
124
- const cachedResults = resultsCache.get(cacheKey);
125
-
126
- if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
127
- return cachedResults.results.slice(0, numResults);
128
- }
129
-
130
- // Get a random user agent
131
- const userAgent = getRandomUserAgent();
132
-
133
- // Fetch results
134
- const response = await axios.get(
135
- `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`,
136
- {
137
- headers: {
138
- 'User-Agent': userAgent
139
- },
140
- httpsAgent: httpsAgent
141
- }
142
- );
143
-
144
- if (response.status !== 200) {
145
- throw new Error('Failed to fetch search results');
146
- }
147
-
148
- const html = response.data;
149
-
150
- // Parse results using cheerio
151
- const $ = cheerio.load(html);
152
-
153
- const results = [];
154
- const jinaFetchPromises = [];
155
- $('.result').each((i, result) => {
156
- const $result = $(result);
157
- const titleEl = $result.find('.result__title a');
158
- const linkEl = $result.find('.result__url');
159
- const snippetEl = $result.find('.result__snippet');
160
-
161
- const title = titleEl.text()?.trim();
162
- const rawLink = titleEl.attr('href');
163
- const description = snippetEl.text()?.trim();
164
- const displayUrl = linkEl.text()?.trim();
165
-
166
- const directLink = extractDirectUrl(rawLink || '');
167
- const favicon = getFaviconUrl(directLink);
168
- const jinaUrl = getJinaAiUrl(directLink);
169
-
170
- if (title && directLink) {
171
- if (mode === 'detailed') {
172
- jinaFetchPromises.push(
173
- axios.get(jinaUrl, {
174
- headers: {
175
- 'User-Agent': getRandomUserAgent()
176
- },
177
- httpsAgent: httpsAgent,
178
- timeout: 10000
179
- })
180
- .then(jinaRes => {
181
- let jinaContent = '';
182
- if (jinaRes.status === 200 && typeof jinaRes.data === 'string') {
183
- const $jina = cheerio.load(jinaRes.data);
184
- jinaContent = $jina('body').text()
185
- }
186
- return {
187
- title,
188
- url: directLink,
189
- snippet: description || '',
190
- favicon: favicon,
191
- displayUrl: displayUrl || '',
192
- Description: jinaContent
193
- };
194
- })
195
- .catch(() => {
196
- return {
197
- title,
198
- url: directLink,
199
- snippet: description || '',
200
- favicon: favicon,
201
- displayUrl: displayUrl || '',
202
- Description: ''
203
- };
204
- })
205
- );
206
- } else {
207
- // short mode: omit Description
208
- jinaFetchPromises.push(
209
- Promise.resolve({
210
- title,
211
- url: directLink,
212
- snippet: description || '',
213
- favicon: favicon,
214
- displayUrl: displayUrl || ''
215
- })
216
- );
217
- }
218
- }
219
- });
220
-
221
- // Wait for all Jina AI fetches to complete
222
- const jinaResults = await Promise.all(jinaFetchPromises);
223
- results.push(...jinaResults);
224
-
225
- // Get limited results
226
- const limitedResults = results.slice(0, numResults);
227
-
228
- // Cache the results
229
- resultsCache.set(cacheKey, {
230
- results: limitedResults,
231
- timestamp: Date.now()
232
- });
233
-
234
- // If cache is too big, remove oldest entries
235
- if (resultsCache.size > MAX_CACHE_PAGES) {
236
- const oldestKey = Array.from(resultsCache.keys())[0];
237
- resultsCache.delete(oldestKey);
238
- }
239
-
240
- return limitedResults;
241
- } catch (error) {
242
- console.error('Error searching DuckDuckGo:', error.message);
243
- throw error;
244
- }
245
- }
246
-
247
-
248
- export {
249
- searchDuckDuckGo,
250
- extractDirectUrl,
251
- getFaviconUrl
252
- };
253
-
254
- /**
255
- * Generate a Jina AI URL for a given website URL
256
- * @param {string} url - The website URL
257
- * @returns {string} The Jina AI URL
258
- */
259
- function getJinaAiUrl(url) {
260
- try {
261
- const urlObj = new URL(url);
262
- return `https://r.jina.ai/${urlObj.href}`;
263
- } catch {
264
- return '';
265
- }
266
- }
267
-
268
- export { getJinaAiUrl };
1
+ import axios from 'axios';
2
+ import * as cheerio from 'cheerio';
3
+ import https from 'https';
4
+ import { getRandomUserAgent } from './user_agents.js';
5
+
6
+ // Constants
7
+ const MAX_CACHE_PAGES = 5;
8
+ const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
9
+ const REQUEST_TIMEOUT = 10000; // 10 seconds
10
+
11
+ // Cache results to avoid repeated requests
12
+ const resultsCache = new Map();
13
+
14
+ // HTTPS agent configuration to handle certificate chain issues
15
+ const httpsAgent = new https.Agent({
16
+ rejectUnauthorized: true, // Keep security enabled
17
+ keepAlive: true,
18
+ timeout: REQUEST_TIMEOUT,
19
+ // Provide fallback for certificate issues while maintaining security
20
+ secureProtocol: 'TLSv1_2_method'
21
+ });
22
+
23
+ /**
24
+ * Generate a cache key for a search query
25
+ * @param {string} query - The search query
26
+ * @returns {string} The cache key
27
+ */
28
+ function getCacheKey(query) {
29
+ return `${query}`;
30
+ }
31
+
32
+ /**
33
+ * Clear old entries from the cache
34
+ */
35
+ function clearOldCache() {
36
+ const now = Date.now();
37
+ for (const [key, value] of resultsCache.entries()) {
38
+ if (now - value.timestamp > CACHE_DURATION) {
39
+ resultsCache.delete(key);
40
+ }
41
+ }
42
+ }
43
+
44
+ /**
45
+ * Extract the direct URL from a DuckDuckGo redirect URL
46
+ * @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
47
+ * @returns {string} The direct URL
48
+ */
49
+ function extractDirectUrl(duckduckgoUrl) {
50
+ try {
51
+ // Handle relative URLs from DuckDuckGo
52
+ if (duckduckgoUrl.startsWith('//')) {
53
+ duckduckgoUrl = 'https:' + duckduckgoUrl;
54
+ } else if (duckduckgoUrl.startsWith('/')) {
55
+ duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
56
+ }
57
+
58
+ const url = new URL(duckduckgoUrl);
59
+
60
+ // Extract direct URL from DuckDuckGo redirect
61
+ if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
62
+ const uddg = url.searchParams.get('uddg');
63
+ if (uddg) {
64
+ return decodeURIComponent(uddg);
65
+ }
66
+ }
67
+
68
+ // Handle ad redirects
69
+ if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
70
+ const u3 = url.searchParams.get('u3');
71
+ if (u3) {
72
+ try {
73
+ const decodedU3 = decodeURIComponent(u3);
74
+ const u3Url = new URL(decodedU3);
75
+ const clickUrl = u3Url.searchParams.get('ld');
76
+ if (clickUrl) {
77
+ return decodeURIComponent(clickUrl);
78
+ }
79
+ return decodedU3;
80
+ } catch {
81
+ return duckduckgoUrl;
82
+ }
83
+ }
84
+ }
85
+
86
+ return duckduckgoUrl;
87
+ } catch {
88
+ // If URL parsing fails, try to extract URL from a basic string match
89
+ const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
90
+ if (urlMatch) {
91
+ return urlMatch[0];
92
+ }
93
+ return duckduckgoUrl;
94
+ }
95
+ }
96
+
97
+ /**
98
+ * Get a favicon URL for a given website URL
99
+ * @param {string} url - The website URL
100
+ * @returns {string} The favicon URL
101
+ */
102
+ function getFaviconUrl(url) {
103
+ try {
104
+ const urlObj = new URL(url);
105
+ return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
106
+ } catch {
107
+ return ''; // Return empty string if URL is invalid
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Generate a Jina AI URL for a given website URL
113
+ * @param {string} url - The website URL
114
+ * @returns {string} The Jina AI URL
115
+ */
116
+ function getJinaAiUrl(url) {
117
+ try {
118
+ const urlObj = new URL(url);
119
+ return `https://r.jina.ai/${urlObj.href}`;
120
+ } catch {
121
+ return '';
122
+ }
123
+ }
124
+
125
+ /**
126
+ * Scrapes search results from DuckDuckGo HTML
127
+ * @param {string} query - The search query
128
+ * @param {number} numResults - Number of results to return (default: 10)
129
+ * @param {string} mode - 'short' or 'detailed' mode (default: 'short')
130
+ * @returns {Promise<Array>} - Array of search results
131
+ */
132
+ async function searchDuckDuckGo(query, numResults = 10, mode = 'short') {
133
+ try {
134
+ // Input validation
135
+ if (!query || typeof query !== 'string') {
136
+ throw new Error('Invalid query: query must be a non-empty string');
137
+ }
138
+
139
+ if (!Number.isInteger(numResults) || numResults < 1 || numResults > 20) {
140
+ throw new Error('Invalid numResults: must be an integer between 1 and 20');
141
+ }
142
+
143
+ if (!['short', 'detailed'].includes(mode)) {
144
+ throw new Error('Invalid mode: must be "short" or "detailed"');
145
+ }
146
+
147
+ // Clear old cache entries
148
+ clearOldCache();
149
+
150
+ // Check cache first
151
+ const cacheKey = getCacheKey(query);
152
+ const cachedResults = resultsCache.get(cacheKey);
153
+
154
+ if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
155
+ console.log(`Cache hit for query: "${query}"`);
156
+ return cachedResults.results.slice(0, numResults);
157
+ }
158
+
159
+ // Get a random user agent
160
+ const userAgent = getRandomUserAgent();
161
+
162
+ console.log(`Searching DuckDuckGo for: "${query}" (${numResults} results, mode: ${mode})`);
163
+
164
+ // Fetch results with timeout
165
+ const controller = new AbortController();
166
+ const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT);
167
+
168
+ try {
169
+ const response = await axios.get(
170
+ `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`,
171
+ {
172
+ signal: controller.signal,
173
+ headers: {
174
+ 'User-Agent': userAgent
175
+ },
176
+ httpsAgent: httpsAgent,
177
+ timeout: REQUEST_TIMEOUT
178
+ }
179
+ );
180
+
181
+ clearTimeout(timeoutId);
182
+
183
+ if (response.status !== 200) {
184
+ throw new Error(`HTTP ${response.status}: Failed to fetch search results`);
185
+ }
186
+
187
+ const html = response.data;
188
+
189
+ // Parse results using cheerio
190
+ const $ = cheerio.load(html);
191
+
192
+ const results = [];
193
+ const jinaFetchPromises = [];
194
+
195
+ $('.result').each((i, result) => {
196
+ const $result = $(result);
197
+ const titleEl = $result.find('.result__title a');
198
+ const linkEl = $result.find('.result__url');
199
+ const snippetEl = $result.find('.result__snippet');
200
+
201
+ const title = titleEl.text()?.trim();
202
+ const rawLink = titleEl.attr('href');
203
+ const description = snippetEl.text()?.trim();
204
+ const displayUrl = linkEl.text()?.trim();
205
+
206
+ const directLink = extractDirectUrl(rawLink || '');
207
+ const favicon = getFaviconUrl(directLink);
208
+ const jinaUrl = getJinaAiUrl(directLink);
209
+
210
+ if (title && directLink) {
211
+ if (mode === 'detailed') {
212
+ jinaFetchPromises.push(
213
+ axios.get(jinaUrl, {
214
+ headers: {
215
+ 'User-Agent': getRandomUserAgent()
216
+ },
217
+ httpsAgent: httpsAgent,
218
+ timeout: 8000
219
+ })
220
+ .then(jinaRes => {
221
+ let jinaContent = '';
222
+ if (jinaRes.status === 200 && typeof jinaRes.data === 'string') {
223
+ const $jina = cheerio.load(jinaRes.data);
224
+ jinaContent = $jina('body').text();
225
+ }
226
+ return {
227
+ title,
228
+ url: directLink,
229
+ snippet: description || '',
230
+ favicon: favicon,
231
+ displayUrl: displayUrl || '',
232
+ description: jinaContent
233
+ };
234
+ })
235
+ .catch(() => {
236
+ // Return fallback without content
237
+ return {
238
+ title,
239
+ url: directLink,
240
+ snippet: description || '',
241
+ favicon: favicon,
242
+ displayUrl: displayUrl || '',
243
+ description: ''
244
+ };
245
+ })
246
+ );
247
+ } else {
248
+ // short mode: omit description
249
+ jinaFetchPromises.push(
250
+ Promise.resolve({
251
+ title,
252
+ url: directLink,
253
+ snippet: description || '',
254
+ favicon: favicon,
255
+ displayUrl: displayUrl || ''
256
+ })
257
+ );
258
+ }
259
+ }
260
+ });
261
+
262
+ // Wait for all Jina AI fetches to complete with timeout
263
+ const jinaResults = await Promise.race([
264
+ Promise.all(jinaFetchPromises),
265
+ new Promise((_, reject) =>
266
+ setTimeout(() => reject(new Error('Content fetch timeout')), 15000)
267
+ )
268
+ ]);
269
+
270
+ results.push(...jinaResults);
271
+
272
+ // Get limited results
273
+ const limitedResults = results.slice(0, numResults);
274
+
275
+ // Cache the results
276
+ resultsCache.set(cacheKey, {
277
+ results: limitedResults,
278
+ timestamp: Date.now()
279
+ });
280
+
281
+ // If cache is too big, remove oldest entries
282
+ if (resultsCache.size > MAX_CACHE_PAGES) {
283
+ const oldestKey = Array.from(resultsCache.keys())[0];
284
+ resultsCache.delete(oldestKey);
285
+ }
286
+
287
+ console.log(`Found ${limitedResults.length} results for query: "${query}"`);
288
+ return limitedResults;
289
+ } catch (fetchError) {
290
+ clearTimeout(timeoutId);
291
+
292
+ if (fetchError.name === 'AbortError') {
293
+ throw new Error('Search request timeout: took longer than 10 seconds');
294
+ }
295
+
296
+ if (fetchError.code === 'ENOTFOUND') {
297
+ throw new Error('Network error: unable to resolve host');
298
+ }
299
+
300
+ if (fetchError.code === 'ECONNREFUSED') {
301
+ throw new Error('Network error: connection refused');
302
+ }
303
+
304
+ throw fetchError;
305
+ }
306
+ } catch (error) {
307
+ console.error('Error searching DuckDuckGo:', error.message);
308
+
309
+ // Enhanced error reporting
310
+ if (error.message.includes('Invalid')) {
311
+ throw error; // Re-throw validation errors as-is
312
+ }
313
+
314
+ throw new Error(`Search failed for "${query}": ${error.message}`);
315
+ }
316
+ }
317
+
318
+ export {
319
+ searchDuckDuckGo,
320
+ extractDirectUrl,
321
+ getFaviconUrl,
322
+ getJinaAiUrl
323
+ };