@oevortex/ddg_search 1.2.0 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,323 +1,323 @@
1
- import axios from 'axios';
2
- import * as cheerio from 'cheerio';
3
- import https from 'https';
4
- import { getRandomUserAgent } from './user_agents.js';
5
-
6
- // Constants
7
- const MAX_CACHE_PAGES = 5;
8
- const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
9
- const REQUEST_TIMEOUT = 10000; // 10 seconds
10
-
11
- // Cache results to avoid repeated requests
12
- const resultsCache = new Map();
13
-
14
- // HTTPS agent configuration to handle certificate chain issues
15
- const httpsAgent = new https.Agent({
16
- rejectUnauthorized: true, // Keep security enabled
17
- keepAlive: true,
18
- timeout: REQUEST_TIMEOUT,
19
- // Provide fallback for certificate issues while maintaining security
20
- secureProtocol: 'TLSv1_2_method'
21
- });
22
-
23
- /**
24
- * Generate a cache key for a search query
25
- * @param {string} query - The search query
26
- * @returns {string} The cache key
27
- */
28
- function getCacheKey(query) {
29
- return `${query}`;
30
- }
31
-
32
- /**
33
- * Clear old entries from the cache
34
- */
35
- function clearOldCache() {
36
- const now = Date.now();
37
- for (const [key, value] of resultsCache.entries()) {
38
- if (now - value.timestamp > CACHE_DURATION) {
39
- resultsCache.delete(key);
40
- }
41
- }
42
- }
43
-
44
- /**
45
- * Extract the direct URL from a DuckDuckGo redirect URL
46
- * @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
47
- * @returns {string} The direct URL
48
- */
49
- function extractDirectUrl(duckduckgoUrl) {
50
- try {
51
- // Handle relative URLs from DuckDuckGo
52
- if (duckduckgoUrl.startsWith('//')) {
53
- duckduckgoUrl = 'https:' + duckduckgoUrl;
54
- } else if (duckduckgoUrl.startsWith('/')) {
55
- duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
56
- }
57
-
58
- const url = new URL(duckduckgoUrl);
59
-
60
- // Extract direct URL from DuckDuckGo redirect
61
- if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
62
- const uddg = url.searchParams.get('uddg');
63
- if (uddg) {
64
- return decodeURIComponent(uddg);
65
- }
66
- }
67
-
68
- // Handle ad redirects
69
- if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
70
- const u3 = url.searchParams.get('u3');
71
- if (u3) {
72
- try {
73
- const decodedU3 = decodeURIComponent(u3);
74
- const u3Url = new URL(decodedU3);
75
- const clickUrl = u3Url.searchParams.get('ld');
76
- if (clickUrl) {
77
- return decodeURIComponent(clickUrl);
78
- }
79
- return decodedU3;
80
- } catch {
81
- return duckduckgoUrl;
82
- }
83
- }
84
- }
85
-
86
- return duckduckgoUrl;
87
- } catch {
88
- // If URL parsing fails, try to extract URL from a basic string match
89
- const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
90
- if (urlMatch) {
91
- return urlMatch[0];
92
- }
93
- return duckduckgoUrl;
94
- }
95
- }
96
-
97
- /**
98
- * Get a favicon URL for a given website URL
99
- * @param {string} url - The website URL
100
- * @returns {string} The favicon URL
101
- */
102
- function getFaviconUrl(url) {
103
- try {
104
- const urlObj = new URL(url);
105
- return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
106
- } catch {
107
- return ''; // Return empty string if URL is invalid
108
- }
109
- }
110
-
111
- /**
112
- * Generate a Jina AI URL for a given website URL
113
- * @param {string} url - The website URL
114
- * @returns {string} The Jina AI URL
115
- */
116
- function getJinaAiUrl(url) {
117
- try {
118
- const urlObj = new URL(url);
119
- return `https://r.jina.ai/${urlObj.href}`;
120
- } catch {
121
- return '';
122
- }
123
- }
124
-
125
- /**
126
- * Scrapes search results from DuckDuckGo HTML
127
- * @param {string} query - The search query
128
- * @param {number} numResults - Number of results to return (default: 10)
129
- * @param {string} mode - 'short' or 'detailed' mode (default: 'short')
130
- * @returns {Promise<Array>} - Array of search results
131
- */
132
- async function searchDuckDuckGo(query, numResults = 10, mode = 'short') {
133
- try {
134
- // Input validation
135
- if (!query || typeof query !== 'string') {
136
- throw new Error('Invalid query: query must be a non-empty string');
137
- }
138
-
139
- if (!Number.isInteger(numResults) || numResults < 1 || numResults > 20) {
140
- throw new Error('Invalid numResults: must be an integer between 1 and 20');
141
- }
142
-
143
- if (!['short', 'detailed'].includes(mode)) {
144
- throw new Error('Invalid mode: must be "short" or "detailed"');
145
- }
146
-
147
- // Clear old cache entries
148
- clearOldCache();
149
-
150
- // Check cache first
151
- const cacheKey = getCacheKey(query);
152
- const cachedResults = resultsCache.get(cacheKey);
153
-
154
- if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
155
- console.log(`Cache hit for query: "${query}"`);
156
- return cachedResults.results.slice(0, numResults);
157
- }
158
-
159
- // Get a random user agent
160
- const userAgent = getRandomUserAgent();
161
-
162
- console.log(`Searching DuckDuckGo for: "${query}" (${numResults} results, mode: ${mode})`);
163
-
164
- // Fetch results with timeout
165
- const controller = new AbortController();
166
- const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT);
167
-
168
- try {
169
- const response = await axios.get(
170
- `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`,
171
- {
172
- signal: controller.signal,
173
- headers: {
174
- 'User-Agent': userAgent
175
- },
176
- httpsAgent: httpsAgent,
177
- timeout: REQUEST_TIMEOUT
178
- }
179
- );
180
-
181
- clearTimeout(timeoutId);
182
-
183
- if (response.status !== 200) {
184
- throw new Error(`HTTP ${response.status}: Failed to fetch search results`);
185
- }
186
-
187
- const html = response.data;
188
-
189
- // Parse results using cheerio
190
- const $ = cheerio.load(html);
191
-
192
- const results = [];
193
- const jinaFetchPromises = [];
194
-
195
- $('.result').each((i, result) => {
196
- const $result = $(result);
197
- const titleEl = $result.find('.result__title a');
198
- const linkEl = $result.find('.result__url');
199
- const snippetEl = $result.find('.result__snippet');
200
-
201
- const title = titleEl.text()?.trim();
202
- const rawLink = titleEl.attr('href');
203
- const description = snippetEl.text()?.trim();
204
- const displayUrl = linkEl.text()?.trim();
205
-
206
- const directLink = extractDirectUrl(rawLink || '');
207
- const favicon = getFaviconUrl(directLink);
208
- const jinaUrl = getJinaAiUrl(directLink);
209
-
210
- if (title && directLink) {
211
- if (mode === 'detailed') {
212
- jinaFetchPromises.push(
213
- axios.get(jinaUrl, {
214
- headers: {
215
- 'User-Agent': getRandomUserAgent()
216
- },
217
- httpsAgent: httpsAgent,
218
- timeout: 8000
219
- })
220
- .then(jinaRes => {
221
- let jinaContent = '';
222
- if (jinaRes.status === 200 && typeof jinaRes.data === 'string') {
223
- const $jina = cheerio.load(jinaRes.data);
224
- jinaContent = $jina('body').text();
225
- }
226
- return {
227
- title,
228
- url: directLink,
229
- snippet: description || '',
230
- favicon: favicon,
231
- displayUrl: displayUrl || '',
232
- description: jinaContent
233
- };
234
- })
235
- .catch(() => {
236
- // Return fallback without content
237
- return {
238
- title,
239
- url: directLink,
240
- snippet: description || '',
241
- favicon: favicon,
242
- displayUrl: displayUrl || '',
243
- description: ''
244
- };
245
- })
246
- );
247
- } else {
248
- // short mode: omit description
249
- jinaFetchPromises.push(
250
- Promise.resolve({
251
- title,
252
- url: directLink,
253
- snippet: description || '',
254
- favicon: favicon,
255
- displayUrl: displayUrl || ''
256
- })
257
- );
258
- }
259
- }
260
- });
261
-
262
- // Wait for all Jina AI fetches to complete with timeout
263
- const jinaResults = await Promise.race([
264
- Promise.all(jinaFetchPromises),
265
- new Promise((_, reject) =>
266
- setTimeout(() => reject(new Error('Content fetch timeout')), 15000)
267
- )
268
- ]);
269
-
270
- results.push(...jinaResults);
271
-
272
- // Get limited results
273
- const limitedResults = results.slice(0, numResults);
274
-
275
- // Cache the results
276
- resultsCache.set(cacheKey, {
277
- results: limitedResults,
278
- timestamp: Date.now()
279
- });
280
-
281
- // If cache is too big, remove oldest entries
282
- if (resultsCache.size > MAX_CACHE_PAGES) {
283
- const oldestKey = Array.from(resultsCache.keys())[0];
284
- resultsCache.delete(oldestKey);
285
- }
286
-
287
- console.log(`Found ${limitedResults.length} results for query: "${query}"`);
288
- return limitedResults;
289
- } catch (fetchError) {
290
- clearTimeout(timeoutId);
291
-
292
- if (fetchError.name === 'AbortError') {
293
- throw new Error('Search request timeout: took longer than 10 seconds');
294
- }
295
-
296
- if (fetchError.code === 'ENOTFOUND') {
297
- throw new Error('Network error: unable to resolve host');
298
- }
299
-
300
- if (fetchError.code === 'ECONNREFUSED') {
301
- throw new Error('Network error: connection refused');
302
- }
303
-
304
- throw fetchError;
305
- }
306
- } catch (error) {
307
- console.error('Error searching DuckDuckGo:', error.message);
308
-
309
- // Enhanced error reporting
310
- if (error.message.includes('Invalid')) {
311
- throw error; // Re-throw validation errors as-is
312
- }
313
-
314
- throw new Error(`Search failed for "${query}": ${error.message}`);
315
- }
316
- }
317
-
318
- export {
319
- searchDuckDuckGo,
320
- extractDirectUrl,
321
- getFaviconUrl,
322
- getJinaAiUrl
1
+ import axios from 'axios';
2
+ import * as cheerio from 'cheerio';
3
+ import https from 'https';
4
+ import { getRandomUserAgent } from './user_agents.js';
5
+
6
+ // Constants
7
+ const MAX_CACHE_PAGES = 5;
8
+ const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
9
+ const REQUEST_TIMEOUT = 10000; // 10 seconds
10
+
11
+ // Cache results to avoid repeated requests
12
+ const resultsCache = new Map();
13
+
14
+ // HTTPS agent configuration to handle certificate chain issues
15
+ const httpsAgent = new https.Agent({
16
+ rejectUnauthorized: true, // Keep security enabled
17
+ keepAlive: true,
18
+ timeout: REQUEST_TIMEOUT,
19
+ // Provide fallback for certificate issues while maintaining security
20
+ secureProtocol: 'TLSv1_2_method'
21
+ });
22
+
23
+ /**
24
+ * Generate a cache key for a search query
25
+ * @param {string} query - The search query
26
+ * @returns {string} The cache key
27
+ */
28
+ function getCacheKey(query) {
29
+ return `${query}`;
30
+ }
31
+
32
+ /**
33
+ * Clear old entries from the cache
34
+ */
35
+ function clearOldCache() {
36
+ const now = Date.now();
37
+ for (const [key, value] of resultsCache.entries()) {
38
+ if (now - value.timestamp > CACHE_DURATION) {
39
+ resultsCache.delete(key);
40
+ }
41
+ }
42
+ }
43
+
44
+ /**
45
+ * Extract the direct URL from a DuckDuckGo redirect URL
46
+ * @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
47
+ * @returns {string} The direct URL
48
+ */
49
+ function extractDirectUrl(duckduckgoUrl) {
50
+ try {
51
+ // Handle relative URLs from DuckDuckGo
52
+ if (duckduckgoUrl.startsWith('//')) {
53
+ duckduckgoUrl = 'https:' + duckduckgoUrl;
54
+ } else if (duckduckgoUrl.startsWith('/')) {
55
+ duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
56
+ }
57
+
58
+ const url = new URL(duckduckgoUrl);
59
+
60
+ // Extract direct URL from DuckDuckGo redirect
61
+ if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
62
+ const uddg = url.searchParams.get('uddg');
63
+ if (uddg) {
64
+ return decodeURIComponent(uddg);
65
+ }
66
+ }
67
+
68
+ // Handle ad redirects
69
+ if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
70
+ const u3 = url.searchParams.get('u3');
71
+ if (u3) {
72
+ try {
73
+ const decodedU3 = decodeURIComponent(u3);
74
+ const u3Url = new URL(decodedU3);
75
+ const clickUrl = u3Url.searchParams.get('ld');
76
+ if (clickUrl) {
77
+ return decodeURIComponent(clickUrl);
78
+ }
79
+ return decodedU3;
80
+ } catch {
81
+ return duckduckgoUrl;
82
+ }
83
+ }
84
+ }
85
+
86
+ return duckduckgoUrl;
87
+ } catch {
88
+ // If URL parsing fails, try to extract URL from a basic string match
89
+ const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
90
+ if (urlMatch) {
91
+ return urlMatch[0];
92
+ }
93
+ return duckduckgoUrl;
94
+ }
95
+ }
96
+
97
+ /**
98
+ * Get a favicon URL for a given website URL
99
+ * @param {string} url - The website URL
100
+ * @returns {string} The favicon URL
101
+ */
102
+ function getFaviconUrl(url) {
103
+ try {
104
+ const urlObj = new URL(url);
105
+ return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
106
+ } catch {
107
+ return ''; // Return empty string if URL is invalid
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Generate a Jina AI URL for a given website URL
113
+ * @param {string} url - The website URL
114
+ * @returns {string} The Jina AI URL
115
+ */
116
+ function getJinaAiUrl(url) {
117
+ try {
118
+ const urlObj = new URL(url);
119
+ return `https://r.jina.ai/${urlObj.href}`;
120
+ } catch {
121
+ return '';
122
+ }
123
+ }
124
+
125
+ /**
126
+ * Scrapes search results from DuckDuckGo HTML
127
+ * @param {string} query - The search query
128
+ * @param {number} numResults - Number of results to return (default: 10)
129
+ * @param {string} mode - 'short' or 'detailed' mode (default: 'short')
130
+ * @returns {Promise<Array>} - Array of search results
131
+ */
132
+ async function searchDuckDuckGo(query, numResults = 10, mode = 'short') {
133
+ try {
134
+ // Input validation
135
+ if (!query || typeof query !== 'string') {
136
+ throw new Error('Invalid query: query must be a non-empty string');
137
+ }
138
+
139
+ if (!Number.isInteger(numResults) || numResults < 1 || numResults > 20) {
140
+ throw new Error('Invalid numResults: must be an integer between 1 and 20');
141
+ }
142
+
143
+ if (!['short', 'detailed'].includes(mode)) {
144
+ throw new Error('Invalid mode: must be "short" or "detailed"');
145
+ }
146
+
147
+ // Clear old cache entries
148
+ clearOldCache();
149
+
150
+ // Check cache first
151
+ const cacheKey = getCacheKey(query);
152
+ const cachedResults = resultsCache.get(cacheKey);
153
+
154
+ if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
155
+ console.log(`Cache hit for query: "${query}"`);
156
+ return cachedResults.results.slice(0, numResults);
157
+ }
158
+
159
+ // Get a random user agent
160
+ const userAgent = getRandomUserAgent();
161
+
162
+ console.log(`Searching DuckDuckGo for: "${query}" (${numResults} results, mode: ${mode})`);
163
+
164
+ // Fetch results with timeout
165
+ const controller = new AbortController();
166
+ const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT);
167
+
168
+ try {
169
+ const response = await axios.get(
170
+ `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`,
171
+ {
172
+ signal: controller.signal,
173
+ headers: {
174
+ 'User-Agent': userAgent
175
+ },
176
+ httpsAgent: httpsAgent,
177
+ timeout: REQUEST_TIMEOUT
178
+ }
179
+ );
180
+
181
+ clearTimeout(timeoutId);
182
+
183
+ if (response.status !== 200) {
184
+ throw new Error(`HTTP ${response.status}: Failed to fetch search results`);
185
+ }
186
+
187
+ const html = response.data;
188
+
189
+ // Parse results using cheerio
190
+ const $ = cheerio.load(html);
191
+
192
+ const results = [];
193
+ const jinaFetchPromises = [];
194
+
195
+ $('.result').each((i, result) => {
196
+ const $result = $(result);
197
+ const titleEl = $result.find('.result__title a');
198
+ const linkEl = $result.find('.result__url');
199
+ const snippetEl = $result.find('.result__snippet');
200
+
201
+ const title = titleEl.text()?.trim();
202
+ const rawLink = titleEl.attr('href');
203
+ const description = snippetEl.text()?.trim();
204
+ const displayUrl = linkEl.text()?.trim();
205
+
206
+ const directLink = extractDirectUrl(rawLink || '');
207
+ const favicon = getFaviconUrl(directLink);
208
+ const jinaUrl = getJinaAiUrl(directLink);
209
+
210
+ if (title && directLink) {
211
+ if (mode === 'detailed') {
212
+ jinaFetchPromises.push(
213
+ axios.get(jinaUrl, {
214
+ headers: {
215
+ 'User-Agent': getRandomUserAgent()
216
+ },
217
+ httpsAgent: httpsAgent,
218
+ timeout: 8000
219
+ })
220
+ .then(jinaRes => {
221
+ let jinaContent = '';
222
+ if (jinaRes.status === 200 && typeof jinaRes.data === 'string') {
223
+ const $jina = cheerio.load(jinaRes.data);
224
+ jinaContent = $jina('body').text();
225
+ }
226
+ return {
227
+ title,
228
+ url: directLink,
229
+ snippet: description || '',
230
+ favicon: favicon,
231
+ displayUrl: displayUrl || '',
232
+ description: jinaContent
233
+ };
234
+ })
235
+ .catch(() => {
236
+ // Return fallback without content
237
+ return {
238
+ title,
239
+ url: directLink,
240
+ snippet: description || '',
241
+ favicon: favicon,
242
+ displayUrl: displayUrl || '',
243
+ description: ''
244
+ };
245
+ })
246
+ );
247
+ } else {
248
+ // short mode: omit description
249
+ jinaFetchPromises.push(
250
+ Promise.resolve({
251
+ title,
252
+ url: directLink,
253
+ snippet: description || '',
254
+ favicon: favicon,
255
+ displayUrl: displayUrl || ''
256
+ })
257
+ );
258
+ }
259
+ }
260
+ });
261
+
262
+ // Wait for all Jina AI fetches to complete with timeout
263
+ const jinaResults = await Promise.race([
264
+ Promise.all(jinaFetchPromises),
265
+ new Promise((_, reject) =>
266
+ setTimeout(() => reject(new Error('Content fetch timeout')), 15000)
267
+ )
268
+ ]);
269
+
270
+ results.push(...jinaResults);
271
+
272
+ // Get limited results
273
+ const limitedResults = results.slice(0, numResults);
274
+
275
+ // Cache the results
276
+ resultsCache.set(cacheKey, {
277
+ results: limitedResults,
278
+ timestamp: Date.now()
279
+ });
280
+
281
+ // If cache is too big, remove oldest entries
282
+ if (resultsCache.size > MAX_CACHE_PAGES) {
283
+ const oldestKey = Array.from(resultsCache.keys())[0];
284
+ resultsCache.delete(oldestKey);
285
+ }
286
+
287
+ console.log(`Found ${limitedResults.length} results for query: "${query}"`);
288
+ return limitedResults;
289
+ } catch (fetchError) {
290
+ clearTimeout(timeoutId);
291
+
292
+ if (fetchError.name === 'AbortError') {
293
+ throw new Error('Search request timeout: took longer than 10 seconds');
294
+ }
295
+
296
+ if (fetchError.code === 'ENOTFOUND') {
297
+ throw new Error('Network error: unable to resolve host');
298
+ }
299
+
300
+ if (fetchError.code === 'ECONNREFUSED') {
301
+ throw new Error('Network error: connection refused');
302
+ }
303
+
304
+ throw fetchError;
305
+ }
306
+ } catch (error) {
307
+ console.error('Error searching DuckDuckGo:', error.message);
308
+
309
+ // Enhanced error reporting
310
+ if (error.message.includes('Invalid')) {
311
+ throw error; // Re-throw validation errors as-is
312
+ }
313
+
314
+ throw new Error(`Search failed for "${query}": ${error.message}`);
315
+ }
316
+ }
317
+
318
+ export {
319
+ searchDuckDuckGo,
320
+ extractDirectUrl,
321
+ getFaviconUrl,
322
+ getJinaAiUrl
323
323
  };