@oevortex/ddg_search 1.0.2 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +226 -223
- package/bin/cli.js +146 -144
- package/package.json +1 -38
- package/src/index.js +82 -78
- package/src/tools/feloTool.js +85 -0
- package/src/tools/fetchUrlTool.js +118 -118
- package/src/tools/metadataTool.js +58 -58
- package/src/tools/searchTool.js +64 -64
- package/src/utils/search.js +401 -401
- package/src/utils/search_felo.js +204 -0
package/src/utils/search.js
CHANGED
|
@@ -1,401 +1,401 @@
|
|
|
1
|
-
import axios from 'axios';
|
|
2
|
-
import { JSDOM } from 'jsdom';
|
|
3
|
-
|
|
4
|
-
// Constants
|
|
5
|
-
const RESULTS_PER_PAGE = 10;
|
|
6
|
-
const MAX_CACHE_PAGES = 5;
|
|
7
|
-
|
|
8
|
-
// Rotating User Agents
|
|
9
|
-
const USER_AGENTS = [
|
|
10
|
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
11
|
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Edge/120.0.0.0',
|
|
12
|
-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15',
|
|
13
|
-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
|
14
|
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
15
|
-
];
|
|
16
|
-
|
|
17
|
-
// Cache results to avoid repeated requests
|
|
18
|
-
const resultsCache = new Map();
|
|
19
|
-
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
|
|
20
|
-
|
|
21
|
-
/**
|
|
22
|
-
* Get a random user agent from the list
|
|
23
|
-
* @returns {string} A random user agent string
|
|
24
|
-
*/
|
|
25
|
-
function getRandomUserAgent() {
|
|
26
|
-
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Generate a cache key for a search query and page
|
|
31
|
-
* @param {string} query - The search query
|
|
32
|
-
* @param {number} page - The page number
|
|
33
|
-
* @returns {string} The cache key
|
|
34
|
-
*/
|
|
35
|
-
function getCacheKey(query, page) {
|
|
36
|
-
return `${query}-${page}`;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
/**
|
|
40
|
-
* Clear old entries from the cache
|
|
41
|
-
*/
|
|
42
|
-
function clearOldCache() {
|
|
43
|
-
const now = Date.now();
|
|
44
|
-
for (const [key, value] of resultsCache.entries()) {
|
|
45
|
-
if (now - value.timestamp > CACHE_DURATION) {
|
|
46
|
-
resultsCache.delete(key);
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
/**
|
|
52
|
-
* Extract the direct URL from a DuckDuckGo redirect URL
|
|
53
|
-
* @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
|
|
54
|
-
* @returns {string} The direct URL
|
|
55
|
-
*/
|
|
56
|
-
function extractDirectUrl(duckduckgoUrl) {
|
|
57
|
-
try {
|
|
58
|
-
// Handle relative URLs from DuckDuckGo
|
|
59
|
-
if (duckduckgoUrl.startsWith('//')) {
|
|
60
|
-
duckduckgoUrl = 'https:' + duckduckgoUrl;
|
|
61
|
-
} else if (duckduckgoUrl.startsWith('/')) {
|
|
62
|
-
duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
const url = new URL(duckduckgoUrl);
|
|
66
|
-
|
|
67
|
-
// Extract direct URL from DuckDuckGo redirect
|
|
68
|
-
if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
|
|
69
|
-
const uddg = url.searchParams.get('uddg');
|
|
70
|
-
if (uddg) {
|
|
71
|
-
return decodeURIComponent(uddg);
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
// Handle ad redirects
|
|
76
|
-
if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
|
|
77
|
-
const u3 = url.searchParams.get('u3');
|
|
78
|
-
if (u3) {
|
|
79
|
-
try {
|
|
80
|
-
const decodedU3 = decodeURIComponent(u3);
|
|
81
|
-
const u3Url = new URL(decodedU3);
|
|
82
|
-
const clickUrl = u3Url.searchParams.get('ld');
|
|
83
|
-
if (clickUrl) {
|
|
84
|
-
return decodeURIComponent(clickUrl);
|
|
85
|
-
}
|
|
86
|
-
return decodedU3;
|
|
87
|
-
} catch {
|
|
88
|
-
return duckduckgoUrl;
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
return duckduckgoUrl;
|
|
94
|
-
} catch {
|
|
95
|
-
// If URL parsing fails, try to extract URL from a basic string match
|
|
96
|
-
const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
|
|
97
|
-
if (urlMatch) {
|
|
98
|
-
return urlMatch[0];
|
|
99
|
-
}
|
|
100
|
-
return duckduckgoUrl;
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
/**
|
|
105
|
-
* Get a favicon URL for a given website URL
|
|
106
|
-
* @param {string} url - The website URL
|
|
107
|
-
* @returns {string} The favicon URL
|
|
108
|
-
*/
|
|
109
|
-
function getFaviconUrl(url) {
|
|
110
|
-
try {
|
|
111
|
-
const urlObj = new URL(url);
|
|
112
|
-
return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
|
|
113
|
-
} catch {
|
|
114
|
-
return ''; // Return empty string if URL is invalid
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
/**
|
|
119
|
-
* Scrapes search results from DuckDuckGo HTML
|
|
120
|
-
* @param {string} query - The search query
|
|
121
|
-
* @param {number} page - The page number (default: 1)
|
|
122
|
-
* @param {number} numResults - Number of results to return (default: 10)
|
|
123
|
-
* @returns {Promise<Array>} - Array of search results
|
|
124
|
-
*/
|
|
125
|
-
async function searchDuckDuckGo(query, page = 1, numResults = 10) {
|
|
126
|
-
try {
|
|
127
|
-
// Clear old cache entries
|
|
128
|
-
clearOldCache();
|
|
129
|
-
|
|
130
|
-
// Calculate start index for pagination
|
|
131
|
-
const startIndex = (page - 1) * RESULTS_PER_PAGE;
|
|
132
|
-
|
|
133
|
-
// Check cache first
|
|
134
|
-
const cacheKey = getCacheKey(query, page);
|
|
135
|
-
const cachedResults = resultsCache.get(cacheKey);
|
|
136
|
-
|
|
137
|
-
if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
|
|
138
|
-
return cachedResults.results.slice(0, numResults);
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
// Get a random user agent
|
|
142
|
-
const userAgent = getRandomUserAgent();
|
|
143
|
-
|
|
144
|
-
// Fetch results
|
|
145
|
-
const response = await axios.get(
|
|
146
|
-
`https://duckduckgo.com/html/?q=${encodeURIComponent(query)}&s=${startIndex}`,
|
|
147
|
-
{
|
|
148
|
-
headers: {
|
|
149
|
-
'User-Agent': userAgent
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
);
|
|
153
|
-
|
|
154
|
-
if (response.status !== 200) {
|
|
155
|
-
throw new Error('Failed to fetch search results');
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
const html = response.data;
|
|
159
|
-
|
|
160
|
-
// Parse results using JSDOM
|
|
161
|
-
const dom = new JSDOM(html);
|
|
162
|
-
const document = dom.window.document;
|
|
163
|
-
|
|
164
|
-
const results = [];
|
|
165
|
-
const searchResults = document.querySelectorAll('.result');
|
|
166
|
-
|
|
167
|
-
searchResults.forEach((result) => {
|
|
168
|
-
const titleEl = result.querySelector('.result__title a');
|
|
169
|
-
const linkEl = result.querySelector('.result__url');
|
|
170
|
-
const snippetEl = result.querySelector('.result__snippet');
|
|
171
|
-
|
|
172
|
-
const title = titleEl?.textContent?.trim();
|
|
173
|
-
const rawLink = titleEl?.getAttribute('href');
|
|
174
|
-
const description = snippetEl?.textContent?.trim();
|
|
175
|
-
const displayUrl = linkEl?.textContent?.trim();
|
|
176
|
-
|
|
177
|
-
const directLink = extractDirectUrl(rawLink || '');
|
|
178
|
-
const favicon = getFaviconUrl(directLink);
|
|
179
|
-
|
|
180
|
-
if (title && directLink) {
|
|
181
|
-
results.push({
|
|
182
|
-
title,
|
|
183
|
-
url: directLink,
|
|
184
|
-
snippet: description || '',
|
|
185
|
-
favicon: favicon,
|
|
186
|
-
displayUrl: displayUrl || ''
|
|
187
|
-
});
|
|
188
|
-
}
|
|
189
|
-
});
|
|
190
|
-
|
|
191
|
-
// Get paginated results
|
|
192
|
-
const paginatedResults = results.slice(0, numResults);
|
|
193
|
-
|
|
194
|
-
// Cache the results
|
|
195
|
-
resultsCache.set(cacheKey, {
|
|
196
|
-
results: paginatedResults,
|
|
197
|
-
timestamp: Date.now()
|
|
198
|
-
});
|
|
199
|
-
|
|
200
|
-
// If cache is too big, remove oldest entries
|
|
201
|
-
if (resultsCache.size > MAX_CACHE_PAGES) {
|
|
202
|
-
const oldestKey = Array.from(resultsCache.keys())[0];
|
|
203
|
-
resultsCache.delete(oldestKey);
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
return paginatedResults;
|
|
207
|
-
} catch (error) {
|
|
208
|
-
console.error('Error searching DuckDuckGo:', error.message);
|
|
209
|
-
throw error;
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
/**
|
|
214
|
-
* Fetches the content of a URL and returns it as text
|
|
215
|
-
* @param {string} url - The URL to fetch
|
|
216
|
-
* @param {Object} options - Options for content extraction
|
|
217
|
-
* @param {boolean} options.extractMainContent - Whether to attempt to extract main content (default: true)
|
|
218
|
-
* @param {boolean} options.includeLinks - Whether to include link text (default: true)
|
|
219
|
-
* @param {boolean} options.includeImages - Whether to include image alt text (default: true)
|
|
220
|
-
* @param {string[]} options.excludeTags - Tags to exclude from extraction
|
|
221
|
-
* @returns {Promise<string>} - The content of the URL
|
|
222
|
-
*/
|
|
223
|
-
async function fetchUrlContent(url, options = {}) {
|
|
224
|
-
try {
|
|
225
|
-
// Default options
|
|
226
|
-
const {
|
|
227
|
-
extractMainContent = true,
|
|
228
|
-
includeLinks = true,
|
|
229
|
-
includeImages = true,
|
|
230
|
-
excludeTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'nav', 'footer', 'header', 'aside']
|
|
231
|
-
} = options;
|
|
232
|
-
|
|
233
|
-
// Get a random user agent
|
|
234
|
-
const userAgent = getRandomUserAgent();
|
|
235
|
-
|
|
236
|
-
const response = await axios.get(url, {
|
|
237
|
-
headers: {
|
|
238
|
-
'User-Agent': userAgent
|
|
239
|
-
},
|
|
240
|
-
timeout: 10000 // 10 second timeout
|
|
241
|
-
});
|
|
242
|
-
|
|
243
|
-
if (response.status !== 200) {
|
|
244
|
-
throw new Error(`Failed to fetch URL: ${url}`);
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
// If the content is HTML, extract the text content
|
|
248
|
-
const contentType = response.headers['content-type'] || '';
|
|
249
|
-
if (contentType.includes('text/html')) {
|
|
250
|
-
const dom = new JSDOM(response.data);
|
|
251
|
-
const document = dom.window.document;
|
|
252
|
-
|
|
253
|
-
// Remove unwanted elements
|
|
254
|
-
excludeTags.forEach(tag => {
|
|
255
|
-
const elements = document.querySelectorAll(tag);
|
|
256
|
-
elements.forEach(el => el.remove());
|
|
257
|
-
});
|
|
258
|
-
|
|
259
|
-
// Remove ads and other common unwanted elements
|
|
260
|
-
const unwantedSelectors = [
|
|
261
|
-
'[id*="ad"]', '[class*="ad"]', '[id*="banner"]', '[class*="banner"]',
|
|
262
|
-
'[id*="popup"]', '[class*="popup"]', '[class*="cookie"]',
|
|
263
|
-
'[id*="cookie"]', '[class*="newsletter"]', '[id*="newsletter"]',
|
|
264
|
-
'[class*="social"]', '[id*="social"]', '[class*="share"]', '[id*="share"]'
|
|
265
|
-
];
|
|
266
|
-
|
|
267
|
-
unwantedSelectors.forEach(selector => {
|
|
268
|
-
try {
|
|
269
|
-
const elements = document.querySelectorAll(selector);
|
|
270
|
-
elements.forEach(el => el.remove());
|
|
271
|
-
} catch (e) {
|
|
272
|
-
// Ignore invalid selectors
|
|
273
|
-
}
|
|
274
|
-
});
|
|
275
|
-
|
|
276
|
-
// Handle links and images
|
|
277
|
-
if (!includeLinks) {
|
|
278
|
-
const links = document.querySelectorAll('a');
|
|
279
|
-
links.forEach(link => {
|
|
280
|
-
const span = document.createElement('span');
|
|
281
|
-
span.textContent = link.textContent;
|
|
282
|
-
link.parentNode.replaceChild(span, link);
|
|
283
|
-
});
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
if (!includeImages) {
|
|
287
|
-
const images = document.querySelectorAll('img');
|
|
288
|
-
images.forEach(img => img.remove());
|
|
289
|
-
} else {
|
|
290
|
-
// Replace images with their alt text
|
|
291
|
-
const images = document.querySelectorAll('img');
|
|
292
|
-
images.forEach(img => {
|
|
293
|
-
const alt = img.getAttribute('alt');
|
|
294
|
-
if (alt) {
|
|
295
|
-
const span = document.createElement('span');
|
|
296
|
-
span.textContent = `[Image: ${alt}]`;
|
|
297
|
-
img.parentNode.replaceChild(span, img);
|
|
298
|
-
} else {
|
|
299
|
-
img.remove();
|
|
300
|
-
}
|
|
301
|
-
});
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
// Try to extract main content if requested
|
|
305
|
-
if (extractMainContent) {
|
|
306
|
-
// Common content selectors in order of priority
|
|
307
|
-
const contentSelectors = [
|
|
308
|
-
'article', 'main', '[role="main"]', '.post-content', '.article-content',
|
|
309
|
-
'.content', '#content', '.post', '.article', '.entry-content',
|
|
310
|
-
'.page-content', '.post-body', '.post-text', '.story-body'
|
|
311
|
-
];
|
|
312
|
-
|
|
313
|
-
for (const selector of contentSelectors) {
|
|
314
|
-
const mainContent = document.querySelector(selector);
|
|
315
|
-
if (mainContent) {
|
|
316
|
-
// Clean up the content
|
|
317
|
-
return cleanText(mainContent.textContent);
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
// If no main content found or not requested, use the body
|
|
323
|
-
return cleanText(document.body.textContent);
|
|
324
|
-
}
|
|
325
|
-
|
|
326
|
-
// For non-HTML content, return as is
|
|
327
|
-
return response.data.toString();
|
|
328
|
-
} catch (error) {
|
|
329
|
-
console.error('Error fetching URL content:', error.message);
|
|
330
|
-
throw error;
|
|
331
|
-
}
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
/**
|
|
335
|
-
* Cleans up text by removing excessive whitespace and normalizing line breaks
|
|
336
|
-
* @param {string} text - The text to clean
|
|
337
|
-
* @returns {string} - The cleaned text
|
|
338
|
-
*/
|
|
339
|
-
function cleanText(text) {
|
|
340
|
-
return text
|
|
341
|
-
.replace(/\s+/g, ' ') // Replace multiple whitespace with single space
|
|
342
|
-
.replace(/\n\s*\n/g, '\n\n') // Normalize multiple line breaks
|
|
343
|
-
.replace(/^\s+|\s+$/g, '') // Trim start and end
|
|
344
|
-
.trim();
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
/**
|
|
348
|
-
* Extracts metadata from a URL (title, description, etc.)
|
|
349
|
-
* @param {string} url - The URL to extract metadata from
|
|
350
|
-
* @returns {Promise<Object>} - The metadata
|
|
351
|
-
*/
|
|
352
|
-
async function extractUrlMetadata(url) {
|
|
353
|
-
try {
|
|
354
|
-
// Get a random user agent
|
|
355
|
-
const userAgent = getRandomUserAgent();
|
|
356
|
-
|
|
357
|
-
const response = await axios.get(url, {
|
|
358
|
-
headers: {
|
|
359
|
-
'User-Agent': userAgent
|
|
360
|
-
}
|
|
361
|
-
});
|
|
362
|
-
|
|
363
|
-
if (response.status !== 200) {
|
|
364
|
-
throw new Error(`Failed to fetch URL: ${url}`);
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
const dom = new JSDOM(response.data);
|
|
368
|
-
const document = dom.window.document;
|
|
369
|
-
|
|
370
|
-
// Extract metadata
|
|
371
|
-
const title = document.querySelector('title')?.textContent || '';
|
|
372
|
-
const description = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
|
|
373
|
-
document.querySelector('meta[property="og:description"]')?.getAttribute('content') || '';
|
|
374
|
-
const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute('content') || '';
|
|
375
|
-
const favicon = document.querySelector('link[rel="icon"]')?.getAttribute('href') ||
|
|
376
|
-
document.querySelector('link[rel="shortcut icon"]')?.getAttribute('href') || '';
|
|
377
|
-
|
|
378
|
-
// Resolve relative URLs
|
|
379
|
-
const resolvedFavicon = favicon ? new URL(favicon, url).href : getFaviconUrl(url);
|
|
380
|
-
const resolvedOgImage = ogImage ? new URL(ogImage, url).href : '';
|
|
381
|
-
|
|
382
|
-
return {
|
|
383
|
-
title,
|
|
384
|
-
description,
|
|
385
|
-
ogImage: resolvedOgImage,
|
|
386
|
-
favicon: resolvedFavicon,
|
|
387
|
-
url
|
|
388
|
-
};
|
|
389
|
-
} catch (error) {
|
|
390
|
-
console.error('Error extracting URL metadata:', error.message);
|
|
391
|
-
throw error;
|
|
392
|
-
}
|
|
393
|
-
}
|
|
394
|
-
|
|
395
|
-
export {
|
|
396
|
-
searchDuckDuckGo,
|
|
397
|
-
fetchUrlContent,
|
|
398
|
-
extractUrlMetadata,
|
|
399
|
-
extractDirectUrl,
|
|
400
|
-
getFaviconUrl
|
|
401
|
-
};
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import { JSDOM } from 'jsdom';
|
|
3
|
+
|
|
4
|
+
// Constants
|
|
5
|
+
const RESULTS_PER_PAGE = 10;
|
|
6
|
+
const MAX_CACHE_PAGES = 5;
|
|
7
|
+
|
|
8
|
+
// Rotating User Agents
|
|
9
|
+
const USER_AGENTS = [
|
|
10
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
11
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Edge/120.0.0.0',
|
|
12
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15',
|
|
13
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
|
14
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
// Cache results to avoid repeated requests
|
|
18
|
+
const resultsCache = new Map();
|
|
19
|
+
const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Get a random user agent from the list
|
|
23
|
+
* @returns {string} A random user agent string
|
|
24
|
+
*/
|
|
25
|
+
function getRandomUserAgent() {
|
|
26
|
+
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Generate a cache key for a search query and page
|
|
31
|
+
* @param {string} query - The search query
|
|
32
|
+
* @param {number} page - The page number
|
|
33
|
+
* @returns {string} The cache key
|
|
34
|
+
*/
|
|
35
|
+
function getCacheKey(query, page) {
|
|
36
|
+
return `${query}-${page}`;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Clear old entries from the cache
|
|
41
|
+
*/
|
|
42
|
+
function clearOldCache() {
|
|
43
|
+
const now = Date.now();
|
|
44
|
+
for (const [key, value] of resultsCache.entries()) {
|
|
45
|
+
if (now - value.timestamp > CACHE_DURATION) {
|
|
46
|
+
resultsCache.delete(key);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Extract the direct URL from a DuckDuckGo redirect URL
|
|
53
|
+
* @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
|
|
54
|
+
* @returns {string} The direct URL
|
|
55
|
+
*/
|
|
56
|
+
function extractDirectUrl(duckduckgoUrl) {
|
|
57
|
+
try {
|
|
58
|
+
// Handle relative URLs from DuckDuckGo
|
|
59
|
+
if (duckduckgoUrl.startsWith('//')) {
|
|
60
|
+
duckduckgoUrl = 'https:' + duckduckgoUrl;
|
|
61
|
+
} else if (duckduckgoUrl.startsWith('/')) {
|
|
62
|
+
duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const url = new URL(duckduckgoUrl);
|
|
66
|
+
|
|
67
|
+
// Extract direct URL from DuckDuckGo redirect
|
|
68
|
+
if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
|
|
69
|
+
const uddg = url.searchParams.get('uddg');
|
|
70
|
+
if (uddg) {
|
|
71
|
+
return decodeURIComponent(uddg);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Handle ad redirects
|
|
76
|
+
if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
|
|
77
|
+
const u3 = url.searchParams.get('u3');
|
|
78
|
+
if (u3) {
|
|
79
|
+
try {
|
|
80
|
+
const decodedU3 = decodeURIComponent(u3);
|
|
81
|
+
const u3Url = new URL(decodedU3);
|
|
82
|
+
const clickUrl = u3Url.searchParams.get('ld');
|
|
83
|
+
if (clickUrl) {
|
|
84
|
+
return decodeURIComponent(clickUrl);
|
|
85
|
+
}
|
|
86
|
+
return decodedU3;
|
|
87
|
+
} catch {
|
|
88
|
+
return duckduckgoUrl;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return duckduckgoUrl;
|
|
94
|
+
} catch {
|
|
95
|
+
// If URL parsing fails, try to extract URL from a basic string match
|
|
96
|
+
const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
|
|
97
|
+
if (urlMatch) {
|
|
98
|
+
return urlMatch[0];
|
|
99
|
+
}
|
|
100
|
+
return duckduckgoUrl;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Get a favicon URL for a given website URL
|
|
106
|
+
* @param {string} url - The website URL
|
|
107
|
+
* @returns {string} The favicon URL
|
|
108
|
+
*/
|
|
109
|
+
function getFaviconUrl(url) {
|
|
110
|
+
try {
|
|
111
|
+
const urlObj = new URL(url);
|
|
112
|
+
return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
|
|
113
|
+
} catch {
|
|
114
|
+
return ''; // Return empty string if URL is invalid
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Scrapes search results from DuckDuckGo HTML
|
|
120
|
+
* @param {string} query - The search query
|
|
121
|
+
* @param {number} page - The page number (default: 1)
|
|
122
|
+
* @param {number} numResults - Number of results to return (default: 10)
|
|
123
|
+
* @returns {Promise<Array>} - Array of search results
|
|
124
|
+
*/
|
|
125
|
+
async function searchDuckDuckGo(query, page = 1, numResults = 10) {
|
|
126
|
+
try {
|
|
127
|
+
// Clear old cache entries
|
|
128
|
+
clearOldCache();
|
|
129
|
+
|
|
130
|
+
// Calculate start index for pagination
|
|
131
|
+
const startIndex = (page - 1) * RESULTS_PER_PAGE;
|
|
132
|
+
|
|
133
|
+
// Check cache first
|
|
134
|
+
const cacheKey = getCacheKey(query, page);
|
|
135
|
+
const cachedResults = resultsCache.get(cacheKey);
|
|
136
|
+
|
|
137
|
+
if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
|
|
138
|
+
return cachedResults.results.slice(0, numResults);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Get a random user agent
|
|
142
|
+
const userAgent = getRandomUserAgent();
|
|
143
|
+
|
|
144
|
+
// Fetch results
|
|
145
|
+
const response = await axios.get(
|
|
146
|
+
`https://duckduckgo.com/html/?q=${encodeURIComponent(query)}&s=${startIndex}`,
|
|
147
|
+
{
|
|
148
|
+
headers: {
|
|
149
|
+
'User-Agent': userAgent
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
);
|
|
153
|
+
|
|
154
|
+
if (response.status !== 200) {
|
|
155
|
+
throw new Error('Failed to fetch search results');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const html = response.data;
|
|
159
|
+
|
|
160
|
+
// Parse results using JSDOM
|
|
161
|
+
const dom = new JSDOM(html);
|
|
162
|
+
const document = dom.window.document;
|
|
163
|
+
|
|
164
|
+
const results = [];
|
|
165
|
+
const searchResults = document.querySelectorAll('.result');
|
|
166
|
+
|
|
167
|
+
searchResults.forEach((result) => {
|
|
168
|
+
const titleEl = result.querySelector('.result__title a');
|
|
169
|
+
const linkEl = result.querySelector('.result__url');
|
|
170
|
+
const snippetEl = result.querySelector('.result__snippet');
|
|
171
|
+
|
|
172
|
+
const title = titleEl?.textContent?.trim();
|
|
173
|
+
const rawLink = titleEl?.getAttribute('href');
|
|
174
|
+
const description = snippetEl?.textContent?.trim();
|
|
175
|
+
const displayUrl = linkEl?.textContent?.trim();
|
|
176
|
+
|
|
177
|
+
const directLink = extractDirectUrl(rawLink || '');
|
|
178
|
+
const favicon = getFaviconUrl(directLink);
|
|
179
|
+
|
|
180
|
+
if (title && directLink) {
|
|
181
|
+
results.push({
|
|
182
|
+
title,
|
|
183
|
+
url: directLink,
|
|
184
|
+
snippet: description || '',
|
|
185
|
+
favicon: favicon,
|
|
186
|
+
displayUrl: displayUrl || ''
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
// Get paginated results
|
|
192
|
+
const paginatedResults = results.slice(0, numResults);
|
|
193
|
+
|
|
194
|
+
// Cache the results
|
|
195
|
+
resultsCache.set(cacheKey, {
|
|
196
|
+
results: paginatedResults,
|
|
197
|
+
timestamp: Date.now()
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
// If cache is too big, remove oldest entries
|
|
201
|
+
if (resultsCache.size > MAX_CACHE_PAGES) {
|
|
202
|
+
const oldestKey = Array.from(resultsCache.keys())[0];
|
|
203
|
+
resultsCache.delete(oldestKey);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return paginatedResults;
|
|
207
|
+
} catch (error) {
|
|
208
|
+
console.error('Error searching DuckDuckGo:', error.message);
|
|
209
|
+
throw error;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Fetches the content of a URL and returns it as text
|
|
215
|
+
* @param {string} url - The URL to fetch
|
|
216
|
+
* @param {Object} options - Options for content extraction
|
|
217
|
+
* @param {boolean} options.extractMainContent - Whether to attempt to extract main content (default: true)
|
|
218
|
+
* @param {boolean} options.includeLinks - Whether to include link text (default: true)
|
|
219
|
+
* @param {boolean} options.includeImages - Whether to include image alt text (default: true)
|
|
220
|
+
* @param {string[]} options.excludeTags - Tags to exclude from extraction
|
|
221
|
+
* @returns {Promise<string>} - The content of the URL
|
|
222
|
+
*/
|
|
223
|
+
async function fetchUrlContent(url, options = {}) {
|
|
224
|
+
try {
|
|
225
|
+
// Default options
|
|
226
|
+
const {
|
|
227
|
+
extractMainContent = true,
|
|
228
|
+
includeLinks = true,
|
|
229
|
+
includeImages = true,
|
|
230
|
+
excludeTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'nav', 'footer', 'header', 'aside']
|
|
231
|
+
} = options;
|
|
232
|
+
|
|
233
|
+
// Get a random user agent
|
|
234
|
+
const userAgent = getRandomUserAgent();
|
|
235
|
+
|
|
236
|
+
const response = await axios.get(url, {
|
|
237
|
+
headers: {
|
|
238
|
+
'User-Agent': userAgent
|
|
239
|
+
},
|
|
240
|
+
timeout: 10000 // 10 second timeout
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
if (response.status !== 200) {
|
|
244
|
+
throw new Error(`Failed to fetch URL: ${url}`);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// If the content is HTML, extract the text content
|
|
248
|
+
const contentType = response.headers['content-type'] || '';
|
|
249
|
+
if (contentType.includes('text/html')) {
|
|
250
|
+
const dom = new JSDOM(response.data);
|
|
251
|
+
const document = dom.window.document;
|
|
252
|
+
|
|
253
|
+
// Remove unwanted elements
|
|
254
|
+
excludeTags.forEach(tag => {
|
|
255
|
+
const elements = document.querySelectorAll(tag);
|
|
256
|
+
elements.forEach(el => el.remove());
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
// Remove ads and other common unwanted elements
|
|
260
|
+
const unwantedSelectors = [
|
|
261
|
+
'[id*="ad"]', '[class*="ad"]', '[id*="banner"]', '[class*="banner"]',
|
|
262
|
+
'[id*="popup"]', '[class*="popup"]', '[class*="cookie"]',
|
|
263
|
+
'[id*="cookie"]', '[class*="newsletter"]', '[id*="newsletter"]',
|
|
264
|
+
'[class*="social"]', '[id*="social"]', '[class*="share"]', '[id*="share"]'
|
|
265
|
+
];
|
|
266
|
+
|
|
267
|
+
unwantedSelectors.forEach(selector => {
|
|
268
|
+
try {
|
|
269
|
+
const elements = document.querySelectorAll(selector);
|
|
270
|
+
elements.forEach(el => el.remove());
|
|
271
|
+
} catch (e) {
|
|
272
|
+
// Ignore invalid selectors
|
|
273
|
+
}
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
// Handle links and images
|
|
277
|
+
if (!includeLinks) {
|
|
278
|
+
const links = document.querySelectorAll('a');
|
|
279
|
+
links.forEach(link => {
|
|
280
|
+
const span = document.createElement('span');
|
|
281
|
+
span.textContent = link.textContent;
|
|
282
|
+
link.parentNode.replaceChild(span, link);
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (!includeImages) {
|
|
287
|
+
const images = document.querySelectorAll('img');
|
|
288
|
+
images.forEach(img => img.remove());
|
|
289
|
+
} else {
|
|
290
|
+
// Replace images with their alt text
|
|
291
|
+
const images = document.querySelectorAll('img');
|
|
292
|
+
images.forEach(img => {
|
|
293
|
+
const alt = img.getAttribute('alt');
|
|
294
|
+
if (alt) {
|
|
295
|
+
const span = document.createElement('span');
|
|
296
|
+
span.textContent = `[Image: ${alt}]`;
|
|
297
|
+
img.parentNode.replaceChild(span, img);
|
|
298
|
+
} else {
|
|
299
|
+
img.remove();
|
|
300
|
+
}
|
|
301
|
+
});
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Try to extract main content if requested
|
|
305
|
+
if (extractMainContent) {
|
|
306
|
+
// Common content selectors in order of priority
|
|
307
|
+
const contentSelectors = [
|
|
308
|
+
'article', 'main', '[role="main"]', '.post-content', '.article-content',
|
|
309
|
+
'.content', '#content', '.post', '.article', '.entry-content',
|
|
310
|
+
'.page-content', '.post-body', '.post-text', '.story-body'
|
|
311
|
+
];
|
|
312
|
+
|
|
313
|
+
for (const selector of contentSelectors) {
|
|
314
|
+
const mainContent = document.querySelector(selector);
|
|
315
|
+
if (mainContent) {
|
|
316
|
+
// Clean up the content
|
|
317
|
+
return cleanText(mainContent.textContent);
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// If no main content found or not requested, use the body
|
|
323
|
+
return cleanText(document.body.textContent);
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// For non-HTML content, return as is
|
|
327
|
+
return response.data.toString();
|
|
328
|
+
} catch (error) {
|
|
329
|
+
console.error('Error fetching URL content:', error.message);
|
|
330
|
+
throw error;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
/**
|
|
335
|
+
* Cleans up text by removing excessive whitespace and normalizing line breaks
|
|
336
|
+
* @param {string} text - The text to clean
|
|
337
|
+
* @returns {string} - The cleaned text
|
|
338
|
+
*/
|
|
339
|
+
function cleanText(text) {
|
|
340
|
+
return text
|
|
341
|
+
.replace(/\s+/g, ' ') // Replace multiple whitespace with single space
|
|
342
|
+
.replace(/\n\s*\n/g, '\n\n') // Normalize multiple line breaks
|
|
343
|
+
.replace(/^\s+|\s+$/g, '') // Trim start and end
|
|
344
|
+
.trim();
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/**
|
|
348
|
+
* Extracts metadata from a URL (title, description, etc.)
|
|
349
|
+
* @param {string} url - The URL to extract metadata from
|
|
350
|
+
* @returns {Promise<Object>} - The metadata
|
|
351
|
+
*/
|
|
352
|
+
async function extractUrlMetadata(url) {
|
|
353
|
+
try {
|
|
354
|
+
// Get a random user agent
|
|
355
|
+
const userAgent = getRandomUserAgent();
|
|
356
|
+
|
|
357
|
+
const response = await axios.get(url, {
|
|
358
|
+
headers: {
|
|
359
|
+
'User-Agent': userAgent
|
|
360
|
+
}
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
if (response.status !== 200) {
|
|
364
|
+
throw new Error(`Failed to fetch URL: ${url}`);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
const dom = new JSDOM(response.data);
|
|
368
|
+
const document = dom.window.document;
|
|
369
|
+
|
|
370
|
+
// Extract metadata
|
|
371
|
+
const title = document.querySelector('title')?.textContent || '';
|
|
372
|
+
const description = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
|
|
373
|
+
document.querySelector('meta[property="og:description"]')?.getAttribute('content') || '';
|
|
374
|
+
const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute('content') || '';
|
|
375
|
+
const favicon = document.querySelector('link[rel="icon"]')?.getAttribute('href') ||
|
|
376
|
+
document.querySelector('link[rel="shortcut icon"]')?.getAttribute('href') || '';
|
|
377
|
+
|
|
378
|
+
// Resolve relative URLs
|
|
379
|
+
const resolvedFavicon = favicon ? new URL(favicon, url).href : getFaviconUrl(url);
|
|
380
|
+
const resolvedOgImage = ogImage ? new URL(ogImage, url).href : '';
|
|
381
|
+
|
|
382
|
+
return {
|
|
383
|
+
title,
|
|
384
|
+
description,
|
|
385
|
+
ogImage: resolvedOgImage,
|
|
386
|
+
favicon: resolvedFavicon,
|
|
387
|
+
url
|
|
388
|
+
};
|
|
389
|
+
} catch (error) {
|
|
390
|
+
console.error('Error extracting URL metadata:', error.message);
|
|
391
|
+
throw error;
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
export {
|
|
396
|
+
searchDuckDuckGo,
|
|
397
|
+
fetchUrlContent,
|
|
398
|
+
extractUrlMetadata,
|
|
399
|
+
extractDirectUrl,
|
|
400
|
+
getFaviconUrl
|
|
401
|
+
};
|