@oevortex/ddg_search 1.0.2 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,401 +1,401 @@
1
- import axios from 'axios';
2
- import { JSDOM } from 'jsdom';
3
-
4
- // Constants
5
- const RESULTS_PER_PAGE = 10;
6
- const MAX_CACHE_PAGES = 5;
7
-
8
- // Rotating User Agents
9
- const USER_AGENTS = [
10
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
11
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Edge/120.0.0.0',
12
- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15',
13
- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
14
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
15
- ];
16
-
17
- // Cache results to avoid repeated requests
18
- const resultsCache = new Map();
19
- const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
20
-
21
- /**
22
- * Get a random user agent from the list
23
- * @returns {string} A random user agent string
24
- */
25
- function getRandomUserAgent() {
26
- return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
27
- }
28
-
29
- /**
30
- * Generate a cache key for a search query and page
31
- * @param {string} query - The search query
32
- * @param {number} page - The page number
33
- * @returns {string} The cache key
34
- */
35
- function getCacheKey(query, page) {
36
- return `${query}-${page}`;
37
- }
38
-
39
- /**
40
- * Clear old entries from the cache
41
- */
42
- function clearOldCache() {
43
- const now = Date.now();
44
- for (const [key, value] of resultsCache.entries()) {
45
- if (now - value.timestamp > CACHE_DURATION) {
46
- resultsCache.delete(key);
47
- }
48
- }
49
- }
50
-
51
- /**
52
- * Extract the direct URL from a DuckDuckGo redirect URL
53
- * @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
54
- * @returns {string} The direct URL
55
- */
56
- function extractDirectUrl(duckduckgoUrl) {
57
- try {
58
- // Handle relative URLs from DuckDuckGo
59
- if (duckduckgoUrl.startsWith('//')) {
60
- duckduckgoUrl = 'https:' + duckduckgoUrl;
61
- } else if (duckduckgoUrl.startsWith('/')) {
62
- duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
63
- }
64
-
65
- const url = new URL(duckduckgoUrl);
66
-
67
- // Extract direct URL from DuckDuckGo redirect
68
- if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
69
- const uddg = url.searchParams.get('uddg');
70
- if (uddg) {
71
- return decodeURIComponent(uddg);
72
- }
73
- }
74
-
75
- // Handle ad redirects
76
- if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
77
- const u3 = url.searchParams.get('u3');
78
- if (u3) {
79
- try {
80
- const decodedU3 = decodeURIComponent(u3);
81
- const u3Url = new URL(decodedU3);
82
- const clickUrl = u3Url.searchParams.get('ld');
83
- if (clickUrl) {
84
- return decodeURIComponent(clickUrl);
85
- }
86
- return decodedU3;
87
- } catch {
88
- return duckduckgoUrl;
89
- }
90
- }
91
- }
92
-
93
- return duckduckgoUrl;
94
- } catch {
95
- // If URL parsing fails, try to extract URL from a basic string match
96
- const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
97
- if (urlMatch) {
98
- return urlMatch[0];
99
- }
100
- return duckduckgoUrl;
101
- }
102
- }
103
-
104
- /**
105
- * Get a favicon URL for a given website URL
106
- * @param {string} url - The website URL
107
- * @returns {string} The favicon URL
108
- */
109
- function getFaviconUrl(url) {
110
- try {
111
- const urlObj = new URL(url);
112
- return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
113
- } catch {
114
- return ''; // Return empty string if URL is invalid
115
- }
116
- }
117
-
118
- /**
119
- * Scrapes search results from DuckDuckGo HTML
120
- * @param {string} query - The search query
121
- * @param {number} page - The page number (default: 1)
122
- * @param {number} numResults - Number of results to return (default: 10)
123
- * @returns {Promise<Array>} - Array of search results
124
- */
125
- async function searchDuckDuckGo(query, page = 1, numResults = 10) {
126
- try {
127
- // Clear old cache entries
128
- clearOldCache();
129
-
130
- // Calculate start index for pagination
131
- const startIndex = (page - 1) * RESULTS_PER_PAGE;
132
-
133
- // Check cache first
134
- const cacheKey = getCacheKey(query, page);
135
- const cachedResults = resultsCache.get(cacheKey);
136
-
137
- if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
138
- return cachedResults.results.slice(0, numResults);
139
- }
140
-
141
- // Get a random user agent
142
- const userAgent = getRandomUserAgent();
143
-
144
- // Fetch results
145
- const response = await axios.get(
146
- `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}&s=${startIndex}`,
147
- {
148
- headers: {
149
- 'User-Agent': userAgent
150
- }
151
- }
152
- );
153
-
154
- if (response.status !== 200) {
155
- throw new Error('Failed to fetch search results');
156
- }
157
-
158
- const html = response.data;
159
-
160
- // Parse results using JSDOM
161
- const dom = new JSDOM(html);
162
- const document = dom.window.document;
163
-
164
- const results = [];
165
- const searchResults = document.querySelectorAll('.result');
166
-
167
- searchResults.forEach((result) => {
168
- const titleEl = result.querySelector('.result__title a');
169
- const linkEl = result.querySelector('.result__url');
170
- const snippetEl = result.querySelector('.result__snippet');
171
-
172
- const title = titleEl?.textContent?.trim();
173
- const rawLink = titleEl?.getAttribute('href');
174
- const description = snippetEl?.textContent?.trim();
175
- const displayUrl = linkEl?.textContent?.trim();
176
-
177
- const directLink = extractDirectUrl(rawLink || '');
178
- const favicon = getFaviconUrl(directLink);
179
-
180
- if (title && directLink) {
181
- results.push({
182
- title,
183
- url: directLink,
184
- snippet: description || '',
185
- favicon: favicon,
186
- displayUrl: displayUrl || ''
187
- });
188
- }
189
- });
190
-
191
- // Get paginated results
192
- const paginatedResults = results.slice(0, numResults);
193
-
194
- // Cache the results
195
- resultsCache.set(cacheKey, {
196
- results: paginatedResults,
197
- timestamp: Date.now()
198
- });
199
-
200
- // If cache is too big, remove oldest entries
201
- if (resultsCache.size > MAX_CACHE_PAGES) {
202
- const oldestKey = Array.from(resultsCache.keys())[0];
203
- resultsCache.delete(oldestKey);
204
- }
205
-
206
- return paginatedResults;
207
- } catch (error) {
208
- console.error('Error searching DuckDuckGo:', error.message);
209
- throw error;
210
- }
211
- }
212
-
213
- /**
214
- * Fetches the content of a URL and returns it as text
215
- * @param {string} url - The URL to fetch
216
- * @param {Object} options - Options for content extraction
217
- * @param {boolean} options.extractMainContent - Whether to attempt to extract main content (default: true)
218
- * @param {boolean} options.includeLinks - Whether to include link text (default: true)
219
- * @param {boolean} options.includeImages - Whether to include image alt text (default: true)
220
- * @param {string[]} options.excludeTags - Tags to exclude from extraction
221
- * @returns {Promise<string>} - The content of the URL
222
- */
223
- async function fetchUrlContent(url, options = {}) {
224
- try {
225
- // Default options
226
- const {
227
- extractMainContent = true,
228
- includeLinks = true,
229
- includeImages = true,
230
- excludeTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'nav', 'footer', 'header', 'aside']
231
- } = options;
232
-
233
- // Get a random user agent
234
- const userAgent = getRandomUserAgent();
235
-
236
- const response = await axios.get(url, {
237
- headers: {
238
- 'User-Agent': userAgent
239
- },
240
- timeout: 10000 // 10 second timeout
241
- });
242
-
243
- if (response.status !== 200) {
244
- throw new Error(`Failed to fetch URL: ${url}`);
245
- }
246
-
247
- // If the content is HTML, extract the text content
248
- const contentType = response.headers['content-type'] || '';
249
- if (contentType.includes('text/html')) {
250
- const dom = new JSDOM(response.data);
251
- const document = dom.window.document;
252
-
253
- // Remove unwanted elements
254
- excludeTags.forEach(tag => {
255
- const elements = document.querySelectorAll(tag);
256
- elements.forEach(el => el.remove());
257
- });
258
-
259
- // Remove ads and other common unwanted elements
260
- const unwantedSelectors = [
261
- '[id*="ad"]', '[class*="ad"]', '[id*="banner"]', '[class*="banner"]',
262
- '[id*="popup"]', '[class*="popup"]', '[class*="cookie"]',
263
- '[id*="cookie"]', '[class*="newsletter"]', '[id*="newsletter"]',
264
- '[class*="social"]', '[id*="social"]', '[class*="share"]', '[id*="share"]'
265
- ];
266
-
267
- unwantedSelectors.forEach(selector => {
268
- try {
269
- const elements = document.querySelectorAll(selector);
270
- elements.forEach(el => el.remove());
271
- } catch (e) {
272
- // Ignore invalid selectors
273
- }
274
- });
275
-
276
- // Handle links and images
277
- if (!includeLinks) {
278
- const links = document.querySelectorAll('a');
279
- links.forEach(link => {
280
- const span = document.createElement('span');
281
- span.textContent = link.textContent;
282
- link.parentNode.replaceChild(span, link);
283
- });
284
- }
285
-
286
- if (!includeImages) {
287
- const images = document.querySelectorAll('img');
288
- images.forEach(img => img.remove());
289
- } else {
290
- // Replace images with their alt text
291
- const images = document.querySelectorAll('img');
292
- images.forEach(img => {
293
- const alt = img.getAttribute('alt');
294
- if (alt) {
295
- const span = document.createElement('span');
296
- span.textContent = `[Image: ${alt}]`;
297
- img.parentNode.replaceChild(span, img);
298
- } else {
299
- img.remove();
300
- }
301
- });
302
- }
303
-
304
- // Try to extract main content if requested
305
- if (extractMainContent) {
306
- // Common content selectors in order of priority
307
- const contentSelectors = [
308
- 'article', 'main', '[role="main"]', '.post-content', '.article-content',
309
- '.content', '#content', '.post', '.article', '.entry-content',
310
- '.page-content', '.post-body', '.post-text', '.story-body'
311
- ];
312
-
313
- for (const selector of contentSelectors) {
314
- const mainContent = document.querySelector(selector);
315
- if (mainContent) {
316
- // Clean up the content
317
- return cleanText(mainContent.textContent);
318
- }
319
- }
320
- }
321
-
322
- // If no main content found or not requested, use the body
323
- return cleanText(document.body.textContent);
324
- }
325
-
326
- // For non-HTML content, return as is
327
- return response.data.toString();
328
- } catch (error) {
329
- console.error('Error fetching URL content:', error.message);
330
- throw error;
331
- }
332
- }
333
-
334
- /**
335
- * Cleans up text by removing excessive whitespace and normalizing line breaks
336
- * @param {string} text - The text to clean
337
- * @returns {string} - The cleaned text
338
- */
339
- function cleanText(text) {
340
- return text
341
- .replace(/\s+/g, ' ') // Replace multiple whitespace with single space
342
- .replace(/\n\s*\n/g, '\n\n') // Normalize multiple line breaks
343
- .replace(/^\s+|\s+$/g, '') // Trim start and end
344
- .trim();
345
- }
346
-
347
- /**
348
- * Extracts metadata from a URL (title, description, etc.)
349
- * @param {string} url - The URL to extract metadata from
350
- * @returns {Promise<Object>} - The metadata
351
- */
352
- async function extractUrlMetadata(url) {
353
- try {
354
- // Get a random user agent
355
- const userAgent = getRandomUserAgent();
356
-
357
- const response = await axios.get(url, {
358
- headers: {
359
- 'User-Agent': userAgent
360
- }
361
- });
362
-
363
- if (response.status !== 200) {
364
- throw new Error(`Failed to fetch URL: ${url}`);
365
- }
366
-
367
- const dom = new JSDOM(response.data);
368
- const document = dom.window.document;
369
-
370
- // Extract metadata
371
- const title = document.querySelector('title')?.textContent || '';
372
- const description = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
373
- document.querySelector('meta[property="og:description"]')?.getAttribute('content') || '';
374
- const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute('content') || '';
375
- const favicon = document.querySelector('link[rel="icon"]')?.getAttribute('href') ||
376
- document.querySelector('link[rel="shortcut icon"]')?.getAttribute('href') || '';
377
-
378
- // Resolve relative URLs
379
- const resolvedFavicon = favicon ? new URL(favicon, url).href : getFaviconUrl(url);
380
- const resolvedOgImage = ogImage ? new URL(ogImage, url).href : '';
381
-
382
- return {
383
- title,
384
- description,
385
- ogImage: resolvedOgImage,
386
- favicon: resolvedFavicon,
387
- url
388
- };
389
- } catch (error) {
390
- console.error('Error extracting URL metadata:', error.message);
391
- throw error;
392
- }
393
- }
394
-
395
- export {
396
- searchDuckDuckGo,
397
- fetchUrlContent,
398
- extractUrlMetadata,
399
- extractDirectUrl,
400
- getFaviconUrl
401
- };
1
+ import axios from 'axios';
2
+ import { JSDOM } from 'jsdom';
3
+
4
+ // Constants
5
+ const RESULTS_PER_PAGE = 10;
6
+ const MAX_CACHE_PAGES = 5;
7
+
8
+ // Rotating User Agents
9
+ const USER_AGENTS = [
10
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
11
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Edge/120.0.0.0',
12
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2.1 Safari/605.1.15',
13
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
14
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
15
+ ];
16
+
17
+ // Cache results to avoid repeated requests
18
+ const resultsCache = new Map();
19
+ const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes
20
+
21
+ /**
22
+ * Get a random user agent from the list
23
+ * @returns {string} A random user agent string
24
+ */
25
+ function getRandomUserAgent() {
26
+ return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
27
+ }
28
+
29
+ /**
30
+ * Generate a cache key for a search query and page
31
+ * @param {string} query - The search query
32
+ * @param {number} page - The page number
33
+ * @returns {string} The cache key
34
+ */
35
+ function getCacheKey(query, page) {
36
+ return `${query}-${page}`;
37
+ }
38
+
39
+ /**
40
+ * Clear old entries from the cache
41
+ */
42
+ function clearOldCache() {
43
+ const now = Date.now();
44
+ for (const [key, value] of resultsCache.entries()) {
45
+ if (now - value.timestamp > CACHE_DURATION) {
46
+ resultsCache.delete(key);
47
+ }
48
+ }
49
+ }
50
+
51
+ /**
52
+ * Extract the direct URL from a DuckDuckGo redirect URL
53
+ * @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from
54
+ * @returns {string} The direct URL
55
+ */
56
+ function extractDirectUrl(duckduckgoUrl) {
57
+ try {
58
+ // Handle relative URLs from DuckDuckGo
59
+ if (duckduckgoUrl.startsWith('//')) {
60
+ duckduckgoUrl = 'https:' + duckduckgoUrl;
61
+ } else if (duckduckgoUrl.startsWith('/')) {
62
+ duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl;
63
+ }
64
+
65
+ const url = new URL(duckduckgoUrl);
66
+
67
+ // Extract direct URL from DuckDuckGo redirect
68
+ if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') {
69
+ const uddg = url.searchParams.get('uddg');
70
+ if (uddg) {
71
+ return decodeURIComponent(uddg);
72
+ }
73
+ }
74
+
75
+ // Handle ad redirects
76
+ if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') {
77
+ const u3 = url.searchParams.get('u3');
78
+ if (u3) {
79
+ try {
80
+ const decodedU3 = decodeURIComponent(u3);
81
+ const u3Url = new URL(decodedU3);
82
+ const clickUrl = u3Url.searchParams.get('ld');
83
+ if (clickUrl) {
84
+ return decodeURIComponent(clickUrl);
85
+ }
86
+ return decodedU3;
87
+ } catch {
88
+ return duckduckgoUrl;
89
+ }
90
+ }
91
+ }
92
+
93
+ return duckduckgoUrl;
94
+ } catch {
95
+ // If URL parsing fails, try to extract URL from a basic string match
96
+ const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
97
+ if (urlMatch) {
98
+ return urlMatch[0];
99
+ }
100
+ return duckduckgoUrl;
101
+ }
102
+ }
103
+
104
+ /**
105
+ * Get a favicon URL for a given website URL
106
+ * @param {string} url - The website URL
107
+ * @returns {string} The favicon URL
108
+ */
109
+ function getFaviconUrl(url) {
110
+ try {
111
+ const urlObj = new URL(url);
112
+ return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`;
113
+ } catch {
114
+ return ''; // Return empty string if URL is invalid
115
+ }
116
+ }
117
+
118
+ /**
119
+ * Scrapes search results from DuckDuckGo HTML
120
+ * @param {string} query - The search query
121
+ * @param {number} page - The page number (default: 1)
122
+ * @param {number} numResults - Number of results to return (default: 10)
123
+ * @returns {Promise<Array>} - Array of search results
124
+ */
125
+ async function searchDuckDuckGo(query, page = 1, numResults = 10) {
126
+ try {
127
+ // Clear old cache entries
128
+ clearOldCache();
129
+
130
+ // Calculate start index for pagination
131
+ const startIndex = (page - 1) * RESULTS_PER_PAGE;
132
+
133
+ // Check cache first
134
+ const cacheKey = getCacheKey(query, page);
135
+ const cachedResults = resultsCache.get(cacheKey);
136
+
137
+ if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) {
138
+ return cachedResults.results.slice(0, numResults);
139
+ }
140
+
141
+ // Get a random user agent
142
+ const userAgent = getRandomUserAgent();
143
+
144
+ // Fetch results
145
+ const response = await axios.get(
146
+ `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}&s=${startIndex}`,
147
+ {
148
+ headers: {
149
+ 'User-Agent': userAgent
150
+ }
151
+ }
152
+ );
153
+
154
+ if (response.status !== 200) {
155
+ throw new Error('Failed to fetch search results');
156
+ }
157
+
158
+ const html = response.data;
159
+
160
+ // Parse results using JSDOM
161
+ const dom = new JSDOM(html);
162
+ const document = dom.window.document;
163
+
164
+ const results = [];
165
+ const searchResults = document.querySelectorAll('.result');
166
+
167
+ searchResults.forEach((result) => {
168
+ const titleEl = result.querySelector('.result__title a');
169
+ const linkEl = result.querySelector('.result__url');
170
+ const snippetEl = result.querySelector('.result__snippet');
171
+
172
+ const title = titleEl?.textContent?.trim();
173
+ const rawLink = titleEl?.getAttribute('href');
174
+ const description = snippetEl?.textContent?.trim();
175
+ const displayUrl = linkEl?.textContent?.trim();
176
+
177
+ const directLink = extractDirectUrl(rawLink || '');
178
+ const favicon = getFaviconUrl(directLink);
179
+
180
+ if (title && directLink) {
181
+ results.push({
182
+ title,
183
+ url: directLink,
184
+ snippet: description || '',
185
+ favicon: favicon,
186
+ displayUrl: displayUrl || ''
187
+ });
188
+ }
189
+ });
190
+
191
+ // Get paginated results
192
+ const paginatedResults = results.slice(0, numResults);
193
+
194
+ // Cache the results
195
+ resultsCache.set(cacheKey, {
196
+ results: paginatedResults,
197
+ timestamp: Date.now()
198
+ });
199
+
200
+ // If cache is too big, remove oldest entries
201
+ if (resultsCache.size > MAX_CACHE_PAGES) {
202
+ const oldestKey = Array.from(resultsCache.keys())[0];
203
+ resultsCache.delete(oldestKey);
204
+ }
205
+
206
+ return paginatedResults;
207
+ } catch (error) {
208
+ console.error('Error searching DuckDuckGo:', error.message);
209
+ throw error;
210
+ }
211
+ }
212
+
213
+ /**
214
+ * Fetches the content of a URL and returns it as text
215
+ * @param {string} url - The URL to fetch
216
+ * @param {Object} options - Options for content extraction
217
+ * @param {boolean} options.extractMainContent - Whether to attempt to extract main content (default: true)
218
+ * @param {boolean} options.includeLinks - Whether to include link text (default: true)
219
+ * @param {boolean} options.includeImages - Whether to include image alt text (default: true)
220
+ * @param {string[]} options.excludeTags - Tags to exclude from extraction
221
+ * @returns {Promise<string>} - The content of the URL
222
+ */
223
+ async function fetchUrlContent(url, options = {}) {
224
+ try {
225
+ // Default options
226
+ const {
227
+ extractMainContent = true,
228
+ includeLinks = true,
229
+ includeImages = true,
230
+ excludeTags = ['script', 'style', 'noscript', 'iframe', 'svg', 'nav', 'footer', 'header', 'aside']
231
+ } = options;
232
+
233
+ // Get a random user agent
234
+ const userAgent = getRandomUserAgent();
235
+
236
+ const response = await axios.get(url, {
237
+ headers: {
238
+ 'User-Agent': userAgent
239
+ },
240
+ timeout: 10000 // 10 second timeout
241
+ });
242
+
243
+ if (response.status !== 200) {
244
+ throw new Error(`Failed to fetch URL: ${url}`);
245
+ }
246
+
247
+ // If the content is HTML, extract the text content
248
+ const contentType = response.headers['content-type'] || '';
249
+ if (contentType.includes('text/html')) {
250
+ const dom = new JSDOM(response.data);
251
+ const document = dom.window.document;
252
+
253
+ // Remove unwanted elements
254
+ excludeTags.forEach(tag => {
255
+ const elements = document.querySelectorAll(tag);
256
+ elements.forEach(el => el.remove());
257
+ });
258
+
259
+ // Remove ads and other common unwanted elements
260
+ const unwantedSelectors = [
261
+ '[id*="ad"]', '[class*="ad"]', '[id*="banner"]', '[class*="banner"]',
262
+ '[id*="popup"]', '[class*="popup"]', '[class*="cookie"]',
263
+ '[id*="cookie"]', '[class*="newsletter"]', '[id*="newsletter"]',
264
+ '[class*="social"]', '[id*="social"]', '[class*="share"]', '[id*="share"]'
265
+ ];
266
+
267
+ unwantedSelectors.forEach(selector => {
268
+ try {
269
+ const elements = document.querySelectorAll(selector);
270
+ elements.forEach(el => el.remove());
271
+ } catch (e) {
272
+ // Ignore invalid selectors
273
+ }
274
+ });
275
+
276
+ // Handle links and images
277
+ if (!includeLinks) {
278
+ const links = document.querySelectorAll('a');
279
+ links.forEach(link => {
280
+ const span = document.createElement('span');
281
+ span.textContent = link.textContent;
282
+ link.parentNode.replaceChild(span, link);
283
+ });
284
+ }
285
+
286
+ if (!includeImages) {
287
+ const images = document.querySelectorAll('img');
288
+ images.forEach(img => img.remove());
289
+ } else {
290
+ // Replace images with their alt text
291
+ const images = document.querySelectorAll('img');
292
+ images.forEach(img => {
293
+ const alt = img.getAttribute('alt');
294
+ if (alt) {
295
+ const span = document.createElement('span');
296
+ span.textContent = `[Image: ${alt}]`;
297
+ img.parentNode.replaceChild(span, img);
298
+ } else {
299
+ img.remove();
300
+ }
301
+ });
302
+ }
303
+
304
+ // Try to extract main content if requested
305
+ if (extractMainContent) {
306
+ // Common content selectors in order of priority
307
+ const contentSelectors = [
308
+ 'article', 'main', '[role="main"]', '.post-content', '.article-content',
309
+ '.content', '#content', '.post', '.article', '.entry-content',
310
+ '.page-content', '.post-body', '.post-text', '.story-body'
311
+ ];
312
+
313
+ for (const selector of contentSelectors) {
314
+ const mainContent = document.querySelector(selector);
315
+ if (mainContent) {
316
+ // Clean up the content
317
+ return cleanText(mainContent.textContent);
318
+ }
319
+ }
320
+ }
321
+
322
+ // If no main content found or not requested, use the body
323
+ return cleanText(document.body.textContent);
324
+ }
325
+
326
+ // For non-HTML content, return as is
327
+ return response.data.toString();
328
+ } catch (error) {
329
+ console.error('Error fetching URL content:', error.message);
330
+ throw error;
331
+ }
332
+ }
333
+
334
+ /**
335
+ * Cleans up text by removing excessive whitespace and normalizing line breaks
336
+ * @param {string} text - The text to clean
337
+ * @returns {string} - The cleaned text
338
+ */
339
+ function cleanText(text) {
340
+ return text
341
+ .replace(/\s+/g, ' ') // Replace multiple whitespace with single space
342
+ .replace(/\n\s*\n/g, '\n\n') // Normalize multiple line breaks
343
+ .replace(/^\s+|\s+$/g, '') // Trim start and end
344
+ .trim();
345
+ }
346
+
347
+ /**
348
+ * Extracts metadata from a URL (title, description, etc.)
349
+ * @param {string} url - The URL to extract metadata from
350
+ * @returns {Promise<Object>} - The metadata
351
+ */
352
+ async function extractUrlMetadata(url) {
353
+ try {
354
+ // Get a random user agent
355
+ const userAgent = getRandomUserAgent();
356
+
357
+ const response = await axios.get(url, {
358
+ headers: {
359
+ 'User-Agent': userAgent
360
+ }
361
+ });
362
+
363
+ if (response.status !== 200) {
364
+ throw new Error(`Failed to fetch URL: ${url}`);
365
+ }
366
+
367
+ const dom = new JSDOM(response.data);
368
+ const document = dom.window.document;
369
+
370
+ // Extract metadata
371
+ const title = document.querySelector('title')?.textContent || '';
372
+ const description = document.querySelector('meta[name="description"]')?.getAttribute('content') ||
373
+ document.querySelector('meta[property="og:description"]')?.getAttribute('content') || '';
374
+ const ogImage = document.querySelector('meta[property="og:image"]')?.getAttribute('content') || '';
375
+ const favicon = document.querySelector('link[rel="icon"]')?.getAttribute('href') ||
376
+ document.querySelector('link[rel="shortcut icon"]')?.getAttribute('href') || '';
377
+
378
+ // Resolve relative URLs
379
+ const resolvedFavicon = favicon ? new URL(favicon, url).href : getFaviconUrl(url);
380
+ const resolvedOgImage = ogImage ? new URL(ogImage, url).href : '';
381
+
382
+ return {
383
+ title,
384
+ description,
385
+ ogImage: resolvedOgImage,
386
+ favicon: resolvedFavicon,
387
+ url
388
+ };
389
+ } catch (error) {
390
+ console.error('Error extracting URL metadata:', error.message);
391
+ throw error;
392
+ }
393
+ }
394
+
395
+ export {
396
+ searchDuckDuckGo,
397
+ fetchUrlContent,
398
+ extractUrlMetadata,
399
+ extractDirectUrl,
400
+ getFaviconUrl
401
+ };