llm-search-tools 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +244 -0
  3. package/dist/index.d.ts +18 -0
  4. package/dist/index.js +40 -0
  5. package/dist/index.js.map +1 -0
  6. package/dist/integration.test.d.ts +1 -0
  7. package/dist/integration.test.js +237 -0
  8. package/dist/modules/answerbox.test.d.ts +1 -0
  9. package/dist/modules/answerbox.test.js +105 -0
  10. package/dist/modules/autocomplete.d.ts +11 -0
  11. package/dist/modules/autocomplete.js +159 -0
  12. package/dist/modules/autocomplete.test.d.ts +1 -0
  13. package/dist/modules/autocomplete.test.js +188 -0
  14. package/dist/modules/common.d.ts +26 -0
  15. package/dist/modules/common.js +263 -0
  16. package/dist/modules/common.test.d.ts +1 -0
  17. package/dist/modules/common.test.js +87 -0
  18. package/dist/modules/crawl.d.ts +9 -0
  19. package/dist/modules/crawl.js +117 -0
  20. package/dist/modules/crawl.test.d.ts +1 -0
  21. package/dist/modules/crawl.test.js +48 -0
  22. package/dist/modules/events.d.ts +8 -0
  23. package/dist/modules/events.js +129 -0
  24. package/dist/modules/events.test.d.ts +1 -0
  25. package/dist/modules/events.test.js +104 -0
  26. package/dist/modules/finance.d.ts +10 -0
  27. package/dist/modules/finance.js +20 -0
  28. package/dist/modules/finance.test.d.ts +1 -0
  29. package/dist/modules/finance.test.js +77 -0
  30. package/dist/modules/flights.d.ts +8 -0
  31. package/dist/modules/flights.js +135 -0
  32. package/dist/modules/flights.test.d.ts +1 -0
  33. package/dist/modules/flights.test.js +128 -0
  34. package/dist/modules/hackernews.d.ts +8 -0
  35. package/dist/modules/hackernews.js +87 -0
  36. package/dist/modules/hackernews.js.map +1 -0
  37. package/dist/modules/images.test.d.ts +1 -0
  38. package/dist/modules/images.test.js +145 -0
  39. package/dist/modules/integrations.test.d.ts +1 -0
  40. package/dist/modules/integrations.test.js +93 -0
  41. package/dist/modules/media.d.ts +11 -0
  42. package/dist/modules/media.js +132 -0
  43. package/dist/modules/media.test.d.ts +1 -0
  44. package/dist/modules/media.test.js +186 -0
  45. package/dist/modules/news.d.ts +3 -0
  46. package/dist/modules/news.js +39 -0
  47. package/dist/modules/news.test.d.ts +1 -0
  48. package/dist/modules/news.test.js +88 -0
  49. package/dist/modules/parser.d.ts +19 -0
  50. package/dist/modules/parser.js +361 -0
  51. package/dist/modules/parser.test.d.ts +1 -0
  52. package/dist/modules/parser.test.js +151 -0
  53. package/dist/modules/reddit.d.ts +21 -0
  54. package/dist/modules/reddit.js +107 -0
  55. package/dist/modules/scrape.d.ts +16 -0
  56. package/dist/modules/scrape.js +272 -0
  57. package/dist/modules/scrape.test.d.ts +1 -0
  58. package/dist/modules/scrape.test.js +232 -0
  59. package/dist/modules/scraper.d.ts +12 -0
  60. package/dist/modules/scraper.js +640 -0
  61. package/dist/modules/scrapers/anidb.d.ts +8 -0
  62. package/dist/modules/scrapers/anidb.js +156 -0
  63. package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
  64. package/dist/modules/scrapers/duckduckgo.js +284 -0
  65. package/dist/modules/scrapers/google-news.d.ts +2 -0
  66. package/dist/modules/scrapers/google-news.js +60 -0
  67. package/dist/modules/scrapers/google.d.ts +6 -0
  68. package/dist/modules/scrapers/google.js +211 -0
  69. package/dist/modules/scrapers/searxng.d.ts +2 -0
  70. package/dist/modules/scrapers/searxng.js +93 -0
  71. package/dist/modules/scrapers/thetvdb.d.ts +3 -0
  72. package/dist/modules/scrapers/thetvdb.js +147 -0
  73. package/dist/modules/scrapers/tmdb.d.ts +3 -0
  74. package/dist/modules/scrapers/tmdb.js +172 -0
  75. package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
  76. package/dist/modules/scrapers/yahoo-finance.js +33 -0
  77. package/dist/modules/search.d.ts +5 -0
  78. package/dist/modules/search.js +45 -0
  79. package/dist/modules/search.js.map +1 -0
  80. package/dist/modules/search.test.d.ts +1 -0
  81. package/dist/modules/search.test.js +219 -0
  82. package/dist/modules/urbandictionary.d.ts +12 -0
  83. package/dist/modules/urbandictionary.js +26 -0
  84. package/dist/modules/webpage.d.ts +4 -0
  85. package/dist/modules/webpage.js +150 -0
  86. package/dist/modules/webpage.js.map +1 -0
  87. package/dist/modules/wikipedia.d.ts +5 -0
  88. package/dist/modules/wikipedia.js +85 -0
  89. package/dist/modules/wikipedia.js.map +1 -0
  90. package/dist/scripts/interactive-search.d.ts +1 -0
  91. package/dist/scripts/interactive-search.js +98 -0
  92. package/dist/test.d.ts +1 -0
  93. package/dist/test.js +179 -0
  94. package/dist/test.js.map +1 -0
  95. package/dist/testBraveSearch.d.ts +1 -0
  96. package/dist/testBraveSearch.js +34 -0
  97. package/dist/testDuckDuckGo.d.ts +1 -0
  98. package/dist/testDuckDuckGo.js +52 -0
  99. package/dist/testEcosia.d.ts +1 -0
  100. package/dist/testEcosia.js +57 -0
  101. package/dist/testSearchModule.d.ts +1 -0
  102. package/dist/testSearchModule.js +95 -0
  103. package/dist/testwebpage.d.ts +1 -0
  104. package/dist/testwebpage.js +81 -0
  105. package/dist/types.d.ts +174 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/createTestDocx.d.ts +1 -0
  109. package/dist/utils/createTestDocx.js +58 -0
  110. package/dist/utils/htmlcleaner.d.ts +20 -0
  111. package/dist/utils/htmlcleaner.js +172 -0
  112. package/docs/README.md +275 -0
  113. package/docs/autocomplete.md +73 -0
  114. package/docs/crawling.md +88 -0
  115. package/docs/events.md +58 -0
  116. package/docs/examples.md +158 -0
  117. package/docs/finance.md +60 -0
  118. package/docs/flights.md +71 -0
  119. package/docs/hackernews.md +121 -0
  120. package/docs/media.md +87 -0
  121. package/docs/news.md +75 -0
  122. package/docs/parser.md +197 -0
  123. package/docs/scraper.md +347 -0
  124. package/docs/search.md +106 -0
  125. package/docs/wikipedia.md +91 -0
  126. package/package.json +97 -0
@@ -0,0 +1,156 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.searchAniDB = searchAniDB;
4
+ exports.getAniDBDetails = getAniDBDetails;
5
+ const common_1 = require("../common");
6
+ const jsdom_1 = require("jsdom");
7
+ // AniDB requires stricter rate limiting to avoid bans
8
+ // They recommend 2s delay between requests
9
+ const MIN_DELAY = 2500;
10
+ let lastSearchTime = 0;
11
+ async function enforceRateLimit() {
12
+ const now = Date.now();
13
+ const timeSinceLast = now - lastSearchTime;
14
+ if (timeSinceLast < MIN_DELAY) {
15
+ await new Promise((resolve) => setTimeout(resolve, MIN_DELAY - timeSinceLast));
16
+ }
17
+ lastSearchTime = Date.now();
18
+ }
19
+ /**
20
+ * AniDB Scraper
21
+ * AniDB has strict anti-bot protection ("AntiLeech").
22
+ * We must use Puppeteer with Stealth plugin and respect rate limits.
23
+ */
24
+ async function searchAniDB(query, options = {}) {
25
+ try {
26
+ await enforceRateLimit();
27
+ const mergedOptions = {
28
+ ...options,
29
+ // Always force puppeteer for AniDB due to protection
30
+ forcePuppeteer: true,
31
+ };
32
+ // AniDB search URL
33
+ // do=animelist performs a search
34
+ const searchUrl = `https://anidb.net/anime/?adb.search=${encodeURIComponent(query)}&do.update=Search&noalias=1`;
35
+ return await scrapeAniDBWithPuppeteer(searchUrl, mergedOptions);
36
+ }
37
+ catch (error) {
38
+ throw {
39
+ message: "AniDB search failed",
40
+ code: "ANIDB_SEARCH_ERROR",
41
+ originalError: error,
42
+ };
43
+ }
44
+ }
45
+ async function scrapeAniDBWithPuppeteer(url, options) {
46
+ const proxy = (0, common_1.parseProxyConfig)(options.proxy);
47
+ const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
48
+ try {
49
+ const page = await browser.newPage();
50
+ await page.setViewport({ width: 1920, height: 1080 });
51
+ await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
52
+ // Navigate with a slightly longer timeout for AniDB
53
+ await page.goto(url, { waitUntil: "networkidle2", timeout: 30000 });
54
+ // Wait for the anime list table
55
+ try {
56
+ await page.waitForSelector("table.animelist", { timeout: 10000 });
57
+ }
58
+ catch (e) {
59
+ // Check if we got a single result redirect (AniDB sometimes redirects directly to the anime page)
60
+ const currentUrl = page.url();
61
+ if (currentUrl.includes("/anime/") && !currentUrl.includes("adb.search")) {
62
+ // Single result found
63
+ const html = await page.content();
64
+ const dom = new jsdom_1.JSDOM(html);
65
+ const singleResult = parseAniDBSinglePage(dom.window.document, currentUrl);
66
+ return singleResult ? [singleResult] : [];
67
+ }
68
+ return [];
69
+ }
70
+ const html = await page.content();
71
+ const dom = new jsdom_1.JSDOM(html);
72
+ return parseAniDBList(dom.window.document);
73
+ }
74
+ finally {
75
+ await browser.close();
76
+ }
77
+ }
78
+ async function getAniDBDetails(url, options = {}) {
79
+ try {
80
+ await enforceRateLimit();
81
+ const proxy = (0, common_1.parseProxyConfig)(options.proxy);
82
+ const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
83
+ try {
84
+ const page = await browser.newPage();
85
+ await page.setViewport({ width: 1920, height: 1080 });
86
+ await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
87
+ await page.goto(url, { waitUntil: "networkidle2", timeout: 30000 });
88
+ const html = await page.content();
89
+ const dom = new jsdom_1.JSDOM(html);
90
+ const result = parseAniDBSinglePage(dom.window.document, url);
91
+ return result || {};
92
+ }
93
+ finally {
94
+ await browser.close();
95
+ }
96
+ }
97
+ catch (e) {
98
+ return {};
99
+ }
100
+ }
101
+ function parseAniDBList(doc) {
102
+ const results = [];
103
+ const rows = doc.querySelectorAll("table.animelist tbody tr");
104
+ rows.forEach((row) => {
105
+ if (results.length >= 10)
106
+ return;
107
+ // AniDB list columns: ID, Icon, Title, Type, Episodes, Rating, etc.
108
+ // The title is usually in the "name" column (class .name or depending on layout)
109
+ const titleLink = row.querySelector("td[data-label='Title'] a, td.name a");
110
+ const imgEl = row.querySelector("img"); // Often thumbnails are small or hidden on list view
111
+ if (titleLink) {
112
+ const title = titleLink.textContent?.trim() || "";
113
+ let href = titleLink.getAttribute("href") || "";
114
+ if (href && !href.startsWith("http")) {
115
+ href = `https://anidb.net${href}`;
116
+ }
117
+ // Try to get other metadata from columns if available
118
+ const typeEl = row.querySelector("td[data-label='Type']");
119
+ const ratingEl = row.querySelector("td[data-label='Rating']");
120
+ const rating = ratingEl?.textContent?.trim();
121
+ const type = typeEl?.textContent?.trim(); // e.g. TV, Movie, OVA
122
+ results.push({
123
+ title,
124
+ url: href,
125
+ rating,
126
+ description: type ? `Type: ${type}` : undefined,
127
+ source: "anidb",
128
+ mediaType: "anime",
129
+ });
130
+ }
131
+ });
132
+ return results;
133
+ }
134
+ function parseAniDBSinglePage(doc, url) {
135
+ // Parse a specific anime page if we get redirected there
136
+ const titleEl = doc.querySelector("h1.anime");
137
+ if (!titleEl)
138
+ return null;
139
+ const title = titleEl.textContent?.replace("Anime:", "").trim() || "";
140
+ const descriptionEl = doc.querySelector("div.desc");
141
+ const ratingEl = doc.querySelector("tr.rating td.value");
142
+ const imgEl = doc.querySelector("div.image img");
143
+ let posterUrl = imgEl?.getAttribute("src") || undefined;
144
+ if (posterUrl && !posterUrl.startsWith("http")) {
145
+ posterUrl = `https://anidb.net${posterUrl}`;
146
+ }
147
+ return {
148
+ title,
149
+ url,
150
+ description: descriptionEl?.textContent?.trim(),
151
+ rating: ratingEl?.textContent?.trim(),
152
+ posterUrl,
153
+ source: "anidb",
154
+ mediaType: "anime",
155
+ };
156
+ }
@@ -0,0 +1,6 @@
1
+ import { ScraperOptions, SearchResult } from "../../types";
2
+ /**
3
+ * Extracts the "Answer Box" or "Instant Answer" from the DuckDuckGo Search DOM
4
+ */
5
+ export declare function extractAnswerBox(doc: Document): string | undefined;
6
+ export declare function searchDuckDuckGo(query: string, options?: ScraperOptions): Promise<SearchResult[]>;
@@ -0,0 +1,284 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.extractAnswerBox = extractAnswerBox;
4
+ exports.searchDuckDuckGo = searchDuckDuckGo;
5
+ const common_1 = require("../common");
6
+ const jsdom_1 = require("jsdom");
7
+ /**
8
+ * Extracts the "Answer Box" or "Instant Answer" from the DuckDuckGo Search DOM
9
+ */
10
+ function extractAnswerBox(doc) {
11
+ // 1. Abstract / Wikipedia Snippet - .module__text
12
+ const abstract = doc.querySelector(".module__text");
13
+ if (abstract && abstract.textContent) {
14
+ return abstract.textContent.trim();
15
+ }
16
+ // 2. Definition / Answer - .zci__def__text
17
+ const definition = doc.querySelector(".zci__def__text");
18
+ if (definition && definition.textContent) {
19
+ return definition.textContent.trim();
20
+ }
21
+ // 3. Calculator / Unit Converter - .c-base__title
22
+ const calculator = doc.querySelector(".c-base__title");
23
+ if (calculator && calculator.textContent) {
24
+ return calculator.textContent.trim();
25
+ }
26
+ // 4. Generic Fact - .zci__body
27
+ const fact = doc.querySelector(".zci__body");
28
+ if (fact && fact.textContent) {
29
+ return fact.textContent.trim();
30
+ }
31
+ return undefined;
32
+ }
33
+ // Rate limiting parameters
34
+ const MIN_DELAY_BETWEEN_SEARCHES = 2000;
35
+ let lastDDGSearchTime = 0;
36
+ // Cache for search results
37
+ const searchCache = new Map();
38
+ const CACHE_TTL = 60 * 60 * 1000; // 1 hour
39
+ // Helper function to enforce rate limiting
40
+ async function enforceRateLimit() {
41
+ const now = Date.now();
42
+ const timeSinceLastSearch = now - lastDDGSearchTime;
43
+ if (timeSinceLastSearch < MIN_DELAY_BETWEEN_SEARCHES) {
44
+ await new Promise((resolve) => setTimeout(resolve, MIN_DELAY_BETWEEN_SEARCHES - timeSinceLastSearch));
45
+ }
46
+ lastDDGSearchTime = Date.now();
47
+ }
48
+ /**
49
+ * Extract the direct URL from a DuckDuckGo redirect URL
50
+ */
51
+ function extractDirectUrl(duckduckgoUrl) {
52
+ try {
53
+ let urlStr = duckduckgoUrl;
54
+ // Handle relative URLs from DuckDuckGo
55
+ if (urlStr.startsWith("//")) {
56
+ urlStr = "https:" + urlStr;
57
+ }
58
+ else if (urlStr.startsWith("/")) {
59
+ urlStr = "https://duckduckgo.com" + urlStr;
60
+ }
61
+ const url = new URL(urlStr);
62
+ // Extract direct URL from DuckDuckGo redirect
63
+ if (url.hostname === "duckduckgo.com" && url.pathname === "/l/") {
64
+ const uddg = url.searchParams.get("uddg");
65
+ if (uddg) {
66
+ return decodeURIComponent(uddg);
67
+ }
68
+ }
69
+ // Handle ad redirects
70
+ if (url.hostname === "duckduckgo.com" && url.pathname === "/y.js") {
71
+ const u3 = url.searchParams.get("u3");
72
+ if (u3) {
73
+ try {
74
+ const decodedU3 = decodeURIComponent(u3);
75
+ const u3Url = new URL(decodedU3);
76
+ const clickUrl = u3Url.searchParams.get("ld");
77
+ if (clickUrl) {
78
+ return decodeURIComponent(clickUrl);
79
+ }
80
+ return decodedU3;
81
+ }
82
+ catch {
83
+ return urlStr;
84
+ }
85
+ }
86
+ }
87
+ return urlStr;
88
+ }
89
+ catch {
90
+ // If URL parsing fails, try to extract URL from a basic string match
91
+ const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/);
92
+ if (urlMatch) {
93
+ return urlMatch[0];
94
+ }
95
+ return duckduckgoUrl;
96
+ }
97
+ }
98
+ /**
99
+ * Generate a Jina AI URL for a given website URL
100
+ */
101
+ function getJinaAiUrl(url) {
102
+ try {
103
+ const urlObj = new URL(url);
104
+ return `https://r.jina.ai/${urlObj.href}`;
105
+ }
106
+ catch {
107
+ return "";
108
+ }
109
+ }
110
+ // Search using Puppeteer
111
+ async function searchWithPuppeteer(query, options) {
112
+ const proxy = (0, common_1.parseProxyConfig)(options.proxy);
113
+ const browser = await (0, common_1.createStealthBrowser)(proxy || undefined);
114
+ const page = await browser.newPage();
115
+ try {
116
+ // Set realistic viewport
117
+ await page.setViewport({ width: 1920, height: 1080 });
118
+ await page.setExtraHTTPHeaders((0, common_1.createRealisticHeaders)());
119
+ if (options.category === "images") {
120
+ const searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}&iax=images&ia=images`;
121
+ await page.goto(searchUrl, { waitUntil: "networkidle2" });
122
+ // Wait for image results - DDG images usually loaded in tiles
123
+ try {
124
+ await page.waitForSelector(".tile--img", { timeout: 10000 });
125
+ }
126
+ catch (e) {
127
+ // continue
128
+ }
129
+ const results = await page.evaluate((limit) => {
130
+ const items = [];
131
+ const elements = document.querySelectorAll(".tile--img");
132
+ for (let i = 0; i < Math.min(elements.length, limit || 20); i++) {
133
+ const el = elements[i];
134
+ // Title
135
+ const titleEl = el.querySelector(".tile__title");
136
+ const title = titleEl?.textContent || "Image";
137
+ // Source link
138
+ const linkEl = el.querySelector("a.tile--img__sub");
139
+ const url = linkEl?.href || "";
140
+ // Thumbnail/Image
141
+ const imgEl = el.querySelector("img.tile--img__img");
142
+ const imageUrl = imgEl?.src || imgEl?.getAttribute("data-src") || "";
143
+ if (url && imageUrl) {
144
+ items.push({
145
+ title,
146
+ url,
147
+ snippet: title,
148
+ imageUrl: imageUrl,
149
+ thumbnailUrl: imageUrl,
150
+ source: "duckduckgo-images",
151
+ });
152
+ }
153
+ }
154
+ return items;
155
+ }, options.limit);
156
+ return results;
157
+ }
158
+ const searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}`;
159
+ await page.goto(searchUrl, { waitUntil: "networkidle2" });
160
+ await page.waitForSelector("#links .result", { timeout: 10000 });
161
+ const results = await page.evaluate((limit) => {
162
+ const items = [];
163
+ const elements = document.querySelectorAll("#links .result");
164
+ for (let i = 0; i < Math.min(elements.length, limit || 10); i++) {
165
+ const el = elements[i];
166
+ const titleEl = el.querySelector("h2");
167
+ const linkEl = el.querySelector("a");
168
+ const snippetEl = el.querySelector(".result__snippet");
169
+ if (titleEl && linkEl) {
170
+ items.push({
171
+ title: titleEl.textContent || "",
172
+ url: linkEl.href || "",
173
+ snippet: snippetEl?.textContent || "",
174
+ source: "duckduckgo",
175
+ });
176
+ }
177
+ }
178
+ return items;
179
+ }, options.limit);
180
+ return results;
181
+ }
182
+ finally {
183
+ await browser.close();
184
+ }
185
+ }
186
+ async function searchDuckDuckGo(query, options = {}) {
187
+ try {
188
+ const mergedOptions = {
189
+ limit: 10,
190
+ safeSearch: true,
191
+ timeout: 10000,
192
+ forcePuppeteer: false,
193
+ antiBot: {
194
+ enabled: true,
195
+ maxRetries: 3,
196
+ retryDelay: 2000,
197
+ },
198
+ ...options,
199
+ };
200
+ const cacheKey = (0, common_1.getCacheKey)(query, mergedOptions);
201
+ const cached = searchCache.get(cacheKey);
202
+ if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
203
+ return cached.results;
204
+ }
205
+ await enforceRateLimit();
206
+ // Try HTML scraping first unless Puppeteer is forced or we are searching for images (images require JS/Puppeteer)
207
+ if (!mergedOptions.forcePuppeteer && mergedOptions.category !== "images") {
208
+ try {
209
+ let searchUrl = `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
210
+ if (mergedOptions.category === "news") {
211
+ searchUrl += "&iar=news&ia=news";
212
+ }
213
+ const { body } = await (0, common_1.fetchWithDetection)(searchUrl, mergedOptions);
214
+ const dom = new jsdom_1.JSDOM(body);
215
+ const doc = dom.window.document;
216
+ const elements = doc.querySelectorAll(".result");
217
+ const results = [];
218
+ // Use a simple loop with index to respect limit
219
+ for (let i = 0; i < elements.length && results.length < (mergedOptions.limit || 10); i++) {
220
+ const el = elements[i];
221
+ const titleEl = el.querySelector(".result__title a");
222
+ const linkEl = el.querySelector(".result__url"); // Usually just display URL
223
+ const snippetEl = el.querySelector(".result__snippet");
224
+ // Try to extract date/source for news if available (structure might vary in HTML version)
225
+ // For now, basic extraction works for both
226
+ if (titleEl) {
227
+ const rawLink = titleEl.getAttribute("href");
228
+ const title = titleEl.textContent?.trim() || "";
229
+ const snippet = snippetEl?.textContent?.trim() || "";
230
+ if (rawLink && title) {
231
+ const url = extractDirectUrl(rawLink);
232
+ if (url && url.startsWith("http")) {
233
+ results.push({
234
+ title,
235
+ url,
236
+ snippet,
237
+ source: mergedOptions.category === "news" ? "duckduckgo-news" : "duckduckgo",
238
+ });
239
+ }
240
+ }
241
+ }
242
+ }
243
+ if (results.length > 0) {
244
+ searchCache.set(cacheKey, {
245
+ results,
246
+ timestamp: Date.now(),
247
+ source: "duckduckgo",
248
+ });
249
+ return results;
250
+ }
251
+ // If no results found via HTML, might be blocked or empty, try puppeteer?
252
+ // Or if it was a genuine empty result.
253
+ // Let's assume if 0 results in HTML but page loaded, we might try puppeteer as backup
254
+ // if we suspect bot detection, but if fetchWithDetection didn't throw, maybe it's just no results.
255
+ // However, DDG HTML version sometimes gives 0 results for complex queries where JS version works.
256
+ // So fallback is good.
257
+ }
258
+ catch (error) {
259
+ const errorMessage = error instanceof Error ? error.message : String(error);
260
+ if (errorMessage === "Bot protection detected" && mergedOptions.antiBot?.enabled) {
261
+ // Silent fallback
262
+ }
263
+ else {
264
+ // Silent fallback
265
+ }
266
+ }
267
+ }
268
+ // Use Puppeteer as fallback
269
+ const results = await searchWithPuppeteer(query, mergedOptions);
270
+ searchCache.set(cacheKey, {
271
+ results,
272
+ timestamp: Date.now(),
273
+ source: "duckduckgo",
274
+ });
275
+ return results;
276
+ }
277
+ catch (error) {
278
+ throw {
279
+ message: "duckduckgo search failed :/",
280
+ code: "DDG_SEARCH_ERROR",
281
+ originalError: error,
282
+ };
283
+ }
284
+ }
@@ -0,0 +1,2 @@
1
+ import { ScraperOptions, NewsResult } from "../../types";
2
+ export declare function searchGoogleNews(query: string, options?: ScraperOptions): Promise<NewsResult[]>;
@@ -0,0 +1,60 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.searchGoogleNews = searchGoogleNews;
7
+ const google_news_scraper_1 = __importDefault(require("google-news-scraper"));
8
+ const common_1 = require("../common");
9
+ // Cache for news results
10
+ const newsCache = new Map();
11
+ const CACHE_TTL = 30 * 60 * 1000; // 30 minutes for news
12
+ async function searchGoogleNews(query, options = {}) {
13
+ try {
14
+ const mergedOptions = {
15
+ limit: 10,
16
+ safeSearch: true,
17
+ timeout: 10000,
18
+ ...options,
19
+ };
20
+ const cacheKey = (0, common_1.getCacheKey)(query, mergedOptions);
21
+ const cached = newsCache.get(cacheKey);
22
+ if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
23
+ return cached.results;
24
+ }
25
+ // google-news-scraper uses Puppeteer internally
26
+ const articles = await (0, google_news_scraper_1.default)({
27
+ searchTerm: query,
28
+ prettyURLs: true,
29
+ queryVars: {
30
+ hl: "en-US",
31
+ gl: "US",
32
+ ceid: "US:en",
33
+ },
34
+ });
35
+ const results = articles.slice(0, mergedOptions.limit).map((article) => ({
36
+ title: article.title,
37
+ url: article.link,
38
+ snippet: article.subtitle || "",
39
+ source: "google-news",
40
+ sourceName: article.source,
41
+ imageUrl: article.image,
42
+ publishedAt: article.time,
43
+ }));
44
+ if (results.length > 0) {
45
+ newsCache.set(cacheKey, {
46
+ results,
47
+ timestamp: Date.now(),
48
+ source: "google-news",
49
+ });
50
+ }
51
+ return results;
52
+ }
53
+ catch (error) {
54
+ throw {
55
+ message: "google news search failed",
56
+ code: "GOOGLE_NEWS_SEARCH_ERROR",
57
+ originalError: error,
58
+ };
59
+ }
60
+ }
@@ -0,0 +1,6 @@
1
+ import { ScraperOptions, SearchResult } from "../../types";
2
+ /**
3
+ * Extracts the "Answer Box" or "Featured Snippet" from the Google Search DOM
4
+ */
5
+ export declare function extractAnswerBox(doc: Document): string | undefined;
6
+ export declare function searchGoogle(query: string, options?: ScraperOptions): Promise<SearchResult[]>;