llm-search-tools 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +244 -0
  3. package/dist/index.d.ts +18 -0
  4. package/dist/index.js +40 -0
  5. package/dist/index.js.map +1 -0
  6. package/dist/integration.test.d.ts +1 -0
  7. package/dist/integration.test.js +237 -0
  8. package/dist/modules/answerbox.test.d.ts +1 -0
  9. package/dist/modules/answerbox.test.js +105 -0
  10. package/dist/modules/autocomplete.d.ts +11 -0
  11. package/dist/modules/autocomplete.js +159 -0
  12. package/dist/modules/autocomplete.test.d.ts +1 -0
  13. package/dist/modules/autocomplete.test.js +188 -0
  14. package/dist/modules/common.d.ts +26 -0
  15. package/dist/modules/common.js +263 -0
  16. package/dist/modules/common.test.d.ts +1 -0
  17. package/dist/modules/common.test.js +87 -0
  18. package/dist/modules/crawl.d.ts +9 -0
  19. package/dist/modules/crawl.js +117 -0
  20. package/dist/modules/crawl.test.d.ts +1 -0
  21. package/dist/modules/crawl.test.js +48 -0
  22. package/dist/modules/events.d.ts +8 -0
  23. package/dist/modules/events.js +129 -0
  24. package/dist/modules/events.test.d.ts +1 -0
  25. package/dist/modules/events.test.js +104 -0
  26. package/dist/modules/finance.d.ts +10 -0
  27. package/dist/modules/finance.js +20 -0
  28. package/dist/modules/finance.test.d.ts +1 -0
  29. package/dist/modules/finance.test.js +77 -0
  30. package/dist/modules/flights.d.ts +8 -0
  31. package/dist/modules/flights.js +135 -0
  32. package/dist/modules/flights.test.d.ts +1 -0
  33. package/dist/modules/flights.test.js +128 -0
  34. package/dist/modules/hackernews.d.ts +8 -0
  35. package/dist/modules/hackernews.js +87 -0
  36. package/dist/modules/hackernews.js.map +1 -0
  37. package/dist/modules/images.test.d.ts +1 -0
  38. package/dist/modules/images.test.js +145 -0
  39. package/dist/modules/integrations.test.d.ts +1 -0
  40. package/dist/modules/integrations.test.js +93 -0
  41. package/dist/modules/media.d.ts +11 -0
  42. package/dist/modules/media.js +132 -0
  43. package/dist/modules/media.test.d.ts +1 -0
  44. package/dist/modules/media.test.js +186 -0
  45. package/dist/modules/news.d.ts +3 -0
  46. package/dist/modules/news.js +39 -0
  47. package/dist/modules/news.test.d.ts +1 -0
  48. package/dist/modules/news.test.js +88 -0
  49. package/dist/modules/parser.d.ts +19 -0
  50. package/dist/modules/parser.js +361 -0
  51. package/dist/modules/parser.test.d.ts +1 -0
  52. package/dist/modules/parser.test.js +151 -0
  53. package/dist/modules/reddit.d.ts +21 -0
  54. package/dist/modules/reddit.js +107 -0
  55. package/dist/modules/scrape.d.ts +16 -0
  56. package/dist/modules/scrape.js +272 -0
  57. package/dist/modules/scrape.test.d.ts +1 -0
  58. package/dist/modules/scrape.test.js +232 -0
  59. package/dist/modules/scraper.d.ts +12 -0
  60. package/dist/modules/scraper.js +640 -0
  61. package/dist/modules/scrapers/anidb.d.ts +8 -0
  62. package/dist/modules/scrapers/anidb.js +156 -0
  63. package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
  64. package/dist/modules/scrapers/duckduckgo.js +284 -0
  65. package/dist/modules/scrapers/google-news.d.ts +2 -0
  66. package/dist/modules/scrapers/google-news.js +60 -0
  67. package/dist/modules/scrapers/google.d.ts +6 -0
  68. package/dist/modules/scrapers/google.js +211 -0
  69. package/dist/modules/scrapers/searxng.d.ts +2 -0
  70. package/dist/modules/scrapers/searxng.js +93 -0
  71. package/dist/modules/scrapers/thetvdb.d.ts +3 -0
  72. package/dist/modules/scrapers/thetvdb.js +147 -0
  73. package/dist/modules/scrapers/tmdb.d.ts +3 -0
  74. package/dist/modules/scrapers/tmdb.js +172 -0
  75. package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
  76. package/dist/modules/scrapers/yahoo-finance.js +33 -0
  77. package/dist/modules/search.d.ts +5 -0
  78. package/dist/modules/search.js +45 -0
  79. package/dist/modules/search.js.map +1 -0
  80. package/dist/modules/search.test.d.ts +1 -0
  81. package/dist/modules/search.test.js +219 -0
  82. package/dist/modules/urbandictionary.d.ts +12 -0
  83. package/dist/modules/urbandictionary.js +26 -0
  84. package/dist/modules/webpage.d.ts +4 -0
  85. package/dist/modules/webpage.js +150 -0
  86. package/dist/modules/webpage.js.map +1 -0
  87. package/dist/modules/wikipedia.d.ts +5 -0
  88. package/dist/modules/wikipedia.js +85 -0
  89. package/dist/modules/wikipedia.js.map +1 -0
  90. package/dist/scripts/interactive-search.d.ts +1 -0
  91. package/dist/scripts/interactive-search.js +98 -0
  92. package/dist/test.d.ts +1 -0
  93. package/dist/test.js +179 -0
  94. package/dist/test.js.map +1 -0
  95. package/dist/testBraveSearch.d.ts +1 -0
  96. package/dist/testBraveSearch.js +34 -0
  97. package/dist/testDuckDuckGo.d.ts +1 -0
  98. package/dist/testDuckDuckGo.js +52 -0
  99. package/dist/testEcosia.d.ts +1 -0
  100. package/dist/testEcosia.js +57 -0
  101. package/dist/testSearchModule.d.ts +1 -0
  102. package/dist/testSearchModule.js +95 -0
  103. package/dist/testwebpage.d.ts +1 -0
  104. package/dist/testwebpage.js +81 -0
  105. package/dist/types.d.ts +174 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/createTestDocx.d.ts +1 -0
  109. package/dist/utils/createTestDocx.js +58 -0
  110. package/dist/utils/htmlcleaner.d.ts +20 -0
  111. package/dist/utils/htmlcleaner.js +172 -0
  112. package/docs/README.md +275 -0
  113. package/docs/autocomplete.md +73 -0
  114. package/docs/crawling.md +88 -0
  115. package/docs/events.md +58 -0
  116. package/docs/examples.md +158 -0
  117. package/docs/finance.md +60 -0
  118. package/docs/flights.md +71 -0
  119. package/docs/hackernews.md +121 -0
  120. package/docs/media.md +87 -0
  121. package/docs/news.md +75 -0
  122. package/docs/parser.md +197 -0
  123. package/docs/scraper.md +347 -0
  124. package/docs/search.md +106 -0
  125. package/docs/wikipedia.md +91 -0
  126. package/package.json +97 -0
@@ -0,0 +1,347 @@
1
+ # Scraper Module Documentation
2
+
3
+ The scraper module provides powerful web scraping and content extraction capabilities with automatic bot detection and proxy support. It can intelligently switch between basic HTTP requests and Puppeteer-based browser automation when bot protection is detected.
4
+
5
+ ## Features
6
+
7
+ - **Automatic Bot Detection**: Detects Cloudflare, PerimeterX, Akamai, DataDome, and other bot protections
8
+ - **Puppeteer Fallback**: Automatically switches to browser automation when needed
9
+ - **Stealth Mode**: Uses puppeteer-extra-plugin-stealth to bypass advanced bot protection including Cloudflare
10
+ - **Proxy Support**: Full support for HTTP, HTTPS, SOCKS4, and SOCKS5 proxies with authentication
11
+ - **Rate Limiting**: Built-in rate limiting to avoid IP bans
12
+ - **Caching**: Intelligent caching to reduce redundant requests
13
+ - **Content Extraction**: Extract readable content from webpages using Mozilla Readability
14
+ - **Special Handlers**: Optimized extraction for Wikipedia and HackerNews
15
+
16
+ ## Basic Usage
17
+
18
+ ### Search Functions
19
+ ```typescript
20
+ import { search, SearchResult } from 'llm-search-tools';
21
+
22
+ // Basic search - automatically handles bot detection
23
+ const results: SearchResult[] = await search('typescript tutorial');
24
+ console.log(results);
25
+ ```
26
+
27
+ ### Webpage Content Extraction
28
+ ```typescript
29
+ import { getWebpageContent, getWebpageText } from 'llm-search-tools';
30
+
31
+ // Extract content from any webpage
32
+ const content = await getWebpageContent('https://example.com/article');
33
+ console.log(content.title);
34
+ console.log(content.textContent);
35
+
36
+ // Get just the text content
37
+ const text = await getWebpageText('https://example.com/article');
38
+ ```
39
+
40
+ ### Force Puppeteer Usage
41
+ ```typescript
42
+ // Always use Puppeteer (useful for JavaScript-heavy sites)
43
+ const results = await search('react tutorial', {
44
+ forcePuppeteer: true,
45
+ limit: 10
46
+ });
47
+ ```
48
+
49
+ ## Webpage Content Extraction
50
+
51
+ ### Basic Content Extraction
52
+ ```typescript
53
+ import { getWebpageContent, WebpageContent } from 'llm-search-tools';
54
+
55
+ // Extract content from any webpage
56
+ const content: WebpageContent = await getWebpageContent('https://example.com/article');
57
+
58
+ console.log('Title:', content.title);
59
+ console.log('Site:', content.siteName);
60
+ console.log('Content length:', content.length);
61
+ console.log('Excerpt:', content.excerpt);
62
+ console.log('Full text:', content.textContent);
63
+ ```
64
+
65
+ ### Force Puppeteer for Protected Sites
66
+ ```typescript
67
+ // Use stealth puppeteer for Cloudflare-protected sites
68
+ const content = await getWebpageContent('https://protected-site.com/article', {
69
+ usePuppeteer: true
70
+ });
71
+ ```
72
+
73
+ ### Using Proxies with Content Extraction
74
+ ```typescript
75
+ // Extract content through a proxy
76
+ const content = await getWebpageContent('https://example.com/article', {
77
+ proxy: 'http://proxy.example.com:8080',
78
+ usePuppeteer: true // Often needed for proxies
79
+ });
80
+
81
+ // Or with proxy configuration object
82
+ const proxyConfig: ProxyConfig = {
83
+ type: 'socks5',
84
+ host: 'proxy.example.com',
85
+ port: 1080,
86
+ auth: {
87
+ username: 'user',
88
+ password: 'pass'
89
+ }
90
+ };
91
+
92
+ const content = await getWebpageContent('https://example.com/article', {
93
+ proxy: proxyConfig,
94
+ usePuppeteer: true
95
+ });
96
+ ```
97
+
98
+ ### Special Site Handlers
99
+ The scraper automatically detects and optimizes for certain sites:
100
+
101
+ ```typescript
102
+ // Wikipedia - automatically extracts clean content
103
+ const wikiContent = await getWebpageContent('https://en.wikipedia.org/wiki/Web_scraping');
104
+
105
+ // HackerNews - extracts story content
106
+ const hnContent = await getWebpageContent('https://news.ycombinator.com/item?id=123456');
107
+ ```
108
+
109
+ ### URL Accessibility Check
110
+ ```typescript
111
+ import { isUrlAccessible } from 'llm-search-tools';
112
+
113
+ const isAccessible = await isUrlAccessible('https://example.com');
114
+ if (isAccessible) {
115
+ const content = await getWebpageContent('https://example.com');
116
+ }
117
+ ```
118
+
119
+ ## Proxy Configuration
120
+
121
+ ### Using Proxy Object
122
+ ```typescript
123
+ import { search, ProxyConfig } from 'llm-search-tools';
124
+
125
+ const proxyConfig: ProxyConfig = {
126
+ type: 'http', // or 'https', 'socks4', 'socks5'
127
+ host: 'proxy.example.com',
128
+ port: 8080,
129
+ auth: { // Optional authentication
130
+ username: 'user',
131
+ password: 'pass'
132
+ }
133
+ };
134
+
135
+ const results = await search('nodejs tutorial', {
136
+ proxy: proxyConfig
137
+ });
138
+ ```
139
+
140
+ ### Using Proxy URL String
141
+ ```typescript
142
+ // Simple proxy without auth
143
+ const results = await search('python tutorial', {
144
+ proxy: 'http://proxy.example.com:8080'
145
+ });
146
+
147
+ // Proxy with authentication
148
+ const results = await search('java tutorial', {
149
+ proxy: 'http://user:pass@proxy.example.com:8080'
150
+ });
151
+
152
+ // SOCKS proxy
153
+ const results = await search('go tutorial', {
154
+ proxy: 'socks5://proxy.example.com:1080'
155
+ });
156
+ ```
157
+
158
+ ## Bot Detection & Fallback
159
+
160
+ The scraper automatically detects bot protection and falls back to Puppeteer:
161
+
162
+ ```typescript
163
+ // This will automatically handle bot detection
164
+ const results = await search('scraping tutorial', {
165
+ antiBot: {
166
+ enabled: true, // Enable bot detection (default: true)
167
+ maxRetries: 3, // Max retries on bot detection (default: 3)
168
+ retryDelay: 2000 // Delay between retries in ms (default: 2000)
169
+ }
170
+ });
171
+ ```
172
+
173
+ ### Detected Protections
174
+
175
+ - **Cloudflare**: CF-Ray headers, challenge pages, "Just a moment" redirects
176
+ - **PerimeterX**: _px cookies, PX headers, captcha challenges
177
+ - **Akamai**: ak_bmsc cookies, akamaized hosts
178
+ - **DataDome**: __ddg_ cookies, x-datadome headers
179
+ - **Generic**: CAPTCHAs, 403 errors, rate limiting messages
180
+
181
+ ## Advanced Options
182
+
183
+ ```typescript
184
+ import { ScraperOptions } from 'llm-search-tools';
185
+
186
+ const options: ScraperOptions = {
187
+ limit: 10, // Number of results (default: 10)
188
+ safeSearch: true, // Enable safe search (default: true)
189
+ timeout: 10000, // Request timeout in ms (default: 10000)
190
+ forcePuppeteer: false, // Force Puppeteer usage (default: false)
191
+ proxy: { // Proxy configuration
192
+ type: 'https',
193
+ host: 'proxy.example.com',
194
+ port: 8080,
195
+ auth: {
196
+ username: 'user',
197
+ password: 'pass'
198
+ }
199
+ },
200
+ antiBot: { // Anti-bot configuration
201
+ enabled: true,
202
+ maxRetries: 3,
203
+ retryDelay: 2000
204
+ }
205
+ };
206
+
207
+ const results = await search('advanced query', options);
208
+ ```
209
+
210
+ ## Search Engine Specific Functions
211
+
212
+ ### Google Search
213
+ ```typescript
214
+ import { searchGoogle } from 'llm-search-tools';
215
+
216
+ // Google-specific search
217
+ const googleResults = await searchGoogle('machine learning', {
218
+ limit: 5,
219
+ proxy: 'http://proxy.example.com:8080'
220
+ });
221
+ ```
222
+
223
+ ### DuckDuckGo Search
224
+ ```typescript
225
+ import { searchDuckDuckGo } from 'llm-search-tools';
226
+
227
+ // DuckDuckGo-specific search
228
+ const ddgResults = await searchDuckDuckGo('data science', {
229
+ safeSearch: false,
230
+ forcePuppeteer: true
231
+ });
232
+ ```
233
+
234
+ ## Error Handling
235
+
236
+ ### Proxy Errors
237
+ ```typescript
238
+ try {
239
+ const results = await search('test', { proxy: 'invalid-proxy' });
240
+ } catch (error) {
241
+ if (error.code === 'PROXY_CONNECTION_FAILED') {
242
+ console.error('Could not connect to proxy:', error.message);
243
+ } else if (error.code === 'PROXY_AUTH_FAILED') {
244
+ console.error('Proxy authentication failed');
245
+ } else if (error.code === 'PROXY_CONNECTION_REFUSED') {
246
+ console.error('Proxy server refused connection');
247
+ }
248
+ }
249
+ ```
250
+
251
+ ### Search Errors
252
+ ```typescript
253
+ try {
254
+ const results = await search('test');
255
+ } catch (error) {
256
+ if (error.code === 'GOOGLE_SEARCH_ERROR') {
257
+ console.error('Google search failed');
258
+ } else if (error.code === 'DDG_SEARCH_ERROR') {
259
+ console.error('DuckDuckGo search failed');
260
+ }
261
+ }
262
+ ```
263
+
264
+ ## Migration from Search API
265
+
266
+ The new scraper module is backward compatible with the old search API:
267
+
268
+ ```typescript
269
+ // Old API (still works)
270
+ import { SearchOptions } from 'llm-search-tools';
271
+
272
+ const oldOptions: SearchOptions = {
273
+ limit: 10,
274
+ safeSearch: true,
275
+ timeout: 5000
276
+ };
277
+
278
+ // New API (recommended)
279
+ import { ScraperOptions } from 'llm-search-tools';
280
+
281
+ const newOptions: ScraperOptions = {
282
+ limit: 10,
283
+ safeSearch: true,
284
+ timeout: 5000,
285
+ forcePuppeteer: false, // New option
286
+ proxy: undefined, // New option
287
+ antiBot: { // New option
288
+ enabled: true,
289
+ maxRetries: 3,
290
+ retryDelay: 2000
291
+ }
292
+ };
293
+ ```
294
+
295
+ ## Best Practices
296
+
297
+ 1. **Use Proxies for High Volume**: Always use proxies when making many requests
298
+ 2. **Respect Rate Limits**: The built-in rate limiting helps avoid IP bans
299
+ 3. **Monitor for Bot Detection**: Check console logs for fallback messages
300
+ 4. **Cache Results**: Enable caching to reduce redundant requests
301
+ 5. **Handle Errors Gracefully**: Always wrap searches in try-catch blocks
302
+
303
+ ## Example: Complete Scraper Setup
304
+
305
+ ```typescript
306
+ import { search, ProxyConfig, ScraperOptions } from 'llm-search-tools';
307
+
308
+ async function advancedScraping() {
309
+ const proxyConfig: ProxyConfig = {
310
+ type: 'socks5',
311
+ host: 'rotating-proxy.example.com',
312
+ port: 1080,
313
+ auth: {
314
+ username: 'your-username',
315
+ password: 'your-password'
316
+ }
317
+ };
318
+
319
+ const options: ScraperOptions = {
320
+ limit: 20,
321
+ safeSearch: false,
322
+ timeout: 15000,
323
+ proxy: proxyConfig,
324
+ antiBot: {
325
+ enabled: true,
326
+ maxRetries: 5,
327
+ retryDelay: 3000
328
+ }
329
+ };
330
+
331
+ try {
332
+ const results = await search('web scraping techniques', options);
333
+ console.log(`Found ${results.length} results`);
334
+
335
+ // Process results...
336
+ results.forEach(result => {
337
+ console.log(`- ${result.title}`);
338
+ console.log(` ${result.url}`);
339
+ console.log(` ${result.snippet}\n`);
340
+ });
341
+ } catch (error) {
342
+ console.error('Scraping failed:', error);
343
+ }
344
+ }
345
+
346
+ advancedScraping();
347
+ ```
package/docs/search.md ADDED
@@ -0,0 +1,106 @@
1
+ # Search Module 🔍
2
+
3
+ The search module provides unified search capabilities using Google, DuckDuckGo, and SearxNG.
4
+
5
+ ## Functions
6
+
7
+ ### search(query: string, options?: SearchOptions)
8
+
9
+ Main search function that tries engines in sequence:
10
+
11
+ 1. **DuckDuckGo** (Most lenient)
12
+ 2. **Google** (Best quality, strict bot detection)
13
+ 3. **SearxNG** (Fallback to public instances)
14
+
15
+ ```typescript
16
+ import { search } from "llm-search-tools";
17
+
18
+ const results = await search("typescript tutorial", {
19
+ limit: 5,
20
+ safeSearch: true,
21
+ timeout: 5000,
22
+ });
23
+ ```
24
+
25
+ ### searchDuckDuckGo(query: string, options?: SearchOptions)
26
+
27
+ Search using DuckDuckGo specifically. Uses HTML scraping with Puppeteer fallback.
28
+
29
+ ```typescript
30
+ import { searchDuckDuckGo } from "llm-search-tools";
31
+
32
+ const results = await searchDuckDuckGo("typescript tutorial");
33
+ ```
34
+
35
+ ### searchGoogle(query: string, options?: SearchOptions)
36
+
37
+ Search using Google specifically.
38
+
39
+ ```typescript
40
+ import { searchGoogle } from "llm-search-tools";
41
+
42
+ const results = await searchGoogle("typescript tutorial");
43
+ ```
44
+
45
+ ### searchSearxNG(query: string, options?: SearchOptions)
46
+
47
+ Search using SearxNG (meta-search engine). Uses public instances by default or a custom instance.
48
+
49
+ ```typescript
50
+ import { searchSearxNG } from "llm-search-tools";
51
+
52
+ const results = await searchSearxNG("typescript tutorial", {
53
+ searxngInstance: "https://searx.be",
54
+ });
55
+ ```
56
+
57
+ ## Options
58
+
59
+ ```typescript
60
+ interface SearchOptions {
61
+ limit?: number; // max number of results (default: 10)
62
+ safeSearch?: boolean; // enable safe search (default: true)
63
+ timeout?: number; // request timeout in ms (default: 10000)
64
+
65
+ // Advanced Options
66
+ proxy?: string | ProxyConfig; // Proxy configuration
67
+ antiBot?: {
68
+ enabled?: boolean; // Enable anti-bot detection measures
69
+ maxRetries?: number;
70
+ retryDelay?: number;
71
+ };
72
+ searxngInstance?: string; // Custom SearxNG instance URL
73
+ }
74
+ ```
75
+
76
+ ## Result Format
77
+
78
+ ```typescript
79
+ interface SearchResult {
80
+ title: string; // result title
81
+ url: string; // result url
82
+ snippet?: string; // result description/snippet
83
+ source: "google" | "duckduckgo" | "wikipedia" | "hackernews" | "searxng";
84
+ }
85
+ ```
86
+
87
+ ## Error Handling
88
+
89
+ All functions throw a `SearchError` on failure. The main `search()` function aggregates errors if all providers fail.
90
+
91
+ ```typescript
92
+ try {
93
+ const results = await search("typescript tutorial");
94
+ } catch (err) {
95
+ if (err.code === "ALL_SEARCH_ENGINES_FAILED") {
96
+ console.log("All search engines failed:", err.errors);
97
+ }
98
+ }
99
+ ```
100
+
101
+ ## Tips
102
+
103
+ - For best results, use the main `search()` function which handles fallbacks automatically.
104
+ - DuckDuckGo is the default first choice as it is less restrictive.
105
+ - SearxNG is a great fallback as it aggregates multiple engines.
106
+ - If you are getting blocked, try enabling `antiBot` or configuring a proxy.
@@ -0,0 +1,91 @@
1
+ # Wikipedia Module 📚
2
+
3
+ The Wikipedia module provides functions for searching Wikipedia and retrieving article content.
4
+
5
+ ## Functions
6
+
7
+ ### wikiSearch(query: string, limit?: number)
8
+
9
+ Search Wikipedia articles.
10
+
11
+ ```typescript
12
+ import { wikiSearch } from 'llm-search-tools';
13
+
14
+ const results = await wikiSearch('Node.js', 5);
15
+ ```
16
+
17
+ ### wikiGetContent(title: string)
18
+
19
+ Get the full content of a Wikipedia article.
20
+
21
+ ```typescript
22
+ import { wikiGetContent } from 'llm-search-tools';
23
+
24
+ const content = await wikiGetContent('Node.js');
25
+ ```
26
+
27
+ ### wikiGetSummary(title: string)
28
+
29
+ Get a summary of a Wikipedia article.
30
+
31
+ ```typescript
32
+ import { wikiGetSummary } from 'llm-search-tools';
33
+
34
+ const summary = await wikiGetSummary('Node.js');
35
+ ```
36
+
37
+ ### setWikiLang(language: string)
38
+
39
+ Set the Wikipedia language (default: 'en').
40
+
41
+ ```typescript
42
+ import { setWikiLang } from 'llm-search-tools';
43
+
44
+ setWikiLang('es'); // switch to Spanish Wikipedia
45
+ ```
46
+
47
+ ## Result Format
48
+
49
+ ```typescript
50
+ interface WikipediaResult extends SearchResult {
51
+ extract?: string; // article extract/summary
52
+ thumbnail?: string; // URL of article thumbnail image
53
+ }
54
+ ```
55
+
56
+ ## Error Handling
57
+
58
+ All functions throw a `SearchError` on failure:
59
+
60
+ ```typescript
61
+ try {
62
+ const results = await wikiSearch('nodejs');
63
+ } catch (err) {
64
+ if (err.code === 'WIKI_SEARCH_ERROR') {
65
+ console.error('wikipedia search failed:', err.message);
66
+ }
67
+ }
68
+ ```
69
+
70
+ ## Tips
71
+
72
+ - Use `wikiGetSummary()` to get a quick overview of a topic
73
+ - `wikiSearch()` results include thumbnails when available
74
+ - Switch languages with `setWikiLang()` for international content
75
+ - Article content from `wikiGetContent()` is in raw format, you may want to parse it
76
+
77
+ ## Common Languages
78
+
79
+ Here are some common language codes for `setWikiLang()`:
80
+
81
+ - 'en' - English
82
+ - 'es' - Spanish
83
+ - 'fr' - French
84
+ - 'de' - German
85
+ - 'it' - Italian
86
+ - 'pt' - Portuguese
87
+ - 'ru' - Russian
88
+ - 'ja' - Japanese
89
+ - 'zh' - Chinese
90
+
91
+ See [Wikipedia language codes](https://en.wikipedia.org/wiki/List_of_Wikipedias) for more options.
package/package.json ADDED
@@ -0,0 +1,97 @@
1
+ {
2
+ "name": "llm-search-tools",
3
+ "version": "1.1.0",
4
+ "description": "A Node.js module for searching and scraping web content, designed for LLMs but useful for any project where webscraping is needed!",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "scripts": {
8
+ "build": "tsc",
9
+ "prepare": "npm run build",
10
+ "test": "vitest run",
11
+ "test:watch": "vitest",
12
+ "lint": "eslint src/**/*.ts",
13
+ "clean": "rm -rf dist",
14
+ "prepublishOnly": "npm run test && npm run build"
15
+ },
16
+ "keywords": [
17
+ "search",
18
+ "scraping",
19
+ "llm",
20
+ "ai",
21
+ "wikipedia",
22
+ "hackernews",
23
+ "readability",
24
+ "google",
25
+ "duckduckgo",
26
+ "typescript",
27
+ "web-scraping",
28
+ "content-extraction",
29
+ "llm-scrape",
30
+ "web-search",
31
+ "scrape",
32
+ "web-scrape",
33
+ "web-crawl",
34
+ "crawling",
35
+ "web-crawler"
36
+ ],
37
+ "author": {
38
+ "name": "Minoa",
39
+ "url": "https://gitlab.com/M1noa"
40
+ },
41
+ "license": "MIT",
42
+ "dependencies": {
43
+ "@crawlee/cheerio": "^3.15.3",
44
+ "@crawlee/puppeteer": "^3.15.3",
45
+ "@mozilla/readability": "^0.6.0",
46
+ "crawlee": "^3.15.3",
47
+ "csv-parse": "^6.1.0",
48
+ "fast-xml-parser": "^5.3.3",
49
+ "google-news-scraper": "^2.7.0",
50
+ "google-sr": "^6.0.0",
51
+ "https-proxy-agent": "^7.0.6",
52
+ "jsdom": "^27.4.0",
53
+ "mammoth": "^1.11.0",
54
+ "pdf-parse": "^2.4.5",
55
+ "puppeteer": "^24.34.0",
56
+ "puppeteer-extra": "^3.3.6",
57
+ "puppeteer-extra-plugin-stealth": "^2.11.2",
58
+ "socks-proxy-agent": "^8.0.5",
59
+ "tesseract.js": "^7.0.0",
60
+ "turndown": "^7.2.2",
61
+ "wikipedia": "^2.4.2",
62
+ "yahoo-finance2": "^3.11.2",
63
+ "zod-to-json-schema": "^3.25.1"
64
+ },
65
+ "devDependencies": {
66
+ "@types/jsdom": "^27.0.0",
67
+ "@types/node": "^25.0.3",
68
+ "@types/turndown": "^5.0.6",
69
+ "@typescript-eslint/eslint-plugin": "^8.51.0",
70
+ "@typescript-eslint/parser": "^8.51.0",
71
+ "eslint": "^9.39.2",
72
+ "globals": "^17.0.0",
73
+ "ts-node": "^10.9.2",
74
+ "typescript": "^5.9.3",
75
+ "vitest": "^3.2.4"
76
+ },
77
+ "engines": {
78
+ "node": ">=16"
79
+ },
80
+ "repository": {
81
+ "type": "git",
82
+ "url": "git+https://gitlab.com/m1noa/llm-search.git"
83
+ },
84
+ "bugs": {
85
+ "url": "https://gitlab.com/m1noa/llm-search/issues"
86
+ },
87
+ "homepage": "https://gitlab.com/m1noa/llm-search#readme",
88
+ "files": [
89
+ "dist",
90
+ "LICENSE",
91
+ "README.md",
92
+ "docs"
93
+ ],
94
+ "publishConfig": {
95
+ "access": "public"
96
+ }
97
+ }