npm - llm-search-tools - Versions diffs - 1.1.0 - Mend

llm-search-tools 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

package/LICENSE +21 -0
package/README.md +244 -0
package/dist/index.d.ts +18 -0
package/dist/index.js +40 -0
package/dist/index.js.map +1 -0
package/dist/integration.test.d.ts +1 -0
package/dist/integration.test.js +237 -0
package/dist/modules/answerbox.test.d.ts +1 -0
package/dist/modules/answerbox.test.js +105 -0
package/dist/modules/autocomplete.d.ts +11 -0
package/dist/modules/autocomplete.js +159 -0
package/dist/modules/autocomplete.test.d.ts +1 -0
package/dist/modules/autocomplete.test.js +188 -0
package/dist/modules/common.d.ts +26 -0
package/dist/modules/common.js +263 -0
package/dist/modules/common.test.d.ts +1 -0
package/dist/modules/common.test.js +87 -0
package/dist/modules/crawl.d.ts +9 -0
package/dist/modules/crawl.js +117 -0
package/dist/modules/crawl.test.d.ts +1 -0
package/dist/modules/crawl.test.js +48 -0
package/dist/modules/events.d.ts +8 -0
package/dist/modules/events.js +129 -0
package/dist/modules/events.test.d.ts +1 -0
package/dist/modules/events.test.js +104 -0
package/dist/modules/finance.d.ts +10 -0
package/dist/modules/finance.js +20 -0
package/dist/modules/finance.test.d.ts +1 -0
package/dist/modules/finance.test.js +77 -0
package/dist/modules/flights.d.ts +8 -0
package/dist/modules/flights.js +135 -0
package/dist/modules/flights.test.d.ts +1 -0
package/dist/modules/flights.test.js +128 -0
package/dist/modules/hackernews.d.ts +8 -0
package/dist/modules/hackernews.js +87 -0
package/dist/modules/hackernews.js.map +1 -0
package/dist/modules/images.test.d.ts +1 -0
package/dist/modules/images.test.js +145 -0
package/dist/modules/integrations.test.d.ts +1 -0
package/dist/modules/integrations.test.js +93 -0
package/dist/modules/media.d.ts +11 -0
package/dist/modules/media.js +132 -0
package/dist/modules/media.test.d.ts +1 -0
package/dist/modules/media.test.js +186 -0
package/dist/modules/news.d.ts +3 -0
package/dist/modules/news.js +39 -0
package/dist/modules/news.test.d.ts +1 -0
package/dist/modules/news.test.js +88 -0
package/dist/modules/parser.d.ts +19 -0
package/dist/modules/parser.js +361 -0
package/dist/modules/parser.test.d.ts +1 -0
package/dist/modules/parser.test.js +151 -0
package/dist/modules/reddit.d.ts +21 -0
package/dist/modules/reddit.js +107 -0
package/dist/modules/scrape.d.ts +16 -0
package/dist/modules/scrape.js +272 -0
package/dist/modules/scrape.test.d.ts +1 -0
package/dist/modules/scrape.test.js +232 -0
package/dist/modules/scraper.d.ts +12 -0
package/dist/modules/scraper.js +640 -0
package/dist/modules/scrapers/anidb.d.ts +8 -0
package/dist/modules/scrapers/anidb.js +156 -0
package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
package/dist/modules/scrapers/duckduckgo.js +284 -0
package/dist/modules/scrapers/google-news.d.ts +2 -0
package/dist/modules/scrapers/google-news.js +60 -0
package/dist/modules/scrapers/google.d.ts +6 -0
package/dist/modules/scrapers/google.js +211 -0
package/dist/modules/scrapers/searxng.d.ts +2 -0
package/dist/modules/scrapers/searxng.js +93 -0
package/dist/modules/scrapers/thetvdb.d.ts +3 -0
package/dist/modules/scrapers/thetvdb.js +147 -0
package/dist/modules/scrapers/tmdb.d.ts +3 -0
package/dist/modules/scrapers/tmdb.js +172 -0
package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
package/dist/modules/scrapers/yahoo-finance.js +33 -0
package/dist/modules/search.d.ts +5 -0
package/dist/modules/search.js +45 -0
package/dist/modules/search.js.map +1 -0
package/dist/modules/search.test.d.ts +1 -0
package/dist/modules/search.test.js +219 -0
package/dist/modules/urbandictionary.d.ts +12 -0
package/dist/modules/urbandictionary.js +26 -0
package/dist/modules/webpage.d.ts +4 -0
package/dist/modules/webpage.js +150 -0
package/dist/modules/webpage.js.map +1 -0
package/dist/modules/wikipedia.d.ts +5 -0
package/dist/modules/wikipedia.js +85 -0
package/dist/modules/wikipedia.js.map +1 -0
package/dist/scripts/interactive-search.d.ts +1 -0
package/dist/scripts/interactive-search.js +98 -0
package/dist/test.d.ts +1 -0
package/dist/test.js +179 -0
package/dist/test.js.map +1 -0
package/dist/testBraveSearch.d.ts +1 -0
package/dist/testBraveSearch.js +34 -0
package/dist/testDuckDuckGo.d.ts +1 -0
package/dist/testDuckDuckGo.js +52 -0
package/dist/testEcosia.d.ts +1 -0
package/dist/testEcosia.js +57 -0
package/dist/testSearchModule.d.ts +1 -0
package/dist/testSearchModule.js +95 -0
package/dist/testwebpage.d.ts +1 -0
package/dist/testwebpage.js +81 -0
package/dist/types.d.ts +174 -0
package/dist/types.js +3 -0
package/dist/types.js.map +1 -0
package/dist/utils/createTestDocx.d.ts +1 -0
package/dist/utils/createTestDocx.js +58 -0
package/dist/utils/htmlcleaner.d.ts +20 -0
package/dist/utils/htmlcleaner.js +172 -0
package/docs/README.md +275 -0
package/docs/autocomplete.md +73 -0
package/docs/crawling.md +88 -0
package/docs/events.md +58 -0
package/docs/examples.md +158 -0
package/docs/finance.md +60 -0
package/docs/flights.md +71 -0
package/docs/hackernews.md +121 -0
package/docs/media.md +87 -0
package/docs/news.md +75 -0
package/docs/parser.md +197 -0
package/docs/scraper.md +347 -0
package/docs/search.md +106 -0
package/docs/wikipedia.md +91 -0
package/package.json +97 -0

package/docs/scraper.md ADDED Viewed

@@ -0,0 +1,347 @@
+# Scraper Module Documentation
+The scraper module provides powerful web scraping and content extraction capabilities with automatic bot detection and proxy support. It can intelligently switch between basic HTTP requests and Puppeteer-based browser automation when bot protection is detected.
+## Features
+- **Automatic Bot Detection**: Detects Cloudflare, PerimeterX, Akamai, DataDome, and other bot protections
+- **Puppeteer Fallback**: Automatically switches to browser automation when needed
+- **Stealth Mode**: Uses puppeteer-extra-plugin-stealth to bypass advanced bot protection including Cloudflare
+- **Proxy Support**: Full support for HTTP, HTTPS, SOCKS4, and SOCKS5 proxies with authentication
+- **Rate Limiting**: Built-in rate limiting to avoid IP bans
+- **Caching**: Intelligent caching to reduce redundant requests
+- **Content Extraction**: Extract readable content from webpages using Mozilla Readability
+- **Special Handlers**: Optimized extraction for Wikipedia and HackerNews
+## Basic Usage
+### Search Functions
+```typescript
+import { search, SearchResult } from 'llm-search-tools';
+// Basic search - automatically handles bot detection
+const results: SearchResult[] = await search('typescript tutorial');
+console.log(results);
+```
+### Webpage Content Extraction
+```typescript
+import { getWebpageContent, getWebpageText } from 'llm-search-tools';
+// Extract content from any webpage
+const content = await getWebpageContent('https://example.com/article');
+console.log(content.title);
+console.log(content.textContent);
+// Get just the text content
+const text = await getWebpageText('https://example.com/article');
+```
+### Force Puppeteer Usage
+```typescript
+// Always use Puppeteer (useful for JavaScript-heavy sites)
+const results = await search('react tutorial', {
+  forcePuppeteer: true,
+  limit: 10
+});
+```
+## Webpage Content Extraction
+### Basic Content Extraction
+```typescript
+import { getWebpageContent, WebpageContent } from 'llm-search-tools';
+// Extract content from any webpage
+const content: WebpageContent = await getWebpageContent('https://example.com/article');
+console.log('Title:', content.title);
+console.log('Site:', content.siteName);
+console.log('Content length:', content.length);
+console.log('Excerpt:', content.excerpt);
+console.log('Full text:', content.textContent);
+```
+### Force Puppeteer for Protected Sites
+```typescript
+// Use stealth puppeteer for Cloudflare-protected sites
+const content = await getWebpageContent('https://protected-site.com/article', {
+  usePuppeteer: true
+});
+```
+### Using Proxies with Content Extraction
+```typescript
+// Extract content through a proxy
+const content = await getWebpageContent('https://example.com/article', {
+  proxy: 'http://proxy.example.com:8080',
+  usePuppeteer: true  // Often needed for proxies
+});
+// Or with proxy configuration object
+const proxyConfig: ProxyConfig = {
+  type: 'socks5',
+  host: 'proxy.example.com',
+  port: 1080,
+  auth: {
+    username: 'user',
+    password: 'pass'
+  }
+};
+const content = await getWebpageContent('https://example.com/article', {
+  proxy: proxyConfig,
+  usePuppeteer: true
+});
+```
+### Special Site Handlers
+The scraper automatically detects and optimizes for certain sites:
+```typescript
+// Wikipedia - automatically extracts clean content
+const wikiContent = await getWebpageContent('https://en.wikipedia.org/wiki/Web_scraping');
+// HackerNews - extracts story content
+const hnContent = await getWebpageContent('https://news.ycombinator.com/item?id=123456');
+```
+### URL Accessibility Check
+```typescript
+import { isUrlAccessible } from 'llm-search-tools';
+const isAccessible = await isUrlAccessible('https://example.com');
+if (isAccessible) {
+  const content = await getWebpageContent('https://example.com');
+}
+```
+## Proxy Configuration
+### Using Proxy Object
+```typescript
+import { search, ProxyConfig } from 'llm-search-tools';
+const proxyConfig: ProxyConfig = {
+  type: 'http',        // or 'https', 'socks4', 'socks5'
+  host: 'proxy.example.com',
+  port: 8080,
+  auth: {              // Optional authentication
+    username: 'user',
+    password: 'pass'
+  }
+};
+const results = await search('nodejs tutorial', {
+  proxy: proxyConfig
+});
+```
+### Using Proxy URL String
+```typescript
+// Simple proxy without auth
+const results = await search('python tutorial', {
+  proxy: 'http://proxy.example.com:8080'
+});
+// Proxy with authentication
+const results = await search('java tutorial', {
+  proxy: 'http://user:pass@proxy.example.com:8080'
+});
+// SOCKS proxy
+const results = await search('go tutorial', {
+  proxy: 'socks5://proxy.example.com:1080'
+});
+```
+## Bot Detection & Fallback
+The scraper automatically detects bot protection and falls back to Puppeteer:
+```typescript
+// This will automatically handle bot detection
+const results = await search('scraping tutorial', {
+  antiBot: {
+    enabled: true,        // Enable bot detection (default: true)
+    maxRetries: 3,        // Max retries on bot detection (default: 3)
+    retryDelay: 2000      // Delay between retries in ms (default: 2000)
+  }
+});
+```
+### Detected Protections
+- **Cloudflare**: CF-Ray headers, challenge pages, "Just a moment" redirects
+- **PerimeterX**: _px cookies, PX headers, captcha challenges
+- **Akamai**: ak_bmsc cookies, akamaized hosts
+- **DataDome**: __ddg_ cookies, x-datadome headers
+- **Generic**: CAPTCHAs, 403 errors, rate limiting messages
+## Advanced Options
+```typescript
+import { ScraperOptions } from 'llm-search-tools';
+const options: ScraperOptions = {
+  limit: 10,              // Number of results (default: 10)
+  safeSearch: true,       // Enable safe search (default: true)
+  timeout: 10000,         // Request timeout in ms (default: 10000)
+  forcePuppeteer: false,  // Force Puppeteer usage (default: false)
+  proxy: {                // Proxy configuration
+    type: 'https',
+    host: 'proxy.example.com',
+    port: 8080,
+    auth: {
+      username: 'user',
+      password: 'pass'
+    }
+  },
+  antiBot: {              // Anti-bot configuration
+    enabled: true,
+    maxRetries: 3,
+    retryDelay: 2000
+  }
+};
+const results = await search('advanced query', options);
+```
+## Search Engine Specific Functions
+### Google Search
+```typescript
+import { searchGoogle } from 'llm-search-tools';
+// Google-specific search
+const googleResults = await searchGoogle('machine learning', {
+  limit: 5,
+  proxy: 'http://proxy.example.com:8080'
+});
+```
+### DuckDuckGo Search
+```typescript
+import { searchDuckDuckGo } from 'llm-search-tools';
+// DuckDuckGo-specific search
+const ddgResults = await searchDuckDuckGo('data science', {
+  safeSearch: false,
+  forcePuppeteer: true
+});
+```
+## Error Handling
+### Proxy Errors
+```typescript
+try {
+  const results = await search('test', { proxy: 'invalid-proxy' });
+} catch (error) {
+  if (error.code === 'PROXY_CONNECTION_FAILED') {
+    console.error('Could not connect to proxy:', error.message);
+  } else if (error.code === 'PROXY_AUTH_FAILED') {
+    console.error('Proxy authentication failed');
+  } else if (error.code === 'PROXY_CONNECTION_REFUSED') {
+    console.error('Proxy server refused connection');
+  }
+}
+```
+### Search Errors
+```typescript
+try {
+  const results = await search('test');
+} catch (error) {
+  if (error.code === 'GOOGLE_SEARCH_ERROR') {
+    console.error('Google search failed');
+  } else if (error.code === 'DDG_SEARCH_ERROR') {
+    console.error('DuckDuckGo search failed');
+  }
+}
+```
+## Migration from Search API
+The new scraper module is backward compatible with the old search API:
+```typescript
+// Old API (still works)
+import { SearchOptions } from 'llm-search-tools';
+const oldOptions: SearchOptions = {
+  limit: 10,
+  safeSearch: true,
+  timeout: 5000
+};
+// New API (recommended)
+import { ScraperOptions } from 'llm-search-tools';
+const newOptions: ScraperOptions = {
+  limit: 10,
+  safeSearch: true,
+  timeout: 5000,
+  forcePuppeteer: false,  // New option
+  proxy: undefined,       // New option
+  antiBot: {              // New option
+    enabled: true,
+    maxRetries: 3,
+    retryDelay: 2000
+  }
+};
+```
+## Best Practices
+1. **Use Proxies for High Volume**: Always use proxies when making many requests
+2. **Respect Rate Limits**: The built-in rate limiting helps avoid IP bans
+3. **Monitor for Bot Detection**: Check console logs for fallback messages
+4. **Cache Results**: Enable caching to reduce redundant requests
+5. **Handle Errors Gracefully**: Always wrap searches in try-catch blocks
+## Example: Complete Scraper Setup
+```typescript
+import { search, ProxyConfig, ScraperOptions } from 'llm-search-tools';
+async function advancedScraping() {
+  const proxyConfig: ProxyConfig = {
+    type: 'socks5',
+    host: 'rotating-proxy.example.com',
+    port: 1080,
+    auth: {
+      username: 'your-username',
+      password: 'your-password'
+    }
+  };
+  const options: ScraperOptions = {
+    limit: 20,
+    safeSearch: false,
+    timeout: 15000,
+    proxy: proxyConfig,
+    antiBot: {
+      enabled: true,
+      maxRetries: 5,
+      retryDelay: 3000
+    }
+  };
+  try {
+    const results = await search('web scraping techniques', options);
+    console.log(`Found ${results.length} results`);
+    // Process results...
+    results.forEach(result => {
+      console.log(`- ${result.title}`);
+      console.log(`  ${result.url}`);
+      console.log(`  ${result.snippet}\n`);
+    });
+  } catch (error) {
+    console.error('Scraping failed:', error);
+  }
+}
+advancedScraping();
+```

package/docs/search.md ADDED Viewed

@@ -0,0 +1,106 @@
+# Search Module 🔍
+The search module provides unified search capabilities using Google, DuckDuckGo, and SearxNG.
+## Functions
+### search(query: string, options?: SearchOptions)
+Main search function that tries engines in sequence:
+1. **DuckDuckGo** (Most lenient)
+2. **Google** (Best quality, strict bot detection)
+3. **SearxNG** (Fallback to public instances)
+```typescript
+import { search } from "llm-search-tools";
+const results = await search("typescript tutorial", {
+    limit: 5,
+    safeSearch: true,
+    timeout: 5000,
+});
+```
+### searchDuckDuckGo(query: string, options?: SearchOptions)
+Search using DuckDuckGo specifically. Uses HTML scraping with Puppeteer fallback.
+```typescript
+import { searchDuckDuckGo } from "llm-search-tools";
+const results = await searchDuckDuckGo("typescript tutorial");
+```
+### searchGoogle(query: string, options?: SearchOptions)
+Search using Google specifically.
+```typescript
+import { searchGoogle } from "llm-search-tools";
+const results = await searchGoogle("typescript tutorial");
+```
+### searchSearxNG(query: string, options?: SearchOptions)
+Search using SearxNG (meta-search engine). Uses public instances by default or a custom instance.
+```typescript
+import { searchSearxNG } from "llm-search-tools";
+const results = await searchSearxNG("typescript tutorial", {
+    searxngInstance: "https://searx.be",
+});
+```
+## Options
+```typescript
+interface SearchOptions {
+    limit?: number; // max number of results (default: 10)
+    safeSearch?: boolean; // enable safe search (default: true)
+    timeout?: number; // request timeout in ms (default: 10000)
+    // Advanced Options
+    proxy?: string | ProxyConfig; // Proxy configuration
+    antiBot?: {
+        enabled?: boolean; // Enable anti-bot detection measures
+        maxRetries?: number;
+        retryDelay?: number;
+    };
+    searxngInstance?: string; // Custom SearxNG instance URL
+}
+```
+## Result Format
+```typescript
+interface SearchResult {
+    title: string; // result title
+    url: string; // result url
+    snippet?: string; // result description/snippet
+    source: "google" | "duckduckgo" | "wikipedia" | "hackernews" | "searxng";
+}
+```
+## Error Handling
+All functions throw a `SearchError` on failure. The main `search()` function aggregates errors if all providers fail.
+```typescript
+try {
+    const results = await search("typescript tutorial");
+} catch (err) {
+    if (err.code === "ALL_SEARCH_ENGINES_FAILED") {
+        console.log("All search engines failed:", err.errors);
+    }
+}
+```
+## Tips
+- For best results, use the main `search()` function which handles fallbacks automatically.
+- DuckDuckGo is the default first choice as it is less restrictive.
+- SearxNG is a great fallback as it aggregates multiple engines.
+- If you are getting blocked, try enabling `antiBot` or configuring a proxy.

package/docs/wikipedia.md ADDED Viewed

@@ -0,0 +1,91 @@
+# Wikipedia Module 📚
+The Wikipedia module provides functions for searching Wikipedia and retrieving article content.
+## Functions
+### wikiSearch(query: string, limit?: number)
+Search Wikipedia articles.
+```typescript
+import { wikiSearch } from 'llm-search-tools';
+const results = await wikiSearch('Node.js', 5);
+```
+### wikiGetContent(title: string)
+Get the full content of a Wikipedia article.
+```typescript
+import { wikiGetContent } from 'llm-search-tools';
+const content = await wikiGetContent('Node.js');
+```
+### wikiGetSummary(title: string)
+Get a summary of a Wikipedia article.
+```typescript
+import { wikiGetSummary } from 'llm-search-tools';
+const summary = await wikiGetSummary('Node.js');
+```
+### setWikiLang(language: string)
+Set the Wikipedia language (default: 'en').
+```typescript
+import { setWikiLang } from 'llm-search-tools';
+setWikiLang('es'); // switch to Spanish Wikipedia
+```
+## Result Format
+```typescript
+interface WikipediaResult extends SearchResult {
+  extract?: string;    // article extract/summary
+  thumbnail?: string;  // URL of article thumbnail image
+}
+```
+## Error Handling
+All functions throw a `SearchError` on failure:
+```typescript
+try {
+  const results = await wikiSearch('nodejs');
+} catch (err) {
+  if (err.code === 'WIKI_SEARCH_ERROR') {
+    console.error('wikipedia search failed:', err.message);
+  }
+}
+```
+## Tips
+- Use `wikiGetSummary()` to get a quick overview of a topic
+- `wikiSearch()` results include thumbnails when available
+- Switch languages with `setWikiLang()` for international content
+- Article content from `wikiGetContent()` is in raw format, you may want to parse it
+## Common Languages
+Here are some common language codes for `setWikiLang()`:
+- 'en' - English
+- 'es' - Spanish
+- 'fr' - French
+- 'de' - German
+- 'it' - Italian
+- 'pt' - Portuguese
+- 'ru' - Russian
+- 'ja' - Japanese
+- 'zh' - Chinese
+See [Wikipedia language codes](https://en.wikipedia.org/wiki/List_of_Wikipedias) for more options.

package/package.json ADDED Viewed

@@ -0,0 +1,97 @@
+{
+  "name": "llm-search-tools",
+  "version": "1.1.0",
+  "description": "A Node.js module for searching and scraping web content, designed for LLMs but useful for any project where webscraping is needed!",
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "scripts": {
+    "build": "tsc",
+    "prepare": "npm run build",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "lint": "eslint src/**/*.ts",
+    "clean": "rm -rf dist",
+    "prepublishOnly": "npm run test && npm run build"
+  },
+  "keywords": [
+    "search",
+    "scraping",
+    "llm",
+    "ai",
+    "wikipedia",
+    "hackernews",
+    "readability",
+    "google",
+    "duckduckgo",
+    "typescript",
+    "web-scraping",
+    "content-extraction",
+    "llm-scrape",
+    "web-search",
+    "scrape",
+    "web-scrape",
+    "web-crawl",
+    "crawling",
+    "web-crawler"
+  ],
+  "author": {
+    "name": "Minoa",
+    "url": "https://gitlab.com/M1noa"
+  },
+  "license": "MIT",
+  "dependencies": {
+    "@crawlee/cheerio": "^3.15.3",
+    "@crawlee/puppeteer": "^3.15.3",
+    "@mozilla/readability": "^0.6.0",
+    "crawlee": "^3.15.3",
+    "csv-parse": "^6.1.0",
+    "fast-xml-parser": "^5.3.3",
+    "google-news-scraper": "^2.7.0",
+    "google-sr": "^6.0.0",
+    "https-proxy-agent": "^7.0.6",
+    "jsdom": "^27.4.0",
+    "mammoth": "^1.11.0",
+    "pdf-parse": "^2.4.5",
+    "puppeteer": "^24.34.0",
+    "puppeteer-extra": "^3.3.6",
+    "puppeteer-extra-plugin-stealth": "^2.11.2",
+    "socks-proxy-agent": "^8.0.5",
+    "tesseract.js": "^7.0.0",
+    "turndown": "^7.2.2",
+    "wikipedia": "^2.4.2",
+    "yahoo-finance2": "^3.11.2",
+    "zod-to-json-schema": "^3.25.1"
+  },
+  "devDependencies": {
+    "@types/jsdom": "^27.0.0",
+    "@types/node": "^25.0.3",
+    "@types/turndown": "^5.0.6",
+    "@typescript-eslint/eslint-plugin": "^8.51.0",
+    "@typescript-eslint/parser": "^8.51.0",
+    "eslint": "^9.39.2",
+    "globals": "^17.0.0",
+    "ts-node": "^10.9.2",
+    "typescript": "^5.9.3",
+    "vitest": "^3.2.4"
+  },
+  "engines": {
+    "node": ">=16"
+  },
+  "repository": {
+    "type": "git",
+    "url": "git+https://gitlab.com/m1noa/llm-search.git"
+  },
+  "bugs": {
+    "url": "https://gitlab.com/m1noa/llm-search/issues"
+  },
+  "homepage": "https://gitlab.com/m1noa/llm-search#readme",
+  "files": [
+    "dist",
+    "LICENSE",
+    "README.md",
+    "docs"
+  ],
+  "publishConfig": {
+    "access": "public"
+  }
+}