llm-search-tools 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +244 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.js +40 -0
- package/dist/index.js.map +1 -0
- package/dist/integration.test.d.ts +1 -0
- package/dist/integration.test.js +237 -0
- package/dist/modules/answerbox.test.d.ts +1 -0
- package/dist/modules/answerbox.test.js +105 -0
- package/dist/modules/autocomplete.d.ts +11 -0
- package/dist/modules/autocomplete.js +159 -0
- package/dist/modules/autocomplete.test.d.ts +1 -0
- package/dist/modules/autocomplete.test.js +188 -0
- package/dist/modules/common.d.ts +26 -0
- package/dist/modules/common.js +263 -0
- package/dist/modules/common.test.d.ts +1 -0
- package/dist/modules/common.test.js +87 -0
- package/dist/modules/crawl.d.ts +9 -0
- package/dist/modules/crawl.js +117 -0
- package/dist/modules/crawl.test.d.ts +1 -0
- package/dist/modules/crawl.test.js +48 -0
- package/dist/modules/events.d.ts +8 -0
- package/dist/modules/events.js +129 -0
- package/dist/modules/events.test.d.ts +1 -0
- package/dist/modules/events.test.js +104 -0
- package/dist/modules/finance.d.ts +10 -0
- package/dist/modules/finance.js +20 -0
- package/dist/modules/finance.test.d.ts +1 -0
- package/dist/modules/finance.test.js +77 -0
- package/dist/modules/flights.d.ts +8 -0
- package/dist/modules/flights.js +135 -0
- package/dist/modules/flights.test.d.ts +1 -0
- package/dist/modules/flights.test.js +128 -0
- package/dist/modules/hackernews.d.ts +8 -0
- package/dist/modules/hackernews.js +87 -0
- package/dist/modules/hackernews.js.map +1 -0
- package/dist/modules/images.test.d.ts +1 -0
- package/dist/modules/images.test.js +145 -0
- package/dist/modules/integrations.test.d.ts +1 -0
- package/dist/modules/integrations.test.js +93 -0
- package/dist/modules/media.d.ts +11 -0
- package/dist/modules/media.js +132 -0
- package/dist/modules/media.test.d.ts +1 -0
- package/dist/modules/media.test.js +186 -0
- package/dist/modules/news.d.ts +3 -0
- package/dist/modules/news.js +39 -0
- package/dist/modules/news.test.d.ts +1 -0
- package/dist/modules/news.test.js +88 -0
- package/dist/modules/parser.d.ts +19 -0
- package/dist/modules/parser.js +361 -0
- package/dist/modules/parser.test.d.ts +1 -0
- package/dist/modules/parser.test.js +151 -0
- package/dist/modules/reddit.d.ts +21 -0
- package/dist/modules/reddit.js +107 -0
- package/dist/modules/scrape.d.ts +16 -0
- package/dist/modules/scrape.js +272 -0
- package/dist/modules/scrape.test.d.ts +1 -0
- package/dist/modules/scrape.test.js +232 -0
- package/dist/modules/scraper.d.ts +12 -0
- package/dist/modules/scraper.js +640 -0
- package/dist/modules/scrapers/anidb.d.ts +8 -0
- package/dist/modules/scrapers/anidb.js +156 -0
- package/dist/modules/scrapers/duckduckgo.d.ts +6 -0
- package/dist/modules/scrapers/duckduckgo.js +284 -0
- package/dist/modules/scrapers/google-news.d.ts +2 -0
- package/dist/modules/scrapers/google-news.js +60 -0
- package/dist/modules/scrapers/google.d.ts +6 -0
- package/dist/modules/scrapers/google.js +211 -0
- package/dist/modules/scrapers/searxng.d.ts +2 -0
- package/dist/modules/scrapers/searxng.js +93 -0
- package/dist/modules/scrapers/thetvdb.d.ts +3 -0
- package/dist/modules/scrapers/thetvdb.js +147 -0
- package/dist/modules/scrapers/tmdb.d.ts +3 -0
- package/dist/modules/scrapers/tmdb.js +172 -0
- package/dist/modules/scrapers/yahoo-finance.d.ts +2 -0
- package/dist/modules/scrapers/yahoo-finance.js +33 -0
- package/dist/modules/search.d.ts +5 -0
- package/dist/modules/search.js +45 -0
- package/dist/modules/search.js.map +1 -0
- package/dist/modules/search.test.d.ts +1 -0
- package/dist/modules/search.test.js +219 -0
- package/dist/modules/urbandictionary.d.ts +12 -0
- package/dist/modules/urbandictionary.js +26 -0
- package/dist/modules/webpage.d.ts +4 -0
- package/dist/modules/webpage.js +150 -0
- package/dist/modules/webpage.js.map +1 -0
- package/dist/modules/wikipedia.d.ts +5 -0
- package/dist/modules/wikipedia.js +85 -0
- package/dist/modules/wikipedia.js.map +1 -0
- package/dist/scripts/interactive-search.d.ts +1 -0
- package/dist/scripts/interactive-search.js +98 -0
- package/dist/test.d.ts +1 -0
- package/dist/test.js +179 -0
- package/dist/test.js.map +1 -0
- package/dist/testBraveSearch.d.ts +1 -0
- package/dist/testBraveSearch.js +34 -0
- package/dist/testDuckDuckGo.d.ts +1 -0
- package/dist/testDuckDuckGo.js +52 -0
- package/dist/testEcosia.d.ts +1 -0
- package/dist/testEcosia.js +57 -0
- package/dist/testSearchModule.d.ts +1 -0
- package/dist/testSearchModule.js +95 -0
- package/dist/testwebpage.d.ts +1 -0
- package/dist/testwebpage.js +81 -0
- package/dist/types.d.ts +174 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/createTestDocx.d.ts +1 -0
- package/dist/utils/createTestDocx.js +58 -0
- package/dist/utils/htmlcleaner.d.ts +20 -0
- package/dist/utils/htmlcleaner.js +172 -0
- package/docs/README.md +275 -0
- package/docs/autocomplete.md +73 -0
- package/docs/crawling.md +88 -0
- package/docs/events.md +58 -0
- package/docs/examples.md +158 -0
- package/docs/finance.md +60 -0
- package/docs/flights.md +71 -0
- package/docs/hackernews.md +121 -0
- package/docs/media.md +87 -0
- package/docs/news.md +75 -0
- package/docs/parser.md +197 -0
- package/docs/scraper.md +347 -0
- package/docs/search.md +106 -0
- package/docs/wikipedia.md +91 -0
- package/package.json +97 -0
|
@@ -0,0 +1,640 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// scraper.ts - unified scraper with bot detection, proxy support, and content extraction
|
|
3
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
4
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
5
|
+
};
|
|
6
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
7
|
+
exports.searchGoogle = searchGoogle;
|
|
8
|
+
exports.searchGoogleLegacy = searchGoogle;
|
|
9
|
+
exports.searchDuckDuckGo = searchDuckDuckGo;
|
|
10
|
+
exports.searchDuckDuckGoLegacy = searchDuckDuckGo;
|
|
11
|
+
exports.search = search;
|
|
12
|
+
exports.getWebpageContent = getWebpageContent;
|
|
13
|
+
exports.getWebpageText = getWebpageText;
|
|
14
|
+
exports.isUrlAccessible = isUrlAccessible;
|
|
15
|
+
const google_sr_1 = require("google-sr");
|
|
16
|
+
const duck_duck_scrape_1 = require("duck-duck-scrape");
|
|
17
|
+
const puppeteer_extra_1 = __importDefault(require("puppeteer-extra"));
|
|
18
|
+
const puppeteer_extra_plugin_stealth_1 = __importDefault(require("puppeteer-extra-plugin-stealth"));
|
|
19
|
+
const readability_1 = require("@mozilla/readability");
|
|
20
|
+
const jsdom_1 = require("jsdom");
|
|
21
|
+
const wikipedia_1 = require("./wikipedia");
|
|
22
|
+
const hackernews_1 = require("./hackernews");
|
|
23
|
+
// Use stealth plugin
|
|
24
|
+
puppeteer_extra_1.default.use((0, puppeteer_extra_plugin_stealth_1.default)());
|
|
25
|
+
const defaultOptions = {
|
|
26
|
+
limit: 10,
|
|
27
|
+
safeSearch: true,
|
|
28
|
+
timeout: 10000,
|
|
29
|
+
forcePuppeteer: false,
|
|
30
|
+
antiBot: {
|
|
31
|
+
enabled: true,
|
|
32
|
+
maxRetries: 3,
|
|
33
|
+
retryDelay: 2000
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
// Rate limiting parameters
|
|
37
|
+
const MIN_DELAY_BETWEEN_SEARCHES = 5000; // 5 seconds for DuckDuckGo
|
|
38
|
+
const GOOGLE_DELAY = 2000; // 2 seconds for Google
|
|
39
|
+
const MAX_RETRIES = 3;
|
|
40
|
+
const RETRY_DELAY = 2000; // 2 seconds between retries
|
|
41
|
+
let lastDDGSearchTime = 0;
|
|
42
|
+
let lastGoogleSearchTime = 0;
|
|
43
|
+
// Cache for search results
|
|
44
|
+
const searchCache = new Map();
|
|
45
|
+
const CACHE_TTL = 60 * 60 * 1000; // 1 hour
|
|
46
|
+
// Bot detection patterns
|
|
47
|
+
const BOT_PROTECTION_PATTERNS = {
|
|
48
|
+
cloudflare: [
|
|
49
|
+
'cf-ray',
|
|
50
|
+
'__cf_bm',
|
|
51
|
+
'cloudflare',
|
|
52
|
+
'challenge-platform',
|
|
53
|
+
'Just a moment...',
|
|
54
|
+
'Checking your browser',
|
|
55
|
+
'DDoS protection by Cloudflare'
|
|
56
|
+
],
|
|
57
|
+
perimeterx: [
|
|
58
|
+
'_px',
|
|
59
|
+
'perimeterx',
|
|
60
|
+
'px-captcha',
|
|
61
|
+
'PX',
|
|
62
|
+
'bot-management'
|
|
63
|
+
],
|
|
64
|
+
akamai: [
|
|
65
|
+
'akamai',
|
|
66
|
+
'ak_bmsc',
|
|
67
|
+
'akamaighost',
|
|
68
|
+
'akamaized',
|
|
69
|
+
'edgekey'
|
|
70
|
+
],
|
|
71
|
+
datadome: [
|
|
72
|
+
'datadome',
|
|
73
|
+
'__ddg_',
|
|
74
|
+
'x-datadome',
|
|
75
|
+
'ddg-',
|
|
76
|
+
'bot-detection'
|
|
77
|
+
],
|
|
78
|
+
generic: [
|
|
79
|
+
'captcha',
|
|
80
|
+
'recaptcha',
|
|
81
|
+
'hcaptcha',
|
|
82
|
+
'access denied',
|
|
83
|
+
'403 forbidden',
|
|
84
|
+
'rate limit',
|
|
85
|
+
'too many requests',
|
|
86
|
+
'blocked',
|
|
87
|
+
'security check',
|
|
88
|
+
'unauthorized'
|
|
89
|
+
]
|
|
90
|
+
};
|
|
91
|
+
// Helper function to detect bot protection
|
|
92
|
+
function detectBotProtection(headers, body) {
|
|
93
|
+
// Check headers
|
|
94
|
+
for (const [key, value] of headers.entries()) {
|
|
95
|
+
const headerContent = `${key}: ${value}`.toLowerCase();
|
|
96
|
+
for (const patterns of Object.values(BOT_PROTECTION_PATTERNS)) {
|
|
97
|
+
for (const pattern of patterns) {
|
|
98
|
+
if (headerContent.includes(pattern.toLowerCase())) {
|
|
99
|
+
return true;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
// Check body content
|
|
105
|
+
const bodyLower = body.toLowerCase();
|
|
106
|
+
for (const patterns of Object.values(BOT_PROTECTION_PATTERNS)) {
|
|
107
|
+
for (const pattern of patterns) {
|
|
108
|
+
if (bodyLower.includes(pattern.toLowerCase())) {
|
|
109
|
+
return true;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
return false;
|
|
114
|
+
}
|
|
115
|
+
// Parse proxy configuration
|
|
116
|
+
function parseProxyConfig(proxy) {
|
|
117
|
+
if (!proxy)
|
|
118
|
+
return null;
|
|
119
|
+
if (typeof proxy === 'string') {
|
|
120
|
+
// Parse proxy URL
|
|
121
|
+
try {
|
|
122
|
+
const url = new URL(proxy);
|
|
123
|
+
return {
|
|
124
|
+
url: proxy,
|
|
125
|
+
type: url.protocol.replace(':', '')
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
catch {
|
|
129
|
+
throw new Error('Invalid proxy URL format');
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
// Build proxy URL from config
|
|
133
|
+
const auth = proxy.auth ? `${proxy.auth.username}:${proxy.auth.password}@` : '';
|
|
134
|
+
const proxyUrl = `${proxy.type}://${auth}${proxy.host}:${proxy.port}`;
|
|
135
|
+
return {
|
|
136
|
+
url: proxyUrl,
|
|
137
|
+
type: proxy.type
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
// Create realistic headers for basic requests
|
|
141
|
+
function createRealisticHeaders() {
|
|
142
|
+
const userAgents = [
|
|
143
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
144
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
145
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0'
|
|
146
|
+
];
|
|
147
|
+
return {
|
|
148
|
+
'User-Agent': userAgents[Math.floor(Math.random() * userAgents.length)],
|
|
149
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
150
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
151
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
|
152
|
+
'DNT': '1',
|
|
153
|
+
'Connection': 'keep-alive',
|
|
154
|
+
'Upgrade-Insecure-Requests': '1',
|
|
155
|
+
'Sec-Fetch-Dest': 'document',
|
|
156
|
+
'Sec-Fetch-Mode': 'navigate',
|
|
157
|
+
'Sec-Fetch-Site': 'none',
|
|
158
|
+
'Cache-Control': 'max-age=0'
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
// Puppeteer stealth configuration with enhanced options
|
|
162
|
+
async function createStealthBrowser(proxy) {
|
|
163
|
+
const launchOptions = {
|
|
164
|
+
headless: 'new',
|
|
165
|
+
args: [
|
|
166
|
+
'--no-sandbox',
|
|
167
|
+
'--disable-setuid-sandbox',
|
|
168
|
+
'--disable-dev-shm-usage',
|
|
169
|
+
'--disable-accelerated-2d-canvas',
|
|
170
|
+
'--no-first-run',
|
|
171
|
+
'--no-zygote',
|
|
172
|
+
'--single-process',
|
|
173
|
+
'--disable-gpu',
|
|
174
|
+
'--disable-web-security',
|
|
175
|
+
'--disable-features=VizDisplayCompositor',
|
|
176
|
+
'--ignore-certificate-errors',
|
|
177
|
+
'--ignore-certificate-errors-spki-list'
|
|
178
|
+
]
|
|
179
|
+
};
|
|
180
|
+
if (proxy) {
|
|
181
|
+
launchOptions.args.push(`--proxy-server=${proxy.url}`);
|
|
182
|
+
}
|
|
183
|
+
const browser = await puppeteer_extra_1.default.launch(launchOptions);
|
|
184
|
+
// Additional stealth measures
|
|
185
|
+
await browser.defaultBrowserContext().overridePermissions('https://www.google.com', []);
|
|
186
|
+
await browser.defaultBrowserContext().overridePermissions('https://duckduckgo.com', []);
|
|
187
|
+
return browser;
|
|
188
|
+
}
|
|
189
|
+
// Helper function to enforce rate limiting
|
|
190
|
+
async function enforceRateLimit(searchType) {
|
|
191
|
+
const now = Date.now();
|
|
192
|
+
const delay = searchType === "ddg" ? MIN_DELAY_BETWEEN_SEARCHES : GOOGLE_DELAY;
|
|
193
|
+
const lastTime = searchType === "ddg" ? lastDDGSearchTime : lastGoogleSearchTime;
|
|
194
|
+
const timeSinceLastSearch = now - lastTime;
|
|
195
|
+
if (timeSinceLastSearch < delay) {
|
|
196
|
+
await new Promise((resolve) => setTimeout(resolve, delay - timeSinceLastSearch));
|
|
197
|
+
}
|
|
198
|
+
if (searchType === "ddg") {
|
|
199
|
+
lastDDGSearchTime = Date.now();
|
|
200
|
+
}
|
|
201
|
+
else {
|
|
202
|
+
lastGoogleSearchTime = Date.now();
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
// Helper function to get cache key
|
|
206
|
+
function getCacheKey(query, options) {
|
|
207
|
+
return `${query}-${JSON.stringify(options)}`;
|
|
208
|
+
}
|
|
209
|
+
// Fetch with bot detection
|
|
210
|
+
async function fetchWithDetection(url, options) {
|
|
211
|
+
const proxy = parseProxyConfig(options.proxy);
|
|
212
|
+
const headers = createRealisticHeaders();
|
|
213
|
+
const fetchOptions = {
|
|
214
|
+
headers,
|
|
215
|
+
timeout: options.timeout || 10000
|
|
216
|
+
};
|
|
217
|
+
if (proxy) {
|
|
218
|
+
try {
|
|
219
|
+
let agent;
|
|
220
|
+
if (proxy.type === 'socks4' || proxy.type === 'socks5') {
|
|
221
|
+
const { SocksProxyAgent } = await import('socks-proxy-agent');
|
|
222
|
+
agent = new SocksProxyAgent(proxy.url);
|
|
223
|
+
}
|
|
224
|
+
else {
|
|
225
|
+
const { HttpsProxyAgent } = await import('https-proxy-agent');
|
|
226
|
+
agent = new HttpsProxyAgent(proxy.url);
|
|
227
|
+
}
|
|
228
|
+
fetchOptions.agent = agent;
|
|
229
|
+
}
|
|
230
|
+
catch (error) {
|
|
231
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
232
|
+
throw {
|
|
233
|
+
message: `Proxy connection failed: ${errorMessage}`,
|
|
234
|
+
code: 'PROXY_CONNECTION_FAILED',
|
|
235
|
+
originalError: error
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
try {
|
|
240
|
+
const response = await fetch(url, fetchOptions);
|
|
241
|
+
const body = await response.text();
|
|
242
|
+
if (detectBotProtection(response.headers, body)) {
|
|
243
|
+
throw new Error('Bot protection detected');
|
|
244
|
+
}
|
|
245
|
+
return {
|
|
246
|
+
headers: response.headers,
|
|
247
|
+
body
|
|
248
|
+
};
|
|
249
|
+
}
|
|
250
|
+
catch (error) {
|
|
251
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
252
|
+
if (errorMessage.includes('407') || errorMessage.includes('authentication')) {
|
|
253
|
+
throw {
|
|
254
|
+
message: 'Proxy authentication failed',
|
|
255
|
+
code: 'PROXY_AUTH_FAILED',
|
|
256
|
+
originalError: error
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
if (errorMessage.includes('ECONNREFUSED') || errorMessage.includes('ENOTFOUND')) {
|
|
260
|
+
throw {
|
|
261
|
+
message: 'Proxy connection refused',
|
|
262
|
+
code: 'PROXY_CONNECTION_REFUSED',
|
|
263
|
+
originalError: error
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
throw error;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
// Search using Puppeteer
|
|
270
|
+
async function searchWithPuppeteer(query, searchType, options) {
|
|
271
|
+
const proxy = parseProxyConfig(options.proxy);
|
|
272
|
+
const browser = await createStealthBrowser(proxy || undefined);
|
|
273
|
+
const page = await browser.newPage();
|
|
274
|
+
try {
|
|
275
|
+
// Set realistic viewport
|
|
276
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
277
|
+
// Set extra headers
|
|
278
|
+
await page.setExtraHTTPHeaders(createRealisticHeaders());
|
|
279
|
+
// Navigate to search engine
|
|
280
|
+
const searchUrl = searchType === "google"
|
|
281
|
+
? `https://www.google.com/search?q=${encodeURIComponent(query)}`
|
|
282
|
+
: `https://duckduckgo.com/?q=${encodeURIComponent(query)}`;
|
|
283
|
+
await page.goto(searchUrl, { waitUntil: 'networkidle2' });
|
|
284
|
+
// Wait for results
|
|
285
|
+
await page.waitForSelector(searchType === "google" ? 'div.g' : '#links .result', { timeout: 10000 });
|
|
286
|
+
// Extract results
|
|
287
|
+
const results = await page.evaluate((limit) => {
|
|
288
|
+
const items = [];
|
|
289
|
+
if (window.location.hostname.includes('google')) {
|
|
290
|
+
const elements = document.querySelectorAll('div.g');
|
|
291
|
+
for (let i = 0; i < Math.min(elements.length, limit || 10); i++) {
|
|
292
|
+
const el = elements[i];
|
|
293
|
+
const titleEl = el.querySelector('h3');
|
|
294
|
+
const linkEl = el.querySelector('a');
|
|
295
|
+
const snippetEl = el.querySelector('.VwiC3b');
|
|
296
|
+
if (titleEl && linkEl) {
|
|
297
|
+
items.push({
|
|
298
|
+
title: titleEl.textContent || '',
|
|
299
|
+
url: linkEl.href || '',
|
|
300
|
+
snippet: snippetEl?.textContent || '',
|
|
301
|
+
source: 'google'
|
|
302
|
+
});
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
else {
|
|
307
|
+
const elements = document.querySelectorAll('#links .result');
|
|
308
|
+
for (let i = 0; i < Math.min(elements.length, limit || 10); i++) {
|
|
309
|
+
const el = elements[i];
|
|
310
|
+
const titleEl = el.querySelector('h2');
|
|
311
|
+
const linkEl = el.querySelector('a');
|
|
312
|
+
const snippetEl = el.querySelector('.result__snippet');
|
|
313
|
+
if (titleEl && linkEl) {
|
|
314
|
+
items.push({
|
|
315
|
+
title: titleEl.textContent || '',
|
|
316
|
+
url: linkEl.href || '',
|
|
317
|
+
snippet: snippetEl?.textContent || '',
|
|
318
|
+
source: 'duckduckgo'
|
|
319
|
+
});
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
return items;
|
|
324
|
+
}, options.limit);
|
|
325
|
+
return results;
|
|
326
|
+
}
|
|
327
|
+
finally {
|
|
328
|
+
await browser.close();
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
async function searchGoogle(query, options = {}) {
|
|
332
|
+
try {
|
|
333
|
+
const opts = { ...defaultOptions, ...options };
|
|
334
|
+
const cacheKey = getCacheKey(query, opts);
|
|
335
|
+
const cached = searchCache.get(cacheKey);
|
|
336
|
+
if (cached &&
|
|
337
|
+
cached.source === "google" &&
|
|
338
|
+
Date.now() - cached.timestamp < CACHE_TTL) {
|
|
339
|
+
return cached.results;
|
|
340
|
+
}
|
|
341
|
+
await enforceRateLimit("google");
|
|
342
|
+
// Try basic fetch first unless Puppeteer is forced
|
|
343
|
+
if (!opts.forcePuppeteer) {
|
|
344
|
+
try {
|
|
345
|
+
const searchUrl = `https://www.google.com/search?q=${encodeURIComponent(query)}`;
|
|
346
|
+
await fetchWithDetection(searchUrl, opts);
|
|
347
|
+
// If no bot detection, use library
|
|
348
|
+
const results = await (0, google_sr_1.search)({
|
|
349
|
+
query,
|
|
350
|
+
parsers: [google_sr_1.OrganicResult],
|
|
351
|
+
noPartialResults: true,
|
|
352
|
+
requestConfig: { queryParams: { safe: 'active' } }
|
|
353
|
+
});
|
|
354
|
+
const formattedResults = results.map((r) => ({
|
|
355
|
+
title: r.title || "",
|
|
356
|
+
url: r.link || "",
|
|
357
|
+
snippet: r.description || "",
|
|
358
|
+
source: "google",
|
|
359
|
+
}));
|
|
360
|
+
searchCache.set(cacheKey, {
|
|
361
|
+
results: formattedResults,
|
|
362
|
+
timestamp: Date.now(),
|
|
363
|
+
source: "google",
|
|
364
|
+
});
|
|
365
|
+
return formattedResults;
|
|
366
|
+
}
|
|
367
|
+
catch (error) {
|
|
368
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
369
|
+
if (errorMessage === 'Bot protection detected' && opts.antiBot?.enabled) {
|
|
370
|
+
console.warn('Bot protection detected, falling back to Puppeteer...');
|
|
371
|
+
}
|
|
372
|
+
else {
|
|
373
|
+
throw error;
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
// Use Puppeteer as fallback
|
|
378
|
+
const results = await searchWithPuppeteer(query, "google", opts);
|
|
379
|
+
searchCache.set(cacheKey, {
|
|
380
|
+
results,
|
|
381
|
+
timestamp: Date.now(),
|
|
382
|
+
source: "google",
|
|
383
|
+
});
|
|
384
|
+
return results;
|
|
385
|
+
}
|
|
386
|
+
catch (error) {
|
|
387
|
+
throw {
|
|
388
|
+
message: "google search failed :(",
|
|
389
|
+
code: "GOOGLE_SEARCH_ERROR",
|
|
390
|
+
originalError: error,
|
|
391
|
+
};
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
async function searchDuckDuckGo(query, options = {}) {
|
|
395
|
+
const opts = { ...defaultOptions, ...options };
|
|
396
|
+
const cacheKey = getCacheKey(query, opts);
|
|
397
|
+
const cached = searchCache.get(cacheKey);
|
|
398
|
+
if (cached &&
|
|
399
|
+
cached.source === "duckduckgo" &&
|
|
400
|
+
Date.now() - cached.timestamp < CACHE_TTL) {
|
|
401
|
+
return cached.results;
|
|
402
|
+
}
|
|
403
|
+
let lastError;
|
|
404
|
+
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
|
405
|
+
try {
|
|
406
|
+
await enforceRateLimit("ddg");
|
|
407
|
+
// Try basic fetch first unless Puppeteer is forced
|
|
408
|
+
if (!opts.forcePuppeteer) {
|
|
409
|
+
try {
|
|
410
|
+
const searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}`;
|
|
411
|
+
await fetchWithDetection(searchUrl, opts);
|
|
412
|
+
// If no bot detection, use library
|
|
413
|
+
const results = await (0, duck_duck_scrape_1.search)(query, {
|
|
414
|
+
safeSearch: opts.safeSearch
|
|
415
|
+
? duck_duck_scrape_1.SafeSearchType.STRICT
|
|
416
|
+
: duck_duck_scrape_1.SafeSearchType.OFF,
|
|
417
|
+
});
|
|
418
|
+
const formattedResults = results.results
|
|
419
|
+
.slice(0, opts.limit)
|
|
420
|
+
.map((r) => ({
|
|
421
|
+
title: r.title,
|
|
422
|
+
url: r.url,
|
|
423
|
+
snippet: r.description,
|
|
424
|
+
source: "duckduckgo",
|
|
425
|
+
}));
|
|
426
|
+
searchCache.set(cacheKey, {
|
|
427
|
+
results: formattedResults,
|
|
428
|
+
timestamp: Date.now(),
|
|
429
|
+
source: "duckduckgo",
|
|
430
|
+
});
|
|
431
|
+
return formattedResults;
|
|
432
|
+
}
|
|
433
|
+
catch (error) {
|
|
434
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
435
|
+
if (errorMessage === 'Bot protection detected' && opts.antiBot?.enabled) {
|
|
436
|
+
console.warn('Bot protection detected, falling back to Puppeteer...');
|
|
437
|
+
}
|
|
438
|
+
else {
|
|
439
|
+
throw error;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
// Use Puppeteer as fallback
|
|
444
|
+
const results = await searchWithPuppeteer(query, "duckduckgo", opts);
|
|
445
|
+
searchCache.set(cacheKey, {
|
|
446
|
+
results,
|
|
447
|
+
timestamp: Date.now(),
|
|
448
|
+
source: "duckduckgo",
|
|
449
|
+
});
|
|
450
|
+
return results;
|
|
451
|
+
}
|
|
452
|
+
catch (error) {
|
|
453
|
+
lastError = error;
|
|
454
|
+
if (attempt < MAX_RETRIES) {
|
|
455
|
+
console.warn(`DuckDuckGo search attempt ${attempt} failed, retrying in ${RETRY_DELAY / 1000} seconds...`);
|
|
456
|
+
await new Promise((resolve) => setTimeout(resolve, RETRY_DELAY));
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
throw {
|
|
461
|
+
message: "duckduckgo search failed :/",
|
|
462
|
+
code: "DDG_SEARCH_ERROR",
|
|
463
|
+
originalError: lastError,
|
|
464
|
+
};
|
|
465
|
+
}
|
|
466
|
+
// Unified search that tries ddg first, then google
|
|
467
|
+
async function search(query, options = {}) {
|
|
468
|
+
try {
|
|
469
|
+
return await searchDuckDuckGo(query, options);
|
|
470
|
+
}
|
|
471
|
+
catch (err) {
|
|
472
|
+
// fallback to google if ddg fails
|
|
473
|
+
console.warn("duckduckgo search failed, falling back to google...", err);
|
|
474
|
+
return await searchGoogle(query, options);
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
// ===== WEBPAGE CONTENT EXTRACTION =====
|
|
478
|
+
// clean up text by removing excessive whitespace and making it more readable
|
|
479
|
+
function cleanText(text) {
|
|
480
|
+
return text
|
|
481
|
+
.replace(/[\n\s\r]+/g, ' ')
|
|
482
|
+
.replace(/([.!?])\s+/g, '$1\n\n')
|
|
483
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
484
|
+
.replace(/\s+/g, ' ')
|
|
485
|
+
.trim();
|
|
486
|
+
}
|
|
487
|
+
// check url type and get appropriate handler
|
|
488
|
+
function getUrlType(url) {
|
|
489
|
+
try {
|
|
490
|
+
const urlObj = new URL(url);
|
|
491
|
+
const hostname = urlObj.hostname;
|
|
492
|
+
if (hostname.includes('wikipedia.org')) {
|
|
493
|
+
return 'wikipedia';
|
|
494
|
+
}
|
|
495
|
+
if (hostname === 'news.ycombinator.com' && url.includes('item?id=')) {
|
|
496
|
+
return 'hackernews';
|
|
497
|
+
}
|
|
498
|
+
// list of domains that don't work well with readability
|
|
499
|
+
const unsupported = [
|
|
500
|
+
'youtube.com', 'youtu.be', 'vimeo.com',
|
|
501
|
+
'twitter.com', 'x.com', 'instagram.com',
|
|
502
|
+
'facebook.com', 'linkedin.com'
|
|
503
|
+
];
|
|
504
|
+
if (unsupported.some(domain => hostname.includes(domain))) {
|
|
505
|
+
return 'unsupported';
|
|
506
|
+
}
|
|
507
|
+
return 'general';
|
|
508
|
+
}
|
|
509
|
+
catch {
|
|
510
|
+
return 'unsupported';
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
// get webpage content using readability with stealth puppeteer
|
|
514
|
+
async function getWebpageContent(url, options) {
|
|
515
|
+
// Backward compatibility: if options is boolean, treat as usePuppeteer
|
|
516
|
+
if (typeof options === 'boolean') {
|
|
517
|
+
options = { usePuppeteer: options };
|
|
518
|
+
}
|
|
519
|
+
else if (!options) {
|
|
520
|
+
options = {};
|
|
521
|
+
}
|
|
522
|
+
try {
|
|
523
|
+
const urlType = getUrlType(url);
|
|
524
|
+
// handle special cases
|
|
525
|
+
if (urlType === 'wikipedia') {
|
|
526
|
+
const title = url.split('/wiki/')[1]?.replace(/_/g, ' ') || url;
|
|
527
|
+
const content = await (0, wikipedia_1.wikiGetContent)(title);
|
|
528
|
+
return {
|
|
529
|
+
title,
|
|
530
|
+
content,
|
|
531
|
+
textContent: cleanText(content),
|
|
532
|
+
length: content.length,
|
|
533
|
+
excerpt: content.slice(0, 200) + '...',
|
|
534
|
+
siteName: 'Wikipedia'
|
|
535
|
+
};
|
|
536
|
+
}
|
|
537
|
+
if (urlType === 'hackernews') {
|
|
538
|
+
const id = parseInt(url.split('id=')[1]);
|
|
539
|
+
const story = await (0, hackernews_1.getStoryById)(id);
|
|
540
|
+
const content = story.snippet || story.title || 'No content available';
|
|
541
|
+
const cleanedContent = cleanText(content);
|
|
542
|
+
return {
|
|
543
|
+
title: story.title || url,
|
|
544
|
+
content: content,
|
|
545
|
+
textContent: cleanedContent,
|
|
546
|
+
length: cleanedContent.length,
|
|
547
|
+
excerpt: cleanedContent.slice(0, 200) + (cleanedContent.length > 200 ? '...' : ''),
|
|
548
|
+
siteName: 'Hacker News'
|
|
549
|
+
};
|
|
550
|
+
}
|
|
551
|
+
if (urlType === 'unsupported') {
|
|
552
|
+
return {
|
|
553
|
+
title: url,
|
|
554
|
+
content: '',
|
|
555
|
+
textContent: 'This URL type is not supported for content extraction.',
|
|
556
|
+
length: 0,
|
|
557
|
+
excerpt: 'Content not available - URL type not supported'
|
|
558
|
+
};
|
|
559
|
+
}
|
|
560
|
+
// handle general case with readability
|
|
561
|
+
let html;
|
|
562
|
+
if (options.usePuppeteer) {
|
|
563
|
+
// Use stealth puppeteer for bot-protected sites
|
|
564
|
+
const browser = await createStealthBrowser(parseProxyConfig(options.proxy) || undefined);
|
|
565
|
+
const page = await browser.newPage();
|
|
566
|
+
await page.setViewport({ width: 1920, height: 1080 });
|
|
567
|
+
await page.setExtraHTTPHeaders(createRealisticHeaders());
|
|
568
|
+
await page.goto(url, { waitUntil: 'networkidle2' });
|
|
569
|
+
html = await page.content();
|
|
570
|
+
await browser.close();
|
|
571
|
+
}
|
|
572
|
+
else {
|
|
573
|
+
try {
|
|
574
|
+
const headers = createRealisticHeaders();
|
|
575
|
+
const fetchOptions = { headers };
|
|
576
|
+
const proxy = parseProxyConfig(options.proxy);
|
|
577
|
+
if (proxy) {
|
|
578
|
+
if (proxy.type === 'socks4' || proxy.type === 'socks5') {
|
|
579
|
+
const { SocksProxyAgent } = await import('socks-proxy-agent');
|
|
580
|
+
fetchOptions.agent = new SocksProxyAgent(proxy.url);
|
|
581
|
+
}
|
|
582
|
+
else {
|
|
583
|
+
const { HttpsProxyAgent } = await import('https-proxy-agent');
|
|
584
|
+
fetchOptions.agent = new HttpsProxyAgent(proxy.url);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
const response = await fetch(url, fetchOptions);
|
|
588
|
+
html = await response.text();
|
|
589
|
+
}
|
|
590
|
+
catch (error) {
|
|
591
|
+
// If basic fetch fails, try with puppeteer
|
|
592
|
+
console.warn('Basic fetch failed, trying with Puppeteer...', error);
|
|
593
|
+
return await getWebpageContent(url, { ...options, usePuppeteer: true });
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
const dom = new jsdom_1.JSDOM(html, { url });
|
|
597
|
+
const reader = new readability_1.Readability(dom.window.document);
|
|
598
|
+
const article = reader.parse();
|
|
599
|
+
if (!article) {
|
|
600
|
+
return {
|
|
601
|
+
title: url,
|
|
602
|
+
content: '',
|
|
603
|
+
textContent: 'Failed to extract readable content from this page.',
|
|
604
|
+
length: 0,
|
|
605
|
+
excerpt: 'Content extraction failed'
|
|
606
|
+
};
|
|
607
|
+
}
|
|
608
|
+
const cleanedText = cleanText(article.textContent || '');
|
|
609
|
+
return {
|
|
610
|
+
title: article.title || url,
|
|
611
|
+
content: article.content || '',
|
|
612
|
+
textContent: cleanedText,
|
|
613
|
+
length: cleanedText.length,
|
|
614
|
+
excerpt: article.excerpt || undefined,
|
|
615
|
+
siteName: article.siteName || undefined
|
|
616
|
+
};
|
|
617
|
+
}
|
|
618
|
+
catch (err) {
|
|
619
|
+
throw {
|
|
620
|
+
message: 'failed to get webpage content :/',
|
|
621
|
+
code: 'WEBPAGE_ERROR',
|
|
622
|
+
originalError: err
|
|
623
|
+
};
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
// get just the text content
|
|
627
|
+
async function getWebpageText(url, options = {}) {
|
|
628
|
+
const content = await getWebpageContent(url, options);
|
|
629
|
+
return content.textContent;
|
|
630
|
+
}
|
|
631
|
+
// check if url is accessible
|
|
632
|
+
async function isUrlAccessible(url) {
|
|
633
|
+
try {
|
|
634
|
+
const response = await fetch(url, { method: 'HEAD' });
|
|
635
|
+
return response.ok;
|
|
636
|
+
}
|
|
637
|
+
catch {
|
|
638
|
+
return false;
|
|
639
|
+
}
|
|
640
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import { MediaResult, MediaSearchOptions } from "../../types";
|
|
2
|
+
/**
|
|
3
|
+
* AniDB Scraper
|
|
4
|
+
* AniDB has strict anti-bot protection ("AntiLeech").
|
|
5
|
+
* We must use Puppeteer with Stealth plugin and respect rate limits.
|
|
6
|
+
*/
|
|
7
|
+
export declare function searchAniDB(query: string, options?: MediaSearchOptions): Promise<MediaResult[]>;
|
|
8
|
+
export declare function getAniDBDetails(url: string, options?: MediaSearchOptions): Promise<Partial<MediaResult>>;
|