mcp-researchpowerpack-http 3.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +124 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +227 -0
- package/dist/index.js.map +7 -0
- package/dist/mcp-use.json +7 -0
- package/dist/src/clients/github.d.ts +83 -0
- package/dist/src/clients/github.d.ts.map +1 -0
- package/dist/src/clients/github.js +370 -0
- package/dist/src/clients/github.js.map +7 -0
- package/dist/src/clients/reddit.d.ts +60 -0
- package/dist/src/clients/reddit.d.ts.map +1 -0
- package/dist/src/clients/reddit.js +287 -0
- package/dist/src/clients/reddit.js.map +7 -0
- package/dist/src/clients/research.d.ts +67 -0
- package/dist/src/clients/research.d.ts.map +1 -0
- package/dist/src/clients/research.js +282 -0
- package/dist/src/clients/research.js.map +7 -0
- package/dist/src/clients/scraper.d.ts +72 -0
- package/dist/src/clients/scraper.d.ts.map +1 -0
- package/dist/src/clients/scraper.js +327 -0
- package/dist/src/clients/scraper.js.map +7 -0
- package/dist/src/clients/search.d.ts +57 -0
- package/dist/src/clients/search.d.ts.map +1 -0
- package/dist/src/clients/search.js +218 -0
- package/dist/src/clients/search.js.map +7 -0
- package/dist/src/config/index.d.ts +93 -0
- package/dist/src/config/index.d.ts.map +1 -0
- package/dist/src/config/index.js +218 -0
- package/dist/src/config/index.js.map +7 -0
- package/dist/src/schemas/deep-research.d.ts +40 -0
- package/dist/src/schemas/deep-research.d.ts.map +1 -0
- package/dist/src/schemas/deep-research.js +216 -0
- package/dist/src/schemas/deep-research.js.map +7 -0
- package/dist/src/schemas/github-score.d.ts +50 -0
- package/dist/src/schemas/github-score.d.ts.map +1 -0
- package/dist/src/schemas/github-score.js +58 -0
- package/dist/src/schemas/github-score.js.map +7 -0
- package/dist/src/schemas/scrape-links.d.ts +23 -0
- package/dist/src/schemas/scrape-links.d.ts.map +1 -0
- package/dist/src/schemas/scrape-links.js +32 -0
- package/dist/src/schemas/scrape-links.js.map +7 -0
- package/dist/src/schemas/web-search.d.ts +18 -0
- package/dist/src/schemas/web-search.d.ts.map +1 -0
- package/dist/src/schemas/web-search.js +28 -0
- package/dist/src/schemas/web-search.js.map +7 -0
- package/dist/src/scoring/github-quality.d.ts +142 -0
- package/dist/src/scoring/github-quality.d.ts.map +1 -0
- package/dist/src/scoring/github-quality.js +202 -0
- package/dist/src/scoring/github-quality.js.map +7 -0
- package/dist/src/services/file-attachment.d.ts +30 -0
- package/dist/src/services/file-attachment.d.ts.map +1 -0
- package/dist/src/services/file-attachment.js +205 -0
- package/dist/src/services/file-attachment.js.map +7 -0
- package/dist/src/services/llm-processor.d.ts +29 -0
- package/dist/src/services/llm-processor.d.ts.map +1 -0
- package/dist/src/services/llm-processor.js +206 -0
- package/dist/src/services/llm-processor.js.map +7 -0
- package/dist/src/services/markdown-cleaner.d.ts +8 -0
- package/dist/src/services/markdown-cleaner.d.ts.map +1 -0
- package/dist/src/services/markdown-cleaner.js +63 -0
- package/dist/src/services/markdown-cleaner.js.map +7 -0
- package/dist/src/tools/github-score.d.ts +12 -0
- package/dist/src/tools/github-score.d.ts.map +1 -0
- package/dist/src/tools/github-score.js +306 -0
- package/dist/src/tools/github-score.js.map +7 -0
- package/dist/src/tools/mcp-helpers.d.ts +27 -0
- package/dist/src/tools/mcp-helpers.d.ts.map +1 -0
- package/dist/src/tools/mcp-helpers.js +47 -0
- package/dist/src/tools/mcp-helpers.js.map +7 -0
- package/dist/src/tools/reddit.d.ts +54 -0
- package/dist/src/tools/reddit.d.ts.map +1 -0
- package/dist/src/tools/reddit.js +498 -0
- package/dist/src/tools/reddit.js.map +7 -0
- package/dist/src/tools/registry.d.ts +3 -0
- package/dist/src/tools/registry.d.ts.map +1 -0
- package/dist/src/tools/registry.js +17 -0
- package/dist/src/tools/registry.js.map +7 -0
- package/dist/src/tools/research.d.ts +14 -0
- package/dist/src/tools/research.d.ts.map +1 -0
- package/dist/src/tools/research.js +250 -0
- package/dist/src/tools/research.js.map +7 -0
- package/dist/src/tools/scrape.d.ts +14 -0
- package/dist/src/tools/scrape.d.ts.map +1 -0
- package/dist/src/tools/scrape.js +290 -0
- package/dist/src/tools/scrape.js.map +7 -0
- package/dist/src/tools/search.d.ts +10 -0
- package/dist/src/tools/search.d.ts.map +1 -0
- package/dist/src/tools/search.js +197 -0
- package/dist/src/tools/search.js.map +7 -0
- package/dist/src/tools/utils.d.ts +105 -0
- package/dist/src/tools/utils.d.ts.map +1 -0
- package/dist/src/tools/utils.js +96 -0
- package/dist/src/tools/utils.js.map +7 -0
- package/dist/src/utils/concurrency.d.ts +28 -0
- package/dist/src/utils/concurrency.d.ts.map +1 -0
- package/dist/src/utils/concurrency.js +62 -0
- package/dist/src/utils/concurrency.js.map +7 -0
- package/dist/src/utils/errors.d.ts +95 -0
- package/dist/src/utils/errors.d.ts.map +1 -0
- package/dist/src/utils/errors.js +289 -0
- package/dist/src/utils/errors.js.map +7 -0
- package/dist/src/utils/logger.d.ts +33 -0
- package/dist/src/utils/logger.d.ts.map +1 -0
- package/dist/src/utils/logger.js +41 -0
- package/dist/src/utils/logger.js.map +7 -0
- package/dist/src/utils/markdown-formatter.d.ts +5 -0
- package/dist/src/utils/markdown-formatter.d.ts.map +1 -0
- package/dist/src/utils/markdown-formatter.js +15 -0
- package/dist/src/utils/markdown-formatter.js.map +7 -0
- package/dist/src/utils/response.d.ts +83 -0
- package/dist/src/utils/response.d.ts.map +1 -0
- package/dist/src/utils/response.js +109 -0
- package/dist/src/utils/response.js.map +7 -0
- package/dist/src/utils/retry.d.ts +43 -0
- package/dist/src/utils/retry.d.ts.map +1 -0
- package/dist/src/utils/retry.js +37 -0
- package/dist/src/utils/retry.js.map +7 -0
- package/dist/src/utils/url-aggregator.d.ts +92 -0
- package/dist/src/utils/url-aggregator.d.ts.map +1 -0
- package/dist/src/utils/url-aggregator.js +357 -0
- package/dist/src/utils/url-aggregator.js.map +7 -0
- package/dist/src/version.d.ts +28 -0
- package/dist/src/version.d.ts.map +1 -0
- package/dist/src/version.js +32 -0
- package/dist/src/version.js.map +7 -0
- package/package.json +73 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
import { parseEnv } from "../config/index.js";
|
|
2
|
+
import {
|
|
3
|
+
classifyError,
|
|
4
|
+
fetchWithTimeout,
|
|
5
|
+
sleep,
|
|
6
|
+
ErrorCode
|
|
7
|
+
} from "../utils/errors.js";
|
|
8
|
+
import { calculateBackoff } from "../utils/retry.js";
|
|
9
|
+
import { pMapSettled } from "../utils/concurrency.js";
|
|
10
|
+
import { mcpLog } from "../utils/logger.js";
|
|
11
|
+
const SCRAPE_MODES = ["basic", "javascript", "javascript_geo"];
|
|
12
|
+
const CREDIT_COSTS = { basic: 1, javascript: 5, javascript_geo: 5 };
|
|
13
|
+
const DEFAULT_SCRAPE_CONCURRENCY = 10;
|
|
14
|
+
const SCRAPE_BATCH_SIZE = 30;
|
|
15
|
+
const MAX_RETRIES = 1;
|
|
16
|
+
const FALLBACK_OVERALL_TIMEOUT_MS = 3e4;
|
|
17
|
+
const RETRYABLE_STATUS_CODES = /* @__PURE__ */ new Set([429, 502, 503, 504, 510]);
|
|
18
|
+
const PERMANENT_FAILURE_CODES = /* @__PURE__ */ new Set([400, 401, 403]);
|
|
19
|
+
const MIN_USEFUL_CONTENT_LENGTH = 200;
|
|
20
|
+
const FALLBACK_ATTEMPTS = [
|
|
21
|
+
{ mode: "basic", description: "basic mode" },
|
|
22
|
+
{ mode: "javascript", description: "javascript rendering" },
|
|
23
|
+
{ mode: "javascript", country: "us", description: "javascript + US geo-targeting" }
|
|
24
|
+
];
|
|
25
|
+
class ScraperClient {
|
|
26
|
+
apiKey;
|
|
27
|
+
baseURL = "https://api.scrape.do";
|
|
28
|
+
constructor(apiKey) {
|
|
29
|
+
const env = parseEnv();
|
|
30
|
+
this.apiKey = apiKey || env.SCRAPER_API_KEY;
|
|
31
|
+
if (!this.apiKey) {
|
|
32
|
+
throw new Error("Web scraping capability is not configured. Please set up the required API credentials.");
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Scrape a single URL with retry logic
|
|
37
|
+
* NEVER throws - always returns a ScrapeResponse (possibly with error)
|
|
38
|
+
*/
|
|
39
|
+
async scrape(request, maxRetries = MAX_RETRIES) {
|
|
40
|
+
const { url, mode = "basic", timeout = 15, country } = request;
|
|
41
|
+
const credits = CREDIT_COSTS[mode] ?? 1;
|
|
42
|
+
try {
|
|
43
|
+
new URL(url);
|
|
44
|
+
} catch {
|
|
45
|
+
return {
|
|
46
|
+
content: `Invalid URL: ${url}`,
|
|
47
|
+
statusCode: 400,
|
|
48
|
+
credits: 0,
|
|
49
|
+
error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false }
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
const params = new URLSearchParams({
|
|
53
|
+
url,
|
|
54
|
+
token: this.apiKey,
|
|
55
|
+
timeout: String(timeout * 1e3)
|
|
56
|
+
});
|
|
57
|
+
if (mode === "javascript") {
|
|
58
|
+
params.append("render", "true");
|
|
59
|
+
}
|
|
60
|
+
if (country) {
|
|
61
|
+
params.append("geoCode", country.toUpperCase());
|
|
62
|
+
}
|
|
63
|
+
const apiUrl = `${this.baseURL}?${params.toString()}`;
|
|
64
|
+
let lastError;
|
|
65
|
+
for (let attempt = 0; attempt < maxRetries; attempt++) {
|
|
66
|
+
try {
|
|
67
|
+
const timeoutMs = (timeout + 5) * 1e3;
|
|
68
|
+
const response = await fetchWithTimeout(apiUrl, {
|
|
69
|
+
method: "GET",
|
|
70
|
+
headers: { Accept: "text/html,application/json" },
|
|
71
|
+
timeoutMs
|
|
72
|
+
});
|
|
73
|
+
let content;
|
|
74
|
+
try {
|
|
75
|
+
content = await response.text();
|
|
76
|
+
} catch (readError) {
|
|
77
|
+
content = `Failed to read response: ${readError instanceof Error ? readError.message : String(readError)}`;
|
|
78
|
+
}
|
|
79
|
+
if (response.ok) {
|
|
80
|
+
return {
|
|
81
|
+
content,
|
|
82
|
+
statusCode: response.status,
|
|
83
|
+
credits,
|
|
84
|
+
headers: Object.fromEntries(response.headers.entries())
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
if (response.status === 404) {
|
|
88
|
+
return {
|
|
89
|
+
content: "404 - Page not found",
|
|
90
|
+
statusCode: 404,
|
|
91
|
+
credits
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
if (PERMANENT_FAILURE_CODES.has(response.status)) {
|
|
95
|
+
const errorMsg = response.status === 401 ? "No credits remaining or subscription suspended" : `Request failed with status ${response.status}`;
|
|
96
|
+
return {
|
|
97
|
+
content: `Error: ${errorMsg}`,
|
|
98
|
+
statusCode: response.status,
|
|
99
|
+
credits: 0,
|
|
100
|
+
error: {
|
|
101
|
+
code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.INVALID_INPUT,
|
|
102
|
+
message: errorMsg,
|
|
103
|
+
retryable: false,
|
|
104
|
+
statusCode: response.status
|
|
105
|
+
}
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
if (RETRYABLE_STATUS_CODES.has(response.status)) {
|
|
109
|
+
lastError = {
|
|
110
|
+
code: response.status === 429 ? ErrorCode.RATE_LIMITED : ErrorCode.SERVICE_UNAVAILABLE,
|
|
111
|
+
message: `Server returned ${response.status}`,
|
|
112
|
+
retryable: true,
|
|
113
|
+
statusCode: response.status
|
|
114
|
+
};
|
|
115
|
+
if (attempt < maxRetries - 1) {
|
|
116
|
+
const delayMs = calculateBackoff(attempt);
|
|
117
|
+
mcpLog("warning", `${response.status} on attempt ${attempt + 1}/${maxRetries}. Retrying in ${delayMs}ms`, "scraper");
|
|
118
|
+
await sleep(delayMs);
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
lastError = classifyError({ status: response.status, message: content });
|
|
123
|
+
if (attempt < maxRetries - 1 && lastError.retryable) {
|
|
124
|
+
const delayMs = calculateBackoff(attempt);
|
|
125
|
+
mcpLog("warning", `Status ${response.status}. Retrying in ${delayMs}ms`, "scraper");
|
|
126
|
+
await sleep(delayMs);
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
return {
|
|
130
|
+
content: `Error: ${lastError.message}`,
|
|
131
|
+
statusCode: response.status,
|
|
132
|
+
credits: 0,
|
|
133
|
+
error: lastError
|
|
134
|
+
};
|
|
135
|
+
} catch (error) {
|
|
136
|
+
lastError = classifyError(error);
|
|
137
|
+
if (!lastError.retryable) {
|
|
138
|
+
return {
|
|
139
|
+
content: `Error: ${lastError.message}`,
|
|
140
|
+
statusCode: lastError.statusCode || 500,
|
|
141
|
+
credits: 0,
|
|
142
|
+
error: lastError
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
if (attempt < maxRetries - 1) {
|
|
146
|
+
const delayMs = calculateBackoff(attempt);
|
|
147
|
+
mcpLog("warning", `${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${maxRetries} in ${delayMs}ms`, "scraper");
|
|
148
|
+
await sleep(delayMs);
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
return {
|
|
154
|
+
content: `Error: Failed after ${maxRetries} attempts. ${lastError?.message || "Unknown error"}`,
|
|
155
|
+
statusCode: lastError?.statusCode || 500,
|
|
156
|
+
credits: 0,
|
|
157
|
+
error: lastError || { code: ErrorCode.UNKNOWN_ERROR, message: "All retries exhausted", retryable: false }
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Scrape with automatic fallback through different modes
|
|
162
|
+
* NEVER throws - always returns a ScrapeResponse
|
|
163
|
+
*/
|
|
164
|
+
async scrapeWithFallback(url, options = {}) {
|
|
165
|
+
const attemptResults = [];
|
|
166
|
+
let lastResult = null;
|
|
167
|
+
const deadline = Date.now() + FALLBACK_OVERALL_TIMEOUT_MS;
|
|
168
|
+
for (const attempt of FALLBACK_ATTEMPTS) {
|
|
169
|
+
if (Date.now() >= deadline) {
|
|
170
|
+
mcpLog("warning", `Overall fallback timeout reached for ${url} after ${attemptResults.length} attempt(s)`, "scraper");
|
|
171
|
+
break;
|
|
172
|
+
}
|
|
173
|
+
const result = await this.tryFallbackAttempt(url, attempt, options);
|
|
174
|
+
if (result.done) {
|
|
175
|
+
if (attemptResults.length > 0) {
|
|
176
|
+
mcpLog("info", `Success with ${attempt.description} after ${attemptResults.length} fallback(s)`, "scraper");
|
|
177
|
+
}
|
|
178
|
+
return result.response;
|
|
179
|
+
}
|
|
180
|
+
lastResult = result.response;
|
|
181
|
+
attemptResults.push(`${attempt.description}: ${result.response.error?.message || result.response.statusCode}`);
|
|
182
|
+
mcpLog("warning", `Failed with ${attempt.description} (${result.response.statusCode}), trying next fallback...`, "scraper");
|
|
183
|
+
}
|
|
184
|
+
const errorMessage = `Failed after ${attemptResults.length} fallback attempt(s): ${attemptResults.join("; ")}`;
|
|
185
|
+
return {
|
|
186
|
+
content: `Error: ${errorMessage}`,
|
|
187
|
+
statusCode: lastResult?.statusCode || 500,
|
|
188
|
+
credits: 0,
|
|
189
|
+
error: {
|
|
190
|
+
code: ErrorCode.SERVICE_UNAVAILABLE,
|
|
191
|
+
message: errorMessage,
|
|
192
|
+
retryable: false
|
|
193
|
+
}
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Execute a single fallback attempt and determine whether to continue.
|
|
198
|
+
* Returns { done: true } on success/terminal or { done: false } to try the next mode.
|
|
199
|
+
*/
|
|
200
|
+
async tryFallbackAttempt(url, attempt, options) {
|
|
201
|
+
const result = await this.scrape({
|
|
202
|
+
url,
|
|
203
|
+
mode: attempt.mode,
|
|
204
|
+
timeout: options.timeout,
|
|
205
|
+
country: attempt.country
|
|
206
|
+
});
|
|
207
|
+
if (result.statusCode >= 200 && result.statusCode < 300 && !result.error) {
|
|
208
|
+
const strippedLength = result.content.replace(/<[^>]*>/g, "").trim().length;
|
|
209
|
+
if (strippedLength < MIN_USEFUL_CONTENT_LENGTH && attempt.mode === "basic") {
|
|
210
|
+
mcpLog("info", `Basic mode returned only ${strippedLength} chars of text for ${url} \u2014 trying JS rendering`, "scraper");
|
|
211
|
+
return { done: false, response: result };
|
|
212
|
+
}
|
|
213
|
+
return { done: true, response: result };
|
|
214
|
+
}
|
|
215
|
+
if (result.statusCode === 404) {
|
|
216
|
+
return { done: true, response: result };
|
|
217
|
+
}
|
|
218
|
+
if (result.statusCode === 502) {
|
|
219
|
+
mcpLog("warning", `502 Bad Gateway for ${url} \u2014 likely WAF/CDN block, skipping fallback modes`, "scraper");
|
|
220
|
+
return { done: true, response: {
|
|
221
|
+
...result,
|
|
222
|
+
error: {
|
|
223
|
+
code: ErrorCode.SERVICE_UNAVAILABLE,
|
|
224
|
+
message: "Bad gateway \u2014 site is blocking automated access",
|
|
225
|
+
retryable: false
|
|
226
|
+
}
|
|
227
|
+
} };
|
|
228
|
+
}
|
|
229
|
+
if (result.error && !result.error.retryable) {
|
|
230
|
+
mcpLog("error", `Non-retryable error with ${attempt.description}: ${result.error.message}`, "scraper");
|
|
231
|
+
return { done: true, response: result };
|
|
232
|
+
}
|
|
233
|
+
return { done: false, response: result };
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Scrape multiple URLs with batching
|
|
237
|
+
* NEVER throws - always returns results array
|
|
238
|
+
*/
|
|
239
|
+
async scrapeMultiple(urls, options = {}) {
|
|
240
|
+
if (urls.length === 0) {
|
|
241
|
+
return [];
|
|
242
|
+
}
|
|
243
|
+
if (urls.length <= SCRAPE_BATCH_SIZE) {
|
|
244
|
+
return this.processBatch(urls, options);
|
|
245
|
+
}
|
|
246
|
+
const result = await this.batchScrape(urls, options);
|
|
247
|
+
return result.results;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Batch scrape with progress callback
|
|
251
|
+
* NEVER throws - uses Promise.allSettled internally
|
|
252
|
+
*/
|
|
253
|
+
async batchScrape(urls, options = {}, onBatchComplete) {
|
|
254
|
+
const totalBatches = Math.ceil(urls.length / SCRAPE_BATCH_SIZE);
|
|
255
|
+
const allResults = [];
|
|
256
|
+
let rateLimitHits = 0;
|
|
257
|
+
mcpLog("info", `Starting batch processing: ${urls.length} URLs in ${totalBatches} batch(es)`, "scraper");
|
|
258
|
+
for (let batchNum = 0; batchNum < totalBatches; batchNum++) {
|
|
259
|
+
const startIdx = batchNum * SCRAPE_BATCH_SIZE;
|
|
260
|
+
const endIdx = Math.min(startIdx + SCRAPE_BATCH_SIZE, urls.length);
|
|
261
|
+
const batchUrls = urls.slice(startIdx, endIdx);
|
|
262
|
+
mcpLog("info", `Processing batch ${batchNum + 1}/${totalBatches} (${batchUrls.length} URLs)`, "scraper");
|
|
263
|
+
const batchResults = await pMapSettled(
|
|
264
|
+
batchUrls,
|
|
265
|
+
(url) => this.scrapeWithFallback(url, options),
|
|
266
|
+
DEFAULT_SCRAPE_CONCURRENCY
|
|
267
|
+
);
|
|
268
|
+
for (let i = 0; i < batchResults.length; i++) {
|
|
269
|
+
const result = batchResults[i];
|
|
270
|
+
if (!result) continue;
|
|
271
|
+
const url = batchUrls[i] ?? "";
|
|
272
|
+
if (result.status === "fulfilled") {
|
|
273
|
+
const scrapeResult = result.value;
|
|
274
|
+
allResults.push({ ...scrapeResult, url });
|
|
275
|
+
if (scrapeResult.error?.code === ErrorCode.RATE_LIMITED) {
|
|
276
|
+
rateLimitHits++;
|
|
277
|
+
}
|
|
278
|
+
} else {
|
|
279
|
+
const errorMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);
|
|
280
|
+
mcpLog("error", `Unexpected rejection for ${url}: ${errorMsg}`, "scraper");
|
|
281
|
+
allResults.push({
|
|
282
|
+
url,
|
|
283
|
+
content: `Error: Unexpected failure - ${errorMsg}`,
|
|
284
|
+
statusCode: 500,
|
|
285
|
+
credits: 0,
|
|
286
|
+
error: classifyError(result.reason)
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
try {
|
|
291
|
+
onBatchComplete?.(batchNum + 1, totalBatches, allResults.length);
|
|
292
|
+
} catch (callbackError) {
|
|
293
|
+
mcpLog("error", `onBatchComplete callback error: ${callbackError}`, "scraper");
|
|
294
|
+
}
|
|
295
|
+
mcpLog("info", `Completed batch ${batchNum + 1}/${totalBatches} (${allResults.length}/${urls.length} total)`, "scraper");
|
|
296
|
+
if (batchNum < totalBatches - 1) {
|
|
297
|
+
const batchDelay = rateLimitHits > 0 ? 2e3 : 500;
|
|
298
|
+
await sleep(batchDelay);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
return { results: allResults, batchesProcessed: totalBatches, totalAttempted: urls.length, rateLimitHits };
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Process a single batch of URLs
|
|
305
|
+
* NEVER throws
|
|
306
|
+
*/
|
|
307
|
+
async processBatch(urls, options) {
|
|
308
|
+
const results = await pMapSettled(urls, (url) => this.scrapeWithFallback(url, options), DEFAULT_SCRAPE_CONCURRENCY);
|
|
309
|
+
return results.map((result, index) => {
|
|
310
|
+
const url = urls[index] || "";
|
|
311
|
+
if (result.status === "fulfilled") {
|
|
312
|
+
return { ...result.value, url };
|
|
313
|
+
}
|
|
314
|
+
return {
|
|
315
|
+
url,
|
|
316
|
+
content: `Error: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`,
|
|
317
|
+
statusCode: 500,
|
|
318
|
+
credits: 0,
|
|
319
|
+
error: classifyError(result.reason)
|
|
320
|
+
};
|
|
321
|
+
});
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
export {
|
|
325
|
+
ScraperClient
|
|
326
|
+
};
|
|
327
|
+
//# sourceMappingURL=scraper.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/clients/scraper.ts"],
|
|
4
|
+
"sourcesContent": ["/**\n * Web Scraper Client\n * Generic interface for URL scraping with automatic fallback modes\n * Implements robust error handling that NEVER crashes\n */\n\nimport { parseEnv, SCRAPER } from '../config/index.js';\nimport {\n classifyError,\n fetchWithTimeout,\n sleep,\n ErrorCode,\n type StructuredError,\n} from '../utils/errors.js';\nimport { calculateBackoff } from '../utils/retry.js';\nimport { pMapSettled } from '../utils/concurrency.js';\nimport { mcpLog } from '../utils/logger.js';\n\n// \u2500\u2500 Constants \u2500\u2500\n\nconst SCRAPE_MODES = ['basic', 'javascript', 'javascript_geo'] as const;\ntype ScrapeMode = typeof SCRAPE_MODES[number];\n\nconst CREDIT_COSTS: Record<string, number> = { basic: 1, javascript: 5, javascript_geo: 5 } as const;\nconst DEFAULT_SCRAPE_CONCURRENCY = 10 as const;\nconst SCRAPE_BATCH_SIZE = 30 as const;\nconst MAX_RETRIES = 1 as const;\n/** Overall timeout for all fallback attempts on a single URL */\nconst FALLBACK_OVERALL_TIMEOUT_MS = 30_000 as const;\n\n// \u2500\u2500 Interfaces \u2500\u2500\n\ninterface ScrapeRequest {\n readonly url: string;\n readonly mode?: 'basic' | 'javascript';\n readonly timeout?: number;\n readonly country?: string;\n}\n\ninterface ScrapeResponse {\n readonly content: string;\n readonly statusCode: number;\n readonly credits: number;\n readonly headers?: Record<string, string>;\n readonly error?: StructuredError;\n}\n\ninterface BatchScrapeResult {\n readonly results: ReadonlyArray<ScrapeResponse & { readonly url: string }>;\n readonly batchesProcessed: number;\n readonly totalAttempted: number;\n readonly rateLimitHits: number;\n}\n\n// Status codes that indicate we should retry (no credit consumed)\nconst RETRYABLE_STATUS_CODES = new Set([429, 502, 503, 504, 510]);\n// Status codes that are permanent failures (don't retry)\nconst PERMANENT_FAILURE_CODES = new Set([400, 401, 403]);\n\n/** Minimum stripped-text length to consider a scrape successful (filters out empty SPA shells) */\nconst MIN_USEFUL_CONTENT_LENGTH = 200 as const;\n\n/** Fallback attempt descriptor used by scrapeWithFallback */\ninterface FallbackAttempt {\n readonly mode: 'basic' | 'javascript';\n readonly country?: string;\n readonly description: string;\n}\n\nconst FALLBACK_ATTEMPTS: readonly FallbackAttempt[] = [\n { mode: 'basic', description: 'basic mode' },\n { mode: 'javascript', description: 'javascript rendering' },\n { mode: 'javascript', country: 'us', description: 'javascript + US geo-targeting' },\n] as const;\n\nexport class ScraperClient {\n private apiKey: string;\n private baseURL = 'https://api.scrape.do';\n\n constructor(apiKey?: string) {\n const env = parseEnv();\n this.apiKey = apiKey || env.SCRAPER_API_KEY;\n\n if (!this.apiKey) {\n throw new Error('Web scraping capability is not configured. Please set up the required API credentials.');\n }\n }\n\n /**\n * Scrape a single URL with retry logic\n * NEVER throws - always returns a ScrapeResponse (possibly with error)\n */\n async scrape(request: ScrapeRequest, maxRetries = MAX_RETRIES): Promise<ScrapeResponse> {\n const { url, mode = 'basic', timeout = 15, country } = request;\n const credits = CREDIT_COSTS[mode] ?? 1;\n\n // Validate URL first\n try {\n new URL(url);\n } catch {\n return {\n content: `Invalid URL: ${url}`,\n statusCode: 400,\n credits: 0,\n error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false },\n };\n }\n\n const params = new URLSearchParams({\n url: url,\n token: this.apiKey,\n timeout: String(timeout * 1000),\n });\n\n if (mode === 'javascript') {\n params.append('render', 'true');\n }\n\n if (country) {\n params.append('geoCode', country.toUpperCase());\n }\n\n const apiUrl = `${this.baseURL}?${params.toString()}`;\n let lastError: StructuredError | undefined;\n\n for (let attempt = 0; attempt < maxRetries; attempt++) {\n try {\n // Use AbortController for timeout\n const timeoutMs = (timeout + 5) * 1000; // Add 5s buffer over scrape timeout\n const response = await fetchWithTimeout(apiUrl, {\n method: 'GET',\n headers: { Accept: 'text/html,application/json' },\n timeoutMs,\n });\n\n // Safely read response body\n let content: string;\n try {\n content = await response.text();\n } catch (readError) {\n content = `Failed to read response: ${readError instanceof Error ? readError.message : String(readError)}`;\n }\n\n // SUCCESS: 2xx - Successful API call\n if (response.ok) {\n return {\n content,\n statusCode: response.status,\n credits,\n headers: Object.fromEntries(response.headers.entries()),\n };\n }\n\n // 404 - Target not found (permanent, but not an error for our purposes)\n if (response.status === 404) {\n return {\n content: '404 - Page not found',\n statusCode: 404,\n credits,\n };\n }\n\n // Permanent failures - don't retry\n if (PERMANENT_FAILURE_CODES.has(response.status)) {\n const errorMsg = response.status === 401\n ? 'No credits remaining or subscription suspended'\n : `Request failed with status ${response.status}`;\n return {\n content: `Error: ${errorMsg}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.INVALID_INPUT,\n message: errorMsg,\n retryable: false,\n statusCode: response.status,\n },\n };\n }\n\n // Retryable status codes\n if (RETRYABLE_STATUS_CODES.has(response.status)) {\n lastError = {\n code: response.status === 429 ? ErrorCode.RATE_LIMITED : ErrorCode.SERVICE_UNAVAILABLE,\n message: `Server returned ${response.status}`,\n retryable: true,\n statusCode: response.status,\n };\n\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${response.status} on attempt ${attempt + 1}/${maxRetries}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n\n // Other non-success status - treat as retryable\n lastError = classifyError({ status: response.status, message: content });\n if (attempt < maxRetries - 1 && lastError.retryable) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `Status ${response.status}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n\n // Final attempt failed\n return {\n content: `Error: ${lastError.message}`,\n statusCode: response.status,\n credits: 0,\n error: lastError,\n };\n\n } catch (error) {\n lastError = classifyError(error);\n\n // Non-retryable errors - return immediately\n if (!lastError.retryable) {\n return {\n content: `Error: ${lastError.message}`,\n statusCode: lastError.statusCode || 500,\n credits: 0,\n error: lastError,\n };\n }\n\n // Retryable error - continue if attempts remaining\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${maxRetries} in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n }\n\n // All retries exhausted\n return {\n content: `Error: Failed after ${maxRetries} attempts. ${lastError?.message || 'Unknown error'}`,\n statusCode: lastError?.statusCode || 500,\n credits: 0,\n error: lastError || { code: ErrorCode.UNKNOWN_ERROR, message: 'All retries exhausted', retryable: false },\n };\n }\n\n /**\n * Scrape with automatic fallback through different modes\n * NEVER throws - always returns a ScrapeResponse\n */\n async scrapeWithFallback(url: string, options: { timeout?: number } = {}): Promise<ScrapeResponse> {\n const attemptResults: string[] = [];\n let lastResult: ScrapeResponse | null = null;\n const deadline = Date.now() + FALLBACK_OVERALL_TIMEOUT_MS;\n\n for (const attempt of FALLBACK_ATTEMPTS) {\n // Check overall deadline before starting next fallback\n if (Date.now() >= deadline) {\n mcpLog('warning', `Overall fallback timeout reached for ${url} after ${attemptResults.length} attempt(s)`, 'scraper');\n break;\n }\n\n const result = await this.tryFallbackAttempt(url, attempt, options);\n\n if (result.done) {\n if (attemptResults.length > 0) {\n mcpLog('info', `Success with ${attempt.description} after ${attemptResults.length} fallback(s)`, 'scraper');\n }\n return result.response;\n }\n\n lastResult = result.response;\n attemptResults.push(`${attempt.description}: ${result.response.error?.message || result.response.statusCode}`);\n mcpLog('warning', `Failed with ${attempt.description} (${result.response.statusCode}), trying next fallback...`, 'scraper');\n }\n\n // All fallbacks exhausted or deadline reached\n const errorMessage = `Failed after ${attemptResults.length} fallback attempt(s): ${attemptResults.join('; ')}`;\n return {\n content: `Error: ${errorMessage}`,\n statusCode: lastResult?.statusCode || 500,\n credits: 0,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: errorMessage,\n retryable: false,\n },\n };\n }\n\n /**\n * Execute a single fallback attempt and determine whether to continue.\n * Returns { done: true } on success/terminal or { done: false } to try the next mode.\n */\n private async tryFallbackAttempt(\n url: string,\n attempt: FallbackAttempt,\n options: { timeout?: number },\n ): Promise<{ done: boolean; response: ScrapeResponse }> {\n const result = await this.scrape({\n url,\n mode: attempt.mode,\n timeout: options.timeout,\n country: attempt.country,\n });\n\n // Success \u2014 but verify content isn't an empty SPA shell\n if (result.statusCode >= 200 && result.statusCode < 300 && !result.error) {\n const strippedLength = result.content.replace(/<[^>]*>/g, '').trim().length;\n if (strippedLength < MIN_USEFUL_CONTENT_LENGTH && attempt.mode === 'basic') {\n mcpLog('info', `Basic mode returned only ${strippedLength} chars of text for ${url} \u2014 trying JS rendering`, 'scraper');\n return { done: false, response: result };\n }\n return { done: true, response: result };\n }\n\n // 404 is a valid response, not an error\n if (result.statusCode === 404) {\n return { done: true, response: result };\n }\n\n // 502 Bad Gateway \u2014 almost always a WAF/CDN block, not a transient issue.\n // Switching render mode won't bypass CDN protection, so fail fast.\n if (result.statusCode === 502) {\n mcpLog('warning', `502 Bad Gateway for ${url} \u2014 likely WAF/CDN block, skipping fallback modes`, 'scraper');\n return { done: true, response: {\n ...result,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: 'Bad gateway \u2014 site is blocking automated access',\n retryable: false,\n },\n }};\n }\n\n // Non-retryable errors - don't try other modes\n if (result.error && !result.error.retryable) {\n mcpLog('error', `Non-retryable error with ${attempt.description}: ${result.error.message}`, 'scraper');\n return { done: true, response: result };\n }\n\n return { done: false, response: result };\n }\n\n /**\n * Scrape multiple URLs with batching\n * NEVER throws - always returns results array\n */\n async scrapeMultiple(urls: string[], options: { timeout?: number } = {}): Promise<Array<ScrapeResponse & { url: string }>> {\n if (urls.length === 0) {\n return [];\n }\n\n if (urls.length <= SCRAPE_BATCH_SIZE) {\n return this.processBatch(urls, options);\n }\n\n const result = await this.batchScrape(urls, options);\n return result.results as Array<ScrapeResponse & { url: string }>;\n }\n\n /**\n * Batch scrape with progress callback\n * NEVER throws - uses Promise.allSettled internally\n */\n async batchScrape(\n urls: string[],\n options: { timeout?: number } = {},\n onBatchComplete?: (batchNum: number, totalBatches: number, processed: number) => void\n ): Promise<BatchScrapeResult> {\n const totalBatches = Math.ceil(urls.length / SCRAPE_BATCH_SIZE);\n const allResults: Array<ScrapeResponse & { url: string }> = [];\n let rateLimitHits = 0;\n\n mcpLog('info', `Starting batch processing: ${urls.length} URLs in ${totalBatches} batch(es)`, 'scraper');\n\n for (let batchNum = 0; batchNum < totalBatches; batchNum++) {\n const startIdx = batchNum * SCRAPE_BATCH_SIZE;\n const endIdx = Math.min(startIdx + SCRAPE_BATCH_SIZE, urls.length);\n const batchUrls = urls.slice(startIdx, endIdx);\n\n mcpLog('info', `Processing batch ${batchNum + 1}/${totalBatches} (${batchUrls.length} URLs)`, 'scraper');\n\n const batchResults = await pMapSettled(\n batchUrls,\n url => this.scrapeWithFallback(url, options),\n DEFAULT_SCRAPE_CONCURRENCY\n );\n\n for (let i = 0; i < batchResults.length; i++) {\n const result = batchResults[i];\n if (!result) continue;\n const url = batchUrls[i] ?? '';\n\n if (result.status === 'fulfilled') {\n const scrapeResult = result.value;\n allResults.push({ ...scrapeResult, url });\n\n // Track rate limits\n if (scrapeResult.error?.code === ErrorCode.RATE_LIMITED) {\n rateLimitHits++;\n }\n } else {\n // This shouldn't happen since scrapeWithFallback never throws,\n // but handle it gracefully just in case\n const errorMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);\n mcpLog('error', `Unexpected rejection for ${url}: ${errorMsg}`, 'scraper');\n\n allResults.push({\n url,\n content: `Error: Unexpected failure - ${errorMsg}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n });\n }\n }\n\n // Safe callback invocation\n try {\n onBatchComplete?.(batchNum + 1, totalBatches, allResults.length);\n } catch (callbackError) {\n mcpLog('error', `onBatchComplete callback error: ${callbackError}`, 'scraper');\n }\n\n mcpLog('info', `Completed batch ${batchNum + 1}/${totalBatches} (${allResults.length}/${urls.length} total)`, 'scraper');\n\n // Adaptive delay between batches \u2014 back off harder under rate limiting\n if (batchNum < totalBatches - 1) {\n const batchDelay = rateLimitHits > 0 ? 2000 : 500;\n await sleep(batchDelay);\n }\n }\n\n return { results: allResults, batchesProcessed: totalBatches, totalAttempted: urls.length, rateLimitHits };\n }\n\n /**\n * Process a single batch of URLs\n * NEVER throws\n */\n private async processBatch(urls: string[], options: { timeout?: number }): Promise<Array<ScrapeResponse & { url: string }>> {\n const results = await pMapSettled(urls, url => this.scrapeWithFallback(url, options), DEFAULT_SCRAPE_CONCURRENCY);\n\n return results.map((result, index) => {\n const url = urls[index] || '';\n\n if (result.status === 'fulfilled') {\n return { ...result.value, url };\n }\n\n // Shouldn't happen, but handle gracefully\n return {\n url,\n content: `Error: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n };\n });\n }\n}\n"],
|
|
5
|
+
"mappings": "AAMA,SAAS,gBAAyB;AAClC;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAEK;AACP,SAAS,wBAAwB;AACjC,SAAS,mBAAmB;AAC5B,SAAS,cAAc;AAIvB,MAAM,eAAe,CAAC,SAAS,cAAc,gBAAgB;AAG7D,MAAM,eAAuC,EAAE,OAAO,GAAG,YAAY,GAAG,gBAAgB,EAAE;AAC1F,MAAM,6BAA6B;AACnC,MAAM,oBAAoB;AAC1B,MAAM,cAAc;AAEpB,MAAM,8BAA8B;AA2BpC,MAAM,yBAAyB,oBAAI,IAAI,CAAC,KAAK,KAAK,KAAK,KAAK,GAAG,CAAC;AAEhE,MAAM,0BAA0B,oBAAI,IAAI,CAAC,KAAK,KAAK,GAAG,CAAC;AAGvD,MAAM,4BAA4B;AASlC,MAAM,oBAAgD;AAAA,EACpD,EAAE,MAAM,SAAS,aAAa,aAAa;AAAA,EAC3C,EAAE,MAAM,cAAc,aAAa,uBAAuB;AAAA,EAC1D,EAAE,MAAM,cAAc,SAAS,MAAM,aAAa,gCAAgC;AACpF;AAEO,MAAM,cAAc;AAAA,EACjB;AAAA,EACA,UAAU;AAAA,EAElB,YAAY,QAAiB;AAC3B,UAAM,MAAM,SAAS;AACrB,SAAK,SAAS,UAAU,IAAI;AAE5B,QAAI,CAAC,KAAK,QAAQ;AAChB,YAAM,IAAI,MAAM,wFAAwF;AAAA,IAC1G;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,OAAO,SAAwB,aAAa,aAAsC;AACtF,UAAM,EAAE,KAAK,OAAO,SAAS,UAAU,IAAI,QAAQ,IAAI;AACvD,UAAM,UAAU,aAAa,IAAI,KAAK;AAGtC,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,aAAO;AAAA,QACL,SAAS,gBAAgB,GAAG;AAAA,QAC5B,YAAY;AAAA,QACZ,SAAS;AAAA,QACT,OAAO,EAAE,MAAM,UAAU,eAAe,SAAS,gBAAgB,GAAG,IAAI,WAAW,MAAM;AAAA,MAC3F;AAAA,IACF;AAEA,UAAM,SAAS,IAAI,gBAAgB;AAAA,MACjC;AAAA,MACA,OAAO,KAAK;AAAA,MACZ,SAAS,OAAO,UAAU,GAAI;AAAA,IAChC,CAAC;AAED,QAAI,SAAS,cAAc;AACzB,aAAO,OAAO,UAAU,MAAM;AAAA,IAChC;AAEA,QAAI,SAAS;AACX,aAAO,OAAO,WAAW,QAAQ,YAAY,CAAC;AAAA,IAChD;AAEA,UAAM,SAAS,GAAG,KAAK,OAAO,IAAI,OAAO,SAAS,CAAC;AACnD,QAAI;AAEJ,aAAS,UAAU,GAAG,UAAU,YAAY,WAAW;AACrD,UAAI;AAEF,cAAM,aAAa,UAAU,KAAK;AAClC,cAAM,WAAW,MAAM,iBAAiB,QAAQ;AAAA,UAC9C,QAAQ;AAAA,UACR,SAAS,EAAE,QAAQ,6BAA6B;AAAA,UAChD;AAAA,QACF,CAAC;AAGD,YAAI;AACJ,YAAI;AACF,oBAAU,MAAM,SAAS,KAAK;AAAA,QAChC,SAAS,WAAW;AAClB,oBAAU,4BAA4B,qBAAqB,QAAQ,UAAU,UAAU,OAAO,SAAS,CAAC;AAAA,QAC1G;AAGA,YAAI,SAAS,IAAI;AACf,iBAAO;AAAA,YACL;AAAA,YACA,YAAY,SAAS;AAAA,YACrB;AAAA,YACA,SAAS,OAAO,YAAY,SAAS,QAAQ,QAAQ,CAAC;AAAA,UACxD;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,KAAK;AAC3B,iBAAO;AAAA,YACL,SAAS;AAAA,YACT,YAAY;AAAA,YACZ;AAAA,UACF;AAAA,QACF;AAGA,YAAI,wBAAwB,IAAI,SAAS,MAAM,GAAG;AAChD,gBAAM,WAAW,SAAS,WAAW,MACjC,mDACA,8BAA8B,SAAS,MAAM;AACjD,iBAAO;AAAA,YACL,SAAS,UAAU,QAAQ;AAAA,YAC3B,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,OAAO;AAAA,cACL,MAAM,SAAS,WAAW,MAAM,UAAU,aAAa,UAAU;AAAA,cACjE,SAAS;AAAA,cACT,WAAW;AAAA,cACX,YAAY,SAAS;AAAA,YACvB;AAAA,UACF;AAAA,QACF;AAGA,YAAI,uBAAuB,IAAI,SAAS,MAAM,GAAG;AAC/C,sBAAY;AAAA,YACV,MAAM,SAAS,WAAW,MAAM,UAAU,eAAe,UAAU;AAAA,YACnE,SAAS,mBAAmB,SAAS,MAAM;AAAA,YAC3C,WAAW;AAAA,YACX,YAAY,SAAS;AAAA,UACvB;AAEA,cAAI,UAAU,aAAa,GAAG;AAC5B,kBAAM,UAAU,iBAAiB,OAAO;AACxC,mBAAO,WAAW,GAAG,SAAS,MAAM,eAAe,UAAU,CAAC,IAAI,UAAU,iBAAiB,OAAO,MAAM,SAAS;AACnH,kBAAM,MAAM,OAAO;AACnB;AAAA,UACF;AAAA,QACF;AAGA,oBAAY,cAAc,EAAE,QAAQ,SAAS,QAAQ,SAAS,QAAQ,CAAC;AACvE,YAAI,UAAU,aAAa,KAAK,UAAU,WAAW;AACnD,gBAAM,UAAU,iBAAiB,OAAO;AACxC,iBAAO,WAAW,UAAU,SAAS,MAAM,iBAAiB,OAAO,MAAM,SAAS;AAClF,gBAAM,MAAM,OAAO;AACnB;AAAA,QACF;AAGA,eAAO;AAAA,UACL,SAAS,UAAU,UAAU,OAAO;AAAA,UACpC,YAAY,SAAS;AAAA,UACrB,SAAS;AAAA,UACT,OAAO;AAAA,QACT;AAAA,MAEF,SAAS,OAAO;AACd,oBAAY,cAAc,KAAK;AAG/B,YAAI,CAAC,UAAU,WAAW;AACxB,iBAAO;AAAA,YACL,SAAS,UAAU,UAAU,OAAO;AAAA,YACpC,YAAY,UAAU,cAAc;AAAA,YACpC,SAAS;AAAA,YACT,OAAO;AAAA,UACT;AAAA,QACF;AAGA,YAAI,UAAU,aAAa,GAAG;AAC5B,gBAAM,UAAU,iBAAiB,OAAO;AACxC,iBAAO,WAAW,GAAG,UAAU,IAAI,KAAK,UAAU,OAAO,WAAW,UAAU,CAAC,IAAI,UAAU,OAAO,OAAO,MAAM,SAAS;AAC1H,gBAAM,MAAM,OAAO;AACnB;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAGA,WAAO;AAAA,MACL,SAAS,uBAAuB,UAAU,cAAc,WAAW,WAAW,eAAe;AAAA,MAC7F,YAAY,WAAW,cAAc;AAAA,MACrC,SAAS;AAAA,MACT,OAAO,aAAa,EAAE,MAAM,UAAU,eAAe,SAAS,yBAAyB,WAAW,MAAM;AAAA,IAC1G;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,mBAAmB,KAAa,UAAgC,CAAC,GAA4B;AACjG,UAAM,iBAA2B,CAAC;AAClC,QAAI,aAAoC;AACxC,UAAM,WAAW,KAAK,IAAI,IAAI;AAE9B,eAAW,WAAW,mBAAmB;AAEvC,UAAI,KAAK,IAAI,KAAK,UAAU;AAC1B,eAAO,WAAW,wCAAwC,GAAG,UAAU,eAAe,MAAM,eAAe,SAAS;AACpH;AAAA,MACF;AAEA,YAAM,SAAS,MAAM,KAAK,mBAAmB,KAAK,SAAS,OAAO;AAElE,UAAI,OAAO,MAAM;AACf,YAAI,eAAe,SAAS,GAAG;AAC7B,iBAAO,QAAQ,gBAAgB,QAAQ,WAAW,UAAU,eAAe,MAAM,gBAAgB,SAAS;AAAA,QAC5G;AACA,eAAO,OAAO;AAAA,MAChB;AAEA,mBAAa,OAAO;AACpB,qBAAe,KAAK,GAAG,QAAQ,WAAW,KAAK,OAAO,SAAS,OAAO,WAAW,OAAO,SAAS,UAAU,EAAE;AAC7G,aAAO,WAAW,eAAe,QAAQ,WAAW,KAAK,OAAO,SAAS,UAAU,8BAA8B,SAAS;AAAA,IAC5H;AAGA,UAAM,eAAe,gBAAgB,eAAe,MAAM,yBAAyB,eAAe,KAAK,IAAI,CAAC;AAC5G,WAAO;AAAA,MACL,SAAS,UAAU,YAAY;AAAA,MAC/B,YAAY,YAAY,cAAc;AAAA,MACtC,SAAS;AAAA,MACT,OAAO;AAAA,QACL,MAAM,UAAU;AAAA,QAChB,SAAS;AAAA,QACT,WAAW;AAAA,MACb;AAAA,IACF;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,mBACZ,KACA,SACA,SACsD;AACtD,UAAM,SAAS,MAAM,KAAK,OAAO;AAAA,MAC/B;AAAA,MACA,MAAM,QAAQ;AAAA,MACd,SAAS,QAAQ;AAAA,MACjB,SAAS,QAAQ;AAAA,IACnB,CAAC;AAGD,QAAI,OAAO,cAAc,OAAO,OAAO,aAAa,OAAO,CAAC,OAAO,OAAO;AACxE,YAAM,iBAAiB,OAAO,QAAQ,QAAQ,YAAY,EAAE,EAAE,KAAK,EAAE;AACrE,UAAI,iBAAiB,6BAA6B,QAAQ,SAAS,SAAS;AAC1E,eAAO,QAAQ,4BAA4B,cAAc,sBAAsB,GAAG,+BAA0B,SAAS;AACrH,eAAO,EAAE,MAAM,OAAO,UAAU,OAAO;AAAA,MACzC;AACA,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAGA,QAAI,OAAO,eAAe,KAAK;AAC7B,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAIA,QAAI,OAAO,eAAe,KAAK;AAC7B,aAAO,WAAW,uBAAuB,GAAG,yDAAoD,SAAS;AACzG,aAAO,EAAE,MAAM,MAAM,UAAU;AAAA,QAC7B,GAAG;AAAA,QACH,OAAO;AAAA,UACL,MAAM,UAAU;AAAA,UAChB,SAAS;AAAA,UACT,WAAW;AAAA,QACb;AAAA,MACF,EAAC;AAAA,IACH;AAGA,QAAI,OAAO,SAAS,CAAC,OAAO,MAAM,WAAW;AAC3C,aAAO,SAAS,4BAA4B,QAAQ,WAAW,KAAK,OAAO,MAAM,OAAO,IAAI,SAAS;AACrG,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAEA,WAAO,EAAE,MAAM,OAAO,UAAU,OAAO;AAAA,EACzC;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,eAAe,MAAgB,UAAgC,CAAC,GAAqD;AACzH,QAAI,KAAK,WAAW,GAAG;AACrB,aAAO,CAAC;AAAA,IACV;AAEA,QAAI,KAAK,UAAU,mBAAmB;AACpC,aAAO,KAAK,aAAa,MAAM,OAAO;AAAA,IACxC;AAEA,UAAM,SAAS,MAAM,KAAK,YAAY,MAAM,OAAO;AACnD,WAAO,OAAO;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,YACJ,MACA,UAAgC,CAAC,GACjC,iBAC4B;AAC5B,UAAM,eAAe,KAAK,KAAK,KAAK,SAAS,iBAAiB;AAC9D,UAAM,aAAsD,CAAC;AAC7D,QAAI,gBAAgB;AAEpB,WAAO,QAAQ,8BAA8B,KAAK,MAAM,YAAY,YAAY,cAAc,SAAS;AAEvG,aAAS,WAAW,GAAG,WAAW,cAAc,YAAY;AAC1D,YAAM,WAAW,WAAW;AAC5B,YAAM,SAAS,KAAK,IAAI,WAAW,mBAAmB,KAAK,MAAM;AACjE,YAAM,YAAY,KAAK,MAAM,UAAU,MAAM;AAE7C,aAAO,QAAQ,oBAAoB,WAAW,CAAC,IAAI,YAAY,KAAK,UAAU,MAAM,UAAU,SAAS;AAEvG,YAAM,eAAe,MAAM;AAAA,QACzB;AAAA,QACA,SAAO,KAAK,mBAAmB,KAAK,OAAO;AAAA,QAC3C;AAAA,MACF;AAEA,eAAS,IAAI,GAAG,IAAI,aAAa,QAAQ,KAAK;AAC5C,cAAM,SAAS,aAAa,CAAC;AAC7B,YAAI,CAAC,OAAQ;AACb,cAAM,MAAM,UAAU,CAAC,KAAK;AAE5B,YAAI,OAAO,WAAW,aAAa;AACjC,gBAAM,eAAe,OAAO;AAC5B,qBAAW,KAAK,EAAE,GAAG,cAAc,IAAI,CAAC;AAGxC,cAAI,aAAa,OAAO,SAAS,UAAU,cAAc;AACvD;AAAA,UACF;AAAA,QACF,OAAO;AAGL,gBAAM,WAAW,OAAO,kBAAkB,QAAQ,OAAO,OAAO,UAAU,OAAO,OAAO,MAAM;AAC9F,iBAAO,SAAS,4BAA4B,GAAG,KAAK,QAAQ,IAAI,SAAS;AAEzE,qBAAW,KAAK;AAAA,YACd;AAAA,YACA,SAAS,+BAA+B,QAAQ;AAAA,YAChD,YAAY;AAAA,YACZ,SAAS;AAAA,YACT,OAAO,cAAc,OAAO,MAAM;AAAA,UACpC,CAAC;AAAA,QACH;AAAA,MACF;AAGA,UAAI;AACF,0BAAkB,WAAW,GAAG,cAAc,WAAW,MAAM;AAAA,MACjE,SAAS,eAAe;AACtB,eAAO,SAAS,mCAAmC,aAAa,IAAI,SAAS;AAAA,MAC/E;AAEA,aAAO,QAAQ,mBAAmB,WAAW,CAAC,IAAI,YAAY,KAAK,WAAW,MAAM,IAAI,KAAK,MAAM,WAAW,SAAS;AAGvH,UAAI,WAAW,eAAe,GAAG;AAC/B,cAAM,aAAa,gBAAgB,IAAI,MAAO;AAC9C,cAAM,MAAM,UAAU;AAAA,MACxB;AAAA,IACF;AAEA,WAAO,EAAE,SAAS,YAAY,kBAAkB,cAAc,gBAAgB,KAAK,QAAQ,cAAc;AAAA,EAC3G;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,aAAa,MAAgB,SAAiF;AAC1H,UAAM,UAAU,MAAM,YAAY,MAAM,SAAO,KAAK,mBAAmB,KAAK,OAAO,GAAG,0BAA0B;AAEhH,WAAO,QAAQ,IAAI,CAAC,QAAQ,UAAU;AACpC,YAAM,MAAM,KAAK,KAAK,KAAK;AAE3B,UAAI,OAAO,WAAW,aAAa;AACjC,eAAO,EAAE,GAAG,OAAO,OAAO,IAAI;AAAA,MAChC;AAGA,aAAO;AAAA,QACL;AAAA,QACA,SAAS,UAAU,OAAO,kBAAkB,QAAQ,OAAO,OAAO,UAAU,OAAO,OAAO,MAAM,CAAC;AAAA,QACjG,YAAY;AAAA,QACZ,SAAS;AAAA,QACT,OAAO,cAAc,OAAO,MAAM;AAAA,MACpC;AAAA,IACF,CAAC;AAAA,EACH;AACF;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Web Search Client
|
|
3
|
+
* Generic interface for web search via Google (Serper implementation)
|
|
4
|
+
* Implements robust error handling that NEVER crashes
|
|
5
|
+
*/
|
|
6
|
+
import { type StructuredError } from '../utils/errors.js';
|
|
7
|
+
interface SearchResult {
|
|
8
|
+
readonly title: string;
|
|
9
|
+
readonly link: string;
|
|
10
|
+
readonly snippet: string;
|
|
11
|
+
readonly date?: string;
|
|
12
|
+
readonly position: number;
|
|
13
|
+
}
|
|
14
|
+
export interface KeywordSearchResult {
|
|
15
|
+
readonly keyword: string;
|
|
16
|
+
readonly results: SearchResult[];
|
|
17
|
+
readonly totalResults: number;
|
|
18
|
+
readonly related: string[];
|
|
19
|
+
readonly error?: StructuredError;
|
|
20
|
+
}
|
|
21
|
+
interface MultipleSearchResponse {
|
|
22
|
+
readonly searches: KeywordSearchResult[];
|
|
23
|
+
readonly totalKeywords: number;
|
|
24
|
+
readonly executionTime: number;
|
|
25
|
+
readonly error?: StructuredError;
|
|
26
|
+
}
|
|
27
|
+
export interface RedditSearchResult {
|
|
28
|
+
readonly title: string;
|
|
29
|
+
readonly url: string;
|
|
30
|
+
readonly snippet: string;
|
|
31
|
+
readonly date?: string;
|
|
32
|
+
}
|
|
33
|
+
export declare class SearchClient {
|
|
34
|
+
private apiKey;
|
|
35
|
+
constructor(apiKey?: string);
|
|
36
|
+
/**
|
|
37
|
+
* Check if error is retryable
|
|
38
|
+
*/
|
|
39
|
+
private isRetryable;
|
|
40
|
+
/**
|
|
41
|
+
* Search multiple keywords in parallel
|
|
42
|
+
* NEVER throws - always returns a valid response
|
|
43
|
+
*/
|
|
44
|
+
searchMultiple(keywords: string[]): Promise<MultipleSearchResponse>;
|
|
45
|
+
/**
|
|
46
|
+
* Search Reddit via Google (adds site:reddit.com automatically)
|
|
47
|
+
* NEVER throws - returns empty array on failure
|
|
48
|
+
*/
|
|
49
|
+
searchReddit(query: string, dateAfter?: string): Promise<RedditSearchResult[]>;
|
|
50
|
+
/**
|
|
51
|
+
* Search Reddit with multiple queries (bounded concurrency)
|
|
52
|
+
* NEVER throws - searchReddit never throws, pMap preserves order
|
|
53
|
+
*/
|
|
54
|
+
searchRedditMultiple(queries: string[], dateAfter?: string): Promise<Map<string, RedditSearchResult[]>>;
|
|
55
|
+
}
|
|
56
|
+
export {};
|
|
57
|
+
//# sourceMappingURL=search.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"search.d.ts","sourceRoot":"","sources":["../../../src/clients/search.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAGH,OAAO,EAKL,KAAK,eAAe,EACrB,MAAM,oBAAoB,CAAC;AAc5B,UAAU,YAAY;IACpB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAC;CAC3B;AAED,MAAM,WAAW,mBAAmB;IAClC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC;IACjC,QAAQ,CAAC,YAAY,EAAE,MAAM,CAAC;IAC9B,QAAQ,CAAC,OAAO,EAAE,MAAM,EAAE,CAAC;IAC3B,QAAQ,CAAC,KAAK,CAAC,EAAE,eAAe,CAAC;CAClC;AAED,UAAU,sBAAsB;IAC9B,QAAQ,CAAC,QAAQ,EAAE,mBAAmB,EAAE,CAAC;IACzC,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,aAAa,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,KAAK,CAAC,EAAE,eAAe,CAAC;CAClC;AAED,MAAM,WAAW,kBAAkB;IACjC,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;CACxB;AAwHD,qBAAa,YAAY;IACvB,OAAO,CAAC,MAAM,CAAS;gBAEX,MAAM,CAAC,EAAE,MAAM;IAS3B;;OAEG;IACH,OAAO,CAAC,WAAW;IAUnB;;;OAGG;IACG,cAAc,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,sBAAsB,CAAC;IAkCzE;;;OAGG;IACG,YAAY,CAAC,KAAK,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,EAAE,CAAC;IAuDpF;;;OAGG;IACG,oBAAoB,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,kBAAkB,EAAE,CAAC,CAAC;CAa9G"}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import { parseEnv } from "../config/index.js";
|
|
2
|
+
import {
|
|
3
|
+
classifyError,
|
|
4
|
+
fetchWithTimeout,
|
|
5
|
+
sleep,
|
|
6
|
+
ErrorCode
|
|
7
|
+
} from "../utils/errors.js";
|
|
8
|
+
import { calculateBackoff } from "../utils/retry.js";
|
|
9
|
+
import { pMap } from "../utils/concurrency.js";
|
|
10
|
+
import { mcpLog } from "../utils/logger.js";
|
|
11
|
+
const SERPER_API_URL = "https://google.serper.dev/search";
|
|
12
|
+
const DEFAULT_RESULTS_PER_KEYWORD = 10;
|
|
13
|
+
const MAX_SEARCH_CONCURRENCY = 8;
|
|
14
|
+
const MAX_RETRIES = 3;
|
|
15
|
+
const SEARCH_RETRY_CONFIG = {
|
|
16
|
+
maxRetries: MAX_RETRIES,
|
|
17
|
+
baseDelayMs: 1e3,
|
|
18
|
+
maxDelayMs: 1e4,
|
|
19
|
+
timeoutMs: 3e4
|
|
20
|
+
};
|
|
21
|
+
const RETRYABLE_SEARCH_CODES = /* @__PURE__ */ new Set([429, 500, 502, 503, 504]);
|
|
22
|
+
const REDDIT_SITE_REGEX = /site:\s*reddit\.com/i;
|
|
23
|
+
const REDDIT_SUBREDDIT_SUFFIX_REGEX = / : r\/\w+$/;
|
|
24
|
+
const REDDIT_SUFFIX_REGEX = / - Reddit$/;
|
|
25
|
+
function parseSearchResponses(responses, keywords) {
|
|
26
|
+
return responses.map((resp, index) => {
|
|
27
|
+
try {
|
|
28
|
+
const organic = resp.organic || [];
|
|
29
|
+
const results = organic.map((item, idx) => ({
|
|
30
|
+
title: item.title || "No title",
|
|
31
|
+
link: item.link || "#",
|
|
32
|
+
snippet: item.snippet || "",
|
|
33
|
+
date: item.date,
|
|
34
|
+
position: item.position || idx + 1
|
|
35
|
+
}));
|
|
36
|
+
const searchInfo = resp.searchInformation;
|
|
37
|
+
const totalResults = searchInfo?.totalResults ? parseInt(String(searchInfo.totalResults).replace(/,/g, ""), 10) : results.length;
|
|
38
|
+
const relatedSearches = resp.relatedSearches || [];
|
|
39
|
+
const related = relatedSearches.map((r) => r.query || "");
|
|
40
|
+
return { keyword: keywords[index] || "", results, totalResults, related };
|
|
41
|
+
} catch {
|
|
42
|
+
return { keyword: keywords[index] || "", results: [], totalResults: 0, related: [] };
|
|
43
|
+
}
|
|
44
|
+
});
|
|
45
|
+
}
|
|
46
|
+
async function executeSearchWithRetry(apiKey, body, isRetryable) {
|
|
47
|
+
let lastError;
|
|
48
|
+
for (let attempt = 0; attempt <= SEARCH_RETRY_CONFIG.maxRetries; attempt++) {
|
|
49
|
+
try {
|
|
50
|
+
if (attempt > 0) {
|
|
51
|
+
mcpLog("warning", `Retry attempt ${attempt}/${SEARCH_RETRY_CONFIG.maxRetries}`, "search");
|
|
52
|
+
}
|
|
53
|
+
const response = await fetchWithTimeout(SERPER_API_URL, {
|
|
54
|
+
method: "POST",
|
|
55
|
+
headers: {
|
|
56
|
+
"X-API-KEY": apiKey,
|
|
57
|
+
"Content-Type": "application/json"
|
|
58
|
+
},
|
|
59
|
+
body: JSON.stringify(body),
|
|
60
|
+
timeoutMs: SEARCH_RETRY_CONFIG.timeoutMs
|
|
61
|
+
});
|
|
62
|
+
if (!response.ok) {
|
|
63
|
+
const errorText = await response.text().catch(() => "");
|
|
64
|
+
lastError = classifyError({ status: response.status, message: errorText });
|
|
65
|
+
if (isRetryable(response.status) && attempt < SEARCH_RETRY_CONFIG.maxRetries) {
|
|
66
|
+
const delayMs = calculateBackoff(attempt, SEARCH_RETRY_CONFIG.baseDelayMs, SEARCH_RETRY_CONFIG.maxDelayMs);
|
|
67
|
+
mcpLog("warning", `API returned ${response.status}, retrying in ${delayMs}ms...`, "search");
|
|
68
|
+
await sleep(delayMs);
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
return { data: void 0, error: lastError };
|
|
72
|
+
}
|
|
73
|
+
try {
|
|
74
|
+
const data = await response.json();
|
|
75
|
+
return { data };
|
|
76
|
+
} catch {
|
|
77
|
+
return {
|
|
78
|
+
data: void 0,
|
|
79
|
+
error: { code: ErrorCode.PARSE_ERROR, message: "Failed to parse search response", retryable: false }
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
} catch (error) {
|
|
83
|
+
lastError = classifyError(error);
|
|
84
|
+
if (isRetryable(void 0, error) && attempt < SEARCH_RETRY_CONFIG.maxRetries) {
|
|
85
|
+
const delayMs = calculateBackoff(attempt, SEARCH_RETRY_CONFIG.baseDelayMs, SEARCH_RETRY_CONFIG.maxDelayMs);
|
|
86
|
+
mcpLog("warning", `${lastError.code}: ${lastError.message}, retrying in ${delayMs}ms...`, "search");
|
|
87
|
+
await sleep(delayMs);
|
|
88
|
+
continue;
|
|
89
|
+
}
|
|
90
|
+
return { data: void 0, error: lastError };
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return {
|
|
94
|
+
data: void 0,
|
|
95
|
+
error: lastError || { code: ErrorCode.UNKNOWN_ERROR, message: "Search failed", retryable: false }
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
class SearchClient {
|
|
99
|
+
apiKey;
|
|
100
|
+
constructor(apiKey) {
|
|
101
|
+
const env = parseEnv();
|
|
102
|
+
this.apiKey = apiKey || env.SEARCH_API_KEY || "";
|
|
103
|
+
if (!this.apiKey) {
|
|
104
|
+
throw new Error("Web search capability is not configured. Please set up the required API credentials.");
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Check if error is retryable
|
|
109
|
+
*/
|
|
110
|
+
isRetryable(status, error) {
|
|
111
|
+
if (status && RETRYABLE_SEARCH_CODES.has(status)) return true;
|
|
112
|
+
if (error == null) return false;
|
|
113
|
+
const message = typeof error === "object" && "message" in error && typeof error.message === "string" ? error.message.toLowerCase() : "";
|
|
114
|
+
return message.includes("timeout") || message.includes("rate limit") || message.includes("connection");
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Search multiple keywords in parallel
|
|
118
|
+
* NEVER throws - always returns a valid response
|
|
119
|
+
*/
|
|
120
|
+
async searchMultiple(keywords) {
|
|
121
|
+
const startTime = Date.now();
|
|
122
|
+
if (keywords.length === 0) {
|
|
123
|
+
return {
|
|
124
|
+
searches: [],
|
|
125
|
+
totalKeywords: 0,
|
|
126
|
+
executionTime: 0,
|
|
127
|
+
error: { code: ErrorCode.INVALID_INPUT, message: "No keywords provided", retryable: false }
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
const searchQueries = keywords.map((keyword) => ({ q: keyword }));
|
|
131
|
+
const { data, error } = await executeSearchWithRetry(
|
|
132
|
+
this.apiKey,
|
|
133
|
+
searchQueries,
|
|
134
|
+
(status, err) => this.isRetryable(status, err)
|
|
135
|
+
);
|
|
136
|
+
if (error || data === void 0) {
|
|
137
|
+
return {
|
|
138
|
+
searches: [],
|
|
139
|
+
totalKeywords: keywords.length,
|
|
140
|
+
executionTime: Date.now() - startTime,
|
|
141
|
+
error
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
const responses = Array.isArray(data) ? data : [data];
|
|
145
|
+
const searches = parseSearchResponses(responses, keywords);
|
|
146
|
+
return { searches, totalKeywords: keywords.length, executionTime: Date.now() - startTime };
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Search Reddit via Google (adds site:reddit.com automatically)
|
|
150
|
+
* NEVER throws - returns empty array on failure
|
|
151
|
+
*/
|
|
152
|
+
async searchReddit(query, dateAfter) {
|
|
153
|
+
if (!query?.trim()) {
|
|
154
|
+
return [];
|
|
155
|
+
}
|
|
156
|
+
let q = query.replace(REDDIT_SITE_REGEX, "").trim() + " site:reddit.com";
|
|
157
|
+
if (dateAfter) {
|
|
158
|
+
q += ` after:${dateAfter}`;
|
|
159
|
+
}
|
|
160
|
+
for (let attempt = 0; attempt <= SEARCH_RETRY_CONFIG.maxRetries; attempt++) {
|
|
161
|
+
try {
|
|
162
|
+
const res = await fetchWithTimeout(SERPER_API_URL, {
|
|
163
|
+
method: "POST",
|
|
164
|
+
headers: { "X-API-KEY": this.apiKey, "Content-Type": "application/json" },
|
|
165
|
+
body: JSON.stringify({ q, num: DEFAULT_RESULTS_PER_KEYWORD }),
|
|
166
|
+
timeoutMs: SEARCH_RETRY_CONFIG.timeoutMs
|
|
167
|
+
});
|
|
168
|
+
if (!res.ok) {
|
|
169
|
+
if (this.isRetryable(res.status) && attempt < SEARCH_RETRY_CONFIG.maxRetries) {
|
|
170
|
+
const delayMs = calculateBackoff(attempt, SEARCH_RETRY_CONFIG.baseDelayMs, SEARCH_RETRY_CONFIG.maxDelayMs);
|
|
171
|
+
mcpLog("warning", `Reddit search ${res.status}, retrying in ${delayMs}ms...`, "search");
|
|
172
|
+
await sleep(delayMs);
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
mcpLog("error", `Reddit search failed with status ${res.status}`, "search");
|
|
176
|
+
return [];
|
|
177
|
+
}
|
|
178
|
+
const data = await res.json();
|
|
179
|
+
return (data.organic || []).map((r) => ({
|
|
180
|
+
title: (r.title || "").replace(REDDIT_SUBREDDIT_SUFFIX_REGEX, "").replace(REDDIT_SUFFIX_REGEX, ""),
|
|
181
|
+
url: r.link || "",
|
|
182
|
+
snippet: r.snippet || "",
|
|
183
|
+
date: r.date
|
|
184
|
+
}));
|
|
185
|
+
} catch (error) {
|
|
186
|
+
const err = classifyError(error);
|
|
187
|
+
if (this.isRetryable(void 0, error) && attempt < SEARCH_RETRY_CONFIG.maxRetries) {
|
|
188
|
+
const delayMs = calculateBackoff(attempt, SEARCH_RETRY_CONFIG.baseDelayMs, SEARCH_RETRY_CONFIG.maxDelayMs);
|
|
189
|
+
mcpLog("warning", `Reddit search ${err.code}, retrying in ${delayMs}ms...`, "search");
|
|
190
|
+
await sleep(delayMs);
|
|
191
|
+
continue;
|
|
192
|
+
}
|
|
193
|
+
mcpLog("error", `Reddit search failed: ${err.message}`, "search");
|
|
194
|
+
return [];
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
return [];
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Search Reddit with multiple queries (bounded concurrency)
|
|
201
|
+
* NEVER throws - searchReddit never throws, pMap preserves order
|
|
202
|
+
*/
|
|
203
|
+
async searchRedditMultiple(queries, dateAfter) {
|
|
204
|
+
if (queries.length === 0) {
|
|
205
|
+
return /* @__PURE__ */ new Map();
|
|
206
|
+
}
|
|
207
|
+
const results = await pMap(
|
|
208
|
+
queries,
|
|
209
|
+
(q) => this.searchReddit(q, dateAfter),
|
|
210
|
+
MAX_SEARCH_CONCURRENCY
|
|
211
|
+
);
|
|
212
|
+
return new Map(queries.map((q, i) => [q, results[i] || []]));
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
export {
|
|
216
|
+
SearchClient
|
|
217
|
+
};
|
|
218
|
+
//# sourceMappingURL=search.js.map
|