@robot-resources/scraper 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +11 -6
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +11 -6
- package/dist/index.js.map +1 -1
- package/dist/mcp-entry.js +1040 -0
- package/package.json +6 -3
|
@@ -0,0 +1,1040 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
3
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
import { Readability } from '@mozilla/readability';
|
|
6
|
+
import { parseHTML } from 'linkedom';
|
|
7
|
+
import TurndownService from 'turndown';
|
|
8
|
+
import robotsParser from 'robots-parser';
|
|
9
|
+
|
|
10
|
+
// src/fetch.ts
|
|
11
|
+
var USER_AGENTS = [
|
|
12
|
+
"Mozilla/5.0 (compatible; ScraperBot/1.0; +https://scraper.robotresources.ai)",
|
|
13
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
14
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
15
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
16
|
+
];
|
|
17
|
+
var DEFAULT_TIMEOUT = 1e4;
|
|
18
|
+
var DEFAULT_MAX_RETRIES = 3;
|
|
19
|
+
var BASE_BACKOFF_MS = 1e3;
|
|
20
|
+
var FetchError = class extends Error {
|
|
21
|
+
constructor(message, statusCode, retryable = false) {
|
|
22
|
+
super(message);
|
|
23
|
+
this.statusCode = statusCode;
|
|
24
|
+
this.retryable = retryable;
|
|
25
|
+
this.name = "FetchError";
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
function isValidUrl(url) {
|
|
29
|
+
try {
|
|
30
|
+
const parsed = new URL(url);
|
|
31
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
32
|
+
} catch {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
function getRandomUserAgent() {
|
|
37
|
+
return USER_AGENTS[Math.floor(Math.random() * USER_AGENTS.length)];
|
|
38
|
+
}
|
|
39
|
+
function buildHeaders(userAgent) {
|
|
40
|
+
return {
|
|
41
|
+
"User-Agent": userAgent || getRandomUserAgent(),
|
|
42
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
43
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
44
|
+
"Accept-Encoding": "gzip, deflate",
|
|
45
|
+
Connection: "keep-alive",
|
|
46
|
+
"Cache-Control": "no-cache"
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
function headersToObject(headers) {
|
|
50
|
+
const result = {};
|
|
51
|
+
headers.forEach((value, key) => {
|
|
52
|
+
result[key.toLowerCase()] = value;
|
|
53
|
+
});
|
|
54
|
+
return result;
|
|
55
|
+
}
|
|
56
|
+
function isRetryableStatus(status) {
|
|
57
|
+
return status >= 500 && status < 600;
|
|
58
|
+
}
|
|
59
|
+
function sleep(ms) {
|
|
60
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
61
|
+
}
|
|
62
|
+
function getBackoffDelay(attempt) {
|
|
63
|
+
return BASE_BACKOFF_MS * Math.pow(2, attempt);
|
|
64
|
+
}
|
|
65
|
+
async function fetchWithTimeout(url, headers, timeout) {
|
|
66
|
+
const controller = new AbortController();
|
|
67
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
68
|
+
try {
|
|
69
|
+
const response = await fetch(url, {
|
|
70
|
+
method: "GET",
|
|
71
|
+
headers,
|
|
72
|
+
signal: controller.signal,
|
|
73
|
+
redirect: "follow"
|
|
74
|
+
});
|
|
75
|
+
return response;
|
|
76
|
+
} catch (error) {
|
|
77
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
78
|
+
throw new FetchError("Request timeout", void 0, true);
|
|
79
|
+
}
|
|
80
|
+
throw error;
|
|
81
|
+
} finally {
|
|
82
|
+
clearTimeout(timeoutId);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
async function fetchUrl(url, options = {}) {
|
|
86
|
+
const {
|
|
87
|
+
timeout = DEFAULT_TIMEOUT,
|
|
88
|
+
maxRetries = DEFAULT_MAX_RETRIES,
|
|
89
|
+
userAgent
|
|
90
|
+
} = options;
|
|
91
|
+
if (!isValidUrl(url)) {
|
|
92
|
+
throw new FetchError("Invalid URL", void 0, false);
|
|
93
|
+
}
|
|
94
|
+
const headers = buildHeaders(userAgent);
|
|
95
|
+
let lastError = null;
|
|
96
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
97
|
+
try {
|
|
98
|
+
const response = await fetchWithTimeout(url, headers, timeout);
|
|
99
|
+
if (!response.ok) {
|
|
100
|
+
const statusCode = response.status;
|
|
101
|
+
if (statusCode >= 400 && statusCode < 500) {
|
|
102
|
+
throw new FetchError(`HTTP ${statusCode}`, statusCode, false);
|
|
103
|
+
}
|
|
104
|
+
if (isRetryableStatus(statusCode)) {
|
|
105
|
+
throw new FetchError(`HTTP ${statusCode}`, statusCode, true);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
const html = await response.text();
|
|
109
|
+
const responseHeaders = headersToObject(response.headers);
|
|
110
|
+
return {
|
|
111
|
+
html,
|
|
112
|
+
url: response.url,
|
|
113
|
+
statusCode: response.status,
|
|
114
|
+
headers: responseHeaders
|
|
115
|
+
};
|
|
116
|
+
} catch (error) {
|
|
117
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
118
|
+
const isRetryable = error instanceof FetchError ? error.retryable : !(error instanceof FetchError);
|
|
119
|
+
const hasRetriesLeft = attempt < maxRetries;
|
|
120
|
+
if (isRetryable && hasRetriesLeft) {
|
|
121
|
+
const delay = getBackoffDelay(attempt);
|
|
122
|
+
await sleep(delay);
|
|
123
|
+
continue;
|
|
124
|
+
}
|
|
125
|
+
break;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
throw lastError || new FetchError("Unknown fetch error");
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// src/fetch-stealth.ts
|
|
132
|
+
var DEFAULT_TIMEOUT2 = 1e4;
|
|
133
|
+
var DEFAULT_MAX_RETRIES2 = 2;
|
|
134
|
+
var BASE_BACKOFF_MS2 = 1e3;
|
|
135
|
+
function isValidUrl2(url) {
|
|
136
|
+
try {
|
|
137
|
+
const parsed = new URL(url);
|
|
138
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
139
|
+
} catch {
|
|
140
|
+
return false;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
function isRetryableStatus2(status) {
|
|
144
|
+
return status >= 500 && status < 600;
|
|
145
|
+
}
|
|
146
|
+
function sleep2(ms) {
|
|
147
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
148
|
+
}
|
|
149
|
+
function getBackoffDelay2(attempt) {
|
|
150
|
+
return BASE_BACKOFF_MS2 * Math.pow(2, attempt);
|
|
151
|
+
}
|
|
152
|
+
async function fetchStealth(url, options = {}) {
|
|
153
|
+
const {
|
|
154
|
+
timeout = DEFAULT_TIMEOUT2,
|
|
155
|
+
maxRetries = DEFAULT_MAX_RETRIES2
|
|
156
|
+
} = options;
|
|
157
|
+
if (!isValidUrl2(url)) {
|
|
158
|
+
throw new FetchError("Invalid URL", void 0, false);
|
|
159
|
+
}
|
|
160
|
+
let Impit;
|
|
161
|
+
try {
|
|
162
|
+
({ Impit } = await import('impit'));
|
|
163
|
+
} catch {
|
|
164
|
+
throw new FetchError(
|
|
165
|
+
"impit is required for stealth mode. Install: npm install impit (requires Node >= 20)",
|
|
166
|
+
void 0,
|
|
167
|
+
false
|
|
168
|
+
);
|
|
169
|
+
}
|
|
170
|
+
const client = new Impit({ browser: "chrome" });
|
|
171
|
+
let lastError = null;
|
|
172
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
173
|
+
try {
|
|
174
|
+
const response = await client.fetch(url, {
|
|
175
|
+
signal: AbortSignal.timeout(timeout)
|
|
176
|
+
});
|
|
177
|
+
if (!response.ok) {
|
|
178
|
+
const statusCode = response.status;
|
|
179
|
+
if (statusCode >= 400 && statusCode < 500) {
|
|
180
|
+
throw new FetchError(`HTTP ${statusCode}`, statusCode, false);
|
|
181
|
+
}
|
|
182
|
+
if (isRetryableStatus2(statusCode)) {
|
|
183
|
+
throw new FetchError(`HTTP ${statusCode}`, statusCode, true);
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
const html = await response.text();
|
|
187
|
+
const headers = {};
|
|
188
|
+
response.headers.forEach((value, key) => {
|
|
189
|
+
headers[key] = value;
|
|
190
|
+
});
|
|
191
|
+
return {
|
|
192
|
+
html,
|
|
193
|
+
url: response.url ?? url,
|
|
194
|
+
statusCode: response.status,
|
|
195
|
+
headers
|
|
196
|
+
};
|
|
197
|
+
} catch (error) {
|
|
198
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
199
|
+
const isRetryable = error instanceof FetchError ? error.retryable : !(error instanceof FetchError);
|
|
200
|
+
const hasRetriesLeft = attempt < maxRetries;
|
|
201
|
+
if (isRetryable && hasRetriesLeft) {
|
|
202
|
+
const delay = getBackoffDelay2(attempt);
|
|
203
|
+
await sleep2(delay);
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
throw lastError || new FetchError("Unknown stealth fetch error");
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// src/fetch-render.ts
|
|
213
|
+
var DEFAULT_TIMEOUT3 = 3e4;
|
|
214
|
+
function isValidUrl3(url) {
|
|
215
|
+
try {
|
|
216
|
+
const parsed = new URL(url);
|
|
217
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
218
|
+
} catch {
|
|
219
|
+
return false;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
async function fetchRender(url, options = {}) {
|
|
223
|
+
const { timeout = DEFAULT_TIMEOUT3 } = options;
|
|
224
|
+
if (!isValidUrl3(url)) {
|
|
225
|
+
throw new FetchError("Invalid URL", void 0, false);
|
|
226
|
+
}
|
|
227
|
+
let chromium;
|
|
228
|
+
try {
|
|
229
|
+
({ chromium } = await import('playwright'));
|
|
230
|
+
} catch {
|
|
231
|
+
throw new FetchError(
|
|
232
|
+
"Playwright is required for render mode. Install: npm install playwright",
|
|
233
|
+
void 0,
|
|
234
|
+
false
|
|
235
|
+
);
|
|
236
|
+
}
|
|
237
|
+
const browser = await chromium.launch({ headless: true });
|
|
238
|
+
try {
|
|
239
|
+
const page = await browser.newPage();
|
|
240
|
+
page.on("dialog", (dialog) => dialog.dismiss());
|
|
241
|
+
const response = await page.goto(url, {
|
|
242
|
+
waitUntil: "networkidle",
|
|
243
|
+
timeout
|
|
244
|
+
});
|
|
245
|
+
if (!response) {
|
|
246
|
+
throw new FetchError(
|
|
247
|
+
"Navigation returned no response (about:blank or same-URL redirect)",
|
|
248
|
+
void 0,
|
|
249
|
+
false
|
|
250
|
+
);
|
|
251
|
+
}
|
|
252
|
+
const statusCode = response.status();
|
|
253
|
+
if (statusCode >= 400 && statusCode < 500) {
|
|
254
|
+
throw new FetchError(`HTTP ${statusCode}`, statusCode, false);
|
|
255
|
+
}
|
|
256
|
+
if (statusCode >= 500) {
|
|
257
|
+
throw new FetchError(`HTTP ${statusCode}`, statusCode, true);
|
|
258
|
+
}
|
|
259
|
+
const html = await page.content();
|
|
260
|
+
const headers = response.headers();
|
|
261
|
+
return {
|
|
262
|
+
html,
|
|
263
|
+
url: page.url(),
|
|
264
|
+
statusCode,
|
|
265
|
+
headers
|
|
266
|
+
};
|
|
267
|
+
} catch (error) {
|
|
268
|
+
if (error instanceof Error && error.name === "TimeoutError") {
|
|
269
|
+
throw new FetchError("Navigation timeout", void 0, true);
|
|
270
|
+
}
|
|
271
|
+
throw error;
|
|
272
|
+
} finally {
|
|
273
|
+
await browser.close();
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// src/fetch-mode.ts
|
|
278
|
+
var CHALLENGE_MARKERS = [
|
|
279
|
+
"cf-browser-verification",
|
|
280
|
+
"Just a moment",
|
|
281
|
+
"_cf_chl_opt",
|
|
282
|
+
"akamai-challenge",
|
|
283
|
+
"ak-challenge"
|
|
284
|
+
];
|
|
285
|
+
function isChallengeResponse(fetchResult) {
|
|
286
|
+
return CHALLENGE_MARKERS.some((marker) => fetchResult.html.includes(marker));
|
|
287
|
+
}
|
|
288
|
+
var VALID_MODES = ["fast", "stealth", "render", "auto"];
|
|
289
|
+
async function fetchWithMode(url, mode, options) {
|
|
290
|
+
if (!VALID_MODES.includes(mode)) {
|
|
291
|
+
throw new FetchError(
|
|
292
|
+
`Invalid fetch mode: '${mode}'. Valid modes: ${VALID_MODES.join(", ")}`,
|
|
293
|
+
void 0,
|
|
294
|
+
false
|
|
295
|
+
);
|
|
296
|
+
}
|
|
297
|
+
if (mode === "stealth") return fetchStealth(url, options);
|
|
298
|
+
if (mode === "render") return fetchRender(url, options);
|
|
299
|
+
if (mode === "fast") return fetchUrl(url, options);
|
|
300
|
+
try {
|
|
301
|
+
const result = await fetchUrl(url, options);
|
|
302
|
+
if (isChallengeResponse(result)) {
|
|
303
|
+
return fetchStealth(url, options);
|
|
304
|
+
}
|
|
305
|
+
return result;
|
|
306
|
+
} catch (err) {
|
|
307
|
+
if (err instanceof FetchError) {
|
|
308
|
+
if (err.statusCode === 403) {
|
|
309
|
+
return fetchStealth(url, options);
|
|
310
|
+
}
|
|
311
|
+
throw err;
|
|
312
|
+
}
|
|
313
|
+
return fetchStealth(url, options);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
var ExtractionError = class extends Error {
|
|
317
|
+
constructor(message, code) {
|
|
318
|
+
super(message);
|
|
319
|
+
this.code = code;
|
|
320
|
+
this.name = "ExtractionError";
|
|
321
|
+
}
|
|
322
|
+
};
|
|
323
|
+
async function extractContent(fetchResult) {
|
|
324
|
+
const { html } = fetchResult;
|
|
325
|
+
if (!html || !html.trim()) {
|
|
326
|
+
throw new ExtractionError("Empty HTML content", "EMPTY_HTML");
|
|
327
|
+
}
|
|
328
|
+
const { document } = parseHTML(html);
|
|
329
|
+
const reader = new Readability(document, {
|
|
330
|
+
charThreshold: 50
|
|
331
|
+
});
|
|
332
|
+
const article = reader.parse();
|
|
333
|
+
if (!article || !article.content || article.content.trim().length < 20) {
|
|
334
|
+
throw new ExtractionError(
|
|
335
|
+
"No content could be extracted from the page",
|
|
336
|
+
"NO_CONTENT"
|
|
337
|
+
);
|
|
338
|
+
}
|
|
339
|
+
const result = {
|
|
340
|
+
content: cleanContent(article.content),
|
|
341
|
+
title: article.title || extractFallbackTitle(document),
|
|
342
|
+
author: article.byline || void 0,
|
|
343
|
+
publishedAt: article.publishedTime || extractPublishedTime(document),
|
|
344
|
+
siteName: article.siteName || extractSiteName(document)
|
|
345
|
+
};
|
|
346
|
+
return result;
|
|
347
|
+
}
|
|
348
|
+
function cleanContent(content) {
|
|
349
|
+
return content.replace(/>\s+</g, "><").replace(/\s{2,}/g, " ").trim();
|
|
350
|
+
}
|
|
351
|
+
function extractFallbackTitle(document) {
|
|
352
|
+
const ogTitle = document.querySelector('meta[property="og:title"]');
|
|
353
|
+
if (ogTitle) {
|
|
354
|
+
const content = ogTitle.getAttribute("content");
|
|
355
|
+
if (content) return content;
|
|
356
|
+
}
|
|
357
|
+
const titleEl = document.querySelector("title");
|
|
358
|
+
if (titleEl && titleEl.textContent) {
|
|
359
|
+
return titleEl.textContent.trim();
|
|
360
|
+
}
|
|
361
|
+
const h1 = document.querySelector("h1");
|
|
362
|
+
if (h1 && h1.textContent) {
|
|
363
|
+
return h1.textContent.trim();
|
|
364
|
+
}
|
|
365
|
+
return void 0;
|
|
366
|
+
}
|
|
367
|
+
function extractPublishedTime(document) {
|
|
368
|
+
const ogTime = document.querySelector(
|
|
369
|
+
'meta[property="article:published_time"]'
|
|
370
|
+
);
|
|
371
|
+
if (ogTime) {
|
|
372
|
+
const content = ogTime.getAttribute("content");
|
|
373
|
+
if (content) return content;
|
|
374
|
+
}
|
|
375
|
+
const schemaTime = document.querySelector('[itemprop="datePublished"]');
|
|
376
|
+
if (schemaTime) {
|
|
377
|
+
const datetime = schemaTime.getAttribute("datetime");
|
|
378
|
+
if (datetime) return datetime;
|
|
379
|
+
const content = schemaTime.getAttribute("content");
|
|
380
|
+
if (content) return content;
|
|
381
|
+
}
|
|
382
|
+
const timeEl = document.querySelector("time[datetime]");
|
|
383
|
+
if (timeEl) {
|
|
384
|
+
const datetime = timeEl.getAttribute("datetime");
|
|
385
|
+
if (datetime) return datetime;
|
|
386
|
+
}
|
|
387
|
+
return void 0;
|
|
388
|
+
}
|
|
389
|
+
function extractSiteName(document) {
|
|
390
|
+
const ogSiteName = document.querySelector('meta[property="og:site_name"]');
|
|
391
|
+
if (ogSiteName) {
|
|
392
|
+
const content = ogSiteName.getAttribute("content");
|
|
393
|
+
if (content) return content;
|
|
394
|
+
}
|
|
395
|
+
const appName = document.querySelector('meta[name="application-name"]');
|
|
396
|
+
if (appName) {
|
|
397
|
+
const content = appName.getAttribute("content");
|
|
398
|
+
if (content) return content;
|
|
399
|
+
}
|
|
400
|
+
return void 0;
|
|
401
|
+
}
|
|
402
|
+
function createTurndownService() {
|
|
403
|
+
const turndown = new TurndownService({
|
|
404
|
+
headingStyle: "atx",
|
|
405
|
+
hr: "---",
|
|
406
|
+
bulletListMarker: "-",
|
|
407
|
+
codeBlockStyle: "fenced",
|
|
408
|
+
fence: "```",
|
|
409
|
+
emDelimiter: "*",
|
|
410
|
+
strongDelimiter: "**",
|
|
411
|
+
linkStyle: "inlined"
|
|
412
|
+
});
|
|
413
|
+
turndown.remove(["script", "style", "noscript", "iframe"]);
|
|
414
|
+
turndown.addRule("removeEmpty", {
|
|
415
|
+
filter: (node) => {
|
|
416
|
+
if (node.nodeType === 1) {
|
|
417
|
+
const text = node.textContent || "";
|
|
418
|
+
const isEmptyBlock = text.trim() === "" && !["IMG", "BR", "HR", "INPUT"].includes(node.nodeName);
|
|
419
|
+
return isEmptyBlock;
|
|
420
|
+
}
|
|
421
|
+
return false;
|
|
422
|
+
},
|
|
423
|
+
replacement: () => ""
|
|
424
|
+
});
|
|
425
|
+
turndown.addRule("fencedCodeBlock", {
|
|
426
|
+
filter: (node, options) => {
|
|
427
|
+
return options.codeBlockStyle === "fenced" && node.nodeName === "PRE" && node.firstChild !== null && node.firstChild.nodeName === "CODE";
|
|
428
|
+
},
|
|
429
|
+
replacement: (_content, node, options) => {
|
|
430
|
+
const codeNode = node.firstChild;
|
|
431
|
+
const code = codeNode.textContent || "";
|
|
432
|
+
const className = codeNode.getAttribute("class") || "";
|
|
433
|
+
const langMatch = className.match(/language-(\w+)/);
|
|
434
|
+
const lang = langMatch ? langMatch[1] : "";
|
|
435
|
+
const fence = options.fence || "```";
|
|
436
|
+
return `
|
|
437
|
+
|
|
438
|
+
${fence}${lang}
|
|
439
|
+
${code}
|
|
440
|
+
${fence}
|
|
441
|
+
|
|
442
|
+
`;
|
|
443
|
+
}
|
|
444
|
+
});
|
|
445
|
+
turndown.addRule("strikethrough", {
|
|
446
|
+
filter: ["del", "s"],
|
|
447
|
+
replacement: (content) => `~~${content}~~`
|
|
448
|
+
});
|
|
449
|
+
return turndown;
|
|
450
|
+
}
|
|
451
|
+
var turndownInstance = null;
|
|
452
|
+
function getTurndown() {
|
|
453
|
+
if (!turndownInstance) {
|
|
454
|
+
turndownInstance = createTurndownService();
|
|
455
|
+
}
|
|
456
|
+
return turndownInstance;
|
|
457
|
+
}
|
|
458
|
+
function cleanMarkdown(markdown) {
|
|
459
|
+
return markdown.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+$/gm, "").replace(/^\n+/, "").replace(/\n+$/, "").trim();
|
|
460
|
+
}
|
|
461
|
+
async function convertToMarkdown(extractResult) {
|
|
462
|
+
const { content } = extractResult;
|
|
463
|
+
if (!content || !content.trim()) {
|
|
464
|
+
return {
|
|
465
|
+
markdown: "",
|
|
466
|
+
tokenCount: 0
|
|
467
|
+
};
|
|
468
|
+
}
|
|
469
|
+
const turndown = getTurndown();
|
|
470
|
+
let markdown = turndown.turndown(content);
|
|
471
|
+
markdown = cleanMarkdown(markdown);
|
|
472
|
+
const tokenCount = estimateTokens(markdown);
|
|
473
|
+
return {
|
|
474
|
+
markdown,
|
|
475
|
+
tokenCount
|
|
476
|
+
};
|
|
477
|
+
}
|
|
478
|
+
function estimateTokens(text) {
|
|
479
|
+
if (!text) return 0;
|
|
480
|
+
let tokens = 0;
|
|
481
|
+
let remaining = text;
|
|
482
|
+
remaining = remaining.replace(/```[\s\S]*?```/g, (match) => {
|
|
483
|
+
tokens += Math.ceil(match.length / 3.2);
|
|
484
|
+
return " ";
|
|
485
|
+
});
|
|
486
|
+
remaining = remaining.replace(/`[^`]+`/g, (match) => {
|
|
487
|
+
tokens += Math.ceil(match.length / 3.5);
|
|
488
|
+
return " ";
|
|
489
|
+
});
|
|
490
|
+
remaining = remaining.replace(/https?:\/\/\S+/g, (match) => {
|
|
491
|
+
tokens += Math.ceil(match.length / 5);
|
|
492
|
+
return " ";
|
|
493
|
+
});
|
|
494
|
+
const proseLength = remaining.replace(/\s+/g, " ").trim().length;
|
|
495
|
+
if (proseLength > 0) {
|
|
496
|
+
tokens += Math.ceil(proseLength / 4.3);
|
|
497
|
+
}
|
|
498
|
+
return Math.max(1, tokens);
|
|
499
|
+
}
|
|
500
|
+
var DEFAULT_TTL_MS = 60 * 60 * 1e3;
|
|
501
|
+
var DEFAULT_TIMEOUT_MS = 5e3;
|
|
502
|
+
var BOT_USER_AGENT = "ScraperBot";
|
|
503
|
+
var cache = /* @__PURE__ */ new Map();
|
|
504
|
+
function getRobotsUrl(url) {
|
|
505
|
+
const parsed = new URL(url);
|
|
506
|
+
return `${parsed.protocol}//${parsed.host}/robots.txt`;
|
|
507
|
+
}
|
|
508
|
+
async function getRobotsParser(url, timeout = DEFAULT_TIMEOUT_MS) {
|
|
509
|
+
const robotsUrl = getRobotsUrl(url);
|
|
510
|
+
const cached = cache.get(robotsUrl);
|
|
511
|
+
if (cached && cached.expiresAt > Date.now()) {
|
|
512
|
+
return cached.parser;
|
|
513
|
+
}
|
|
514
|
+
try {
|
|
515
|
+
const controller = new AbortController();
|
|
516
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
517
|
+
const response = await fetch(robotsUrl, {
|
|
518
|
+
signal: controller.signal,
|
|
519
|
+
headers: { "User-Agent": BOT_USER_AGENT }
|
|
520
|
+
});
|
|
521
|
+
clearTimeout(timeoutId);
|
|
522
|
+
const text = response.ok ? await response.text() : "";
|
|
523
|
+
const parser = robotsParser(robotsUrl, text);
|
|
524
|
+
cache.set(robotsUrl, {
|
|
525
|
+
parser,
|
|
526
|
+
expiresAt: Date.now() + DEFAULT_TTL_MS
|
|
527
|
+
});
|
|
528
|
+
return parser;
|
|
529
|
+
} catch {
|
|
530
|
+
const parser = robotsParser(robotsUrl, "");
|
|
531
|
+
cache.set(robotsUrl, {
|
|
532
|
+
parser,
|
|
533
|
+
expiresAt: Date.now() + DEFAULT_TTL_MS
|
|
534
|
+
});
|
|
535
|
+
return parser;
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
async function isAllowedByRobots(url, timeout) {
|
|
539
|
+
const parser = await getRobotsParser(url, timeout);
|
|
540
|
+
return parser.isAllowed(url, BOT_USER_AGENT) !== false;
|
|
541
|
+
}
|
|
542
|
+
async function getCrawlDelay(url, timeout) {
|
|
543
|
+
const parser = await getRobotsParser(url, timeout);
|
|
544
|
+
const delay = parser.getCrawlDelay(BOT_USER_AGENT);
|
|
545
|
+
return delay === void 0 ? null : delay;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
// src/sitemap.ts
|
|
549
|
+
var DEFAULT_TIMEOUT_MS2 = 1e4;
|
|
550
|
+
var DEFAULT_TTL_MS2 = 60 * 60 * 1e3;
|
|
551
|
+
var MAX_RECURSION_DEPTH = 2;
|
|
552
|
+
var cache2 = /* @__PURE__ */ new Map();
|
|
553
|
+
function getOrigin(url) {
|
|
554
|
+
const parsed = new URL(url);
|
|
555
|
+
return `${parsed.protocol}//${parsed.host}`;
|
|
556
|
+
}
|
|
557
|
+
function isSitemapIndex(xml) {
|
|
558
|
+
return /<sitemapindex[\s>]/i.test(xml);
|
|
559
|
+
}
|
|
560
|
+
function extractSitemapIndexUrls(xml) {
|
|
561
|
+
const urls = [];
|
|
562
|
+
const sitemapBlockRegex = /<(?:\w+:)?sitemap\b[^>]*>([\s\S]*?)<\/(?:\w+:)?sitemap>/gi;
|
|
563
|
+
let blockMatch;
|
|
564
|
+
while ((blockMatch = sitemapBlockRegex.exec(xml)) !== null) {
|
|
565
|
+
const block = blockMatch[1];
|
|
566
|
+
const locMatch = /<(?:\w+:)?loc\b[^>]*>([\s\S]*?)<\/(?:\w+:)?loc>/i.exec(block);
|
|
567
|
+
if (locMatch) {
|
|
568
|
+
const url = locMatch[1].trim();
|
|
569
|
+
if (url) urls.push(url);
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
return urls;
|
|
573
|
+
}
|
|
574
|
+
function extractUrlEntries(xml, origin) {
|
|
575
|
+
const entries = [];
|
|
576
|
+
const urlBlockRegex = /<(?:\w+:)?url\b[^>]*>([\s\S]*?)<\/(?:\w+:)?url>/gi;
|
|
577
|
+
let blockMatch;
|
|
578
|
+
while ((blockMatch = urlBlockRegex.exec(xml)) !== null) {
|
|
579
|
+
const block = blockMatch[1];
|
|
580
|
+
const locMatch = /<(?:\w+:)?loc\b[^>]*>([\s\S]*?)<\/(?:\w+:)?loc>/i.exec(block);
|
|
581
|
+
if (!locMatch) continue;
|
|
582
|
+
const loc = locMatch[1].trim();
|
|
583
|
+
if (!loc) continue;
|
|
584
|
+
try {
|
|
585
|
+
if (getOrigin(loc) !== origin) continue;
|
|
586
|
+
} catch {
|
|
587
|
+
continue;
|
|
588
|
+
}
|
|
589
|
+
const entry = { loc };
|
|
590
|
+
const lastmodMatch = /<(?:\w+:)?lastmod\b[^>]*>([\s\S]*?)<\/(?:\w+:)?lastmod>/i.exec(block);
|
|
591
|
+
if (lastmodMatch) {
|
|
592
|
+
const lastmod = lastmodMatch[1].trim();
|
|
593
|
+
if (lastmod) entry.lastmod = lastmod;
|
|
594
|
+
}
|
|
595
|
+
const priorityMatch = /<(?:\w+:)?priority\b[^>]*>([\s\S]*?)<\/(?:\w+:)?priority>/i.exec(block);
|
|
596
|
+
if (priorityMatch) {
|
|
597
|
+
const priority = parseFloat(priorityMatch[1].trim());
|
|
598
|
+
if (!isNaN(priority)) entry.priority = priority;
|
|
599
|
+
}
|
|
600
|
+
entries.push(entry);
|
|
601
|
+
}
|
|
602
|
+
return entries;
|
|
603
|
+
}
|
|
604
|
+
async function fetchSitemapXml(url, timeout) {
|
|
605
|
+
try {
|
|
606
|
+
const controller = new AbortController();
|
|
607
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
608
|
+
const response = await fetch(url, {
|
|
609
|
+
signal: controller.signal,
|
|
610
|
+
headers: { "User-Agent": "ScraperBot/1.0" }
|
|
611
|
+
});
|
|
612
|
+
clearTimeout(timeoutId);
|
|
613
|
+
if (!response.ok) return null;
|
|
614
|
+
return await response.text();
|
|
615
|
+
} catch {
|
|
616
|
+
return null;
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
async function parseSitemapInternal(url, origin, timeout, depth) {
|
|
620
|
+
if (depth >= MAX_RECURSION_DEPTH) return [];
|
|
621
|
+
const xml = await fetchSitemapXml(url, timeout);
|
|
622
|
+
if (!xml) return [];
|
|
623
|
+
if (isSitemapIndex(xml)) {
|
|
624
|
+
const sitemapUrls = extractSitemapIndexUrls(xml);
|
|
625
|
+
const allEntries = [];
|
|
626
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
627
|
+
const entries = await parseSitemapInternal(
|
|
628
|
+
sitemapUrl,
|
|
629
|
+
origin,
|
|
630
|
+
timeout,
|
|
631
|
+
depth + 1
|
|
632
|
+
);
|
|
633
|
+
allEntries.push(...entries);
|
|
634
|
+
}
|
|
635
|
+
return allEntries;
|
|
636
|
+
}
|
|
637
|
+
return extractUrlEntries(xml, origin);
|
|
638
|
+
}
|
|
639
|
+
async function parseSitemap(url, timeout = DEFAULT_TIMEOUT_MS2) {
|
|
640
|
+
const cached = cache2.get(url);
|
|
641
|
+
if (cached && cached.expiresAt > Date.now()) {
|
|
642
|
+
return cached.entries;
|
|
643
|
+
}
|
|
644
|
+
let origin;
|
|
645
|
+
try {
|
|
646
|
+
origin = getOrigin(url);
|
|
647
|
+
} catch {
|
|
648
|
+
return [];
|
|
649
|
+
}
|
|
650
|
+
const entries = await parseSitemapInternal(url, origin, timeout, 0);
|
|
651
|
+
cache2.set(url, {
|
|
652
|
+
entries,
|
|
653
|
+
expiresAt: Date.now() + DEFAULT_TTL_MS2
|
|
654
|
+
});
|
|
655
|
+
return entries;
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// src/crawl.ts
|
|
659
|
+
function isValidUrl4(url) {
|
|
660
|
+
try {
|
|
661
|
+
const parsed = new URL(url);
|
|
662
|
+
return parsed.protocol === "http:" || parsed.protocol === "https:";
|
|
663
|
+
} catch {
|
|
664
|
+
return false;
|
|
665
|
+
}
|
|
666
|
+
}
|
|
667
|
+
function normalizeUrl(url) {
|
|
668
|
+
try {
|
|
669
|
+
const parsed = new URL(url);
|
|
670
|
+
parsed.hash = "";
|
|
671
|
+
let pathname = parsed.pathname;
|
|
672
|
+
if (pathname.length > 1 && pathname.endsWith("/")) {
|
|
673
|
+
pathname = pathname.slice(0, -1);
|
|
674
|
+
}
|
|
675
|
+
parsed.pathname = pathname;
|
|
676
|
+
return parsed.toString();
|
|
677
|
+
} catch {
|
|
678
|
+
return url;
|
|
679
|
+
}
|
|
680
|
+
}
|
|
681
|
+
var SKIP_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
682
|
+
".pdf",
|
|
683
|
+
".jpg",
|
|
684
|
+
".jpeg",
|
|
685
|
+
".png",
|
|
686
|
+
".gif",
|
|
687
|
+
".svg",
|
|
688
|
+
".webp",
|
|
689
|
+
".ico",
|
|
690
|
+
".mp4",
|
|
691
|
+
".mp3",
|
|
692
|
+
".wav",
|
|
693
|
+
".avi",
|
|
694
|
+
".zip",
|
|
695
|
+
".tar",
|
|
696
|
+
".gz",
|
|
697
|
+
".rar",
|
|
698
|
+
".css",
|
|
699
|
+
".js",
|
|
700
|
+
".xml",
|
|
701
|
+
".json",
|
|
702
|
+
".woff",
|
|
703
|
+
".woff2",
|
|
704
|
+
".ttf",
|
|
705
|
+
".eot"
|
|
706
|
+
]);
|
|
707
|
+
function extractLinks(html, baseUrl) {
|
|
708
|
+
const links = [];
|
|
709
|
+
let origin;
|
|
710
|
+
try {
|
|
711
|
+
origin = new URL(baseUrl).origin;
|
|
712
|
+
} catch {
|
|
713
|
+
return [];
|
|
714
|
+
}
|
|
715
|
+
const regex = /<a\s+[^>]*href\s*=\s*["']([^"']+)["'][^>]*>/gi;
|
|
716
|
+
let match;
|
|
717
|
+
while ((match = regex.exec(html)) !== null) {
|
|
718
|
+
const href = match[1].trim();
|
|
719
|
+
if (href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:") || href.startsWith("#")) continue;
|
|
720
|
+
try {
|
|
721
|
+
const resolved = new URL(href, baseUrl);
|
|
722
|
+
if (resolved.origin !== origin) continue;
|
|
723
|
+
const ext = resolved.pathname.toLowerCase().match(/\.\w+$/)?.[0];
|
|
724
|
+
if (ext && SKIP_EXTENSIONS.has(ext)) continue;
|
|
725
|
+
links.push(normalizeUrl(resolved.toString()));
|
|
726
|
+
} catch {
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
return [...new Set(links)];
|
|
730
|
+
}
|
|
731
|
+
function matchGlob(url, pattern) {
|
|
732
|
+
const regex = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*\*/g, "{{DOUBLESTAR}}").replace(/\*/g, "[^/]*").replace(/\{\{DOUBLESTAR\}\}/g, ".*");
|
|
733
|
+
return new RegExp(regex).test(url);
|
|
734
|
+
}
|
|
735
|
+
function matchesFilter(url, include, exclude) {
|
|
736
|
+
if (exclude?.length) {
|
|
737
|
+
for (const pattern of exclude) {
|
|
738
|
+
if (matchGlob(url, pattern)) return false;
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
if (include?.length) {
|
|
742
|
+
return include.some((pattern) => matchGlob(url, pattern));
|
|
743
|
+
}
|
|
744
|
+
return true;
|
|
745
|
+
}
|
|
746
|
+
function sleep3(ms) {
|
|
747
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
748
|
+
}
|
|
749
|
+
async function crawl(options) {
|
|
750
|
+
const startTime = Date.now();
|
|
751
|
+
const {
|
|
752
|
+
url: startUrl,
|
|
753
|
+
depth: maxDepth = 2,
|
|
754
|
+
limit = 50,
|
|
755
|
+
mode = "auto",
|
|
756
|
+
include,
|
|
757
|
+
exclude,
|
|
758
|
+
timeout,
|
|
759
|
+
concurrency = 3,
|
|
760
|
+
respectRobots = true
|
|
761
|
+
} = options;
|
|
762
|
+
if (!isValidUrl4(startUrl)) {
|
|
763
|
+
throw new FetchError("Invalid URL", void 0, false);
|
|
764
|
+
}
|
|
765
|
+
if (maxDepth < 0) throw new FetchError("depth must be >= 0", void 0, false);
|
|
766
|
+
if (limit < 1) throw new FetchError("limit must be >= 1", void 0, false);
|
|
767
|
+
if (concurrency < 1) throw new FetchError("concurrency must be >= 1", void 0, false);
|
|
768
|
+
if (timeout !== void 0 && (timeout <= 0 || Number.isNaN(timeout))) {
|
|
769
|
+
throw new FetchError("timeout must be a positive number", void 0, false);
|
|
770
|
+
}
|
|
771
|
+
const pages = [];
|
|
772
|
+
const errors = [];
|
|
773
|
+
const visited = /* @__PURE__ */ new Set();
|
|
774
|
+
let totalDiscovered = 0;
|
|
775
|
+
let totalSkipped = 0;
|
|
776
|
+
const normalizedStart = normalizeUrl(startUrl);
|
|
777
|
+
const origin = new URL(startUrl).origin;
|
|
778
|
+
let crawlDelay = null;
|
|
779
|
+
if (respectRobots) {
|
|
780
|
+
crawlDelay = await getCrawlDelay(startUrl, timeout);
|
|
781
|
+
const allowed = await isAllowedByRobots(startUrl, timeout);
|
|
782
|
+
if (!allowed) {
|
|
783
|
+
return {
|
|
784
|
+
pages: [],
|
|
785
|
+
totalDiscovered: 1,
|
|
786
|
+
totalCrawled: 0,
|
|
787
|
+
totalSkipped: 1,
|
|
788
|
+
errors: [],
|
|
789
|
+
duration: Date.now() - startTime
|
|
790
|
+
};
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
const queue = [
|
|
794
|
+
{ url: normalizedStart, depth: 0 }
|
|
795
|
+
];
|
|
796
|
+
if (maxDepth > 0) {
|
|
797
|
+
try {
|
|
798
|
+
const sitemapEntries = await parseSitemap(`${origin}/sitemap.xml`, timeout);
|
|
799
|
+
const seen = /* @__PURE__ */ new Set([normalizedStart]);
|
|
800
|
+
for (const entry of sitemapEntries) {
|
|
801
|
+
const normalized = normalizeUrl(entry.loc);
|
|
802
|
+
if (!seen.has(normalized)) {
|
|
803
|
+
seen.add(normalized);
|
|
804
|
+
queue.push({ url: normalized, depth: 1 });
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
} catch (err) {
|
|
808
|
+
console.debug(`[scraper] Sitemap unavailable for ${origin}: ${err instanceof Error ? err.message : String(err)}`);
|
|
809
|
+
}
|
|
810
|
+
}
|
|
811
|
+
totalDiscovered = queue.length;
|
|
812
|
+
while (queue.length > 0 && pages.length < limit) {
|
|
813
|
+
const batchSize = Math.min(concurrency, limit - pages.length, queue.length);
|
|
814
|
+
const batch = queue.splice(0, batchSize);
|
|
815
|
+
const tasks = batch.map(async ({ url, depth }) => {
|
|
816
|
+
const normalized = normalizeUrl(url);
|
|
817
|
+
if (visited.has(normalized)) {
|
|
818
|
+
totalSkipped++;
|
|
819
|
+
return;
|
|
820
|
+
}
|
|
821
|
+
visited.add(normalized);
|
|
822
|
+
if (normalized !== normalizedStart && !matchesFilter(normalized, include, exclude)) {
|
|
823
|
+
totalSkipped++;
|
|
824
|
+
return;
|
|
825
|
+
}
|
|
826
|
+
if (respectRobots) {
|
|
827
|
+
const allowed = await isAllowedByRobots(url, timeout);
|
|
828
|
+
if (!allowed) {
|
|
829
|
+
totalSkipped++;
|
|
830
|
+
return;
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
try {
|
|
834
|
+
const fetchResult = await fetchWithMode(url, mode, { timeout });
|
|
835
|
+
const extractResult = await extractContent(fetchResult);
|
|
836
|
+
const convertResult = await convertToMarkdown(extractResult);
|
|
837
|
+
const pageResult = {
|
|
838
|
+
markdown: convertResult.markdown,
|
|
839
|
+
tokenCount: convertResult.tokenCount,
|
|
840
|
+
title: extractResult.title,
|
|
841
|
+
author: extractResult.author,
|
|
842
|
+
siteName: extractResult.siteName,
|
|
843
|
+
publishedAt: extractResult.publishedAt,
|
|
844
|
+
url: fetchResult.url,
|
|
845
|
+
depth
|
|
846
|
+
};
|
|
847
|
+
pages.push(pageResult);
|
|
848
|
+
if (depth < maxDepth) {
|
|
849
|
+
const links = extractLinks(fetchResult.html, fetchResult.url);
|
|
850
|
+
for (const link of links) {
|
|
851
|
+
if (!visited.has(link)) {
|
|
852
|
+
queue.push({ url: link, depth: depth + 1 });
|
|
853
|
+
totalDiscovered++;
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
} catch (err) {
|
|
858
|
+
errors.push({
|
|
859
|
+
url,
|
|
860
|
+
error: err instanceof Error ? err.message : String(err),
|
|
861
|
+
depth
|
|
862
|
+
});
|
|
863
|
+
}
|
|
864
|
+
});
|
|
865
|
+
await Promise.allSettled(tasks);
|
|
866
|
+
if (crawlDelay && crawlDelay > 0 && queue.length > 0) {
|
|
867
|
+
await sleep3(crawlDelay * 1e3);
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
return {
|
|
871
|
+
pages,
|
|
872
|
+
totalDiscovered,
|
|
873
|
+
totalCrawled: pages.length,
|
|
874
|
+
totalSkipped,
|
|
875
|
+
errors,
|
|
876
|
+
duration: Date.now() - startTime
|
|
877
|
+
};
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
// src/mcp-server.ts
|
|
881
|
+
function createServer() {
|
|
882
|
+
const server2 = new McpServer({
|
|
883
|
+
name: "scraper-mcp",
|
|
884
|
+
version: "0.2.0"
|
|
885
|
+
});
|
|
886
|
+
server2.tool(
|
|
887
|
+
"scraper_compress_url",
|
|
888
|
+
"Compress web content from a URL for reduced token usage. Returns markdown with 70-90% fewer tokens than raw HTML.",
|
|
889
|
+
{
|
|
890
|
+
url: z.string().url().describe("URL to compress"),
|
|
891
|
+
mode: z.enum(["fast", "stealth", "render", "auto"]).optional().describe("Fetch mode: 'fast' (plain HTTP), 'stealth' (TLS fingerprint), 'render' (headless browser), 'auto' (fast with fallback). Default: 'auto'"),
|
|
892
|
+
timeout: z.number().positive().optional().describe("Fetch timeout in milliseconds (default: 10000)"),
|
|
893
|
+
maxRetries: z.number().int().min(0).max(10).optional().describe("Max retry attempts (default: 3)")
|
|
894
|
+
},
|
|
895
|
+
async (args) => compressUrl(args)
|
|
896
|
+
);
|
|
897
|
+
server2.tool(
|
|
898
|
+
"scraper_crawl_url",
|
|
899
|
+
"Crawl multiple pages from a starting URL using BFS link discovery. Returns compressed markdown for each page with 70-90% fewer tokens than raw HTML.",
|
|
900
|
+
{
|
|
901
|
+
url: z.string().url().describe("Starting URL to crawl"),
|
|
902
|
+
maxPages: z.number().int().min(1).max(100).optional().describe("Max pages to crawl (default: 10)"),
|
|
903
|
+
maxDepth: z.number().int().min(0).max(5).optional().describe("Max link depth (default: 2)"),
|
|
904
|
+
mode: z.enum(["fast", "stealth", "render", "auto"]).optional().describe("Fetch mode: 'fast' (plain HTTP), 'stealth' (TLS fingerprint), 'render' (headless browser), 'auto' (fast with fallback). Default: 'auto'"),
|
|
905
|
+
include: z.array(z.string()).optional().describe("URL patterns to include (glob)"),
|
|
906
|
+
exclude: z.array(z.string()).optional().describe("URL patterns to exclude (glob)"),
|
|
907
|
+
timeout: z.number().positive().optional().describe("Per-page timeout in milliseconds (default: 10000)")
|
|
908
|
+
},
|
|
909
|
+
async (args) => crawlUrl(args)
|
|
910
|
+
);
|
|
911
|
+
return server2;
|
|
912
|
+
}
|
|
913
|
+
async function compressUrl({
|
|
914
|
+
url,
|
|
915
|
+
mode,
|
|
916
|
+
timeout,
|
|
917
|
+
maxRetries
|
|
918
|
+
}) {
|
|
919
|
+
try {
|
|
920
|
+
const fetchResult = await fetchWithMode(url, mode ?? "auto", { timeout, maxRetries });
|
|
921
|
+
const originalTokens = estimateTokens(fetchResult.html);
|
|
922
|
+
const extractResult = await extractContent(fetchResult);
|
|
923
|
+
const convertResult = await convertToMarkdown(extractResult);
|
|
924
|
+
const compressionRatio = originalTokens > 0 ? Math.round((1 - convertResult.tokenCount / originalTokens) * 100) : 0;
|
|
925
|
+
return {
|
|
926
|
+
content: [{ type: "text", text: convertResult.markdown }],
|
|
927
|
+
structuredContent: {
|
|
928
|
+
markdown: convertResult.markdown,
|
|
929
|
+
tokenCount: convertResult.tokenCount,
|
|
930
|
+
title: extractResult.title ?? null,
|
|
931
|
+
author: extractResult.author ?? null,
|
|
932
|
+
siteName: extractResult.siteName ?? null,
|
|
933
|
+
url: fetchResult.url,
|
|
934
|
+
compressionRatio
|
|
935
|
+
}
|
|
936
|
+
};
|
|
937
|
+
} catch (error) {
|
|
938
|
+
return formatError(url, error);
|
|
939
|
+
}
|
|
940
|
+
}
|
|
941
|
+
async function crawlUrl({
|
|
942
|
+
url,
|
|
943
|
+
maxPages,
|
|
944
|
+
maxDepth,
|
|
945
|
+
mode,
|
|
946
|
+
include,
|
|
947
|
+
exclude,
|
|
948
|
+
timeout
|
|
949
|
+
}) {
|
|
950
|
+
try {
|
|
951
|
+
const result = await crawl({
|
|
952
|
+
url,
|
|
953
|
+
limit: maxPages ?? 10,
|
|
954
|
+
depth: maxDepth ?? 2,
|
|
955
|
+
mode,
|
|
956
|
+
include,
|
|
957
|
+
exclude,
|
|
958
|
+
timeout
|
|
959
|
+
});
|
|
960
|
+
const host = new URL(url).host;
|
|
961
|
+
const errorSuffix = result.errors.length > 0 ? ` (${result.errors.length} error${result.errors.length > 1 ? "s" : ""})` : "";
|
|
962
|
+
const summary = `Crawled ${result.totalCrawled} pages from ${host}${errorSuffix}`;
|
|
963
|
+
const content = [
|
|
964
|
+
{ type: "text", text: summary }
|
|
965
|
+
];
|
|
966
|
+
for (const page of result.pages) {
|
|
967
|
+
const header = page.title ? `## ${page.title}
|
|
968
|
+
|
|
969
|
+
` : "";
|
|
970
|
+
content.push({
|
|
971
|
+
type: "text",
|
|
972
|
+
text: `${header}${page.markdown}`
|
|
973
|
+
});
|
|
974
|
+
}
|
|
975
|
+
return {
|
|
976
|
+
content,
|
|
977
|
+
structuredContent: {
|
|
978
|
+
pages: result.pages,
|
|
979
|
+
totalCrawled: result.totalCrawled,
|
|
980
|
+
totalDiscovered: result.totalDiscovered,
|
|
981
|
+
totalSkipped: result.totalSkipped,
|
|
982
|
+
errors: result.errors,
|
|
983
|
+
duration: result.duration
|
|
984
|
+
}
|
|
985
|
+
};
|
|
986
|
+
} catch (error) {
|
|
987
|
+
return formatError(url, error);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
function formatError(url, error) {
|
|
991
|
+
if (error instanceof FetchError) {
|
|
992
|
+
let message;
|
|
993
|
+
if (error.statusCode === 403) {
|
|
994
|
+
message = `Access denied (HTTP 403) for ${url}. The site may block automated access. Try a different URL or check if authentication is required.`;
|
|
995
|
+
} else if (error.statusCode === 404) {
|
|
996
|
+
message = `Page not found (HTTP 404) at ${url}. Verify the URL is correct and the page exists.`;
|
|
997
|
+
} else if (error.statusCode && error.statusCode >= 500) {
|
|
998
|
+
message = `Server error (HTTP ${error.statusCode}) from ${url}. The site may be experiencing issues. Try again later.`;
|
|
999
|
+
} else if (error.message.includes("timeout") || error.message.includes("Timeout")) {
|
|
1000
|
+
message = `Request timed out fetching ${url}. Try increasing the timeout parameter or check if the site is accessible.`;
|
|
1001
|
+
} else if (error.message.includes("Invalid URL")) {
|
|
1002
|
+
message = `Invalid URL: ${url}. Provide a full URL starting with http:// or https://`;
|
|
1003
|
+
} else {
|
|
1004
|
+
message = `Failed to fetch ${url}: ${error.message}${error.retryable ? ". Retries exhausted \u2014 try again later." : ""}`;
|
|
1005
|
+
}
|
|
1006
|
+
return {
|
|
1007
|
+
content: [{ type: "text", text: message }],
|
|
1008
|
+
isError: true
|
|
1009
|
+
};
|
|
1010
|
+
}
|
|
1011
|
+
if (error instanceof ExtractionError) {
|
|
1012
|
+
let message;
|
|
1013
|
+
if (error.code === "EMPTY_HTML") {
|
|
1014
|
+
message = `The page at ${url} returned empty HTML. It may require JavaScript rendering \u2014 try using mode: 'render' (requires Playwright peer dependency) or try a URL that serves static HTML content.`;
|
|
1015
|
+
} else if (error.code === "NO_CONTENT") {
|
|
1016
|
+
message = `Could not extract meaningful content from ${url}. The page may be a login wall, contain only images/video, or rely entirely on JavaScript rendering.`;
|
|
1017
|
+
} else {
|
|
1018
|
+
message = `Content extraction failed for ${url}: ${error.message}`;
|
|
1019
|
+
}
|
|
1020
|
+
return {
|
|
1021
|
+
content: [{ type: "text", text: message }],
|
|
1022
|
+
isError: true
|
|
1023
|
+
};
|
|
1024
|
+
}
|
|
1025
|
+
const msg = error instanceof Error ? error.message : String(error);
|
|
1026
|
+
return {
|
|
1027
|
+
content: [
|
|
1028
|
+
{
|
|
1029
|
+
type: "text",
|
|
1030
|
+
text: `Unexpected error processing ${url}: ${msg}`
|
|
1031
|
+
}
|
|
1032
|
+
],
|
|
1033
|
+
isError: true
|
|
1034
|
+
};
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// src/mcp-entry.ts
|
|
1038
|
+
var server = createServer();
|
|
1039
|
+
var transport = new StdioServerTransport();
|
|
1040
|
+
await server.connect(transport);
|