mcp-researchpowerpack 6.0.5 → 6.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/mcp-use.json +2 -2
- package/dist/src/clients/jina.js +165 -0
- package/dist/src/clients/jina.js.map +7 -0
- package/dist/src/clients/scraper.js +21 -0
- package/dist/src/clients/scraper.js.map +2 -2
- package/dist/src/config/index.js +2 -1
- package/dist/src/config/index.js.map +2 -2
- package/dist/src/tools/scrape.js +128 -18
- package/dist/src/tools/scrape.js.map +2 -2
- package/dist/src/utils/errors.js +1 -0
- package/dist/src/utils/errors.js.map +2 -2
- package/dist/src/utils/source-type.js +40 -1
- package/dist/src/utils/source-type.js.map +2 -2
- package/package.json +1 -1
package/dist/mcp-use.json
CHANGED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import {
|
|
2
|
+
classifyError,
|
|
3
|
+
fetchWithTimeout,
|
|
4
|
+
sleep,
|
|
5
|
+
ErrorCode
|
|
6
|
+
} from "../utils/errors.js";
|
|
7
|
+
import { calculateBackoff } from "../utils/retry.js";
|
|
8
|
+
import { mcpLog } from "../utils/logger.js";
|
|
9
|
+
const JINA_READER_BASE = "https://r.jina.ai/";
|
|
10
|
+
const DEFAULT_TIMEOUT_MS = 6e4;
|
|
11
|
+
const MAX_RETRIES = 2;
|
|
12
|
+
class JinaClient {
|
|
13
|
+
apiKey;
|
|
14
|
+
constructor(apiKey) {
|
|
15
|
+
const fromEnv = process.env.JINA_API_KEY?.trim();
|
|
16
|
+
this.apiKey = apiKey?.trim() || fromEnv || void 0;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Convert a URL to markdown via Jina Reader.
|
|
20
|
+
* NEVER throws — always returns a JinaConvertResponse (possibly with error).
|
|
21
|
+
*/
|
|
22
|
+
async convert(request) {
|
|
23
|
+
const { url, timeoutMs = DEFAULT_TIMEOUT_MS } = request;
|
|
24
|
+
try {
|
|
25
|
+
new URL(url);
|
|
26
|
+
} catch {
|
|
27
|
+
return {
|
|
28
|
+
content: `Invalid URL: ${url}`,
|
|
29
|
+
statusCode: 400,
|
|
30
|
+
credits: 0,
|
|
31
|
+
error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false }
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
const jinaUrl = `${JINA_READER_BASE}${url}`;
|
|
35
|
+
const headers = {
|
|
36
|
+
Accept: "text/markdown"
|
|
37
|
+
};
|
|
38
|
+
if (this.apiKey) {
|
|
39
|
+
headers["Authorization"] = `Bearer ${this.apiKey}`;
|
|
40
|
+
}
|
|
41
|
+
let lastError;
|
|
42
|
+
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
|
|
43
|
+
try {
|
|
44
|
+
const response = await fetchWithTimeout(jinaUrl, {
|
|
45
|
+
method: "GET",
|
|
46
|
+
headers,
|
|
47
|
+
timeoutMs
|
|
48
|
+
});
|
|
49
|
+
let content;
|
|
50
|
+
try {
|
|
51
|
+
content = await response.text();
|
|
52
|
+
} catch (readError) {
|
|
53
|
+
content = `Failed to read Jina response: ${readError instanceof Error ? readError.message : String(readError)}`;
|
|
54
|
+
}
|
|
55
|
+
const usageHeader = response.headers.get("x-usage-tokens");
|
|
56
|
+
const usageTokens = usageHeader ? Number(usageHeader) : void 0;
|
|
57
|
+
if (response.ok) {
|
|
58
|
+
if (!content.trim()) {
|
|
59
|
+
return {
|
|
60
|
+
content: "Jina returned an empty body",
|
|
61
|
+
statusCode: response.status,
|
|
62
|
+
credits: 0,
|
|
63
|
+
usageTokens: Number.isFinite(usageTokens) ? usageTokens : void 0,
|
|
64
|
+
error: {
|
|
65
|
+
code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,
|
|
66
|
+
message: "Jina Reader returned empty content for this URL",
|
|
67
|
+
retryable: false
|
|
68
|
+
}
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
return {
|
|
72
|
+
content,
|
|
73
|
+
statusCode: response.status,
|
|
74
|
+
credits: 0,
|
|
75
|
+
usageTokens: Number.isFinite(usageTokens) ? usageTokens : void 0
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
if (response.status === 401 || response.status === 403) {
|
|
79
|
+
return {
|
|
80
|
+
content: `Jina auth/quota error (${response.status}): ${content.slice(0, 200)}`,
|
|
81
|
+
statusCode: response.status,
|
|
82
|
+
credits: 0,
|
|
83
|
+
error: {
|
|
84
|
+
code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.QUOTA_EXCEEDED,
|
|
85
|
+
message: response.status === 401 ? "Jina Reader auth failed \u2014 check JINA_API_KEY" : "Jina Reader quota exceeded",
|
|
86
|
+
retryable: false,
|
|
87
|
+
statusCode: response.status
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
if (response.status === 404) {
|
|
92
|
+
return {
|
|
93
|
+
content: `Jina could not fetch the target URL (404)`,
|
|
94
|
+
statusCode: 404,
|
|
95
|
+
credits: 0,
|
|
96
|
+
error: {
|
|
97
|
+
code: ErrorCode.NOT_FOUND,
|
|
98
|
+
message: "Target URL not reachable by Jina Reader",
|
|
99
|
+
retryable: false,
|
|
100
|
+
statusCode: 404
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
if (response.status === 429 || response.status >= 500) {
|
|
105
|
+
lastError = classifyError({ status: response.status, message: content.slice(0, 200) });
|
|
106
|
+
if (attempt < MAX_RETRIES) {
|
|
107
|
+
const delayMs = calculateBackoff(attempt);
|
|
108
|
+
mcpLog(
|
|
109
|
+
"warning",
|
|
110
|
+
`Jina ${response.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1}. Retrying in ${delayMs}ms`,
|
|
111
|
+
"jina"
|
|
112
|
+
);
|
|
113
|
+
await sleep(delayMs);
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
return {
|
|
117
|
+
content: `Jina Reader error (${response.status}): ${content.slice(0, 200)}`,
|
|
118
|
+
statusCode: response.status,
|
|
119
|
+
credits: 0,
|
|
120
|
+
error: lastError
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
return {
|
|
124
|
+
content: `Jina Reader error (${response.status}): ${content.slice(0, 200)}`,
|
|
125
|
+
statusCode: response.status,
|
|
126
|
+
credits: 0,
|
|
127
|
+
error: {
|
|
128
|
+
code: ErrorCode.INVALID_INPUT,
|
|
129
|
+
message: `Jina Reader returned ${response.status}`,
|
|
130
|
+
retryable: false,
|
|
131
|
+
statusCode: response.status
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
} catch (error) {
|
|
135
|
+
lastError = classifyError(error);
|
|
136
|
+
if (lastError.retryable && attempt < MAX_RETRIES) {
|
|
137
|
+
const delayMs = calculateBackoff(attempt);
|
|
138
|
+
mcpLog(
|
|
139
|
+
"warning",
|
|
140
|
+
`Jina ${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${MAX_RETRIES + 1} in ${delayMs}ms`,
|
|
141
|
+
"jina"
|
|
142
|
+
);
|
|
143
|
+
await sleep(delayMs);
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
return {
|
|
147
|
+
content: `Jina Reader failed: ${lastError.message}`,
|
|
148
|
+
statusCode: lastError.statusCode ?? 500,
|
|
149
|
+
credits: 0,
|
|
150
|
+
error: lastError
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return {
|
|
155
|
+
content: `Jina Reader failed after ${MAX_RETRIES + 1} attempts: ${lastError?.message ?? "Unknown error"}`,
|
|
156
|
+
statusCode: lastError?.statusCode ?? 500,
|
|
157
|
+
credits: 0,
|
|
158
|
+
error: lastError ?? { code: ErrorCode.UNKNOWN_ERROR, message: "All retries exhausted", retryable: false }
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
export {
|
|
163
|
+
JinaClient
|
|
164
|
+
};
|
|
165
|
+
//# sourceMappingURL=jina.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/clients/jina.ts"],
|
|
4
|
+
"sourcesContent": ["/**\n * Jina Reader Client\n *\n * Converts any URL (including PDFs, DOCX, PPTX, HTML) into clean markdown via\n * the public `https://r.jina.ai/<url>` endpoint. Used by `scrape-links` for\n * document formats that our HTML-assumed pipeline (Scrape.do + Readability +\n * Turndown) cannot decode.\n *\n * NEVER throws \u2014 every failure surfaces as a classified `StructuredError`\n * in the returned response, matching the shape of `ScraperClient.scrape`.\n *\n * Auth: optional `JINA_API_KEY` raises the rate limit from 20 RPM to 200+ RPM.\n * Without a key the endpoint still works; we just retry more aggressively on\n * 429 responses.\n */\n\nimport {\n classifyError,\n fetchWithTimeout,\n sleep,\n ErrorCode,\n type StructuredError,\n} from '../utils/errors.js';\nimport { calculateBackoff } from '../utils/retry.js';\nimport { mcpLog } from '../utils/logger.js';\n\n// \u2500\u2500 Constants \u2500\u2500\n\nconst JINA_READER_BASE = 'https://r.jina.ai/' as const;\nconst DEFAULT_TIMEOUT_MS = 60_000 as const; // Jina can take a while for large PDFs\nconst MAX_RETRIES = 2 as const;\n\n// \u2500\u2500 Interfaces \u2500\u2500\n\nexport interface JinaConvertRequest {\n readonly url: string;\n readonly timeoutMs?: number;\n}\n\nexport interface JinaConvertResponse {\n readonly content: string;\n readonly statusCode: number;\n /** Always 0 \u2014 Jina is a separate service from Scrape.do's credit pool. */\n readonly credits: 0;\n readonly usageTokens?: number;\n readonly error?: StructuredError;\n}\n\n// \u2500\u2500 Client \u2500\u2500\n\nexport class JinaClient {\n private readonly apiKey: string | undefined;\n\n constructor(apiKey?: string) {\n const fromEnv = process.env.JINA_API_KEY?.trim();\n this.apiKey = apiKey?.trim() || fromEnv || undefined;\n }\n\n /**\n * Convert a URL to markdown via Jina Reader.\n * NEVER throws \u2014 always returns a JinaConvertResponse (possibly with error).\n */\n async convert(request: JinaConvertRequest): Promise<JinaConvertResponse> {\n const { url, timeoutMs = DEFAULT_TIMEOUT_MS } = request;\n\n try {\n new URL(url);\n } catch {\n return {\n content: `Invalid URL: ${url}`,\n statusCode: 400,\n credits: 0,\n error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false },\n };\n }\n\n // Jina Reader parses the full target URL as the path suffix. Query strings\n // and fragments in the target are preserved verbatim; no encoding needed.\n const jinaUrl = `${JINA_READER_BASE}${url}`;\n\n const headers: Record<string, string> = {\n Accept: 'text/markdown',\n };\n if (this.apiKey) {\n headers['Authorization'] = `Bearer ${this.apiKey}`;\n }\n\n let lastError: StructuredError | undefined;\n\n for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {\n try {\n const response = await fetchWithTimeout(jinaUrl, {\n method: 'GET',\n headers,\n timeoutMs,\n });\n\n let content: string;\n try {\n content = await response.text();\n } catch (readError) {\n content = `Failed to read Jina response: ${readError instanceof Error ? readError.message : String(readError)}`;\n }\n\n const usageHeader = response.headers.get('x-usage-tokens');\n const usageTokens = usageHeader ? Number(usageHeader) : undefined;\n\n if (response.ok) {\n if (!content.trim()) {\n return {\n content: 'Jina returned an empty body',\n statusCode: response.status,\n credits: 0,\n usageTokens: Number.isFinite(usageTokens) ? usageTokens : undefined,\n error: {\n code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,\n message: 'Jina Reader returned empty content for this URL',\n retryable: false,\n },\n };\n }\n return {\n content,\n statusCode: response.status,\n credits: 0,\n usageTokens: Number.isFinite(usageTokens) ? usageTokens : undefined,\n };\n }\n\n // 401/403 \u2014 auth or quota problems. Not retryable.\n if (response.status === 401 || response.status === 403) {\n return {\n content: `Jina auth/quota error (${response.status}): ${content.slice(0, 200)}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.QUOTA_EXCEEDED,\n message: response.status === 401\n ? 'Jina Reader auth failed \u2014 check JINA_API_KEY'\n : 'Jina Reader quota exceeded',\n retryable: false,\n statusCode: response.status,\n },\n };\n }\n\n // 404 \u2014 the target URL itself was not found by Jina.\n if (response.status === 404) {\n return {\n content: `Jina could not fetch the target URL (404)`,\n statusCode: 404,\n credits: 0,\n error: {\n code: ErrorCode.NOT_FOUND,\n message: 'Target URL not reachable by Jina Reader',\n retryable: false,\n statusCode: 404,\n },\n };\n }\n\n // 429 / 5xx \u2014 retryable.\n if (response.status === 429 || response.status >= 500) {\n lastError = classifyError({ status: response.status, message: content.slice(0, 200) });\n if (attempt < MAX_RETRIES) {\n const delayMs = calculateBackoff(attempt);\n mcpLog(\n 'warning',\n `Jina ${response.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1}. Retrying in ${delayMs}ms`,\n 'jina',\n );\n await sleep(delayMs);\n continue;\n }\n return {\n content: `Jina Reader error (${response.status}): ${content.slice(0, 200)}`,\n statusCode: response.status,\n credits: 0,\n error: lastError,\n };\n }\n\n // Anything else \u2014 treat as non-retryable client error.\n return {\n content: `Jina Reader error (${response.status}): ${content.slice(0, 200)}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: ErrorCode.INVALID_INPUT,\n message: `Jina Reader returned ${response.status}`,\n retryable: false,\n statusCode: response.status,\n },\n };\n } catch (error) {\n lastError = classifyError(error);\n if (lastError.retryable && attempt < MAX_RETRIES) {\n const delayMs = calculateBackoff(attempt);\n mcpLog(\n 'warning',\n `Jina ${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${MAX_RETRIES + 1} in ${delayMs}ms`,\n 'jina',\n );\n await sleep(delayMs);\n continue;\n }\n return {\n content: `Jina Reader failed: ${lastError.message}`,\n statusCode: lastError.statusCode ?? 500,\n credits: 0,\n error: lastError,\n };\n }\n }\n\n return {\n content: `Jina Reader failed after ${MAX_RETRIES + 1} attempts: ${lastError?.message ?? 'Unknown error'}`,\n statusCode: lastError?.statusCode ?? 500,\n credits: 0,\n error: lastError ?? { code: ErrorCode.UNKNOWN_ERROR, message: 'All retries exhausted', retryable: false },\n };\n }\n}\n"],
|
|
5
|
+
"mappings": "AAgBA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAEK;AACP,SAAS,wBAAwB;AACjC,SAAS,cAAc;AAIvB,MAAM,mBAAmB;AACzB,MAAM,qBAAqB;AAC3B,MAAM,cAAc;AAoBb,MAAM,WAAW;AAAA,EACL;AAAA,EAEjB,YAAY,QAAiB;AAC3B,UAAM,UAAU,QAAQ,IAAI,cAAc,KAAK;AAC/C,SAAK,SAAS,QAAQ,KAAK,KAAK,WAAW;AAAA,EAC7C;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,QAAQ,SAA2D;AACvE,UAAM,EAAE,KAAK,YAAY,mBAAmB,IAAI;AAEhD,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,aAAO;AAAA,QACL,SAAS,gBAAgB,GAAG;AAAA,QAC5B,YAAY;AAAA,QACZ,SAAS;AAAA,QACT,OAAO,EAAE,MAAM,UAAU,eAAe,SAAS,gBAAgB,GAAG,IAAI,WAAW,MAAM;AAAA,MAC3F;AAAA,IACF;AAIA,UAAM,UAAU,GAAG,gBAAgB,GAAG,GAAG;AAEzC,UAAM,UAAkC;AAAA,MACtC,QAAQ;AAAA,IACV;AACA,QAAI,KAAK,QAAQ;AACf,cAAQ,eAAe,IAAI,UAAU,KAAK,MAAM;AAAA,IAClD;AAEA,QAAI;AAEJ,aAAS,UAAU,GAAG,WAAW,aAAa,WAAW;AACvD,UAAI;AACF,cAAM,WAAW,MAAM,iBAAiB,SAAS;AAAA,UAC/C,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC;AAED,YAAI;AACJ,YAAI;AACF,oBAAU,MAAM,SAAS,KAAK;AAAA,QAChC,SAAS,WAAW;AAClB,oBAAU,iCAAiC,qBAAqB,QAAQ,UAAU,UAAU,OAAO,SAAS,CAAC;AAAA,QAC/G;AAEA,cAAM,cAAc,SAAS,QAAQ,IAAI,gBAAgB;AACzD,cAAM,cAAc,cAAc,OAAO,WAAW,IAAI;AAExD,YAAI,SAAS,IAAI;AACf,cAAI,CAAC,QAAQ,KAAK,GAAG;AACnB,mBAAO;AAAA,cACL,SAAS;AAAA,cACT,YAAY,SAAS;AAAA,cACrB,SAAS;AAAA,cACT,aAAa,OAAO,SAAS,WAAW,IAAI,cAAc;AAAA,cAC1D,OAAO;AAAA,gBACL,MAAM,UAAU;AAAA,gBAChB,SAAS;AAAA,gBACT,WAAW;AAAA,cACb;AAAA,YACF;AAAA,UACF;AACA,iBAAO;AAAA,YACL;AAAA,YACA,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,aAAa,OAAO,SAAS,WAAW,IAAI,cAAc;AAAA,UAC5D;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,OAAO,SAAS,WAAW,KAAK;AACtD,iBAAO;AAAA,YACL,SAAS,0BAA0B,SAAS,MAAM,MAAM,QAAQ,MAAM,GAAG,GAAG,CAAC;AAAA,YAC7E,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,OAAO;AAAA,cACL,MAAM,SAAS,WAAW,MAAM,UAAU,aAAa,UAAU;AAAA,cACjE,SAAS,SAAS,WAAW,MACzB,sDACA;AAAA,cACJ,WAAW;AAAA,cACX,YAAY,SAAS;AAAA,YACvB;AAAA,UACF;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,KAAK;AAC3B,iBAAO;AAAA,YACL,SAAS;AAAA,YACT,YAAY;AAAA,YACZ,SAAS;AAAA,YACT,OAAO;AAAA,cACL,MAAM,UAAU;AAAA,cAChB,SAAS;AAAA,cACT,WAAW;AAAA,cACX,YAAY;AAAA,YACd;AAAA,UACF;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,OAAO,SAAS,UAAU,KAAK;AACrD,sBAAY,cAAc,EAAE,QAAQ,SAAS,QAAQ,SAAS,QAAQ,MAAM,GAAG,GAAG,EAAE,CAAC;AACrF,cAAI,UAAU,aAAa;AACzB,kBAAM,UAAU,iBAAiB,OAAO;AACxC;AAAA,cACE;AAAA,cACA,QAAQ,SAAS,MAAM,eAAe,UAAU,CAAC,IAAI,cAAc,CAAC,iBAAiB,OAAO;AAAA,cAC5F;AAAA,YACF;AACA,kBAAM,MAAM,OAAO;AACnB;AAAA,UACF;AACA,iBAAO;AAAA,YACL,SAAS,sBAAsB,SAAS,MAAM,MAAM,QAAQ,MAAM,GAAG,GAAG,CAAC;AAAA,YACzE,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,OAAO;AAAA,UACT;AAAA,QACF;AAGA,eAAO;AAAA,UACL,SAAS,sBAAsB,SAAS,MAAM,MAAM,QAAQ,MAAM,GAAG,GAAG,CAAC;AAAA,UACzE,YAAY,SAAS;AAAA,UACrB,SAAS;AAAA,UACT,OAAO;AAAA,YACL,MAAM,UAAU;AAAA,YAChB,SAAS,wBAAwB,SAAS,MAAM;AAAA,YAChD,WAAW;AAAA,YACX,YAAY,SAAS;AAAA,UACvB;AAAA,QACF;AAAA,MACF,SAAS,OAAO;AACd,oBAAY,cAAc,KAAK;AAC/B,YAAI,UAAU,aAAa,UAAU,aAAa;AAChD,gBAAM,UAAU,iBAAiB,OAAO;AACxC;AAAA,YACE;AAAA,YACA,QAAQ,UAAU,IAAI,KAAK,UAAU,OAAO,WAAW,UAAU,CAAC,IAAI,cAAc,CAAC,OAAO,OAAO;AAAA,YACnG;AAAA,UACF;AACA,gBAAM,MAAM,OAAO;AACnB;AAAA,QACF;AACA,eAAO;AAAA,UACL,SAAS,uBAAuB,UAAU,OAAO;AAAA,UACjD,YAAY,UAAU,cAAc;AAAA,UACpC,SAAS;AAAA,UACT,OAAO;AAAA,QACT;AAAA,MACF;AAAA,IACF;AAEA,WAAO;AAAA,MACL,SAAS,4BAA4B,cAAc,CAAC,cAAc,WAAW,WAAW,eAAe;AAAA,MACvG,YAAY,WAAW,cAAc;AAAA,MACrC,SAAS;AAAA,MACT,OAAO,aAAa,EAAE,MAAM,UAAU,eAAe,SAAS,yBAAyB,WAAW,MAAM;AAAA,IAC1G;AAAA,EACF;AACF;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
import { calculateBackoff } from "../utils/retry.js";
|
|
9
9
|
import { pMapSettled } from "../utils/concurrency.js";
|
|
10
10
|
import { mcpLog } from "../utils/logger.js";
|
|
11
|
+
import { isBinaryDocumentContentType } from "../utils/source-type.js";
|
|
11
12
|
const SCRAPE_MODES = ["basic", "javascript", "javascript_geo"];
|
|
12
13
|
const CREDIT_COSTS = { basic: 1, javascript: 5, javascript_geo: 5 };
|
|
13
14
|
const SCRAPE_BATCH_SIZE = 30;
|
|
@@ -76,6 +77,26 @@ class ScraperClient {
|
|
|
76
77
|
content = `Failed to read response: ${readError instanceof Error ? readError.message : String(readError)}`;
|
|
77
78
|
}
|
|
78
79
|
if (response.ok) {
|
|
80
|
+
const contentType = response.headers.get("content-type");
|
|
81
|
+
if (isBinaryDocumentContentType(contentType)) {
|
|
82
|
+
mcpLog(
|
|
83
|
+
"info",
|
|
84
|
+
`Binary document detected at ${url} (content-type: ${contentType}). Deferring to Jina Reader.`,
|
|
85
|
+
"scraper"
|
|
86
|
+
);
|
|
87
|
+
return {
|
|
88
|
+
content: `Binary document (${contentType ?? "unknown"}); routed to Jina Reader`,
|
|
89
|
+
statusCode: 415,
|
|
90
|
+
credits: 0,
|
|
91
|
+
headers: Object.fromEntries(response.headers.entries()),
|
|
92
|
+
error: {
|
|
93
|
+
code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,
|
|
94
|
+
message: `Scrape.do cannot decode ${contentType ?? "this binary content-type"}`,
|
|
95
|
+
retryable: false,
|
|
96
|
+
statusCode: 415
|
|
97
|
+
}
|
|
98
|
+
};
|
|
99
|
+
}
|
|
79
100
|
return {
|
|
80
101
|
content,
|
|
81
102
|
statusCode: response.status,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/clients/scraper.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Web Scraper Client\n * Generic interface for URL scraping with automatic fallback modes\n * Implements robust error handling that NEVER crashes\n */\n\nimport { parseEnv, SCRAPER, CONCURRENCY } from '../config/index.js';\nimport {\n classifyError,\n fetchWithTimeout,\n sleep,\n ErrorCode,\n type StructuredError,\n} from '../utils/errors.js';\nimport { calculateBackoff } from '../utils/retry.js';\nimport { pMapSettled } from '../utils/concurrency.js';\nimport { mcpLog } from '../utils/logger.js';\n\n// \u2500\u2500 Constants \u2500\u2500\n\nconst SCRAPE_MODES = ['basic', 'javascript', 'javascript_geo'] as const;\ntype ScrapeMode = typeof SCRAPE_MODES[number];\n\nconst CREDIT_COSTS: Record<string, number> = { basic: 1, javascript: 5, javascript_geo: 5 } as const;\nconst SCRAPE_BATCH_SIZE = 30 as const;\nconst MAX_RETRIES = 1 as const;\n/** Overall timeout for all fallback attempts on a single URL */\nconst FALLBACK_OVERALL_TIMEOUT_MS = 30_000 as const;\n\n// \u2500\u2500 Interfaces \u2500\u2500\n\ninterface ScrapeRequest {\n readonly url: string;\n readonly mode?: 'basic' | 'javascript';\n readonly timeout?: number;\n readonly country?: string;\n}\n\ninterface ScrapeResponse {\n readonly content: string;\n readonly statusCode: number;\n readonly credits: number;\n readonly headers?: Record<string, string>;\n readonly error?: StructuredError;\n}\n\ninterface BatchScrapeResult {\n readonly results: ReadonlyArray<ScrapeResponse & { readonly url: string }>;\n readonly batchesProcessed: number;\n readonly totalAttempted: number;\n readonly rateLimitHits: number;\n}\n\n// Status codes that indicate we should retry (no credit consumed)\nconst RETRYABLE_STATUS_CODES = new Set([429, 502, 503, 504, 510]);\n// Status codes that are permanent failures (don't retry)\nconst PERMANENT_FAILURE_CODES = new Set([400, 401, 403]);\n\n/** Minimum stripped-text length to consider a scrape successful (filters out empty SPA shells) */\nconst MIN_USEFUL_CONTENT_LENGTH = 200 as const;\n\n/** Fallback attempt descriptor used by scrapeWithFallback */\ninterface FallbackAttempt {\n readonly mode: 'basic' | 'javascript';\n readonly country?: string;\n readonly description: string;\n}\n\nconst FALLBACK_ATTEMPTS: readonly FallbackAttempt[] = [\n { mode: 'basic', description: 'basic mode' },\n { mode: 'javascript', description: 'javascript rendering' },\n { mode: 'javascript', country: 'us', description: 'javascript + US geo-targeting' },\n] as const;\n\nexport class ScraperClient {\n private apiKey: string;\n private baseURL = 'https://api.scrape.do';\n\n constructor(apiKey?: string) {\n const env = parseEnv();\n this.apiKey = apiKey || env.SCRAPER_API_KEY;\n\n if (!this.apiKey) {\n throw new Error('Web scraping capability is not configured. Please set up the required API credentials.');\n }\n }\n\n /**\n * Scrape a single URL with retry logic\n * NEVER throws - always returns a ScrapeResponse (possibly with error)\n */\n async scrape(request: ScrapeRequest, maxRetries = MAX_RETRIES): Promise<ScrapeResponse> {\n const { url, mode = 'basic', timeout = 15, country } = request;\n const credits = CREDIT_COSTS[mode] ?? 1;\n\n // Validate URL first\n try {\n new URL(url);\n } catch {\n return {\n content: `Invalid URL: ${url}`,\n statusCode: 400,\n credits: 0,\n error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false },\n };\n }\n\n const params = new URLSearchParams({\n url: url,\n token: this.apiKey,\n timeout: String(timeout * 1000),\n });\n\n if (mode === 'javascript') {\n params.append('render', 'true');\n }\n\n if (country) {\n params.append('geoCode', country.toUpperCase());\n }\n\n const apiUrl = `${this.baseURL}?${params.toString()}`;\n let lastError: StructuredError | undefined;\n\n for (let attempt = 0; attempt < maxRetries; attempt++) {\n try {\n // Use AbortController for timeout\n const timeoutMs = (timeout + 5) * 1000; // Add 5s buffer over scrape timeout\n const response = await fetchWithTimeout(apiUrl, {\n method: 'GET',\n headers: { Accept: 'text/html,application/json' },\n timeoutMs,\n });\n\n // Safely read response body\n let content: string;\n try {\n content = await response.text();\n } catch (readError) {\n content = `Failed to read response: ${readError instanceof Error ? readError.message : String(readError)}`;\n }\n\n // SUCCESS: 2xx - Successful API call\n if (response.ok) {\n return {\n content,\n statusCode: response.status,\n credits,\n headers: Object.fromEntries(response.headers.entries()),\n };\n }\n\n // 404 - Target not found (permanent, but not an error for our purposes)\n if (response.status === 404) {\n return {\n content: '404 - Page not found',\n statusCode: 404,\n credits,\n };\n }\n\n // Permanent failures - don't retry\n if (PERMANENT_FAILURE_CODES.has(response.status)) {\n const errorMsg = response.status === 401\n ? 'No credits remaining or subscription suspended'\n : `Request failed with status ${response.status}`;\n return {\n content: `Error: ${errorMsg}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.INVALID_INPUT,\n message: errorMsg,\n retryable: false,\n statusCode: response.status,\n },\n };\n }\n\n // Retryable status codes\n if (RETRYABLE_STATUS_CODES.has(response.status)) {\n lastError = {\n code: response.status === 429 ? ErrorCode.RATE_LIMITED : ErrorCode.SERVICE_UNAVAILABLE,\n message: `Server returned ${response.status}`,\n retryable: true,\n statusCode: response.status,\n };\n\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${response.status} on attempt ${attempt + 1}/${maxRetries}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n\n // Other non-success status - treat as retryable\n lastError = classifyError({ status: response.status, message: content });\n if (attempt < maxRetries - 1 && lastError.retryable) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `Status ${response.status}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n\n // Final attempt failed\n return {\n content: `Error: ${lastError.message}`,\n statusCode: response.status,\n credits: 0,\n error: lastError,\n };\n\n } catch (error) {\n lastError = classifyError(error);\n\n // Non-retryable errors - return immediately\n if (!lastError.retryable) {\n return {\n content: `Error: ${lastError.message}`,\n statusCode: lastError.statusCode || 500,\n credits: 0,\n error: lastError,\n };\n }\n\n // Retryable error - continue if attempts remaining\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${maxRetries} in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n }\n\n // All retries exhausted\n return {\n content: `Error: Failed after ${maxRetries} attempts. ${lastError?.message || 'Unknown error'}`,\n statusCode: lastError?.statusCode || 500,\n credits: 0,\n error: lastError || { code: ErrorCode.UNKNOWN_ERROR, message: 'All retries exhausted', retryable: false },\n };\n }\n\n /**\n * Scrape with automatic fallback through different modes\n * NEVER throws - always returns a ScrapeResponse\n */\n async scrapeWithFallback(url: string, options: { timeout?: number } = {}): Promise<ScrapeResponse> {\n const attemptResults: string[] = [];\n let lastResult: ScrapeResponse | null = null;\n const deadline = Date.now() + FALLBACK_OVERALL_TIMEOUT_MS;\n\n for (const attempt of FALLBACK_ATTEMPTS) {\n // Check overall deadline before starting next fallback\n if (Date.now() >= deadline) {\n mcpLog('warning', `Overall fallback timeout reached for ${url} after ${attemptResults.length} attempt(s)`, 'scraper');\n break;\n }\n\n const result = await this.tryFallbackAttempt(url, attempt, options);\n\n if (result.done) {\n if (attemptResults.length > 0) {\n mcpLog('info', `Success with ${attempt.description} after ${attemptResults.length} fallback(s)`, 'scraper');\n }\n return result.response;\n }\n\n lastResult = result.response;\n attemptResults.push(`${attempt.description}: ${result.response.error?.message || result.response.statusCode}`);\n mcpLog('warning', `Failed with ${attempt.description} (${result.response.statusCode}), trying next fallback...`, 'scraper');\n }\n\n // All fallbacks exhausted or deadline reached\n const errorMessage = `Failed after ${attemptResults.length} fallback attempt(s): ${attemptResults.join('; ')}`;\n return {\n content: `Error: ${errorMessage}`,\n statusCode: lastResult?.statusCode || 500,\n credits: 0,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: errorMessage,\n retryable: false,\n },\n };\n }\n\n /**\n * Execute a single fallback attempt and determine whether to continue.\n * Returns { done: true } on success/terminal or { done: false } to try the next mode.\n */\n private async tryFallbackAttempt(\n url: string,\n attempt: FallbackAttempt,\n options: { timeout?: number },\n ): Promise<{ done: boolean; response: ScrapeResponse }> {\n const result = await this.scrape({\n url,\n mode: attempt.mode,\n timeout: options.timeout,\n country: attempt.country,\n });\n\n // Success \u2014 but verify content isn't an empty SPA shell\n if (result.statusCode >= 200 && result.statusCode < 300 && !result.error) {\n const strippedLength = result.content.replace(/<[^>]*>/g, '').trim().length;\n if (strippedLength < MIN_USEFUL_CONTENT_LENGTH && attempt.mode === 'basic') {\n mcpLog('info', `Basic mode returned only ${strippedLength} chars of text for ${url} \u2014 trying JS rendering`, 'scraper');\n return { done: false, response: result };\n }\n return { done: true, response: result };\n }\n\n // 404 is a valid response, not an error\n if (result.statusCode === 404) {\n return { done: true, response: result };\n }\n\n // 502 Bad Gateway \u2014 almost always a WAF/CDN block, not a transient issue.\n // Switching render mode won't bypass CDN protection, so fail fast.\n if (result.statusCode === 502) {\n mcpLog('warning', `502 Bad Gateway for ${url} \u2014 likely WAF/CDN block, skipping fallback modes`, 'scraper');\n return { done: true, response: {\n ...result,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: 'Bad gateway \u2014 site is blocking automated access',\n retryable: false,\n },\n }};\n }\n\n // Non-retryable errors - don't try other modes\n if (result.error && !result.error.retryable) {\n mcpLog('error', `Non-retryable error with ${attempt.description}: ${result.error.message}`, 'scraper');\n return { done: true, response: result };\n }\n\n return { done: false, response: result };\n }\n\n /**\n * Scrape multiple URLs with batching\n * NEVER throws - always returns results array\n */\n async scrapeMultiple(urls: string[], options: { timeout?: number } = {}): Promise<Array<ScrapeResponse & { url: string }>> {\n if (urls.length === 0) {\n return [];\n }\n\n if (urls.length <= SCRAPE_BATCH_SIZE) {\n return this.processBatch(urls, options);\n }\n\n const result = await this.batchScrape(urls, options);\n return result.results as Array<ScrapeResponse & { url: string }>;\n }\n\n /**\n * Batch scrape with progress callback\n * NEVER throws - uses Promise.allSettled internally\n */\n async batchScrape(\n urls: string[],\n options: { timeout?: number } = {},\n onBatchComplete?: (batchNum: number, totalBatches: number, processed: number) => void\n ): Promise<BatchScrapeResult> {\n const totalBatches = Math.ceil(urls.length / SCRAPE_BATCH_SIZE);\n const allResults: Array<ScrapeResponse & { url: string }> = [];\n let rateLimitHits = 0;\n\n mcpLog('info', `Starting batch processing: ${urls.length} URLs in ${totalBatches} batch(es)`, 'scraper');\n\n for (let batchNum = 0; batchNum < totalBatches; batchNum++) {\n const startIdx = batchNum * SCRAPE_BATCH_SIZE;\n const endIdx = Math.min(startIdx + SCRAPE_BATCH_SIZE, urls.length);\n const batchUrls = urls.slice(startIdx, endIdx);\n\n mcpLog('info', `Processing batch ${batchNum + 1}/${totalBatches} (${batchUrls.length} URLs)`, 'scraper');\n\n const batchResults = await pMapSettled(\n batchUrls,\n url => this.scrapeWithFallback(url, options),\n CONCURRENCY.SCRAPER\n );\n\n for (let i = 0; i < batchResults.length; i++) {\n const result = batchResults[i];\n if (!result) continue;\n const url = batchUrls[i] ?? '';\n\n if (result.status === 'fulfilled') {\n const scrapeResult = result.value;\n allResults.push({ ...scrapeResult, url });\n\n // Track rate limits\n if (scrapeResult.error?.code === ErrorCode.RATE_LIMITED) {\n rateLimitHits++;\n }\n } else {\n // This shouldn't happen since scrapeWithFallback never throws,\n // but handle it gracefully just in case\n const errorMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);\n mcpLog('error', `Unexpected rejection for ${url}: ${errorMsg}`, 'scraper');\n\n allResults.push({\n url,\n content: `Error: Unexpected failure - ${errorMsg}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n });\n }\n }\n\n // Safe callback invocation\n try {\n onBatchComplete?.(batchNum + 1, totalBatches, allResults.length);\n } catch (callbackError) {\n mcpLog('error', `onBatchComplete callback error: ${callbackError}`, 'scraper');\n }\n\n mcpLog('info', `Completed batch ${batchNum + 1}/${totalBatches} (${allResults.length}/${urls.length} total)`, 'scraper');\n\n // Adaptive delay between batches \u2014 back off harder under rate limiting\n if (batchNum < totalBatches - 1) {\n const batchDelay = rateLimitHits > 0 ? 2000 : 500;\n await sleep(batchDelay);\n }\n }\n\n return { results: allResults, batchesProcessed: totalBatches, totalAttempted: urls.length, rateLimitHits };\n }\n\n /**\n * Process a single batch of URLs\n * NEVER throws\n */\n private async processBatch(urls: string[], options: { timeout?: number }): Promise<Array<ScrapeResponse & { url: string }>> {\n const results = await pMapSettled(urls, url => this.scrapeWithFallback(url, options), CONCURRENCY.SCRAPER);\n\n return results.map((result, index) => {\n const url = urls[index] || '';\n\n if (result.status === 'fulfilled') {\n return { ...result.value, url };\n }\n\n // Shouldn't happen, but handle gracefully\n return {\n url,\n content: `Error: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n };\n });\n }\n}\n"],
|
|
5
|
-
"mappings": "AAMA,SAAS,UAAmB,mBAAmB;AAC/C;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAEK;AACP,SAAS,wBAAwB;AACjC,SAAS,mBAAmB;AAC5B,SAAS,cAAc;
|
|
4
|
+
"sourcesContent": ["/**\n * Web Scraper Client\n * Generic interface for URL scraping with automatic fallback modes\n * Implements robust error handling that NEVER crashes\n */\n\nimport { parseEnv, SCRAPER, CONCURRENCY } from '../config/index.js';\nimport {\n classifyError,\n fetchWithTimeout,\n sleep,\n ErrorCode,\n type StructuredError,\n} from '../utils/errors.js';\nimport { calculateBackoff } from '../utils/retry.js';\nimport { pMapSettled } from '../utils/concurrency.js';\nimport { mcpLog } from '../utils/logger.js';\nimport { isBinaryDocumentContentType } from '../utils/source-type.js';\n\n// \u2500\u2500 Constants \u2500\u2500\n\nconst SCRAPE_MODES = ['basic', 'javascript', 'javascript_geo'] as const;\ntype ScrapeMode = typeof SCRAPE_MODES[number];\n\nconst CREDIT_COSTS: Record<string, number> = { basic: 1, javascript: 5, javascript_geo: 5 } as const;\nconst SCRAPE_BATCH_SIZE = 30 as const;\nconst MAX_RETRIES = 1 as const;\n/** Overall timeout for all fallback attempts on a single URL */\nconst FALLBACK_OVERALL_TIMEOUT_MS = 30_000 as const;\n\n// \u2500\u2500 Interfaces \u2500\u2500\n\ninterface ScrapeRequest {\n readonly url: string;\n readonly mode?: 'basic' | 'javascript';\n readonly timeout?: number;\n readonly country?: string;\n}\n\ninterface ScrapeResponse {\n readonly content: string;\n readonly statusCode: number;\n readonly credits: number;\n readonly headers?: Record<string, string>;\n readonly error?: StructuredError;\n}\n\ninterface BatchScrapeResult {\n readonly results: ReadonlyArray<ScrapeResponse & { readonly url: string }>;\n readonly batchesProcessed: number;\n readonly totalAttempted: number;\n readonly rateLimitHits: number;\n}\n\n// Status codes that indicate we should retry (no credit consumed)\nconst RETRYABLE_STATUS_CODES = new Set([429, 502, 503, 504, 510]);\n// Status codes that are permanent failures (don't retry)\nconst PERMANENT_FAILURE_CODES = new Set([400, 401, 403]);\n\n/** Minimum stripped-text length to consider a scrape successful (filters out empty SPA shells) */\nconst MIN_USEFUL_CONTENT_LENGTH = 200 as const;\n\n/** Fallback attempt descriptor used by scrapeWithFallback */\ninterface FallbackAttempt {\n readonly mode: 'basic' | 'javascript';\n readonly country?: string;\n readonly description: string;\n}\n\nconst FALLBACK_ATTEMPTS: readonly FallbackAttempt[] = [\n { mode: 'basic', description: 'basic mode' },\n { mode: 'javascript', description: 'javascript rendering' },\n { mode: 'javascript', country: 'us', description: 'javascript + US geo-targeting' },\n] as const;\n\nexport class ScraperClient {\n private apiKey: string;\n private baseURL = 'https://api.scrape.do';\n\n constructor(apiKey?: string) {\n const env = parseEnv();\n this.apiKey = apiKey || env.SCRAPER_API_KEY;\n\n if (!this.apiKey) {\n throw new Error('Web scraping capability is not configured. Please set up the required API credentials.');\n }\n }\n\n /**\n * Scrape a single URL with retry logic\n * NEVER throws - always returns a ScrapeResponse (possibly with error)\n */\n async scrape(request: ScrapeRequest, maxRetries = MAX_RETRIES): Promise<ScrapeResponse> {\n const { url, mode = 'basic', timeout = 15, country } = request;\n const credits = CREDIT_COSTS[mode] ?? 1;\n\n // Validate URL first\n try {\n new URL(url);\n } catch {\n return {\n content: `Invalid URL: ${url}`,\n statusCode: 400,\n credits: 0,\n error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false },\n };\n }\n\n const params = new URLSearchParams({\n url: url,\n token: this.apiKey,\n timeout: String(timeout * 1000),\n });\n\n if (mode === 'javascript') {\n params.append('render', 'true');\n }\n\n if (country) {\n params.append('geoCode', country.toUpperCase());\n }\n\n const apiUrl = `${this.baseURL}?${params.toString()}`;\n let lastError: StructuredError | undefined;\n\n for (let attempt = 0; attempt < maxRetries; attempt++) {\n try {\n // Use AbortController for timeout\n const timeoutMs = (timeout + 5) * 1000; // Add 5s buffer over scrape timeout\n const response = await fetchWithTimeout(apiUrl, {\n method: 'GET',\n headers: { Accept: 'text/html,application/json' },\n timeoutMs,\n });\n\n // Safely read response body\n let content: string;\n try {\n content = await response.text();\n } catch (readError) {\n content = `Failed to read response: ${readError instanceof Error ? readError.message : String(readError)}`;\n }\n\n // SUCCESS: 2xx - Successful API call\n if (response.ok) {\n // Content-Type gate: if the origin served a binary document format\n // (PDF/DOCX/PPTX/XLSX/octet-stream), discard the body and surface\n // UNSUPPORTED_BINARY_CONTENT so the tool handler can reroute this\n // URL through the Jina Reader path. Reading binary as text produces\n // mojibake that silently passes Readability + Turndown (both of which\n // short-circuit on \"no `<` tag\") and contaminates the LLM.\n const contentType = response.headers.get('content-type');\n if (isBinaryDocumentContentType(contentType)) {\n mcpLog(\n 'info',\n `Binary document detected at ${url} (content-type: ${contentType}). Deferring to Jina Reader.`,\n 'scraper',\n );\n return {\n content: `Binary document (${contentType ?? 'unknown'}); routed to Jina Reader`,\n statusCode: 415,\n credits: 0,\n headers: Object.fromEntries(response.headers.entries()),\n error: {\n code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,\n message: `Scrape.do cannot decode ${contentType ?? 'this binary content-type'}`,\n retryable: false,\n statusCode: 415,\n },\n };\n }\n\n return {\n content,\n statusCode: response.status,\n credits,\n headers: Object.fromEntries(response.headers.entries()),\n };\n }\n\n // 404 - Target not found (permanent, but not an error for our purposes)\n if (response.status === 404) {\n return {\n content: '404 - Page not found',\n statusCode: 404,\n credits,\n };\n }\n\n // Permanent failures - don't retry\n if (PERMANENT_FAILURE_CODES.has(response.status)) {\n const errorMsg = response.status === 401\n ? 'No credits remaining or subscription suspended'\n : `Request failed with status ${response.status}`;\n return {\n content: `Error: ${errorMsg}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.INVALID_INPUT,\n message: errorMsg,\n retryable: false,\n statusCode: response.status,\n },\n };\n }\n\n // Retryable status codes\n if (RETRYABLE_STATUS_CODES.has(response.status)) {\n lastError = {\n code: response.status === 429 ? ErrorCode.RATE_LIMITED : ErrorCode.SERVICE_UNAVAILABLE,\n message: `Server returned ${response.status}`,\n retryable: true,\n statusCode: response.status,\n };\n\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${response.status} on attempt ${attempt + 1}/${maxRetries}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n\n // Other non-success status - treat as retryable\n lastError = classifyError({ status: response.status, message: content });\n if (attempt < maxRetries - 1 && lastError.retryable) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `Status ${response.status}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n\n // Final attempt failed\n return {\n content: `Error: ${lastError.message}`,\n statusCode: response.status,\n credits: 0,\n error: lastError,\n };\n\n } catch (error) {\n lastError = classifyError(error);\n\n // Non-retryable errors - return immediately\n if (!lastError.retryable) {\n return {\n content: `Error: ${lastError.message}`,\n statusCode: lastError.statusCode || 500,\n credits: 0,\n error: lastError,\n };\n }\n\n // Retryable error - continue if attempts remaining\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${maxRetries} in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n }\n\n // All retries exhausted\n return {\n content: `Error: Failed after ${maxRetries} attempts. ${lastError?.message || 'Unknown error'}`,\n statusCode: lastError?.statusCode || 500,\n credits: 0,\n error: lastError || { code: ErrorCode.UNKNOWN_ERROR, message: 'All retries exhausted', retryable: false },\n };\n }\n\n /**\n * Scrape with automatic fallback through different modes\n * NEVER throws - always returns a ScrapeResponse\n */\n async scrapeWithFallback(url: string, options: { timeout?: number } = {}): Promise<ScrapeResponse> {\n const attemptResults: string[] = [];\n let lastResult: ScrapeResponse | null = null;\n const deadline = Date.now() + FALLBACK_OVERALL_TIMEOUT_MS;\n\n for (const attempt of FALLBACK_ATTEMPTS) {\n // Check overall deadline before starting next fallback\n if (Date.now() >= deadline) {\n mcpLog('warning', `Overall fallback timeout reached for ${url} after ${attemptResults.length} attempt(s)`, 'scraper');\n break;\n }\n\n const result = await this.tryFallbackAttempt(url, attempt, options);\n\n if (result.done) {\n if (attemptResults.length > 0) {\n mcpLog('info', `Success with ${attempt.description} after ${attemptResults.length} fallback(s)`, 'scraper');\n }\n return result.response;\n }\n\n lastResult = result.response;\n attemptResults.push(`${attempt.description}: ${result.response.error?.message || result.response.statusCode}`);\n mcpLog('warning', `Failed with ${attempt.description} (${result.response.statusCode}), trying next fallback...`, 'scraper');\n }\n\n // All fallbacks exhausted or deadline reached\n const errorMessage = `Failed after ${attemptResults.length} fallback attempt(s): ${attemptResults.join('; ')}`;\n return {\n content: `Error: ${errorMessage}`,\n statusCode: lastResult?.statusCode || 500,\n credits: 0,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: errorMessage,\n retryable: false,\n },\n };\n }\n\n /**\n * Execute a single fallback attempt and determine whether to continue.\n * Returns { done: true } on success/terminal or { done: false } to try the next mode.\n */\n private async tryFallbackAttempt(\n url: string,\n attempt: FallbackAttempt,\n options: { timeout?: number },\n ): Promise<{ done: boolean; response: ScrapeResponse }> {\n const result = await this.scrape({\n url,\n mode: attempt.mode,\n timeout: options.timeout,\n country: attempt.country,\n });\n\n // Success \u2014 but verify content isn't an empty SPA shell\n if (result.statusCode >= 200 && result.statusCode < 300 && !result.error) {\n const strippedLength = result.content.replace(/<[^>]*>/g, '').trim().length;\n if (strippedLength < MIN_USEFUL_CONTENT_LENGTH && attempt.mode === 'basic') {\n mcpLog('info', `Basic mode returned only ${strippedLength} chars of text for ${url} \u2014 trying JS rendering`, 'scraper');\n return { done: false, response: result };\n }\n return { done: true, response: result };\n }\n\n // 404 is a valid response, not an error\n if (result.statusCode === 404) {\n return { done: true, response: result };\n }\n\n // 502 Bad Gateway \u2014 almost always a WAF/CDN block, not a transient issue.\n // Switching render mode won't bypass CDN protection, so fail fast.\n if (result.statusCode === 502) {\n mcpLog('warning', `502 Bad Gateway for ${url} \u2014 likely WAF/CDN block, skipping fallback modes`, 'scraper');\n return { done: true, response: {\n ...result,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: 'Bad gateway \u2014 site is blocking automated access',\n retryable: false,\n },\n }};\n }\n\n // Non-retryable errors - don't try other modes\n if (result.error && !result.error.retryable) {\n mcpLog('error', `Non-retryable error with ${attempt.description}: ${result.error.message}`, 'scraper');\n return { done: true, response: result };\n }\n\n return { done: false, response: result };\n }\n\n /**\n * Scrape multiple URLs with batching\n * NEVER throws - always returns results array\n */\n async scrapeMultiple(urls: string[], options: { timeout?: number } = {}): Promise<Array<ScrapeResponse & { url: string }>> {\n if (urls.length === 0) {\n return [];\n }\n\n if (urls.length <= SCRAPE_BATCH_SIZE) {\n return this.processBatch(urls, options);\n }\n\n const result = await this.batchScrape(urls, options);\n return result.results as Array<ScrapeResponse & { url: string }>;\n }\n\n /**\n * Batch scrape with progress callback\n * NEVER throws - uses Promise.allSettled internally\n */\n async batchScrape(\n urls: string[],\n options: { timeout?: number } = {},\n onBatchComplete?: (batchNum: number, totalBatches: number, processed: number) => void\n ): Promise<BatchScrapeResult> {\n const totalBatches = Math.ceil(urls.length / SCRAPE_BATCH_SIZE);\n const allResults: Array<ScrapeResponse & { url: string }> = [];\n let rateLimitHits = 0;\n\n mcpLog('info', `Starting batch processing: ${urls.length} URLs in ${totalBatches} batch(es)`, 'scraper');\n\n for (let batchNum = 0; batchNum < totalBatches; batchNum++) {\n const startIdx = batchNum * SCRAPE_BATCH_SIZE;\n const endIdx = Math.min(startIdx + SCRAPE_BATCH_SIZE, urls.length);\n const batchUrls = urls.slice(startIdx, endIdx);\n\n mcpLog('info', `Processing batch ${batchNum + 1}/${totalBatches} (${batchUrls.length} URLs)`, 'scraper');\n\n const batchResults = await pMapSettled(\n batchUrls,\n url => this.scrapeWithFallback(url, options),\n CONCURRENCY.SCRAPER\n );\n\n for (let i = 0; i < batchResults.length; i++) {\n const result = batchResults[i];\n if (!result) continue;\n const url = batchUrls[i] ?? '';\n\n if (result.status === 'fulfilled') {\n const scrapeResult = result.value;\n allResults.push({ ...scrapeResult, url });\n\n // Track rate limits\n if (scrapeResult.error?.code === ErrorCode.RATE_LIMITED) {\n rateLimitHits++;\n }\n } else {\n // This shouldn't happen since scrapeWithFallback never throws,\n // but handle it gracefully just in case\n const errorMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);\n mcpLog('error', `Unexpected rejection for ${url}: ${errorMsg}`, 'scraper');\n\n allResults.push({\n url,\n content: `Error: Unexpected failure - ${errorMsg}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n });\n }\n }\n\n // Safe callback invocation\n try {\n onBatchComplete?.(batchNum + 1, totalBatches, allResults.length);\n } catch (callbackError) {\n mcpLog('error', `onBatchComplete callback error: ${callbackError}`, 'scraper');\n }\n\n mcpLog('info', `Completed batch ${batchNum + 1}/${totalBatches} (${allResults.length}/${urls.length} total)`, 'scraper');\n\n // Adaptive delay between batches \u2014 back off harder under rate limiting\n if (batchNum < totalBatches - 1) {\n const batchDelay = rateLimitHits > 0 ? 2000 : 500;\n await sleep(batchDelay);\n }\n }\n\n return { results: allResults, batchesProcessed: totalBatches, totalAttempted: urls.length, rateLimitHits };\n }\n\n /**\n * Process a single batch of URLs\n * NEVER throws\n */\n private async processBatch(urls: string[], options: { timeout?: number }): Promise<Array<ScrapeResponse & { url: string }>> {\n const results = await pMapSettled(urls, url => this.scrapeWithFallback(url, options), CONCURRENCY.SCRAPER);\n\n return results.map((result, index) => {\n const url = urls[index] || '';\n\n if (result.status === 'fulfilled') {\n return { ...result.value, url };\n }\n\n // Shouldn't happen, but handle gracefully\n return {\n url,\n content: `Error: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n };\n });\n }\n}\n"],
|
|
5
|
+
"mappings": "AAMA,SAAS,UAAmB,mBAAmB;AAC/C;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAEK;AACP,SAAS,wBAAwB;AACjC,SAAS,mBAAmB;AAC5B,SAAS,cAAc;AACvB,SAAS,mCAAmC;AAI5C,MAAM,eAAe,CAAC,SAAS,cAAc,gBAAgB;AAG7D,MAAM,eAAuC,EAAE,OAAO,GAAG,YAAY,GAAG,gBAAgB,EAAE;AAC1F,MAAM,oBAAoB;AAC1B,MAAM,cAAc;AAEpB,MAAM,8BAA8B;AA2BpC,MAAM,yBAAyB,oBAAI,IAAI,CAAC,KAAK,KAAK,KAAK,KAAK,GAAG,CAAC;AAEhE,MAAM,0BAA0B,oBAAI,IAAI,CAAC,KAAK,KAAK,GAAG,CAAC;AAGvD,MAAM,4BAA4B;AASlC,MAAM,oBAAgD;AAAA,EACpD,EAAE,MAAM,SAAS,aAAa,aAAa;AAAA,EAC3C,EAAE,MAAM,cAAc,aAAa,uBAAuB;AAAA,EAC1D,EAAE,MAAM,cAAc,SAAS,MAAM,aAAa,gCAAgC;AACpF;AAEO,MAAM,cAAc;AAAA,EACjB;AAAA,EACA,UAAU;AAAA,EAElB,YAAY,QAAiB;AAC3B,UAAM,MAAM,SAAS;AACrB,SAAK,SAAS,UAAU,IAAI;AAE5B,QAAI,CAAC,KAAK,QAAQ;AAChB,YAAM,IAAI,MAAM,wFAAwF;AAAA,IAC1G;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,OAAO,SAAwB,aAAa,aAAsC;AACtF,UAAM,EAAE,KAAK,OAAO,SAAS,UAAU,IAAI,QAAQ,IAAI;AACvD,UAAM,UAAU,aAAa,IAAI,KAAK;AAGtC,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,aAAO;AAAA,QACL,SAAS,gBAAgB,GAAG;AAAA,QAC5B,YAAY;AAAA,QACZ,SAAS;AAAA,QACT,OAAO,EAAE,MAAM,UAAU,eAAe,SAAS,gBAAgB,GAAG,IAAI,WAAW,MAAM;AAAA,MAC3F;AAAA,IACF;AAEA,UAAM,SAAS,IAAI,gBAAgB;AAAA,MACjC;AAAA,MACA,OAAO,KAAK;AAAA,MACZ,SAAS,OAAO,UAAU,GAAI;AAAA,IAChC,CAAC;AAED,QAAI,SAAS,cAAc;AACzB,aAAO,OAAO,UAAU,MAAM;AAAA,IAChC;AAEA,QAAI,SAAS;AACX,aAAO,OAAO,WAAW,QAAQ,YAAY,CAAC;AAAA,IAChD;AAEA,UAAM,SAAS,GAAG,KAAK,OAAO,IAAI,OAAO,SAAS,CAAC;AACnD,QAAI;AAEJ,aAAS,UAAU,GAAG,UAAU,YAAY,WAAW;AACrD,UAAI;AAEF,cAAM,aAAa,UAAU,KAAK;AAClC,cAAM,WAAW,MAAM,iBAAiB,QAAQ;AAAA,UAC9C,QAAQ;AAAA,UACR,SAAS,EAAE,QAAQ,6BAA6B;AAAA,UAChD;AAAA,QACF,CAAC;AAGD,YAAI;AACJ,YAAI;AACF,oBAAU,MAAM,SAAS,KAAK;AAAA,QAChC,SAAS,WAAW;AAClB,oBAAU,4BAA4B,qBAAqB,QAAQ,UAAU,UAAU,OAAO,SAAS,CAAC;AAAA,QAC1G;AAGA,YAAI,SAAS,IAAI;AAOf,gBAAM,cAAc,SAAS,QAAQ,IAAI,cAAc;AACvD,cAAI,4BAA4B,WAAW,GAAG;AAC5C;AAAA,cACE;AAAA,cACA,+BAA+B,GAAG,mBAAmB,WAAW;AAAA,cAChE;AAAA,YACF;AACA,mBAAO;AAAA,cACL,SAAS,oBAAoB,eAAe,SAAS;AAAA,cACrD,YAAY;AAAA,cACZ,SAAS;AAAA,cACT,SAAS,OAAO,YAAY,SAAS,QAAQ,QAAQ,CAAC;AAAA,cACtD,OAAO;AAAA,gBACL,MAAM,UAAU;AAAA,gBAChB,SAAS,2BAA2B,eAAe,0BAA0B;AAAA,gBAC7E,WAAW;AAAA,gBACX,YAAY;AAAA,cACd;AAAA,YACF;AAAA,UACF;AAEA,iBAAO;AAAA,YACL;AAAA,YACA,YAAY,SAAS;AAAA,YACrB;AAAA,YACA,SAAS,OAAO,YAAY,SAAS,QAAQ,QAAQ,CAAC;AAAA,UACxD;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,KAAK;AAC3B,iBAAO;AAAA,YACL,SAAS;AAAA,YACT,YAAY;AAAA,YACZ;AAAA,UACF;AAAA,QACF;AAGA,YAAI,wBAAwB,IAAI,SAAS,MAAM,GAAG;AAChD,gBAAM,WAAW,SAAS,WAAW,MACjC,mDACA,8BAA8B,SAAS,MAAM;AACjD,iBAAO;AAAA,YACL,SAAS,UAAU,QAAQ;AAAA,YAC3B,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,OAAO;AAAA,cACL,MAAM,SAAS,WAAW,MAAM,UAAU,aAAa,UAAU;AAAA,cACjE,SAAS;AAAA,cACT,WAAW;AAAA,cACX,YAAY,SAAS;AAAA,YACvB;AAAA,UACF;AAAA,QACF;AAGA,YAAI,uBAAuB,IAAI,SAAS,MAAM,GAAG;AAC/C,sBAAY;AAAA,YACV,MAAM,SAAS,WAAW,MAAM,UAAU,eAAe,UAAU;AAAA,YACnE,SAAS,mBAAmB,SAAS,MAAM;AAAA,YAC3C,WAAW;AAAA,YACX,YAAY,SAAS;AAAA,UACvB;AAEA,cAAI,UAAU,aAAa,GAAG;AAC5B,kBAAM,UAAU,iBAAiB,OAAO;AACxC,mBAAO,WAAW,GAAG,SAAS,MAAM,eAAe,UAAU,CAAC,IAAI,UAAU,iBAAiB,OAAO,MAAM,SAAS;AACnH,kBAAM,MAAM,OAAO;AACnB;AAAA,UACF;AAAA,QACF;AAGA,oBAAY,cAAc,EAAE,QAAQ,SAAS,QAAQ,SAAS,QAAQ,CAAC;AACvE,YAAI,UAAU,aAAa,KAAK,UAAU,WAAW;AACnD,gBAAM,UAAU,iBAAiB,OAAO;AACxC,iBAAO,WAAW,UAAU,SAAS,MAAM,iBAAiB,OAAO,MAAM,SAAS;AAClF,gBAAM,MAAM,OAAO;AACnB;AAAA,QACF;AAGA,eAAO;AAAA,UACL,SAAS,UAAU,UAAU,OAAO;AAAA,UACpC,YAAY,SAAS;AAAA,UACrB,SAAS;AAAA,UACT,OAAO;AAAA,QACT;AAAA,MAEF,SAAS,OAAO;AACd,oBAAY,cAAc,KAAK;AAG/B,YAAI,CAAC,UAAU,WAAW;AACxB,iBAAO;AAAA,YACL,SAAS,UAAU,UAAU,OAAO;AAAA,YACpC,YAAY,UAAU,cAAc;AAAA,YACpC,SAAS;AAAA,YACT,OAAO;AAAA,UACT;AAAA,QACF;AAGA,YAAI,UAAU,aAAa,GAAG;AAC5B,gBAAM,UAAU,iBAAiB,OAAO;AACxC,iBAAO,WAAW,GAAG,UAAU,IAAI,KAAK,UAAU,OAAO,WAAW,UAAU,CAAC,IAAI,UAAU,OAAO,OAAO,MAAM,SAAS;AAC1H,gBAAM,MAAM,OAAO;AACnB;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAGA,WAAO;AAAA,MACL,SAAS,uBAAuB,UAAU,cAAc,WAAW,WAAW,eAAe;AAAA,MAC7F,YAAY,WAAW,cAAc;AAAA,MACrC,SAAS;AAAA,MACT,OAAO,aAAa,EAAE,MAAM,UAAU,eAAe,SAAS,yBAAyB,WAAW,MAAM;AAAA,IAC1G;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,mBAAmB,KAAa,UAAgC,CAAC,GAA4B;AACjG,UAAM,iBAA2B,CAAC;AAClC,QAAI,aAAoC;AACxC,UAAM,WAAW,KAAK,IAAI,IAAI;AAE9B,eAAW,WAAW,mBAAmB;AAEvC,UAAI,KAAK,IAAI,KAAK,UAAU;AAC1B,eAAO,WAAW,wCAAwC,GAAG,UAAU,eAAe,MAAM,eAAe,SAAS;AACpH;AAAA,MACF;AAEA,YAAM,SAAS,MAAM,KAAK,mBAAmB,KAAK,SAAS,OAAO;AAElE,UAAI,OAAO,MAAM;AACf,YAAI,eAAe,SAAS,GAAG;AAC7B,iBAAO,QAAQ,gBAAgB,QAAQ,WAAW,UAAU,eAAe,MAAM,gBAAgB,SAAS;AAAA,QAC5G;AACA,eAAO,OAAO;AAAA,MAChB;AAEA,mBAAa,OAAO;AACpB,qBAAe,KAAK,GAAG,QAAQ,WAAW,KAAK,OAAO,SAAS,OAAO,WAAW,OAAO,SAAS,UAAU,EAAE;AAC7G,aAAO,WAAW,eAAe,QAAQ,WAAW,KAAK,OAAO,SAAS,UAAU,8BAA8B,SAAS;AAAA,IAC5H;AAGA,UAAM,eAAe,gBAAgB,eAAe,MAAM,yBAAyB,eAAe,KAAK,IAAI,CAAC;AAC5G,WAAO;AAAA,MACL,SAAS,UAAU,YAAY;AAAA,MAC/B,YAAY,YAAY,cAAc;AAAA,MACtC,SAAS;AAAA,MACT,OAAO;AAAA,QACL,MAAM,UAAU;AAAA,QAChB,SAAS;AAAA,QACT,WAAW;AAAA,MACb;AAAA,IACF;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,mBACZ,KACA,SACA,SACsD;AACtD,UAAM,SAAS,MAAM,KAAK,OAAO;AAAA,MAC/B;AAAA,MACA,MAAM,QAAQ;AAAA,MACd,SAAS,QAAQ;AAAA,MACjB,SAAS,QAAQ;AAAA,IACnB,CAAC;AAGD,QAAI,OAAO,cAAc,OAAO,OAAO,aAAa,OAAO,CAAC,OAAO,OAAO;AACxE,YAAM,iBAAiB,OAAO,QAAQ,QAAQ,YAAY,EAAE,EAAE,KAAK,EAAE;AACrE,UAAI,iBAAiB,6BAA6B,QAAQ,SAAS,SAAS;AAC1E,eAAO,QAAQ,4BAA4B,cAAc,sBAAsB,GAAG,+BAA0B,SAAS;AACrH,eAAO,EAAE,MAAM,OAAO,UAAU,OAAO;AAAA,MACzC;AACA,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAGA,QAAI,OAAO,eAAe,KAAK;AAC7B,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAIA,QAAI,OAAO,eAAe,KAAK;AAC7B,aAAO,WAAW,uBAAuB,GAAG,yDAAoD,SAAS;AACzG,aAAO,EAAE,MAAM,MAAM,UAAU;AAAA,QAC7B,GAAG;AAAA,QACH,OAAO;AAAA,UACL,MAAM,UAAU;AAAA,UAChB,SAAS;AAAA,UACT,WAAW;AAAA,QACb;AAAA,MACF,EAAC;AAAA,IACH;AAGA,QAAI,OAAO,SAAS,CAAC,OAAO,MAAM,WAAW;AAC3C,aAAO,SAAS,4BAA4B,QAAQ,WAAW,KAAK,OAAO,MAAM,OAAO,IAAI,SAAS;AACrG,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAEA,WAAO,EAAE,MAAM,OAAO,UAAU,OAAO;AAAA,EACzC;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,eAAe,MAAgB,UAAgC,CAAC,GAAqD;AACzH,QAAI,KAAK,WAAW,GAAG;AACrB,aAAO,CAAC;AAAA,IACV;AAEA,QAAI,KAAK,UAAU,mBAAmB;AACpC,aAAO,KAAK,aAAa,MAAM,OAAO;AAAA,IACxC;AAEA,UAAM,SAAS,MAAM,KAAK,YAAY,MAAM,OAAO;AACnD,WAAO,OAAO;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,YACJ,MACA,UAAgC,CAAC,GACjC,iBAC4B;AAC5B,UAAM,eAAe,KAAK,KAAK,KAAK,SAAS,iBAAiB;AAC9D,UAAM,aAAsD,CAAC;AAC7D,QAAI,gBAAgB;AAEpB,WAAO,QAAQ,8BAA8B,KAAK,MAAM,YAAY,YAAY,cAAc,SAAS;AAEvG,aAAS,WAAW,GAAG,WAAW,cAAc,YAAY;AAC1D,YAAM,WAAW,WAAW;AAC5B,YAAM,SAAS,KAAK,IAAI,WAAW,mBAAmB,KAAK,MAAM;AACjE,YAAM,YAAY,KAAK,MAAM,UAAU,MAAM;AAE7C,aAAO,QAAQ,oBAAoB,WAAW,CAAC,IAAI,YAAY,KAAK,UAAU,MAAM,UAAU,SAAS;AAEvG,YAAM,eAAe,MAAM;AAAA,QACzB;AAAA,QACA,SAAO,KAAK,mBAAmB,KAAK,OAAO;AAAA,QAC3C,YAAY;AAAA,MACd;AAEA,eAAS,IAAI,GAAG,IAAI,aAAa,QAAQ,KAAK;AAC5C,cAAM,SAAS,aAAa,CAAC;AAC7B,YAAI,CAAC,OAAQ;AACb,cAAM,MAAM,UAAU,CAAC,KAAK;AAE5B,YAAI,OAAO,WAAW,aAAa;AACjC,gBAAM,eAAe,OAAO;AAC5B,qBAAW,KAAK,EAAE,GAAG,cAAc,IAAI,CAAC;AAGxC,cAAI,aAAa,OAAO,SAAS,UAAU,cAAc;AACvD;AAAA,UACF;AAAA,QACF,OAAO;AAGL,gBAAM,WAAW,OAAO,kBAAkB,QAAQ,OAAO,OAAO,UAAU,OAAO,OAAO,MAAM;AAC9F,iBAAO,SAAS,4BAA4B,GAAG,KAAK,QAAQ,IAAI,SAAS;AAEzE,qBAAW,KAAK;AAAA,YACd;AAAA,YACA,SAAS,+BAA+B,QAAQ;AAAA,YAChD,YAAY;AAAA,YACZ,SAAS;AAAA,YACT,OAAO,cAAc,OAAO,MAAM;AAAA,UACpC,CAAC;AAAA,QACH;AAAA,MACF;AAGA,UAAI;AACF,0BAAkB,WAAW,GAAG,cAAc,WAAW,MAAM;AAAA,MACjE,SAAS,eAAe;AACtB,eAAO,SAAS,mCAAmC,aAAa,IAAI,SAAS;AAAA,MAC/E;AAEA,aAAO,QAAQ,mBAAmB,WAAW,CAAC,IAAI,YAAY,KAAK,WAAW,MAAM,IAAI,KAAK,MAAM,WAAW,SAAS;AAGvH,UAAI,WAAW,eAAe,GAAG;AAC/B,cAAM,aAAa,gBAAgB,IAAI,MAAO;AAC9C,cAAM,MAAM,UAAU;AAAA,MACxB;AAAA,IACF;AAEA,WAAO,EAAE,SAAS,YAAY,kBAAkB,cAAc,gBAAgB,KAAK,QAAQ,cAAc;AAAA,EAC3G;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,aAAa,MAAgB,SAAiF;AAC1H,UAAM,UAAU,MAAM,YAAY,MAAM,SAAO,KAAK,mBAAmB,KAAK,OAAO,GAAG,YAAY,OAAO;AAEzG,WAAO,QAAQ,IAAI,CAAC,QAAQ,UAAU;AACpC,YAAM,MAAM,KAAK,KAAK,KAAK;AAE3B,UAAI,OAAO,WAAW,aAAa;AACjC,eAAO,EAAE,GAAG,OAAO,OAAO,IAAI;AAAA,MAChC;AAGA,aAAO;AAAA,QACL;AAAA,QACA,SAAS,UAAU,OAAO,kBAAkB,QAAQ,OAAO,OAAO,UAAU,OAAO,OAAO,MAAM,CAAC;AAAA,QACjG,YAAY;AAAA,QACZ,SAAS;AAAA,QACT,OAAO,cAAc,OAAO,MAAM;AAAA,MACpC;AAAA,IACF,CAAC;AAAA,EACH;AACF;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
package/dist/src/config/index.js
CHANGED
|
@@ -27,7 +27,8 @@ function parseEnv() {
|
|
|
27
27
|
SCRAPER_API_KEY: process.env.SCRAPEDO_API_KEY || "",
|
|
28
28
|
SEARCH_API_KEY: process.env.SERPER_API_KEY || void 0,
|
|
29
29
|
REDDIT_CLIENT_ID: process.env.REDDIT_CLIENT_ID || void 0,
|
|
30
|
-
REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || void 0
|
|
30
|
+
REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || void 0,
|
|
31
|
+
JINA_API_KEY: process.env.JINA_API_KEY || void 0
|
|
31
32
|
};
|
|
32
33
|
return cachedEnv;
|
|
33
34
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/config/index.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Consolidated configuration\n * All environment variables, constants, and LLM config in one place\n */\n\nimport { Logger } from 'mcp-use';\n\nimport { VERSION, PACKAGE_NAME, PACKAGE_DESCRIPTION } from '../version.js';\n\n// ============================================================================\n// Safe Integer Parsing Helper\n// ============================================================================\n\n/**\n * Safely parse an integer from environment variable with bounds checking\n */\nfunction safeParseInt(\n value: string | undefined,\n defaultVal: number,\n min: number,\n max: number\n): number {\n const logger = Logger.get('config');\n\n if (!value) {\n return defaultVal;\n }\n\n const parsed = parseInt(value, 10);\n\n if (isNaN(parsed)) {\n logger.warn(`Invalid number \"${value}\", using default ${defaultVal}`);\n return defaultVal;\n }\n\n if (parsed < min) {\n logger.warn(`Value ${parsed} below minimum ${min}, clamping to ${min}`);\n return min;\n }\n\n if (parsed > max) {\n logger.warn(`Value ${parsed} above maximum ${max}, clamping to ${max}`);\n return max;\n }\n\n return parsed;\n}\n\n\n// ============================================================================\n// Environment Parsing\n// ============================================================================\n\ninterface EnvConfig {\n SCRAPER_API_KEY: string;\n SEARCH_API_KEY: string | undefined;\n REDDIT_CLIENT_ID: string | undefined;\n REDDIT_CLIENT_SECRET: string | undefined;\n}\n\nlet cachedEnv: EnvConfig | null = null;\n\nexport function parseEnv(): EnvConfig {\n if (cachedEnv) return cachedEnv;\n cachedEnv = {\n SCRAPER_API_KEY: process.env.SCRAPEDO_API_KEY || '',\n SEARCH_API_KEY: process.env.SERPER_API_KEY || undefined,\n REDDIT_CLIENT_ID: process.env.REDDIT_CLIENT_ID || undefined,\n REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || undefined,\n };\n return cachedEnv;\n}\n\n// ============================================================================\n// MCP Server Configuration\n// ============================================================================\n\nexport const SERVER = {\n NAME: PACKAGE_NAME,\n VERSION: VERSION,\n DESCRIPTION: PACKAGE_DESCRIPTION,\n} as const;\n\n// ============================================================================\n// Capability Detection (which features are available based on ENV)\n// ============================================================================\n\nexport interface Capabilities {\n reddit: boolean; // REDDIT_CLIENT_ID + REDDIT_CLIENT_SECRET\n search: boolean; // SERPER_API_KEY\n scraping: boolean; // SCRAPEDO_API_KEY\n llmExtraction: boolean; // LLM_API_KEY\n}\n\nexport function getCapabilities(): Capabilities {\n const env = parseEnv();\n return {\n reddit: !!(env.REDDIT_CLIENT_ID && env.REDDIT_CLIENT_SECRET),\n search: !!env.SEARCH_API_KEY,\n scraping: !!env.SCRAPER_API_KEY,\n llmExtraction: !!LLM_EXTRACTION.API_KEY,\n };\n}\n\nexport function getMissingEnvMessage(capability: keyof Capabilities): string {\n const messages: Record<keyof Capabilities, string> = {\n reddit: '\u274C **Reddit tools unavailable.** Set `REDDIT_CLIENT_ID` and `REDDIT_CLIENT_SECRET` to enable `get-reddit-post`.\\n\\n\uD83D\uDC49 Create a Reddit app at: https://www.reddit.com/prefs/apps (select \"script\" type)',\n search: '\u274C **Search unavailable.** Set `SERPER_API_KEY` to enable `web-search` (including `scope: \"reddit\"`).\\n\\n\uD83D\uDC49 Get your free API key at: https://serper.dev (2,500 free queries)',\n scraping: '\u274C **Web scraping unavailable.** Set `SCRAPEDO_API_KEY` to enable `scrape-links`.\\n\\n\uD83D\uDC49 Sign up at: https://scrape.do (1,000 free credits)',\n llmExtraction: '\u26A0\uFE0F **AI extraction disabled.** Set `LLM_API_KEY`, `LLM_BASE_URL`, and `LLM_MODEL` to enable AI-powered content extraction and search classification.\\n\\nScraping will work but without intelligent content filtering.',\n };\n return messages[capability];\n}\n\n// ============================================================================\n// Concurrency Limits\n// ============================================================================\n\nexport const CONCURRENCY = {\n SEARCH: safeParseInt(process.env.CONCURRENCY_SEARCH, 50, 1, 200),\n SCRAPER: safeParseInt(process.env.CONCURRENCY_SCRAPER, 50, 1, 200),\n REDDIT: safeParseInt(process.env.CONCURRENCY_REDDIT, 50, 1, 200),\n LLM_EXTRACTION: safeParseInt(process.env.LLM_CONCURRENCY, 50, 1, 200),\n} as const;\n\nexport const SCRAPER = {\n BATCH_SIZE: 30,\n EXTRACTION_PREFIX: 'Extract from document only \u2014 never hallucinate or add external knowledge.',\n EXTRACTION_SUFFIX: 'First line = content, not preamble. No confirmation messages.',\n} as const;\n\n// ============================================================================\n// Reddit Configuration\n// ============================================================================\n\nexport const REDDIT = {\n BATCH_SIZE: 10,\n MAX_WORDS_PER_POST: 50_000,\n MAX_WORDS_TOTAL: 500_000,\n MIN_POSTS: 1,\n MAX_POSTS: 50,\n RETRY_COUNT: 5,\n RETRY_DELAYS: [2000, 4000, 8000, 16000, 32000] as const,\n} as const;\n\n// ============================================================================\n// CTR Weights for URL Ranking (inspired from CTR research)\n// ============================================================================\n\nexport const CTR_WEIGHTS: Record<number, number> = {\n 1: 100.00,\n 2: 60.00,\n 3: 48.89,\n 4: 33.33,\n 5: 28.89,\n 6: 26.44,\n 7: 24.44,\n 8: 17.78,\n 9: 13.33,\n 10: 12.56,\n} as const;\n\n// ============================================================================\n// LLM Configuration\n//\n// Required vars (all must be set together when LLM is enabled):\n// LLM_API_KEY \u2014 API key for the OpenAI-compatible endpoint\n// LLM_BASE_URL \u2014 endpoint base URL (e.g. https://server.up.railway.app/v1)\n// LLM_MODEL \u2014 primary model (e.g. gpt-5.4-mini)\n//\n// Optional:\n// LLM_FALLBACK_MODEL \u2014 model to use after primary exhausts all retries (e.g. gpt-5.4)\n// LLM_CONCURRENCY \u2014 parallel LLM calls (default: 50)\n//\n// Reasoning effort is always 'low' \u2014 not configurable.\n// ============================================================================\n\ninterface LlmExtractionConfig {\n readonly MODEL: string;\n readonly FALLBACK_MODEL: string;\n readonly BASE_URL: string;\n readonly API_KEY: string;\n}\n\nlet cachedLlmExtraction: LlmExtractionConfig | null = null;\n\nfunction getLlmExtraction(): LlmExtractionConfig {\n if (cachedLlmExtraction) return cachedLlmExtraction;\n\n const apiKey = process.env.LLM_API_KEY?.trim() || '';\n const baseUrl = process.env.LLM_BASE_URL?.trim();\n const model = process.env.LLM_MODEL?.trim();\n const fallbackModel = process.env.LLM_FALLBACK_MODEL?.trim() || '';\n\n if (apiKey && !baseUrl) {\n throw new Error(\n 'LLM_BASE_URL is required when LLM_API_KEY is set. ' +\n 'Set LLM_BASE_URL to your OpenAI-compatible endpoint.',\n );\n }\n if (apiKey && !model) {\n throw new Error(\n 'LLM_MODEL is required when LLM_API_KEY is set.',\n );\n }\n\n cachedLlmExtraction = {\n API_KEY: apiKey,\n BASE_URL: baseUrl || '',\n MODEL: model || '',\n FALLBACK_MODEL: fallbackModel,\n };\n return cachedLlmExtraction;\n}\n\nexport const LLM_EXTRACTION: LlmExtractionConfig = new Proxy({} as LlmExtractionConfig, {\n get(_target, prop: string) {\n return getLlmExtraction()[prop as keyof LlmExtractionConfig];\n },\n});\n"],
|
|
5
|
-
"mappings": "AAKA,SAAS,cAAc;AAEvB,SAAS,SAAS,cAAc,2BAA2B;AAS3D,SAAS,aACP,OACA,YACA,KACA,KACQ;AACR,QAAM,SAAS,OAAO,IAAI,QAAQ;AAElC,MAAI,CAAC,OAAO;AACV,WAAO;AAAA,EACT;AAEA,QAAM,SAAS,SAAS,OAAO,EAAE;AAEjC,MAAI,MAAM,MAAM,GAAG;AACjB,WAAO,KAAK,mBAAmB,KAAK,oBAAoB,UAAU,EAAE;AACpE,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,KAAK;AAChB,WAAO,KAAK,SAAS,MAAM,kBAAkB,GAAG,iBAAiB,GAAG,EAAE;AACtE,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,KAAK;AAChB,WAAO,KAAK,SAAS,MAAM,kBAAkB,GAAG,iBAAiB,GAAG,EAAE;AACtE,WAAO;AAAA,EACT;AAEA,SAAO;AACT;
|
|
4
|
+
"sourcesContent": ["/**\n * Consolidated configuration\n * All environment variables, constants, and LLM config in one place\n */\n\nimport { Logger } from 'mcp-use';\n\nimport { VERSION, PACKAGE_NAME, PACKAGE_DESCRIPTION } from '../version.js';\n\n// ============================================================================\n// Safe Integer Parsing Helper\n// ============================================================================\n\n/**\n * Safely parse an integer from environment variable with bounds checking\n */\nfunction safeParseInt(\n value: string | undefined,\n defaultVal: number,\n min: number,\n max: number\n): number {\n const logger = Logger.get('config');\n\n if (!value) {\n return defaultVal;\n }\n\n const parsed = parseInt(value, 10);\n\n if (isNaN(parsed)) {\n logger.warn(`Invalid number \"${value}\", using default ${defaultVal}`);\n return defaultVal;\n }\n\n if (parsed < min) {\n logger.warn(`Value ${parsed} below minimum ${min}, clamping to ${min}`);\n return min;\n }\n\n if (parsed > max) {\n logger.warn(`Value ${parsed} above maximum ${max}, clamping to ${max}`);\n return max;\n }\n\n return parsed;\n}\n\n\n// ============================================================================\n// Environment Parsing\n// ============================================================================\n\ninterface EnvConfig {\n SCRAPER_API_KEY: string;\n SEARCH_API_KEY: string | undefined;\n REDDIT_CLIENT_ID: string | undefined;\n REDDIT_CLIENT_SECRET: string | undefined;\n JINA_API_KEY: string | undefined;\n}\n\nlet cachedEnv: EnvConfig | null = null;\n\nexport function parseEnv(): EnvConfig {\n if (cachedEnv) return cachedEnv;\n cachedEnv = {\n SCRAPER_API_KEY: process.env.SCRAPEDO_API_KEY || '',\n SEARCH_API_KEY: process.env.SERPER_API_KEY || undefined,\n REDDIT_CLIENT_ID: process.env.REDDIT_CLIENT_ID || undefined,\n REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || undefined,\n JINA_API_KEY: process.env.JINA_API_KEY || undefined,\n };\n return cachedEnv;\n}\n\n// ============================================================================\n// MCP Server Configuration\n// ============================================================================\n\nexport const SERVER = {\n NAME: PACKAGE_NAME,\n VERSION: VERSION,\n DESCRIPTION: PACKAGE_DESCRIPTION,\n} as const;\n\n// ============================================================================\n// Capability Detection (which features are available based on ENV)\n// ============================================================================\n\nexport interface Capabilities {\n reddit: boolean; // REDDIT_CLIENT_ID + REDDIT_CLIENT_SECRET\n search: boolean; // SERPER_API_KEY\n scraping: boolean; // SCRAPEDO_API_KEY\n llmExtraction: boolean; // LLM_API_KEY\n}\n\nexport function getCapabilities(): Capabilities {\n const env = parseEnv();\n return {\n reddit: !!(env.REDDIT_CLIENT_ID && env.REDDIT_CLIENT_SECRET),\n search: !!env.SEARCH_API_KEY,\n scraping: !!env.SCRAPER_API_KEY,\n llmExtraction: !!LLM_EXTRACTION.API_KEY,\n };\n}\n\nexport function getMissingEnvMessage(capability: keyof Capabilities): string {\n const messages: Record<keyof Capabilities, string> = {\n reddit: '\u274C **Reddit tools unavailable.** Set `REDDIT_CLIENT_ID` and `REDDIT_CLIENT_SECRET` to enable `get-reddit-post`.\\n\\n\uD83D\uDC49 Create a Reddit app at: https://www.reddit.com/prefs/apps (select \"script\" type)',\n search: '\u274C **Search unavailable.** Set `SERPER_API_KEY` to enable `web-search` (including `scope: \"reddit\"`).\\n\\n\uD83D\uDC49 Get your free API key at: https://serper.dev (2,500 free queries)',\n scraping: '\u274C **Web scraping unavailable.** Set `SCRAPEDO_API_KEY` to enable `scrape-links`.\\n\\n\uD83D\uDC49 Sign up at: https://scrape.do (1,000 free credits)',\n llmExtraction: '\u26A0\uFE0F **AI extraction disabled.** Set `LLM_API_KEY`, `LLM_BASE_URL`, and `LLM_MODEL` to enable AI-powered content extraction and search classification.\\n\\nScraping will work but without intelligent content filtering.',\n };\n return messages[capability];\n}\n\n// ============================================================================\n// Concurrency Limits\n// ============================================================================\n\nexport const CONCURRENCY = {\n SEARCH: safeParseInt(process.env.CONCURRENCY_SEARCH, 50, 1, 200),\n SCRAPER: safeParseInt(process.env.CONCURRENCY_SCRAPER, 50, 1, 200),\n REDDIT: safeParseInt(process.env.CONCURRENCY_REDDIT, 50, 1, 200),\n LLM_EXTRACTION: safeParseInt(process.env.LLM_CONCURRENCY, 50, 1, 200),\n} as const;\n\nexport const SCRAPER = {\n BATCH_SIZE: 30,\n EXTRACTION_PREFIX: 'Extract from document only \u2014 never hallucinate or add external knowledge.',\n EXTRACTION_SUFFIX: 'First line = content, not preamble. No confirmation messages.',\n} as const;\n\n// ============================================================================\n// Reddit Configuration\n// ============================================================================\n\nexport const REDDIT = {\n BATCH_SIZE: 10,\n MAX_WORDS_PER_POST: 50_000,\n MAX_WORDS_TOTAL: 500_000,\n MIN_POSTS: 1,\n MAX_POSTS: 50,\n RETRY_COUNT: 5,\n RETRY_DELAYS: [2000, 4000, 8000, 16000, 32000] as const,\n} as const;\n\n// ============================================================================\n// CTR Weights for URL Ranking (inspired from CTR research)\n// ============================================================================\n\nexport const CTR_WEIGHTS: Record<number, number> = {\n 1: 100.00,\n 2: 60.00,\n 3: 48.89,\n 4: 33.33,\n 5: 28.89,\n 6: 26.44,\n 7: 24.44,\n 8: 17.78,\n 9: 13.33,\n 10: 12.56,\n} as const;\n\n// ============================================================================\n// LLM Configuration\n//\n// Required vars (all must be set together when LLM is enabled):\n// LLM_API_KEY \u2014 API key for the OpenAI-compatible endpoint\n// LLM_BASE_URL \u2014 endpoint base URL (e.g. https://server.up.railway.app/v1)\n// LLM_MODEL \u2014 primary model (e.g. gpt-5.4-mini)\n//\n// Optional:\n// LLM_FALLBACK_MODEL \u2014 model to use after primary exhausts all retries (e.g. gpt-5.4)\n// LLM_CONCURRENCY \u2014 parallel LLM calls (default: 50)\n//\n// Reasoning effort is always 'low' \u2014 not configurable.\n// ============================================================================\n\ninterface LlmExtractionConfig {\n readonly MODEL: string;\n readonly FALLBACK_MODEL: string;\n readonly BASE_URL: string;\n readonly API_KEY: string;\n}\n\nlet cachedLlmExtraction: LlmExtractionConfig | null = null;\n\nfunction getLlmExtraction(): LlmExtractionConfig {\n if (cachedLlmExtraction) return cachedLlmExtraction;\n\n const apiKey = process.env.LLM_API_KEY?.trim() || '';\n const baseUrl = process.env.LLM_BASE_URL?.trim();\n const model = process.env.LLM_MODEL?.trim();\n const fallbackModel = process.env.LLM_FALLBACK_MODEL?.trim() || '';\n\n if (apiKey && !baseUrl) {\n throw new Error(\n 'LLM_BASE_URL is required when LLM_API_KEY is set. ' +\n 'Set LLM_BASE_URL to your OpenAI-compatible endpoint.',\n );\n }\n if (apiKey && !model) {\n throw new Error(\n 'LLM_MODEL is required when LLM_API_KEY is set.',\n );\n }\n\n cachedLlmExtraction = {\n API_KEY: apiKey,\n BASE_URL: baseUrl || '',\n MODEL: model || '',\n FALLBACK_MODEL: fallbackModel,\n };\n return cachedLlmExtraction;\n}\n\nexport const LLM_EXTRACTION: LlmExtractionConfig = new Proxy({} as LlmExtractionConfig, {\n get(_target, prop: string) {\n return getLlmExtraction()[prop as keyof LlmExtractionConfig];\n },\n});\n"],
|
|
5
|
+
"mappings": "AAKA,SAAS,cAAc;AAEvB,SAAS,SAAS,cAAc,2BAA2B;AAS3D,SAAS,aACP,OACA,YACA,KACA,KACQ;AACR,QAAM,SAAS,OAAO,IAAI,QAAQ;AAElC,MAAI,CAAC,OAAO;AACV,WAAO;AAAA,EACT;AAEA,QAAM,SAAS,SAAS,OAAO,EAAE;AAEjC,MAAI,MAAM,MAAM,GAAG;AACjB,WAAO,KAAK,mBAAmB,KAAK,oBAAoB,UAAU,EAAE;AACpE,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,KAAK;AAChB,WAAO,KAAK,SAAS,MAAM,kBAAkB,GAAG,iBAAiB,GAAG,EAAE;AACtE,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,KAAK;AAChB,WAAO,KAAK,SAAS,MAAM,kBAAkB,GAAG,iBAAiB,GAAG,EAAE;AACtE,WAAO;AAAA,EACT;AAEA,SAAO;AACT;AAeA,IAAI,YAA8B;AAE3B,SAAS,WAAsB;AACpC,MAAI,UAAW,QAAO;AACtB,cAAY;AAAA,IACV,iBAAiB,QAAQ,IAAI,oBAAoB;AAAA,IACjD,gBAAgB,QAAQ,IAAI,kBAAkB;AAAA,IAC9C,kBAAkB,QAAQ,IAAI,oBAAoB;AAAA,IAClD,sBAAsB,QAAQ,IAAI,wBAAwB;AAAA,IAC1D,cAAc,QAAQ,IAAI,gBAAgB;AAAA,EAC5C;AACA,SAAO;AACT;AAMO,MAAM,SAAS;AAAA,EACpB,MAAM;AAAA,EACN;AAAA,EACA,aAAa;AACf;AAaO,SAAS,kBAAgC;AAC9C,QAAM,MAAM,SAAS;AACrB,SAAO;AAAA,IACL,QAAQ,CAAC,EAAE,IAAI,oBAAoB,IAAI;AAAA,IACvC,QAAQ,CAAC,CAAC,IAAI;AAAA,IACd,UAAU,CAAC,CAAC,IAAI;AAAA,IAChB,eAAe,CAAC,CAAC,eAAe;AAAA,EAClC;AACF;AAEO,SAAS,qBAAqB,YAAwC;AAC3E,QAAM,WAA+C;AAAA,IACnD,QAAQ;AAAA,IACR,QAAQ;AAAA,IACR,UAAU;AAAA,IACV,eAAe;AAAA,EACjB;AACA,SAAO,SAAS,UAAU;AAC5B;AAMO,MAAM,cAAc;AAAA,EACzB,QAAQ,aAAa,QAAQ,IAAI,oBAAoB,IAAI,GAAG,GAAG;AAAA,EAC/D,SAAS,aAAa,QAAQ,IAAI,qBAAqB,IAAI,GAAG,GAAG;AAAA,EACjE,QAAQ,aAAa,QAAQ,IAAI,oBAAoB,IAAI,GAAG,GAAG;AAAA,EAC/D,gBAAgB,aAAa,QAAQ,IAAI,iBAAiB,IAAI,GAAG,GAAG;AACtE;AAEO,MAAM,UAAU;AAAA,EACrB,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,mBAAmB;AACrB;AAMO,MAAM,SAAS;AAAA,EACpB,YAAY;AAAA,EACZ,oBAAoB;AAAA,EACpB,iBAAiB;AAAA,EACjB,WAAW;AAAA,EACX,WAAW;AAAA,EACX,aAAa;AAAA,EACb,cAAc,CAAC,KAAM,KAAM,KAAM,MAAO,IAAK;AAC/C;AAMO,MAAM,cAAsC;AAAA,EACjD,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,IAAI;AACN;AAwBA,IAAI,sBAAkD;AAEtD,SAAS,mBAAwC;AAC/C,MAAI,oBAAqB,QAAO;AAEhC,QAAM,SAAS,QAAQ,IAAI,aAAa,KAAK,KAAK;AAClD,QAAM,UAAU,QAAQ,IAAI,cAAc,KAAK;AAC/C,QAAM,QAAQ,QAAQ,IAAI,WAAW,KAAK;AAC1C,QAAM,gBAAgB,QAAQ,IAAI,oBAAoB,KAAK,KAAK;AAEhE,MAAI,UAAU,CAAC,SAAS;AACtB,UAAM,IAAI;AAAA,MACR;AAAA,IAEF;AAAA,EACF;AACA,MAAI,UAAU,CAAC,OAAO;AACpB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,wBAAsB;AAAA,IACpB,SAAS;AAAA,IACT,UAAU,WAAW;AAAA,IACrB,OAAO,SAAS;AAAA,IAChB,gBAAgB;AAAA,EAClB;AACA,SAAO;AACT;AAEO,MAAM,iBAAsC,IAAI,MAAM,CAAC,GAA0B;AAAA,EACtF,IAAI,SAAS,MAAc;AACzB,WAAO,iBAAiB,EAAE,IAAiC;AAAA,EAC7D;AACF,CAAC;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
package/dist/src/tools/scrape.js
CHANGED
|
@@ -11,12 +11,14 @@ import {
|
|
|
11
11
|
} from "../schemas/scrape-links.js";
|
|
12
12
|
import { ScraperClient } from "../clients/scraper.js";
|
|
13
13
|
import { RedditClient } from "../clients/reddit.js";
|
|
14
|
+
import { JinaClient } from "../clients/jina.js";
|
|
14
15
|
import { MarkdownCleaner } from "../services/markdown-cleaner.js";
|
|
15
16
|
import { createLLMProcessor, processContentWithLLM } from "../services/llm-processor.js";
|
|
16
17
|
import { removeMetaTags } from "../utils/markdown-formatter.js";
|
|
17
18
|
import { extractReadableContent } from "../utils/content-extractor.js";
|
|
18
|
-
import { classifyError } from "../utils/errors.js";
|
|
19
|
-
import {
|
|
19
|
+
import { classifyError, ErrorCode } from "../utils/errors.js";
|
|
20
|
+
import { isDocumentUrl } from "../utils/source-type.js";
|
|
21
|
+
import { pMap, pMapSettled } from "../utils/concurrency.js";
|
|
20
22
|
import {
|
|
21
23
|
mcpLog,
|
|
22
24
|
formatSuccess,
|
|
@@ -75,6 +77,7 @@ Execution time: ${formatDuration(Date.now() - startTime)}`
|
|
|
75
77
|
function partitionUrls(urls) {
|
|
76
78
|
const webInputs = [];
|
|
77
79
|
const redditInputs = [];
|
|
80
|
+
const documentInputs = [];
|
|
78
81
|
const invalidEntries = [];
|
|
79
82
|
for (let i = 0; i < urls.length; i++) {
|
|
80
83
|
const url = urls[i];
|
|
@@ -84,23 +87,32 @@ function partitionUrls(urls) {
|
|
|
84
87
|
invalidEntries.push({ url, origIndex: i });
|
|
85
88
|
continue;
|
|
86
89
|
}
|
|
87
|
-
if (
|
|
90
|
+
if (isDocumentUrl(url)) {
|
|
91
|
+
documentInputs.push({ url, origIndex: i });
|
|
92
|
+
} else if (isRedditUrl(url)) {
|
|
88
93
|
redditInputs.push({ url, origIndex: i });
|
|
89
94
|
} else {
|
|
90
95
|
webInputs.push({ url, origIndex: i });
|
|
91
96
|
}
|
|
92
97
|
}
|
|
93
|
-
return { webInputs, redditInputs, invalidEntries };
|
|
98
|
+
return { webInputs, redditInputs, documentInputs, invalidEntries };
|
|
94
99
|
}
|
|
95
100
|
async function fetchWebBranch(inputs, client) {
|
|
96
101
|
if (inputs.length === 0) {
|
|
97
|
-
return {
|
|
102
|
+
return {
|
|
103
|
+
successItems: [],
|
|
104
|
+
failedContents: [],
|
|
105
|
+
metrics: { successful: 0, failed: 0, totalCredits: 0 },
|
|
106
|
+
binaryDeferred: []
|
|
107
|
+
};
|
|
98
108
|
}
|
|
99
109
|
mcpLog("info", `[concurrency] web branch: fanning out ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`, "scrape");
|
|
100
110
|
const urls = inputs.map((i) => i.url);
|
|
101
111
|
const results = await client.scrapeMultiple(urls, { timeout: 60 });
|
|
112
|
+
const urlToIndex = new Map(inputs.map((i) => [i.url, i.origIndex]));
|
|
102
113
|
const successItems = [];
|
|
103
114
|
const failedContents = [];
|
|
115
|
+
const binaryDeferred = [];
|
|
104
116
|
let successful = 0;
|
|
105
117
|
let failed = 0;
|
|
106
118
|
let totalCredits = 0;
|
|
@@ -114,6 +126,13 @@ async function fetchWebBranch(inputs, client) {
|
|
|
114
126
|
\u274C No result returned`);
|
|
115
127
|
continue;
|
|
116
128
|
}
|
|
129
|
+
if (result.error?.code === ErrorCode.UNSUPPORTED_BINARY_CONTENT) {
|
|
130
|
+
binaryDeferred.push({
|
|
131
|
+
url: result.url,
|
|
132
|
+
origIndex: urlToIndex.get(result.url) ?? origIndex
|
|
133
|
+
});
|
|
134
|
+
continue;
|
|
135
|
+
}
|
|
117
136
|
if (result.error || result.statusCode < 200 || result.statusCode >= 300) {
|
|
118
137
|
failed++;
|
|
119
138
|
const errorMsg = result.error?.message || result.content || `HTTP ${result.statusCode}`;
|
|
@@ -134,7 +153,62 @@ async function fetchWebBranch(inputs, client) {
|
|
|
134
153
|
}
|
|
135
154
|
successItems.push({ url: result.url, content, index: origIndex });
|
|
136
155
|
}
|
|
137
|
-
return {
|
|
156
|
+
return {
|
|
157
|
+
successItems,
|
|
158
|
+
failedContents,
|
|
159
|
+
metrics: { successful, failed, totalCredits },
|
|
160
|
+
binaryDeferred
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
async function fetchDocumentBranch(inputs, jinaClient) {
|
|
164
|
+
if (inputs.length === 0) {
|
|
165
|
+
return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };
|
|
166
|
+
}
|
|
167
|
+
mcpLog(
|
|
168
|
+
"info",
|
|
169
|
+
`[concurrency] document branch (jina): converting ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`,
|
|
170
|
+
"scrape"
|
|
171
|
+
);
|
|
172
|
+
const results = await pMapSettled(
|
|
173
|
+
inputs,
|
|
174
|
+
(input) => jinaClient.convert({ url: input.url }),
|
|
175
|
+
CONCURRENCY.SCRAPER
|
|
176
|
+
);
|
|
177
|
+
const successItems = [];
|
|
178
|
+
const failedContents = [];
|
|
179
|
+
let successful = 0;
|
|
180
|
+
let failed = 0;
|
|
181
|
+
for (let i = 0; i < results.length; i++) {
|
|
182
|
+
const settled = results[i];
|
|
183
|
+
const input = inputs[i];
|
|
184
|
+
if (!settled) {
|
|
185
|
+
failed++;
|
|
186
|
+
failedContents.push(`## ${input.url}
|
|
187
|
+
|
|
188
|
+
\u274C No result returned (document conversion)`);
|
|
189
|
+
continue;
|
|
190
|
+
}
|
|
191
|
+
if (settled.status === "rejected") {
|
|
192
|
+
failed++;
|
|
193
|
+
const reason = settled.reason instanceof Error ? settled.reason.message : String(settled.reason);
|
|
194
|
+
failedContents.push(`## ${input.url}
|
|
195
|
+
|
|
196
|
+
\u274C Document conversion failed: ${reason}`);
|
|
197
|
+
continue;
|
|
198
|
+
}
|
|
199
|
+
const result = settled.value;
|
|
200
|
+
if (result.error || result.statusCode < 200 || result.statusCode >= 300) {
|
|
201
|
+
failed++;
|
|
202
|
+
const errorMsg = result.error?.message || `HTTP ${result.statusCode}`;
|
|
203
|
+
failedContents.push(`## ${input.url}
|
|
204
|
+
|
|
205
|
+
\u274C Document conversion failed: ${errorMsg}`);
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
successful++;
|
|
209
|
+
successItems.push({ url: input.url, content: result.content, index: input.origIndex });
|
|
210
|
+
}
|
|
211
|
+
return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };
|
|
138
212
|
}
|
|
139
213
|
function formatRedditPostAsMarkdown(result) {
|
|
140
214
|
const { post, comments } = result;
|
|
@@ -315,11 +389,11 @@ async function handleScrapeLinks(params, reporter = NOOP_REPORTER) {
|
|
|
315
389
|
if (!params.urls || params.urls.length === 0) {
|
|
316
390
|
return createScrapeErrorResponse("NO_URLS", "No URLs provided", startTime);
|
|
317
391
|
}
|
|
318
|
-
const { webInputs, redditInputs, invalidEntries } = partitionUrls(params.urls);
|
|
319
|
-
const validCount = webInputs.length + redditInputs.length;
|
|
392
|
+
const { webInputs, redditInputs, documentInputs, invalidEntries } = partitionUrls(params.urls);
|
|
393
|
+
const validCount = webInputs.length + redditInputs.length + documentInputs.length;
|
|
320
394
|
await reporter.log(
|
|
321
395
|
"info",
|
|
322
|
-
`Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${invalidEntries.length} invalid`
|
|
396
|
+
`Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${documentInputs.length} document, ${invalidEntries.length} invalid`
|
|
323
397
|
);
|
|
324
398
|
if (validCount === 0) {
|
|
325
399
|
return createScrapeErrorResponse(
|
|
@@ -334,17 +408,23 @@ async function handleScrapeLinks(params, reporter = NOOP_REPORTER) {
|
|
|
334
408
|
}
|
|
335
409
|
mcpLog(
|
|
336
410
|
"info",
|
|
337
|
-
`Starting scrape: ${webInputs.length} web + ${redditInputs.length} reddit URL(s)`,
|
|
411
|
+
`Starting scrape: ${webInputs.length} web + ${redditInputs.length} reddit + ${documentInputs.length} document URL(s)`,
|
|
338
412
|
"scrape"
|
|
339
413
|
);
|
|
340
414
|
await reporter.progress(15, 100, "Preparing scraper clients");
|
|
341
415
|
let clients = null;
|
|
342
416
|
try {
|
|
417
|
+
const jinaClient = new JinaClient();
|
|
343
418
|
if (webInputs.length > 0) {
|
|
344
|
-
clients = {
|
|
419
|
+
clients = {
|
|
420
|
+
client: new ScraperClient(),
|
|
421
|
+
jinaClient,
|
|
422
|
+
llmProcessor: createLLMProcessor()
|
|
423
|
+
};
|
|
345
424
|
} else {
|
|
346
425
|
clients = {
|
|
347
426
|
client: null,
|
|
427
|
+
jinaClient,
|
|
348
428
|
llmProcessor: createLLMProcessor()
|
|
349
429
|
};
|
|
350
430
|
}
|
|
@@ -362,20 +442,50 @@ async function handleScrapeLinks(params, reporter = NOOP_REPORTER) {
|
|
|
362
442
|
}
|
|
363
443
|
const enhancedInstruction = enhanceExtractionInstruction(params.extract);
|
|
364
444
|
await reporter.progress(35, 100, "Fetching page content");
|
|
365
|
-
const
|
|
366
|
-
|
|
367
|
-
|
|
445
|
+
const emptyPhase = {
|
|
446
|
+
successItems: [],
|
|
447
|
+
failedContents: [],
|
|
448
|
+
metrics: { successful: 0, failed: 0, totalCredits: 0 },
|
|
449
|
+
binaryDeferred: []
|
|
450
|
+
};
|
|
451
|
+
const [webPhase, redditPhase, documentPhase] = await Promise.all([
|
|
452
|
+
webInputs.length > 0 ? fetchWebBranch(webInputs, clients.client) : Promise.resolve(emptyPhase),
|
|
453
|
+
fetchRedditBranch(redditInputs),
|
|
454
|
+
fetchDocumentBranch(documentInputs, clients.jinaClient)
|
|
368
455
|
]);
|
|
369
|
-
|
|
456
|
+
let deferredPhase = {
|
|
457
|
+
successItems: [],
|
|
458
|
+
failedContents: [],
|
|
459
|
+
metrics: { successful: 0, failed: 0, totalCredits: 0 }
|
|
460
|
+
};
|
|
461
|
+
if (webPhase.binaryDeferred.length > 0) {
|
|
462
|
+
await reporter.log(
|
|
463
|
+
"info",
|
|
464
|
+
`Rerouting ${webPhase.binaryDeferred.length} binary URL(s) from Scrape.do \u2192 Jina Reader`
|
|
465
|
+
);
|
|
466
|
+
deferredPhase = await fetchDocumentBranch(webPhase.binaryDeferred, clients.jinaClient);
|
|
467
|
+
}
|
|
468
|
+
const successItems = [
|
|
469
|
+
...webPhase.successItems,
|
|
470
|
+
...redditPhase.successItems,
|
|
471
|
+
...documentPhase.successItems,
|
|
472
|
+
...deferredPhase.successItems
|
|
473
|
+
];
|
|
370
474
|
const invalidFailed = invalidEntries.map(
|
|
371
475
|
({ url }) => `## ${url}
|
|
372
476
|
|
|
373
477
|
\u274C Invalid URL format`
|
|
374
478
|
);
|
|
375
|
-
const failedContents = [
|
|
479
|
+
const failedContents = [
|
|
480
|
+
...invalidFailed,
|
|
481
|
+
...webPhase.failedContents,
|
|
482
|
+
...redditPhase.failedContents,
|
|
483
|
+
...documentPhase.failedContents,
|
|
484
|
+
...deferredPhase.failedContents
|
|
485
|
+
];
|
|
376
486
|
const metrics = {
|
|
377
|
-
successful: webPhase.metrics.successful + redditPhase.metrics.successful,
|
|
378
|
-
failed: invalidEntries.length + webPhase.metrics.failed + redditPhase.metrics.failed,
|
|
487
|
+
successful: webPhase.metrics.successful + redditPhase.metrics.successful + documentPhase.metrics.successful + deferredPhase.metrics.successful,
|
|
488
|
+
failed: invalidEntries.length + webPhase.metrics.failed + redditPhase.metrics.failed + documentPhase.metrics.failed + deferredPhase.metrics.failed,
|
|
379
489
|
totalCredits: webPhase.metrics.totalCredits
|
|
380
490
|
};
|
|
381
491
|
await reporter.log("info", `Fetched ${metrics.successful} page(s), ${metrics.failed} failed`);
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/tools/scrape.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Scrape Links Tool Handler\n *\n * Scrapes many URLs in parallel. Reddit permalinks (reddit.com/r/.../comments/...)\n * are auto-detected and routed through the Reddit API; all other URLs go through\n * the scraper. Both branches feed the same per-URL LLM extraction pipeline.\n *\n * NEVER throws \u2014 every error is returned as a tool-level failure response.\n */\n\nimport type { MCPServer } from 'mcp-use/server';\n\nimport {\n SCRAPER,\n CONCURRENCY,\n getCapabilities,\n getMissingEnvMessage,\n parseEnv,\n} from '../config/index.js';\nimport {\n scrapeLinksOutputSchema,\n scrapeLinksParamsSchema,\n type ScrapeLinksParams,\n type ScrapeLinksOutput,\n} from '../schemas/scrape-links.js';\nimport { ScraperClient } from '../clients/scraper.js';\nimport { RedditClient, type PostResult } from '../clients/reddit.js';\nimport { MarkdownCleaner } from '../services/markdown-cleaner.js';\nimport { createLLMProcessor, processContentWithLLM } from '../services/llm-processor.js';\nimport { removeMetaTags } from '../utils/markdown-formatter.js';\nimport { extractReadableContent } from '../utils/content-extractor.js';\nimport { classifyError } from '../utils/errors.js';\nimport { pMap } from '../utils/concurrency.js';\nimport {\n mcpLog,\n formatSuccess,\n formatError,\n formatBatchHeader,\n formatDuration,\n} from './utils.js';\nimport {\n createToolReporter,\n NOOP_REPORTER,\n toolFailure,\n toolSuccess,\n toToolResponse,\n type ToolExecutionResult,\n type ToolReporter,\n} from './mcp-helpers.js';\n\nconst markdownCleaner = new MarkdownCleaner();\n\nfunction enhanceExtractionInstruction(instruction: string | undefined): string {\n const base = instruction || 'Extract the main content and key information from this page.';\n return `${SCRAPER.EXTRACTION_PREFIX}\\n\\n${base}\\n\\n${SCRAPER.EXTRACTION_SUFFIX}`;\n}\n\n// --- Types ---\n\ninterface ProcessedResult {\n url: string;\n content: string;\n index: number; // original position in params.urls[]\n}\n\ninterface ScrapeMetrics {\n successful: number;\n failed: number;\n totalCredits: number;\n}\n\ninterface ScrapePhaseResult {\n successItems: ProcessedResult[];\n failedContents: string[];\n metrics: ScrapeMetrics;\n}\n\ninterface BranchInput {\n url: string;\n origIndex: number;\n}\n\ninterface ScrapeClients {\n client: ScraperClient;\n llmProcessor: ReturnType<typeof createLLMProcessor>;\n}\n\n// --- Reddit URL detection ---\n\nconst REDDIT_HOST = /(?:^|\\.)reddit\\.com$/i;\nconst REDDIT_POST_PERMALINK = /\\/r\\/[^/]+\\/comments\\/[a-z0-9]+/i;\n\nfunction isRedditUrl(url: string): boolean {\n try {\n const u = new URL(url);\n return REDDIT_HOST.test(u.hostname);\n } catch {\n return false;\n }\n}\n\nfunction isRedditPostPermalink(url: string): boolean {\n try {\n const u = new URL(url);\n return REDDIT_HOST.test(u.hostname) && REDDIT_POST_PERMALINK.test(u.pathname);\n } catch {\n return false;\n }\n}\n\n// --- Error helper ---\n\nfunction createScrapeErrorResponse(\n code: string,\n message: string,\n startTime: number,\n retryable = false,\n alternatives?: string[],\n): ToolExecutionResult<ScrapeLinksOutput> {\n return toolFailure(\n `${formatError({\n code,\n message,\n retryable,\n toolName: 'scrape-links',\n howToFix: code === 'NO_URLS' ? ['Provide at least one valid URL'] : undefined,\n alternatives,\n })}\\n\\nExecution time: ${formatDuration(Date.now() - startTime)}`,\n );\n}\n\n// --- URL partitioning ---\n\ninterface PartitionedUrls {\n webInputs: BranchInput[];\n redditInputs: BranchInput[];\n invalidEntries: { url: string; origIndex: number }[];\n}\n\nfunction partitionUrls(urls: string[]): PartitionedUrls {\n const webInputs: BranchInput[] = [];\n const redditInputs: BranchInput[] = [];\n const invalidEntries: { url: string; origIndex: number }[] = [];\n\n for (let i = 0; i < urls.length; i++) {\n const url = urls[i]!;\n try {\n new URL(url);\n } catch {\n invalidEntries.push({ url, origIndex: i });\n continue;\n }\n if (isRedditUrl(url)) {\n redditInputs.push({ url, origIndex: i });\n } else {\n webInputs.push({ url, origIndex: i });\n }\n }\n\n return { webInputs, redditInputs, invalidEntries };\n}\n\n// --- Web branch ---\n\nasync function fetchWebBranch(\n inputs: BranchInput[],\n client: ScraperClient,\n): Promise<ScrapePhaseResult> {\n if (inputs.length === 0) {\n return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };\n }\n\n mcpLog('info', `[concurrency] web branch: fanning out ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`, 'scrape');\n const urls = inputs.map((i) => i.url);\n const results = await client.scrapeMultiple(urls, { timeout: 60 });\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [];\n let successful = 0;\n let failed = 0;\n let totalCredits = 0;\n\n for (let i = 0; i < results.length; i++) {\n const result = results[i];\n const origIndex = inputs[i]!.origIndex;\n if (!result) {\n failed++;\n failedContents.push(`## ${inputs[i]!.url}\\n\\n\u274C No result returned`);\n continue;\n }\n\n if (result.error || result.statusCode < 200 || result.statusCode >= 300) {\n failed++;\n const errorMsg = result.error?.message || result.content || `HTTP ${result.statusCode}`;\n failedContents.push(`## ${result.url}\\n\\n\u274C Failed to scrape: ${errorMsg}`);\n continue;\n }\n\n successful++;\n totalCredits += result.credits;\n\n let content: string;\n try {\n const readable = extractReadableContent(result.content, result.url);\n const sourceForCleaner = readable.extracted ? readable.content : result.content;\n content = markdownCleaner.processContent(sourceForCleaner);\n } catch {\n content = result.content;\n }\n\n successItems.push({ url: result.url, content, index: origIndex });\n }\n\n return { successItems, failedContents, metrics: { successful, failed, totalCredits } };\n}\n\n// --- Reddit branch ---\n\nfunction formatRedditPostAsMarkdown(result: PostResult): string {\n const { post, comments } = result;\n const lines: string[] = [];\n lines.push(`# ${post.title}`);\n lines.push('');\n lines.push(`**r/${post.subreddit}** \u2022 u/${post.author} \u2022 \u2B06\uFE0F ${post.score} \u2022 \uD83D\uDCAC ${post.commentCount} comments`);\n lines.push(`\uD83D\uDD17 ${post.url}`);\n lines.push('');\n if (post.body) {\n lines.push('## Post content');\n lines.push('');\n lines.push(post.body);\n lines.push('');\n }\n if (comments.length > 0) {\n lines.push(`## Top comments (${comments.length} total)`);\n lines.push('');\n for (const c of comments) {\n const indent = ' '.repeat(c.depth);\n const op = c.isOP ? ' **[OP]**' : '';\n const score = c.score >= 0 ? `+${c.score}` : `${c.score}`;\n lines.push(`${indent}- **u/${c.author}**${op} _(${score})_`);\n for (const line of c.body.split('\\n')) {\n lines.push(`${indent} ${line}`);\n }\n lines.push('');\n }\n }\n return lines.join('\\n');\n}\n\nasync function fetchRedditBranch(inputs: BranchInput[]): Promise<ScrapePhaseResult> {\n if (inputs.length === 0) {\n return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };\n }\n\n const env = parseEnv();\n if (!env.REDDIT_CLIENT_ID || !env.REDDIT_CLIENT_SECRET) {\n const failedContents = inputs.map(\n (i) => `## ${i.url}\\n\\n\u274C Reddit URL detected, but Reddit API is not configured. Set \\`REDDIT_CLIENT_ID\\` and \\`REDDIT_CLIENT_SECRET\\` in the server env to enable threaded Reddit scraping.`,\n );\n return {\n successItems: [],\n failedContents,\n metrics: { successful: 0, failed: inputs.length, totalCredits: 0 },\n };\n }\n\n // Warn for non-permalink Reddit URLs (subreddit homepages, /new, /top, /hot,\n // user profiles). The Reddit API path we call requires /r/.../comments/... \u2014\n // reject upfront so the caller sees a helpful message instead of a 404.\n const [postInputs, nonPermalinks] = inputs.reduce<[BranchInput[], BranchInput[]]>(\n ([posts, rest], input) => {\n if (isRedditPostPermalink(input.url)) posts.push(input);\n else rest.push(input);\n return [posts, rest];\n },\n [[], []],\n );\n\n const nonPermalinkFailed = nonPermalinks.map(\n (i) => `## ${i.url}\\n\\n\u274C Only Reddit post permalinks (/r/<sub>/comments/<id>/...) are supported. Use web-search with scope:\"reddit\" to discover post permalinks first.`,\n );\n\n if (postInputs.length === 0) {\n return {\n successItems: [],\n failedContents: nonPermalinkFailed,\n metrics: { successful: 0, failed: nonPermalinks.length, totalCredits: 0 },\n };\n }\n\n mcpLog('info', `[concurrency] reddit branch: fetching ${postInputs.length} post(s) with limit=${CONCURRENCY.REDDIT}`, 'scrape');\n const client = new RedditClient(env.REDDIT_CLIENT_ID, env.REDDIT_CLIENT_SECRET);\n const urls = postInputs.map((i) => i.url);\n const batchResult = await client.batchGetPosts(urls, true);\n const urlToIndex = new Map(postInputs.map((i) => [i.url, i.origIndex]));\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [...nonPermalinkFailed];\n let successful = 0;\n let failed = nonPermalinks.length;\n\n for (const [url, result] of batchResult.results) {\n const origIndex = urlToIndex.get(url) ?? -1;\n if (result instanceof Error) {\n failed++;\n failedContents.push(`## ${url}\\n\\n\u274C Reddit fetch failed: ${result.message}`);\n continue;\n }\n successful++;\n successItems.push({ url, content: formatRedditPostAsMarkdown(result), index: origIndex });\n }\n\n return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };\n}\n\n// --- LLM extraction (shared by both branches) ---\n\nasync function processItemsWithLlm(\n successItems: ProcessedResult[],\n enhancedInstruction: string,\n llmProcessor: ReturnType<typeof createLLMProcessor>,\n reporter: ToolReporter,\n): Promise<{ items: ProcessedResult[]; llmErrors: number; llmAttempted: number }> {\n let llmErrors = 0;\n\n if (!llmProcessor || successItems.length === 0) {\n if (!llmProcessor && successItems.length > 0) {\n mcpLog('warning', 'LLM unavailable (LLM_API_KEY not set). Returning raw scraped content.', 'scrape');\n void reporter.log('warning', 'llm_extractor_unreachable: planner not configured; raw scraped content returned');\n }\n return { items: successItems, llmErrors, llmAttempted: 0 };\n }\n\n mcpLog('info', `[concurrency] llm extraction: fanning out ${successItems.length} item(s) with limit=${CONCURRENCY.LLM_EXTRACTION}`, 'scrape');\n\n const llmResults = await pMap(\n successItems,\n async (item) => {\n mcpLog('debug', `LLM extracting ${item.url}...`, 'scrape');\n\n const llmResult = await processContentWithLLM(\n item.content,\n { enabled: true, extract: enhancedInstruction, url: item.url },\n llmProcessor,\n );\n\n if (llmResult.processed) {\n return { ...item, content: llmResult.content };\n }\n\n llmErrors++;\n mcpLog('warning', `LLM extraction failed for ${item.url}: ${llmResult.error || 'unknown reason'}`, 'scrape');\n void reporter.log('warning', `llm_extractor_unreachable: ${item.url} \u2014 ${llmResult.error || 'unknown reason'}`);\n return item;\n },\n CONCURRENCY.LLM_EXTRACTION,\n );\n\n return { items: llmResults, llmErrors, llmAttempted: successItems.length };\n}\n\n// --- Output assembly ---\n\nfunction assembleContentEntries(successItems: ProcessedResult[], failedContents: string[]): string[] {\n const sorted = [...successItems].sort((a, b) => a.index - b.index);\n const contents = [...failedContents];\n for (const item of sorted) {\n let content = item.content;\n try {\n content = removeMetaTags(content);\n } catch {\n // Use content as-is\n }\n contents.push(`## ${item.url}\\n\\n${content}`);\n }\n return contents;\n}\n\nfunction buildScrapeResponse(\n params: ScrapeLinksParams,\n contents: string[],\n metrics: ScrapeMetrics,\n llmErrors: number,\n executionTime: number,\n llmAccounting: { llmAttempted: number; llmSucceeded: boolean },\n): { content: string; structuredContent: ScrapeLinksOutput } {\n const llmExtras: Record<string, string | number> = {};\n if (llmAccounting.llmAttempted > 0) {\n const ok = llmAccounting.llmAttempted - llmErrors;\n llmExtras['LLM extraction'] = `${ok}/${llmAccounting.llmAttempted} succeeded`;\n if (!llmAccounting.llmSucceeded) {\n llmExtras['LLM credit'] = '0 charged (no extraction produced)';\n }\n } else if (llmErrors > 0) {\n llmExtras['LLM extraction failures'] = llmErrors;\n }\n\n const batchHeader = formatBatchHeader({\n title: `Scraped Content (${params.urls.length} URLs)`,\n totalItems: params.urls.length,\n successful: metrics.successful,\n failed: metrics.failed,\n extras: {\n 'Credits used': metrics.totalCredits,\n ...llmExtras,\n },\n });\n\n const formattedContent = formatSuccess({\n title: 'Scraping Complete',\n summary: batchHeader,\n data: contents.join('\\n\\n---\\n\\n'),\n metadata: {\n 'Execution time': formatDuration(executionTime),\n },\n });\n\n const metadata: ScrapeLinksOutput['metadata'] = {\n total_items: params.urls.length,\n successful: metrics.successful,\n failed: metrics.failed,\n execution_time_ms: executionTime,\n total_credits: metrics.totalCredits,\n };\n return { content: formattedContent, structuredContent: { metadata } };\n}\n\n// --- Handler ---\n\nexport async function handleScrapeLinks(\n params: ScrapeLinksParams,\n reporter: ToolReporter = NOOP_REPORTER,\n): Promise<ToolExecutionResult<ScrapeLinksOutput>> {\n const startTime = Date.now();\n\n if (!params.urls || params.urls.length === 0) {\n return createScrapeErrorResponse('NO_URLS', 'No URLs provided', startTime);\n }\n\n const { webInputs, redditInputs, invalidEntries } = partitionUrls(params.urls);\n const validCount = webInputs.length + redditInputs.length;\n\n await reporter.log(\n 'info',\n `Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${invalidEntries.length} invalid`,\n );\n\n if (validCount === 0) {\n return createScrapeErrorResponse(\n 'INVALID_URLS',\n `All ${params.urls.length} URLs are invalid`,\n startTime,\n false,\n [\n 'web-search(queries=[...], extract=\"...\") \u2014 search for valid URLs first, then scrape the results',\n ],\n );\n }\n\n mcpLog(\n 'info',\n `Starting scrape: ${webInputs.length} web + ${redditInputs.length} reddit URL(s)`,\n 'scrape',\n );\n await reporter.progress(15, 100, 'Preparing scraper clients');\n\n // Only initialize web clients if we actually have web URLs. Reddit-only\n // batches run without touching the scraper.\n let clients: ScrapeClients | null = null;\n try {\n if (webInputs.length > 0) {\n clients = { client: new ScraperClient(), llmProcessor: createLLMProcessor() };\n } else {\n // Reddit-only: no scraper needed, but still create the LLM processor\n // so the extraction pass runs.\n clients = {\n client: null as unknown as ScraperClient,\n llmProcessor: createLLMProcessor(),\n };\n }\n } catch (error) {\n const err = classifyError(error);\n return createScrapeErrorResponse(\n 'CLIENT_INIT_FAILED',\n `Failed to initialize scraper: ${err.message}`,\n startTime,\n false,\n [\n 'web-search(queries=[\"topic key findings\", \"topic summary\"], extract=\"key findings and summary\") \u2014 search instead of scraping',\n ],\n );\n }\n\n const enhancedInstruction = enhanceExtractionInstruction(params.extract);\n\n await reporter.progress(35, 100, 'Fetching page content');\n\n // Run both branches in parallel. Failures in one branch do not block the other.\n const [webPhase, redditPhase] = await Promise.all([\n webInputs.length > 0\n ? fetchWebBranch(webInputs, clients.client)\n : Promise.resolve<ScrapePhaseResult>({ successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } }),\n fetchRedditBranch(redditInputs),\n ]);\n\n const successItems = [...webPhase.successItems, ...redditPhase.successItems];\n const invalidFailed = invalidEntries.map(\n ({ url }) => `## ${url}\\n\\n\u274C Invalid URL format`,\n );\n const failedContents = [...invalidFailed, ...webPhase.failedContents, ...redditPhase.failedContents];\n const metrics: ScrapeMetrics = {\n successful: webPhase.metrics.successful + redditPhase.metrics.successful,\n failed: invalidEntries.length + webPhase.metrics.failed + redditPhase.metrics.failed,\n totalCredits: webPhase.metrics.totalCredits,\n };\n\n await reporter.log('info', `Fetched ${metrics.successful} page(s), ${metrics.failed} failed`);\n\n if (successItems.length > 0) {\n await reporter.progress(80, 100, 'Running LLM extraction over fetched pages');\n }\n\n const { items: processedItems, llmErrors, llmAttempted } = await processItemsWithLlm(\n successItems,\n enhancedInstruction,\n clients.llmProcessor,\n reporter,\n );\n\n const contents = assembleContentEntries(processedItems, failedContents);\n const executionTime = Date.now() - startTime;\n\n mcpLog(\n 'info',\n `Completed: ${metrics.successful} successful, ${metrics.failed} failed, ${metrics.totalCredits} credits used`,\n 'scrape',\n );\n\n const llmSucceeded = llmAttempted > 0 && llmErrors < llmAttempted;\n const result = buildScrapeResponse(\n params,\n contents,\n metrics,\n llmErrors,\n executionTime,\n { llmAttempted, llmSucceeded },\n );\n\n if (metrics.successful === 0 && metrics.failed > 0) {\n return toolFailure(result.content);\n }\n\n return toolSuccess(result.content, result.structuredContent);\n}\n\nexport function registerScrapeLinksTool(server: MCPServer): void {\n server.tool(\n {\n name: 'scrape-links',\n title: 'Scrape Links',\n description:\n 'Fetch many URLs in parallel and run per-URL structured LLM extraction. Auto-detects reddit.com post permalinks and routes them through the Reddit API (threaded post + comments); everything else flows through the HTTP scraper. Safe to call in parallel \u2014 group URLs by context rather than jamming unrelated batches together. Each page returns `## Source`, `## Matches` (verbatim-preserved facts), `## Not found` (explicit gaps), and `## Follow-up signals` (new terms + referenced URLs) that feed the next research loop. Describe the SHAPE of what you want in `extract`, facets separated by `|` (e.g. `root cause | affected versions | fix | workarounds | timeline`).',\n schema: scrapeLinksParamsSchema,\n outputSchema: scrapeLinksOutputSchema,\n annotations: {\n readOnlyHint: true,\n idempotentHint: true,\n destructiveHint: false,\n openWorldHint: true,\n },\n },\n async (args, ctx) => {\n if (!getCapabilities().scraping) {\n return toToolResponse(toolFailure(getMissingEnvMessage('scraping')));\n }\n\n const reporter = createToolReporter(ctx, 'scrape-links');\n const result = await handleScrapeLinks(args, reporter);\n\n await reporter.progress(100, 100, result.isError ? 'Scrape failed' : 'Scrape complete');\n return toToolResponse(result);\n },\n );\n}\n"],
|
|
5
|
-
"mappings": "AAYA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP;AAAA,EACE;AAAA,EACA;AAAA,OAGK;AACP,SAAS,qBAAqB;AAC9B,SAAS,oBAAqC;AAC9C,SAAS,uBAAuB;AAChC,SAAS,oBAAoB,6BAA6B;AAC1D,SAAS,sBAAsB;AAC/B,SAAS,8BAA8B;AACvC,SAAS,qBAAqB;AAC9B,SAAS,YAAY;AACrB;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAGK;AAEP,MAAM,kBAAkB,IAAI,gBAAgB;AAE5C,SAAS,6BAA6B,aAAyC;AAC7E,QAAM,OAAO,eAAe;AAC5B,SAAO,GAAG,QAAQ,iBAAiB;AAAA;AAAA,EAAO,IAAI;AAAA;AAAA,EAAO,QAAQ,iBAAiB;AAChF;AAkCA,MAAM,cAAc;AACpB,MAAM,wBAAwB;AAE9B,SAAS,YAAY,KAAsB;AACzC,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AACrB,WAAO,YAAY,KAAK,EAAE,QAAQ;AAAA,EACpC,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEA,SAAS,sBAAsB,KAAsB;AACnD,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AACrB,WAAO,YAAY,KAAK,EAAE,QAAQ,KAAK,sBAAsB,KAAK,EAAE,QAAQ;AAAA,EAC9E,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAIA,SAAS,0BACP,MACA,SACA,WACA,YAAY,OACZ,cACwC;AACxC,SAAO;AAAA,IACL,GAAG,YAAY;AAAA,MACb;AAAA,MACA;AAAA,MACA;AAAA,MACA,UAAU;AAAA,MACV,UAAU,SAAS,YAAY,CAAC,gCAAgC,IAAI;AAAA,MACpE;AAAA,IACF,CAAC,CAAC;AAAA;AAAA,kBAAuB,eAAe,KAAK,IAAI,IAAI,SAAS,CAAC;AAAA,EACjE;AACF;AAUA,SAAS,cAAc,MAAiC;AACtD,QAAM,YAA2B,CAAC;AAClC,QAAM,eAA8B,CAAC;AACrC,QAAM,iBAAuD,CAAC;AAE9D,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,qBAAe,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AACzC;AAAA,IACF;AACA,QAAI,YAAY,GAAG,GAAG;AACpB,mBAAa,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IACzC,OAAO;AACL,gBAAU,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,SAAO,EAAE,WAAW,cAAc,eAAe;AACnD;AAIA,eAAe,eACb,QACA,QAC4B;AAC5B,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE;AAAA,EACxG;AAEA,SAAO,QAAQ,yCAAyC,OAAO,MAAM,sBAAsB,YAAY,OAAO,IAAI,QAAQ;AAC1H,QAAM,OAAO,OAAO,IAAI,CAAC,MAAM,EAAE,GAAG;AACpC,QAAM,UAAU,MAAM,OAAO,eAAe,MAAM,EAAE,SAAS,GAAG,CAAC;AAEjE,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC;AAClC,MAAI,aAAa;AACjB,MAAI,SAAS;AACb,MAAI,eAAe;AAEnB,WAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,UAAM,SAAS,QAAQ,CAAC;AACxB,UAAM,YAAY,OAAO,CAAC,EAAG;AAC7B,QAAI,CAAC,QAAQ;AACX;AACA,qBAAe,KAAK,MAAM,OAAO,CAAC,EAAG,GAAG;AAAA;AAAA,0BAA0B;AAClE;AAAA,IACF;AAEA,QAAI,OAAO,SAAS,OAAO,aAAa,OAAO,OAAO,cAAc,KAAK;AACvE;AACA,YAAM,WAAW,OAAO,OAAO,WAAW,OAAO,WAAW,QAAQ,OAAO,UAAU;AACrF,qBAAe,KAAK,MAAM,OAAO,GAAG;AAAA;AAAA,2BAA2B,QAAQ,EAAE;AACzE;AAAA,IACF;AAEA;AACA,oBAAgB,OAAO;AAEvB,QAAI;AACJ,QAAI;AACF,YAAM,WAAW,uBAAuB,OAAO,SAAS,OAAO,GAAG;AAClE,YAAM,mBAAmB,SAAS,YAAY,SAAS,UAAU,OAAO;AACxE,gBAAU,gBAAgB,eAAe,gBAAgB;AAAA,IAC3D,QAAQ;AACN,gBAAU,OAAO;AAAA,IACnB;AAEA,iBAAa,KAAK,EAAE,KAAK,OAAO,KAAK,SAAS,OAAO,UAAU,CAAC;AAAA,EAClE;AAEA,SAAO,EAAE,cAAc,gBAAgB,SAAS,EAAE,YAAY,QAAQ,aAAa,EAAE;AACvF;AAIA,SAAS,2BAA2B,QAA4B;AAC9D,QAAM,EAAE,MAAM,SAAS,IAAI;AAC3B,QAAM,QAAkB,CAAC;AACzB,QAAM,KAAK,KAAK,KAAK,KAAK,EAAE;AAC5B,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,OAAO,KAAK,SAAS,eAAU,KAAK,MAAM,wBAAS,KAAK,KAAK,qBAAS,KAAK,YAAY,WAAW;AAC7G,QAAM,KAAK,aAAM,KAAK,GAAG,EAAE;AAC3B,QAAM,KAAK,EAAE;AACb,MAAI,KAAK,MAAM;AACb,UAAM,KAAK,iBAAiB;AAC5B,UAAM,KAAK,EAAE;AACb,UAAM,KAAK,KAAK,IAAI;AACpB,UAAM,KAAK,EAAE;AAAA,EACf;AACA,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,oBAAoB,SAAS,MAAM,SAAS;AACvD,UAAM,KAAK,EAAE;AACb,eAAW,KAAK,UAAU;AACxB,YAAM,SAAS,KAAK,OAAO,EAAE,KAAK;AAClC,YAAM,KAAK,EAAE,OAAO,cAAc;AAClC,YAAM,QAAQ,EAAE,SAAS,IAAI,IAAI,EAAE,KAAK,KAAK,GAAG,EAAE,KAAK;AACvD,YAAM,KAAK,GAAG,MAAM,SAAS,EAAE,MAAM,KAAK,EAAE,MAAM,KAAK,IAAI;AAC3D,iBAAW,QAAQ,EAAE,KAAK,MAAM,IAAI,GAAG;AACrC,cAAM,KAAK,GAAG,MAAM,KAAK,IAAI,EAAE;AAAA,MACjC;AACA,YAAM,KAAK,EAAE;AAAA,IACf;AAAA,EACF;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,eAAe,kBAAkB,QAAmD;AAClF,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE;AAAA,EACxG;AAEA,QAAM,MAAM,SAAS;AACrB,MAAI,CAAC,IAAI,oBAAoB,CAAC,IAAI,sBAAsB;AACtD,UAAMA,kBAAiB,OAAO;AAAA,MAC5B,CAAC,MAAM,MAAM,EAAE,GAAG;AAAA;AAAA;AAAA,IACpB;AACA,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAAA;AAAA,MACA,SAAS,EAAE,YAAY,GAAG,QAAQ,OAAO,QAAQ,cAAc,EAAE;AAAA,IACnE;AAAA,EACF;AAKA,QAAM,CAAC,YAAY,aAAa,IAAI,OAAO;AAAA,IACzC,CAAC,CAAC,OAAO,IAAI,GAAG,UAAU;AACxB,UAAI,sBAAsB,MAAM,GAAG,EAAG,OAAM,KAAK,KAAK;AAAA,UACjD,MAAK,KAAK,KAAK;AACpB,aAAO,CAAC,OAAO,IAAI;AAAA,IACrB;AAAA,IACA,CAAC,CAAC,GAAG,CAAC,CAAC;AAAA,EACT;AAEA,QAAM,qBAAqB,cAAc;AAAA,IACvC,CAAC,MAAM,MAAM,EAAE,GAAG;AAAA;AAAA;AAAA,EACpB;AAEA,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAgB;AAAA,MAChB,SAAS,EAAE,YAAY,GAAG,QAAQ,cAAc,QAAQ,cAAc,EAAE;AAAA,IAC1E;AAAA,EACF;AAEA,SAAO,QAAQ,yCAAyC,WAAW,MAAM,uBAAuB,YAAY,MAAM,IAAI,QAAQ;AAC9H,QAAM,SAAS,IAAI,aAAa,IAAI,kBAAkB,IAAI,oBAAoB;AAC9E,QAAM,OAAO,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG;AACxC,QAAM,cAAc,MAAM,OAAO,cAAc,MAAM,IAAI;AACzD,QAAM,aAAa,IAAI,IAAI,WAAW,IAAI,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,SAAS,CAAC,CAAC;AAEtE,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC,GAAG,kBAAkB;AACvD,MAAI,aAAa;AACjB,MAAI,SAAS,cAAc;AAE3B,aAAW,CAAC,KAAK,MAAM,KAAK,YAAY,SAAS;AAC/C,UAAM,YAAY,WAAW,IAAI,GAAG,KAAK;AACzC,QAAI,kBAAkB,OAAO;AAC3B;AACA,qBAAe,KAAK,MAAM,GAAG;AAAA;AAAA,8BAA8B,OAAO,OAAO,EAAE;AAC3E;AAAA,IACF;AACA;AACA,iBAAa,KAAK,EAAE,KAAK,SAAS,2BAA2B,MAAM,GAAG,OAAO,UAAU,CAAC;AAAA,EAC1F;AAEA,SAAO,EAAE,cAAc,gBAAgB,SAAS,EAAE,YAAY,QAAQ,cAAc,EAAE,EAAE;AAC1F;AAIA,eAAe,oBACb,cACA,qBACA,cACA,UACgF;AAChF,MAAI,YAAY;AAEhB,MAAI,CAAC,gBAAgB,aAAa,WAAW,GAAG;AAC9C,QAAI,CAAC,gBAAgB,aAAa,SAAS,GAAG;AAC5C,aAAO,WAAW,yEAAyE,QAAQ;AACnG,WAAK,SAAS,IAAI,WAAW,iFAAiF;AAAA,IAChH;AACA,WAAO,EAAE,OAAO,cAAc,WAAW,cAAc,EAAE;AAAA,EAC3D;AAEA,SAAO,QAAQ,6CAA6C,aAAa,MAAM,uBAAuB,YAAY,cAAc,IAAI,QAAQ;AAE5I,QAAM,aAAa,MAAM;AAAA,IACvB;AAAA,IACA,OAAO,SAAS;AACd,aAAO,SAAS,kBAAkB,KAAK,GAAG,OAAO,QAAQ;AAEzD,YAAM,YAAY,MAAM;AAAA,QACtB,KAAK;AAAA,QACL,EAAE,SAAS,MAAM,SAAS,qBAAqB,KAAK,KAAK,IAAI;AAAA,QAC7D;AAAA,MACF;AAEA,UAAI,UAAU,WAAW;AACvB,eAAO,EAAE,GAAG,MAAM,SAAS,UAAU,QAAQ;AAAA,MAC/C;AAEA;AACA,aAAO,WAAW,6BAA6B,KAAK,GAAG,KAAK,UAAU,SAAS,gBAAgB,IAAI,QAAQ;AAC3G,WAAK,SAAS,IAAI,WAAW,8BAA8B,KAAK,GAAG,WAAM,UAAU,SAAS,gBAAgB,EAAE;AAC9G,aAAO;AAAA,IACT;AAAA,IACA,YAAY;AAAA,EACd;AAEA,SAAO,EAAE,OAAO,YAAY,WAAW,cAAc,aAAa,OAAO;AAC3E;AAIA,SAAS,uBAAuB,cAAiC,gBAAoC;AACnG,QAAM,SAAS,CAAC,GAAG,YAAY,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AACjE,QAAM,WAAW,CAAC,GAAG,cAAc;AACnC,aAAW,QAAQ,QAAQ;AACzB,QAAI,UAAU,KAAK;AACnB,QAAI;AACF,gBAAU,eAAe,OAAO;AAAA,IAClC,QAAQ;AAAA,IAER;AACA,aAAS,KAAK,MAAM,KAAK,GAAG;AAAA;AAAA,EAAO,OAAO,EAAE;AAAA,EAC9C;AACA,SAAO;AACT;AAEA,SAAS,oBACP,QACA,UACA,SACA,WACA,eACA,eAC2D;AAC3D,QAAM,YAA6C,CAAC;AACpD,MAAI,cAAc,eAAe,GAAG;AAClC,UAAM,KAAK,cAAc,eAAe;AACxC,cAAU,gBAAgB,IAAI,GAAG,EAAE,IAAI,cAAc,YAAY;AACjE,QAAI,CAAC,cAAc,cAAc;AAC/B,gBAAU,YAAY,IAAI;AAAA,IAC5B;AAAA,EACF,WAAW,YAAY,GAAG;AACxB,cAAU,yBAAyB,IAAI;AAAA,EACzC;AAEA,QAAM,cAAc,kBAAkB;AAAA,IACpC,OAAO,oBAAoB,OAAO,KAAK,MAAM;AAAA,IAC7C,YAAY,OAAO,KAAK;AAAA,IACxB,YAAY,QAAQ;AAAA,IACpB,QAAQ,QAAQ;AAAA,IAChB,QAAQ;AAAA,MACN,gBAAgB,QAAQ;AAAA,MACxB,GAAG;AAAA,IACL;AAAA,EACF,CAAC;AAED,QAAM,mBAAmB,cAAc;AAAA,IACrC,OAAO;AAAA,IACP,SAAS;AAAA,IACT,MAAM,SAAS,KAAK,aAAa;AAAA,IACjC,UAAU;AAAA,MACR,kBAAkB,eAAe,aAAa;AAAA,IAChD;AAAA,EACF,CAAC;AAED,QAAM,WAA0C;AAAA,IAC9C,aAAa,OAAO,KAAK;AAAA,IACzB,YAAY,QAAQ;AAAA,IACpB,QAAQ,QAAQ;AAAA,IAChB,mBAAmB;AAAA,IACnB,eAAe,QAAQ;AAAA,EACzB;AACA,SAAO,EAAE,SAAS,kBAAkB,mBAAmB,EAAE,SAAS,EAAE;AACtE;AAIA,eAAsB,kBACpB,QACA,WAAyB,eACwB;AACjD,QAAM,YAAY,KAAK,IAAI;AAE3B,MAAI,CAAC,OAAO,QAAQ,OAAO,KAAK,WAAW,GAAG;AAC5C,WAAO,0BAA0B,WAAW,oBAAoB,SAAS;AAAA,EAC3E;AAEA,QAAM,EAAE,WAAW,cAAc,eAAe,IAAI,cAAc,OAAO,IAAI;AAC7E,QAAM,aAAa,UAAU,SAAS,aAAa;AAEnD,QAAM,SAAS;AAAA,IACb;AAAA,IACA,eAAe,OAAO,KAAK,MAAM,YAAY,UAAU,MAAM,SAAS,aAAa,MAAM,YAAY,eAAe,MAAM;AAAA,EAC5H;AAEA,MAAI,eAAe,GAAG;AACpB,WAAO;AAAA,MACL;AAAA,MACA,OAAO,OAAO,KAAK,MAAM;AAAA,MACzB;AAAA,MACA;AAAA,MACA;AAAA,QACE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA;AAAA,IACE;AAAA,IACA,oBAAoB,UAAU,MAAM,UAAU,aAAa,MAAM;AAAA,IACjE;AAAA,EACF;AACA,QAAM,SAAS,SAAS,IAAI,KAAK,2BAA2B;AAI5D,MAAI,UAAgC;AACpC,MAAI;AACF,QAAI,UAAU,SAAS,GAAG;AACxB,gBAAU,EAAE,QAAQ,IAAI,cAAc,GAAG,cAAc,mBAAmB,EAAE;AAAA,IAC9E,OAAO;AAGL,gBAAU;AAAA,QACR,QAAQ;AAAA,QACR,cAAc,mBAAmB;AAAA,MACnC;AAAA,IACF;AAAA,EACF,SAAS,OAAO;AACd,UAAM,MAAM,cAAc,KAAK;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,iCAAiC,IAAI,OAAO;AAAA,MAC5C;AAAA,MACA;AAAA,MACA;AAAA,QACE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,sBAAsB,6BAA6B,OAAO,OAAO;AAEvE,QAAM,SAAS,SAAS,IAAI,KAAK,uBAAuB;AAGxD,QAAM,CAAC,UAAU,WAAW,IAAI,MAAM,QAAQ,IAAI;AAAA,IAChD,UAAU,SAAS,IACf,eAAe,WAAW,QAAQ,MAAM,IACxC,QAAQ,QAA2B,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE,CAAC;AAAA,IACvI,kBAAkB,YAAY;AAAA,EAChC,CAAC;AAED,QAAM,eAAe,CAAC,GAAG,SAAS,cAAc,GAAG,YAAY,YAAY;AAC3E,QAAM,gBAAgB,eAAe;AAAA,IACnC,CAAC,EAAE,IAAI,MAAM,MAAM,GAAG;AAAA;AAAA;AAAA,EACxB;AACA,QAAM,iBAAiB,CAAC,GAAG,eAAe,GAAG,SAAS,gBAAgB,GAAG,YAAY,cAAc;AACnG,QAAM,UAAyB;AAAA,IAC7B,YAAY,SAAS,QAAQ,aAAa,YAAY,QAAQ;AAAA,IAC9D,QAAQ,eAAe,SAAS,SAAS,QAAQ,SAAS,YAAY,QAAQ;AAAA,IAC9E,cAAc,SAAS,QAAQ;AAAA,EACjC;AAEA,QAAM,SAAS,IAAI,QAAQ,WAAW,QAAQ,UAAU,aAAa,QAAQ,MAAM,SAAS;AAE5F,MAAI,aAAa,SAAS,GAAG;AAC3B,UAAM,SAAS,SAAS,IAAI,KAAK,2CAA2C;AAAA,EAC9E;AAEA,QAAM,EAAE,OAAO,gBAAgB,WAAW,aAAa,IAAI,MAAM;AAAA,IAC/D;AAAA,IACA;AAAA,IACA,QAAQ;AAAA,IACR;AAAA,EACF;AAEA,QAAM,WAAW,uBAAuB,gBAAgB,cAAc;AACtE,QAAM,gBAAgB,KAAK,IAAI,IAAI;AAEnC;AAAA,IACE;AAAA,IACA,cAAc,QAAQ,UAAU,gBAAgB,QAAQ,MAAM,YAAY,QAAQ,YAAY;AAAA,IAC9F;AAAA,EACF;AAEA,QAAM,eAAe,eAAe,KAAK,YAAY;AACrD,QAAM,SAAS;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,EAAE,cAAc,aAAa;AAAA,EAC/B;AAEA,MAAI,QAAQ,eAAe,KAAK,QAAQ,SAAS,GAAG;AAClD,WAAO,YAAY,OAAO,OAAO;AAAA,EACnC;AAEA,SAAO,YAAY,OAAO,SAAS,OAAO,iBAAiB;AAC7D;AAEO,SAAS,wBAAwB,QAAyB;AAC/D,SAAO;AAAA,IACL;AAAA,MACE,MAAM;AAAA,MACN,OAAO;AAAA,MACP,aACE;AAAA,MACF,QAAQ;AAAA,MACR,cAAc;AAAA,MACd,aAAa;AAAA,QACX,cAAc;AAAA,QACd,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,eAAe;AAAA,MACjB;AAAA,IACF;AAAA,IACA,OAAO,MAAM,QAAQ;AACnB,UAAI,CAAC,gBAAgB,EAAE,UAAU;AAC/B,eAAO,eAAe,YAAY,qBAAqB,UAAU,CAAC,CAAC;AAAA,MACrE;AAEA,YAAM,WAAW,mBAAmB,KAAK,cAAc;AACvD,YAAM,SAAS,MAAM,kBAAkB,MAAM,QAAQ;AAErD,YAAM,SAAS,SAAS,KAAK,KAAK,OAAO,UAAU,kBAAkB,iBAAiB;AACtF,aAAO,eAAe,MAAM;AAAA,IAC9B;AAAA,EACF;AACF;",
|
|
4
|
+
"sourcesContent": ["/**\n * Scrape Links Tool Handler\n *\n * Scrapes many URLs in parallel. Reddit permalinks (reddit.com/r/.../comments/...)\n * are auto-detected and routed through the Reddit API; all other URLs go through\n * the scraper. Both branches feed the same per-URL LLM extraction pipeline.\n *\n * NEVER throws \u2014 every error is returned as a tool-level failure response.\n */\n\nimport type { MCPServer } from 'mcp-use/server';\n\nimport {\n SCRAPER,\n CONCURRENCY,\n getCapabilities,\n getMissingEnvMessage,\n parseEnv,\n} from '../config/index.js';\nimport {\n scrapeLinksOutputSchema,\n scrapeLinksParamsSchema,\n type ScrapeLinksParams,\n type ScrapeLinksOutput,\n} from '../schemas/scrape-links.js';\nimport { ScraperClient } from '../clients/scraper.js';\nimport { RedditClient, type PostResult } from '../clients/reddit.js';\nimport { JinaClient } from '../clients/jina.js';\nimport { MarkdownCleaner } from '../services/markdown-cleaner.js';\nimport { createLLMProcessor, processContentWithLLM } from '../services/llm-processor.js';\nimport { removeMetaTags } from '../utils/markdown-formatter.js';\nimport { extractReadableContent } from '../utils/content-extractor.js';\nimport { classifyError, ErrorCode } from '../utils/errors.js';\nimport { isDocumentUrl } from '../utils/source-type.js';\nimport { pMap, pMapSettled } from '../utils/concurrency.js';\nimport {\n mcpLog,\n formatSuccess,\n formatError,\n formatBatchHeader,\n formatDuration,\n} from './utils.js';\nimport {\n createToolReporter,\n NOOP_REPORTER,\n toolFailure,\n toolSuccess,\n toToolResponse,\n type ToolExecutionResult,\n type ToolReporter,\n} from './mcp-helpers.js';\n\nconst markdownCleaner = new MarkdownCleaner();\n\nfunction enhanceExtractionInstruction(instruction: string | undefined): string {\n const base = instruction || 'Extract the main content and key information from this page.';\n return `${SCRAPER.EXTRACTION_PREFIX}\\n\\n${base}\\n\\n${SCRAPER.EXTRACTION_SUFFIX}`;\n}\n\n// --- Types ---\n\ninterface ProcessedResult {\n url: string;\n content: string;\n index: number; // original position in params.urls[]\n}\n\ninterface ScrapeMetrics {\n successful: number;\n failed: number;\n totalCredits: number;\n}\n\ninterface ScrapePhaseResult {\n successItems: ProcessedResult[];\n failedContents: string[];\n metrics: ScrapeMetrics;\n}\n\ninterface BranchInput {\n url: string;\n origIndex: number;\n}\n\ninterface ScrapeClients {\n client: ScraperClient;\n jinaClient: JinaClient;\n llmProcessor: ReturnType<typeof createLLMProcessor>;\n}\n\ninterface WebPhaseResult extends ScrapePhaseResult {\n /** URLs that Scrape.do returned as binary content; re-run through Jina. */\n binaryDeferred: BranchInput[];\n}\n\n// --- Reddit URL detection ---\n\nconst REDDIT_HOST = /(?:^|\\.)reddit\\.com$/i;\nconst REDDIT_POST_PERMALINK = /\\/r\\/[^/]+\\/comments\\/[a-z0-9]+/i;\n\nfunction isRedditUrl(url: string): boolean {\n try {\n const u = new URL(url);\n return REDDIT_HOST.test(u.hostname);\n } catch {\n return false;\n }\n}\n\nfunction isRedditPostPermalink(url: string): boolean {\n try {\n const u = new URL(url);\n return REDDIT_HOST.test(u.hostname) && REDDIT_POST_PERMALINK.test(u.pathname);\n } catch {\n return false;\n }\n}\n\n// --- Error helper ---\n\nfunction createScrapeErrorResponse(\n code: string,\n message: string,\n startTime: number,\n retryable = false,\n alternatives?: string[],\n): ToolExecutionResult<ScrapeLinksOutput> {\n return toolFailure(\n `${formatError({\n code,\n message,\n retryable,\n toolName: 'scrape-links',\n howToFix: code === 'NO_URLS' ? ['Provide at least one valid URL'] : undefined,\n alternatives,\n })}\\n\\nExecution time: ${formatDuration(Date.now() - startTime)}`,\n );\n}\n\n// --- URL partitioning ---\n\ninterface PartitionedUrls {\n webInputs: BranchInput[];\n redditInputs: BranchInput[];\n documentInputs: BranchInput[];\n invalidEntries: { url: string; origIndex: number }[];\n}\n\nfunction partitionUrls(urls: string[]): PartitionedUrls {\n const webInputs: BranchInput[] = [];\n const redditInputs: BranchInput[] = [];\n const documentInputs: BranchInput[] = [];\n const invalidEntries: { url: string; origIndex: number }[] = [];\n\n for (let i = 0; i < urls.length; i++) {\n const url = urls[i]!;\n try {\n new URL(url);\n } catch {\n invalidEntries.push({ url, origIndex: i });\n continue;\n }\n // Document URLs (.pdf/.docx/.pptx/.xlsx) go straight to Jina Reader \u2014\n // bypassing Scrape.do because it cannot decode binary bodies. Ordered\n // before the Reddit check so a hypothetical PDF on a reddit-adjacent host\n // still takes the document path.\n if (isDocumentUrl(url)) {\n documentInputs.push({ url, origIndex: i });\n } else if (isRedditUrl(url)) {\n redditInputs.push({ url, origIndex: i });\n } else {\n webInputs.push({ url, origIndex: i });\n }\n }\n\n return { webInputs, redditInputs, documentInputs, invalidEntries };\n}\n\n// --- Web branch ---\n\nasync function fetchWebBranch(\n inputs: BranchInput[],\n client: ScraperClient,\n): Promise<WebPhaseResult> {\n if (inputs.length === 0) {\n return {\n successItems: [],\n failedContents: [],\n metrics: { successful: 0, failed: 0, totalCredits: 0 },\n binaryDeferred: [],\n };\n }\n\n mcpLog('info', `[concurrency] web branch: fanning out ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`, 'scrape');\n const urls = inputs.map((i) => i.url);\n const results = await client.scrapeMultiple(urls, { timeout: 60 });\n const urlToIndex = new Map(inputs.map((i) => [i.url, i.origIndex]));\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [];\n const binaryDeferred: BranchInput[] = [];\n let successful = 0;\n let failed = 0;\n let totalCredits = 0;\n\n for (let i = 0; i < results.length; i++) {\n const result = results[i];\n const origIndex = inputs[i]!.origIndex;\n if (!result) {\n failed++;\n failedContents.push(`## ${inputs[i]!.url}\\n\\n\u274C No result returned`);\n continue;\n }\n\n // Binary document detected by content-type \u2014 defer to Jina Reader.\n // These URLs are not counted as failures yet; the handler will re-run\n // them through the document branch and merge results.\n if (result.error?.code === ErrorCode.UNSUPPORTED_BINARY_CONTENT) {\n binaryDeferred.push({\n url: result.url,\n origIndex: urlToIndex.get(result.url) ?? origIndex,\n });\n continue;\n }\n\n if (result.error || result.statusCode < 200 || result.statusCode >= 300) {\n failed++;\n const errorMsg = result.error?.message || result.content || `HTTP ${result.statusCode}`;\n failedContents.push(`## ${result.url}\\n\\n\u274C Failed to scrape: ${errorMsg}`);\n continue;\n }\n\n successful++;\n totalCredits += result.credits;\n\n let content: string;\n try {\n const readable = extractReadableContent(result.content, result.url);\n const sourceForCleaner = readable.extracted ? readable.content : result.content;\n content = markdownCleaner.processContent(sourceForCleaner);\n } catch {\n content = result.content;\n }\n\n successItems.push({ url: result.url, content, index: origIndex });\n }\n\n return {\n successItems,\n failedContents,\n metrics: { successful, failed, totalCredits },\n binaryDeferred,\n };\n}\n\n// --- Document branch (Jina Reader) ---\n\nasync function fetchDocumentBranch(\n inputs: BranchInput[],\n jinaClient: JinaClient,\n): Promise<ScrapePhaseResult> {\n if (inputs.length === 0) {\n return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };\n }\n\n mcpLog(\n 'info',\n `[concurrency] document branch (jina): converting ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`,\n 'scrape',\n );\n\n const results = await pMapSettled(\n inputs,\n (input) => jinaClient.convert({ url: input.url }),\n CONCURRENCY.SCRAPER,\n );\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [];\n let successful = 0;\n let failed = 0;\n\n for (let i = 0; i < results.length; i++) {\n const settled = results[i];\n const input = inputs[i]!;\n if (!settled) {\n failed++;\n failedContents.push(`## ${input.url}\\n\\n\u274C No result returned (document conversion)`);\n continue;\n }\n if (settled.status === 'rejected') {\n failed++;\n const reason = settled.reason instanceof Error ? settled.reason.message : String(settled.reason);\n failedContents.push(`## ${input.url}\\n\\n\u274C Document conversion failed: ${reason}`);\n continue;\n }\n\n const result = settled.value;\n if (result.error || result.statusCode < 200 || result.statusCode >= 300) {\n failed++;\n const errorMsg = result.error?.message || `HTTP ${result.statusCode}`;\n failedContents.push(`## ${input.url}\\n\\n\u274C Document conversion failed: ${errorMsg}`);\n continue;\n }\n\n successful++;\n successItems.push({ url: input.url, content: result.content, index: input.origIndex });\n }\n\n return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };\n}\n\n// --- Reddit branch ---\n\nfunction formatRedditPostAsMarkdown(result: PostResult): string {\n const { post, comments } = result;\n const lines: string[] = [];\n lines.push(`# ${post.title}`);\n lines.push('');\n lines.push(`**r/${post.subreddit}** \u2022 u/${post.author} \u2022 \u2B06\uFE0F ${post.score} \u2022 \uD83D\uDCAC ${post.commentCount} comments`);\n lines.push(`\uD83D\uDD17 ${post.url}`);\n lines.push('');\n if (post.body) {\n lines.push('## Post content');\n lines.push('');\n lines.push(post.body);\n lines.push('');\n }\n if (comments.length > 0) {\n lines.push(`## Top comments (${comments.length} total)`);\n lines.push('');\n for (const c of comments) {\n const indent = ' '.repeat(c.depth);\n const op = c.isOP ? ' **[OP]**' : '';\n const score = c.score >= 0 ? `+${c.score}` : `${c.score}`;\n lines.push(`${indent}- **u/${c.author}**${op} _(${score})_`);\n for (const line of c.body.split('\\n')) {\n lines.push(`${indent} ${line}`);\n }\n lines.push('');\n }\n }\n return lines.join('\\n');\n}\n\nasync function fetchRedditBranch(inputs: BranchInput[]): Promise<ScrapePhaseResult> {\n if (inputs.length === 0) {\n return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };\n }\n\n const env = parseEnv();\n if (!env.REDDIT_CLIENT_ID || !env.REDDIT_CLIENT_SECRET) {\n const failedContents = inputs.map(\n (i) => `## ${i.url}\\n\\n\u274C Reddit URL detected, but Reddit API is not configured. Set \\`REDDIT_CLIENT_ID\\` and \\`REDDIT_CLIENT_SECRET\\` in the server env to enable threaded Reddit scraping.`,\n );\n return {\n successItems: [],\n failedContents,\n metrics: { successful: 0, failed: inputs.length, totalCredits: 0 },\n };\n }\n\n // Warn for non-permalink Reddit URLs (subreddit homepages, /new, /top, /hot,\n // user profiles). The Reddit API path we call requires /r/.../comments/... \u2014\n // reject upfront so the caller sees a helpful message instead of a 404.\n const [postInputs, nonPermalinks] = inputs.reduce<[BranchInput[], BranchInput[]]>(\n ([posts, rest], input) => {\n if (isRedditPostPermalink(input.url)) posts.push(input);\n else rest.push(input);\n return [posts, rest];\n },\n [[], []],\n );\n\n const nonPermalinkFailed = nonPermalinks.map(\n (i) => `## ${i.url}\\n\\n\u274C Only Reddit post permalinks (/r/<sub>/comments/<id>/...) are supported. Use web-search with scope:\"reddit\" to discover post permalinks first.`,\n );\n\n if (postInputs.length === 0) {\n return {\n successItems: [],\n failedContents: nonPermalinkFailed,\n metrics: { successful: 0, failed: nonPermalinks.length, totalCredits: 0 },\n };\n }\n\n mcpLog('info', `[concurrency] reddit branch: fetching ${postInputs.length} post(s) with limit=${CONCURRENCY.REDDIT}`, 'scrape');\n const client = new RedditClient(env.REDDIT_CLIENT_ID, env.REDDIT_CLIENT_SECRET);\n const urls = postInputs.map((i) => i.url);\n const batchResult = await client.batchGetPosts(urls, true);\n const urlToIndex = new Map(postInputs.map((i) => [i.url, i.origIndex]));\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [...nonPermalinkFailed];\n let successful = 0;\n let failed = nonPermalinks.length;\n\n for (const [url, result] of batchResult.results) {\n const origIndex = urlToIndex.get(url) ?? -1;\n if (result instanceof Error) {\n failed++;\n failedContents.push(`## ${url}\\n\\n\u274C Reddit fetch failed: ${result.message}`);\n continue;\n }\n successful++;\n successItems.push({ url, content: formatRedditPostAsMarkdown(result), index: origIndex });\n }\n\n return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };\n}\n\n// --- LLM extraction (shared by both branches) ---\n\nasync function processItemsWithLlm(\n successItems: ProcessedResult[],\n enhancedInstruction: string,\n llmProcessor: ReturnType<typeof createLLMProcessor>,\n reporter: ToolReporter,\n): Promise<{ items: ProcessedResult[]; llmErrors: number; llmAttempted: number }> {\n let llmErrors = 0;\n\n if (!llmProcessor || successItems.length === 0) {\n if (!llmProcessor && successItems.length > 0) {\n mcpLog('warning', 'LLM unavailable (LLM_API_KEY not set). Returning raw scraped content.', 'scrape');\n void reporter.log('warning', 'llm_extractor_unreachable: planner not configured; raw scraped content returned');\n }\n return { items: successItems, llmErrors, llmAttempted: 0 };\n }\n\n mcpLog('info', `[concurrency] llm extraction: fanning out ${successItems.length} item(s) with limit=${CONCURRENCY.LLM_EXTRACTION}`, 'scrape');\n\n const llmResults = await pMap(\n successItems,\n async (item) => {\n mcpLog('debug', `LLM extracting ${item.url}...`, 'scrape');\n\n const llmResult = await processContentWithLLM(\n item.content,\n { enabled: true, extract: enhancedInstruction, url: item.url },\n llmProcessor,\n );\n\n if (llmResult.processed) {\n return { ...item, content: llmResult.content };\n }\n\n llmErrors++;\n mcpLog('warning', `LLM extraction failed for ${item.url}: ${llmResult.error || 'unknown reason'}`, 'scrape');\n void reporter.log('warning', `llm_extractor_unreachable: ${item.url} \u2014 ${llmResult.error || 'unknown reason'}`);\n return item;\n },\n CONCURRENCY.LLM_EXTRACTION,\n );\n\n return { items: llmResults, llmErrors, llmAttempted: successItems.length };\n}\n\n// --- Output assembly ---\n\nfunction assembleContentEntries(successItems: ProcessedResult[], failedContents: string[]): string[] {\n const sorted = [...successItems].sort((a, b) => a.index - b.index);\n const contents = [...failedContents];\n for (const item of sorted) {\n let content = item.content;\n try {\n content = removeMetaTags(content);\n } catch {\n // Use content as-is\n }\n contents.push(`## ${item.url}\\n\\n${content}`);\n }\n return contents;\n}\n\nfunction buildScrapeResponse(\n params: ScrapeLinksParams,\n contents: string[],\n metrics: ScrapeMetrics,\n llmErrors: number,\n executionTime: number,\n llmAccounting: { llmAttempted: number; llmSucceeded: boolean },\n): { content: string; structuredContent: ScrapeLinksOutput } {\n const llmExtras: Record<string, string | number> = {};\n if (llmAccounting.llmAttempted > 0) {\n const ok = llmAccounting.llmAttempted - llmErrors;\n llmExtras['LLM extraction'] = `${ok}/${llmAccounting.llmAttempted} succeeded`;\n if (!llmAccounting.llmSucceeded) {\n llmExtras['LLM credit'] = '0 charged (no extraction produced)';\n }\n } else if (llmErrors > 0) {\n llmExtras['LLM extraction failures'] = llmErrors;\n }\n\n const batchHeader = formatBatchHeader({\n title: `Scraped Content (${params.urls.length} URLs)`,\n totalItems: params.urls.length,\n successful: metrics.successful,\n failed: metrics.failed,\n extras: {\n 'Credits used': metrics.totalCredits,\n ...llmExtras,\n },\n });\n\n const formattedContent = formatSuccess({\n title: 'Scraping Complete',\n summary: batchHeader,\n data: contents.join('\\n\\n---\\n\\n'),\n metadata: {\n 'Execution time': formatDuration(executionTime),\n },\n });\n\n const metadata: ScrapeLinksOutput['metadata'] = {\n total_items: params.urls.length,\n successful: metrics.successful,\n failed: metrics.failed,\n execution_time_ms: executionTime,\n total_credits: metrics.totalCredits,\n };\n return { content: formattedContent, structuredContent: { metadata } };\n}\n\n// --- Handler ---\n\nexport async function handleScrapeLinks(\n params: ScrapeLinksParams,\n reporter: ToolReporter = NOOP_REPORTER,\n): Promise<ToolExecutionResult<ScrapeLinksOutput>> {\n const startTime = Date.now();\n\n if (!params.urls || params.urls.length === 0) {\n return createScrapeErrorResponse('NO_URLS', 'No URLs provided', startTime);\n }\n\n const { webInputs, redditInputs, documentInputs, invalidEntries } = partitionUrls(params.urls);\n const validCount = webInputs.length + redditInputs.length + documentInputs.length;\n\n await reporter.log(\n 'info',\n `Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${documentInputs.length} document, ${invalidEntries.length} invalid`,\n );\n\n if (validCount === 0) {\n return createScrapeErrorResponse(\n 'INVALID_URLS',\n `All ${params.urls.length} URLs are invalid`,\n startTime,\n false,\n [\n 'web-search(queries=[...], extract=\"...\") \u2014 search for valid URLs first, then scrape the results',\n ],\n );\n }\n\n mcpLog(\n 'info',\n `Starting scrape: ${webInputs.length} web + ${redditInputs.length} reddit + ${documentInputs.length} document URL(s)`,\n 'scrape',\n );\n await reporter.progress(15, 100, 'Preparing scraper clients');\n\n // Only initialize the Scrape.do client if we actually have HTML/web URLs.\n // The Jina client is cheap (no auth needed) and always constructed so the\n // document branch and the web\u2192Jina fallback path both work uniformly.\n let clients: ScrapeClients | null = null;\n try {\n const jinaClient = new JinaClient();\n if (webInputs.length > 0) {\n clients = {\n client: new ScraperClient(),\n jinaClient,\n llmProcessor: createLLMProcessor(),\n };\n } else {\n clients = {\n client: null as unknown as ScraperClient,\n jinaClient,\n llmProcessor: createLLMProcessor(),\n };\n }\n } catch (error) {\n const err = classifyError(error);\n return createScrapeErrorResponse(\n 'CLIENT_INIT_FAILED',\n `Failed to initialize scraper: ${err.message}`,\n startTime,\n false,\n [\n 'web-search(queries=[\"topic key findings\", \"topic summary\"], extract=\"key findings and summary\") \u2014 search instead of scraping',\n ],\n );\n }\n\n const enhancedInstruction = enhanceExtractionInstruction(params.extract);\n\n await reporter.progress(35, 100, 'Fetching page content');\n\n // Phase 1 \u2014 run all three branches in parallel. Failures in one branch do\n // not block the others. The web branch may surface binary-content URLs via\n // `binaryDeferred`, which are re-routed through Jina in Phase 2.\n const emptyPhase: WebPhaseResult = {\n successItems: [], failedContents: [],\n metrics: { successful: 0, failed: 0, totalCredits: 0 },\n binaryDeferred: [],\n };\n const [webPhase, redditPhase, documentPhase] = await Promise.all([\n webInputs.length > 0\n ? fetchWebBranch(webInputs, clients.client)\n : Promise.resolve<WebPhaseResult>(emptyPhase),\n fetchRedditBranch(redditInputs),\n fetchDocumentBranch(documentInputs, clients.jinaClient),\n ]);\n\n // Phase 2 \u2014 fallback for URLs that Scrape.do reported as binary.\n let deferredPhase: ScrapePhaseResult = {\n successItems: [], failedContents: [],\n metrics: { successful: 0, failed: 0, totalCredits: 0 },\n };\n if (webPhase.binaryDeferred.length > 0) {\n await reporter.log(\n 'info',\n `Rerouting ${webPhase.binaryDeferred.length} binary URL(s) from Scrape.do \u2192 Jina Reader`,\n );\n deferredPhase = await fetchDocumentBranch(webPhase.binaryDeferred, clients.jinaClient);\n }\n\n const successItems = [\n ...webPhase.successItems,\n ...redditPhase.successItems,\n ...documentPhase.successItems,\n ...deferredPhase.successItems,\n ];\n const invalidFailed = invalidEntries.map(\n ({ url }) => `## ${url}\\n\\n\u274C Invalid URL format`,\n );\n const failedContents = [\n ...invalidFailed,\n ...webPhase.failedContents,\n ...redditPhase.failedContents,\n ...documentPhase.failedContents,\n ...deferredPhase.failedContents,\n ];\n const metrics: ScrapeMetrics = {\n successful:\n webPhase.metrics.successful\n + redditPhase.metrics.successful\n + documentPhase.metrics.successful\n + deferredPhase.metrics.successful,\n failed:\n invalidEntries.length\n + webPhase.metrics.failed\n + redditPhase.metrics.failed\n + documentPhase.metrics.failed\n + deferredPhase.metrics.failed,\n totalCredits: webPhase.metrics.totalCredits,\n };\n\n await reporter.log('info', `Fetched ${metrics.successful} page(s), ${metrics.failed} failed`);\n\n if (successItems.length > 0) {\n await reporter.progress(80, 100, 'Running LLM extraction over fetched pages');\n }\n\n const { items: processedItems, llmErrors, llmAttempted } = await processItemsWithLlm(\n successItems,\n enhancedInstruction,\n clients.llmProcessor,\n reporter,\n );\n\n const contents = assembleContentEntries(processedItems, failedContents);\n const executionTime = Date.now() - startTime;\n\n mcpLog(\n 'info',\n `Completed: ${metrics.successful} successful, ${metrics.failed} failed, ${metrics.totalCredits} credits used`,\n 'scrape',\n );\n\n const llmSucceeded = llmAttempted > 0 && llmErrors < llmAttempted;\n const result = buildScrapeResponse(\n params,\n contents,\n metrics,\n llmErrors,\n executionTime,\n { llmAttempted, llmSucceeded },\n );\n\n if (metrics.successful === 0 && metrics.failed > 0) {\n return toolFailure(result.content);\n }\n\n return toolSuccess(result.content, result.structuredContent);\n}\n\nexport function registerScrapeLinksTool(server: MCPServer): void {\n server.tool(\n {\n name: 'scrape-links',\n title: 'Scrape Links',\n description:\n 'Fetch many URLs in parallel and run per-URL structured LLM extraction. Auto-detects reddit.com post permalinks and routes them through the Reddit API (threaded post + comments); everything else flows through the HTTP scraper. Safe to call in parallel \u2014 group URLs by context rather than jamming unrelated batches together. Each page returns `## Source`, `## Matches` (verbatim-preserved facts), `## Not found` (explicit gaps), and `## Follow-up signals` (new terms + referenced URLs) that feed the next research loop. Describe the SHAPE of what you want in `extract`, facets separated by `|` (e.g. `root cause | affected versions | fix | workarounds | timeline`).',\n schema: scrapeLinksParamsSchema,\n outputSchema: scrapeLinksOutputSchema,\n annotations: {\n readOnlyHint: true,\n idempotentHint: true,\n destructiveHint: false,\n openWorldHint: true,\n },\n },\n async (args, ctx) => {\n if (!getCapabilities().scraping) {\n return toToolResponse(toolFailure(getMissingEnvMessage('scraping')));\n }\n\n const reporter = createToolReporter(ctx, 'scrape-links');\n const result = await handleScrapeLinks(args, reporter);\n\n await reporter.progress(100, 100, result.isError ? 'Scrape failed' : 'Scrape complete');\n return toToolResponse(result);\n },\n );\n}\n"],
|
|
5
|
+
"mappings": "AAYA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP;AAAA,EACE;AAAA,EACA;AAAA,OAGK;AACP,SAAS,qBAAqB;AAC9B,SAAS,oBAAqC;AAC9C,SAAS,kBAAkB;AAC3B,SAAS,uBAAuB;AAChC,SAAS,oBAAoB,6BAA6B;AAC1D,SAAS,sBAAsB;AAC/B,SAAS,8BAA8B;AACvC,SAAS,eAAe,iBAAiB;AACzC,SAAS,qBAAqB;AAC9B,SAAS,MAAM,mBAAmB;AAClC;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAGK;AAEP,MAAM,kBAAkB,IAAI,gBAAgB;AAE5C,SAAS,6BAA6B,aAAyC;AAC7E,QAAM,OAAO,eAAe;AAC5B,SAAO,GAAG,QAAQ,iBAAiB;AAAA;AAAA,EAAO,IAAI;AAAA;AAAA,EAAO,QAAQ,iBAAiB;AAChF;AAwCA,MAAM,cAAc;AACpB,MAAM,wBAAwB;AAE9B,SAAS,YAAY,KAAsB;AACzC,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AACrB,WAAO,YAAY,KAAK,EAAE,QAAQ;AAAA,EACpC,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEA,SAAS,sBAAsB,KAAsB;AACnD,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AACrB,WAAO,YAAY,KAAK,EAAE,QAAQ,KAAK,sBAAsB,KAAK,EAAE,QAAQ;AAAA,EAC9E,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAIA,SAAS,0BACP,MACA,SACA,WACA,YAAY,OACZ,cACwC;AACxC,SAAO;AAAA,IACL,GAAG,YAAY;AAAA,MACb;AAAA,MACA;AAAA,MACA;AAAA,MACA,UAAU;AAAA,MACV,UAAU,SAAS,YAAY,CAAC,gCAAgC,IAAI;AAAA,MACpE;AAAA,IACF,CAAC,CAAC;AAAA;AAAA,kBAAuB,eAAe,KAAK,IAAI,IAAI,SAAS,CAAC;AAAA,EACjE;AACF;AAWA,SAAS,cAAc,MAAiC;AACtD,QAAM,YAA2B,CAAC;AAClC,QAAM,eAA8B,CAAC;AACrC,QAAM,iBAAgC,CAAC;AACvC,QAAM,iBAAuD,CAAC;AAE9D,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,qBAAe,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AACzC;AAAA,IACF;AAKA,QAAI,cAAc,GAAG,GAAG;AACtB,qBAAe,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IAC3C,WAAW,YAAY,GAAG,GAAG;AAC3B,mBAAa,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IACzC,OAAO;AACL,gBAAU,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,SAAO,EAAE,WAAW,cAAc,gBAAgB,eAAe;AACnE;AAIA,eAAe,eACb,QACA,QACyB;AACzB,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAgB,CAAC;AAAA,MACjB,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE;AAAA,MACrD,gBAAgB,CAAC;AAAA,IACnB;AAAA,EACF;AAEA,SAAO,QAAQ,yCAAyC,OAAO,MAAM,sBAAsB,YAAY,OAAO,IAAI,QAAQ;AAC1H,QAAM,OAAO,OAAO,IAAI,CAAC,MAAM,EAAE,GAAG;AACpC,QAAM,UAAU,MAAM,OAAO,eAAe,MAAM,EAAE,SAAS,GAAG,CAAC;AACjE,QAAM,aAAa,IAAI,IAAI,OAAO,IAAI,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,SAAS,CAAC,CAAC;AAElE,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC;AAClC,QAAM,iBAAgC,CAAC;AACvC,MAAI,aAAa;AACjB,MAAI,SAAS;AACb,MAAI,eAAe;AAEnB,WAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,UAAM,SAAS,QAAQ,CAAC;AACxB,UAAM,YAAY,OAAO,CAAC,EAAG;AAC7B,QAAI,CAAC,QAAQ;AACX;AACA,qBAAe,KAAK,MAAM,OAAO,CAAC,EAAG,GAAG;AAAA;AAAA,0BAA0B;AAClE;AAAA,IACF;AAKA,QAAI,OAAO,OAAO,SAAS,UAAU,4BAA4B;AAC/D,qBAAe,KAAK;AAAA,QAClB,KAAK,OAAO;AAAA,QACZ,WAAW,WAAW,IAAI,OAAO,GAAG,KAAK;AAAA,MAC3C,CAAC;AACD;AAAA,IACF;AAEA,QAAI,OAAO,SAAS,OAAO,aAAa,OAAO,OAAO,cAAc,KAAK;AACvE;AACA,YAAM,WAAW,OAAO,OAAO,WAAW,OAAO,WAAW,QAAQ,OAAO,UAAU;AACrF,qBAAe,KAAK,MAAM,OAAO,GAAG;AAAA;AAAA,2BAA2B,QAAQ,EAAE;AACzE;AAAA,IACF;AAEA;AACA,oBAAgB,OAAO;AAEvB,QAAI;AACJ,QAAI;AACF,YAAM,WAAW,uBAAuB,OAAO,SAAS,OAAO,GAAG;AAClE,YAAM,mBAAmB,SAAS,YAAY,SAAS,UAAU,OAAO;AACxE,gBAAU,gBAAgB,eAAe,gBAAgB;AAAA,IAC3D,QAAQ;AACN,gBAAU,OAAO;AAAA,IACnB;AAEA,iBAAa,KAAK,EAAE,KAAK,OAAO,KAAK,SAAS,OAAO,UAAU,CAAC;AAAA,EAClE;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,SAAS,EAAE,YAAY,QAAQ,aAAa;AAAA,IAC5C;AAAA,EACF;AACF;AAIA,eAAe,oBACb,QACA,YAC4B;AAC5B,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE;AAAA,EACxG;AAEA;AAAA,IACE;AAAA,IACA,oDAAoD,OAAO,MAAM,sBAAsB,YAAY,OAAO;AAAA,IAC1G;AAAA,EACF;AAEA,QAAM,UAAU,MAAM;AAAA,IACpB;AAAA,IACA,CAAC,UAAU,WAAW,QAAQ,EAAE,KAAK,MAAM,IAAI,CAAC;AAAA,IAChD,YAAY;AAAA,EACd;AAEA,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC;AAClC,MAAI,aAAa;AACjB,MAAI,SAAS;AAEb,WAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,UAAM,UAAU,QAAQ,CAAC;AACzB,UAAM,QAAQ,OAAO,CAAC;AACtB,QAAI,CAAC,SAAS;AACZ;AACA,qBAAe,KAAK,MAAM,MAAM,GAAG;AAAA;AAAA,gDAAgD;AACnF;AAAA,IACF;AACA,QAAI,QAAQ,WAAW,YAAY;AACjC;AACA,YAAM,SAAS,QAAQ,kBAAkB,QAAQ,QAAQ,OAAO,UAAU,OAAO,QAAQ,MAAM;AAC/F,qBAAe,KAAK,MAAM,MAAM,GAAG;AAAA;AAAA,qCAAqC,MAAM,EAAE;AAChF;AAAA,IACF;AAEA,UAAM,SAAS,QAAQ;AACvB,QAAI,OAAO,SAAS,OAAO,aAAa,OAAO,OAAO,cAAc,KAAK;AACvE;AACA,YAAM,WAAW,OAAO,OAAO,WAAW,QAAQ,OAAO,UAAU;AACnE,qBAAe,KAAK,MAAM,MAAM,GAAG;AAAA;AAAA,qCAAqC,QAAQ,EAAE;AAClF;AAAA,IACF;AAEA;AACA,iBAAa,KAAK,EAAE,KAAK,MAAM,KAAK,SAAS,OAAO,SAAS,OAAO,MAAM,UAAU,CAAC;AAAA,EACvF;AAEA,SAAO,EAAE,cAAc,gBAAgB,SAAS,EAAE,YAAY,QAAQ,cAAc,EAAE,EAAE;AAC1F;AAIA,SAAS,2BAA2B,QAA4B;AAC9D,QAAM,EAAE,MAAM,SAAS,IAAI;AAC3B,QAAM,QAAkB,CAAC;AACzB,QAAM,KAAK,KAAK,KAAK,KAAK,EAAE;AAC5B,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,OAAO,KAAK,SAAS,eAAU,KAAK,MAAM,wBAAS,KAAK,KAAK,qBAAS,KAAK,YAAY,WAAW;AAC7G,QAAM,KAAK,aAAM,KAAK,GAAG,EAAE;AAC3B,QAAM,KAAK,EAAE;AACb,MAAI,KAAK,MAAM;AACb,UAAM,KAAK,iBAAiB;AAC5B,UAAM,KAAK,EAAE;AACb,UAAM,KAAK,KAAK,IAAI;AACpB,UAAM,KAAK,EAAE;AAAA,EACf;AACA,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,oBAAoB,SAAS,MAAM,SAAS;AACvD,UAAM,KAAK,EAAE;AACb,eAAW,KAAK,UAAU;AACxB,YAAM,SAAS,KAAK,OAAO,EAAE,KAAK;AAClC,YAAM,KAAK,EAAE,OAAO,cAAc;AAClC,YAAM,QAAQ,EAAE,SAAS,IAAI,IAAI,EAAE,KAAK,KAAK,GAAG,EAAE,KAAK;AACvD,YAAM,KAAK,GAAG,MAAM,SAAS,EAAE,MAAM,KAAK,EAAE,MAAM,KAAK,IAAI;AAC3D,iBAAW,QAAQ,EAAE,KAAK,MAAM,IAAI,GAAG;AACrC,cAAM,KAAK,GAAG,MAAM,KAAK,IAAI,EAAE;AAAA,MACjC;AACA,YAAM,KAAK,EAAE;AAAA,IACf;AAAA,EACF;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,eAAe,kBAAkB,QAAmD;AAClF,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE;AAAA,EACxG;AAEA,QAAM,MAAM,SAAS;AACrB,MAAI,CAAC,IAAI,oBAAoB,CAAC,IAAI,sBAAsB;AACtD,UAAMA,kBAAiB,OAAO;AAAA,MAC5B,CAAC,MAAM,MAAM,EAAE,GAAG;AAAA;AAAA;AAAA,IACpB;AACA,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAAA;AAAA,MACA,SAAS,EAAE,YAAY,GAAG,QAAQ,OAAO,QAAQ,cAAc,EAAE;AAAA,IACnE;AAAA,EACF;AAKA,QAAM,CAAC,YAAY,aAAa,IAAI,OAAO;AAAA,IACzC,CAAC,CAAC,OAAO,IAAI,GAAG,UAAU;AACxB,UAAI,sBAAsB,MAAM,GAAG,EAAG,OAAM,KAAK,KAAK;AAAA,UACjD,MAAK,KAAK,KAAK;AACpB,aAAO,CAAC,OAAO,IAAI;AAAA,IACrB;AAAA,IACA,CAAC,CAAC,GAAG,CAAC,CAAC;AAAA,EACT;AAEA,QAAM,qBAAqB,cAAc;AAAA,IACvC,CAAC,MAAM,MAAM,EAAE,GAAG;AAAA;AAAA;AAAA,EACpB;AAEA,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAgB;AAAA,MAChB,SAAS,EAAE,YAAY,GAAG,QAAQ,cAAc,QAAQ,cAAc,EAAE;AAAA,IAC1E;AAAA,EACF;AAEA,SAAO,QAAQ,yCAAyC,WAAW,MAAM,uBAAuB,YAAY,MAAM,IAAI,QAAQ;AAC9H,QAAM,SAAS,IAAI,aAAa,IAAI,kBAAkB,IAAI,oBAAoB;AAC9E,QAAM,OAAO,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG;AACxC,QAAM,cAAc,MAAM,OAAO,cAAc,MAAM,IAAI;AACzD,QAAM,aAAa,IAAI,IAAI,WAAW,IAAI,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,SAAS,CAAC,CAAC;AAEtE,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC,GAAG,kBAAkB;AACvD,MAAI,aAAa;AACjB,MAAI,SAAS,cAAc;AAE3B,aAAW,CAAC,KAAK,MAAM,KAAK,YAAY,SAAS;AAC/C,UAAM,YAAY,WAAW,IAAI,GAAG,KAAK;AACzC,QAAI,kBAAkB,OAAO;AAC3B;AACA,qBAAe,KAAK,MAAM,GAAG;AAAA;AAAA,8BAA8B,OAAO,OAAO,EAAE;AAC3E;AAAA,IACF;AACA;AACA,iBAAa,KAAK,EAAE,KAAK,SAAS,2BAA2B,MAAM,GAAG,OAAO,UAAU,CAAC;AAAA,EAC1F;AAEA,SAAO,EAAE,cAAc,gBAAgB,SAAS,EAAE,YAAY,QAAQ,cAAc,EAAE,EAAE;AAC1F;AAIA,eAAe,oBACb,cACA,qBACA,cACA,UACgF;AAChF,MAAI,YAAY;AAEhB,MAAI,CAAC,gBAAgB,aAAa,WAAW,GAAG;AAC9C,QAAI,CAAC,gBAAgB,aAAa,SAAS,GAAG;AAC5C,aAAO,WAAW,yEAAyE,QAAQ;AACnG,WAAK,SAAS,IAAI,WAAW,iFAAiF;AAAA,IAChH;AACA,WAAO,EAAE,OAAO,cAAc,WAAW,cAAc,EAAE;AAAA,EAC3D;AAEA,SAAO,QAAQ,6CAA6C,aAAa,MAAM,uBAAuB,YAAY,cAAc,IAAI,QAAQ;AAE5I,QAAM,aAAa,MAAM;AAAA,IACvB;AAAA,IACA,OAAO,SAAS;AACd,aAAO,SAAS,kBAAkB,KAAK,GAAG,OAAO,QAAQ;AAEzD,YAAM,YAAY,MAAM;AAAA,QACtB,KAAK;AAAA,QACL,EAAE,SAAS,MAAM,SAAS,qBAAqB,KAAK,KAAK,IAAI;AAAA,QAC7D;AAAA,MACF;AAEA,UAAI,UAAU,WAAW;AACvB,eAAO,EAAE,GAAG,MAAM,SAAS,UAAU,QAAQ;AAAA,MAC/C;AAEA;AACA,aAAO,WAAW,6BAA6B,KAAK,GAAG,KAAK,UAAU,SAAS,gBAAgB,IAAI,QAAQ;AAC3G,WAAK,SAAS,IAAI,WAAW,8BAA8B,KAAK,GAAG,WAAM,UAAU,SAAS,gBAAgB,EAAE;AAC9G,aAAO;AAAA,IACT;AAAA,IACA,YAAY;AAAA,EACd;AAEA,SAAO,EAAE,OAAO,YAAY,WAAW,cAAc,aAAa,OAAO;AAC3E;AAIA,SAAS,uBAAuB,cAAiC,gBAAoC;AACnG,QAAM,SAAS,CAAC,GAAG,YAAY,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AACjE,QAAM,WAAW,CAAC,GAAG,cAAc;AACnC,aAAW,QAAQ,QAAQ;AACzB,QAAI,UAAU,KAAK;AACnB,QAAI;AACF,gBAAU,eAAe,OAAO;AAAA,IAClC,QAAQ;AAAA,IAER;AACA,aAAS,KAAK,MAAM,KAAK,GAAG;AAAA;AAAA,EAAO,OAAO,EAAE;AAAA,EAC9C;AACA,SAAO;AACT;AAEA,SAAS,oBACP,QACA,UACA,SACA,WACA,eACA,eAC2D;AAC3D,QAAM,YAA6C,CAAC;AACpD,MAAI,cAAc,eAAe,GAAG;AAClC,UAAM,KAAK,cAAc,eAAe;AACxC,cAAU,gBAAgB,IAAI,GAAG,EAAE,IAAI,cAAc,YAAY;AACjE,QAAI,CAAC,cAAc,cAAc;AAC/B,gBAAU,YAAY,IAAI;AAAA,IAC5B;AAAA,EACF,WAAW,YAAY,GAAG;AACxB,cAAU,yBAAyB,IAAI;AAAA,EACzC;AAEA,QAAM,cAAc,kBAAkB;AAAA,IACpC,OAAO,oBAAoB,OAAO,KAAK,MAAM;AAAA,IAC7C,YAAY,OAAO,KAAK;AAAA,IACxB,YAAY,QAAQ;AAAA,IACpB,QAAQ,QAAQ;AAAA,IAChB,QAAQ;AAAA,MACN,gBAAgB,QAAQ;AAAA,MACxB,GAAG;AAAA,IACL;AAAA,EACF,CAAC;AAED,QAAM,mBAAmB,cAAc;AAAA,IACrC,OAAO;AAAA,IACP,SAAS;AAAA,IACT,MAAM,SAAS,KAAK,aAAa;AAAA,IACjC,UAAU;AAAA,MACR,kBAAkB,eAAe,aAAa;AAAA,IAChD;AAAA,EACF,CAAC;AAED,QAAM,WAA0C;AAAA,IAC9C,aAAa,OAAO,KAAK;AAAA,IACzB,YAAY,QAAQ;AAAA,IACpB,QAAQ,QAAQ;AAAA,IAChB,mBAAmB;AAAA,IACnB,eAAe,QAAQ;AAAA,EACzB;AACA,SAAO,EAAE,SAAS,kBAAkB,mBAAmB,EAAE,SAAS,EAAE;AACtE;AAIA,eAAsB,kBACpB,QACA,WAAyB,eACwB;AACjD,QAAM,YAAY,KAAK,IAAI;AAE3B,MAAI,CAAC,OAAO,QAAQ,OAAO,KAAK,WAAW,GAAG;AAC5C,WAAO,0BAA0B,WAAW,oBAAoB,SAAS;AAAA,EAC3E;AAEA,QAAM,EAAE,WAAW,cAAc,gBAAgB,eAAe,IAAI,cAAc,OAAO,IAAI;AAC7F,QAAM,aAAa,UAAU,SAAS,aAAa,SAAS,eAAe;AAE3E,QAAM,SAAS;AAAA,IACb;AAAA,IACA,eAAe,OAAO,KAAK,MAAM,YAAY,UAAU,MAAM,SAAS,aAAa,MAAM,YAAY,eAAe,MAAM,cAAc,eAAe,MAAM;AAAA,EAC/J;AAEA,MAAI,eAAe,GAAG;AACpB,WAAO;AAAA,MACL;AAAA,MACA,OAAO,OAAO,KAAK,MAAM;AAAA,MACzB;AAAA,MACA;AAAA,MACA;AAAA,QACE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA;AAAA,IACE;AAAA,IACA,oBAAoB,UAAU,MAAM,UAAU,aAAa,MAAM,aAAa,eAAe,MAAM;AAAA,IACnG;AAAA,EACF;AACA,QAAM,SAAS,SAAS,IAAI,KAAK,2BAA2B;AAK5D,MAAI,UAAgC;AACpC,MAAI;AACF,UAAM,aAAa,IAAI,WAAW;AAClC,QAAI,UAAU,SAAS,GAAG;AACxB,gBAAU;AAAA,QACR,QAAQ,IAAI,cAAc;AAAA,QAC1B;AAAA,QACA,cAAc,mBAAmB;AAAA,MACnC;AAAA,IACF,OAAO;AACL,gBAAU;AAAA,QACR,QAAQ;AAAA,QACR;AAAA,QACA,cAAc,mBAAmB;AAAA,MACnC;AAAA,IACF;AAAA,EACF,SAAS,OAAO;AACd,UAAM,MAAM,cAAc,KAAK;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,iCAAiC,IAAI,OAAO;AAAA,MAC5C;AAAA,MACA;AAAA,MACA;AAAA,QACE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,sBAAsB,6BAA6B,OAAO,OAAO;AAEvE,QAAM,SAAS,SAAS,IAAI,KAAK,uBAAuB;AAKxD,QAAM,aAA6B;AAAA,IACjC,cAAc,CAAC;AAAA,IAAG,gBAAgB,CAAC;AAAA,IACnC,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE;AAAA,IACrD,gBAAgB,CAAC;AAAA,EACnB;AACA,QAAM,CAAC,UAAU,aAAa,aAAa,IAAI,MAAM,QAAQ,IAAI;AAAA,IAC/D,UAAU,SAAS,IACf,eAAe,WAAW,QAAQ,MAAM,IACxC,QAAQ,QAAwB,UAAU;AAAA,IAC9C,kBAAkB,YAAY;AAAA,IAC9B,oBAAoB,gBAAgB,QAAQ,UAAU;AAAA,EACxD,CAAC;AAGD,MAAI,gBAAmC;AAAA,IACrC,cAAc,CAAC;AAAA,IAAG,gBAAgB,CAAC;AAAA,IACnC,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE;AAAA,EACvD;AACA,MAAI,SAAS,eAAe,SAAS,GAAG;AACtC,UAAM,SAAS;AAAA,MACb;AAAA,MACA,aAAa,SAAS,eAAe,MAAM;AAAA,IAC7C;AACA,oBAAgB,MAAM,oBAAoB,SAAS,gBAAgB,QAAQ,UAAU;AAAA,EACvF;AAEA,QAAM,eAAe;AAAA,IACnB,GAAG,SAAS;AAAA,IACZ,GAAG,YAAY;AAAA,IACf,GAAG,cAAc;AAAA,IACjB,GAAG,cAAc;AAAA,EACnB;AACA,QAAM,gBAAgB,eAAe;AAAA,IACnC,CAAC,EAAE,IAAI,MAAM,MAAM,GAAG;AAAA;AAAA;AAAA,EACxB;AACA,QAAM,iBAAiB;AAAA,IACrB,GAAG;AAAA,IACH,GAAG,SAAS;AAAA,IACZ,GAAG,YAAY;AAAA,IACf,GAAG,cAAc;AAAA,IACjB,GAAG,cAAc;AAAA,EACnB;AACA,QAAM,UAAyB;AAAA,IAC7B,YACE,SAAS,QAAQ,aACf,YAAY,QAAQ,aACpB,cAAc,QAAQ,aACtB,cAAc,QAAQ;AAAA,IAC1B,QACE,eAAe,SACb,SAAS,QAAQ,SACjB,YAAY,QAAQ,SACpB,cAAc,QAAQ,SACtB,cAAc,QAAQ;AAAA,IAC1B,cAAc,SAAS,QAAQ;AAAA,EACjC;AAEA,QAAM,SAAS,IAAI,QAAQ,WAAW,QAAQ,UAAU,aAAa,QAAQ,MAAM,SAAS;AAE5F,MAAI,aAAa,SAAS,GAAG;AAC3B,UAAM,SAAS,SAAS,IAAI,KAAK,2CAA2C;AAAA,EAC9E;AAEA,QAAM,EAAE,OAAO,gBAAgB,WAAW,aAAa,IAAI,MAAM;AAAA,IAC/D;AAAA,IACA;AAAA,IACA,QAAQ;AAAA,IACR;AAAA,EACF;AAEA,QAAM,WAAW,uBAAuB,gBAAgB,cAAc;AACtE,QAAM,gBAAgB,KAAK,IAAI,IAAI;AAEnC;AAAA,IACE;AAAA,IACA,cAAc,QAAQ,UAAU,gBAAgB,QAAQ,MAAM,YAAY,QAAQ,YAAY;AAAA,IAC9F;AAAA,EACF;AAEA,QAAM,eAAe,eAAe,KAAK,YAAY;AACrD,QAAM,SAAS;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,EAAE,cAAc,aAAa;AAAA,EAC/B;AAEA,MAAI,QAAQ,eAAe,KAAK,QAAQ,SAAS,GAAG;AAClD,WAAO,YAAY,OAAO,OAAO;AAAA,EACnC;AAEA,SAAO,YAAY,OAAO,SAAS,OAAO,iBAAiB;AAC7D;AAEO,SAAS,wBAAwB,QAAyB;AAC/D,SAAO;AAAA,IACL;AAAA,MACE,MAAM;AAAA,MACN,OAAO;AAAA,MACP,aACE;AAAA,MACF,QAAQ;AAAA,MACR,cAAc;AAAA,MACd,aAAa;AAAA,QACX,cAAc;AAAA,QACd,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,eAAe;AAAA,MACjB;AAAA,IACF;AAAA,IACA,OAAO,MAAM,QAAQ;AACnB,UAAI,CAAC,gBAAgB,EAAE,UAAU;AAC/B,eAAO,eAAe,YAAY,qBAAqB,UAAU,CAAC,CAAC;AAAA,MACrE;AAEA,YAAM,WAAW,mBAAmB,KAAK,cAAc;AACvD,YAAM,SAAS,MAAM,kBAAkB,MAAM,QAAQ;AAErD,YAAM,SAAS,SAAS,KAAK,KAAK,OAAO,UAAU,kBAAkB,iBAAiB;AACtF,aAAO,eAAe,MAAM;AAAA,IAC9B;AAAA,EACF;AACF;",
|
|
6
6
|
"names": ["failedContents"]
|
|
7
7
|
}
|
package/dist/src/utils/errors.js
CHANGED
|
@@ -10,6 +10,7 @@ const ErrorCode = {
|
|
|
10
10
|
INVALID_INPUT: "INVALID_INPUT",
|
|
11
11
|
NOT_FOUND: "NOT_FOUND",
|
|
12
12
|
QUOTA_EXCEEDED: "QUOTA_EXCEEDED",
|
|
13
|
+
UNSUPPORTED_BINARY_CONTENT: "UNSUPPORTED_BINARY_CONTENT",
|
|
13
14
|
// Internal errors
|
|
14
15
|
INTERNAL_ERROR: "INTERNAL_ERROR",
|
|
15
16
|
PARSE_ERROR: "PARSE_ERROR",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/utils/errors.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Robust error handling utilities for MCP server\n * Ensures the server NEVER crashes and always returns structured responses\n */\n\nimport { mcpLog } from './logger.js';\n\n// ============================================================================\n// Error Codes (MCP-compliant)\n// ============================================================================\n\nexport const ErrorCode = {\n // Retryable errors\n RATE_LIMITED: 'RATE_LIMITED',\n TIMEOUT: 'TIMEOUT',\n NETWORK_ERROR: 'NETWORK_ERROR',\n SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE',\n \n // Non-retryable errors\n AUTH_ERROR: 'AUTH_ERROR',\n INVALID_INPUT: 'INVALID_INPUT',\n NOT_FOUND: 'NOT_FOUND',\n QUOTA_EXCEEDED: 'QUOTA_EXCEEDED',\n \n // Internal errors\n INTERNAL_ERROR: 'INTERNAL_ERROR',\n PARSE_ERROR: 'PARSE_ERROR',\n UNKNOWN_ERROR: 'UNKNOWN_ERROR',\n} as const;\n\ntype ErrorCodeType = typeof ErrorCode[keyof typeof ErrorCode];\n\n// ============================================================================\n// Structured Error Types\n// ============================================================================\n\nexport interface StructuredError {\n code: ErrorCodeType;\n message: string;\n retryable: boolean;\n statusCode?: number;\n cause?: string;\n}\n\ninterface RetryOptions {\n readonly maxRetries: number;\n readonly baseDelayMs: number;\n readonly maxDelayMs: number;\n readonly retryableStatuses: readonly number[];\n readonly onRetry?: (attempt: number, error: StructuredError, delayMs: number) => void;\n}\n\nconst DEFAULT_RETRY_OPTIONS: RetryOptions = {\n maxRetries: 3,\n baseDelayMs: 1000,\n maxDelayMs: 30000,\n retryableStatuses: [408, 429, 500, 502, 503, 504, 510],\n};\n\n// ============================================================================\n// Error Classification \u2014 Atomic Classifiers\n// ============================================================================\n\n/**\n * Classify DOMException (AbortError from AbortController timeouts)\n */\nfunction classifyDomException(error: DOMException): StructuredError {\n if (error.name === 'AbortError') {\n return { code: ErrorCode.TIMEOUT, message: 'Request timed out', retryable: true };\n }\n return { code: ErrorCode.UNKNOWN_ERROR, message: error.message, retryable: false };\n}\n\n/**\n * Classify by Node.js error codes (ECONNREFUSED, ENOTFOUND, etc.)\n * Returns null if no matching code is found.\n */\nfunction classifyByErrorCode(error: { code?: string; message?: string }): StructuredError | null {\n const errCode = error.code;\n if (!errCode) return null;\n\n const networkErrorMessages: Record<string, string> = {\n ECONNREFUSED: 'Connection refused \u2014 service may be down',\n ECONNRESET: 'Connection was reset \u2014 please retry',\n ECONNABORTED: 'Connection aborted \u2014 please retry',\n ENOTFOUND: 'Service not reachable \u2014 check your network',\n EPIPE: 'Connection lost \u2014 please retry',\n EAI_AGAIN: 'DNS lookup failed \u2014 check your network',\n };\n\n if (errCode === 'ECONNREFUSED' || errCode === 'ENOTFOUND' || errCode === 'ECONNRESET') {\n return { code: ErrorCode.NETWORK_ERROR, message: networkErrorMessages[errCode] || 'Network connection failed', retryable: true, cause: error.message };\n }\n\n if (errCode === 'ECONNABORTED' || errCode === 'ETIMEDOUT') {\n return { code: ErrorCode.TIMEOUT, message: networkErrorMessages[errCode] || 'Request timed out', retryable: true, cause: error.message };\n }\n\n return null;\n}\n\n/**\n * Classify by HTTP status code extracted from error objects (axios-style, fetch-style, etc.)\n * Returns null if no status code is found.\n */\nfunction classifyByStatusCode(error: { status?: number; statusCode?: number; response?: { status?: number }; message?: string }): StructuredError | null {\n const status = error.response?.status || error.status || error.statusCode;\n if (!status) return null;\n return classifyHttpError(status, error.message || String(error));\n}\n\n/**\n * Classify by error message patterns (timeout, rate-limit, auth, parse errors)\n * Returns null if no pattern matches.\n */\nfunction classifyByMessage(message: string): StructuredError | null {\n const lower = message.toLowerCase();\n\n // Timeout patterns\n if (lower.includes('timeout') || lower.includes('timed out') || lower.includes('aborterror')) {\n return { code: ErrorCode.TIMEOUT, message: 'Request timed out', retryable: true, cause: message };\n }\n\n // Rate-limit patterns\n if (lower.includes('rate limit') || lower.includes('too many requests')) {\n return { code: ErrorCode.RATE_LIMITED, message: 'Rate limit exceeded', retryable: true, cause: message };\n }\n\n // API key errors\n if (message.includes('API_KEY') || message.includes('api_key') || message.includes('Invalid API')) {\n return { code: ErrorCode.AUTH_ERROR, message: 'API key missing or invalid', retryable: false, cause: message };\n }\n\n // Parse errors\n if (message.includes('JSON') || message.includes('parse') || message.includes('Unexpected token')) {\n return { code: ErrorCode.PARSE_ERROR, message: 'Failed to parse response', retryable: false, cause: message };\n }\n\n return null;\n}\n\n/**\n * Catch-all fallback classification when no other classifier matches.\n */\nfunction classifyFallback(message: string, cause?: unknown): StructuredError {\n return {\n code: ErrorCode.UNKNOWN_ERROR,\n message,\n retryable: false,\n cause: cause ? String(cause) : undefined,\n };\n}\n\n// ============================================================================\n// Main Error Classification Pipeline\n// ============================================================================\n\n/**\n * Classify any error into a structured format.\n * NEVER throws \u2014 always returns a valid StructuredError.\n */\nexport function classifyError(error: unknown): StructuredError {\n if (error == null) {\n return { code: ErrorCode.UNKNOWN_ERROR, message: 'An unknown error occurred', retryable: false };\n }\n\n if (error instanceof DOMException) return classifyDomException(error);\n\n if (!isErrorLike(error)) {\n return { code: ErrorCode.UNKNOWN_ERROR, message: String(error), retryable: false };\n }\n\n return classifyByErrorCode(error)\n ?? classifyByStatusCode(error)\n ?? classifyByMessage(error.message ?? String(error))\n ?? classifyFallback(error.message ?? String(error), error.cause);\n}\n\n/**\n * Type guard for error-like objects with common error properties\n */\nfunction isErrorLike(value: unknown): value is {\n message?: string;\n response?: { status?: number; data?: unknown };\n status?: number;\n statusCode?: number;\n code?: string;\n name?: string;\n cause?: unknown;\n} {\n return typeof value === 'object' && value !== null;\n}\n\n/**\n * Classify HTTP status codes into structured errors.\n * Exhaustive switch with grouped default handling for unknown ranges.\n */\nfunction classifyHttpError(status: number, message: string): StructuredError {\n switch (status) {\n case 400:\n return { code: ErrorCode.INVALID_INPUT, message: 'Bad request', retryable: false, statusCode: status };\n case 401:\n return { code: ErrorCode.AUTH_ERROR, message: 'Invalid API key', retryable: false, statusCode: status };\n case 403:\n return { code: ErrorCode.QUOTA_EXCEEDED, message: 'Access forbidden or quota exceeded', retryable: false, statusCode: status };\n case 404:\n return { code: ErrorCode.NOT_FOUND, message: 'Resource not found', retryable: false, statusCode: status };\n case 408:\n return { code: ErrorCode.TIMEOUT, message: 'Request timeout', retryable: true, statusCode: status };\n case 429:\n return { code: ErrorCode.RATE_LIMITED, message: 'Rate limit exceeded', retryable: true, statusCode: status };\n case 500:\n return { code: ErrorCode.INTERNAL_ERROR, message: 'Server error', retryable: true, statusCode: status };\n case 502:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Bad gateway', retryable: true, statusCode: status };\n case 503:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Service unavailable', retryable: true, statusCode: status };\n case 504:\n return { code: ErrorCode.TIMEOUT, message: 'Gateway timeout', retryable: true, statusCode: status };\n case 510:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Request canceled', retryable: true, statusCode: status };\n default:\n if (status >= 500) {\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: `Server error: ${status}`, retryable: true, statusCode: status };\n }\n if (status >= 400) {\n return { code: ErrorCode.INVALID_INPUT, message: `Client error: ${status}`, retryable: false, statusCode: status };\n }\n return { code: ErrorCode.UNKNOWN_ERROR, message: `HTTP ${status}: ${message}`, retryable: false, statusCode: status };\n }\n}\n\n// ============================================================================\n// Retry Logic with Exponential Backoff\n// ============================================================================\n\n/**\n * Calculate delay with exponential backoff and jitter\n */\nfunction calculateBackoff(attempt: number, options: RetryOptions): number {\n const exponentialDelay = options.baseDelayMs * Math.pow(2, attempt);\n const jitter = Math.random() * 0.3 * exponentialDelay; // 0-30% jitter\n return Math.min(exponentialDelay + jitter, options.maxDelayMs);\n}\n\n/**\n * Sleep utility that respects abort signals\n */\nexport function sleep(ms: number, signal?: AbortSignal): Promise<void> {\n return new Promise((resolve, reject) => {\n if (signal?.aborted) {\n reject(new DOMException('Aborted', 'AbortError'));\n return;\n }\n\n function onAbort() {\n clearTimeout(timeout);\n reject(new DOMException('Aborted', 'AbortError'));\n }\n\n const timeout = setTimeout(() => {\n if (signal) signal.removeEventListener('abort', onAbort);\n resolve();\n }, ms);\n\n signal?.addEventListener('abort', onAbort, { once: true });\n // Re-check: signal may have aborted between initial check and listener registration\n if (signal?.aborted) {\n onAbort();\n }\n });\n}\n\n/**\n * Wrap a fetch call with timeout via AbortController\n */\nexport function fetchWithTimeout(\n url: string,\n options: RequestInit & { timeoutMs?: number } = {}\n): Promise<Response> {\n const { timeoutMs = 30000, signal: externalSignal, ...fetchOptions } = options;\n\n const controller = new AbortController();\n const timeoutId = setTimeout(() => controller.abort(), timeoutMs);\n\n let onExternalAbort: (() => void) | undefined;\n if (externalSignal) {\n onExternalAbort = () => controller.abort();\n externalSignal.addEventListener('abort', onExternalAbort, { once: true });\n if (externalSignal.aborted) {\n controller.abort();\n }\n }\n\n return fetch(url, { ...fetchOptions, signal: controller.signal }).finally(() => {\n clearTimeout(timeoutId);\n if (externalSignal && onExternalAbort) {\n externalSignal.removeEventListener('abort', onExternalAbort);\n }\n });\n}\n\n// ============================================================================\n// Stability Wrappers \u2014 Network resilience for LLM API calls\n// ============================================================================\n\n/**\n * Wrap a non-streaming API call with activity-based timeout detection.\n * If the call hasn't completed within `stallMs`, abort and retry.\n * This catches \"stuck\" connections where TCP stays open but no data flows.\n *\n * @param fn - Async function that accepts an AbortSignal\n * @param stallMs - Max milliseconds to wait for the call to complete before considering it stuck\n * @param maxAttempts - Max retry attempts for stalled requests\n * @param label - Label for log messages\n * @returns The result of the function\n */\nexport async function withStallProtection<T>(\n fn: (signal: AbortSignal) => Promise<T>,\n stallMs: number,\n maxAttempts: number = 2,\n label: string = 'request',\n): Promise<T> {\n for (let attempt = 0; attempt < maxAttempts; attempt++) {\n const controller = new AbortController();\n let stallTimer: ReturnType<typeof setTimeout> | undefined;\n\n const stallPromise = new Promise<never>((_, reject) => {\n stallTimer = setTimeout(() => {\n controller.abort();\n reject(Object.assign(new Error(`Service temporarily unavailable \u2014 no response received (attempt ${attempt + 1}/${maxAttempts})`), {\n code: 'ESTALLED',\n retryable: attempt < maxAttempts - 1,\n }));\n }, stallMs);\n });\n\n let fnPromise: Promise<T> | undefined;\n try {\n fnPromise = fn(controller.signal);\n const result = await Promise.race([fnPromise, stallPromise]);\n clearTimeout(stallTimer);\n return result;\n } catch (err) {\n // Suppress unhandled rejection from the losing promise\n // (e.g. fnPromise rejects after stallPromise wins the race)\n fnPromise?.catch(() => {});\n clearTimeout(stallTimer);\n const isStall = err instanceof Error && (err as NodeJS.ErrnoException).code === 'ESTALLED';\n if (isStall && attempt < maxAttempts - 1) {\n const backoff = calculateBackoff(attempt, DEFAULT_RETRY_OPTIONS);\n mcpLog('warning', `${label} stalled, retrying in ${backoff}ms (attempt ${attempt + 1})`, 'stability');\n await sleep(backoff);\n continue;\n }\n throw err;\n }\n }\n // Should never reach here, but TypeScript needs it\n throw new Error(`${label} failed after ${maxAttempts} stall-protection attempts`);\n}\n"],
|
|
5
|
-
"mappings": "AAKA,SAAS,cAAc;AAMhB,MAAM,YAAY;AAAA;AAAA,EAEvB,cAAc;AAAA,EACd,SAAS;AAAA,EACT,eAAe;AAAA,EACf,qBAAqB;AAAA;AAAA,EAGrB,YAAY;AAAA,EACZ,eAAe;AAAA,EACf,WAAW;AAAA,EACX,gBAAgB;AAAA;AAAA,
|
|
4
|
+
"sourcesContent": ["/**\n * Robust error handling utilities for MCP server\n * Ensures the server NEVER crashes and always returns structured responses\n */\n\nimport { mcpLog } from './logger.js';\n\n// ============================================================================\n// Error Codes (MCP-compliant)\n// ============================================================================\n\nexport const ErrorCode = {\n // Retryable errors\n RATE_LIMITED: 'RATE_LIMITED',\n TIMEOUT: 'TIMEOUT',\n NETWORK_ERROR: 'NETWORK_ERROR',\n SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE',\n \n // Non-retryable errors\n AUTH_ERROR: 'AUTH_ERROR',\n INVALID_INPUT: 'INVALID_INPUT',\n NOT_FOUND: 'NOT_FOUND',\n QUOTA_EXCEEDED: 'QUOTA_EXCEEDED',\n UNSUPPORTED_BINARY_CONTENT: 'UNSUPPORTED_BINARY_CONTENT',\n\n // Internal errors\n INTERNAL_ERROR: 'INTERNAL_ERROR',\n PARSE_ERROR: 'PARSE_ERROR',\n UNKNOWN_ERROR: 'UNKNOWN_ERROR',\n} as const;\n\ntype ErrorCodeType = typeof ErrorCode[keyof typeof ErrorCode];\n\n// ============================================================================\n// Structured Error Types\n// ============================================================================\n\nexport interface StructuredError {\n code: ErrorCodeType;\n message: string;\n retryable: boolean;\n statusCode?: number;\n cause?: string;\n}\n\ninterface RetryOptions {\n readonly maxRetries: number;\n readonly baseDelayMs: number;\n readonly maxDelayMs: number;\n readonly retryableStatuses: readonly number[];\n readonly onRetry?: (attempt: number, error: StructuredError, delayMs: number) => void;\n}\n\nconst DEFAULT_RETRY_OPTIONS: RetryOptions = {\n maxRetries: 3,\n baseDelayMs: 1000,\n maxDelayMs: 30000,\n retryableStatuses: [408, 429, 500, 502, 503, 504, 510],\n};\n\n// ============================================================================\n// Error Classification \u2014 Atomic Classifiers\n// ============================================================================\n\n/**\n * Classify DOMException (AbortError from AbortController timeouts)\n */\nfunction classifyDomException(error: DOMException): StructuredError {\n if (error.name === 'AbortError') {\n return { code: ErrorCode.TIMEOUT, message: 'Request timed out', retryable: true };\n }\n return { code: ErrorCode.UNKNOWN_ERROR, message: error.message, retryable: false };\n}\n\n/**\n * Classify by Node.js error codes (ECONNREFUSED, ENOTFOUND, etc.)\n * Returns null if no matching code is found.\n */\nfunction classifyByErrorCode(error: { code?: string; message?: string }): StructuredError | null {\n const errCode = error.code;\n if (!errCode) return null;\n\n const networkErrorMessages: Record<string, string> = {\n ECONNREFUSED: 'Connection refused \u2014 service may be down',\n ECONNRESET: 'Connection was reset \u2014 please retry',\n ECONNABORTED: 'Connection aborted \u2014 please retry',\n ENOTFOUND: 'Service not reachable \u2014 check your network',\n EPIPE: 'Connection lost \u2014 please retry',\n EAI_AGAIN: 'DNS lookup failed \u2014 check your network',\n };\n\n if (errCode === 'ECONNREFUSED' || errCode === 'ENOTFOUND' || errCode === 'ECONNRESET') {\n return { code: ErrorCode.NETWORK_ERROR, message: networkErrorMessages[errCode] || 'Network connection failed', retryable: true, cause: error.message };\n }\n\n if (errCode === 'ECONNABORTED' || errCode === 'ETIMEDOUT') {\n return { code: ErrorCode.TIMEOUT, message: networkErrorMessages[errCode] || 'Request timed out', retryable: true, cause: error.message };\n }\n\n return null;\n}\n\n/**\n * Classify by HTTP status code extracted from error objects (axios-style, fetch-style, etc.)\n * Returns null if no status code is found.\n */\nfunction classifyByStatusCode(error: { status?: number; statusCode?: number; response?: { status?: number }; message?: string }): StructuredError | null {\n const status = error.response?.status || error.status || error.statusCode;\n if (!status) return null;\n return classifyHttpError(status, error.message || String(error));\n}\n\n/**\n * Classify by error message patterns (timeout, rate-limit, auth, parse errors)\n * Returns null if no pattern matches.\n */\nfunction classifyByMessage(message: string): StructuredError | null {\n const lower = message.toLowerCase();\n\n // Timeout patterns\n if (lower.includes('timeout') || lower.includes('timed out') || lower.includes('aborterror')) {\n return { code: ErrorCode.TIMEOUT, message: 'Request timed out', retryable: true, cause: message };\n }\n\n // Rate-limit patterns\n if (lower.includes('rate limit') || lower.includes('too many requests')) {\n return { code: ErrorCode.RATE_LIMITED, message: 'Rate limit exceeded', retryable: true, cause: message };\n }\n\n // API key errors\n if (message.includes('API_KEY') || message.includes('api_key') || message.includes('Invalid API')) {\n return { code: ErrorCode.AUTH_ERROR, message: 'API key missing or invalid', retryable: false, cause: message };\n }\n\n // Parse errors\n if (message.includes('JSON') || message.includes('parse') || message.includes('Unexpected token')) {\n return { code: ErrorCode.PARSE_ERROR, message: 'Failed to parse response', retryable: false, cause: message };\n }\n\n return null;\n}\n\n/**\n * Catch-all fallback classification when no other classifier matches.\n */\nfunction classifyFallback(message: string, cause?: unknown): StructuredError {\n return {\n code: ErrorCode.UNKNOWN_ERROR,\n message,\n retryable: false,\n cause: cause ? String(cause) : undefined,\n };\n}\n\n// ============================================================================\n// Main Error Classification Pipeline\n// ============================================================================\n\n/**\n * Classify any error into a structured format.\n * NEVER throws \u2014 always returns a valid StructuredError.\n */\nexport function classifyError(error: unknown): StructuredError {\n if (error == null) {\n return { code: ErrorCode.UNKNOWN_ERROR, message: 'An unknown error occurred', retryable: false };\n }\n\n if (error instanceof DOMException) return classifyDomException(error);\n\n if (!isErrorLike(error)) {\n return { code: ErrorCode.UNKNOWN_ERROR, message: String(error), retryable: false };\n }\n\n return classifyByErrorCode(error)\n ?? classifyByStatusCode(error)\n ?? classifyByMessage(error.message ?? String(error))\n ?? classifyFallback(error.message ?? String(error), error.cause);\n}\n\n/**\n * Type guard for error-like objects with common error properties\n */\nfunction isErrorLike(value: unknown): value is {\n message?: string;\n response?: { status?: number; data?: unknown };\n status?: number;\n statusCode?: number;\n code?: string;\n name?: string;\n cause?: unknown;\n} {\n return typeof value === 'object' && value !== null;\n}\n\n/**\n * Classify HTTP status codes into structured errors.\n * Exhaustive switch with grouped default handling for unknown ranges.\n */\nfunction classifyHttpError(status: number, message: string): StructuredError {\n switch (status) {\n case 400:\n return { code: ErrorCode.INVALID_INPUT, message: 'Bad request', retryable: false, statusCode: status };\n case 401:\n return { code: ErrorCode.AUTH_ERROR, message: 'Invalid API key', retryable: false, statusCode: status };\n case 403:\n return { code: ErrorCode.QUOTA_EXCEEDED, message: 'Access forbidden or quota exceeded', retryable: false, statusCode: status };\n case 404:\n return { code: ErrorCode.NOT_FOUND, message: 'Resource not found', retryable: false, statusCode: status };\n case 408:\n return { code: ErrorCode.TIMEOUT, message: 'Request timeout', retryable: true, statusCode: status };\n case 429:\n return { code: ErrorCode.RATE_LIMITED, message: 'Rate limit exceeded', retryable: true, statusCode: status };\n case 500:\n return { code: ErrorCode.INTERNAL_ERROR, message: 'Server error', retryable: true, statusCode: status };\n case 502:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Bad gateway', retryable: true, statusCode: status };\n case 503:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Service unavailable', retryable: true, statusCode: status };\n case 504:\n return { code: ErrorCode.TIMEOUT, message: 'Gateway timeout', retryable: true, statusCode: status };\n case 510:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Request canceled', retryable: true, statusCode: status };\n default:\n if (status >= 500) {\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: `Server error: ${status}`, retryable: true, statusCode: status };\n }\n if (status >= 400) {\n return { code: ErrorCode.INVALID_INPUT, message: `Client error: ${status}`, retryable: false, statusCode: status };\n }\n return { code: ErrorCode.UNKNOWN_ERROR, message: `HTTP ${status}: ${message}`, retryable: false, statusCode: status };\n }\n}\n\n// ============================================================================\n// Retry Logic with Exponential Backoff\n// ============================================================================\n\n/**\n * Calculate delay with exponential backoff and jitter\n */\nfunction calculateBackoff(attempt: number, options: RetryOptions): number {\n const exponentialDelay = options.baseDelayMs * Math.pow(2, attempt);\n const jitter = Math.random() * 0.3 * exponentialDelay; // 0-30% jitter\n return Math.min(exponentialDelay + jitter, options.maxDelayMs);\n}\n\n/**\n * Sleep utility that respects abort signals\n */\nexport function sleep(ms: number, signal?: AbortSignal): Promise<void> {\n return new Promise((resolve, reject) => {\n if (signal?.aborted) {\n reject(new DOMException('Aborted', 'AbortError'));\n return;\n }\n\n function onAbort() {\n clearTimeout(timeout);\n reject(new DOMException('Aborted', 'AbortError'));\n }\n\n const timeout = setTimeout(() => {\n if (signal) signal.removeEventListener('abort', onAbort);\n resolve();\n }, ms);\n\n signal?.addEventListener('abort', onAbort, { once: true });\n // Re-check: signal may have aborted between initial check and listener registration\n if (signal?.aborted) {\n onAbort();\n }\n });\n}\n\n/**\n * Wrap a fetch call with timeout via AbortController\n */\nexport function fetchWithTimeout(\n url: string,\n options: RequestInit & { timeoutMs?: number } = {}\n): Promise<Response> {\n const { timeoutMs = 30000, signal: externalSignal, ...fetchOptions } = options;\n\n const controller = new AbortController();\n const timeoutId = setTimeout(() => controller.abort(), timeoutMs);\n\n let onExternalAbort: (() => void) | undefined;\n if (externalSignal) {\n onExternalAbort = () => controller.abort();\n externalSignal.addEventListener('abort', onExternalAbort, { once: true });\n if (externalSignal.aborted) {\n controller.abort();\n }\n }\n\n return fetch(url, { ...fetchOptions, signal: controller.signal }).finally(() => {\n clearTimeout(timeoutId);\n if (externalSignal && onExternalAbort) {\n externalSignal.removeEventListener('abort', onExternalAbort);\n }\n });\n}\n\n// ============================================================================\n// Stability Wrappers \u2014 Network resilience for LLM API calls\n// ============================================================================\n\n/**\n * Wrap a non-streaming API call with activity-based timeout detection.\n * If the call hasn't completed within `stallMs`, abort and retry.\n * This catches \"stuck\" connections where TCP stays open but no data flows.\n *\n * @param fn - Async function that accepts an AbortSignal\n * @param stallMs - Max milliseconds to wait for the call to complete before considering it stuck\n * @param maxAttempts - Max retry attempts for stalled requests\n * @param label - Label for log messages\n * @returns The result of the function\n */\nexport async function withStallProtection<T>(\n fn: (signal: AbortSignal) => Promise<T>,\n stallMs: number,\n maxAttempts: number = 2,\n label: string = 'request',\n): Promise<T> {\n for (let attempt = 0; attempt < maxAttempts; attempt++) {\n const controller = new AbortController();\n let stallTimer: ReturnType<typeof setTimeout> | undefined;\n\n const stallPromise = new Promise<never>((_, reject) => {\n stallTimer = setTimeout(() => {\n controller.abort();\n reject(Object.assign(new Error(`Service temporarily unavailable \u2014 no response received (attempt ${attempt + 1}/${maxAttempts})`), {\n code: 'ESTALLED',\n retryable: attempt < maxAttempts - 1,\n }));\n }, stallMs);\n });\n\n let fnPromise: Promise<T> | undefined;\n try {\n fnPromise = fn(controller.signal);\n const result = await Promise.race([fnPromise, stallPromise]);\n clearTimeout(stallTimer);\n return result;\n } catch (err) {\n // Suppress unhandled rejection from the losing promise\n // (e.g. fnPromise rejects after stallPromise wins the race)\n fnPromise?.catch(() => {});\n clearTimeout(stallTimer);\n const isStall = err instanceof Error && (err as NodeJS.ErrnoException).code === 'ESTALLED';\n if (isStall && attempt < maxAttempts - 1) {\n const backoff = calculateBackoff(attempt, DEFAULT_RETRY_OPTIONS);\n mcpLog('warning', `${label} stalled, retrying in ${backoff}ms (attempt ${attempt + 1})`, 'stability');\n await sleep(backoff);\n continue;\n }\n throw err;\n }\n }\n // Should never reach here, but TypeScript needs it\n throw new Error(`${label} failed after ${maxAttempts} stall-protection attempts`);\n}\n"],
|
|
5
|
+
"mappings": "AAKA,SAAS,cAAc;AAMhB,MAAM,YAAY;AAAA;AAAA,EAEvB,cAAc;AAAA,EACd,SAAS;AAAA,EACT,eAAe;AAAA,EACf,qBAAqB;AAAA;AAAA,EAGrB,YAAY;AAAA,EACZ,eAAe;AAAA,EACf,WAAW;AAAA,EACX,gBAAgB;AAAA,EAChB,4BAA4B;AAAA;AAAA,EAG5B,gBAAgB;AAAA,EAChB,aAAa;AAAA,EACb,eAAe;AACjB;AAwBA,MAAM,wBAAsC;AAAA,EAC1C,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,YAAY;AAAA,EACZ,mBAAmB,CAAC,KAAK,KAAK,KAAK,KAAK,KAAK,KAAK,GAAG;AACvD;AASA,SAAS,qBAAqB,OAAsC;AAClE,MAAI,MAAM,SAAS,cAAc;AAC/B,WAAO,EAAE,MAAM,UAAU,SAAS,SAAS,qBAAqB,WAAW,KAAK;AAAA,EAClF;AACA,SAAO,EAAE,MAAM,UAAU,eAAe,SAAS,MAAM,SAAS,WAAW,MAAM;AACnF;AAMA,SAAS,oBAAoB,OAAoE;AAC/F,QAAM,UAAU,MAAM;AACtB,MAAI,CAAC,QAAS,QAAO;AAErB,QAAM,uBAA+C;AAAA,IACnD,cAAc;AAAA,IACd,YAAY;AAAA,IACZ,cAAc;AAAA,IACd,WAAW;AAAA,IACX,OAAO;AAAA,IACP,WAAW;AAAA,EACb;AAEA,MAAI,YAAY,kBAAkB,YAAY,eAAe,YAAY,cAAc;AACrF,WAAO,EAAE,MAAM,UAAU,eAAe,SAAS,qBAAqB,OAAO,KAAK,6BAA6B,WAAW,MAAM,OAAO,MAAM,QAAQ;AAAA,EACvJ;AAEA,MAAI,YAAY,kBAAkB,YAAY,aAAa;AACzD,WAAO,EAAE,MAAM,UAAU,SAAS,SAAS,qBAAqB,OAAO,KAAK,qBAAqB,WAAW,MAAM,OAAO,MAAM,QAAQ;AAAA,EACzI;AAEA,SAAO;AACT;AAMA,SAAS,qBAAqB,OAA2H;AACvJ,QAAM,SAAS,MAAM,UAAU,UAAU,MAAM,UAAU,MAAM;AAC/D,MAAI,CAAC,OAAQ,QAAO;AACpB,SAAO,kBAAkB,QAAQ,MAAM,WAAW,OAAO,KAAK,CAAC;AACjE;AAMA,SAAS,kBAAkB,SAAyC;AAClE,QAAM,QAAQ,QAAQ,YAAY;AAGlC,MAAI,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,WAAW,KAAK,MAAM,SAAS,YAAY,GAAG;AAC5F,WAAO,EAAE,MAAM,UAAU,SAAS,SAAS,qBAAqB,WAAW,MAAM,OAAO,QAAQ;AAAA,EAClG;AAGA,MAAI,MAAM,SAAS,YAAY,KAAK,MAAM,SAAS,mBAAmB,GAAG;AACvE,WAAO,EAAE,MAAM,UAAU,cAAc,SAAS,uBAAuB,WAAW,MAAM,OAAO,QAAQ;AAAA,EACzG;AAGA,MAAI,QAAQ,SAAS,SAAS,KAAK,QAAQ,SAAS,SAAS,KAAK,QAAQ,SAAS,aAAa,GAAG;AACjG,WAAO,EAAE,MAAM,UAAU,YAAY,SAAS,8BAA8B,WAAW,OAAO,OAAO,QAAQ;AAAA,EAC/G;AAGA,MAAI,QAAQ,SAAS,MAAM,KAAK,QAAQ,SAAS,OAAO,KAAK,QAAQ,SAAS,kBAAkB,GAAG;AACjG,WAAO,EAAE,MAAM,UAAU,aAAa,SAAS,4BAA4B,WAAW,OAAO,OAAO,QAAQ;AAAA,EAC9G;AAEA,SAAO;AACT;AAKA,SAAS,iBAAiB,SAAiB,OAAkC;AAC3E,SAAO;AAAA,IACL,MAAM,UAAU;AAAA,IAChB;AAAA,IACA,WAAW;AAAA,IACX,OAAO,QAAQ,OAAO,KAAK,IAAI;AAAA,EACjC;AACF;AAUO,SAAS,cAAc,OAAiC;AAC7D,MAAI,SAAS,MAAM;AACjB,WAAO,EAAE,MAAM,UAAU,eAAe,SAAS,6BAA6B,WAAW,MAAM;AAAA,EACjG;AAEA,MAAI,iBAAiB,aAAc,QAAO,qBAAqB,KAAK;AAEpE,MAAI,CAAC,YAAY,KAAK,GAAG;AACvB,WAAO,EAAE,MAAM,UAAU,eAAe,SAAS,OAAO,KAAK,GAAG,WAAW,MAAM;AAAA,EACnF;AAEA,SAAO,oBAAoB,KAAK,KAC3B,qBAAqB,KAAK,KAC1B,kBAAkB,MAAM,WAAW,OAAO,KAAK,CAAC,KAChD,iBAAiB,MAAM,WAAW,OAAO,KAAK,GAAG,MAAM,KAAK;AACnE;AAKA,SAAS,YAAY,OAQnB;AACA,SAAO,OAAO,UAAU,YAAY,UAAU;AAChD;AAMA,SAAS,kBAAkB,QAAgB,SAAkC;AAC3E,UAAQ,QAAQ;AAAA,IACd,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,eAAe,SAAS,eAAe,WAAW,OAAO,YAAY,OAAO;AAAA,IACvG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,YAAY,SAAS,mBAAmB,WAAW,OAAO,YAAY,OAAO;AAAA,IACxG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,gBAAgB,SAAS,sCAAsC,WAAW,OAAO,YAAY,OAAO;AAAA,IAC/H,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,WAAW,SAAS,sBAAsB,WAAW,OAAO,YAAY,OAAO;AAAA,IAC1G,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,SAAS,SAAS,mBAAmB,WAAW,MAAM,YAAY,OAAO;AAAA,IACpG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,cAAc,SAAS,uBAAuB,WAAW,MAAM,YAAY,OAAO;AAAA,IAC7G,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,gBAAgB,SAAS,gBAAgB,WAAW,MAAM,YAAY,OAAO;AAAA,IACxG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,qBAAqB,SAAS,eAAe,WAAW,MAAM,YAAY,OAAO;AAAA,IAC5G,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,qBAAqB,SAAS,uBAAuB,WAAW,MAAM,YAAY,OAAO;AAAA,IACpH,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,SAAS,SAAS,mBAAmB,WAAW,MAAM,YAAY,OAAO;AAAA,IACpG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,qBAAqB,SAAS,oBAAoB,WAAW,MAAM,YAAY,OAAO;AAAA,IACjH;AACE,UAAI,UAAU,KAAK;AACjB,eAAO,EAAE,MAAM,UAAU,qBAAqB,SAAS,iBAAiB,MAAM,IAAI,WAAW,MAAM,YAAY,OAAO;AAAA,MACxH;AACA,UAAI,UAAU,KAAK;AACjB,eAAO,EAAE,MAAM,UAAU,eAAe,SAAS,iBAAiB,MAAM,IAAI,WAAW,OAAO,YAAY,OAAO;AAAA,MACnH;AACA,aAAO,EAAE,MAAM,UAAU,eAAe,SAAS,QAAQ,MAAM,KAAK,OAAO,IAAI,WAAW,OAAO,YAAY,OAAO;AAAA,EACxH;AACF;AASA,SAAS,iBAAiB,SAAiB,SAA+B;AACxE,QAAM,mBAAmB,QAAQ,cAAc,KAAK,IAAI,GAAG,OAAO;AAClE,QAAM,SAAS,KAAK,OAAO,IAAI,MAAM;AACrC,SAAO,KAAK,IAAI,mBAAmB,QAAQ,QAAQ,UAAU;AAC/D;AAKO,SAAS,MAAM,IAAY,QAAqC;AACrE,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,QAAI,QAAQ,SAAS;AACnB,aAAO,IAAI,aAAa,WAAW,YAAY,CAAC;AAChD;AAAA,IACF;AAEA,aAAS,UAAU;AACjB,mBAAa,OAAO;AACpB,aAAO,IAAI,aAAa,WAAW,YAAY,CAAC;AAAA,IAClD;AAEA,UAAM,UAAU,WAAW,MAAM;AAC/B,UAAI,OAAQ,QAAO,oBAAoB,SAAS,OAAO;AACvD,cAAQ;AAAA,IACV,GAAG,EAAE;AAEL,YAAQ,iBAAiB,SAAS,SAAS,EAAE,MAAM,KAAK,CAAC;AAEzD,QAAI,QAAQ,SAAS;AACnB,cAAQ;AAAA,IACV;AAAA,EACF,CAAC;AACH;AAKO,SAAS,iBACd,KACA,UAAgD,CAAC,GAC9B;AACnB,QAAM,EAAE,YAAY,KAAO,QAAQ,gBAAgB,GAAG,aAAa,IAAI;AAEvE,QAAM,aAAa,IAAI,gBAAgB;AACvC,QAAM,YAAY,WAAW,MAAM,WAAW,MAAM,GAAG,SAAS;AAEhE,MAAI;AACJ,MAAI,gBAAgB;AAClB,sBAAkB,MAAM,WAAW,MAAM;AACzC,mBAAe,iBAAiB,SAAS,iBAAiB,EAAE,MAAM,KAAK,CAAC;AACxE,QAAI,eAAe,SAAS;AAC1B,iBAAW,MAAM;AAAA,IACnB;AAAA,EACF;AAEA,SAAO,MAAM,KAAK,EAAE,GAAG,cAAc,QAAQ,WAAW,OAAO,CAAC,EAAE,QAAQ,MAAM;AAC9E,iBAAa,SAAS;AACtB,QAAI,kBAAkB,iBAAiB;AACrC,qBAAe,oBAAoB,SAAS,eAAe;AAAA,IAC7D;AAAA,EACF,CAAC;AACH;AAiBA,eAAsB,oBACpB,IACA,SACA,cAAsB,GACtB,QAAgB,WACJ;AACZ,WAAS,UAAU,GAAG,UAAU,aAAa,WAAW;AACtD,UAAM,aAAa,IAAI,gBAAgB;AACvC,QAAI;AAEJ,UAAM,eAAe,IAAI,QAAe,CAAC,GAAG,WAAW;AACrD,mBAAa,WAAW,MAAM;AAC5B,mBAAW,MAAM;AACjB,eAAO,OAAO,OAAO,IAAI,MAAM,wEAAmE,UAAU,CAAC,IAAI,WAAW,GAAG,GAAG;AAAA,UAChI,MAAM;AAAA,UACN,WAAW,UAAU,cAAc;AAAA,QACrC,CAAC,CAAC;AAAA,MACJ,GAAG,OAAO;AAAA,IACZ,CAAC;AAED,QAAI;AACJ,QAAI;AACF,kBAAY,GAAG,WAAW,MAAM;AAChC,YAAM,SAAS,MAAM,QAAQ,KAAK,CAAC,WAAW,YAAY,CAAC;AAC3D,mBAAa,UAAU;AACvB,aAAO;AAAA,IACT,SAAS,KAAK;AAGZ,iBAAW,MAAM,MAAM;AAAA,MAAC,CAAC;AACzB,mBAAa,UAAU;AACvB,YAAM,UAAU,eAAe,SAAU,IAA8B,SAAS;AAChF,UAAI,WAAW,UAAU,cAAc,GAAG;AACxC,cAAM,UAAU,iBAAiB,SAAS,qBAAqB;AAC/D,eAAO,WAAW,GAAG,KAAK,yBAAyB,OAAO,eAAe,UAAU,CAAC,KAAK,WAAW;AACpG,cAAM,MAAM,OAAO;AACnB;AAAA,MACF;AACA,YAAM;AAAA,IACR;AAAA,EACF;AAEA,QAAM,IAAI,MAAM,GAAG,KAAK,iBAAiB,WAAW,4BAA4B;AAClF;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
|
@@ -35,7 +35,46 @@ function classifySourceByUrl(url) {
|
|
|
35
35
|
}
|
|
36
36
|
return "web";
|
|
37
37
|
}
|
|
38
|
+
const DOCUMENT_PATH_SUFFIXES = [
|
|
39
|
+
".pdf",
|
|
40
|
+
".doc",
|
|
41
|
+
".docx",
|
|
42
|
+
".ppt",
|
|
43
|
+
".pptx",
|
|
44
|
+
".xls",
|
|
45
|
+
".xlsx"
|
|
46
|
+
];
|
|
47
|
+
function isDocumentUrl(url) {
|
|
48
|
+
let pathname;
|
|
49
|
+
try {
|
|
50
|
+
pathname = new URL(url).pathname.toLowerCase();
|
|
51
|
+
} catch {
|
|
52
|
+
return false;
|
|
53
|
+
}
|
|
54
|
+
for (const suffix of DOCUMENT_PATH_SUFFIXES) {
|
|
55
|
+
if (pathname.endsWith(suffix)) return true;
|
|
56
|
+
}
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
const BINARY_CONTENT_TYPE_PREFIXES = [
|
|
60
|
+
"application/pdf",
|
|
61
|
+
"application/msword",
|
|
62
|
+
"application/vnd.openxmlformats-officedocument.",
|
|
63
|
+
"application/vnd.ms-excel",
|
|
64
|
+
"application/vnd.ms-powerpoint",
|
|
65
|
+
"application/octet-stream"
|
|
66
|
+
];
|
|
67
|
+
function isBinaryDocumentContentType(contentType) {
|
|
68
|
+
if (!contentType) return false;
|
|
69
|
+
const lower = contentType.toLowerCase();
|
|
70
|
+
for (const prefix of BINARY_CONTENT_TYPE_PREFIXES) {
|
|
71
|
+
if (lower.startsWith(prefix)) return true;
|
|
72
|
+
}
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
38
75
|
export {
|
|
39
|
-
classifySourceByUrl
|
|
76
|
+
classifySourceByUrl,
|
|
77
|
+
isBinaryDocumentContentType,
|
|
78
|
+
isDocumentUrl
|
|
40
79
|
};
|
|
41
80
|
//# sourceMappingURL=source-type.js.map
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/utils/source-type.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Hostname/path-heuristic source-type tagging. Works without the LLM\n * classifier so degraded-mode web-search responses still carry a\n * `source_type` field per result. When the LLM classifier IS available,\n * its tag wins (the classifier sees title + snippet as well, not just URL).\n *\n * See: mcp-revisions/output-shaping/06-source-type-tagging-without-llm.md.\n */\n\nexport type SourceType =\n | 'reddit'\n | 'github'\n | 'docs'\n | 'blog'\n | 'paper'\n | 'qa'\n | 'cve'\n | 'news'\n | 'video'\n | 'web';\n\nconst RULES: Array<[RegExp, SourceType]> = [\n // Reddit post permalinks (subreddit homepages are filtered out upstream).\n [/(?:^|\\.)reddit\\.com\\//i, 'reddit'],\n [/(?:^|\\.)github\\.com\\//i, 'github'],\n [/(?:^|\\.)gitlab\\.com\\//i, 'github'],\n // CVE-prefixed paths are unambiguous regardless of host.\n [/\\/CVE-\\d{4}-\\d+/i, 'cve'],\n [/(?:^|\\.)nvd\\.nist\\.gov\\//i, 'cve'],\n [/(?:^|\\.)stackoverflow\\.com\\//i, 'qa'],\n [/(?:^|\\.)stackexchange\\.com\\//i, 'qa'],\n [/(?:^|\\.)arxiv\\.org\\//i, 'paper'],\n [/(?:^|\\.)medium\\.com\\//i, 'blog'],\n [/(?:^|\\.)dev\\.to\\//i, 'blog'],\n [/(?:^|\\.)substack\\.com\\//i, 'blog'],\n // Docs subdomains and /docs/ paths.\n [/^(?:[a-z0-9-]+\\.)*docs\\./i, 'docs'],\n [/\\/docs\\//i, 'docs'],\n [/(?:^|\\.)readthedocs\\.io\\//i, 'docs'],\n // Video.\n [/(?:^|\\.)youtube\\.com\\/watch/i, 'video'],\n [/(?:^|\\.)youtu\\.be\\//i, 'video'],\n // News / engineering blogs (last so it doesn't capture vendor docs).\n [/(?:^|\\.)(?:news|blog|engineering)\\.[a-z0-9-]+\\.[a-z]{2,}\\//i, 'news'],\n];\n\nexport function classifySourceByUrl(url: string): SourceType {\n let candidate: string;\n try {\n const u = new URL(url);\n // Match against `host + pathname` so rules can use either or both.\n candidate = `${u.hostname}${u.pathname}`;\n } catch {\n candidate = url;\n }\n for (const [re, type] of RULES) {\n if (re.test(candidate)) return type;\n }\n return 'web';\n}\n"],
|
|
5
|
-
"mappings": "AAqBA,MAAM,QAAqC;AAAA;AAAA,EAEzC,CAAC,0BAA0B,QAAQ;AAAA,EACnC,CAAC,0BAA0B,QAAQ;AAAA,EACnC,CAAC,0BAA0B,QAAQ;AAAA;AAAA,EAEnC,CAAC,oBAAoB,KAAK;AAAA,EAC1B,CAAC,6BAA6B,KAAK;AAAA,EACnC,CAAC,iCAAiC,IAAI;AAAA,EACtC,CAAC,iCAAiC,IAAI;AAAA,EACtC,CAAC,yBAAyB,OAAO;AAAA,EACjC,CAAC,0BAA0B,MAAM;AAAA,EACjC,CAAC,sBAAsB,MAAM;AAAA,EAC7B,CAAC,4BAA4B,MAAM;AAAA;AAAA,EAEnC,CAAC,6BAA6B,MAAM;AAAA,EACpC,CAAC,aAAa,MAAM;AAAA,EACpB,CAAC,8BAA8B,MAAM;AAAA;AAAA,EAErC,CAAC,gCAAgC,OAAO;AAAA,EACxC,CAAC,wBAAwB,OAAO;AAAA;AAAA,EAEhC,CAAC,+DAA+D,MAAM;AACxE;AAEO,SAAS,oBAAoB,KAAyB;AAC3D,MAAI;AACJ,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AAErB,gBAAY,GAAG,EAAE,QAAQ,GAAG,EAAE,QAAQ;AAAA,EACxC,QAAQ;AACN,gBAAY;AAAA,EACd;AACA,aAAW,CAAC,IAAI,IAAI,KAAK,OAAO;AAC9B,QAAI,GAAG,KAAK,SAAS,EAAG,QAAO;AAAA,EACjC;AACA,SAAO;AACT;",
|
|
4
|
+
"sourcesContent": ["/**\n * Hostname/path-heuristic source-type tagging. Works without the LLM\n * classifier so degraded-mode web-search responses still carry a\n * `source_type` field per result. When the LLM classifier IS available,\n * its tag wins (the classifier sees title + snippet as well, not just URL).\n *\n * See: mcp-revisions/output-shaping/06-source-type-tagging-without-llm.md.\n */\n\nexport type SourceType =\n | 'reddit'\n | 'github'\n | 'docs'\n | 'blog'\n | 'paper'\n | 'qa'\n | 'cve'\n | 'news'\n | 'video'\n | 'web';\n\nconst RULES: Array<[RegExp, SourceType]> = [\n // Reddit post permalinks (subreddit homepages are filtered out upstream).\n [/(?:^|\\.)reddit\\.com\\//i, 'reddit'],\n [/(?:^|\\.)github\\.com\\//i, 'github'],\n [/(?:^|\\.)gitlab\\.com\\//i, 'github'],\n // CVE-prefixed paths are unambiguous regardless of host.\n [/\\/CVE-\\d{4}-\\d+/i, 'cve'],\n [/(?:^|\\.)nvd\\.nist\\.gov\\//i, 'cve'],\n [/(?:^|\\.)stackoverflow\\.com\\//i, 'qa'],\n [/(?:^|\\.)stackexchange\\.com\\//i, 'qa'],\n [/(?:^|\\.)arxiv\\.org\\//i, 'paper'],\n [/(?:^|\\.)medium\\.com\\//i, 'blog'],\n [/(?:^|\\.)dev\\.to\\//i, 'blog'],\n [/(?:^|\\.)substack\\.com\\//i, 'blog'],\n // Docs subdomains and /docs/ paths.\n [/^(?:[a-z0-9-]+\\.)*docs\\./i, 'docs'],\n [/\\/docs\\//i, 'docs'],\n [/(?:^|\\.)readthedocs\\.io\\//i, 'docs'],\n // Video.\n [/(?:^|\\.)youtube\\.com\\/watch/i, 'video'],\n [/(?:^|\\.)youtu\\.be\\//i, 'video'],\n // News / engineering blogs (last so it doesn't capture vendor docs).\n [/(?:^|\\.)(?:news|blog|engineering)\\.[a-z0-9-]+\\.[a-z]{2,}\\//i, 'news'],\n];\n\nexport function classifySourceByUrl(url: string): SourceType {\n let candidate: string;\n try {\n const u = new URL(url);\n // Match against `host + pathname` so rules can use either or both.\n candidate = `${u.hostname}${u.pathname}`;\n } catch {\n candidate = url;\n }\n for (const [re, type] of RULES) {\n if (re.test(candidate)) return type;\n }\n return 'web';\n}\n\n// \u2500\u2500 Document-format detection (PDF / Office) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n// Scrape.do + Readability + Turndown assume HTML input. Binary document\n// formats need a markdown-extraction service (Jina Reader) instead. These two\n// helpers give the scrape pipeline both a pre-fetch gate (URL suffix) and a\n// post-fetch gate (response Content-Type header).\n\nconst DOCUMENT_PATH_SUFFIXES = [\n '.pdf',\n '.doc', '.docx',\n '.ppt', '.pptx',\n '.xls', '.xlsx',\n] as const;\n\n/**\n * Pre-fetch gate: does this URL's path end in a known binary-document suffix?\n * Case-insensitive. Trailing query strings / fragments are ignored \u2014 only the\n * pathname is inspected. Invalid URLs return false (handled upstream).\n */\nexport function isDocumentUrl(url: string): boolean {\n let pathname: string;\n try {\n pathname = new URL(url).pathname.toLowerCase();\n } catch {\n return false;\n }\n for (const suffix of DOCUMENT_PATH_SUFFIXES) {\n if (pathname.endsWith(suffix)) return true;\n }\n return false;\n}\n\nconst BINARY_CONTENT_TYPE_PREFIXES = [\n 'application/pdf',\n 'application/msword',\n 'application/vnd.openxmlformats-officedocument.',\n 'application/vnd.ms-excel',\n 'application/vnd.ms-powerpoint',\n 'application/octet-stream',\n] as const;\n\n/**\n * Post-fetch gate: does this Content-Type header indicate a binary document\n * that our HTML pipeline cannot decode? Returns false for HTML/JSON/plain text\n * and for unknown/missing content-types (the upstream pipeline can still try).\n */\nexport function isBinaryDocumentContentType(contentType: string | null | undefined): boolean {\n if (!contentType) return false;\n const lower = contentType.toLowerCase();\n for (const prefix of BINARY_CONTENT_TYPE_PREFIXES) {\n if (lower.startsWith(prefix)) return true;\n }\n return false;\n}\n"],
|
|
5
|
+
"mappings": "AAqBA,MAAM,QAAqC;AAAA;AAAA,EAEzC,CAAC,0BAA0B,QAAQ;AAAA,EACnC,CAAC,0BAA0B,QAAQ;AAAA,EACnC,CAAC,0BAA0B,QAAQ;AAAA;AAAA,EAEnC,CAAC,oBAAoB,KAAK;AAAA,EAC1B,CAAC,6BAA6B,KAAK;AAAA,EACnC,CAAC,iCAAiC,IAAI;AAAA,EACtC,CAAC,iCAAiC,IAAI;AAAA,EACtC,CAAC,yBAAyB,OAAO;AAAA,EACjC,CAAC,0BAA0B,MAAM;AAAA,EACjC,CAAC,sBAAsB,MAAM;AAAA,EAC7B,CAAC,4BAA4B,MAAM;AAAA;AAAA,EAEnC,CAAC,6BAA6B,MAAM;AAAA,EACpC,CAAC,aAAa,MAAM;AAAA,EACpB,CAAC,8BAA8B,MAAM;AAAA;AAAA,EAErC,CAAC,gCAAgC,OAAO;AAAA,EACxC,CAAC,wBAAwB,OAAO;AAAA;AAAA,EAEhC,CAAC,+DAA+D,MAAM;AACxE;AAEO,SAAS,oBAAoB,KAAyB;AAC3D,MAAI;AACJ,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AAErB,gBAAY,GAAG,EAAE,QAAQ,GAAG,EAAE,QAAQ;AAAA,EACxC,QAAQ;AACN,gBAAY;AAAA,EACd;AACA,aAAW,CAAC,IAAI,IAAI,KAAK,OAAO;AAC9B,QAAI,GAAG,KAAK,SAAS,EAAG,QAAO;AAAA,EACjC;AACA,SAAO;AACT;AAQA,MAAM,yBAAyB;AAAA,EAC7B;AAAA,EACA;AAAA,EAAQ;AAAA,EACR;AAAA,EAAQ;AAAA,EACR;AAAA,EAAQ;AACV;AAOO,SAAS,cAAc,KAAsB;AAClD,MAAI;AACJ,MAAI;AACF,eAAW,IAAI,IAAI,GAAG,EAAE,SAAS,YAAY;AAAA,EAC/C,QAAQ;AACN,WAAO;AAAA,EACT;AACA,aAAW,UAAU,wBAAwB;AAC3C,QAAI,SAAS,SAAS,MAAM,EAAG,QAAO;AAAA,EACxC;AACA,SAAO;AACT;AAEA,MAAM,+BAA+B;AAAA,EACnC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAOO,SAAS,4BAA4B,aAAiD;AAC3F,MAAI,CAAC,YAAa,QAAO;AACzB,QAAM,QAAQ,YAAY,YAAY;AACtC,aAAW,UAAU,8BAA8B;AACjD,QAAI,MAAM,WAAW,MAAM,EAAG,QAAO;AAAA,EACvC;AACA,SAAO;AACT;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mcp-researchpowerpack",
|
|
3
|
-
"version": "6.0.
|
|
3
|
+
"version": "6.0.6",
|
|
4
4
|
"description": "HTTP-first MCP research server: start-research (goal-tailored brief), web-search (with Reddit scope), scrape-links (auto-detects Reddit URLs) — built on mcp-use.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|