mcp-researchpowerpack 6.0.5 → 6.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/mcp-use.json +2 -2
- package/dist/src/clients/jina.js +165 -0
- package/dist/src/clients/jina.js.map +7 -0
- package/dist/src/clients/scraper.js +21 -0
- package/dist/src/clients/scraper.js.map +2 -2
- package/dist/src/config/index.js +2 -1
- package/dist/src/config/index.js.map +2 -2
- package/dist/src/tools/scrape.js +156 -21
- package/dist/src/tools/scrape.js.map +2 -2
- package/dist/src/utils/errors.js +1 -0
- package/dist/src/utils/errors.js.map +2 -2
- package/dist/src/utils/source-type.js +40 -1
- package/dist/src/utils/source-type.js.map +2 -2
- package/package.json +1 -1
package/dist/mcp-use.json
CHANGED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import {
|
|
2
|
+
classifyError,
|
|
3
|
+
fetchWithTimeout,
|
|
4
|
+
sleep,
|
|
5
|
+
ErrorCode
|
|
6
|
+
} from "../utils/errors.js";
|
|
7
|
+
import { calculateBackoff } from "../utils/retry.js";
|
|
8
|
+
import { mcpLog } from "../utils/logger.js";
|
|
9
|
+
const JINA_READER_BASE = "https://r.jina.ai/";
|
|
10
|
+
const DEFAULT_TIMEOUT_MS = 6e4;
|
|
11
|
+
const MAX_RETRIES = 2;
|
|
12
|
+
class JinaClient {
|
|
13
|
+
apiKey;
|
|
14
|
+
constructor(apiKey) {
|
|
15
|
+
const fromEnv = process.env.JINA_API_KEY?.trim();
|
|
16
|
+
this.apiKey = apiKey?.trim() || fromEnv || void 0;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Convert a URL to markdown via Jina Reader.
|
|
20
|
+
* NEVER throws — always returns a JinaConvertResponse (possibly with error).
|
|
21
|
+
*/
|
|
22
|
+
async convert(request) {
|
|
23
|
+
const { url, timeoutMs = DEFAULT_TIMEOUT_MS } = request;
|
|
24
|
+
try {
|
|
25
|
+
new URL(url);
|
|
26
|
+
} catch {
|
|
27
|
+
return {
|
|
28
|
+
content: `Invalid URL: ${url}`,
|
|
29
|
+
statusCode: 400,
|
|
30
|
+
credits: 0,
|
|
31
|
+
error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false }
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
const jinaUrl = `${JINA_READER_BASE}${url}`;
|
|
35
|
+
const headers = {
|
|
36
|
+
Accept: "text/markdown"
|
|
37
|
+
};
|
|
38
|
+
if (this.apiKey) {
|
|
39
|
+
headers["Authorization"] = `Bearer ${this.apiKey}`;
|
|
40
|
+
}
|
|
41
|
+
let lastError;
|
|
42
|
+
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
|
|
43
|
+
try {
|
|
44
|
+
const response = await fetchWithTimeout(jinaUrl, {
|
|
45
|
+
method: "GET",
|
|
46
|
+
headers,
|
|
47
|
+
timeoutMs
|
|
48
|
+
});
|
|
49
|
+
let content;
|
|
50
|
+
try {
|
|
51
|
+
content = await response.text();
|
|
52
|
+
} catch (readError) {
|
|
53
|
+
content = `Failed to read Jina response: ${readError instanceof Error ? readError.message : String(readError)}`;
|
|
54
|
+
}
|
|
55
|
+
const usageHeader = response.headers.get("x-usage-tokens");
|
|
56
|
+
const usageTokens = usageHeader ? Number(usageHeader) : void 0;
|
|
57
|
+
if (response.ok) {
|
|
58
|
+
if (!content.trim()) {
|
|
59
|
+
return {
|
|
60
|
+
content: "Jina returned an empty body",
|
|
61
|
+
statusCode: response.status,
|
|
62
|
+
credits: 0,
|
|
63
|
+
usageTokens: Number.isFinite(usageTokens) ? usageTokens : void 0,
|
|
64
|
+
error: {
|
|
65
|
+
code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,
|
|
66
|
+
message: "Jina Reader returned empty content for this URL",
|
|
67
|
+
retryable: false
|
|
68
|
+
}
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
return {
|
|
72
|
+
content,
|
|
73
|
+
statusCode: response.status,
|
|
74
|
+
credits: 0,
|
|
75
|
+
usageTokens: Number.isFinite(usageTokens) ? usageTokens : void 0
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
if (response.status === 401 || response.status === 403) {
|
|
79
|
+
return {
|
|
80
|
+
content: `Jina auth/quota error (${response.status}): ${content.slice(0, 200)}`,
|
|
81
|
+
statusCode: response.status,
|
|
82
|
+
credits: 0,
|
|
83
|
+
error: {
|
|
84
|
+
code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.QUOTA_EXCEEDED,
|
|
85
|
+
message: response.status === 401 ? "Jina Reader auth failed \u2014 check JINA_API_KEY" : "Jina Reader quota exceeded",
|
|
86
|
+
retryable: false,
|
|
87
|
+
statusCode: response.status
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
if (response.status === 404) {
|
|
92
|
+
return {
|
|
93
|
+
content: `Jina could not fetch the target URL (404)`,
|
|
94
|
+
statusCode: 404,
|
|
95
|
+
credits: 0,
|
|
96
|
+
error: {
|
|
97
|
+
code: ErrorCode.NOT_FOUND,
|
|
98
|
+
message: "Target URL not reachable by Jina Reader",
|
|
99
|
+
retryable: false,
|
|
100
|
+
statusCode: 404
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
if (response.status === 429 || response.status >= 500) {
|
|
105
|
+
lastError = classifyError({ status: response.status, message: content.slice(0, 200) });
|
|
106
|
+
if (attempt < MAX_RETRIES) {
|
|
107
|
+
const delayMs = calculateBackoff(attempt);
|
|
108
|
+
mcpLog(
|
|
109
|
+
"warning",
|
|
110
|
+
`Jina ${response.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1}. Retrying in ${delayMs}ms`,
|
|
111
|
+
"jina"
|
|
112
|
+
);
|
|
113
|
+
await sleep(delayMs);
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
return {
|
|
117
|
+
content: `Jina Reader error (${response.status}): ${content.slice(0, 200)}`,
|
|
118
|
+
statusCode: response.status,
|
|
119
|
+
credits: 0,
|
|
120
|
+
error: lastError
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
return {
|
|
124
|
+
content: `Jina Reader error (${response.status}): ${content.slice(0, 200)}`,
|
|
125
|
+
statusCode: response.status,
|
|
126
|
+
credits: 0,
|
|
127
|
+
error: {
|
|
128
|
+
code: ErrorCode.INVALID_INPUT,
|
|
129
|
+
message: `Jina Reader returned ${response.status}`,
|
|
130
|
+
retryable: false,
|
|
131
|
+
statusCode: response.status
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
} catch (error) {
|
|
135
|
+
lastError = classifyError(error);
|
|
136
|
+
if (lastError.retryable && attempt < MAX_RETRIES) {
|
|
137
|
+
const delayMs = calculateBackoff(attempt);
|
|
138
|
+
mcpLog(
|
|
139
|
+
"warning",
|
|
140
|
+
`Jina ${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${MAX_RETRIES + 1} in ${delayMs}ms`,
|
|
141
|
+
"jina"
|
|
142
|
+
);
|
|
143
|
+
await sleep(delayMs);
|
|
144
|
+
continue;
|
|
145
|
+
}
|
|
146
|
+
return {
|
|
147
|
+
content: `Jina Reader failed: ${lastError.message}`,
|
|
148
|
+
statusCode: lastError.statusCode ?? 500,
|
|
149
|
+
credits: 0,
|
|
150
|
+
error: lastError
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return {
|
|
155
|
+
content: `Jina Reader failed after ${MAX_RETRIES + 1} attempts: ${lastError?.message ?? "Unknown error"}`,
|
|
156
|
+
statusCode: lastError?.statusCode ?? 500,
|
|
157
|
+
credits: 0,
|
|
158
|
+
error: lastError ?? { code: ErrorCode.UNKNOWN_ERROR, message: "All retries exhausted", retryable: false }
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
export {
|
|
163
|
+
JinaClient
|
|
164
|
+
};
|
|
165
|
+
//# sourceMappingURL=jina.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/clients/jina.ts"],
|
|
4
|
+
"sourcesContent": ["/**\n * Jina Reader Client\n *\n * Converts any URL (including PDFs, DOCX, PPTX, HTML) into clean markdown via\n * the public `https://r.jina.ai/<url>` endpoint. Used by `scrape-links` for\n * document formats that our HTML-assumed pipeline (Scrape.do + Readability +\n * Turndown) cannot decode.\n *\n * NEVER throws \u2014 every failure surfaces as a classified `StructuredError`\n * in the returned response, matching the shape of `ScraperClient.scrape`.\n *\n * Auth: optional `JINA_API_KEY` raises the rate limit from 20 RPM to 200+ RPM.\n * Without a key the endpoint still works; we just retry more aggressively on\n * 429 responses.\n */\n\nimport {\n classifyError,\n fetchWithTimeout,\n sleep,\n ErrorCode,\n type StructuredError,\n} from '../utils/errors.js';\nimport { calculateBackoff } from '../utils/retry.js';\nimport { mcpLog } from '../utils/logger.js';\n\n// \u2500\u2500 Constants \u2500\u2500\n\nconst JINA_READER_BASE = 'https://r.jina.ai/' as const;\nconst DEFAULT_TIMEOUT_MS = 60_000 as const; // Jina can take a while for large PDFs\nconst MAX_RETRIES = 2 as const;\n\n// \u2500\u2500 Interfaces \u2500\u2500\n\nexport interface JinaConvertRequest {\n readonly url: string;\n readonly timeoutMs?: number;\n}\n\nexport interface JinaConvertResponse {\n readonly content: string;\n readonly statusCode: number;\n /** Always 0 \u2014 Jina is a separate service from Scrape.do's credit pool. */\n readonly credits: 0;\n readonly usageTokens?: number;\n readonly error?: StructuredError;\n}\n\n// \u2500\u2500 Client \u2500\u2500\n\nexport class JinaClient {\n private readonly apiKey: string | undefined;\n\n constructor(apiKey?: string) {\n const fromEnv = process.env.JINA_API_KEY?.trim();\n this.apiKey = apiKey?.trim() || fromEnv || undefined;\n }\n\n /**\n * Convert a URL to markdown via Jina Reader.\n * NEVER throws \u2014 always returns a JinaConvertResponse (possibly with error).\n */\n async convert(request: JinaConvertRequest): Promise<JinaConvertResponse> {\n const { url, timeoutMs = DEFAULT_TIMEOUT_MS } = request;\n\n try {\n new URL(url);\n } catch {\n return {\n content: `Invalid URL: ${url}`,\n statusCode: 400,\n credits: 0,\n error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false },\n };\n }\n\n // Jina Reader parses the full target URL as the path suffix. Query strings\n // and fragments in the target are preserved verbatim; no encoding needed.\n const jinaUrl = `${JINA_READER_BASE}${url}`;\n\n const headers: Record<string, string> = {\n Accept: 'text/markdown',\n };\n if (this.apiKey) {\n headers['Authorization'] = `Bearer ${this.apiKey}`;\n }\n\n let lastError: StructuredError | undefined;\n\n for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {\n try {\n const response = await fetchWithTimeout(jinaUrl, {\n method: 'GET',\n headers,\n timeoutMs,\n });\n\n let content: string;\n try {\n content = await response.text();\n } catch (readError) {\n content = `Failed to read Jina response: ${readError instanceof Error ? readError.message : String(readError)}`;\n }\n\n const usageHeader = response.headers.get('x-usage-tokens');\n const usageTokens = usageHeader ? Number(usageHeader) : undefined;\n\n if (response.ok) {\n if (!content.trim()) {\n return {\n content: 'Jina returned an empty body',\n statusCode: response.status,\n credits: 0,\n usageTokens: Number.isFinite(usageTokens) ? usageTokens : undefined,\n error: {\n code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,\n message: 'Jina Reader returned empty content for this URL',\n retryable: false,\n },\n };\n }\n return {\n content,\n statusCode: response.status,\n credits: 0,\n usageTokens: Number.isFinite(usageTokens) ? usageTokens : undefined,\n };\n }\n\n // 401/403 \u2014 auth or quota problems. Not retryable.\n if (response.status === 401 || response.status === 403) {\n return {\n content: `Jina auth/quota error (${response.status}): ${content.slice(0, 200)}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.QUOTA_EXCEEDED,\n message: response.status === 401\n ? 'Jina Reader auth failed \u2014 check JINA_API_KEY'\n : 'Jina Reader quota exceeded',\n retryable: false,\n statusCode: response.status,\n },\n };\n }\n\n // 404 \u2014 the target URL itself was not found by Jina.\n if (response.status === 404) {\n return {\n content: `Jina could not fetch the target URL (404)`,\n statusCode: 404,\n credits: 0,\n error: {\n code: ErrorCode.NOT_FOUND,\n message: 'Target URL not reachable by Jina Reader',\n retryable: false,\n statusCode: 404,\n },\n };\n }\n\n // 429 / 5xx \u2014 retryable.\n if (response.status === 429 || response.status >= 500) {\n lastError = classifyError({ status: response.status, message: content.slice(0, 200) });\n if (attempt < MAX_RETRIES) {\n const delayMs = calculateBackoff(attempt);\n mcpLog(\n 'warning',\n `Jina ${response.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1}. Retrying in ${delayMs}ms`,\n 'jina',\n );\n await sleep(delayMs);\n continue;\n }\n return {\n content: `Jina Reader error (${response.status}): ${content.slice(0, 200)}`,\n statusCode: response.status,\n credits: 0,\n error: lastError,\n };\n }\n\n // Anything else \u2014 treat as non-retryable client error.\n return {\n content: `Jina Reader error (${response.status}): ${content.slice(0, 200)}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: ErrorCode.INVALID_INPUT,\n message: `Jina Reader returned ${response.status}`,\n retryable: false,\n statusCode: response.status,\n },\n };\n } catch (error) {\n lastError = classifyError(error);\n if (lastError.retryable && attempt < MAX_RETRIES) {\n const delayMs = calculateBackoff(attempt);\n mcpLog(\n 'warning',\n `Jina ${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${MAX_RETRIES + 1} in ${delayMs}ms`,\n 'jina',\n );\n await sleep(delayMs);\n continue;\n }\n return {\n content: `Jina Reader failed: ${lastError.message}`,\n statusCode: lastError.statusCode ?? 500,\n credits: 0,\n error: lastError,\n };\n }\n }\n\n return {\n content: `Jina Reader failed after ${MAX_RETRIES + 1} attempts: ${lastError?.message ?? 'Unknown error'}`,\n statusCode: lastError?.statusCode ?? 500,\n credits: 0,\n error: lastError ?? { code: ErrorCode.UNKNOWN_ERROR, message: 'All retries exhausted', retryable: false },\n };\n }\n}\n"],
|
|
5
|
+
"mappings": "AAgBA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAEK;AACP,SAAS,wBAAwB;AACjC,SAAS,cAAc;AAIvB,MAAM,mBAAmB;AACzB,MAAM,qBAAqB;AAC3B,MAAM,cAAc;AAoBb,MAAM,WAAW;AAAA,EACL;AAAA,EAEjB,YAAY,QAAiB;AAC3B,UAAM,UAAU,QAAQ,IAAI,cAAc,KAAK;AAC/C,SAAK,SAAS,QAAQ,KAAK,KAAK,WAAW;AAAA,EAC7C;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,QAAQ,SAA2D;AACvE,UAAM,EAAE,KAAK,YAAY,mBAAmB,IAAI;AAEhD,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,aAAO;AAAA,QACL,SAAS,gBAAgB,GAAG;AAAA,QAC5B,YAAY;AAAA,QACZ,SAAS;AAAA,QACT,OAAO,EAAE,MAAM,UAAU,eAAe,SAAS,gBAAgB,GAAG,IAAI,WAAW,MAAM;AAAA,MAC3F;AAAA,IACF;AAIA,UAAM,UAAU,GAAG,gBAAgB,GAAG,GAAG;AAEzC,UAAM,UAAkC;AAAA,MACtC,QAAQ;AAAA,IACV;AACA,QAAI,KAAK,QAAQ;AACf,cAAQ,eAAe,IAAI,UAAU,KAAK,MAAM;AAAA,IAClD;AAEA,QAAI;AAEJ,aAAS,UAAU,GAAG,WAAW,aAAa,WAAW;AACvD,UAAI;AACF,cAAM,WAAW,MAAM,iBAAiB,SAAS;AAAA,UAC/C,QAAQ;AAAA,UACR;AAAA,UACA;AAAA,QACF,CAAC;AAED,YAAI;AACJ,YAAI;AACF,oBAAU,MAAM,SAAS,KAAK;AAAA,QAChC,SAAS,WAAW;AAClB,oBAAU,iCAAiC,qBAAqB,QAAQ,UAAU,UAAU,OAAO,SAAS,CAAC;AAAA,QAC/G;AAEA,cAAM,cAAc,SAAS,QAAQ,IAAI,gBAAgB;AACzD,cAAM,cAAc,cAAc,OAAO,WAAW,IAAI;AAExD,YAAI,SAAS,IAAI;AACf,cAAI,CAAC,QAAQ,KAAK,GAAG;AACnB,mBAAO;AAAA,cACL,SAAS;AAAA,cACT,YAAY,SAAS;AAAA,cACrB,SAAS;AAAA,cACT,aAAa,OAAO,SAAS,WAAW,IAAI,cAAc;AAAA,cAC1D,OAAO;AAAA,gBACL,MAAM,UAAU;AAAA,gBAChB,SAAS;AAAA,gBACT,WAAW;AAAA,cACb;AAAA,YACF;AAAA,UACF;AACA,iBAAO;AAAA,YACL;AAAA,YACA,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,aAAa,OAAO,SAAS,WAAW,IAAI,cAAc;AAAA,UAC5D;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,OAAO,SAAS,WAAW,KAAK;AACtD,iBAAO;AAAA,YACL,SAAS,0BAA0B,SAAS,MAAM,MAAM,QAAQ,MAAM,GAAG,GAAG,CAAC;AAAA,YAC7E,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,OAAO;AAAA,cACL,MAAM,SAAS,WAAW,MAAM,UAAU,aAAa,UAAU;AAAA,cACjE,SAAS,SAAS,WAAW,MACzB,sDACA;AAAA,cACJ,WAAW;AAAA,cACX,YAAY,SAAS;AAAA,YACvB;AAAA,UACF;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,KAAK;AAC3B,iBAAO;AAAA,YACL,SAAS;AAAA,YACT,YAAY;AAAA,YACZ,SAAS;AAAA,YACT,OAAO;AAAA,cACL,MAAM,UAAU;AAAA,cAChB,SAAS;AAAA,cACT,WAAW;AAAA,cACX,YAAY;AAAA,YACd;AAAA,UACF;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,OAAO,SAAS,UAAU,KAAK;AACrD,sBAAY,cAAc,EAAE,QAAQ,SAAS,QAAQ,SAAS,QAAQ,MAAM,GAAG,GAAG,EAAE,CAAC;AACrF,cAAI,UAAU,aAAa;AACzB,kBAAM,UAAU,iBAAiB,OAAO;AACxC;AAAA,cACE;AAAA,cACA,QAAQ,SAAS,MAAM,eAAe,UAAU,CAAC,IAAI,cAAc,CAAC,iBAAiB,OAAO;AAAA,cAC5F;AAAA,YACF;AACA,kBAAM,MAAM,OAAO;AACnB;AAAA,UACF;AACA,iBAAO;AAAA,YACL,SAAS,sBAAsB,SAAS,MAAM,MAAM,QAAQ,MAAM,GAAG,GAAG,CAAC;AAAA,YACzE,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,OAAO;AAAA,UACT;AAAA,QACF;AAGA,eAAO;AAAA,UACL,SAAS,sBAAsB,SAAS,MAAM,MAAM,QAAQ,MAAM,GAAG,GAAG,CAAC;AAAA,UACzE,YAAY,SAAS;AAAA,UACrB,SAAS;AAAA,UACT,OAAO;AAAA,YACL,MAAM,UAAU;AAAA,YAChB,SAAS,wBAAwB,SAAS,MAAM;AAAA,YAChD,WAAW;AAAA,YACX,YAAY,SAAS;AAAA,UACvB;AAAA,QACF;AAAA,MACF,SAAS,OAAO;AACd,oBAAY,cAAc,KAAK;AAC/B,YAAI,UAAU,aAAa,UAAU,aAAa;AAChD,gBAAM,UAAU,iBAAiB,OAAO;AACxC;AAAA,YACE;AAAA,YACA,QAAQ,UAAU,IAAI,KAAK,UAAU,OAAO,WAAW,UAAU,CAAC,IAAI,cAAc,CAAC,OAAO,OAAO;AAAA,YACnG;AAAA,UACF;AACA,gBAAM,MAAM,OAAO;AACnB;AAAA,QACF;AACA,eAAO;AAAA,UACL,SAAS,uBAAuB,UAAU,OAAO;AAAA,UACjD,YAAY,UAAU,cAAc;AAAA,UACpC,SAAS;AAAA,UACT,OAAO;AAAA,QACT;AAAA,MACF;AAAA,IACF;AAEA,WAAO;AAAA,MACL,SAAS,4BAA4B,cAAc,CAAC,cAAc,WAAW,WAAW,eAAe;AAAA,MACvG,YAAY,WAAW,cAAc;AAAA,MACrC,SAAS;AAAA,MACT,OAAO,aAAa,EAAE,MAAM,UAAU,eAAe,SAAS,yBAAyB,WAAW,MAAM;AAAA,IAC1G;AAAA,EACF;AACF;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -8,6 +8,7 @@ import {
|
|
|
8
8
|
import { calculateBackoff } from "../utils/retry.js";
|
|
9
9
|
import { pMapSettled } from "../utils/concurrency.js";
|
|
10
10
|
import { mcpLog } from "../utils/logger.js";
|
|
11
|
+
import { isBinaryDocumentContentType } from "../utils/source-type.js";
|
|
11
12
|
const SCRAPE_MODES = ["basic", "javascript", "javascript_geo"];
|
|
12
13
|
const CREDIT_COSTS = { basic: 1, javascript: 5, javascript_geo: 5 };
|
|
13
14
|
const SCRAPE_BATCH_SIZE = 30;
|
|
@@ -76,6 +77,26 @@ class ScraperClient {
|
|
|
76
77
|
content = `Failed to read response: ${readError instanceof Error ? readError.message : String(readError)}`;
|
|
77
78
|
}
|
|
78
79
|
if (response.ok) {
|
|
80
|
+
const contentType = response.headers.get("content-type");
|
|
81
|
+
if (isBinaryDocumentContentType(contentType)) {
|
|
82
|
+
mcpLog(
|
|
83
|
+
"info",
|
|
84
|
+
`Binary document detected at ${url} (content-type: ${contentType}). Deferring to Jina Reader.`,
|
|
85
|
+
"scraper"
|
|
86
|
+
);
|
|
87
|
+
return {
|
|
88
|
+
content: `Binary document (${contentType ?? "unknown"}); routed to Jina Reader`,
|
|
89
|
+
statusCode: 415,
|
|
90
|
+
credits: 0,
|
|
91
|
+
headers: Object.fromEntries(response.headers.entries()),
|
|
92
|
+
error: {
|
|
93
|
+
code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,
|
|
94
|
+
message: `Scrape.do cannot decode ${contentType ?? "this binary content-type"}`,
|
|
95
|
+
retryable: false,
|
|
96
|
+
statusCode: 415
|
|
97
|
+
}
|
|
98
|
+
};
|
|
99
|
+
}
|
|
79
100
|
return {
|
|
80
101
|
content,
|
|
81
102
|
statusCode: response.status,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/clients/scraper.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Web Scraper Client\n * Generic interface for URL scraping with automatic fallback modes\n * Implements robust error handling that NEVER crashes\n */\n\nimport { parseEnv, SCRAPER, CONCURRENCY } from '../config/index.js';\nimport {\n classifyError,\n fetchWithTimeout,\n sleep,\n ErrorCode,\n type StructuredError,\n} from '../utils/errors.js';\nimport { calculateBackoff } from '../utils/retry.js';\nimport { pMapSettled } from '../utils/concurrency.js';\nimport { mcpLog } from '../utils/logger.js';\n\n// \u2500\u2500 Constants \u2500\u2500\n\nconst SCRAPE_MODES = ['basic', 'javascript', 'javascript_geo'] as const;\ntype ScrapeMode = typeof SCRAPE_MODES[number];\n\nconst CREDIT_COSTS: Record<string, number> = { basic: 1, javascript: 5, javascript_geo: 5 } as const;\nconst SCRAPE_BATCH_SIZE = 30 as const;\nconst MAX_RETRIES = 1 as const;\n/** Overall timeout for all fallback attempts on a single URL */\nconst FALLBACK_OVERALL_TIMEOUT_MS = 30_000 as const;\n\n// \u2500\u2500 Interfaces \u2500\u2500\n\ninterface ScrapeRequest {\n readonly url: string;\n readonly mode?: 'basic' | 'javascript';\n readonly timeout?: number;\n readonly country?: string;\n}\n\ninterface ScrapeResponse {\n readonly content: string;\n readonly statusCode: number;\n readonly credits: number;\n readonly headers?: Record<string, string>;\n readonly error?: StructuredError;\n}\n\ninterface BatchScrapeResult {\n readonly results: ReadonlyArray<ScrapeResponse & { readonly url: string }>;\n readonly batchesProcessed: number;\n readonly totalAttempted: number;\n readonly rateLimitHits: number;\n}\n\n// Status codes that indicate we should retry (no credit consumed)\nconst RETRYABLE_STATUS_CODES = new Set([429, 502, 503, 504, 510]);\n// Status codes that are permanent failures (don't retry)\nconst PERMANENT_FAILURE_CODES = new Set([400, 401, 403]);\n\n/** Minimum stripped-text length to consider a scrape successful (filters out empty SPA shells) */\nconst MIN_USEFUL_CONTENT_LENGTH = 200 as const;\n\n/** Fallback attempt descriptor used by scrapeWithFallback */\ninterface FallbackAttempt {\n readonly mode: 'basic' | 'javascript';\n readonly country?: string;\n readonly description: string;\n}\n\nconst FALLBACK_ATTEMPTS: readonly FallbackAttempt[] = [\n { mode: 'basic', description: 'basic mode' },\n { mode: 'javascript', description: 'javascript rendering' },\n { mode: 'javascript', country: 'us', description: 'javascript + US geo-targeting' },\n] as const;\n\nexport class ScraperClient {\n private apiKey: string;\n private baseURL = 'https://api.scrape.do';\n\n constructor(apiKey?: string) {\n const env = parseEnv();\n this.apiKey = apiKey || env.SCRAPER_API_KEY;\n\n if (!this.apiKey) {\n throw new Error('Web scraping capability is not configured. Please set up the required API credentials.');\n }\n }\n\n /**\n * Scrape a single URL with retry logic\n * NEVER throws - always returns a ScrapeResponse (possibly with error)\n */\n async scrape(request: ScrapeRequest, maxRetries = MAX_RETRIES): Promise<ScrapeResponse> {\n const { url, mode = 'basic', timeout = 15, country } = request;\n const credits = CREDIT_COSTS[mode] ?? 1;\n\n // Validate URL first\n try {\n new URL(url);\n } catch {\n return {\n content: `Invalid URL: ${url}`,\n statusCode: 400,\n credits: 0,\n error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false },\n };\n }\n\n const params = new URLSearchParams({\n url: url,\n token: this.apiKey,\n timeout: String(timeout * 1000),\n });\n\n if (mode === 'javascript') {\n params.append('render', 'true');\n }\n\n if (country) {\n params.append('geoCode', country.toUpperCase());\n }\n\n const apiUrl = `${this.baseURL}?${params.toString()}`;\n let lastError: StructuredError | undefined;\n\n for (let attempt = 0; attempt < maxRetries; attempt++) {\n try {\n // Use AbortController for timeout\n const timeoutMs = (timeout + 5) * 1000; // Add 5s buffer over scrape timeout\n const response = await fetchWithTimeout(apiUrl, {\n method: 'GET',\n headers: { Accept: 'text/html,application/json' },\n timeoutMs,\n });\n\n // Safely read response body\n let content: string;\n try {\n content = await response.text();\n } catch (readError) {\n content = `Failed to read response: ${readError instanceof Error ? readError.message : String(readError)}`;\n }\n\n // SUCCESS: 2xx - Successful API call\n if (response.ok) {\n return {\n content,\n statusCode: response.status,\n credits,\n headers: Object.fromEntries(response.headers.entries()),\n };\n }\n\n // 404 - Target not found (permanent, but not an error for our purposes)\n if (response.status === 404) {\n return {\n content: '404 - Page not found',\n statusCode: 404,\n credits,\n };\n }\n\n // Permanent failures - don't retry\n if (PERMANENT_FAILURE_CODES.has(response.status)) {\n const errorMsg = response.status === 401\n ? 'No credits remaining or subscription suspended'\n : `Request failed with status ${response.status}`;\n return {\n content: `Error: ${errorMsg}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.INVALID_INPUT,\n message: errorMsg,\n retryable: false,\n statusCode: response.status,\n },\n };\n }\n\n // Retryable status codes\n if (RETRYABLE_STATUS_CODES.has(response.status)) {\n lastError = {\n code: response.status === 429 ? ErrorCode.RATE_LIMITED : ErrorCode.SERVICE_UNAVAILABLE,\n message: `Server returned ${response.status}`,\n retryable: true,\n statusCode: response.status,\n };\n\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${response.status} on attempt ${attempt + 1}/${maxRetries}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n\n // Other non-success status - treat as retryable\n lastError = classifyError({ status: response.status, message: content });\n if (attempt < maxRetries - 1 && lastError.retryable) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `Status ${response.status}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n\n // Final attempt failed\n return {\n content: `Error: ${lastError.message}`,\n statusCode: response.status,\n credits: 0,\n error: lastError,\n };\n\n } catch (error) {\n lastError = classifyError(error);\n\n // Non-retryable errors - return immediately\n if (!lastError.retryable) {\n return {\n content: `Error: ${lastError.message}`,\n statusCode: lastError.statusCode || 500,\n credits: 0,\n error: lastError,\n };\n }\n\n // Retryable error - continue if attempts remaining\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${maxRetries} in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n }\n\n // All retries exhausted\n return {\n content: `Error: Failed after ${maxRetries} attempts. ${lastError?.message || 'Unknown error'}`,\n statusCode: lastError?.statusCode || 500,\n credits: 0,\n error: lastError || { code: ErrorCode.UNKNOWN_ERROR, message: 'All retries exhausted', retryable: false },\n };\n }\n\n /**\n * Scrape with automatic fallback through different modes\n * NEVER throws - always returns a ScrapeResponse\n */\n async scrapeWithFallback(url: string, options: { timeout?: number } = {}): Promise<ScrapeResponse> {\n const attemptResults: string[] = [];\n let lastResult: ScrapeResponse | null = null;\n const deadline = Date.now() + FALLBACK_OVERALL_TIMEOUT_MS;\n\n for (const attempt of FALLBACK_ATTEMPTS) {\n // Check overall deadline before starting next fallback\n if (Date.now() >= deadline) {\n mcpLog('warning', `Overall fallback timeout reached for ${url} after ${attemptResults.length} attempt(s)`, 'scraper');\n break;\n }\n\n const result = await this.tryFallbackAttempt(url, attempt, options);\n\n if (result.done) {\n if (attemptResults.length > 0) {\n mcpLog('info', `Success with ${attempt.description} after ${attemptResults.length} fallback(s)`, 'scraper');\n }\n return result.response;\n }\n\n lastResult = result.response;\n attemptResults.push(`${attempt.description}: ${result.response.error?.message || result.response.statusCode}`);\n mcpLog('warning', `Failed with ${attempt.description} (${result.response.statusCode}), trying next fallback...`, 'scraper');\n }\n\n // All fallbacks exhausted or deadline reached\n const errorMessage = `Failed after ${attemptResults.length} fallback attempt(s): ${attemptResults.join('; ')}`;\n return {\n content: `Error: ${errorMessage}`,\n statusCode: lastResult?.statusCode || 500,\n credits: 0,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: errorMessage,\n retryable: false,\n },\n };\n }\n\n /**\n * Execute a single fallback attempt and determine whether to continue.\n * Returns { done: true } on success/terminal or { done: false } to try the next mode.\n */\n private async tryFallbackAttempt(\n url: string,\n attempt: FallbackAttempt,\n options: { timeout?: number },\n ): Promise<{ done: boolean; response: ScrapeResponse }> {\n const result = await this.scrape({\n url,\n mode: attempt.mode,\n timeout: options.timeout,\n country: attempt.country,\n });\n\n // Success \u2014 but verify content isn't an empty SPA shell\n if (result.statusCode >= 200 && result.statusCode < 300 && !result.error) {\n const strippedLength = result.content.replace(/<[^>]*>/g, '').trim().length;\n if (strippedLength < MIN_USEFUL_CONTENT_LENGTH && attempt.mode === 'basic') {\n mcpLog('info', `Basic mode returned only ${strippedLength} chars of text for ${url} \u2014 trying JS rendering`, 'scraper');\n return { done: false, response: result };\n }\n return { done: true, response: result };\n }\n\n // 404 is a valid response, not an error\n if (result.statusCode === 404) {\n return { done: true, response: result };\n }\n\n // 502 Bad Gateway \u2014 almost always a WAF/CDN block, not a transient issue.\n // Switching render mode won't bypass CDN protection, so fail fast.\n if (result.statusCode === 502) {\n mcpLog('warning', `502 Bad Gateway for ${url} \u2014 likely WAF/CDN block, skipping fallback modes`, 'scraper');\n return { done: true, response: {\n ...result,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: 'Bad gateway \u2014 site is blocking automated access',\n retryable: false,\n },\n }};\n }\n\n // Non-retryable errors - don't try other modes\n if (result.error && !result.error.retryable) {\n mcpLog('error', `Non-retryable error with ${attempt.description}: ${result.error.message}`, 'scraper');\n return { done: true, response: result };\n }\n\n return { done: false, response: result };\n }\n\n /**\n * Scrape multiple URLs with batching\n * NEVER throws - always returns results array\n */\n async scrapeMultiple(urls: string[], options: { timeout?: number } = {}): Promise<Array<ScrapeResponse & { url: string }>> {\n if (urls.length === 0) {\n return [];\n }\n\n if (urls.length <= SCRAPE_BATCH_SIZE) {\n return this.processBatch(urls, options);\n }\n\n const result = await this.batchScrape(urls, options);\n return result.results as Array<ScrapeResponse & { url: string }>;\n }\n\n /**\n * Batch scrape with progress callback\n * NEVER throws - uses Promise.allSettled internally\n */\n async batchScrape(\n urls: string[],\n options: { timeout?: number } = {},\n onBatchComplete?: (batchNum: number, totalBatches: number, processed: number) => void\n ): Promise<BatchScrapeResult> {\n const totalBatches = Math.ceil(urls.length / SCRAPE_BATCH_SIZE);\n const allResults: Array<ScrapeResponse & { url: string }> = [];\n let rateLimitHits = 0;\n\n mcpLog('info', `Starting batch processing: ${urls.length} URLs in ${totalBatches} batch(es)`, 'scraper');\n\n for (let batchNum = 0; batchNum < totalBatches; batchNum++) {\n const startIdx = batchNum * SCRAPE_BATCH_SIZE;\n const endIdx = Math.min(startIdx + SCRAPE_BATCH_SIZE, urls.length);\n const batchUrls = urls.slice(startIdx, endIdx);\n\n mcpLog('info', `Processing batch ${batchNum + 1}/${totalBatches} (${batchUrls.length} URLs)`, 'scraper');\n\n const batchResults = await pMapSettled(\n batchUrls,\n url => this.scrapeWithFallback(url, options),\n CONCURRENCY.SCRAPER\n );\n\n for (let i = 0; i < batchResults.length; i++) {\n const result = batchResults[i];\n if (!result) continue;\n const url = batchUrls[i] ?? '';\n\n if (result.status === 'fulfilled') {\n const scrapeResult = result.value;\n allResults.push({ ...scrapeResult, url });\n\n // Track rate limits\n if (scrapeResult.error?.code === ErrorCode.RATE_LIMITED) {\n rateLimitHits++;\n }\n } else {\n // This shouldn't happen since scrapeWithFallback never throws,\n // but handle it gracefully just in case\n const errorMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);\n mcpLog('error', `Unexpected rejection for ${url}: ${errorMsg}`, 'scraper');\n\n allResults.push({\n url,\n content: `Error: Unexpected failure - ${errorMsg}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n });\n }\n }\n\n // Safe callback invocation\n try {\n onBatchComplete?.(batchNum + 1, totalBatches, allResults.length);\n } catch (callbackError) {\n mcpLog('error', `onBatchComplete callback error: ${callbackError}`, 'scraper');\n }\n\n mcpLog('info', `Completed batch ${batchNum + 1}/${totalBatches} (${allResults.length}/${urls.length} total)`, 'scraper');\n\n // Adaptive delay between batches \u2014 back off harder under rate limiting\n if (batchNum < totalBatches - 1) {\n const batchDelay = rateLimitHits > 0 ? 2000 : 500;\n await sleep(batchDelay);\n }\n }\n\n return { results: allResults, batchesProcessed: totalBatches, totalAttempted: urls.length, rateLimitHits };\n }\n\n /**\n * Process a single batch of URLs\n * NEVER throws\n */\n private async processBatch(urls: string[], options: { timeout?: number }): Promise<Array<ScrapeResponse & { url: string }>> {\n const results = await pMapSettled(urls, url => this.scrapeWithFallback(url, options), CONCURRENCY.SCRAPER);\n\n return results.map((result, index) => {\n const url = urls[index] || '';\n\n if (result.status === 'fulfilled') {\n return { ...result.value, url };\n }\n\n // Shouldn't happen, but handle gracefully\n return {\n url,\n content: `Error: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n };\n });\n }\n}\n"],
|
|
5
|
-
"mappings": "AAMA,SAAS,UAAmB,mBAAmB;AAC/C;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAEK;AACP,SAAS,wBAAwB;AACjC,SAAS,mBAAmB;AAC5B,SAAS,cAAc;
|
|
4
|
+
"sourcesContent": ["/**\n * Web Scraper Client\n * Generic interface for URL scraping with automatic fallback modes\n * Implements robust error handling that NEVER crashes\n */\n\nimport { parseEnv, SCRAPER, CONCURRENCY } from '../config/index.js';\nimport {\n classifyError,\n fetchWithTimeout,\n sleep,\n ErrorCode,\n type StructuredError,\n} from '../utils/errors.js';\nimport { calculateBackoff } from '../utils/retry.js';\nimport { pMapSettled } from '../utils/concurrency.js';\nimport { mcpLog } from '../utils/logger.js';\nimport { isBinaryDocumentContentType } from '../utils/source-type.js';\n\n// \u2500\u2500 Constants \u2500\u2500\n\nconst SCRAPE_MODES = ['basic', 'javascript', 'javascript_geo'] as const;\ntype ScrapeMode = typeof SCRAPE_MODES[number];\n\nconst CREDIT_COSTS: Record<string, number> = { basic: 1, javascript: 5, javascript_geo: 5 } as const;\nconst SCRAPE_BATCH_SIZE = 30 as const;\nconst MAX_RETRIES = 1 as const;\n/** Overall timeout for all fallback attempts on a single URL */\nconst FALLBACK_OVERALL_TIMEOUT_MS = 30_000 as const;\n\n// \u2500\u2500 Interfaces \u2500\u2500\n\ninterface ScrapeRequest {\n readonly url: string;\n readonly mode?: 'basic' | 'javascript';\n readonly timeout?: number;\n readonly country?: string;\n}\n\ninterface ScrapeResponse {\n readonly content: string;\n readonly statusCode: number;\n readonly credits: number;\n readonly headers?: Record<string, string>;\n readonly error?: StructuredError;\n}\n\ninterface BatchScrapeResult {\n readonly results: ReadonlyArray<ScrapeResponse & { readonly url: string }>;\n readonly batchesProcessed: number;\n readonly totalAttempted: number;\n readonly rateLimitHits: number;\n}\n\n// Status codes that indicate we should retry (no credit consumed)\nconst RETRYABLE_STATUS_CODES = new Set([429, 502, 503, 504, 510]);\n// Status codes that are permanent failures (don't retry)\nconst PERMANENT_FAILURE_CODES = new Set([400, 401, 403]);\n\n/** Minimum stripped-text length to consider a scrape successful (filters out empty SPA shells) */\nconst MIN_USEFUL_CONTENT_LENGTH = 200 as const;\n\n/** Fallback attempt descriptor used by scrapeWithFallback */\ninterface FallbackAttempt {\n readonly mode: 'basic' | 'javascript';\n readonly country?: string;\n readonly description: string;\n}\n\nconst FALLBACK_ATTEMPTS: readonly FallbackAttempt[] = [\n { mode: 'basic', description: 'basic mode' },\n { mode: 'javascript', description: 'javascript rendering' },\n { mode: 'javascript', country: 'us', description: 'javascript + US geo-targeting' },\n] as const;\n\nexport class ScraperClient {\n private apiKey: string;\n private baseURL = 'https://api.scrape.do';\n\n constructor(apiKey?: string) {\n const env = parseEnv();\n this.apiKey = apiKey || env.SCRAPER_API_KEY;\n\n if (!this.apiKey) {\n throw new Error('Web scraping capability is not configured. Please set up the required API credentials.');\n }\n }\n\n /**\n * Scrape a single URL with retry logic\n * NEVER throws - always returns a ScrapeResponse (possibly with error)\n */\n async scrape(request: ScrapeRequest, maxRetries = MAX_RETRIES): Promise<ScrapeResponse> {\n const { url, mode = 'basic', timeout = 15, country } = request;\n const credits = CREDIT_COSTS[mode] ?? 1;\n\n // Validate URL first\n try {\n new URL(url);\n } catch {\n return {\n content: `Invalid URL: ${url}`,\n statusCode: 400,\n credits: 0,\n error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false },\n };\n }\n\n const params = new URLSearchParams({\n url: url,\n token: this.apiKey,\n timeout: String(timeout * 1000),\n });\n\n if (mode === 'javascript') {\n params.append('render', 'true');\n }\n\n if (country) {\n params.append('geoCode', country.toUpperCase());\n }\n\n const apiUrl = `${this.baseURL}?${params.toString()}`;\n let lastError: StructuredError | undefined;\n\n for (let attempt = 0; attempt < maxRetries; attempt++) {\n try {\n // Use AbortController for timeout\n const timeoutMs = (timeout + 5) * 1000; // Add 5s buffer over scrape timeout\n const response = await fetchWithTimeout(apiUrl, {\n method: 'GET',\n headers: { Accept: 'text/html,application/json' },\n timeoutMs,\n });\n\n // Safely read response body\n let content: string;\n try {\n content = await response.text();\n } catch (readError) {\n content = `Failed to read response: ${readError instanceof Error ? readError.message : String(readError)}`;\n }\n\n // SUCCESS: 2xx - Successful API call\n if (response.ok) {\n // Content-Type gate: if the origin served a binary document format\n // (PDF/DOCX/PPTX/XLSX/octet-stream), discard the body and surface\n // UNSUPPORTED_BINARY_CONTENT so the tool handler can reroute this\n // URL through the Jina Reader path. Reading binary as text produces\n // mojibake that silently passes Readability + Turndown (both of which\n // short-circuit on \"no `<` tag\") and contaminates the LLM.\n const contentType = response.headers.get('content-type');\n if (isBinaryDocumentContentType(contentType)) {\n mcpLog(\n 'info',\n `Binary document detected at ${url} (content-type: ${contentType}). Deferring to Jina Reader.`,\n 'scraper',\n );\n return {\n content: `Binary document (${contentType ?? 'unknown'}); routed to Jina Reader`,\n statusCode: 415,\n credits: 0,\n headers: Object.fromEntries(response.headers.entries()),\n error: {\n code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,\n message: `Scrape.do cannot decode ${contentType ?? 'this binary content-type'}`,\n retryable: false,\n statusCode: 415,\n },\n };\n }\n\n return {\n content,\n statusCode: response.status,\n credits,\n headers: Object.fromEntries(response.headers.entries()),\n };\n }\n\n // 404 - Target not found (permanent, but not an error for our purposes)\n if (response.status === 404) {\n return {\n content: '404 - Page not found',\n statusCode: 404,\n credits,\n };\n }\n\n // Permanent failures - don't retry\n if (PERMANENT_FAILURE_CODES.has(response.status)) {\n const errorMsg = response.status === 401\n ? 'No credits remaining or subscription suspended'\n : `Request failed with status ${response.status}`;\n return {\n content: `Error: ${errorMsg}`,\n statusCode: response.status,\n credits: 0,\n error: {\n code: response.status === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.INVALID_INPUT,\n message: errorMsg,\n retryable: false,\n statusCode: response.status,\n },\n };\n }\n\n // Retryable status codes\n if (RETRYABLE_STATUS_CODES.has(response.status)) {\n lastError = {\n code: response.status === 429 ? ErrorCode.RATE_LIMITED : ErrorCode.SERVICE_UNAVAILABLE,\n message: `Server returned ${response.status}`,\n retryable: true,\n statusCode: response.status,\n };\n\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${response.status} on attempt ${attempt + 1}/${maxRetries}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n\n // Other non-success status - treat as retryable\n lastError = classifyError({ status: response.status, message: content });\n if (attempt < maxRetries - 1 && lastError.retryable) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `Status ${response.status}. Retrying in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n\n // Final attempt failed\n return {\n content: `Error: ${lastError.message}`,\n statusCode: response.status,\n credits: 0,\n error: lastError,\n };\n\n } catch (error) {\n lastError = classifyError(error);\n\n // Non-retryable errors - return immediately\n if (!lastError.retryable) {\n return {\n content: `Error: ${lastError.message}`,\n statusCode: lastError.statusCode || 500,\n credits: 0,\n error: lastError,\n };\n }\n\n // Retryable error - continue if attempts remaining\n if (attempt < maxRetries - 1) {\n const delayMs = calculateBackoff(attempt);\n mcpLog('warning', `${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${maxRetries} in ${delayMs}ms`, 'scraper');\n await sleep(delayMs);\n continue;\n }\n }\n }\n\n // All retries exhausted\n return {\n content: `Error: Failed after ${maxRetries} attempts. ${lastError?.message || 'Unknown error'}`,\n statusCode: lastError?.statusCode || 500,\n credits: 0,\n error: lastError || { code: ErrorCode.UNKNOWN_ERROR, message: 'All retries exhausted', retryable: false },\n };\n }\n\n /**\n * Scrape with automatic fallback through different modes\n * NEVER throws - always returns a ScrapeResponse\n */\n async scrapeWithFallback(url: string, options: { timeout?: number } = {}): Promise<ScrapeResponse> {\n const attemptResults: string[] = [];\n let lastResult: ScrapeResponse | null = null;\n const deadline = Date.now() + FALLBACK_OVERALL_TIMEOUT_MS;\n\n for (const attempt of FALLBACK_ATTEMPTS) {\n // Check overall deadline before starting next fallback\n if (Date.now() >= deadline) {\n mcpLog('warning', `Overall fallback timeout reached for ${url} after ${attemptResults.length} attempt(s)`, 'scraper');\n break;\n }\n\n const result = await this.tryFallbackAttempt(url, attempt, options);\n\n if (result.done) {\n if (attemptResults.length > 0) {\n mcpLog('info', `Success with ${attempt.description} after ${attemptResults.length} fallback(s)`, 'scraper');\n }\n return result.response;\n }\n\n lastResult = result.response;\n attemptResults.push(`${attempt.description}: ${result.response.error?.message || result.response.statusCode}`);\n mcpLog('warning', `Failed with ${attempt.description} (${result.response.statusCode}), trying next fallback...`, 'scraper');\n }\n\n // All fallbacks exhausted or deadline reached\n const errorMessage = `Failed after ${attemptResults.length} fallback attempt(s): ${attemptResults.join('; ')}`;\n return {\n content: `Error: ${errorMessage}`,\n statusCode: lastResult?.statusCode || 500,\n credits: 0,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: errorMessage,\n retryable: false,\n },\n };\n }\n\n /**\n * Execute a single fallback attempt and determine whether to continue.\n * Returns { done: true } on success/terminal or { done: false } to try the next mode.\n */\n private async tryFallbackAttempt(\n url: string,\n attempt: FallbackAttempt,\n options: { timeout?: number },\n ): Promise<{ done: boolean; response: ScrapeResponse }> {\n const result = await this.scrape({\n url,\n mode: attempt.mode,\n timeout: options.timeout,\n country: attempt.country,\n });\n\n // Success \u2014 but verify content isn't an empty SPA shell\n if (result.statusCode >= 200 && result.statusCode < 300 && !result.error) {\n const strippedLength = result.content.replace(/<[^>]*>/g, '').trim().length;\n if (strippedLength < MIN_USEFUL_CONTENT_LENGTH && attempt.mode === 'basic') {\n mcpLog('info', `Basic mode returned only ${strippedLength} chars of text for ${url} \u2014 trying JS rendering`, 'scraper');\n return { done: false, response: result };\n }\n return { done: true, response: result };\n }\n\n // 404 is a valid response, not an error\n if (result.statusCode === 404) {\n return { done: true, response: result };\n }\n\n // 502 Bad Gateway \u2014 almost always a WAF/CDN block, not a transient issue.\n // Switching render mode won't bypass CDN protection, so fail fast.\n if (result.statusCode === 502) {\n mcpLog('warning', `502 Bad Gateway for ${url} \u2014 likely WAF/CDN block, skipping fallback modes`, 'scraper');\n return { done: true, response: {\n ...result,\n error: {\n code: ErrorCode.SERVICE_UNAVAILABLE,\n message: 'Bad gateway \u2014 site is blocking automated access',\n retryable: false,\n },\n }};\n }\n\n // Non-retryable errors - don't try other modes\n if (result.error && !result.error.retryable) {\n mcpLog('error', `Non-retryable error with ${attempt.description}: ${result.error.message}`, 'scraper');\n return { done: true, response: result };\n }\n\n return { done: false, response: result };\n }\n\n /**\n * Scrape multiple URLs with batching\n * NEVER throws - always returns results array\n */\n async scrapeMultiple(urls: string[], options: { timeout?: number } = {}): Promise<Array<ScrapeResponse & { url: string }>> {\n if (urls.length === 0) {\n return [];\n }\n\n if (urls.length <= SCRAPE_BATCH_SIZE) {\n return this.processBatch(urls, options);\n }\n\n const result = await this.batchScrape(urls, options);\n return result.results as Array<ScrapeResponse & { url: string }>;\n }\n\n /**\n * Batch scrape with progress callback\n * NEVER throws - uses Promise.allSettled internally\n */\n async batchScrape(\n urls: string[],\n options: { timeout?: number } = {},\n onBatchComplete?: (batchNum: number, totalBatches: number, processed: number) => void\n ): Promise<BatchScrapeResult> {\n const totalBatches = Math.ceil(urls.length / SCRAPE_BATCH_SIZE);\n const allResults: Array<ScrapeResponse & { url: string }> = [];\n let rateLimitHits = 0;\n\n mcpLog('info', `Starting batch processing: ${urls.length} URLs in ${totalBatches} batch(es)`, 'scraper');\n\n for (let batchNum = 0; batchNum < totalBatches; batchNum++) {\n const startIdx = batchNum * SCRAPE_BATCH_SIZE;\n const endIdx = Math.min(startIdx + SCRAPE_BATCH_SIZE, urls.length);\n const batchUrls = urls.slice(startIdx, endIdx);\n\n mcpLog('info', `Processing batch ${batchNum + 1}/${totalBatches} (${batchUrls.length} URLs)`, 'scraper');\n\n const batchResults = await pMapSettled(\n batchUrls,\n url => this.scrapeWithFallback(url, options),\n CONCURRENCY.SCRAPER\n );\n\n for (let i = 0; i < batchResults.length; i++) {\n const result = batchResults[i];\n if (!result) continue;\n const url = batchUrls[i] ?? '';\n\n if (result.status === 'fulfilled') {\n const scrapeResult = result.value;\n allResults.push({ ...scrapeResult, url });\n\n // Track rate limits\n if (scrapeResult.error?.code === ErrorCode.RATE_LIMITED) {\n rateLimitHits++;\n }\n } else {\n // This shouldn't happen since scrapeWithFallback never throws,\n // but handle it gracefully just in case\n const errorMsg = result.reason instanceof Error ? result.reason.message : String(result.reason);\n mcpLog('error', `Unexpected rejection for ${url}: ${errorMsg}`, 'scraper');\n\n allResults.push({\n url,\n content: `Error: Unexpected failure - ${errorMsg}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n });\n }\n }\n\n // Safe callback invocation\n try {\n onBatchComplete?.(batchNum + 1, totalBatches, allResults.length);\n } catch (callbackError) {\n mcpLog('error', `onBatchComplete callback error: ${callbackError}`, 'scraper');\n }\n\n mcpLog('info', `Completed batch ${batchNum + 1}/${totalBatches} (${allResults.length}/${urls.length} total)`, 'scraper');\n\n // Adaptive delay between batches \u2014 back off harder under rate limiting\n if (batchNum < totalBatches - 1) {\n const batchDelay = rateLimitHits > 0 ? 2000 : 500;\n await sleep(batchDelay);\n }\n }\n\n return { results: allResults, batchesProcessed: totalBatches, totalAttempted: urls.length, rateLimitHits };\n }\n\n /**\n * Process a single batch of URLs\n * NEVER throws\n */\n private async processBatch(urls: string[], options: { timeout?: number }): Promise<Array<ScrapeResponse & { url: string }>> {\n const results = await pMapSettled(urls, url => this.scrapeWithFallback(url, options), CONCURRENCY.SCRAPER);\n\n return results.map((result, index) => {\n const url = urls[index] || '';\n\n if (result.status === 'fulfilled') {\n return { ...result.value, url };\n }\n\n // Shouldn't happen, but handle gracefully\n return {\n url,\n content: `Error: ${result.reason instanceof Error ? result.reason.message : String(result.reason)}`,\n statusCode: 500,\n credits: 0,\n error: classifyError(result.reason),\n };\n });\n }\n}\n"],
|
|
5
|
+
"mappings": "AAMA,SAAS,UAAmB,mBAAmB;AAC/C;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAEK;AACP,SAAS,wBAAwB;AACjC,SAAS,mBAAmB;AAC5B,SAAS,cAAc;AACvB,SAAS,mCAAmC;AAI5C,MAAM,eAAe,CAAC,SAAS,cAAc,gBAAgB;AAG7D,MAAM,eAAuC,EAAE,OAAO,GAAG,YAAY,GAAG,gBAAgB,EAAE;AAC1F,MAAM,oBAAoB;AAC1B,MAAM,cAAc;AAEpB,MAAM,8BAA8B;AA2BpC,MAAM,yBAAyB,oBAAI,IAAI,CAAC,KAAK,KAAK,KAAK,KAAK,GAAG,CAAC;AAEhE,MAAM,0BAA0B,oBAAI,IAAI,CAAC,KAAK,KAAK,GAAG,CAAC;AAGvD,MAAM,4BAA4B;AASlC,MAAM,oBAAgD;AAAA,EACpD,EAAE,MAAM,SAAS,aAAa,aAAa;AAAA,EAC3C,EAAE,MAAM,cAAc,aAAa,uBAAuB;AAAA,EAC1D,EAAE,MAAM,cAAc,SAAS,MAAM,aAAa,gCAAgC;AACpF;AAEO,MAAM,cAAc;AAAA,EACjB;AAAA,EACA,UAAU;AAAA,EAElB,YAAY,QAAiB;AAC3B,UAAM,MAAM,SAAS;AACrB,SAAK,SAAS,UAAU,IAAI;AAE5B,QAAI,CAAC,KAAK,QAAQ;AAChB,YAAM,IAAI,MAAM,wFAAwF;AAAA,IAC1G;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,OAAO,SAAwB,aAAa,aAAsC;AACtF,UAAM,EAAE,KAAK,OAAO,SAAS,UAAU,IAAI,QAAQ,IAAI;AACvD,UAAM,UAAU,aAAa,IAAI,KAAK;AAGtC,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,aAAO;AAAA,QACL,SAAS,gBAAgB,GAAG;AAAA,QAC5B,YAAY;AAAA,QACZ,SAAS;AAAA,QACT,OAAO,EAAE,MAAM,UAAU,eAAe,SAAS,gBAAgB,GAAG,IAAI,WAAW,MAAM;AAAA,MAC3F;AAAA,IACF;AAEA,UAAM,SAAS,IAAI,gBAAgB;AAAA,MACjC;AAAA,MACA,OAAO,KAAK;AAAA,MACZ,SAAS,OAAO,UAAU,GAAI;AAAA,IAChC,CAAC;AAED,QAAI,SAAS,cAAc;AACzB,aAAO,OAAO,UAAU,MAAM;AAAA,IAChC;AAEA,QAAI,SAAS;AACX,aAAO,OAAO,WAAW,QAAQ,YAAY,CAAC;AAAA,IAChD;AAEA,UAAM,SAAS,GAAG,KAAK,OAAO,IAAI,OAAO,SAAS,CAAC;AACnD,QAAI;AAEJ,aAAS,UAAU,GAAG,UAAU,YAAY,WAAW;AACrD,UAAI;AAEF,cAAM,aAAa,UAAU,KAAK;AAClC,cAAM,WAAW,MAAM,iBAAiB,QAAQ;AAAA,UAC9C,QAAQ;AAAA,UACR,SAAS,EAAE,QAAQ,6BAA6B;AAAA,UAChD;AAAA,QACF,CAAC;AAGD,YAAI;AACJ,YAAI;AACF,oBAAU,MAAM,SAAS,KAAK;AAAA,QAChC,SAAS,WAAW;AAClB,oBAAU,4BAA4B,qBAAqB,QAAQ,UAAU,UAAU,OAAO,SAAS,CAAC;AAAA,QAC1G;AAGA,YAAI,SAAS,IAAI;AAOf,gBAAM,cAAc,SAAS,QAAQ,IAAI,cAAc;AACvD,cAAI,4BAA4B,WAAW,GAAG;AAC5C;AAAA,cACE;AAAA,cACA,+BAA+B,GAAG,mBAAmB,WAAW;AAAA,cAChE;AAAA,YACF;AACA,mBAAO;AAAA,cACL,SAAS,oBAAoB,eAAe,SAAS;AAAA,cACrD,YAAY;AAAA,cACZ,SAAS;AAAA,cACT,SAAS,OAAO,YAAY,SAAS,QAAQ,QAAQ,CAAC;AAAA,cACtD,OAAO;AAAA,gBACL,MAAM,UAAU;AAAA,gBAChB,SAAS,2BAA2B,eAAe,0BAA0B;AAAA,gBAC7E,WAAW;AAAA,gBACX,YAAY;AAAA,cACd;AAAA,YACF;AAAA,UACF;AAEA,iBAAO;AAAA,YACL;AAAA,YACA,YAAY,SAAS;AAAA,YACrB;AAAA,YACA,SAAS,OAAO,YAAY,SAAS,QAAQ,QAAQ,CAAC;AAAA,UACxD;AAAA,QACF;AAGA,YAAI,SAAS,WAAW,KAAK;AAC3B,iBAAO;AAAA,YACL,SAAS;AAAA,YACT,YAAY;AAAA,YACZ;AAAA,UACF;AAAA,QACF;AAGA,YAAI,wBAAwB,IAAI,SAAS,MAAM,GAAG;AAChD,gBAAM,WAAW,SAAS,WAAW,MACjC,mDACA,8BAA8B,SAAS,MAAM;AACjD,iBAAO;AAAA,YACL,SAAS,UAAU,QAAQ;AAAA,YAC3B,YAAY,SAAS;AAAA,YACrB,SAAS;AAAA,YACT,OAAO;AAAA,cACL,MAAM,SAAS,WAAW,MAAM,UAAU,aAAa,UAAU;AAAA,cACjE,SAAS;AAAA,cACT,WAAW;AAAA,cACX,YAAY,SAAS;AAAA,YACvB;AAAA,UACF;AAAA,QACF;AAGA,YAAI,uBAAuB,IAAI,SAAS,MAAM,GAAG;AAC/C,sBAAY;AAAA,YACV,MAAM,SAAS,WAAW,MAAM,UAAU,eAAe,UAAU;AAAA,YACnE,SAAS,mBAAmB,SAAS,MAAM;AAAA,YAC3C,WAAW;AAAA,YACX,YAAY,SAAS;AAAA,UACvB;AAEA,cAAI,UAAU,aAAa,GAAG;AAC5B,kBAAM,UAAU,iBAAiB,OAAO;AACxC,mBAAO,WAAW,GAAG,SAAS,MAAM,eAAe,UAAU,CAAC,IAAI,UAAU,iBAAiB,OAAO,MAAM,SAAS;AACnH,kBAAM,MAAM,OAAO;AACnB;AAAA,UACF;AAAA,QACF;AAGA,oBAAY,cAAc,EAAE,QAAQ,SAAS,QAAQ,SAAS,QAAQ,CAAC;AACvE,YAAI,UAAU,aAAa,KAAK,UAAU,WAAW;AACnD,gBAAM,UAAU,iBAAiB,OAAO;AACxC,iBAAO,WAAW,UAAU,SAAS,MAAM,iBAAiB,OAAO,MAAM,SAAS;AAClF,gBAAM,MAAM,OAAO;AACnB;AAAA,QACF;AAGA,eAAO;AAAA,UACL,SAAS,UAAU,UAAU,OAAO;AAAA,UACpC,YAAY,SAAS;AAAA,UACrB,SAAS;AAAA,UACT,OAAO;AAAA,QACT;AAAA,MAEF,SAAS,OAAO;AACd,oBAAY,cAAc,KAAK;AAG/B,YAAI,CAAC,UAAU,WAAW;AACxB,iBAAO;AAAA,YACL,SAAS,UAAU,UAAU,OAAO;AAAA,YACpC,YAAY,UAAU,cAAc;AAAA,YACpC,SAAS;AAAA,YACT,OAAO;AAAA,UACT;AAAA,QACF;AAGA,YAAI,UAAU,aAAa,GAAG;AAC5B,gBAAM,UAAU,iBAAiB,OAAO;AACxC,iBAAO,WAAW,GAAG,UAAU,IAAI,KAAK,UAAU,OAAO,WAAW,UAAU,CAAC,IAAI,UAAU,OAAO,OAAO,MAAM,SAAS;AAC1H,gBAAM,MAAM,OAAO;AACnB;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAGA,WAAO;AAAA,MACL,SAAS,uBAAuB,UAAU,cAAc,WAAW,WAAW,eAAe;AAAA,MAC7F,YAAY,WAAW,cAAc;AAAA,MACrC,SAAS;AAAA,MACT,OAAO,aAAa,EAAE,MAAM,UAAU,eAAe,SAAS,yBAAyB,WAAW,MAAM;AAAA,IAC1G;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,mBAAmB,KAAa,UAAgC,CAAC,GAA4B;AACjG,UAAM,iBAA2B,CAAC;AAClC,QAAI,aAAoC;AACxC,UAAM,WAAW,KAAK,IAAI,IAAI;AAE9B,eAAW,WAAW,mBAAmB;AAEvC,UAAI,KAAK,IAAI,KAAK,UAAU;AAC1B,eAAO,WAAW,wCAAwC,GAAG,UAAU,eAAe,MAAM,eAAe,SAAS;AACpH;AAAA,MACF;AAEA,YAAM,SAAS,MAAM,KAAK,mBAAmB,KAAK,SAAS,OAAO;AAElE,UAAI,OAAO,MAAM;AACf,YAAI,eAAe,SAAS,GAAG;AAC7B,iBAAO,QAAQ,gBAAgB,QAAQ,WAAW,UAAU,eAAe,MAAM,gBAAgB,SAAS;AAAA,QAC5G;AACA,eAAO,OAAO;AAAA,MAChB;AAEA,mBAAa,OAAO;AACpB,qBAAe,KAAK,GAAG,QAAQ,WAAW,KAAK,OAAO,SAAS,OAAO,WAAW,OAAO,SAAS,UAAU,EAAE;AAC7G,aAAO,WAAW,eAAe,QAAQ,WAAW,KAAK,OAAO,SAAS,UAAU,8BAA8B,SAAS;AAAA,IAC5H;AAGA,UAAM,eAAe,gBAAgB,eAAe,MAAM,yBAAyB,eAAe,KAAK,IAAI,CAAC;AAC5G,WAAO;AAAA,MACL,SAAS,UAAU,YAAY;AAAA,MAC/B,YAAY,YAAY,cAAc;AAAA,MACtC,SAAS;AAAA,MACT,OAAO;AAAA,QACL,MAAM,UAAU;AAAA,QAChB,SAAS;AAAA,QACT,WAAW;AAAA,MACb;AAAA,IACF;AAAA,EACF;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,mBACZ,KACA,SACA,SACsD;AACtD,UAAM,SAAS,MAAM,KAAK,OAAO;AAAA,MAC/B;AAAA,MACA,MAAM,QAAQ;AAAA,MACd,SAAS,QAAQ;AAAA,MACjB,SAAS,QAAQ;AAAA,IACnB,CAAC;AAGD,QAAI,OAAO,cAAc,OAAO,OAAO,aAAa,OAAO,CAAC,OAAO,OAAO;AACxE,YAAM,iBAAiB,OAAO,QAAQ,QAAQ,YAAY,EAAE,EAAE,KAAK,EAAE;AACrE,UAAI,iBAAiB,6BAA6B,QAAQ,SAAS,SAAS;AAC1E,eAAO,QAAQ,4BAA4B,cAAc,sBAAsB,GAAG,+BAA0B,SAAS;AACrH,eAAO,EAAE,MAAM,OAAO,UAAU,OAAO;AAAA,MACzC;AACA,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAGA,QAAI,OAAO,eAAe,KAAK;AAC7B,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAIA,QAAI,OAAO,eAAe,KAAK;AAC7B,aAAO,WAAW,uBAAuB,GAAG,yDAAoD,SAAS;AACzG,aAAO,EAAE,MAAM,MAAM,UAAU;AAAA,QAC7B,GAAG;AAAA,QACH,OAAO;AAAA,UACL,MAAM,UAAU;AAAA,UAChB,SAAS;AAAA,UACT,WAAW;AAAA,QACb;AAAA,MACF,EAAC;AAAA,IACH;AAGA,QAAI,OAAO,SAAS,CAAC,OAAO,MAAM,WAAW;AAC3C,aAAO,SAAS,4BAA4B,QAAQ,WAAW,KAAK,OAAO,MAAM,OAAO,IAAI,SAAS;AACrG,aAAO,EAAE,MAAM,MAAM,UAAU,OAAO;AAAA,IACxC;AAEA,WAAO,EAAE,MAAM,OAAO,UAAU,OAAO;AAAA,EACzC;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,eAAe,MAAgB,UAAgC,CAAC,GAAqD;AACzH,QAAI,KAAK,WAAW,GAAG;AACrB,aAAO,CAAC;AAAA,IACV;AAEA,QAAI,KAAK,UAAU,mBAAmB;AACpC,aAAO,KAAK,aAAa,MAAM,OAAO;AAAA,IACxC;AAEA,UAAM,SAAS,MAAM,KAAK,YAAY,MAAM,OAAO;AACnD,WAAO,OAAO;AAAA,EAChB;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAM,YACJ,MACA,UAAgC,CAAC,GACjC,iBAC4B;AAC5B,UAAM,eAAe,KAAK,KAAK,KAAK,SAAS,iBAAiB;AAC9D,UAAM,aAAsD,CAAC;AAC7D,QAAI,gBAAgB;AAEpB,WAAO,QAAQ,8BAA8B,KAAK,MAAM,YAAY,YAAY,cAAc,SAAS;AAEvG,aAAS,WAAW,GAAG,WAAW,cAAc,YAAY;AAC1D,YAAM,WAAW,WAAW;AAC5B,YAAM,SAAS,KAAK,IAAI,WAAW,mBAAmB,KAAK,MAAM;AACjE,YAAM,YAAY,KAAK,MAAM,UAAU,MAAM;AAE7C,aAAO,QAAQ,oBAAoB,WAAW,CAAC,IAAI,YAAY,KAAK,UAAU,MAAM,UAAU,SAAS;AAEvG,YAAM,eAAe,MAAM;AAAA,QACzB;AAAA,QACA,SAAO,KAAK,mBAAmB,KAAK,OAAO;AAAA,QAC3C,YAAY;AAAA,MACd;AAEA,eAAS,IAAI,GAAG,IAAI,aAAa,QAAQ,KAAK;AAC5C,cAAM,SAAS,aAAa,CAAC;AAC7B,YAAI,CAAC,OAAQ;AACb,cAAM,MAAM,UAAU,CAAC,KAAK;AAE5B,YAAI,OAAO,WAAW,aAAa;AACjC,gBAAM,eAAe,OAAO;AAC5B,qBAAW,KAAK,EAAE,GAAG,cAAc,IAAI,CAAC;AAGxC,cAAI,aAAa,OAAO,SAAS,UAAU,cAAc;AACvD;AAAA,UACF;AAAA,QACF,OAAO;AAGL,gBAAM,WAAW,OAAO,kBAAkB,QAAQ,OAAO,OAAO,UAAU,OAAO,OAAO,MAAM;AAC9F,iBAAO,SAAS,4BAA4B,GAAG,KAAK,QAAQ,IAAI,SAAS;AAEzE,qBAAW,KAAK;AAAA,YACd;AAAA,YACA,SAAS,+BAA+B,QAAQ;AAAA,YAChD,YAAY;AAAA,YACZ,SAAS;AAAA,YACT,OAAO,cAAc,OAAO,MAAM;AAAA,UACpC,CAAC;AAAA,QACH;AAAA,MACF;AAGA,UAAI;AACF,0BAAkB,WAAW,GAAG,cAAc,WAAW,MAAM;AAAA,MACjE,SAAS,eAAe;AACtB,eAAO,SAAS,mCAAmC,aAAa,IAAI,SAAS;AAAA,MAC/E;AAEA,aAAO,QAAQ,mBAAmB,WAAW,CAAC,IAAI,YAAY,KAAK,WAAW,MAAM,IAAI,KAAK,MAAM,WAAW,SAAS;AAGvH,UAAI,WAAW,eAAe,GAAG;AAC/B,cAAM,aAAa,gBAAgB,IAAI,MAAO;AAC9C,cAAM,MAAM,UAAU;AAAA,MACxB;AAAA,IACF;AAEA,WAAO,EAAE,SAAS,YAAY,kBAAkB,cAAc,gBAAgB,KAAK,QAAQ,cAAc;AAAA,EAC3G;AAAA;AAAA;AAAA;AAAA;AAAA,EAMA,MAAc,aAAa,MAAgB,SAAiF;AAC1H,UAAM,UAAU,MAAM,YAAY,MAAM,SAAO,KAAK,mBAAmB,KAAK,OAAO,GAAG,YAAY,OAAO;AAEzG,WAAO,QAAQ,IAAI,CAAC,QAAQ,UAAU;AACpC,YAAM,MAAM,KAAK,KAAK,KAAK;AAE3B,UAAI,OAAO,WAAW,aAAa;AACjC,eAAO,EAAE,GAAG,OAAO,OAAO,IAAI;AAAA,MAChC;AAGA,aAAO;AAAA,QACL;AAAA,QACA,SAAS,UAAU,OAAO,kBAAkB,QAAQ,OAAO,OAAO,UAAU,OAAO,OAAO,MAAM,CAAC;AAAA,QACjG,YAAY;AAAA,QACZ,SAAS;AAAA,QACT,OAAO,cAAc,OAAO,MAAM;AAAA,MACpC;AAAA,IACF,CAAC;AAAA,EACH;AACF;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
package/dist/src/config/index.js
CHANGED
|
@@ -27,7 +27,8 @@ function parseEnv() {
|
|
|
27
27
|
SCRAPER_API_KEY: process.env.SCRAPEDO_API_KEY || "",
|
|
28
28
|
SEARCH_API_KEY: process.env.SERPER_API_KEY || void 0,
|
|
29
29
|
REDDIT_CLIENT_ID: process.env.REDDIT_CLIENT_ID || void 0,
|
|
30
|
-
REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || void 0
|
|
30
|
+
REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || void 0,
|
|
31
|
+
JINA_API_KEY: process.env.JINA_API_KEY || void 0
|
|
31
32
|
};
|
|
32
33
|
return cachedEnv;
|
|
33
34
|
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/config/index.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Consolidated configuration\n * All environment variables, constants, and LLM config in one place\n */\n\nimport { Logger } from 'mcp-use';\n\nimport { VERSION, PACKAGE_NAME, PACKAGE_DESCRIPTION } from '../version.js';\n\n// ============================================================================\n// Safe Integer Parsing Helper\n// ============================================================================\n\n/**\n * Safely parse an integer from environment variable with bounds checking\n */\nfunction safeParseInt(\n value: string | undefined,\n defaultVal: number,\n min: number,\n max: number\n): number {\n const logger = Logger.get('config');\n\n if (!value) {\n return defaultVal;\n }\n\n const parsed = parseInt(value, 10);\n\n if (isNaN(parsed)) {\n logger.warn(`Invalid number \"${value}\", using default ${defaultVal}`);\n return defaultVal;\n }\n\n if (parsed < min) {\n logger.warn(`Value ${parsed} below minimum ${min}, clamping to ${min}`);\n return min;\n }\n\n if (parsed > max) {\n logger.warn(`Value ${parsed} above maximum ${max}, clamping to ${max}`);\n return max;\n }\n\n return parsed;\n}\n\n\n// ============================================================================\n// Environment Parsing\n// ============================================================================\n\ninterface EnvConfig {\n SCRAPER_API_KEY: string;\n SEARCH_API_KEY: string | undefined;\n REDDIT_CLIENT_ID: string | undefined;\n REDDIT_CLIENT_SECRET: string | undefined;\n}\n\nlet cachedEnv: EnvConfig | null = null;\n\nexport function parseEnv(): EnvConfig {\n if (cachedEnv) return cachedEnv;\n cachedEnv = {\n SCRAPER_API_KEY: process.env.SCRAPEDO_API_KEY || '',\n SEARCH_API_KEY: process.env.SERPER_API_KEY || undefined,\n REDDIT_CLIENT_ID: process.env.REDDIT_CLIENT_ID || undefined,\n REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || undefined,\n };\n return cachedEnv;\n}\n\n// ============================================================================\n// MCP Server Configuration\n// ============================================================================\n\nexport const SERVER = {\n NAME: PACKAGE_NAME,\n VERSION: VERSION,\n DESCRIPTION: PACKAGE_DESCRIPTION,\n} as const;\n\n// ============================================================================\n// Capability Detection (which features are available based on ENV)\n// ============================================================================\n\nexport interface Capabilities {\n reddit: boolean; // REDDIT_CLIENT_ID + REDDIT_CLIENT_SECRET\n search: boolean; // SERPER_API_KEY\n scraping: boolean; // SCRAPEDO_API_KEY\n llmExtraction: boolean; // LLM_API_KEY\n}\n\nexport function getCapabilities(): Capabilities {\n const env = parseEnv();\n return {\n reddit: !!(env.REDDIT_CLIENT_ID && env.REDDIT_CLIENT_SECRET),\n search: !!env.SEARCH_API_KEY,\n scraping: !!env.SCRAPER_API_KEY,\n llmExtraction: !!LLM_EXTRACTION.API_KEY,\n };\n}\n\nexport function getMissingEnvMessage(capability: keyof Capabilities): string {\n const messages: Record<keyof Capabilities, string> = {\n reddit: '\u274C **Reddit tools unavailable.** Set `REDDIT_CLIENT_ID` and `REDDIT_CLIENT_SECRET` to enable `get-reddit-post`.\\n\\n\uD83D\uDC49 Create a Reddit app at: https://www.reddit.com/prefs/apps (select \"script\" type)',\n search: '\u274C **Search unavailable.** Set `SERPER_API_KEY` to enable `web-search` (including `scope: \"reddit\"`).\\n\\n\uD83D\uDC49 Get your free API key at: https://serper.dev (2,500 free queries)',\n scraping: '\u274C **Web scraping unavailable.** Set `SCRAPEDO_API_KEY` to enable `scrape-links`.\\n\\n\uD83D\uDC49 Sign up at: https://scrape.do (1,000 free credits)',\n llmExtraction: '\u26A0\uFE0F **AI extraction disabled.** Set `LLM_API_KEY`, `LLM_BASE_URL`, and `LLM_MODEL` to enable AI-powered content extraction and search classification.\\n\\nScraping will work but without intelligent content filtering.',\n };\n return messages[capability];\n}\n\n// ============================================================================\n// Concurrency Limits\n// ============================================================================\n\nexport const CONCURRENCY = {\n SEARCH: safeParseInt(process.env.CONCURRENCY_SEARCH, 50, 1, 200),\n SCRAPER: safeParseInt(process.env.CONCURRENCY_SCRAPER, 50, 1, 200),\n REDDIT: safeParseInt(process.env.CONCURRENCY_REDDIT, 50, 1, 200),\n LLM_EXTRACTION: safeParseInt(process.env.LLM_CONCURRENCY, 50, 1, 200),\n} as const;\n\nexport const SCRAPER = {\n BATCH_SIZE: 30,\n EXTRACTION_PREFIX: 'Extract from document only \u2014 never hallucinate or add external knowledge.',\n EXTRACTION_SUFFIX: 'First line = content, not preamble. No confirmation messages.',\n} as const;\n\n// ============================================================================\n// Reddit Configuration\n// ============================================================================\n\nexport const REDDIT = {\n BATCH_SIZE: 10,\n MAX_WORDS_PER_POST: 50_000,\n MAX_WORDS_TOTAL: 500_000,\n MIN_POSTS: 1,\n MAX_POSTS: 50,\n RETRY_COUNT: 5,\n RETRY_DELAYS: [2000, 4000, 8000, 16000, 32000] as const,\n} as const;\n\n// ============================================================================\n// CTR Weights for URL Ranking (inspired from CTR research)\n// ============================================================================\n\nexport const CTR_WEIGHTS: Record<number, number> = {\n 1: 100.00,\n 2: 60.00,\n 3: 48.89,\n 4: 33.33,\n 5: 28.89,\n 6: 26.44,\n 7: 24.44,\n 8: 17.78,\n 9: 13.33,\n 10: 12.56,\n} as const;\n\n// ============================================================================\n// LLM Configuration\n//\n// Required vars (all must be set together when LLM is enabled):\n// LLM_API_KEY \u2014 API key for the OpenAI-compatible endpoint\n// LLM_BASE_URL \u2014 endpoint base URL (e.g. https://server.up.railway.app/v1)\n// LLM_MODEL \u2014 primary model (e.g. gpt-5.4-mini)\n//\n// Optional:\n// LLM_FALLBACK_MODEL \u2014 model to use after primary exhausts all retries (e.g. gpt-5.4)\n// LLM_CONCURRENCY \u2014 parallel LLM calls (default: 50)\n//\n// Reasoning effort is always 'low' \u2014 not configurable.\n// ============================================================================\n\ninterface LlmExtractionConfig {\n readonly MODEL: string;\n readonly FALLBACK_MODEL: string;\n readonly BASE_URL: string;\n readonly API_KEY: string;\n}\n\nlet cachedLlmExtraction: LlmExtractionConfig | null = null;\n\nfunction getLlmExtraction(): LlmExtractionConfig {\n if (cachedLlmExtraction) return cachedLlmExtraction;\n\n const apiKey = process.env.LLM_API_KEY?.trim() || '';\n const baseUrl = process.env.LLM_BASE_URL?.trim();\n const model = process.env.LLM_MODEL?.trim();\n const fallbackModel = process.env.LLM_FALLBACK_MODEL?.trim() || '';\n\n if (apiKey && !baseUrl) {\n throw new Error(\n 'LLM_BASE_URL is required when LLM_API_KEY is set. ' +\n 'Set LLM_BASE_URL to your OpenAI-compatible endpoint.',\n );\n }\n if (apiKey && !model) {\n throw new Error(\n 'LLM_MODEL is required when LLM_API_KEY is set.',\n );\n }\n\n cachedLlmExtraction = {\n API_KEY: apiKey,\n BASE_URL: baseUrl || '',\n MODEL: model || '',\n FALLBACK_MODEL: fallbackModel,\n };\n return cachedLlmExtraction;\n}\n\nexport const LLM_EXTRACTION: LlmExtractionConfig = new Proxy({} as LlmExtractionConfig, {\n get(_target, prop: string) {\n return getLlmExtraction()[prop as keyof LlmExtractionConfig];\n },\n});\n"],
|
|
5
|
-
"mappings": "AAKA,SAAS,cAAc;AAEvB,SAAS,SAAS,cAAc,2BAA2B;AAS3D,SAAS,aACP,OACA,YACA,KACA,KACQ;AACR,QAAM,SAAS,OAAO,IAAI,QAAQ;AAElC,MAAI,CAAC,OAAO;AACV,WAAO;AAAA,EACT;AAEA,QAAM,SAAS,SAAS,OAAO,EAAE;AAEjC,MAAI,MAAM,MAAM,GAAG;AACjB,WAAO,KAAK,mBAAmB,KAAK,oBAAoB,UAAU,EAAE;AACpE,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,KAAK;AAChB,WAAO,KAAK,SAAS,MAAM,kBAAkB,GAAG,iBAAiB,GAAG,EAAE;AACtE,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,KAAK;AAChB,WAAO,KAAK,SAAS,MAAM,kBAAkB,GAAG,iBAAiB,GAAG,EAAE;AACtE,WAAO;AAAA,EACT;AAEA,SAAO;AACT;
|
|
4
|
+
"sourcesContent": ["/**\n * Consolidated configuration\n * All environment variables, constants, and LLM config in one place\n */\n\nimport { Logger } from 'mcp-use';\n\nimport { VERSION, PACKAGE_NAME, PACKAGE_DESCRIPTION } from '../version.js';\n\n// ============================================================================\n// Safe Integer Parsing Helper\n// ============================================================================\n\n/**\n * Safely parse an integer from environment variable with bounds checking\n */\nfunction safeParseInt(\n value: string | undefined,\n defaultVal: number,\n min: number,\n max: number\n): number {\n const logger = Logger.get('config');\n\n if (!value) {\n return defaultVal;\n }\n\n const parsed = parseInt(value, 10);\n\n if (isNaN(parsed)) {\n logger.warn(`Invalid number \"${value}\", using default ${defaultVal}`);\n return defaultVal;\n }\n\n if (parsed < min) {\n logger.warn(`Value ${parsed} below minimum ${min}, clamping to ${min}`);\n return min;\n }\n\n if (parsed > max) {\n logger.warn(`Value ${parsed} above maximum ${max}, clamping to ${max}`);\n return max;\n }\n\n return parsed;\n}\n\n\n// ============================================================================\n// Environment Parsing\n// ============================================================================\n\ninterface EnvConfig {\n SCRAPER_API_KEY: string;\n SEARCH_API_KEY: string | undefined;\n REDDIT_CLIENT_ID: string | undefined;\n REDDIT_CLIENT_SECRET: string | undefined;\n JINA_API_KEY: string | undefined;\n}\n\nlet cachedEnv: EnvConfig | null = null;\n\nexport function parseEnv(): EnvConfig {\n if (cachedEnv) return cachedEnv;\n cachedEnv = {\n SCRAPER_API_KEY: process.env.SCRAPEDO_API_KEY || '',\n SEARCH_API_KEY: process.env.SERPER_API_KEY || undefined,\n REDDIT_CLIENT_ID: process.env.REDDIT_CLIENT_ID || undefined,\n REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || undefined,\n JINA_API_KEY: process.env.JINA_API_KEY || undefined,\n };\n return cachedEnv;\n}\n\n// ============================================================================\n// MCP Server Configuration\n// ============================================================================\n\nexport const SERVER = {\n NAME: PACKAGE_NAME,\n VERSION: VERSION,\n DESCRIPTION: PACKAGE_DESCRIPTION,\n} as const;\n\n// ============================================================================\n// Capability Detection (which features are available based on ENV)\n// ============================================================================\n\nexport interface Capabilities {\n reddit: boolean; // REDDIT_CLIENT_ID + REDDIT_CLIENT_SECRET\n search: boolean; // SERPER_API_KEY\n scraping: boolean; // SCRAPEDO_API_KEY\n llmExtraction: boolean; // LLM_API_KEY\n}\n\nexport function getCapabilities(): Capabilities {\n const env = parseEnv();\n return {\n reddit: !!(env.REDDIT_CLIENT_ID && env.REDDIT_CLIENT_SECRET),\n search: !!env.SEARCH_API_KEY,\n scraping: !!env.SCRAPER_API_KEY,\n llmExtraction: !!LLM_EXTRACTION.API_KEY,\n };\n}\n\nexport function getMissingEnvMessage(capability: keyof Capabilities): string {\n const messages: Record<keyof Capabilities, string> = {\n reddit: '\u274C **Reddit tools unavailable.** Set `REDDIT_CLIENT_ID` and `REDDIT_CLIENT_SECRET` to enable `get-reddit-post`.\\n\\n\uD83D\uDC49 Create a Reddit app at: https://www.reddit.com/prefs/apps (select \"script\" type)',\n search: '\u274C **Search unavailable.** Set `SERPER_API_KEY` to enable `web-search` (including `scope: \"reddit\"`).\\n\\n\uD83D\uDC49 Get your free API key at: https://serper.dev (2,500 free queries)',\n scraping: '\u274C **Web scraping unavailable.** Set `SCRAPEDO_API_KEY` to enable `scrape-links`.\\n\\n\uD83D\uDC49 Sign up at: https://scrape.do (1,000 free credits)',\n llmExtraction: '\u26A0\uFE0F **AI extraction disabled.** Set `LLM_API_KEY`, `LLM_BASE_URL`, and `LLM_MODEL` to enable AI-powered content extraction and search classification.\\n\\nScraping will work but without intelligent content filtering.',\n };\n return messages[capability];\n}\n\n// ============================================================================\n// Concurrency Limits\n// ============================================================================\n\nexport const CONCURRENCY = {\n SEARCH: safeParseInt(process.env.CONCURRENCY_SEARCH, 50, 1, 200),\n SCRAPER: safeParseInt(process.env.CONCURRENCY_SCRAPER, 50, 1, 200),\n REDDIT: safeParseInt(process.env.CONCURRENCY_REDDIT, 50, 1, 200),\n LLM_EXTRACTION: safeParseInt(process.env.LLM_CONCURRENCY, 50, 1, 200),\n} as const;\n\nexport const SCRAPER = {\n BATCH_SIZE: 30,\n EXTRACTION_PREFIX: 'Extract from document only \u2014 never hallucinate or add external knowledge.',\n EXTRACTION_SUFFIX: 'First line = content, not preamble. No confirmation messages.',\n} as const;\n\n// ============================================================================\n// Reddit Configuration\n// ============================================================================\n\nexport const REDDIT = {\n BATCH_SIZE: 10,\n MAX_WORDS_PER_POST: 50_000,\n MAX_WORDS_TOTAL: 500_000,\n MIN_POSTS: 1,\n MAX_POSTS: 50,\n RETRY_COUNT: 5,\n RETRY_DELAYS: [2000, 4000, 8000, 16000, 32000] as const,\n} as const;\n\n// ============================================================================\n// CTR Weights for URL Ranking (inspired from CTR research)\n// ============================================================================\n\nexport const CTR_WEIGHTS: Record<number, number> = {\n 1: 100.00,\n 2: 60.00,\n 3: 48.89,\n 4: 33.33,\n 5: 28.89,\n 6: 26.44,\n 7: 24.44,\n 8: 17.78,\n 9: 13.33,\n 10: 12.56,\n} as const;\n\n// ============================================================================\n// LLM Configuration\n//\n// Required vars (all must be set together when LLM is enabled):\n// LLM_API_KEY \u2014 API key for the OpenAI-compatible endpoint\n// LLM_BASE_URL \u2014 endpoint base URL (e.g. https://server.up.railway.app/v1)\n// LLM_MODEL \u2014 primary model (e.g. gpt-5.4-mini)\n//\n// Optional:\n// LLM_FALLBACK_MODEL \u2014 model to use after primary exhausts all retries (e.g. gpt-5.4)\n// LLM_CONCURRENCY \u2014 parallel LLM calls (default: 50)\n//\n// Reasoning effort is always 'low' \u2014 not configurable.\n// ============================================================================\n\ninterface LlmExtractionConfig {\n readonly MODEL: string;\n readonly FALLBACK_MODEL: string;\n readonly BASE_URL: string;\n readonly API_KEY: string;\n}\n\nlet cachedLlmExtraction: LlmExtractionConfig | null = null;\n\nfunction getLlmExtraction(): LlmExtractionConfig {\n if (cachedLlmExtraction) return cachedLlmExtraction;\n\n const apiKey = process.env.LLM_API_KEY?.trim() || '';\n const baseUrl = process.env.LLM_BASE_URL?.trim();\n const model = process.env.LLM_MODEL?.trim();\n const fallbackModel = process.env.LLM_FALLBACK_MODEL?.trim() || '';\n\n if (apiKey && !baseUrl) {\n throw new Error(\n 'LLM_BASE_URL is required when LLM_API_KEY is set. ' +\n 'Set LLM_BASE_URL to your OpenAI-compatible endpoint.',\n );\n }\n if (apiKey && !model) {\n throw new Error(\n 'LLM_MODEL is required when LLM_API_KEY is set.',\n );\n }\n\n cachedLlmExtraction = {\n API_KEY: apiKey,\n BASE_URL: baseUrl || '',\n MODEL: model || '',\n FALLBACK_MODEL: fallbackModel,\n };\n return cachedLlmExtraction;\n}\n\nexport const LLM_EXTRACTION: LlmExtractionConfig = new Proxy({} as LlmExtractionConfig, {\n get(_target, prop: string) {\n return getLlmExtraction()[prop as keyof LlmExtractionConfig];\n },\n});\n"],
|
|
5
|
+
"mappings": "AAKA,SAAS,cAAc;AAEvB,SAAS,SAAS,cAAc,2BAA2B;AAS3D,SAAS,aACP,OACA,YACA,KACA,KACQ;AACR,QAAM,SAAS,OAAO,IAAI,QAAQ;AAElC,MAAI,CAAC,OAAO;AACV,WAAO;AAAA,EACT;AAEA,QAAM,SAAS,SAAS,OAAO,EAAE;AAEjC,MAAI,MAAM,MAAM,GAAG;AACjB,WAAO,KAAK,mBAAmB,KAAK,oBAAoB,UAAU,EAAE;AACpE,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,KAAK;AAChB,WAAO,KAAK,SAAS,MAAM,kBAAkB,GAAG,iBAAiB,GAAG,EAAE;AACtE,WAAO;AAAA,EACT;AAEA,MAAI,SAAS,KAAK;AAChB,WAAO,KAAK,SAAS,MAAM,kBAAkB,GAAG,iBAAiB,GAAG,EAAE;AACtE,WAAO;AAAA,EACT;AAEA,SAAO;AACT;AAeA,IAAI,YAA8B;AAE3B,SAAS,WAAsB;AACpC,MAAI,UAAW,QAAO;AACtB,cAAY;AAAA,IACV,iBAAiB,QAAQ,IAAI,oBAAoB;AAAA,IACjD,gBAAgB,QAAQ,IAAI,kBAAkB;AAAA,IAC9C,kBAAkB,QAAQ,IAAI,oBAAoB;AAAA,IAClD,sBAAsB,QAAQ,IAAI,wBAAwB;AAAA,IAC1D,cAAc,QAAQ,IAAI,gBAAgB;AAAA,EAC5C;AACA,SAAO;AACT;AAMO,MAAM,SAAS;AAAA,EACpB,MAAM;AAAA,EACN;AAAA,EACA,aAAa;AACf;AAaO,SAAS,kBAAgC;AAC9C,QAAM,MAAM,SAAS;AACrB,SAAO;AAAA,IACL,QAAQ,CAAC,EAAE,IAAI,oBAAoB,IAAI;AAAA,IACvC,QAAQ,CAAC,CAAC,IAAI;AAAA,IACd,UAAU,CAAC,CAAC,IAAI;AAAA,IAChB,eAAe,CAAC,CAAC,eAAe;AAAA,EAClC;AACF;AAEO,SAAS,qBAAqB,YAAwC;AAC3E,QAAM,WAA+C;AAAA,IACnD,QAAQ;AAAA,IACR,QAAQ;AAAA,IACR,UAAU;AAAA,IACV,eAAe;AAAA,EACjB;AACA,SAAO,SAAS,UAAU;AAC5B;AAMO,MAAM,cAAc;AAAA,EACzB,QAAQ,aAAa,QAAQ,IAAI,oBAAoB,IAAI,GAAG,GAAG;AAAA,EAC/D,SAAS,aAAa,QAAQ,IAAI,qBAAqB,IAAI,GAAG,GAAG;AAAA,EACjE,QAAQ,aAAa,QAAQ,IAAI,oBAAoB,IAAI,GAAG,GAAG;AAAA,EAC/D,gBAAgB,aAAa,QAAQ,IAAI,iBAAiB,IAAI,GAAG,GAAG;AACtE;AAEO,MAAM,UAAU;AAAA,EACrB,YAAY;AAAA,EACZ,mBAAmB;AAAA,EACnB,mBAAmB;AACrB;AAMO,MAAM,SAAS;AAAA,EACpB,YAAY;AAAA,EACZ,oBAAoB;AAAA,EACpB,iBAAiB;AAAA,EACjB,WAAW;AAAA,EACX,WAAW;AAAA,EACX,aAAa;AAAA,EACb,cAAc,CAAC,KAAM,KAAM,KAAM,MAAO,IAAK;AAC/C;AAMO,MAAM,cAAsC;AAAA,EACjD,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,GAAG;AAAA,EACH,IAAI;AACN;AAwBA,IAAI,sBAAkD;AAEtD,SAAS,mBAAwC;AAC/C,MAAI,oBAAqB,QAAO;AAEhC,QAAM,SAAS,QAAQ,IAAI,aAAa,KAAK,KAAK;AAClD,QAAM,UAAU,QAAQ,IAAI,cAAc,KAAK;AAC/C,QAAM,QAAQ,QAAQ,IAAI,WAAW,KAAK;AAC1C,QAAM,gBAAgB,QAAQ,IAAI,oBAAoB,KAAK,KAAK;AAEhE,MAAI,UAAU,CAAC,SAAS;AACtB,UAAM,IAAI;AAAA,MACR;AAAA,IAEF;AAAA,EACF;AACA,MAAI,UAAU,CAAC,OAAO;AACpB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAEA,wBAAsB;AAAA,IACpB,SAAS;AAAA,IACT,UAAU,WAAW;AAAA,IACrB,OAAO,SAAS;AAAA,IAChB,gBAAgB;AAAA,EAClB;AACA,SAAO;AACT;AAEO,MAAM,iBAAsC,IAAI,MAAM,CAAC,GAA0B;AAAA,EACtF,IAAI,SAAS,MAAc;AACzB,WAAO,iBAAiB,EAAE,IAAiC;AAAA,EAC7D;AACF,CAAC;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
package/dist/src/tools/scrape.js
CHANGED
|
@@ -11,12 +11,14 @@ import {
|
|
|
11
11
|
} from "../schemas/scrape-links.js";
|
|
12
12
|
import { ScraperClient } from "../clients/scraper.js";
|
|
13
13
|
import { RedditClient } from "../clients/reddit.js";
|
|
14
|
+
import { JinaClient } from "../clients/jina.js";
|
|
14
15
|
import { MarkdownCleaner } from "../services/markdown-cleaner.js";
|
|
15
16
|
import { createLLMProcessor, processContentWithLLM } from "../services/llm-processor.js";
|
|
16
17
|
import { removeMetaTags } from "../utils/markdown-formatter.js";
|
|
17
18
|
import { extractReadableContent } from "../utils/content-extractor.js";
|
|
18
|
-
import { classifyError } from "../utils/errors.js";
|
|
19
|
-
import {
|
|
19
|
+
import { classifyError, ErrorCode } from "../utils/errors.js";
|
|
20
|
+
import { isDocumentUrl } from "../utils/source-type.js";
|
|
21
|
+
import { pMap, pMapSettled } from "../utils/concurrency.js";
|
|
20
22
|
import {
|
|
21
23
|
mcpLog,
|
|
22
24
|
formatSuccess,
|
|
@@ -75,6 +77,7 @@ Execution time: ${formatDuration(Date.now() - startTime)}`
|
|
|
75
77
|
function partitionUrls(urls) {
|
|
76
78
|
const webInputs = [];
|
|
77
79
|
const redditInputs = [];
|
|
80
|
+
const documentInputs = [];
|
|
78
81
|
const invalidEntries = [];
|
|
79
82
|
for (let i = 0; i < urls.length; i++) {
|
|
80
83
|
const url = urls[i];
|
|
@@ -84,23 +87,32 @@ function partitionUrls(urls) {
|
|
|
84
87
|
invalidEntries.push({ url, origIndex: i });
|
|
85
88
|
continue;
|
|
86
89
|
}
|
|
87
|
-
if (
|
|
90
|
+
if (isDocumentUrl(url)) {
|
|
91
|
+
documentInputs.push({ url, origIndex: i });
|
|
92
|
+
} else if (isRedditUrl(url)) {
|
|
88
93
|
redditInputs.push({ url, origIndex: i });
|
|
89
94
|
} else {
|
|
90
95
|
webInputs.push({ url, origIndex: i });
|
|
91
96
|
}
|
|
92
97
|
}
|
|
93
|
-
return { webInputs, redditInputs, invalidEntries };
|
|
98
|
+
return { webInputs, redditInputs, documentInputs, invalidEntries };
|
|
94
99
|
}
|
|
95
100
|
async function fetchWebBranch(inputs, client) {
|
|
96
101
|
if (inputs.length === 0) {
|
|
97
|
-
return {
|
|
102
|
+
return {
|
|
103
|
+
successItems: [],
|
|
104
|
+
failedContents: [],
|
|
105
|
+
metrics: { successful: 0, failed: 0, totalCredits: 0 },
|
|
106
|
+
jinaFallbacks: []
|
|
107
|
+
};
|
|
98
108
|
}
|
|
99
109
|
mcpLog("info", `[concurrency] web branch: fanning out ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`, "scrape");
|
|
100
110
|
const urls = inputs.map((i) => i.url);
|
|
101
111
|
const results = await client.scrapeMultiple(urls, { timeout: 60 });
|
|
112
|
+
const urlToIndex = new Map(inputs.map((i) => [i.url, i.origIndex]));
|
|
102
113
|
const successItems = [];
|
|
103
114
|
const failedContents = [];
|
|
115
|
+
const jinaFallbacks = [];
|
|
104
116
|
let successful = 0;
|
|
105
117
|
let failed = 0;
|
|
106
118
|
let totalCredits = 0;
|
|
@@ -114,12 +126,29 @@ async function fetchWebBranch(inputs, client) {
|
|
|
114
126
|
\u274C No result returned`);
|
|
115
127
|
continue;
|
|
116
128
|
}
|
|
117
|
-
if (result.error
|
|
129
|
+
if (result.error?.code === ErrorCode.UNSUPPORTED_BINARY_CONTENT) {
|
|
130
|
+
jinaFallbacks.push({
|
|
131
|
+
url: result.url,
|
|
132
|
+
origIndex: urlToIndex.get(result.url) ?? origIndex,
|
|
133
|
+
reason: "binary_content"
|
|
134
|
+
});
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
137
|
+
const scrapeFailed = Boolean(result.error) || result.statusCode < 200 || result.statusCode >= 300;
|
|
138
|
+
if (scrapeFailed && result.statusCode !== 404) {
|
|
139
|
+
jinaFallbacks.push({
|
|
140
|
+
url: result.url,
|
|
141
|
+
origIndex: urlToIndex.get(result.url) ?? origIndex,
|
|
142
|
+
reason: "scrape_failed",
|
|
143
|
+
scrapeError: result.error?.message || result.content || `HTTP ${result.statusCode}`
|
|
144
|
+
});
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
if (scrapeFailed) {
|
|
118
148
|
failed++;
|
|
119
|
-
const errorMsg = result.error?.message || result.content || `HTTP ${result.statusCode}`;
|
|
120
149
|
failedContents.push(`## ${result.url}
|
|
121
150
|
|
|
122
|
-
\u274C Failed to scrape:
|
|
151
|
+
\u274C Failed to scrape: HTTP 404 \u2014 Page not found`);
|
|
123
152
|
continue;
|
|
124
153
|
}
|
|
125
154
|
successful++;
|
|
@@ -134,7 +163,67 @@ async function fetchWebBranch(inputs, client) {
|
|
|
134
163
|
}
|
|
135
164
|
successItems.push({ url: result.url, content, index: origIndex });
|
|
136
165
|
}
|
|
137
|
-
return {
|
|
166
|
+
return {
|
|
167
|
+
successItems,
|
|
168
|
+
failedContents,
|
|
169
|
+
metrics: { successful, failed, totalCredits },
|
|
170
|
+
jinaFallbacks
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
function formatJinaFailure(url, jinaError, scrapeError) {
|
|
174
|
+
if (scrapeError) {
|
|
175
|
+
return `## ${url}
|
|
176
|
+
|
|
177
|
+
\u274C Both scrapers failed. Scrape.do: ${scrapeError}. Jina Reader: ${jinaError}.`;
|
|
178
|
+
}
|
|
179
|
+
return `## ${url}
|
|
180
|
+
|
|
181
|
+
\u274C Document conversion failed: ${jinaError}`;
|
|
182
|
+
}
|
|
183
|
+
async function fetchDocumentBranch(inputs, jinaClient, scrapeErrorContext) {
|
|
184
|
+
if (inputs.length === 0) {
|
|
185
|
+
return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };
|
|
186
|
+
}
|
|
187
|
+
mcpLog(
|
|
188
|
+
"info",
|
|
189
|
+
`[concurrency] document branch (jina): converting ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`,
|
|
190
|
+
"scrape"
|
|
191
|
+
);
|
|
192
|
+
const results = await pMapSettled(
|
|
193
|
+
inputs,
|
|
194
|
+
(input) => jinaClient.convert({ url: input.url }),
|
|
195
|
+
CONCURRENCY.SCRAPER
|
|
196
|
+
);
|
|
197
|
+
const successItems = [];
|
|
198
|
+
const failedContents = [];
|
|
199
|
+
let successful = 0;
|
|
200
|
+
let failed = 0;
|
|
201
|
+
for (let i = 0; i < results.length; i++) {
|
|
202
|
+
const settled = results[i];
|
|
203
|
+
const input = inputs[i];
|
|
204
|
+
const scrapeError = scrapeErrorContext?.get(input.url);
|
|
205
|
+
if (!settled) {
|
|
206
|
+
failed++;
|
|
207
|
+
failedContents.push(formatJinaFailure(input.url, "No result returned", scrapeError));
|
|
208
|
+
continue;
|
|
209
|
+
}
|
|
210
|
+
if (settled.status === "rejected") {
|
|
211
|
+
failed++;
|
|
212
|
+
const reason = settled.reason instanceof Error ? settled.reason.message : String(settled.reason);
|
|
213
|
+
failedContents.push(formatJinaFailure(input.url, reason, scrapeError));
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
const result = settled.value;
|
|
217
|
+
if (result.error || result.statusCode < 200 || result.statusCode >= 300) {
|
|
218
|
+
failed++;
|
|
219
|
+
const errorMsg = result.error?.message || `HTTP ${result.statusCode}`;
|
|
220
|
+
failedContents.push(formatJinaFailure(input.url, errorMsg, scrapeError));
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
successful++;
|
|
224
|
+
successItems.push({ url: input.url, content: result.content, index: input.origIndex });
|
|
225
|
+
}
|
|
226
|
+
return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };
|
|
138
227
|
}
|
|
139
228
|
function formatRedditPostAsMarkdown(result) {
|
|
140
229
|
const { post, comments } = result;
|
|
@@ -315,11 +404,11 @@ async function handleScrapeLinks(params, reporter = NOOP_REPORTER) {
|
|
|
315
404
|
if (!params.urls || params.urls.length === 0) {
|
|
316
405
|
return createScrapeErrorResponse("NO_URLS", "No URLs provided", startTime);
|
|
317
406
|
}
|
|
318
|
-
const { webInputs, redditInputs, invalidEntries } = partitionUrls(params.urls);
|
|
319
|
-
const validCount = webInputs.length + redditInputs.length;
|
|
407
|
+
const { webInputs, redditInputs, documentInputs, invalidEntries } = partitionUrls(params.urls);
|
|
408
|
+
const validCount = webInputs.length + redditInputs.length + documentInputs.length;
|
|
320
409
|
await reporter.log(
|
|
321
410
|
"info",
|
|
322
|
-
`Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${invalidEntries.length} invalid`
|
|
411
|
+
`Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${documentInputs.length} document, ${invalidEntries.length} invalid`
|
|
323
412
|
);
|
|
324
413
|
if (validCount === 0) {
|
|
325
414
|
return createScrapeErrorResponse(
|
|
@@ -334,17 +423,23 @@ async function handleScrapeLinks(params, reporter = NOOP_REPORTER) {
|
|
|
334
423
|
}
|
|
335
424
|
mcpLog(
|
|
336
425
|
"info",
|
|
337
|
-
`Starting scrape: ${webInputs.length} web + ${redditInputs.length} reddit URL(s)`,
|
|
426
|
+
`Starting scrape: ${webInputs.length} web + ${redditInputs.length} reddit + ${documentInputs.length} document URL(s)`,
|
|
338
427
|
"scrape"
|
|
339
428
|
);
|
|
340
429
|
await reporter.progress(15, 100, "Preparing scraper clients");
|
|
341
430
|
let clients = null;
|
|
342
431
|
try {
|
|
432
|
+
const jinaClient = new JinaClient();
|
|
343
433
|
if (webInputs.length > 0) {
|
|
344
|
-
clients = {
|
|
434
|
+
clients = {
|
|
435
|
+
client: new ScraperClient(),
|
|
436
|
+
jinaClient,
|
|
437
|
+
llmProcessor: createLLMProcessor()
|
|
438
|
+
};
|
|
345
439
|
} else {
|
|
346
440
|
clients = {
|
|
347
441
|
client: null,
|
|
442
|
+
jinaClient,
|
|
348
443
|
llmProcessor: createLLMProcessor()
|
|
349
444
|
};
|
|
350
445
|
}
|
|
@@ -362,20 +457,59 @@ async function handleScrapeLinks(params, reporter = NOOP_REPORTER) {
|
|
|
362
457
|
}
|
|
363
458
|
const enhancedInstruction = enhanceExtractionInstruction(params.extract);
|
|
364
459
|
await reporter.progress(35, 100, "Fetching page content");
|
|
365
|
-
const
|
|
366
|
-
|
|
367
|
-
|
|
460
|
+
const emptyPhase = {
|
|
461
|
+
successItems: [],
|
|
462
|
+
failedContents: [],
|
|
463
|
+
metrics: { successful: 0, failed: 0, totalCredits: 0 },
|
|
464
|
+
jinaFallbacks: []
|
|
465
|
+
};
|
|
466
|
+
const [webPhase, redditPhase, documentPhase] = await Promise.all([
|
|
467
|
+
webInputs.length > 0 ? fetchWebBranch(webInputs, clients.client) : Promise.resolve(emptyPhase),
|
|
468
|
+
fetchRedditBranch(redditInputs),
|
|
469
|
+
fetchDocumentBranch(documentInputs, clients.jinaClient)
|
|
368
470
|
]);
|
|
369
|
-
|
|
471
|
+
let deferredPhase = {
|
|
472
|
+
successItems: [],
|
|
473
|
+
failedContents: [],
|
|
474
|
+
metrics: { successful: 0, failed: 0, totalCredits: 0 }
|
|
475
|
+
};
|
|
476
|
+
if (webPhase.jinaFallbacks.length > 0) {
|
|
477
|
+
const binaryCount = webPhase.jinaFallbacks.filter((f) => f.reason === "binary_content").length;
|
|
478
|
+
const failedCount = webPhase.jinaFallbacks.length - binaryCount;
|
|
479
|
+
await reporter.log(
|
|
480
|
+
"info",
|
|
481
|
+
`Rerouting ${webPhase.jinaFallbacks.length} URL(s) to Jina Reader: ${binaryCount} binary, ${failedCount} scrape-failed`
|
|
482
|
+
);
|
|
483
|
+
const fallbackInputs = webPhase.jinaFallbacks.map((f) => ({
|
|
484
|
+
url: f.url,
|
|
485
|
+
origIndex: f.origIndex
|
|
486
|
+
}));
|
|
487
|
+
const errorContext = new Map(
|
|
488
|
+
webPhase.jinaFallbacks.filter((f) => f.scrapeError !== void 0).map((f) => [f.url, f.scrapeError])
|
|
489
|
+
);
|
|
490
|
+
deferredPhase = await fetchDocumentBranch(fallbackInputs, clients.jinaClient, errorContext);
|
|
491
|
+
}
|
|
492
|
+
const successItems = [
|
|
493
|
+
...webPhase.successItems,
|
|
494
|
+
...redditPhase.successItems,
|
|
495
|
+
...documentPhase.successItems,
|
|
496
|
+
...deferredPhase.successItems
|
|
497
|
+
];
|
|
370
498
|
const invalidFailed = invalidEntries.map(
|
|
371
499
|
({ url }) => `## ${url}
|
|
372
500
|
|
|
373
501
|
\u274C Invalid URL format`
|
|
374
502
|
);
|
|
375
|
-
const failedContents = [
|
|
503
|
+
const failedContents = [
|
|
504
|
+
...invalidFailed,
|
|
505
|
+
...webPhase.failedContents,
|
|
506
|
+
...redditPhase.failedContents,
|
|
507
|
+
...documentPhase.failedContents,
|
|
508
|
+
...deferredPhase.failedContents
|
|
509
|
+
];
|
|
376
510
|
const metrics = {
|
|
377
|
-
successful: webPhase.metrics.successful + redditPhase.metrics.successful,
|
|
378
|
-
failed: invalidEntries.length + webPhase.metrics.failed + redditPhase.metrics.failed,
|
|
511
|
+
successful: webPhase.metrics.successful + redditPhase.metrics.successful + documentPhase.metrics.successful + deferredPhase.metrics.successful,
|
|
512
|
+
failed: invalidEntries.length + webPhase.metrics.failed + redditPhase.metrics.failed + documentPhase.metrics.failed + deferredPhase.metrics.failed,
|
|
379
513
|
totalCredits: webPhase.metrics.totalCredits
|
|
380
514
|
};
|
|
381
515
|
await reporter.log("info", `Fetched ${metrics.successful} page(s), ${metrics.failed} failed`);
|
|
@@ -436,6 +570,7 @@ function registerScrapeLinksTool(server) {
|
|
|
436
570
|
);
|
|
437
571
|
}
|
|
438
572
|
export {
|
|
573
|
+
formatJinaFailure,
|
|
439
574
|
handleScrapeLinks,
|
|
440
575
|
registerScrapeLinksTool
|
|
441
576
|
};
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/tools/scrape.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Scrape Links Tool Handler\n *\n * Scrapes many URLs in parallel. Reddit permalinks (reddit.com/r/.../comments/...)\n * are auto-detected and routed through the Reddit API; all other URLs go through\n * the scraper. Both branches feed the same per-URL LLM extraction pipeline.\n *\n * NEVER throws \u2014 every error is returned as a tool-level failure response.\n */\n\nimport type { MCPServer } from 'mcp-use/server';\n\nimport {\n SCRAPER,\n CONCURRENCY,\n getCapabilities,\n getMissingEnvMessage,\n parseEnv,\n} from '../config/index.js';\nimport {\n scrapeLinksOutputSchema,\n scrapeLinksParamsSchema,\n type ScrapeLinksParams,\n type ScrapeLinksOutput,\n} from '../schemas/scrape-links.js';\nimport { ScraperClient } from '../clients/scraper.js';\nimport { RedditClient, type PostResult } from '../clients/reddit.js';\nimport { MarkdownCleaner } from '../services/markdown-cleaner.js';\nimport { createLLMProcessor, processContentWithLLM } from '../services/llm-processor.js';\nimport { removeMetaTags } from '../utils/markdown-formatter.js';\nimport { extractReadableContent } from '../utils/content-extractor.js';\nimport { classifyError } from '../utils/errors.js';\nimport { pMap } from '../utils/concurrency.js';\nimport {\n mcpLog,\n formatSuccess,\n formatError,\n formatBatchHeader,\n formatDuration,\n} from './utils.js';\nimport {\n createToolReporter,\n NOOP_REPORTER,\n toolFailure,\n toolSuccess,\n toToolResponse,\n type ToolExecutionResult,\n type ToolReporter,\n} from './mcp-helpers.js';\n\nconst markdownCleaner = new MarkdownCleaner();\n\nfunction enhanceExtractionInstruction(instruction: string | undefined): string {\n const base = instruction || 'Extract the main content and key information from this page.';\n return `${SCRAPER.EXTRACTION_PREFIX}\\n\\n${base}\\n\\n${SCRAPER.EXTRACTION_SUFFIX}`;\n}\n\n// --- Types ---\n\ninterface ProcessedResult {\n url: string;\n content: string;\n index: number; // original position in params.urls[]\n}\n\ninterface ScrapeMetrics {\n successful: number;\n failed: number;\n totalCredits: number;\n}\n\ninterface ScrapePhaseResult {\n successItems: ProcessedResult[];\n failedContents: string[];\n metrics: ScrapeMetrics;\n}\n\ninterface BranchInput {\n url: string;\n origIndex: number;\n}\n\ninterface ScrapeClients {\n client: ScraperClient;\n llmProcessor: ReturnType<typeof createLLMProcessor>;\n}\n\n// --- Reddit URL detection ---\n\nconst REDDIT_HOST = /(?:^|\\.)reddit\\.com$/i;\nconst REDDIT_POST_PERMALINK = /\\/r\\/[^/]+\\/comments\\/[a-z0-9]+/i;\n\nfunction isRedditUrl(url: string): boolean {\n try {\n const u = new URL(url);\n return REDDIT_HOST.test(u.hostname);\n } catch {\n return false;\n }\n}\n\nfunction isRedditPostPermalink(url: string): boolean {\n try {\n const u = new URL(url);\n return REDDIT_HOST.test(u.hostname) && REDDIT_POST_PERMALINK.test(u.pathname);\n } catch {\n return false;\n }\n}\n\n// --- Error helper ---\n\nfunction createScrapeErrorResponse(\n code: string,\n message: string,\n startTime: number,\n retryable = false,\n alternatives?: string[],\n): ToolExecutionResult<ScrapeLinksOutput> {\n return toolFailure(\n `${formatError({\n code,\n message,\n retryable,\n toolName: 'scrape-links',\n howToFix: code === 'NO_URLS' ? ['Provide at least one valid URL'] : undefined,\n alternatives,\n })}\\n\\nExecution time: ${formatDuration(Date.now() - startTime)}`,\n );\n}\n\n// --- URL partitioning ---\n\ninterface PartitionedUrls {\n webInputs: BranchInput[];\n redditInputs: BranchInput[];\n invalidEntries: { url: string; origIndex: number }[];\n}\n\nfunction partitionUrls(urls: string[]): PartitionedUrls {\n const webInputs: BranchInput[] = [];\n const redditInputs: BranchInput[] = [];\n const invalidEntries: { url: string; origIndex: number }[] = [];\n\n for (let i = 0; i < urls.length; i++) {\n const url = urls[i]!;\n try {\n new URL(url);\n } catch {\n invalidEntries.push({ url, origIndex: i });\n continue;\n }\n if (isRedditUrl(url)) {\n redditInputs.push({ url, origIndex: i });\n } else {\n webInputs.push({ url, origIndex: i });\n }\n }\n\n return { webInputs, redditInputs, invalidEntries };\n}\n\n// --- Web branch ---\n\nasync function fetchWebBranch(\n inputs: BranchInput[],\n client: ScraperClient,\n): Promise<ScrapePhaseResult> {\n if (inputs.length === 0) {\n return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };\n }\n\n mcpLog('info', `[concurrency] web branch: fanning out ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`, 'scrape');\n const urls = inputs.map((i) => i.url);\n const results = await client.scrapeMultiple(urls, { timeout: 60 });\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [];\n let successful = 0;\n let failed = 0;\n let totalCredits = 0;\n\n for (let i = 0; i < results.length; i++) {\n const result = results[i];\n const origIndex = inputs[i]!.origIndex;\n if (!result) {\n failed++;\n failedContents.push(`## ${inputs[i]!.url}\\n\\n\u274C No result returned`);\n continue;\n }\n\n if (result.error || result.statusCode < 200 || result.statusCode >= 300) {\n failed++;\n const errorMsg = result.error?.message || result.content || `HTTP ${result.statusCode}`;\n failedContents.push(`## ${result.url}\\n\\n\u274C Failed to scrape: ${errorMsg}`);\n continue;\n }\n\n successful++;\n totalCredits += result.credits;\n\n let content: string;\n try {\n const readable = extractReadableContent(result.content, result.url);\n const sourceForCleaner = readable.extracted ? readable.content : result.content;\n content = markdownCleaner.processContent(sourceForCleaner);\n } catch {\n content = result.content;\n }\n\n successItems.push({ url: result.url, content, index: origIndex });\n }\n\n return { successItems, failedContents, metrics: { successful, failed, totalCredits } };\n}\n\n// --- Reddit branch ---\n\nfunction formatRedditPostAsMarkdown(result: PostResult): string {\n const { post, comments } = result;\n const lines: string[] = [];\n lines.push(`# ${post.title}`);\n lines.push('');\n lines.push(`**r/${post.subreddit}** \u2022 u/${post.author} \u2022 \u2B06\uFE0F ${post.score} \u2022 \uD83D\uDCAC ${post.commentCount} comments`);\n lines.push(`\uD83D\uDD17 ${post.url}`);\n lines.push('');\n if (post.body) {\n lines.push('## Post content');\n lines.push('');\n lines.push(post.body);\n lines.push('');\n }\n if (comments.length > 0) {\n lines.push(`## Top comments (${comments.length} total)`);\n lines.push('');\n for (const c of comments) {\n const indent = ' '.repeat(c.depth);\n const op = c.isOP ? ' **[OP]**' : '';\n const score = c.score >= 0 ? `+${c.score}` : `${c.score}`;\n lines.push(`${indent}- **u/${c.author}**${op} _(${score})_`);\n for (const line of c.body.split('\\n')) {\n lines.push(`${indent} ${line}`);\n }\n lines.push('');\n }\n }\n return lines.join('\\n');\n}\n\nasync function fetchRedditBranch(inputs: BranchInput[]): Promise<ScrapePhaseResult> {\n if (inputs.length === 0) {\n return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };\n }\n\n const env = parseEnv();\n if (!env.REDDIT_CLIENT_ID || !env.REDDIT_CLIENT_SECRET) {\n const failedContents = inputs.map(\n (i) => `## ${i.url}\\n\\n\u274C Reddit URL detected, but Reddit API is not configured. Set \\`REDDIT_CLIENT_ID\\` and \\`REDDIT_CLIENT_SECRET\\` in the server env to enable threaded Reddit scraping.`,\n );\n return {\n successItems: [],\n failedContents,\n metrics: { successful: 0, failed: inputs.length, totalCredits: 0 },\n };\n }\n\n // Warn for non-permalink Reddit URLs (subreddit homepages, /new, /top, /hot,\n // user profiles). The Reddit API path we call requires /r/.../comments/... \u2014\n // reject upfront so the caller sees a helpful message instead of a 404.\n const [postInputs, nonPermalinks] = inputs.reduce<[BranchInput[], BranchInput[]]>(\n ([posts, rest], input) => {\n if (isRedditPostPermalink(input.url)) posts.push(input);\n else rest.push(input);\n return [posts, rest];\n },\n [[], []],\n );\n\n const nonPermalinkFailed = nonPermalinks.map(\n (i) => `## ${i.url}\\n\\n\u274C Only Reddit post permalinks (/r/<sub>/comments/<id>/...) are supported. Use web-search with scope:\"reddit\" to discover post permalinks first.`,\n );\n\n if (postInputs.length === 0) {\n return {\n successItems: [],\n failedContents: nonPermalinkFailed,\n metrics: { successful: 0, failed: nonPermalinks.length, totalCredits: 0 },\n };\n }\n\n mcpLog('info', `[concurrency] reddit branch: fetching ${postInputs.length} post(s) with limit=${CONCURRENCY.REDDIT}`, 'scrape');\n const client = new RedditClient(env.REDDIT_CLIENT_ID, env.REDDIT_CLIENT_SECRET);\n const urls = postInputs.map((i) => i.url);\n const batchResult = await client.batchGetPosts(urls, true);\n const urlToIndex = new Map(postInputs.map((i) => [i.url, i.origIndex]));\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [...nonPermalinkFailed];\n let successful = 0;\n let failed = nonPermalinks.length;\n\n for (const [url, result] of batchResult.results) {\n const origIndex = urlToIndex.get(url) ?? -1;\n if (result instanceof Error) {\n failed++;\n failedContents.push(`## ${url}\\n\\n\u274C Reddit fetch failed: ${result.message}`);\n continue;\n }\n successful++;\n successItems.push({ url, content: formatRedditPostAsMarkdown(result), index: origIndex });\n }\n\n return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };\n}\n\n// --- LLM extraction (shared by both branches) ---\n\nasync function processItemsWithLlm(\n successItems: ProcessedResult[],\n enhancedInstruction: string,\n llmProcessor: ReturnType<typeof createLLMProcessor>,\n reporter: ToolReporter,\n): Promise<{ items: ProcessedResult[]; llmErrors: number; llmAttempted: number }> {\n let llmErrors = 0;\n\n if (!llmProcessor || successItems.length === 0) {\n if (!llmProcessor && successItems.length > 0) {\n mcpLog('warning', 'LLM unavailable (LLM_API_KEY not set). Returning raw scraped content.', 'scrape');\n void reporter.log('warning', 'llm_extractor_unreachable: planner not configured; raw scraped content returned');\n }\n return { items: successItems, llmErrors, llmAttempted: 0 };\n }\n\n mcpLog('info', `[concurrency] llm extraction: fanning out ${successItems.length} item(s) with limit=${CONCURRENCY.LLM_EXTRACTION}`, 'scrape');\n\n const llmResults = await pMap(\n successItems,\n async (item) => {\n mcpLog('debug', `LLM extracting ${item.url}...`, 'scrape');\n\n const llmResult = await processContentWithLLM(\n item.content,\n { enabled: true, extract: enhancedInstruction, url: item.url },\n llmProcessor,\n );\n\n if (llmResult.processed) {\n return { ...item, content: llmResult.content };\n }\n\n llmErrors++;\n mcpLog('warning', `LLM extraction failed for ${item.url}: ${llmResult.error || 'unknown reason'}`, 'scrape');\n void reporter.log('warning', `llm_extractor_unreachable: ${item.url} \u2014 ${llmResult.error || 'unknown reason'}`);\n return item;\n },\n CONCURRENCY.LLM_EXTRACTION,\n );\n\n return { items: llmResults, llmErrors, llmAttempted: successItems.length };\n}\n\n// --- Output assembly ---\n\nfunction assembleContentEntries(successItems: ProcessedResult[], failedContents: string[]): string[] {\n const sorted = [...successItems].sort((a, b) => a.index - b.index);\n const contents = [...failedContents];\n for (const item of sorted) {\n let content = item.content;\n try {\n content = removeMetaTags(content);\n } catch {\n // Use content as-is\n }\n contents.push(`## ${item.url}\\n\\n${content}`);\n }\n return contents;\n}\n\nfunction buildScrapeResponse(\n params: ScrapeLinksParams,\n contents: string[],\n metrics: ScrapeMetrics,\n llmErrors: number,\n executionTime: number,\n llmAccounting: { llmAttempted: number; llmSucceeded: boolean },\n): { content: string; structuredContent: ScrapeLinksOutput } {\n const llmExtras: Record<string, string | number> = {};\n if (llmAccounting.llmAttempted > 0) {\n const ok = llmAccounting.llmAttempted - llmErrors;\n llmExtras['LLM extraction'] = `${ok}/${llmAccounting.llmAttempted} succeeded`;\n if (!llmAccounting.llmSucceeded) {\n llmExtras['LLM credit'] = '0 charged (no extraction produced)';\n }\n } else if (llmErrors > 0) {\n llmExtras['LLM extraction failures'] = llmErrors;\n }\n\n const batchHeader = formatBatchHeader({\n title: `Scraped Content (${params.urls.length} URLs)`,\n totalItems: params.urls.length,\n successful: metrics.successful,\n failed: metrics.failed,\n extras: {\n 'Credits used': metrics.totalCredits,\n ...llmExtras,\n },\n });\n\n const formattedContent = formatSuccess({\n title: 'Scraping Complete',\n summary: batchHeader,\n data: contents.join('\\n\\n---\\n\\n'),\n metadata: {\n 'Execution time': formatDuration(executionTime),\n },\n });\n\n const metadata: ScrapeLinksOutput['metadata'] = {\n total_items: params.urls.length,\n successful: metrics.successful,\n failed: metrics.failed,\n execution_time_ms: executionTime,\n total_credits: metrics.totalCredits,\n };\n return { content: formattedContent, structuredContent: { metadata } };\n}\n\n// --- Handler ---\n\nexport async function handleScrapeLinks(\n params: ScrapeLinksParams,\n reporter: ToolReporter = NOOP_REPORTER,\n): Promise<ToolExecutionResult<ScrapeLinksOutput>> {\n const startTime = Date.now();\n\n if (!params.urls || params.urls.length === 0) {\n return createScrapeErrorResponse('NO_URLS', 'No URLs provided', startTime);\n }\n\n const { webInputs, redditInputs, invalidEntries } = partitionUrls(params.urls);\n const validCount = webInputs.length + redditInputs.length;\n\n await reporter.log(\n 'info',\n `Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${invalidEntries.length} invalid`,\n );\n\n if (validCount === 0) {\n return createScrapeErrorResponse(\n 'INVALID_URLS',\n `All ${params.urls.length} URLs are invalid`,\n startTime,\n false,\n [\n 'web-search(queries=[...], extract=\"...\") \u2014 search for valid URLs first, then scrape the results',\n ],\n );\n }\n\n mcpLog(\n 'info',\n `Starting scrape: ${webInputs.length} web + ${redditInputs.length} reddit URL(s)`,\n 'scrape',\n );\n await reporter.progress(15, 100, 'Preparing scraper clients');\n\n // Only initialize web clients if we actually have web URLs. Reddit-only\n // batches run without touching the scraper.\n let clients: ScrapeClients | null = null;\n try {\n if (webInputs.length > 0) {\n clients = { client: new ScraperClient(), llmProcessor: createLLMProcessor() };\n } else {\n // Reddit-only: no scraper needed, but still create the LLM processor\n // so the extraction pass runs.\n clients = {\n client: null as unknown as ScraperClient,\n llmProcessor: createLLMProcessor(),\n };\n }\n } catch (error) {\n const err = classifyError(error);\n return createScrapeErrorResponse(\n 'CLIENT_INIT_FAILED',\n `Failed to initialize scraper: ${err.message}`,\n startTime,\n false,\n [\n 'web-search(queries=[\"topic key findings\", \"topic summary\"], extract=\"key findings and summary\") \u2014 search instead of scraping',\n ],\n );\n }\n\n const enhancedInstruction = enhanceExtractionInstruction(params.extract);\n\n await reporter.progress(35, 100, 'Fetching page content');\n\n // Run both branches in parallel. Failures in one branch do not block the other.\n const [webPhase, redditPhase] = await Promise.all([\n webInputs.length > 0\n ? fetchWebBranch(webInputs, clients.client)\n : Promise.resolve<ScrapePhaseResult>({ successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } }),\n fetchRedditBranch(redditInputs),\n ]);\n\n const successItems = [...webPhase.successItems, ...redditPhase.successItems];\n const invalidFailed = invalidEntries.map(\n ({ url }) => `## ${url}\\n\\n\u274C Invalid URL format`,\n );\n const failedContents = [...invalidFailed, ...webPhase.failedContents, ...redditPhase.failedContents];\n const metrics: ScrapeMetrics = {\n successful: webPhase.metrics.successful + redditPhase.metrics.successful,\n failed: invalidEntries.length + webPhase.metrics.failed + redditPhase.metrics.failed,\n totalCredits: webPhase.metrics.totalCredits,\n };\n\n await reporter.log('info', `Fetched ${metrics.successful} page(s), ${metrics.failed} failed`);\n\n if (successItems.length > 0) {\n await reporter.progress(80, 100, 'Running LLM extraction over fetched pages');\n }\n\n const { items: processedItems, llmErrors, llmAttempted } = await processItemsWithLlm(\n successItems,\n enhancedInstruction,\n clients.llmProcessor,\n reporter,\n );\n\n const contents = assembleContentEntries(processedItems, failedContents);\n const executionTime = Date.now() - startTime;\n\n mcpLog(\n 'info',\n `Completed: ${metrics.successful} successful, ${metrics.failed} failed, ${metrics.totalCredits} credits used`,\n 'scrape',\n );\n\n const llmSucceeded = llmAttempted > 0 && llmErrors < llmAttempted;\n const result = buildScrapeResponse(\n params,\n contents,\n metrics,\n llmErrors,\n executionTime,\n { llmAttempted, llmSucceeded },\n );\n\n if (metrics.successful === 0 && metrics.failed > 0) {\n return toolFailure(result.content);\n }\n\n return toolSuccess(result.content, result.structuredContent);\n}\n\nexport function registerScrapeLinksTool(server: MCPServer): void {\n server.tool(\n {\n name: 'scrape-links',\n title: 'Scrape Links',\n description:\n 'Fetch many URLs in parallel and run per-URL structured LLM extraction. Auto-detects reddit.com post permalinks and routes them through the Reddit API (threaded post + comments); everything else flows through the HTTP scraper. Safe to call in parallel \u2014 group URLs by context rather than jamming unrelated batches together. Each page returns `## Source`, `## Matches` (verbatim-preserved facts), `## Not found` (explicit gaps), and `## Follow-up signals` (new terms + referenced URLs) that feed the next research loop. Describe the SHAPE of what you want in `extract`, facets separated by `|` (e.g. `root cause | affected versions | fix | workarounds | timeline`).',\n schema: scrapeLinksParamsSchema,\n outputSchema: scrapeLinksOutputSchema,\n annotations: {\n readOnlyHint: true,\n idempotentHint: true,\n destructiveHint: false,\n openWorldHint: true,\n },\n },\n async (args, ctx) => {\n if (!getCapabilities().scraping) {\n return toToolResponse(toolFailure(getMissingEnvMessage('scraping')));\n }\n\n const reporter = createToolReporter(ctx, 'scrape-links');\n const result = await handleScrapeLinks(args, reporter);\n\n await reporter.progress(100, 100, result.isError ? 'Scrape failed' : 'Scrape complete');\n return toToolResponse(result);\n },\n );\n}\n"],
|
|
5
|
-
"mappings": "AAYA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP;AAAA,EACE;AAAA,EACA;AAAA,OAGK;AACP,SAAS,qBAAqB;AAC9B,SAAS,oBAAqC;AAC9C,SAAS,uBAAuB;AAChC,SAAS,oBAAoB,6BAA6B;AAC1D,SAAS,sBAAsB;AAC/B,SAAS,8BAA8B;AACvC,SAAS,qBAAqB;AAC9B,SAAS,YAAY;AACrB;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAGK;AAEP,MAAM,kBAAkB,IAAI,gBAAgB;AAE5C,SAAS,6BAA6B,aAAyC;AAC7E,QAAM,OAAO,eAAe;AAC5B,SAAO,GAAG,QAAQ,iBAAiB;AAAA;AAAA,EAAO,IAAI;AAAA;AAAA,EAAO,QAAQ,iBAAiB;AAChF;AAkCA,MAAM,cAAc;AACpB,MAAM,wBAAwB;AAE9B,SAAS,YAAY,KAAsB;AACzC,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AACrB,WAAO,YAAY,KAAK,EAAE,QAAQ;AAAA,EACpC,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEA,SAAS,sBAAsB,KAAsB;AACnD,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AACrB,WAAO,YAAY,KAAK,EAAE,QAAQ,KAAK,sBAAsB,KAAK,EAAE,QAAQ;AAAA,EAC9E,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAIA,SAAS,0BACP,MACA,SACA,WACA,YAAY,OACZ,cACwC;AACxC,SAAO;AAAA,IACL,GAAG,YAAY;AAAA,MACb;AAAA,MACA;AAAA,MACA;AAAA,MACA,UAAU;AAAA,MACV,UAAU,SAAS,YAAY,CAAC,gCAAgC,IAAI;AAAA,MACpE;AAAA,IACF,CAAC,CAAC;AAAA;AAAA,kBAAuB,eAAe,KAAK,IAAI,IAAI,SAAS,CAAC;AAAA,EACjE;AACF;AAUA,SAAS,cAAc,MAAiC;AACtD,QAAM,YAA2B,CAAC;AAClC,QAAM,eAA8B,CAAC;AACrC,QAAM,iBAAuD,CAAC;AAE9D,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,qBAAe,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AACzC;AAAA,IACF;AACA,QAAI,YAAY,GAAG,GAAG;AACpB,mBAAa,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IACzC,OAAO;AACL,gBAAU,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,SAAO,EAAE,WAAW,cAAc,eAAe;AACnD;AAIA,eAAe,eACb,QACA,QAC4B;AAC5B,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE;AAAA,EACxG;AAEA,SAAO,QAAQ,yCAAyC,OAAO,MAAM,sBAAsB,YAAY,OAAO,IAAI,QAAQ;AAC1H,QAAM,OAAO,OAAO,IAAI,CAAC,MAAM,EAAE,GAAG;AACpC,QAAM,UAAU,MAAM,OAAO,eAAe,MAAM,EAAE,SAAS,GAAG,CAAC;AAEjE,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC;AAClC,MAAI,aAAa;AACjB,MAAI,SAAS;AACb,MAAI,eAAe;AAEnB,WAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,UAAM,SAAS,QAAQ,CAAC;AACxB,UAAM,YAAY,OAAO,CAAC,EAAG;AAC7B,QAAI,CAAC,QAAQ;AACX;AACA,qBAAe,KAAK,MAAM,OAAO,CAAC,EAAG,GAAG;AAAA;AAAA,0BAA0B;AAClE;AAAA,IACF;AAEA,QAAI,OAAO,SAAS,OAAO,aAAa,OAAO,OAAO,cAAc,KAAK;AACvE;AACA,YAAM,WAAW,OAAO,OAAO,WAAW,OAAO,WAAW,QAAQ,OAAO,UAAU;AACrF,qBAAe,KAAK,MAAM,OAAO,GAAG;AAAA;AAAA,2BAA2B,QAAQ,EAAE;AACzE;AAAA,IACF;AAEA;AACA,oBAAgB,OAAO;AAEvB,QAAI;AACJ,QAAI;AACF,YAAM,WAAW,uBAAuB,OAAO,SAAS,OAAO,GAAG;AAClE,YAAM,mBAAmB,SAAS,YAAY,SAAS,UAAU,OAAO;AACxE,gBAAU,gBAAgB,eAAe,gBAAgB;AAAA,IAC3D,QAAQ;AACN,gBAAU,OAAO;AAAA,IACnB;AAEA,iBAAa,KAAK,EAAE,KAAK,OAAO,KAAK,SAAS,OAAO,UAAU,CAAC;AAAA,EAClE;AAEA,SAAO,EAAE,cAAc,gBAAgB,SAAS,EAAE,YAAY,QAAQ,aAAa,EAAE;AACvF;AAIA,SAAS,2BAA2B,QAA4B;AAC9D,QAAM,EAAE,MAAM,SAAS,IAAI;AAC3B,QAAM,QAAkB,CAAC;AACzB,QAAM,KAAK,KAAK,KAAK,KAAK,EAAE;AAC5B,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,OAAO,KAAK,SAAS,eAAU,KAAK,MAAM,wBAAS,KAAK,KAAK,qBAAS,KAAK,YAAY,WAAW;AAC7G,QAAM,KAAK,aAAM,KAAK,GAAG,EAAE;AAC3B,QAAM,KAAK,EAAE;AACb,MAAI,KAAK,MAAM;AACb,UAAM,KAAK,iBAAiB;AAC5B,UAAM,KAAK,EAAE;AACb,UAAM,KAAK,KAAK,IAAI;AACpB,UAAM,KAAK,EAAE;AAAA,EACf;AACA,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,oBAAoB,SAAS,MAAM,SAAS;AACvD,UAAM,KAAK,EAAE;AACb,eAAW,KAAK,UAAU;AACxB,YAAM,SAAS,KAAK,OAAO,EAAE,KAAK;AAClC,YAAM,KAAK,EAAE,OAAO,cAAc;AAClC,YAAM,QAAQ,EAAE,SAAS,IAAI,IAAI,EAAE,KAAK,KAAK,GAAG,EAAE,KAAK;AACvD,YAAM,KAAK,GAAG,MAAM,SAAS,EAAE,MAAM,KAAK,EAAE,MAAM,KAAK,IAAI;AAC3D,iBAAW,QAAQ,EAAE,KAAK,MAAM,IAAI,GAAG;AACrC,cAAM,KAAK,GAAG,MAAM,KAAK,IAAI,EAAE;AAAA,MACjC;AACA,YAAM,KAAK,EAAE;AAAA,IACf;AAAA,EACF;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,eAAe,kBAAkB,QAAmD;AAClF,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE;AAAA,EACxG;AAEA,QAAM,MAAM,SAAS;AACrB,MAAI,CAAC,IAAI,oBAAoB,CAAC,IAAI,sBAAsB;AACtD,UAAMA,kBAAiB,OAAO;AAAA,MAC5B,CAAC,MAAM,MAAM,EAAE,GAAG;AAAA;AAAA;AAAA,IACpB;AACA,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAAA;AAAA,MACA,SAAS,EAAE,YAAY,GAAG,QAAQ,OAAO,QAAQ,cAAc,EAAE;AAAA,IACnE;AAAA,EACF;AAKA,QAAM,CAAC,YAAY,aAAa,IAAI,OAAO;AAAA,IACzC,CAAC,CAAC,OAAO,IAAI,GAAG,UAAU;AACxB,UAAI,sBAAsB,MAAM,GAAG,EAAG,OAAM,KAAK,KAAK;AAAA,UACjD,MAAK,KAAK,KAAK;AACpB,aAAO,CAAC,OAAO,IAAI;AAAA,IACrB;AAAA,IACA,CAAC,CAAC,GAAG,CAAC,CAAC;AAAA,EACT;AAEA,QAAM,qBAAqB,cAAc;AAAA,IACvC,CAAC,MAAM,MAAM,EAAE,GAAG;AAAA;AAAA;AAAA,EACpB;AAEA,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAgB;AAAA,MAChB,SAAS,EAAE,YAAY,GAAG,QAAQ,cAAc,QAAQ,cAAc,EAAE;AAAA,IAC1E;AAAA,EACF;AAEA,SAAO,QAAQ,yCAAyC,WAAW,MAAM,uBAAuB,YAAY,MAAM,IAAI,QAAQ;AAC9H,QAAM,SAAS,IAAI,aAAa,IAAI,kBAAkB,IAAI,oBAAoB;AAC9E,QAAM,OAAO,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG;AACxC,QAAM,cAAc,MAAM,OAAO,cAAc,MAAM,IAAI;AACzD,QAAM,aAAa,IAAI,IAAI,WAAW,IAAI,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,SAAS,CAAC,CAAC;AAEtE,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC,GAAG,kBAAkB;AACvD,MAAI,aAAa;AACjB,MAAI,SAAS,cAAc;AAE3B,aAAW,CAAC,KAAK,MAAM,KAAK,YAAY,SAAS;AAC/C,UAAM,YAAY,WAAW,IAAI,GAAG,KAAK;AACzC,QAAI,kBAAkB,OAAO;AAC3B;AACA,qBAAe,KAAK,MAAM,GAAG;AAAA;AAAA,8BAA8B,OAAO,OAAO,EAAE;AAC3E;AAAA,IACF;AACA;AACA,iBAAa,KAAK,EAAE,KAAK,SAAS,2BAA2B,MAAM,GAAG,OAAO,UAAU,CAAC;AAAA,EAC1F;AAEA,SAAO,EAAE,cAAc,gBAAgB,SAAS,EAAE,YAAY,QAAQ,cAAc,EAAE,EAAE;AAC1F;AAIA,eAAe,oBACb,cACA,qBACA,cACA,UACgF;AAChF,MAAI,YAAY;AAEhB,MAAI,CAAC,gBAAgB,aAAa,WAAW,GAAG;AAC9C,QAAI,CAAC,gBAAgB,aAAa,SAAS,GAAG;AAC5C,aAAO,WAAW,yEAAyE,QAAQ;AACnG,WAAK,SAAS,IAAI,WAAW,iFAAiF;AAAA,IAChH;AACA,WAAO,EAAE,OAAO,cAAc,WAAW,cAAc,EAAE;AAAA,EAC3D;AAEA,SAAO,QAAQ,6CAA6C,aAAa,MAAM,uBAAuB,YAAY,cAAc,IAAI,QAAQ;AAE5I,QAAM,aAAa,MAAM;AAAA,IACvB;AAAA,IACA,OAAO,SAAS;AACd,aAAO,SAAS,kBAAkB,KAAK,GAAG,OAAO,QAAQ;AAEzD,YAAM,YAAY,MAAM;AAAA,QACtB,KAAK;AAAA,QACL,EAAE,SAAS,MAAM,SAAS,qBAAqB,KAAK,KAAK,IAAI;AAAA,QAC7D;AAAA,MACF;AAEA,UAAI,UAAU,WAAW;AACvB,eAAO,EAAE,GAAG,MAAM,SAAS,UAAU,QAAQ;AAAA,MAC/C;AAEA;AACA,aAAO,WAAW,6BAA6B,KAAK,GAAG,KAAK,UAAU,SAAS,gBAAgB,IAAI,QAAQ;AAC3G,WAAK,SAAS,IAAI,WAAW,8BAA8B,KAAK,GAAG,WAAM,UAAU,SAAS,gBAAgB,EAAE;AAC9G,aAAO;AAAA,IACT;AAAA,IACA,YAAY;AAAA,EACd;AAEA,SAAO,EAAE,OAAO,YAAY,WAAW,cAAc,aAAa,OAAO;AAC3E;AAIA,SAAS,uBAAuB,cAAiC,gBAAoC;AACnG,QAAM,SAAS,CAAC,GAAG,YAAY,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AACjE,QAAM,WAAW,CAAC,GAAG,cAAc;AACnC,aAAW,QAAQ,QAAQ;AACzB,QAAI,UAAU,KAAK;AACnB,QAAI;AACF,gBAAU,eAAe,OAAO;AAAA,IAClC,QAAQ;AAAA,IAER;AACA,aAAS,KAAK,MAAM,KAAK,GAAG;AAAA;AAAA,EAAO,OAAO,EAAE;AAAA,EAC9C;AACA,SAAO;AACT;AAEA,SAAS,oBACP,QACA,UACA,SACA,WACA,eACA,eAC2D;AAC3D,QAAM,YAA6C,CAAC;AACpD,MAAI,cAAc,eAAe,GAAG;AAClC,UAAM,KAAK,cAAc,eAAe;AACxC,cAAU,gBAAgB,IAAI,GAAG,EAAE,IAAI,cAAc,YAAY;AACjE,QAAI,CAAC,cAAc,cAAc;AAC/B,gBAAU,YAAY,IAAI;AAAA,IAC5B;AAAA,EACF,WAAW,YAAY,GAAG;AACxB,cAAU,yBAAyB,IAAI;AAAA,EACzC;AAEA,QAAM,cAAc,kBAAkB;AAAA,IACpC,OAAO,oBAAoB,OAAO,KAAK,MAAM;AAAA,IAC7C,YAAY,OAAO,KAAK;AAAA,IACxB,YAAY,QAAQ;AAAA,IACpB,QAAQ,QAAQ;AAAA,IAChB,QAAQ;AAAA,MACN,gBAAgB,QAAQ;AAAA,MACxB,GAAG;AAAA,IACL;AAAA,EACF,CAAC;AAED,QAAM,mBAAmB,cAAc;AAAA,IACrC,OAAO;AAAA,IACP,SAAS;AAAA,IACT,MAAM,SAAS,KAAK,aAAa;AAAA,IACjC,UAAU;AAAA,MACR,kBAAkB,eAAe,aAAa;AAAA,IAChD;AAAA,EACF,CAAC;AAED,QAAM,WAA0C;AAAA,IAC9C,aAAa,OAAO,KAAK;AAAA,IACzB,YAAY,QAAQ;AAAA,IACpB,QAAQ,QAAQ;AAAA,IAChB,mBAAmB;AAAA,IACnB,eAAe,QAAQ;AAAA,EACzB;AACA,SAAO,EAAE,SAAS,kBAAkB,mBAAmB,EAAE,SAAS,EAAE;AACtE;AAIA,eAAsB,kBACpB,QACA,WAAyB,eACwB;AACjD,QAAM,YAAY,KAAK,IAAI;AAE3B,MAAI,CAAC,OAAO,QAAQ,OAAO,KAAK,WAAW,GAAG;AAC5C,WAAO,0BAA0B,WAAW,oBAAoB,SAAS;AAAA,EAC3E;AAEA,QAAM,EAAE,WAAW,cAAc,eAAe,IAAI,cAAc,OAAO,IAAI;AAC7E,QAAM,aAAa,UAAU,SAAS,aAAa;AAEnD,QAAM,SAAS;AAAA,IACb;AAAA,IACA,eAAe,OAAO,KAAK,MAAM,YAAY,UAAU,MAAM,SAAS,aAAa,MAAM,YAAY,eAAe,MAAM;AAAA,EAC5H;AAEA,MAAI,eAAe,GAAG;AACpB,WAAO;AAAA,MACL;AAAA,MACA,OAAO,OAAO,KAAK,MAAM;AAAA,MACzB;AAAA,MACA;AAAA,MACA;AAAA,QACE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA;AAAA,IACE;AAAA,IACA,oBAAoB,UAAU,MAAM,UAAU,aAAa,MAAM;AAAA,IACjE;AAAA,EACF;AACA,QAAM,SAAS,SAAS,IAAI,KAAK,2BAA2B;AAI5D,MAAI,UAAgC;AACpC,MAAI;AACF,QAAI,UAAU,SAAS,GAAG;AACxB,gBAAU,EAAE,QAAQ,IAAI,cAAc,GAAG,cAAc,mBAAmB,EAAE;AAAA,IAC9E,OAAO;AAGL,gBAAU;AAAA,QACR,QAAQ;AAAA,QACR,cAAc,mBAAmB;AAAA,MACnC;AAAA,IACF;AAAA,EACF,SAAS,OAAO;AACd,UAAM,MAAM,cAAc,KAAK;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,iCAAiC,IAAI,OAAO;AAAA,MAC5C;AAAA,MACA;AAAA,MACA;AAAA,QACE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,sBAAsB,6BAA6B,OAAO,OAAO;AAEvE,QAAM,SAAS,SAAS,IAAI,KAAK,uBAAuB;AAGxD,QAAM,CAAC,UAAU,WAAW,IAAI,MAAM,QAAQ,IAAI;AAAA,IAChD,UAAU,SAAS,IACf,eAAe,WAAW,QAAQ,MAAM,IACxC,QAAQ,QAA2B,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE,CAAC;AAAA,IACvI,kBAAkB,YAAY;AAAA,EAChC,CAAC;AAED,QAAM,eAAe,CAAC,GAAG,SAAS,cAAc,GAAG,YAAY,YAAY;AAC3E,QAAM,gBAAgB,eAAe;AAAA,IACnC,CAAC,EAAE,IAAI,MAAM,MAAM,GAAG;AAAA;AAAA;AAAA,EACxB;AACA,QAAM,iBAAiB,CAAC,GAAG,eAAe,GAAG,SAAS,gBAAgB,GAAG,YAAY,cAAc;AACnG,QAAM,UAAyB;AAAA,IAC7B,YAAY,SAAS,QAAQ,aAAa,YAAY,QAAQ;AAAA,IAC9D,QAAQ,eAAe,SAAS,SAAS,QAAQ,SAAS,YAAY,QAAQ;AAAA,IAC9E,cAAc,SAAS,QAAQ;AAAA,EACjC;AAEA,QAAM,SAAS,IAAI,QAAQ,WAAW,QAAQ,UAAU,aAAa,QAAQ,MAAM,SAAS;AAE5F,MAAI,aAAa,SAAS,GAAG;AAC3B,UAAM,SAAS,SAAS,IAAI,KAAK,2CAA2C;AAAA,EAC9E;AAEA,QAAM,EAAE,OAAO,gBAAgB,WAAW,aAAa,IAAI,MAAM;AAAA,IAC/D;AAAA,IACA;AAAA,IACA,QAAQ;AAAA,IACR;AAAA,EACF;AAEA,QAAM,WAAW,uBAAuB,gBAAgB,cAAc;AACtE,QAAM,gBAAgB,KAAK,IAAI,IAAI;AAEnC;AAAA,IACE;AAAA,IACA,cAAc,QAAQ,UAAU,gBAAgB,QAAQ,MAAM,YAAY,QAAQ,YAAY;AAAA,IAC9F;AAAA,EACF;AAEA,QAAM,eAAe,eAAe,KAAK,YAAY;AACrD,QAAM,SAAS;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,EAAE,cAAc,aAAa;AAAA,EAC/B;AAEA,MAAI,QAAQ,eAAe,KAAK,QAAQ,SAAS,GAAG;AAClD,WAAO,YAAY,OAAO,OAAO;AAAA,EACnC;AAEA,SAAO,YAAY,OAAO,SAAS,OAAO,iBAAiB;AAC7D;AAEO,SAAS,wBAAwB,QAAyB;AAC/D,SAAO;AAAA,IACL;AAAA,MACE,MAAM;AAAA,MACN,OAAO;AAAA,MACP,aACE;AAAA,MACF,QAAQ;AAAA,MACR,cAAc;AAAA,MACd,aAAa;AAAA,QACX,cAAc;AAAA,QACd,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,eAAe;AAAA,MACjB;AAAA,IACF;AAAA,IACA,OAAO,MAAM,QAAQ;AACnB,UAAI,CAAC,gBAAgB,EAAE,UAAU;AAC/B,eAAO,eAAe,YAAY,qBAAqB,UAAU,CAAC,CAAC;AAAA,MACrE;AAEA,YAAM,WAAW,mBAAmB,KAAK,cAAc;AACvD,YAAM,SAAS,MAAM,kBAAkB,MAAM,QAAQ;AAErD,YAAM,SAAS,SAAS,KAAK,KAAK,OAAO,UAAU,kBAAkB,iBAAiB;AACtF,aAAO,eAAe,MAAM;AAAA,IAC9B;AAAA,EACF;AACF;",
|
|
4
|
+
"sourcesContent": ["/**\n * Scrape Links Tool Handler\n *\n * Scrapes many URLs in parallel. Reddit permalinks (reddit.com/r/.../comments/...)\n * are auto-detected and routed through the Reddit API; all other URLs go through\n * the scraper. Both branches feed the same per-URL LLM extraction pipeline.\n *\n * NEVER throws \u2014 every error is returned as a tool-level failure response.\n */\n\nimport type { MCPServer } from 'mcp-use/server';\n\nimport {\n SCRAPER,\n CONCURRENCY,\n getCapabilities,\n getMissingEnvMessage,\n parseEnv,\n} from '../config/index.js';\nimport {\n scrapeLinksOutputSchema,\n scrapeLinksParamsSchema,\n type ScrapeLinksParams,\n type ScrapeLinksOutput,\n} from '../schemas/scrape-links.js';\nimport { ScraperClient } from '../clients/scraper.js';\nimport { RedditClient, type PostResult } from '../clients/reddit.js';\nimport { JinaClient } from '../clients/jina.js';\nimport { MarkdownCleaner } from '../services/markdown-cleaner.js';\nimport { createLLMProcessor, processContentWithLLM } from '../services/llm-processor.js';\nimport { removeMetaTags } from '../utils/markdown-formatter.js';\nimport { extractReadableContent } from '../utils/content-extractor.js';\nimport { classifyError, ErrorCode } from '../utils/errors.js';\nimport { isDocumentUrl } from '../utils/source-type.js';\nimport { pMap, pMapSettled } from '../utils/concurrency.js';\nimport {\n mcpLog,\n formatSuccess,\n formatError,\n formatBatchHeader,\n formatDuration,\n} from './utils.js';\nimport {\n createToolReporter,\n NOOP_REPORTER,\n toolFailure,\n toolSuccess,\n toToolResponse,\n type ToolExecutionResult,\n type ToolReporter,\n} from './mcp-helpers.js';\n\nconst markdownCleaner = new MarkdownCleaner();\n\nfunction enhanceExtractionInstruction(instruction: string | undefined): string {\n const base = instruction || 'Extract the main content and key information from this page.';\n return `${SCRAPER.EXTRACTION_PREFIX}\\n\\n${base}\\n\\n${SCRAPER.EXTRACTION_SUFFIX}`;\n}\n\n// --- Types ---\n\ninterface ProcessedResult {\n url: string;\n content: string;\n index: number; // original position in params.urls[]\n}\n\ninterface ScrapeMetrics {\n successful: number;\n failed: number;\n totalCredits: number;\n}\n\ninterface ScrapePhaseResult {\n successItems: ProcessedResult[];\n failedContents: string[];\n metrics: ScrapeMetrics;\n}\n\ninterface BranchInput {\n url: string;\n origIndex: number;\n}\n\ninterface ScrapeClients {\n client: ScraperClient;\n jinaClient: JinaClient;\n llmProcessor: ReturnType<typeof createLLMProcessor>;\n}\n\n/**\n * Any URL the web branch decides to hand off to Jina Reader \u2014 either because\n * Scrape.do returned a binary content-type, or because Scrape.do failed\n * outright (non-404 error). `scrapeError` is preserved so that, if Jina also\n * fails, the final error message can surface both layers.\n *\n * Genuine 404s are NOT put here \u2014 the URL doesn't exist; Jina won't help.\n */\ninterface JinaFallback {\n url: string;\n origIndex: number;\n reason: 'binary_content' | 'scrape_failed';\n scrapeError?: string;\n}\n\ninterface WebPhaseResult extends ScrapePhaseResult {\n jinaFallbacks: JinaFallback[];\n}\n\n// --- Reddit URL detection ---\n\nconst REDDIT_HOST = /(?:^|\\.)reddit\\.com$/i;\nconst REDDIT_POST_PERMALINK = /\\/r\\/[^/]+\\/comments\\/[a-z0-9]+/i;\n\nfunction isRedditUrl(url: string): boolean {\n try {\n const u = new URL(url);\n return REDDIT_HOST.test(u.hostname);\n } catch {\n return false;\n }\n}\n\nfunction isRedditPostPermalink(url: string): boolean {\n try {\n const u = new URL(url);\n return REDDIT_HOST.test(u.hostname) && REDDIT_POST_PERMALINK.test(u.pathname);\n } catch {\n return false;\n }\n}\n\n// --- Error helper ---\n\nfunction createScrapeErrorResponse(\n code: string,\n message: string,\n startTime: number,\n retryable = false,\n alternatives?: string[],\n): ToolExecutionResult<ScrapeLinksOutput> {\n return toolFailure(\n `${formatError({\n code,\n message,\n retryable,\n toolName: 'scrape-links',\n howToFix: code === 'NO_URLS' ? ['Provide at least one valid URL'] : undefined,\n alternatives,\n })}\\n\\nExecution time: ${formatDuration(Date.now() - startTime)}`,\n );\n}\n\n// --- URL partitioning ---\n\ninterface PartitionedUrls {\n webInputs: BranchInput[];\n redditInputs: BranchInput[];\n documentInputs: BranchInput[];\n invalidEntries: { url: string; origIndex: number }[];\n}\n\nfunction partitionUrls(urls: string[]): PartitionedUrls {\n const webInputs: BranchInput[] = [];\n const redditInputs: BranchInput[] = [];\n const documentInputs: BranchInput[] = [];\n const invalidEntries: { url: string; origIndex: number }[] = [];\n\n for (let i = 0; i < urls.length; i++) {\n const url = urls[i]!;\n try {\n new URL(url);\n } catch {\n invalidEntries.push({ url, origIndex: i });\n continue;\n }\n // Document URLs (.pdf/.docx/.pptx/.xlsx) go straight to Jina Reader \u2014\n // bypassing Scrape.do because it cannot decode binary bodies. Ordered\n // before the Reddit check so a hypothetical PDF on a reddit-adjacent host\n // still takes the document path.\n if (isDocumentUrl(url)) {\n documentInputs.push({ url, origIndex: i });\n } else if (isRedditUrl(url)) {\n redditInputs.push({ url, origIndex: i });\n } else {\n webInputs.push({ url, origIndex: i });\n }\n }\n\n return { webInputs, redditInputs, documentInputs, invalidEntries };\n}\n\n// --- Web branch ---\n\nasync function fetchWebBranch(\n inputs: BranchInput[],\n client: ScraperClient,\n): Promise<WebPhaseResult> {\n if (inputs.length === 0) {\n return {\n successItems: [],\n failedContents: [],\n metrics: { successful: 0, failed: 0, totalCredits: 0 },\n jinaFallbacks: [],\n };\n }\n\n mcpLog('info', `[concurrency] web branch: fanning out ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`, 'scrape');\n const urls = inputs.map((i) => i.url);\n const results = await client.scrapeMultiple(urls, { timeout: 60 });\n const urlToIndex = new Map(inputs.map((i) => [i.url, i.origIndex]));\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [];\n const jinaFallbacks: JinaFallback[] = [];\n let successful = 0;\n let failed = 0;\n let totalCredits = 0;\n\n for (let i = 0; i < results.length; i++) {\n const result = results[i];\n const origIndex = inputs[i]!.origIndex;\n if (!result) {\n failed++;\n failedContents.push(`## ${inputs[i]!.url}\\n\\n\u274C No result returned`);\n continue;\n }\n\n // Binary document detected by content-type \u2014 defer to Jina Reader.\n if (result.error?.code === ErrorCode.UNSUPPORTED_BINARY_CONTENT) {\n jinaFallbacks.push({\n url: result.url,\n origIndex: urlToIndex.get(result.url) ?? origIndex,\n reason: 'binary_content',\n });\n continue;\n }\n\n // Scrape.do failure \u2014 only 404s are treated as hard fails (Jina won't\n // help when the page genuinely doesn't exist). Every other failure mode\n // (302 redirect loops, WAF blocks, timeouts, 5xx, service unavailable)\n // gets a second chance through Jina Reader, which uses different IPs\n // and handles many anti-bot surfaces differently.\n const scrapeFailed = Boolean(result.error) || result.statusCode < 200 || result.statusCode >= 300;\n if (scrapeFailed && result.statusCode !== 404) {\n jinaFallbacks.push({\n url: result.url,\n origIndex: urlToIndex.get(result.url) ?? origIndex,\n reason: 'scrape_failed',\n scrapeError: result.error?.message || result.content || `HTTP ${result.statusCode}`,\n });\n continue;\n }\n if (scrapeFailed) {\n failed++;\n failedContents.push(`## ${result.url}\\n\\n\u274C Failed to scrape: HTTP 404 \u2014 Page not found`);\n continue;\n }\n\n successful++;\n totalCredits += result.credits;\n\n let content: string;\n try {\n const readable = extractReadableContent(result.content, result.url);\n const sourceForCleaner = readable.extracted ? readable.content : result.content;\n content = markdownCleaner.processContent(sourceForCleaner);\n } catch {\n content = result.content;\n }\n\n successItems.push({ url: result.url, content, index: origIndex });\n }\n\n return {\n successItems,\n failedContents,\n metrics: { successful, failed, totalCredits },\n jinaFallbacks,\n };\n}\n\n// --- Document branch (Jina Reader) ---\n\n/**\n * Format a Jina-failure line. If the URL was deferred here *after* Scrape.do\n * already failed, surface both layers' errors so the caller can see that this\n * isn't just a Jina glitch \u2014 the primary path failed too.\n *\n * Exported for unit testing.\n */\nexport function formatJinaFailure(url: string, jinaError: string, scrapeError?: string): string {\n if (scrapeError) {\n return `## ${url}\\n\\n\u274C Both scrapers failed. Scrape.do: ${scrapeError}. Jina Reader: ${jinaError}.`;\n }\n return `## ${url}\\n\\n\u274C Document conversion failed: ${jinaError}`;\n}\n\nasync function fetchDocumentBranch(\n inputs: BranchInput[],\n jinaClient: JinaClient,\n /** Optional: map url \u2192 original Scrape.do error, for fallback messaging. */\n scrapeErrorContext?: Map<string, string>,\n): Promise<ScrapePhaseResult> {\n if (inputs.length === 0) {\n return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };\n }\n\n mcpLog(\n 'info',\n `[concurrency] document branch (jina): converting ${inputs.length} URL(s) with limit=${CONCURRENCY.SCRAPER}`,\n 'scrape',\n );\n\n const results = await pMapSettled(\n inputs,\n (input) => jinaClient.convert({ url: input.url }),\n CONCURRENCY.SCRAPER,\n );\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [];\n let successful = 0;\n let failed = 0;\n\n for (let i = 0; i < results.length; i++) {\n const settled = results[i];\n const input = inputs[i]!;\n const scrapeError = scrapeErrorContext?.get(input.url);\n if (!settled) {\n failed++;\n failedContents.push(formatJinaFailure(input.url, 'No result returned', scrapeError));\n continue;\n }\n if (settled.status === 'rejected') {\n failed++;\n const reason = settled.reason instanceof Error ? settled.reason.message : String(settled.reason);\n failedContents.push(formatJinaFailure(input.url, reason, scrapeError));\n continue;\n }\n\n const result = settled.value;\n if (result.error || result.statusCode < 200 || result.statusCode >= 300) {\n failed++;\n const errorMsg = result.error?.message || `HTTP ${result.statusCode}`;\n failedContents.push(formatJinaFailure(input.url, errorMsg, scrapeError));\n continue;\n }\n\n successful++;\n successItems.push({ url: input.url, content: result.content, index: input.origIndex });\n }\n\n return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };\n}\n\n// --- Reddit branch ---\n\nfunction formatRedditPostAsMarkdown(result: PostResult): string {\n const { post, comments } = result;\n const lines: string[] = [];\n lines.push(`# ${post.title}`);\n lines.push('');\n lines.push(`**r/${post.subreddit}** \u2022 u/${post.author} \u2022 \u2B06\uFE0F ${post.score} \u2022 \uD83D\uDCAC ${post.commentCount} comments`);\n lines.push(`\uD83D\uDD17 ${post.url}`);\n lines.push('');\n if (post.body) {\n lines.push('## Post content');\n lines.push('');\n lines.push(post.body);\n lines.push('');\n }\n if (comments.length > 0) {\n lines.push(`## Top comments (${comments.length} total)`);\n lines.push('');\n for (const c of comments) {\n const indent = ' '.repeat(c.depth);\n const op = c.isOP ? ' **[OP]**' : '';\n const score = c.score >= 0 ? `+${c.score}` : `${c.score}`;\n lines.push(`${indent}- **u/${c.author}**${op} _(${score})_`);\n for (const line of c.body.split('\\n')) {\n lines.push(`${indent} ${line}`);\n }\n lines.push('');\n }\n }\n return lines.join('\\n');\n}\n\nasync function fetchRedditBranch(inputs: BranchInput[]): Promise<ScrapePhaseResult> {\n if (inputs.length === 0) {\n return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };\n }\n\n const env = parseEnv();\n if (!env.REDDIT_CLIENT_ID || !env.REDDIT_CLIENT_SECRET) {\n const failedContents = inputs.map(\n (i) => `## ${i.url}\\n\\n\u274C Reddit URL detected, but Reddit API is not configured. Set \\`REDDIT_CLIENT_ID\\` and \\`REDDIT_CLIENT_SECRET\\` in the server env to enable threaded Reddit scraping.`,\n );\n return {\n successItems: [],\n failedContents,\n metrics: { successful: 0, failed: inputs.length, totalCredits: 0 },\n };\n }\n\n // Warn for non-permalink Reddit URLs (subreddit homepages, /new, /top, /hot,\n // user profiles). The Reddit API path we call requires /r/.../comments/... \u2014\n // reject upfront so the caller sees a helpful message instead of a 404.\n const [postInputs, nonPermalinks] = inputs.reduce<[BranchInput[], BranchInput[]]>(\n ([posts, rest], input) => {\n if (isRedditPostPermalink(input.url)) posts.push(input);\n else rest.push(input);\n return [posts, rest];\n },\n [[], []],\n );\n\n const nonPermalinkFailed = nonPermalinks.map(\n (i) => `## ${i.url}\\n\\n\u274C Only Reddit post permalinks (/r/<sub>/comments/<id>/...) are supported. Use web-search with scope:\"reddit\" to discover post permalinks first.`,\n );\n\n if (postInputs.length === 0) {\n return {\n successItems: [],\n failedContents: nonPermalinkFailed,\n metrics: { successful: 0, failed: nonPermalinks.length, totalCredits: 0 },\n };\n }\n\n mcpLog('info', `[concurrency] reddit branch: fetching ${postInputs.length} post(s) with limit=${CONCURRENCY.REDDIT}`, 'scrape');\n const client = new RedditClient(env.REDDIT_CLIENT_ID, env.REDDIT_CLIENT_SECRET);\n const urls = postInputs.map((i) => i.url);\n const batchResult = await client.batchGetPosts(urls, true);\n const urlToIndex = new Map(postInputs.map((i) => [i.url, i.origIndex]));\n\n const successItems: ProcessedResult[] = [];\n const failedContents: string[] = [...nonPermalinkFailed];\n let successful = 0;\n let failed = nonPermalinks.length;\n\n for (const [url, result] of batchResult.results) {\n const origIndex = urlToIndex.get(url) ?? -1;\n if (result instanceof Error) {\n failed++;\n failedContents.push(`## ${url}\\n\\n\u274C Reddit fetch failed: ${result.message}`);\n continue;\n }\n successful++;\n successItems.push({ url, content: formatRedditPostAsMarkdown(result), index: origIndex });\n }\n\n return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };\n}\n\n// --- LLM extraction (shared by both branches) ---\n\nasync function processItemsWithLlm(\n successItems: ProcessedResult[],\n enhancedInstruction: string,\n llmProcessor: ReturnType<typeof createLLMProcessor>,\n reporter: ToolReporter,\n): Promise<{ items: ProcessedResult[]; llmErrors: number; llmAttempted: number }> {\n let llmErrors = 0;\n\n if (!llmProcessor || successItems.length === 0) {\n if (!llmProcessor && successItems.length > 0) {\n mcpLog('warning', 'LLM unavailable (LLM_API_KEY not set). Returning raw scraped content.', 'scrape');\n void reporter.log('warning', 'llm_extractor_unreachable: planner not configured; raw scraped content returned');\n }\n return { items: successItems, llmErrors, llmAttempted: 0 };\n }\n\n mcpLog('info', `[concurrency] llm extraction: fanning out ${successItems.length} item(s) with limit=${CONCURRENCY.LLM_EXTRACTION}`, 'scrape');\n\n const llmResults = await pMap(\n successItems,\n async (item) => {\n mcpLog('debug', `LLM extracting ${item.url}...`, 'scrape');\n\n const llmResult = await processContentWithLLM(\n item.content,\n { enabled: true, extract: enhancedInstruction, url: item.url },\n llmProcessor,\n );\n\n if (llmResult.processed) {\n return { ...item, content: llmResult.content };\n }\n\n llmErrors++;\n mcpLog('warning', `LLM extraction failed for ${item.url}: ${llmResult.error || 'unknown reason'}`, 'scrape');\n void reporter.log('warning', `llm_extractor_unreachable: ${item.url} \u2014 ${llmResult.error || 'unknown reason'}`);\n return item;\n },\n CONCURRENCY.LLM_EXTRACTION,\n );\n\n return { items: llmResults, llmErrors, llmAttempted: successItems.length };\n}\n\n// --- Output assembly ---\n\nfunction assembleContentEntries(successItems: ProcessedResult[], failedContents: string[]): string[] {\n const sorted = [...successItems].sort((a, b) => a.index - b.index);\n const contents = [...failedContents];\n for (const item of sorted) {\n let content = item.content;\n try {\n content = removeMetaTags(content);\n } catch {\n // Use content as-is\n }\n contents.push(`## ${item.url}\\n\\n${content}`);\n }\n return contents;\n}\n\nfunction buildScrapeResponse(\n params: ScrapeLinksParams,\n contents: string[],\n metrics: ScrapeMetrics,\n llmErrors: number,\n executionTime: number,\n llmAccounting: { llmAttempted: number; llmSucceeded: boolean },\n): { content: string; structuredContent: ScrapeLinksOutput } {\n const llmExtras: Record<string, string | number> = {};\n if (llmAccounting.llmAttempted > 0) {\n const ok = llmAccounting.llmAttempted - llmErrors;\n llmExtras['LLM extraction'] = `${ok}/${llmAccounting.llmAttempted} succeeded`;\n if (!llmAccounting.llmSucceeded) {\n llmExtras['LLM credit'] = '0 charged (no extraction produced)';\n }\n } else if (llmErrors > 0) {\n llmExtras['LLM extraction failures'] = llmErrors;\n }\n\n const batchHeader = formatBatchHeader({\n title: `Scraped Content (${params.urls.length} URLs)`,\n totalItems: params.urls.length,\n successful: metrics.successful,\n failed: metrics.failed,\n extras: {\n 'Credits used': metrics.totalCredits,\n ...llmExtras,\n },\n });\n\n const formattedContent = formatSuccess({\n title: 'Scraping Complete',\n summary: batchHeader,\n data: contents.join('\\n\\n---\\n\\n'),\n metadata: {\n 'Execution time': formatDuration(executionTime),\n },\n });\n\n const metadata: ScrapeLinksOutput['metadata'] = {\n total_items: params.urls.length,\n successful: metrics.successful,\n failed: metrics.failed,\n execution_time_ms: executionTime,\n total_credits: metrics.totalCredits,\n };\n return { content: formattedContent, structuredContent: { metadata } };\n}\n\n// --- Handler ---\n\nexport async function handleScrapeLinks(\n params: ScrapeLinksParams,\n reporter: ToolReporter = NOOP_REPORTER,\n): Promise<ToolExecutionResult<ScrapeLinksOutput>> {\n const startTime = Date.now();\n\n if (!params.urls || params.urls.length === 0) {\n return createScrapeErrorResponse('NO_URLS', 'No URLs provided', startTime);\n }\n\n const { webInputs, redditInputs, documentInputs, invalidEntries } = partitionUrls(params.urls);\n const validCount = webInputs.length + redditInputs.length + documentInputs.length;\n\n await reporter.log(\n 'info',\n `Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${documentInputs.length} document, ${invalidEntries.length} invalid`,\n );\n\n if (validCount === 0) {\n return createScrapeErrorResponse(\n 'INVALID_URLS',\n `All ${params.urls.length} URLs are invalid`,\n startTime,\n false,\n [\n 'web-search(queries=[...], extract=\"...\") \u2014 search for valid URLs first, then scrape the results',\n ],\n );\n }\n\n mcpLog(\n 'info',\n `Starting scrape: ${webInputs.length} web + ${redditInputs.length} reddit + ${documentInputs.length} document URL(s)`,\n 'scrape',\n );\n await reporter.progress(15, 100, 'Preparing scraper clients');\n\n // Only initialize the Scrape.do client if we actually have HTML/web URLs.\n // The Jina client is cheap (no auth needed) and always constructed so the\n // document branch and the web\u2192Jina fallback path both work uniformly.\n let clients: ScrapeClients | null = null;\n try {\n const jinaClient = new JinaClient();\n if (webInputs.length > 0) {\n clients = {\n client: new ScraperClient(),\n jinaClient,\n llmProcessor: createLLMProcessor(),\n };\n } else {\n clients = {\n client: null as unknown as ScraperClient,\n jinaClient,\n llmProcessor: createLLMProcessor(),\n };\n }\n } catch (error) {\n const err = classifyError(error);\n return createScrapeErrorResponse(\n 'CLIENT_INIT_FAILED',\n `Failed to initialize scraper: ${err.message}`,\n startTime,\n false,\n [\n 'web-search(queries=[\"topic key findings\", \"topic summary\"], extract=\"key findings and summary\") \u2014 search instead of scraping',\n ],\n );\n }\n\n const enhancedInstruction = enhanceExtractionInstruction(params.extract);\n\n await reporter.progress(35, 100, 'Fetching page content');\n\n // Phase 1 \u2014 run all three branches in parallel. Failures in one branch do\n // not block the others. The web branch may surface URLs to reroute via\n // `jinaFallbacks` (binary content-type OR non-404 Scrape.do failure),\n // which Phase 2 re-runs through Jina Reader.\n const emptyPhase: WebPhaseResult = {\n successItems: [], failedContents: [],\n metrics: { successful: 0, failed: 0, totalCredits: 0 },\n jinaFallbacks: [],\n };\n const [webPhase, redditPhase, documentPhase] = await Promise.all([\n webInputs.length > 0\n ? fetchWebBranch(webInputs, clients.client)\n : Promise.resolve<WebPhaseResult>(emptyPhase),\n fetchRedditBranch(redditInputs),\n fetchDocumentBranch(documentInputs, clients.jinaClient),\n ]);\n\n // Phase 2 \u2014 Jina Reader as a fallback for web-branch URLs that either\n // returned binary content or failed outright on Scrape.do.\n let deferredPhase: ScrapePhaseResult = {\n successItems: [], failedContents: [],\n metrics: { successful: 0, failed: 0, totalCredits: 0 },\n };\n if (webPhase.jinaFallbacks.length > 0) {\n const binaryCount = webPhase.jinaFallbacks.filter((f) => f.reason === 'binary_content').length;\n const failedCount = webPhase.jinaFallbacks.length - binaryCount;\n await reporter.log(\n 'info',\n `Rerouting ${webPhase.jinaFallbacks.length} URL(s) to Jina Reader: ${binaryCount} binary, ${failedCount} scrape-failed`,\n );\n const fallbackInputs: BranchInput[] = webPhase.jinaFallbacks.map((f) => ({\n url: f.url,\n origIndex: f.origIndex,\n }));\n const errorContext = new Map<string, string>(\n webPhase.jinaFallbacks\n .filter((f) => f.scrapeError !== undefined)\n .map((f) => [f.url, f.scrapeError as string]),\n );\n deferredPhase = await fetchDocumentBranch(fallbackInputs, clients.jinaClient, errorContext);\n }\n\n const successItems = [\n ...webPhase.successItems,\n ...redditPhase.successItems,\n ...documentPhase.successItems,\n ...deferredPhase.successItems,\n ];\n const invalidFailed = invalidEntries.map(\n ({ url }) => `## ${url}\\n\\n\u274C Invalid URL format`,\n );\n const failedContents = [\n ...invalidFailed,\n ...webPhase.failedContents,\n ...redditPhase.failedContents,\n ...documentPhase.failedContents,\n ...deferredPhase.failedContents,\n ];\n const metrics: ScrapeMetrics = {\n successful:\n webPhase.metrics.successful\n + redditPhase.metrics.successful\n + documentPhase.metrics.successful\n + deferredPhase.metrics.successful,\n failed:\n invalidEntries.length\n + webPhase.metrics.failed\n + redditPhase.metrics.failed\n + documentPhase.metrics.failed\n + deferredPhase.metrics.failed,\n totalCredits: webPhase.metrics.totalCredits,\n };\n\n await reporter.log('info', `Fetched ${metrics.successful} page(s), ${metrics.failed} failed`);\n\n if (successItems.length > 0) {\n await reporter.progress(80, 100, 'Running LLM extraction over fetched pages');\n }\n\n const { items: processedItems, llmErrors, llmAttempted } = await processItemsWithLlm(\n successItems,\n enhancedInstruction,\n clients.llmProcessor,\n reporter,\n );\n\n const contents = assembleContentEntries(processedItems, failedContents);\n const executionTime = Date.now() - startTime;\n\n mcpLog(\n 'info',\n `Completed: ${metrics.successful} successful, ${metrics.failed} failed, ${metrics.totalCredits} credits used`,\n 'scrape',\n );\n\n const llmSucceeded = llmAttempted > 0 && llmErrors < llmAttempted;\n const result = buildScrapeResponse(\n params,\n contents,\n metrics,\n llmErrors,\n executionTime,\n { llmAttempted, llmSucceeded },\n );\n\n if (metrics.successful === 0 && metrics.failed > 0) {\n return toolFailure(result.content);\n }\n\n return toolSuccess(result.content, result.structuredContent);\n}\n\nexport function registerScrapeLinksTool(server: MCPServer): void {\n server.tool(\n {\n name: 'scrape-links',\n title: 'Scrape Links',\n description:\n 'Fetch many URLs in parallel and run per-URL structured LLM extraction. Auto-detects reddit.com post permalinks and routes them through the Reddit API (threaded post + comments); everything else flows through the HTTP scraper. Safe to call in parallel \u2014 group URLs by context rather than jamming unrelated batches together. Each page returns `## Source`, `## Matches` (verbatim-preserved facts), `## Not found` (explicit gaps), and `## Follow-up signals` (new terms + referenced URLs) that feed the next research loop. Describe the SHAPE of what you want in `extract`, facets separated by `|` (e.g. `root cause | affected versions | fix | workarounds | timeline`).',\n schema: scrapeLinksParamsSchema,\n outputSchema: scrapeLinksOutputSchema,\n annotations: {\n readOnlyHint: true,\n idempotentHint: true,\n destructiveHint: false,\n openWorldHint: true,\n },\n },\n async (args, ctx) => {\n if (!getCapabilities().scraping) {\n return toToolResponse(toolFailure(getMissingEnvMessage('scraping')));\n }\n\n const reporter = createToolReporter(ctx, 'scrape-links');\n const result = await handleScrapeLinks(args, reporter);\n\n await reporter.progress(100, 100, result.isError ? 'Scrape failed' : 'Scrape complete');\n return toToolResponse(result);\n },\n );\n}\n"],
|
|
5
|
+
"mappings": "AAYA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP;AAAA,EACE;AAAA,EACA;AAAA,OAGK;AACP,SAAS,qBAAqB;AAC9B,SAAS,oBAAqC;AAC9C,SAAS,kBAAkB;AAC3B,SAAS,uBAAuB;AAChC,SAAS,oBAAoB,6BAA6B;AAC1D,SAAS,sBAAsB;AAC/B,SAAS,8BAA8B;AACvC,SAAS,eAAe,iBAAiB;AACzC,SAAS,qBAAqB;AAC9B,SAAS,MAAM,mBAAmB;AAClC;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAGK;AAEP,MAAM,kBAAkB,IAAI,gBAAgB;AAE5C,SAAS,6BAA6B,aAAyC;AAC7E,QAAM,OAAO,eAAe;AAC5B,SAAO,GAAG,QAAQ,iBAAiB;AAAA;AAAA,EAAO,IAAI;AAAA;AAAA,EAAO,QAAQ,iBAAiB;AAChF;AAsDA,MAAM,cAAc;AACpB,MAAM,wBAAwB;AAE9B,SAAS,YAAY,KAAsB;AACzC,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AACrB,WAAO,YAAY,KAAK,EAAE,QAAQ;AAAA,EACpC,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAEA,SAAS,sBAAsB,KAAsB;AACnD,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AACrB,WAAO,YAAY,KAAK,EAAE,QAAQ,KAAK,sBAAsB,KAAK,EAAE,QAAQ;AAAA,EAC9E,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAIA,SAAS,0BACP,MACA,SACA,WACA,YAAY,OACZ,cACwC;AACxC,SAAO;AAAA,IACL,GAAG,YAAY;AAAA,MACb;AAAA,MACA;AAAA,MACA;AAAA,MACA,UAAU;AAAA,MACV,UAAU,SAAS,YAAY,CAAC,gCAAgC,IAAI;AAAA,MACpE;AAAA,IACF,CAAC,CAAC;AAAA;AAAA,kBAAuB,eAAe,KAAK,IAAI,IAAI,SAAS,CAAC;AAAA,EACjE;AACF;AAWA,SAAS,cAAc,MAAiC;AACtD,QAAM,YAA2B,CAAC;AAClC,QAAM,eAA8B,CAAC;AACrC,QAAM,iBAAgC,CAAC;AACvC,QAAM,iBAAuD,CAAC;AAE9D,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,MAAM,KAAK,CAAC;AAClB,QAAI;AACF,UAAI,IAAI,GAAG;AAAA,IACb,QAAQ;AACN,qBAAe,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AACzC;AAAA,IACF;AAKA,QAAI,cAAc,GAAG,GAAG;AACtB,qBAAe,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IAC3C,WAAW,YAAY,GAAG,GAAG;AAC3B,mBAAa,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IACzC,OAAO;AACL,gBAAU,KAAK,EAAE,KAAK,WAAW,EAAE,CAAC;AAAA,IACtC;AAAA,EACF;AAEA,SAAO,EAAE,WAAW,cAAc,gBAAgB,eAAe;AACnE;AAIA,eAAe,eACb,QACA,QACyB;AACzB,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAgB,CAAC;AAAA,MACjB,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE;AAAA,MACrD,eAAe,CAAC;AAAA,IAClB;AAAA,EACF;AAEA,SAAO,QAAQ,yCAAyC,OAAO,MAAM,sBAAsB,YAAY,OAAO,IAAI,QAAQ;AAC1H,QAAM,OAAO,OAAO,IAAI,CAAC,MAAM,EAAE,GAAG;AACpC,QAAM,UAAU,MAAM,OAAO,eAAe,MAAM,EAAE,SAAS,GAAG,CAAC;AACjE,QAAM,aAAa,IAAI,IAAI,OAAO,IAAI,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,SAAS,CAAC,CAAC;AAElE,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC;AAClC,QAAM,gBAAgC,CAAC;AACvC,MAAI,aAAa;AACjB,MAAI,SAAS;AACb,MAAI,eAAe;AAEnB,WAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,UAAM,SAAS,QAAQ,CAAC;AACxB,UAAM,YAAY,OAAO,CAAC,EAAG;AAC7B,QAAI,CAAC,QAAQ;AACX;AACA,qBAAe,KAAK,MAAM,OAAO,CAAC,EAAG,GAAG;AAAA;AAAA,0BAA0B;AAClE;AAAA,IACF;AAGA,QAAI,OAAO,OAAO,SAAS,UAAU,4BAA4B;AAC/D,oBAAc,KAAK;AAAA,QACjB,KAAK,OAAO;AAAA,QACZ,WAAW,WAAW,IAAI,OAAO,GAAG,KAAK;AAAA,QACzC,QAAQ;AAAA,MACV,CAAC;AACD;AAAA,IACF;AAOA,UAAM,eAAe,QAAQ,OAAO,KAAK,KAAK,OAAO,aAAa,OAAO,OAAO,cAAc;AAC9F,QAAI,gBAAgB,OAAO,eAAe,KAAK;AAC7C,oBAAc,KAAK;AAAA,QACjB,KAAK,OAAO;AAAA,QACZ,WAAW,WAAW,IAAI,OAAO,GAAG,KAAK;AAAA,QACzC,QAAQ;AAAA,QACR,aAAa,OAAO,OAAO,WAAW,OAAO,WAAW,QAAQ,OAAO,UAAU;AAAA,MACnF,CAAC;AACD;AAAA,IACF;AACA,QAAI,cAAc;AAChB;AACA,qBAAe,KAAK,MAAM,OAAO,GAAG;AAAA;AAAA,wDAAmD;AACvF;AAAA,IACF;AAEA;AACA,oBAAgB,OAAO;AAEvB,QAAI;AACJ,QAAI;AACF,YAAM,WAAW,uBAAuB,OAAO,SAAS,OAAO,GAAG;AAClE,YAAM,mBAAmB,SAAS,YAAY,SAAS,UAAU,OAAO;AACxE,gBAAU,gBAAgB,eAAe,gBAAgB;AAAA,IAC3D,QAAQ;AACN,gBAAU,OAAO;AAAA,IACnB;AAEA,iBAAa,KAAK,EAAE,KAAK,OAAO,KAAK,SAAS,OAAO,UAAU,CAAC;AAAA,EAClE;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA,SAAS,EAAE,YAAY,QAAQ,aAAa;AAAA,IAC5C;AAAA,EACF;AACF;AAWO,SAAS,kBAAkB,KAAa,WAAmB,aAA8B;AAC9F,MAAI,aAAa;AACf,WAAO,MAAM,GAAG;AAAA;AAAA,0CAA0C,WAAW,kBAAkB,SAAS;AAAA,EAClG;AACA,SAAO,MAAM,GAAG;AAAA;AAAA,qCAAqC,SAAS;AAChE;AAEA,eAAe,oBACb,QACA,YAEA,oBAC4B;AAC5B,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE;AAAA,EACxG;AAEA;AAAA,IACE;AAAA,IACA,oDAAoD,OAAO,MAAM,sBAAsB,YAAY,OAAO;AAAA,IAC1G;AAAA,EACF;AAEA,QAAM,UAAU,MAAM;AAAA,IACpB;AAAA,IACA,CAAC,UAAU,WAAW,QAAQ,EAAE,KAAK,MAAM,IAAI,CAAC;AAAA,IAChD,YAAY;AAAA,EACd;AAEA,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC;AAClC,MAAI,aAAa;AACjB,MAAI,SAAS;AAEb,WAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,UAAM,UAAU,QAAQ,CAAC;AACzB,UAAM,QAAQ,OAAO,CAAC;AACtB,UAAM,cAAc,oBAAoB,IAAI,MAAM,GAAG;AACrD,QAAI,CAAC,SAAS;AACZ;AACA,qBAAe,KAAK,kBAAkB,MAAM,KAAK,sBAAsB,WAAW,CAAC;AACnF;AAAA,IACF;AACA,QAAI,QAAQ,WAAW,YAAY;AACjC;AACA,YAAM,SAAS,QAAQ,kBAAkB,QAAQ,QAAQ,OAAO,UAAU,OAAO,QAAQ,MAAM;AAC/F,qBAAe,KAAK,kBAAkB,MAAM,KAAK,QAAQ,WAAW,CAAC;AACrE;AAAA,IACF;AAEA,UAAM,SAAS,QAAQ;AACvB,QAAI,OAAO,SAAS,OAAO,aAAa,OAAO,OAAO,cAAc,KAAK;AACvE;AACA,YAAM,WAAW,OAAO,OAAO,WAAW,QAAQ,OAAO,UAAU;AACnE,qBAAe,KAAK,kBAAkB,MAAM,KAAK,UAAU,WAAW,CAAC;AACvE;AAAA,IACF;AAEA;AACA,iBAAa,KAAK,EAAE,KAAK,MAAM,KAAK,SAAS,OAAO,SAAS,OAAO,MAAM,UAAU,CAAC;AAAA,EACvF;AAEA,SAAO,EAAE,cAAc,gBAAgB,SAAS,EAAE,YAAY,QAAQ,cAAc,EAAE,EAAE;AAC1F;AAIA,SAAS,2BAA2B,QAA4B;AAC9D,QAAM,EAAE,MAAM,SAAS,IAAI;AAC3B,QAAM,QAAkB,CAAC;AACzB,QAAM,KAAK,KAAK,KAAK,KAAK,EAAE;AAC5B,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,OAAO,KAAK,SAAS,eAAU,KAAK,MAAM,wBAAS,KAAK,KAAK,qBAAS,KAAK,YAAY,WAAW;AAC7G,QAAM,KAAK,aAAM,KAAK,GAAG,EAAE;AAC3B,QAAM,KAAK,EAAE;AACb,MAAI,KAAK,MAAM;AACb,UAAM,KAAK,iBAAiB;AAC5B,UAAM,KAAK,EAAE;AACb,UAAM,KAAK,KAAK,IAAI;AACpB,UAAM,KAAK,EAAE;AAAA,EACf;AACA,MAAI,SAAS,SAAS,GAAG;AACvB,UAAM,KAAK,oBAAoB,SAAS,MAAM,SAAS;AACvD,UAAM,KAAK,EAAE;AACb,eAAW,KAAK,UAAU;AACxB,YAAM,SAAS,KAAK,OAAO,EAAE,KAAK;AAClC,YAAM,KAAK,EAAE,OAAO,cAAc;AAClC,YAAM,QAAQ,EAAE,SAAS,IAAI,IAAI,EAAE,KAAK,KAAK,GAAG,EAAE,KAAK;AACvD,YAAM,KAAK,GAAG,MAAM,SAAS,EAAE,MAAM,KAAK,EAAE,MAAM,KAAK,IAAI;AAC3D,iBAAW,QAAQ,EAAE,KAAK,MAAM,IAAI,GAAG;AACrC,cAAM,KAAK,GAAG,MAAM,KAAK,IAAI,EAAE;AAAA,MACjC;AACA,YAAM,KAAK,EAAE;AAAA,IACf;AAAA,EACF;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,eAAe,kBAAkB,QAAmD;AAClF,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO,EAAE,cAAc,CAAC,GAAG,gBAAgB,CAAC,GAAG,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE,EAAE;AAAA,EACxG;AAEA,QAAM,MAAM,SAAS;AACrB,MAAI,CAAC,IAAI,oBAAoB,CAAC,IAAI,sBAAsB;AACtD,UAAMA,kBAAiB,OAAO;AAAA,MAC5B,CAAC,MAAM,MAAM,EAAE,GAAG;AAAA;AAAA;AAAA,IACpB;AACA,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAAA;AAAA,MACA,SAAS,EAAE,YAAY,GAAG,QAAQ,OAAO,QAAQ,cAAc,EAAE;AAAA,IACnE;AAAA,EACF;AAKA,QAAM,CAAC,YAAY,aAAa,IAAI,OAAO;AAAA,IACzC,CAAC,CAAC,OAAO,IAAI,GAAG,UAAU;AACxB,UAAI,sBAAsB,MAAM,GAAG,EAAG,OAAM,KAAK,KAAK;AAAA,UACjD,MAAK,KAAK,KAAK;AACpB,aAAO,CAAC,OAAO,IAAI;AAAA,IACrB;AAAA,IACA,CAAC,CAAC,GAAG,CAAC,CAAC;AAAA,EACT;AAEA,QAAM,qBAAqB,cAAc;AAAA,IACvC,CAAC,MAAM,MAAM,EAAE,GAAG;AAAA;AAAA;AAAA,EACpB;AAEA,MAAI,WAAW,WAAW,GAAG;AAC3B,WAAO;AAAA,MACL,cAAc,CAAC;AAAA,MACf,gBAAgB;AAAA,MAChB,SAAS,EAAE,YAAY,GAAG,QAAQ,cAAc,QAAQ,cAAc,EAAE;AAAA,IAC1E;AAAA,EACF;AAEA,SAAO,QAAQ,yCAAyC,WAAW,MAAM,uBAAuB,YAAY,MAAM,IAAI,QAAQ;AAC9H,QAAM,SAAS,IAAI,aAAa,IAAI,kBAAkB,IAAI,oBAAoB;AAC9E,QAAM,OAAO,WAAW,IAAI,CAAC,MAAM,EAAE,GAAG;AACxC,QAAM,cAAc,MAAM,OAAO,cAAc,MAAM,IAAI;AACzD,QAAM,aAAa,IAAI,IAAI,WAAW,IAAI,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,SAAS,CAAC,CAAC;AAEtE,QAAM,eAAkC,CAAC;AACzC,QAAM,iBAA2B,CAAC,GAAG,kBAAkB;AACvD,MAAI,aAAa;AACjB,MAAI,SAAS,cAAc;AAE3B,aAAW,CAAC,KAAK,MAAM,KAAK,YAAY,SAAS;AAC/C,UAAM,YAAY,WAAW,IAAI,GAAG,KAAK;AACzC,QAAI,kBAAkB,OAAO;AAC3B;AACA,qBAAe,KAAK,MAAM,GAAG;AAAA;AAAA,8BAA8B,OAAO,OAAO,EAAE;AAC3E;AAAA,IACF;AACA;AACA,iBAAa,KAAK,EAAE,KAAK,SAAS,2BAA2B,MAAM,GAAG,OAAO,UAAU,CAAC;AAAA,EAC1F;AAEA,SAAO,EAAE,cAAc,gBAAgB,SAAS,EAAE,YAAY,QAAQ,cAAc,EAAE,EAAE;AAC1F;AAIA,eAAe,oBACb,cACA,qBACA,cACA,UACgF;AAChF,MAAI,YAAY;AAEhB,MAAI,CAAC,gBAAgB,aAAa,WAAW,GAAG;AAC9C,QAAI,CAAC,gBAAgB,aAAa,SAAS,GAAG;AAC5C,aAAO,WAAW,yEAAyE,QAAQ;AACnG,WAAK,SAAS,IAAI,WAAW,iFAAiF;AAAA,IAChH;AACA,WAAO,EAAE,OAAO,cAAc,WAAW,cAAc,EAAE;AAAA,EAC3D;AAEA,SAAO,QAAQ,6CAA6C,aAAa,MAAM,uBAAuB,YAAY,cAAc,IAAI,QAAQ;AAE5I,QAAM,aAAa,MAAM;AAAA,IACvB;AAAA,IACA,OAAO,SAAS;AACd,aAAO,SAAS,kBAAkB,KAAK,GAAG,OAAO,QAAQ;AAEzD,YAAM,YAAY,MAAM;AAAA,QACtB,KAAK;AAAA,QACL,EAAE,SAAS,MAAM,SAAS,qBAAqB,KAAK,KAAK,IAAI;AAAA,QAC7D;AAAA,MACF;AAEA,UAAI,UAAU,WAAW;AACvB,eAAO,EAAE,GAAG,MAAM,SAAS,UAAU,QAAQ;AAAA,MAC/C;AAEA;AACA,aAAO,WAAW,6BAA6B,KAAK,GAAG,KAAK,UAAU,SAAS,gBAAgB,IAAI,QAAQ;AAC3G,WAAK,SAAS,IAAI,WAAW,8BAA8B,KAAK,GAAG,WAAM,UAAU,SAAS,gBAAgB,EAAE;AAC9G,aAAO;AAAA,IACT;AAAA,IACA,YAAY;AAAA,EACd;AAEA,SAAO,EAAE,OAAO,YAAY,WAAW,cAAc,aAAa,OAAO;AAC3E;AAIA,SAAS,uBAAuB,cAAiC,gBAAoC;AACnG,QAAM,SAAS,CAAC,GAAG,YAAY,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK;AACjE,QAAM,WAAW,CAAC,GAAG,cAAc;AACnC,aAAW,QAAQ,QAAQ;AACzB,QAAI,UAAU,KAAK;AACnB,QAAI;AACF,gBAAU,eAAe,OAAO;AAAA,IAClC,QAAQ;AAAA,IAER;AACA,aAAS,KAAK,MAAM,KAAK,GAAG;AAAA;AAAA,EAAO,OAAO,EAAE;AAAA,EAC9C;AACA,SAAO;AACT;AAEA,SAAS,oBACP,QACA,UACA,SACA,WACA,eACA,eAC2D;AAC3D,QAAM,YAA6C,CAAC;AACpD,MAAI,cAAc,eAAe,GAAG;AAClC,UAAM,KAAK,cAAc,eAAe;AACxC,cAAU,gBAAgB,IAAI,GAAG,EAAE,IAAI,cAAc,YAAY;AACjE,QAAI,CAAC,cAAc,cAAc;AAC/B,gBAAU,YAAY,IAAI;AAAA,IAC5B;AAAA,EACF,WAAW,YAAY,GAAG;AACxB,cAAU,yBAAyB,IAAI;AAAA,EACzC;AAEA,QAAM,cAAc,kBAAkB;AAAA,IACpC,OAAO,oBAAoB,OAAO,KAAK,MAAM;AAAA,IAC7C,YAAY,OAAO,KAAK;AAAA,IACxB,YAAY,QAAQ;AAAA,IACpB,QAAQ,QAAQ;AAAA,IAChB,QAAQ;AAAA,MACN,gBAAgB,QAAQ;AAAA,MACxB,GAAG;AAAA,IACL;AAAA,EACF,CAAC;AAED,QAAM,mBAAmB,cAAc;AAAA,IACrC,OAAO;AAAA,IACP,SAAS;AAAA,IACT,MAAM,SAAS,KAAK,aAAa;AAAA,IACjC,UAAU;AAAA,MACR,kBAAkB,eAAe,aAAa;AAAA,IAChD;AAAA,EACF,CAAC;AAED,QAAM,WAA0C;AAAA,IAC9C,aAAa,OAAO,KAAK;AAAA,IACzB,YAAY,QAAQ;AAAA,IACpB,QAAQ,QAAQ;AAAA,IAChB,mBAAmB;AAAA,IACnB,eAAe,QAAQ;AAAA,EACzB;AACA,SAAO,EAAE,SAAS,kBAAkB,mBAAmB,EAAE,SAAS,EAAE;AACtE;AAIA,eAAsB,kBACpB,QACA,WAAyB,eACwB;AACjD,QAAM,YAAY,KAAK,IAAI;AAE3B,MAAI,CAAC,OAAO,QAAQ,OAAO,KAAK,WAAW,GAAG;AAC5C,WAAO,0BAA0B,WAAW,oBAAoB,SAAS;AAAA,EAC3E;AAEA,QAAM,EAAE,WAAW,cAAc,gBAAgB,eAAe,IAAI,cAAc,OAAO,IAAI;AAC7F,QAAM,aAAa,UAAU,SAAS,aAAa,SAAS,eAAe;AAE3E,QAAM,SAAS;AAAA,IACb;AAAA,IACA,eAAe,OAAO,KAAK,MAAM,YAAY,UAAU,MAAM,SAAS,aAAa,MAAM,YAAY,eAAe,MAAM,cAAc,eAAe,MAAM;AAAA,EAC/J;AAEA,MAAI,eAAe,GAAG;AACpB,WAAO;AAAA,MACL;AAAA,MACA,OAAO,OAAO,KAAK,MAAM;AAAA,MACzB;AAAA,MACA;AAAA,MACA;AAAA,QACE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA;AAAA,IACE;AAAA,IACA,oBAAoB,UAAU,MAAM,UAAU,aAAa,MAAM,aAAa,eAAe,MAAM;AAAA,IACnG;AAAA,EACF;AACA,QAAM,SAAS,SAAS,IAAI,KAAK,2BAA2B;AAK5D,MAAI,UAAgC;AACpC,MAAI;AACF,UAAM,aAAa,IAAI,WAAW;AAClC,QAAI,UAAU,SAAS,GAAG;AACxB,gBAAU;AAAA,QACR,QAAQ,IAAI,cAAc;AAAA,QAC1B;AAAA,QACA,cAAc,mBAAmB;AAAA,MACnC;AAAA,IACF,OAAO;AACL,gBAAU;AAAA,QACR,QAAQ;AAAA,QACR;AAAA,QACA,cAAc,mBAAmB;AAAA,MACnC;AAAA,IACF;AAAA,EACF,SAAS,OAAO;AACd,UAAM,MAAM,cAAc,KAAK;AAC/B,WAAO;AAAA,MACL;AAAA,MACA,iCAAiC,IAAI,OAAO;AAAA,MAC5C;AAAA,MACA;AAAA,MACA;AAAA,QACE;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,QAAM,sBAAsB,6BAA6B,OAAO,OAAO;AAEvE,QAAM,SAAS,SAAS,IAAI,KAAK,uBAAuB;AAMxD,QAAM,aAA6B;AAAA,IACjC,cAAc,CAAC;AAAA,IAAG,gBAAgB,CAAC;AAAA,IACnC,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE;AAAA,IACrD,eAAe,CAAC;AAAA,EAClB;AACA,QAAM,CAAC,UAAU,aAAa,aAAa,IAAI,MAAM,QAAQ,IAAI;AAAA,IAC/D,UAAU,SAAS,IACf,eAAe,WAAW,QAAQ,MAAM,IACxC,QAAQ,QAAwB,UAAU;AAAA,IAC9C,kBAAkB,YAAY;AAAA,IAC9B,oBAAoB,gBAAgB,QAAQ,UAAU;AAAA,EACxD,CAAC;AAID,MAAI,gBAAmC;AAAA,IACrC,cAAc,CAAC;AAAA,IAAG,gBAAgB,CAAC;AAAA,IACnC,SAAS,EAAE,YAAY,GAAG,QAAQ,GAAG,cAAc,EAAE;AAAA,EACvD;AACA,MAAI,SAAS,cAAc,SAAS,GAAG;AACrC,UAAM,cAAc,SAAS,cAAc,OAAO,CAAC,MAAM,EAAE,WAAW,gBAAgB,EAAE;AACxF,UAAM,cAAc,SAAS,cAAc,SAAS;AACpD,UAAM,SAAS;AAAA,MACb;AAAA,MACA,aAAa,SAAS,cAAc,MAAM,2BAA2B,WAAW,YAAY,WAAW;AAAA,IACzG;AACA,UAAM,iBAAgC,SAAS,cAAc,IAAI,CAAC,OAAO;AAAA,MACvE,KAAK,EAAE;AAAA,MACP,WAAW,EAAE;AAAA,IACf,EAAE;AACF,UAAM,eAAe,IAAI;AAAA,MACvB,SAAS,cACN,OAAO,CAAC,MAAM,EAAE,gBAAgB,MAAS,EACzC,IAAI,CAAC,MAAM,CAAC,EAAE,KAAK,EAAE,WAAqB,CAAC;AAAA,IAChD;AACA,oBAAgB,MAAM,oBAAoB,gBAAgB,QAAQ,YAAY,YAAY;AAAA,EAC5F;AAEA,QAAM,eAAe;AAAA,IACnB,GAAG,SAAS;AAAA,IACZ,GAAG,YAAY;AAAA,IACf,GAAG,cAAc;AAAA,IACjB,GAAG,cAAc;AAAA,EACnB;AACA,QAAM,gBAAgB,eAAe;AAAA,IACnC,CAAC,EAAE,IAAI,MAAM,MAAM,GAAG;AAAA;AAAA;AAAA,EACxB;AACA,QAAM,iBAAiB;AAAA,IACrB,GAAG;AAAA,IACH,GAAG,SAAS;AAAA,IACZ,GAAG,YAAY;AAAA,IACf,GAAG,cAAc;AAAA,IACjB,GAAG,cAAc;AAAA,EACnB;AACA,QAAM,UAAyB;AAAA,IAC7B,YACE,SAAS,QAAQ,aACf,YAAY,QAAQ,aACpB,cAAc,QAAQ,aACtB,cAAc,QAAQ;AAAA,IAC1B,QACE,eAAe,SACb,SAAS,QAAQ,SACjB,YAAY,QAAQ,SACpB,cAAc,QAAQ,SACtB,cAAc,QAAQ;AAAA,IAC1B,cAAc,SAAS,QAAQ;AAAA,EACjC;AAEA,QAAM,SAAS,IAAI,QAAQ,WAAW,QAAQ,UAAU,aAAa,QAAQ,MAAM,SAAS;AAE5F,MAAI,aAAa,SAAS,GAAG;AAC3B,UAAM,SAAS,SAAS,IAAI,KAAK,2CAA2C;AAAA,EAC9E;AAEA,QAAM,EAAE,OAAO,gBAAgB,WAAW,aAAa,IAAI,MAAM;AAAA,IAC/D;AAAA,IACA;AAAA,IACA,QAAQ;AAAA,IACR;AAAA,EACF;AAEA,QAAM,WAAW,uBAAuB,gBAAgB,cAAc;AACtE,QAAM,gBAAgB,KAAK,IAAI,IAAI;AAEnC;AAAA,IACE;AAAA,IACA,cAAc,QAAQ,UAAU,gBAAgB,QAAQ,MAAM,YAAY,QAAQ,YAAY;AAAA,IAC9F;AAAA,EACF;AAEA,QAAM,eAAe,eAAe,KAAK,YAAY;AACrD,QAAM,SAAS;AAAA,IACb;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,EAAE,cAAc,aAAa;AAAA,EAC/B;AAEA,MAAI,QAAQ,eAAe,KAAK,QAAQ,SAAS,GAAG;AAClD,WAAO,YAAY,OAAO,OAAO;AAAA,EACnC;AAEA,SAAO,YAAY,OAAO,SAAS,OAAO,iBAAiB;AAC7D;AAEO,SAAS,wBAAwB,QAAyB;AAC/D,SAAO;AAAA,IACL;AAAA,MACE,MAAM;AAAA,MACN,OAAO;AAAA,MACP,aACE;AAAA,MACF,QAAQ;AAAA,MACR,cAAc;AAAA,MACd,aAAa;AAAA,QACX,cAAc;AAAA,QACd,gBAAgB;AAAA,QAChB,iBAAiB;AAAA,QACjB,eAAe;AAAA,MACjB;AAAA,IACF;AAAA,IACA,OAAO,MAAM,QAAQ;AACnB,UAAI,CAAC,gBAAgB,EAAE,UAAU;AAC/B,eAAO,eAAe,YAAY,qBAAqB,UAAU,CAAC,CAAC;AAAA,MACrE;AAEA,YAAM,WAAW,mBAAmB,KAAK,cAAc;AACvD,YAAM,SAAS,MAAM,kBAAkB,MAAM,QAAQ;AAErD,YAAM,SAAS,SAAS,KAAK,KAAK,OAAO,UAAU,kBAAkB,iBAAiB;AACtF,aAAO,eAAe,MAAM;AAAA,IAC9B;AAAA,EACF;AACF;",
|
|
6
6
|
"names": ["failedContents"]
|
|
7
7
|
}
|
package/dist/src/utils/errors.js
CHANGED
|
@@ -10,6 +10,7 @@ const ErrorCode = {
|
|
|
10
10
|
INVALID_INPUT: "INVALID_INPUT",
|
|
11
11
|
NOT_FOUND: "NOT_FOUND",
|
|
12
12
|
QUOTA_EXCEEDED: "QUOTA_EXCEEDED",
|
|
13
|
+
UNSUPPORTED_BINARY_CONTENT: "UNSUPPORTED_BINARY_CONTENT",
|
|
13
14
|
// Internal errors
|
|
14
15
|
INTERNAL_ERROR: "INTERNAL_ERROR",
|
|
15
16
|
PARSE_ERROR: "PARSE_ERROR",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/utils/errors.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Robust error handling utilities for MCP server\n * Ensures the server NEVER crashes and always returns structured responses\n */\n\nimport { mcpLog } from './logger.js';\n\n// ============================================================================\n// Error Codes (MCP-compliant)\n// ============================================================================\n\nexport const ErrorCode = {\n // Retryable errors\n RATE_LIMITED: 'RATE_LIMITED',\n TIMEOUT: 'TIMEOUT',\n NETWORK_ERROR: 'NETWORK_ERROR',\n SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE',\n \n // Non-retryable errors\n AUTH_ERROR: 'AUTH_ERROR',\n INVALID_INPUT: 'INVALID_INPUT',\n NOT_FOUND: 'NOT_FOUND',\n QUOTA_EXCEEDED: 'QUOTA_EXCEEDED',\n \n // Internal errors\n INTERNAL_ERROR: 'INTERNAL_ERROR',\n PARSE_ERROR: 'PARSE_ERROR',\n UNKNOWN_ERROR: 'UNKNOWN_ERROR',\n} as const;\n\ntype ErrorCodeType = typeof ErrorCode[keyof typeof ErrorCode];\n\n// ============================================================================\n// Structured Error Types\n// ============================================================================\n\nexport interface StructuredError {\n code: ErrorCodeType;\n message: string;\n retryable: boolean;\n statusCode?: number;\n cause?: string;\n}\n\ninterface RetryOptions {\n readonly maxRetries: number;\n readonly baseDelayMs: number;\n readonly maxDelayMs: number;\n readonly retryableStatuses: readonly number[];\n readonly onRetry?: (attempt: number, error: StructuredError, delayMs: number) => void;\n}\n\nconst DEFAULT_RETRY_OPTIONS: RetryOptions = {\n maxRetries: 3,\n baseDelayMs: 1000,\n maxDelayMs: 30000,\n retryableStatuses: [408, 429, 500, 502, 503, 504, 510],\n};\n\n// ============================================================================\n// Error Classification \u2014 Atomic Classifiers\n// ============================================================================\n\n/**\n * Classify DOMException (AbortError from AbortController timeouts)\n */\nfunction classifyDomException(error: DOMException): StructuredError {\n if (error.name === 'AbortError') {\n return { code: ErrorCode.TIMEOUT, message: 'Request timed out', retryable: true };\n }\n return { code: ErrorCode.UNKNOWN_ERROR, message: error.message, retryable: false };\n}\n\n/**\n * Classify by Node.js error codes (ECONNREFUSED, ENOTFOUND, etc.)\n * Returns null if no matching code is found.\n */\nfunction classifyByErrorCode(error: { code?: string; message?: string }): StructuredError | null {\n const errCode = error.code;\n if (!errCode) return null;\n\n const networkErrorMessages: Record<string, string> = {\n ECONNREFUSED: 'Connection refused \u2014 service may be down',\n ECONNRESET: 'Connection was reset \u2014 please retry',\n ECONNABORTED: 'Connection aborted \u2014 please retry',\n ENOTFOUND: 'Service not reachable \u2014 check your network',\n EPIPE: 'Connection lost \u2014 please retry',\n EAI_AGAIN: 'DNS lookup failed \u2014 check your network',\n };\n\n if (errCode === 'ECONNREFUSED' || errCode === 'ENOTFOUND' || errCode === 'ECONNRESET') {\n return { code: ErrorCode.NETWORK_ERROR, message: networkErrorMessages[errCode] || 'Network connection failed', retryable: true, cause: error.message };\n }\n\n if (errCode === 'ECONNABORTED' || errCode === 'ETIMEDOUT') {\n return { code: ErrorCode.TIMEOUT, message: networkErrorMessages[errCode] || 'Request timed out', retryable: true, cause: error.message };\n }\n\n return null;\n}\n\n/**\n * Classify by HTTP status code extracted from error objects (axios-style, fetch-style, etc.)\n * Returns null if no status code is found.\n */\nfunction classifyByStatusCode(error: { status?: number; statusCode?: number; response?: { status?: number }; message?: string }): StructuredError | null {\n const status = error.response?.status || error.status || error.statusCode;\n if (!status) return null;\n return classifyHttpError(status, error.message || String(error));\n}\n\n/**\n * Classify by error message patterns (timeout, rate-limit, auth, parse errors)\n * Returns null if no pattern matches.\n */\nfunction classifyByMessage(message: string): StructuredError | null {\n const lower = message.toLowerCase();\n\n // Timeout patterns\n if (lower.includes('timeout') || lower.includes('timed out') || lower.includes('aborterror')) {\n return { code: ErrorCode.TIMEOUT, message: 'Request timed out', retryable: true, cause: message };\n }\n\n // Rate-limit patterns\n if (lower.includes('rate limit') || lower.includes('too many requests')) {\n return { code: ErrorCode.RATE_LIMITED, message: 'Rate limit exceeded', retryable: true, cause: message };\n }\n\n // API key errors\n if (message.includes('API_KEY') || message.includes('api_key') || message.includes('Invalid API')) {\n return { code: ErrorCode.AUTH_ERROR, message: 'API key missing or invalid', retryable: false, cause: message };\n }\n\n // Parse errors\n if (message.includes('JSON') || message.includes('parse') || message.includes('Unexpected token')) {\n return { code: ErrorCode.PARSE_ERROR, message: 'Failed to parse response', retryable: false, cause: message };\n }\n\n return null;\n}\n\n/**\n * Catch-all fallback classification when no other classifier matches.\n */\nfunction classifyFallback(message: string, cause?: unknown): StructuredError {\n return {\n code: ErrorCode.UNKNOWN_ERROR,\n message,\n retryable: false,\n cause: cause ? String(cause) : undefined,\n };\n}\n\n// ============================================================================\n// Main Error Classification Pipeline\n// ============================================================================\n\n/**\n * Classify any error into a structured format.\n * NEVER throws \u2014 always returns a valid StructuredError.\n */\nexport function classifyError(error: unknown): StructuredError {\n if (error == null) {\n return { code: ErrorCode.UNKNOWN_ERROR, message: 'An unknown error occurred', retryable: false };\n }\n\n if (error instanceof DOMException) return classifyDomException(error);\n\n if (!isErrorLike(error)) {\n return { code: ErrorCode.UNKNOWN_ERROR, message: String(error), retryable: false };\n }\n\n return classifyByErrorCode(error)\n ?? classifyByStatusCode(error)\n ?? classifyByMessage(error.message ?? String(error))\n ?? classifyFallback(error.message ?? String(error), error.cause);\n}\n\n/**\n * Type guard for error-like objects with common error properties\n */\nfunction isErrorLike(value: unknown): value is {\n message?: string;\n response?: { status?: number; data?: unknown };\n status?: number;\n statusCode?: number;\n code?: string;\n name?: string;\n cause?: unknown;\n} {\n return typeof value === 'object' && value !== null;\n}\n\n/**\n * Classify HTTP status codes into structured errors.\n * Exhaustive switch with grouped default handling for unknown ranges.\n */\nfunction classifyHttpError(status: number, message: string): StructuredError {\n switch (status) {\n case 400:\n return { code: ErrorCode.INVALID_INPUT, message: 'Bad request', retryable: false, statusCode: status };\n case 401:\n return { code: ErrorCode.AUTH_ERROR, message: 'Invalid API key', retryable: false, statusCode: status };\n case 403:\n return { code: ErrorCode.QUOTA_EXCEEDED, message: 'Access forbidden or quota exceeded', retryable: false, statusCode: status };\n case 404:\n return { code: ErrorCode.NOT_FOUND, message: 'Resource not found', retryable: false, statusCode: status };\n case 408:\n return { code: ErrorCode.TIMEOUT, message: 'Request timeout', retryable: true, statusCode: status };\n case 429:\n return { code: ErrorCode.RATE_LIMITED, message: 'Rate limit exceeded', retryable: true, statusCode: status };\n case 500:\n return { code: ErrorCode.INTERNAL_ERROR, message: 'Server error', retryable: true, statusCode: status };\n case 502:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Bad gateway', retryable: true, statusCode: status };\n case 503:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Service unavailable', retryable: true, statusCode: status };\n case 504:\n return { code: ErrorCode.TIMEOUT, message: 'Gateway timeout', retryable: true, statusCode: status };\n case 510:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Request canceled', retryable: true, statusCode: status };\n default:\n if (status >= 500) {\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: `Server error: ${status}`, retryable: true, statusCode: status };\n }\n if (status >= 400) {\n return { code: ErrorCode.INVALID_INPUT, message: `Client error: ${status}`, retryable: false, statusCode: status };\n }\n return { code: ErrorCode.UNKNOWN_ERROR, message: `HTTP ${status}: ${message}`, retryable: false, statusCode: status };\n }\n}\n\n// ============================================================================\n// Retry Logic with Exponential Backoff\n// ============================================================================\n\n/**\n * Calculate delay with exponential backoff and jitter\n */\nfunction calculateBackoff(attempt: number, options: RetryOptions): number {\n const exponentialDelay = options.baseDelayMs * Math.pow(2, attempt);\n const jitter = Math.random() * 0.3 * exponentialDelay; // 0-30% jitter\n return Math.min(exponentialDelay + jitter, options.maxDelayMs);\n}\n\n/**\n * Sleep utility that respects abort signals\n */\nexport function sleep(ms: number, signal?: AbortSignal): Promise<void> {\n return new Promise((resolve, reject) => {\n if (signal?.aborted) {\n reject(new DOMException('Aborted', 'AbortError'));\n return;\n }\n\n function onAbort() {\n clearTimeout(timeout);\n reject(new DOMException('Aborted', 'AbortError'));\n }\n\n const timeout = setTimeout(() => {\n if (signal) signal.removeEventListener('abort', onAbort);\n resolve();\n }, ms);\n\n signal?.addEventListener('abort', onAbort, { once: true });\n // Re-check: signal may have aborted between initial check and listener registration\n if (signal?.aborted) {\n onAbort();\n }\n });\n}\n\n/**\n * Wrap a fetch call with timeout via AbortController\n */\nexport function fetchWithTimeout(\n url: string,\n options: RequestInit & { timeoutMs?: number } = {}\n): Promise<Response> {\n const { timeoutMs = 30000, signal: externalSignal, ...fetchOptions } = options;\n\n const controller = new AbortController();\n const timeoutId = setTimeout(() => controller.abort(), timeoutMs);\n\n let onExternalAbort: (() => void) | undefined;\n if (externalSignal) {\n onExternalAbort = () => controller.abort();\n externalSignal.addEventListener('abort', onExternalAbort, { once: true });\n if (externalSignal.aborted) {\n controller.abort();\n }\n }\n\n return fetch(url, { ...fetchOptions, signal: controller.signal }).finally(() => {\n clearTimeout(timeoutId);\n if (externalSignal && onExternalAbort) {\n externalSignal.removeEventListener('abort', onExternalAbort);\n }\n });\n}\n\n// ============================================================================\n// Stability Wrappers \u2014 Network resilience for LLM API calls\n// ============================================================================\n\n/**\n * Wrap a non-streaming API call with activity-based timeout detection.\n * If the call hasn't completed within `stallMs`, abort and retry.\n * This catches \"stuck\" connections where TCP stays open but no data flows.\n *\n * @param fn - Async function that accepts an AbortSignal\n * @param stallMs - Max milliseconds to wait for the call to complete before considering it stuck\n * @param maxAttempts - Max retry attempts for stalled requests\n * @param label - Label for log messages\n * @returns The result of the function\n */\nexport async function withStallProtection<T>(\n fn: (signal: AbortSignal) => Promise<T>,\n stallMs: number,\n maxAttempts: number = 2,\n label: string = 'request',\n): Promise<T> {\n for (let attempt = 0; attempt < maxAttempts; attempt++) {\n const controller = new AbortController();\n let stallTimer: ReturnType<typeof setTimeout> | undefined;\n\n const stallPromise = new Promise<never>((_, reject) => {\n stallTimer = setTimeout(() => {\n controller.abort();\n reject(Object.assign(new Error(`Service temporarily unavailable \u2014 no response received (attempt ${attempt + 1}/${maxAttempts})`), {\n code: 'ESTALLED',\n retryable: attempt < maxAttempts - 1,\n }));\n }, stallMs);\n });\n\n let fnPromise: Promise<T> | undefined;\n try {\n fnPromise = fn(controller.signal);\n const result = await Promise.race([fnPromise, stallPromise]);\n clearTimeout(stallTimer);\n return result;\n } catch (err) {\n // Suppress unhandled rejection from the losing promise\n // (e.g. fnPromise rejects after stallPromise wins the race)\n fnPromise?.catch(() => {});\n clearTimeout(stallTimer);\n const isStall = err instanceof Error && (err as NodeJS.ErrnoException).code === 'ESTALLED';\n if (isStall && attempt < maxAttempts - 1) {\n const backoff = calculateBackoff(attempt, DEFAULT_RETRY_OPTIONS);\n mcpLog('warning', `${label} stalled, retrying in ${backoff}ms (attempt ${attempt + 1})`, 'stability');\n await sleep(backoff);\n continue;\n }\n throw err;\n }\n }\n // Should never reach here, but TypeScript needs it\n throw new Error(`${label} failed after ${maxAttempts} stall-protection attempts`);\n}\n"],
|
|
5
|
-
"mappings": "AAKA,SAAS,cAAc;AAMhB,MAAM,YAAY;AAAA;AAAA,EAEvB,cAAc;AAAA,EACd,SAAS;AAAA,EACT,eAAe;AAAA,EACf,qBAAqB;AAAA;AAAA,EAGrB,YAAY;AAAA,EACZ,eAAe;AAAA,EACf,WAAW;AAAA,EACX,gBAAgB;AAAA;AAAA,
|
|
4
|
+
"sourcesContent": ["/**\n * Robust error handling utilities for MCP server\n * Ensures the server NEVER crashes and always returns structured responses\n */\n\nimport { mcpLog } from './logger.js';\n\n// ============================================================================\n// Error Codes (MCP-compliant)\n// ============================================================================\n\nexport const ErrorCode = {\n // Retryable errors\n RATE_LIMITED: 'RATE_LIMITED',\n TIMEOUT: 'TIMEOUT',\n NETWORK_ERROR: 'NETWORK_ERROR',\n SERVICE_UNAVAILABLE: 'SERVICE_UNAVAILABLE',\n \n // Non-retryable errors\n AUTH_ERROR: 'AUTH_ERROR',\n INVALID_INPUT: 'INVALID_INPUT',\n NOT_FOUND: 'NOT_FOUND',\n QUOTA_EXCEEDED: 'QUOTA_EXCEEDED',\n UNSUPPORTED_BINARY_CONTENT: 'UNSUPPORTED_BINARY_CONTENT',\n\n // Internal errors\n INTERNAL_ERROR: 'INTERNAL_ERROR',\n PARSE_ERROR: 'PARSE_ERROR',\n UNKNOWN_ERROR: 'UNKNOWN_ERROR',\n} as const;\n\ntype ErrorCodeType = typeof ErrorCode[keyof typeof ErrorCode];\n\n// ============================================================================\n// Structured Error Types\n// ============================================================================\n\nexport interface StructuredError {\n code: ErrorCodeType;\n message: string;\n retryable: boolean;\n statusCode?: number;\n cause?: string;\n}\n\ninterface RetryOptions {\n readonly maxRetries: number;\n readonly baseDelayMs: number;\n readonly maxDelayMs: number;\n readonly retryableStatuses: readonly number[];\n readonly onRetry?: (attempt: number, error: StructuredError, delayMs: number) => void;\n}\n\nconst DEFAULT_RETRY_OPTIONS: RetryOptions = {\n maxRetries: 3,\n baseDelayMs: 1000,\n maxDelayMs: 30000,\n retryableStatuses: [408, 429, 500, 502, 503, 504, 510],\n};\n\n// ============================================================================\n// Error Classification \u2014 Atomic Classifiers\n// ============================================================================\n\n/**\n * Classify DOMException (AbortError from AbortController timeouts)\n */\nfunction classifyDomException(error: DOMException): StructuredError {\n if (error.name === 'AbortError') {\n return { code: ErrorCode.TIMEOUT, message: 'Request timed out', retryable: true };\n }\n return { code: ErrorCode.UNKNOWN_ERROR, message: error.message, retryable: false };\n}\n\n/**\n * Classify by Node.js error codes (ECONNREFUSED, ENOTFOUND, etc.)\n * Returns null if no matching code is found.\n */\nfunction classifyByErrorCode(error: { code?: string; message?: string }): StructuredError | null {\n const errCode = error.code;\n if (!errCode) return null;\n\n const networkErrorMessages: Record<string, string> = {\n ECONNREFUSED: 'Connection refused \u2014 service may be down',\n ECONNRESET: 'Connection was reset \u2014 please retry',\n ECONNABORTED: 'Connection aborted \u2014 please retry',\n ENOTFOUND: 'Service not reachable \u2014 check your network',\n EPIPE: 'Connection lost \u2014 please retry',\n EAI_AGAIN: 'DNS lookup failed \u2014 check your network',\n };\n\n if (errCode === 'ECONNREFUSED' || errCode === 'ENOTFOUND' || errCode === 'ECONNRESET') {\n return { code: ErrorCode.NETWORK_ERROR, message: networkErrorMessages[errCode] || 'Network connection failed', retryable: true, cause: error.message };\n }\n\n if (errCode === 'ECONNABORTED' || errCode === 'ETIMEDOUT') {\n return { code: ErrorCode.TIMEOUT, message: networkErrorMessages[errCode] || 'Request timed out', retryable: true, cause: error.message };\n }\n\n return null;\n}\n\n/**\n * Classify by HTTP status code extracted from error objects (axios-style, fetch-style, etc.)\n * Returns null if no status code is found.\n */\nfunction classifyByStatusCode(error: { status?: number; statusCode?: number; response?: { status?: number }; message?: string }): StructuredError | null {\n const status = error.response?.status || error.status || error.statusCode;\n if (!status) return null;\n return classifyHttpError(status, error.message || String(error));\n}\n\n/**\n * Classify by error message patterns (timeout, rate-limit, auth, parse errors)\n * Returns null if no pattern matches.\n */\nfunction classifyByMessage(message: string): StructuredError | null {\n const lower = message.toLowerCase();\n\n // Timeout patterns\n if (lower.includes('timeout') || lower.includes('timed out') || lower.includes('aborterror')) {\n return { code: ErrorCode.TIMEOUT, message: 'Request timed out', retryable: true, cause: message };\n }\n\n // Rate-limit patterns\n if (lower.includes('rate limit') || lower.includes('too many requests')) {\n return { code: ErrorCode.RATE_LIMITED, message: 'Rate limit exceeded', retryable: true, cause: message };\n }\n\n // API key errors\n if (message.includes('API_KEY') || message.includes('api_key') || message.includes('Invalid API')) {\n return { code: ErrorCode.AUTH_ERROR, message: 'API key missing or invalid', retryable: false, cause: message };\n }\n\n // Parse errors\n if (message.includes('JSON') || message.includes('parse') || message.includes('Unexpected token')) {\n return { code: ErrorCode.PARSE_ERROR, message: 'Failed to parse response', retryable: false, cause: message };\n }\n\n return null;\n}\n\n/**\n * Catch-all fallback classification when no other classifier matches.\n */\nfunction classifyFallback(message: string, cause?: unknown): StructuredError {\n return {\n code: ErrorCode.UNKNOWN_ERROR,\n message,\n retryable: false,\n cause: cause ? String(cause) : undefined,\n };\n}\n\n// ============================================================================\n// Main Error Classification Pipeline\n// ============================================================================\n\n/**\n * Classify any error into a structured format.\n * NEVER throws \u2014 always returns a valid StructuredError.\n */\nexport function classifyError(error: unknown): StructuredError {\n if (error == null) {\n return { code: ErrorCode.UNKNOWN_ERROR, message: 'An unknown error occurred', retryable: false };\n }\n\n if (error instanceof DOMException) return classifyDomException(error);\n\n if (!isErrorLike(error)) {\n return { code: ErrorCode.UNKNOWN_ERROR, message: String(error), retryable: false };\n }\n\n return classifyByErrorCode(error)\n ?? classifyByStatusCode(error)\n ?? classifyByMessage(error.message ?? String(error))\n ?? classifyFallback(error.message ?? String(error), error.cause);\n}\n\n/**\n * Type guard for error-like objects with common error properties\n */\nfunction isErrorLike(value: unknown): value is {\n message?: string;\n response?: { status?: number; data?: unknown };\n status?: number;\n statusCode?: number;\n code?: string;\n name?: string;\n cause?: unknown;\n} {\n return typeof value === 'object' && value !== null;\n}\n\n/**\n * Classify HTTP status codes into structured errors.\n * Exhaustive switch with grouped default handling for unknown ranges.\n */\nfunction classifyHttpError(status: number, message: string): StructuredError {\n switch (status) {\n case 400:\n return { code: ErrorCode.INVALID_INPUT, message: 'Bad request', retryable: false, statusCode: status };\n case 401:\n return { code: ErrorCode.AUTH_ERROR, message: 'Invalid API key', retryable: false, statusCode: status };\n case 403:\n return { code: ErrorCode.QUOTA_EXCEEDED, message: 'Access forbidden or quota exceeded', retryable: false, statusCode: status };\n case 404:\n return { code: ErrorCode.NOT_FOUND, message: 'Resource not found', retryable: false, statusCode: status };\n case 408:\n return { code: ErrorCode.TIMEOUT, message: 'Request timeout', retryable: true, statusCode: status };\n case 429:\n return { code: ErrorCode.RATE_LIMITED, message: 'Rate limit exceeded', retryable: true, statusCode: status };\n case 500:\n return { code: ErrorCode.INTERNAL_ERROR, message: 'Server error', retryable: true, statusCode: status };\n case 502:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Bad gateway', retryable: true, statusCode: status };\n case 503:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Service unavailable', retryable: true, statusCode: status };\n case 504:\n return { code: ErrorCode.TIMEOUT, message: 'Gateway timeout', retryable: true, statusCode: status };\n case 510:\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: 'Request canceled', retryable: true, statusCode: status };\n default:\n if (status >= 500) {\n return { code: ErrorCode.SERVICE_UNAVAILABLE, message: `Server error: ${status}`, retryable: true, statusCode: status };\n }\n if (status >= 400) {\n return { code: ErrorCode.INVALID_INPUT, message: `Client error: ${status}`, retryable: false, statusCode: status };\n }\n return { code: ErrorCode.UNKNOWN_ERROR, message: `HTTP ${status}: ${message}`, retryable: false, statusCode: status };\n }\n}\n\n// ============================================================================\n// Retry Logic with Exponential Backoff\n// ============================================================================\n\n/**\n * Calculate delay with exponential backoff and jitter\n */\nfunction calculateBackoff(attempt: number, options: RetryOptions): number {\n const exponentialDelay = options.baseDelayMs * Math.pow(2, attempt);\n const jitter = Math.random() * 0.3 * exponentialDelay; // 0-30% jitter\n return Math.min(exponentialDelay + jitter, options.maxDelayMs);\n}\n\n/**\n * Sleep utility that respects abort signals\n */\nexport function sleep(ms: number, signal?: AbortSignal): Promise<void> {\n return new Promise((resolve, reject) => {\n if (signal?.aborted) {\n reject(new DOMException('Aborted', 'AbortError'));\n return;\n }\n\n function onAbort() {\n clearTimeout(timeout);\n reject(new DOMException('Aborted', 'AbortError'));\n }\n\n const timeout = setTimeout(() => {\n if (signal) signal.removeEventListener('abort', onAbort);\n resolve();\n }, ms);\n\n signal?.addEventListener('abort', onAbort, { once: true });\n // Re-check: signal may have aborted between initial check and listener registration\n if (signal?.aborted) {\n onAbort();\n }\n });\n}\n\n/**\n * Wrap a fetch call with timeout via AbortController\n */\nexport function fetchWithTimeout(\n url: string,\n options: RequestInit & { timeoutMs?: number } = {}\n): Promise<Response> {\n const { timeoutMs = 30000, signal: externalSignal, ...fetchOptions } = options;\n\n const controller = new AbortController();\n const timeoutId = setTimeout(() => controller.abort(), timeoutMs);\n\n let onExternalAbort: (() => void) | undefined;\n if (externalSignal) {\n onExternalAbort = () => controller.abort();\n externalSignal.addEventListener('abort', onExternalAbort, { once: true });\n if (externalSignal.aborted) {\n controller.abort();\n }\n }\n\n return fetch(url, { ...fetchOptions, signal: controller.signal }).finally(() => {\n clearTimeout(timeoutId);\n if (externalSignal && onExternalAbort) {\n externalSignal.removeEventListener('abort', onExternalAbort);\n }\n });\n}\n\n// ============================================================================\n// Stability Wrappers \u2014 Network resilience for LLM API calls\n// ============================================================================\n\n/**\n * Wrap a non-streaming API call with activity-based timeout detection.\n * If the call hasn't completed within `stallMs`, abort and retry.\n * This catches \"stuck\" connections where TCP stays open but no data flows.\n *\n * @param fn - Async function that accepts an AbortSignal\n * @param stallMs - Max milliseconds to wait for the call to complete before considering it stuck\n * @param maxAttempts - Max retry attempts for stalled requests\n * @param label - Label for log messages\n * @returns The result of the function\n */\nexport async function withStallProtection<T>(\n fn: (signal: AbortSignal) => Promise<T>,\n stallMs: number,\n maxAttempts: number = 2,\n label: string = 'request',\n): Promise<T> {\n for (let attempt = 0; attempt < maxAttempts; attempt++) {\n const controller = new AbortController();\n let stallTimer: ReturnType<typeof setTimeout> | undefined;\n\n const stallPromise = new Promise<never>((_, reject) => {\n stallTimer = setTimeout(() => {\n controller.abort();\n reject(Object.assign(new Error(`Service temporarily unavailable \u2014 no response received (attempt ${attempt + 1}/${maxAttempts})`), {\n code: 'ESTALLED',\n retryable: attempt < maxAttempts - 1,\n }));\n }, stallMs);\n });\n\n let fnPromise: Promise<T> | undefined;\n try {\n fnPromise = fn(controller.signal);\n const result = await Promise.race([fnPromise, stallPromise]);\n clearTimeout(stallTimer);\n return result;\n } catch (err) {\n // Suppress unhandled rejection from the losing promise\n // (e.g. fnPromise rejects after stallPromise wins the race)\n fnPromise?.catch(() => {});\n clearTimeout(stallTimer);\n const isStall = err instanceof Error && (err as NodeJS.ErrnoException).code === 'ESTALLED';\n if (isStall && attempt < maxAttempts - 1) {\n const backoff = calculateBackoff(attempt, DEFAULT_RETRY_OPTIONS);\n mcpLog('warning', `${label} stalled, retrying in ${backoff}ms (attempt ${attempt + 1})`, 'stability');\n await sleep(backoff);\n continue;\n }\n throw err;\n }\n }\n // Should never reach here, but TypeScript needs it\n throw new Error(`${label} failed after ${maxAttempts} stall-protection attempts`);\n}\n"],
|
|
5
|
+
"mappings": "AAKA,SAAS,cAAc;AAMhB,MAAM,YAAY;AAAA;AAAA,EAEvB,cAAc;AAAA,EACd,SAAS;AAAA,EACT,eAAe;AAAA,EACf,qBAAqB;AAAA;AAAA,EAGrB,YAAY;AAAA,EACZ,eAAe;AAAA,EACf,WAAW;AAAA,EACX,gBAAgB;AAAA,EAChB,4BAA4B;AAAA;AAAA,EAG5B,gBAAgB;AAAA,EAChB,aAAa;AAAA,EACb,eAAe;AACjB;AAwBA,MAAM,wBAAsC;AAAA,EAC1C,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,YAAY;AAAA,EACZ,mBAAmB,CAAC,KAAK,KAAK,KAAK,KAAK,KAAK,KAAK,GAAG;AACvD;AASA,SAAS,qBAAqB,OAAsC;AAClE,MAAI,MAAM,SAAS,cAAc;AAC/B,WAAO,EAAE,MAAM,UAAU,SAAS,SAAS,qBAAqB,WAAW,KAAK;AAAA,EAClF;AACA,SAAO,EAAE,MAAM,UAAU,eAAe,SAAS,MAAM,SAAS,WAAW,MAAM;AACnF;AAMA,SAAS,oBAAoB,OAAoE;AAC/F,QAAM,UAAU,MAAM;AACtB,MAAI,CAAC,QAAS,QAAO;AAErB,QAAM,uBAA+C;AAAA,IACnD,cAAc;AAAA,IACd,YAAY;AAAA,IACZ,cAAc;AAAA,IACd,WAAW;AAAA,IACX,OAAO;AAAA,IACP,WAAW;AAAA,EACb;AAEA,MAAI,YAAY,kBAAkB,YAAY,eAAe,YAAY,cAAc;AACrF,WAAO,EAAE,MAAM,UAAU,eAAe,SAAS,qBAAqB,OAAO,KAAK,6BAA6B,WAAW,MAAM,OAAO,MAAM,QAAQ;AAAA,EACvJ;AAEA,MAAI,YAAY,kBAAkB,YAAY,aAAa;AACzD,WAAO,EAAE,MAAM,UAAU,SAAS,SAAS,qBAAqB,OAAO,KAAK,qBAAqB,WAAW,MAAM,OAAO,MAAM,QAAQ;AAAA,EACzI;AAEA,SAAO;AACT;AAMA,SAAS,qBAAqB,OAA2H;AACvJ,QAAM,SAAS,MAAM,UAAU,UAAU,MAAM,UAAU,MAAM;AAC/D,MAAI,CAAC,OAAQ,QAAO;AACpB,SAAO,kBAAkB,QAAQ,MAAM,WAAW,OAAO,KAAK,CAAC;AACjE;AAMA,SAAS,kBAAkB,SAAyC;AAClE,QAAM,QAAQ,QAAQ,YAAY;AAGlC,MAAI,MAAM,SAAS,SAAS,KAAK,MAAM,SAAS,WAAW,KAAK,MAAM,SAAS,YAAY,GAAG;AAC5F,WAAO,EAAE,MAAM,UAAU,SAAS,SAAS,qBAAqB,WAAW,MAAM,OAAO,QAAQ;AAAA,EAClG;AAGA,MAAI,MAAM,SAAS,YAAY,KAAK,MAAM,SAAS,mBAAmB,GAAG;AACvE,WAAO,EAAE,MAAM,UAAU,cAAc,SAAS,uBAAuB,WAAW,MAAM,OAAO,QAAQ;AAAA,EACzG;AAGA,MAAI,QAAQ,SAAS,SAAS,KAAK,QAAQ,SAAS,SAAS,KAAK,QAAQ,SAAS,aAAa,GAAG;AACjG,WAAO,EAAE,MAAM,UAAU,YAAY,SAAS,8BAA8B,WAAW,OAAO,OAAO,QAAQ;AAAA,EAC/G;AAGA,MAAI,QAAQ,SAAS,MAAM,KAAK,QAAQ,SAAS,OAAO,KAAK,QAAQ,SAAS,kBAAkB,GAAG;AACjG,WAAO,EAAE,MAAM,UAAU,aAAa,SAAS,4BAA4B,WAAW,OAAO,OAAO,QAAQ;AAAA,EAC9G;AAEA,SAAO;AACT;AAKA,SAAS,iBAAiB,SAAiB,OAAkC;AAC3E,SAAO;AAAA,IACL,MAAM,UAAU;AAAA,IAChB;AAAA,IACA,WAAW;AAAA,IACX,OAAO,QAAQ,OAAO,KAAK,IAAI;AAAA,EACjC;AACF;AAUO,SAAS,cAAc,OAAiC;AAC7D,MAAI,SAAS,MAAM;AACjB,WAAO,EAAE,MAAM,UAAU,eAAe,SAAS,6BAA6B,WAAW,MAAM;AAAA,EACjG;AAEA,MAAI,iBAAiB,aAAc,QAAO,qBAAqB,KAAK;AAEpE,MAAI,CAAC,YAAY,KAAK,GAAG;AACvB,WAAO,EAAE,MAAM,UAAU,eAAe,SAAS,OAAO,KAAK,GAAG,WAAW,MAAM;AAAA,EACnF;AAEA,SAAO,oBAAoB,KAAK,KAC3B,qBAAqB,KAAK,KAC1B,kBAAkB,MAAM,WAAW,OAAO,KAAK,CAAC,KAChD,iBAAiB,MAAM,WAAW,OAAO,KAAK,GAAG,MAAM,KAAK;AACnE;AAKA,SAAS,YAAY,OAQnB;AACA,SAAO,OAAO,UAAU,YAAY,UAAU;AAChD;AAMA,SAAS,kBAAkB,QAAgB,SAAkC;AAC3E,UAAQ,QAAQ;AAAA,IACd,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,eAAe,SAAS,eAAe,WAAW,OAAO,YAAY,OAAO;AAAA,IACvG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,YAAY,SAAS,mBAAmB,WAAW,OAAO,YAAY,OAAO;AAAA,IACxG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,gBAAgB,SAAS,sCAAsC,WAAW,OAAO,YAAY,OAAO;AAAA,IAC/H,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,WAAW,SAAS,sBAAsB,WAAW,OAAO,YAAY,OAAO;AAAA,IAC1G,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,SAAS,SAAS,mBAAmB,WAAW,MAAM,YAAY,OAAO;AAAA,IACpG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,cAAc,SAAS,uBAAuB,WAAW,MAAM,YAAY,OAAO;AAAA,IAC7G,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,gBAAgB,SAAS,gBAAgB,WAAW,MAAM,YAAY,OAAO;AAAA,IACxG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,qBAAqB,SAAS,eAAe,WAAW,MAAM,YAAY,OAAO;AAAA,IAC5G,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,qBAAqB,SAAS,uBAAuB,WAAW,MAAM,YAAY,OAAO;AAAA,IACpH,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,SAAS,SAAS,mBAAmB,WAAW,MAAM,YAAY,OAAO;AAAA,IACpG,KAAK;AACH,aAAO,EAAE,MAAM,UAAU,qBAAqB,SAAS,oBAAoB,WAAW,MAAM,YAAY,OAAO;AAAA,IACjH;AACE,UAAI,UAAU,KAAK;AACjB,eAAO,EAAE,MAAM,UAAU,qBAAqB,SAAS,iBAAiB,MAAM,IAAI,WAAW,MAAM,YAAY,OAAO;AAAA,MACxH;AACA,UAAI,UAAU,KAAK;AACjB,eAAO,EAAE,MAAM,UAAU,eAAe,SAAS,iBAAiB,MAAM,IAAI,WAAW,OAAO,YAAY,OAAO;AAAA,MACnH;AACA,aAAO,EAAE,MAAM,UAAU,eAAe,SAAS,QAAQ,MAAM,KAAK,OAAO,IAAI,WAAW,OAAO,YAAY,OAAO;AAAA,EACxH;AACF;AASA,SAAS,iBAAiB,SAAiB,SAA+B;AACxE,QAAM,mBAAmB,QAAQ,cAAc,KAAK,IAAI,GAAG,OAAO;AAClE,QAAM,SAAS,KAAK,OAAO,IAAI,MAAM;AACrC,SAAO,KAAK,IAAI,mBAAmB,QAAQ,QAAQ,UAAU;AAC/D;AAKO,SAAS,MAAM,IAAY,QAAqC;AACrE,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,QAAI,QAAQ,SAAS;AACnB,aAAO,IAAI,aAAa,WAAW,YAAY,CAAC;AAChD;AAAA,IACF;AAEA,aAAS,UAAU;AACjB,mBAAa,OAAO;AACpB,aAAO,IAAI,aAAa,WAAW,YAAY,CAAC;AAAA,IAClD;AAEA,UAAM,UAAU,WAAW,MAAM;AAC/B,UAAI,OAAQ,QAAO,oBAAoB,SAAS,OAAO;AACvD,cAAQ;AAAA,IACV,GAAG,EAAE;AAEL,YAAQ,iBAAiB,SAAS,SAAS,EAAE,MAAM,KAAK,CAAC;AAEzD,QAAI,QAAQ,SAAS;AACnB,cAAQ;AAAA,IACV;AAAA,EACF,CAAC;AACH;AAKO,SAAS,iBACd,KACA,UAAgD,CAAC,GAC9B;AACnB,QAAM,EAAE,YAAY,KAAO,QAAQ,gBAAgB,GAAG,aAAa,IAAI;AAEvE,QAAM,aAAa,IAAI,gBAAgB;AACvC,QAAM,YAAY,WAAW,MAAM,WAAW,MAAM,GAAG,SAAS;AAEhE,MAAI;AACJ,MAAI,gBAAgB;AAClB,sBAAkB,MAAM,WAAW,MAAM;AACzC,mBAAe,iBAAiB,SAAS,iBAAiB,EAAE,MAAM,KAAK,CAAC;AACxE,QAAI,eAAe,SAAS;AAC1B,iBAAW,MAAM;AAAA,IACnB;AAAA,EACF;AAEA,SAAO,MAAM,KAAK,EAAE,GAAG,cAAc,QAAQ,WAAW,OAAO,CAAC,EAAE,QAAQ,MAAM;AAC9E,iBAAa,SAAS;AACtB,QAAI,kBAAkB,iBAAiB;AACrC,qBAAe,oBAAoB,SAAS,eAAe;AAAA,IAC7D;AAAA,EACF,CAAC;AACH;AAiBA,eAAsB,oBACpB,IACA,SACA,cAAsB,GACtB,QAAgB,WACJ;AACZ,WAAS,UAAU,GAAG,UAAU,aAAa,WAAW;AACtD,UAAM,aAAa,IAAI,gBAAgB;AACvC,QAAI;AAEJ,UAAM,eAAe,IAAI,QAAe,CAAC,GAAG,WAAW;AACrD,mBAAa,WAAW,MAAM;AAC5B,mBAAW,MAAM;AACjB,eAAO,OAAO,OAAO,IAAI,MAAM,wEAAmE,UAAU,CAAC,IAAI,WAAW,GAAG,GAAG;AAAA,UAChI,MAAM;AAAA,UACN,WAAW,UAAU,cAAc;AAAA,QACrC,CAAC,CAAC;AAAA,MACJ,GAAG,OAAO;AAAA,IACZ,CAAC;AAED,QAAI;AACJ,QAAI;AACF,kBAAY,GAAG,WAAW,MAAM;AAChC,YAAM,SAAS,MAAM,QAAQ,KAAK,CAAC,WAAW,YAAY,CAAC;AAC3D,mBAAa,UAAU;AACvB,aAAO;AAAA,IACT,SAAS,KAAK;AAGZ,iBAAW,MAAM,MAAM;AAAA,MAAC,CAAC;AACzB,mBAAa,UAAU;AACvB,YAAM,UAAU,eAAe,SAAU,IAA8B,SAAS;AAChF,UAAI,WAAW,UAAU,cAAc,GAAG;AACxC,cAAM,UAAU,iBAAiB,SAAS,qBAAqB;AAC/D,eAAO,WAAW,GAAG,KAAK,yBAAyB,OAAO,eAAe,UAAU,CAAC,KAAK,WAAW;AACpG,cAAM,MAAM,OAAO;AACnB;AAAA,MACF;AACA,YAAM;AAAA,IACR;AAAA,EACF;AAEA,QAAM,IAAI,MAAM,GAAG,KAAK,iBAAiB,WAAW,4BAA4B;AAClF;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
|
@@ -35,7 +35,46 @@ function classifySourceByUrl(url) {
|
|
|
35
35
|
}
|
|
36
36
|
return "web";
|
|
37
37
|
}
|
|
38
|
+
const DOCUMENT_PATH_SUFFIXES = [
|
|
39
|
+
".pdf",
|
|
40
|
+
".doc",
|
|
41
|
+
".docx",
|
|
42
|
+
".ppt",
|
|
43
|
+
".pptx",
|
|
44
|
+
".xls",
|
|
45
|
+
".xlsx"
|
|
46
|
+
];
|
|
47
|
+
function isDocumentUrl(url) {
|
|
48
|
+
let pathname;
|
|
49
|
+
try {
|
|
50
|
+
pathname = new URL(url).pathname.toLowerCase();
|
|
51
|
+
} catch {
|
|
52
|
+
return false;
|
|
53
|
+
}
|
|
54
|
+
for (const suffix of DOCUMENT_PATH_SUFFIXES) {
|
|
55
|
+
if (pathname.endsWith(suffix)) return true;
|
|
56
|
+
}
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
const BINARY_CONTENT_TYPE_PREFIXES = [
|
|
60
|
+
"application/pdf",
|
|
61
|
+
"application/msword",
|
|
62
|
+
"application/vnd.openxmlformats-officedocument.",
|
|
63
|
+
"application/vnd.ms-excel",
|
|
64
|
+
"application/vnd.ms-powerpoint",
|
|
65
|
+
"application/octet-stream"
|
|
66
|
+
];
|
|
67
|
+
function isBinaryDocumentContentType(contentType) {
|
|
68
|
+
if (!contentType) return false;
|
|
69
|
+
const lower = contentType.toLowerCase();
|
|
70
|
+
for (const prefix of BINARY_CONTENT_TYPE_PREFIXES) {
|
|
71
|
+
if (lower.startsWith(prefix)) return true;
|
|
72
|
+
}
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
38
75
|
export {
|
|
39
|
-
classifySourceByUrl
|
|
76
|
+
classifySourceByUrl,
|
|
77
|
+
isBinaryDocumentContentType,
|
|
78
|
+
isDocumentUrl
|
|
40
79
|
};
|
|
41
80
|
//# sourceMappingURL=source-type.js.map
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"version": 3,
|
|
3
3
|
"sources": ["../../../src/utils/source-type.ts"],
|
|
4
|
-
"sourcesContent": ["/**\n * Hostname/path-heuristic source-type tagging. Works without the LLM\n * classifier so degraded-mode web-search responses still carry a\n * `source_type` field per result. When the LLM classifier IS available,\n * its tag wins (the classifier sees title + snippet as well, not just URL).\n *\n * See: mcp-revisions/output-shaping/06-source-type-tagging-without-llm.md.\n */\n\nexport type SourceType =\n | 'reddit'\n | 'github'\n | 'docs'\n | 'blog'\n | 'paper'\n | 'qa'\n | 'cve'\n | 'news'\n | 'video'\n | 'web';\n\nconst RULES: Array<[RegExp, SourceType]> = [\n // Reddit post permalinks (subreddit homepages are filtered out upstream).\n [/(?:^|\\.)reddit\\.com\\//i, 'reddit'],\n [/(?:^|\\.)github\\.com\\//i, 'github'],\n [/(?:^|\\.)gitlab\\.com\\//i, 'github'],\n // CVE-prefixed paths are unambiguous regardless of host.\n [/\\/CVE-\\d{4}-\\d+/i, 'cve'],\n [/(?:^|\\.)nvd\\.nist\\.gov\\//i, 'cve'],\n [/(?:^|\\.)stackoverflow\\.com\\//i, 'qa'],\n [/(?:^|\\.)stackexchange\\.com\\//i, 'qa'],\n [/(?:^|\\.)arxiv\\.org\\//i, 'paper'],\n [/(?:^|\\.)medium\\.com\\//i, 'blog'],\n [/(?:^|\\.)dev\\.to\\//i, 'blog'],\n [/(?:^|\\.)substack\\.com\\//i, 'blog'],\n // Docs subdomains and /docs/ paths.\n [/^(?:[a-z0-9-]+\\.)*docs\\./i, 'docs'],\n [/\\/docs\\//i, 'docs'],\n [/(?:^|\\.)readthedocs\\.io\\//i, 'docs'],\n // Video.\n [/(?:^|\\.)youtube\\.com\\/watch/i, 'video'],\n [/(?:^|\\.)youtu\\.be\\//i, 'video'],\n // News / engineering blogs (last so it doesn't capture vendor docs).\n [/(?:^|\\.)(?:news|blog|engineering)\\.[a-z0-9-]+\\.[a-z]{2,}\\//i, 'news'],\n];\n\nexport function classifySourceByUrl(url: string): SourceType {\n let candidate: string;\n try {\n const u = new URL(url);\n // Match against `host + pathname` so rules can use either or both.\n candidate = `${u.hostname}${u.pathname}`;\n } catch {\n candidate = url;\n }\n for (const [re, type] of RULES) {\n if (re.test(candidate)) return type;\n }\n return 'web';\n}\n"],
|
|
5
|
-
"mappings": "AAqBA,MAAM,QAAqC;AAAA;AAAA,EAEzC,CAAC,0BAA0B,QAAQ;AAAA,EACnC,CAAC,0BAA0B,QAAQ;AAAA,EACnC,CAAC,0BAA0B,QAAQ;AAAA;AAAA,EAEnC,CAAC,oBAAoB,KAAK;AAAA,EAC1B,CAAC,6BAA6B,KAAK;AAAA,EACnC,CAAC,iCAAiC,IAAI;AAAA,EACtC,CAAC,iCAAiC,IAAI;AAAA,EACtC,CAAC,yBAAyB,OAAO;AAAA,EACjC,CAAC,0BAA0B,MAAM;AAAA,EACjC,CAAC,sBAAsB,MAAM;AAAA,EAC7B,CAAC,4BAA4B,MAAM;AAAA;AAAA,EAEnC,CAAC,6BAA6B,MAAM;AAAA,EACpC,CAAC,aAAa,MAAM;AAAA,EACpB,CAAC,8BAA8B,MAAM;AAAA;AAAA,EAErC,CAAC,gCAAgC,OAAO;AAAA,EACxC,CAAC,wBAAwB,OAAO;AAAA;AAAA,EAEhC,CAAC,+DAA+D,MAAM;AACxE;AAEO,SAAS,oBAAoB,KAAyB;AAC3D,MAAI;AACJ,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AAErB,gBAAY,GAAG,EAAE,QAAQ,GAAG,EAAE,QAAQ;AAAA,EACxC,QAAQ;AACN,gBAAY;AAAA,EACd;AACA,aAAW,CAAC,IAAI,IAAI,KAAK,OAAO;AAC9B,QAAI,GAAG,KAAK,SAAS,EAAG,QAAO;AAAA,EACjC;AACA,SAAO;AACT;",
|
|
4
|
+
"sourcesContent": ["/**\n * Hostname/path-heuristic source-type tagging. Works without the LLM\n * classifier so degraded-mode web-search responses still carry a\n * `source_type` field per result. When the LLM classifier IS available,\n * its tag wins (the classifier sees title + snippet as well, not just URL).\n *\n * See: mcp-revisions/output-shaping/06-source-type-tagging-without-llm.md.\n */\n\nexport type SourceType =\n | 'reddit'\n | 'github'\n | 'docs'\n | 'blog'\n | 'paper'\n | 'qa'\n | 'cve'\n | 'news'\n | 'video'\n | 'web';\n\nconst RULES: Array<[RegExp, SourceType]> = [\n // Reddit post permalinks (subreddit homepages are filtered out upstream).\n [/(?:^|\\.)reddit\\.com\\//i, 'reddit'],\n [/(?:^|\\.)github\\.com\\//i, 'github'],\n [/(?:^|\\.)gitlab\\.com\\//i, 'github'],\n // CVE-prefixed paths are unambiguous regardless of host.\n [/\\/CVE-\\d{4}-\\d+/i, 'cve'],\n [/(?:^|\\.)nvd\\.nist\\.gov\\//i, 'cve'],\n [/(?:^|\\.)stackoverflow\\.com\\//i, 'qa'],\n [/(?:^|\\.)stackexchange\\.com\\//i, 'qa'],\n [/(?:^|\\.)arxiv\\.org\\//i, 'paper'],\n [/(?:^|\\.)medium\\.com\\//i, 'blog'],\n [/(?:^|\\.)dev\\.to\\//i, 'blog'],\n [/(?:^|\\.)substack\\.com\\//i, 'blog'],\n // Docs subdomains and /docs/ paths.\n [/^(?:[a-z0-9-]+\\.)*docs\\./i, 'docs'],\n [/\\/docs\\//i, 'docs'],\n [/(?:^|\\.)readthedocs\\.io\\//i, 'docs'],\n // Video.\n [/(?:^|\\.)youtube\\.com\\/watch/i, 'video'],\n [/(?:^|\\.)youtu\\.be\\//i, 'video'],\n // News / engineering blogs (last so it doesn't capture vendor docs).\n [/(?:^|\\.)(?:news|blog|engineering)\\.[a-z0-9-]+\\.[a-z]{2,}\\//i, 'news'],\n];\n\nexport function classifySourceByUrl(url: string): SourceType {\n let candidate: string;\n try {\n const u = new URL(url);\n // Match against `host + pathname` so rules can use either or both.\n candidate = `${u.hostname}${u.pathname}`;\n } catch {\n candidate = url;\n }\n for (const [re, type] of RULES) {\n if (re.test(candidate)) return type;\n }\n return 'web';\n}\n\n// \u2500\u2500 Document-format detection (PDF / Office) \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n// Scrape.do + Readability + Turndown assume HTML input. Binary document\n// formats need a markdown-extraction service (Jina Reader) instead. These two\n// helpers give the scrape pipeline both a pre-fetch gate (URL suffix) and a\n// post-fetch gate (response Content-Type header).\n\nconst DOCUMENT_PATH_SUFFIXES = [\n '.pdf',\n '.doc', '.docx',\n '.ppt', '.pptx',\n '.xls', '.xlsx',\n] as const;\n\n/**\n * Pre-fetch gate: does this URL's path end in a known binary-document suffix?\n * Case-insensitive. Trailing query strings / fragments are ignored \u2014 only the\n * pathname is inspected. Invalid URLs return false (handled upstream).\n */\nexport function isDocumentUrl(url: string): boolean {\n let pathname: string;\n try {\n pathname = new URL(url).pathname.toLowerCase();\n } catch {\n return false;\n }\n for (const suffix of DOCUMENT_PATH_SUFFIXES) {\n if (pathname.endsWith(suffix)) return true;\n }\n return false;\n}\n\nconst BINARY_CONTENT_TYPE_PREFIXES = [\n 'application/pdf',\n 'application/msword',\n 'application/vnd.openxmlformats-officedocument.',\n 'application/vnd.ms-excel',\n 'application/vnd.ms-powerpoint',\n 'application/octet-stream',\n] as const;\n\n/**\n * Post-fetch gate: does this Content-Type header indicate a binary document\n * that our HTML pipeline cannot decode? Returns false for HTML/JSON/plain text\n * and for unknown/missing content-types (the upstream pipeline can still try).\n */\nexport function isBinaryDocumentContentType(contentType: string | null | undefined): boolean {\n if (!contentType) return false;\n const lower = contentType.toLowerCase();\n for (const prefix of BINARY_CONTENT_TYPE_PREFIXES) {\n if (lower.startsWith(prefix)) return true;\n }\n return false;\n}\n"],
|
|
5
|
+
"mappings": "AAqBA,MAAM,QAAqC;AAAA;AAAA,EAEzC,CAAC,0BAA0B,QAAQ;AAAA,EACnC,CAAC,0BAA0B,QAAQ;AAAA,EACnC,CAAC,0BAA0B,QAAQ;AAAA;AAAA,EAEnC,CAAC,oBAAoB,KAAK;AAAA,EAC1B,CAAC,6BAA6B,KAAK;AAAA,EACnC,CAAC,iCAAiC,IAAI;AAAA,EACtC,CAAC,iCAAiC,IAAI;AAAA,EACtC,CAAC,yBAAyB,OAAO;AAAA,EACjC,CAAC,0BAA0B,MAAM;AAAA,EACjC,CAAC,sBAAsB,MAAM;AAAA,EAC7B,CAAC,4BAA4B,MAAM;AAAA;AAAA,EAEnC,CAAC,6BAA6B,MAAM;AAAA,EACpC,CAAC,aAAa,MAAM;AAAA,EACpB,CAAC,8BAA8B,MAAM;AAAA;AAAA,EAErC,CAAC,gCAAgC,OAAO;AAAA,EACxC,CAAC,wBAAwB,OAAO;AAAA;AAAA,EAEhC,CAAC,+DAA+D,MAAM;AACxE;AAEO,SAAS,oBAAoB,KAAyB;AAC3D,MAAI;AACJ,MAAI;AACF,UAAM,IAAI,IAAI,IAAI,GAAG;AAErB,gBAAY,GAAG,EAAE,QAAQ,GAAG,EAAE,QAAQ;AAAA,EACxC,QAAQ;AACN,gBAAY;AAAA,EACd;AACA,aAAW,CAAC,IAAI,IAAI,KAAK,OAAO;AAC9B,QAAI,GAAG,KAAK,SAAS,EAAG,QAAO;AAAA,EACjC;AACA,SAAO;AACT;AAQA,MAAM,yBAAyB;AAAA,EAC7B;AAAA,EACA;AAAA,EAAQ;AAAA,EACR;AAAA,EAAQ;AAAA,EACR;AAAA,EAAQ;AACV;AAOO,SAAS,cAAc,KAAsB;AAClD,MAAI;AACJ,MAAI;AACF,eAAW,IAAI,IAAI,GAAG,EAAE,SAAS,YAAY;AAAA,EAC/C,QAAQ;AACN,WAAO;AAAA,EACT;AACA,aAAW,UAAU,wBAAwB;AAC3C,QAAI,SAAS,SAAS,MAAM,EAAG,QAAO;AAAA,EACxC;AACA,SAAO;AACT;AAEA,MAAM,+BAA+B;AAAA,EACnC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAOO,SAAS,4BAA4B,aAAiD;AAC3F,MAAI,CAAC,YAAa,QAAO;AACzB,QAAM,QAAQ,YAAY,YAAY;AACtC,aAAW,UAAU,8BAA8B;AACjD,QAAI,MAAM,WAAW,MAAM,EAAG,QAAO;AAAA,EACvC;AACA,SAAO;AACT;",
|
|
6
6
|
"names": []
|
|
7
7
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mcp-researchpowerpack",
|
|
3
|
-
"version": "6.0.
|
|
3
|
+
"version": "6.0.7",
|
|
4
4
|
"description": "HTTP-first MCP research server: start-research (goal-tailored brief), web-search (with Reddit scope), scrape-links (auto-detects Reddit URLs) — built on mcp-use.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|