mcp-researchpowerpack-http 3.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +124 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +227 -0
- package/dist/index.js.map +7 -0
- package/dist/mcp-use.json +7 -0
- package/dist/src/clients/github.d.ts +83 -0
- package/dist/src/clients/github.d.ts.map +1 -0
- package/dist/src/clients/github.js +370 -0
- package/dist/src/clients/github.js.map +7 -0
- package/dist/src/clients/reddit.d.ts +60 -0
- package/dist/src/clients/reddit.d.ts.map +1 -0
- package/dist/src/clients/reddit.js +287 -0
- package/dist/src/clients/reddit.js.map +7 -0
- package/dist/src/clients/research.d.ts +67 -0
- package/dist/src/clients/research.d.ts.map +1 -0
- package/dist/src/clients/research.js +282 -0
- package/dist/src/clients/research.js.map +7 -0
- package/dist/src/clients/scraper.d.ts +72 -0
- package/dist/src/clients/scraper.d.ts.map +1 -0
- package/dist/src/clients/scraper.js +327 -0
- package/dist/src/clients/scraper.js.map +7 -0
- package/dist/src/clients/search.d.ts +57 -0
- package/dist/src/clients/search.d.ts.map +1 -0
- package/dist/src/clients/search.js +218 -0
- package/dist/src/clients/search.js.map +7 -0
- package/dist/src/config/index.d.ts +93 -0
- package/dist/src/config/index.d.ts.map +1 -0
- package/dist/src/config/index.js +218 -0
- package/dist/src/config/index.js.map +7 -0
- package/dist/src/schemas/deep-research.d.ts +40 -0
- package/dist/src/schemas/deep-research.d.ts.map +1 -0
- package/dist/src/schemas/deep-research.js +216 -0
- package/dist/src/schemas/deep-research.js.map +7 -0
- package/dist/src/schemas/github-score.d.ts +50 -0
- package/dist/src/schemas/github-score.d.ts.map +1 -0
- package/dist/src/schemas/github-score.js +58 -0
- package/dist/src/schemas/github-score.js.map +7 -0
- package/dist/src/schemas/scrape-links.d.ts +23 -0
- package/dist/src/schemas/scrape-links.d.ts.map +1 -0
- package/dist/src/schemas/scrape-links.js +32 -0
- package/dist/src/schemas/scrape-links.js.map +7 -0
- package/dist/src/schemas/web-search.d.ts +18 -0
- package/dist/src/schemas/web-search.d.ts.map +1 -0
- package/dist/src/schemas/web-search.js +28 -0
- package/dist/src/schemas/web-search.js.map +7 -0
- package/dist/src/scoring/github-quality.d.ts +142 -0
- package/dist/src/scoring/github-quality.d.ts.map +1 -0
- package/dist/src/scoring/github-quality.js +202 -0
- package/dist/src/scoring/github-quality.js.map +7 -0
- package/dist/src/services/file-attachment.d.ts +30 -0
- package/dist/src/services/file-attachment.d.ts.map +1 -0
- package/dist/src/services/file-attachment.js +205 -0
- package/dist/src/services/file-attachment.js.map +7 -0
- package/dist/src/services/llm-processor.d.ts +29 -0
- package/dist/src/services/llm-processor.d.ts.map +1 -0
- package/dist/src/services/llm-processor.js +206 -0
- package/dist/src/services/llm-processor.js.map +7 -0
- package/dist/src/services/markdown-cleaner.d.ts +8 -0
- package/dist/src/services/markdown-cleaner.d.ts.map +1 -0
- package/dist/src/services/markdown-cleaner.js +63 -0
- package/dist/src/services/markdown-cleaner.js.map +7 -0
- package/dist/src/tools/github-score.d.ts +12 -0
- package/dist/src/tools/github-score.d.ts.map +1 -0
- package/dist/src/tools/github-score.js +306 -0
- package/dist/src/tools/github-score.js.map +7 -0
- package/dist/src/tools/mcp-helpers.d.ts +27 -0
- package/dist/src/tools/mcp-helpers.d.ts.map +1 -0
- package/dist/src/tools/mcp-helpers.js +47 -0
- package/dist/src/tools/mcp-helpers.js.map +7 -0
- package/dist/src/tools/reddit.d.ts +54 -0
- package/dist/src/tools/reddit.d.ts.map +1 -0
- package/dist/src/tools/reddit.js +498 -0
- package/dist/src/tools/reddit.js.map +7 -0
- package/dist/src/tools/registry.d.ts +3 -0
- package/dist/src/tools/registry.d.ts.map +1 -0
- package/dist/src/tools/registry.js +17 -0
- package/dist/src/tools/registry.js.map +7 -0
- package/dist/src/tools/research.d.ts +14 -0
- package/dist/src/tools/research.d.ts.map +1 -0
- package/dist/src/tools/research.js +250 -0
- package/dist/src/tools/research.js.map +7 -0
- package/dist/src/tools/scrape.d.ts +14 -0
- package/dist/src/tools/scrape.d.ts.map +1 -0
- package/dist/src/tools/scrape.js +290 -0
- package/dist/src/tools/scrape.js.map +7 -0
- package/dist/src/tools/search.d.ts +10 -0
- package/dist/src/tools/search.d.ts.map +1 -0
- package/dist/src/tools/search.js +197 -0
- package/dist/src/tools/search.js.map +7 -0
- package/dist/src/tools/utils.d.ts +105 -0
- package/dist/src/tools/utils.d.ts.map +1 -0
- package/dist/src/tools/utils.js +96 -0
- package/dist/src/tools/utils.js.map +7 -0
- package/dist/src/utils/concurrency.d.ts +28 -0
- package/dist/src/utils/concurrency.d.ts.map +1 -0
- package/dist/src/utils/concurrency.js +62 -0
- package/dist/src/utils/concurrency.js.map +7 -0
- package/dist/src/utils/errors.d.ts +95 -0
- package/dist/src/utils/errors.d.ts.map +1 -0
- package/dist/src/utils/errors.js +289 -0
- package/dist/src/utils/errors.js.map +7 -0
- package/dist/src/utils/logger.d.ts +33 -0
- package/dist/src/utils/logger.d.ts.map +1 -0
- package/dist/src/utils/logger.js +41 -0
- package/dist/src/utils/logger.js.map +7 -0
- package/dist/src/utils/markdown-formatter.d.ts +5 -0
- package/dist/src/utils/markdown-formatter.d.ts.map +1 -0
- package/dist/src/utils/markdown-formatter.js +15 -0
- package/dist/src/utils/markdown-formatter.js.map +7 -0
- package/dist/src/utils/response.d.ts +83 -0
- package/dist/src/utils/response.d.ts.map +1 -0
- package/dist/src/utils/response.js +109 -0
- package/dist/src/utils/response.js.map +7 -0
- package/dist/src/utils/retry.d.ts +43 -0
- package/dist/src/utils/retry.d.ts.map +1 -0
- package/dist/src/utils/retry.js +37 -0
- package/dist/src/utils/retry.js.map +7 -0
- package/dist/src/utils/url-aggregator.d.ts +92 -0
- package/dist/src/utils/url-aggregator.d.ts.map +1 -0
- package/dist/src/utils/url-aggregator.js +357 -0
- package/dist/src/utils/url-aggregator.js.map +7 -0
- package/dist/src/version.d.ts +28 -0
- package/dist/src/version.d.ts.map +1 -0
- package/dist/src/version.js +32 -0
- package/dist/src/version.js.map +7 -0
- package/package.json +73 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM Processor for content extraction
|
|
3
|
+
* Uses OpenRouter via OPENROUTER_API_KEY for AI-powered content filtering
|
|
4
|
+
* Implements robust retry logic and NEVER throws
|
|
5
|
+
*/
|
|
6
|
+
import OpenAI from 'openai';
|
|
7
|
+
import { type StructuredError } from '../utils/errors.js';
|
|
8
|
+
/** Default concurrency for parallel LLM extractions */
|
|
9
|
+
export declare const DEFAULT_LLM_CONCURRENCY: 3;
|
|
10
|
+
interface ProcessingConfig {
|
|
11
|
+
readonly use_llm: boolean;
|
|
12
|
+
readonly what_to_extract: string | undefined;
|
|
13
|
+
readonly max_tokens?: number;
|
|
14
|
+
}
|
|
15
|
+
interface LLMResult {
|
|
16
|
+
readonly content: string;
|
|
17
|
+
readonly processed: boolean;
|
|
18
|
+
readonly error?: string;
|
|
19
|
+
readonly errorDetails?: StructuredError;
|
|
20
|
+
}
|
|
21
|
+
export declare function createLLMProcessor(): OpenAI | null;
|
|
22
|
+
/**
|
|
23
|
+
* Process content with LLM extraction
|
|
24
|
+
* NEVER throws - always returns a valid LLMResult
|
|
25
|
+
* Implements retry logic with exponential backoff for transient failures
|
|
26
|
+
*/
|
|
27
|
+
export declare function processContentWithLLM(content: string, config: ProcessingConfig, processor?: OpenAI | null, signal?: AbortSignal): Promise<LLMResult>;
|
|
28
|
+
export {};
|
|
29
|
+
//# sourceMappingURL=llm-processor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llm-processor.d.ts","sourceRoot":"","sources":["../../../src/services/llm-processor.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,MAAM,MAAM,QAAQ,CAAC;AAE5B,OAAO,EAML,KAAK,eAAe,EACrB,MAAM,oBAAoB,CAAC;AAG5B,uDAAuD;AACvD,eAAO,MAAM,uBAAuB,EAAG,CAAU,CAAC;AAiBlD,UAAU,gBAAgB;IACxB,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,eAAe,EAAE,MAAM,GAAG,SAAS,CAAC;IAC7C,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAC;CAC9B;AAED,UAAU,SAAS;IACjB,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,SAAS,EAAE,OAAO,CAAC;IAC5B,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,YAAY,CAAC,EAAE,eAAe,CAAC;CACzC;AA8BD,wBAAgB,kBAAkB,IAAI,MAAM,GAAG,IAAI,CA2BlD;AA8DD;;;;GAIG;AACH,wBAAsB,qBAAqB,CACzC,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,gBAAgB,EACxB,SAAS,CAAC,EAAE,MAAM,GAAG,IAAI,EACzB,MAAM,CAAC,EAAE,WAAW,GACnB,OAAO,CAAC,SAAS,CAAC,CA6IpB"}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
import OpenAI from "openai";
|
|
2
|
+
import { RESEARCH, LLM_EXTRACTION, CEREBRAS, getCapabilities } from "../config/index.js";
|
|
3
|
+
import {
|
|
4
|
+
classifyError,
|
|
5
|
+
sleep,
|
|
6
|
+
ErrorCode,
|
|
7
|
+
withRequestTimeout,
|
|
8
|
+
withStallProtection
|
|
9
|
+
} from "../utils/errors.js";
|
|
10
|
+
import { mcpLog } from "../utils/logger.js";
|
|
11
|
+
const DEFAULT_LLM_CONCURRENCY = 3;
|
|
12
|
+
const MAX_LLM_INPUT_CHARS = 1e5;
|
|
13
|
+
const LLM_CLIENT_TIMEOUT_MS = 12e4;
|
|
14
|
+
const BACKOFF_JITTER_FACTOR = 0.3;
|
|
15
|
+
const LLM_STALL_TIMEOUT_MS = 15e3;
|
|
16
|
+
const LLM_REQUEST_DEADLINE_MS = 3e4;
|
|
17
|
+
const LLM_RETRY_CONFIG = {
|
|
18
|
+
maxRetries: 2,
|
|
19
|
+
baseDelayMs: 1e3,
|
|
20
|
+
maxDelayMs: 5e3
|
|
21
|
+
};
|
|
22
|
+
const RETRYABLE_LLM_ERROR_CODES = /* @__PURE__ */ new Set([
|
|
23
|
+
"rate_limit_exceeded",
|
|
24
|
+
"server_error",
|
|
25
|
+
"timeout",
|
|
26
|
+
"service_unavailable"
|
|
27
|
+
]);
|
|
28
|
+
function hasStatus(error) {
|
|
29
|
+
return typeof error === "object" && error !== null && "status" in error && typeof error.status === "number";
|
|
30
|
+
}
|
|
31
|
+
let llmClient = null;
|
|
32
|
+
let cerebrasClient = null;
|
|
33
|
+
function createLLMProcessor() {
|
|
34
|
+
if (!getCapabilities().llmExtraction) return null;
|
|
35
|
+
if (CEREBRAS.ENABLED) {
|
|
36
|
+
if (!cerebrasClient) {
|
|
37
|
+
cerebrasClient = new OpenAI({
|
|
38
|
+
baseURL: CEREBRAS.BASE_URL,
|
|
39
|
+
apiKey: CEREBRAS.API_KEY,
|
|
40
|
+
timeout: LLM_CLIENT_TIMEOUT_MS,
|
|
41
|
+
maxRetries: 0
|
|
42
|
+
});
|
|
43
|
+
mcpLog("info", `LLM extraction using Cerebras (${CEREBRAS.MODEL})`, "llm");
|
|
44
|
+
}
|
|
45
|
+
return cerebrasClient;
|
|
46
|
+
}
|
|
47
|
+
if (!llmClient) {
|
|
48
|
+
llmClient = new OpenAI({
|
|
49
|
+
baseURL: RESEARCH.BASE_URL,
|
|
50
|
+
apiKey: RESEARCH.API_KEY,
|
|
51
|
+
timeout: LLM_CLIENT_TIMEOUT_MS,
|
|
52
|
+
maxRetries: 0
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
return llmClient;
|
|
56
|
+
}
|
|
57
|
+
function isRetryableLLMError(error) {
|
|
58
|
+
if (!error || typeof error !== "object") return false;
|
|
59
|
+
const stallCode = error?.code;
|
|
60
|
+
if (stallCode === "ESTALLED" || stallCode === "ETIMEDOUT") {
|
|
61
|
+
return true;
|
|
62
|
+
}
|
|
63
|
+
if (hasStatus(error)) {
|
|
64
|
+
if (error.status === 429 || error.status === 500 || error.status === 502 || error.status === 503 || error.status === 504) {
|
|
65
|
+
return true;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
const record = error;
|
|
69
|
+
const code = typeof record.code === "string" ? record.code : void 0;
|
|
70
|
+
const nested = typeof record.error === "object" && record.error !== null ? record.error : null;
|
|
71
|
+
const errorCode = code ?? (nested && typeof nested.code === "string" ? nested.code : void 0) ?? (nested && typeof nested.type === "string" ? nested.type : void 0);
|
|
72
|
+
if (errorCode && RETRYABLE_LLM_ERROR_CODES.has(errorCode)) {
|
|
73
|
+
return true;
|
|
74
|
+
}
|
|
75
|
+
const message = typeof record.message === "string" ? record.message.toLowerCase() : "";
|
|
76
|
+
if (message.includes("rate limit") || message.includes("timeout") || message.includes("timed out") || message.includes("service unavailable") || message.includes("server error") || message.includes("connection") || message.includes("econnreset")) {
|
|
77
|
+
return true;
|
|
78
|
+
}
|
|
79
|
+
return false;
|
|
80
|
+
}
|
|
81
|
+
function calculateLLMBackoff(attempt) {
|
|
82
|
+
const exponentialDelay = LLM_RETRY_CONFIG.baseDelayMs * Math.pow(2, attempt);
|
|
83
|
+
const jitter = Math.random() * BACKOFF_JITTER_FACTOR * exponentialDelay;
|
|
84
|
+
return Math.min(exponentialDelay + jitter, LLM_RETRY_CONFIG.maxDelayMs);
|
|
85
|
+
}
|
|
86
|
+
async function processContentWithLLM(content, config, processor, signal) {
|
|
87
|
+
if (!config.use_llm) {
|
|
88
|
+
return { content, processed: false };
|
|
89
|
+
}
|
|
90
|
+
if (!processor) {
|
|
91
|
+
return {
|
|
92
|
+
content,
|
|
93
|
+
processed: false,
|
|
94
|
+
error: "LLM processor not available (OPENROUTER_API_KEY not set)",
|
|
95
|
+
errorDetails: {
|
|
96
|
+
code: ErrorCode.AUTH_ERROR,
|
|
97
|
+
message: "LLM processor not available",
|
|
98
|
+
retryable: false
|
|
99
|
+
}
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
if (!content?.trim()) {
|
|
103
|
+
return { content: content || "", processed: false, error: "Empty content provided" };
|
|
104
|
+
}
|
|
105
|
+
const truncatedContent = content.length > MAX_LLM_INPUT_CHARS ? content.substring(0, MAX_LLM_INPUT_CHARS) + "\n\n[Content truncated due to length]" : content;
|
|
106
|
+
const prompt = config.what_to_extract ? `Extract and clean the following content. Focus on: ${config.what_to_extract}
|
|
107
|
+
|
|
108
|
+
Content:
|
|
109
|
+
${truncatedContent}` : `Clean and extract the main content from the following text, removing navigation, ads, and irrelevant elements:
|
|
110
|
+
|
|
111
|
+
${truncatedContent}`;
|
|
112
|
+
const activeModel = CEREBRAS.ENABLED ? CEREBRAS.MODEL : LLM_EXTRACTION.MODEL;
|
|
113
|
+
const requestBody = {
|
|
114
|
+
model: activeModel,
|
|
115
|
+
messages: [{ role: "user", content: prompt }],
|
|
116
|
+
max_tokens: config.max_tokens || LLM_EXTRACTION.MAX_TOKENS
|
|
117
|
+
};
|
|
118
|
+
if (!CEREBRAS.ENABLED && LLM_EXTRACTION.ENABLE_REASONING) {
|
|
119
|
+
requestBody.reasoning = { enabled: true };
|
|
120
|
+
}
|
|
121
|
+
let lastError;
|
|
122
|
+
for (let attempt = 0; attempt <= LLM_RETRY_CONFIG.maxRetries; attempt++) {
|
|
123
|
+
try {
|
|
124
|
+
if (attempt === 0) {
|
|
125
|
+
mcpLog("info", `Starting extraction with ${activeModel}${CEREBRAS.ENABLED ? " (Cerebras)" : ""}`, "llm");
|
|
126
|
+
} else {
|
|
127
|
+
mcpLog("warning", `Retry attempt ${attempt}/${LLM_RETRY_CONFIG.maxRetries}`, "llm");
|
|
128
|
+
}
|
|
129
|
+
const response = await withStallProtection(
|
|
130
|
+
(stallSignal) => withRequestTimeout(
|
|
131
|
+
(timeoutSignal) => {
|
|
132
|
+
const mergedController = new AbortController();
|
|
133
|
+
const abortMerged = () => mergedController.abort();
|
|
134
|
+
signal?.addEventListener("abort", abortMerged, { once: true });
|
|
135
|
+
stallSignal.addEventListener("abort", abortMerged, { once: true });
|
|
136
|
+
timeoutSignal.addEventListener("abort", abortMerged, { once: true });
|
|
137
|
+
return processor.chat.completions.create(
|
|
138
|
+
requestBody,
|
|
139
|
+
{ signal: mergedController.signal }
|
|
140
|
+
).finally(() => {
|
|
141
|
+
signal?.removeEventListener("abort", abortMerged);
|
|
142
|
+
stallSignal.removeEventListener("abort", abortMerged);
|
|
143
|
+
timeoutSignal.removeEventListener("abort", abortMerged);
|
|
144
|
+
});
|
|
145
|
+
},
|
|
146
|
+
LLM_REQUEST_DEADLINE_MS,
|
|
147
|
+
`LLM extraction (${activeModel})`
|
|
148
|
+
),
|
|
149
|
+
LLM_STALL_TIMEOUT_MS,
|
|
150
|
+
3,
|
|
151
|
+
`LLM extraction (${activeModel})`
|
|
152
|
+
);
|
|
153
|
+
const result = response.choices?.[0]?.message?.content;
|
|
154
|
+
if (result && result.trim()) {
|
|
155
|
+
mcpLog("info", `Successfully extracted ${result.length} characters`, "llm");
|
|
156
|
+
return { content: result, processed: true };
|
|
157
|
+
}
|
|
158
|
+
mcpLog("warning", "Received empty response from LLM", "llm");
|
|
159
|
+
return {
|
|
160
|
+
content,
|
|
161
|
+
processed: false,
|
|
162
|
+
error: "LLM returned empty response",
|
|
163
|
+
errorDetails: {
|
|
164
|
+
code: ErrorCode.INTERNAL_ERROR,
|
|
165
|
+
message: "LLM returned empty response",
|
|
166
|
+
retryable: false
|
|
167
|
+
}
|
|
168
|
+
};
|
|
169
|
+
} catch (err) {
|
|
170
|
+
lastError = classifyError(err);
|
|
171
|
+
const status = hasStatus(err) ? err.status : void 0;
|
|
172
|
+
const code = typeof err === "object" && err !== null && "code" in err ? String(err.code) : void 0;
|
|
173
|
+
mcpLog("error", `Error (attempt ${attempt + 1}): ${lastError.message} [status=${status}, code=${code}, retryable=${isRetryableLLMError(err)}]`, "llm");
|
|
174
|
+
if (isRetryableLLMError(err) && attempt < LLM_RETRY_CONFIG.maxRetries) {
|
|
175
|
+
const delayMs = calculateLLMBackoff(attempt);
|
|
176
|
+
mcpLog("warning", `Retrying in ${delayMs}ms...`, "llm");
|
|
177
|
+
try {
|
|
178
|
+
await sleep(delayMs, signal);
|
|
179
|
+
} catch {
|
|
180
|
+
break;
|
|
181
|
+
}
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
break;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
const errorMessage = lastError?.message || "Unknown LLM error";
|
|
188
|
+
mcpLog("error", `All attempts failed: ${errorMessage}. Returning original content.`, "llm");
|
|
189
|
+
return {
|
|
190
|
+
content,
|
|
191
|
+
// Return original content as fallback
|
|
192
|
+
processed: false,
|
|
193
|
+
error: `LLM extraction failed: ${errorMessage}`,
|
|
194
|
+
errorDetails: lastError || {
|
|
195
|
+
code: ErrorCode.UNKNOWN_ERROR,
|
|
196
|
+
message: errorMessage,
|
|
197
|
+
retryable: false
|
|
198
|
+
}
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
export {
|
|
202
|
+
DEFAULT_LLM_CONCURRENCY,
|
|
203
|
+
createLLMProcessor,
|
|
204
|
+
processContentWithLLM
|
|
205
|
+
};
|
|
206
|
+
//# sourceMappingURL=llm-processor.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/services/llm-processor.ts"],
|
|
4
|
+
"sourcesContent": ["/**\n * LLM Processor for content extraction\n * Uses OpenRouter via OPENROUTER_API_KEY for AI-powered content filtering\n * Implements robust retry logic and NEVER throws\n */\n\nimport OpenAI from 'openai';\nimport { RESEARCH, LLM_EXTRACTION, CEREBRAS, getCapabilities } from '../config/index.js';\nimport {\n classifyError,\n sleep,\n ErrorCode,\n withRequestTimeout,\n withStallProtection,\n type StructuredError,\n} from '../utils/errors.js';\nimport { mcpLog } from '../utils/logger.js';\n\n/** Default concurrency for parallel LLM extractions */\nexport const DEFAULT_LLM_CONCURRENCY = 3 as const;\n\n/** Maximum input characters for LLM processing (~25k tokens) */\nconst MAX_LLM_INPUT_CHARS = 100_000 as const;\n\n/** LLM client timeout in milliseconds */\nconst LLM_CLIENT_TIMEOUT_MS = 120_000 as const;\n\n/** Jitter factor for exponential backoff */\nconst BACKOFF_JITTER_FACTOR = 0.3 as const;\n\n/** Stall detection timeout \u2014 abort if no response in this time */\nconst LLM_STALL_TIMEOUT_MS = 15_000 as const;\n\n/** Hard request deadline for LLM calls */\nconst LLM_REQUEST_DEADLINE_MS = 30_000 as const;\n\ninterface ProcessingConfig {\n readonly use_llm: boolean;\n readonly what_to_extract: string | undefined;\n readonly max_tokens?: number;\n}\n\ninterface LLMResult {\n readonly content: string;\n readonly processed: boolean;\n readonly error?: string;\n readonly errorDetails?: StructuredError;\n}\n\n// LLM-specific retry configuration\nconst LLM_RETRY_CONFIG = {\n maxRetries: 2,\n baseDelayMs: 1000,\n maxDelayMs: 5000,\n} as const;\n\n// OpenRouter/OpenAI specific retryable error codes (using Set for type-safe lookup)\nconst RETRYABLE_LLM_ERROR_CODES = new Set([\n 'rate_limit_exceeded',\n 'server_error',\n 'timeout',\n 'service_unavailable',\n]);\n\n/** Type guard for errors with an HTTP status code */\nfunction hasStatus(error: unknown): error is { status: number } {\n return (\n typeof error === 'object' &&\n error !== null &&\n 'status' in error &&\n typeof (error as Record<string, unknown>).status === 'number'\n );\n}\n\nlet llmClient: OpenAI | null = null;\nlet cerebrasClient: OpenAI | null = null;\n\nexport function createLLMProcessor(): OpenAI | null {\n if (!getCapabilities().llmExtraction) return null;\n\n // Cerebras takes priority when enabled\n if (CEREBRAS.ENABLED) {\n if (!cerebrasClient) {\n cerebrasClient = new OpenAI({\n baseURL: CEREBRAS.BASE_URL,\n apiKey: CEREBRAS.API_KEY,\n timeout: LLM_CLIENT_TIMEOUT_MS,\n maxRetries: 0,\n });\n mcpLog('info', `LLM extraction using Cerebras (${CEREBRAS.MODEL})`, 'llm');\n }\n return cerebrasClient;\n }\n\n // Default: OpenRouter\n if (!llmClient) {\n llmClient = new OpenAI({\n baseURL: RESEARCH.BASE_URL,\n apiKey: RESEARCH.API_KEY,\n timeout: LLM_CLIENT_TIMEOUT_MS,\n maxRetries: 0,\n });\n }\n return llmClient;\n}\n\n/**\n * Check if an LLM error is retryable\n */\nfunction isRetryableLLMError(error: unknown): boolean {\n if (!error || typeof error !== 'object') return false;\n\n // Stall/timeout protection errors - always retry these\n const stallCode = (error as { code?: string })?.code;\n if (stallCode === 'ESTALLED' || stallCode === 'ETIMEDOUT') {\n return true;\n }\n\n // Check HTTP status codes\n if (hasStatus(error)) {\n if (error.status === 429 || error.status === 500 || error.status === 502 || error.status === 503 || error.status === 504) {\n return true;\n }\n }\n\n // Check error codes from OpenAI/OpenRouter\n const record = error as Record<string, unknown>;\n const code = typeof record.code === 'string' ? record.code : undefined;\n const nested =\n typeof record.error === 'object' && record.error !== null\n ? (record.error as Record<string, unknown>)\n : null;\n const errorCode =\n code ??\n (nested && typeof nested.code === 'string' ? nested.code : undefined) ??\n (nested && typeof nested.type === 'string' ? nested.type : undefined);\n if (errorCode && RETRYABLE_LLM_ERROR_CODES.has(errorCode)) {\n return true;\n }\n\n // Check message for common patterns\n const message = typeof record.message === 'string' ? record.message.toLowerCase() : '';\n if (\n message.includes('rate limit') ||\n message.includes('timeout') ||\n message.includes('timed out') ||\n message.includes('service unavailable') ||\n message.includes('server error') ||\n message.includes('connection') ||\n message.includes('econnreset')\n ) {\n return true;\n }\n\n return false;\n}\n\n/**\n * Calculate backoff delay with jitter for LLM retries\n */\nfunction calculateLLMBackoff(attempt: number): number {\n const exponentialDelay = LLM_RETRY_CONFIG.baseDelayMs * Math.pow(2, attempt);\n const jitter = Math.random() * BACKOFF_JITTER_FACTOR * exponentialDelay;\n return Math.min(exponentialDelay + jitter, LLM_RETRY_CONFIG.maxDelayMs);\n}\n\n/**\n * Process content with LLM extraction\n * NEVER throws - always returns a valid LLMResult\n * Implements retry logic with exponential backoff for transient failures\n */\nexport async function processContentWithLLM(\n content: string,\n config: ProcessingConfig,\n processor?: OpenAI | null,\n signal?: AbortSignal\n): Promise<LLMResult> {\n // Early returns for invalid/skip conditions\n if (!config.use_llm) {\n return { content, processed: false };\n }\n\n if (!processor) {\n return {\n content,\n processed: false,\n error: 'LLM processor not available (OPENROUTER_API_KEY not set)',\n errorDetails: {\n code: ErrorCode.AUTH_ERROR,\n message: 'LLM processor not available',\n retryable: false,\n },\n };\n }\n\n if (!content?.trim()) {\n return { content: content || '', processed: false, error: 'Empty content provided' };\n }\n\n // Truncate extremely long content to avoid token limits\n const truncatedContent = content.length > MAX_LLM_INPUT_CHARS\n ? content.substring(0, MAX_LLM_INPUT_CHARS) + '\\n\\n[Content truncated due to length]'\n : content;\n\n const prompt = config.what_to_extract\n ? `Extract and clean the following content. Focus on: ${config.what_to_extract}\\n\\nContent:\\n${truncatedContent}`\n : `Clean and extract the main content from the following text, removing navigation, ads, and irrelevant elements:\\n\\n${truncatedContent}`;\n\n // Select model based on Cerebras availability\n const activeModel = CEREBRAS.ENABLED ? CEREBRAS.MODEL : LLM_EXTRACTION.MODEL;\n\n // Build request body\n const requestBody: Record<string, unknown> = {\n model: activeModel,\n messages: [{ role: 'user', content: prompt }],\n max_tokens: config.max_tokens || LLM_EXTRACTION.MAX_TOKENS,\n };\n\n // Cerebras doesn't support reasoning parameter\n if (!CEREBRAS.ENABLED && LLM_EXTRACTION.ENABLE_REASONING) {\n requestBody.reasoning = { enabled: true };\n }\n\n let lastError: StructuredError | undefined;\n\n // Retry loop\n for (let attempt = 0; attempt <= LLM_RETRY_CONFIG.maxRetries; attempt++) {\n try {\n if (attempt === 0) {\n mcpLog('info', `Starting extraction with ${activeModel}${CEREBRAS.ENABLED ? ' (Cerebras)' : ''}`, 'llm');\n } else {\n mcpLog('warning', `Retry attempt ${attempt}/${LLM_RETRY_CONFIG.maxRetries}`, 'llm');\n }\n\n const response = await withStallProtection(\n (stallSignal) => withRequestTimeout(\n (timeoutSignal) => {\n // Merge external signal, stall signal, and timeout signal\n const mergedController = new AbortController();\n const abortMerged = () => mergedController.abort();\n signal?.addEventListener('abort', abortMerged, { once: true });\n stallSignal.addEventListener('abort', abortMerged, { once: true });\n timeoutSignal.addEventListener('abort', abortMerged, { once: true });\n\n return processor.chat.completions.create(\n requestBody as unknown as OpenAI.ChatCompletionCreateParamsNonStreaming,\n { signal: mergedController.signal }\n ).finally(() => {\n signal?.removeEventListener('abort', abortMerged);\n stallSignal.removeEventListener('abort', abortMerged);\n timeoutSignal.removeEventListener('abort', abortMerged);\n });\n },\n LLM_REQUEST_DEADLINE_MS,\n `LLM extraction (${activeModel})`,\n ),\n LLM_STALL_TIMEOUT_MS,\n 3,\n `LLM extraction (${activeModel})`,\n );\n\n const result = response.choices?.[0]?.message?.content;\n if (result && result.trim()) {\n mcpLog('info', `Successfully extracted ${result.length} characters`, 'llm');\n return { content: result, processed: true };\n }\n\n // Empty response - not retryable\n mcpLog('warning', 'Received empty response from LLM', 'llm');\n return {\n content,\n processed: false,\n error: 'LLM returned empty response',\n errorDetails: {\n code: ErrorCode.INTERNAL_ERROR,\n message: 'LLM returned empty response',\n retryable: false,\n },\n };\n\n } catch (err: unknown) {\n lastError = classifyError(err);\n\n // Log the error\n const status = hasStatus(err) ? err.status : undefined;\n const code = typeof err === 'object' && err !== null && 'code' in err\n ? String((err as Record<string, unknown>).code)\n : undefined;\n mcpLog('error', `Error (attempt ${attempt + 1}): ${lastError.message} [status=${status}, code=${code}, retryable=${isRetryableLLMError(err)}]`, 'llm');\n\n // Check if we should retry\n if (isRetryableLLMError(err) && attempt < LLM_RETRY_CONFIG.maxRetries) {\n const delayMs = calculateLLMBackoff(attempt);\n mcpLog('warning', `Retrying in ${delayMs}ms...`, 'llm');\n try { await sleep(delayMs, signal); } catch { break; }\n continue;\n }\n\n // Non-retryable or max retries reached\n break;\n }\n }\n\n // All attempts failed - return original content with error info\n const errorMessage = lastError?.message || 'Unknown LLM error';\n mcpLog('error', `All attempts failed: ${errorMessage}. Returning original content.`, 'llm');\n\n return {\n content, // Return original content as fallback\n processed: false,\n error: `LLM extraction failed: ${errorMessage}`,\n errorDetails: lastError || {\n code: ErrorCode.UNKNOWN_ERROR,\n message: errorMessage,\n retryable: false,\n },\n };\n}\n\n"],
|
|
5
|
+
"mappings": "AAMA,OAAO,YAAY;AACnB,SAAS,UAAU,gBAAgB,UAAU,uBAAuB;AACpE;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OAEK;AACP,SAAS,cAAc;AAGhB,MAAM,0BAA0B;AAGvC,MAAM,sBAAsB;AAG5B,MAAM,wBAAwB;AAG9B,MAAM,wBAAwB;AAG9B,MAAM,uBAAuB;AAG7B,MAAM,0BAA0B;AAgBhC,MAAM,mBAAmB;AAAA,EACvB,YAAY;AAAA,EACZ,aAAa;AAAA,EACb,YAAY;AACd;AAGA,MAAM,4BAA4B,oBAAI,IAAI;AAAA,EACxC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAGD,SAAS,UAAU,OAA6C;AAC9D,SACE,OAAO,UAAU,YACjB,UAAU,QACV,YAAY,SACZ,OAAQ,MAAkC,WAAW;AAEzD;AAEA,IAAI,YAA2B;AAC/B,IAAI,iBAAgC;AAE7B,SAAS,qBAAoC;AAClD,MAAI,CAAC,gBAAgB,EAAE,cAAe,QAAO;AAG7C,MAAI,SAAS,SAAS;AACpB,QAAI,CAAC,gBAAgB;AACnB,uBAAiB,IAAI,OAAO;AAAA,QAC1B,SAAS,SAAS;AAAA,QAClB,QAAQ,SAAS;AAAA,QACjB,SAAS;AAAA,QACT,YAAY;AAAA,MACd,CAAC;AACD,aAAO,QAAQ,kCAAkC,SAAS,KAAK,KAAK,KAAK;AAAA,IAC3E;AACA,WAAO;AAAA,EACT;AAGA,MAAI,CAAC,WAAW;AACd,gBAAY,IAAI,OAAO;AAAA,MACrB,SAAS,SAAS;AAAA,MAClB,QAAQ,SAAS;AAAA,MACjB,SAAS;AAAA,MACT,YAAY;AAAA,IACd,CAAC;AAAA,EACH;AACA,SAAO;AACT;AAKA,SAAS,oBAAoB,OAAyB;AACpD,MAAI,CAAC,SAAS,OAAO,UAAU,SAAU,QAAO;AAGhD,QAAM,YAAa,OAA6B;AAChD,MAAI,cAAc,cAAc,cAAc,aAAa;AACzD,WAAO;AAAA,EACT;AAGA,MAAI,UAAU,KAAK,GAAG;AACpB,QAAI,MAAM,WAAW,OAAO,MAAM,WAAW,OAAO,MAAM,WAAW,OAAO,MAAM,WAAW,OAAO,MAAM,WAAW,KAAK;AACxH,aAAO;AAAA,IACT;AAAA,EACF;AAGA,QAAM,SAAS;AACf,QAAM,OAAO,OAAO,OAAO,SAAS,WAAW,OAAO,OAAO;AAC7D,QAAM,SACJ,OAAO,OAAO,UAAU,YAAY,OAAO,UAAU,OAChD,OAAO,QACR;AACN,QAAM,YACJ,SACC,UAAU,OAAO,OAAO,SAAS,WAAW,OAAO,OAAO,YAC1D,UAAU,OAAO,OAAO,SAAS,WAAW,OAAO,OAAO;AAC7D,MAAI,aAAa,0BAA0B,IAAI,SAAS,GAAG;AACzD,WAAO;AAAA,EACT;AAGA,QAAM,UAAU,OAAO,OAAO,YAAY,WAAW,OAAO,QAAQ,YAAY,IAAI;AACpF,MACE,QAAQ,SAAS,YAAY,KAC7B,QAAQ,SAAS,SAAS,KAC1B,QAAQ,SAAS,WAAW,KAC5B,QAAQ,SAAS,qBAAqB,KACtC,QAAQ,SAAS,cAAc,KAC/B,QAAQ,SAAS,YAAY,KAC7B,QAAQ,SAAS,YAAY,GAC7B;AACA,WAAO;AAAA,EACT;AAEA,SAAO;AACT;AAKA,SAAS,oBAAoB,SAAyB;AACpD,QAAM,mBAAmB,iBAAiB,cAAc,KAAK,IAAI,GAAG,OAAO;AAC3E,QAAM,SAAS,KAAK,OAAO,IAAI,wBAAwB;AACvD,SAAO,KAAK,IAAI,mBAAmB,QAAQ,iBAAiB,UAAU;AACxE;AAOA,eAAsB,sBACpB,SACA,QACA,WACA,QACoB;AAEpB,MAAI,CAAC,OAAO,SAAS;AACnB,WAAO,EAAE,SAAS,WAAW,MAAM;AAAA,EACrC;AAEA,MAAI,CAAC,WAAW;AACd,WAAO;AAAA,MACL;AAAA,MACA,WAAW;AAAA,MACX,OAAO;AAAA,MACP,cAAc;AAAA,QACZ,MAAM,UAAU;AAAA,QAChB,SAAS;AAAA,QACT,WAAW;AAAA,MACb;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,SAAS,KAAK,GAAG;AACpB,WAAO,EAAE,SAAS,WAAW,IAAI,WAAW,OAAO,OAAO,yBAAyB;AAAA,EACrF;AAGA,QAAM,mBAAmB,QAAQ,SAAS,sBACtC,QAAQ,UAAU,GAAG,mBAAmB,IAAI,0CAC5C;AAEJ,QAAM,SAAS,OAAO,kBAClB,sDAAsD,OAAO,eAAe;AAAA;AAAA;AAAA,EAAiB,gBAAgB,KAC7G;AAAA;AAAA,EAAqH,gBAAgB;AAGzI,QAAM,cAAc,SAAS,UAAU,SAAS,QAAQ,eAAe;AAGvE,QAAM,cAAuC;AAAA,IAC3C,OAAO;AAAA,IACP,UAAU,CAAC,EAAE,MAAM,QAAQ,SAAS,OAAO,CAAC;AAAA,IAC5C,YAAY,OAAO,cAAc,eAAe;AAAA,EAClD;AAGA,MAAI,CAAC,SAAS,WAAW,eAAe,kBAAkB;AACxD,gBAAY,YAAY,EAAE,SAAS,KAAK;AAAA,EAC1C;AAEA,MAAI;AAGJ,WAAS,UAAU,GAAG,WAAW,iBAAiB,YAAY,WAAW;AACvE,QAAI;AACF,UAAI,YAAY,GAAG;AACjB,eAAO,QAAQ,4BAA4B,WAAW,GAAG,SAAS,UAAU,gBAAgB,EAAE,IAAI,KAAK;AAAA,MACzG,OAAO;AACL,eAAO,WAAW,iBAAiB,OAAO,IAAI,iBAAiB,UAAU,IAAI,KAAK;AAAA,MACpF;AAEA,YAAM,WAAW,MAAM;AAAA,QACrB,CAAC,gBAAgB;AAAA,UACf,CAAC,kBAAkB;AAEjB,kBAAM,mBAAmB,IAAI,gBAAgB;AAC7C,kBAAM,cAAc,MAAM,iBAAiB,MAAM;AACjD,oBAAQ,iBAAiB,SAAS,aAAa,EAAE,MAAM,KAAK,CAAC;AAC7D,wBAAY,iBAAiB,SAAS,aAAa,EAAE,MAAM,KAAK,CAAC;AACjE,0BAAc,iBAAiB,SAAS,aAAa,EAAE,MAAM,KAAK,CAAC;AAEnE,mBAAO,UAAU,KAAK,YAAY;AAAA,cAChC;AAAA,cACA,EAAE,QAAQ,iBAAiB,OAAO;AAAA,YACpC,EAAE,QAAQ,MAAM;AACd,sBAAQ,oBAAoB,SAAS,WAAW;AAChD,0BAAY,oBAAoB,SAAS,WAAW;AACpD,4BAAc,oBAAoB,SAAS,WAAW;AAAA,YACxD,CAAC;AAAA,UACH;AAAA,UACA;AAAA,UACA,mBAAmB,WAAW;AAAA,QAChC;AAAA,QACA;AAAA,QACA;AAAA,QACA,mBAAmB,WAAW;AAAA,MAChC;AAEA,YAAM,SAAS,SAAS,UAAU,CAAC,GAAG,SAAS;AAC/C,UAAI,UAAU,OAAO,KAAK,GAAG;AAC3B,eAAO,QAAQ,0BAA0B,OAAO,MAAM,eAAe,KAAK;AAC1E,eAAO,EAAE,SAAS,QAAQ,WAAW,KAAK;AAAA,MAC5C;AAGA,aAAO,WAAW,oCAAoC,KAAK;AAC3D,aAAO;AAAA,QACL;AAAA,QACA,WAAW;AAAA,QACX,OAAO;AAAA,QACP,cAAc;AAAA,UACZ,MAAM,UAAU;AAAA,UAChB,SAAS;AAAA,UACT,WAAW;AAAA,QACb;AAAA,MACF;AAAA,IAEF,SAAS,KAAc;AACrB,kBAAY,cAAc,GAAG;AAG7B,YAAM,SAAS,UAAU,GAAG,IAAI,IAAI,SAAS;AAC7C,YAAM,OAAO,OAAO,QAAQ,YAAY,QAAQ,QAAQ,UAAU,MAC9D,OAAQ,IAAgC,IAAI,IAC5C;AACJ,aAAO,SAAS,kBAAkB,UAAU,CAAC,MAAM,UAAU,OAAO,YAAY,MAAM,UAAU,IAAI,eAAe,oBAAoB,GAAG,CAAC,KAAK,KAAK;AAGrJ,UAAI,oBAAoB,GAAG,KAAK,UAAU,iBAAiB,YAAY;AACrE,cAAM,UAAU,oBAAoB,OAAO;AAC3C,eAAO,WAAW,eAAe,OAAO,SAAS,KAAK;AACtD,YAAI;AAAE,gBAAM,MAAM,SAAS,MAAM;AAAA,QAAG,QAAQ;AAAE;AAAA,QAAO;AACrD;AAAA,MACF;AAGA;AAAA,IACF;AAAA,EACF;AAGA,QAAM,eAAe,WAAW,WAAW;AAC3C,SAAO,SAAS,wBAAwB,YAAY,iCAAiC,KAAK;AAE1F,SAAO;AAAA,IACL;AAAA;AAAA,IACA,WAAW;AAAA,IACX,OAAO,0BAA0B,YAAY;AAAA,IAC7C,cAAc,aAAa;AAAA,MACzB,MAAM,UAAU;AAAA,MAChB,SAAS;AAAA,MACT,WAAW;AAAA,IACb;AAAA,EACF;AACF;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export declare class MarkdownCleaner {
|
|
2
|
+
/**
|
|
3
|
+
* Process HTML content and convert to clean Markdown
|
|
4
|
+
* NEVER throws - returns original content on any error for graceful degradation
|
|
5
|
+
*/
|
|
6
|
+
processContent(htmlContent: string): string;
|
|
7
|
+
}
|
|
8
|
+
//# sourceMappingURL=markdown-cleaner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown-cleaner.d.ts","sourceRoot":"","sources":["../../../src/services/markdown-cleaner.ts"],"names":[],"mappings":"AAsCA,qBAAa,eAAe;IAC1B;;;OAGG;IACH,cAAc,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM;CAwC5C"}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import TurndownService from "turndown";
|
|
2
|
+
const turndown = new TurndownService({
|
|
3
|
+
headingStyle: "atx",
|
|
4
|
+
codeBlockStyle: "fenced",
|
|
5
|
+
bulletListMarker: "-"
|
|
6
|
+
});
|
|
7
|
+
turndown.remove(["script", "style", "nav", "footer", "aside", "noscript"]);
|
|
8
|
+
const MAX_CONTENT_LENGTH = 524288;
|
|
9
|
+
function removeHtmlComments(html) {
|
|
10
|
+
const parts = [];
|
|
11
|
+
let pos = 0;
|
|
12
|
+
while (pos < html.length) {
|
|
13
|
+
const start = html.indexOf("<!--", pos);
|
|
14
|
+
if (start === -1) {
|
|
15
|
+
parts.push(html.substring(pos));
|
|
16
|
+
break;
|
|
17
|
+
}
|
|
18
|
+
if (start > pos) parts.push(html.substring(pos, start));
|
|
19
|
+
const end = html.indexOf("-->", start + 4);
|
|
20
|
+
if (end === -1) {
|
|
21
|
+
parts.push(html.substring(start));
|
|
22
|
+
break;
|
|
23
|
+
}
|
|
24
|
+
pos = end + 3;
|
|
25
|
+
}
|
|
26
|
+
return parts.join("");
|
|
27
|
+
}
|
|
28
|
+
class MarkdownCleaner {
|
|
29
|
+
/**
|
|
30
|
+
* Process HTML content and convert to clean Markdown
|
|
31
|
+
* NEVER throws - returns original content on any error for graceful degradation
|
|
32
|
+
*/
|
|
33
|
+
processContent(htmlContent) {
|
|
34
|
+
try {
|
|
35
|
+
if (!htmlContent || typeof htmlContent !== "string") {
|
|
36
|
+
return htmlContent || "";
|
|
37
|
+
}
|
|
38
|
+
if (!htmlContent.includes("<")) {
|
|
39
|
+
return htmlContent.trim();
|
|
40
|
+
}
|
|
41
|
+
if (htmlContent.length > MAX_CONTENT_LENGTH) {
|
|
42
|
+
htmlContent = htmlContent.substring(0, MAX_CONTENT_LENGTH);
|
|
43
|
+
}
|
|
44
|
+
let content = removeHtmlComments(htmlContent);
|
|
45
|
+
content = turndown.turndown(content);
|
|
46
|
+
content = content.replace(/\n{3,}/g, "\n\n");
|
|
47
|
+
content = content.trim();
|
|
48
|
+
return content;
|
|
49
|
+
} catch (error) {
|
|
50
|
+
console.error(
|
|
51
|
+
"[MarkdownCleaner] processContent failed:",
|
|
52
|
+
error instanceof Error ? error.message : String(error),
|
|
53
|
+
"| Content length:",
|
|
54
|
+
htmlContent?.length ?? 0
|
|
55
|
+
);
|
|
56
|
+
return htmlContent || "";
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
export {
|
|
61
|
+
MarkdownCleaner
|
|
62
|
+
};
|
|
63
|
+
//# sourceMappingURL=markdown-cleaner.js.map
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": 3,
|
|
3
|
+
"sources": ["../../../src/services/markdown-cleaner.ts"],
|
|
4
|
+
"sourcesContent": ["/**\n * Markdown cleaner service using Turndown for HTML to Markdown conversion\n */\nimport TurndownService from 'turndown';\n\nconst turndown = new TurndownService({\n headingStyle: 'atx',\n codeBlockStyle: 'fenced',\n bulletListMarker: '-',\n});\n\n// Remove script, style, nav, footer, aside elements\nturndown.remove(['script', 'style', 'nav', 'footer', 'aside', 'noscript']);\n\n/** Maximum HTML content length in characters \u2014 prevents event loop blocking on huge pages */\nconst MAX_CONTENT_LENGTH = 524_288 as const; // 512 * 1024\n\n/**\n * Remove HTML comments using linear-time indexOf loop.\n * Avoids catastrophic backtracking from /<!--[\\s\\S]*?-->/g on malformed HTML.\n */\nfunction removeHtmlComments(html: string): string {\n const parts: string[] = [];\n let pos = 0;\n while (pos < html.length) {\n const start = html.indexOf('<!--', pos);\n if (start === -1) { parts.push(html.substring(pos)); break; }\n if (start > pos) parts.push(html.substring(pos, start));\n const end = html.indexOf('-->', start + 4);\n if (end === -1) {\n parts.push(html.substring(start)); // preserve unclosed comment + rest\n break;\n }\n pos = end + 3;\n }\n return parts.join('');\n}\n\nexport class MarkdownCleaner {\n /**\n * Process HTML content and convert to clean Markdown\n * NEVER throws - returns original content on any error for graceful degradation\n */\n processContent(htmlContent: string): string {\n try {\n // Handle null/undefined/non-string inputs gracefully\n if (!htmlContent || typeof htmlContent !== 'string') {\n return htmlContent || '';\n }\n\n // If already markdown (no HTML tags), return as-is\n if (!htmlContent.includes('<')) {\n return htmlContent.trim();\n }\n\n // Truncate oversized HTML to prevent blocking the event loop\n if (htmlContent.length > MAX_CONTENT_LENGTH) {\n htmlContent = htmlContent.substring(0, MAX_CONTENT_LENGTH);\n }\n\n // Remove HTML comments before conversion (linear-time)\n let content = removeHtmlComments(htmlContent);\n\n // Convert HTML to Markdown using Turndown\n content = turndown.turndown(content);\n\n // Clean up whitespace\n content = content.replace(/\\n{3,}/g, '\\n\\n');\n content = content.trim();\n\n return content;\n } catch (error) {\n // Log error but don't crash - return original content for graceful degradation\n console.error(\n '[MarkdownCleaner] processContent failed:',\n error instanceof Error ? error.message : String(error),\n '| Content length:',\n htmlContent?.length ?? 0\n );\n // Return original content if conversion fails\n return htmlContent || '';\n }\n }\n}\n"],
|
|
5
|
+
"mappings": "AAGA,OAAO,qBAAqB;AAE5B,MAAM,WAAW,IAAI,gBAAgB;AAAA,EACnC,cAAc;AAAA,EACd,gBAAgB;AAAA,EAChB,kBAAkB;AACpB,CAAC;AAGD,SAAS,OAAO,CAAC,UAAU,SAAS,OAAO,UAAU,SAAS,UAAU,CAAC;AAGzE,MAAM,qBAAqB;AAM3B,SAAS,mBAAmB,MAAsB;AAChD,QAAM,QAAkB,CAAC;AACzB,MAAI,MAAM;AACV,SAAO,MAAM,KAAK,QAAQ;AACxB,UAAM,QAAQ,KAAK,QAAQ,QAAQ,GAAG;AACtC,QAAI,UAAU,IAAI;AAAE,YAAM,KAAK,KAAK,UAAU,GAAG,CAAC;AAAG;AAAA,IAAO;AAC5D,QAAI,QAAQ,IAAK,OAAM,KAAK,KAAK,UAAU,KAAK,KAAK,CAAC;AACtD,UAAM,MAAM,KAAK,QAAQ,OAAO,QAAQ,CAAC;AACzC,QAAI,QAAQ,IAAI;AACd,YAAM,KAAK,KAAK,UAAU,KAAK,CAAC;AAChC;AAAA,IACF;AACA,UAAM,MAAM;AAAA,EACd;AACA,SAAO,MAAM,KAAK,EAAE;AACtB;AAEO,MAAM,gBAAgB;AAAA;AAAA;AAAA;AAAA;AAAA,EAK3B,eAAe,aAA6B;AAC1C,QAAI;AAEF,UAAI,CAAC,eAAe,OAAO,gBAAgB,UAAU;AACnD,eAAO,eAAe;AAAA,MACxB;AAGA,UAAI,CAAC,YAAY,SAAS,GAAG,GAAG;AAC9B,eAAO,YAAY,KAAK;AAAA,MAC1B;AAGA,UAAI,YAAY,SAAS,oBAAoB;AAC3C,sBAAc,YAAY,UAAU,GAAG,kBAAkB;AAAA,MAC3D;AAGA,UAAI,UAAU,mBAAmB,WAAW;AAG5C,gBAAU,SAAS,SAAS,OAAO;AAGnC,gBAAU,QAAQ,QAAQ,WAAW,MAAM;AAC3C,gBAAU,QAAQ,KAAK;AAEvB,aAAO;AAAA,IACT,SAAS,OAAO;AAEd,cAAQ;AAAA,QACN;AAAA,QACA,iBAAiB,QAAQ,MAAM,UAAU,OAAO,KAAK;AAAA,QACrD;AAAA,QACA,aAAa,UAAU;AAAA,MACzB;AAEA,aAAO,eAAe;AAAA,IACxB;AAAA,EACF;AACF;",
|
|
6
|
+
"names": []
|
|
7
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GitHub Score Tool Handler
|
|
3
|
+
* Searches GitHub repos by keywords, fetches detailed data, calculates quality metrics,
|
|
4
|
+
* and returns a scored Markdown table.
|
|
5
|
+
* NEVER throws — always returns structured response for graceful degradation.
|
|
6
|
+
*/
|
|
7
|
+
import type { MCPServer } from 'mcp-use/server';
|
|
8
|
+
import { type GitHubScoreParams, type GitHubScoreOutput } from '../schemas/github-score.js';
|
|
9
|
+
import { type ToolExecutionResult, type ToolReporter } from './mcp-helpers.js';
|
|
10
|
+
export declare function handleGitHubScore(params: GitHubScoreParams, reporter?: ToolReporter): Promise<ToolExecutionResult<GitHubScoreOutput>>;
|
|
11
|
+
export declare function registerGitHubScoreTool(server: MCPServer): void;
|
|
12
|
+
//# sourceMappingURL=github-score.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"github-score.d.ts","sourceRoot":"","sources":["../../../src/tools/github-score.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,gBAAgB,CAAC;AAGhD,OAAO,EAGL,KAAK,iBAAiB,EACtB,KAAK,iBAAiB,EACvB,MAAM,4BAA4B,CAAC;AAUpC,OAAO,EAML,KAAK,mBAAmB,EACxB,KAAK,YAAY,EAClB,MAAM,kBAAkB,CAAC;AAiJ1B,wBAAsB,iBAAiB,CACrC,MAAM,EAAE,iBAAiB,EACzB,QAAQ,GAAE,YAA4B,GACrC,OAAO,CAAC,mBAAmB,CAAC,iBAAiB,CAAC,CAAC,CAyIjD;AA0CD,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,SAAS,GAAG,IAAI,CAqC/D"}
|