mcp-researchpowerpack 7.0.10 → 7.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +4662 -21
- package/dist/index.js.map +4 -4
- package/dist/mcp-use.json +2 -2
- package/dist/src/clients/jina.js +202 -16
- package/dist/src/clients/jina.js.map +3 -3
- package/dist/src/clients/kernel.js +254 -7
- package/dist/src/clients/kernel.js.map +4 -4
- package/dist/src/clients/reddit.js +326 -23
- package/dist/src/clients/reddit.js.map +4 -4
- package/dist/src/clients/scraper.js +345 -22
- package/dist/src/clients/scraper.js.map +4 -4
- package/dist/src/clients/search.js +316 -20
- package/dist/src/clients/search.js.map +4 -4
- package/dist/src/config/index.js +39 -10
- package/dist/src/config/index.js.map +3 -3
- package/dist/src/effect/errors.js +130 -5
- package/dist/src/effect/errors.js.map +3 -3
- package/dist/src/effect/runtime.js +1893 -4
- package/dist/src/effect/runtime.js.map +4 -4
- package/dist/src/effect/services.js +2124 -22
- package/dist/src/effect/services.js.map +4 -4
- package/dist/src/schemas/scrape-links.js +6 -5
- package/dist/src/schemas/scrape-links.js.map +1 -1
- package/dist/src/schemas/start-research.js +2 -1
- package/dist/src/schemas/start-research.js.map +1 -1
- package/dist/src/schemas/web-search.js +9 -8
- package/dist/src/schemas/web-search.js.map +1 -1
- package/dist/src/services/llm-processor.js +406 -25
- package/dist/src/services/llm-processor.js.map +4 -4
- package/dist/src/services/markdown-cleaner.js +6 -5
- package/dist/src/services/markdown-cleaner.js.map +1 -1
- package/dist/src/tools/mcp-helpers.js +2 -1
- package/dist/src/tools/mcp-helpers.js.map +1 -1
- package/dist/src/tools/registry.js +4629 -3
- package/dist/src/tools/registry.js.map +4 -4
- package/dist/src/tools/scrape.js +2610 -80
- package/dist/src/tools/scrape.js.map +4 -4
- package/dist/src/tools/search.js +2388 -59
- package/dist/src/tools/search.js.map +4 -4
- package/dist/src/tools/start-research.js +2030 -23
- package/dist/src/tools/start-research.js.map +4 -4
- package/dist/src/tools/utils.js +98 -7
- package/dist/src/tools/utils.js.map +3 -3
- package/dist/src/utils/concurrency.js +1 -0
- package/dist/src/utils/concurrency.js.map +1 -1
- package/dist/src/utils/content-extractor.js +27 -2
- package/dist/src/utils/content-extractor.js.map +3 -3
- package/dist/src/utils/content-quality.js +4 -3
- package/dist/src/utils/content-quality.js.map +1 -1
- package/dist/src/utils/errors.js +26 -3
- package/dist/src/utils/errors.js.map +3 -3
- package/dist/src/utils/logger.js +1 -0
- package/dist/src/utils/logger.js.map +1 -1
- package/dist/src/utils/markdown-formatter.js +1 -0
- package/dist/src/utils/markdown-formatter.js.map +1 -1
- package/dist/src/utils/query-relax.js +9 -8
- package/dist/src/utils/query-relax.js.map +1 -1
- package/dist/src/utils/response.js +3 -2
- package/dist/src/utils/response.js.map +1 -1
- package/dist/src/utils/retry.js +5 -4
- package/dist/src/utils/retry.js.map +1 -1
- package/dist/src/utils/sanitize.js +4 -3
- package/dist/src/utils/sanitize.js.map +1 -1
- package/dist/src/utils/source-type.js +4 -3
- package/dist/src/utils/source-type.js.map +1 -1
- package/dist/src/utils/url-aggregator.js +112 -11
- package/dist/src/utils/url-aggregator.js.map +3 -3
- package/dist/src/version.js +7 -6
- package/dist/src/version.js.map +1 -1
- package/package.json +3 -3
package/dist/index.js
CHANGED
|
@@ -1,22 +1,4663 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
}
|
|
5
|
-
import { Logger } from "mcp-use";
|
|
2
|
+
|
|
3
|
+
// index.ts
|
|
4
|
+
import { Logger as Logger5 } from "mcp-use";
|
|
6
5
|
import {
|
|
7
6
|
InMemorySessionStore,
|
|
8
7
|
InMemoryStreamManager,
|
|
9
8
|
MCPServer,
|
|
10
9
|
object
|
|
11
10
|
} from "mcp-use/server";
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
import {
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
11
|
+
|
|
12
|
+
// src/config/index.ts
|
|
13
|
+
import { Logger } from "mcp-use";
|
|
14
|
+
|
|
15
|
+
// src/version.ts
|
|
16
|
+
import { createRequire } from "module";
|
|
17
|
+
import { fileURLToPath } from "url";
|
|
18
|
+
import { dirname, join } from "path";
|
|
19
|
+
var DEFAULT_PACKAGE_INFO = {
|
|
20
|
+
version: "3.9.5",
|
|
21
|
+
name: "mcp-researchpowerpack-http",
|
|
22
|
+
description: "Research Powerpack MCP Server"
|
|
23
|
+
};
|
|
24
|
+
var packageJson = { ...DEFAULT_PACKAGE_INFO };
|
|
25
|
+
try {
|
|
26
|
+
if (typeof import.meta.url === "string" && import.meta.url.startsWith("file:")) {
|
|
27
|
+
const _require = createRequire(import.meta.url);
|
|
28
|
+
const _dirname = dirname(fileURLToPath(import.meta.url));
|
|
29
|
+
try {
|
|
30
|
+
packageJson = _require(join(_dirname, "..", "package.json"));
|
|
31
|
+
} catch {
|
|
32
|
+
packageJson = _require(join(_dirname, "..", "..", "package.json"));
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
} catch {
|
|
36
|
+
}
|
|
37
|
+
var VERSION = packageJson.version;
|
|
38
|
+
var PACKAGE_NAME = packageJson.name;
|
|
39
|
+
var PACKAGE_DESCRIPTION = packageJson.description;
|
|
40
|
+
var USER_AGENT_VERSION = `${PACKAGE_NAME}/${VERSION}`;
|
|
41
|
+
|
|
42
|
+
// src/config/index.ts
|
|
43
|
+
function safeParseInt(value, defaultVal, min, max) {
|
|
44
|
+
const logger2 = Logger.get("config");
|
|
45
|
+
if (!value) {
|
|
46
|
+
return defaultVal;
|
|
47
|
+
}
|
|
48
|
+
const parsed = parseInt(value, 10);
|
|
49
|
+
if (isNaN(parsed)) {
|
|
50
|
+
logger2.warn(`Invalid number "${value}", using default ${defaultVal}`);
|
|
51
|
+
return defaultVal;
|
|
52
|
+
}
|
|
53
|
+
if (parsed < min) {
|
|
54
|
+
logger2.warn(`Value ${parsed} below minimum ${min}, clamping to ${min}`);
|
|
55
|
+
return min;
|
|
56
|
+
}
|
|
57
|
+
if (parsed > max) {
|
|
58
|
+
logger2.warn(`Value ${parsed} above maximum ${max}, clamping to ${max}`);
|
|
59
|
+
return max;
|
|
60
|
+
}
|
|
61
|
+
return parsed;
|
|
62
|
+
}
|
|
63
|
+
var cachedEnv = null;
|
|
64
|
+
function parseEnv() {
|
|
65
|
+
if (cachedEnv) return cachedEnv;
|
|
66
|
+
cachedEnv = {
|
|
67
|
+
SCRAPER_API_KEY: process.env.SCRAPEDO_API_KEY || "",
|
|
68
|
+
SEARCH_API_KEY: process.env.SERPER_API_KEY || void 0,
|
|
69
|
+
REDDIT_CLIENT_ID: process.env.REDDIT_CLIENT_ID || void 0,
|
|
70
|
+
REDDIT_CLIENT_SECRET: process.env.REDDIT_CLIENT_SECRET || void 0,
|
|
71
|
+
JINA_API_KEY: process.env.JINA_API_KEY || void 0,
|
|
72
|
+
KERNEL_API_KEY: process.env.KERNEL_API_KEY || void 0,
|
|
73
|
+
KERNEL_PROJECT: process.env.KERNEL_PROJECT || void 0
|
|
74
|
+
};
|
|
75
|
+
return cachedEnv;
|
|
76
|
+
}
|
|
77
|
+
var SERVER = {
|
|
78
|
+
NAME: PACKAGE_NAME,
|
|
79
|
+
VERSION,
|
|
80
|
+
DESCRIPTION: PACKAGE_DESCRIPTION
|
|
81
|
+
};
|
|
82
|
+
function getCapabilities() {
|
|
83
|
+
const env = parseEnv();
|
|
84
|
+
return {
|
|
85
|
+
reddit: !!(env.REDDIT_CLIENT_ID && env.REDDIT_CLIENT_SECRET),
|
|
86
|
+
search: !!(env.SEARCH_API_KEY || env.JINA_API_KEY),
|
|
87
|
+
serperSearch: !!env.SEARCH_API_KEY,
|
|
88
|
+
jina: !!env.JINA_API_KEY,
|
|
89
|
+
scraping: !!env.SCRAPER_API_KEY,
|
|
90
|
+
kernel: !!env.KERNEL_API_KEY,
|
|
91
|
+
llmExtraction: getLLMConfigStatus().configured
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
function getMissingEnvMessage(capability) {
|
|
95
|
+
const messages = {
|
|
96
|
+
reddit: '\u274C **Reddit scraping unavailable.** Set `REDDIT_CLIENT_ID` and `REDDIT_CLIENT_SECRET` to enable threaded Reddit post fetching in `raw-scrape-links` and `smart-scrape-links`.\n\n\u{1F449} Create a Reddit app at: https://www.reddit.com/prefs/apps (select "script" type)',
|
|
97
|
+
search: "\u274C **Search unavailable.** Set `SERPER_API_KEY` or `JINA_API_KEY` to enable `raw-web-search` and `smart-web-search`.\n\n\u{1F449} Serper provides Google SERPs; Jina Search is used as a fallback/search-only provider.",
|
|
98
|
+
serperSearch: "\u274C **Serper search unavailable.** Set `SERPER_API_KEY` to enable Google-backed primary search.",
|
|
99
|
+
jina: "\u274C **Jina unavailable.** Set `JINA_API_KEY` to enable Jina Search and authenticated Jina Reader requests.",
|
|
100
|
+
scraping: "\u26A0\uFE0F **Scrape.do proxy fallback unavailable.** Set `SCRAPEDO_API_KEY` to enable Jina Reader retries through Scrape.do proxy mode.\n\n\u{1F449} Sign up at: https://scrape.do (1,000 free credits)",
|
|
101
|
+
kernel: "\u274C **Kernel browser rendering unavailable.** Set `KERNEL_API_KEY` to enable optional browser-render fallback for raw/smart scraping.",
|
|
102
|
+
llmExtraction: "\u26A0\uFE0F **AI extraction disabled.** Set `LLM_API_KEY`, `LLM_BASE_URL`, and `LLM_MODEL` to enable `smart-web-search` and `smart-scrape-links`.\n\nUse the raw tools when you need markdown without LLM processing."
|
|
103
|
+
};
|
|
104
|
+
return messages[capability];
|
|
105
|
+
}
|
|
106
|
+
var CONCURRENCY = {
|
|
107
|
+
SEARCH: safeParseInt(process.env.CONCURRENCY_SEARCH, 50, 1, 200),
|
|
108
|
+
SCRAPER: safeParseInt(process.env.CONCURRENCY_SCRAPER, 50, 1, 200),
|
|
109
|
+
JINA_READER: safeParseInt(process.env.CONCURRENCY_JINA_READER, 50, 1, 200),
|
|
110
|
+
REDDIT: safeParseInt(process.env.CONCURRENCY_REDDIT, 50, 1, 200),
|
|
111
|
+
LLM_EXTRACTION: safeParseInt(process.env.LLM_CONCURRENCY, 50, 1, 200),
|
|
112
|
+
KERNEL: safeParseInt(process.env.CONCURRENCY_KERNEL, 3, 1, 20)
|
|
113
|
+
};
|
|
114
|
+
var SCRAPER = {
|
|
115
|
+
BATCH_SIZE: 30,
|
|
116
|
+
EXTRACTION_PREFIX: "Extract from document only \u2014 never hallucinate or add external knowledge.",
|
|
117
|
+
EXTRACTION_SUFFIX: "First line = content, not preamble. No confirmation messages."
|
|
118
|
+
};
|
|
119
|
+
var REDDIT = {
|
|
120
|
+
BATCH_SIZE: 10,
|
|
121
|
+
MAX_WORDS_PER_POST: 5e4,
|
|
122
|
+
MAX_WORDS_TOTAL: 5e5,
|
|
123
|
+
MIN_POSTS: 1,
|
|
124
|
+
MAX_POSTS: 50,
|
|
125
|
+
RETRY_COUNT: 5,
|
|
126
|
+
RETRY_DELAYS: [2e3, 4e3, 8e3, 16e3, 32e3]
|
|
127
|
+
};
|
|
128
|
+
var CTR_WEIGHTS = {
|
|
129
|
+
1: 100,
|
|
130
|
+
2: 60,
|
|
131
|
+
3: 48.89,
|
|
132
|
+
4: 33.33,
|
|
133
|
+
5: 28.89,
|
|
134
|
+
6: 26.44,
|
|
135
|
+
7: 24.44,
|
|
136
|
+
8: 17.78,
|
|
137
|
+
9: 13.33,
|
|
138
|
+
10: 12.56
|
|
139
|
+
};
|
|
140
|
+
var cachedLlmConfigStatus = null;
|
|
141
|
+
function getLLMConfigStatus() {
|
|
142
|
+
if (cachedLlmConfigStatus) return cachedLlmConfigStatus;
|
|
143
|
+
const apiKeyPresent = !!process.env.LLM_API_KEY?.trim();
|
|
144
|
+
const baseUrlPresent = !!process.env.LLM_BASE_URL?.trim();
|
|
145
|
+
const modelPresent = !!process.env.LLM_MODEL?.trim();
|
|
146
|
+
const missingVars = [];
|
|
147
|
+
if (!apiKeyPresent) missingVars.push("LLM_API_KEY");
|
|
148
|
+
if (!baseUrlPresent) missingVars.push("LLM_BASE_URL");
|
|
149
|
+
if (!modelPresent) missingVars.push("LLM_MODEL");
|
|
150
|
+
const configured = missingVars.length === 0;
|
|
151
|
+
cachedLlmConfigStatus = {
|
|
152
|
+
configured,
|
|
153
|
+
apiKeyPresent,
|
|
154
|
+
baseUrlPresent,
|
|
155
|
+
modelPresent,
|
|
156
|
+
missingVars,
|
|
157
|
+
error: configured ? null : `LLM disabled: missing ${missingVars.join(", ")}`
|
|
158
|
+
};
|
|
159
|
+
return cachedLlmConfigStatus;
|
|
160
|
+
}
|
|
161
|
+
var cachedLlmExtraction = null;
|
|
162
|
+
function getLlmExtraction() {
|
|
163
|
+
if (cachedLlmExtraction) return cachedLlmExtraction;
|
|
164
|
+
const apiKey = process.env.LLM_API_KEY?.trim() || "";
|
|
165
|
+
const baseUrl = process.env.LLM_BASE_URL?.trim();
|
|
166
|
+
const model = process.env.LLM_MODEL?.trim();
|
|
167
|
+
const fallbackModel = process.env.LLM_FALLBACK_MODEL?.trim() || "";
|
|
168
|
+
if (apiKey && !baseUrl) {
|
|
169
|
+
throw new Error(
|
|
170
|
+
"LLM_BASE_URL is required when LLM_API_KEY is set. Set LLM_BASE_URL to your OpenAI-compatible endpoint."
|
|
171
|
+
);
|
|
172
|
+
}
|
|
173
|
+
if (apiKey && !model) {
|
|
174
|
+
throw new Error(
|
|
175
|
+
"LLM_MODEL is required when LLM_API_KEY is set."
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
cachedLlmExtraction = {
|
|
179
|
+
API_KEY: apiKey,
|
|
180
|
+
BASE_URL: baseUrl || "",
|
|
181
|
+
MODEL: model || "",
|
|
182
|
+
FALLBACK_MODEL: fallbackModel
|
|
183
|
+
};
|
|
184
|
+
return cachedLlmExtraction;
|
|
185
|
+
}
|
|
186
|
+
var LLM_EXTRACTION = new Proxy({}, {
|
|
187
|
+
get(_target, prop) {
|
|
188
|
+
return getLlmExtraction()[prop];
|
|
189
|
+
}
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
// src/services/llm-processor.ts
|
|
193
|
+
import OpenAI from "openai";
|
|
194
|
+
|
|
195
|
+
// src/schemas/web-search.ts
|
|
196
|
+
import { z } from "zod";
|
|
197
|
+
var QUERY_REWRITE_PAIR_EXAMPLES = [
|
|
198
|
+
'Bad: `<feature> support` \u2192 Better: `site:<official-docs-domain> "<feature>" "<platform-or-version>"`',
|
|
199
|
+
'Bad: `<product> pricing` \u2192 Better: `site:<vendor-domain> "<product>" pricing "enterprise" OR "free tier"`',
|
|
200
|
+
'Bad: `<library> bug fix` \u2192 Better: `"<exact error text>" "<library-or-package>" "<version>" site:github.com`',
|
|
201
|
+
'Bad: `<tool> reviews` \u2192 Better: `site:reddit.com/r/<community>/comments "<tool>" "migration" OR "regression"`'
|
|
202
|
+
];
|
|
203
|
+
var QUERY_REWRITE_PAIR_GUIDANCE = [
|
|
204
|
+
"Write Google retrieval probes, not topic labels.",
|
|
205
|
+
"For each broad idea, rewrite it into a query that names the evidence source class, discriminating anchor terms, and one useful operator when possible.",
|
|
206
|
+
"Use rewrite-pair thinking before searching:",
|
|
207
|
+
...QUERY_REWRITE_PAIR_EXAMPLES,
|
|
208
|
+
"Do not repeat the same noun phrase with adjectives changed; fan out by source type and evidence need."
|
|
209
|
+
];
|
|
210
|
+
var QUERY_REWRITE_PAIR_GUIDANCE_TEXT = QUERY_REWRITE_PAIR_GUIDANCE.join(" ");
|
|
211
|
+
var keywordSchema = z.string().min(1, { message: "search: Keyword cannot be empty" }).describe(
|
|
212
|
+
`A single search keyword/query. Each item runs as a separate parallel search. ${QUERY_REWRITE_PAIR_GUIDANCE_TEXT}`
|
|
213
|
+
);
|
|
214
|
+
var keywordsSchema = z.array(keywordSchema).min(1, { message: "search: At least 1 keyword required" }).max(50, { message: "search: At most 50 keywords allowed per call" }).describe(
|
|
215
|
+
`Search keywords to run in parallel. Serper is primary when configured; Jina Search is fallback when Serper is missing, fails, or yields empty query results. ${QUERY_REWRITE_PAIR_GUIDANCE_TEXT} Think of keywords as retrieval probes, not topic labels. Pack distinct facets in one call: official docs, implementation, failures, comparisons, sentiment, changelog, CVE, pricing, or other source classes.`
|
|
216
|
+
);
|
|
217
|
+
var rawWebSearchParamsSchema = z.object({
|
|
218
|
+
keywords: keywordsSchema,
|
|
219
|
+
extract: z.never().optional(),
|
|
220
|
+
scope: z.never().optional(),
|
|
221
|
+
verbose: z.never().optional()
|
|
222
|
+
}).strict();
|
|
223
|
+
var smartWebSearchParamsSchema = z.object({
|
|
224
|
+
keywords: keywordsSchema,
|
|
225
|
+
extract: z.string().min(1, { message: "smart-web-search: extract cannot be empty" }).describe(
|
|
226
|
+
'Semantic instruction for the relevance classifier \u2014 what "relevant" means for THIS goal. This is the post-sort target, so name the evidence you need and the source-of-truth expectation: e.g. official docs/release notes for specs, issue/PR/error text for bugs, Reddit/HN/blogs for lived experience, vendor pricing pages for pricing, CVE databases for security. Drives tiering (HIGHLY_RELEVANT / MAYBE_RELEVANT / OTHER), synthesis, gap analysis, and refine-query suggestions. Be specific: "OAuth 2.1 support in TypeScript MCP frameworks \u2014 runnable code, not marketing", not "MCP OAuth".'
|
|
227
|
+
),
|
|
228
|
+
scope: z.enum(["web", "reddit", "both"]).default("web").describe(
|
|
229
|
+
'Search scope. "web" (default) = open web, no augmentation. "reddit" = server appends `site:reddit.com` to every keyword and filters results to post permalinks (`/r/.+/comments/[a-z0-9]+/`); subreddit homepages are dropped. "both" = runs every keyword twice (open web + reddit-scoped), merges the result set, and tags each row with its source. Use "reddit" for sentiment/migration/lived-experience research; use "both" when opinion-heavy AND official sources also matter.'
|
|
230
|
+
),
|
|
231
|
+
verbose: z.boolean().default(false).describe(
|
|
232
|
+
"Include per-row scoring/coverage metadata, the trailing Signals block, and CONSENSUS labels even when they carry little signal. Default false."
|
|
233
|
+
)
|
|
234
|
+
}).strict();
|
|
235
|
+
|
|
236
|
+
// src/utils/logger.ts
|
|
237
|
+
import { Logger as Logger2 } from "mcp-use";
|
|
238
|
+
function getLogger(name) {
|
|
239
|
+
return Logger2.get(name);
|
|
240
|
+
}
|
|
241
|
+
function mcpLog(level, message, loggerName) {
|
|
242
|
+
const logger2 = getLogger(loggerName ?? "research-powerpack");
|
|
243
|
+
switch (level) {
|
|
244
|
+
case "debug":
|
|
245
|
+
logger2.debug(message);
|
|
246
|
+
break;
|
|
247
|
+
case "info":
|
|
248
|
+
logger2.info(message);
|
|
249
|
+
break;
|
|
250
|
+
case "warning":
|
|
251
|
+
logger2.warn(message);
|
|
252
|
+
break;
|
|
253
|
+
case "error":
|
|
254
|
+
logger2.error(message);
|
|
255
|
+
break;
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// src/utils/errors.ts
|
|
260
|
+
var ErrorCode = {
|
|
261
|
+
// Retryable errors
|
|
262
|
+
RATE_LIMITED: "RATE_LIMITED",
|
|
263
|
+
TIMEOUT: "TIMEOUT",
|
|
264
|
+
NETWORK_ERROR: "NETWORK_ERROR",
|
|
265
|
+
SERVICE_UNAVAILABLE: "SERVICE_UNAVAILABLE",
|
|
266
|
+
// Non-retryable errors
|
|
267
|
+
AUTH_ERROR: "AUTH_ERROR",
|
|
268
|
+
INVALID_INPUT: "INVALID_INPUT",
|
|
269
|
+
NOT_FOUND: "NOT_FOUND",
|
|
270
|
+
QUOTA_EXCEEDED: "QUOTA_EXCEEDED",
|
|
271
|
+
UNSUPPORTED_BINARY_CONTENT: "UNSUPPORTED_BINARY_CONTENT",
|
|
272
|
+
// Internal errors
|
|
273
|
+
INTERNAL_ERROR: "INTERNAL_ERROR",
|
|
274
|
+
PARSE_ERROR: "PARSE_ERROR",
|
|
275
|
+
UNKNOWN_ERROR: "UNKNOWN_ERROR"
|
|
276
|
+
};
|
|
277
|
+
var DEFAULT_RETRY_OPTIONS = {
|
|
278
|
+
maxRetries: 3,
|
|
279
|
+
baseDelayMs: 1e3,
|
|
280
|
+
maxDelayMs: 3e4,
|
|
281
|
+
retryableStatuses: [408, 429, 500, 502, 503, 504, 510]
|
|
282
|
+
};
|
|
283
|
+
function classifyDomException(error2) {
|
|
284
|
+
if (error2.name === "AbortError") {
|
|
285
|
+
return { code: ErrorCode.TIMEOUT, message: "Request timed out", retryable: true };
|
|
286
|
+
}
|
|
287
|
+
return { code: ErrorCode.UNKNOWN_ERROR, message: error2.message, retryable: false };
|
|
288
|
+
}
|
|
289
|
+
function classifyByErrorCode(error2) {
|
|
290
|
+
const errCode = error2.code;
|
|
291
|
+
if (!errCode) return null;
|
|
292
|
+
const networkErrorMessages = {
|
|
293
|
+
ECONNREFUSED: "Connection refused \u2014 service may be down",
|
|
294
|
+
ECONNRESET: "Connection was reset \u2014 please retry",
|
|
295
|
+
ECONNABORTED: "Connection aborted \u2014 please retry",
|
|
296
|
+
ENOTFOUND: "Service not reachable \u2014 check your network",
|
|
297
|
+
EPIPE: "Connection lost \u2014 please retry",
|
|
298
|
+
EAI_AGAIN: "DNS lookup failed \u2014 check your network"
|
|
299
|
+
};
|
|
300
|
+
if (errCode === "ECONNREFUSED" || errCode === "ENOTFOUND" || errCode === "ECONNRESET") {
|
|
301
|
+
return { code: ErrorCode.NETWORK_ERROR, message: networkErrorMessages[errCode] || "Network connection failed", retryable: true, cause: error2.message };
|
|
302
|
+
}
|
|
303
|
+
if (errCode === "ECONNABORTED" || errCode === "ETIMEDOUT") {
|
|
304
|
+
return { code: ErrorCode.TIMEOUT, message: networkErrorMessages[errCode] || "Request timed out", retryable: true, cause: error2.message };
|
|
305
|
+
}
|
|
306
|
+
return null;
|
|
307
|
+
}
|
|
308
|
+
function classifyByStatusCode(error2) {
|
|
309
|
+
const status = error2.response?.status || error2.status || error2.statusCode;
|
|
310
|
+
if (!status) return null;
|
|
311
|
+
return classifyHttpError(status, error2.message || String(error2));
|
|
312
|
+
}
|
|
313
|
+
function classifyByMessage(message) {
|
|
314
|
+
const lower = message.toLowerCase();
|
|
315
|
+
if (lower.includes("timeout") || lower.includes("timed out") || lower.includes("aborterror")) {
|
|
316
|
+
return { code: ErrorCode.TIMEOUT, message: "Request timed out", retryable: true, cause: message };
|
|
317
|
+
}
|
|
318
|
+
if (lower.includes("rate limit") || lower.includes("too many requests")) {
|
|
319
|
+
return { code: ErrorCode.RATE_LIMITED, message: "Rate limit exceeded", retryable: true, cause: message };
|
|
320
|
+
}
|
|
321
|
+
if (message.includes("API_KEY") || message.includes("api_key") || message.includes("Invalid API")) {
|
|
322
|
+
return { code: ErrorCode.AUTH_ERROR, message: "API key missing or invalid", retryable: false, cause: message };
|
|
323
|
+
}
|
|
324
|
+
if (message.includes("JSON") || message.includes("parse") || message.includes("Unexpected token")) {
|
|
325
|
+
return { code: ErrorCode.PARSE_ERROR, message: "Failed to parse response", retryable: false, cause: message };
|
|
326
|
+
}
|
|
327
|
+
return null;
|
|
328
|
+
}
|
|
329
|
+
function classifyFallback(message, cause) {
|
|
330
|
+
return {
|
|
331
|
+
code: ErrorCode.UNKNOWN_ERROR,
|
|
332
|
+
message,
|
|
333
|
+
retryable: false,
|
|
334
|
+
cause: cause ? String(cause) : void 0
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
function classifyError(error2) {
|
|
338
|
+
if (error2 == null) {
|
|
339
|
+
return { code: ErrorCode.UNKNOWN_ERROR, message: "An unknown error occurred", retryable: false };
|
|
340
|
+
}
|
|
341
|
+
if (error2 instanceof DOMException) return classifyDomException(error2);
|
|
342
|
+
if (!isErrorLike(error2)) {
|
|
343
|
+
return { code: ErrorCode.UNKNOWN_ERROR, message: String(error2), retryable: false };
|
|
344
|
+
}
|
|
345
|
+
return classifyByErrorCode(error2) ?? classifyByStatusCode(error2) ?? classifyByMessage(error2.message ?? String(error2)) ?? classifyFallback(error2.message ?? String(error2), error2.cause);
|
|
346
|
+
}
|
|
347
|
+
function isErrorLike(value) {
|
|
348
|
+
return typeof value === "object" && value !== null;
|
|
349
|
+
}
|
|
350
|
+
function classifyHttpError(status, message) {
|
|
351
|
+
switch (status) {
|
|
352
|
+
case 400:
|
|
353
|
+
return { code: ErrorCode.INVALID_INPUT, message: "Bad request", retryable: false, statusCode: status };
|
|
354
|
+
case 401:
|
|
355
|
+
return { code: ErrorCode.AUTH_ERROR, message: "Invalid API key", retryable: false, statusCode: status };
|
|
356
|
+
case 403:
|
|
357
|
+
return { code: ErrorCode.QUOTA_EXCEEDED, message: "Access forbidden or quota exceeded", retryable: false, statusCode: status };
|
|
358
|
+
case 404:
|
|
359
|
+
return { code: ErrorCode.NOT_FOUND, message: "Resource not found", retryable: false, statusCode: status };
|
|
360
|
+
case 408:
|
|
361
|
+
return { code: ErrorCode.TIMEOUT, message: "Request timeout", retryable: true, statusCode: status };
|
|
362
|
+
case 429:
|
|
363
|
+
return { code: ErrorCode.RATE_LIMITED, message: "Rate limit exceeded", retryable: true, statusCode: status };
|
|
364
|
+
case 500:
|
|
365
|
+
return { code: ErrorCode.INTERNAL_ERROR, message: "Server error", retryable: true, statusCode: status };
|
|
366
|
+
case 502:
|
|
367
|
+
return { code: ErrorCode.SERVICE_UNAVAILABLE, message: "Bad gateway", retryable: true, statusCode: status };
|
|
368
|
+
case 503:
|
|
369
|
+
return { code: ErrorCode.SERVICE_UNAVAILABLE, message: "Service unavailable", retryable: true, statusCode: status };
|
|
370
|
+
case 504:
|
|
371
|
+
return { code: ErrorCode.TIMEOUT, message: "Gateway timeout", retryable: true, statusCode: status };
|
|
372
|
+
case 510:
|
|
373
|
+
return { code: ErrorCode.SERVICE_UNAVAILABLE, message: "Request canceled", retryable: true, statusCode: status };
|
|
374
|
+
default:
|
|
375
|
+
if (status >= 500) {
|
|
376
|
+
return { code: ErrorCode.SERVICE_UNAVAILABLE, message: `Server error: ${status}`, retryable: true, statusCode: status };
|
|
377
|
+
}
|
|
378
|
+
if (status >= 400) {
|
|
379
|
+
return { code: ErrorCode.INVALID_INPUT, message: `Client error: ${status}`, retryable: false, statusCode: status };
|
|
380
|
+
}
|
|
381
|
+
return { code: ErrorCode.UNKNOWN_ERROR, message: `HTTP ${status}: ${message}`, retryable: false, statusCode: status };
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
function calculateBackoff(attempt, options) {
|
|
385
|
+
const exponentialDelay = options.baseDelayMs * Math.pow(2, attempt);
|
|
386
|
+
const jitter = Math.random() * 0.3 * exponentialDelay;
|
|
387
|
+
return Math.min(exponentialDelay + jitter, options.maxDelayMs);
|
|
388
|
+
}
|
|
389
|
+
function sleep(ms, signal) {
|
|
390
|
+
return new Promise((resolve, reject) => {
|
|
391
|
+
if (signal?.aborted) {
|
|
392
|
+
reject(new DOMException("Aborted", "AbortError"));
|
|
393
|
+
return;
|
|
394
|
+
}
|
|
395
|
+
function onAbort() {
|
|
396
|
+
clearTimeout(timeout);
|
|
397
|
+
reject(new DOMException("Aborted", "AbortError"));
|
|
398
|
+
}
|
|
399
|
+
const timeout = setTimeout(() => {
|
|
400
|
+
if (signal) signal.removeEventListener("abort", onAbort);
|
|
401
|
+
resolve();
|
|
402
|
+
}, ms);
|
|
403
|
+
signal?.addEventListener("abort", onAbort, { once: true });
|
|
404
|
+
if (signal?.aborted) {
|
|
405
|
+
onAbort();
|
|
406
|
+
}
|
|
407
|
+
});
|
|
408
|
+
}
|
|
409
|
+
function fetchWithTimeout(url, options = {}) {
|
|
410
|
+
const { timeoutMs = 3e4, signal: externalSignal, ...fetchOptions } = options;
|
|
411
|
+
const controller = new AbortController();
|
|
412
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
413
|
+
let onExternalAbort;
|
|
414
|
+
if (externalSignal) {
|
|
415
|
+
onExternalAbort = () => controller.abort();
|
|
416
|
+
externalSignal.addEventListener("abort", onExternalAbort, { once: true });
|
|
417
|
+
if (externalSignal.aborted) {
|
|
418
|
+
controller.abort();
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
return fetch(url, { ...fetchOptions, signal: controller.signal }).finally(() => {
|
|
422
|
+
clearTimeout(timeoutId);
|
|
423
|
+
if (externalSignal && onExternalAbort) {
|
|
424
|
+
externalSignal.removeEventListener("abort", onExternalAbort);
|
|
425
|
+
}
|
|
426
|
+
});
|
|
427
|
+
}
|
|
428
|
+
async function withStallProtection(fn, stallMs, maxAttempts = 2, label = "request") {
|
|
429
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
430
|
+
const controller = new AbortController();
|
|
431
|
+
let stallTimer;
|
|
432
|
+
const stallPromise = new Promise((_, reject) => {
|
|
433
|
+
stallTimer = setTimeout(() => {
|
|
434
|
+
controller.abort();
|
|
435
|
+
reject(Object.assign(new Error(`Service temporarily unavailable \u2014 no response received (attempt ${attempt + 1}/${maxAttempts})`), {
|
|
436
|
+
code: "ESTALLED",
|
|
437
|
+
retryable: attempt < maxAttempts - 1
|
|
438
|
+
}));
|
|
439
|
+
}, stallMs);
|
|
440
|
+
});
|
|
441
|
+
let fnPromise;
|
|
442
|
+
try {
|
|
443
|
+
fnPromise = fn(controller.signal);
|
|
444
|
+
const result = await Promise.race([fnPromise, stallPromise]);
|
|
445
|
+
clearTimeout(stallTimer);
|
|
446
|
+
return result;
|
|
447
|
+
} catch (err) {
|
|
448
|
+
fnPromise?.catch(() => {
|
|
449
|
+
});
|
|
450
|
+
clearTimeout(stallTimer);
|
|
451
|
+
const isStall = err instanceof Error && err.code === "ESTALLED";
|
|
452
|
+
if (isStall && attempt < maxAttempts - 1) {
|
|
453
|
+
const backoff = calculateBackoff(attempt, DEFAULT_RETRY_OPTIONS);
|
|
454
|
+
mcpLog("warning", `${label} stalled, retrying in ${backoff}ms (attempt ${attempt + 1})`, "stability");
|
|
455
|
+
await sleep(backoff);
|
|
456
|
+
continue;
|
|
457
|
+
}
|
|
458
|
+
throw err;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
throw new Error(`${label} failed after ${maxAttempts} stall-protection attempts`);
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
// src/services/llm-processor.ts
|
|
465
|
+
var MAX_LLM_INPUT_CHARS = 5e5;
|
|
466
|
+
var MAX_PRIMARY_MODEL_INPUT_CHARS = 1e5;
|
|
467
|
+
var LLM_CLIENT_TIMEOUT_MS = 6e5;
|
|
468
|
+
var BACKOFF_JITTER_FACTOR = 0.3;
|
|
469
|
+
var LLM_STALL_TIMEOUT_MS = 75e3;
|
|
470
|
+
var LLM_REQUEST_DEADLINE_MS = 15e4;
|
|
471
|
+
var llmHealth = {
|
|
472
|
+
lastPlannerOk: false,
|
|
473
|
+
lastExtractorOk: false,
|
|
474
|
+
lastPlannerCheckedAt: null,
|
|
475
|
+
lastExtractorCheckedAt: null,
|
|
476
|
+
lastPlannerError: null,
|
|
477
|
+
lastExtractorError: null,
|
|
478
|
+
consecutivePlannerFailures: 0,
|
|
479
|
+
consecutiveExtractorFailures: 0
|
|
480
|
+
};
|
|
481
|
+
function markLLMSuccess(kind) {
|
|
482
|
+
const ts = (/* @__PURE__ */ new Date()).toISOString();
|
|
483
|
+
if (kind === "planner") {
|
|
484
|
+
llmHealth.lastPlannerOk = true;
|
|
485
|
+
llmHealth.lastPlannerCheckedAt = ts;
|
|
486
|
+
llmHealth.lastPlannerError = null;
|
|
487
|
+
llmHealth.consecutivePlannerFailures = 0;
|
|
488
|
+
} else {
|
|
489
|
+
llmHealth.lastExtractorOk = true;
|
|
490
|
+
llmHealth.lastExtractorCheckedAt = ts;
|
|
491
|
+
llmHealth.lastExtractorError = null;
|
|
492
|
+
llmHealth.consecutiveExtractorFailures = 0;
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
function markLLMFailure(kind, err) {
|
|
496
|
+
const ts = (/* @__PURE__ */ new Date()).toISOString();
|
|
497
|
+
const message = err instanceof Error ? err.message : String(err ?? "unknown error");
|
|
498
|
+
if (kind === "planner") {
|
|
499
|
+
llmHealth.lastPlannerOk = false;
|
|
500
|
+
llmHealth.lastPlannerCheckedAt = ts;
|
|
501
|
+
llmHealth.lastPlannerError = message;
|
|
502
|
+
llmHealth.consecutivePlannerFailures += 1;
|
|
503
|
+
} else {
|
|
504
|
+
llmHealth.lastExtractorOk = false;
|
|
505
|
+
llmHealth.lastExtractorCheckedAt = ts;
|
|
506
|
+
llmHealth.lastExtractorError = message;
|
|
507
|
+
llmHealth.consecutiveExtractorFailures += 1;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
function getLLMHealth() {
|
|
511
|
+
const cap = getCapabilities();
|
|
512
|
+
return {
|
|
513
|
+
lastPlannerOk: llmHealth.lastPlannerOk,
|
|
514
|
+
lastExtractorOk: llmHealth.lastExtractorOk,
|
|
515
|
+
lastPlannerCheckedAt: llmHealth.lastPlannerCheckedAt,
|
|
516
|
+
lastExtractorCheckedAt: llmHealth.lastExtractorCheckedAt,
|
|
517
|
+
lastPlannerError: llmHealth.lastPlannerError,
|
|
518
|
+
lastExtractorError: llmHealth.lastExtractorError,
|
|
519
|
+
// Static capability — based on env presence at boot. Runtime health (above)
|
|
520
|
+
// tells whether the last attempt actually succeeded.
|
|
521
|
+
plannerConfigured: cap.llmExtraction,
|
|
522
|
+
extractorConfigured: cap.llmExtraction,
|
|
523
|
+
consecutivePlannerFailures: llmHealth.consecutivePlannerFailures,
|
|
524
|
+
consecutiveExtractorFailures: llmHealth.consecutiveExtractorFailures
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
var LLM_RETRY_CONFIG = {
|
|
528
|
+
maxRetries: 2,
|
|
529
|
+
baseDelayMs: 1e3,
|
|
530
|
+
maxDelayMs: 5e3
|
|
531
|
+
};
|
|
532
|
+
var FALLBACK_RETRY_COUNT = 3;
|
|
533
|
+
var RETRYABLE_LLM_ERROR_CODES = /* @__PURE__ */ new Set([
|
|
534
|
+
"rate_limit_exceeded",
|
|
535
|
+
"server_error",
|
|
536
|
+
"timeout",
|
|
537
|
+
"service_unavailable"
|
|
538
|
+
]);
|
|
539
|
+
function hasStatus(error2) {
|
|
540
|
+
return typeof error2 === "object" && error2 !== null && "status" in error2 && typeof error2.status === "number";
|
|
541
|
+
}
|
|
542
|
+
var llmClient = null;
|
|
543
|
+
function createLLMProcessor() {
|
|
544
|
+
if (!getCapabilities().llmExtraction) return null;
|
|
545
|
+
if (!llmClient) {
|
|
546
|
+
llmClient = new OpenAI({
|
|
547
|
+
baseURL: LLM_EXTRACTION.BASE_URL,
|
|
548
|
+
apiKey: LLM_EXTRACTION.API_KEY,
|
|
549
|
+
timeout: LLM_CLIENT_TIMEOUT_MS,
|
|
550
|
+
maxRetries: 0,
|
|
551
|
+
defaultHeaders: { "X-Title": "mcp-research-powerpack" }
|
|
552
|
+
});
|
|
553
|
+
mcpLog("info", `LLM extraction configured (model: ${LLM_EXTRACTION.MODEL}, baseURL: ${LLM_EXTRACTION.BASE_URL})`, "llm");
|
|
554
|
+
}
|
|
555
|
+
return llmClient;
|
|
556
|
+
}
|
|
557
|
+
function buildChatRequestBody(model, prompt) {
|
|
558
|
+
return {
|
|
559
|
+
model,
|
|
560
|
+
messages: [{ role: "user", content: prompt }],
|
|
561
|
+
reasoning_effort: "low"
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
function normalizeProviderError(err, message) {
|
|
565
|
+
if (typeof err === "object" && err !== null) return err;
|
|
566
|
+
return new Error(message);
|
|
567
|
+
}
|
|
568
|
+
function getProviderFailure(response) {
|
|
569
|
+
if (response.content !== null || response.failureKind !== "provider") return null;
|
|
570
|
+
return response.errorCause;
|
|
571
|
+
}
|
|
572
|
+
function emptyLLMExtractionResult(content) {
|
|
573
|
+
return {
|
|
574
|
+
content,
|
|
575
|
+
processed: false,
|
|
576
|
+
error: "LLM returned empty response",
|
|
577
|
+
errorDetails: {
|
|
578
|
+
code: ErrorCode.INTERNAL_ERROR,
|
|
579
|
+
message: "LLM returned empty response",
|
|
580
|
+
retryable: false
|
|
581
|
+
}
|
|
582
|
+
};
|
|
583
|
+
}
|
|
584
|
+
async function requestText(processor, prompt, operationLabel, signal, modelOverride) {
|
|
585
|
+
const model = modelOverride || LLM_EXTRACTION.MODEL;
|
|
586
|
+
try {
|
|
587
|
+
const response = await withStallProtection(
|
|
588
|
+
(stallSignal) => processor.chat.completions.create(
|
|
589
|
+
buildChatRequestBody(model, prompt),
|
|
590
|
+
{
|
|
591
|
+
signal: signal ? AbortSignal.any([stallSignal, signal]) : stallSignal,
|
|
592
|
+
timeout: LLM_REQUEST_DEADLINE_MS
|
|
593
|
+
}
|
|
594
|
+
),
|
|
595
|
+
LLM_STALL_TIMEOUT_MS,
|
|
596
|
+
3,
|
|
597
|
+
`${operationLabel} (${model})`
|
|
598
|
+
);
|
|
599
|
+
const content = response.choices?.[0]?.message?.content?.trim();
|
|
600
|
+
if (content) {
|
|
601
|
+
return { content, model };
|
|
602
|
+
}
|
|
603
|
+
const err = `Empty response from model ${model}`;
|
|
604
|
+
mcpLog("warning", `${operationLabel} returned empty content for model ${model}`, "llm");
|
|
605
|
+
return { content: null, model, error: err, failureKind: "empty" };
|
|
606
|
+
} catch (err) {
|
|
607
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
608
|
+
mcpLog("warning", `${operationLabel} failed for model ${model}: ${message}`, "llm");
|
|
609
|
+
return {
|
|
610
|
+
content: null,
|
|
611
|
+
model,
|
|
612
|
+
error: message,
|
|
613
|
+
failureKind: "provider",
|
|
614
|
+
errorCause: normalizeProviderError(err, message)
|
|
615
|
+
};
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
async function requestTextWithFallback(processor, prompt, operationLabel, signal) {
|
|
619
|
+
const primary = await requestText(processor, prompt, operationLabel, signal);
|
|
620
|
+
if (primary.content !== null) return primary;
|
|
621
|
+
const fallbackModel = LLM_EXTRACTION.FALLBACK_MODEL;
|
|
622
|
+
if (!fallbackModel) return primary;
|
|
623
|
+
mcpLog("warning", `Primary model failed, switching to fallback ${fallbackModel}`, "llm");
|
|
624
|
+
let lastFailure = primary;
|
|
625
|
+
for (let attempt = 0; attempt < FALLBACK_RETRY_COUNT; attempt++) {
|
|
626
|
+
if (attempt > 0) {
|
|
627
|
+
const delayMs = calculateLLMBackoff(attempt - 1);
|
|
628
|
+
mcpLog("warning", `Fallback retry ${attempt}/${FALLBACK_RETRY_COUNT - 1} in ${delayMs}ms`, "llm");
|
|
629
|
+
try {
|
|
630
|
+
await sleep(delayMs, signal);
|
|
631
|
+
} catch {
|
|
632
|
+
break;
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
const result = await requestText(processor, prompt, `${operationLabel} [fallback]`, signal, fallbackModel);
|
|
636
|
+
if (result.content !== null) return result;
|
|
637
|
+
lastFailure = result;
|
|
638
|
+
}
|
|
639
|
+
return lastFailure;
|
|
640
|
+
}
|
|
641
|
+
function isRetryableLLMError(error2) {
|
|
642
|
+
if (!error2 || typeof error2 !== "object") return false;
|
|
643
|
+
const stallCode = error2?.code;
|
|
644
|
+
if (stallCode === "ESTALLED" || stallCode === "ETIMEDOUT") {
|
|
645
|
+
return true;
|
|
646
|
+
}
|
|
647
|
+
if (hasStatus(error2)) {
|
|
648
|
+
if (error2.status === 429 || error2.status === 500 || error2.status === 502 || error2.status === 503 || error2.status === 504) {
|
|
649
|
+
return true;
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
const record = error2;
|
|
653
|
+
const code = typeof record.code === "string" ? record.code : void 0;
|
|
654
|
+
const nested = typeof record.error === "object" && record.error !== null ? record.error : null;
|
|
655
|
+
const errorCode = code ?? (nested && typeof nested.code === "string" ? nested.code : void 0) ?? (nested && typeof nested.type === "string" ? nested.type : void 0);
|
|
656
|
+
if (errorCode && RETRYABLE_LLM_ERROR_CODES.has(errorCode)) {
|
|
657
|
+
return true;
|
|
658
|
+
}
|
|
659
|
+
const message = typeof record.message === "string" ? record.message.toLowerCase() : "";
|
|
660
|
+
if (message.includes("rate limit") || message.includes("timeout") || message.includes("timed out") || message.includes("service unavailable") || message.includes("server error") || message.includes("connection") || message.includes("econnreset")) {
|
|
661
|
+
return true;
|
|
662
|
+
}
|
|
663
|
+
return false;
|
|
664
|
+
}
|
|
665
|
+
function isContextWindowError(error2) {
|
|
666
|
+
if (!error2 || typeof error2 !== "object") return false;
|
|
667
|
+
const record = error2;
|
|
668
|
+
const nested = typeof record.error === "object" && record.error !== null ? record.error : null;
|
|
669
|
+
const code = typeof record.code === "string" ? record.code : void 0;
|
|
670
|
+
const nestedCode = nested && typeof nested.code === "string" ? nested.code : void 0;
|
|
671
|
+
if (code === "context_length_exceeded" || nestedCode === "context_length_exceeded") {
|
|
672
|
+
return true;
|
|
673
|
+
}
|
|
674
|
+
const messages = [];
|
|
675
|
+
if (typeof record.message === "string") messages.push(record.message);
|
|
676
|
+
if (nested && typeof nested.message === "string") messages.push(nested.message);
|
|
677
|
+
const combined = messages.join(" ").toLowerCase();
|
|
678
|
+
return combined.includes("context length") || combined.includes("context window") || combined.includes("maximum context") || combined.includes("maximum tokens") || combined.includes("token limit") || combined.includes("too many tokens") || combined.includes("prompt is too long") || combined.includes("reduce the length");
|
|
679
|
+
}
|
|
680
|
+
function calculateLLMBackoff(attempt) {
|
|
681
|
+
const exponentialDelay = LLM_RETRY_CONFIG.baseDelayMs * Math.pow(2, attempt);
|
|
682
|
+
const jitter = Math.random() * BACKOFF_JITTER_FACTOR * exponentialDelay;
|
|
683
|
+
return Math.min(exponentialDelay + jitter, LLM_RETRY_CONFIG.maxDelayMs);
|
|
684
|
+
}
|
|
685
|
+
async function processContentWithLLM(content, config, processor, signal) {
|
|
686
|
+
if (!config.enabled) {
|
|
687
|
+
return { content, processed: false };
|
|
688
|
+
}
|
|
689
|
+
if (!processor) {
|
|
690
|
+
return {
|
|
691
|
+
content,
|
|
692
|
+
processed: false,
|
|
693
|
+
error: "LLM processor not available (LLM_API_KEY, LLM_BASE_URL, and LLM_MODEL must all be set)",
|
|
694
|
+
errorDetails: {
|
|
695
|
+
code: ErrorCode.AUTH_ERROR,
|
|
696
|
+
message: "LLM processor not available",
|
|
697
|
+
retryable: false
|
|
698
|
+
}
|
|
699
|
+
};
|
|
700
|
+
}
|
|
701
|
+
if (!content?.trim()) {
|
|
702
|
+
return { content: content || "", processed: false, error: "Empty content provided" };
|
|
703
|
+
}
|
|
704
|
+
const truncatedContent = content.length > MAX_LLM_INPUT_CHARS ? content.substring(0, MAX_LLM_INPUT_CHARS) + "\n\n[Content truncated due to length]" : content;
|
|
705
|
+
const skipPrimaryForSize = truncatedContent.length > MAX_PRIMARY_MODEL_INPUT_CHARS && !!LLM_EXTRACTION.FALLBACK_MODEL;
|
|
706
|
+
const safeUrl = (() => {
|
|
707
|
+
if (!config.url) return void 0;
|
|
708
|
+
try {
|
|
709
|
+
const u = new URL(config.url);
|
|
710
|
+
return `${u.origin}${u.pathname}`;
|
|
711
|
+
} catch {
|
|
712
|
+
return void 0;
|
|
713
|
+
}
|
|
714
|
+
})();
|
|
715
|
+
const urlLine = safeUrl ? `PAGE URL: ${safeUrl}
|
|
716
|
+
|
|
717
|
+
` : "";
|
|
718
|
+
const prompt = config.extract ? `You are a factual extractor for a research agent. Extract ONLY the information that matches the instruction below. Do not summarize, interpret, or editorialize.
|
|
719
|
+
|
|
720
|
+
${urlLine}EXTRACTION INSTRUCTION: ${config.extract}
|
|
721
|
+
|
|
722
|
+
STEP 1 \u2014 Classify this page. Look at the URL if present, plus structural cues (code blocks, table patterns, comment threads, marketing copy). Pick ONE:
|
|
723
|
+
\`docs | changelog | github-readme | github-thread | reddit | hackernews | forum | blog | marketing | announcement | qa | cve | paper | release-notes | other\`
|
|
724
|
+
|
|
725
|
+
STEP 2 \u2014 Adjust emphasis by page type:
|
|
726
|
+
- docs / changelog / github-readme / release-notes \u2192 API signatures, version numbers, flags, exact config keys, code blocks. Copy verbatim. Preserve tables as tables.
|
|
727
|
+
- github-thread \u2192 weight MAINTAINER comments (label "[maintainer]") over drive-by commenters. Preserve stacktraces verbatim. Capture chronological resolution \u2014 what was decided and when. Link the accepted-fix commit/PR if referenced.
|
|
728
|
+
- reddit / hackernews / forum \u2192 lived experience. Quote verbatim with attribution ("u/foo wrote: \u2026" or "user <name>"). Prioritize replies with stack details, specific failure stories, or replies that contradict the OP. Record overall sentiment distribution as one bullet if clear skew ("~70% agree / ~20% dissent / rest off-topic"). Drop context-free opinions ("this sucks") from Matches.
|
|
729
|
+
- blog \u2192 prioritize concrete reproductions, code, measurements. If the author makes a claim without evidence, mark "[unsourced claim]".
|
|
730
|
+
- marketing / announcement \u2192 pricing tiers, feature matrices verbatim, free-tier quotas, enterprise contact. Preserve tables as tables. Treat roadmap/future-tense claims skeptically \u2014 note them as "[announced, not shipped]" when framing is future-tense.
|
|
731
|
+
- qa (stackoverflow) \u2192 accepted answer's code + high-voted disagreements. Always note the answer date \u2014 SO rots.
|
|
732
|
+
- cve \u2192 CVSS vector verbatim, CWE, CPE ranges, affected versions, fix version, references. Each with its label.
|
|
733
|
+
- paper \u2192 claim, method, dataset, benchmark numbers, comparison baseline. Preserve numeric deltas verbatim.
|
|
734
|
+
|
|
735
|
+
STEP 3 \u2014 Emit markdown with these sections, in order:
|
|
736
|
+
|
|
737
|
+
## Source
|
|
738
|
+
- URL: <verbatim if visible, else "unknown">
|
|
739
|
+
- Page type: <the type you picked>
|
|
740
|
+
- Page date: <verbatim if visible, else "not visible">
|
|
741
|
+
- Author / maintainer (if identifiable): <verbatim>
|
|
742
|
+
|
|
743
|
+
## Matches
|
|
744
|
+
One bullet per distinct piece of matching info:
|
|
745
|
+
- **<short label>** \u2014 the information. Quote VERBATIM for: numbers, versions, dates, API names, prices, error messages, stacktraces, CVSS vectors, benchmark scores, command flags, proper nouns, and people's words. Backticks for code/identifiers. Preserve tables.
|
|
746
|
+
|
|
747
|
+
## Not found
|
|
748
|
+
Every part of the extraction instruction this page did NOT answer. Be explicit. Example: "Enterprise pricing contact \u2014 not present on this page."
|
|
749
|
+
|
|
750
|
+
## Follow-up signals
|
|
751
|
+
Short bullets \u2014 NEW angles this page surfaced that the agent should investigate. Include: new terms, unexpected vendor names, contradicting claims, referenced-but-unscraped URLs. Copy URLs VERBATIM from the source; if only anchor text is visible, write "anchor: <text> (URL not in scraped content)". Skip this section if nothing new surfaced. Do NOT invent.
|
|
752
|
+
|
|
753
|
+
## Contradictions
|
|
754
|
+
(Include this section only if the page contains internally contradictory claims.) Bullet each contradiction with both sides quoted verbatim.
|
|
755
|
+
|
|
756
|
+
## Truncation
|
|
757
|
+
(Include only if content appears cut mid-element.) "Content cut mid-<table row / code block / comment / paragraph>; extraction may be incomplete for <section>."
|
|
758
|
+
|
|
759
|
+
RULES:
|
|
760
|
+
- Never paraphrase numbers, versions, code, or quoted text.
|
|
761
|
+
- If an instruction item is not answered, it goes in "Not found" \u2014 do NOT invent an answer to please the caller.
|
|
762
|
+
- Preserve code blocks, command examples, tables exactly.
|
|
763
|
+
- Do NOT add commentary or recommendations outside "Follow-up signals".
|
|
764
|
+
- Page language \u2260 English: quote verbatim in the original language AND provide a parenthetical gloss in English.
|
|
765
|
+
- Page appears gated (login wall, paywall, JS-render-empty shell) or near-empty: BEFORE dismissing the page, look for ANY visible text \u2014 og:title, og:description, meta description, headline, author name, nav labels, teaser/preview sentences, visible comment snippets. If ANY such text exists, extract it as usual under \`## Source\` + \`## Matches\`, and list the blocked facets under \`## Not found\`. Prefix the first \`## Matches\` bullet with \`**[partial \u2014 <reason>]**\` so the caller knows the body is gated (reasons: \`login-wall | paywall | JS-render-empty | truncated-before-relevant-section\`). ONLY when there is NO visible extractable text at all (< 50 words AND no og:* AND no headline AND no preview), return exactly one line:
|
|
766
|
+
\`## Matches\\n_Page did not load: <reason>_\`
|
|
767
|
+
Valid reasons: \`404 | login-wall | paywall | JS-render-empty | non-text-asset | truncated-before-relevant-section\`.
|
|
768
|
+
|
|
769
|
+
Content:
|
|
770
|
+
${truncatedContent}` : `Clean the following page content: drop navigation, ads, cookie banners, footers, author bios, related-article lists. Preserve headings, paragraphs, code blocks, tables, and inline links as \`[text](url)\`. Do NOT summarize \u2014 preserve the full body.
|
|
771
|
+
|
|
772
|
+
${urlLine}Content:
|
|
773
|
+
${truncatedContent}`;
|
|
774
|
+
let lastError;
|
|
775
|
+
if (skipPrimaryForSize) {
|
|
776
|
+
mcpLog(
|
|
777
|
+
"info",
|
|
778
|
+
`Input ${truncatedContent.length} chars exceeds primary model cap (${MAX_PRIMARY_MODEL_INPUT_CHARS}); routing directly to fallback`,
|
|
779
|
+
"llm"
|
|
780
|
+
);
|
|
781
|
+
} else {
|
|
782
|
+
for (let attempt = 0; attempt <= LLM_RETRY_CONFIG.maxRetries; attempt++) {
|
|
783
|
+
try {
|
|
784
|
+
if (attempt === 0) {
|
|
785
|
+
mcpLog("info", `Starting extraction with ${LLM_EXTRACTION.MODEL}`, "llm");
|
|
786
|
+
} else {
|
|
787
|
+
mcpLog("warning", `Retry attempt ${attempt}/${LLM_RETRY_CONFIG.maxRetries}`, "llm");
|
|
788
|
+
}
|
|
789
|
+
const response = await requestText(processor, prompt, "LLM extraction", signal);
|
|
790
|
+
if (response.content !== null) {
|
|
791
|
+
mcpLog("info", `Successfully extracted ${response.content.length} characters`, "llm");
|
|
792
|
+
markLLMSuccess("extractor");
|
|
793
|
+
return { content: response.content, processed: true };
|
|
794
|
+
}
|
|
795
|
+
const providerFailure = getProviderFailure(response);
|
|
796
|
+
if (providerFailure) {
|
|
797
|
+
throw providerFailure;
|
|
798
|
+
}
|
|
799
|
+
mcpLog("warning", "Received empty response from LLM", "llm");
|
|
800
|
+
markLLMFailure("extractor", "LLM returned empty response");
|
|
801
|
+
return emptyLLMExtractionResult(content);
|
|
802
|
+
} catch (err) {
|
|
803
|
+
lastError = classifyError(err);
|
|
804
|
+
const status = hasStatus(err) ? err.status : void 0;
|
|
805
|
+
const code = typeof err === "object" && err !== null && "code" in err ? String(err.code) : void 0;
|
|
806
|
+
const ctxErr = isContextWindowError(err);
|
|
807
|
+
mcpLog("error", `Error (attempt ${attempt + 1}): ${lastError.message} [status=${status}, code=${code}, retryable=${isRetryableLLMError(err)}, context_window=${ctxErr}]`, "llm");
|
|
808
|
+
if (ctxErr) {
|
|
809
|
+
mcpLog("warning", "Context window exceeded on primary \u2014 skipping remaining retries, routing to fallback", "llm");
|
|
810
|
+
break;
|
|
811
|
+
}
|
|
812
|
+
if (isRetryableLLMError(err) && attempt < LLM_RETRY_CONFIG.maxRetries) {
|
|
813
|
+
const delayMs = calculateLLMBackoff(attempt);
|
|
814
|
+
mcpLog("warning", `Retrying in ${delayMs}ms...`, "llm");
|
|
815
|
+
try {
|
|
816
|
+
await sleep(delayMs, signal);
|
|
817
|
+
} catch {
|
|
818
|
+
break;
|
|
819
|
+
}
|
|
820
|
+
continue;
|
|
821
|
+
}
|
|
822
|
+
break;
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
const fallbackModel = LLM_EXTRACTION.FALLBACK_MODEL;
|
|
827
|
+
if (fallbackModel) {
|
|
828
|
+
mcpLog("warning", `Primary exhausted, switching to fallback ${fallbackModel}`, "llm");
|
|
829
|
+
for (let attempt = 0; attempt < FALLBACK_RETRY_COUNT; attempt++) {
|
|
830
|
+
if (attempt > 0) {
|
|
831
|
+
const delayMs = calculateLLMBackoff(attempt - 1);
|
|
832
|
+
mcpLog("warning", `Fallback retry ${attempt}/${FALLBACK_RETRY_COUNT - 1} in ${delayMs}ms`, "llm");
|
|
833
|
+
try {
|
|
834
|
+
await sleep(delayMs, signal);
|
|
835
|
+
} catch {
|
|
836
|
+
break;
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
try {
|
|
840
|
+
const response = await requestText(processor, prompt, "LLM extraction [fallback]", signal, fallbackModel);
|
|
841
|
+
if (response.content !== null) {
|
|
842
|
+
mcpLog("info", `Fallback extracted ${response.content.length} characters`, "llm");
|
|
843
|
+
markLLMSuccess("extractor");
|
|
844
|
+
return { content: response.content, processed: true };
|
|
845
|
+
}
|
|
846
|
+
const providerFailure = getProviderFailure(response);
|
|
847
|
+
if (providerFailure) {
|
|
848
|
+
throw providerFailure;
|
|
849
|
+
}
|
|
850
|
+
mcpLog("warning", "Fallback returned empty response", "llm");
|
|
851
|
+
markLLMFailure("extractor", "LLM returned empty response");
|
|
852
|
+
return emptyLLMExtractionResult(content);
|
|
853
|
+
} catch (err) {
|
|
854
|
+
lastError = classifyError(err);
|
|
855
|
+
mcpLog("error", `Fallback error (attempt ${attempt + 1}): ${lastError.message}`, "llm");
|
|
856
|
+
if (isContextWindowError(err) || !isRetryableLLMError(err)) break;
|
|
857
|
+
}
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
const errorMessage = lastError?.message || "Unknown LLM error";
|
|
861
|
+
mcpLog("error", `All attempts failed: ${errorMessage}. Returning original content.`, "llm");
|
|
862
|
+
markLLMFailure("extractor", errorMessage);
|
|
863
|
+
return {
|
|
864
|
+
content,
|
|
865
|
+
processed: false,
|
|
866
|
+
error: `LLM extraction failed: ${errorMessage}`,
|
|
867
|
+
errorDetails: lastError || {
|
|
868
|
+
code: ErrorCode.UNKNOWN_ERROR,
|
|
869
|
+
message: errorMessage,
|
|
870
|
+
retryable: false
|
|
871
|
+
}
|
|
872
|
+
};
|
|
873
|
+
}
|
|
874
|
+
var MAX_CLASSIFICATION_URLS = 50;
|
|
875
|
+
async function classifySearchResults(rankedUrls, objective, totalQueries, processor, previousQueries = []) {
|
|
876
|
+
const urlsToClassify = rankedUrls.slice(0, MAX_CLASSIFICATION_URLS);
|
|
877
|
+
const STATIC_WEIGHTS = [30, 20, 15, 10, 8, 6, 5, 4, 3, 2];
|
|
878
|
+
const weightForRank = (rank) => STATIC_WEIGHTS[rank - 1] ?? 1;
|
|
879
|
+
const lines = [];
|
|
880
|
+
for (const url of urlsToClassify) {
|
|
881
|
+
let domain;
|
|
882
|
+
try {
|
|
883
|
+
domain = new URL(url.url).hostname.replace(/^www\./, "");
|
|
884
|
+
} catch {
|
|
885
|
+
domain = url.url;
|
|
886
|
+
}
|
|
887
|
+
const snippet = url.snippet.length > 120 ? url.snippet.slice(0, 117) + "..." : url.snippet;
|
|
888
|
+
lines.push(`[${url.rank}] w=${weightForRank(url.rank)} ${url.title} \u2014 ${domain} \u2014 ${snippet}`);
|
|
889
|
+
}
|
|
890
|
+
const prevQueriesBlock = previousQueries.length > 0 ? previousQueries.map((q) => `- ${q}`).join("\n") : "- (none provided)";
|
|
891
|
+
const today = (/* @__PURE__ */ new Date()).toISOString().slice(0, 10);
|
|
892
|
+
const prompt = `You are the relevance filter for a research agent. Classify each search result below against the objective and produce a structured analysis.
|
|
893
|
+
|
|
894
|
+
OBJECTIVE: ${objective}
|
|
895
|
+
TODAY: ${today}
|
|
896
|
+
|
|
897
|
+
PREVIOUS QUERIES (already run \u2014 do NOT paraphrase in refine_queries):
|
|
898
|
+
${prevQueriesBlock}
|
|
899
|
+
|
|
900
|
+
Return ONLY a JSON object (no markdown, no code fences):
|
|
901
|
+
|
|
902
|
+
{
|
|
903
|
+
"title": "2\u20138 word label for this RESULT CLUSTER (not the objective)",
|
|
904
|
+
"synthesis": "3\u20135 sentences grounded in the results. Every non-trivial claim cites a rank in [brackets], e.g. '[3] documents the flag; [7][12] report it is broken on macOS.' A synthesis with zero citations is invalid.",
|
|
905
|
+
"confidence": "high | medium | low",
|
|
906
|
+
"confidence_reason": "one sentence \u2014 why",
|
|
907
|
+
"gaps": [
|
|
908
|
+
{ "id": 0, "description": "specific, actionable thing the current results do NOT answer \u2014 not 'more info needed'" }
|
|
909
|
+
],
|
|
910
|
+
"refine_queries": [
|
|
911
|
+
{ "query": "concrete next search", "gap_id": 0, "rationale": "\u226412 words" }
|
|
912
|
+
],
|
|
913
|
+
"results": [
|
|
914
|
+
{
|
|
915
|
+
"rank": 1,
|
|
916
|
+
"tier": "HIGHLY_RELEVANT | MAYBE_RELEVANT | OTHER",
|
|
917
|
+
"source_type": "vendor_doc | github | reddit | hackernews | blog | news | marketing | stackoverflow | cve | paper | release_notes | aggregator | other",
|
|
918
|
+
"reason": "\u226412 words citing the snippet cue that drove the tier"
|
|
919
|
+
}
|
|
920
|
+
]
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
WEIGHT SCHEME: each row is prefixed with a weight (w=N). Higher weight means the URL ranked better across input queries \u2014 prefer HIGHLY_RELEVANT for high-weight rows when content matches the objective. Weight alone never justifies HIGHLY_RELEVANT; snippet cues still drive the decision.
|
|
924
|
+
|
|
925
|
+
SOURCE-OF-TRUTH RUBRIC (the "primary source" is goal-dependent \u2014 infer goal type from the objective):
|
|
926
|
+
- spec / API / config questions \u2192 vendor_doc, github (README, RFC), release_notes are primary
|
|
927
|
+
- bug / failure-mode questions \u2192 github (issue/PR), stackoverflow are primary
|
|
928
|
+
- migration / sentiment / lived-experience \u2192 reddit, hackernews, blog are primary; docs are secondary
|
|
929
|
+
- pricing / commercial \u2192 marketing (the vendor's own pricing page IS the primary source, but treat feature lists skeptically)
|
|
930
|
+
- security / CVE \u2192 cve databases, distro security trackers (nvd.nist.gov, security-tracker.debian.org, ubuntu.com/security) are primary
|
|
931
|
+
- synthesis / open-ended \u2192 blend; no single type is primary
|
|
932
|
+
- product launch \u2192 vendor_doc + news + marketing for the launch itself; blogs + reddit for independent verification
|
|
933
|
+
|
|
934
|
+
FRESHNESS: proportional to topic velocity. For a week-old release, demote anything older than 30 days. For general tech questions, demote older than 18 months. For stable protocols (HTTP, TCP, POSIX), don't demote by age.
|
|
935
|
+
|
|
936
|
+
CONFIDENCE:
|
|
937
|
+
- high = \u22653 HIGHLY_RELEVANT results from INDEPENDENT domains agree on the core answer
|
|
938
|
+
- medium = \u22652 HIGHLY_RELEVANT exist but disagree or share a domain; OR a single authoritative primary source answers it
|
|
939
|
+
- low = otherwise; snippet-only judgments cap at medium
|
|
940
|
+
|
|
941
|
+
REFINE QUERIES \u2014 each MUST differ from every previousQuery by:
|
|
942
|
+
- a new operator (site:, quotes, verbatim version number), OR
|
|
943
|
+
- a domain-specific noun ABSENT from every prior query
|
|
944
|
+
Adding a year alone does NOT count as differentiation.
|
|
945
|
+
Each refine_query MUST reference a specific gap_id from the gaps array above.
|
|
946
|
+
Produce 4\u20138 refine_queries total. Cover: (a) a primary-source probe, (b) a temporal sharpener, (c) a failure-mode or comparison probe, (d) at least one new-term probe seeded by a specific result's snippet.
|
|
947
|
+
|
|
948
|
+
RULES:
|
|
949
|
+
- Classify ALL ${urlsToClassify.length} results. Do not skip or collapse any.
|
|
950
|
+
- Use only the three tier values.
|
|
951
|
+
- Judge from title + domain + snippet only. Do NOT invent facts not present in the snippet.
|
|
952
|
+
- If ALL results are OTHER: synthesis = "", confidence = "low", and \`gaps\` must explicitly state why the current queries missed the target.
|
|
953
|
+
- Casing: tier = UPPERCASE_WITH_UNDERSCORES, confidence = lowercase.
|
|
954
|
+
|
|
955
|
+
SEARCH RESULTS (${urlsToClassify.length} URLs from ${totalQueries} queries):
|
|
956
|
+
${lines.join("\n")}`;
|
|
957
|
+
try {
|
|
958
|
+
mcpLog("info", `Classifying ${urlsToClassify.length} URLs against objective`, "llm");
|
|
959
|
+
const response = await requestTextWithFallback(
|
|
960
|
+
processor,
|
|
961
|
+
prompt,
|
|
962
|
+
"Search classification"
|
|
963
|
+
);
|
|
964
|
+
if (response.content === null) {
|
|
965
|
+
const errMsg = response.error ?? "LLM returned empty classification response";
|
|
966
|
+
markLLMFailure("planner", errMsg);
|
|
967
|
+
return { result: null, error: errMsg };
|
|
968
|
+
}
|
|
969
|
+
const cleaned = response.content.replace(/^```(?:json)?\s*\n?/m, "").replace(/\n?```\s*$/m, "").trim();
|
|
970
|
+
const parsed = JSON.parse(cleaned);
|
|
971
|
+
if (!parsed.title || typeof parsed.synthesis !== "string" || !Array.isArray(parsed.results)) {
|
|
972
|
+
const errMsg = "LLM response missing required fields (title, synthesis, results)";
|
|
973
|
+
markLLMFailure("planner", errMsg);
|
|
974
|
+
return { result: null, error: errMsg };
|
|
975
|
+
}
|
|
976
|
+
mcpLog("info", `Classification complete: ${parsed.results.filter((r) => r.tier === "HIGHLY_RELEVANT").length} highly relevant`, "llm");
|
|
977
|
+
markLLMSuccess("planner");
|
|
978
|
+
return { result: parsed };
|
|
979
|
+
} catch (err) {
|
|
980
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
981
|
+
mcpLog("error", `Classification failed: ${message}`, "llm");
|
|
982
|
+
markLLMFailure("planner", message);
|
|
983
|
+
return { result: null, error: `Classification failed: ${message}` };
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
async function suggestRefineQueriesForRawMode(rankedUrls, objective, originalQueries, processor) {
|
|
987
|
+
const urlsToSummarize = rankedUrls.slice(0, 12);
|
|
988
|
+
const lines = urlsToSummarize.map((url) => {
|
|
989
|
+
let domain;
|
|
990
|
+
try {
|
|
991
|
+
domain = new URL(url.url).hostname.replace(/^www\./, "");
|
|
992
|
+
} catch {
|
|
993
|
+
domain = url.url;
|
|
994
|
+
}
|
|
995
|
+
return `[${url.rank}] ${url.title} \u2014 ${domain}`;
|
|
996
|
+
});
|
|
997
|
+
const prompt = `You are generating follow-up search queries for an agent using raw search results.
|
|
998
|
+
|
|
999
|
+
Return ONLY a JSON object (no markdown, no code fences):
|
|
1000
|
+
{
|
|
1001
|
+
"refine_queries": [
|
|
1002
|
+
{ "query": "next search query", "gap_description": "what gap this closes", "rationale": "\u226412 words on why" }
|
|
1003
|
+
]
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
OBJECTIVE: ${objective}
|
|
1007
|
+
|
|
1008
|
+
PREVIOUS QUERIES (already run \u2014 do NOT paraphrase):
|
|
1009
|
+
${originalQueries.map((query) => `- ${query}`).join("\n")}
|
|
1010
|
+
|
|
1011
|
+
TOP RESULT TITLES (to seed new-term probes):
|
|
1012
|
+
${lines.join("\n")}
|
|
1013
|
+
|
|
1014
|
+
RULES:
|
|
1015
|
+
- Produce 4\u20136 diverse follow-ups. Cover: (a) a primary-source probe (site:, RFC, vendor docs); (b) a temporal sharpener (changelog, version number); (c) a failure-mode or comparison probe; (d) at least one new-term probe seeded by a specific result title.
|
|
1016
|
+
- Each query MUST differ from every previousQuery by either a new operator (site:, quotes, a verbatim version number) OR a domain-specific noun absent from every prior query. Adding a year alone does NOT count.
|
|
1017
|
+
- Each refine_query MUST include a \`gap_description\` naming what the current results don't answer.
|
|
1018
|
+
- Do not include URLs.
|
|
1019
|
+
- Keep rationales \u226412 words.`;
|
|
1020
|
+
try {
|
|
1021
|
+
const response = await requestTextWithFallback(
|
|
1022
|
+
processor,
|
|
1023
|
+
prompt,
|
|
1024
|
+
"Raw-mode refine query generation"
|
|
1025
|
+
);
|
|
1026
|
+
if (response.content === null) {
|
|
1027
|
+
const errMsg = response.error ?? "LLM returned empty raw-mode refine query response";
|
|
1028
|
+
markLLMFailure("planner", errMsg);
|
|
1029
|
+
return { result: [], error: errMsg };
|
|
1030
|
+
}
|
|
1031
|
+
const cleaned = response.content.replace(/^```(?:json)?\s*\n?/m, "").replace(/\n?```\s*$/m, "").trim();
|
|
1032
|
+
const parsed = JSON.parse(cleaned);
|
|
1033
|
+
markLLMSuccess("planner");
|
|
1034
|
+
return { result: Array.isArray(parsed.refine_queries) ? parsed.refine_queries : [] };
|
|
1035
|
+
} catch (err) {
|
|
1036
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1037
|
+
mcpLog("error", `Raw-mode refine query generation failed: ${message}`, "llm");
|
|
1038
|
+
markLLMFailure("planner", message);
|
|
1039
|
+
return { result: [], error: message };
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
var VALID_GOAL_CLASSES = /* @__PURE__ */ new Set([
|
|
1043
|
+
"spec",
|
|
1044
|
+
"bug",
|
|
1045
|
+
"migration",
|
|
1046
|
+
"sentiment",
|
|
1047
|
+
"pricing",
|
|
1048
|
+
"security",
|
|
1049
|
+
"synthesis",
|
|
1050
|
+
"product_launch",
|
|
1051
|
+
"other"
|
|
1052
|
+
]);
|
|
1053
|
+
var VALID_FRESHNESS = /* @__PURE__ */ new Set(["days", "weeks", "months", "years"]);
|
|
1054
|
+
var VALID_BRANCHES = /* @__PURE__ */ new Set(["reddit", "web", "both"]);
|
|
1055
|
+
var VALID_STEP_TOOLS = /* @__PURE__ */ new Set(["raw-web-search", "smart-web-search", "raw-scrape-links", "smart-scrape-links"]);
|
|
1056
|
+
function isStringArray(value) {
|
|
1057
|
+
return Array.isArray(value) && value.every((v) => typeof v === "string");
|
|
1058
|
+
}
|
|
1059
|
+
function isStepArray(value) {
|
|
1060
|
+
return Array.isArray(value) && value.every((s) => {
|
|
1061
|
+
if (typeof s !== "object" || s === null) return false;
|
|
1062
|
+
const tool = s.tool;
|
|
1063
|
+
const reason = s.reason;
|
|
1064
|
+
return typeof tool === "string" && VALID_STEP_TOOLS.has(tool) && typeof reason === "string" && reason.trim().length > 0;
|
|
1065
|
+
});
|
|
1066
|
+
}
|
|
1067
|
+
function parseResearchBrief(raw) {
|
|
1068
|
+
try {
|
|
1069
|
+
const cleaned = raw.replace(/^```(?:json)?\s*\n?/m, "").replace(/\n?```\s*$/m, "").trim();
|
|
1070
|
+
const parsed = JSON.parse(cleaned);
|
|
1071
|
+
const goal_class = typeof parsed.goal_class === "string" ? parsed.goal_class : null;
|
|
1072
|
+
if (!goal_class || !VALID_GOAL_CLASSES.has(goal_class)) return null;
|
|
1073
|
+
const freshness_window = typeof parsed.freshness_window === "string" ? parsed.freshness_window : null;
|
|
1074
|
+
if (!freshness_window || !VALID_FRESHNESS.has(freshness_window)) return null;
|
|
1075
|
+
const primary_branch = parsed.primary_branch;
|
|
1076
|
+
if (typeof primary_branch !== "string" || !VALID_BRANCHES.has(primary_branch)) return null;
|
|
1077
|
+
if (!isStepArray(parsed.first_call_sequence) || parsed.first_call_sequence.length === 0) return null;
|
|
1078
|
+
if (!isStringArray(parsed.keyword_seeds) || parsed.keyword_seeds.length === 0) return null;
|
|
1079
|
+
return {
|
|
1080
|
+
goal_class,
|
|
1081
|
+
goal_class_reason: typeof parsed.goal_class_reason === "string" ? parsed.goal_class_reason : "",
|
|
1082
|
+
primary_branch,
|
|
1083
|
+
primary_branch_reason: typeof parsed.primary_branch_reason === "string" ? parsed.primary_branch_reason : "",
|
|
1084
|
+
freshness_window,
|
|
1085
|
+
first_call_sequence: parsed.first_call_sequence,
|
|
1086
|
+
keyword_seeds: parsed.keyword_seeds.filter((s) => s.trim().length > 0),
|
|
1087
|
+
iteration_hints: isStringArray(parsed.iteration_hints) ? parsed.iteration_hints : [],
|
|
1088
|
+
gaps_to_watch: isStringArray(parsed.gaps_to_watch) ? parsed.gaps_to_watch : [],
|
|
1089
|
+
stop_criteria: isStringArray(parsed.stop_criteria) ? parsed.stop_criteria : []
|
|
1090
|
+
};
|
|
1091
|
+
} catch {
|
|
1092
|
+
return null;
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
async function generateResearchBrief(goal, processor, signal) {
|
|
1096
|
+
const today = (/* @__PURE__ */ new Date()).toISOString().slice(0, 10);
|
|
1097
|
+
const prompt = `You are a research planner. An agent is about to run a multi-pass research loop on the goal below using 5 tools:
|
|
1098
|
+
|
|
1099
|
+
- start-research: orientation and this brief
|
|
1100
|
+
- raw-web-search: raw search fan-out, keywords only, up to 50 keywords per call, no LLM; best for breadth, audit trails, and candidate URL capture
|
|
1101
|
+
- smart-web-search: search fan-out + required LLM prioritization/classification over titles/snippets, scope: web|reddit|both, up to 50 keywords per call; best for triage after a strong diverse keyword set
|
|
1102
|
+
- raw-scrape-links: fetch URLs as full markdown, urls only, no LLM; Reddit permalinks return threaded comments; best for complete context and ambiguous sources
|
|
1103
|
+
- smart-scrape-links: fetch URLs then required LLM extraction over page bodies; Reddit permalinks return threaded comments before extraction; best for focused evidence extraction once facets are known
|
|
1104
|
+
|
|
1105
|
+
Produce a tailored JSON brief.
|
|
1106
|
+
|
|
1107
|
+
GOAL: ${goal}
|
|
1108
|
+
TODAY: ${today}
|
|
1109
|
+
|
|
1110
|
+
Return ONLY a JSON object (no markdown, no code fences):
|
|
1111
|
+
|
|
1112
|
+
{
|
|
1113
|
+
"goal_class": "spec | bug | migration | sentiment | pricing | security | synthesis | product_launch | other",
|
|
1114
|
+
"goal_class_reason": "one sentence \u2014 why this class",
|
|
1115
|
+
"primary_branch": "reddit | web | both",
|
|
1116
|
+
"primary_branch_reason": "one sentence \u2014 why this branch leads",
|
|
1117
|
+
"freshness_window": "days | weeks | months | years",
|
|
1118
|
+
"first_call_sequence": [
|
|
1119
|
+
{ "tool": "raw-web-search | smart-web-search | raw-scrape-links | smart-scrape-links", "reason": "what this call establishes for the agent" }
|
|
1120
|
+
],
|
|
1121
|
+
"keyword_seeds": ["25\u201350 concrete search keywords \u2014 flat list, to be fired in the first search call as keywords"],
|
|
1122
|
+
"iteration_hints": ["2\u20135 pointers on which harvested terms / follow-up signals to watch for after pass 1"],
|
|
1123
|
+
"gaps_to_watch": ["2\u20135 concrete questions the agent MUST verify or the answer is incomplete"],
|
|
1124
|
+
"stop_criteria": ["2\u20134 checkable conditions \u2014 all must hold before the agent declares done"]
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
RULES:
|
|
1128
|
+
|
|
1129
|
+
primary_branch:
|
|
1130
|
+
- "reddit" \u2192 sentiment / migration / lived-experience / community-consensus goals. Usually leads with raw-web-search using Reddit-focused keywords, then raw-scrape-links on post permalinks to preserve full comments.
|
|
1131
|
+
- "web" \u2192 spec / bug / pricing / CVE / API / primary-source goals. Usually leads with raw-web-search for maximum candidate breadth OR smart-web-search scope:"web" when the initial keyword set is already diverse and needs prioritization.
|
|
1132
|
+
- "both" \u2192 opinion-heavy AND needs official sources (e.g. product launch + practitioner reception).
|
|
1133
|
+
|
|
1134
|
+
first_call_sequence:
|
|
1135
|
+
- 1\u20133 steps.
|
|
1136
|
+
- Use raw-web-search when the first need is recall: many distinct source classes, exact candidate URLs, Reddit permalink discovery, or cheap follow-up passes.
|
|
1137
|
+
- Use smart-web-search when the first need is prioritization: the agent has 10\u201350 distinct keyword probes and needs HIGHLY/MAYBE tiers, gaps, and refine queries. Smart search reads snippets only; never plan it as final evidence.
|
|
1138
|
+
- Use raw-scrape-links when complete page/thread context is valuable, the extraction shape is unclear, or Reddit comments are the source of truth.
|
|
1139
|
+
- Use smart-scrape-links when the extraction shape is known (facets separated by |) and the agent needs compact evidence from page bodies; this is usually the highest-value smart tool for answer construction.
|
|
1140
|
+
- reddit-first: step 1 = raw-web-search with Reddit permalink probes; step 2 = raw-scrape-links on best post permalinks for full comments. Add smart-web-search only when there are many candidate posts to triage.
|
|
1141
|
+
- web-first: step 1 = raw-web-search for broad URL capture OR smart-web-search scope:"web" for prioritizing a diverse keyword fan-out; step 2 = smart-scrape-links on selected URLs when extraction facets are known, otherwise raw-scrape-links first.
|
|
1142
|
+
- both: step 1 = parallel search calls split by source need; step 2 = raw-scrape-links for full evidence and smart-scrape-links for final extraction.
|
|
1143
|
+
|
|
1144
|
+
keyword_seeds:
|
|
1145
|
+
- 25\u201350 total. Narrow bug \u2192 fewer. Open synthesis \u2192 more.
|
|
1146
|
+
- Write Google retrieval probes, not topic labels.
|
|
1147
|
+
- For each broad idea, first do a bad \u2192 better rewrite in your head: replace a vague phrase with a query that names the evidence source class, discriminating anchor terms, and one useful operator when possible.
|
|
1148
|
+
- ${QUERY_REWRITE_PAIR_GUIDANCE_TEXT}
|
|
1149
|
+
- Use operators where helpful (site:, quotes, verbatim version numbers, exact error text, package names, release/version strings).
|
|
1150
|
+
- DIVERSE facets \u2014 same noun-phrase cannot repeat across seeds with adjectives-only variation.
|
|
1151
|
+
- Optimize keyword_seeds for distinct coverage first. Smart-web-search can prioritize a broad result set, but it cannot compensate for a narrow or repetitive keyword set.
|
|
1152
|
+
- Do NOT invent vendor names you are uncertain exist.
|
|
1153
|
+
- For \`site:<domain>\` filters, ONLY use domains you are highly confident are real. Safe choices: \`github.com\`, \`stackoverflow.com\`, \`reddit.com\`, \`news.ycombinator.com\`, \`arxiv.org\`, \`nvd.nist.gov\`, \`pypi.org\`, \`npmjs.com\`, plus any canonical homepage/docs domain explicitly spelled out in the goal itself (e.g. goal names "Cursor" \u2192 \`cursor.com\`/\`docs.cursor.com\` is acceptable). If you don't know the product's real docs domain, leave the query open (no \`site:\`) instead of guessing.
|
|
1154
|
+
|
|
1155
|
+
freshness_window:
|
|
1156
|
+
- If the goal mentions a recent release / date / version, use "days" or "weeks".
|
|
1157
|
+
- Stable protocols / APIs \u2192 "months" or "years".`;
|
|
1158
|
+
try {
|
|
1159
|
+
const response = await requestTextWithFallback(
|
|
1160
|
+
processor,
|
|
1161
|
+
prompt,
|
|
1162
|
+
"Research brief generation",
|
|
1163
|
+
signal
|
|
1164
|
+
);
|
|
1165
|
+
if (response.content === null) {
|
|
1166
|
+
mcpLog("warning", `Research brief generation returned no content: ${response.error ?? "unknown"}`, "llm");
|
|
1167
|
+
markLLMFailure("planner", response.error ?? "empty response");
|
|
1168
|
+
return null;
|
|
1169
|
+
}
|
|
1170
|
+
const brief = parseResearchBrief(response.content);
|
|
1171
|
+
if (!brief) {
|
|
1172
|
+
mcpLog("warning", "Research brief JSON parse or shape validation failed", "llm");
|
|
1173
|
+
markLLMFailure("planner", "brief parse/validation failed");
|
|
1174
|
+
return null;
|
|
1175
|
+
}
|
|
1176
|
+
markLLMSuccess("planner");
|
|
1177
|
+
return brief;
|
|
1178
|
+
} catch (err) {
|
|
1179
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1180
|
+
mcpLog("warning", `Research brief generation failed: ${message}`, "llm");
|
|
1181
|
+
markLLMFailure("planner", message);
|
|
1182
|
+
return null;
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
function renderResearchBrief(brief) {
|
|
1186
|
+
const lines = [];
|
|
1187
|
+
lines.push("## Your research brief (goal-tailored)");
|
|
1188
|
+
lines.push("");
|
|
1189
|
+
lines.push(`**Goal class**: \`${brief.goal_class}\` \u2014 ${brief.goal_class_reason}`);
|
|
1190
|
+
lines.push(`**Primary branch**: \`${brief.primary_branch}\` \u2014 ${brief.primary_branch_reason}`);
|
|
1191
|
+
lines.push(`**Freshness**: \`${brief.freshness_window}\``);
|
|
1192
|
+
lines.push("");
|
|
1193
|
+
if (brief.first_call_sequence.length > 0) {
|
|
1194
|
+
lines.push("### First-call sequence");
|
|
1195
|
+
brief.first_call_sequence.forEach((step, i) => {
|
|
1196
|
+
lines.push(`${i + 1}. \`${step.tool}\` \u2014 ${step.reason}`);
|
|
1197
|
+
});
|
|
1198
|
+
lines.push("");
|
|
1199
|
+
}
|
|
1200
|
+
if (brief.keyword_seeds.length > 0) {
|
|
1201
|
+
lines.push(`### Keyword seeds (${brief.keyword_seeds.length}) \u2014 fire these in your first search call as a flat \`keywords\` array`);
|
|
1202
|
+
for (const seed of brief.keyword_seeds) {
|
|
1203
|
+
lines.push(`- ${seed}`);
|
|
1204
|
+
}
|
|
1205
|
+
lines.push("");
|
|
1206
|
+
}
|
|
1207
|
+
if (brief.iteration_hints.length > 0) {
|
|
1208
|
+
lines.push("### Iteration hints (harvest new terms from scrape extracts' `## Follow-up signals`)");
|
|
1209
|
+
for (const hint of brief.iteration_hints) lines.push(`- ${hint}`);
|
|
1210
|
+
lines.push("");
|
|
1211
|
+
}
|
|
1212
|
+
if (brief.gaps_to_watch.length > 0) {
|
|
1213
|
+
lines.push("### Gaps to watch");
|
|
1214
|
+
for (const gap of brief.gaps_to_watch) lines.push(`- ${gap}`);
|
|
1215
|
+
lines.push("");
|
|
1216
|
+
}
|
|
1217
|
+
if (brief.stop_criteria.length > 0) {
|
|
1218
|
+
lines.push("### Stop criteria");
|
|
1219
|
+
for (const c of brief.stop_criteria) lines.push(`- ${c}`);
|
|
1220
|
+
lines.push("");
|
|
1221
|
+
}
|
|
1222
|
+
lines.push("---");
|
|
1223
|
+
lines.push("");
|
|
1224
|
+
lines.push("Fire `first_call_sequence` now. After each smart scrape, harvest new terms from `## Follow-up signals`; after each raw scrape, inspect the full markdown/comments and seed the next search round. Stop when every gap is closed.");
|
|
1225
|
+
return lines.join("\n");
|
|
1226
|
+
}
|
|
1227
|
+
|
|
1228
|
+
// src/tools/scrape.ts
|
|
1229
|
+
import { Effect as Effect3, Either } from "effect";
|
|
1230
|
+
|
|
1231
|
+
// src/schemas/scrape-links.ts
|
|
1232
|
+
import { z as z2 } from "zod";
|
|
1233
|
+
var urlSchema = z2.string().url({ message: "scrape: Invalid URL format" }).refine(
|
|
1234
|
+
(url) => url.startsWith("http://") || url.startsWith("https://"),
|
|
1235
|
+
{ message: "scrape: URL must use http:// or https://" }
|
|
1236
|
+
).describe("A fully-qualified HTTP or HTTPS URL to scrape.");
|
|
1237
|
+
var urlsSchema = z2.array(urlSchema).min(1, { message: "scrape: At least 1 URL required" }).max(50, { message: "scrape: At most 50 URLs allowed per call" }).describe("URLs to fetch in parallel. Reddit post permalinks (`reddit.com/r/<sub>/comments/<id>/...`) are auto-detected and routed through the Reddit API (threaded post + comments). Non-Reddit URLs use Jina Reader first, then Jina Reader through Scrape.do proxy when configured, then optional Kernel browser rendering for web pages.");
|
|
1238
|
+
var rawScrapeLinksParamsSchema = z2.object({
|
|
1239
|
+
urls: urlsSchema,
|
|
1240
|
+
extract: z2.never().optional()
|
|
1241
|
+
}).strict();
|
|
1242
|
+
var smartScrapeLinksParamsSchema = z2.object({
|
|
1243
|
+
urls: z2.array(urlSchema).min(1, { message: "scrape: At least 1 URL required" }).max(50, { message: "scrape: At most 50 URLs allowed per call" }).describe("URLs to fetch and extract in parallel. Reddit post permalinks (`reddit.com/r/<sub>/comments/<id>/...`) are auto-detected and routed through the Reddit API (threaded post + comments). Non-Reddit URLs use Jina Reader first, then Jina Reader through Scrape.do proxy when configured, then optional Kernel browser rendering for web pages. Mix reddit + non-reddit URLs freely; branches run concurrently. Prefer contextually grouped batches \u2014 call this tool multiple times in parallel when URL sets are unrelated."),
|
|
1244
|
+
extract: z2.string().min(1, { message: "smart-scrape-links: extract cannot be empty" }).describe(
|
|
1245
|
+
'Required semantic extraction instruction. Describe the SHAPE of what you want, separated by `|`. The extractor classifies each page (docs / github-thread / reddit / marketing / cve / paper / announcement / qa / blog / changelog / release-notes) and adjusts emphasis per type: preserves numbers/versions/stacktraces verbatim from docs and CVE pages, quotes Reddit/HN with attribution plus sentiment distribution, flags what the page did NOT answer in a "Not found" section, and surfaces referenced-but-unscraped URLs in a "Follow-up signals" section. Good examples: "root cause | affected versions | fix | workarounds | timeline"; "pricing tiers | rate limits | enterprise contact | free-tier quotas"; "maintainer decisions | accepted fix commits | stacktraces | resolved version".'
|
|
1246
|
+
)
|
|
1247
|
+
}).strict();
|
|
1248
|
+
|
|
1249
|
+
// src/utils/retry.ts
|
|
1250
|
+
var JITTER_FACTOR = 0.3;
|
|
1251
|
+
var EXPONENTIAL_BASE = 2;
|
|
1252
|
+
var DEFAULT_BASE_DELAY_MS = 1e3;
|
|
1253
|
+
var DEFAULT_MAX_DELAY_MS = 3e4;
|
|
1254
|
+
function calculateBackoff2(attempt, baseDelayMs = DEFAULT_BASE_DELAY_MS, maxDelayMs = DEFAULT_MAX_DELAY_MS) {
|
|
1255
|
+
const exponentialDelay = baseDelayMs * Math.pow(EXPONENTIAL_BASE, attempt);
|
|
1256
|
+
const jitter = JITTER_FACTOR * exponentialDelay * Math.random();
|
|
1257
|
+
return Math.min(exponentialDelay + jitter, maxDelayMs);
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
// src/clients/jina.ts
|
|
1261
|
+
var JINA_READER_BASE = "https://r.jina.ai/";
|
|
1262
|
+
var JINA_SEARCH_BASE = "https://s.jina.ai/";
|
|
1263
|
+
var DEFAULT_TIMEOUT_SECONDS = 15;
|
|
1264
|
+
var DEFAULT_TIMEOUT_MS = DEFAULT_TIMEOUT_SECONDS * 1e3;
|
|
1265
|
+
var MAX_RETRIES = 2;
|
|
1266
|
+
var SEARCH_RESULTS_PER_QUERY = 10;
|
|
1267
|
+
function buildJinaSearchUrl(query) {
|
|
1268
|
+
const params = new URLSearchParams({ q: query });
|
|
1269
|
+
return `${JINA_SEARCH_BASE}?${params.toString()}`;
|
|
1270
|
+
}
|
|
1271
|
+
function buildScrapeDoProxyUrl(token, parameters = "render=false") {
|
|
1272
|
+
const trimmed = token.trim();
|
|
1273
|
+
if (!trimmed) return "";
|
|
1274
|
+
return `http://${encodeURIComponent(trimmed)}:${parameters}@proxy.scrape.do:8080`;
|
|
1275
|
+
}
|
|
1276
|
+
var JinaClient = class {
|
|
1277
|
+
apiKey;
|
|
1278
|
+
constructor(apiKey) {
|
|
1279
|
+
const fromEnv = process.env.JINA_API_KEY?.trim();
|
|
1280
|
+
this.apiKey = apiKey?.trim() || fromEnv || void 0;
|
|
1281
|
+
}
|
|
1282
|
+
/**
|
|
1283
|
+
* Convert a URL to markdown via Jina Reader.
|
|
1284
|
+
* NEVER throws — always returns a JinaConvertResponse (possibly with error).
|
|
1285
|
+
*/
|
|
1286
|
+
async convert(request) {
|
|
1287
|
+
const {
|
|
1288
|
+
url,
|
|
1289
|
+
timeoutSeconds = DEFAULT_TIMEOUT_SECONDS,
|
|
1290
|
+
proxyUrl,
|
|
1291
|
+
noCache = false,
|
|
1292
|
+
allowProxyRetry = false
|
|
1293
|
+
} = request;
|
|
1294
|
+
try {
|
|
1295
|
+
new URL(url);
|
|
1296
|
+
} catch {
|
|
1297
|
+
return {
|
|
1298
|
+
content: `Invalid URL: ${url}`,
|
|
1299
|
+
statusCode: 400,
|
|
1300
|
+
credits: 0,
|
|
1301
|
+
error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false }
|
|
1302
|
+
};
|
|
1303
|
+
}
|
|
1304
|
+
const first = await this.convertOnce({
|
|
1305
|
+
url,
|
|
1306
|
+
timeoutSeconds,
|
|
1307
|
+
proxyUrl,
|
|
1308
|
+
noCache
|
|
1309
|
+
});
|
|
1310
|
+
if (!first.error || !allowProxyRetry || proxyUrl || isTerminalReaderError(first.error)) {
|
|
1311
|
+
return first;
|
|
1312
|
+
}
|
|
1313
|
+
mcpLog("warning", `Jina Reader failed for ${url}; retrying with Jina proxy`, "jina");
|
|
1314
|
+
return this.convertOnce({
|
|
1315
|
+
url,
|
|
1316
|
+
timeoutSeconds,
|
|
1317
|
+
proxyUrl: "auto",
|
|
1318
|
+
noCache: true
|
|
1319
|
+
});
|
|
1320
|
+
}
|
|
1321
|
+
async searchMultiple(queries) {
|
|
1322
|
+
const startTime = Date.now();
|
|
1323
|
+
if (queries.length === 0) {
|
|
1324
|
+
return {
|
|
1325
|
+
searches: [],
|
|
1326
|
+
totalQueries: 0,
|
|
1327
|
+
executionTime: 0,
|
|
1328
|
+
error: { code: ErrorCode.INVALID_INPUT, message: "No queries provided", retryable: false }
|
|
1329
|
+
};
|
|
1330
|
+
}
|
|
1331
|
+
if (!this.apiKey) {
|
|
1332
|
+
return {
|
|
1333
|
+
searches: [],
|
|
1334
|
+
totalQueries: queries.length,
|
|
1335
|
+
executionTime: Date.now() - startTime,
|
|
1336
|
+
error: { code: ErrorCode.AUTH_ERROR, message: "Jina Search requires JINA_API_KEY", retryable: false }
|
|
1337
|
+
};
|
|
1338
|
+
}
|
|
1339
|
+
const searches = await Promise.all(queries.map((query) => this.searchOne(query)));
|
|
1340
|
+
const firstError = searches.find((search) => search.error)?.error;
|
|
1341
|
+
const allFailed = searches.every((search) => search.error);
|
|
1342
|
+
return {
|
|
1343
|
+
searches,
|
|
1344
|
+
totalQueries: queries.length,
|
|
1345
|
+
executionTime: Date.now() - startTime,
|
|
1346
|
+
...allFailed && firstError ? { error: firstError } : {}
|
|
1347
|
+
};
|
|
1348
|
+
}
|
|
1349
|
+
async convertOnce(request) {
|
|
1350
|
+
const headers = {
|
|
1351
|
+
Accept: "application/json",
|
|
1352
|
+
"Content-Type": "application/json"
|
|
1353
|
+
};
|
|
1354
|
+
if (this.apiKey) headers["Authorization"] = `Bearer ${this.apiKey}`;
|
|
1355
|
+
if (request.proxyUrl && request.proxyUrl !== "auto") {
|
|
1356
|
+
headers["X-Proxy-Url"] = request.proxyUrl;
|
|
1357
|
+
}
|
|
1358
|
+
const body = {
|
|
1359
|
+
url: request.url,
|
|
1360
|
+
respondWith: "markdown",
|
|
1361
|
+
timeout: request.timeoutSeconds,
|
|
1362
|
+
base: "final",
|
|
1363
|
+
removeOverlay: true
|
|
1364
|
+
};
|
|
1365
|
+
if (request.proxyUrl === "auto") body["proxy"] = "auto";
|
|
1366
|
+
if (request.noCache) body["noCache"] = true;
|
|
1367
|
+
return this.fetchReader(body, headers, request.timeoutSeconds);
|
|
1368
|
+
}
|
|
1369
|
+
async fetchReader(body, headers, timeoutSeconds) {
|
|
1370
|
+
let lastError;
|
|
1371
|
+
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
|
|
1372
|
+
try {
|
|
1373
|
+
const response = await fetchWithTimeout(JINA_READER_BASE, {
|
|
1374
|
+
method: "POST",
|
|
1375
|
+
headers,
|
|
1376
|
+
body: JSON.stringify(body),
|
|
1377
|
+
timeoutMs: (timeoutSeconds + 5) * 1e3
|
|
1378
|
+
});
|
|
1379
|
+
const raw = await response.text().catch(
|
|
1380
|
+
(readError) => `Failed to read Jina response: ${readError instanceof Error ? readError.message : String(readError)}`
|
|
1381
|
+
);
|
|
1382
|
+
const usageHeader = response.headers.get("x-usage-tokens");
|
|
1383
|
+
const usageTokens = usageHeader ? Number(usageHeader) : void 0;
|
|
1384
|
+
const parsed = parseReaderContent(raw);
|
|
1385
|
+
if (response.ok) {
|
|
1386
|
+
if (!parsed.content.trim()) {
|
|
1387
|
+
return emptyReaderResponse(response.status, usageTokens);
|
|
1388
|
+
}
|
|
1389
|
+
return {
|
|
1390
|
+
content: parsed.content,
|
|
1391
|
+
statusCode: response.status,
|
|
1392
|
+
credits: 0,
|
|
1393
|
+
usageTokens: Number.isFinite(usageTokens) ? usageTokens : void 0
|
|
1394
|
+
};
|
|
1395
|
+
}
|
|
1396
|
+
const terminal = terminalReaderResponse(response.status, parsed.content || raw);
|
|
1397
|
+
if (terminal) return terminal;
|
|
1398
|
+
lastError = classifyError({ status: response.status, message: raw.slice(0, 200) });
|
|
1399
|
+
if (lastError.retryable && attempt < MAX_RETRIES) {
|
|
1400
|
+
const delayMs = calculateBackoff2(attempt);
|
|
1401
|
+
mcpLog(
|
|
1402
|
+
"warning",
|
|
1403
|
+
`Jina ${response.status} on attempt ${attempt + 1}/${MAX_RETRIES + 1}. Retrying in ${delayMs}ms`,
|
|
1404
|
+
"jina"
|
|
1405
|
+
);
|
|
1406
|
+
await sleep(delayMs);
|
|
1407
|
+
continue;
|
|
1408
|
+
}
|
|
1409
|
+
return {
|
|
1410
|
+
content: `Jina Reader error (${response.status}): ${raw.slice(0, 200)}`,
|
|
1411
|
+
statusCode: response.status,
|
|
1412
|
+
credits: 0,
|
|
1413
|
+
error: lastError
|
|
1414
|
+
};
|
|
1415
|
+
} catch (error2) {
|
|
1416
|
+
lastError = classifyError(error2);
|
|
1417
|
+
if (lastError.retryable && attempt < MAX_RETRIES) {
|
|
1418
|
+
const delayMs = calculateBackoff2(attempt);
|
|
1419
|
+
mcpLog(
|
|
1420
|
+
"warning",
|
|
1421
|
+
`Jina ${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${MAX_RETRIES + 1} in ${delayMs}ms`,
|
|
1422
|
+
"jina"
|
|
1423
|
+
);
|
|
1424
|
+
await sleep(delayMs);
|
|
1425
|
+
continue;
|
|
1426
|
+
}
|
|
1427
|
+
return {
|
|
1428
|
+
content: `Jina Reader failed: ${lastError.message}`,
|
|
1429
|
+
statusCode: lastError.statusCode ?? 500,
|
|
1430
|
+
credits: 0,
|
|
1431
|
+
error: lastError
|
|
1432
|
+
};
|
|
1433
|
+
}
|
|
1434
|
+
}
|
|
1435
|
+
return {
|
|
1436
|
+
content: `Jina Reader failed after ${MAX_RETRIES + 1} attempts: ${lastError?.message ?? "Unknown error"}`,
|
|
1437
|
+
statusCode: lastError?.statusCode ?? 500,
|
|
1438
|
+
credits: 0,
|
|
1439
|
+
error: lastError ?? { code: ErrorCode.UNKNOWN_ERROR, message: "All retries exhausted", retryable: false }
|
|
1440
|
+
};
|
|
1441
|
+
}
|
|
1442
|
+
async searchOne(query) {
|
|
1443
|
+
const headers = {
|
|
1444
|
+
Accept: "application/json",
|
|
1445
|
+
Authorization: `Bearer ${this.apiKey ?? ""}`
|
|
1446
|
+
};
|
|
1447
|
+
try {
|
|
1448
|
+
const response = await fetchWithTimeout(buildJinaSearchUrl(query), {
|
|
1449
|
+
method: "GET",
|
|
1450
|
+
headers,
|
|
1451
|
+
timeoutMs: DEFAULT_TIMEOUT_MS
|
|
1452
|
+
});
|
|
1453
|
+
const raw = await response.text().catch(
|
|
1454
|
+
(readError) => `Failed to read Jina Search response: ${readError instanceof Error ? readError.message : String(readError)}`
|
|
1455
|
+
);
|
|
1456
|
+
if (!response.ok) {
|
|
1457
|
+
return {
|
|
1458
|
+
query,
|
|
1459
|
+
results: [],
|
|
1460
|
+
totalResults: 0,
|
|
1461
|
+
related: [],
|
|
1462
|
+
error: classifyError({ status: response.status, message: raw.slice(0, 200) })
|
|
1463
|
+
};
|
|
1464
|
+
}
|
|
1465
|
+
const results = parseSearchResults(raw);
|
|
1466
|
+
return { query, results, totalResults: results.length, related: [] };
|
|
1467
|
+
} catch (error2) {
|
|
1468
|
+
return {
|
|
1469
|
+
query,
|
|
1470
|
+
results: [],
|
|
1471
|
+
totalResults: 0,
|
|
1472
|
+
related: [],
|
|
1473
|
+
error: classifyError(error2)
|
|
1474
|
+
};
|
|
1475
|
+
}
|
|
1476
|
+
}
|
|
1477
|
+
};
|
|
1478
|
+
function parseReaderContent(raw) {
|
|
1479
|
+
try {
|
|
1480
|
+
const parsed = JSON.parse(raw);
|
|
1481
|
+
const data = readRecord(parsed, "data");
|
|
1482
|
+
const content = readString(data, "content");
|
|
1483
|
+
if (content) return { content };
|
|
1484
|
+
} catch {
|
|
1485
|
+
}
|
|
1486
|
+
return { content: raw };
|
|
1487
|
+
}
|
|
1488
|
+
function emptyReaderResponse(statusCode, usageTokens) {
|
|
1489
|
+
return {
|
|
1490
|
+
content: "Jina returned an empty body",
|
|
1491
|
+
statusCode,
|
|
1492
|
+
credits: 0,
|
|
1493
|
+
usageTokens: Number.isFinite(usageTokens) ? usageTokens : void 0,
|
|
1494
|
+
error: {
|
|
1495
|
+
code: ErrorCode.UNSUPPORTED_BINARY_CONTENT,
|
|
1496
|
+
message: "Jina Reader returned empty content for this URL",
|
|
1497
|
+
retryable: false
|
|
1498
|
+
}
|
|
1499
|
+
};
|
|
1500
|
+
}
|
|
1501
|
+
function terminalReaderResponse(statusCode, content) {
|
|
1502
|
+
if (statusCode === 401 || statusCode === 403) {
|
|
1503
|
+
return {
|
|
1504
|
+
content: `Jina auth/quota error (${statusCode}): ${content.slice(0, 200)}`,
|
|
1505
|
+
statusCode,
|
|
1506
|
+
credits: 0,
|
|
1507
|
+
error: {
|
|
1508
|
+
code: statusCode === 401 ? ErrorCode.AUTH_ERROR : ErrorCode.QUOTA_EXCEEDED,
|
|
1509
|
+
message: statusCode === 401 ? "Jina Reader auth failed \u2014 check JINA_API_KEY" : "Jina Reader quota exceeded",
|
|
1510
|
+
retryable: false,
|
|
1511
|
+
statusCode
|
|
1512
|
+
}
|
|
1513
|
+
};
|
|
1514
|
+
}
|
|
1515
|
+
if (statusCode === 404) {
|
|
1516
|
+
return {
|
|
1517
|
+
content: "Jina could not fetch the target URL (404)",
|
|
1518
|
+
statusCode: 404,
|
|
1519
|
+
credits: 0,
|
|
1520
|
+
error: {
|
|
1521
|
+
code: ErrorCode.NOT_FOUND,
|
|
1522
|
+
message: "Target URL not reachable by Jina Reader",
|
|
1523
|
+
retryable: false,
|
|
1524
|
+
statusCode: 404
|
|
1525
|
+
}
|
|
1526
|
+
};
|
|
1527
|
+
}
|
|
1528
|
+
if (statusCode >= 400 && statusCode < 500 && statusCode !== 429) {
|
|
1529
|
+
return {
|
|
1530
|
+
content: `Jina Reader error (${statusCode}): ${content.slice(0, 200)}`,
|
|
1531
|
+
statusCode,
|
|
1532
|
+
credits: 0,
|
|
1533
|
+
error: {
|
|
1534
|
+
code: ErrorCode.INVALID_INPUT,
|
|
1535
|
+
message: `Jina Reader returned ${statusCode}`,
|
|
1536
|
+
retryable: false,
|
|
1537
|
+
statusCode
|
|
1538
|
+
}
|
|
1539
|
+
};
|
|
1540
|
+
}
|
|
1541
|
+
return null;
|
|
1542
|
+
}
|
|
1543
|
+
function isTerminalReaderError(error2) {
|
|
1544
|
+
return !error2.retryable && (error2.code === ErrorCode.AUTH_ERROR || error2.code === ErrorCode.QUOTA_EXCEEDED || error2.code === ErrorCode.NOT_FOUND || error2.code === ErrorCode.INVALID_INPUT);
|
|
1545
|
+
}
|
|
1546
|
+
function parseSearchResults(raw) {
|
|
1547
|
+
let data;
|
|
1548
|
+
try {
|
|
1549
|
+
const parsed = JSON.parse(raw);
|
|
1550
|
+
data = readUnknown(parsed, "data");
|
|
1551
|
+
} catch {
|
|
1552
|
+
data = parseMarkdownSearchResults(raw);
|
|
1553
|
+
}
|
|
1554
|
+
const items = Array.isArray(data) ? data : [];
|
|
1555
|
+
return items.map((item, index) => normalizeSearchItem(item, index)).filter((item) => item !== null).slice(0, SEARCH_RESULTS_PER_QUERY);
|
|
1556
|
+
}
|
|
1557
|
+
function normalizeSearchItem(item, index) {
|
|
1558
|
+
const link = readString(item, "url") ?? readString(item, "link");
|
|
1559
|
+
if (!link) return null;
|
|
1560
|
+
return {
|
|
1561
|
+
title: readString(item, "title") || link,
|
|
1562
|
+
link,
|
|
1563
|
+
snippet: (readString(item, "snippet") || readString(item, "description") || readString(item, "content") || "").slice(0, 500),
|
|
1564
|
+
date: readString(item, "date") ?? readString(item, "publishedTime"),
|
|
1565
|
+
position: index + 1
|
|
1566
|
+
};
|
|
1567
|
+
}
|
|
1568
|
+
function parseMarkdownSearchResults(raw) {
|
|
1569
|
+
const items = [];
|
|
1570
|
+
const markdownLink = /\[([^\]]+)\]\((https?:\/\/[^)]+)\)/g;
|
|
1571
|
+
let match;
|
|
1572
|
+
while ((match = markdownLink.exec(raw)) !== null && items.length < SEARCH_RESULTS_PER_QUERY) {
|
|
1573
|
+
const title = match[1];
|
|
1574
|
+
const url = match[2];
|
|
1575
|
+
if (title && url) items.push({ title, url });
|
|
1576
|
+
}
|
|
1577
|
+
return items;
|
|
1578
|
+
}
|
|
1579
|
+
function isRecord(value) {
|
|
1580
|
+
return typeof value === "object" && value !== null;
|
|
1581
|
+
}
|
|
1582
|
+
function readUnknown(value, key) {
|
|
1583
|
+
return isRecord(value) ? value[key] : void 0;
|
|
1584
|
+
}
|
|
1585
|
+
function readRecord(value, key) {
|
|
1586
|
+
const child = readUnknown(value, key);
|
|
1587
|
+
return isRecord(child) ? child : void 0;
|
|
1588
|
+
}
|
|
1589
|
+
function readString(value, key) {
|
|
1590
|
+
const child = readUnknown(value, key);
|
|
1591
|
+
return typeof child === "string" ? child : void 0;
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1594
|
+
// src/services/markdown-cleaner.ts
|
|
1595
|
+
import { Logger as Logger3 } from "mcp-use";
|
|
1596
|
+
import TurndownService from "turndown";
|
|
1597
|
+
var logger = Logger3.get("markdown-cleaner");
|
|
1598
|
+
var turndown = new TurndownService({
|
|
1599
|
+
headingStyle: "atx",
|
|
1600
|
+
codeBlockStyle: "fenced",
|
|
1601
|
+
bulletListMarker: "-"
|
|
1602
|
+
});
|
|
1603
|
+
turndown.remove(["script", "style", "nav", "footer", "aside", "noscript"]);
|
|
1604
|
+
var MAX_CONTENT_LENGTH = 524288;
|
|
1605
|
+
function removeHtmlComments(html) {
|
|
1606
|
+
const parts = [];
|
|
1607
|
+
let pos = 0;
|
|
1608
|
+
while (pos < html.length) {
|
|
1609
|
+
const start = html.indexOf("<!--", pos);
|
|
1610
|
+
if (start === -1) {
|
|
1611
|
+
parts.push(html.substring(pos));
|
|
1612
|
+
break;
|
|
1613
|
+
}
|
|
1614
|
+
if (start > pos) parts.push(html.substring(pos, start));
|
|
1615
|
+
const end = html.indexOf("-->", start + 4);
|
|
1616
|
+
if (end === -1) {
|
|
1617
|
+
parts.push(html.substring(start));
|
|
1618
|
+
break;
|
|
1619
|
+
}
|
|
1620
|
+
pos = end + 3;
|
|
1621
|
+
}
|
|
1622
|
+
return parts.join("");
|
|
1623
|
+
}
|
|
1624
|
+
var MarkdownCleaner = class {
|
|
1625
|
+
/**
|
|
1626
|
+
* Process HTML content and convert to clean Markdown
|
|
1627
|
+
* NEVER throws - returns original content on any error for graceful degradation
|
|
1628
|
+
*/
|
|
1629
|
+
processContent(htmlContent) {
|
|
1630
|
+
try {
|
|
1631
|
+
if (!htmlContent || typeof htmlContent !== "string") {
|
|
1632
|
+
return htmlContent || "";
|
|
1633
|
+
}
|
|
1634
|
+
if (!htmlContent.includes("<")) {
|
|
1635
|
+
return htmlContent.trim();
|
|
1636
|
+
}
|
|
1637
|
+
if (htmlContent.length > MAX_CONTENT_LENGTH) {
|
|
1638
|
+
htmlContent = htmlContent.substring(0, MAX_CONTENT_LENGTH);
|
|
1639
|
+
}
|
|
1640
|
+
let content = removeHtmlComments(htmlContent);
|
|
1641
|
+
content = turndown.turndown(content);
|
|
1642
|
+
content = content.replace(/\n{3,}/g, "\n\n");
|
|
1643
|
+
content = content.trim();
|
|
1644
|
+
return content;
|
|
1645
|
+
} catch (error2) {
|
|
1646
|
+
logger.warn(
|
|
1647
|
+
`processContent failed: ${error2 instanceof Error ? error2.message : String(error2)} | Content length: ${htmlContent?.length ?? 0}`
|
|
1648
|
+
);
|
|
1649
|
+
return htmlContent || "";
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1652
|
+
};
|
|
1653
|
+
|
|
1654
|
+
// src/utils/markdown-formatter.ts
|
|
1655
|
+
function removeMetaTags(content) {
|
|
1656
|
+
if (!content || typeof content !== "string") {
|
|
1657
|
+
return content;
|
|
1658
|
+
}
|
|
1659
|
+
const lines = content.split("\n");
|
|
1660
|
+
const filteredLines = lines.filter((line) => {
|
|
1661
|
+
const trimmed = line.trim();
|
|
1662
|
+
return !trimmed.startsWith("- Meta:") && !trimmed.startsWith("Meta:");
|
|
1663
|
+
});
|
|
1664
|
+
return filteredLines.join("\n");
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
// src/utils/content-extractor.ts
|
|
1668
|
+
import { Readability } from "@mozilla/readability";
|
|
1669
|
+
import { JSDOM, VirtualConsole } from "jsdom";
|
|
1670
|
+
var MAX_READABILITY_BYTES = 15e5;
|
|
1671
|
+
function extractReadableContent(html, url) {
|
|
1672
|
+
if (!html || typeof html !== "string") {
|
|
1673
|
+
return { title: "", content: html ?? "", extracted: false };
|
|
1674
|
+
}
|
|
1675
|
+
if (html.length > MAX_READABILITY_BYTES) {
|
|
1676
|
+
return { title: "", content: html, extracted: false };
|
|
1677
|
+
}
|
|
1678
|
+
if (!html.includes("<")) {
|
|
1679
|
+
return { title: "", content: html, extracted: false };
|
|
1680
|
+
}
|
|
1681
|
+
const virtualConsole = new VirtualConsole();
|
|
1682
|
+
virtualConsole.on("error", () => {
|
|
1683
|
+
});
|
|
1684
|
+
virtualConsole.on("warn", () => {
|
|
1685
|
+
});
|
|
1686
|
+
virtualConsole.on("jsdomError", () => {
|
|
1687
|
+
});
|
|
1688
|
+
let dom;
|
|
1689
|
+
try {
|
|
1690
|
+
dom = new JSDOM(html, {
|
|
1691
|
+
url: url && /^https?:/i.test(url) ? url : "https://example.com/",
|
|
1692
|
+
virtualConsole
|
|
1693
|
+
});
|
|
1694
|
+
} catch (err) {
|
|
1695
|
+
mcpLog("warning", `JSDOM construction failed: ${err instanceof Error ? err.message : String(err)}`, "content-extractor");
|
|
1696
|
+
return { title: "", content: html, extracted: false };
|
|
1697
|
+
}
|
|
1698
|
+
try {
|
|
1699
|
+
const reader = new Readability(dom.window.document, {
|
|
1700
|
+
// Keep classes that downstream cleanup may need; Turndown ignores them.
|
|
1701
|
+
keepClasses: false
|
|
1702
|
+
// Strip <script>/<style> already handled by Readability defaults.
|
|
1703
|
+
});
|
|
1704
|
+
const article = reader.parse();
|
|
1705
|
+
if (!article || !article.content) {
|
|
1706
|
+
return { title: article?.title ?? "", content: html, extracted: false };
|
|
1707
|
+
}
|
|
1708
|
+
return {
|
|
1709
|
+
title: article.title ?? "",
|
|
1710
|
+
content: article.content,
|
|
1711
|
+
byline: article.byline ?? void 0,
|
|
1712
|
+
extracted: true
|
|
1713
|
+
};
|
|
1714
|
+
} catch (err) {
|
|
1715
|
+
mcpLog("warning", `Readability.parse failed: ${err instanceof Error ? err.message : String(err)}`, "content-extractor");
|
|
1716
|
+
return { title: "", content: html, extracted: false };
|
|
1717
|
+
} finally {
|
|
1718
|
+
try {
|
|
1719
|
+
dom.window.close();
|
|
1720
|
+
} catch {
|
|
1721
|
+
}
|
|
1722
|
+
}
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1725
|
+
// src/utils/source-type.ts
|
|
1726
|
+
var DOCUMENT_PATH_SUFFIXES = [
|
|
1727
|
+
".pdf",
|
|
1728
|
+
".doc",
|
|
1729
|
+
".docx",
|
|
1730
|
+
".ppt",
|
|
1731
|
+
".pptx",
|
|
1732
|
+
".xls",
|
|
1733
|
+
".xlsx"
|
|
1734
|
+
];
|
|
1735
|
+
function isDocumentUrl(url) {
|
|
1736
|
+
let pathname;
|
|
1737
|
+
try {
|
|
1738
|
+
pathname = new URL(url).pathname.toLowerCase();
|
|
1739
|
+
} catch {
|
|
1740
|
+
return false;
|
|
1741
|
+
}
|
|
1742
|
+
for (const suffix of DOCUMENT_PATH_SUFFIXES) {
|
|
1743
|
+
if (pathname.endsWith(suffix)) return true;
|
|
1744
|
+
}
|
|
1745
|
+
return false;
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
// src/utils/content-quality.ts
|
|
1749
|
+
var MIN_MARKDOWN_CHARS = 800;
|
|
1750
|
+
var MIN_MARKDOWN_WORDS = 120;
|
|
1751
|
+
var BLOCK_PHRASES = [
|
|
1752
|
+
"access denied",
|
|
1753
|
+
"are you a human",
|
|
1754
|
+
"captcha",
|
|
1755
|
+
"checking your browser",
|
|
1756
|
+
"enable javascript",
|
|
1757
|
+
"just a moment",
|
|
1758
|
+
"login required",
|
|
1759
|
+
"please enable cookies",
|
|
1760
|
+
"please verify you are human",
|
|
1761
|
+
"sign in to continue",
|
|
1762
|
+
"temporarily unavailable"
|
|
1763
|
+
];
|
|
1764
|
+
function countWords(content) {
|
|
1765
|
+
const matches = content.trim().match(/\S+/g);
|
|
1766
|
+
return matches ? matches.length : 0;
|
|
1767
|
+
}
|
|
1768
|
+
function findBlockPhrase(content) {
|
|
1769
|
+
const lower = content.toLowerCase();
|
|
1770
|
+
return BLOCK_PHRASES.find((phrase) => lower.includes(phrase));
|
|
1771
|
+
}
|
|
1772
|
+
function assessMarkdownQuality(content) {
|
|
1773
|
+
const trimmed = content.trim();
|
|
1774
|
+
const charCount = trimmed.length;
|
|
1775
|
+
const wordCount = countWords(trimmed);
|
|
1776
|
+
const blockPhrase = findBlockPhrase(trimmed);
|
|
1777
|
+
if (blockPhrase) {
|
|
1778
|
+
return {
|
|
1779
|
+
weak: true,
|
|
1780
|
+
reason: `blocked_or_interstitial:${blockPhrase}`,
|
|
1781
|
+
charCount,
|
|
1782
|
+
wordCount,
|
|
1783
|
+
blockPhrase
|
|
1784
|
+
};
|
|
1785
|
+
}
|
|
1786
|
+
if (charCount < MIN_MARKDOWN_CHARS) {
|
|
1787
|
+
return {
|
|
1788
|
+
weak: true,
|
|
1789
|
+
reason: `too_few_chars:${charCount}<${MIN_MARKDOWN_CHARS}`,
|
|
1790
|
+
charCount,
|
|
1791
|
+
wordCount
|
|
1792
|
+
};
|
|
1793
|
+
}
|
|
1794
|
+
if (wordCount < MIN_MARKDOWN_WORDS) {
|
|
1795
|
+
return {
|
|
1796
|
+
weak: true,
|
|
1797
|
+
reason: `too_few_words:${wordCount}<${MIN_MARKDOWN_WORDS}`,
|
|
1798
|
+
charCount,
|
|
1799
|
+
wordCount
|
|
1800
|
+
};
|
|
1801
|
+
}
|
|
1802
|
+
return {
|
|
1803
|
+
weak: false,
|
|
1804
|
+
reason: "ok",
|
|
1805
|
+
charCount,
|
|
1806
|
+
wordCount
|
|
1807
|
+
};
|
|
1808
|
+
}
|
|
1809
|
+
|
|
1810
|
+
// src/effect/runtime.ts
|
|
1811
|
+
import { Effect as Effect2, Layer as Layer2 } from "effect";
|
|
1812
|
+
|
|
1813
|
+
// src/effect/services.ts
|
|
1814
|
+
import { Context, Effect, Layer } from "effect";
|
|
1815
|
+
|
|
1816
|
+
// src/utils/concurrency.ts
|
|
1817
|
+
import pMapLib from "p-map";
|
|
1818
|
+
async function pMap(items, mapper, concurrency = 6, signal) {
|
|
1819
|
+
if (items.length === 0) return [];
|
|
1820
|
+
const limit = Math.max(1, Math.min(concurrency, items.length));
|
|
1821
|
+
return pMapLib(items, mapper, { concurrency: limit, signal });
|
|
1822
|
+
}
|
|
1823
|
+
async function pMapSettled(items, mapper, concurrency = 6, signal) {
|
|
1824
|
+
if (items.length === 0) return [];
|
|
1825
|
+
const limit = Math.max(1, Math.min(concurrency, items.length));
|
|
1826
|
+
return pMapLib(
|
|
1827
|
+
items,
|
|
1828
|
+
async (item, index) => {
|
|
1829
|
+
try {
|
|
1830
|
+
const value = await mapper(item, index);
|
|
1831
|
+
return { status: "fulfilled", value };
|
|
1832
|
+
} catch (reason) {
|
|
1833
|
+
return { status: "rejected", reason };
|
|
1834
|
+
}
|
|
1835
|
+
},
|
|
1836
|
+
{ concurrency: limit, signal, stopOnError: false }
|
|
1837
|
+
);
|
|
1838
|
+
}
|
|
1839
|
+
|
|
1840
|
+
// src/clients/search.ts
|
|
1841
|
+
var SERPER_API_URL = "https://google.serper.dev/search";
|
|
1842
|
+
var DEFAULT_RESULTS_PER_QUERY = 10;
|
|
1843
|
+
var MAX_RETRIES2 = 3;
|
|
1844
|
+
var SEARCH_RETRY_CONFIG = {
|
|
1845
|
+
maxRetries: MAX_RETRIES2,
|
|
1846
|
+
baseDelayMs: 1e3,
|
|
1847
|
+
maxDelayMs: 1e4,
|
|
1848
|
+
timeoutMs: 3e4
|
|
1849
|
+
};
|
|
1850
|
+
var RETRYABLE_SEARCH_CODES = /* @__PURE__ */ new Set([429, 500, 502, 503, 504]);
|
|
1851
|
+
var REDDIT_SITE_REGEX = /site:\s*reddit\.com/i;
|
|
1852
|
+
var REDDIT_SUBREDDIT_SUFFIX_REGEX = / : r\/\w+$/;
|
|
1853
|
+
var REDDIT_SUFFIX_REGEX = / - Reddit$/;
|
|
1854
|
+
function parseSearchResponses(responses, queries) {
|
|
1855
|
+
return responses.map((resp, index) => {
|
|
1856
|
+
try {
|
|
1857
|
+
const organic = resp.organic || [];
|
|
1858
|
+
const results = organic.map((item, idx) => ({
|
|
1859
|
+
title: item.title || "No title",
|
|
1860
|
+
link: item.link || "#",
|
|
1861
|
+
snippet: item.snippet || "",
|
|
1862
|
+
date: item.date,
|
|
1863
|
+
position: item.position || idx + 1
|
|
1864
|
+
}));
|
|
1865
|
+
const searchInfo = resp.searchInformation;
|
|
1866
|
+
const totalResults = searchInfo?.totalResults ? parseInt(String(searchInfo.totalResults).replace(/,/g, ""), 10) : results.length;
|
|
1867
|
+
const relatedSearches = resp.relatedSearches || [];
|
|
1868
|
+
const related = relatedSearches.map((r) => r.query || "");
|
|
1869
|
+
return { query: queries[index] || "", results, totalResults, related };
|
|
1870
|
+
} catch {
|
|
1871
|
+
return { query: queries[index] || "", results: [], totalResults: 0, related: [] };
|
|
1872
|
+
}
|
|
1873
|
+
});
|
|
1874
|
+
}
|
|
1875
|
+
async function executeSearchWithRetry(apiKey, body, isRetryable) {
|
|
1876
|
+
let lastError;
|
|
1877
|
+
for (let attempt = 0; attempt <= SEARCH_RETRY_CONFIG.maxRetries; attempt++) {
|
|
1878
|
+
try {
|
|
1879
|
+
if (attempt > 0) {
|
|
1880
|
+
mcpLog("warning", `Retry attempt ${attempt}/${SEARCH_RETRY_CONFIG.maxRetries}`, "search");
|
|
1881
|
+
}
|
|
1882
|
+
const response = await fetchWithTimeout(SERPER_API_URL, {
|
|
1883
|
+
method: "POST",
|
|
1884
|
+
headers: {
|
|
1885
|
+
"X-API-KEY": apiKey,
|
|
1886
|
+
"Content-Type": "application/json"
|
|
1887
|
+
},
|
|
1888
|
+
body: JSON.stringify(body),
|
|
1889
|
+
timeoutMs: SEARCH_RETRY_CONFIG.timeoutMs
|
|
1890
|
+
});
|
|
1891
|
+
if (!response.ok) {
|
|
1892
|
+
const errorText = await response.text().catch(() => "");
|
|
1893
|
+
lastError = classifyError({ status: response.status, message: errorText });
|
|
1894
|
+
if (isRetryable(response.status) && attempt < SEARCH_RETRY_CONFIG.maxRetries) {
|
|
1895
|
+
const delayMs = calculateBackoff2(attempt, SEARCH_RETRY_CONFIG.baseDelayMs, SEARCH_RETRY_CONFIG.maxDelayMs);
|
|
1896
|
+
mcpLog("warning", `API returned ${response.status}, retrying in ${delayMs}ms...`, "search");
|
|
1897
|
+
await sleep(delayMs);
|
|
1898
|
+
continue;
|
|
1899
|
+
}
|
|
1900
|
+
return { data: void 0, error: lastError };
|
|
1901
|
+
}
|
|
1902
|
+
try {
|
|
1903
|
+
const data = await response.json();
|
|
1904
|
+
return { data };
|
|
1905
|
+
} catch {
|
|
1906
|
+
return {
|
|
1907
|
+
data: void 0,
|
|
1908
|
+
error: { code: ErrorCode.PARSE_ERROR, message: "Failed to parse search response", retryable: false }
|
|
1909
|
+
};
|
|
1910
|
+
}
|
|
1911
|
+
} catch (error2) {
|
|
1912
|
+
lastError = classifyError(error2);
|
|
1913
|
+
if (isRetryable(void 0, error2) && attempt < SEARCH_RETRY_CONFIG.maxRetries) {
|
|
1914
|
+
const delayMs = calculateBackoff2(attempt, SEARCH_RETRY_CONFIG.baseDelayMs, SEARCH_RETRY_CONFIG.maxDelayMs);
|
|
1915
|
+
mcpLog("warning", `${lastError.code}: ${lastError.message}, retrying in ${delayMs}ms...`, "search");
|
|
1916
|
+
await sleep(delayMs);
|
|
1917
|
+
continue;
|
|
1918
|
+
}
|
|
1919
|
+
return { data: void 0, error: lastError };
|
|
1920
|
+
}
|
|
1921
|
+
}
|
|
1922
|
+
return {
|
|
1923
|
+
data: void 0,
|
|
1924
|
+
error: lastError || { code: ErrorCode.UNKNOWN_ERROR, message: "Search failed", retryable: false }
|
|
1925
|
+
};
|
|
1926
|
+
}
|
|
1927
|
+
var SearchClient = class {
|
|
1928
|
+
apiKey;
|
|
1929
|
+
constructor(apiKey) {
|
|
1930
|
+
const env = parseEnv();
|
|
1931
|
+
this.apiKey = apiKey || env.SEARCH_API_KEY || "";
|
|
1932
|
+
if (!this.apiKey) {
|
|
1933
|
+
throw new Error("Web search capability is not configured. Please set up the required API credentials.");
|
|
1934
|
+
}
|
|
1935
|
+
}
|
|
1936
|
+
/**
|
|
1937
|
+
* Check if error is retryable
|
|
1938
|
+
*/
|
|
1939
|
+
isRetryable(status, error2) {
|
|
1940
|
+
if (status && RETRYABLE_SEARCH_CODES.has(status)) return true;
|
|
1941
|
+
if (error2 == null) return false;
|
|
1942
|
+
const message = typeof error2 === "object" && "message" in error2 && typeof error2.message === "string" ? error2.message.toLowerCase() : "";
|
|
1943
|
+
return message.includes("timeout") || message.includes("rate limit") || message.includes("connection");
|
|
1944
|
+
}
|
|
1945
|
+
/**
|
|
1946
|
+
* Search multiple queries in parallel
|
|
1947
|
+
* NEVER throws - always returns a valid response
|
|
1948
|
+
*/
|
|
1949
|
+
async searchMultiple(queries) {
|
|
1950
|
+
const startTime = Date.now();
|
|
1951
|
+
if (queries.length === 0) {
|
|
1952
|
+
return {
|
|
1953
|
+
searches: [],
|
|
1954
|
+
totalQueries: 0,
|
|
1955
|
+
executionTime: 0,
|
|
1956
|
+
error: { code: ErrorCode.INVALID_INPUT, message: "No queries provided", retryable: false }
|
|
1957
|
+
};
|
|
1958
|
+
}
|
|
1959
|
+
const searchQueries = queries.map((query) => ({ q: query }));
|
|
1960
|
+
const { data, error: error2 } = await executeSearchWithRetry(
|
|
1961
|
+
this.apiKey,
|
|
1962
|
+
searchQueries,
|
|
1963
|
+
(status, err) => this.isRetryable(status, err)
|
|
1964
|
+
);
|
|
1965
|
+
if (error2 || data === void 0) {
|
|
1966
|
+
return {
|
|
1967
|
+
searches: [],
|
|
1968
|
+
totalQueries: queries.length,
|
|
1969
|
+
executionTime: Date.now() - startTime,
|
|
1970
|
+
error: error2 ?? { code: ErrorCode.UNKNOWN_ERROR, message: "Search provider returned no data", retryable: false }
|
|
1971
|
+
};
|
|
1972
|
+
}
|
|
1973
|
+
const responses = Array.isArray(data) ? data : [data];
|
|
1974
|
+
const searches = parseSearchResponses(responses, queries);
|
|
1975
|
+
return { searches, totalQueries: queries.length, executionTime: Date.now() - startTime };
|
|
1976
|
+
}
|
|
1977
|
+
/**
|
|
1978
|
+
* Search Reddit via Google (adds site:reddit.com automatically)
|
|
1979
|
+
* NEVER throws - returns empty array on failure
|
|
1980
|
+
*/
|
|
1981
|
+
async searchReddit(query, dateAfter) {
|
|
1982
|
+
if (!query?.trim()) {
|
|
1983
|
+
return [];
|
|
1984
|
+
}
|
|
1985
|
+
let q = query.replace(REDDIT_SITE_REGEX, "").trim() + " site:reddit.com";
|
|
1986
|
+
if (dateAfter) {
|
|
1987
|
+
q += ` after:${dateAfter}`;
|
|
1988
|
+
}
|
|
1989
|
+
for (let attempt = 0; attempt <= SEARCH_RETRY_CONFIG.maxRetries; attempt++) {
|
|
1990
|
+
try {
|
|
1991
|
+
const res = await fetchWithTimeout(SERPER_API_URL, {
|
|
1992
|
+
method: "POST",
|
|
1993
|
+
headers: { "X-API-KEY": this.apiKey, "Content-Type": "application/json" },
|
|
1994
|
+
body: JSON.stringify({ q, num: DEFAULT_RESULTS_PER_QUERY }),
|
|
1995
|
+
timeoutMs: SEARCH_RETRY_CONFIG.timeoutMs
|
|
1996
|
+
});
|
|
1997
|
+
if (!res.ok) {
|
|
1998
|
+
if (this.isRetryable(res.status) && attempt < SEARCH_RETRY_CONFIG.maxRetries) {
|
|
1999
|
+
const delayMs = calculateBackoff2(attempt, SEARCH_RETRY_CONFIG.baseDelayMs, SEARCH_RETRY_CONFIG.maxDelayMs);
|
|
2000
|
+
mcpLog("warning", `Reddit search ${res.status}, retrying in ${delayMs}ms...`, "search");
|
|
2001
|
+
await sleep(delayMs);
|
|
2002
|
+
continue;
|
|
2003
|
+
}
|
|
2004
|
+
mcpLog("error", `Reddit search failed with status ${res.status}`, "search");
|
|
2005
|
+
return [];
|
|
2006
|
+
}
|
|
2007
|
+
const data = await res.json();
|
|
2008
|
+
return (data.organic || []).map((r) => ({
|
|
2009
|
+
title: (r.title || "").replace(REDDIT_SUBREDDIT_SUFFIX_REGEX, "").replace(REDDIT_SUFFIX_REGEX, ""),
|
|
2010
|
+
url: r.link || "",
|
|
2011
|
+
snippet: r.snippet || "",
|
|
2012
|
+
date: r.date
|
|
2013
|
+
}));
|
|
2014
|
+
} catch (error2) {
|
|
2015
|
+
const err = classifyError(error2);
|
|
2016
|
+
if (this.isRetryable(void 0, error2) && attempt < SEARCH_RETRY_CONFIG.maxRetries) {
|
|
2017
|
+
const delayMs = calculateBackoff2(attempt, SEARCH_RETRY_CONFIG.baseDelayMs, SEARCH_RETRY_CONFIG.maxDelayMs);
|
|
2018
|
+
mcpLog("warning", `Reddit search ${err.code}, retrying in ${delayMs}ms...`, "search");
|
|
2019
|
+
await sleep(delayMs);
|
|
2020
|
+
continue;
|
|
2021
|
+
}
|
|
2022
|
+
mcpLog("error", `Reddit search failed: ${err.message}`, "search");
|
|
2023
|
+
return [];
|
|
2024
|
+
}
|
|
2025
|
+
}
|
|
2026
|
+
return [];
|
|
2027
|
+
}
|
|
2028
|
+
/**
|
|
2029
|
+
* Search Reddit with multiple queries (bounded concurrency)
|
|
2030
|
+
* NEVER throws - searchReddit never throws, pMap preserves order
|
|
2031
|
+
*/
|
|
2032
|
+
async searchRedditMultiple(queries, dateAfter) {
|
|
2033
|
+
if (queries.length === 0) {
|
|
2034
|
+
return /* @__PURE__ */ new Map();
|
|
2035
|
+
}
|
|
2036
|
+
const results = await pMap(
|
|
2037
|
+
queries,
|
|
2038
|
+
(q) => this.searchReddit(q, dateAfter),
|
|
2039
|
+
CONCURRENCY.SEARCH
|
|
2040
|
+
);
|
|
2041
|
+
return new Map(queries.map((q, i) => [q, results[i] || []]));
|
|
2042
|
+
}
|
|
2043
|
+
};
|
|
2044
|
+
|
|
2045
|
+
// src/clients/kernel.ts
|
|
2046
|
+
import Kernel from "@onkernel/sdk";
|
|
2047
|
+
var DEFAULT_RENDER_TIMEOUT_SECONDS = 15;
|
|
2048
|
+
var BROWSER_IDLE_TIMEOUT_SECONDS = 300;
|
|
2049
|
+
var KernelClient = class {
|
|
2050
|
+
kernel;
|
|
2051
|
+
constructor(apiKey) {
|
|
2052
|
+
const env = parseEnv();
|
|
2053
|
+
const resolvedKey = apiKey?.trim() || env.KERNEL_API_KEY;
|
|
2054
|
+
if (!resolvedKey) {
|
|
2055
|
+
throw new Error("Kernel browser rendering is not configured. Set KERNEL_API_KEY.");
|
|
2056
|
+
}
|
|
2057
|
+
this.kernel = new Kernel({
|
|
2058
|
+
apiKey: resolvedKey,
|
|
2059
|
+
timeout: 3e4,
|
|
2060
|
+
maxRetries: 1,
|
|
2061
|
+
...env.KERNEL_PROJECT ? { defaultHeaders: { "X-Kernel-Project-Id": env.KERNEL_PROJECT } } : {}
|
|
2062
|
+
});
|
|
2063
|
+
}
|
|
2064
|
+
async render(request) {
|
|
2065
|
+
const { url, timeoutSeconds = DEFAULT_RENDER_TIMEOUT_SECONDS } = request;
|
|
2066
|
+
try {
|
|
2067
|
+
new URL(url);
|
|
2068
|
+
} catch {
|
|
2069
|
+
return {
|
|
2070
|
+
content: `Invalid URL: ${url}`,
|
|
2071
|
+
statusCode: 400,
|
|
2072
|
+
credits: 0,
|
|
2073
|
+
error: { code: ErrorCode.INVALID_INPUT, message: `Invalid URL: ${url}`, retryable: false }
|
|
2074
|
+
};
|
|
2075
|
+
}
|
|
2076
|
+
let sessionId;
|
|
2077
|
+
try {
|
|
2078
|
+
const session = await this.kernel.browsers.create({
|
|
2079
|
+
headless: true,
|
|
2080
|
+
stealth: true,
|
|
2081
|
+
timeout_seconds: BROWSER_IDLE_TIMEOUT_SECONDS,
|
|
2082
|
+
viewport: { width: 1280, height: 800 }
|
|
2083
|
+
});
|
|
2084
|
+
sessionId = session.session_id;
|
|
2085
|
+
const response = await this.kernel.browsers.playwright.execute(session.session_id, {
|
|
2086
|
+
code: buildRenderScript(url, timeoutSeconds),
|
|
2087
|
+
timeout_sec: Math.min(timeoutSeconds + 5, 300)
|
|
2088
|
+
});
|
|
2089
|
+
if (!response.success) {
|
|
2090
|
+
const message = response.error || response.stderr || "Kernel Playwright execution failed";
|
|
2091
|
+
return {
|
|
2092
|
+
content: `Kernel render failed: ${message}`,
|
|
2093
|
+
statusCode: 500,
|
|
2094
|
+
credits: 0,
|
|
2095
|
+
error: { code: ErrorCode.SERVICE_UNAVAILABLE, message, retryable: true }
|
|
2096
|
+
};
|
|
2097
|
+
}
|
|
2098
|
+
const rendered = parseRenderedPage(response.result);
|
|
2099
|
+
if (!rendered) {
|
|
2100
|
+
return {
|
|
2101
|
+
content: "Kernel render returned an invalid payload",
|
|
2102
|
+
statusCode: 500,
|
|
2103
|
+
credits: 0,
|
|
2104
|
+
error: {
|
|
2105
|
+
code: ErrorCode.PARSE_ERROR,
|
|
2106
|
+
message: "Kernel render returned an invalid payload",
|
|
2107
|
+
retryable: false
|
|
2108
|
+
}
|
|
2109
|
+
};
|
|
2110
|
+
}
|
|
2111
|
+
return {
|
|
2112
|
+
content: rendered.html,
|
|
2113
|
+
statusCode: 200,
|
|
2114
|
+
credits: 0,
|
|
2115
|
+
finalUrl: rendered.finalUrl,
|
|
2116
|
+
title: rendered.title,
|
|
2117
|
+
text: rendered.text
|
|
2118
|
+
};
|
|
2119
|
+
} catch (error2) {
|
|
2120
|
+
const err = formatKernelError(error2);
|
|
2121
|
+
return {
|
|
2122
|
+
content: `Kernel render failed: ${err.message}`,
|
|
2123
|
+
statusCode: err.statusCode ?? 500,
|
|
2124
|
+
credits: 0,
|
|
2125
|
+
error: err
|
|
2126
|
+
};
|
|
2127
|
+
} finally {
|
|
2128
|
+
if (sessionId) {
|
|
2129
|
+
try {
|
|
2130
|
+
await this.kernel.browsers.deleteByID(sessionId);
|
|
2131
|
+
} catch (deleteError) {
|
|
2132
|
+
const err = formatKernelError(deleteError);
|
|
2133
|
+
mcpLog("warning", `Kernel browser cleanup failed for ${sessionId}: ${err.message}`, "kernel");
|
|
2134
|
+
}
|
|
2135
|
+
}
|
|
2136
|
+
}
|
|
2137
|
+
}
|
|
2138
|
+
};
|
|
2139
|
+
function buildRenderScript(url, timeoutSeconds) {
|
|
2140
|
+
const timeoutMs = timeoutSeconds * 1e3;
|
|
2141
|
+
return `
|
|
2142
|
+
const targetUrl = ${JSON.stringify(url)};
|
|
2143
|
+
const timeoutMs = ${timeoutMs};
|
|
2144
|
+
await page.goto(targetUrl, { waitUntil: 'domcontentloaded', timeout: timeoutMs });
|
|
2145
|
+
await page.waitForLoadState('networkidle', { timeout: Math.min(5000, timeoutMs) }).catch(() => {});
|
|
2146
|
+
const html = await page.content();
|
|
2147
|
+
const text = await page.locator('body').innerText({ timeout: 2000 }).catch(() => '');
|
|
2148
|
+
return {
|
|
2149
|
+
html,
|
|
2150
|
+
text,
|
|
2151
|
+
title: await page.title(),
|
|
2152
|
+
finalUrl: page.url(),
|
|
2153
|
+
};
|
|
2154
|
+
`;
|
|
2155
|
+
}
|
|
2156
|
+
function parseRenderedPage(value) {
|
|
2157
|
+
if (!isRecord2(value)) return null;
|
|
2158
|
+
const html = readString2(value, "html");
|
|
2159
|
+
const finalUrl = readString2(value, "finalUrl");
|
|
2160
|
+
const title = readString2(value, "title");
|
|
2161
|
+
const text = readString2(value, "text");
|
|
2162
|
+
if (!html || !finalUrl || title === void 0 || text === void 0) return null;
|
|
2163
|
+
return { html, finalUrl, title, text };
|
|
2164
|
+
}
|
|
2165
|
+
function formatKernelError(error2) {
|
|
2166
|
+
if (error2 instanceof Kernel.APIError) {
|
|
2167
|
+
return {
|
|
2168
|
+
...classifyError({ status: error2.status, message: error2.message }),
|
|
2169
|
+
cause: error2.message
|
|
2170
|
+
};
|
|
2171
|
+
}
|
|
2172
|
+
return classifyError(error2);
|
|
2173
|
+
}
|
|
2174
|
+
function isRecord2(value) {
|
|
2175
|
+
return typeof value === "object" && value !== null;
|
|
2176
|
+
}
|
|
2177
|
+
function readString2(value, key) {
|
|
2178
|
+
const child = value[key];
|
|
2179
|
+
return typeof child === "string" ? child : void 0;
|
|
2180
|
+
}
|
|
2181
|
+
|
|
2182
|
+
// src/clients/reddit.ts
|
|
2183
|
+
import { Logger as Logger4 } from "mcp-use";
|
|
2184
|
+
var REDDIT_TOKEN_URL = "https://www.reddit.com/api/v1/access_token";
|
|
2185
|
+
var REDDIT_API_BASE = "https://oauth.reddit.com";
|
|
2186
|
+
var TOKEN_EXPIRY_MS = 55e3;
|
|
2187
|
+
var FETCH_LIMIT = 500;
|
|
2188
|
+
var cachedToken = null;
|
|
2189
|
+
var cachedTokenExpiry = 0;
|
|
2190
|
+
var DEBUG_TOKEN_CACHE = process.env.DEBUG_REDDIT === "true";
|
|
2191
|
+
var clientLogger = Logger4.get("reddit-client");
|
|
2192
|
+
var pendingAuthPromise = null;
|
|
2193
|
+
async function fetchRedditJson(sub, id, token, userAgent) {
|
|
2194
|
+
const limit = Math.min(FETCH_LIMIT, 500);
|
|
2195
|
+
const apiUrl = `${REDDIT_API_BASE}/r/${sub}/comments/${id}?sort=top&limit=${limit}&depth=10&raw_json=1`;
|
|
2196
|
+
const res = await fetchWithTimeout(apiUrl, {
|
|
2197
|
+
headers: {
|
|
2198
|
+
"Authorization": `Bearer ${token}`,
|
|
2199
|
+
"User-Agent": userAgent
|
|
2200
|
+
},
|
|
2201
|
+
timeoutMs: 3e4
|
|
2202
|
+
});
|
|
2203
|
+
if (res.status === 429) {
|
|
2204
|
+
const err = new Error("Rate limited by Reddit API");
|
|
2205
|
+
err.status = 429;
|
|
2206
|
+
throw err;
|
|
2207
|
+
}
|
|
2208
|
+
if (res.status === 404) {
|
|
2209
|
+
throw new Error(`Post not found: /r/${sub}/comments/${id}`);
|
|
2210
|
+
}
|
|
2211
|
+
if (!res.ok) {
|
|
2212
|
+
const err = new Error(`Reddit API error: ${res.status}`);
|
|
2213
|
+
err.status = res.status;
|
|
2214
|
+
throw err;
|
|
2215
|
+
}
|
|
2216
|
+
try {
|
|
2217
|
+
return await res.json();
|
|
2218
|
+
} catch {
|
|
2219
|
+
throw new Error("Failed to parse Reddit API response");
|
|
2220
|
+
}
|
|
2221
|
+
}
|
|
2222
|
+
function parsePostData(postListing, sub) {
|
|
2223
|
+
const p = postListing?.data?.children?.[0]?.data;
|
|
2224
|
+
if (!p) {
|
|
2225
|
+
throw new Error(`Post data not found in response for /r/${sub}`);
|
|
2226
|
+
}
|
|
2227
|
+
return {
|
|
2228
|
+
title: p.title || "Untitled",
|
|
2229
|
+
author: p.author || "[deleted]",
|
|
2230
|
+
subreddit: p.subreddit || sub,
|
|
2231
|
+
body: formatBody(p),
|
|
2232
|
+
score: p.score || 0,
|
|
2233
|
+
commentCount: p.num_comments || 0,
|
|
2234
|
+
url: `https://reddit.com${p.permalink || ""}`,
|
|
2235
|
+
created: new Date((p.created_utc || 0) * 1e3),
|
|
2236
|
+
flair: p.link_flair_text || void 0,
|
|
2237
|
+
isNsfw: p.over_18 || false,
|
|
2238
|
+
isPinned: p.stickied || false
|
|
2239
|
+
};
|
|
2240
|
+
}
|
|
2241
|
+
function formatBody(p) {
|
|
2242
|
+
if (p.selftext?.trim()) return p.selftext;
|
|
2243
|
+
if (p.is_self) return "";
|
|
2244
|
+
if (p.url) return `**Link:** ${p.url}`;
|
|
2245
|
+
return "";
|
|
2246
|
+
}
|
|
2247
|
+
var MAX_COMMENT_DEPTH = 15;
|
|
2248
|
+
function parseCommentTree(commentListing, opAuthor) {
|
|
2249
|
+
const result = [];
|
|
2250
|
+
const extract = (items, depth = 0) => {
|
|
2251
|
+
if (depth > MAX_COMMENT_DEPTH) return;
|
|
2252
|
+
const sorted = [...items].sort((a, b) => (b.data?.score || 0) - (a.data?.score || 0));
|
|
2253
|
+
for (const c of sorted) {
|
|
2254
|
+
if (c.kind !== "t1" || !c.data?.author || c.data.author === "[deleted]") continue;
|
|
2255
|
+
result.push({
|
|
2256
|
+
author: c.data.author,
|
|
2257
|
+
body: c.data.body || "",
|
|
2258
|
+
score: c.data.score || 0,
|
|
2259
|
+
depth,
|
|
2260
|
+
isOP: c.data.author === opAuthor
|
|
2261
|
+
});
|
|
2262
|
+
if (typeof c.data.replies === "object" && c.data.replies?.data?.children) {
|
|
2263
|
+
extract(c.data.replies.data.children, depth + 1);
|
|
2264
|
+
}
|
|
2265
|
+
}
|
|
2266
|
+
};
|
|
2267
|
+
extract(commentListing?.data?.children || []);
|
|
2268
|
+
return result;
|
|
2269
|
+
}
|
|
2270
|
+
async function processBatch(client, batchUrls) {
|
|
2271
|
+
const results = /* @__PURE__ */ new Map();
|
|
2272
|
+
let rateLimitHits = 0;
|
|
2273
|
+
const batchResults = await pMapSettled(
|
|
2274
|
+
batchUrls,
|
|
2275
|
+
(url) => client.getPost(url),
|
|
2276
|
+
CONCURRENCY.REDDIT
|
|
2277
|
+
);
|
|
2278
|
+
for (let i = 0; i < batchResults.length; i++) {
|
|
2279
|
+
const result = batchResults[i];
|
|
2280
|
+
if (!result) continue;
|
|
2281
|
+
const url = batchUrls[i] ?? "";
|
|
2282
|
+
if (result.status === "fulfilled") {
|
|
2283
|
+
results.set(url, result.value);
|
|
2284
|
+
} else {
|
|
2285
|
+
const errorMsg = result.reason?.message || String(result.reason);
|
|
2286
|
+
if (errorMsg.includes("429") || errorMsg.includes("rate")) rateLimitHits++;
|
|
2287
|
+
results.set(url, new Error(errorMsg));
|
|
2288
|
+
}
|
|
2289
|
+
}
|
|
2290
|
+
return { results, rateLimitHits };
|
|
2291
|
+
}
|
|
2292
|
+
var RedditClient = class {
|
|
2293
|
+
constructor(clientId, clientSecret) {
|
|
2294
|
+
this.clientId = clientId;
|
|
2295
|
+
this.clientSecret = clientSecret;
|
|
2296
|
+
}
|
|
2297
|
+
userAgent = `script:${USER_AGENT_VERSION} (by /u/research-powerpack)`;
|
|
2298
|
+
/**
|
|
2299
|
+
* Authenticate with Reddit API with retry logic
|
|
2300
|
+
* Uses module-level token cache and promise deduplication to prevent
|
|
2301
|
+
* concurrent auth calls from firing multiple token requests
|
|
2302
|
+
* Returns null on failure instead of throwing
|
|
2303
|
+
*/
|
|
2304
|
+
async auth() {
|
|
2305
|
+
if (cachedToken && Date.now() < cachedTokenExpiry - TOKEN_EXPIRY_MS) {
|
|
2306
|
+
if (DEBUG_TOKEN_CACHE) clientLogger.debug("Token cache HIT");
|
|
2307
|
+
return cachedToken;
|
|
2308
|
+
}
|
|
2309
|
+
if (pendingAuthPromise) {
|
|
2310
|
+
if (DEBUG_TOKEN_CACHE) clientLogger.debug("Auth already in flight, awaiting...");
|
|
2311
|
+
return pendingAuthPromise;
|
|
2312
|
+
}
|
|
2313
|
+
pendingAuthPromise = this.performAuth();
|
|
2314
|
+
try {
|
|
2315
|
+
return await pendingAuthPromise;
|
|
2316
|
+
} finally {
|
|
2317
|
+
pendingAuthPromise = null;
|
|
2318
|
+
}
|
|
2319
|
+
}
|
|
2320
|
+
async performAuth() {
|
|
2321
|
+
if (DEBUG_TOKEN_CACHE) clientLogger.debug("Token cache MISS - authenticating");
|
|
2322
|
+
const credentials = Buffer.from(`${this.clientId}:${this.clientSecret}`).toString("base64");
|
|
2323
|
+
for (let attempt = 0; attempt < 3; attempt++) {
|
|
2324
|
+
try {
|
|
2325
|
+
const res = await fetchWithTimeout(REDDIT_TOKEN_URL, {
|
|
2326
|
+
method: "POST",
|
|
2327
|
+
headers: {
|
|
2328
|
+
"Authorization": `Basic ${credentials}`,
|
|
2329
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
2330
|
+
"User-Agent": this.userAgent
|
|
2331
|
+
},
|
|
2332
|
+
body: "grant_type=client_credentials",
|
|
2333
|
+
timeoutMs: 15e3
|
|
2334
|
+
});
|
|
2335
|
+
if (!res.ok) {
|
|
2336
|
+
const text = await res.text().catch(() => "");
|
|
2337
|
+
mcpLog("error", `Auth failed (${res.status}): ${text}`, "reddit");
|
|
2338
|
+
if (res.status === 401 || res.status === 403) {
|
|
2339
|
+
cachedToken = null;
|
|
2340
|
+
cachedTokenExpiry = 0;
|
|
2341
|
+
return null;
|
|
2342
|
+
}
|
|
2343
|
+
if (res.status >= 500 && attempt < 2) {
|
|
2344
|
+
await sleep(calculateBackoff2(attempt));
|
|
2345
|
+
continue;
|
|
2346
|
+
}
|
|
2347
|
+
return null;
|
|
2348
|
+
}
|
|
2349
|
+
const data = await res.json();
|
|
2350
|
+
if (!data.access_token) {
|
|
2351
|
+
mcpLog("error", "Auth response missing access_token", "reddit");
|
|
2352
|
+
return null;
|
|
2353
|
+
}
|
|
2354
|
+
cachedToken = data.access_token;
|
|
2355
|
+
cachedTokenExpiry = Date.now() + (data.expires_in || 3600) * 1e3;
|
|
2356
|
+
return cachedToken;
|
|
2357
|
+
} catch (error2) {
|
|
2358
|
+
const err = classifyError(error2);
|
|
2359
|
+
mcpLog("error", `Auth error (attempt ${attempt + 1}): ${err.message}`, "reddit");
|
|
2360
|
+
if (err.code === ErrorCode.AUTH_ERROR) {
|
|
2361
|
+
cachedToken = null;
|
|
2362
|
+
cachedTokenExpiry = 0;
|
|
2363
|
+
}
|
|
2364
|
+
if (attempt < 2 && err.retryable) {
|
|
2365
|
+
await sleep(calculateBackoff2(attempt));
|
|
2366
|
+
continue;
|
|
2367
|
+
}
|
|
2368
|
+
return null;
|
|
2369
|
+
}
|
|
2370
|
+
}
|
|
2371
|
+
return null;
|
|
2372
|
+
}
|
|
2373
|
+
parseUrl(url) {
|
|
2374
|
+
const m = url.match(/reddit\.com\/r\/([^\/]+)\/comments\/([a-z0-9]+)/i);
|
|
2375
|
+
return m ? { sub: m[1], id: m[2] } : null;
|
|
2376
|
+
}
|
|
2377
|
+
/**
|
|
2378
|
+
* Get a single Reddit post with comments
|
|
2379
|
+
* Returns PostResult or throws Error (for use with Promise.allSettled)
|
|
2380
|
+
*/
|
|
2381
|
+
async getPost(url) {
|
|
2382
|
+
const parsed = this.parseUrl(url);
|
|
2383
|
+
if (!parsed) {
|
|
2384
|
+
throw new Error(`Invalid Reddit URL format: ${url}`);
|
|
2385
|
+
}
|
|
2386
|
+
const token = await this.auth();
|
|
2387
|
+
if (!token) {
|
|
2388
|
+
throw new Error("Reddit authentication failed - check credentials");
|
|
2389
|
+
}
|
|
2390
|
+
let lastError = null;
|
|
2391
|
+
for (let attempt = 0; attempt < REDDIT.RETRY_COUNT; attempt++) {
|
|
2392
|
+
try {
|
|
2393
|
+
const data = await fetchRedditJson(parsed.sub, parsed.id, token, this.userAgent);
|
|
2394
|
+
const [postListing, commentListing] = data;
|
|
2395
|
+
const post = parsePostData(postListing, parsed.sub);
|
|
2396
|
+
const comments = parseCommentTree(commentListing, post.author);
|
|
2397
|
+
return { post, comments, actualComments: post.commentCount };
|
|
2398
|
+
} catch (error2) {
|
|
2399
|
+
lastError = classifyError(error2);
|
|
2400
|
+
const status = error2.status;
|
|
2401
|
+
if (status === 429) {
|
|
2402
|
+
const delay = REDDIT.RETRY_DELAYS[attempt] || 32e3;
|
|
2403
|
+
mcpLog("warning", `Rate limited. Retry ${attempt + 1}/${REDDIT.RETRY_COUNT} after ${delay}ms`, "reddit");
|
|
2404
|
+
await sleep(delay);
|
|
2405
|
+
continue;
|
|
2406
|
+
}
|
|
2407
|
+
if (!lastError.retryable) {
|
|
2408
|
+
throw error2 instanceof Error ? error2 : new Error(lastError.message);
|
|
2409
|
+
}
|
|
2410
|
+
if (attempt < REDDIT.RETRY_COUNT - 1) {
|
|
2411
|
+
const delay = REDDIT.RETRY_DELAYS[attempt] || 2e3;
|
|
2412
|
+
mcpLog("warning", `${lastError.code}: ${lastError.message}. Retry ${attempt + 1}/${REDDIT.RETRY_COUNT}`, "reddit");
|
|
2413
|
+
await sleep(delay);
|
|
2414
|
+
}
|
|
2415
|
+
}
|
|
2416
|
+
}
|
|
2417
|
+
throw new Error(lastError?.message || "Failed to fetch Reddit post after retries");
|
|
2418
|
+
}
|
|
2419
|
+
async batchGetPosts(urls, fetchComments = true, onBatchComplete) {
|
|
2420
|
+
const allResults = /* @__PURE__ */ new Map();
|
|
2421
|
+
let rateLimitHits = 0;
|
|
2422
|
+
const totalBatches = Math.ceil(urls.length / REDDIT.BATCH_SIZE);
|
|
2423
|
+
mcpLog("info", `Fetching ${urls.length} posts in ${totalBatches} batch(es), up to ${FETCH_LIMIT} comments/post`, "reddit");
|
|
2424
|
+
for (let batchNum = 0; batchNum < totalBatches; batchNum++) {
|
|
2425
|
+
const startIdx = batchNum * REDDIT.BATCH_SIZE;
|
|
2426
|
+
const batchUrls = urls.slice(startIdx, startIdx + REDDIT.BATCH_SIZE);
|
|
2427
|
+
mcpLog("info", `Batch ${batchNum + 1}/${totalBatches} (${batchUrls.length} posts)`, "reddit");
|
|
2428
|
+
const batchResult = await processBatch(this, batchUrls);
|
|
2429
|
+
for (const [url, result] of batchResult.results) {
|
|
2430
|
+
allResults.set(url, result);
|
|
2431
|
+
}
|
|
2432
|
+
rateLimitHits += batchResult.rateLimitHits;
|
|
2433
|
+
try {
|
|
2434
|
+
onBatchComplete?.(batchNum + 1, totalBatches, allResults.size);
|
|
2435
|
+
} catch (callbackError) {
|
|
2436
|
+
mcpLog("error", `onBatchComplete callback error: ${callbackError}`, "reddit");
|
|
2437
|
+
}
|
|
2438
|
+
mcpLog("info", `Batch ${batchNum + 1} complete (${allResults.size}/${urls.length})`, "reddit");
|
|
2439
|
+
if (batchNum < totalBatches - 1) {
|
|
2440
|
+
await sleep(500);
|
|
2441
|
+
}
|
|
2442
|
+
}
|
|
2443
|
+
return { results: allResults, batchesProcessed: totalBatches, totalPosts: urls.length, rateLimitHits };
|
|
2444
|
+
}
|
|
2445
|
+
};
|
|
2446
|
+
|
|
2447
|
+
// src/effect/errors.ts
|
|
2448
|
+
import { Data } from "effect";
|
|
2449
|
+
var ProviderRequestError = class extends Data.TaggedError("ProviderRequestError") {
|
|
2450
|
+
};
|
|
2451
|
+
var ProviderTimeoutError = class extends Data.TaggedError("ProviderTimeoutError") {
|
|
2452
|
+
};
|
|
2453
|
+
var WeakContentError = class extends Data.TaggedError("WeakContentError") {
|
|
2454
|
+
};
|
|
2455
|
+
var AllStrategiesExhaustedError = class extends Data.TaggedError("AllStrategiesExhaustedError") {
|
|
2456
|
+
};
|
|
2457
|
+
function providerError(provider, operation, error2) {
|
|
2458
|
+
return new ProviderRequestError({
|
|
2459
|
+
provider,
|
|
2460
|
+
operation,
|
|
2461
|
+
error: classifyError(error2)
|
|
2462
|
+
});
|
|
2463
|
+
}
|
|
2464
|
+
|
|
2465
|
+
// src/effect/services.ts
|
|
2466
|
+
var SearchService = Context.GenericTag("SearchService");
|
|
2467
|
+
var JinaService = Context.GenericTag("JinaService");
|
|
2468
|
+
var KernelService = Context.GenericTag("KernelService");
|
|
2469
|
+
var RedditService = Context.GenericTag("RedditService");
|
|
2470
|
+
var LlmService = Context.GenericTag("LlmService");
|
|
2471
|
+
var SearchServiceLive = Layer.sync(SearchService, () => {
|
|
2472
|
+
const env = parseEnv();
|
|
2473
|
+
return {
|
|
2474
|
+
serperSearchMultiple: (queries) => Effect.tryPromise({
|
|
2475
|
+
try: () => new SearchClient(env.SEARCH_API_KEY).searchMultiple([...queries]),
|
|
2476
|
+
catch: (error2) => providerError("serper", "searchMultiple", error2)
|
|
2477
|
+
}),
|
|
2478
|
+
jinaSearchMultiple: (queries) => Effect.tryPromise({
|
|
2479
|
+
try: () => new JinaClient(env.JINA_API_KEY).searchMultiple([...queries]),
|
|
2480
|
+
catch: (error2) => providerError("jina", "searchMultiple", error2)
|
|
2481
|
+
})
|
|
2482
|
+
};
|
|
2483
|
+
});
|
|
2484
|
+
var JinaServiceLive = Layer.sync(JinaService, () => {
|
|
2485
|
+
const env = parseEnv();
|
|
2486
|
+
const client = new JinaClient(env.JINA_API_KEY);
|
|
2487
|
+
return {
|
|
2488
|
+
convert: (request) => Effect.tryPromise({
|
|
2489
|
+
try: () => client.convert(request),
|
|
2490
|
+
catch: (error2) => providerError("jina", "convert", error2)
|
|
2491
|
+
})
|
|
2492
|
+
};
|
|
2493
|
+
});
|
|
2494
|
+
var KernelServiceLive = Layer.sync(KernelService, () => {
|
|
2495
|
+
const client = new KernelClient();
|
|
2496
|
+
return {
|
|
2497
|
+
render: (request) => Effect.tryPromise({
|
|
2498
|
+
try: () => client.render(request),
|
|
2499
|
+
catch: (error2) => providerError("kernel", "render", error2)
|
|
2500
|
+
})
|
|
2501
|
+
};
|
|
2502
|
+
});
|
|
2503
|
+
function RedditServiceLive(clientId, clientSecret) {
|
|
2504
|
+
return Layer.sync(RedditService, () => {
|
|
2505
|
+
const client = new RedditClient(clientId, clientSecret);
|
|
2506
|
+
return {
|
|
2507
|
+
batchGetPosts: (urls, includeComments) => Effect.tryPromise({
|
|
2508
|
+
try: () => client.batchGetPosts([...urls], includeComments),
|
|
2509
|
+
catch: (error2) => providerError("reddit", "batchGetPosts", error2)
|
|
2510
|
+
})
|
|
2511
|
+
};
|
|
2512
|
+
});
|
|
2513
|
+
}
|
|
2514
|
+
var LlmServiceLive = Layer.succeed(LlmService, {
|
|
2515
|
+
extractContent: (content, config) => Effect.tryPromise({
|
|
2516
|
+
try: () => processContentWithLLM(content, config, createLLMProcessor()),
|
|
2517
|
+
catch: (error2) => providerError("llm", "extractContent", error2)
|
|
2518
|
+
}),
|
|
2519
|
+
classifySearchResults: (rankedUrls, objective, totalQueries, processor, previousQueries) => Effect.tryPromise({
|
|
2520
|
+
try: () => classifySearchResults(rankedUrls, objective, totalQueries, processor, previousQueries),
|
|
2521
|
+
catch: (error2) => providerError("llm", "classifySearchResults", error2)
|
|
2522
|
+
}),
|
|
2523
|
+
suggestRefineQueriesForRawMode: (rankedUrls, objective, originalQueries, processor) => Effect.tryPromise({
|
|
2524
|
+
try: () => suggestRefineQueriesForRawMode(rankedUrls, objective, originalQueries, processor),
|
|
2525
|
+
catch: (error2) => providerError("llm", "suggestRefineQueriesForRawMode", error2)
|
|
2526
|
+
}),
|
|
2527
|
+
generateResearchBrief: (goal, processor) => Effect.tryPromise({
|
|
2528
|
+
try: () => generateResearchBrief(goal, processor),
|
|
2529
|
+
catch: (error2) => providerError("llm", "generateResearchBrief", error2)
|
|
2530
|
+
})
|
|
2531
|
+
});
|
|
2532
|
+
|
|
2533
|
+
// src/effect/runtime.ts
|
|
2534
|
+
var BaseExternalLive = Layer2.mergeAll(
|
|
2535
|
+
JinaServiceLive,
|
|
2536
|
+
SearchServiceLive,
|
|
2537
|
+
LlmServiceLive
|
|
2538
|
+
);
|
|
2539
|
+
function runExternalEffect(effect, layer) {
|
|
2540
|
+
return Effect2.runPromise(effect.pipe(Effect2.provide(layer)));
|
|
2541
|
+
}
|
|
2542
|
+
|
|
2543
|
+
// src/utils/response.ts
|
|
2544
|
+
var SECONDS_MS = 1e3;
|
|
2545
|
+
var MINUTES_MS = 6e4;
|
|
2546
|
+
function formatSuccess(opts) {
|
|
2547
|
+
const parts = [];
|
|
2548
|
+
parts.push(`\u2713 ${opts.title}`);
|
|
2549
|
+
parts.push("");
|
|
2550
|
+
parts.push(opts.summary);
|
|
2551
|
+
if (opts.data) {
|
|
2552
|
+
parts.push("");
|
|
2553
|
+
parts.push("---");
|
|
2554
|
+
parts.push(opts.data);
|
|
2555
|
+
}
|
|
2556
|
+
if (opts.nextSteps?.length) {
|
|
2557
|
+
parts.push("");
|
|
2558
|
+
parts.push("---");
|
|
2559
|
+
parts.push("**Next Steps:**");
|
|
2560
|
+
opts.nextSteps.forEach((step) => parts.push(`\u2192 ${step}`));
|
|
2561
|
+
}
|
|
2562
|
+
if (opts.metadata && Object.keys(opts.metadata).length > 0) {
|
|
2563
|
+
parts.push("");
|
|
2564
|
+
parts.push("---");
|
|
2565
|
+
const metaStr = Object.entries(opts.metadata).map(([k, v]) => `${k}: ${v}`).join(" | ");
|
|
2566
|
+
parts.push(`*${metaStr}*`);
|
|
2567
|
+
}
|
|
2568
|
+
return parts.join("\n");
|
|
2569
|
+
}
|
|
2570
|
+
function formatError(opts) {
|
|
2571
|
+
const parts = [];
|
|
2572
|
+
const prefix = opts.toolName ? `[${opts.toolName}] ` : "";
|
|
2573
|
+
parts.push(`\u274C ${prefix}${opts.code}: ${opts.message}`);
|
|
2574
|
+
if (opts.retryable) {
|
|
2575
|
+
parts.push("*Retryable.*");
|
|
2576
|
+
}
|
|
2577
|
+
if (opts.howToFix?.length) {
|
|
2578
|
+
parts.push("");
|
|
2579
|
+
parts.push("**How to Fix:**");
|
|
2580
|
+
opts.howToFix.forEach((step, i) => parts.push(`${i + 1}. ${step}`));
|
|
2581
|
+
}
|
|
2582
|
+
if (opts.alternatives?.length) {
|
|
2583
|
+
parts.push("");
|
|
2584
|
+
parts.push("**Alternatives:**");
|
|
2585
|
+
opts.alternatives.forEach((alt, i) => parts.push(`${i + 1}. ${alt}`));
|
|
2586
|
+
}
|
|
2587
|
+
return parts.join("\n");
|
|
2588
|
+
}
|
|
2589
|
+
function formatBatchHeader(opts) {
|
|
2590
|
+
const parts = [];
|
|
2591
|
+
const successRate = opts.totalItems > 0 ? opts.successful / opts.totalItems : 0;
|
|
2592
|
+
const emoji = successRate === 1 ? "\u2713" : successRate >= 0.5 ? "\u26A0\uFE0F" : "\u274C";
|
|
2593
|
+
parts.push(`${emoji} ${opts.title}`);
|
|
2594
|
+
parts.push("");
|
|
2595
|
+
parts.push(`\u2022 Total: ${opts.totalItems}`);
|
|
2596
|
+
parts.push(`\u2022 Successful: ${opts.successful}`);
|
|
2597
|
+
if (opts.failed > 0) {
|
|
2598
|
+
parts.push(`\u2022 Failed: ${opts.failed}`);
|
|
2599
|
+
}
|
|
2600
|
+
if (opts.tokensPerItem) {
|
|
2601
|
+
parts.push(`\u2022 Tokens/item: ~${opts.tokensPerItem.toLocaleString()}`);
|
|
2602
|
+
}
|
|
2603
|
+
if (opts.batches) {
|
|
2604
|
+
parts.push(`\u2022 Batches: ${opts.batches}`);
|
|
2605
|
+
}
|
|
2606
|
+
if (opts.extras) {
|
|
2607
|
+
Object.entries(opts.extras).forEach(([key, val]) => {
|
|
2608
|
+
parts.push(`\u2022 ${key}: ${val}`);
|
|
2609
|
+
});
|
|
2610
|
+
}
|
|
2611
|
+
return parts.join("\n");
|
|
2612
|
+
}
|
|
2613
|
+
function formatDuration(ms) {
|
|
2614
|
+
if (ms < SECONDS_MS) return `${ms}ms`;
|
|
2615
|
+
if (ms < MINUTES_MS) return `${(ms / SECONDS_MS).toFixed(1)}s`;
|
|
2616
|
+
return `${(ms / MINUTES_MS).toFixed(1)}m`;
|
|
2617
|
+
}
|
|
2618
|
+
|
|
2619
|
+
// src/tools/mcp-helpers.ts
|
|
2620
|
+
import { error, markdown } from "mcp-use/server";
|
|
2621
|
+
var NOOP_REPORTER = {
|
|
2622
|
+
async log() {
|
|
2623
|
+
},
|
|
2624
|
+
async progress() {
|
|
2625
|
+
}
|
|
2626
|
+
};
|
|
2627
|
+
function toolSuccess(content, structuredContent) {
|
|
2628
|
+
return {
|
|
2629
|
+
isError: false,
|
|
2630
|
+
content,
|
|
2631
|
+
structuredContent
|
|
2632
|
+
};
|
|
2633
|
+
}
|
|
2634
|
+
function toolFailure(content) {
|
|
2635
|
+
return {
|
|
2636
|
+
isError: true,
|
|
2637
|
+
content
|
|
2638
|
+
};
|
|
2639
|
+
}
|
|
2640
|
+
function createToolReporter(ctx, loggerName) {
|
|
2641
|
+
return {
|
|
2642
|
+
log(level, message) {
|
|
2643
|
+
return ctx.log(level, message, loggerName);
|
|
2644
|
+
},
|
|
2645
|
+
progress(loaded, total, message) {
|
|
2646
|
+
return ctx.reportProgress?.(loaded, total, message) ?? Promise.resolve();
|
|
2647
|
+
}
|
|
2648
|
+
};
|
|
2649
|
+
}
|
|
2650
|
+
function toToolResponse(result) {
|
|
2651
|
+
if (result.isError) {
|
|
2652
|
+
return error(result.content);
|
|
2653
|
+
}
|
|
2654
|
+
if (result.structuredContent) {
|
|
2655
|
+
return {
|
|
2656
|
+
...markdown(result.content),
|
|
2657
|
+
structuredContent: result.structuredContent
|
|
2658
|
+
};
|
|
2659
|
+
}
|
|
2660
|
+
return markdown(result.content);
|
|
2661
|
+
}
|
|
2662
|
+
|
|
2663
|
+
// src/tools/scrape.ts
|
|
2664
|
+
var markdownCleaner = new MarkdownCleaner();
|
|
2665
|
+
function formatInputValidationError(toolName, issues) {
|
|
2666
|
+
const details = issues.map((issue) => {
|
|
2667
|
+
const path = issue.path.length > 0 ? issue.path.map(String).join(".") : "<root>";
|
|
2668
|
+
return `- ${path}: ${issue.message}`;
|
|
2669
|
+
}).join("\n");
|
|
2670
|
+
return `Invalid ${toolName} input.
|
|
2671
|
+
|
|
2672
|
+
${details}`;
|
|
2673
|
+
}
|
|
2674
|
+
function enhanceExtractionInstruction(instruction) {
|
|
2675
|
+
const base = instruction || "Extract the main content and key information from this page.";
|
|
2676
|
+
return `${SCRAPER.EXTRACTION_PREFIX}
|
|
2677
|
+
|
|
2678
|
+
${base}
|
|
2679
|
+
|
|
2680
|
+
${SCRAPER.EXTRACTION_SUFFIX}`;
|
|
2681
|
+
}
|
|
2682
|
+
function cleanFetchedContent(rawContent, url) {
|
|
2683
|
+
try {
|
|
2684
|
+
const readable = extractReadableContent(rawContent, url);
|
|
2685
|
+
const sourceForCleaner = readable.extracted ? readable.content : rawContent;
|
|
2686
|
+
return markdownCleaner.processContent(sourceForCleaner);
|
|
2687
|
+
} catch {
|
|
2688
|
+
return rawContent;
|
|
2689
|
+
}
|
|
2690
|
+
}
|
|
2691
|
+
function effectErrorMessage(error2) {
|
|
2692
|
+
if (typeof error2 === "object" && error2 !== null) {
|
|
2693
|
+
if ("error" in error2) {
|
|
2694
|
+
const structured = error2.error;
|
|
2695
|
+
if (typeof structured?.message === "string") return structured.message;
|
|
2696
|
+
}
|
|
2697
|
+
if ("message" in error2 && typeof error2.message === "string") {
|
|
2698
|
+
return error2.message;
|
|
2699
|
+
}
|
|
2700
|
+
if ("_tag" in error2 && typeof error2._tag === "string") {
|
|
2701
|
+
return error2._tag;
|
|
2702
|
+
}
|
|
2703
|
+
}
|
|
2704
|
+
return String(error2);
|
|
2705
|
+
}
|
|
2706
|
+
var REDDIT_HOST = /(?:^|\.)reddit\.com$/i;
|
|
2707
|
+
var REDDIT_POST_PERMALINK = /\/r\/[^/]+\/comments\/[a-z0-9]+/i;
|
|
2708
|
+
function isRedditUrl(url) {
|
|
2709
|
+
try {
|
|
2710
|
+
const u = new URL(url);
|
|
2711
|
+
return REDDIT_HOST.test(u.hostname);
|
|
2712
|
+
} catch {
|
|
2713
|
+
return false;
|
|
2714
|
+
}
|
|
2715
|
+
}
|
|
2716
|
+
function isRedditPostPermalink(url) {
|
|
2717
|
+
try {
|
|
2718
|
+
const u = new URL(url);
|
|
2719
|
+
return REDDIT_HOST.test(u.hostname) && REDDIT_POST_PERMALINK.test(u.pathname);
|
|
2720
|
+
} catch {
|
|
2721
|
+
return false;
|
|
2722
|
+
}
|
|
2723
|
+
}
|
|
2724
|
+
function createScrapeErrorResponse(code, message, startTime, toolName = "raw-scrape-links", retryable = false, alternatives) {
|
|
2725
|
+
return toolFailure(
|
|
2726
|
+
`${formatError({
|
|
2727
|
+
code,
|
|
2728
|
+
message,
|
|
2729
|
+
retryable,
|
|
2730
|
+
toolName,
|
|
2731
|
+
howToFix: code === "NO_URLS" ? ["Provide at least one valid URL"] : void 0,
|
|
2732
|
+
alternatives
|
|
2733
|
+
})}
|
|
2734
|
+
|
|
2735
|
+
Execution time: ${formatDuration(Date.now() - startTime)}`
|
|
2736
|
+
);
|
|
2737
|
+
}
|
|
2738
|
+
function partitionUrls(urls) {
|
|
2739
|
+
const webInputs = [];
|
|
2740
|
+
const redditInputs = [];
|
|
2741
|
+
const documentInputs = [];
|
|
2742
|
+
const invalidEntries = [];
|
|
2743
|
+
for (let i = 0; i < urls.length; i++) {
|
|
2744
|
+
const url = urls[i];
|
|
2745
|
+
try {
|
|
2746
|
+
new URL(url);
|
|
2747
|
+
} catch {
|
|
2748
|
+
invalidEntries.push({ url, origIndex: i });
|
|
2749
|
+
continue;
|
|
2750
|
+
}
|
|
2751
|
+
if (isDocumentUrl(url)) {
|
|
2752
|
+
documentInputs.push({ url, origIndex: i });
|
|
2753
|
+
} else if (isRedditUrl(url)) {
|
|
2754
|
+
redditInputs.push({ url, origIndex: i });
|
|
2755
|
+
} else {
|
|
2756
|
+
webInputs.push({ url, origIndex: i });
|
|
2757
|
+
}
|
|
2758
|
+
}
|
|
2759
|
+
return { webInputs, redditInputs, documentInputs, invalidEntries };
|
|
2760
|
+
}
|
|
2761
|
+
function formatJinaFailure(url, jinaError, proxyError) {
|
|
2762
|
+
if (proxyError) {
|
|
2763
|
+
return `## ${url}
|
|
2764
|
+
|
|
2765
|
+
\u274C Jina Reader failed. Direct: ${jinaError}. Scrape.do proxy: ${proxyError}.`;
|
|
2766
|
+
}
|
|
2767
|
+
return `## ${url}
|
|
2768
|
+
|
|
2769
|
+
\u274C Document conversion failed: ${jinaError}`;
|
|
2770
|
+
}
|
|
2771
|
+
function jinaResultError(result) {
|
|
2772
|
+
if (result.error || result.statusCode < 200 || result.statusCode >= 300) {
|
|
2773
|
+
return result.error?.message || `HTTP ${result.statusCode}`;
|
|
2774
|
+
}
|
|
2775
|
+
const quality = assessMarkdownQuality(result.content);
|
|
2776
|
+
if (quality.weak) {
|
|
2777
|
+
return `Weak Jina markdown (${quality.reason})`;
|
|
2778
|
+
}
|
|
2779
|
+
return null;
|
|
2780
|
+
}
|
|
2781
|
+
function canTryKernel(input, directError) {
|
|
2782
|
+
if (isDocumentUrl(input.url)) return false;
|
|
2783
|
+
if (!directError) return true;
|
|
2784
|
+
if (directError.code === ErrorCode.NOT_FOUND || directError.code === ErrorCode.INVALID_INPUT) {
|
|
2785
|
+
return false;
|
|
2786
|
+
}
|
|
2787
|
+
return true;
|
|
2788
|
+
}
|
|
2789
|
+
function formatJinaFirstFailure(input) {
|
|
2790
|
+
if (input.proxyError) {
|
|
2791
|
+
return formatJinaFailure(input.url, input.directError ?? "Unknown direct failure", input.proxyError);
|
|
2792
|
+
}
|
|
2793
|
+
return formatJinaFailure(input.url, input.directError ?? "Unknown direct failure");
|
|
2794
|
+
}
|
|
2795
|
+
async function fetchJinaFirstBranch(inputs, kernelEnabled, scrapeDoProxyUrl) {
|
|
2796
|
+
if (inputs.length === 0) {
|
|
2797
|
+
return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };
|
|
2798
|
+
}
|
|
2799
|
+
mcpLog(
|
|
2800
|
+
"info",
|
|
2801
|
+
`[concurrency] jina direct branch: converting ${inputs.length} URL(s) with limit=${CONCURRENCY.JINA_READER}`,
|
|
2802
|
+
"scrape"
|
|
2803
|
+
);
|
|
2804
|
+
const directResults = await runExternalEffect(
|
|
2805
|
+
Effect3.gen(function* () {
|
|
2806
|
+
const jina = yield* JinaService;
|
|
2807
|
+
return yield* Effect3.forEach(
|
|
2808
|
+
inputs,
|
|
2809
|
+
(input) => jina.convert({ url: input.url, timeoutSeconds: 15, allowProxyRetry: false }).pipe(
|
|
2810
|
+
Effect3.timeoutFail({
|
|
2811
|
+
duration: "20 seconds",
|
|
2812
|
+
onTimeout: () => new ProviderTimeoutError({
|
|
2813
|
+
provider: "jina",
|
|
2814
|
+
operation: "convert:direct",
|
|
2815
|
+
durationMs: 2e4
|
|
2816
|
+
})
|
|
2817
|
+
}),
|
|
2818
|
+
Effect3.either
|
|
2819
|
+
),
|
|
2820
|
+
{ concurrency: CONCURRENCY.JINA_READER }
|
|
2821
|
+
);
|
|
2822
|
+
}),
|
|
2823
|
+
JinaServiceLive
|
|
2824
|
+
);
|
|
2825
|
+
const successItems = [];
|
|
2826
|
+
const failedContents = [];
|
|
2827
|
+
const proxyInputs = [];
|
|
2828
|
+
const kernelInputs = [];
|
|
2829
|
+
let successful = 0;
|
|
2830
|
+
let failed = 0;
|
|
2831
|
+
const enqueueFailure = (input) => {
|
|
2832
|
+
if (kernelEnabled && canTryKernel(input, input.directStructuredError)) {
|
|
2833
|
+
kernelInputs.push({
|
|
2834
|
+
url: input.url,
|
|
2835
|
+
origIndex: input.origIndex,
|
|
2836
|
+
proxyError: input.proxyError ? `Scrape.do proxy: ${input.proxyError}` : void 0,
|
|
2837
|
+
jinaError: input.directError
|
|
2838
|
+
});
|
|
2839
|
+
return;
|
|
2840
|
+
}
|
|
2841
|
+
failed++;
|
|
2842
|
+
failedContents.push({ index: input.origIndex, content: formatJinaFirstFailure(input) });
|
|
2843
|
+
};
|
|
2844
|
+
for (let i = 0; i < directResults.length; i++) {
|
|
2845
|
+
const settled = directResults[i];
|
|
2846
|
+
const input = inputs[i];
|
|
2847
|
+
if (!settled) {
|
|
2848
|
+
proxyInputs.push({ ...input, directError: "No result returned" });
|
|
2849
|
+
continue;
|
|
2850
|
+
}
|
|
2851
|
+
if (Either.isLeft(settled)) {
|
|
2852
|
+
const reason = effectErrorMessage(settled.left);
|
|
2853
|
+
proxyInputs.push({ ...input, directError: reason });
|
|
2854
|
+
continue;
|
|
2855
|
+
}
|
|
2856
|
+
const directError = jinaResultError(settled.right);
|
|
2857
|
+
if (!directError) {
|
|
2858
|
+
successful++;
|
|
2859
|
+
successItems.push({ url: input.url, content: settled.right.content, index: input.origIndex, rawContent: settled.right.content });
|
|
2860
|
+
continue;
|
|
2861
|
+
}
|
|
2862
|
+
const structuredError = settled.right.error;
|
|
2863
|
+
if (structuredError && isTerminalReaderError(structuredError)) {
|
|
2864
|
+
enqueueFailure({ ...input, directError, directStructuredError: structuredError });
|
|
2865
|
+
continue;
|
|
2866
|
+
}
|
|
2867
|
+
proxyInputs.push({ ...input, directError, directStructuredError: structuredError });
|
|
2868
|
+
}
|
|
2869
|
+
if (proxyInputs.length > 0) {
|
|
2870
|
+
if (!scrapeDoProxyUrl) {
|
|
2871
|
+
for (const input of proxyInputs) {
|
|
2872
|
+
enqueueFailure({ ...input, proxyError: "SCRAPEDO_API_KEY is not configured" });
|
|
2873
|
+
}
|
|
2874
|
+
} else {
|
|
2875
|
+
mcpLog(
|
|
2876
|
+
"info",
|
|
2877
|
+
`[concurrency] jina scrape.do proxy branch: retrying ${proxyInputs.length} URL(s) with limit=${CONCURRENCY.JINA_READER}`,
|
|
2878
|
+
"scrape"
|
|
2879
|
+
);
|
|
2880
|
+
const proxyResults = await runExternalEffect(
|
|
2881
|
+
Effect3.gen(function* () {
|
|
2882
|
+
const jina = yield* JinaService;
|
|
2883
|
+
return yield* Effect3.forEach(
|
|
2884
|
+
proxyInputs,
|
|
2885
|
+
(input) => jina.convert({
|
|
2886
|
+
url: input.url,
|
|
2887
|
+
timeoutSeconds: 15,
|
|
2888
|
+
proxyUrl: scrapeDoProxyUrl,
|
|
2889
|
+
noCache: true,
|
|
2890
|
+
allowProxyRetry: false
|
|
2891
|
+
}).pipe(
|
|
2892
|
+
Effect3.timeoutFail({
|
|
2893
|
+
duration: "20 seconds",
|
|
2894
|
+
onTimeout: () => new ProviderTimeoutError({
|
|
2895
|
+
provider: "jina",
|
|
2896
|
+
operation: "convert:proxy",
|
|
2897
|
+
durationMs: 2e4
|
|
2898
|
+
})
|
|
2899
|
+
}),
|
|
2900
|
+
Effect3.either
|
|
2901
|
+
),
|
|
2902
|
+
{ concurrency: CONCURRENCY.JINA_READER }
|
|
2903
|
+
);
|
|
2904
|
+
}),
|
|
2905
|
+
JinaServiceLive
|
|
2906
|
+
);
|
|
2907
|
+
for (let i = 0; i < proxyResults.length; i++) {
|
|
2908
|
+
const settled = proxyResults[i];
|
|
2909
|
+
const input = proxyInputs[i];
|
|
2910
|
+
if (!settled) {
|
|
2911
|
+
enqueueFailure({ ...input, proxyError: "No result returned" });
|
|
2912
|
+
continue;
|
|
2913
|
+
}
|
|
2914
|
+
if (Either.isLeft(settled)) {
|
|
2915
|
+
enqueueFailure({ ...input, proxyError: effectErrorMessage(settled.left) });
|
|
2916
|
+
continue;
|
|
2917
|
+
}
|
|
2918
|
+
const proxyError = jinaResultError(settled.right);
|
|
2919
|
+
if (!proxyError) {
|
|
2920
|
+
successful++;
|
|
2921
|
+
successItems.push({ url: input.url, content: settled.right.content, index: input.origIndex, rawContent: settled.right.content });
|
|
2922
|
+
continue;
|
|
2923
|
+
}
|
|
2924
|
+
enqueueFailure({ ...input, proxyError });
|
|
2925
|
+
}
|
|
2926
|
+
}
|
|
2927
|
+
}
|
|
2928
|
+
if (kernelInputs.length > 0 && kernelEnabled) {
|
|
2929
|
+
const kernelPhase = await fetchKernelBranch(kernelInputs);
|
|
2930
|
+
successItems.push(...kernelPhase.successItems);
|
|
2931
|
+
failedContents.push(...kernelPhase.failedContents);
|
|
2932
|
+
successful += kernelPhase.metrics.successful;
|
|
2933
|
+
failed += kernelPhase.metrics.failed;
|
|
2934
|
+
}
|
|
2935
|
+
return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };
|
|
2936
|
+
}
|
|
2937
|
+
function formatKernelFailure(url, kernelError, proxyError, jinaError) {
|
|
2938
|
+
const layers = [
|
|
2939
|
+
jinaError ? `Jina Reader: ${jinaError}` : void 0,
|
|
2940
|
+
proxyError ? proxyError : void 0,
|
|
2941
|
+
`Kernel: ${kernelError}`
|
|
2942
|
+
].filter((line) => Boolean(line));
|
|
2943
|
+
return `## ${url}
|
|
2944
|
+
|
|
2945
|
+
\u274C All scrape providers failed. ${layers.join(". ")}.`;
|
|
2946
|
+
}
|
|
2947
|
+
async function fetchKernelBranch(inputs) {
|
|
2948
|
+
if (inputs.length === 0) {
|
|
2949
|
+
return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };
|
|
2950
|
+
}
|
|
2951
|
+
mcpLog(
|
|
2952
|
+
"info",
|
|
2953
|
+
`[concurrency] kernel branch: rendering ${inputs.length} URL(s) with limit=${CONCURRENCY.KERNEL}`,
|
|
2954
|
+
"scrape"
|
|
2955
|
+
);
|
|
2956
|
+
const results = await runExternalEffect(
|
|
2957
|
+
Effect3.gen(function* () {
|
|
2958
|
+
const kernel = yield* KernelService;
|
|
2959
|
+
return yield* Effect3.forEach(
|
|
2960
|
+
inputs,
|
|
2961
|
+
(input) => kernel.render({ url: input.url, timeoutSeconds: 15 }).pipe(
|
|
2962
|
+
Effect3.timeoutFail({
|
|
2963
|
+
duration: "25 seconds",
|
|
2964
|
+
onTimeout: () => new ProviderTimeoutError({
|
|
2965
|
+
provider: "kernel",
|
|
2966
|
+
operation: "render",
|
|
2967
|
+
durationMs: 25e3
|
|
2968
|
+
})
|
|
2969
|
+
}),
|
|
2970
|
+
Effect3.either
|
|
2971
|
+
),
|
|
2972
|
+
{ concurrency: CONCURRENCY.KERNEL }
|
|
2973
|
+
);
|
|
2974
|
+
}),
|
|
2975
|
+
KernelServiceLive
|
|
2976
|
+
);
|
|
2977
|
+
const successItems = [];
|
|
2978
|
+
const failedContents = [];
|
|
2979
|
+
let successful = 0;
|
|
2980
|
+
let failed = 0;
|
|
2981
|
+
for (let i = 0; i < results.length; i++) {
|
|
2982
|
+
const settled = results[i];
|
|
2983
|
+
const input = inputs[i];
|
|
2984
|
+
if (!settled) {
|
|
2985
|
+
failed++;
|
|
2986
|
+
failedContents.push({
|
|
2987
|
+
index: input.origIndex,
|
|
2988
|
+
content: formatKernelFailure(input.url, "No result returned", input.proxyError, input.jinaError)
|
|
2989
|
+
});
|
|
2990
|
+
continue;
|
|
2991
|
+
}
|
|
2992
|
+
if (Either.isLeft(settled)) {
|
|
2993
|
+
failed++;
|
|
2994
|
+
const reason = effectErrorMessage(settled.left);
|
|
2995
|
+
failedContents.push({
|
|
2996
|
+
index: input.origIndex,
|
|
2997
|
+
content: formatKernelFailure(input.url, reason, input.proxyError, input.jinaError)
|
|
2998
|
+
});
|
|
2999
|
+
continue;
|
|
3000
|
+
}
|
|
3001
|
+
const result = settled.right;
|
|
3002
|
+
if (result.error || result.statusCode < 200 || result.statusCode >= 300) {
|
|
3003
|
+
failed++;
|
|
3004
|
+
const errorMsg = result.error?.message || `HTTP ${result.statusCode}`;
|
|
3005
|
+
failedContents.push({
|
|
3006
|
+
index: input.origIndex,
|
|
3007
|
+
content: formatKernelFailure(input.url, errorMsg, input.proxyError, input.jinaError)
|
|
3008
|
+
});
|
|
3009
|
+
continue;
|
|
3010
|
+
}
|
|
3011
|
+
const content = cleanFetchedContent(result.content, result.finalUrl ?? input.url);
|
|
3012
|
+
const quality = assessMarkdownQuality(content);
|
|
3013
|
+
if (quality.weak) {
|
|
3014
|
+
failed++;
|
|
3015
|
+
failedContents.push({
|
|
3016
|
+
index: input.origIndex,
|
|
3017
|
+
content: formatKernelFailure(
|
|
3018
|
+
input.url,
|
|
3019
|
+
`Weak Kernel markdown (${quality.reason})`,
|
|
3020
|
+
input.proxyError,
|
|
3021
|
+
input.jinaError
|
|
3022
|
+
)
|
|
3023
|
+
});
|
|
3024
|
+
continue;
|
|
3025
|
+
}
|
|
3026
|
+
successful++;
|
|
3027
|
+
const finalUrl = result.finalUrl ?? input.url;
|
|
3028
|
+
successItems.push({ url: finalUrl, content, index: input.origIndex, rawContent: content });
|
|
3029
|
+
}
|
|
3030
|
+
return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };
|
|
3031
|
+
}
|
|
3032
|
+
function formatRedditPostAsMarkdown(result) {
|
|
3033
|
+
const { post, comments } = result;
|
|
3034
|
+
const lines = [];
|
|
3035
|
+
lines.push(`# ${post.title}`);
|
|
3036
|
+
lines.push("");
|
|
3037
|
+
lines.push(`**r/${post.subreddit}** \u2022 u/${post.author} \u2022 \u2B06\uFE0F ${post.score} \u2022 \u{1F4AC} ${post.commentCount} comments`);
|
|
3038
|
+
lines.push(`\u{1F517} ${post.url}`);
|
|
3039
|
+
lines.push("");
|
|
3040
|
+
if (post.body) {
|
|
3041
|
+
lines.push("## Post content");
|
|
3042
|
+
lines.push("");
|
|
3043
|
+
lines.push(post.body);
|
|
3044
|
+
lines.push("");
|
|
3045
|
+
}
|
|
3046
|
+
if (comments.length > 0) {
|
|
3047
|
+
lines.push(`## Top comments (${comments.length} total)`);
|
|
3048
|
+
lines.push("");
|
|
3049
|
+
for (const c of comments) {
|
|
3050
|
+
const indent = " ".repeat(c.depth);
|
|
3051
|
+
const op = c.isOP ? " **[OP]**" : "";
|
|
3052
|
+
const score = c.score >= 0 ? `+${c.score}` : `${c.score}`;
|
|
3053
|
+
lines.push(`${indent}- **u/${c.author}**${op} _(${score})_`);
|
|
3054
|
+
for (const line of c.body.split("\n")) {
|
|
3055
|
+
lines.push(`${indent} ${line}`);
|
|
3056
|
+
}
|
|
3057
|
+
lines.push("");
|
|
3058
|
+
}
|
|
3059
|
+
}
|
|
3060
|
+
return lines.join("\n");
|
|
3061
|
+
}
|
|
3062
|
+
async function fetchRedditBranch(inputs) {
|
|
3063
|
+
if (inputs.length === 0) {
|
|
3064
|
+
return { successItems: [], failedContents: [], metrics: { successful: 0, failed: 0, totalCredits: 0 } };
|
|
3065
|
+
}
|
|
3066
|
+
const env = parseEnv();
|
|
3067
|
+
if (!env.REDDIT_CLIENT_ID || !env.REDDIT_CLIENT_SECRET) {
|
|
3068
|
+
const failedContents2 = inputs.map(
|
|
3069
|
+
(i) => ({
|
|
3070
|
+
index: i.origIndex,
|
|
3071
|
+
content: `## ${i.url}
|
|
3072
|
+
|
|
3073
|
+
\u274C Reddit URL detected, but Reddit API is not configured. Set \`REDDIT_CLIENT_ID\` and \`REDDIT_CLIENT_SECRET\` in the server env to enable threaded Reddit scraping.`
|
|
3074
|
+
})
|
|
3075
|
+
);
|
|
3076
|
+
return {
|
|
3077
|
+
successItems: [],
|
|
3078
|
+
failedContents: failedContents2,
|
|
3079
|
+
metrics: { successful: 0, failed: inputs.length, totalCredits: 0 }
|
|
3080
|
+
};
|
|
3081
|
+
}
|
|
3082
|
+
const [postInputs, nonPermalinks] = inputs.reduce(
|
|
3083
|
+
([posts, rest], input) => {
|
|
3084
|
+
if (isRedditPostPermalink(input.url)) posts.push(input);
|
|
3085
|
+
else rest.push(input);
|
|
3086
|
+
return [posts, rest];
|
|
3087
|
+
},
|
|
3088
|
+
[[], []]
|
|
3089
|
+
);
|
|
3090
|
+
const nonPermalinkFailed = nonPermalinks.map(
|
|
3091
|
+
(i) => ({
|
|
3092
|
+
index: i.origIndex,
|
|
3093
|
+
content: `## ${i.url}
|
|
3094
|
+
|
|
3095
|
+
\u274C Only Reddit post permalinks (/r/<sub>/comments/<id>/...) are supported. Use raw-web-search with explicit Reddit permalink probes or smart-web-search with scope:"reddit" to discover post permalinks first.`
|
|
3096
|
+
})
|
|
3097
|
+
);
|
|
3098
|
+
if (postInputs.length === 0) {
|
|
3099
|
+
return {
|
|
3100
|
+
successItems: [],
|
|
3101
|
+
failedContents: nonPermalinkFailed,
|
|
3102
|
+
metrics: { successful: 0, failed: nonPermalinks.length, totalCredits: 0 }
|
|
3103
|
+
};
|
|
3104
|
+
}
|
|
3105
|
+
mcpLog("info", `[concurrency] reddit branch: fetching ${postInputs.length} post(s) with limit=${CONCURRENCY.REDDIT}`, "scrape");
|
|
3106
|
+
const urls = postInputs.map((i) => i.url);
|
|
3107
|
+
const batchResult = await runExternalEffect(
|
|
3108
|
+
Effect3.gen(function* () {
|
|
3109
|
+
const reddit = yield* RedditService;
|
|
3110
|
+
return yield* reddit.batchGetPosts(urls, true).pipe(
|
|
3111
|
+
Effect3.timeoutFail({
|
|
3112
|
+
duration: "60 seconds",
|
|
3113
|
+
onTimeout: () => new ProviderTimeoutError({
|
|
3114
|
+
provider: "reddit",
|
|
3115
|
+
operation: "batchGetPosts",
|
|
3116
|
+
durationMs: 6e4
|
|
3117
|
+
})
|
|
3118
|
+
})
|
|
3119
|
+
);
|
|
3120
|
+
}),
|
|
3121
|
+
RedditServiceLive(env.REDDIT_CLIENT_ID, env.REDDIT_CLIENT_SECRET)
|
|
3122
|
+
);
|
|
3123
|
+
const urlToIndex = new Map(postInputs.map((i) => [i.url, i.origIndex]));
|
|
3124
|
+
const successItems = [];
|
|
3125
|
+
const failedContents = [...nonPermalinkFailed];
|
|
3126
|
+
let successful = 0;
|
|
3127
|
+
let failed = nonPermalinks.length;
|
|
3128
|
+
for (const [url, result] of batchResult.results) {
|
|
3129
|
+
const origIndex = urlToIndex.get(url) ?? -1;
|
|
3130
|
+
if (result instanceof Error) {
|
|
3131
|
+
failed++;
|
|
3132
|
+
failedContents.push({ index: origIndex, content: `## ${url}
|
|
3133
|
+
|
|
3134
|
+
\u274C Reddit fetch failed: ${result.message}` });
|
|
3135
|
+
continue;
|
|
3136
|
+
}
|
|
3137
|
+
successful++;
|
|
3138
|
+
const md = formatRedditPostAsMarkdown(result);
|
|
3139
|
+
successItems.push({ url, content: md, index: origIndex, rawContent: md });
|
|
3140
|
+
}
|
|
3141
|
+
return { successItems, failedContents, metrics: { successful, failed, totalCredits: 0 } };
|
|
3142
|
+
}
|
|
3143
|
+
var TERSE_LLM_FAILURE_RE = /^\s*##\s*Matches\s*\n+\s*_Page did not load:\s*([a-z0-9_-]+)_\s*\.?\s*$/i;
|
|
3144
|
+
var RAW_FALLBACK_CHAR_CAP = 4e3;
|
|
3145
|
+
function detectTerseFailure(llmOutput) {
|
|
3146
|
+
const m = llmOutput.trim().match(TERSE_LLM_FAILURE_RE);
|
|
3147
|
+
return m ? m[1] : null;
|
|
3148
|
+
}
|
|
3149
|
+
function mergeLlmWithRawFallback(llmOutput, rawContent) {
|
|
3150
|
+
const reason = detectTerseFailure(llmOutput);
|
|
3151
|
+
if (!reason) return llmOutput;
|
|
3152
|
+
const trimmed = rawContent?.trim();
|
|
3153
|
+
if (!trimmed) return llmOutput;
|
|
3154
|
+
const snippet = trimmed.length > RAW_FALLBACK_CHAR_CAP ? trimmed.slice(0, RAW_FALLBACK_CHAR_CAP) + "\n\n\u2026[raw truncated]" : trimmed;
|
|
3155
|
+
return `${llmOutput.trim()}
|
|
3156
|
+
|
|
3157
|
+
## Raw content (LLM flagged page as ${reason})
|
|
3158
|
+
|
|
3159
|
+
${snippet}`;
|
|
3160
|
+
}
|
|
3161
|
+
async function processItemsWithLlm(successItems, enhancedInstruction, llmProcessor, reporter) {
|
|
3162
|
+
let llmErrors = 0;
|
|
3163
|
+
if (!enhancedInstruction) {
|
|
3164
|
+
if (successItems.length > 0) {
|
|
3165
|
+
mcpLog("info", "Raw mode: extract omitted \u2014 returning cleaned scraped content without LLM pass", "scrape");
|
|
3166
|
+
}
|
|
3167
|
+
return { items: successItems, llmErrors, llmAttempted: 0 };
|
|
3168
|
+
}
|
|
3169
|
+
if (!llmProcessor || successItems.length === 0) {
|
|
3170
|
+
if (!llmProcessor && successItems.length > 0) {
|
|
3171
|
+
mcpLog("warning", "LLM unavailable (LLM_API_KEY not set). Returning raw scraped content.", "scrape");
|
|
3172
|
+
void reporter.log("warning", "llm_extractor_unreachable: planner not configured; raw scraped content returned");
|
|
3173
|
+
}
|
|
3174
|
+
return { items: successItems, llmErrors, llmAttempted: 0 };
|
|
3175
|
+
}
|
|
3176
|
+
mcpLog("info", `[concurrency] llm extraction: fanning out ${successItems.length} item(s) with limit=${CONCURRENCY.LLM_EXTRACTION}`, "scrape");
|
|
3177
|
+
const llmResults = await runExternalEffect(
|
|
3178
|
+
Effect3.gen(function* () {
|
|
3179
|
+
const llm = yield* LlmService;
|
|
3180
|
+
return yield* Effect3.forEach(
|
|
3181
|
+
successItems,
|
|
3182
|
+
(item) => llm.extractContent(
|
|
3183
|
+
item.content,
|
|
3184
|
+
{ enabled: true, extract: enhancedInstruction, url: item.url }
|
|
3185
|
+
).pipe(
|
|
3186
|
+
Effect3.timeoutFail({
|
|
3187
|
+
duration: "155 seconds",
|
|
3188
|
+
onTimeout: () => new ProviderTimeoutError({
|
|
3189
|
+
provider: "llm",
|
|
3190
|
+
operation: "extractContent",
|
|
3191
|
+
durationMs: 155e3
|
|
3192
|
+
})
|
|
3193
|
+
}),
|
|
3194
|
+
Effect3.either,
|
|
3195
|
+
Effect3.map((result) => ({ item, result }))
|
|
3196
|
+
),
|
|
3197
|
+
{ concurrency: CONCURRENCY.LLM_EXTRACTION }
|
|
3198
|
+
);
|
|
3199
|
+
}),
|
|
3200
|
+
LlmServiceLive
|
|
3201
|
+
);
|
|
3202
|
+
const processedItems = llmResults.map(({ item, result }) => {
|
|
3203
|
+
mcpLog("debug", `LLM extracting ${item.url}...`, "scrape");
|
|
3204
|
+
if (Either.isLeft(result)) {
|
|
3205
|
+
llmErrors++;
|
|
3206
|
+
const errorMessage = effectErrorMessage(result.left);
|
|
3207
|
+
mcpLog("warning", `LLM extraction failed for ${item.url}: ${errorMessage}`, "scrape");
|
|
3208
|
+
void reporter.log("warning", `llm_extractor_unreachable: ${item.url} \u2014 ${errorMessage}`);
|
|
3209
|
+
const raw2 = item.rawContent?.trim();
|
|
3210
|
+
const rawSnippet2 = raw2 ? `
|
|
3211
|
+
|
|
3212
|
+
## Raw content (unextracted)
|
|
3213
|
+
|
|
3214
|
+
${raw2.length > RAW_FALLBACK_CHAR_CAP ? raw2.slice(0, RAW_FALLBACK_CHAR_CAP) + "\n\n...[raw truncated]" : raw2}` : "";
|
|
3215
|
+
return {
|
|
3216
|
+
...item,
|
|
3217
|
+
content: `\u274C LLM extraction failed: ${errorMessage}${rawSnippet2}`
|
|
3218
|
+
};
|
|
3219
|
+
}
|
|
3220
|
+
const llmResult = result.right;
|
|
3221
|
+
if (llmResult.processed) {
|
|
3222
|
+
const merged = mergeLlmWithRawFallback(llmResult.content, item.rawContent);
|
|
3223
|
+
if (merged !== llmResult.content) {
|
|
3224
|
+
mcpLog("warning", `LLM emitted terse escape line for ${item.url} \u2014 preserved raw fallback`, "scrape");
|
|
3225
|
+
void reporter.log("warning", `llm_terse_escape: ${item.url} \u2014 preserving raw fallback`);
|
|
3226
|
+
}
|
|
3227
|
+
return { ...item, content: merged };
|
|
3228
|
+
}
|
|
3229
|
+
llmErrors++;
|
|
3230
|
+
mcpLog("warning", `LLM extraction failed for ${item.url}: ${llmResult.error || "unknown reason"}`, "scrape");
|
|
3231
|
+
void reporter.log("warning", `llm_extractor_unreachable: ${item.url} \u2014 ${llmResult.error || "unknown reason"}`);
|
|
3232
|
+
const raw = item.rawContent?.trim();
|
|
3233
|
+
const rawSnippet = raw ? `
|
|
3234
|
+
|
|
3235
|
+
## Raw content (unextracted)
|
|
3236
|
+
|
|
3237
|
+
${raw.length > RAW_FALLBACK_CHAR_CAP ? raw.slice(0, RAW_FALLBACK_CHAR_CAP) + "\n\n...[raw truncated]" : raw}` : "";
|
|
3238
|
+
return {
|
|
3239
|
+
...item,
|
|
3240
|
+
content: `\u274C LLM extraction failed: ${llmResult.error || "unknown reason"}${rawSnippet}`
|
|
3241
|
+
};
|
|
3242
|
+
});
|
|
3243
|
+
return { items: processedItems, llmErrors, llmAttempted: successItems.length };
|
|
3244
|
+
}
|
|
3245
|
+
function assembleContentEntries(successItems, failedContents) {
|
|
3246
|
+
const successEntries = successItems.map((item) => {
|
|
3247
|
+
let content = item.content;
|
|
3248
|
+
try {
|
|
3249
|
+
content = removeMetaTags(content);
|
|
3250
|
+
} catch {
|
|
3251
|
+
}
|
|
3252
|
+
return { index: item.index, content: `## ${item.url}
|
|
3253
|
+
|
|
3254
|
+
${content}` };
|
|
3255
|
+
});
|
|
3256
|
+
return [...failedContents, ...successEntries].sort((a, b) => a.index - b.index).map((entry) => entry.content);
|
|
3257
|
+
}
|
|
3258
|
+
function buildScrapeResponse(params, contents, metrics, llmErrors, executionTime, llmAccounting) {
|
|
3259
|
+
const llmExtras = {};
|
|
3260
|
+
if (llmAccounting.llmAttempted > 0) {
|
|
3261
|
+
const ok = llmAccounting.llmAttempted - llmErrors;
|
|
3262
|
+
llmExtras["LLM extraction"] = `${ok}/${llmAccounting.llmAttempted} succeeded`;
|
|
3263
|
+
if (!llmAccounting.llmSucceeded) {
|
|
3264
|
+
llmExtras["LLM credit"] = "0 charged (no extraction produced)";
|
|
3265
|
+
}
|
|
3266
|
+
} else if (llmErrors > 0) {
|
|
3267
|
+
llmExtras["LLM extraction failures"] = llmErrors;
|
|
3268
|
+
}
|
|
3269
|
+
const batchHeader = formatBatchHeader({
|
|
3270
|
+
title: `Scraped Content (${params.urls.length} URLs)`,
|
|
3271
|
+
totalItems: params.urls.length,
|
|
3272
|
+
successful: metrics.successful,
|
|
3273
|
+
failed: metrics.failed,
|
|
3274
|
+
extras: {
|
|
3275
|
+
"Credits used": metrics.totalCredits,
|
|
3276
|
+
...llmExtras
|
|
3277
|
+
}
|
|
3278
|
+
});
|
|
3279
|
+
const formattedContent = formatSuccess({
|
|
3280
|
+
title: "Scraping Complete",
|
|
3281
|
+
summary: batchHeader,
|
|
3282
|
+
data: contents.join("\n\n---\n\n"),
|
|
3283
|
+
metadata: {
|
|
3284
|
+
"Execution time": formatDuration(executionTime)
|
|
3285
|
+
}
|
|
3286
|
+
});
|
|
3287
|
+
return formattedContent;
|
|
3288
|
+
}
|
|
3289
|
+
async function handleScrapeLinksMode(params, reporter = NOOP_REPORTER) {
|
|
3290
|
+
const startTime = Date.now();
|
|
3291
|
+
if (!params.urls || params.urls.length === 0) {
|
|
3292
|
+
return createScrapeErrorResponse("NO_URLS", "No URLs provided", startTime, params.toolName);
|
|
3293
|
+
}
|
|
3294
|
+
if (params.smart && !createLLMProcessor()) {
|
|
3295
|
+
return toolFailure(getMissingEnvMessage("llmExtraction"));
|
|
3296
|
+
}
|
|
3297
|
+
const { webInputs, redditInputs, documentInputs, invalidEntries } = partitionUrls(params.urls);
|
|
3298
|
+
const validCount = webInputs.length + redditInputs.length + documentInputs.length;
|
|
3299
|
+
await reporter.log(
|
|
3300
|
+
"info",
|
|
3301
|
+
`Partitioned ${params.urls.length} URL(s): ${webInputs.length} web, ${redditInputs.length} reddit, ${documentInputs.length} document, ${invalidEntries.length} invalid`
|
|
3302
|
+
);
|
|
3303
|
+
if (validCount === 0) {
|
|
3304
|
+
return createScrapeErrorResponse(
|
|
3305
|
+
"INVALID_URLS",
|
|
3306
|
+
`All ${params.urls.length} URLs are invalid`,
|
|
3307
|
+
startTime,
|
|
3308
|
+
params.toolName,
|
|
3309
|
+
false,
|
|
3310
|
+
[
|
|
3311
|
+
"raw-web-search(keywords=[...]) \u2014 search for valid URLs first, then scrape the results"
|
|
3312
|
+
]
|
|
3313
|
+
);
|
|
3314
|
+
}
|
|
3315
|
+
mcpLog(
|
|
3316
|
+
"info",
|
|
3317
|
+
`Starting ${params.toolName}: ${webInputs.length} web + ${redditInputs.length} reddit + ${documentInputs.length} document URL(s)`,
|
|
3318
|
+
"scrape"
|
|
3319
|
+
);
|
|
3320
|
+
await reporter.progress(15, 100, "Preparing scrape clients");
|
|
3321
|
+
let kernelEnabled;
|
|
3322
|
+
let scrapeDoProxyUrl;
|
|
3323
|
+
try {
|
|
3324
|
+
const env = parseEnv();
|
|
3325
|
+
kernelEnabled = getCapabilities().kernel;
|
|
3326
|
+
scrapeDoProxyUrl = env.SCRAPER_API_KEY ? buildScrapeDoProxyUrl(env.SCRAPER_API_KEY) : void 0;
|
|
3327
|
+
} catch (error2) {
|
|
3328
|
+
const err = classifyError(error2);
|
|
3329
|
+
return createScrapeErrorResponse(
|
|
3330
|
+
"CLIENT_INIT_FAILED",
|
|
3331
|
+
`Failed to initialize scrape providers: ${err.message}`,
|
|
3332
|
+
startTime,
|
|
3333
|
+
params.toolName,
|
|
3334
|
+
false,
|
|
3335
|
+
[
|
|
3336
|
+
'raw-web-search(keywords=["topic key findings", "topic summary"]) \u2014 search instead of scraping'
|
|
3337
|
+
]
|
|
3338
|
+
);
|
|
3339
|
+
}
|
|
3340
|
+
const enhancedInstruction = params.smart ? enhanceExtractionInstruction(params.extract) : void 0;
|
|
3341
|
+
await reporter.progress(35, 100, "Fetching page content");
|
|
3342
|
+
const jinaInputs = [...webInputs, ...documentInputs];
|
|
3343
|
+
const [jinaPhase, redditPhase] = await Promise.all([
|
|
3344
|
+
fetchJinaFirstBranch(jinaInputs, kernelEnabled, scrapeDoProxyUrl),
|
|
3345
|
+
fetchRedditBranch(redditInputs)
|
|
3346
|
+
]);
|
|
3347
|
+
const successItems = [
|
|
3348
|
+
...jinaPhase.successItems,
|
|
3349
|
+
...redditPhase.successItems
|
|
3350
|
+
];
|
|
3351
|
+
const invalidFailed = invalidEntries.map(
|
|
3352
|
+
({ url, origIndex }) => ({ index: origIndex, content: `## ${url}
|
|
3353
|
+
|
|
3354
|
+
\u274C Invalid URL format` })
|
|
3355
|
+
);
|
|
3356
|
+
const failedContents = [
|
|
3357
|
+
...invalidFailed,
|
|
3358
|
+
...jinaPhase.failedContents,
|
|
3359
|
+
...redditPhase.failedContents
|
|
3360
|
+
];
|
|
3361
|
+
const metrics = {
|
|
3362
|
+
successful: jinaPhase.metrics.successful + redditPhase.metrics.successful,
|
|
3363
|
+
failed: invalidEntries.length + jinaPhase.metrics.failed + redditPhase.metrics.failed,
|
|
3364
|
+
totalCredits: 0
|
|
3365
|
+
};
|
|
3366
|
+
await reporter.log("info", `Fetched ${metrics.successful} page(s), ${metrics.failed} failed`);
|
|
3367
|
+
if (successItems.length > 0) {
|
|
3368
|
+
await reporter.progress(80, 100, "Running LLM extraction over fetched pages");
|
|
3369
|
+
}
|
|
3370
|
+
const { items: processedItems, llmErrors, llmAttempted } = await processItemsWithLlm(
|
|
3371
|
+
successItems,
|
|
3372
|
+
enhancedInstruction,
|
|
3373
|
+
createLLMProcessor(),
|
|
3374
|
+
reporter
|
|
3375
|
+
);
|
|
3376
|
+
const contents = assembleContentEntries(processedItems, failedContents);
|
|
3377
|
+
const executionTime = Date.now() - startTime;
|
|
3378
|
+
mcpLog(
|
|
3379
|
+
"info",
|
|
3380
|
+
`Completed: ${metrics.successful} successful, ${metrics.failed} failed, ${metrics.totalCredits} credits used`,
|
|
3381
|
+
"scrape"
|
|
3382
|
+
);
|
|
3383
|
+
const llmSucceeded = llmAttempted > 0 && llmErrors < llmAttempted;
|
|
3384
|
+
const content = buildScrapeResponse(
|
|
3385
|
+
params,
|
|
3386
|
+
contents,
|
|
3387
|
+
metrics,
|
|
3388
|
+
llmErrors,
|
|
3389
|
+
executionTime,
|
|
3390
|
+
{ llmAttempted, llmSucceeded }
|
|
3391
|
+
);
|
|
3392
|
+
if (metrics.successful === 0 && metrics.failed > 0) {
|
|
3393
|
+
return toolFailure(content);
|
|
3394
|
+
}
|
|
3395
|
+
if (params.smart && llmAttempted > 0 && llmErrors === llmAttempted) {
|
|
3396
|
+
return toolFailure(content);
|
|
3397
|
+
}
|
|
3398
|
+
return toolSuccess(content);
|
|
3399
|
+
}
|
|
3400
|
+
function handleRawScrapeLinks(params, reporter = NOOP_REPORTER) {
|
|
3401
|
+
return handleScrapeLinksMode({ ...params, smart: false, toolName: "raw-scrape-links" }, reporter);
|
|
3402
|
+
}
|
|
3403
|
+
function handleSmartScrapeLinks(params, reporter = NOOP_REPORTER) {
|
|
3404
|
+
return handleScrapeLinksMode({ ...params, smart: true, toolName: "smart-scrape-links" }, reporter);
|
|
3405
|
+
}
|
|
3406
|
+
function registerScrapeLinksTools(server) {
|
|
3407
|
+
server.tool(
|
|
3408
|
+
{
|
|
3409
|
+
name: "raw-scrape-links",
|
|
3410
|
+
title: "Raw Scrape Links",
|
|
3411
|
+
description: "Fetch URLs in parallel and return full markdown directly. Input is only `urls` (1\u201350). Reddit post permalinks route through the Reddit API with threaded comments. Non-Reddit URLs use Jina Reader first, then Jina Reader with Scrape.do proxy mode when SCRAPEDO_API_KEY is configured, then optional Kernel browser rendering for web pages. Use this for full source capture, Reddit comment harvesting, and raw evidence before synthesis.",
|
|
3412
|
+
schema: rawScrapeLinksParamsSchema,
|
|
3413
|
+
annotations: {
|
|
3414
|
+
readOnlyHint: true,
|
|
3415
|
+
idempotentHint: true,
|
|
3416
|
+
destructiveHint: false,
|
|
3417
|
+
openWorldHint: true
|
|
3418
|
+
}
|
|
3419
|
+
},
|
|
3420
|
+
async (args, ctx) => {
|
|
3421
|
+
const parsed = rawScrapeLinksParamsSchema.safeParse(args);
|
|
3422
|
+
if (!parsed.success) {
|
|
3423
|
+
return toToolResponse(toolFailure(formatInputValidationError("raw-scrape-links", parsed.error.issues)));
|
|
3424
|
+
}
|
|
3425
|
+
const reporter = createToolReporter(ctx, "raw-scrape-links");
|
|
3426
|
+
const result = await handleRawScrapeLinks(parsed.data, reporter);
|
|
3427
|
+
await reporter.progress(100, 100, result.isError ? "Scrape failed" : "Scrape complete");
|
|
3428
|
+
return toToolResponse(result);
|
|
3429
|
+
}
|
|
3430
|
+
);
|
|
3431
|
+
server.tool(
|
|
3432
|
+
{
|
|
3433
|
+
name: "smart-scrape-links",
|
|
3434
|
+
title: "Smart Scrape Links",
|
|
3435
|
+
description: "Fetch URLs in parallel, then always run per-URL LLM extraction. Input is `urls` (1\u201350) plus required `extract`. Reddit post permalinks route through the Reddit API with threaded comments. Non-Reddit URLs use Jina Reader first, then Jina Reader with Scrape.do proxy mode when SCRAPEDO_API_KEY is configured, then optional Kernel browser rendering for web pages. Each extracted page returns markdown sections such as `## Source`, `## Matches`, `## Not found`, and `## Follow-up signals`.",
|
|
3436
|
+
schema: smartScrapeLinksParamsSchema,
|
|
3437
|
+
annotations: {
|
|
3438
|
+
readOnlyHint: true,
|
|
3439
|
+
idempotentHint: true,
|
|
3440
|
+
destructiveHint: false,
|
|
3441
|
+
openWorldHint: true
|
|
3442
|
+
}
|
|
3443
|
+
},
|
|
3444
|
+
async (args, ctx) => {
|
|
3445
|
+
const parsed = smartScrapeLinksParamsSchema.safeParse(args);
|
|
3446
|
+
if (!parsed.success) {
|
|
3447
|
+
return toToolResponse(toolFailure(formatInputValidationError("smart-scrape-links", parsed.error.issues)));
|
|
3448
|
+
}
|
|
3449
|
+
const reporter = createToolReporter(ctx, "smart-scrape-links");
|
|
3450
|
+
const result = await handleSmartScrapeLinks(parsed.data, reporter);
|
|
3451
|
+
await reporter.progress(100, 100, result.isError ? "Scrape failed" : "Scrape complete");
|
|
3452
|
+
return toToolResponse(result);
|
|
3453
|
+
}
|
|
3454
|
+
);
|
|
3455
|
+
}
|
|
3456
|
+
|
|
3457
|
+
// src/tools/search.ts
|
|
3458
|
+
import { Effect as Effect4 } from "effect";
|
|
3459
|
+
|
|
3460
|
+
// src/utils/url-aggregator.ts
|
|
3461
|
+
var MIN_BEYOND_TOP10_WEIGHT = 0;
|
|
3462
|
+
var BEYOND_TOP10_DECAY = 0.5;
|
|
3463
|
+
var BEYOND_TOP10_BASE = 10;
|
|
3464
|
+
var DEFAULT_MIN_CONSENSUS_URLS = 5;
|
|
3465
|
+
var HIGH_CONSENSUS_THRESHOLD = 4;
|
|
3466
|
+
var MAX_ALT_SNIPPETS = 3;
|
|
3467
|
+
var MAX_CONSISTENCY_PENALTY = 0.15;
|
|
3468
|
+
var CONSISTENCY_STDDEV_SCALE = 5;
|
|
3469
|
+
function computePositionStats(positions) {
|
|
3470
|
+
if (positions.length <= 1) {
|
|
3471
|
+
return { mean: positions[0] ?? 0, stdDev: 0, consistencyMultiplier: 1 };
|
|
3472
|
+
}
|
|
3473
|
+
const mean = positions.reduce((a, b) => a + b, 0) / positions.length;
|
|
3474
|
+
const variance = positions.reduce((sum, p) => sum + (p - mean) ** 2, 0) / (positions.length - 1);
|
|
3475
|
+
const stdDev = Math.sqrt(variance);
|
|
3476
|
+
const consistencyMultiplier = 1 - MAX_CONSISTENCY_PENALTY * Math.min(stdDev / CONSISTENCY_STDDEV_SCALE, 1);
|
|
3477
|
+
return { mean, stdDev, consistencyMultiplier };
|
|
3478
|
+
}
|
|
3479
|
+
function getCtrWeight(position) {
|
|
3480
|
+
if (position >= 1 && position <= 10) {
|
|
3481
|
+
return CTR_WEIGHTS[position] ?? 0;
|
|
3482
|
+
}
|
|
3483
|
+
return Math.max(MIN_BEYOND_TOP10_WEIGHT, BEYOND_TOP10_BASE - (position - BEYOND_TOP10_BASE) * BEYOND_TOP10_DECAY);
|
|
3484
|
+
}
|
|
3485
|
+
function aggregateResults(searches) {
|
|
3486
|
+
const urlMap = /* @__PURE__ */ new Map();
|
|
3487
|
+
for (const search of searches) {
|
|
3488
|
+
for (const result of search.results) {
|
|
3489
|
+
const normalizedUrl = normalizeUrl(result.link);
|
|
3490
|
+
const existing = urlMap.get(normalizedUrl);
|
|
3491
|
+
if (existing) {
|
|
3492
|
+
existing.frequency += 1;
|
|
3493
|
+
existing.positions.push(result.position);
|
|
3494
|
+
existing.queries.push(search.query);
|
|
3495
|
+
const prevBest = existing.bestPosition;
|
|
3496
|
+
existing.bestPosition = Math.min(existing.bestPosition, result.position);
|
|
3497
|
+
existing.totalScore += getCtrWeight(result.position);
|
|
3498
|
+
if (result.snippet && existing.allSnippets.length < MAX_ALT_SNIPPETS && !existing.allSnippets.some((s) => s === result.snippet)) {
|
|
3499
|
+
existing.allSnippets.push(result.snippet);
|
|
3500
|
+
}
|
|
3501
|
+
if (result.position < prevBest) {
|
|
3502
|
+
existing.title = result.title;
|
|
3503
|
+
existing.snippet = result.snippet;
|
|
3504
|
+
}
|
|
3505
|
+
} else {
|
|
3506
|
+
urlMap.set(normalizedUrl, {
|
|
3507
|
+
url: result.link,
|
|
3508
|
+
title: result.title,
|
|
3509
|
+
snippet: result.snippet,
|
|
3510
|
+
allSnippets: result.snippet ? [result.snippet] : [],
|
|
3511
|
+
frequency: 1,
|
|
3512
|
+
positions: [result.position],
|
|
3513
|
+
queries: [search.query],
|
|
3514
|
+
bestPosition: result.position,
|
|
3515
|
+
totalScore: getCtrWeight(result.position)
|
|
3516
|
+
});
|
|
3517
|
+
}
|
|
3518
|
+
}
|
|
3519
|
+
}
|
|
3520
|
+
return urlMap;
|
|
3521
|
+
}
|
|
3522
|
+
function normalizeUrl(url) {
|
|
3523
|
+
try {
|
|
3524
|
+
const parsed = new URL(url);
|
|
3525
|
+
let host = parsed.hostname.replace(/^www\./, "");
|
|
3526
|
+
let path = parsed.pathname.replace(/\/$/, "") || "/";
|
|
3527
|
+
return `${host}${path}${parsed.search}`.toLowerCase();
|
|
3528
|
+
} catch {
|
|
3529
|
+
return url.toLowerCase().replace(/\/$/, "");
|
|
3530
|
+
}
|
|
3531
|
+
}
|
|
3532
|
+
function countByFrequency(urlMap, minFrequency) {
|
|
3533
|
+
let count = 0;
|
|
3534
|
+
for (const url of urlMap.values()) {
|
|
3535
|
+
if (url.frequency >= minFrequency) count++;
|
|
3536
|
+
}
|
|
3537
|
+
return count;
|
|
3538
|
+
}
|
|
3539
|
+
function calculateWeightedScores(urls, consensusThreshold, totalQueries) {
|
|
3540
|
+
if (urls.length === 0) return [];
|
|
3541
|
+
const scored = urls.map((url) => {
|
|
3542
|
+
const stats = computePositionStats(url.positions);
|
|
3543
|
+
const compositeScore = url.totalScore * stats.consistencyMultiplier;
|
|
3544
|
+
return { url, compositeScore, stats };
|
|
3545
|
+
});
|
|
3546
|
+
scored.sort((a, b) => b.compositeScore - a.compositeScore);
|
|
3547
|
+
const maxScore = scored[0].compositeScore;
|
|
3548
|
+
return scored.map(({ url, compositeScore, stats }, index) => ({
|
|
3549
|
+
url: url.url,
|
|
3550
|
+
title: url.title,
|
|
3551
|
+
snippet: url.snippet,
|
|
3552
|
+
allSnippets: url.allSnippets,
|
|
3553
|
+
rank: index + 1,
|
|
3554
|
+
score: maxScore > 0 ? compositeScore / maxScore * 100 : 0,
|
|
3555
|
+
frequency: url.frequency,
|
|
3556
|
+
positions: url.positions,
|
|
3557
|
+
queries: url.queries,
|
|
3558
|
+
bestPosition: url.bestPosition,
|
|
3559
|
+
isConsensus: url.frequency >= consensusThreshold,
|
|
3560
|
+
coverageRatio: totalQueries > 0 ? url.frequency / totalQueries : 0,
|
|
3561
|
+
positionStdDev: stats.stdDev,
|
|
3562
|
+
consistencyMultiplier: stats.consistencyMultiplier
|
|
3563
|
+
}));
|
|
3564
|
+
}
|
|
3565
|
+
var COVERAGE_TABLE_MAX_ROWS = 20;
|
|
3566
|
+
function consistencyLabel(stdDev, frequency) {
|
|
3567
|
+
if (frequency <= 1) return "n/a";
|
|
3568
|
+
if (stdDev < 1.5) return "high";
|
|
3569
|
+
if (stdDev < 3.5) return "medium";
|
|
3570
|
+
return "variable";
|
|
3571
|
+
}
|
|
3572
|
+
function generateUnifiedOutput(rankedUrls, allQueries, queryResults, totalUniqueUrls, frequencyThreshold, thresholdNote, verbose = false) {
|
|
3573
|
+
const lines = [];
|
|
3574
|
+
const consensusCount = rankedUrls.filter((u) => u.isConsensus).length;
|
|
3575
|
+
lines.push(`## Web Search Results (${allQueries.length} queries, ${totalUniqueUrls} unique URLs)`);
|
|
3576
|
+
lines.push("");
|
|
3577
|
+
if (thresholdNote) {
|
|
3578
|
+
lines.push(`> ${thresholdNote}`);
|
|
3579
|
+
lines.push("");
|
|
3580
|
+
}
|
|
3581
|
+
const consensusActive = frequencyThreshold > 1;
|
|
3582
|
+
for (const url of rankedUrls) {
|
|
3583
|
+
const consensusTag = consensusActive && url.frequency >= HIGH_CONSENSUS_THRESHOLD ? " CONSENSUS+++" : consensusActive && url.isConsensus ? " CONSENSUS" : "";
|
|
3584
|
+
const coveragePct = Math.round(url.coverageRatio * 100);
|
|
3585
|
+
const consistency = consistencyLabel(url.positionStdDev, url.frequency);
|
|
3586
|
+
lines.push(`**${url.rank}. [${url.title}](${url.url})**${consensusTag}`);
|
|
3587
|
+
const showRowMetadata = verbose || allQueries.length > 1 && url.frequency > 1 || allQueries.length === 1;
|
|
3588
|
+
if (showRowMetadata) {
|
|
3589
|
+
const parts = [
|
|
3590
|
+
`Score: ${url.score.toFixed(1)}`,
|
|
3591
|
+
`Seen in: ${url.frequency}/${allQueries.length} queries (${coveragePct}%)`,
|
|
3592
|
+
`Best pos: #${url.bestPosition}`
|
|
3593
|
+
];
|
|
3594
|
+
if (url.frequency > 1) {
|
|
3595
|
+
parts.push(`Consistency: ${consistency}`);
|
|
3596
|
+
}
|
|
3597
|
+
lines.push(parts.join(" | "));
|
|
3598
|
+
}
|
|
3599
|
+
if (url.queries.length > 1 || verbose) {
|
|
3600
|
+
lines.push(`Queries: ${url.queries.map((q) => `"${q}"`).join(", ")}`);
|
|
3601
|
+
}
|
|
3602
|
+
lines.push(`> ${url.snippet}`);
|
|
3603
|
+
if (url.allSnippets.length > 1) {
|
|
3604
|
+
const alts = url.allSnippets.filter((s) => s !== url.snippet).slice(0, 3).map((s) => s.length > 100 ? s.slice(0, 97) + "..." : s);
|
|
3605
|
+
if (alts.length > 0) {
|
|
3606
|
+
lines.push(`Alt: ${alts.map((s) => `"${s}"`).join(" | ")}`);
|
|
3607
|
+
}
|
|
3608
|
+
}
|
|
3609
|
+
lines.push("");
|
|
3610
|
+
}
|
|
3611
|
+
lines.push("---");
|
|
3612
|
+
if (allQueries.length <= COVERAGE_TABLE_MAX_ROWS) {
|
|
3613
|
+
lines.push("### Query Coverage");
|
|
3614
|
+
lines.push("| Query | Results | Top URL | Top Pos |");
|
|
3615
|
+
lines.push("|---------|---------|---------|---------|");
|
|
3616
|
+
for (const search of queryResults) {
|
|
3617
|
+
const topResult = search.results[0];
|
|
3618
|
+
let topDomain = "";
|
|
3619
|
+
if (topResult) {
|
|
3620
|
+
try {
|
|
3621
|
+
topDomain = new URL(topResult.link).hostname.replace(/^www\./, "");
|
|
3622
|
+
} catch {
|
|
3623
|
+
topDomain = topResult.link;
|
|
3624
|
+
}
|
|
3625
|
+
}
|
|
3626
|
+
lines.push(`| "${search.query}" | ${search.results.length} | ${topDomain || "\u2014"} | ${topResult ? `#${topResult.position}` : "\u2014"} |`);
|
|
3627
|
+
}
|
|
3628
|
+
lines.push("");
|
|
3629
|
+
} else {
|
|
3630
|
+
const goodCount = queryResults.filter((s) => s.results.length >= 3).length;
|
|
3631
|
+
lines.push(`### Query Coverage: ${goodCount}/${allQueries.length} queries returned 3+ results`);
|
|
3632
|
+
lines.push("");
|
|
3633
|
+
}
|
|
3634
|
+
const lowYield = queryResults.filter((s) => s.results.length <= 1);
|
|
3635
|
+
if (lowYield.length > 0) {
|
|
3636
|
+
lines.push(`**Low-yield queries** (0-1 results): ${lowYield.map((s) => `\`${s.query}\``).join(", ")}`);
|
|
3637
|
+
lines.push("");
|
|
3638
|
+
}
|
|
3639
|
+
const allRelated = /* @__PURE__ */ new Set();
|
|
3640
|
+
for (const search of queryResults) {
|
|
3641
|
+
if (search.related) {
|
|
3642
|
+
for (const r of search.related) {
|
|
3643
|
+
allRelated.add(r);
|
|
3644
|
+
}
|
|
3645
|
+
}
|
|
3646
|
+
}
|
|
3647
|
+
if (allRelated.size > 0) {
|
|
3648
|
+
const related = [...allRelated].slice(0, 10);
|
|
3649
|
+
lines.push(`**Related searches:** ${related.map((r) => `\`${r}\``).join(", ")}`);
|
|
3650
|
+
lines.push("");
|
|
3651
|
+
}
|
|
3652
|
+
return lines.join("\n");
|
|
3653
|
+
}
|
|
3654
|
+
function aggregateAndRank(searches, minConsensusUrls = DEFAULT_MIN_CONSENSUS_URLS) {
|
|
3655
|
+
const urlMap = aggregateResults(searches);
|
|
3656
|
+
const totalUniqueUrls = urlMap.size;
|
|
3657
|
+
const totalQueries = searches.length;
|
|
3658
|
+
const thresholds = [3, 2, 1];
|
|
3659
|
+
let usedThreshold = 1;
|
|
3660
|
+
let thresholdNote;
|
|
3661
|
+
for (const threshold of thresholds) {
|
|
3662
|
+
const count = countByFrequency(urlMap, threshold);
|
|
3663
|
+
if (count >= minConsensusUrls || threshold === 1) {
|
|
3664
|
+
usedThreshold = threshold;
|
|
3665
|
+
if (threshold < 3) {
|
|
3666
|
+
thresholdNote = `Note: Consensus threshold set to \u2265${threshold} due to result diversity.`;
|
|
3667
|
+
}
|
|
3668
|
+
break;
|
|
3669
|
+
}
|
|
3670
|
+
}
|
|
3671
|
+
const allUrls = [...urlMap.values()];
|
|
3672
|
+
const rankedUrls = calculateWeightedScores(allUrls, usedThreshold, totalQueries);
|
|
3673
|
+
return {
|
|
3674
|
+
rankedUrls,
|
|
3675
|
+
totalUniqueUrls,
|
|
3676
|
+
totalQueries,
|
|
3677
|
+
frequencyThreshold: usedThreshold,
|
|
3678
|
+
thresholdNote
|
|
3679
|
+
};
|
|
3680
|
+
}
|
|
3681
|
+
|
|
3682
|
+
// src/utils/sanitize.ts
|
|
3683
|
+
var CONTROL_CHARS = /[\x00-\x1f\x7f]/g;
|
|
3684
|
+
var URLS = /https?:\/\/\S+/gi;
|
|
3685
|
+
var MARKDOWN_LINKS = /\[([^\]]+)\]\([^)]+\)/g;
|
|
3686
|
+
function sanitizeSuggestion(input) {
|
|
3687
|
+
return input.replace(CONTROL_CHARS, " ").replace(MARKDOWN_LINKS, "$1").replace(URLS, "").replace(/\s+/g, " ").trim();
|
|
3688
|
+
}
|
|
3689
|
+
|
|
3690
|
+
// src/utils/query-relax.ts
|
|
3691
|
+
var QUOTED_PHRASE_RE = /"([^"]*)"/g;
|
|
3692
|
+
var HAS_BOOLEAN_GROUPING = /\b(?:OR|AND)\b|[()]/;
|
|
3693
|
+
var OPERATOR_CHAR_IN_PHRASE = /[():[\]]/;
|
|
3694
|
+
var OPERATOR_CHAR_GLOBAL = /[():[\]]/g;
|
|
3695
|
+
var PATH_LIKE_IN_PHRASE = /\/|~\/|^@|\.{3,}/;
|
|
3696
|
+
var URI_SCHEME_IN_PHRASE = /^[a-z][a-z0-9+.-]*:/i;
|
|
3697
|
+
var HAS_SITE_OPERATOR = /\bsite:\S+/i;
|
|
3698
|
+
var SITE_OPERATOR_GLOBAL = /\bsite:\S+/gi;
|
|
3699
|
+
function renderSeg(seg) {
|
|
3700
|
+
return seg.type === "raw" ? seg.text : seg.quoted ? `"${seg.text}"` : seg.text;
|
|
3701
|
+
}
|
|
3702
|
+
function tokenize(query) {
|
|
3703
|
+
const segs = [];
|
|
3704
|
+
let last = 0;
|
|
3705
|
+
for (const m of query.matchAll(QUOTED_PHRASE_RE)) {
|
|
3706
|
+
const start = m.index ?? 0;
|
|
3707
|
+
const end = start + m[0].length;
|
|
3708
|
+
if (start > last) segs.push({ type: "raw", text: query.slice(last, start) });
|
|
3709
|
+
segs.push({ type: "phrase", text: m[1] ?? "", quoted: true });
|
|
3710
|
+
last = end;
|
|
3711
|
+
}
|
|
3712
|
+
if (last < query.length) segs.push({ type: "raw", text: query.slice(last) });
|
|
3713
|
+
return segs;
|
|
3714
|
+
}
|
|
3715
|
+
function rebuild(segs) {
|
|
3716
|
+
return segs.map(renderSeg).join("").replace(/\s+/g, " ").trim();
|
|
3717
|
+
}
|
|
3718
|
+
function onlyWhitespaceBetween(segs, fromIndex, toIndex) {
|
|
3719
|
+
for (let i = fromIndex + 1; i < toIndex; i += 1) {
|
|
3720
|
+
const seg = segs[i];
|
|
3721
|
+
if (seg === void 0) continue;
|
|
3722
|
+
if (seg.type !== "raw" || seg.text.trim() !== "") {
|
|
3723
|
+
return false;
|
|
3724
|
+
}
|
|
3725
|
+
}
|
|
3726
|
+
return true;
|
|
3727
|
+
}
|
|
3728
|
+
function buildAnchoredOrGroup(segs, quotedIndices) {
|
|
3729
|
+
const groupStart = quotedIndices[1];
|
|
3730
|
+
const groupEnd = quotedIndices[quotedIndices.length - 1];
|
|
3731
|
+
if (groupStart === void 0 || groupEnd === void 0) {
|
|
3732
|
+
return null;
|
|
3733
|
+
}
|
|
3734
|
+
for (let i = 2; i < quotedIndices.length; i += 1) {
|
|
3735
|
+
const previous = quotedIndices[i - 1];
|
|
3736
|
+
const current = quotedIndices[i];
|
|
3737
|
+
if (previous === void 0 || current === void 0 || !onlyWhitespaceBetween(segs, previous, current)) {
|
|
3738
|
+
return null;
|
|
3739
|
+
}
|
|
3740
|
+
}
|
|
3741
|
+
const groupIndices = new Set(quotedIndices.slice(1));
|
|
3742
|
+
const parts = [];
|
|
3743
|
+
for (let i = 0; i < segs.length; i += 1) {
|
|
3744
|
+
const seg = segs[i];
|
|
3745
|
+
if (seg === void 0) continue;
|
|
3746
|
+
if (seg.type === "raw") {
|
|
3747
|
+
if (i > groupStart && i < groupEnd) {
|
|
3748
|
+
continue;
|
|
3749
|
+
}
|
|
3750
|
+
parts.push(seg.text);
|
|
3751
|
+
continue;
|
|
3752
|
+
}
|
|
3753
|
+
if (groupIndices.has(i)) {
|
|
3754
|
+
parts.push(i === groupStart ? " (" : " OR ");
|
|
3755
|
+
parts.push(renderSeg(seg));
|
|
3756
|
+
if (i === groupEnd) {
|
|
3757
|
+
parts.push(")");
|
|
3758
|
+
}
|
|
3759
|
+
continue;
|
|
3760
|
+
}
|
|
3761
|
+
parts.push(renderSeg(seg));
|
|
3762
|
+
}
|
|
3763
|
+
return parts.join("").replace(/\s+/g, " ").trim();
|
|
3764
|
+
}
|
|
3765
|
+
function normalizeQueryForDispatch(query) {
|
|
3766
|
+
const original = query.trim().replace(/\s+/g, " ");
|
|
3767
|
+
if (!original) {
|
|
3768
|
+
return { rewritten: original, changed: false, rules: [] };
|
|
3769
|
+
}
|
|
3770
|
+
const segs = tokenize(query);
|
|
3771
|
+
const rules = [];
|
|
3772
|
+
for (const s of segs) {
|
|
3773
|
+
if (s.type === "phrase" && s.quoted && OPERATOR_CHAR_IN_PHRASE.test(s.text) && !URI_SCHEME_IN_PHRASE.test(s.text)) {
|
|
3774
|
+
s.quoted = false;
|
|
3775
|
+
s.text = s.text.replace(OPERATOR_CHAR_GLOBAL, " ");
|
|
3776
|
+
if (!rules.includes("A1")) rules.push("A1");
|
|
3777
|
+
}
|
|
3778
|
+
}
|
|
3779
|
+
for (const s of segs) {
|
|
3780
|
+
if (s.type === "phrase" && s.quoted && (URI_SCHEME_IN_PHRASE.test(s.text) || PATH_LIKE_IN_PHRASE.test(s.text))) {
|
|
3781
|
+
s.quoted = false;
|
|
3782
|
+
if (!rules.includes("A2")) rules.push("A2");
|
|
3783
|
+
}
|
|
3784
|
+
}
|
|
3785
|
+
const stillQuotedIndices = [];
|
|
3786
|
+
for (let i = 0; i < segs.length; i += 1) {
|
|
3787
|
+
const seg = segs[i];
|
|
3788
|
+
if (seg?.type === "phrase" && seg.quoted) {
|
|
3789
|
+
stillQuotedIndices.push(i);
|
|
3790
|
+
}
|
|
3791
|
+
}
|
|
3792
|
+
const rawJoined = segs.filter((s) => s.type === "raw").map((s) => s.text).join(" ");
|
|
3793
|
+
if (stillQuotedIndices.length >= 3 && !HAS_BOOLEAN_GROUPING.test(rawJoined)) {
|
|
3794
|
+
const grouped = buildAnchoredOrGroup(segs, stillQuotedIndices);
|
|
3795
|
+
if (grouped !== null) {
|
|
3796
|
+
rules.push("A3");
|
|
3797
|
+
return { rewritten: grouped, changed: grouped !== original, rules };
|
|
3798
|
+
}
|
|
3799
|
+
}
|
|
3800
|
+
const rewritten = rebuild(segs);
|
|
3801
|
+
return { rewritten, changed: rewritten !== original, rules };
|
|
3802
|
+
}
|
|
3803
|
+
function relaxQueryForRetry(query, options = {}) {
|
|
3804
|
+
const original = query.trim().replace(/\s+/g, " ");
|
|
3805
|
+
if (!original) {
|
|
3806
|
+
return { rewritten: original, changed: false, rules: [] };
|
|
3807
|
+
}
|
|
3808
|
+
const dropSite = options.dropSite ?? true;
|
|
3809
|
+
const rules = [];
|
|
3810
|
+
let result = query;
|
|
3811
|
+
if (result.includes('"')) {
|
|
3812
|
+
result = result.replace(/"/g, "");
|
|
3813
|
+
rules.push("B1");
|
|
3814
|
+
}
|
|
3815
|
+
if (dropSite && HAS_SITE_OPERATOR.test(result)) {
|
|
3816
|
+
result = result.replace(SITE_OPERATOR_GLOBAL, " ");
|
|
3817
|
+
rules.push("B2");
|
|
3818
|
+
}
|
|
3819
|
+
result = result.replace(/\s+/g, " ").trim();
|
|
3820
|
+
return { rewritten: result, changed: result !== original, rules };
|
|
3821
|
+
}
|
|
3822
|
+
|
|
3823
|
+
// src/tools/search.ts
|
|
3824
|
+
function formatInputValidationError2(toolName, issues) {
|
|
3825
|
+
const details = issues.map((issue) => {
|
|
3826
|
+
const path = issue.path.length > 0 ? issue.path.map(String).join(".") : "<root>";
|
|
3827
|
+
return `- ${path}: ${issue.message}`;
|
|
3828
|
+
}).join("\n");
|
|
3829
|
+
return `Invalid ${toolName} input.
|
|
3830
|
+
|
|
3831
|
+
${details}`;
|
|
3832
|
+
}
|
|
3833
|
+
var REDDIT_POST_PERMALINK2 = /\/r\/[^/]+\/comments\/[a-z0-9]+\//i;
|
|
3834
|
+
var REDDIT_HOST2 = /(?:^|\.)reddit\.com$/i;
|
|
3835
|
+
function redditScopedQuery(query) {
|
|
3836
|
+
return /\bsite:reddit\.com\b/i.test(query) ? query : `${query} site:reddit.com`;
|
|
3837
|
+
}
|
|
3838
|
+
function buildScopedQueries(queries, scope) {
|
|
3839
|
+
if (scope === "web") {
|
|
3840
|
+
return queries.map((query) => ({ query, resultScope: "web", dropSiteOnRetry: true }));
|
|
3841
|
+
}
|
|
3842
|
+
const reddited = queries.map(
|
|
3843
|
+
(q) => ({ query: redditScopedQuery(q), resultScope: "reddit", dropSiteOnRetry: false })
|
|
3844
|
+
);
|
|
3845
|
+
if (scope === "reddit") return reddited;
|
|
3846
|
+
return [
|
|
3847
|
+
...queries.map((query) => ({ query, resultScope: "web", dropSiteOnRetry: true })),
|
|
3848
|
+
...reddited
|
|
3849
|
+
];
|
|
3850
|
+
}
|
|
3851
|
+
async function executeSearches(queries) {
|
|
3852
|
+
const env = parseEnv();
|
|
3853
|
+
const hasSerper = Boolean(env.SEARCH_API_KEY);
|
|
3854
|
+
const hasJina = Boolean(env.JINA_API_KEY);
|
|
3855
|
+
if (!hasSerper && hasJina) {
|
|
3856
|
+
mcpLog("info", "SERPER_API_KEY missing; using Jina Search fallback as primary search provider", "search");
|
|
3857
|
+
return runExternalEffect(
|
|
3858
|
+
Effect4.gen(function* () {
|
|
3859
|
+
const search = yield* SearchService;
|
|
3860
|
+
return yield* search.jinaSearchMultiple(queries);
|
|
3861
|
+
}),
|
|
3862
|
+
SearchServiceLive
|
|
3863
|
+
);
|
|
3864
|
+
}
|
|
3865
|
+
if (!hasSerper) {
|
|
3866
|
+
return {
|
|
3867
|
+
searches: [],
|
|
3868
|
+
totalQueries: queries.length,
|
|
3869
|
+
executionTime: 0,
|
|
3870
|
+
error: {
|
|
3871
|
+
code: ErrorCode.AUTH_ERROR,
|
|
3872
|
+
message: "No search provider configured. Set SERPER_API_KEY or JINA_API_KEY.",
|
|
3873
|
+
retryable: false
|
|
3874
|
+
}
|
|
3875
|
+
};
|
|
3876
|
+
}
|
|
3877
|
+
const serperResponse = await runExternalEffect(
|
|
3878
|
+
Effect4.gen(function* () {
|
|
3879
|
+
const search = yield* SearchService;
|
|
3880
|
+
return yield* search.serperSearchMultiple(queries);
|
|
3881
|
+
}),
|
|
3882
|
+
SearchServiceLive
|
|
3883
|
+
);
|
|
3884
|
+
if (!hasJina) return serperResponse;
|
|
3885
|
+
if (serperResponse.error) {
|
|
3886
|
+
mcpLog("warning", `Serper failed (${serperResponse.error.message}); trying Jina Search fallback`, "search");
|
|
3887
|
+
const jinaResponse2 = await runExternalEffect(
|
|
3888
|
+
Effect4.gen(function* () {
|
|
3889
|
+
const search = yield* SearchService;
|
|
3890
|
+
return yield* search.jinaSearchMultiple(queries);
|
|
3891
|
+
}),
|
|
3892
|
+
SearchServiceLive
|
|
3893
|
+
);
|
|
3894
|
+
return jinaResponse2.error ? serperResponse : jinaResponse2;
|
|
3895
|
+
}
|
|
3896
|
+
const emptyIndices = serperResponse.searches.map((search, index) => search.results.length === 0 ? index : -1).filter((index) => index !== -1);
|
|
3897
|
+
if (emptyIndices.length === 0) return serperResponse;
|
|
3898
|
+
const fallbackQueries = emptyIndices.map((index) => serperResponse.searches[index]?.query).filter((query) => typeof query === "string" && query.length > 0);
|
|
3899
|
+
if (fallbackQueries.length === 0) return serperResponse;
|
|
3900
|
+
mcpLog("info", `${fallbackQueries.length} zero-result Serper query/queries; trying Jina Search fallback`, "search");
|
|
3901
|
+
const jinaResponse = await runExternalEffect(
|
|
3902
|
+
Effect4.gen(function* () {
|
|
3903
|
+
const search = yield* SearchService;
|
|
3904
|
+
return yield* search.jinaSearchMultiple(fallbackQueries);
|
|
3905
|
+
}),
|
|
3906
|
+
SearchServiceLive
|
|
3907
|
+
);
|
|
3908
|
+
if (jinaResponse.error) return serperResponse;
|
|
3909
|
+
const fallbackByQuery = new Map(jinaResponse.searches.map((search) => [search.query, search]));
|
|
3910
|
+
const mergedSearches = serperResponse.searches.map((search) => {
|
|
3911
|
+
if (search.results.length > 0) return search;
|
|
3912
|
+
const fallback = fallbackByQuery.get(search.query);
|
|
3913
|
+
if (!fallback || fallback.results.length === 0) return search;
|
|
3914
|
+
return { ...fallback, query: search.query };
|
|
3915
|
+
});
|
|
3916
|
+
return {
|
|
3917
|
+
...serperResponse,
|
|
3918
|
+
searches: mergedSearches,
|
|
3919
|
+
executionTime: serperResponse.executionTime + jinaResponse.executionTime
|
|
3920
|
+
};
|
|
3921
|
+
}
|
|
3922
|
+
async function executeWithRelaxRetry(dispatched, reporter, searchExecutor = executeSearches, retryOptions = {}) {
|
|
3923
|
+
const initial = await searchExecutor(dispatched);
|
|
3924
|
+
if (initial.error) {
|
|
3925
|
+
return { response: initial, retried: [], failurePhase: "initial" };
|
|
3926
|
+
}
|
|
3927
|
+
const emptyIndices = initial.searches.map((s, i) => s.results.length === 0 ? i : -1).filter((i) => i !== -1);
|
|
3928
|
+
if (emptyIndices.length === 0) {
|
|
3929
|
+
return { response: initial, retried: [] };
|
|
3930
|
+
}
|
|
3931
|
+
const plans = [];
|
|
3932
|
+
for (const idx of emptyIndices) {
|
|
3933
|
+
const dq = dispatched[idx];
|
|
3934
|
+
if (typeof dq !== "string") continue;
|
|
3935
|
+
const r = relaxQueryForRetry(dq, { dropSite: retryOptions.dropSiteOnRetry?.[idx] ?? true });
|
|
3936
|
+
if (r.changed && r.rewritten !== dq) {
|
|
3937
|
+
plans.push({ index: idx, original: dq, relaxed: r.rewritten, rules: [...r.rules] });
|
|
3938
|
+
}
|
|
3939
|
+
}
|
|
3940
|
+
if (plans.length === 0) {
|
|
3941
|
+
return { response: initial, retried: [] };
|
|
3942
|
+
}
|
|
3943
|
+
mcpLog(
|
|
3944
|
+
"info",
|
|
3945
|
+
`${plans.length}/${emptyIndices.length} empty-result queries eligible for relaxation retry`,
|
|
3946
|
+
"search"
|
|
3947
|
+
);
|
|
3948
|
+
await reporter.log(
|
|
3949
|
+
"info",
|
|
3950
|
+
`${plans.length} queries returned 0 results; retrying with relaxation`
|
|
3951
|
+
);
|
|
3952
|
+
const retryResp = await searchExecutor(plans.map((p) => p.relaxed));
|
|
3953
|
+
const retried = [];
|
|
3954
|
+
const retryByIndex = /* @__PURE__ */ new Map();
|
|
3955
|
+
plans.forEach((plan, i) => {
|
|
3956
|
+
const r = retryResp.searches[i];
|
|
3957
|
+
if (r) retryByIndex.set(plan.index, r);
|
|
3958
|
+
retried.push({
|
|
3959
|
+
original: plan.original,
|
|
3960
|
+
retried_with: plan.relaxed,
|
|
3961
|
+
rules: plan.rules,
|
|
3962
|
+
recovered_results: r?.results.length ?? 0
|
|
3963
|
+
});
|
|
3964
|
+
});
|
|
3965
|
+
if (retryResp.error) {
|
|
3966
|
+
mcpLog(
|
|
3967
|
+
"warning",
|
|
3968
|
+
`Relaxed retry batch failed; preserving initial search results: ${retryResp.error.message}`,
|
|
3969
|
+
"search"
|
|
3970
|
+
);
|
|
3971
|
+
await reporter.log(
|
|
3972
|
+
"warning",
|
|
3973
|
+
`search_relax_retry_failed: ${retryResp.error.message}`
|
|
3974
|
+
);
|
|
3975
|
+
return {
|
|
3976
|
+
response: initial,
|
|
3977
|
+
retried,
|
|
3978
|
+
retryError: retryResp.error
|
|
3979
|
+
};
|
|
3980
|
+
}
|
|
3981
|
+
const mergedSearches = initial.searches.map((s, idx) => {
|
|
3982
|
+
const r = retryByIndex.get(idx);
|
|
3983
|
+
if (r && r.results.length > 0) {
|
|
3984
|
+
return { ...r, query: s.query };
|
|
3985
|
+
}
|
|
3986
|
+
return s;
|
|
3987
|
+
});
|
|
3988
|
+
return {
|
|
3989
|
+
response: { ...initial, searches: mergedSearches },
|
|
3990
|
+
retried
|
|
3991
|
+
};
|
|
3992
|
+
}
|
|
3993
|
+
function filterScopedSearches(response, scope, resultScopes = []) {
|
|
3994
|
+
if (scope === "web") return response;
|
|
3995
|
+
const filtered = response.searches.map((search, index) => {
|
|
3996
|
+
const resultScope = resultScopes[index] ?? (scope === "reddit" ? "reddit" : "web");
|
|
3997
|
+
return {
|
|
3998
|
+
...search,
|
|
3999
|
+
results: search.results.filter((r) => {
|
|
4000
|
+
let host;
|
|
4001
|
+
try {
|
|
4002
|
+
host = new URL(r.link).hostname;
|
|
4003
|
+
} catch {
|
|
4004
|
+
return true;
|
|
4005
|
+
}
|
|
4006
|
+
if (resultScope === "reddit") {
|
|
4007
|
+
return REDDIT_HOST2.test(host) && REDDIT_POST_PERMALINK2.test(r.link);
|
|
4008
|
+
}
|
|
4009
|
+
if (!REDDIT_HOST2.test(host)) return true;
|
|
4010
|
+
return REDDIT_POST_PERMALINK2.test(r.link);
|
|
4011
|
+
})
|
|
4012
|
+
};
|
|
4013
|
+
});
|
|
4014
|
+
return { ...response, searches: filtered };
|
|
4015
|
+
}
|
|
4016
|
+
function processResults(response) {
|
|
4017
|
+
const aggregation = aggregateAndRank(response.searches, 5);
|
|
4018
|
+
return { aggregation };
|
|
4019
|
+
}
|
|
4020
|
+
function buildRawOutput(queries, aggregation, searches, verbose = false) {
|
|
4021
|
+
return generateUnifiedOutput(
|
|
4022
|
+
aggregation.rankedUrls,
|
|
4023
|
+
queries,
|
|
4024
|
+
searches,
|
|
4025
|
+
aggregation.totalUniqueUrls,
|
|
4026
|
+
aggregation.frequencyThreshold,
|
|
4027
|
+
aggregation.thresholdNote,
|
|
4028
|
+
verbose
|
|
4029
|
+
);
|
|
4030
|
+
}
|
|
4031
|
+
function buildSignalsSection(aggregation, searches, totalQueries) {
|
|
4032
|
+
const coverageCount = searches.filter((search) => search.results.length >= 3).length;
|
|
4033
|
+
const lowYield = searches.filter((search) => search.results.length <= 1).map((search) => `"${search.query}"`);
|
|
4034
|
+
const consensusCount = aggregation.rankedUrls.filter((url) => url.isConsensus).length;
|
|
4035
|
+
const lines = [
|
|
4036
|
+
"**Signals**",
|
|
4037
|
+
`- Coverage: ${coverageCount}/${totalQueries} queries returned \u22653 results`,
|
|
4038
|
+
`- Consensus URLs: ${consensusCount}`
|
|
4039
|
+
];
|
|
4040
|
+
if (lowYield.length > 0) {
|
|
4041
|
+
lines.push(`- Low-yield: ${lowYield.join(", ")}`);
|
|
4042
|
+
}
|
|
4043
|
+
return lines.join("\n");
|
|
4044
|
+
}
|
|
4045
|
+
function buildSuggestedFollowUpsSection(refineQueries) {
|
|
4046
|
+
if (!refineQueries || refineQueries.length === 0) {
|
|
4047
|
+
return "";
|
|
4048
|
+
}
|
|
4049
|
+
const lines = ["## Suggested follow-up searches", ""];
|
|
4050
|
+
for (const item of refineQueries) {
|
|
4051
|
+
const query = sanitizeSuggestion(item.query ?? "");
|
|
4052
|
+
if (!query) continue;
|
|
4053
|
+
const rationale = sanitizeSuggestion(item.rationale ?? "");
|
|
4054
|
+
const gapTag = typeof item.gap_id === "number" ? ` _(closes gap [${item.gap_id}])_` : item.gap_description ? ` _(${sanitizeSuggestion(item.gap_description)})_` : "";
|
|
4055
|
+
lines.push(
|
|
4056
|
+
rationale ? `- ${query} \u2014 ${rationale}${gapTag}` : `- ${query}${gapTag}`
|
|
4057
|
+
);
|
|
4058
|
+
}
|
|
4059
|
+
return lines.length === 2 ? "" : lines.join("\n");
|
|
4060
|
+
}
|
|
4061
|
+
function appendSignalsAndFollowUps(markdown2, signalsSection, refineQueries, options = {}) {
|
|
4062
|
+
const includeSignals = options.includeSignals ?? false;
|
|
4063
|
+
const sections = [markdown2];
|
|
4064
|
+
if (includeSignals && signalsSection) {
|
|
4065
|
+
sections.push("", "---", signalsSection);
|
|
4066
|
+
}
|
|
4067
|
+
const followUps = buildSuggestedFollowUpsSection(refineQueries);
|
|
4068
|
+
if (followUps) {
|
|
4069
|
+
sections.push("", followUps);
|
|
4070
|
+
}
|
|
4071
|
+
return sections.join("\n");
|
|
4072
|
+
}
|
|
4073
|
+
var MIN_START_HERE = 3;
|
|
4074
|
+
var MAX_START_HERE = 5;
|
|
4075
|
+
function buildStartHereSection(tiers, entryByRank, opts = {}) {
|
|
4076
|
+
const min = opts.min ?? MIN_START_HERE;
|
|
4077
|
+
const max = opts.max ?? MAX_START_HERE;
|
|
4078
|
+
const picks = [];
|
|
4079
|
+
for (const candidate of tiers.high) {
|
|
4080
|
+
if (picks.length >= max) break;
|
|
4081
|
+
picks.push({ candidate, tier: "HIGHLY_RELEVANT" });
|
|
4082
|
+
}
|
|
4083
|
+
if (picks.length < min) {
|
|
4084
|
+
const target = Math.min(min, max);
|
|
4085
|
+
for (const candidate of tiers.maybe) {
|
|
4086
|
+
if (picks.length >= target) break;
|
|
4087
|
+
picks.push({ candidate, tier: "MAYBE_RELEVANT" });
|
|
4088
|
+
}
|
|
4089
|
+
}
|
|
4090
|
+
if (picks.length === 0) return "";
|
|
4091
|
+
const lines = [];
|
|
4092
|
+
lines.push("## Start here \u2014 best candidates for your extract");
|
|
4093
|
+
picks.forEach((pick, i) => {
|
|
4094
|
+
const entry = entryByRank.get(pick.candidate.rank);
|
|
4095
|
+
const reason = entry?.reason && entry.reason.trim().length > 0 ? entry.reason : "\u2014";
|
|
4096
|
+
let domain;
|
|
4097
|
+
try {
|
|
4098
|
+
domain = new URL(pick.candidate.url).hostname.replace(/^www\./, "");
|
|
4099
|
+
} catch {
|
|
4100
|
+
domain = pick.candidate.url;
|
|
4101
|
+
}
|
|
4102
|
+
lines.push(
|
|
4103
|
+
`${i + 1}. **[${pick.candidate.title}](${pick.candidate.url})** \u2014 ${domain} \u2014 ${reason} *(${pick.tier}, rank ${pick.candidate.rank})*`
|
|
4104
|
+
);
|
|
4105
|
+
});
|
|
4106
|
+
return lines.join("\n");
|
|
4107
|
+
}
|
|
4108
|
+
function buildClassifiedOutput(classification, aggregation, extract, searches, totalQueries, verbose = false) {
|
|
4109
|
+
const rankedUrls = aggregation.rankedUrls;
|
|
4110
|
+
const entryByRank = new Map(classification.results.map((r) => [r.rank, r]));
|
|
4111
|
+
const tiers = {
|
|
4112
|
+
high: [],
|
|
4113
|
+
maybe: [],
|
|
4114
|
+
other: []
|
|
4115
|
+
};
|
|
4116
|
+
for (const url of rankedUrls) {
|
|
4117
|
+
const entry = entryByRank.get(url.rank);
|
|
4118
|
+
const tier = entry?.tier;
|
|
4119
|
+
if (tier === "HIGHLY_RELEVANT") {
|
|
4120
|
+
tiers.high.push(url);
|
|
4121
|
+
} else if (tier === "MAYBE_RELEVANT") {
|
|
4122
|
+
tiers.maybe.push(url);
|
|
4123
|
+
} else {
|
|
4124
|
+
tiers.other.push(url);
|
|
4125
|
+
}
|
|
4126
|
+
}
|
|
4127
|
+
const lines = [];
|
|
4128
|
+
lines.push(`## ${classification.title}`);
|
|
4129
|
+
lines.push(`> Looking for: ${extract}`);
|
|
4130
|
+
lines.push(`> ${totalQueries} queries \u2192 ${rankedUrls.length} URLs \u2192 ${tiers.high.length} highly relevant, ${tiers.maybe.length} possibly relevant`);
|
|
4131
|
+
if (classification.confidence) {
|
|
4132
|
+
const confReason = classification.confidence_reason ? ` \u2014 ${classification.confidence_reason}` : "";
|
|
4133
|
+
lines.push(`> Confidence: \`${classification.confidence}\`${confReason}`);
|
|
4134
|
+
}
|
|
4135
|
+
lines.push("");
|
|
4136
|
+
const startHere = buildStartHereSection(
|
|
4137
|
+
{ high: tiers.high, maybe: tiers.maybe },
|
|
4138
|
+
entryByRank
|
|
4139
|
+
);
|
|
4140
|
+
if (startHere) {
|
|
4141
|
+
lines.push(startHere);
|
|
4142
|
+
lines.push("");
|
|
4143
|
+
}
|
|
4144
|
+
lines.push(`**Summary:** ${classification.synthesis}`);
|
|
4145
|
+
lines.push("");
|
|
4146
|
+
const renderRichRow = (url) => {
|
|
4147
|
+
const entry = entryByRank.get(url.rank);
|
|
4148
|
+
const coveragePct = Math.round(url.coverageRatio * 100);
|
|
4149
|
+
const seenIn = `${url.frequency}/${totalQueries} (${coveragePct}%)`;
|
|
4150
|
+
const sourceType = entry?.source_type ? `\`${entry.source_type}\`` : "\u2014";
|
|
4151
|
+
const reason = entry?.reason ? entry.reason.replace(/\|/g, "\\|") : "\u2014";
|
|
4152
|
+
return `| ${url.rank} | [${url.title}](${url.url}) | ${sourceType} | ${seenIn} | ${reason} |`;
|
|
4153
|
+
};
|
|
4154
|
+
if (tiers.high.length > 0) {
|
|
4155
|
+
lines.push(`### Highly Relevant (${tiers.high.length})`);
|
|
4156
|
+
lines.push("| # | URL | Source | Seen in | Why |");
|
|
4157
|
+
lines.push("|---|-----|--------|---------|-----|");
|
|
4158
|
+
for (const url of tiers.high) lines.push(renderRichRow(url));
|
|
4159
|
+
lines.push("");
|
|
4160
|
+
}
|
|
4161
|
+
if (tiers.maybe.length > 0) {
|
|
4162
|
+
lines.push(`### Maybe Relevant (${tiers.maybe.length})`);
|
|
4163
|
+
lines.push("| # | URL | Source | Seen in | Why |");
|
|
4164
|
+
lines.push("|---|-----|--------|---------|-----|");
|
|
4165
|
+
for (const url of tiers.maybe) lines.push(renderRichRow(url));
|
|
4166
|
+
lines.push("");
|
|
4167
|
+
}
|
|
4168
|
+
if (tiers.other.length > 0) {
|
|
4169
|
+
lines.push(`### Other Results (${tiers.other.length})`);
|
|
4170
|
+
lines.push("| # | URL | Source | Score | Queries |");
|
|
4171
|
+
lines.push("|---|-----|--------|-------|---------|");
|
|
4172
|
+
for (const url of tiers.other) {
|
|
4173
|
+
const entry = entryByRank.get(url.rank);
|
|
4174
|
+
const queryList = url.queries.map((q) => `"${q}"`).join(", ");
|
|
4175
|
+
const sourceType = entry?.source_type ? `\`${entry.source_type}\`` : "\u2014";
|
|
4176
|
+
let domain;
|
|
4177
|
+
try {
|
|
4178
|
+
domain = new URL(url.url).hostname.replace(/^www\./, "");
|
|
4179
|
+
} catch {
|
|
4180
|
+
domain = url.url;
|
|
4181
|
+
}
|
|
4182
|
+
lines.push(`| ${url.rank} | ${domain} | ${sourceType} | ${url.score.toFixed(1)} | ${queryList} |`);
|
|
4183
|
+
}
|
|
4184
|
+
lines.push("");
|
|
4185
|
+
}
|
|
4186
|
+
if (verbose) {
|
|
4187
|
+
lines.push(buildSignalsSection(aggregation, searches, totalQueries));
|
|
4188
|
+
}
|
|
4189
|
+
if (classification.gaps && classification.gaps.length > 0) {
|
|
4190
|
+
lines.push("");
|
|
4191
|
+
lines.push("## Gaps");
|
|
4192
|
+
for (const gap of classification.gaps) {
|
|
4193
|
+
lines.push(`- **[${gap.id}]** ${gap.description}`);
|
|
4194
|
+
}
|
|
4195
|
+
}
|
|
4196
|
+
const followUps = buildSuggestedFollowUpsSection(classification.refine_queries);
|
|
4197
|
+
if (followUps) {
|
|
4198
|
+
lines.push("");
|
|
4199
|
+
lines.push(followUps);
|
|
4200
|
+
}
|
|
4201
|
+
return lines.join("\n");
|
|
4202
|
+
}
|
|
4203
|
+
function formatSearchFailureMessage(error2, phase) {
|
|
4204
|
+
if (phase === "initial") {
|
|
4205
|
+
return `Search provider failed during initial batch: ${error2.message}`;
|
|
4206
|
+
}
|
|
4207
|
+
if (phase === "relax-retry") {
|
|
4208
|
+
return `Search provider failed during relaxed retry batch: ${error2.message}`;
|
|
4209
|
+
}
|
|
4210
|
+
return error2.message;
|
|
4211
|
+
}
|
|
4212
|
+
function buildWebSearchError(error2, params, startTime, phase) {
|
|
4213
|
+
const message = formatSearchFailureMessage(error2, phase);
|
|
4214
|
+
const executionTime = Date.now() - startTime;
|
|
4215
|
+
mcpLog("error", `${params.toolName}: ${message}`, "search");
|
|
4216
|
+
const errorContent = formatError({
|
|
4217
|
+
code: error2.code,
|
|
4218
|
+
message,
|
|
4219
|
+
retryable: error2.retryable,
|
|
4220
|
+
toolName: params.toolName,
|
|
4221
|
+
howToFix: ["Verify SERPER_API_KEY or JINA_API_KEY is set correctly"],
|
|
4222
|
+
alternatives: [
|
|
4223
|
+
'raw-web-search(keywords=["topic recommendations site:reddit.com"]) \u2014 unclassified search results',
|
|
4224
|
+
"raw-scrape-links(urls=[...]) \u2014 if you already have URLs, fetch their markdown now"
|
|
4225
|
+
]
|
|
4226
|
+
});
|
|
4227
|
+
return toolFailure(
|
|
4228
|
+
`${errorContent}
|
|
4229
|
+
|
|
4230
|
+
Execution time: ${formatDuration(executionTime)}
|
|
4231
|
+
Keywords: ${params.keywords.length}`
|
|
4232
|
+
);
|
|
4233
|
+
}
|
|
4234
|
+
async function handleSearch(params, reporter = NOOP_REPORTER, searchExecutor = executeSearches) {
|
|
4235
|
+
const startTime = Date.now();
|
|
4236
|
+
try {
|
|
4237
|
+
if (params.smart && !createLLMProcessor()) {
|
|
4238
|
+
return toolFailure(getMissingEnvMessage("llmExtraction"));
|
|
4239
|
+
}
|
|
4240
|
+
const scopedQueries = buildScopedQueries(params.keywords, params.scope);
|
|
4241
|
+
const effectiveQueries = scopedQueries.map((entry) => entry.query);
|
|
4242
|
+
if (params.scope !== "web") {
|
|
4243
|
+
mcpLog("info", `Searching scope=${params.scope}: ${params.keywords.length} input keywords \u2192 ${effectiveQueries.length} dispatched`, "search");
|
|
4244
|
+
} else {
|
|
4245
|
+
mcpLog("info", `Searching for ${params.keywords.length} keyword(s)`, "search");
|
|
4246
|
+
}
|
|
4247
|
+
await reporter.log("info", `Searching for ${effectiveQueries.length} query/queries (scope=${params.scope})`);
|
|
4248
|
+
await reporter.progress(15, 100, "Submitting search queries");
|
|
4249
|
+
const dispatchPlan = effectiveQueries.map((q) => {
|
|
4250
|
+
const r = normalizeQueryForDispatch(q);
|
|
4251
|
+
return { original: q, dispatched: r.rewritten, rules: [...r.rules], changed: r.changed };
|
|
4252
|
+
});
|
|
4253
|
+
const dispatchedQueries = dispatchPlan.map((p) => p.dispatched);
|
|
4254
|
+
const resultScopes = scopedQueries.map((entry) => entry.resultScope);
|
|
4255
|
+
const dropSiteOnRetry = scopedQueries.map((entry) => entry.dropSiteOnRetry);
|
|
4256
|
+
const queryRewrites = dispatchPlan.filter((p) => p.changed).map((p) => ({ original: p.original, rewritten: p.dispatched, rules: p.rules }));
|
|
4257
|
+
if (queryRewrites.length > 0) {
|
|
4258
|
+
mcpLog(
|
|
4259
|
+
"info",
|
|
4260
|
+
`Pre-dispatch normalized ${queryRewrites.length}/${effectiveQueries.length} queries`,
|
|
4261
|
+
"search"
|
|
4262
|
+
);
|
|
4263
|
+
await reporter.log(
|
|
4264
|
+
"info",
|
|
4265
|
+
`Normalized ${queryRewrites.length} queries pre-dispatch`
|
|
4266
|
+
);
|
|
4267
|
+
}
|
|
4268
|
+
const {
|
|
4269
|
+
response: rawResponse,
|
|
4270
|
+
retried: retriedQueries,
|
|
4271
|
+
failurePhase,
|
|
4272
|
+
retryError
|
|
4273
|
+
} = await executeWithRelaxRetry(
|
|
4274
|
+
dispatchedQueries,
|
|
4275
|
+
reporter,
|
|
4276
|
+
searchExecutor,
|
|
4277
|
+
{ dropSiteOnRetry }
|
|
4278
|
+
);
|
|
4279
|
+
if (rawResponse.error) {
|
|
4280
|
+
await reporter.log("error", `search_provider_failed: ${rawResponse.error.message}`);
|
|
4281
|
+
return buildWebSearchError(rawResponse.error, params, startTime, failurePhase);
|
|
4282
|
+
}
|
|
4283
|
+
const response = filterScopedSearches(rawResponse, params.scope, resultScopes);
|
|
4284
|
+
await reporter.progress(50, 100, "Collected search results");
|
|
4285
|
+
const { aggregation } = processResults(response);
|
|
4286
|
+
await reporter.log(
|
|
4287
|
+
"info",
|
|
4288
|
+
`Collected ${aggregation.totalUniqueUrls} unique URLs across ${response.totalQueries} queries`
|
|
4289
|
+
);
|
|
4290
|
+
let markdown2;
|
|
4291
|
+
if (!params.smart) {
|
|
4292
|
+
markdown2 = appendSignalsAndFollowUps(
|
|
4293
|
+
buildRawOutput(params.keywords, aggregation, response.searches, false),
|
|
4294
|
+
buildSignalsSection(aggregation, response.searches, response.totalQueries),
|
|
4295
|
+
void 0,
|
|
4296
|
+
{ includeSignals: false }
|
|
4297
|
+
);
|
|
4298
|
+
await reporter.progress(80, 100, "Ranking search results");
|
|
4299
|
+
} else {
|
|
4300
|
+
const llmProcessor = createLLMProcessor();
|
|
4301
|
+
if (!llmProcessor) {
|
|
4302
|
+
return toolFailure(getMissingEnvMessage("llmExtraction"));
|
|
4303
|
+
}
|
|
4304
|
+
await reporter.progress(65, 100, "Classifying results by relevance");
|
|
4305
|
+
const classification = await runExternalEffect(
|
|
4306
|
+
Effect4.gen(function* () {
|
|
4307
|
+
const llm = yield* LlmService;
|
|
4308
|
+
return yield* llm.classifySearchResults(
|
|
4309
|
+
aggregation.rankedUrls,
|
|
4310
|
+
params.extract ?? "",
|
|
4311
|
+
response.totalQueries,
|
|
4312
|
+
llmProcessor,
|
|
4313
|
+
params.keywords
|
|
4314
|
+
);
|
|
4315
|
+
}),
|
|
4316
|
+
LlmServiceLive
|
|
4317
|
+
);
|
|
4318
|
+
if (classification.result) {
|
|
4319
|
+
markdown2 = buildClassifiedOutput(
|
|
4320
|
+
classification.result,
|
|
4321
|
+
aggregation,
|
|
4322
|
+
params.extract ?? "",
|
|
4323
|
+
response.searches,
|
|
4324
|
+
response.totalQueries,
|
|
4325
|
+
params.verbose
|
|
4326
|
+
);
|
|
4327
|
+
await reporter.progress(85, 100, "Formatted classified results");
|
|
4328
|
+
} else {
|
|
4329
|
+
const llmError = classification.error ?? "Unknown classification error";
|
|
4330
|
+
mcpLog("warning", `Classification failed for smart-web-search: ${llmError}`, "search");
|
|
4331
|
+
await reporter.log("warning", `llm_classifier_failed: ${llmError}`);
|
|
4332
|
+
return toolFailure(
|
|
4333
|
+
`${formatError({
|
|
4334
|
+
code: ErrorCode.SERVICE_UNAVAILABLE,
|
|
4335
|
+
message: `LLM classification failed: ${llmError}`,
|
|
4336
|
+
retryable: true,
|
|
4337
|
+
toolName: params.toolName,
|
|
4338
|
+
alternatives: ["raw-web-search(keywords=[...]) \u2014 return unclassified search results"]
|
|
4339
|
+
})}
|
|
4340
|
+
|
|
4341
|
+
Execution time: ${formatDuration(Date.now() - startTime)}`
|
|
4342
|
+
);
|
|
4343
|
+
}
|
|
4344
|
+
}
|
|
4345
|
+
const executionTime = Date.now() - startTime;
|
|
4346
|
+
mcpLog("info", `Search completed: ${aggregation.rankedUrls.length} URLs, smart=${params.smart}`, "search");
|
|
4347
|
+
await reporter.log("info", `Search completed with ${aggregation.rankedUrls.length} URLs (smart: ${params.smart})`);
|
|
4348
|
+
const footerParts = [
|
|
4349
|
+
formatDuration(executionTime),
|
|
4350
|
+
`${aggregation.totalUniqueUrls} unique URLs`,
|
|
4351
|
+
params.smart ? "LLM classified" : "raw search"
|
|
4352
|
+
];
|
|
4353
|
+
if (queryRewrites.length > 0) footerParts.push(`${queryRewrites.length} normalized`);
|
|
4354
|
+
if (retriedQueries.length > 0) footerParts.push(`${retriedQueries.length} retried`);
|
|
4355
|
+
if (retryError) footerParts.push(`retry warning: ${retryError.code}`);
|
|
4356
|
+
const footer = `
|
|
4357
|
+
---
|
|
4358
|
+
*${footerParts.join(" | ")}*`;
|
|
4359
|
+
const fullMarkdown = markdown2 + footer;
|
|
4360
|
+
return toolSuccess(fullMarkdown);
|
|
4361
|
+
} catch (error2) {
|
|
4362
|
+
return buildWebSearchError(classifyError(error2), params, startTime);
|
|
4363
|
+
}
|
|
4364
|
+
}
|
|
4365
|
+
function handleRawWebSearch(params, reporter = NOOP_REPORTER, searchExecutor = executeSearches) {
|
|
4366
|
+
return handleSearch(
|
|
4367
|
+
{
|
|
4368
|
+
...params,
|
|
4369
|
+
scope: "web",
|
|
4370
|
+
verbose: false,
|
|
4371
|
+
smart: false,
|
|
4372
|
+
toolName: "raw-web-search"
|
|
4373
|
+
},
|
|
4374
|
+
reporter,
|
|
4375
|
+
searchExecutor
|
|
4376
|
+
);
|
|
4377
|
+
}
|
|
4378
|
+
function handleSmartWebSearch(params, reporter = NOOP_REPORTER, searchExecutor = executeSearches) {
|
|
4379
|
+
return handleSearch(
|
|
4380
|
+
{
|
|
4381
|
+
...params,
|
|
4382
|
+
smart: true,
|
|
4383
|
+
toolName: "smart-web-search"
|
|
4384
|
+
},
|
|
4385
|
+
reporter,
|
|
4386
|
+
searchExecutor
|
|
4387
|
+
);
|
|
4388
|
+
}
|
|
4389
|
+
function registerWebSearchTools(server) {
|
|
4390
|
+
server.tool(
|
|
4391
|
+
{
|
|
4392
|
+
name: "raw-web-search",
|
|
4393
|
+
title: "Raw Web Search",
|
|
4394
|
+
description: `Fan out raw search keywords in parallel and return the ranked markdown list directly. Serper is primary when configured; Jina Search is fallback when Serper is missing, fails, or yields empty query results. Input is only \`keywords\` (1\u201350 items). ${QUERY_REWRITE_PAIR_GUIDANCE_TEXT} Use this when you need unclassified result data, Reddit permalink discovery via explicit \`site:reddit.com/r/.../comments\` keywords, or broad reconnaissance before synthesis.`,
|
|
4395
|
+
schema: rawWebSearchParamsSchema,
|
|
4396
|
+
annotations: {
|
|
4397
|
+
readOnlyHint: true,
|
|
4398
|
+
idempotentHint: true,
|
|
4399
|
+
destructiveHint: false,
|
|
4400
|
+
openWorldHint: true
|
|
4401
|
+
}
|
|
4402
|
+
},
|
|
4403
|
+
async (args, ctx) => {
|
|
4404
|
+
const parsed = rawWebSearchParamsSchema.safeParse(args);
|
|
4405
|
+
if (!parsed.success) {
|
|
4406
|
+
return toToolResponse(toolFailure(formatInputValidationError2("raw-web-search", parsed.error.issues)));
|
|
4407
|
+
}
|
|
4408
|
+
if (!getCapabilities().search) {
|
|
4409
|
+
return toToolResponse(toolFailure(getMissingEnvMessage("search")));
|
|
4410
|
+
}
|
|
4411
|
+
const reporter = createToolReporter(ctx, "raw-web-search");
|
|
4412
|
+
const result = await handleRawWebSearch(parsed.data, reporter);
|
|
4413
|
+
await reporter.progress(100, 100, result.isError ? "Search failed" : "Search complete");
|
|
4414
|
+
return toToolResponse(result);
|
|
4415
|
+
}
|
|
4416
|
+
);
|
|
4417
|
+
server.tool(
|
|
4418
|
+
{
|
|
4419
|
+
name: "smart-web-search",
|
|
4420
|
+
title: "Smart Web Search",
|
|
4421
|
+
description: `Fan out search keywords in parallel, then always run LLM classification and synthesis against \`extract\`. Serper is primary when configured; Jina Search is fallback when Serper is missing, fails, or yields empty query results. Input carries 1\u201350 \`keywords\`, required \`extract\`, optional \`scope: "web" | "reddit" | "both"\`, and optional \`verbose\`. ${QUERY_REWRITE_PAIR_GUIDANCE_TEXT} Use this for comprehensive research passes where you need HIGHLY_RELEVANT/MAYBE/OTHER tiers, a grounded synthesis with rank citations, gaps, and suggested follow-up searches.`,
|
|
4422
|
+
schema: smartWebSearchParamsSchema,
|
|
4423
|
+
annotations: {
|
|
4424
|
+
readOnlyHint: true,
|
|
4425
|
+
idempotentHint: true,
|
|
4426
|
+
destructiveHint: false,
|
|
4427
|
+
openWorldHint: true
|
|
4428
|
+
}
|
|
4429
|
+
},
|
|
4430
|
+
async (args, ctx) => {
|
|
4431
|
+
const parsed = smartWebSearchParamsSchema.safeParse(args);
|
|
4432
|
+
if (!parsed.success) {
|
|
4433
|
+
return toToolResponse(toolFailure(formatInputValidationError2("smart-web-search", parsed.error.issues)));
|
|
4434
|
+
}
|
|
4435
|
+
if (!getCapabilities().search) {
|
|
4436
|
+
return toToolResponse(toolFailure(getMissingEnvMessage("search")));
|
|
4437
|
+
}
|
|
4438
|
+
const reporter = createToolReporter(ctx, "smart-web-search");
|
|
4439
|
+
const result = await handleSmartWebSearch(parsed.data, reporter);
|
|
4440
|
+
await reporter.progress(100, 100, result.isError ? "Search failed" : "Search complete");
|
|
4441
|
+
return toToolResponse(result);
|
|
4442
|
+
}
|
|
4443
|
+
);
|
|
4444
|
+
}
|
|
4445
|
+
|
|
4446
|
+
// src/tools/start-research.ts
|
|
4447
|
+
import { Effect as Effect5 } from "effect";
|
|
4448
|
+
|
|
4449
|
+
// src/schemas/start-research.ts
|
|
4450
|
+
import { z as z3 } from "zod";
|
|
4451
|
+
var startResearchParamsSchema = z3.object({
|
|
4452
|
+
goal: z3.string().min(1, { message: "start-research: goal cannot be empty" }).optional().describe(
|
|
4453
|
+
'Research goal for this session. When provided AND the LLM planner is configured (LLM_API_KEY + LLM_BASE_URL + LLM_MODEL all set), the server returns a goal-tailored brief: classified goal type (spec | bug | migration | sentiment | pricing | security | synthesis | product_launch), a `primary_branch` recommendation (reddit for sentiment/migration; web for spec/bug/pricing; both when opinion-heavy AND needs official sources), the exact `first_call_sequence` of raw/smart search and scrape calls to fire, 25\u201350 keyword seeds for the first search call, iteration hints, gaps to watch, and stop criteria. The goal also sets the post-sort relevance target, so state the evidence you need and what "done" means. No goal \u2192 the generic 5-tool playbook (no tailored brief). Write the goal as you would to a human researcher \u2014 one or two sentences, specific about what "done" looks like.'
|
|
4454
|
+
),
|
|
4455
|
+
include_playbook: z3.boolean().default(false).describe(
|
|
4456
|
+
"Include the full 5-tool research playbook (toolbelt overview, the loop, output discipline). Default false \u2014 when the LLM planner is offline the server emits a compact stub that already names the tools and the loop. Pass true only if the agent needs the verbose tactic reference, or to override the degraded-mode shrink."
|
|
4457
|
+
)
|
|
4458
|
+
}).strict();
|
|
4459
|
+
|
|
4460
|
+
// src/tools/start-research.ts
|
|
4461
|
+
var SKILL_INSTALL_HINT = [
|
|
4462
|
+
"> \u{1F4A1} **Pair this server with the `run-research` skill** for the full agentic playbook",
|
|
4463
|
+
"> (single-agent loop, multi-agent orchestrator, mission-prompt templates, output discipline).",
|
|
4464
|
+
"> Install once per machine \u2014 the skill is what teaches the agent how to spend these tools well:",
|
|
4465
|
+
">",
|
|
4466
|
+
"> ```bash",
|
|
4467
|
+
"> npx -y skills add -y -g https://github.com/yigitkonur/skills-by-yigitkonur --skill /run-research",
|
|
4468
|
+
"> ```",
|
|
4469
|
+
">",
|
|
4470
|
+
"> Already installed? Skip this \u2014 the skill auto-loads on relevant prompts. The full pack",
|
|
4471
|
+
"> ships ~50 sibling skills: `npx -y skills add -y -g https://github.com/yigitkonur/skills-by-yigitkonur`."
|
|
4472
|
+
].join("\n");
|
|
4473
|
+
function buildStaticScaffolding(goal, opts = {}) {
|
|
4474
|
+
const plannerAvailable = opts.plannerAvailable ?? true;
|
|
4475
|
+
const focusLine = goal ? `> Focus for this session: ${goal}` : "> Focus for this session: not yet specified \u2014 set one on the next pass";
|
|
4476
|
+
const classifierLoopStep = plannerAvailable ? "3. If you used `smart-web-search`, treat its output as a prioritization layer over your keyword fan-out: read `Start here`, HIGHLY/MAYBE tiers, `gaps[]`, and `refine_queries[]`, then decide what to scrape. Do not treat smart search as evidence; it only sees titles/snippets. If you used `raw-web-search`, use the ranked URLs/snippets as the candidate pool and do your own prioritization." : "3. Classifier output is NOT available (LLM planner offline). Use `raw-web-search` and synthesize the terrain yourself from titles + snippets; then scrape before making claims.";
|
|
4477
|
+
return [
|
|
4478
|
+
SKILL_INSTALL_HINT,
|
|
4479
|
+
"",
|
|
4480
|
+
"# Research session started",
|
|
4481
|
+
"",
|
|
4482
|
+
focusLine,
|
|
4483
|
+
"",
|
|
4484
|
+
"You are running a research LOOP, not answering from memory. Training data is stale; the web is authoritative for anything dated, versioned, priced, or contested. Every non-trivial claim in your final answer must be traceable to a raw or smart scrape excerpt you read. Never cite a URL from a search snippet alone.",
|
|
4485
|
+
"",
|
|
4486
|
+
"## The 5 tools",
|
|
4487
|
+
"",
|
|
4488
|
+
"**1. `start-research`** \u2014 you just called me. I plan this session and return the brief below. Call me again only if the goal materially shifts.",
|
|
4489
|
+
"",
|
|
4490
|
+
"**2. `raw-web-search`** \u2014 fan out search keywords in parallel and return the ranked markdown list directly. Serper is primary when configured; Jina Search is fallback when Serper is missing, fails, or yields empty query results. One call carries **up to 50 keywords** in a flat `keywords` array. Use raw search when recall matters more than interpretation: broad discovery, audit trails, exact candidate URLs, cheap second/third passes, and Reddit permalink discovery.",
|
|
4491
|
+
"",
|
|
4492
|
+
"**3. `smart-web-search`** \u2014 fan out search keywords, then always run LLM classification/synthesis with required `extract`. Its value is **prioritization, not evidence**: it ranks title/snippet results into HIGHLY_RELEVANT / MAYBE / OTHER, surfaces `## Gaps`, and suggests sharper follow-up keywords. Use smart search after you have a clear goal and a deliberately diverse keyword set; use raw search when you mainly need all candidate URLs or maximum distinct keyword coverage without classifier cost. Call search **aggressively** \u2014 2\u20134 rounds per session is normal, not 1. **Parallel-safe**: run multiple search calls in the same turn for orthogonal subtopics. `scope` values:",
|
|
4493
|
+
'- `"reddit"` \u2192 server appends `site:reddit.com` and filters to post permalinks. Use for sentiment / migration / lived experience.',
|
|
4494
|
+
'- `"web"` (default) \u2192 open web. Use for spec / bug / pricing / CVE / API / primary-source hunts.',
|
|
4495
|
+
'- `"both"` \u2192 fans each query across both. Use when the topic is opinion-heavy AND needs official sources.',
|
|
4496
|
+
"",
|
|
4497
|
+
`**Query rewrite discipline** \u2014 ${QUERY_REWRITE_PAIR_GUIDANCE_TEXT}`,
|
|
4498
|
+
"",
|
|
4499
|
+
"## Raw vs smart choice",
|
|
4500
|
+
"",
|
|
4501
|
+
"- Prefer **raw search** for exploration and breadth: maximize distinct keywords, source classes, domains, and exact-result capture. Prefer **smart search** for triage: give it the same diverse keyword fan-out plus a precise `extract` goal so it can prioritize which URLs deserve scraping.",
|
|
4502
|
+
"- Prefer **raw scrape** when the page/comments are the research object, when you need complete context, when extraction shape is unclear, or when preserving Reddit threads matters. Prefer **smart scrape** when you know the fields/facets you need and want compact evidence with `## Matches`, `## Not found`, and `## Follow-up signals`.",
|
|
4503
|
+
"- Smart search is allowed to steer attention, but final claims must come from raw or smart scrape content. Smart scrape is usually the highest-value smart tool for answer construction because it reads the page body, not just snippets.",
|
|
4504
|
+
"",
|
|
4505
|
+
"**4. `raw-scrape-links`** \u2014 fetch URLs in parallel and return full markdown directly. **Auto-detects** `reddit.com/r/.../comments/` permalinks and routes them through the Reddit API (threaded post + comments). Non-Reddit URLs use Jina Reader first, then Jina Reader through Scrape.do proxy mode when configured, then optional Kernel browser rendering for web pages. Use raw scrape when you need complete markdown/comments, when the extraction shape is not yet known, or before narrowing a long/important source.",
|
|
4506
|
+
"",
|
|
4507
|
+
"**5. `smart-scrape-links`** \u2014 fetch URLs with the same provider stack, then always run per-URL LLM extraction with required `extract`. Use smart scrape for focused extraction, final evidence packs, long pages where only specific facts matter, and cross-source comparison. Describe extraction SHAPE in `extract`, facets separated by `|`: `root cause | affected versions | fix | workarounds | timeline`. It should preserve numbers/versions/errors/quotes and explicitly say what the source did not answer.",
|
|
4508
|
+
"",
|
|
4509
|
+
"## The loop",
|
|
4510
|
+
"",
|
|
4511
|
+
"1. Read the brief below (if present). Note `primary_branch`, `keyword_seeds`, `gaps_to_watch`, `stop_criteria`.",
|
|
4512
|
+
'2. Fire `first_call_sequence` in order. For `primary_branch: reddit`, lead with `raw-web-search` using Reddit permalink probes, then `raw-scrape-links` on the best post permalinks for full comments. For `web`, use `raw-web-search` for maximum candidate breadth or `smart-web-search scope:"web"` to prioritize a strong keyword fan-out, then `smart-scrape-links` on selected URLs when the extraction shape is known. For `both`, issue parallel search calls split by source need.',
|
|
4513
|
+
classifierLoopStep,
|
|
4514
|
+
"4. Scrape every HIGHLY_RELEVANT plus the 2\u20133 best MAYBE_RELEVANT. Use `raw-scrape-links` first for full documents/comments and ambiguous sources; use `smart-scrape-links` when you can name the extraction facets precisely.",
|
|
4515
|
+
"5. Harvest from smart scrape `## Follow-up signals` and from raw scrape full markdown/comments \u2014 new terms, version numbers, vendor names, failure modes, referenced URLs. These seed your next search round.",
|
|
4516
|
+
"6. Fire the next search round with harvested terms plus any `refine_queries[]` the classifier suggested. Do NOT paraphrase queries already run \u2014 the classifier tracks them.",
|
|
4517
|
+
"7. **Stop** when every `gaps_to_watch` item is closed AND the last search pass surfaced no new terms, OR when you have completed 4 full passes. State remaining gaps explicitly if you hit the cap.",
|
|
4518
|
+
"",
|
|
4519
|
+
"## Output discipline",
|
|
4520
|
+
"",
|
|
4521
|
+
"- Cite URL (or Reddit permalink) for every non-trivial claim \u2014 only from a raw or smart scrape excerpt you read.",
|
|
4522
|
+
"- Quote verbatim: numbers, versions, API names, prices, error messages, stacktraces, people's words.",
|
|
4523
|
+
"- Separate documented facts from inferred conclusions explicitly.",
|
|
4524
|
+
"- Include the scrape date for time-sensitive claims.",
|
|
4525
|
+
"- If you could not verify something, say so \u2014 do not paper over gaps.",
|
|
4526
|
+
"- Never cite a URL from a search snippet alone.",
|
|
4527
|
+
"",
|
|
4528
|
+
"## Post-cutoff discipline",
|
|
4529
|
+
"",
|
|
4530
|
+
"For anything released / changed after your training cutoff \u2014 new products, versions, prices, benchmarks \u2014 treat your own query suggestions as hypotheses until a scraped first-party page confirms them. Include `site:<vendor-domain>` keywords in your first search call when the goal names a vendor or product."
|
|
4531
|
+
].join("\n");
|
|
4532
|
+
}
|
|
4533
|
+
function buildDegradedStub(goal) {
|
|
4534
|
+
const focusLine = goal ? `> Focus for this session: ${goal}` : "> Focus for this session: not specified \u2014 set one on the next pass.";
|
|
4535
|
+
return [
|
|
4536
|
+
SKILL_INSTALL_HINT,
|
|
4537
|
+
"",
|
|
4538
|
+
"# Research session started (LLM planner offline \u2014 compact stub)",
|
|
4539
|
+
"",
|
|
4540
|
+
focusLine,
|
|
4541
|
+
"",
|
|
4542
|
+
"**5 tools**: `start-research` (plans), `raw-web-search` (keywords-only raw search), `smart-web-search` (search + required LLM prioritization over snippets, `scope: web|reddit|both`), `raw-scrape-links` (urls-only full markdown/comments), `smart-scrape-links` (scrape + required LLM extraction over page bodies). All are **parallel-callable** \u2014 fire multiple in the same turn when subtopics are orthogonal.",
|
|
4543
|
+
"",
|
|
4544
|
+
"**Loop**: search \u2192 scrape \u2192 harvest terms/signals \u2192 next search round \u2192 stop when gaps close OR after 4 passes. Use raw search for breadth, smart search for prioritization, raw scrape for full context, and smart scrape for focused extraction.",
|
|
4545
|
+
"",
|
|
4546
|
+
"**Reddit branch**: use `raw-web-search` with Reddit permalink probes for sentiment / migration / lived experience, then `raw-scrape-links` to capture full threaded comments. Use smart search only to prioritize many candidate posts; use smart scrape only after you know the extraction facets.",
|
|
4547
|
+
"",
|
|
4548
|
+
"**Cite**: every non-trivial claim must trace to a raw or smart scrape excerpt, never a search snippet. Quote verbatim for numbers, versions, stacktraces, people's words.",
|
|
4549
|
+
"",
|
|
4550
|
+
"Pass `include_playbook: true` to `start-research` for the full tactic reference."
|
|
4551
|
+
].join("\n");
|
|
4552
|
+
}
|
|
4553
|
+
var PLANNER_FAILURE_THRESHOLD = 2;
|
|
4554
|
+
var PLANNER_FAILURE_TTL_MS = 6e4;
|
|
4555
|
+
function isPlannerKnownOffline(health, nowMs = Date.now()) {
|
|
4556
|
+
if (!health.plannerConfigured) {
|
|
4557
|
+
return true;
|
|
4558
|
+
}
|
|
4559
|
+
if (health.consecutivePlannerFailures < PLANNER_FAILURE_THRESHOLD) {
|
|
4560
|
+
return false;
|
|
4561
|
+
}
|
|
4562
|
+
if (health.lastPlannerCheckedAt === null) {
|
|
4563
|
+
return false;
|
|
4564
|
+
}
|
|
4565
|
+
const lastMs = Date.parse(health.lastPlannerCheckedAt);
|
|
4566
|
+
if (Number.isNaN(lastMs)) {
|
|
4567
|
+
return false;
|
|
4568
|
+
}
|
|
4569
|
+
return nowMs - lastMs < PLANNER_FAILURE_TTL_MS;
|
|
4570
|
+
}
|
|
4571
|
+
async function buildGoalAwareBrief(goal, signal) {
|
|
4572
|
+
const processor = createLLMProcessor();
|
|
4573
|
+
if (!processor) {
|
|
4574
|
+
mcpLog("info", "start-research: LLM unavailable, returning static orientation only", "start-research");
|
|
4575
|
+
return "";
|
|
4576
|
+
}
|
|
4577
|
+
void signal;
|
|
4578
|
+
const brief = await runExternalEffect(
|
|
4579
|
+
Effect5.gen(function* () {
|
|
4580
|
+
const llm = yield* LlmService;
|
|
4581
|
+
return yield* llm.generateResearchBrief(goal, processor);
|
|
4582
|
+
}),
|
|
4583
|
+
LlmServiceLive
|
|
4584
|
+
);
|
|
4585
|
+
if (!brief) {
|
|
4586
|
+
mcpLog("warning", "start-research: brief generation failed, returning static orientation only", "start-research");
|
|
4587
|
+
return "";
|
|
4588
|
+
}
|
|
4589
|
+
return renderResearchBrief(brief);
|
|
4590
|
+
}
|
|
4591
|
+
async function handleStartResearch(params, signal) {
|
|
4592
|
+
try {
|
|
4593
|
+
const llmHealth2 = getLLMHealth();
|
|
4594
|
+
const plannerKnownOffline = isPlannerKnownOffline(llmHealth2);
|
|
4595
|
+
if (plannerKnownOffline && !params.include_playbook) {
|
|
4596
|
+
const stub = buildDegradedStub(params.goal);
|
|
4597
|
+
return toolSuccess(stub);
|
|
4598
|
+
}
|
|
4599
|
+
const scaffolding = buildStaticScaffolding(params.goal, {
|
|
4600
|
+
plannerAvailable: !plannerKnownOffline
|
|
4601
|
+
});
|
|
4602
|
+
let brief = "";
|
|
4603
|
+
if (params.goal) {
|
|
4604
|
+
brief = await buildGoalAwareBrief(params.goal, signal);
|
|
4605
|
+
}
|
|
4606
|
+
const briefFallbackNote = params.goal && !brief ? "\n\n---\n\n> _Goal-tailored brief unavailable: LLM planner is not configured or failed this call. The static playbook above still applies; you can proceed with it, or retry `start-research` after verifying `LLM_API_KEY`._" : "";
|
|
4607
|
+
const content = brief ? `${scaffolding}
|
|
4608
|
+
|
|
4609
|
+
---
|
|
4610
|
+
|
|
4611
|
+
${brief}` : `${scaffolding}${briefFallbackNote}`;
|
|
4612
|
+
return toolSuccess(content);
|
|
4613
|
+
} catch (err) {
|
|
4614
|
+
const structuredError = classifyError(err);
|
|
4615
|
+
mcpLog("error", `start-research: ${structuredError.message}`, "start-research");
|
|
4616
|
+
return toolFailure(
|
|
4617
|
+
formatError({
|
|
4618
|
+
code: structuredError.code,
|
|
4619
|
+
message: structuredError.message,
|
|
4620
|
+
retryable: structuredError.retryable,
|
|
4621
|
+
toolName: "start-research",
|
|
4622
|
+
howToFix: ["Retry start-research. If the failure persists, verify LLM_API_KEY / LLM_BASE_URL / LLM_MODEL."]
|
|
4623
|
+
})
|
|
4624
|
+
);
|
|
4625
|
+
}
|
|
4626
|
+
}
|
|
4627
|
+
function registerStartResearchTool(server) {
|
|
4628
|
+
server.tool(
|
|
4629
|
+
{
|
|
4630
|
+
name: "start-research",
|
|
4631
|
+
title: "Start Research Session",
|
|
4632
|
+
description: `Call this FIRST every research session. Provide a \`goal\`; I return a goal-tailored brief naming (a) \`primary_branch\` (reddit for sentiment/migration, web for spec/bug/pricing, both when opinion-heavy AND needs official sources), (b) the exact \`first_call_sequence\` of raw/smart search and scrape calls to fire, (c) 25\u201350 keyword seeds for your first search call, (d) iteration hints, (e) gaps to watch, (f) stop criteria. ${QUERY_REWRITE_PAIR_GUIDANCE_TEXT} No goal? You still get the generic 5-tool playbook. Other tools work without calling this, but you will use them worse.`,
|
|
4633
|
+
schema: startResearchParamsSchema,
|
|
4634
|
+
annotations: {
|
|
4635
|
+
readOnlyHint: true,
|
|
4636
|
+
idempotentHint: true,
|
|
4637
|
+
destructiveHint: false,
|
|
4638
|
+
openWorldHint: false
|
|
4639
|
+
}
|
|
4640
|
+
},
|
|
4641
|
+
async (args) => toToolResponse(await handleStartResearch(args))
|
|
4642
|
+
);
|
|
4643
|
+
}
|
|
4644
|
+
|
|
4645
|
+
// src/tools/registry.ts
|
|
4646
|
+
function registerAllTools(server) {
|
|
4647
|
+
registerStartResearchTool(server);
|
|
4648
|
+
registerWebSearchTools(server);
|
|
4649
|
+
registerScrapeLinksTools(server);
|
|
4650
|
+
}
|
|
4651
|
+
|
|
4652
|
+
// index.ts
|
|
4653
|
+
if (!process.env.UV_THREADPOOL_SIZE) {
|
|
4654
|
+
process.env.UV_THREADPOOL_SIZE = "8";
|
|
4655
|
+
}
|
|
4656
|
+
var DEFAULT_PORT = 3e3;
|
|
4657
|
+
var SHUTDOWN_TIMEOUT_MS = 1e4;
|
|
4658
|
+
var WEBSITE_URL = "https://github.com/yigitkonur/mcp-researchpowerpack";
|
|
4659
|
+
var LOCAL_DEFAULT_HOST = "127.0.0.1";
|
|
4660
|
+
var startupLogger = Logger5.get("startup");
|
|
20
4661
|
function parseCsvEnv(value) {
|
|
21
4662
|
if (!value) return void 0;
|
|
22
4663
|
const parts = value.split(",").map((part) => part.trim()).filter(Boolean);
|
|
@@ -66,15 +4707,15 @@ function buildCors(allowedOrigins) {
|
|
|
66
4707
|
};
|
|
67
4708
|
}
|
|
68
4709
|
function configureLogging() {
|
|
69
|
-
|
|
4710
|
+
Logger5.configure({
|
|
70
4711
|
level: process.env.NODE_ENV === "production" ? "info" : "debug",
|
|
71
4712
|
format: "minimal"
|
|
72
4713
|
});
|
|
73
4714
|
const debug = process.env.DEBUG?.trim();
|
|
74
4715
|
if (debug === "2") {
|
|
75
|
-
|
|
4716
|
+
Logger5.setDebug(2);
|
|
76
4717
|
} else if (debug) {
|
|
77
|
-
|
|
4718
|
+
Logger5.setDebug(1);
|
|
78
4719
|
}
|
|
79
4720
|
}
|
|
80
4721
|
function normalizeOrigin(value, envName) {
|
|
@@ -227,9 +4868,9 @@ async function main() {
|
|
|
227
4868
|
}
|
|
228
4869
|
clearTimeout(forceExit);
|
|
229
4870
|
process.exit(exitCode);
|
|
230
|
-
} catch (
|
|
4871
|
+
} catch (error2) {
|
|
231
4872
|
clearTimeout(forceExit);
|
|
232
|
-
const message =
|
|
4873
|
+
const message = error2 instanceof Error ? error2.stack ?? error2.message : String(error2);
|
|
233
4874
|
startupLogger.error(`Error while stopping server: ${message}`);
|
|
234
4875
|
process.exit(1);
|
|
235
4876
|
}
|
|
@@ -240,8 +4881,8 @@ async function main() {
|
|
|
240
4881
|
process.on("SIGINT", () => {
|
|
241
4882
|
void shutdown("SIGINT", 0);
|
|
242
4883
|
});
|
|
243
|
-
process.on("uncaughtException", (
|
|
244
|
-
startupLogger.error(`Uncaught exception: ${
|
|
4884
|
+
process.on("uncaughtException", (error2) => {
|
|
4885
|
+
startupLogger.error(`Uncaught exception: ${error2.stack ?? error2.message}`);
|
|
245
4886
|
void shutdown("uncaughtException", 1);
|
|
246
4887
|
});
|
|
247
4888
|
process.on("unhandledRejection", (reason) => {
|
|
@@ -251,8 +4892,8 @@ async function main() {
|
|
|
251
4892
|
await server.listen(port);
|
|
252
4893
|
startupLogger.info(`${SERVER.NAME} v${SERVER.VERSION} listening on http://${host}:${port}/mcp`);
|
|
253
4894
|
}
|
|
254
|
-
void main().catch((
|
|
255
|
-
const message =
|
|
4895
|
+
void main().catch((error2) => {
|
|
4896
|
+
const message = error2 instanceof Error ? error2.stack ?? error2.message : String(error2);
|
|
256
4897
|
startupLogger.error(`Server failed to start: ${message}`);
|
|
257
4898
|
process.exit(1);
|
|
258
4899
|
});
|