membot 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/patches/@evantahler%2Fmcpx@0.21.4.patch +17 -10
- package/scripts/apply-patches.sh +7 -5
- package/src/ingest/agent-fetcher.ts +639 -0
- package/src/ingest/fetcher.ts +83 -205
- package/src/ingest/ingest.ts +33 -6
- package/src/refresh/runner.ts +9 -1
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
diff --git a/src/search/onnx-wasm-paths.ts b/src/search/onnx-wasm-paths.ts
|
|
2
2
|
--- a/src/search/onnx-wasm-paths.ts
|
|
3
3
|
+++ b/src/search/onnx-wasm-paths.ts
|
|
4
|
-
@@ -1,31 +1,
|
|
4
|
+
@@ -1,31 +1,16 @@
|
|
5
5
|
-// Embed the onnxruntime-web WASM runtime files into the compiled binary
|
|
6
6
|
-// (`bun build --compile`) so they survive in a single-binary distribution
|
|
7
7
|
-// where the user has no node_modules.
|
|
@@ -33,12 +33,19 @@ diff --git a/src/search/onnx-wasm-paths.ts b/src/search/onnx-wasm-paths.ts
|
|
|
33
33
|
-};
|
|
34
34
|
-
|
|
35
35
|
-export { wasmBinPath, wasmMjsPath };
|
|
36
|
-
+// PATCHED (membot):
|
|
37
|
-
+// of
|
|
38
|
-
+// resolves
|
|
39
|
-
+//
|
|
40
|
-
+//
|
|
41
|
-
+//
|
|
42
|
-
|
|
43
|
-
+
|
|
44
|
-
+
|
|
36
|
+
+// PATCHED (membot): point mcpx's onnx-wasm-paths at the onnxruntime-web installed
|
|
37
|
+
+// at the top of membot's node_modules. Upstream's `../../node_modules/...` only
|
|
38
|
+
+// resolves in mcpx's standalone repo layout; for consumers we walk up 4 levels:
|
|
39
|
+
+// node_modules/@evantahler/mcpx/src/search → node_modules → onnxruntime-web.
|
|
40
|
+
+// biome-ignore lint/suspicious/noTsIgnore: must stay as ts-ignore — relative path only resolves at runtime in consumer layout
|
|
41
|
+
+// @ts-ignore - dynamic-only import
|
|
42
|
+
+import wasmMjsPath from "../../../../onnxruntime-web/dist/ort-wasm-simd-threaded.asyncify.mjs" with {
|
|
43
|
+
+ type: "file",
|
|
44
|
+
+};
|
|
45
|
+
+// biome-ignore lint/suspicious/noTsIgnore: must stay as ts-ignore — relative path only resolves at runtime in consumer layout
|
|
46
|
+
+// @ts-ignore - dynamic-only import
|
|
47
|
+
+import wasmBinPath from "../../../../onnxruntime-web/dist/ort-wasm-simd-threaded.asyncify.wasm" with {
|
|
48
|
+
+ type: "file",
|
|
49
|
+
+};
|
|
50
|
+
+
|
|
51
|
+
+export { wasmBinPath, wasmMjsPath };
|
package/scripts/apply-patches.sh
CHANGED
|
@@ -38,11 +38,13 @@ apply_patch \
|
|
|
38
38
|
"node_modules/@huggingface/transformers" \
|
|
39
39
|
".membot-transformers-patch-applied"
|
|
40
40
|
|
|
41
|
-
# @evantahler/mcpx —
|
|
42
|
-
# `with { type: "file" }` imports
|
|
43
|
-
#
|
|
44
|
-
#
|
|
45
|
-
#
|
|
41
|
+
# @evantahler/mcpx — rewrite `src/search/onnx-wasm-paths.ts` so its static
|
|
42
|
+
# `with { type: "file" }` imports of onnxruntime-web's WASM resolve from the
|
|
43
|
+
# consumer's hoisted node_modules layout (../../../../onnxruntime-web/...)
|
|
44
|
+
# instead of mcpx's own repo layout (../../node_modules/...). With this
|
|
45
|
+
# patch in place, mcpx's semantic search runs end-to-end inside membot
|
|
46
|
+
# (the agent fetcher's `mcp_search` exercises it) and `bun build --compile`
|
|
47
|
+
# can bundle the WASM assets into the standalone binary.
|
|
46
48
|
apply_patch \
|
|
47
49
|
"patches/@evantahler%2Fmcpx@0.21.4.patch" \
|
|
48
50
|
"node_modules/@evantahler/mcpx" \
|
|
@@ -0,0 +1,639 @@
|
|
|
1
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
2
|
+
import type {
|
|
3
|
+
Tool as AnthropicTool,
|
|
4
|
+
MessageParam,
|
|
5
|
+
ToolResultBlockParam,
|
|
6
|
+
ToolUseBlock,
|
|
7
|
+
} from "@anthropic-ai/sdk/resources/messages";
|
|
8
|
+
import type { LlmConfig } from "../config/schemas.ts";
|
|
9
|
+
import { HelpfulError } from "../errors.ts";
|
|
10
|
+
import { logger } from "../output/logger.ts";
|
|
11
|
+
import { sha256Hex } from "./local-reader.ts";
|
|
12
|
+
|
|
13
|
+
/** Number of times the agent may iterate. Each turn = one Claude call + tool dispatch. */
|
|
14
|
+
const MAX_TURNS = 10;
|
|
15
|
+
/** Bytes of content shown back to the LLM in the mcp_exec preview. The harness has the full content. */
|
|
16
|
+
const PREVIEW_CHARS = 2_000;
|
|
17
|
+
/** Token budget per Claude call. Should comfortably fit a tool-use response + reasoning. */
|
|
18
|
+
const MAX_RESPONSE_TOKENS = 4_096;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Outcome shape mirrored from `FetchedRemote` in fetcher.ts. We don't
|
|
22
|
+
* import that type here to avoid a cycle — the fetcher imports us.
|
|
23
|
+
*/
|
|
24
|
+
export interface AgentFetchedRemote {
|
|
25
|
+
bytes: Uint8Array;
|
|
26
|
+
sha256: string;
|
|
27
|
+
mimeType: string;
|
|
28
|
+
fetcher: "mcpx";
|
|
29
|
+
fetcherServer: string;
|
|
30
|
+
fetcherTool: string;
|
|
31
|
+
fetcherArgs: Record<string, unknown>;
|
|
32
|
+
sourceUrl: string;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* The slice of mcpx the agent loop needs. Kept minimal so tests can
|
|
37
|
+
* stub it without spinning up a real client.
|
|
38
|
+
*/
|
|
39
|
+
export interface AgentMcpxAdapter {
|
|
40
|
+
search(
|
|
41
|
+
query: string,
|
|
42
|
+
options?: { keywordOnly?: boolean; semanticOnly?: boolean },
|
|
43
|
+
): Promise<{ server: string; tool: string; description?: string; score?: number; matchType?: string }[]>;
|
|
44
|
+
listTools(server?: string): Promise<{ server: string; tool: { name: string; description?: string } }[]>;
|
|
45
|
+
info(
|
|
46
|
+
server: string,
|
|
47
|
+
tool: string,
|
|
48
|
+
): Promise<{ name: string; description?: string; inputSchema?: unknown } | undefined>;
|
|
49
|
+
exec(
|
|
50
|
+
server: string,
|
|
51
|
+
tool: string,
|
|
52
|
+
args?: Record<string, unknown>,
|
|
53
|
+
): Promise<{ isError?: boolean; content?: unknown[] }>;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
export interface AgentFetchOptions {
|
|
57
|
+
url: string;
|
|
58
|
+
mcpx: AgentMcpxAdapter;
|
|
59
|
+
llm: LlmConfig;
|
|
60
|
+
hint?: string;
|
|
61
|
+
/**
|
|
62
|
+
* Optional sublabel callback. Receives compact, human-readable strings
|
|
63
|
+
* describing what the agent is doing each turn (e.g. "mcp_exec
|
|
64
|
+
* linear/list_comments (turn 2)"). Wired to the spinner suffix in TTY
|
|
65
|
+
* mode so users see live progress without `--verbose`.
|
|
66
|
+
*/
|
|
67
|
+
onProgress?: (sublabel: string) => void;
|
|
68
|
+
/** Test seam: inject a pre-built Anthropic client. */
|
|
69
|
+
_testClient?: Anthropic;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Outcome of the agent loop:
|
|
74
|
+
* - `accepted`: the agent picked a captured mcp_exec result; caller stores it as the new version.
|
|
75
|
+
* - `fallback`: the agent gave up on mcpx (request_http_fallback, no tool calls, max turns); caller does plain HTTP.
|
|
76
|
+
* - HelpfulError thrown: the agent reported an actionable failure (report_failure), or the loop hit a hard error.
|
|
77
|
+
*/
|
|
78
|
+
export type AgentFetchOutcome = { kind: "accepted"; result: AgentFetchedRemote } | { kind: "fallback"; reason: string };
|
|
79
|
+
|
|
80
|
+
const FETCHER_SYSTEM_PROMPT = `You are a content fetcher. Your job is to find the right MCP tool to retrieve the content at the given URL, run it, and tell the harness which result to save.
|
|
81
|
+
|
|
82
|
+
**Important: the harness captures the full result of every mcp_exec call automatically.** You only see a short preview of each result so you can verify it looks reasonable. You do NOT need to read or copy the full content — you just identify which exec call to save.
|
|
83
|
+
|
|
84
|
+
**Format preference: markdown, in order of preference.**
|
|
85
|
+
1. When searching with mcp_search or mcp_list_tools, prefer tools whose names indicate markdown output: anything containing "markdown", "md", "AsMarkdown", "AsMd", "AsDocmd", or similar.
|
|
86
|
+
2. If no markdown-named variant exists, use mcp_info to inspect the tool's input schema for a "format", "mime_type", "output_format", or similar parameter and request "markdown" (or "md") when available.
|
|
87
|
+
3. If neither is possible, run the tool anyway. The membot pipeline will normalize the captured content downstream — markdown-native tools are still preferred because they're cheaper and higher fidelity, but you do not have to find one.
|
|
88
|
+
|
|
89
|
+
Workflow:
|
|
90
|
+
1. Use mcp_search or mcp_list_tools to find the best tool for this URL (e.g., Google Docs tools for docs.google.com, Firecrawl for generic web pages, GitHub tools for github.com). Apply the format preference above.
|
|
91
|
+
2. Use mcp_info to inspect the tool's input schema. **Required before mcp_exec on any tool you haven't called this session.** Many tools want \`document_id\`, \`repo\`, \`page_id\`, etc. — not \`url\`. Extract the right value from the URL.
|
|
92
|
+
3. Call mcp_exec with arguments that conform to the schema.
|
|
93
|
+
4. **Multi-step workflows are expected.** Many providers need a sequence of calls — e.g. Firecrawl: \`scrape\` returns a job id, then \`get_job_status\` polls until done, then the final result has the content; some doc providers need a \`prepare/export\` call before \`download\`; large docs may paginate. Make as many mcp_exec calls as needed. Read each preview to decide the next step.
|
|
94
|
+
5. If the tool errors (input_error / auth_error / "still processing"), read the error, adjust, and retry — or pivot to a different tool.
|
|
95
|
+
6. Once a successful exec preview looks like the FINAL content, call accept_content with the exec_call_id (the tool_use_id of that mcp_exec call) and the actual mime_type the tool returned. Pick the call whose result is the actual content — not an intermediate job id or status response.
|
|
96
|
+
|
|
97
|
+
Terminal tools (call exactly one):
|
|
98
|
+
- accept_content(exec_call_id, mime_type?) — save the content captured from a previous mcp_exec call.
|
|
99
|
+
- request_http_fallback() — fall back to a basic HTTP fetch. Use only when no MCP tool can handle the URL after a genuine attempt.
|
|
100
|
+
- report_failure(message) — surface an actionable message to the user (e.g., "this Google Doc is private — share it with your service account"). Use only when there is a specific next step the user must take.`;
|
|
101
|
+
|
|
102
|
+
const acceptContentTool: AnthropicTool = {
|
|
103
|
+
name: "accept_content",
|
|
104
|
+
description:
|
|
105
|
+
"Save the full content captured by the harness from a previous mcp_exec call. You only need to supply the exec_call_id (the tool_use_id of that mcp_exec call). The harness already has the full content. Do NOT paste content here.",
|
|
106
|
+
input_schema: {
|
|
107
|
+
type: "object" as const,
|
|
108
|
+
properties: {
|
|
109
|
+
exec_call_id: {
|
|
110
|
+
type: "string",
|
|
111
|
+
description:
|
|
112
|
+
"The tool_use_id of the mcp_exec call whose result should be saved (the harness lists captured ids in mcp_exec previews).",
|
|
113
|
+
},
|
|
114
|
+
mime_type: {
|
|
115
|
+
type: "string",
|
|
116
|
+
description:
|
|
117
|
+
"MIME type the source tool returned (e.g. 'text/markdown', 'text/html', 'application/json'). Defaults to text/markdown.",
|
|
118
|
+
},
|
|
119
|
+
},
|
|
120
|
+
required: ["exec_call_id"],
|
|
121
|
+
},
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
const requestHttpFallbackTool: AnthropicTool = {
|
|
125
|
+
name: "request_http_fallback",
|
|
126
|
+
description: "Fall back to a basic HTTP fetch. Use only when no MCP tool can handle the URL after a genuine attempt.",
|
|
127
|
+
input_schema: { type: "object" as const, properties: {}, required: [] },
|
|
128
|
+
};
|
|
129
|
+
|
|
130
|
+
const reportFailureTool: AnthropicTool = {
|
|
131
|
+
name: "report_failure",
|
|
132
|
+
description:
|
|
133
|
+
"Report a fetch failure with an actionable message for the user (e.g., 'this Google Doc is private — share it with your service account'). Use only when there is a clear next step the user must take.",
|
|
134
|
+
input_schema: {
|
|
135
|
+
type: "object" as const,
|
|
136
|
+
properties: {
|
|
137
|
+
message: {
|
|
138
|
+
type: "string",
|
|
139
|
+
description: "Clear, actionable, user-facing message explaining what the user needs to do.",
|
|
140
|
+
},
|
|
141
|
+
},
|
|
142
|
+
required: ["message"],
|
|
143
|
+
},
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
const mcpSearchTool: AnthropicTool = {
|
|
147
|
+
name: "mcp_search",
|
|
148
|
+
description:
|
|
149
|
+
"Search for MCP tools by keyword + semantic similarity over the live mcpx catalog. Returns up to a handful of {server, tool, description, score} entries.",
|
|
150
|
+
input_schema: {
|
|
151
|
+
type: "object" as const,
|
|
152
|
+
properties: {
|
|
153
|
+
query: { type: "string", description: "Search query (e.g. 'fetch google docs as markdown')." },
|
|
154
|
+
},
|
|
155
|
+
required: ["query"],
|
|
156
|
+
},
|
|
157
|
+
};
|
|
158
|
+
|
|
159
|
+
const mcpListToolsTool: AnthropicTool = {
|
|
160
|
+
name: "mcp_list_tools",
|
|
161
|
+
description: "List available tools from configured MCP servers. Optionally filter by server name.",
|
|
162
|
+
input_schema: {
|
|
163
|
+
type: "object" as const,
|
|
164
|
+
properties: {
|
|
165
|
+
server: { type: "string", description: "Optional server name to filter on." },
|
|
166
|
+
},
|
|
167
|
+
required: [],
|
|
168
|
+
},
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
const mcpInfoTool: AnthropicTool = {
|
|
172
|
+
name: "mcp_info",
|
|
173
|
+
description:
|
|
174
|
+
"Get the full schema (name, description, input parameters) for a specific MCP tool. Required before mcp_exec on tools you haven't called this session.",
|
|
175
|
+
input_schema: {
|
|
176
|
+
type: "object" as const,
|
|
177
|
+
properties: {
|
|
178
|
+
server: { type: "string", description: "MCP server name." },
|
|
179
|
+
tool: { type: "string", description: "Tool name on the server." },
|
|
180
|
+
},
|
|
181
|
+
required: ["server", "tool"],
|
|
182
|
+
},
|
|
183
|
+
};
|
|
184
|
+
|
|
185
|
+
const mcpExecTool: AnthropicTool = {
|
|
186
|
+
name: "mcp_exec",
|
|
187
|
+
description:
|
|
188
|
+
"Execute a tool on an MCP server. The full result is captured by the harness keyed by tool_use_id; you receive a short preview to verify the content. To save the result, call accept_content with the exec_call_id.",
|
|
189
|
+
input_schema: {
|
|
190
|
+
type: "object" as const,
|
|
191
|
+
properties: {
|
|
192
|
+
server: { type: "string", description: "MCP server name." },
|
|
193
|
+
tool: { type: "string", description: "Tool name on the server." },
|
|
194
|
+
args: {
|
|
195
|
+
type: "object",
|
|
196
|
+
description: "Arguments object that conforms to the tool's input schema (verify via mcp_info).",
|
|
197
|
+
},
|
|
198
|
+
},
|
|
199
|
+
required: ["server", "tool"],
|
|
200
|
+
},
|
|
201
|
+
};
|
|
202
|
+
|
|
203
|
+
const ALL_TOOLS: AnthropicTool[] = [
|
|
204
|
+
mcpSearchTool,
|
|
205
|
+
mcpListToolsTool,
|
|
206
|
+
mcpInfoTool,
|
|
207
|
+
mcpExecTool,
|
|
208
|
+
acceptContentTool,
|
|
209
|
+
requestHttpFallbackTool,
|
|
210
|
+
reportFailureTool,
|
|
211
|
+
];
|
|
212
|
+
|
|
213
|
+
interface CapturedExec {
|
|
214
|
+
server: string;
|
|
215
|
+
tool: string;
|
|
216
|
+
args: Record<string, unknown>;
|
|
217
|
+
content: string;
|
|
218
|
+
mimeType: string;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Run the multi-turn fetcher agent. Mirrors botholomew's `runFetcherLoop`.
|
|
223
|
+
*
|
|
224
|
+
* Returns `{ kind: "accepted", result }` when the agent calls `accept_content`
|
|
225
|
+
* on a captured mcp_exec result. Returns `{ kind: "fallback" }` when the agent
|
|
226
|
+
* calls `request_http_fallback`, produces no tool calls, or exhausts MAX_TURNS.
|
|
227
|
+
* Throws `HelpfulError` when the agent calls `report_failure` (the actionable
|
|
228
|
+
* message becomes the error's `message`/`hint`).
|
|
229
|
+
*/
|
|
230
|
+
export async function agentFetch(opts: AgentFetchOptions): Promise<AgentFetchOutcome> {
|
|
231
|
+
if (!opts.llm.anthropic_api_key || opts.llm.anthropic_api_key.trim() === "") {
|
|
232
|
+
throw new HelpfulError({
|
|
233
|
+
kind: "auth_error",
|
|
234
|
+
message: `agentFetch requires ANTHROPIC_API_KEY but llm.anthropic_api_key is empty.`,
|
|
235
|
+
hint: `Set ANTHROPIC_API_KEY in your environment or under llm.anthropic_api_key in ~/.membot/config.json.`,
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
const client = opts._testClient ?? new Anthropic({ apiKey: opts.llm.anthropic_api_key });
|
|
240
|
+
|
|
241
|
+
const userPrompt = opts.hint
|
|
242
|
+
? `Fetch the content at: ${opts.url}\n\nAdditional guidance:\n${opts.hint}`
|
|
243
|
+
: `Fetch the content at: ${opts.url}`;
|
|
244
|
+
const messages: MessageParam[] = [{ role: "user", content: userPrompt }];
|
|
245
|
+
|
|
246
|
+
const captured = new Map<string, CapturedExec>();
|
|
247
|
+
|
|
248
|
+
opts.onProgress?.(`fetching via mcpx agent (turn 1)`);
|
|
249
|
+
|
|
250
|
+
for (let turn = 0; turn < MAX_TURNS; turn++) {
|
|
251
|
+
if (turn > 0) {
|
|
252
|
+
logger.info(`[fetcher] turn ${turn + 1}/${MAX_TURNS}`);
|
|
253
|
+
opts.onProgress?.(`fetching via mcpx agent (turn ${turn + 1})`);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const response = await client.messages.create({
|
|
257
|
+
model: opts.llm.converter_model,
|
|
258
|
+
max_tokens: MAX_RESPONSE_TOKENS,
|
|
259
|
+
system: FETCHER_SYSTEM_PROMPT,
|
|
260
|
+
messages,
|
|
261
|
+
tools: ALL_TOOLS,
|
|
262
|
+
});
|
|
263
|
+
|
|
264
|
+
for (const block of response.content) {
|
|
265
|
+
if (block.type === "text" && block.text.trim()) {
|
|
266
|
+
logger.debug(`[fetcher] turn ${turn + 1} reasoning: ${block.text.trim()}`);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if (response.stop_reason === "max_tokens") {
|
|
271
|
+
throw new HelpfulError({
|
|
272
|
+
kind: "internal_error",
|
|
273
|
+
message: `Fetcher agent hit max_tokens (${MAX_RESPONSE_TOKENS}) on turn ${turn + 1}.`,
|
|
274
|
+
hint: `The fetched document or the agent's reasoning is too long. Try \`membot add ${opts.url} --fetcher http\` or fetch a more specific section.`,
|
|
275
|
+
});
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
const toolUseBlocks = response.content.filter((b): b is ToolUseBlock => b.type === "tool_use");
|
|
279
|
+
if (toolUseBlocks.length === 0) {
|
|
280
|
+
logger.info(`[fetcher] turn ${turn + 1}: no tool calls — falling back to HTTP`);
|
|
281
|
+
return { kind: "fallback", reason: "agent stopped without selecting an outcome" };
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
messages.push({ role: "assistant", content: response.content });
|
|
285
|
+
|
|
286
|
+
// Log selected tools at info-level so users see what the agent is doing
|
|
287
|
+
// without enabling --verbose. Discovery (search/info/list) stays quiet
|
|
288
|
+
// at info; the actual mcp_exec calls are the high-signal events.
|
|
289
|
+
for (const tu of toolUseBlocks) {
|
|
290
|
+
logToolSelection(tu, turn + 1, opts.onProgress);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// Terminal tools — checked in priority order.
|
|
294
|
+
const failureCall = toolUseBlocks.find((b) => b.name === "report_failure");
|
|
295
|
+
if (failureCall) {
|
|
296
|
+
const input = failureCall.input as Partial<{ message: string }>;
|
|
297
|
+
const message =
|
|
298
|
+
typeof input.message === "string" && input.message.trim()
|
|
299
|
+
? input.message.trim()
|
|
300
|
+
: "Fetch failed but the agent did not provide a message.";
|
|
301
|
+
logger.info(`[fetcher] turn ${turn + 1}: report_failure: ${message}`);
|
|
302
|
+
throw new HelpfulError({
|
|
303
|
+
kind: "input_error",
|
|
304
|
+
message: `Fetcher agent reported failure for ${opts.url}: ${message}`,
|
|
305
|
+
hint: message,
|
|
306
|
+
});
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
const fallbackCall = toolUseBlocks.find((b) => b.name === "request_http_fallback");
|
|
310
|
+
if (fallbackCall) {
|
|
311
|
+
logger.info(`[fetcher] turn ${turn + 1}: agent requested HTTP fallback`);
|
|
312
|
+
return { kind: "fallback", reason: "agent requested HTTP fallback" };
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
const acceptCall = toolUseBlocks.find((b) => b.name === "accept_content");
|
|
316
|
+
if (acceptCall) {
|
|
317
|
+
const input = acceptCall.input as Partial<{ exec_call_id: string; mime_type: string }>;
|
|
318
|
+
if (typeof input.exec_call_id !== "string") {
|
|
319
|
+
messages.push({
|
|
320
|
+
role: "user",
|
|
321
|
+
content: [
|
|
322
|
+
{
|
|
323
|
+
type: "tool_result",
|
|
324
|
+
tool_use_id: acceptCall.id,
|
|
325
|
+
content: "Invalid accept_content call: 'exec_call_id' is required.",
|
|
326
|
+
is_error: true,
|
|
327
|
+
},
|
|
328
|
+
],
|
|
329
|
+
});
|
|
330
|
+
continue;
|
|
331
|
+
}
|
|
332
|
+
const cached = captured.get(input.exec_call_id);
|
|
333
|
+
if (!cached) {
|
|
334
|
+
const validIds = [...captured.keys()];
|
|
335
|
+
messages.push({
|
|
336
|
+
role: "user",
|
|
337
|
+
content: [
|
|
338
|
+
{
|
|
339
|
+
type: "tool_result",
|
|
340
|
+
tool_use_id: acceptCall.id,
|
|
341
|
+
content: `No mcp_exec call with id "${input.exec_call_id}" was captured. Captured ids: ${validIds.length ? validIds.join(", ") : "(none yet — run mcp_exec first)"}.`,
|
|
342
|
+
is_error: true,
|
|
343
|
+
},
|
|
344
|
+
],
|
|
345
|
+
});
|
|
346
|
+
continue;
|
|
347
|
+
}
|
|
348
|
+
const claimedMime = (input.mime_type ?? cached.mimeType ?? "text/markdown").trim() || "text/markdown";
|
|
349
|
+
const bytes = new TextEncoder().encode(cached.content);
|
|
350
|
+
logger.info(`[fetcher] accepted: ${cached.server}/${cached.tool} (${bytes.byteLength} bytes, ${claimedMime})`);
|
|
351
|
+
logger.debug(`[fetcher] accepted args: ${truncateJson(cached.args, 500)}`);
|
|
352
|
+
logger.debug(`[fetcher] accepted preview: ${truncate(cached.content, 200)}`);
|
|
353
|
+
opts.onProgress?.(`accepted ${cached.server}/${cached.tool}`);
|
|
354
|
+
return {
|
|
355
|
+
kind: "accepted",
|
|
356
|
+
result: {
|
|
357
|
+
bytes,
|
|
358
|
+
sha256: sha256Hex(bytes),
|
|
359
|
+
mimeType: claimedMime,
|
|
360
|
+
fetcher: "mcpx",
|
|
361
|
+
fetcherServer: cached.server,
|
|
362
|
+
fetcherTool: cached.tool,
|
|
363
|
+
fetcherArgs: cached.args,
|
|
364
|
+
sourceUrl: opts.url,
|
|
365
|
+
},
|
|
366
|
+
};
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Discovery / exec tools — execute in parallel, feed results back.
|
|
370
|
+
const toolResults: ToolResultBlockParam[] = await Promise.all(
|
|
371
|
+
toolUseBlocks.map((toolUse) => dispatchAgentTool(toolUse, opts.mcpx, captured)),
|
|
372
|
+
);
|
|
373
|
+
messages.push({ role: "user", content: toolResults });
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
logger.info(`[fetcher] max turns (${MAX_TURNS}) exceeded — falling back to HTTP`);
|
|
377
|
+
return { kind: "fallback", reason: `agent exceeded MAX_TURNS=${MAX_TURNS}` };
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Emit a per-turn line about which tool the agent is about to invoke. mcp_exec
|
|
382
|
+
* is the high-signal event that surfaces *which provider was chosen*, so it
|
|
383
|
+
* goes to info; discovery (search / list / info) stays at debug.
|
|
384
|
+
*/
|
|
385
|
+
function logToolSelection(tu: ToolUseBlock, turn: number, onProgress?: (s: string) => void): void {
|
|
386
|
+
if (tu.name === "mcp_exec") {
|
|
387
|
+
const i = tu.input as Partial<{ server: string; tool: string; args: Record<string, unknown> }>;
|
|
388
|
+
const server = i.server ?? "?";
|
|
389
|
+
const tool = i.tool ?? "?";
|
|
390
|
+
logger.info(`[fetcher] turn ${turn}: mcp_exec ${server}/${tool}`);
|
|
391
|
+
logger.debug(`[fetcher] turn ${turn}: mcp_exec args: ${truncateJson(i.args ?? {}, 500)}`);
|
|
392
|
+
onProgress?.(`mcp_exec ${server}/${tool} (turn ${turn})`);
|
|
393
|
+
} else if (tu.name === "mcp_search") {
|
|
394
|
+
const i = tu.input as Partial<{ query: string }>;
|
|
395
|
+
logger.debug(`[fetcher] turn ${turn}: mcp_search "${i.query ?? ""}"`);
|
|
396
|
+
} else if (tu.name === "mcp_info") {
|
|
397
|
+
const i = tu.input as Partial<{ server: string; tool: string }>;
|
|
398
|
+
logger.debug(`[fetcher] turn ${turn}: mcp_info ${i.server ?? "?"}/${i.tool ?? "?"}`);
|
|
399
|
+
} else if (tu.name === "mcp_list_tools") {
|
|
400
|
+
const i = tu.input as Partial<{ server: string }>;
|
|
401
|
+
logger.debug(`[fetcher] turn ${turn}: mcp_list_tools${i.server ? ` ${i.server}` : ""}`);
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
/** JSON-stringify with a length cap so a giant args payload doesn't bloat logs. */
|
|
406
|
+
function truncateJson(value: unknown, max: number): string {
|
|
407
|
+
let s: string;
|
|
408
|
+
try {
|
|
409
|
+
s = JSON.stringify(value);
|
|
410
|
+
} catch {
|
|
411
|
+
s = String(value);
|
|
412
|
+
}
|
|
413
|
+
return s.length > max ? `${s.slice(0, max)}… (+${s.length - max} chars)` : s;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/** Single-line truncation for debug previews; collapses whitespace. */
|
|
417
|
+
function truncate(s: string, max: number): string {
|
|
418
|
+
const oneLine = s.replace(/\s+/g, " ").trim();
|
|
419
|
+
return oneLine.length > max ? `${oneLine.slice(0, max)}… (+${oneLine.length - max} chars)` : oneLine;
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
/** Execute one agent tool call and produce the tool_result block fed back to Claude. */
|
|
423
|
+
async function dispatchAgentTool(
|
|
424
|
+
toolUse: ToolUseBlock,
|
|
425
|
+
mcpx: AgentMcpxAdapter,
|
|
426
|
+
captured: Map<string, CapturedExec>,
|
|
427
|
+
): Promise<ToolResultBlockParam> {
|
|
428
|
+
try {
|
|
429
|
+
switch (toolUse.name) {
|
|
430
|
+
case "mcp_search":
|
|
431
|
+
return await runMcpSearch(toolUse, mcpx);
|
|
432
|
+
case "mcp_list_tools":
|
|
433
|
+
return await runMcpListTools(toolUse, mcpx);
|
|
434
|
+
case "mcp_info":
|
|
435
|
+
return await runMcpInfo(toolUse, mcpx);
|
|
436
|
+
case "mcp_exec":
|
|
437
|
+
return await runMcpExec(toolUse, mcpx, captured);
|
|
438
|
+
default:
|
|
439
|
+
return {
|
|
440
|
+
type: "tool_result",
|
|
441
|
+
tool_use_id: toolUse.id,
|
|
442
|
+
content: `Unknown tool: ${toolUse.name}`,
|
|
443
|
+
is_error: true,
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
} catch (err) {
|
|
447
|
+
return {
|
|
448
|
+
type: "tool_result",
|
|
449
|
+
tool_use_id: toolUse.id,
|
|
450
|
+
content: `Error: ${err instanceof Error ? err.message : String(err)}`,
|
|
451
|
+
is_error: true,
|
|
452
|
+
};
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
async function runMcpSearch(toolUse: ToolUseBlock, mcpx: AgentMcpxAdapter): Promise<ToolResultBlockParam> {
|
|
457
|
+
const input = toolUse.input as Partial<{ query: string }>;
|
|
458
|
+
if (typeof input.query !== "string" || !input.query.trim()) {
|
|
459
|
+
return { type: "tool_result", tool_use_id: toolUse.id, content: "mcp_search requires 'query'.", is_error: true };
|
|
460
|
+
}
|
|
461
|
+
try {
|
|
462
|
+
const results = await mcpx.search(input.query);
|
|
463
|
+
const top = results.slice(0, 3).map((r) => `${r.server}/${r.tool}${r.score ? ` (${r.score.toFixed(2)})` : ""}`);
|
|
464
|
+
logger.debug(`[fetcher] mcp_search "${input.query}" → ${top.length ? top.join(", ") : "(no hits)"}`);
|
|
465
|
+
return {
|
|
466
|
+
type: "tool_result",
|
|
467
|
+
tool_use_id: toolUse.id,
|
|
468
|
+
content: JSON.stringify(
|
|
469
|
+
{
|
|
470
|
+
results: results.slice(0, 10).map((r) => ({
|
|
471
|
+
server: r.server,
|
|
472
|
+
tool: r.tool,
|
|
473
|
+
description: r.description ?? "",
|
|
474
|
+
score: r.score ?? 0,
|
|
475
|
+
})),
|
|
476
|
+
hint:
|
|
477
|
+
results.length > 0
|
|
478
|
+
? "Use mcp_info to read the input schema before mcp_exec."
|
|
479
|
+
: "No results. Try broader terms or mcp_list_tools.",
|
|
480
|
+
},
|
|
481
|
+
null,
|
|
482
|
+
2,
|
|
483
|
+
),
|
|
484
|
+
};
|
|
485
|
+
} catch (err) {
|
|
486
|
+
return {
|
|
487
|
+
type: "tool_result",
|
|
488
|
+
tool_use_id: toolUse.id,
|
|
489
|
+
content: `mcp_search failed: ${err instanceof Error ? err.message : String(err)}. Try mcp_list_tools instead.`,
|
|
490
|
+
is_error: true,
|
|
491
|
+
};
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
async function runMcpListTools(toolUse: ToolUseBlock, mcpx: AgentMcpxAdapter): Promise<ToolResultBlockParam> {
|
|
496
|
+
const input = toolUse.input as Partial<{ server: string }>;
|
|
497
|
+
const tools = await mcpx.listTools(input.server);
|
|
498
|
+
const mapped = tools.map((t) => ({ server: t.server, name: t.tool.name, description: t.tool.description ?? "" }));
|
|
499
|
+
return {
|
|
500
|
+
type: "tool_result",
|
|
501
|
+
tool_use_id: toolUse.id,
|
|
502
|
+
content: JSON.stringify(
|
|
503
|
+
{
|
|
504
|
+
tools: mapped,
|
|
505
|
+
hint:
|
|
506
|
+
mapped.length > 0
|
|
507
|
+
? "Use mcp_info on a {server, name} pair before mcp_exec."
|
|
508
|
+
: "No tools. mcpx may not be configured.",
|
|
509
|
+
},
|
|
510
|
+
null,
|
|
511
|
+
2,
|
|
512
|
+
),
|
|
513
|
+
};
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
async function runMcpInfo(toolUse: ToolUseBlock, mcpx: AgentMcpxAdapter): Promise<ToolResultBlockParam> {
|
|
517
|
+
const input = toolUse.input as Partial<{ server: string; tool: string }>;
|
|
518
|
+
if (typeof input.server !== "string" || typeof input.tool !== "string") {
|
|
519
|
+
return {
|
|
520
|
+
type: "tool_result",
|
|
521
|
+
tool_use_id: toolUse.id,
|
|
522
|
+
content: "mcp_info requires 'server' and 'tool'.",
|
|
523
|
+
is_error: true,
|
|
524
|
+
};
|
|
525
|
+
}
|
|
526
|
+
const tool = await mcpx.info(input.server, input.tool);
|
|
527
|
+
if (!tool) {
|
|
528
|
+
return {
|
|
529
|
+
type: "tool_result",
|
|
530
|
+
tool_use_id: toolUse.id,
|
|
531
|
+
content: `Tool "${input.tool}" not found on server "${input.server}". Use mcp_search or mcp_list_tools.`,
|
|
532
|
+
is_error: true,
|
|
533
|
+
};
|
|
534
|
+
}
|
|
535
|
+
return {
|
|
536
|
+
type: "tool_result",
|
|
537
|
+
tool_use_id: toolUse.id,
|
|
538
|
+
content: JSON.stringify(
|
|
539
|
+
{
|
|
540
|
+
name: tool.name,
|
|
541
|
+
description: tool.description ?? "",
|
|
542
|
+
input_schema: tool.inputSchema ?? {},
|
|
543
|
+
hint: `Call mcp_exec with server='${input.server}', tool='${tool.name}', and args matching this schema.`,
|
|
544
|
+
},
|
|
545
|
+
null,
|
|
546
|
+
2,
|
|
547
|
+
),
|
|
548
|
+
};
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
async function runMcpExec(
|
|
552
|
+
toolUse: ToolUseBlock,
|
|
553
|
+
mcpx: AgentMcpxAdapter,
|
|
554
|
+
captured: Map<string, CapturedExec>,
|
|
555
|
+
): Promise<ToolResultBlockParam> {
|
|
556
|
+
const input = toolUse.input as Partial<{ server: string; tool: string; args: Record<string, unknown> }>;
|
|
557
|
+
if (typeof input.server !== "string" || typeof input.tool !== "string") {
|
|
558
|
+
return {
|
|
559
|
+
type: "tool_result",
|
|
560
|
+
tool_use_id: toolUse.id,
|
|
561
|
+
content: "mcp_exec requires 'server' and 'tool'.",
|
|
562
|
+
is_error: true,
|
|
563
|
+
};
|
|
564
|
+
}
|
|
565
|
+
const args = (input.args ?? {}) as Record<string, unknown>;
|
|
566
|
+
|
|
567
|
+
let result: { isError?: boolean; content?: unknown[] };
|
|
568
|
+
try {
|
|
569
|
+
result = await mcpx.exec(input.server, input.tool, args);
|
|
570
|
+
} catch (err) {
|
|
571
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
572
|
+
logger.info(`[fetcher] → ${input.server}/${input.tool} threw: ${truncate(msg, 200)}`);
|
|
573
|
+
return {
|
|
574
|
+
type: "tool_result",
|
|
575
|
+
tool_use_id: toolUse.id,
|
|
576
|
+
content: `mcp_exec ${input.server}/${input.tool} threw: ${msg}. Use mcp_info to verify the schema, then retry — or pivot to a different tool.`,
|
|
577
|
+
is_error: true,
|
|
578
|
+
};
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
const text = extractText(result);
|
|
582
|
+
|
|
583
|
+
if (result.isError === true) {
|
|
584
|
+
logger.info(`[fetcher] → ${input.server}/${input.tool} error: ${truncate(text, 200)}`);
|
|
585
|
+
return {
|
|
586
|
+
type: "tool_result",
|
|
587
|
+
tool_use_id: toolUse.id,
|
|
588
|
+
content: `mcp_exec ${input.server}/${input.tool} returned isError=true: ${text}\n\nUse mcp_info to check the schema, fix the args, and retry — or try a different tool.`,
|
|
589
|
+
is_error: true,
|
|
590
|
+
};
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
if (!text?.trim()) {
|
|
594
|
+
logger.info(`[fetcher] → ${input.server}/${input.tool} empty result`);
|
|
595
|
+
return {
|
|
596
|
+
type: "tool_result",
|
|
597
|
+
tool_use_id: toolUse.id,
|
|
598
|
+
content: `mcp_exec ${input.server}/${input.tool} returned empty content. Try a different tool or different args.`,
|
|
599
|
+
is_error: true,
|
|
600
|
+
};
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
logger.info(`[fetcher] → ${input.server}/${input.tool} ok (${text.length} chars)`);
|
|
604
|
+
captured.set(toolUse.id, { server: input.server, tool: input.tool, args, content: text, mimeType: "text/markdown" });
|
|
605
|
+
const preview =
|
|
606
|
+
text.length > PREVIEW_CHARS
|
|
607
|
+
? `${text.slice(0, PREVIEW_CHARS)}\n\n[... ${text.length - PREVIEW_CHARS} more chars truncated. Full content (${text.length} chars total) is captured by the harness with exec_call_id="${toolUse.id}". Call accept_content with this id to save it.]`
|
|
608
|
+
: `${text}\n\n[Full content (${text.length} chars) captured by the harness with exec_call_id="${toolUse.id}". Call accept_content with this id to save it.]`;
|
|
609
|
+
return { type: "tool_result", tool_use_id: toolUse.id, content: preview };
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
/**
|
|
613
|
+
* Extract a single string out of an MCP CallToolResult envelope. Mirrors
|
|
614
|
+
* the heterogeneous shapes mcpx tools return; tolerates string content,
|
|
615
|
+
* `text` fields, and the array-of-content-blocks shape.
|
|
616
|
+
*/
|
|
617
|
+
function extractText(result: { content?: unknown } | unknown): string {
|
|
618
|
+
if (typeof result === "string") return result;
|
|
619
|
+
if (!result || typeof result !== "object") return "";
|
|
620
|
+
const r = result as Record<string, unknown>;
|
|
621
|
+
if (typeof r.text === "string") return r.text;
|
|
622
|
+
if (typeof r.content === "string") return r.content;
|
|
623
|
+
if (typeof r.markdown === "string") return r.markdown;
|
|
624
|
+
if (Array.isArray(r.content)) {
|
|
625
|
+
const out: string[] = [];
|
|
626
|
+
for (const c of r.content) {
|
|
627
|
+
if (c && typeof c === "object") {
|
|
628
|
+
const inner = c as Record<string, unknown>;
|
|
629
|
+
if (typeof inner.text === "string") out.push(inner.text);
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
if (out.length > 0) return out.join("\n\n");
|
|
633
|
+
}
|
|
634
|
+
try {
|
|
635
|
+
return JSON.stringify(result);
|
|
636
|
+
} catch {
|
|
637
|
+
return "";
|
|
638
|
+
}
|
|
639
|
+
}
|
package/src/ingest/fetcher.ts
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
|
+
import type { LlmConfig } from "../config/schemas.ts";
|
|
1
2
|
import { DEFAULTS } from "../constants.ts";
|
|
2
3
|
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
3
4
|
import { logger } from "../output/logger.ts";
|
|
5
|
+
import type { AgentMcpxAdapter } from "./agent-fetcher.ts";
|
|
6
|
+
import { agentFetch } from "./agent-fetcher.ts";
|
|
4
7
|
import { sha256Hex } from "./local-reader.ts";
|
|
5
8
|
|
|
6
9
|
export interface FetchedRemote {
|
|
@@ -14,38 +17,45 @@ export interface FetchedRemote {
|
|
|
14
17
|
sourceUrl: string;
|
|
15
18
|
}
|
|
16
19
|
|
|
17
|
-
export interface McpxToolDescriptor {
|
|
18
|
-
server: string;
|
|
19
|
-
tool: { name: string; description?: string };
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
export interface McpxSearchHit {
|
|
23
|
-
server: string;
|
|
24
|
-
tool: { name: string; description?: string };
|
|
25
|
-
score?: number;
|
|
26
|
-
}
|
|
27
|
-
|
|
28
20
|
export interface FetchOptions {
|
|
29
21
|
/**
|
|
30
22
|
* User-provided hint. Free-form keyword (e.g. "firecrawl", "github",
|
|
31
23
|
* "google-docs", "http"). Special-cased: "http" forces plain fetch.
|
|
32
|
-
* Otherwise the hint is
|
|
33
|
-
*
|
|
24
|
+
* Otherwise the hint is passed verbatim to the agent loop as extra
|
|
25
|
+
* guidance about which provider to prefer.
|
|
34
26
|
*/
|
|
35
27
|
hint?: string;
|
|
36
|
-
/** Live mcpx adapter
|
|
37
|
-
mcpx?:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
28
|
+
/** Live mcpx adapter the agent loop drives via search/list/info/exec. */
|
|
29
|
+
mcpx?: AgentMcpxAdapter | null;
|
|
30
|
+
/**
|
|
31
|
+
* LLM config. The agent loop needs an Anthropic key; without one the
|
|
32
|
+
* mcpx path is skipped and we fall back to plain HTTP.
|
|
33
|
+
*/
|
|
34
|
+
llm?: LlmConfig;
|
|
35
|
+
/**
|
|
36
|
+
* Forwarded to the agent loop so callers (e.g. the ingest progress
|
|
37
|
+
* reporter) can drive a spinner sublabel from per-turn agent activity.
|
|
38
|
+
*/
|
|
39
|
+
onProgress?: (sublabel: string) => void;
|
|
42
40
|
}
|
|
43
41
|
|
|
44
42
|
/**
|
|
45
|
-
* Fetch a remote URL
|
|
46
|
-
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
43
|
+
* Fetch a remote URL.
|
|
44
|
+
*
|
|
45
|
+
* - `--fetcher http` (or no mcpx, or no LLM key) → plain HTTP.
|
|
46
|
+
* - Otherwise → multi-turn agent loop: Claude is given mcpx tools
|
|
47
|
+
* (search/list/info/exec) and decides how to retrieve the URL,
|
|
48
|
+
* including multi-step flows (start a job → poll → download).
|
|
49
|
+
* The agent's selected mcp_exec invocation is recorded on the
|
|
50
|
+
* returned row so refresh can replay it deterministically without
|
|
51
|
+
* another agent round-trip.
|
|
52
|
+
*
|
|
53
|
+
* If the agent decides plain HTTP is the right call (`request_http_fallback`,
|
|
54
|
+
* no tool calls, max turns) we transparently fall through to `httpFetch`.
|
|
55
|
+
* If the agent reports an actionable failure, we surface that as a
|
|
56
|
+
* `HelpfulError`. If mcpx is configured but the LLM key is missing AND
|
|
57
|
+
* the HTTP fallback also fails, we surface an `auth_error` naming the env
|
|
58
|
+
* var so users see the real cause instead of a misleading 401.
|
|
49
59
|
*/
|
|
50
60
|
export async function fetchRemote(url: string, options: FetchOptions = {}): Promise<FetchedRemote> {
|
|
51
61
|
const mcpx = options.mcpx;
|
|
@@ -54,8 +64,46 @@ export async function fetchRemote(url: string, options: FetchOptions = {}): Prom
|
|
|
54
64
|
if (hint === "http") return httpFetch(url);
|
|
55
65
|
if (!mcpx) return httpFetch(url);
|
|
56
66
|
|
|
57
|
-
const
|
|
58
|
-
if (
|
|
67
|
+
const apiKey = options.llm?.anthropic_api_key?.trim();
|
|
68
|
+
if (!apiKey) {
|
|
69
|
+
// No way to drive the agent. Try HTTP; if that fails, the user
|
|
70
|
+
// almost certainly wanted mcpx — surface a clear key-missing error.
|
|
71
|
+
try {
|
|
72
|
+
return await httpFetch(url);
|
|
73
|
+
} catch (err) {
|
|
74
|
+
if (err instanceof HelpfulError && err.kind === "network_error") {
|
|
75
|
+
throw new HelpfulError({
|
|
76
|
+
kind: "auth_error",
|
|
77
|
+
message: `${url} couldn't be fetched directly (${err.message}). Membot has mcpx configured, but routing through it requires Claude to translate the URL into the right tool arguments — and ANTHROPIC_API_KEY isn't set.`,
|
|
78
|
+
hint: `Set ANTHROPIC_API_KEY in your environment (or under llm.anthropic_api_key in ~/.membot/config.json), then retry. To force the HTTP path explicitly, run \`membot add ${url} --fetcher http\`.`,
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
throw err;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
let outcome: Awaited<ReturnType<typeof agentFetch>>;
|
|
86
|
+
try {
|
|
87
|
+
outcome = await agentFetch({ url, mcpx, llm: options.llm!, hint, onProgress: options.onProgress });
|
|
88
|
+
} catch (err) {
|
|
89
|
+
if (err instanceof HelpfulError) throw err;
|
|
90
|
+
logger.warn(`agent-fetch failed (${err instanceof Error ? err.message : String(err)}) — falling back to HTTP`);
|
|
91
|
+
return httpFetch(url);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
if (outcome.kind === "accepted") {
|
|
95
|
+
return {
|
|
96
|
+
bytes: outcome.result.bytes,
|
|
97
|
+
sha256: outcome.result.sha256,
|
|
98
|
+
mimeType: outcome.result.mimeType,
|
|
99
|
+
fetcher: "mcpx",
|
|
100
|
+
fetcherServer: outcome.result.fetcherServer,
|
|
101
|
+
fetcherTool: outcome.result.fetcherTool,
|
|
102
|
+
fetcherArgs: outcome.result.fetcherArgs,
|
|
103
|
+
sourceUrl: url,
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
logger.info(`[fetcher] falling back to HTTP: ${outcome.reason}`);
|
|
59
107
|
return httpFetch(url);
|
|
60
108
|
}
|
|
61
109
|
|
|
@@ -71,7 +119,7 @@ async function httpFetch(url: string): Promise<FetchedRemote> {
|
|
|
71
119
|
throw asHelpful(
|
|
72
120
|
err,
|
|
73
121
|
`while fetching ${url}`,
|
|
74
|
-
`Check your network and that ${url} is reachable. For mcpx-managed sources (gdocs/github/firecrawl), set
|
|
122
|
+
`Check your network and that ${url} is reachable. For mcpx-managed sources (gdocs/github/firecrawl), set ANTHROPIC_API_KEY so membot can drive an mcpx tool.`,
|
|
75
123
|
"network_error",
|
|
76
124
|
);
|
|
77
125
|
}
|
|
@@ -79,7 +127,7 @@ async function httpFetch(url: string): Promise<FetchedRemote> {
|
|
|
79
127
|
throw new HelpfulError({
|
|
80
128
|
kind: "network_error",
|
|
81
129
|
message: `HTTP ${resp.status} ${resp.statusText}: ${url}`,
|
|
82
|
-
hint: "Verify the URL is reachable and not gated behind auth. For private docs use mcpx
|
|
130
|
+
hint: "Verify the URL is reachable and not gated behind auth. For private docs use mcpx (set ANTHROPIC_API_KEY).",
|
|
83
131
|
});
|
|
84
132
|
}
|
|
85
133
|
const bytes = new Uint8Array(await resp.arrayBuffer());
|
|
@@ -98,183 +146,13 @@ async function httpFetch(url: string): Promise<FetchedRemote> {
|
|
|
98
146
|
}
|
|
99
147
|
|
|
100
148
|
/**
|
|
101
|
-
*
|
|
102
|
-
*
|
|
103
|
-
*
|
|
104
|
-
*
|
|
105
|
-
*
|
|
106
|
-
* label for which provider they want — we never assume server names.
|
|
107
|
-
* 2. Otherwise, fall back to a host-based search query (e.g. URL host
|
|
108
|
-
* "github.com" → search for "github fetch markdown").
|
|
109
|
-
* 3. From the returned candidates, prefer tools whose name or description
|
|
110
|
-
* signals markdown output. Failing that, the first tool that takes a
|
|
111
|
-
* URL-shaped argument.
|
|
112
|
-
* 4. Execute the tool with `{ url, format: "markdown" }`-shaped args.
|
|
113
|
-
* If exec fails, return null so the caller falls back to plain HTTP.
|
|
114
|
-
*/
|
|
115
|
-
async function tryMcpx(
|
|
116
|
-
url: string,
|
|
117
|
-
mcpx: NonNullable<FetchOptions["mcpx"]>,
|
|
118
|
-
hint: string | undefined,
|
|
119
|
-
): Promise<FetchedRemote | null> {
|
|
120
|
-
const candidates = await discoverCandidates(url, mcpx, hint);
|
|
121
|
-
if (candidates.length === 0) return null;
|
|
122
|
-
|
|
123
|
-
const chosen = pickTool(candidates);
|
|
124
|
-
if (!chosen) return null;
|
|
125
|
-
|
|
126
|
-
const args = buildArgs(chosen.tool.name, url);
|
|
127
|
-
let result: unknown;
|
|
128
|
-
try {
|
|
129
|
-
result = await mcpx.exec(chosen.server, chosen.tool.name, args);
|
|
130
|
-
} catch (err) {
|
|
131
|
-
logger.warn(
|
|
132
|
-
`mcpx: ${chosen.server}/${chosen.tool.name} failed (${err instanceof Error ? err.message : String(err)})`,
|
|
133
|
-
);
|
|
134
|
-
return null;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
const text = extractText(result);
|
|
138
|
-
if (!text || text.trim().length === 0) return null;
|
|
139
|
-
const bytes = new TextEncoder().encode(text);
|
|
140
|
-
return {
|
|
141
|
-
bytes,
|
|
142
|
-
sha256: sha256Hex(bytes),
|
|
143
|
-
mimeType: "text/markdown",
|
|
144
|
-
fetcher: "mcpx",
|
|
145
|
-
fetcherServer: chosen.server,
|
|
146
|
-
fetcherTool: chosen.tool.name,
|
|
147
|
-
fetcherArgs: args,
|
|
148
|
-
sourceUrl: url,
|
|
149
|
-
};
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
/**
|
|
153
|
-
* Build a list of candidate fetcher tools by querying mcpx's live catalog.
|
|
154
|
-
* Tries semantic search first (using the hint or the URL's host as the
|
|
155
|
-
* query) then falls back to listing all tools and filtering by name. Never
|
|
156
|
-
* hardcodes a server name — the catalog is the source of truth.
|
|
149
|
+
* Detect MCP `CallToolResult` envelopes that signal tool failure. MCP
|
|
150
|
+
* tool errors don't throw — they return `{ isError: true, content: [...] }`
|
|
151
|
+
* — so callers must check this explicitly before treating the content
|
|
152
|
+
* as a successful payload. Used by the refresh runner; the agent loop
|
|
153
|
+
* has its own preview-aware check.
|
|
157
154
|
*/
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
hint: string | undefined,
|
|
162
|
-
): Promise<McpxToolDescriptor[]> {
|
|
163
|
-
const host = safeHost(url);
|
|
164
|
-
const queries = buildQueries(hint, host);
|
|
165
|
-
|
|
166
|
-
if (mcpx.search) {
|
|
167
|
-
for (const q of queries) {
|
|
168
|
-
try {
|
|
169
|
-
const hits = await mcpx.search(q);
|
|
170
|
-
if (hits.length > 0) {
|
|
171
|
-
return hits.slice(0, 5).map((h) => ({ server: h.server, tool: h.tool }));
|
|
172
|
-
}
|
|
173
|
-
} catch (err) {
|
|
174
|
-
logger.debug(`mcpx: search(${q}) failed (${err instanceof Error ? err.message : String(err)})`);
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
let tools: McpxToolDescriptor[];
|
|
180
|
-
try {
|
|
181
|
-
tools = await mcpx.listTools();
|
|
182
|
-
} catch (err) {
|
|
183
|
-
logger.debug(`mcpx: listTools failed (${err instanceof Error ? err.message : String(err)})`);
|
|
184
|
-
return [];
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
const lowercaseHaystack = (t: McpxToolDescriptor) =>
|
|
188
|
-
`${t.server} ${t.tool.name} ${t.tool.description ?? ""}`.toLowerCase();
|
|
189
|
-
|
|
190
|
-
if (hint) {
|
|
191
|
-
const needle = hint.toLowerCase();
|
|
192
|
-
const matched = tools.filter((t) => lowercaseHaystack(t).includes(needle));
|
|
193
|
-
if (matched.length > 0) return matched;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
if (host) {
|
|
197
|
-
const tokens = host.split(".");
|
|
198
|
-
const matched = tools.filter((t) => tokens.some((tok) => tok.length > 2 && lowercaseHaystack(t).includes(tok)));
|
|
199
|
-
if (matched.length > 0) return matched;
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
// Fall back to any tool that looks like a URL fetcher.
|
|
203
|
-
return tools.filter((t) => /fetch|scrape|http|url/i.test(`${t.tool.name} ${t.tool.description ?? ""}`));
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
/** Compose semantic-search queries to feed mcpx.search. */
|
|
207
|
-
function buildQueries(hint: string | undefined, host: string | null): string[] {
|
|
208
|
-
const out: string[] = [];
|
|
209
|
-
if (hint) out.push(`${hint} fetch markdown`);
|
|
210
|
-
if (host) out.push(`fetch ${host} as markdown`, `scrape ${host}`);
|
|
211
|
-
out.push("fetch URL as markdown", "scrape webpage to markdown");
|
|
212
|
-
return out;
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
/** URL → hostname or null. */
|
|
216
|
-
function safeHost(url: string): string | null {
|
|
217
|
-
try {
|
|
218
|
-
return new URL(url).hostname.toLowerCase();
|
|
219
|
-
} catch {
|
|
220
|
-
return null;
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
/**
|
|
225
|
-
* Among the candidate tools, prefer one whose name or description signals
|
|
226
|
-
* markdown output (contains "markdown", "md", "Docmd", etc.). Falls back
|
|
227
|
-
* to anything that looks like a generic fetch/scrape verb, and finally
|
|
228
|
-
* to the first candidate so we always try something.
|
|
229
|
-
*/
|
|
230
|
-
function pickTool(tools: McpxToolDescriptor[]): McpxToolDescriptor | null {
|
|
231
|
-
const score = (t: McpxToolDescriptor) => {
|
|
232
|
-
const hay = `${t.tool.name} ${t.tool.description ?? ""}`.toLowerCase();
|
|
233
|
-
let s = 0;
|
|
234
|
-
if (/markdown|docmd|asmd|\bmd\b/.test(hay)) s += 5;
|
|
235
|
-
if (/scrape|extract|fetch|get|read/.test(hay)) s += 2;
|
|
236
|
-
if (/url|web|html|page/.test(hay)) s += 1;
|
|
237
|
-
return s;
|
|
238
|
-
};
|
|
239
|
-
const sorted = [...tools].sort((a, b) => score(b) - score(a));
|
|
240
|
-
return sorted[0] ?? null;
|
|
241
|
-
}
|
|
242
|
-
|
|
243
|
-
/**
|
|
244
|
-
* Build the argument object the mcpx fetcher tool likely accepts. We can't
|
|
245
|
-
* know the schema without calling info(), so we build a permissive bag with
|
|
246
|
-
* the common shapes (`{url, format: "markdown", formats: ["markdown"]}`)
|
|
247
|
-
* and trust the underlying tool to ignore unknown fields.
|
|
248
|
-
*/
|
|
249
|
-
function buildArgs(toolName: string, url: string): Record<string, unknown> {
|
|
250
|
-
const args: Record<string, unknown> = { url };
|
|
251
|
-
if (/markdown|md/i.test(toolName)) args.format = "markdown";
|
|
252
|
-
args.formats = ["markdown"];
|
|
253
|
-
return args;
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
/** Pull a string out of the heterogeneous shapes mcpx tools return. */
|
|
257
|
-
function extractText(result: unknown): string {
|
|
258
|
-
if (typeof result === "string") return result;
|
|
259
|
-
if (result && typeof result === "object") {
|
|
260
|
-
const maybe = result as Record<string, unknown>;
|
|
261
|
-
if (typeof maybe.text === "string") return maybe.text;
|
|
262
|
-
if (typeof maybe.content === "string") return maybe.content;
|
|
263
|
-
if (typeof maybe.markdown === "string") return maybe.markdown;
|
|
264
|
-
if (Array.isArray(maybe.content)) {
|
|
265
|
-
const out: string[] = [];
|
|
266
|
-
for (const c of maybe.content) {
|
|
267
|
-
if (c && typeof c === "object") {
|
|
268
|
-
const inner = c as Record<string, unknown>;
|
|
269
|
-
if (typeof inner.text === "string") out.push(inner.text);
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
if (out.length > 0) return out.join("\n\n");
|
|
273
|
-
}
|
|
274
|
-
}
|
|
275
|
-
try {
|
|
276
|
-
return JSON.stringify(result);
|
|
277
|
-
} catch {
|
|
278
|
-
return "";
|
|
279
|
-
}
|
|
155
|
+
export function isMcpToolError(result: unknown): boolean {
|
|
156
|
+
if (!result || typeof result !== "object") return false;
|
|
157
|
+
return (result as { isError?: unknown }).isError === true;
|
|
280
158
|
}
|
package/src/ingest/ingest.ts
CHANGED
|
@@ -3,6 +3,7 @@ import { upsertBlob } from "../db/blobs.ts";
|
|
|
3
3
|
import { insertChunksForVersion, rebuildFts } from "../db/chunks.ts";
|
|
4
4
|
import { type FetcherKind, getCurrent, insertVersion, millisIso, type SourceType } from "../db/files.ts";
|
|
5
5
|
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
6
|
+
import { logger } from "../output/logger.ts";
|
|
6
7
|
import { chunkDeterministic } from "./chunker.ts";
|
|
7
8
|
import { convert } from "./converter/index.ts";
|
|
8
9
|
import { describe } from "./describer.ts";
|
|
@@ -188,12 +189,32 @@ async function ingestUrl(
|
|
|
188
189
|
): Promise<IngestResult> {
|
|
189
190
|
const mcpxAdapter = ctx.mcpx
|
|
190
191
|
? {
|
|
191
|
-
async
|
|
192
|
-
|
|
193
|
-
|
|
192
|
+
async search(query: string, options?: { keywordOnly?: boolean; semanticOnly?: boolean }) {
|
|
193
|
+
try {
|
|
194
|
+
const results = await ctx.mcpx!.search(query, options);
|
|
195
|
+
return results.map((r) => ({
|
|
196
|
+
server: r.server,
|
|
197
|
+
tool: r.tool,
|
|
198
|
+
description: r.description ?? undefined,
|
|
199
|
+
score: r.score,
|
|
200
|
+
matchType: r.matchType ?? undefined,
|
|
201
|
+
}));
|
|
202
|
+
} catch (err) {
|
|
203
|
+
logger.debug(`mcpx.search(${query}) failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
204
|
+
return [];
|
|
205
|
+
}
|
|
194
206
|
},
|
|
195
|
-
async
|
|
196
|
-
|
|
207
|
+
async listTools(server?: string) {
|
|
208
|
+
const tools = await ctx.mcpx!.listTools(server);
|
|
209
|
+
return tools.map((t) => ({ server: t.server, tool: { name: t.tool.name, description: t.tool.description } }));
|
|
210
|
+
},
|
|
211
|
+
async info(server: string, tool: string) {
|
|
212
|
+
const t = await ctx.mcpx!.info(server, tool);
|
|
213
|
+
if (!t) return undefined;
|
|
214
|
+
return { name: t.name, description: t.description, inputSchema: t.inputSchema };
|
|
215
|
+
},
|
|
216
|
+
async exec(server: string, tool: string, args?: Record<string, unknown>) {
|
|
217
|
+
return ctx.mcpx!.exec(server, tool, args ?? {});
|
|
197
218
|
},
|
|
198
219
|
}
|
|
199
220
|
: null;
|
|
@@ -212,7 +233,12 @@ async function ingestUrl(
|
|
|
212
233
|
};
|
|
213
234
|
|
|
214
235
|
try {
|
|
215
|
-
const fetched = await fetchRemote(url, {
|
|
236
|
+
const fetched = await fetchRemote(url, {
|
|
237
|
+
hint: input.fetcher_hint,
|
|
238
|
+
mcpx: mcpxAdapter,
|
|
239
|
+
llm: ctx.config.llm,
|
|
240
|
+
onProgress: (sublabel) => callbacks?.onEntryProgress?.(url, sublabel),
|
|
241
|
+
});
|
|
216
242
|
result.mime_type = fetched.mimeType;
|
|
217
243
|
result.size_bytes = fetched.bytes.byteLength;
|
|
218
244
|
result.fetcher = fetched.fetcher;
|
|
@@ -583,6 +609,7 @@ function summarize(entries: IngestEntryResult[]): IngestResult {
|
|
|
583
609
|
}
|
|
584
610
|
|
|
585
611
|
function errorMessage(err: unknown): string {
|
|
612
|
+
if (err instanceof HelpfulError) return `${err.message} — ${err.hint}`;
|
|
586
613
|
if (err instanceof Error) return err.message;
|
|
587
614
|
return String(err);
|
|
588
615
|
}
|
package/src/refresh/runner.ts
CHANGED
|
@@ -8,7 +8,7 @@ import { chunkDeterministic } from "../ingest/chunker.ts";
|
|
|
8
8
|
import { convert } from "../ingest/converter/index.ts";
|
|
9
9
|
import { describe } from "../ingest/describer.ts";
|
|
10
10
|
import { embed } from "../ingest/embedder.ts";
|
|
11
|
-
import { fetchRemote } from "../ingest/fetcher.ts";
|
|
11
|
+
import { fetchRemote, isMcpToolError } from "../ingest/fetcher.ts";
|
|
12
12
|
import { mimeFromPath, readLocalFile, sha256Hex } from "../ingest/local-reader.ts";
|
|
13
13
|
import { buildSearchText } from "../ingest/search-text.ts";
|
|
14
14
|
|
|
@@ -192,6 +192,14 @@ async function replayFetch(
|
|
|
192
192
|
if (cur.fetcher === "mcpx" && cur.fetcher_server && cur.fetcher_tool && mcpx) {
|
|
193
193
|
const args = cur.fetcher_args ?? {};
|
|
194
194
|
const result = await mcpx.exec(cur.fetcher_server, cur.fetcher_tool, args);
|
|
195
|
+
if (isMcpToolError(result)) {
|
|
196
|
+
const detail = extractText(result).trim();
|
|
197
|
+
throw new HelpfulError({
|
|
198
|
+
kind: "network_error",
|
|
199
|
+
message: `mcpx tool ${cur.fetcher_server}/${cur.fetcher_tool} returned isError=true${detail ? `: ${detail}` : ""}`,
|
|
200
|
+
hint: `Re-add with a working fetcher: \`membot remove ${cur.logical_path}\` then \`membot add ${cur.source_path} --fetcher http\` (or another --fetcher hint).`,
|
|
201
|
+
});
|
|
202
|
+
}
|
|
195
203
|
const text = extractText(result);
|
|
196
204
|
const bytes = new TextEncoder().encode(text);
|
|
197
205
|
return {
|