membot 0.0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +137 -0
- package/.cursor/rules/membot.mdc +137 -0
- package/README.md +131 -0
- package/package.json +83 -24
- package/patches/@huggingface%2Ftransformers@4.2.0.patch +137 -0
- package/scripts/apply-transformers-patch.sh +35 -0
- package/src/cli.ts +72 -0
- package/src/commands/check-update.ts +69 -0
- package/src/commands/mcpx.ts +112 -0
- package/src/commands/reindex.ts +53 -0
- package/src/commands/serve.ts +58 -0
- package/src/commands/skill.ts +131 -0
- package/src/commands/upgrade.ts +220 -0
- package/src/config/loader.ts +100 -0
- package/src/config/schemas.ts +39 -0
- package/src/constants.ts +42 -0
- package/src/context.ts +80 -0
- package/src/db/blobs.ts +53 -0
- package/src/db/chunks.ts +176 -0
- package/src/db/connection.ts +173 -0
- package/src/db/files.ts +325 -0
- package/src/db/migrations/001-init.ts +63 -0
- package/src/db/migrations/002-fts.ts +12 -0
- package/src/db/migrations.ts +45 -0
- package/src/errors.ts +87 -0
- package/src/ingest/chunker.ts +117 -0
- package/src/ingest/converter/docx.ts +15 -0
- package/src/ingest/converter/html.ts +20 -0
- package/src/ingest/converter/image.ts +71 -0
- package/src/ingest/converter/index.ts +119 -0
- package/src/ingest/converter/llm.ts +66 -0
- package/src/ingest/converter/ocr.ts +51 -0
- package/src/ingest/converter/pdf.ts +38 -0
- package/src/ingest/converter/text.ts +8 -0
- package/src/ingest/describer.ts +72 -0
- package/src/ingest/embedder.ts +98 -0
- package/src/ingest/fetcher.ts +280 -0
- package/src/ingest/ingest.ts +444 -0
- package/src/ingest/local-reader.ts +64 -0
- package/src/ingest/search-text.ts +18 -0
- package/src/ingest/source-resolver.ts +186 -0
- package/src/mcp/instructions.ts +34 -0
- package/src/mcp/server.ts +101 -0
- package/src/mount/commander.ts +174 -0
- package/src/mount/mcp.ts +111 -0
- package/src/mount/zod-to-cli.ts +158 -0
- package/src/operations/add.ts +69 -0
- package/src/operations/diff.ts +105 -0
- package/src/operations/index.ts +38 -0
- package/src/operations/info.ts +95 -0
- package/src/operations/list.ts +87 -0
- package/src/operations/move.ts +83 -0
- package/src/operations/prune.ts +80 -0
- package/src/operations/read.ts +102 -0
- package/src/operations/refresh.ts +72 -0
- package/src/operations/remove.ts +35 -0
- package/src/operations/search.ts +72 -0
- package/src/operations/tree.ts +103 -0
- package/src/operations/types.ts +81 -0
- package/src/operations/versions.ts +78 -0
- package/src/operations/write.ts +77 -0
- package/src/output/formatter.ts +68 -0
- package/src/output/logger.ts +114 -0
- package/src/output/progress.ts +78 -0
- package/src/output/tty.ts +91 -0
- package/src/refresh/runner.ts +296 -0
- package/src/refresh/scheduler.ts +54 -0
- package/src/sdk.ts +27 -0
- package/src/search/hybrid.ts +100 -0
- package/src/search/keyword.ts +62 -0
- package/src/search/semantic.ts +56 -0
- package/src/types/text-modules.d.ts +9 -0
- package/src/update/background.ts +73 -0
- package/src/update/cache.ts +40 -0
- package/src/update/checker.ts +117 -0
- package/.claude/settings.local.json +0 -7
- package/CLAUDE.md +0 -139
- package/docs/plan.md +0 -905
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { extractText, getDocumentProxy } from "unpdf";
|
|
2
|
+
import { logger } from "../../output/logger.ts";
|
|
3
|
+
|
|
4
|
+
export interface PdfConversion {
|
|
5
|
+
markdown: string;
|
|
6
|
+
textRatio: number;
|
|
7
|
+
usedOcrFallback: boolean;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
const LOW_TEXT_RATIO = 0.005; // < ~5 chars per kB → very likely scanned
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Extract the text layer from a PDF using unpdf. Returns the extracted
|
|
14
|
+
* markdown and a `textRatio` (chars / file-bytes) so the dispatcher can
|
|
15
|
+
* decide whether to fall through to OCR. The OCR step itself happens in
|
|
16
|
+
* converter/index.ts so this module stays free of WASM dependencies.
|
|
17
|
+
*/
|
|
18
|
+
export async function convertPdf(bytes: Uint8Array): Promise<PdfConversion> {
|
|
19
|
+
try {
|
|
20
|
+
const pdf = await getDocumentProxy(bytes);
|
|
21
|
+
const { text } = await extractText(pdf, { mergePages: false });
|
|
22
|
+
const pages: string[] = Array.isArray(text) ? text : [String(text)];
|
|
23
|
+
const md = pages
|
|
24
|
+
.map((p, i) => `## Page ${i + 1}\n\n${p.trim()}`)
|
|
25
|
+
.filter((p) => p.length > 0)
|
|
26
|
+
.join("\n\n");
|
|
27
|
+
const ratio = bytes.byteLength === 0 ? 0 : md.length / bytes.byteLength;
|
|
28
|
+
return { markdown: md, textRatio: ratio, usedOcrFallback: false };
|
|
29
|
+
} catch (err) {
|
|
30
|
+
logger.warn(`pdf: text extraction failed (${err instanceof Error ? err.message : String(err)})`);
|
|
31
|
+
return { markdown: "", textRatio: 0, usedOcrFallback: false };
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Decide whether unpdf's output is "low text ratio" enough to warrant OCR fallback. */
|
|
36
|
+
export function shouldOcrPdf(conversion: PdfConversion): boolean {
|
|
37
|
+
return conversion.markdown.trim().length === 0 || conversion.textRatio < LOW_TEXT_RATIO;
|
|
38
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Plain-text / markdown passthrough converter. Decodes bytes as UTF-8 and
|
|
3
|
+
* returns them unchanged — the chunker downstream handles paragraph
|
|
4
|
+
* boundaries the same way as it would for an LLM-converted file.
|
|
5
|
+
*/
|
|
6
|
+
export function convertText(bytes: Uint8Array): string {
|
|
7
|
+
return new TextDecoder("utf-8").decode(bytes);
|
|
8
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import Anthropic from "@anthropic-ai/sdk";
|
|
2
|
+
import type { LlmConfig } from "../config/schemas.ts";
|
|
3
|
+
import { logger } from "../output/logger.ts";
|
|
4
|
+
|
|
5
|
+
const DESCRIBER_PROMPT = `You write a one-paragraph description of a file for use in a search index.
|
|
6
|
+
|
|
7
|
+
Rules:
|
|
8
|
+
- One paragraph, 1-3 sentences.
|
|
9
|
+
- Plain prose, no headings, no markdown formatting.
|
|
10
|
+
- Cover what the file IS and what it's ABOUT — both subject and shape.
|
|
11
|
+
- For images, focus on the visual subject. For documents, focus on the topic and intended reader.
|
|
12
|
+
- Output the description ONLY — no preamble, no quoting, no labels.`;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Generate a one-paragraph description for the file's surrogate, used
|
|
16
|
+
* as the `<description>` line in chunks.search_text. Falls back to a
|
|
17
|
+
* deterministic heuristic when no API key is configured so the pipeline
|
|
18
|
+
* still produces a non-empty description offline.
|
|
19
|
+
*/
|
|
20
|
+
export async function describe(
|
|
21
|
+
logicalPath: string,
|
|
22
|
+
mimeType: string,
|
|
23
|
+
surrogate: string,
|
|
24
|
+
llm: LlmConfig,
|
|
25
|
+
): Promise<string> {
|
|
26
|
+
if (!llm.anthropic_api_key || llm.anthropic_api_key.trim() === "") {
|
|
27
|
+
return deterministicDescription(logicalPath, mimeType, surrogate);
|
|
28
|
+
}
|
|
29
|
+
const client = new Anthropic({ apiKey: llm.anthropic_api_key });
|
|
30
|
+
const sample = surrogate.slice(0, 4_000);
|
|
31
|
+
try {
|
|
32
|
+
const resp = await client.messages.create({
|
|
33
|
+
model: llm.describer_model,
|
|
34
|
+
max_tokens: 300,
|
|
35
|
+
system: DESCRIBER_PROMPT,
|
|
36
|
+
messages: [
|
|
37
|
+
{
|
|
38
|
+
role: "user",
|
|
39
|
+
content: `Logical path: ${logicalPath}\nMIME type: ${mimeType}\n\nFile body:\n${sample}`,
|
|
40
|
+
},
|
|
41
|
+
],
|
|
42
|
+
});
|
|
43
|
+
const text = resp.content
|
|
44
|
+
.flatMap((b) => (b.type === "text" ? [b.text] : []))
|
|
45
|
+
.join("")
|
|
46
|
+
.trim();
|
|
47
|
+
if (!text) return deterministicDescription(logicalPath, mimeType, surrogate);
|
|
48
|
+
return text;
|
|
49
|
+
} catch (err) {
|
|
50
|
+
logger.warn(`describer: failed (${err instanceof Error ? err.message : String(err)}) — falling back`);
|
|
51
|
+
return deterministicDescription(logicalPath, mimeType, surrogate);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Cheap, deterministic description used when the LLM isn't available.
|
|
57
|
+
* For markdown/text it's the first heading + a 200-char prefix; for
|
|
58
|
+
* binaries it's `<mime> · <size> bytes`.
|
|
59
|
+
*/
|
|
60
|
+
export function deterministicDescription(logicalPath: string, mimeType: string, surrogate: string): string {
|
|
61
|
+
if (mimeType.startsWith("text/") || mimeType === "application/json" || mimeType === "application/yaml") {
|
|
62
|
+
const trimmed = surrogate.trim();
|
|
63
|
+
const headingMatch = trimmed.match(/^#+\s+(.+)$/m);
|
|
64
|
+
const heading = headingMatch?.[1]?.trim();
|
|
65
|
+
const prefix = trimmed.slice(0, 200).replace(/\s+/g, " ").trim();
|
|
66
|
+
if (heading && prefix) return `${heading} — ${prefix}`;
|
|
67
|
+
if (heading) return heading;
|
|
68
|
+
if (prefix) return prefix;
|
|
69
|
+
return `${logicalPath} (${mimeType})`;
|
|
70
|
+
}
|
|
71
|
+
return `${mimeType} · ${surrogate.length} chars`;
|
|
72
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { existsSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { env, type FeatureExtractionPipeline, pipeline } from "@huggingface/transformers";
|
|
4
|
+
import { EMBEDDING_DIMENSION, EMBEDDING_MODEL } from "../constants.ts";
|
|
5
|
+
import { HelpfulError } from "../errors.ts";
|
|
6
|
+
import { logger } from "../output/logger.ts";
|
|
7
|
+
|
|
8
|
+
// We patch @huggingface/transformers to use onnxruntime-web (WASM). Pin the
|
|
9
|
+
// loader to the on-disk copy so we stay offline-capable.
|
|
10
|
+
const ortWasm = env.backends.onnx?.wasm;
|
|
11
|
+
if (ortWasm) {
|
|
12
|
+
ortWasm.wasmPaths = {
|
|
13
|
+
mjs: import.meta.resolve("onnxruntime-web/ort-wasm-simd-threaded.asyncify.mjs"),
|
|
14
|
+
wasm: import.meta.resolve("onnxruntime-web/ort-wasm-simd-threaded.asyncify.wasm"),
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const pipelinePromises = new Map<string, Promise<FeatureExtractionPipeline>>();
|
|
19
|
+
|
|
20
|
+
/** Configure where transformers caches downloaded model weights. */
|
|
21
|
+
export function setEmbeddingCacheDir(dir: string): void {
|
|
22
|
+
env.cacheDir = dir.endsWith("/") ? dir : `${dir}/`;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function isModelCached(model: string): boolean {
|
|
26
|
+
if (!env.cacheDir) return false;
|
|
27
|
+
return existsSync(join(env.cacheDir, model));
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Lazily load (and cache) the feature-extraction pipeline for a model. Loading
|
|
32
|
+
* is expensive (downloads weights on first run, ~100s of ms to instantiate
|
|
33
|
+
* ONNX), so we hold one promise per model name for the life of the process.
|
|
34
|
+
*
|
|
35
|
+
* Try `wasm` first, fall back to `cpu` on "Unsupported device". The transformers
|
|
36
|
+
* patch (applied for `bun build --compile` and via `bun run prebuild` for local
|
|
37
|
+
* dev) registers `wasm` as a supported device backed by onnxruntime-web — that's
|
|
38
|
+
* mandatory for the single-binary build because native bindings can't be
|
|
39
|
+
* bundled. When the package is unpatched (npm-installed membot, or `bun dev`
|
|
40
|
+
* before `prebuild`), `wasm` is rejected and we fall back to the default `cpu`
|
|
41
|
+
* device, which uses the onnxruntime-node native bindings that ship with the
|
|
42
|
+
* unpatched package.
|
|
43
|
+
*/
|
|
44
|
+
async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
|
|
45
|
+
let p = pipelinePromises.get(model);
|
|
46
|
+
if (!p) {
|
|
47
|
+
if (isModelCached(model)) {
|
|
48
|
+
logger.debug(`embedder: loading cached model ${model}`);
|
|
49
|
+
} else {
|
|
50
|
+
logger.info(`embedder: loading model ${model} (first run, downloading weights)`);
|
|
51
|
+
}
|
|
52
|
+
p = (async () => {
|
|
53
|
+
try {
|
|
54
|
+
return (await pipeline("feature-extraction", model, { device: "wasm" })) as FeatureExtractionPipeline;
|
|
55
|
+
} catch (err) {
|
|
56
|
+
if (!String((err as Error)?.message ?? "").includes("Unsupported device")) throw err;
|
|
57
|
+
logger.debug("embedder: wasm backend unavailable, falling back to cpu (onnxruntime-node)");
|
|
58
|
+
return (await pipeline("feature-extraction", model, { device: "cpu" })) as FeatureExtractionPipeline;
|
|
59
|
+
}
|
|
60
|
+
})();
|
|
61
|
+
pipelinePromises.set(model, p);
|
|
62
|
+
}
|
|
63
|
+
return p;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Embed an array of texts to L2-normalized vectors with the configured
|
|
68
|
+
* model. Throws a HelpfulError when the model's dimension doesn't match
|
|
69
|
+
* EMBEDDING_DIMENSION (the value baked into the DB schema).
|
|
70
|
+
*/
|
|
71
|
+
export async function embed(texts: string[], model: string = EMBEDDING_MODEL): Promise<number[][]> {
|
|
72
|
+
if (texts.length === 0) return [];
|
|
73
|
+
const extractor = await getPipeline(model);
|
|
74
|
+
const output = await extractor(texts, { pooling: "mean", normalize: true });
|
|
75
|
+
const data = output.tolist() as number[][];
|
|
76
|
+
if (data[0] && data[0].length !== EMBEDDING_DIMENSION) {
|
|
77
|
+
throw new HelpfulError({
|
|
78
|
+
kind: "internal_error",
|
|
79
|
+
message: `embedding model ${model} returned ${data[0].length}-dim vectors, expected ${EMBEDDING_DIMENSION}`,
|
|
80
|
+
hint: `Set config.embedding_model to a ${EMBEDDING_DIMENSION}-dim model (default: ${EMBEDDING_MODEL}).`,
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
return data;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** Embed a single text — convenience wrapper for query-time embedding. */
|
|
87
|
+
export async function embedSingle(text: string, model: string = EMBEDDING_MODEL): Promise<number[]> {
|
|
88
|
+
const all = await embed([text], model);
|
|
89
|
+
const vec = all[0];
|
|
90
|
+
if (!vec) {
|
|
91
|
+
throw new HelpfulError({
|
|
92
|
+
kind: "internal_error",
|
|
93
|
+
message: "embed() returned no vectors",
|
|
94
|
+
hint: "This is likely a transformers WASM patch issue. Run `bun run prebuild` and retry.",
|
|
95
|
+
});
|
|
96
|
+
}
|
|
97
|
+
return vec;
|
|
98
|
+
}
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
import { DEFAULTS } from "../constants.ts";
|
|
2
|
+
import { asHelpful, HelpfulError } from "../errors.ts";
|
|
3
|
+
import { logger } from "../output/logger.ts";
|
|
4
|
+
import { sha256Hex } from "./local-reader.ts";
|
|
5
|
+
|
|
6
|
+
export interface FetchedRemote {
|
|
7
|
+
bytes: Uint8Array;
|
|
8
|
+
sha256: string;
|
|
9
|
+
mimeType: string;
|
|
10
|
+
fetcher: "http" | "mcpx";
|
|
11
|
+
fetcherServer: string | null;
|
|
12
|
+
fetcherTool: string | null;
|
|
13
|
+
fetcherArgs: Record<string, unknown> | null;
|
|
14
|
+
sourceUrl: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface McpxToolDescriptor {
|
|
18
|
+
server: string;
|
|
19
|
+
tool: { name: string; description?: string };
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface McpxSearchHit {
|
|
23
|
+
server: string;
|
|
24
|
+
tool: { name: string; description?: string };
|
|
25
|
+
score?: number;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface FetchOptions {
|
|
29
|
+
/**
|
|
30
|
+
* User-provided hint. Free-form keyword (e.g. "firecrawl", "github",
|
|
31
|
+
* "google-docs", "http"). Special-cased: "http" forces plain fetch.
|
|
32
|
+
* Otherwise the hint is used as a search query against the live
|
|
33
|
+
* mcpx tool catalog — we never hardcode server names.
|
|
34
|
+
*/
|
|
35
|
+
hint?: string;
|
|
36
|
+
/** Live mcpx adapter. Use listTools/search/exec to find a fetcher on the fly. */
|
|
37
|
+
mcpx?: {
|
|
38
|
+
exec(server: string, tool: string, args: Record<string, unknown>): Promise<unknown>;
|
|
39
|
+
listTools(): Promise<McpxToolDescriptor[]>;
|
|
40
|
+
search?(query: string): Promise<McpxSearchHit[]>;
|
|
41
|
+
} | null;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Fetch a remote URL, preferring an mcpx-managed server (Firecrawl, Google
|
|
46
|
+
* Docs, GitHub, …) for known providers and falling back to a plain `fetch`
|
|
47
|
+
* otherwise. The chosen invocation (server/tool/args) is returned alongside
|
|
48
|
+
* the bytes so the caller can persist it on the row for replay-on-refresh.
|
|
49
|
+
*/
|
|
50
|
+
export async function fetchRemote(url: string, options: FetchOptions = {}): Promise<FetchedRemote> {
|
|
51
|
+
const mcpx = options.mcpx;
|
|
52
|
+
const hint = options.hint?.trim();
|
|
53
|
+
|
|
54
|
+
if (hint === "http") return httpFetch(url);
|
|
55
|
+
if (!mcpx) return httpFetch(url);
|
|
56
|
+
|
|
57
|
+
const tried = await tryMcpx(url, mcpx, hint);
|
|
58
|
+
if (tried) return tried;
|
|
59
|
+
return httpFetch(url);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/** Plain `fetch` fallback. Used when mcpx isn't configured or the hint says so. */
|
|
63
|
+
async function httpFetch(url: string): Promise<FetchedRemote> {
|
|
64
|
+
let resp: Response;
|
|
65
|
+
try {
|
|
66
|
+
resp = await fetch(url, {
|
|
67
|
+
headers: { "User-Agent": "membot/0.1" },
|
|
68
|
+
signal: AbortSignal.timeout(DEFAULTS.HTTP_TIMEOUT_MS),
|
|
69
|
+
});
|
|
70
|
+
} catch (err) {
|
|
71
|
+
throw asHelpful(
|
|
72
|
+
err,
|
|
73
|
+
`while fetching ${url}`,
|
|
74
|
+
`Check your network and that ${url} is reachable. For mcpx-managed sources (gdocs/github/firecrawl), set --fetcher firecrawl etc.`,
|
|
75
|
+
"network_error",
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
if (!resp.ok) {
|
|
79
|
+
throw new HelpfulError({
|
|
80
|
+
kind: "network_error",
|
|
81
|
+
message: `HTTP ${resp.status} ${resp.statusText}: ${url}`,
|
|
82
|
+
hint: "Verify the URL is reachable and not gated behind auth. For private docs use mcpx via --fetcher.",
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
const bytes = new Uint8Array(await resp.arrayBuffer());
|
|
86
|
+
const ct = resp.headers.get("content-type") ?? "";
|
|
87
|
+
const mime = ct.split(";")[0]?.trim() || "application/octet-stream";
|
|
88
|
+
return {
|
|
89
|
+
bytes,
|
|
90
|
+
sha256: sha256Hex(bytes),
|
|
91
|
+
mimeType: mime,
|
|
92
|
+
fetcher: "http",
|
|
93
|
+
fetcherServer: null,
|
|
94
|
+
fetcherTool: null,
|
|
95
|
+
fetcherArgs: null,
|
|
96
|
+
sourceUrl: url,
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Attempt to fetch via mcpx by discovering a suitable tool at runtime.
|
|
102
|
+
*
|
|
103
|
+
* Strategy:
|
|
104
|
+
* 1. If the user passed a hint, search for it via mcpx.search() (semantic
|
|
105
|
+
* tool search over the live catalog). The hint is the user's free-text
|
|
106
|
+
* label for which provider they want — we never assume server names.
|
|
107
|
+
* 2. Otherwise, fall back to a host-based search query (e.g. URL host
|
|
108
|
+
* "github.com" → search for "github fetch markdown").
|
|
109
|
+
* 3. From the returned candidates, prefer tools whose name or description
|
|
110
|
+
* signals markdown output. Failing that, the first tool that takes a
|
|
111
|
+
* URL-shaped argument.
|
|
112
|
+
* 4. Execute the tool with `{ url, format: "markdown" }`-shaped args.
|
|
113
|
+
* If exec fails, return null so the caller falls back to plain HTTP.
|
|
114
|
+
*/
|
|
115
|
+
async function tryMcpx(
|
|
116
|
+
url: string,
|
|
117
|
+
mcpx: NonNullable<FetchOptions["mcpx"]>,
|
|
118
|
+
hint: string | undefined,
|
|
119
|
+
): Promise<FetchedRemote | null> {
|
|
120
|
+
const candidates = await discoverCandidates(url, mcpx, hint);
|
|
121
|
+
if (candidates.length === 0) return null;
|
|
122
|
+
|
|
123
|
+
const chosen = pickTool(candidates);
|
|
124
|
+
if (!chosen) return null;
|
|
125
|
+
|
|
126
|
+
const args = buildArgs(chosen.tool.name, url);
|
|
127
|
+
let result: unknown;
|
|
128
|
+
try {
|
|
129
|
+
result = await mcpx.exec(chosen.server, chosen.tool.name, args);
|
|
130
|
+
} catch (err) {
|
|
131
|
+
logger.warn(
|
|
132
|
+
`mcpx: ${chosen.server}/${chosen.tool.name} failed (${err instanceof Error ? err.message : String(err)})`,
|
|
133
|
+
);
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const text = extractText(result);
|
|
138
|
+
if (!text || text.trim().length === 0) return null;
|
|
139
|
+
const bytes = new TextEncoder().encode(text);
|
|
140
|
+
return {
|
|
141
|
+
bytes,
|
|
142
|
+
sha256: sha256Hex(bytes),
|
|
143
|
+
mimeType: "text/markdown",
|
|
144
|
+
fetcher: "mcpx",
|
|
145
|
+
fetcherServer: chosen.server,
|
|
146
|
+
fetcherTool: chosen.tool.name,
|
|
147
|
+
fetcherArgs: args,
|
|
148
|
+
sourceUrl: url,
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Build a list of candidate fetcher tools by querying mcpx's live catalog.
|
|
154
|
+
* Tries semantic search first (using the hint or the URL's host as the
|
|
155
|
+
* query) then falls back to listing all tools and filtering by name. Never
|
|
156
|
+
* hardcodes a server name — the catalog is the source of truth.
|
|
157
|
+
*/
|
|
158
|
+
async function discoverCandidates(
|
|
159
|
+
url: string,
|
|
160
|
+
mcpx: NonNullable<FetchOptions["mcpx"]>,
|
|
161
|
+
hint: string | undefined,
|
|
162
|
+
): Promise<McpxToolDescriptor[]> {
|
|
163
|
+
const host = safeHost(url);
|
|
164
|
+
const queries = buildQueries(hint, host);
|
|
165
|
+
|
|
166
|
+
if (mcpx.search) {
|
|
167
|
+
for (const q of queries) {
|
|
168
|
+
try {
|
|
169
|
+
const hits = await mcpx.search(q);
|
|
170
|
+
if (hits.length > 0) {
|
|
171
|
+
return hits.slice(0, 5).map((h) => ({ server: h.server, tool: h.tool }));
|
|
172
|
+
}
|
|
173
|
+
} catch (err) {
|
|
174
|
+
logger.debug(`mcpx: search(${q}) failed (${err instanceof Error ? err.message : String(err)})`);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
let tools: McpxToolDescriptor[];
|
|
180
|
+
try {
|
|
181
|
+
tools = await mcpx.listTools();
|
|
182
|
+
} catch (err) {
|
|
183
|
+
logger.debug(`mcpx: listTools failed (${err instanceof Error ? err.message : String(err)})`);
|
|
184
|
+
return [];
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
const lowercaseHaystack = (t: McpxToolDescriptor) =>
|
|
188
|
+
`${t.server} ${t.tool.name} ${t.tool.description ?? ""}`.toLowerCase();
|
|
189
|
+
|
|
190
|
+
if (hint) {
|
|
191
|
+
const needle = hint.toLowerCase();
|
|
192
|
+
const matched = tools.filter((t) => lowercaseHaystack(t).includes(needle));
|
|
193
|
+
if (matched.length > 0) return matched;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if (host) {
|
|
197
|
+
const tokens = host.split(".");
|
|
198
|
+
const matched = tools.filter((t) => tokens.some((tok) => tok.length > 2 && lowercaseHaystack(t).includes(tok)));
|
|
199
|
+
if (matched.length > 0) return matched;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Fall back to any tool that looks like a URL fetcher.
|
|
203
|
+
return tools.filter((t) => /fetch|scrape|http|url/i.test(`${t.tool.name} ${t.tool.description ?? ""}`));
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/** Compose semantic-search queries to feed mcpx.search. */
|
|
207
|
+
function buildQueries(hint: string | undefined, host: string | null): string[] {
|
|
208
|
+
const out: string[] = [];
|
|
209
|
+
if (hint) out.push(`${hint} fetch markdown`);
|
|
210
|
+
if (host) out.push(`fetch ${host} as markdown`, `scrape ${host}`);
|
|
211
|
+
out.push("fetch URL as markdown", "scrape webpage to markdown");
|
|
212
|
+
return out;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/** URL → hostname or null. */
|
|
216
|
+
function safeHost(url: string): string | null {
|
|
217
|
+
try {
|
|
218
|
+
return new URL(url).hostname.toLowerCase();
|
|
219
|
+
} catch {
|
|
220
|
+
return null;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Among the candidate tools, prefer one whose name or description signals
|
|
226
|
+
* markdown output (contains "markdown", "md", "Docmd", etc.). Falls back
|
|
227
|
+
* to anything that looks like a generic fetch/scrape verb, and finally
|
|
228
|
+
* to the first candidate so we always try something.
|
|
229
|
+
*/
|
|
230
|
+
function pickTool(tools: McpxToolDescriptor[]): McpxToolDescriptor | null {
|
|
231
|
+
const score = (t: McpxToolDescriptor) => {
|
|
232
|
+
const hay = `${t.tool.name} ${t.tool.description ?? ""}`.toLowerCase();
|
|
233
|
+
let s = 0;
|
|
234
|
+
if (/markdown|docmd|asmd|\bmd\b/.test(hay)) s += 5;
|
|
235
|
+
if (/scrape|extract|fetch|get|read/.test(hay)) s += 2;
|
|
236
|
+
if (/url|web|html|page/.test(hay)) s += 1;
|
|
237
|
+
return s;
|
|
238
|
+
};
|
|
239
|
+
const sorted = [...tools].sort((a, b) => score(b) - score(a));
|
|
240
|
+
return sorted[0] ?? null;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/**
|
|
244
|
+
* Build the argument object the mcpx fetcher tool likely accepts. We can't
|
|
245
|
+
* know the schema without calling info(), so we build a permissive bag with
|
|
246
|
+
* the common shapes (`{url, format: "markdown", formats: ["markdown"]}`)
|
|
247
|
+
* and trust the underlying tool to ignore unknown fields.
|
|
248
|
+
*/
|
|
249
|
+
function buildArgs(toolName: string, url: string): Record<string, unknown> {
|
|
250
|
+
const args: Record<string, unknown> = { url };
|
|
251
|
+
if (/markdown|md/i.test(toolName)) args.format = "markdown";
|
|
252
|
+
args.formats = ["markdown"];
|
|
253
|
+
return args;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/** Pull a string out of the heterogeneous shapes mcpx tools return. */
|
|
257
|
+
function extractText(result: unknown): string {
|
|
258
|
+
if (typeof result === "string") return result;
|
|
259
|
+
if (result && typeof result === "object") {
|
|
260
|
+
const maybe = result as Record<string, unknown>;
|
|
261
|
+
if (typeof maybe.text === "string") return maybe.text;
|
|
262
|
+
if (typeof maybe.content === "string") return maybe.content;
|
|
263
|
+
if (typeof maybe.markdown === "string") return maybe.markdown;
|
|
264
|
+
if (Array.isArray(maybe.content)) {
|
|
265
|
+
const out: string[] = [];
|
|
266
|
+
for (const c of maybe.content) {
|
|
267
|
+
if (c && typeof c === "object") {
|
|
268
|
+
const inner = c as Record<string, unknown>;
|
|
269
|
+
if (typeof inner.text === "string") out.push(inner.text);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
if (out.length > 0) return out.join("\n\n");
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
try {
|
|
276
|
+
return JSON.stringify(result);
|
|
277
|
+
} catch {
|
|
278
|
+
return "";
|
|
279
|
+
}
|
|
280
|
+
}
|