membot 0.0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/membot.md +137 -0
- package/.cursor/rules/membot.mdc +137 -0
- package/README.md +131 -0
- package/package.json +83 -24
- package/patches/@huggingface%2Ftransformers@4.2.0.patch +137 -0
- package/scripts/apply-transformers-patch.sh +35 -0
- package/src/cli.ts +72 -0
- package/src/commands/check-update.ts +69 -0
- package/src/commands/mcpx.ts +112 -0
- package/src/commands/reindex.ts +53 -0
- package/src/commands/serve.ts +58 -0
- package/src/commands/skill.ts +131 -0
- package/src/commands/upgrade.ts +220 -0
- package/src/config/loader.ts +100 -0
- package/src/config/schemas.ts +39 -0
- package/src/constants.ts +42 -0
- package/src/context.ts +80 -0
- package/src/db/blobs.ts +53 -0
- package/src/db/chunks.ts +176 -0
- package/src/db/connection.ts +173 -0
- package/src/db/files.ts +325 -0
- package/src/db/migrations/001-init.ts +63 -0
- package/src/db/migrations/002-fts.ts +12 -0
- package/src/db/migrations.ts +45 -0
- package/src/errors.ts +87 -0
- package/src/ingest/chunker.ts +117 -0
- package/src/ingest/converter/docx.ts +15 -0
- package/src/ingest/converter/html.ts +20 -0
- package/src/ingest/converter/image.ts +71 -0
- package/src/ingest/converter/index.ts +119 -0
- package/src/ingest/converter/llm.ts +66 -0
- package/src/ingest/converter/ocr.ts +51 -0
- package/src/ingest/converter/pdf.ts +38 -0
- package/src/ingest/converter/text.ts +8 -0
- package/src/ingest/describer.ts +72 -0
- package/src/ingest/embedder.ts +98 -0
- package/src/ingest/fetcher.ts +280 -0
- package/src/ingest/ingest.ts +444 -0
- package/src/ingest/local-reader.ts +64 -0
- package/src/ingest/search-text.ts +18 -0
- package/src/ingest/source-resolver.ts +186 -0
- package/src/mcp/instructions.ts +34 -0
- package/src/mcp/server.ts +101 -0
- package/src/mount/commander.ts +174 -0
- package/src/mount/mcp.ts +111 -0
- package/src/mount/zod-to-cli.ts +158 -0
- package/src/operations/add.ts +69 -0
- package/src/operations/diff.ts +105 -0
- package/src/operations/index.ts +38 -0
- package/src/operations/info.ts +95 -0
- package/src/operations/list.ts +87 -0
- package/src/operations/move.ts +83 -0
- package/src/operations/prune.ts +80 -0
- package/src/operations/read.ts +102 -0
- package/src/operations/refresh.ts +72 -0
- package/src/operations/remove.ts +35 -0
- package/src/operations/search.ts +72 -0
- package/src/operations/tree.ts +103 -0
- package/src/operations/types.ts +81 -0
- package/src/operations/versions.ts +78 -0
- package/src/operations/write.ts +77 -0
- package/src/output/formatter.ts +68 -0
- package/src/output/logger.ts +114 -0
- package/src/output/progress.ts +78 -0
- package/src/output/tty.ts +91 -0
- package/src/refresh/runner.ts +296 -0
- package/src/refresh/scheduler.ts +54 -0
- package/src/sdk.ts +27 -0
- package/src/search/hybrid.ts +100 -0
- package/src/search/keyword.ts +62 -0
- package/src/search/semantic.ts +56 -0
- package/src/types/text-modules.d.ts +9 -0
- package/src/update/background.ts +73 -0
- package/src/update/cache.ts +40 -0
- package/src/update/checker.ts +117 -0
- package/.claude/settings.local.json +0 -7
- package/CLAUDE.md +0 -139
- package/docs/plan.md +0 -905
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import { rebuildFts } from "../db/chunks.ts";
|
|
2
|
+
import type { DbConnection } from "../db/connection.ts";
|
|
3
|
+
|
|
4
|
+
export interface KeywordHit {
|
|
5
|
+
logical_path: string;
|
|
6
|
+
version_id: string;
|
|
7
|
+
chunk_index: number;
|
|
8
|
+
chunk_content: string;
|
|
9
|
+
search_text: string;
|
|
10
|
+
score: number;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
interface RawKeywordRow {
|
|
14
|
+
row_key: string;
|
|
15
|
+
logical_path: string;
|
|
16
|
+
version_id: string;
|
|
17
|
+
chunk_index: number;
|
|
18
|
+
chunk_content: string;
|
|
19
|
+
search_text: string;
|
|
20
|
+
bm25_score: number;
|
|
21
|
+
[key: string]: unknown;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* BM25 keyword search over `chunks.search_text` via the FTS extension.
|
|
26
|
+
* Returns an empty list when FTS isn't available on this platform — the
|
|
27
|
+
* hybrid layer treats missing keyword hits as "no signal" and degrades
|
|
28
|
+
* to semantic-only.
|
|
29
|
+
*/
|
|
30
|
+
export async function searchKeyword(
|
|
31
|
+
db: DbConnection,
|
|
32
|
+
query: string,
|
|
33
|
+
options: { limit?: number; pathPrefix?: string } = {},
|
|
34
|
+
): Promise<KeywordHit[]> {
|
|
35
|
+
const result = await rebuildFts(db);
|
|
36
|
+
if (result.kind !== "rebuilt") return [];
|
|
37
|
+
|
|
38
|
+
const limit = options.limit ?? 50;
|
|
39
|
+
try {
|
|
40
|
+
const sql = `SELECT row_key, logical_path, version_id, chunk_index,
|
|
41
|
+
chunk_content, search_text,
|
|
42
|
+
fts_main__current_chunks_fts.match_bm25(row_key, ?1) AS bm25_score
|
|
43
|
+
FROM _current_chunks_fts
|
|
44
|
+
WHERE fts_main__current_chunks_fts.match_bm25(row_key, ?1) IS NOT NULL
|
|
45
|
+
${options.pathPrefix ? "AND logical_path LIKE ?2" : ""}
|
|
46
|
+
ORDER BY bm25_score DESC
|
|
47
|
+
LIMIT ${Number(limit)}`;
|
|
48
|
+
const rows: RawKeywordRow[] = options.pathPrefix
|
|
49
|
+
? await db.queryAll<RawKeywordRow>(sql, query, `${options.pathPrefix}%`)
|
|
50
|
+
: await db.queryAll<RawKeywordRow>(sql, query);
|
|
51
|
+
return rows.map((r) => ({
|
|
52
|
+
logical_path: r.logical_path,
|
|
53
|
+
version_id: String(r.version_id),
|
|
54
|
+
chunk_index: Number(r.chunk_index),
|
|
55
|
+
chunk_content: r.chunk_content,
|
|
56
|
+
search_text: r.search_text,
|
|
57
|
+
score: Number(r.bm25_score),
|
|
58
|
+
}));
|
|
59
|
+
} catch {
|
|
60
|
+
return [];
|
|
61
|
+
}
|
|
62
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { EMBEDDING_DIMENSION } from "../constants.ts";
|
|
2
|
+
import type { DbConnection } from "../db/connection.ts";
|
|
3
|
+
|
|
4
|
+
export interface SemanticHit {
|
|
5
|
+
logical_path: string;
|
|
6
|
+
version_id: string;
|
|
7
|
+
chunk_index: number;
|
|
8
|
+
chunk_content: string;
|
|
9
|
+
search_text: string;
|
|
10
|
+
score: number;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
interface RawSemanticRow {
|
|
14
|
+
logical_path: string;
|
|
15
|
+
version_id: string;
|
|
16
|
+
chunk_index: number;
|
|
17
|
+
chunk_content: string;
|
|
18
|
+
search_text: string;
|
|
19
|
+
distance: number;
|
|
20
|
+
[key: string]: unknown;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Cosine-similarity search over the chunks' embedding vectors. Searches
|
|
25
|
+
* `current_chunks` (latest non-tombstoned per logical_path) by default;
|
|
26
|
+
* pass `includeHistory=true` to search every version.
|
|
27
|
+
*/
|
|
28
|
+
export async function searchSemantic(
|
|
29
|
+
db: DbConnection,
|
|
30
|
+
queryVec: number[],
|
|
31
|
+
options: { limit?: number; pathPrefix?: string; includeHistory?: boolean } = {},
|
|
32
|
+
): Promise<SemanticHit[]> {
|
|
33
|
+
const limit = options.limit ?? 50;
|
|
34
|
+
const view = options.includeHistory ? "chunks" : "current_chunks";
|
|
35
|
+
const prefixClause = options.pathPrefix ? `WHERE logical_path LIKE ?2` : "";
|
|
36
|
+
const sql = `SELECT logical_path,
|
|
37
|
+
CAST(version_id AS VARCHAR) AS version_id,
|
|
38
|
+
chunk_index, chunk_content, search_text,
|
|
39
|
+
array_cosine_distance(embedding, ?1::FLOAT[${EMBEDDING_DIMENSION}]) AS distance
|
|
40
|
+
FROM ${view}
|
|
41
|
+
${prefixClause}
|
|
42
|
+
ORDER BY distance ASC
|
|
43
|
+
LIMIT ${Number(limit)}`;
|
|
44
|
+
const rows: RawSemanticRow[] = options.pathPrefix
|
|
45
|
+
? await db.queryAll<RawSemanticRow>(sql, queryVec, `${options.pathPrefix}%`)
|
|
46
|
+
: await db.queryAll<RawSemanticRow>(sql, queryVec);
|
|
47
|
+
|
|
48
|
+
return rows.map((r) => ({
|
|
49
|
+
logical_path: r.logical_path,
|
|
50
|
+
version_id: String(r.version_id),
|
|
51
|
+
chunk_index: Number(r.chunk_index),
|
|
52
|
+
chunk_content: r.chunk_content,
|
|
53
|
+
search_text: r.search_text,
|
|
54
|
+
score: 1 - Number(r.distance),
|
|
55
|
+
}));
|
|
56
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import { cyan, dim, yellow } from "ansis";
|
|
2
|
+
import pkg from "../../package.json" with { type: "json" };
|
|
3
|
+
import { DEFAULTS, ENV } from "../constants.ts";
|
|
4
|
+
import { loadUpdateCache, saveUpdateCache } from "./cache.ts";
|
|
5
|
+
import { checkForUpdate, needsCheck, type UpdateCache } from "./checker.ts";
|
|
6
|
+
|
|
7
|
+
/** Format a multi-line stderr update notice (yellow header + dim changelog + cyan call-to-action). */
|
|
8
|
+
function formatNotice(currentVersion: string, latestVersion: string, changelog?: string): string {
|
|
9
|
+
const lines: string[] = ["", yellow(`Update available: ${currentVersion} → ${latestVersion}`)];
|
|
10
|
+
|
|
11
|
+
if (changelog) {
|
|
12
|
+
lines.push("");
|
|
13
|
+
lines.push(dim(changelog));
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
lines.push("");
|
|
17
|
+
lines.push(cyan("Run `membot upgrade` to update"));
|
|
18
|
+
lines.push("");
|
|
19
|
+
|
|
20
|
+
return lines.join("\n");
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Non-blocking background update check. Returns a formatted notice string when
|
|
25
|
+
* an update is available, or `null` otherwise. Honors `MEMBOT_NO_UPDATE_CHECK`,
|
|
26
|
+
* skips itself for the upgrade/check-update commands, and only fires in TTY.
|
|
27
|
+
* Never throws.
|
|
28
|
+
*/
|
|
29
|
+
export async function maybeCheckForUpdate(): Promise<string | null> {
|
|
30
|
+
try {
|
|
31
|
+
if (process.env[ENV.NO_UPDATE_CHECK] === "1") return null;
|
|
32
|
+
|
|
33
|
+
const args = process.argv.slice(2);
|
|
34
|
+
const command = args.find((a) => !a.startsWith("-"));
|
|
35
|
+
if (command === "check-update" || command === "upgrade") return null;
|
|
36
|
+
|
|
37
|
+
if (!(process.stderr.isTTY ?? false)) return null;
|
|
38
|
+
|
|
39
|
+
const cache = await loadUpdateCache();
|
|
40
|
+
|
|
41
|
+
if (!needsCheck(cache)) {
|
|
42
|
+
if (cache?.hasUpdate) {
|
|
43
|
+
return formatNotice(pkg.version, cache.latestVersion, cache.changelog);
|
|
44
|
+
}
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const controller = new AbortController();
|
|
49
|
+
const timeout = setTimeout(() => controller.abort(), DEFAULTS.UPDATE_CHECK_TIMEOUT_MS);
|
|
50
|
+
|
|
51
|
+
try {
|
|
52
|
+
const info = await checkForUpdate(pkg.version, controller.signal);
|
|
53
|
+
|
|
54
|
+
const newCache: UpdateCache = {
|
|
55
|
+
lastCheckAt: new Date().toISOString(),
|
|
56
|
+
latestVersion: info.latestVersion,
|
|
57
|
+
hasUpdate: info.hasUpdate,
|
|
58
|
+
changelog: info.changelog,
|
|
59
|
+
};
|
|
60
|
+
await saveUpdateCache(newCache);
|
|
61
|
+
|
|
62
|
+
if (info.hasUpdate) {
|
|
63
|
+
return formatNotice(pkg.version, info.latestVersion, info.changelog);
|
|
64
|
+
}
|
|
65
|
+
} finally {
|
|
66
|
+
clearTimeout(timeout);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return null;
|
|
70
|
+
} catch {
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { join } from "node:path";
|
|
2
|
+
import { defaultMembotHome } from "../constants.ts";
|
|
3
|
+
import type { UpdateCache } from "./checker.ts";
|
|
4
|
+
|
|
5
|
+
/** Path to the JSON file that holds the latest update-check result. */
|
|
6
|
+
function updateCachePath(): string {
|
|
7
|
+
return join(defaultMembotHome(), "update.json");
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
/** Load the cached update-check result, or `undefined` if missing/unreadable. */
|
|
11
|
+
export async function loadUpdateCache(): Promise<UpdateCache | undefined> {
|
|
12
|
+
try {
|
|
13
|
+
const file = Bun.file(updateCachePath());
|
|
14
|
+
if (!(await file.exists())) return undefined;
|
|
15
|
+
return JSON.parse(await file.text()) as UpdateCache;
|
|
16
|
+
} catch {
|
|
17
|
+
return undefined;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/** Persist a fresh update-check result. Silent on write failure (e.g. permission denied). */
|
|
22
|
+
export async function saveUpdateCache(cache: UpdateCache): Promise<void> {
|
|
23
|
+
try {
|
|
24
|
+
await Bun.write(updateCachePath(), `${JSON.stringify(cache, null, 2)}\n`);
|
|
25
|
+
} catch {
|
|
26
|
+
// Ignore write failures (e.g. permissions)
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Empty the cache file so the next check is forced to refetch. */
|
|
31
|
+
export async function clearUpdateCache(): Promise<void> {
|
|
32
|
+
try {
|
|
33
|
+
const file = Bun.file(updateCachePath());
|
|
34
|
+
if (await file.exists()) {
|
|
35
|
+
await Bun.write(updateCachePath(), "");
|
|
36
|
+
}
|
|
37
|
+
} catch {
|
|
38
|
+
// Ignore
|
|
39
|
+
}
|
|
40
|
+
}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import pkg from "../../package.json" with { type: "json" };
|
|
2
|
+
import { DEFAULTS } from "../constants.ts";
|
|
3
|
+
|
|
4
|
+
const NPM_REGISTRY_URL = `https://registry.npmjs.org/${pkg.name}/latest`;
|
|
5
|
+
const GITHUB_REPO = pkg.repository.url.replace(/^https:\/\/github\.com\//, "").replace(/\.git$/, "");
|
|
6
|
+
|
|
7
|
+
export interface UpdateInfo {
|
|
8
|
+
currentVersion: string;
|
|
9
|
+
latestVersion: string;
|
|
10
|
+
hasUpdate: boolean;
|
|
11
|
+
aheadOfLatest: boolean;
|
|
12
|
+
changelog?: string;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface UpdateCache {
|
|
16
|
+
lastCheckAt: string;
|
|
17
|
+
latestVersion: string;
|
|
18
|
+
hasUpdate: boolean;
|
|
19
|
+
changelog?: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export type InstallMethod = "npm" | "bun" | "binary" | "local-dev";
|
|
23
|
+
|
|
24
|
+
/** Compare two semver strings. Returns true if latest > current. */
|
|
25
|
+
export function isNewerVersion(current: string, latest: string): boolean {
|
|
26
|
+
return Bun.semver.order(current, latest) === -1;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Fetch the latest version from the npm registry. Falls back to the bundled version on error. */
|
|
30
|
+
export async function fetchLatestVersion(signal?: AbortSignal): Promise<string> {
|
|
31
|
+
try {
|
|
32
|
+
const res = await fetch(NPM_REGISTRY_URL, { signal });
|
|
33
|
+
if (!res.ok) return pkg.version;
|
|
34
|
+
const data = (await res.json()) as { version: string };
|
|
35
|
+
return data.version;
|
|
36
|
+
} catch {
|
|
37
|
+
return pkg.version;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Fetch changelog text from GitHub releases between two versions. Returns undefined when unavailable. */
|
|
42
|
+
export async function fetchChangelog(
|
|
43
|
+
fromVersion: string,
|
|
44
|
+
toVersion: string,
|
|
45
|
+
signal?: AbortSignal,
|
|
46
|
+
): Promise<string | undefined> {
|
|
47
|
+
try {
|
|
48
|
+
const res = await fetch(`https://api.github.com/repos/${GITHUB_REPO}/releases?per_page=20`, {
|
|
49
|
+
signal,
|
|
50
|
+
headers: { Accept: "application/vnd.github.v3+json" },
|
|
51
|
+
});
|
|
52
|
+
if (!res.ok) return undefined;
|
|
53
|
+
|
|
54
|
+
const releases = (await res.json()) as Array<{
|
|
55
|
+
tag_name: string;
|
|
56
|
+
body: string | null;
|
|
57
|
+
}>;
|
|
58
|
+
|
|
59
|
+
const relevant = releases.filter((r) => {
|
|
60
|
+
const v = r.tag_name.replace(/^v/, "");
|
|
61
|
+
return isNewerVersion(fromVersion, v) && !isNewerVersion(toVersion, v);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
if (relevant.length === 0) return undefined;
|
|
65
|
+
|
|
66
|
+
return relevant
|
|
67
|
+
.map((r) => `## ${r.tag_name}\n${r.body ?? ""}`)
|
|
68
|
+
.join("\n\n")
|
|
69
|
+
.trim();
|
|
70
|
+
} catch {
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Check npm for a newer version and fetch its changelog if present. Never throws. */
|
|
76
|
+
export async function checkForUpdate(currentVersion: string, signal?: AbortSignal): Promise<UpdateInfo> {
|
|
77
|
+
const latestVersion = await fetchLatestVersion(signal);
|
|
78
|
+
const hasUpdate = isNewerVersion(currentVersion, latestVersion);
|
|
79
|
+
const aheadOfLatest = isNewerVersion(latestVersion, currentVersion);
|
|
80
|
+
|
|
81
|
+
let changelog: string | undefined;
|
|
82
|
+
if (hasUpdate) {
|
|
83
|
+
changelog = await fetchChangelog(currentVersion, latestVersion, signal);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return { currentVersion, latestVersion, hasUpdate, aheadOfLatest, changelog };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/** Returns true if the cache is missing or older than UPDATE_CHECK_INTERVAL_MS. */
|
|
90
|
+
export function needsCheck(cache?: UpdateCache): boolean {
|
|
91
|
+
if (!cache?.lastCheckAt) return true;
|
|
92
|
+
return Date.now() - new Date(cache.lastCheckAt).getTime() > DEFAULTS.UPDATE_CHECK_INTERVAL_MS;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Detect how membot was installed by inspecting `process.execPath` and `process.argv[1]`.
|
|
97
|
+
* Used to pick the right upgrade strategy: package-manager reinstall vs binary download
|
|
98
|
+
* vs no-op for source checkouts.
|
|
99
|
+
*/
|
|
100
|
+
export function detectInstallMethod(): InstallMethod {
|
|
101
|
+
const script = process.argv[1] ?? "";
|
|
102
|
+
const execPath = process.execPath;
|
|
103
|
+
|
|
104
|
+
if (script.includes("src/cli.ts") && !script.includes("node_modules")) {
|
|
105
|
+
return "local-dev";
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (!execPath.includes("bun") && !execPath.includes("node")) {
|
|
109
|
+
return "binary";
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (script.includes(".bun/install") || script.includes(".bun/bin")) {
|
|
113
|
+
return "bun";
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return "npm";
|
|
117
|
+
}
|
package/CLAUDE.md
DELETED
|
@@ -1,139 +0,0 @@
|
|
|
1
|
-
# CLAUDE.md — `ctx`
|
|
2
|
-
|
|
3
|
-
Guidance for Claude Code when working in this repo. Pair with `docs/plan.md` (the source-of-truth design doc).
|
|
4
|
-
|
|
5
|
-
## What this project is
|
|
6
|
-
|
|
7
|
-
`ctx` is a standalone Bun CLI + MCP server that gives AI agents a persistent, versioned, searchable context store. Files (markdown, PDF, DOCX, HTML, URLs) are ingested, converted to markdown, chunked, embedded locally with `@huggingface/transformers` (WASM, 384-dim `Xenova/bge-small-en-v1.5`), and indexed in DuckDB with hybrid search (vector + BM25). Every agent-visible artifact is a row in `files`, addressed by a virtual `logical_path` — there is **no** on-disk tree of stored content.
|
|
8
|
-
|
|
9
|
-
Reference projects (read these to understand the conventions before changing anything):
|
|
10
|
-
|
|
11
|
-
- `/Users/evan/workspace/botholomew` — origin of the context system. The chunker, embedder, fetcher, markdown-converter, and hybrid search live in `src/context/` and `src/tools/search/`.
|
|
12
|
-
- `/Users/evan/workspace/mcpx` — the project this one mirrors for layout, build, distribution, logger, and CLI shape.
|
|
13
|
-
|
|
14
|
-
## Hard constraints
|
|
15
|
-
|
|
16
|
-
- **Bun-only.** No Node-only deps. `bun build --compile` produces standalone binaries; the runtime must not require Bun installed.
|
|
17
|
-
- **Local embeddings only.** `@huggingface/transformers` WASM, `Xenova/bge-small-en-v1.5`, 384-dim. Never reach for cloud embedding APIs (OpenAI/Voyage/Cohere/Anthropic embeddings) even if a reference project uses them.
|
|
18
|
-
- **DuckDB is the only store.** Content AND original bytes live in rows (`files.content`, `blobs.bytes`), not in a filesystem tree. `~/.ctx/index.duckdb` holds everything except cached model weights. The DB will get large — that's accepted.
|
|
19
|
-
- **Append-only versioning.** Every ingest, refresh that finds new bytes, write, or rename creates a new `(logical_path, version_id)` row. `version_id` is a `TIMESTAMP` (ms precision). Default queries flow through `current_files` / `current_chunks` views. Delete = tombstone, not a row removal.
|
|
20
|
-
- **MCP defaults to current.** Every MCP tool acts on the latest non-tombstoned version unless `version` is passed explicitly.
|
|
21
|
-
- **Mcpx invocations are persisted.** When `ctx_add` fetches a remote URL via mcpx, store `fetcher_server`, `fetcher_tool`, and `fetcher_args` on the row so refresh re-invokes the exact same tool — never re-route through the agent.
|
|
22
|
-
- **Native conversion first, LLM fallback for messy/binary input.** `unpdf`, `mammoth`, `turndown` handle the common cases. Tesseract WASM (`tesseract.js`) does OCR for `image/*` and for PDFs whose text extraction came back empty. Claude vision captions images; Claude markdown-converter is the last-resort fallback. Missing `ANTHROPIC_API_KEY` is not a hard error — the pipeline degrades to deterministic surrogates.
|
|
23
|
-
- **Textual surrogate is the universal interface.** Every artifact (markdown, PDF, image, audio, anything) produces a markdown body that flows through chunking + embedding + FTS. Original bytes live in `blobs` and are reachable via `ctx_read bytes=true`. Search has zero special cases for binary content.
|
|
24
|
-
- **Always describe.** `files.description` is generated for every ingested file, including plain markdown. The string `<logical_path>\n<description>\n\n<chunk_content>` is what gets embedded and FTS-indexed (stored as `chunks.search_text`); `chunks.chunk_content` keeps the raw body for clean snippet rendering.
|
|
25
|
-
- **`ctx_add` accepts directories and globs.** Single arg, polymorphic: file path, directory (recursive walk, symlinks followed via realpath dedupe), glob (`docs/**/*.md`), URL, or `inline:<text>`. Each matched entry becomes its own version under its own logical_path; partial failures are reported per-entry, not all-or-nothing.
|
|
26
|
-
- **CLI auto-renders for the environment.** TTY → spinners, progress bars, ANSI colors. Piped/`--json`/`CI=true`/`NO_COLOR` → JSON to stdout, structured logs to stderr, no ANSI bytes. One code path; `src/output/tty.ts` is the single source of truth for which mode is active.
|
|
27
|
-
- **All errors are `HelpfulError`.** Bare `throw new Error(...)` is forbidden. `HelpfulError` requires a non-empty `hint` (statically and at runtime); the hint must name the next action concretely. The same hint string lands in front of both humans (CLI stderr) and LLMs (MCP `structuredContent.error.hint` and the rendered text content).
|
|
28
|
-
|
|
29
|
-
## Architecture at a glance
|
|
30
|
-
|
|
31
|
-
```
|
|
32
|
-
ctx_add ──► local-reader OR fetcher (mcpx) ──► converter (mime dispatch)
|
|
33
|
-
│
|
|
34
|
-
▼
|
|
35
|
-
chunker ──► embedder (WASM)
|
|
36
|
-
│
|
|
37
|
-
▼
|
|
38
|
-
db.files.insertVersion + db.chunks.insertForVersion
|
|
39
|
-
│
|
|
40
|
-
▼
|
|
41
|
-
FTS index rebuild (current_chunks)
|
|
42
|
-
|
|
43
|
-
ctx_refresh ──► re-read source ──► sha256 compare
|
|
44
|
-
│
|
|
45
|
-
unchanged ◄──┴──► changed ──► same pipeline as ctx_add
|
|
46
|
-
(status only) (creates new version_id)
|
|
47
|
-
```
|
|
48
|
-
|
|
49
|
-
Daemon mode (`ctx serve --watch`) ticks every `tick_interval_sec` and runs the no-arg refresh path against rows whose `refresh_frequency_sec` has elapsed.
|
|
50
|
-
|
|
51
|
-
## Layout
|
|
52
|
-
|
|
53
|
-
```
|
|
54
|
-
src/
|
|
55
|
-
cli.ts # commander entry; iterates operations registry
|
|
56
|
-
sdk.ts # programmatic API for embedding ctx
|
|
57
|
-
context.ts # AppContext: config + db + embedder + mcpx + logger
|
|
58
|
-
constants.ts # CTX_HOME, EMBEDDING_DIMENSION=384, defaults
|
|
59
|
-
operations/ # ★ one file per user-facing capability; single source of truth
|
|
60
|
-
types.ts # Operation<I,O>, defineOperation()
|
|
61
|
-
index.ts # ordered registry; cli + mcp both iterate this
|
|
62
|
-
add.ts list.ts tree.ts read.ts write.ts search.ts remove.ts
|
|
63
|
-
move.ts refresh.ts info.ts versions.ts diff.ts prune.ts
|
|
64
|
-
mount/
|
|
65
|
-
mcp.ts # mountAsMcpTool — registers an Operation as an MCP tool
|
|
66
|
-
commander.ts # mountAsCommanderCommand — registers an Operation as a CLI subcommand
|
|
67
|
-
zod-to-cli.ts # introspects zod schema → commander .argument()/.option() calls
|
|
68
|
-
commands/ # CLI-only commands with no MCP equivalent (serve, reindex)
|
|
69
|
-
config/ # zod schema + loader (~/.ctx/config.json)
|
|
70
|
-
db/ # DuckDB connection, migrations, files.ts, chunks.ts
|
|
71
|
-
ingest/ # source-resolver (file/dir/glob/url/inline), local-reader, fetcher, chunker, embedder, describer, search-text, converter/ (pdf/docx/html/image/text/ocr/llm)
|
|
72
|
-
search/ # semantic.ts, keyword.ts, hybrid.ts (RRF)
|
|
73
|
-
refresh/ # runner.ts (per-row), scheduler.ts (daemon)
|
|
74
|
-
mcp/ # server.ts, instructions.ts
|
|
75
|
-
output/ # tty.ts (mode detection), logger.ts (spinner-aware), progress.ts (multi-entry bar), formatter.ts (table/markdown/json)
|
|
76
|
-
errors.ts # HelpfulError class — the only error type allowed in handlers
|
|
77
|
-
test/ # bun test, _preload.ts applies transformers patch
|
|
78
|
-
patches/ # @huggingface/transformers WASM patch (copy from mcpx)
|
|
79
|
-
scripts/ # apply-transformers-patch.sh (pre-build hook)
|
|
80
|
-
docs/plan.md # source-of-truth design
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
## Coding conventions
|
|
84
|
-
|
|
85
|
-
- **One Operation, two surfaces.** Every user-facing capability is a single `Operation` in `src/operations/` with a zod input schema, zod output schema, description string, and handler. The MCP server and the commander CLI both consume this — never write a tool description twice, never define an input shape twice. The description string is the LLM-facing docstring AND the `--help` text. Field-level help comes from `.describe()` on each zod field.
|
|
86
|
-
- **Zod everywhere.** Operation I/O schemas, config schema, fetcher response shapes. Use `.describe()` on every field — that text is what the agent and human both read.
|
|
87
|
-
- **Errors are `HelpfulError` only.** See `src/errors.ts`. Required fields: `kind`, `message`, `hint`. The constructor refuses an empty hint at runtime, and the type system refuses to omit it at compile time. The mount adapters render `kind` + `message` + `hint` for both surfaces — humans see colorized output on a TTY, LLMs get the same fields back as MCP `structuredContent.error`. Hint quality bar: name a concrete next action (a command to run, a flag to set, a path to check). Vague hints like "Check your config" should fail review.
|
|
88
|
-
- **No log-and-rethrow.** Errors propagate to the mount boundary, are rendered there exactly once, then exit. Logging the error before throwing produces double-output and breaks JSON-mode parseability.
|
|
89
|
-
- **Spinners & progress are advisory.** Operations call `ctx.progress.tick(...)` and `ctx.logger.info(...)` without checking whether they're rendered. The renderer in `src/output/` decides; non-interactive mode coerces both into stderr lines or no-ops.
|
|
90
|
-
- **No duplicated handlers.** If you find yourself writing logic in `src/commands/*.ts` that an MCP tool would also want, it belongs in `src/operations/` instead. The only legitimate `src/commands/*.ts` files are CLI-only behaviors with no agent-facing meaning (`serve`, `reindex`).
|
|
91
|
-
- **Logger, not console.** Use `src/output/logger.ts` (spinner-aware, JSON/TTY-aware). `console.log` in production code is a bug.
|
|
92
|
-
- **Colors via `ansis`, spinners via `nanospinner`.** Same as mcpx.
|
|
93
|
-
- **No premature abstractions.** Three similar lines beat a generic helper. Don't build for hypothetical fetchers, hypothetical embedders, or hypothetical storage backends.
|
|
94
|
-
|
|
95
|
-
## Tool / command descriptions
|
|
96
|
-
|
|
97
|
-
Operation descriptions are the user interface — for the LLM AND for the human running `ctx <cmd> --help`. The same string is shown in both places. Every operation description follows this shape:
|
|
98
|
-
|
|
99
|
-
1. Bash-equivalent prefix where applicable: `[[ bash equivalent: cat ]]`.
|
|
100
|
-
2. One-line purpose.
|
|
101
|
-
3. When-to-use guidance — what to call before/after, what tool to prefer instead in adjacent cases.
|
|
102
|
-
4. Constraints, recovery hints, and links to other operations by name.
|
|
103
|
-
|
|
104
|
-
Server-level `instructions` (the string handed to the MCP client when it connects) is defined in `src/mcp/instructions.ts`. It frames the discovery → ingest → consume → write workflow and explicitly tells the agent how versioning, refresh, and the `version` parameter behave. CLI users get the same framing through `ctx --help` (commander's top-level help). Update both that file and `docs/plan.md` together if you change the operation surface.
|
|
105
|
-
|
|
106
|
-
## Testing
|
|
107
|
-
|
|
108
|
-
- `bun test`. Test preload at `test/_preload.ts` applies the transformers WASM patch.
|
|
109
|
-
- Use a real ephemeral DuckDB file per test (don't mock the DB).
|
|
110
|
-
- Real fixtures for converters (`test/fixtures/sample.pdf`, `sample.docx`, `sample.html`).
|
|
111
|
-
- Mock the network only for fetcher tests; everything else hits the real local pipeline.
|
|
112
|
-
- Versioning paths to cover: insert creates v1, refresh-unchanged creates no new version, refresh-changed creates v2, `current_files` returns v2, explicit `version=v1` returns v1, tombstone hides from `current_files` but `versions` still lists it, `prune --before` drops non-current rows.
|
|
113
|
-
|
|
114
|
-
## Build & distribution
|
|
115
|
-
|
|
116
|
-
- Pre-build: `scripts/apply-transformers-patch.sh` (copy verbatim from mcpx).
|
|
117
|
-
- Build: `bun build --compile --minify --sourcemap ./src/cli.ts --outfile dist/ctx`.
|
|
118
|
-
- Targets: darwin-arm64, darwin-x64, linux-arm64, linux-x64, windows-x64, windows-arm64.
|
|
119
|
-
- Distribution: `install.sh` / `install.ps1` mirror mcpx; published to NPM as well.
|
|
120
|
-
|
|
121
|
-
## Things to avoid
|
|
122
|
-
|
|
123
|
-
- Re-introducing a filesystem store under `~/.ctx/context/`. The store is rows.
|
|
124
|
-
- Cloud embeddings. Local WASM only.
|
|
125
|
-
- Mutating an existing version's `content` / `content_sha256` / `chunks`. Those fields are immutable once the row is written — corrections are new versions.
|
|
126
|
-
- Re-routing a remote refresh through the LLM/agent. Replay the stored `fetcher_*` columns directly via mcpx.
|
|
127
|
-
- Tools that return content blobs without a `version_id` — every read-shaped response must echo which version it served.
|
|
128
|
-
- A separate `ctx_read_blob` tool. Bytes are reachable via `ctx_read` with `bytes=true`. One read tool, one mental model.
|
|
129
|
-
- Embedding `chunk_content` raw. Always embed `search_text` (the prepended `<path>\n<description>\n\n<body>`) — that's what `chunks.search_text` holds and what FTS is built on.
|
|
130
|
-
- Aborting a directory/glob ingest because one entry failed. Stream per-entry results; report failures alongside successes.
|
|
131
|
-
- Throwing `new Error(...)` anywhere in `src/operations/`, `src/ingest/`, `src/db/`, `src/refresh/`, or `src/mcp/`. Always `HelpfulError`. Wrap external errors with `asHelpful(cause, context, hint, kind)`.
|
|
132
|
-
- Writing colorized output unconditionally. Always go through `src/output/` so non-interactive callers get clean JSON.
|
|
133
|
-
- A `HelpfulError` whose hint just paraphrases the message ("File not found. Hint: file is missing."). Hint must name a concrete next step — a command, a flag, a path to inspect.
|
|
134
|
-
- **Defining a tool description in two places.** If you catch yourself writing copy in `src/mcp/...` that also exists in `src/commands/...`, stop — make it an `Operation`.
|
|
135
|
-
- Hand-rolling a JSON Schema for an MCP tool. Always derive it from the zod input schema via the mount adapter.
|
|
136
|
-
|
|
137
|
-
## When in doubt
|
|
138
|
-
|
|
139
|
-
Read `docs/plan.md`. If the plan and code disagree, the plan wins until a deliberate update lands in both.
|