botholomew 0.16.4 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +46 -41
  2. package/package.json +4 -9
  3. package/src/chat/agent.ts +37 -40
  4. package/src/chat/session.ts +10 -10
  5. package/src/cli.ts +0 -2
  6. package/src/commands/capabilities.ts +35 -33
  7. package/src/commands/context.ts +133 -221
  8. package/src/commands/init.ts +22 -1
  9. package/src/commands/mcpx.ts +21 -8
  10. package/src/commands/nuke.ts +52 -15
  11. package/src/commands/prepare.ts +16 -13
  12. package/src/config/loader.ts +1 -8
  13. package/src/config/schemas.ts +6 -0
  14. package/src/constants.ts +16 -32
  15. package/src/init/index.ts +52 -27
  16. package/src/mcpx/client.ts +21 -5
  17. package/src/mem/client.ts +33 -0
  18. package/src/{context → prompts}/capabilities.ts +11 -7
  19. package/src/schedules/store.ts +1 -1
  20. package/src/tasks/store.ts +1 -1
  21. package/src/threads/store.ts +1 -1
  22. package/src/tools/capabilities/refresh.ts +1 -1
  23. package/src/tools/membot/adapter.ts +111 -0
  24. package/src/tools/membot/copy.ts +59 -0
  25. package/src/tools/membot/count_lines.ts +53 -0
  26. package/src/tools/membot/edit.ts +72 -0
  27. package/src/tools/membot/exists.ts +54 -0
  28. package/src/tools/membot/index.ts +26 -0
  29. package/src/tools/{context → membot}/pipe.ts +34 -32
  30. package/src/tools/registry.ts +6 -37
  31. package/src/tools/tool.ts +6 -8
  32. package/src/tui/App.tsx +3 -4
  33. package/src/tui/components/ContextPanel.tsx +109 -226
  34. package/src/tui/components/HelpPanel.tsx +2 -2
  35. package/src/tui/components/StatusBar.tsx +0 -6
  36. package/src/tui/components/ThreadPanel.tsx +8 -7
  37. package/src/tui/wrapDetail.ts +11 -0
  38. package/src/worker/heartbeat.ts +0 -20
  39. package/src/worker/index.ts +13 -13
  40. package/src/worker/llm.ts +7 -9
  41. package/src/worker/prompt.ts +25 -13
  42. package/src/worker/spawn.ts +1 -1
  43. package/src/worker/tick.ts +10 -9
  44. package/src/commands/db.ts +0 -119
  45. package/src/commands/with-db.ts +0 -22
  46. package/src/context/chunker.ts +0 -275
  47. package/src/context/embedder-impl.ts +0 -100
  48. package/src/context/embedder.ts +0 -9
  49. package/src/context/fetcher-errors.ts +0 -8
  50. package/src/context/fetcher.ts +0 -515
  51. package/src/context/locks.ts +0 -146
  52. package/src/context/markdown-converter.ts +0 -186
  53. package/src/context/reindex.ts +0 -198
  54. package/src/context/store.ts +0 -841
  55. package/src/context/url-utils.ts +0 -25
  56. package/src/db/connection.ts +0 -255
  57. package/src/db/doctor.ts +0 -235
  58. package/src/db/embeddings.ts +0 -317
  59. package/src/db/query.ts +0 -56
  60. package/src/db/schema.ts +0 -93
  61. package/src/db/sql/1-core_tables.sql +0 -53
  62. package/src/db/sql/10-dedupe_context_items.sql +0 -26
  63. package/src/db/sql/11-rebuild_hnsw.sql +0 -8
  64. package/src/db/sql/12-workers.sql +0 -66
  65. package/src/db/sql/13-drive-paths.sql +0 -47
  66. package/src/db/sql/14-drop_hnsw_index.sql +0 -8
  67. package/src/db/sql/15-fts_index.sql +0 -8
  68. package/src/db/sql/16-source_url.sql +0 -7
  69. package/src/db/sql/17-worker_log_path.sql +0 -3
  70. package/src/db/sql/18-reset_embeddings_for_local.sql +0 -39
  71. package/src/db/sql/19-disk_backed_index.sql +0 -36
  72. package/src/db/sql/2-logging_tables.sql +0 -24
  73. package/src/db/sql/20-drop_db_tables_for_files.sql +0 -19
  74. package/src/db/sql/3-daemon_state.sql +0 -5
  75. package/src/db/sql/4-unique_context_path.sql +0 -1
  76. package/src/db/sql/5-reset_embeddings_for_openai.sql +0 -1
  77. package/src/db/sql/6-vss_index.sql +0 -7
  78. package/src/db/sql/7-drop_embeddings_fk.sql +0 -23
  79. package/src/db/sql/8-task_output.sql +0 -1
  80. package/src/db/sql/9-source-type.sql +0 -1
  81. package/src/tools/context/read-large-result.ts +0 -33
  82. package/src/tools/dir/create.ts +0 -47
  83. package/src/tools/dir/size.ts +0 -77
  84. package/src/tools/dir/tree.ts +0 -124
  85. package/src/tools/file/copy.ts +0 -73
  86. package/src/tools/file/count-lines.ts +0 -54
  87. package/src/tools/file/delete.ts +0 -83
  88. package/src/tools/file/edit.ts +0 -76
  89. package/src/tools/file/exists.ts +0 -33
  90. package/src/tools/file/info.ts +0 -66
  91. package/src/tools/file/move.ts +0 -66
  92. package/src/tools/file/read.ts +0 -67
  93. package/src/tools/file/write.ts +0 -58
  94. package/src/tools/search/fuse.ts +0 -96
  95. package/src/tools/search/index.ts +0 -127
  96. package/src/tools/search/regexp.ts +0 -82
  97. package/src/tools/search/semantic.ts +0 -167
  98. /package/src/{db → utils}/uuid.ts +0 -0
@@ -1,186 +0,0 @@
1
- import type { BotholomewConfig } from "../config/schemas.ts";
2
- import { logger } from "../utils/logger.ts";
3
- import { createLlmClient } from "../worker/llm-client.ts";
4
- import { FetchFailureError } from "./fetcher-errors.ts";
5
-
6
- const CONVERTER_MAX_TOKENS = 16_384;
7
-
8
- const CONVERTER_SYSTEM_PROMPT = `You normalize documents to clean, well-structured Markdown.
9
-
10
- **If the input is already clean, valid Markdown, return it verbatim with no edits.** Look for ATX headings (#, ##), bullet/numbered lists, fenced code blocks, inline code, links in [text](url) form, blockquotes, GFM tables. If the structure is consistently markdown-shaped, echo it back unchanged.
11
-
12
- Otherwise, convert it. The input mime_type is a hint, not a guarantee — verify the actual content. Common non-markdown formats to recognize and convert:
13
- - **HTML** — strip tags, scripts, styles, navigation/footer chrome; preserve headings, paragraphs, lists, tables, links, code.
14
- - **JSON / XML / YAML** — render the structure as readable Markdown (headings/lists for objects, tables where appropriate, fenced code blocks for inline values).
15
- - **DocMD (Google Docs structured format)** — lines like \`[H1 1-31 HEADING_1 tabId=t.0 ...] Title text\` or \`[P5 884-937 PARAGRAPH ...] Body text\`. Strip the bracket annotations entirely; map H1→#, H2→##, H3→###, P→paragraph; preserve the trailing text content.
16
- - **RTF, plain text with mixed structure, ad-hoc formats** — extract the semantic content, drop the noise.
17
-
18
- Rules for the output:
19
- - Preserve all semantic content: headings, paragraphs, lists, tables, links, inline code, code blocks, blockquotes.
20
- - Use ATX headings (#, ##, ###), fenced code blocks (\`\`\`lang), GFM-style tables, and reference- or inline-style links — whichever is cleanest.
21
- - Strip metadata headers/IDs that aren't part of the document body (e.g. \`@document_id: ...\`, \`@revision_id: ...\`).
22
- - Output **only** the Markdown. No preamble ("Here is the converted markdown:"), no trailing commentary, no wrapping the entire output in a code fence.`;
23
-
24
- const MARKDOWN_MIME_TYPES = new Set([
25
- "text/markdown",
26
- "text/x-markdown",
27
- "text/md",
28
- ]);
29
-
30
- export function isMarkdownMimeType(mimeType: string): boolean {
31
- const base = mimeType.split(";")[0]?.trim().toLowerCase() ?? "";
32
- return MARKDOWN_MIME_TYPES.has(base);
33
- }
34
-
35
- /**
36
- * Sniff content for a non-markdown structure. Returns a mime type when the
37
- * content has unmistakable markers of HTML / XML / JSON / etc., otherwise
38
- * null. Used to verify a tool's claim of `text/markdown` — if the agent (or
39
- * a defaulted mime type) lies about the format, we want to convert anyway.
40
- *
41
- * Markdown is a superset of plain text, so a null return ≠ "definitely
42
- * markdown". It just means we found no strong contradicting signal.
43
- */
44
- export function sniffNonMarkdownMimeType(content: string): string | null {
45
- const head = content.trimStart().slice(0, 4096);
46
- if (!head) return null;
47
-
48
- if (/^<!doctype\s+html/i.test(head)) return "text/html";
49
- if (/^<html[\s>]/i.test(head)) return "text/html";
50
- if (/^<\?xml[\s?]/i.test(head)) return "application/xml";
51
-
52
- // JSON: parses as JSON top-to-bottom (use the full content, not the head).
53
- const trimmed = content.trim();
54
- if (
55
- (trimmed.startsWith("{") && trimmed.endsWith("}")) ||
56
- (trimmed.startsWith("[") && trimmed.endsWith("]"))
57
- ) {
58
- try {
59
- JSON.parse(trimmed);
60
- return "application/json";
61
- } catch {
62
- // fall through
63
- }
64
- }
65
-
66
- // Heuristic HTML: dense tag markup. Markdown can contain occasional inline
67
- // HTML, so we only flag it when tags dominate the sample.
68
- const tagMatches = head.match(/<\/?[a-z][a-z0-9]*[\s/>]/gi) ?? [];
69
- if (tagMatches.length >= 10) {
70
- const charsPerTag = head.length / tagMatches.length;
71
- if (charsPerTag < 80) return "text/html";
72
- }
73
-
74
- return null;
75
- }
76
-
77
- /**
78
- * Decide the effective mime type for a piece of content. If the claim is
79
- * markdown but the content sniffs as something else, trust the sniff so we
80
- * convert instead of saving mislabeled garbage.
81
- */
82
- export function resolveEffectiveMimeType(
83
- claimedMimeType: string,
84
- content: string,
85
- ): { mimeType: string; sniffed: boolean } {
86
- if (!isMarkdownMimeType(claimedMimeType)) {
87
- return { mimeType: claimedMimeType, sniffed: false };
88
- }
89
- const sniffed = sniffNonMarkdownMimeType(content);
90
- if (sniffed) return { mimeType: sniffed, sniffed: true };
91
- return { mimeType: claimedMimeType, sniffed: false };
92
- }
93
-
94
- function stripLeadingMarkdownFence(text: string): string {
95
- const trimmed = text.trim();
96
- const fenceMatch = trimmed.match(
97
- /^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/,
98
- );
99
- if (fenceMatch?.[1]) return fenceMatch[1];
100
- return text;
101
- }
102
-
103
- /**
104
- * Convert arbitrary content to Markdown via a single-shot LLM call.
105
- *
106
- * Does **not** short-circuit on `mimeType === "text/markdown"` — tools
107
- * frequently mislabel their output (e.g. Google Docs' "DocMD" tool returns
108
- * structured `[H1 ...]` annotations, not real markdown). The mime type is
109
- * passed in as a hint for the model; the model decides whether the content
110
- * is already markdown (echo unchanged) or needs converting.
111
- *
112
- * - Throws FetchFailureError when the response hits max_tokens (silently
113
- * truncating the saved file would be worse than failing loudly).
114
- * - On transient API errors, logs a warning and returns the raw content so
115
- * the import still produces *something* the user can edit.
116
- */
117
- export async function convertToMarkdown(
118
- content: string,
119
- mimeType: string,
120
- sourceUrl: string,
121
- config: Required<BotholomewConfig>,
122
- ): Promise<string> {
123
- if (!config.anthropic_api_key) return content;
124
-
125
- const client = createLlmClient(config);
126
- // Conversion is mechanical text-shaping — Haiku (the chunker model) is
127
- // plenty smart for this and ~5x faster than Opus on long documents.
128
- const model = config.chunker_model || config.model;
129
-
130
- try {
131
- const stream = client.messages.stream({
132
- model,
133
- max_tokens: CONVERTER_MAX_TOKENS,
134
- system: CONVERTER_SYSTEM_PROMPT,
135
- messages: [
136
- {
137
- role: "user",
138
- content: `Convert this ${mimeType} content to Markdown. Source URL: ${sourceUrl}\n\n${content}`,
139
- },
140
- ],
141
- });
142
-
143
- let charsReceived = 0;
144
- let lastLogged = 0;
145
- const PROGRESS_INTERVAL_CHARS = 2_000;
146
- for await (const event of stream) {
147
- if (
148
- event.type === "content_block_delta" &&
149
- event.delta.type === "text_delta"
150
- ) {
151
- charsReceived += event.delta.text.length;
152
- if (charsReceived - lastLogged >= PROGRESS_INTERVAL_CHARS) {
153
- logger.dim(` ...converted ${charsReceived} chars`);
154
- lastLogged = charsReceived;
155
- }
156
- }
157
- }
158
-
159
- const final = await stream.finalMessage();
160
-
161
- if (final.stop_reason === "max_tokens") {
162
- throw new FetchFailureError(
163
- `Markdown conversion exceeded token budget (max_tokens=${CONVERTER_MAX_TOKENS}). The source document is too large to convert in one pass — try fetching a smaller section or a tool that supports pagination.`,
164
- );
165
- }
166
-
167
- const text = final.content
168
- .flatMap((block) => (block.type === "text" ? [block.text] : []))
169
- .join("");
170
-
171
- if (!text.trim()) {
172
- logger.warn(
173
- "markdown conversion returned empty output — saving raw content",
174
- );
175
- return content;
176
- }
177
-
178
- return stripLeadingMarkdownFence(text);
179
- } catch (err) {
180
- if (err instanceof FetchFailureError) throw err;
181
- logger.warn(
182
- `markdown conversion failed (${err instanceof Error ? err.message : String(err)}) — saving raw content`,
183
- );
184
- return content;
185
- }
186
- }
@@ -1,198 +0,0 @@
1
- import { createHash } from "node:crypto";
2
- import { readFile, stat } from "node:fs/promises";
3
- import { join } from "node:path";
4
- import type { BotholomewConfig } from "../config/schemas.ts";
5
- import { CONTEXT_DIR } from "../constants.ts";
6
- import { withDb } from "../db/connection.ts";
7
- import {
8
- type ChunkInput,
9
- deleteIndexedPath,
10
- getIndexedPath,
11
- listIndexedPaths,
12
- rebuildSearchIndex,
13
- upsertChunksForPath,
14
- } from "../db/embeddings.ts";
15
- import { logger } from "../utils/logger.ts";
16
- import { chunkByTextSplit } from "./chunker.ts";
17
- import { embed as defaultEmbed } from "./embedder.ts";
18
- import { isContextPathLocked } from "./locks.ts";
19
- import { listContextDir } from "./store.ts";
20
-
21
- /** Embed function shape — exported for tests that want to inject a fake. */
22
- export type EmbedFn = (
23
- texts: string[],
24
- config: Required<BotholomewConfig>,
25
- ) => Promise<number[][]>;
26
-
27
- /**
28
- * Walk every textual file under `<projectDir>/context/` and reconcile the
29
- * disk-backed search index. Adds new files, replaces stale ones whose
30
- * content_hash changed, and drops index rows for files that no longer exist.
31
- *
32
- * Uses the deterministic text splitter (`chunkByTextSplit`) — never the LLM
33
- * chunker — so a fresh project with no API key still indexes successfully.
34
- */
35
- export async function reindexContext(
36
- projectDir: string,
37
- config: Required<BotholomewConfig>,
38
- dbPath: string,
39
- opts: {
40
- onProgress?: (msg: string) => void;
41
- /** Override embed for tests; defaults to the real WASM embedder. */
42
- embedFn?: EmbedFn;
43
- } = {},
44
- ): Promise<ReindexSummary> {
45
- const onProgress = opts.onProgress ?? (() => {});
46
- const embed = opts.embedFn ?? defaultEmbed;
47
-
48
- // 1. Walk context/ for every textual file along with its current
49
- // (path, hash, mtime, size). Binary files are intentionally skipped —
50
- // embeddings on bytes are meaningless and would just consume storage.
51
- onProgress("scanning files");
52
- const onDisk = await collectDiskFiles(projectDir);
53
-
54
- // 2. Read the existing index so we can decide what's add / update / skip /
55
- // remove without re-embedding files that haven't changed.
56
- const indexed = await withDb(dbPath, listIndexedPaths);
57
- const indexedByPath = new Map(indexed.map((r) => [r.path, r]));
58
-
59
- let added = 0;
60
- let updated = 0;
61
- let unchanged = 0;
62
- let removed = 0;
63
- let chunksWritten = 0;
64
-
65
- // 3. For each file on disk: skip if (path, hash) is already indexed and the
66
- // on-disk content hash matches; otherwise (re)embed.
67
- for (const file of onDisk) {
68
- const existing = indexedByPath.get(file.path);
69
- if (existing && existing.content_hash === file.contentHash) {
70
- unchanged++;
71
- indexedByPath.delete(file.path);
72
- continue;
73
- }
74
-
75
- onProgress(`embedding ${file.path}`);
76
- const text = await readFile(
77
- join(projectDir, CONTEXT_DIR, file.path),
78
- "utf-8",
79
- );
80
- const chunks = chunkByTextSplit(text);
81
- if (chunks.length === 0) {
82
- // Empty/whitespace-only file. Drop any stale rows for it; otherwise
83
- // there's nothing to index.
84
- if (existing) {
85
- await withDb(dbPath, (conn) => deleteIndexedPath(conn, file.path));
86
- }
87
- continue;
88
- }
89
- const vectors = await embed(
90
- chunks.map((c) => c.content),
91
- config,
92
- );
93
- const inputs: ChunkInput[] = chunks.map((c, i) => ({
94
- chunk_index: c.index,
95
- chunk_content: c.content,
96
- embedding: vectors[i] ?? new Array(config.embedding_dimension).fill(0),
97
- }));
98
- await withDb(dbPath, (conn) =>
99
- upsertChunksForPath(conn, {
100
- path: file.path,
101
- contentHash: file.contentHash,
102
- mtimeMs: file.mtimeMs,
103
- sizeBytes: file.sizeBytes,
104
- chunks: inputs,
105
- }),
106
- );
107
- if (existing) updated++;
108
- else added++;
109
- chunksWritten += inputs.length;
110
- indexedByPath.delete(file.path);
111
- }
112
-
113
- // 4. Anything left in indexedByPath is in the index but not on disk →
114
- // delete its rows so search results don't surface ghost files. Skip
115
- // paths with an active per-path write lock: a worker may have just
116
- // written the file *after* our `collectDiskFiles` walk snapshot, and
117
- // pruning now would drop the index row for a real file. Best-effort —
118
- // the next reindex will reconcile.
119
- for (const orphan of indexedByPath.keys()) {
120
- if (await isContextPathLocked(projectDir, orphan)) {
121
- logger.debug(`reindex: skipping orphan-prune for in-flight ${orphan}`);
122
- continue;
123
- }
124
- await withDb(dbPath, (conn) => deleteIndexedPath(conn, orphan));
125
- removed++;
126
- }
127
-
128
- if (added + updated + removed > 0) {
129
- onProgress("rebuilding FTS index");
130
- await withDb(dbPath, rebuildSearchIndex);
131
- }
132
-
133
- return { added, updated, unchanged, removed, chunksWritten };
134
- }
135
-
136
- export interface ReindexSummary {
137
- added: number;
138
- updated: number;
139
- unchanged: number;
140
- removed: number;
141
- chunksWritten: number;
142
- }
143
-
144
- interface DiskFile {
145
- path: string;
146
- contentHash: string;
147
- mtimeMs: number;
148
- sizeBytes: number;
149
- }
150
-
151
- async function collectDiskFiles(projectDir: string): Promise<DiskFile[]> {
152
- const entries = await listContextDir(projectDir, "", { recursive: true });
153
- const out: DiskFile[] = [];
154
- for (const e of entries) {
155
- if (e.is_directory) continue;
156
- if (!e.is_textual) continue;
157
- const abs = join(projectDir, CONTEXT_DIR, e.path);
158
- let st: Awaited<ReturnType<typeof stat>>;
159
- try {
160
- st = await stat(abs);
161
- } catch (err) {
162
- logger.warn(`reindex: skipping ${e.path}: ${err}`);
163
- continue;
164
- }
165
- const buf = await readFile(abs);
166
- const contentHash = createHash("sha256").update(buf).digest("hex");
167
- out.push({
168
- path: e.path,
169
- contentHash,
170
- mtimeMs: st.mtimeMs,
171
- sizeBytes: st.size,
172
- });
173
- }
174
- return out;
175
- }
176
-
177
- /**
178
- * Drop a single path from the index. Used by file/dir tool callers when
179
- * they delete or move a file and want the index to reflect it immediately
180
- * instead of waiting for the next reindex.
181
- */
182
- export async function dropIndexedPath(
183
- dbPath: string,
184
- path: string,
185
- ): Promise<void> {
186
- await withDb(dbPath, async (conn) => {
187
- await deleteIndexedPath(conn, path);
188
- await rebuildSearchIndex(conn);
189
- });
190
- }
191
-
192
- export async function getIndexEntry(
193
- dbPath: string,
194
- path: string,
195
- ): Promise<{ chunks: number } | null> {
196
- const row = await withDb(dbPath, (conn) => getIndexedPath(conn, path));
197
- return row ? { chunks: row.chunk_count } : null;
198
- }