npm - botholomew - Versions diffs - 0.16.4 → 0.18.0 - Mend

botholomew 0.16.4 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (98) hide show

package/README.md +46 -41
package/package.json +4 -9
package/src/chat/agent.ts +37 -40
package/src/chat/session.ts +10 -10
package/src/cli.ts +0 -2
package/src/commands/capabilities.ts +35 -33
package/src/commands/context.ts +133 -221
package/src/commands/init.ts +22 -1
package/src/commands/mcpx.ts +21 -8
package/src/commands/nuke.ts +52 -15
package/src/commands/prepare.ts +16 -13
package/src/config/loader.ts +1 -8
package/src/config/schemas.ts +6 -0
package/src/constants.ts +16 -32
package/src/init/index.ts +52 -27
package/src/mcpx/client.ts +21 -5
package/src/mem/client.ts +33 -0
package/src/{context → prompts}/capabilities.ts +11 -7
package/src/schedules/store.ts +1 -1
package/src/tasks/store.ts +1 -1
package/src/threads/store.ts +1 -1
package/src/tools/capabilities/refresh.ts +1 -1
package/src/tools/membot/adapter.ts +111 -0
package/src/tools/membot/copy.ts +59 -0
package/src/tools/membot/count_lines.ts +53 -0
package/src/tools/membot/edit.ts +72 -0
package/src/tools/membot/exists.ts +54 -0
package/src/tools/membot/index.ts +26 -0
package/src/tools/{context → membot}/pipe.ts +34 -32
package/src/tools/registry.ts +6 -37
package/src/tools/tool.ts +6 -8
package/src/tui/App.tsx +3 -4
package/src/tui/components/ContextPanel.tsx +109 -226
package/src/tui/components/HelpPanel.tsx +2 -2
package/src/tui/components/StatusBar.tsx +0 -6
package/src/tui/components/ThreadPanel.tsx +8 -7
package/src/tui/wrapDetail.ts +11 -0
package/src/worker/heartbeat.ts +0 -20
package/src/worker/index.ts +13 -13
package/src/worker/llm.ts +7 -9
package/src/worker/prompt.ts +25 -13
package/src/worker/spawn.ts +1 -1
package/src/worker/tick.ts +10 -9
package/src/commands/db.ts +0 -119
package/src/commands/with-db.ts +0 -22
package/src/context/chunker.ts +0 -275
package/src/context/embedder-impl.ts +0 -100
package/src/context/embedder.ts +0 -9
package/src/context/fetcher-errors.ts +0 -8
package/src/context/fetcher.ts +0 -515
package/src/context/locks.ts +0 -146
package/src/context/markdown-converter.ts +0 -186
package/src/context/reindex.ts +0 -198
package/src/context/store.ts +0 -841
package/src/context/url-utils.ts +0 -25
package/src/db/connection.ts +0 -255
package/src/db/doctor.ts +0 -235
package/src/db/embeddings.ts +0 -317
package/src/db/query.ts +0 -56
package/src/db/schema.ts +0 -93
package/src/db/sql/1-core_tables.sql +0 -53
package/src/db/sql/10-dedupe_context_items.sql +0 -26
package/src/db/sql/11-rebuild_hnsw.sql +0 -8
package/src/db/sql/12-workers.sql +0 -66
package/src/db/sql/13-drive-paths.sql +0 -47
package/src/db/sql/14-drop_hnsw_index.sql +0 -8
package/src/db/sql/15-fts_index.sql +0 -8
package/src/db/sql/16-source_url.sql +0 -7
package/src/db/sql/17-worker_log_path.sql +0 -3
package/src/db/sql/18-reset_embeddings_for_local.sql +0 -39
package/src/db/sql/19-disk_backed_index.sql +0 -36
package/src/db/sql/2-logging_tables.sql +0 -24
package/src/db/sql/20-drop_db_tables_for_files.sql +0 -19
package/src/db/sql/3-daemon_state.sql +0 -5
package/src/db/sql/4-unique_context_path.sql +0 -1
package/src/db/sql/5-reset_embeddings_for_openai.sql +0 -1
package/src/db/sql/6-vss_index.sql +0 -7
package/src/db/sql/7-drop_embeddings_fk.sql +0 -23
package/src/db/sql/8-task_output.sql +0 -1
package/src/db/sql/9-source-type.sql +0 -1
package/src/tools/context/read-large-result.ts +0 -33
package/src/tools/dir/create.ts +0 -47
package/src/tools/dir/size.ts +0 -77
package/src/tools/dir/tree.ts +0 -124
package/src/tools/file/copy.ts +0 -73
package/src/tools/file/count-lines.ts +0 -54
package/src/tools/file/delete.ts +0 -83
package/src/tools/file/edit.ts +0 -76
package/src/tools/file/exists.ts +0 -33
package/src/tools/file/info.ts +0 -66
package/src/tools/file/move.ts +0 -66
package/src/tools/file/read.ts +0 -67
package/src/tools/file/write.ts +0 -58
package/src/tools/search/fuse.ts +0 -96
package/src/tools/search/index.ts +0 -127
package/src/tools/search/regexp.ts +0 -82
package/src/tools/search/semantic.ts +0 -167
/package/src/{db → utils}/uuid.ts +0 -0

package/src/context/chunker.ts DELETED Viewed

@@ -1,275 +0,0 @@
-import Anthropic from "@anthropic-ai/sdk";
-import type { BotholomewConfig } from "../config/schemas.ts";
-import { logger } from "../utils/logger.ts";
-export interface Chunk {
-  index: number;
-  content: string;
-}
-const SHORT_CONTENT_THRESHOLD = 200;
-const LLM_TIMEOUT_MS = 10_000;
-const DEFAULT_OVERLAP_LINES = 2;
-// OpenAI's embedding endpoint caps inputs at 8192 tokens. The cl100k_base
-// tokenizer averages ~4 chars/token on plain English but can drop to ~2
-// chars/token on dense/code/non-ASCII content. We cap at 15k chars so even
-// at the worst-case ~2.5 chars/token (~6k tokens) we stay well under the
-// 8192-token limit, leaving headroom for the title/description prefix
-// prepended at embed time.
-const MAX_CHUNK_CHARS = 15_000;
-// Target size for deterministic fallback chunks. Smaller than MAX_CHUNK_CHARS
-// so a large doc produces multiple chunks of reasonable granularity when the
-// LLM chunker fails.
-const FALLBACK_TARGET_CHARS = 4_000;
-const CHUNKER_TOOL_NAME = "return_chunks";
-const CHUNKER_TOOL = {
-  name: CHUNKER_TOOL_NAME,
-  description:
-    "Return the chunk boundaries for this document. Each chunk should be a coherent semantic section.",
-  input_schema: {
-    type: "object" as const,
-    properties: {
-      chunks: {
-        type: "array",
-        description: "Array of chunk boundaries (1-based, inclusive)",
-        items: {
-          type: "object",
-          properties: {
-            start_line: {
-              type: "number",
-              description: "1-based start line (inclusive)",
-            },
-            end_line: {
-              type: "number",
-              description: "1-based end line (inclusive)",
-            },
-          },
-          required: ["start_line", "end_line"],
-        },
-      },
-    },
-    required: ["chunks"],
-  },
-};
-/**
- * Split text into pieces no larger than `maxChars`, preferring paragraph,
- * line, and finally hard-character boundaries.
- */
-function splitText(text: string, maxChars: number): string[] {
-  if (text.length <= maxChars) return [text];
-  // Try paragraph splits first.
-  const paragraphs = text.split(/\n\n+/);
-  if (paragraphs.length > 1) {
-    const out: string[] = [];
-    let buf = "";
-    for (const p of paragraphs) {
-      const candidate = buf ? `${buf}\n\n${p}` : p;
-      if (candidate.length <= maxChars) {
-        buf = candidate;
-      } else {
-        if (buf) out.push(buf);
-        if (p.length <= maxChars) {
-          buf = p;
-        } else {
-          out.push(...splitText(p, maxChars));
-          buf = "";
-        }
-      }
-    }
-    if (buf) out.push(buf);
-    return out;
-  }
-  // Fall back to line splits.
-  const lines = text.split("\n");
-  if (lines.length > 1) {
-    const out: string[] = [];
-    let buf = "";
-    for (const line of lines) {
-      const candidate = buf ? `${buf}\n${line}` : line;
-      if (candidate.length <= maxChars) {
-        buf = candidate;
-      } else {
-        if (buf) out.push(buf);
-        if (line.length <= maxChars) {
-          buf = line;
-        } else {
-          // Single line longer than maxChars — slice it.
-          for (let i = 0; i < line.length; i += maxChars) {
-            out.push(line.slice(i, i + maxChars));
-          }
-          buf = "";
-        }
-      }
-    }
-    if (buf) out.push(buf);
-    return out;
-  }
-  // Last resort: hard slice.
-  const out: string[] = [];
-  for (let i = 0; i < text.length; i += maxChars) {
-    out.push(text.slice(i, i + maxChars));
-  }
-  return out;
-}
-/**
- * Re-chunk any chunks larger than `maxChars`, preserving order and reindexing.
- */
-export function enforceMaxChunkSize(
-  chunks: Chunk[],
-  maxChars = MAX_CHUNK_CHARS,
-): Chunk[] {
-  const out: Chunk[] = [];
-  for (const c of chunks) {
-    if (c.content.length <= maxChars) {
-      out.push({ index: out.length, content: c.content });
-      continue;
-    }
-    for (const piece of splitText(c.content, maxChars)) {
-      out.push({ index: out.length, content: piece });
-    }
-  }
-  return out;
-}
-/**
- * Add overlapping lines from the end of each chunk to the start of the next.
- * Improves retrieval when concepts span chunk boundaries.
- */
-export function addOverlapToChunks(
-  chunks: Chunk[],
-  overlapLines = DEFAULT_OVERLAP_LINES,
-): Chunk[] {
-  if (chunks.length <= 1 || overlapLines <= 0) return chunks;
-  return chunks.map((c, i) => {
-    if (i === 0) return { ...c };
-    const prevChunk = chunks[i - 1];
-    if (!prevChunk) return { ...c };
-    const prevLines = prevChunk.content.split("\n");
-    const overlap = prevLines.slice(-overlapLines).join("\n");
-    return { ...c, content: `${overlap}\n${c.content}` };
-  });
-}
-export type LLMChunkerFn = (
-  content: string,
-  mimeType: string,
-  config: Required<BotholomewConfig>,
-) => Promise<Chunk[]>;
-/**
- * Deterministic fallback that splits content on paragraph / line /
- * hard-char boundaries. Used when the LLM chunker errors or times out.
- */
-export function chunkByTextSplit(
-  content: string,
-  targetChars = FALLBACK_TARGET_CHARS,
-): Chunk[] {
-  return splitText(content, targetChars).map((c, i) => ({
-    index: i,
-    content: c,
-  }));
-}
-/**
- * LLM-driven chunker that asks Claude to identify semantic boundaries.
- * Uses structured outputs via tool_use with forced tool_choice.
- */
-export async function chunkWithLLM(
-  content: string,
-  mimeType: string,
-  config: Required<BotholomewConfig>,
-): Promise<Chunk[]> {
-  const client = new Anthropic({ apiKey: config.anthropic_api_key });
-  const lines = content.split("\n");
-  const response = await Promise.race([
-    client.messages.create({
-      model: config.chunker_model,
-      max_tokens: 2048,
-      tools: [CHUNKER_TOOL],
-      tool_choice: { type: "tool", name: CHUNKER_TOOL_NAME },
-      messages: [
-        {
-          role: "user",
-          content: `You are a document chunker. Given the following ${mimeType} document with ${lines.length} lines, identify semantic chunk boundaries. Each chunk should be a coherent section (100-500 lines preferred). Cover all lines with no gaps.
-Document:
-${content}`,
-        },
-      ],
-    }),
-    new Promise<never>((_, reject) =>
-      setTimeout(
-        () => reject(new Error("LLM chunker timeout")),
-        LLM_TIMEOUT_MS,
-      ),
-    ),
-  ]);
-  // Extract the tool_use block
-  const toolBlock = response.content.find((b) => b.type === "tool_use");
-  if (!toolBlock || toolBlock.type !== "tool_use") {
-    throw new Error("LLM chunker returned no tool_use block");
-  }
-  const input = toolBlock.input as {
-    chunks: Array<{ start_line: number; end_line: number }>;
-  };
-  if (!Array.isArray(input.chunks) || input.chunks.length === 0) {
-    throw new Error("LLM chunker returned empty boundaries");
-  }
-  return input.chunks.map((b, i) => ({
-    index: i,
-    content: lines.slice(b.start_line - 1, b.end_line).join("\n"),
-  }));
-}
-/**
- * Chunk content using the LLM chunker, with a deterministic fallback
- * when the LLM call fails (timeout, empty boundaries, API error, …).
- * Short content (<200 chars) is returned as a single chunk.
- */
-export async function chunk(
-  content: string,
-  mimeType: string,
-  config: Required<BotholomewConfig>,
-  llmChunker: LLMChunkerFn = chunkWithLLM,
-): Promise<Chunk[]> {
-  if (content.length < SHORT_CONTENT_THRESHOLD) {
-    return [{ index: 0, content }];
-  }
-  if (!config.anthropic_api_key) {
-    throw new Error(
-      "Anthropic API key is required for chunking. Set anthropic_api_key in config.",
-    );
-  }
-  let chunks: Chunk[];
-  try {
-    chunks = await llmChunker(content, mimeType, config);
-  } catch (err) {
-    const msg = err instanceof Error ? err.message : String(err);
-    logger.warn(
-      `chunker: LLM chunking failed (${msg}); falling back to deterministic text split`,
-    );
-    chunks = chunkByTextSplit(content);
-  }
-  // Enforce a hard size cap before AND after overlap. The first pass handles
-  // oversize chunks from the LLM (common for docs with very long lines); the
-  // second pass handles the rare case where added overlap pushes a near-limit
-  // chunk over.
-  const sized = enforceMaxChunkSize(chunks);
-  const withOverlap = addOverlapToChunks(sized);
-  return enforceMaxChunkSize(withOverlap);
-}

package/src/context/embedder-impl.ts DELETED Viewed

@@ -1,100 +0,0 @@
-import { existsSync } from "node:fs";
-import { join } from "node:path";
-import {
-  env,
-  type FeatureExtractionPipeline,
-  pipeline,
-} from "@huggingface/transformers";
-import type { BotholomewConfig } from "../config/schemas.ts";
-import { logger } from "../utils/logger.ts";
-// We patch @huggingface/transformers to use onnxruntime-web (WASM) instead of
-// onnxruntime-node (which segfaults under Bun — oven-sh/bun#26081). By default
-// transformers.js then points the WASM loader at jsDelivr; pin it to the
-// onnxruntime-web copy already on disk so the chat path stays offline-capable.
-const ortWasm = env.backends.onnx?.wasm;
-if (ortWasm) {
-  ortWasm.wasmPaths = {
-    mjs: import.meta.resolve(
-      "onnxruntime-web/ort-wasm-simd-threaded.asyncify.mjs",
-    ),
-    wasm: import.meta.resolve(
-      "onnxruntime-web/ort-wasm-simd-threaded.asyncify.wasm",
-    ),
-  };
-}
-type EmbedFn = (
-  texts: string[],
-  config: Required<BotholomewConfig>,
-) => Promise<number[][]>;
-// Singleton pipeline keyed by model name. Loading the model is expensive
-// (downloads weights on first run, then ~hundreds of ms to instantiate the
-// ONNX runtime), so we hold one per model for the life of the process.
-const pipelinePromises = new Map<string, Promise<FeatureExtractionPipeline>>();
-export function setEmbeddingCacheDir(dir: string): void {
-  // Trailing separator matters: transformers.js builds paths as `${cacheDir}${rel}` (no separator).
-  env.cacheDir = dir.endsWith("/") ? dir : `${dir}/`;
-}
-function isModelCached(model: string): boolean {
-  if (!env.cacheDir) return false;
-  return existsSync(join(env.cacheDir, model));
-}
-async function getPipeline(model: string): Promise<FeatureExtractionPipeline> {
-  let p = pipelinePromises.get(model);
-  if (!p) {
-    if (isModelCached(model)) {
-      logger.debug(`Loading embedding model ${model}`);
-    } else {
-      logger.info(
-        `Loading embedding model ${model} (first run, downloading weights)`,
-      );
-    }
-    p = pipeline("feature-extraction", model);
-    pipelinePromises.set(model, p);
-  }
-  return p;
-}
-/**
- * Embed multiple texts using a local @huggingface/transformers feature-extraction
- * pipeline. Returns an array of L2-normalized float vectors with the model's
- * native dimension (must match `config.embedding_dimension`).
- */
-export async function embed(
-  texts: string[],
-  config: Required<BotholomewConfig>,
-): Promise<number[][]> {
-  if (texts.length === 0) return [];
-  const extractor = await getPipeline(config.embedding_model);
-  const output = await extractor(texts, { pooling: "mean", normalize: true });
-  const data = output.tolist() as number[][];
-  if (data[0] && data[0].length !== config.embedding_dimension) {
-    throw new Error(
-      `Embedding model ${config.embedding_model} returned ${data[0].length}-dim vectors, but embedding_dimension is set to ${config.embedding_dimension}. Update embedding_dimension in config and re-embed.`,
-    );
-  }
-  return data;
-}
-/**
- * Embed a single text string.
- */
-export async function embedSingle(
-  text: string,
-  config: Required<BotholomewConfig>,
-): Promise<number[]> {
-  const results = await embed([text], config);
-  const vec = results[0];
-  if (!vec) throw new Error("embed returned empty results");
-  return vec;
-}
-export type { EmbedFn };

package/src/context/embedder.ts DELETED Viewed

@@ -1,9 +0,0 @@
-// Re-exports the real embedder implementation from `embedder-impl.ts`.
-//
-// Why the indirection: tests that touch code importing from this file (e.g.,
-// `src/chat/agent.ts`, `src/worker/prompt.ts`) use Bun's `mock.module()` to
-// stub the embedder so they don't hit OpenAI. Bun's module mocks are
-// process-wide and can leak into subsequent test files. By keeping the real
-// implementation in `embedder-impl.ts`, `test/context/embedder.test.ts` can
-// import the real embedder from a path that nothing mocks.
-export * from "./embedder-impl.ts";

package/src/context/fetcher-errors.ts DELETED Viewed

@@ -1,8 +0,0 @@
-export class FetchFailureError extends Error {
-  readonly userMessage: string;
-  constructor(message: string) {
-    super(message);
-    this.name = "FetchFailureError";
-    this.userMessage = message;
-  }
-}