npm - mikoshi - Versions diffs - 0.1.0 - Mend

mikoshi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/README.md +41 -0
package/package.json +23 -0
package/src/mikoshi/auth.js +199 -0
package/src/mikoshi/chunking.js +31 -0
package/src/mikoshi/cli.js +274 -0
package/src/mikoshi/config.js +58 -0
package/src/mikoshi/entitlements.js +6 -0
package/src/mikoshi/hashing.js +9 -0
package/src/mikoshi/ignore.js +90 -0
package/src/mikoshi/indexing/file_scanner.js +39 -0
package/src/mikoshi/indexing/index_store.js +82 -0
package/src/mikoshi/indexing/indexer.js +198 -0
package/src/mikoshi/mcp_server/server.js +121 -0
package/src/mikoshi/retrieval/hybrid.js +85 -0
package/src/mikoshi/retrieval/lexical.js +53 -0
package/src/mikoshi/retrieval/rerank.js +3 -0
package/src/mikoshi/retrieval/semantic.js +210 -0
package/src/mikoshi/utils/timer.js +9 -0
package/src/mikoshi/utils/types.js +44 -0

package/src/mikoshi/ignore.js ADDED Viewed

@@ -0,0 +1,90 @@
+import fs from "node:fs";
+import path from "node:path";
+const COMMON_IGNORES = [
+  ".git/",
+  "node_modules/",
+  "dist/",
+  "build/",
+  "coverage/",
+  ".pytest_cache/",
+  ".venv/",
+  ".mypy_cache/",
+  ".ruff_cache/",
+  ".idea/",
+  ".vscode/",
+  "*.pyc",
+  "*.pyo",
+  "*.pyd",
+  "*.so",
+  "*.dylib",
+  "*.dll",
+  "*.exe",
+  "*.DS_Store",
+  "*.log",
+  "*.tmp",
+  "*.swp",
+  "*.swo",
+  "*.zip",
+  "*.tar",
+  "*.gz",
+  "*.tgz",
+  "*.jpg",
+  "*.jpeg",
+  "*.png",
+  "*.gif",
+  "*.bmp",
+  "*.ico",
+  "*.mp3",
+  "*.mp4",
+  "*.mov",
+  "*.avi",
+  "*.pdf",
+];
+function globToRegex(glob) {
+  const escaped = glob
+    .replace(/[.+^${}()|[\]\\]/g, "\\$&")
+    .replace(/\*/g, ".*")
+    .replace(/\?/g, ".");
+  return new RegExp(`^${escaped}$`);
+}
+export class IgnoreMatcher {
+  constructor(repoRoot) {
+    this.repoRoot = repoRoot;
+    this.patterns = [];
+    this._loadPatterns();
+  }
+  _loadPatterns() {
+    const addPattern = (pattern) => {
+      const trimmed = pattern.trim();
+      if (!trimmed || trimmed.startsWith("#")) return;
+      this.patterns.push(trimmed);
+    };
+    COMMON_IGNORES.forEach(addPattern);
+    const gitignorePath = path.join(this.repoRoot, ".gitignore");
+    if (fs.existsSync(gitignorePath)) {
+      const content = fs.readFileSync(gitignorePath, "utf8");
+      content.split(/\r?\n/).forEach(addPattern);
+    }
+  }
+  ignores(relPath) {
+    for (const pattern of this.patterns) {
+      if (pattern.endsWith("/")) {
+        if (relPath.startsWith(pattern)) return true;
+        continue;
+      }
+      if (pattern.includes("*")) {
+        if (globToRegex(pattern).test(relPath)) return true;
+        continue;
+      }
+      if (relPath === pattern || relPath.endsWith(`/${pattern}`)) return true;
+    }
+    return false;
+  }
+}

package/src/mikoshi/indexing/file_scanner.js ADDED Viewed

@@ -0,0 +1,39 @@
+import fs from "node:fs";
+import path from "node:path";
+function looksBinary(buffer) {
+  const sample = buffer.subarray(0, 8000);
+  for (const byte of sample) {
+    if (byte === 0) return true;
+  }
+  return false;
+}
+function scanDir(dir, repoRoot, matcher, maxBytes, out) {
+  const entries = fs.readdirSync(dir, { withFileTypes: true });
+  for (const entry of entries) {
+    const fullPath = path.join(dir, entry.name);
+    const relPath = path.relative(repoRoot, fullPath).split(path.sep).join("/");
+    if (matcher.ignores(relPath)) continue;
+    if (entry.isDirectory()) {
+      scanDir(fullPath, repoRoot, matcher, maxBytes, out);
+    } else if (entry.isFile()) {
+      try {
+        const stat = fs.statSync(fullPath);
+        if (stat.size > maxBytes) continue;
+        const data = fs.readFileSync(fullPath);
+        if (looksBinary(data)) continue;
+        out.push(fullPath);
+      } catch {
+        continue;
+      }
+    }
+  }
+}
+export function scanRepo(repoRoot, matcher, maxBytes) {
+  const files = [];
+  scanDir(repoRoot, repoRoot, matcher, maxBytes, files);
+  return files;
+}

package/src/mikoshi/indexing/index_store.js ADDED Viewed

@@ -0,0 +1,82 @@
+import fs from "node:fs";
+import path from "node:path";
+import { sha256Text } from "../hashing.js";
+export class IndexStore {
+  constructor(repoRoot, indexRoot) {
+    this.repo_root = repoRoot;
+    this.repo_id = repoIdForPath(repoRoot);
+    this.store_dir = path.join(indexRoot, this.repo_id);
+    this.meta_path = path.join(this.store_dir, "meta.json");
+    this.chunks_path = path.join(this.store_dir, "chunks.jsonl");
+    this.embeddings_path = path.join(this.store_dir, "embeddings.bin");
+  }
+  exists() {
+    return fs.existsSync(this.meta_path) && fs.existsSync(this.chunks_path);
+  }
+  ensureDir() {
+    fs.mkdirSync(this.store_dir, { recursive: true });
+  }
+  clear() {
+    if (fs.existsSync(this.store_dir)) {
+      fs.rmSync(this.store_dir, { recursive: true, force: true });
+    }
+  }
+  loadMeta() {
+    if (!fs.existsSync(this.meta_path)) return null;
+    try {
+      const data = JSON.parse(fs.readFileSync(this.meta_path, "utf8"));
+      return data;
+    } catch {
+      return null;
+    }
+  }
+  saveMeta(meta) {
+    this.ensureDir();
+    fs.writeFileSync(this.meta_path, JSON.stringify(meta, null, 2));
+  }
+  loadChunks() {
+    if (!fs.existsSync(this.chunks_path)) return [];
+    const lines = fs.readFileSync(this.chunks_path, "utf8").split(/\r?\n/);
+    const chunks = [];
+    for (const line of lines) {
+      if (!line.trim()) continue;
+      try {
+        chunks.push(JSON.parse(line));
+      } catch {
+        continue;
+      }
+    }
+    return chunks;
+  }
+  saveChunks(chunks) {
+    this.ensureDir();
+    const lines = chunks.map((chunk) => JSON.stringify(chunk)).join("\n");
+    fs.writeFileSync(this.chunks_path, lines + (lines ? "\n" : ""));
+  }
+  loadEmbeddings(meta) {
+    if (!fs.existsSync(this.embeddings_path)) return null;
+    const buffer = fs.readFileSync(this.embeddings_path);
+    const embeddings = new Float32Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 4);
+    const dim = meta?.embedding_dim || 0;
+    return { embeddings, dim };
+  }
+  saveEmbeddings(embeddings) {
+    this.ensureDir();
+    const buffer = Buffer.from(embeddings.buffer);
+    fs.writeFileSync(this.embeddings_path, buffer);
+  }
+}
+export function repoIdForPath(repoRoot) {
+  return sha256Text(path.resolve(repoRoot)).slice(0, 16);
+}

package/src/mikoshi/indexing/indexer.js ADDED Viewed

@@ -0,0 +1,198 @@
+import fs from "node:fs";
+import path from "node:path";
+import { chunkText } from "../chunking.js";
+import { loadConfig } from "../config.js";
+import { sha256Bytes, sha256Text } from "../hashing.js";
+import { IgnoreMatcher } from "../ignore.js";
+import { IndexStore } from "./index_store.js";
+import { scanRepo } from "./file_scanner.js";
+import { getEmbeddingsProvider, normalizeEmbeddings } from "../retrieval/semantic.js";
+import { Timer } from "../utils/timer.js";
+function prepareFile(filePath, repoRoot) {
+  const data = fs.readFileSync(filePath);
+  const fileHash = sha256Bytes(data);
+  const text = data.toString("utf8");
+  const relpath = path.relative(repoRoot, filePath).split(path.sep).join("/");
+  return { path: filePath, relpath, file_hash: fileHash, text };
+}
+function chunksFromText(relpath, fileHash, text, maxLines, overlap) {
+  const spans = chunkText(text, maxLines, overlap);
+  return spans.map((span) => {
+    const chunkId = sha256Text(`${relpath}:${span.start_line}:${span.end_line}:${span.text}`);
+    return {
+      id: chunkId,
+      relpath,
+      start_line: span.start_line,
+      end_line: span.end_line,
+      text: span.text,
+      file_hash: fileHash,
+      vector_idx: null,
+    };
+  });
+}
+function withVectorIdx(chunks) {
+  return chunks.map((chunk, idx) => ({ ...chunk, vector_idx: idx }));
+}
+function reuseChunks(prevChunks, prevEmbeddings) {
+  const chunksByFile = new Map();
+  const embeddingById = new Map();
+  if (prevEmbeddings) {
+    for (const chunk of prevChunks) {
+      if (chunk.vector_idx == null) continue;
+      const offset = chunk.vector_idx;
+      if (offset >= 0) embeddingById.set(chunk.id, offset);
+    }
+  }
+  for (const chunk of prevChunks) {
+    if (!chunksByFile.has(chunk.relpath)) chunksByFile.set(chunk.relpath, []);
+    chunksByFile.get(chunk.relpath).push(chunk);
+  }
+  return { chunksByFile, embeddingById };
+}
+export async function indexRepo(repoPath, configOverride = null) {
+  const config = configOverride || loadConfig();
+  const repoRoot = path.resolve(repoPath);
+  const matcher = new IgnoreMatcher(repoRoot);
+  const store = new IndexStore(repoRoot, config.index_root);
+  const prevMeta = store.loadMeta();
+  const prevChunks = store.loadChunks();
+  const prevEmbeddingsData = prevMeta ? store.loadEmbeddings(prevMeta) : null;
+  const prevFiles = prevMeta?.files || {};
+  let reset = false;
+  if (prevMeta) {
+    if (
+      prevMeta.chunk_lines !== config.chunk_lines ||
+      prevMeta.chunk_overlap !== config.chunk_overlap ||
+      prevMeta.max_bytes !== config.max_bytes ||
+      prevMeta.embedding_provider !== config.embeddings.provider ||
+      prevMeta.model !== config.embeddings.model
+    ) {
+      reset = true;
+    }
+  }
+  const prevChunksSafe = reset ? [] : prevChunks;
+  const prevEmbeddingsSafe = reset ? null : prevEmbeddingsData;
+  const prevFilesSafe = reset ? {} : prevFiles;
+  const { chunksByFile, embeddingById } = reuseChunks(prevChunksSafe, prevEmbeddingsSafe?.embeddings);
+  const timer = new Timer();
+  const files = scanRepo(repoRoot, matcher, config.max_bytes);
+  const indexedFiles = [];
+  for (const filePath of files) {
+    try {
+      indexedFiles.push(prepareFile(filePath, repoRoot));
+    } catch {
+      continue;
+    }
+  }
+  const fileHashes = {};
+  for (const item of indexedFiles) fileHashes[item.relpath] = item.file_hash;
+  const newChunks = [];
+  const embeddingSlots = [];
+  const pendingTexts = [];
+  const pendingIndices = [];
+  for (const item of indexedFiles) {
+    const relpath = item.relpath;
+    const unchanged = prevFilesSafe[relpath] === item.file_hash;
+    if (unchanged && chunksByFile.has(relpath) && embeddingById.size) {
+      const existing = chunksByFile.get(relpath);
+      if (existing.every((chunk) => embeddingById.has(chunk.id))) {
+        for (const chunk of existing) {
+          newChunks.push(chunk);
+          embeddingSlots.push(embeddingById.get(chunk.id));
+        }
+        continue;
+      }
+    }
+    const chunks = chunksFromText(
+      relpath,
+      item.file_hash,
+      item.text,
+      config.chunk_lines,
+      config.chunk_overlap
+    );
+    for (const chunk of chunks) {
+      pendingIndices.push(newChunks.length);
+      newChunks.push(chunk);
+      embeddingSlots.push(null);
+      pendingTexts.push(chunk.text);
+    }
+  }
+  let provider = null;
+  let pendingEmbeddings = [];
+  if (pendingTexts.length) {
+    provider = getEmbeddingsProvider(config);
+    pendingEmbeddings = await provider.embedTexts(pendingTexts);
+  }
+  const dimension =
+    provider?.dimension ||
+    prevEmbeddingsSafe?.dim ||
+    prevMeta?.embedding_dim ||
+    (pendingEmbeddings[0] ? pendingEmbeddings[0].length : 1);
+  const allEmbeddings = new Float32Array(newChunks.length * dimension);
+  for (let i = 0; i < embeddingSlots.length; i += 1) {
+    const slot = embeddingSlots[i];
+    let vector = null;
+    if (slot !== null && prevEmbeddingsSafe) {
+      const offset = slot * dimension;
+      vector = prevEmbeddingsSafe.embeddings.subarray(offset, offset + dimension);
+    }
+    if (!vector) {
+      const pendingIndex = pendingIndices.indexOf(i);
+      if (pendingIndex >= 0) vector = pendingEmbeddings[pendingIndex];
+    }
+    if (!vector) {
+      vector = new Float32Array(dimension);
+    }
+    allEmbeddings.set(vector, i * dimension);
+  }
+  normalizeEmbeddings(allEmbeddings, dimension);
+  const finalChunks = withVectorIdx(newChunks);
+  const now = new Date().toISOString();
+  const meta = {
+    repo_id: store.repo_id,
+    repo_path: repoRoot,
+    created_at: prevMeta?.created_at || now,
+    updated_at: now,
+    embedding_provider: config.embeddings.provider,
+    model: config.embeddings.model,
+    embedding_dim: dimension,
+    chunk_lines: config.chunk_lines,
+    chunk_overlap: config.chunk_overlap,
+    max_bytes: config.max_bytes,
+    files: fileHashes,
+    chunks: finalChunks.length,
+  };
+  store.saveChunks(finalChunks);
+  store.saveEmbeddings(allEmbeddings);
+  store.saveMeta(meta);
+  return {
+    repo_id: store.repo_id,
+    chunks_indexed: finalChunks.length,
+    took_ms: timer.ms,
+  };
+}

package/src/mikoshi/mcp_server/server.js ADDED Viewed

@@ -0,0 +1,121 @@
+#!/usr/bin/env node
+import readline from "node:readline";
+import { indexRepo } from "../indexing/indexer.js";
+import { searchRepo } from "../retrieval/hybrid.js";
+import { loadConfig } from "../config.js";
+import { IndexStore } from "../indexing/index_store.js";
+import path from "node:path";
+const TOOL_NAME = "codebase-retrieval";
+const TOOL_SCHEMA = {
+  name: TOOL_NAME,
+  description: "Index (if needed) and search a local codebase.",
+  inputSchema: {
+    type: "object",
+    properties: {
+      path: { type: "string" },
+      query: { type: "string" },
+      k: { type: "integer", default: 8 },
+    },
+    required: ["path", "query"],
+  },
+  outputSchema: {
+    type: "object",
+    properties: {
+      result: { type: "array" },
+    },
+    required: ["result"],
+  },
+};
+function send(message) {
+  process.stdout.write(JSON.stringify(message) + "\n");
+}
+function sendError(id, code, message) {
+  send({ jsonrpc: "2.0", id, error: { code, message } });
+}
+async function handleToolCall(params) {
+  const { name, arguments: args } = params || {};
+  if (name !== TOOL_NAME) {
+    return { isError: true, content: [{ type: "text", text: `Unknown tool: ${name}` }] };
+  }
+  const repoPath = args?.path;
+  const query = args?.query;
+  const k = args?.k ?? 8;
+  if (!repoPath || !query) {
+    return {
+      isError: true,
+      content: [{ type: "text", text: "Missing required arguments." }],
+    };
+  }
+  const config = loadConfig();
+  const repoRoot = path.resolve(repoPath);
+  const store = new IndexStore(repoRoot, config.index_root);
+  if (!store.exists()) {
+    await indexRepo(repoRoot, config);
+  } else {
+    await indexRepo(repoRoot, config);
+  }
+  const results = await searchRepo(repoRoot, query, k);
+  return {
+    isError: false,
+    content: [{ type: "text", text: JSON.stringify(results) }],
+    structuredContent: { result: results },
+  };
+}
+const rl = readline.createInterface({ input: process.stdin, crlfDelay: Infinity });
+rl.on("line", async (line) => {
+  if (!line.trim()) return;
+  let payload;
+  try {
+    payload = JSON.parse(line);
+  } catch {
+    return;
+  }
+  const { id, method, params } = payload;
+  try {
+    if (method === "initialize") {
+      send({
+        jsonrpc: "2.0",
+        id,
+        result: {
+          protocolVersion: params?.protocolVersion || "2024-11-05",
+          capabilities: { tools: {} },
+          serverInfo: { name: "mikoshi", version: "0.1.0" },
+        },
+      });
+      return;
+    }
+    if (method === "tools/list") {
+      send({ jsonrpc: "2.0", id, result: { tools: [TOOL_SCHEMA] } });
+      return;
+    }
+    if (method === "tools/call") {
+      const result = await handleToolCall(params);
+      send({ jsonrpc: "2.0", id, result });
+      return;
+    }
+    if (method === "ping") {
+      send({ jsonrpc: "2.0", id, result: {} });
+      return;
+    }
+    if (id !== undefined) {
+      sendError(id, -32601, `Method not found: ${method}`);
+    }
+  } catch (err) {
+    if (id !== undefined) {
+      sendError(id, -32000, err?.message || "Internal error");
+    }
+  }
+});

package/src/mikoshi/retrieval/hybrid.js ADDED Viewed

@@ -0,0 +1,85 @@
+import path from "node:path";
+import { loadConfig } from "../config.js";
+import { IndexStore } from "../indexing/index_store.js";
+import { LexicalIndex } from "./lexical.js";
+import { rerank } from "./rerank.js";
+import { getEmbeddingsProvider, SemanticSearcher } from "./semantic.js";
+function normalize(scores) {
+  if (!scores.length) return new Map();
+  const values = scores.map((item) => item[1]);
+  const min = Math.min(...values);
+  const max = Math.max(...values);
+  const out = new Map();
+  for (const [idx, score] of scores) {
+    if (max === min) out.set(idx, 1.0);
+    else out.set(idx, (score - min) / (max - min));
+  }
+  return out;
+}
+export function mergeScores(lexical, semantic, alpha = 0.5) {
+  const lexNorm = normalize(lexical);
+  const semNorm = normalize(semantic);
+  const combined = new Map();
+  for (const [idx, score] of lexNorm.entries()) {
+    combined.set(idx, (combined.get(idx) || 0) + alpha * score);
+  }
+  for (const [idx, score] of semNorm.entries()) {
+    combined.set(idx, (combined.get(idx) || 0) + (1 - alpha) * score);
+  }
+  return combined;
+}
+function topK(combined, k) {
+  return [...combined.entries()].sort((a, b) => b[1] - a[1]).slice(0, k);
+}
+export function makeSnippet(text, maxLines = 6, maxChars = 400) {
+  const lines = text.split(/\r?\n/);
+  let snippet = lines.slice(0, maxLines).join("\n");
+  if (snippet.length > maxChars) snippet = `${snippet.slice(0, maxChars - 3)}...`;
+  return snippet;
+}
+export async function hybridSearch(query, chunks, lexical, semantic, k = 8, alpha = 0.5) {
+  if (!chunks.length) return [];
+  const expanded = Math.max(k * 3, k);
+  const lexicalHits = lexical.search(query, expanded);
+  const semanticHits = await semantic.search(query, expanded);
+  const combined = mergeScores(lexicalHits, semanticHits, alpha);
+  const merged = topK(combined, expanded);
+  const reranked = rerank(query, merged, chunks);
+  return reranked.slice(0, k);
+}
+export async function searchRepo(repoPath, query, k = 8) {
+  const config = loadConfig();
+  const repoRoot = path.resolve(repoPath);
+  const store = new IndexStore(repoRoot, config.index_root);
+  const meta = store.loadMeta();
+  if (!meta) throw new Error("Repository has not been indexed yet.");
+  if (meta.embedding_provider !== config.embeddings.provider || meta.model !== config.embeddings.model) {
+    throw new Error("Index was built with different embeddings settings. Re-index the repository.");
+  }
+  const chunks = store.loadChunks();
+  const embeddingsData = store.loadEmbeddings(meta);
+  if (!embeddingsData) throw new Error("Index data missing. Re-run indexing.");
+  if (!chunks.length) return [];
+  const lexical = new LexicalIndex(chunks.map((chunk) => chunk.text));
+  const provider = getEmbeddingsProvider(config);
+  const semantic = new SemanticSearcher(embeddingsData.embeddings, embeddingsData.dim, provider);
+  const hits = await hybridSearch(query, chunks, lexical, semantic, k);
+  return hits.map(([idx, score]) => ({
+    relpath: chunks[idx].relpath,
+    start_line: chunks[idx].start_line,
+    end_line: chunks[idx].end_line,
+    score: Number(score),
+    snippet: makeSnippet(chunks[idx].text),
+  }));
+}

package/src/mikoshi/retrieval/lexical.js ADDED Viewed

@@ -0,0 +1,53 @@
+function tokenize(text) {
+  return text.toLowerCase().split(/[^a-z0-9_]+/).filter(Boolean);
+}
+export class LexicalIndex {
+  constructor(documents) {
+    this.documents = documents;
+    this.docTokens = documents.map(tokenize);
+    this.docLengths = this.docTokens.map((tokens) => tokens.length);
+    this.avgDocLength =
+      this.docLengths.reduce((a, b) => a + b, 0) / (this.docLengths.length || 1);
+    this.termDocFreq = new Map();
+    for (const tokens of this.docTokens) {
+      const seen = new Set();
+      for (const token of tokens) {
+        if (seen.has(token)) continue;
+        seen.add(token);
+        this.termDocFreq.set(token, (this.termDocFreq.get(token) || 0) + 1);
+      }
+    }
+  }
+  search(query, k = 8) {
+    const tokens = tokenize(query);
+    const scores = new Array(this.documents.length).fill(0);
+    const N = this.documents.length;
+    const k1 = 1.2;
+    const b = 0.75;
+    for (const term of tokens) {
+      const df = this.termDocFreq.get(term) || 0.5;
+      const idf = Math.log(1 + (N - df + 0.5) / (df + 0.5));
+      for (let i = 0; i < this.docTokens.length; i += 1) {
+        const tokensDoc = this.docTokens[i];
+        let tf = 0;
+        for (const t of tokensDoc) {
+          if (t === term) tf += 1;
+        }
+        if (!tf) continue;
+        const denom = tf + k1 * (1 - b + (b * this.docLengths[i]) / this.avgDocLength);
+        scores[i] += idf * ((tf * (k1 + 1)) / denom);
+      }
+    }
+    const results = [];
+    for (let i = 0; i < scores.length; i += 1) {
+      if (scores[i] > 0) results.push([i, scores[i]]);
+    }
+    results.sort((a, b) => b[1] - a[1]);
+    return results.slice(0, k);
+  }
+}

package/src/mikoshi/retrieval/rerank.js ADDED Viewed

@@ -0,0 +1,3 @@
+export function rerank(_query, merged, _chunks) {
+  return merged;
+}