npm - opencode-semantic-search - Versions diffs - 0.1.0 - Mend

opencode-semantic-search 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/AGENTS.md +165 -0
package/README.md +138 -0
package/SETUP.md +541 -0
package/bin/opencode-semantic-search.mjs +70 -0
package/bun.lock +61 -0
package/index.ts +138 -0
package/install.sh +260 -0
package/package.json +67 -0
package/src/chunker/fallback.ts +77 -0
package/src/chunker/index.ts +16 -0
package/src/chunker/treesitter.ts +119 -0
package/src/config.ts +157 -0
package/src/diagnostics/bundle.ts +63 -0
package/src/diagnostics/routing.ts +37 -0
package/src/embedder/interface.ts +62 -0
package/src/embedder/ollama.ts +60 -0
package/src/embedder/openai.ts +71 -0
package/src/indexer/delta.ts +165 -0
package/src/indexer/gc.ts +10 -0
package/src/indexer/incremental.ts +105 -0
package/src/indexer/pipeline.test.ts +126 -0
package/src/indexer/pipeline.ts +394 -0
package/src/indexer/pool.ts +25 -0
package/src/indexer/resume.ts +14 -0
package/src/logger.ts +121 -0
package/src/runtime.ts +111 -0
package/src/search/context.ts +17 -0
package/src/search/hybrid.ts +65 -0
package/src/store/schema.sql +31 -0
package/src/store/sqlite.ts +269 -0
package/src/tools/diagnostic_bundle.ts +34 -0
package/src/tools/index_status.ts +73 -0
package/src/tools/reindex.ts +71 -0
package/src/tools/semantic_search.ts +91 -0
package/src/tools/smart_grep.ts +198 -0
package/src/tui_toast.ts +191 -0
package/src/types.d.ts +1 -0

package/index.ts ADDED Viewed

@@ -0,0 +1,138 @@
+import type { Plugin } from "@opencode-ai/plugin";
+import { loadConfig } from "./src/config";
+import { deltaSync } from "./src/indexer/delta";
+import { indexSingleFile } from "./src/indexer/incremental";
+import { createLogger } from "./src/logger";
+import { applyDeltaProgress, clearIndexingProgress, createRuntime, setIncrementalIndexingProgress } from "./src/runtime";
+import { showIndexingCompleteToast } from "./src/tui_toast";
+import { createDiagnosticBundleTool } from "./src/tools/diagnostic_bundle";
+import { createIndexStatusTool } from "./src/tools/index_status";
+import { createReindexTool } from "./src/tools/reindex";
+import { createSemanticSearchTool } from "./src/tools/semantic_search";
+import { createSmartGrepTool } from "./src/tools/smart_grep";
+async function runLockedIndexing(runtime: Awaited<ReturnType<typeof createRuntime>>, operation: () => Promise<void>): Promise<void> {
+  if (runtime.indexingLock) {
+    await runtime.logger.debug("indexing", { message: "Skipped because indexing lock is active" });
+    return;
+  }
+  runtime.indexingLock = true;
+  try {
+    await operation();
+  } finally {
+    runtime.indexingLock = false;
+  }
+}
+export const OpenCodeSemanticSearchPlugin: Plugin = async ({ client, worktree, directory }) => {
+  const config = await loadConfig(worktree);
+  const logger = createLogger({
+    enabled: config.logging.enabled,
+    level: config.logging.level,
+    verbosePaths: config.logging.verbose_paths ?? [],
+    logFile: config.logging.log_file,
+    client
+  });
+  const runtime = await createRuntime(worktree, config, logger);
+  runtime.opencodeClient = client;
+  runtime.projectDirectory = directory;
+  await logger.info("plugin", {
+    message: "Plugin initialized",
+    extra: {
+      provider: runtime.config.embedding.provider,
+      model: runtime.config.embedding.model
+    }
+  });
+  return {
+    tool: {
+      semantic_search: createSemanticSearchTool(runtime),
+      index_status: createIndexStatusTool(runtime),
+      reindex: createReindexTool(runtime),
+      grep: createSmartGrepTool(runtime),
+      diagnostic_bundle: createDiagnosticBundleTool(runtime),
+    },
+    event: async ({ event }) => {
+      if (event.type === "session.created") {
+        const startedAt = Date.now();
+        await runtime.logger.info("session.created", { message: "Delta sync started" });
+        await runLockedIndexing(runtime, async () => {
+          await deltaSync(worktree, runtime.store, runtime.embedder, runtime.config, {
+            logger: runtime.logger,
+            onProgress: (update) => applyDeltaProgress(runtime, update, "background"),
+          });
+        });
+        const elapsedMs = Date.now() - startedAt;
+        const statsAfterSync = runtime.store.stats();
+        await runtime.logger.info("session.created", {
+          message: "Delta sync finished",
+          extra: { elapsedMs }
+        });
+        void showIndexingCompleteToast(runtime, {
+          title: "Semantic index",
+          files: statsAfterSync.files,
+          chunks: statsAfterSync.chunks,
+          elapsedMs,
+          failedFiles: runtime.indexingProgress.failedFiles,
+          flavor: "background",
+        });
+      } else if (event.type === "file.edited") {
+        const filePath = event.properties.file;
+        await Bun.sleep(500);
+        const startedAt = Date.now();
+        await runtime.logger.info("file.edited", {
+          message: "Incremental indexing started",
+          extra: { filePath }
+        });
+        await runLockedIndexing(runtime, async () => {
+          setIncrementalIndexingProgress(runtime, filePath);
+          try {
+            await indexSingleFile(filePath, runtime.store, runtime.embedder, runtime.config, runtime.logger);
+          } finally {
+            clearIndexingProgress(runtime);
+          }
+        });
+        await runtime.logger.info("file.edited", {
+          message: "Incremental indexing finished",
+          extra: { filePath, elapsedMs: Date.now() - startedAt }
+        });
+      } else if (event.type === "file.watcher.updated") {
+        if (event.properties.event === "unlink") return;
+        const filePath = event.properties.file;
+        const startedAt = Date.now();
+        await runtime.logger.info("file.watcher.updated", {
+          message: "Watcher-triggered indexing started",
+          extra: { filePath }
+        });
+        await runLockedIndexing(runtime, async () => {
+          setIncrementalIndexingProgress(runtime, filePath);
+          try {
+            await indexSingleFile(filePath, runtime.store, runtime.embedder, runtime.config, runtime.logger);
+          } finally {
+            clearIndexingProgress(runtime);
+          }
+        });
+        await runtime.logger.info("file.watcher.updated", {
+          message: "Watcher-triggered indexing finished",
+          extra: { filePath, elapsedMs: Date.now() - startedAt }
+        });
+      }
+    },
+    "experimental.session.compacting": async (_input, output) => {
+      const stats = runtime.store.stats();
+      output.context.push(
+        `## Semantic Search Status\nfiles=${stats.files} chunks=${stats.chunks} last_sync=${stats.lastSync ?? "never"}\nUse semantic_search for conceptual queries; grep override falls back to ripgrep automatically.`,
+      );
+    },
+    "experimental.chat.system.transform": async (_input, output) => {
+      const { files, chunks } = runtime.store.stats();
+      if (chunks === 0) return;
+      output.system.push(
+        `Semantic search index ready (${files} files, ${chunks} chunks). Prefer \`semantic_search\` for conceptual/behavioural queries; use \`grep\` only for exact symbols or literals.`,
+      );
+    },
+  };
+};
+export default OpenCodeSemanticSearchPlugin;

package/install.sh ADDED Viewed

@@ -0,0 +1,260 @@
+#!/usr/bin/env bash
+# OpenCode Semantic Search Plugin — install script
+# Usage: bash install.sh [--global] [--ollama-model MODEL] [--openai-key-env VAR]
+set -euo pipefail
+# ─── Color helpers ─────────────────────────────────────────────────────────────
+RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; CYAN='\033[0;36m'; BOLD='\033[1m'; RESET='\033[0m'
+ok()   { echo -e "  ${GREEN}✔${RESET}  $*"; }
+warn() { echo -e "  ${YELLOW}!${RESET}  $*"; }
+err()  { echo -e "  ${RED}✘${RESET}  $*"; }
+info() { echo -e "  ${CYAN}→${RESET}  $*"; }
+header() { echo -e "\n${BOLD}$*${RESET}"; }
+# ─── Defaults ──────────────────────────────────────────────────────────────────
+INSTALL_GLOBAL=true
+OLLAMA_MODEL="nomic-embed-text"
+OPENAI_KEY_ENV=""
+SKIP_OLLAMA=false
+# ─── Parse args ────────────────────────────────────────────────────────────────
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --global)           INSTALL_GLOBAL=true ;;
+    --local)            INSTALL_GLOBAL=false ;;
+    --ollama-model)     OLLAMA_MODEL="$2"; shift ;;
+    --openai-key-env)   OPENAI_KEY_ENV="$2"; shift ;;
+    --skip-ollama)      SKIP_OLLAMA=true ;;
+    -h|--help)
+      echo "Usage: bash install.sh [OPTIONS]"
+      echo ""
+      echo "Options:"
+      echo "  --global              Install to ~/.config/opencode/plugins/ (default)"
+      echo "  --local               Install to ./.opencode/plugins/ in the current directory"
+      echo "  --ollama-model MODEL  Embedding model to pull (default: nomic-embed-text)"
+      echo "  --openai-key-env VAR  Use OpenAI instead of Ollama; pass the env var name (e.g. OPENAI_API_KEY)"
+      echo "  --skip-ollama         Skip Ollama checks and model pull"
+      echo "  -h, --help            Show this help"
+      exit 0
+      ;;
+    *) err "Unknown option: $1"; exit 1 ;;
+  esac
+  shift
+done
+# When invoked via bunx/npm bin, PLUGIN_DIR is set to the installed package root.
+PLUGIN_DIR="${PLUGIN_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}"
+echo ""
+echo -e "${BOLD}OpenCode Semantic Search Plugin — Setup${RESET}"
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+# ─── Step 1: Check prerequisites ──────────────────────────────────────────────
+header "Step 1 — Checking prerequisites"
+# Bun
+if command -v bun &>/dev/null; then
+  BUN_VERSION=$(bun --version)
+  ok "bun $BUN_VERSION"
+else
+  err "bun not found. Install from https://bun.sh"
+  exit 1
+fi
+# OpenCode
+if command -v opencode &>/dev/null; then
+  OPENCODE_VERSION=$(opencode --version 2>/dev/null || echo "unknown")
+  ok "opencode $OPENCODE_VERSION"
+else
+  warn "opencode not found in PATH. Install from https://opencode.ai"
+  warn "Plugin will still be installed; start OpenCode after installing it."
+fi
+# ripgrep
+if command -v rg &>/dev/null; then
+  ok "ripgrep $(rg --version | head -1)"
+else
+  err "ripgrep (rg) not found — required for grep fallback."
+  echo ""
+  echo "  Install ripgrep:"
+  echo "    macOS:   brew install ripgrep"
+  echo "    Ubuntu:  sudo apt install ripgrep"
+  echo "    WSL:     sudo apt install ripgrep"
+  echo "    Cargo:   cargo install ripgrep"
+  exit 1
+fi
+# Ollama / OpenAI
+if [[ -n "$OPENAI_KEY_ENV" ]]; then
+  info "Using OpenAI embeddings (env: $OPENAI_KEY_ENV) — skipping Ollama checks."
+  SKIP_OLLAMA=true
+fi
+if [[ "$SKIP_OLLAMA" == false ]]; then
+  if command -v ollama &>/dev/null; then
+    ok "ollama $(ollama --version 2>/dev/null | head -1)"
+  else
+    warn "ollama not found. Install from https://ollama.com"
+    warn "Plugin will use keyword-only fallback until Ollama is available."
+    SKIP_OLLAMA=true
+  fi
+fi
+# ─── Step 2: Install npm dependencies ─────────────────────────────────────────
+header "Step 2 — Installing dependencies"
+cd "$PLUGIN_DIR"
+if [[ -f "$PLUGIN_DIR/bun.lock" ]]; then
+  bun install --frozen-lockfile 2>&1 | tail -3
+else
+  bun install 2>&1 | tail -3
+fi
+ok "dependencies installed"
+# ─── Step 3: Set up plugin directory ──────────────────────────────────────────
+header "Step 3 — Installing plugin"
+if [[ "$INSTALL_GLOBAL" == true ]]; then
+  PLUGINS_TARGET="$HOME/.config/opencode/plugins"
+else
+  PLUGINS_TARGET="$(pwd)/.opencode/plugins"
+fi
+mkdir -p "$PLUGINS_TARGET"
+LINK_TARGET="$PLUGINS_TARGET/opencode-semantic-search.ts"
+# Write a loader shim that re-exports from the installed plugin directory
+cat > "$LINK_TARGET" << SHIM
+// Auto-generated by install.sh — do not edit.
+// Loads the semantic search plugin from its installed location.
+export { default, OpenCodeSemanticSearchPlugin } from "${PLUGIN_DIR}/index.ts";
+SHIM
+ok "plugin shim created at $LINK_TARGET"
+if [[ "$INSTALL_GLOBAL" == true ]]; then
+  info "Plugin registered globally (all projects)."
+else
+  info "Plugin registered for this project only (current directory)."
+  info "To install globally, re-run with: bash install.sh --global"
+fi
+# ─── Step 4: Pull Ollama model ────────────────────────────────────────────────
+if [[ "$SKIP_OLLAMA" == false ]]; then
+  header "Step 4 — Pulling Ollama embedding model"
+  if ollama list 2>/dev/null | grep -q "$OLLAMA_MODEL"; then
+    ok "$OLLAMA_MODEL already available"
+  else
+    info "Pulling $OLLAMA_MODEL (this may take a minute)..."
+    if ! ollama pull "$OLLAMA_MODEL"; then
+      warn "Could not pull $OLLAMA_MODEL. Make sure Ollama is running: ollama serve"
+      warn "Run 'ollama pull $OLLAMA_MODEL' manually when Ollama is available."
+    else
+      ok "$OLLAMA_MODEL ready"
+    fi
+  fi
+else
+  header "Step 4 — Embedding provider setup"
+  if [[ -n "$OPENAI_KEY_ENV" ]]; then
+    ok "Using OpenAI provider (model: text-embedding-3-small)"
+    ok "Make sure $OPENAI_KEY_ENV is set in your environment."
+  else
+    warn "Skipped Ollama check. Run 'ollama pull $OLLAMA_MODEL' to enable semantic search."
+  fi
+fi
+# ─── Step 5: Write config (project or global defaults) ───────────────────────
+header "Step 5 — Config"
+if [[ "$INSTALL_GLOBAL" == true ]]; then
+  OPENCODE_DIR="$HOME/.config/opencode"
+  CONFIG_LABEL="global default"
+else
+  OPENCODE_DIR="$(pwd)/.opencode"
+  CONFIG_LABEL="project"
+fi
+CONFIG_PATH="$OPENCODE_DIR/semantic-search.json"
+mkdir -p "$OPENCODE_DIR"
+if [[ -f "$CONFIG_PATH" ]]; then
+  ok "Config already exists at $CONFIG_PATH — skipping."
+else
+  if [[ -n "$OPENAI_KEY_ENV" ]]; then
+    cat > "$CONFIG_PATH" << JSON
+{
+  "embedding": {
+    "provider": "openai",
+    "api_base": "https://api.openai.com/v1",
+    "model": "text-embedding-3-small",
+    "api_key_env": "$OPENAI_KEY_ENV",
+    "dimensions": 1536
+  }
+}
+JSON
+  else
+    cat > "$CONFIG_PATH" << JSON
+{
+  "embedding": {
+    "provider": "ollama",
+    "api_base": "http://localhost:11434/v1",
+    "model": "$OLLAMA_MODEL",
+    "dimensions": 768
+  }
+}
+JSON
+  fi
+  ok "Config written to $CONFIG_PATH ($CONFIG_LABEL)"
+  info "Edit it anytime to change models or add advanced settings."
+fi
+if [[ "$INSTALL_GLOBAL" == true ]]; then
+  info "Per-project overrides: add .opencode/semantic-search.json in a repo."
+fi
+# ─── Step 6: Add plugin shim path to .gitignore if project install ─────────────
+if [[ "$INSTALL_GLOBAL" == false ]]; then
+  GITIGNORE="$(pwd)/.gitignore"
+  SHIM_ENTRY=".opencode/plugins/opencode-semantic-search.ts"
+  if [[ -f "$GITIGNORE" ]] && ! grep -qF "$SHIM_ENTRY" "$GITIGNORE"; then
+    echo "$SHIM_ENTRY" >> "$GITIGNORE"
+    ok "Added plugin shim to .gitignore"
+  fi
+fi
+# ─── Step 7: Self-test ─────────────────────────────────────────────────────────
+header "Step 6 — Self-test"
+if [[ -f "$PLUGIN_DIR/scripts/integration-index.ts" ]]; then
+  info "Running integration tests..."
+  if bun run test:integration 2>&1; then
+    ok "All integration tests passed."
+  else
+    warn "Integration tests had failures. Check above output."
+    warn "This may be expected if Ollama is not running; the plugin degrades gracefully."
+  fi
+else
+  info "Skipping integration tests (not included in this install bundle)."
+fi
+# ─── Done ──────────────────────────────────────────────────────────────────────
+echo ""
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo -e "${GREEN}${BOLD}Installation complete!${RESET}"
+echo ""
+if [[ "$SKIP_OLLAMA" == false ]]; then
+  echo -e "  ${BOLD}Before starting OpenCode, make sure Ollama is running:${RESET}"
+  echo "    ollama serve"
+  echo ""
+fi
+echo -e "  ${BOLD}Start OpenCode in your project:${RESET}"
+echo "    opencode"
+echo ""
+echo -e "  ${BOLD}Available tools once OpenCode starts:${RESET}"
+echo "    semantic_search  — search by concept or intent"
+echo "    index_status     — check indexing coverage and health"
+echo "    reindex          — force a full rebuild of the index"
+echo "    grep (override)  — smart routing: semantic-first, ripgrep fallback"
+echo ""
+echo -e "  ${BOLD}Docs:${RESET} SETUP.md"
+echo ""

package/package.json ADDED Viewed

@@ -0,0 +1,67 @@
+{
+  "name": "opencode-semantic-search",
+  "version": "0.1.0",
+  "description": "Local-first semantic search plugin for OpenCode — hybrid vector + BM25 search, AST chunking, smart grep override.",
+  "module": "index.ts",
+  "type": "module",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/jainprashul/opencode-semantic-search.git"
+  },
+  "homepage": "https://github.com/jainprashul/opencode-semantic-search#readme",
+  "bugs": {
+    "url": "https://github.com/jainprashul/opencode-semantic-search/issues"
+  },
+  "bin": {
+    "opencode-semantic-search": "bin/opencode-semantic-search.mjs"
+  },
+  "files": [
+    "index.ts",
+    "src",
+    "install.sh",
+    "bin/opencode-semantic-search.mjs",
+    "bun.lock",
+    "README.md",
+    "SETUP.md",
+    "AGENTS.md"
+  ],
+  "keywords": [
+    "opencode",
+    "opencode-plugin",
+    "semantic-search",
+    "embeddings",
+    "rag",
+    "code-search",
+    "tree-sitter",
+    "ollama",
+    "sqlite-vec",
+    "bun"
+  ],
+  "scripts": {
+    "prepublishOnly": "bun run typecheck",
+    "install:plugin": "bash install.sh",
+    "install:plugin:global": "bash install.sh --global",
+    "diagnostic:bundle": "bun run scripts/diagnostic-bundle.ts",
+    "typecheck": "bunx tsc --noEmit",
+    "check": "bun run typecheck",
+    "test:integration:index": "bun run scripts/integration-index.ts",
+    "test:integration:routing": "bun run scripts/integration-routing.ts",
+    "test:integration": "bun run test:integration:index && bun run test:integration:routing",
+    "test:unit": "bun test src"
+  },
+  "devDependencies": {
+    "@types/bun": "latest"
+  },
+  "peerDependencies": {
+    "typescript": "^5"
+  },
+  "dependencies": {
+    "@opencode-ai/plugin": "^1.3.2",
+    "@opencode-ai/sdk": "1.3.2",
+    "@vscode/tree-sitter-wasm": "^0.3.0",
+    "ignore": "^7.0.5",
+    "picomatch": "^4.0.4",
+    "sqlite-vec": "^0.1.7",
+    "web-tree-sitter": "^0.26.7"
+  }
+}

package/src/chunker/fallback.ts ADDED Viewed

@@ -0,0 +1,77 @@
+export interface Chunk {
+  startLine: number;
+  endLine: number;
+  text: string;
+}
+/** Conservative char budget so estimated tokens stay under `maxContextTokens` (≈3 chars/token worst case). */
+export function maxEmbeddingChunkChars(
+  chunkingMaxTokens: number,
+  maxContextTokens: number
+): number {
+  const fromChunking = Math.max(256, chunkingMaxTokens * 4);
+  const hardCeiling = maxContextTokens * 3;
+  return Math.min(fromChunking, hardCeiling);
+}
+export function fallbackChunk(
+  text: string,
+  maxTokens: number,
+  overlapTokens: number,
+  maxContextTokens = 8192
+): Chunk[] {
+  const lines = text.split("\n");
+  const chunks: Chunk[] = [];
+  const approxChars = maxEmbeddingChunkChars(maxTokens, maxContextTokens);
+  const overlapChars = Math.max(0, overlapTokens * 4);
+  let startLine = 1;
+  let cursor = 0;
+  while (cursor < text.length) {
+    const end = Math.min(text.length, cursor + approxChars);
+    const body = text.slice(cursor, end);
+    const consumedLines = body.split("\n").length;
+    chunks.push({
+      startLine,
+      endLine: Math.min(lines.length, startLine + consumedLines - 1),
+      text: body,
+    });
+    if (end >= text.length) break;
+    cursor = Math.max(0, end - overlapChars);
+    startLine = Math.min(lines.length, startLine + consumedLines - 1);
+  }
+  return chunks;
+}
+/** Splits one chunk if it exceeds the embedding char budget (e.g. large tree-sitter boundaries). */
+export function splitChunkToMaxSize(
+  chunk: Chunk,
+  maxTokens: number,
+  overlapTokens: number,
+  maxContextTokens: number
+): Chunk[] {
+  const approxChars = maxEmbeddingChunkChars(maxTokens, maxContextTokens);
+  if (chunk.text.length <= approxChars) {
+    return [chunk];
+  }
+  const parts = fallbackChunk(chunk.text, maxTokens, overlapTokens, maxContextTokens);
+  const lineOffset = chunk.startLine - 1;
+  return parts.map((p) => ({
+    startLine: p.startLine + lineOffset,
+    endLine: p.endLine + lineOffset,
+    text: p.text,
+  }));
+}
+export function enforceChunkSizeLimits(
+  chunks: Chunk[],
+  maxTokens: number,
+  overlapTokens: number,
+  maxContextTokens: number
+): Chunk[] {
+  const out: Chunk[] = [];
+  for (const c of chunks) {
+    out.push(...splitChunkToMaxSize(c, maxTokens, overlapTokens, maxContextTokens));
+  }
+  return out;
+}

package/src/chunker/index.ts ADDED Viewed

@@ -0,0 +1,16 @@
+import path from "node:path";
+import type { PluginConfig } from "../config";
+import { enforceChunkSizeLimits, fallbackChunk, type Chunk } from "./fallback";
+import { treeSitterChunk } from "./treesitter";
+const astExtensions = new Set([".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".rs", ".java"]);
+export async function chunkFile(filePath: string, text: string, config: PluginConfig): Promise<Chunk[]> {
+  const { max_tokens: maxTokens, overlap_tokens: overlapTokens } = config.chunking;
+  const maxContextTokens = config.embedding.max_context_tokens;
+  const ext = path.extname(filePath).toLowerCase();
+  const raw = astExtensions.has(ext)
+    ? await treeSitterChunk(filePath, text, maxTokens, overlapTokens, maxContextTokens)
+    : fallbackChunk(text, maxTokens, overlapTokens, maxContextTokens);
+  return enforceChunkSizeLimits(raw, maxTokens, overlapTokens, maxContextTokens);
+}

package/src/chunker/treesitter.ts ADDED Viewed

@@ -0,0 +1,119 @@
+import { fallbackChunk, type Chunk } from "./fallback";
+import path from "node:path";
+import { Language, Parser } from "web-tree-sitter";
+interface AstLikeNode {
+  startPosition?: { row: number };
+  endPosition?: { row: number };
+  text?: string;
+  namedChildren?: AstLikeNode[];
+  type?: string;
+}
+function isBoundaryNode(nodeType: string | undefined): boolean {
+  if (!nodeType) return false;
+  return (
+    nodeType.includes("function") ||
+    nodeType.includes("class") ||
+    nodeType.includes("method") ||
+    nodeType.includes("interface") ||
+    nodeType.includes("impl")
+  );
+}
+const EXT_TO_WASM: Record<string, string> = {
+  ".ts": "tree-sitter-typescript.wasm",
+  ".tsx": "tree-sitter-tsx.wasm",
+  ".js": "tree-sitter-javascript.wasm",
+  ".jsx": "tree-sitter-javascript.wasm",
+  ".py": "tree-sitter-python.wasm",
+  ".go": "tree-sitter-go.wasm",
+  ".rs": "tree-sitter-rust.wasm",
+  ".java": "tree-sitter-java.wasm",
+};
+let parserReady = false;
+const languageCache = new Map<string, Language>();
+const pluginRoot = path.resolve(import.meta.dir, "../..");
+async function initParser(): Promise<void> {
+  if (parserReady) return;
+  const wasmPath = path.join(pluginRoot, "node_modules", "web-tree-sitter", "web-tree-sitter.wasm");
+  await Parser.init({
+    locateFile: () => wasmPath,
+  });
+  parserReady = true;
+}
+async function loadLanguageForExtension(extension: string): Promise<Language | null> {
+  const wasmFile = EXT_TO_WASM[extension];
+  if (!wasmFile) return null;
+  const cached = languageCache.get(wasmFile);
+  if (cached) return cached;
+  const wasmPath = path.join(pluginRoot, "node_modules", "@vscode", "tree-sitter-wasm", "wasm", wasmFile);
+  const lang = await Language.load(wasmPath);
+  languageCache.set(wasmFile, lang);
+  return lang;
+}
+function maybeCollectBoundary(node: AstLikeNode): boolean {
+  return isBoundaryNode(node.type) && Boolean(node.text);
+}
+function collectBoundaryChunks(root: AstLikeNode): Chunk[] {
+  const out: Chunk[] = [];
+  const walk = (node: AstLikeNode): void => {
+    if (maybeCollectBoundary(node)) {
+      out.push({
+        startLine: (node.startPosition?.row ?? 0) + 1,
+        endLine: (node.endPosition?.row ?? 0) + 1,
+        text: node.text ?? "",
+      });
+    }
+    for (const child of node.namedChildren ?? []) walk(child);
+  };
+  walk(root);
+  return out;
+}
+export async function treeSitterChunk(
+  filePath: string,
+  text: string,
+  maxTokens: number,
+  overlapTokens: number,
+  maxContextTokens: number
+): Promise<Chunk[]> {
+  try {
+    await initParser();
+    const ext = path.extname(filePath).toLowerCase();
+    const language = await loadLanguageForExtension(ext);
+    if (!language) return fallbackChunk(text, maxTokens, overlapTokens, maxContextTokens);
+    const parser = new Parser();
+    parser.setLanguage(language);
+    const tree = parser.parse(text);
+    const boundaries = tree?.rootNode ? collectBoundaryChunks(tree.rootNode as unknown as AstLikeNode) : [];
+    parser.delete();
+    tree?.delete();
+    if (boundaries.length === 0) return fallbackChunk(text, maxTokens, overlapTokens, maxContextTokens);
+    return boundaries;
+  } catch {
+    return fallbackChunk(text, maxTokens, overlapTokens, maxContextTokens);
+  }
+}
+export function extractBoundaryChunks(root: AstLikeNode, source: string): Chunk[] {
+  const chunks: Chunk[] = [];
+  const walk = (node: AstLikeNode): void => {
+    if (isBoundaryNode(node.type) && node.text) {
+      chunks.push({
+        startLine: (node.startPosition?.row ?? 0) + 1,
+        endLine: (node.endPosition?.row ?? 0) + 1,
+        text: node.text,
+      });
+    }
+    for (const child of node.namedChildren ?? []) walk(child);
+  };
+  walk(root);
+  return chunks.length > 0 ? chunks : fallbackChunk(source, 512, 50, 8192);
+}