opencode-semantic-search 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/index.ts ADDED
@@ -0,0 +1,138 @@
1
+ import type { Plugin } from "@opencode-ai/plugin";
2
+ import { loadConfig } from "./src/config";
3
+ import { deltaSync } from "./src/indexer/delta";
4
+ import { indexSingleFile } from "./src/indexer/incremental";
5
+ import { createLogger } from "./src/logger";
6
+ import { applyDeltaProgress, clearIndexingProgress, createRuntime, setIncrementalIndexingProgress } from "./src/runtime";
7
+ import { showIndexingCompleteToast } from "./src/tui_toast";
8
+ import { createDiagnosticBundleTool } from "./src/tools/diagnostic_bundle";
9
+ import { createIndexStatusTool } from "./src/tools/index_status";
10
+ import { createReindexTool } from "./src/tools/reindex";
11
+ import { createSemanticSearchTool } from "./src/tools/semantic_search";
12
+ import { createSmartGrepTool } from "./src/tools/smart_grep";
13
+
14
+ async function runLockedIndexing(runtime: Awaited<ReturnType<typeof createRuntime>>, operation: () => Promise<void>): Promise<void> {
15
+ if (runtime.indexingLock) {
16
+ await runtime.logger.debug("indexing", { message: "Skipped because indexing lock is active" });
17
+ return;
18
+ }
19
+ runtime.indexingLock = true;
20
+ try {
21
+ await operation();
22
+ } finally {
23
+ runtime.indexingLock = false;
24
+ }
25
+ }
26
+
27
+ export const OpenCodeSemanticSearchPlugin: Plugin = async ({ client, worktree, directory }) => {
28
+ const config = await loadConfig(worktree);
29
+ const logger = createLogger({
30
+ enabled: config.logging.enabled,
31
+ level: config.logging.level,
32
+ verbosePaths: config.logging.verbose_paths ?? [],
33
+ logFile: config.logging.log_file,
34
+ client
35
+ });
36
+ const runtime = await createRuntime(worktree, config, logger);
37
+ runtime.opencodeClient = client;
38
+ runtime.projectDirectory = directory;
39
+
40
+ await logger.info("plugin", {
41
+ message: "Plugin initialized",
42
+ extra: {
43
+ provider: runtime.config.embedding.provider,
44
+ model: runtime.config.embedding.model
45
+ }
46
+ });
47
+
48
+ return {
49
+ tool: {
50
+ semantic_search: createSemanticSearchTool(runtime),
51
+ index_status: createIndexStatusTool(runtime),
52
+ reindex: createReindexTool(runtime),
53
+ grep: createSmartGrepTool(runtime),
54
+ diagnostic_bundle: createDiagnosticBundleTool(runtime),
55
+ },
56
+ event: async ({ event }) => {
57
+ if (event.type === "session.created") {
58
+ const startedAt = Date.now();
59
+ await runtime.logger.info("session.created", { message: "Delta sync started" });
60
+ await runLockedIndexing(runtime, async () => {
61
+ await deltaSync(worktree, runtime.store, runtime.embedder, runtime.config, {
62
+ logger: runtime.logger,
63
+ onProgress: (update) => applyDeltaProgress(runtime, update, "background"),
64
+ });
65
+ });
66
+ const elapsedMs = Date.now() - startedAt;
67
+ const statsAfterSync = runtime.store.stats();
68
+ await runtime.logger.info("session.created", {
69
+ message: "Delta sync finished",
70
+ extra: { elapsedMs }
71
+ });
72
+ void showIndexingCompleteToast(runtime, {
73
+ title: "Semantic index",
74
+ files: statsAfterSync.files,
75
+ chunks: statsAfterSync.chunks,
76
+ elapsedMs,
77
+ failedFiles: runtime.indexingProgress.failedFiles,
78
+ flavor: "background",
79
+ });
80
+ } else if (event.type === "file.edited") {
81
+ const filePath = event.properties.file;
82
+ await Bun.sleep(500);
83
+ const startedAt = Date.now();
84
+ await runtime.logger.info("file.edited", {
85
+ message: "Incremental indexing started",
86
+ extra: { filePath }
87
+ });
88
+ await runLockedIndexing(runtime, async () => {
89
+ setIncrementalIndexingProgress(runtime, filePath);
90
+ try {
91
+ await indexSingleFile(filePath, runtime.store, runtime.embedder, runtime.config, runtime.logger);
92
+ } finally {
93
+ clearIndexingProgress(runtime);
94
+ }
95
+ });
96
+ await runtime.logger.info("file.edited", {
97
+ message: "Incremental indexing finished",
98
+ extra: { filePath, elapsedMs: Date.now() - startedAt }
99
+ });
100
+ } else if (event.type === "file.watcher.updated") {
101
+ if (event.properties.event === "unlink") return;
102
+ const filePath = event.properties.file;
103
+ const startedAt = Date.now();
104
+ await runtime.logger.info("file.watcher.updated", {
105
+ message: "Watcher-triggered indexing started",
106
+ extra: { filePath }
107
+ });
108
+ await runLockedIndexing(runtime, async () => {
109
+ setIncrementalIndexingProgress(runtime, filePath);
110
+ try {
111
+ await indexSingleFile(filePath, runtime.store, runtime.embedder, runtime.config, runtime.logger);
112
+ } finally {
113
+ clearIndexingProgress(runtime);
114
+ }
115
+ });
116
+ await runtime.logger.info("file.watcher.updated", {
117
+ message: "Watcher-triggered indexing finished",
118
+ extra: { filePath, elapsedMs: Date.now() - startedAt }
119
+ });
120
+ }
121
+ },
122
+ "experimental.session.compacting": async (_input, output) => {
123
+ const stats = runtime.store.stats();
124
+ output.context.push(
125
+ `## Semantic Search Status\nfiles=${stats.files} chunks=${stats.chunks} last_sync=${stats.lastSync ?? "never"}\nUse semantic_search for conceptual queries; grep override falls back to ripgrep automatically.`,
126
+ );
127
+ },
128
+ "experimental.chat.system.transform": async (_input, output) => {
129
+ const { files, chunks } = runtime.store.stats();
130
+ if (chunks === 0) return;
131
+ output.system.push(
132
+ `Semantic search index ready (${files} files, ${chunks} chunks). Prefer \`semantic_search\` for conceptual/behavioural queries; use \`grep\` only for exact symbols or literals.`,
133
+ );
134
+ },
135
+ };
136
+ };
137
+
138
+ export default OpenCodeSemanticSearchPlugin;
package/install.sh ADDED
@@ -0,0 +1,260 @@
1
+ #!/usr/bin/env bash
2
+ # OpenCode Semantic Search Plugin — install script
3
+ # Usage: bash install.sh [--global] [--ollama-model MODEL] [--openai-key-env VAR]
4
+ set -euo pipefail
5
+
6
+ # ─── Color helpers ─────────────────────────────────────────────────────────────
7
+ RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; CYAN='\033[0;36m'; BOLD='\033[1m'; RESET='\033[0m'
8
+ ok() { echo -e " ${GREEN}✔${RESET} $*"; }
9
+ warn() { echo -e " ${YELLOW}!${RESET} $*"; }
10
+ err() { echo -e " ${RED}✘${RESET} $*"; }
11
+ info() { echo -e " ${CYAN}→${RESET} $*"; }
12
+ header() { echo -e "\n${BOLD}$*${RESET}"; }
13
+
14
+ # ─── Defaults ──────────────────────────────────────────────────────────────────
15
+ INSTALL_GLOBAL=true
16
+ OLLAMA_MODEL="nomic-embed-text"
17
+ OPENAI_KEY_ENV=""
18
+ SKIP_OLLAMA=false
19
+
20
+ # ─── Parse args ────────────────────────────────────────────────────────────────
21
+ while [[ $# -gt 0 ]]; do
22
+ case $1 in
23
+ --global) INSTALL_GLOBAL=true ;;
24
+ --local) INSTALL_GLOBAL=false ;;
25
+ --ollama-model) OLLAMA_MODEL="$2"; shift ;;
26
+ --openai-key-env) OPENAI_KEY_ENV="$2"; shift ;;
27
+ --skip-ollama) SKIP_OLLAMA=true ;;
28
+ -h|--help)
29
+ echo "Usage: bash install.sh [OPTIONS]"
30
+ echo ""
31
+ echo "Options:"
32
+ echo " --global Install to ~/.config/opencode/plugins/ (default)"
33
+ echo " --local Install to ./.opencode/plugins/ in the current directory"
34
+ echo " --ollama-model MODEL Embedding model to pull (default: nomic-embed-text)"
35
+ echo " --openai-key-env VAR Use OpenAI instead of Ollama; pass the env var name (e.g. OPENAI_API_KEY)"
36
+ echo " --skip-ollama Skip Ollama checks and model pull"
37
+ echo " -h, --help Show this help"
38
+ exit 0
39
+ ;;
40
+ *) err "Unknown option: $1"; exit 1 ;;
41
+ esac
42
+ shift
43
+ done
44
+
45
+ # When invoked via bunx/npm bin, PLUGIN_DIR is set to the installed package root.
46
+ PLUGIN_DIR="${PLUGIN_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}"
47
+
48
+ echo ""
49
+ echo -e "${BOLD}OpenCode Semantic Search Plugin — Setup${RESET}"
50
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
51
+
52
+ # ─── Step 1: Check prerequisites ──────────────────────────────────────────────
53
+ header "Step 1 — Checking prerequisites"
54
+
55
+ # Bun
56
+ if command -v bun &>/dev/null; then
57
+ BUN_VERSION=$(bun --version)
58
+ ok "bun $BUN_VERSION"
59
+ else
60
+ err "bun not found. Install from https://bun.sh"
61
+ exit 1
62
+ fi
63
+
64
+ # OpenCode
65
+ if command -v opencode &>/dev/null; then
66
+ OPENCODE_VERSION=$(opencode --version 2>/dev/null || echo "unknown")
67
+ ok "opencode $OPENCODE_VERSION"
68
+ else
69
+ warn "opencode not found in PATH. Install from https://opencode.ai"
70
+ warn "Plugin will still be installed; start OpenCode after installing it."
71
+ fi
72
+
73
+ # ripgrep
74
+ if command -v rg &>/dev/null; then
75
+ ok "ripgrep $(rg --version | head -1)"
76
+ else
77
+ err "ripgrep (rg) not found — required for grep fallback."
78
+ echo ""
79
+ echo " Install ripgrep:"
80
+ echo " macOS: brew install ripgrep"
81
+ echo " Ubuntu: sudo apt install ripgrep"
82
+ echo " WSL: sudo apt install ripgrep"
83
+ echo " Cargo: cargo install ripgrep"
84
+ exit 1
85
+ fi
86
+
87
+ # Ollama / OpenAI
88
+ if [[ -n "$OPENAI_KEY_ENV" ]]; then
89
+ info "Using OpenAI embeddings (env: $OPENAI_KEY_ENV) — skipping Ollama checks."
90
+ SKIP_OLLAMA=true
91
+ fi
92
+
93
+ if [[ "$SKIP_OLLAMA" == false ]]; then
94
+ if command -v ollama &>/dev/null; then
95
+ ok "ollama $(ollama --version 2>/dev/null | head -1)"
96
+ else
97
+ warn "ollama not found. Install from https://ollama.com"
98
+ warn "Plugin will use keyword-only fallback until Ollama is available."
99
+ SKIP_OLLAMA=true
100
+ fi
101
+ fi
102
+
103
+ # ─── Step 2: Install npm dependencies ─────────────────────────────────────────
104
+ header "Step 2 — Installing dependencies"
105
+
106
+ cd "$PLUGIN_DIR"
107
+ if [[ -f "$PLUGIN_DIR/bun.lock" ]]; then
108
+ bun install --frozen-lockfile 2>&1 | tail -3
109
+ else
110
+ bun install 2>&1 | tail -3
111
+ fi
112
+ ok "dependencies installed"
113
+
114
+ # ─── Step 3: Set up plugin directory ──────────────────────────────────────────
115
+ header "Step 3 — Installing plugin"
116
+
117
+ if [[ "$INSTALL_GLOBAL" == true ]]; then
118
+ PLUGINS_TARGET="$HOME/.config/opencode/plugins"
119
+ else
120
+ PLUGINS_TARGET="$(pwd)/.opencode/plugins"
121
+ fi
122
+
123
+ mkdir -p "$PLUGINS_TARGET"
124
+ LINK_TARGET="$PLUGINS_TARGET/opencode-semantic-search.ts"
125
+
126
+ # Write a loader shim that re-exports from the installed plugin directory
127
+ cat > "$LINK_TARGET" << SHIM
128
+ // Auto-generated by install.sh — do not edit.
129
+ // Loads the semantic search plugin from its installed location.
130
+ export { default, OpenCodeSemanticSearchPlugin } from "${PLUGIN_DIR}/index.ts";
131
+ SHIM
132
+
133
+ ok "plugin shim created at $LINK_TARGET"
134
+
135
+ if [[ "$INSTALL_GLOBAL" == true ]]; then
136
+ info "Plugin registered globally (all projects)."
137
+ else
138
+ info "Plugin registered for this project only (current directory)."
139
+ info "To install globally, re-run with: bash install.sh --global"
140
+ fi
141
+
142
+ # ─── Step 4: Pull Ollama model ────────────────────────────────────────────────
143
+ if [[ "$SKIP_OLLAMA" == false ]]; then
144
+ header "Step 4 — Pulling Ollama embedding model"
145
+
146
+ if ollama list 2>/dev/null | grep -q "$OLLAMA_MODEL"; then
147
+ ok "$OLLAMA_MODEL already available"
148
+ else
149
+ info "Pulling $OLLAMA_MODEL (this may take a minute)..."
150
+ if ! ollama pull "$OLLAMA_MODEL"; then
151
+ warn "Could not pull $OLLAMA_MODEL. Make sure Ollama is running: ollama serve"
152
+ warn "Run 'ollama pull $OLLAMA_MODEL' manually when Ollama is available."
153
+ else
154
+ ok "$OLLAMA_MODEL ready"
155
+ fi
156
+ fi
157
+ else
158
+ header "Step 4 — Embedding provider setup"
159
+ if [[ -n "$OPENAI_KEY_ENV" ]]; then
160
+ ok "Using OpenAI provider (model: text-embedding-3-small)"
161
+ ok "Make sure $OPENAI_KEY_ENV is set in your environment."
162
+ else
163
+ warn "Skipped Ollama check. Run 'ollama pull $OLLAMA_MODEL' to enable semantic search."
164
+ fi
165
+ fi
166
+
167
+ # ─── Step 5: Write config (project or global defaults) ───────────────────────
168
+ header "Step 5 — Config"
169
+
170
+ if [[ "$INSTALL_GLOBAL" == true ]]; then
171
+ OPENCODE_DIR="$HOME/.config/opencode"
172
+ CONFIG_LABEL="global default"
173
+ else
174
+ OPENCODE_DIR="$(pwd)/.opencode"
175
+ CONFIG_LABEL="project"
176
+ fi
177
+ CONFIG_PATH="$OPENCODE_DIR/semantic-search.json"
178
+ mkdir -p "$OPENCODE_DIR"
179
+
180
+ if [[ -f "$CONFIG_PATH" ]]; then
181
+ ok "Config already exists at $CONFIG_PATH — skipping."
182
+ else
183
+ if [[ -n "$OPENAI_KEY_ENV" ]]; then
184
+ cat > "$CONFIG_PATH" << JSON
185
+ {
186
+ "embedding": {
187
+ "provider": "openai",
188
+ "api_base": "https://api.openai.com/v1",
189
+ "model": "text-embedding-3-small",
190
+ "api_key_env": "$OPENAI_KEY_ENV",
191
+ "dimensions": 1536
192
+ }
193
+ }
194
+ JSON
195
+ else
196
+ cat > "$CONFIG_PATH" << JSON
197
+ {
198
+ "embedding": {
199
+ "provider": "ollama",
200
+ "api_base": "http://localhost:11434/v1",
201
+ "model": "$OLLAMA_MODEL",
202
+ "dimensions": 768
203
+ }
204
+ }
205
+ JSON
206
+ fi
207
+ ok "Config written to $CONFIG_PATH ($CONFIG_LABEL)"
208
+ info "Edit it anytime to change models or add advanced settings."
209
+ fi
210
+
211
+ if [[ "$INSTALL_GLOBAL" == true ]]; then
212
+ info "Per-project overrides: add .opencode/semantic-search.json in a repo."
213
+ fi
214
+
215
+ # ─── Step 6: Add plugin shim path to .gitignore if project install ─────────────
216
+ if [[ "$INSTALL_GLOBAL" == false ]]; then
217
+ GITIGNORE="$(pwd)/.gitignore"
218
+ SHIM_ENTRY=".opencode/plugins/opencode-semantic-search.ts"
219
+ if [[ -f "$GITIGNORE" ]] && ! grep -qF "$SHIM_ENTRY" "$GITIGNORE"; then
220
+ echo "$SHIM_ENTRY" >> "$GITIGNORE"
221
+ ok "Added plugin shim to .gitignore"
222
+ fi
223
+ fi
224
+
225
+ # ─── Step 7: Self-test ─────────────────────────────────────────────────────────
226
+ header "Step 6 — Self-test"
227
+
228
+ if [[ -f "$PLUGIN_DIR/scripts/integration-index.ts" ]]; then
229
+ info "Running integration tests..."
230
+ if bun run test:integration 2>&1; then
231
+ ok "All integration tests passed."
232
+ else
233
+ warn "Integration tests had failures. Check above output."
234
+ warn "This may be expected if Ollama is not running; the plugin degrades gracefully."
235
+ fi
236
+ else
237
+ info "Skipping integration tests (not included in this install bundle)."
238
+ fi
239
+
240
+ # ─── Done ──────────────────────────────────────────────────────────────────────
241
+ echo ""
242
+ echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
243
+ echo -e "${GREEN}${BOLD}Installation complete!${RESET}"
244
+ echo ""
245
+ if [[ "$SKIP_OLLAMA" == false ]]; then
246
+ echo -e " ${BOLD}Before starting OpenCode, make sure Ollama is running:${RESET}"
247
+ echo " ollama serve"
248
+ echo ""
249
+ fi
250
+ echo -e " ${BOLD}Start OpenCode in your project:${RESET}"
251
+ echo " opencode"
252
+ echo ""
253
+ echo -e " ${BOLD}Available tools once OpenCode starts:${RESET}"
254
+ echo " semantic_search — search by concept or intent"
255
+ echo " index_status — check indexing coverage and health"
256
+ echo " reindex — force a full rebuild of the index"
257
+ echo " grep (override) — smart routing: semantic-first, ripgrep fallback"
258
+ echo ""
259
+ echo -e " ${BOLD}Docs:${RESET} SETUP.md"
260
+ echo ""
package/package.json ADDED
@@ -0,0 +1,67 @@
1
+ {
2
+ "name": "opencode-semantic-search",
3
+ "version": "0.1.0",
4
+ "description": "Local-first semantic search plugin for OpenCode — hybrid vector + BM25 search, AST chunking, smart grep override.",
5
+ "module": "index.ts",
6
+ "type": "module",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "git+https://github.com/jainprashul/opencode-semantic-search.git"
10
+ },
11
+ "homepage": "https://github.com/jainprashul/opencode-semantic-search#readme",
12
+ "bugs": {
13
+ "url": "https://github.com/jainprashul/opencode-semantic-search/issues"
14
+ },
15
+ "bin": {
16
+ "opencode-semantic-search": "bin/opencode-semantic-search.mjs"
17
+ },
18
+ "files": [
19
+ "index.ts",
20
+ "src",
21
+ "install.sh",
22
+ "bin/opencode-semantic-search.mjs",
23
+ "bun.lock",
24
+ "README.md",
25
+ "SETUP.md",
26
+ "AGENTS.md"
27
+ ],
28
+ "keywords": [
29
+ "opencode",
30
+ "opencode-plugin",
31
+ "semantic-search",
32
+ "embeddings",
33
+ "rag",
34
+ "code-search",
35
+ "tree-sitter",
36
+ "ollama",
37
+ "sqlite-vec",
38
+ "bun"
39
+ ],
40
+ "scripts": {
41
+ "prepublishOnly": "bun run typecheck",
42
+ "install:plugin": "bash install.sh",
43
+ "install:plugin:global": "bash install.sh --global",
44
+ "diagnostic:bundle": "bun run scripts/diagnostic-bundle.ts",
45
+ "typecheck": "bunx tsc --noEmit",
46
+ "check": "bun run typecheck",
47
+ "test:integration:index": "bun run scripts/integration-index.ts",
48
+ "test:integration:routing": "bun run scripts/integration-routing.ts",
49
+ "test:integration": "bun run test:integration:index && bun run test:integration:routing",
50
+ "test:unit": "bun test src"
51
+ },
52
+ "devDependencies": {
53
+ "@types/bun": "latest"
54
+ },
55
+ "peerDependencies": {
56
+ "typescript": "^5"
57
+ },
58
+ "dependencies": {
59
+ "@opencode-ai/plugin": "^1.3.2",
60
+ "@opencode-ai/sdk": "1.3.2",
61
+ "@vscode/tree-sitter-wasm": "^0.3.0",
62
+ "ignore": "^7.0.5",
63
+ "picomatch": "^4.0.4",
64
+ "sqlite-vec": "^0.1.7",
65
+ "web-tree-sitter": "^0.26.7"
66
+ }
67
+ }
@@ -0,0 +1,77 @@
1
+ export interface Chunk {
2
+ startLine: number;
3
+ endLine: number;
4
+ text: string;
5
+ }
6
+
7
+ /** Conservative char budget so estimated tokens stay under `maxContextTokens` (≈3 chars/token worst case). */
8
+ export function maxEmbeddingChunkChars(
9
+ chunkingMaxTokens: number,
10
+ maxContextTokens: number
11
+ ): number {
12
+ const fromChunking = Math.max(256, chunkingMaxTokens * 4);
13
+ const hardCeiling = maxContextTokens * 3;
14
+ return Math.min(fromChunking, hardCeiling);
15
+ }
16
+
17
+ export function fallbackChunk(
18
+ text: string,
19
+ maxTokens: number,
20
+ overlapTokens: number,
21
+ maxContextTokens = 8192
22
+ ): Chunk[] {
23
+ const lines = text.split("\n");
24
+ const chunks: Chunk[] = [];
25
+ const approxChars = maxEmbeddingChunkChars(maxTokens, maxContextTokens);
26
+ const overlapChars = Math.max(0, overlapTokens * 4);
27
+
28
+ let startLine = 1;
29
+ let cursor = 0;
30
+ while (cursor < text.length) {
31
+ const end = Math.min(text.length, cursor + approxChars);
32
+ const body = text.slice(cursor, end);
33
+ const consumedLines = body.split("\n").length;
34
+ chunks.push({
35
+ startLine,
36
+ endLine: Math.min(lines.length, startLine + consumedLines - 1),
37
+ text: body,
38
+ });
39
+ if (end >= text.length) break;
40
+ cursor = Math.max(0, end - overlapChars);
41
+ startLine = Math.min(lines.length, startLine + consumedLines - 1);
42
+ }
43
+ return chunks;
44
+ }
45
+
46
+ /** Splits one chunk if it exceeds the embedding char budget (e.g. large tree-sitter boundaries). */
47
+ export function splitChunkToMaxSize(
48
+ chunk: Chunk,
49
+ maxTokens: number,
50
+ overlapTokens: number,
51
+ maxContextTokens: number
52
+ ): Chunk[] {
53
+ const approxChars = maxEmbeddingChunkChars(maxTokens, maxContextTokens);
54
+ if (chunk.text.length <= approxChars) {
55
+ return [chunk];
56
+ }
57
+ const parts = fallbackChunk(chunk.text, maxTokens, overlapTokens, maxContextTokens);
58
+ const lineOffset = chunk.startLine - 1;
59
+ return parts.map((p) => ({
60
+ startLine: p.startLine + lineOffset,
61
+ endLine: p.endLine + lineOffset,
62
+ text: p.text,
63
+ }));
64
+ }
65
+
66
+ export function enforceChunkSizeLimits(
67
+ chunks: Chunk[],
68
+ maxTokens: number,
69
+ overlapTokens: number,
70
+ maxContextTokens: number
71
+ ): Chunk[] {
72
+ const out: Chunk[] = [];
73
+ for (const c of chunks) {
74
+ out.push(...splitChunkToMaxSize(c, maxTokens, overlapTokens, maxContextTokens));
75
+ }
76
+ return out;
77
+ }
@@ -0,0 +1,16 @@
1
+ import path from "node:path";
2
+ import type { PluginConfig } from "../config";
3
+ import { enforceChunkSizeLimits, fallbackChunk, type Chunk } from "./fallback";
4
+ import { treeSitterChunk } from "./treesitter";
5
+
6
+ const astExtensions = new Set([".ts", ".tsx", ".js", ".jsx", ".py", ".go", ".rs", ".java"]);
7
+
8
+ export async function chunkFile(filePath: string, text: string, config: PluginConfig): Promise<Chunk[]> {
9
+ const { max_tokens: maxTokens, overlap_tokens: overlapTokens } = config.chunking;
10
+ const maxContextTokens = config.embedding.max_context_tokens;
11
+ const ext = path.extname(filePath).toLowerCase();
12
+ const raw = astExtensions.has(ext)
13
+ ? await treeSitterChunk(filePath, text, maxTokens, overlapTokens, maxContextTokens)
14
+ : fallbackChunk(text, maxTokens, overlapTokens, maxContextTokens);
15
+ return enforceChunkSizeLimits(raw, maxTokens, overlapTokens, maxContextTokens);
16
+ }
@@ -0,0 +1,119 @@
1
+ import { fallbackChunk, type Chunk } from "./fallback";
2
+ import path from "node:path";
3
+ import { Language, Parser } from "web-tree-sitter";
4
+
5
+ interface AstLikeNode {
6
+ startPosition?: { row: number };
7
+ endPosition?: { row: number };
8
+ text?: string;
9
+ namedChildren?: AstLikeNode[];
10
+ type?: string;
11
+ }
12
+
13
+ function isBoundaryNode(nodeType: string | undefined): boolean {
14
+ if (!nodeType) return false;
15
+ return (
16
+ nodeType.includes("function") ||
17
+ nodeType.includes("class") ||
18
+ nodeType.includes("method") ||
19
+ nodeType.includes("interface") ||
20
+ nodeType.includes("impl")
21
+ );
22
+ }
23
+
24
+ const EXT_TO_WASM: Record<string, string> = {
25
+ ".ts": "tree-sitter-typescript.wasm",
26
+ ".tsx": "tree-sitter-tsx.wasm",
27
+ ".js": "tree-sitter-javascript.wasm",
28
+ ".jsx": "tree-sitter-javascript.wasm",
29
+ ".py": "tree-sitter-python.wasm",
30
+ ".go": "tree-sitter-go.wasm",
31
+ ".rs": "tree-sitter-rust.wasm",
32
+ ".java": "tree-sitter-java.wasm",
33
+ };
34
+
35
+ let parserReady = false;
36
+ const languageCache = new Map<string, Language>();
37
+
38
+ const pluginRoot = path.resolve(import.meta.dir, "../..");
39
+
40
+ async function initParser(): Promise<void> {
41
+ if (parserReady) return;
42
+ const wasmPath = path.join(pluginRoot, "node_modules", "web-tree-sitter", "web-tree-sitter.wasm");
43
+ await Parser.init({
44
+ locateFile: () => wasmPath,
45
+ });
46
+ parserReady = true;
47
+ }
48
+
49
+ async function loadLanguageForExtension(extension: string): Promise<Language | null> {
50
+ const wasmFile = EXT_TO_WASM[extension];
51
+ if (!wasmFile) return null;
52
+ const cached = languageCache.get(wasmFile);
53
+ if (cached) return cached;
54
+ const wasmPath = path.join(pluginRoot, "node_modules", "@vscode", "tree-sitter-wasm", "wasm", wasmFile);
55
+ const lang = await Language.load(wasmPath);
56
+ languageCache.set(wasmFile, lang);
57
+ return lang;
58
+ }
59
+
60
+ function maybeCollectBoundary(node: AstLikeNode): boolean {
61
+ return isBoundaryNode(node.type) && Boolean(node.text);
62
+ }
63
+
64
+ function collectBoundaryChunks(root: AstLikeNode): Chunk[] {
65
+ const out: Chunk[] = [];
66
+ const walk = (node: AstLikeNode): void => {
67
+ if (maybeCollectBoundary(node)) {
68
+ out.push({
69
+ startLine: (node.startPosition?.row ?? 0) + 1,
70
+ endLine: (node.endPosition?.row ?? 0) + 1,
71
+ text: node.text ?? "",
72
+ });
73
+ }
74
+ for (const child of node.namedChildren ?? []) walk(child);
75
+ };
76
+ walk(root);
77
+ return out;
78
+ }
79
+
80
+ export async function treeSitterChunk(
81
+ filePath: string,
82
+ text: string,
83
+ maxTokens: number,
84
+ overlapTokens: number,
85
+ maxContextTokens: number
86
+ ): Promise<Chunk[]> {
87
+ try {
88
+ await initParser();
89
+ const ext = path.extname(filePath).toLowerCase();
90
+ const language = await loadLanguageForExtension(ext);
91
+ if (!language) return fallbackChunk(text, maxTokens, overlapTokens, maxContextTokens);
92
+ const parser = new Parser();
93
+ parser.setLanguage(language);
94
+ const tree = parser.parse(text);
95
+ const boundaries = tree?.rootNode ? collectBoundaryChunks(tree.rootNode as unknown as AstLikeNode) : [];
96
+ parser.delete();
97
+ tree?.delete();
98
+ if (boundaries.length === 0) return fallbackChunk(text, maxTokens, overlapTokens, maxContextTokens);
99
+ return boundaries;
100
+ } catch {
101
+ return fallbackChunk(text, maxTokens, overlapTokens, maxContextTokens);
102
+ }
103
+ }
104
+
105
+ export function extractBoundaryChunks(root: AstLikeNode, source: string): Chunk[] {
106
+ const chunks: Chunk[] = [];
107
+ const walk = (node: AstLikeNode): void => {
108
+ if (isBoundaryNode(node.type) && node.text) {
109
+ chunks.push({
110
+ startLine: (node.startPosition?.row ?? 0) + 1,
111
+ endLine: (node.endPosition?.row ?? 0) + 1,
112
+ text: node.text,
113
+ });
114
+ }
115
+ for (const child of node.namedChildren ?? []) walk(child);
116
+ };
117
+ walk(root);
118
+ return chunks.length > 0 ? chunks : fallbackChunk(source, 512, 50, 8192);
119
+ }