opencode-codebase-index 0.1.10 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -688,6 +688,8 @@ function getDefaultIndexingConfig() {
688
688
  autoIndex: false,
689
689
  watchFiles: true,
690
690
  maxFileSize: 1048576,
691
+ maxChunksPerFile: 100,
692
+ semanticOnly: false,
691
693
  retries: 3,
692
694
  retryDelayMs: 1e3
693
695
  };
@@ -721,6 +723,8 @@ function parseConfig(raw) {
721
723
  autoIndex: typeof rawIndexing.autoIndex === "boolean" ? rawIndexing.autoIndex : defaultIndexing.autoIndex,
722
724
  watchFiles: typeof rawIndexing.watchFiles === "boolean" ? rawIndexing.watchFiles : defaultIndexing.watchFiles,
723
725
  maxFileSize: typeof rawIndexing.maxFileSize === "number" ? rawIndexing.maxFileSize : defaultIndexing.maxFileSize,
726
+ maxChunksPerFile: typeof rawIndexing.maxChunksPerFile === "number" ? Math.max(1, rawIndexing.maxChunksPerFile) : defaultIndexing.maxChunksPerFile,
727
+ semanticOnly: typeof rawIndexing.semanticOnly === "boolean" ? rawIndexing.semanticOnly : defaultIndexing.semanticOnly,
724
728
  retries: typeof rawIndexing.retries === "number" ? rawIndexing.retries : defaultIndexing.retries,
725
729
  retryDelayMs: typeof rawIndexing.retryDelayMs === "number" ? rawIndexing.retryDelayMs : defaultIndexing.retryDelayMs
726
730
  };
@@ -802,7 +806,7 @@ function getDefaultModelForProvider(provider) {
802
806
  }
803
807
 
804
808
  // src/indexer/index.ts
805
- import { existsSync as existsSync4, readFileSync as readFileSync4, writeFileSync as writeFileSync2, promises as fsPromises2 } from "fs";
809
+ import { existsSync as existsSync4, readFileSync as readFileSync4, writeFileSync, promises as fsPromises2 } from "fs";
806
810
  import * as path5 from "path";
807
811
 
808
812
  // node_modules/eventemitter3/index.mjs
@@ -2184,7 +2188,10 @@ function shouldIncludeFile(filePath, projectRoot, includePatterns, excludePatter
2184
2188
  return false;
2185
2189
  }
2186
2190
  function matchGlob(filePath, pattern) {
2187
- const regexPattern = pattern.replace(/\*\*/g, "<<<DOUBLESTAR>>>").replace(/\*/g, "[^/]*").replace(/<<<DOUBLESTAR>>>/g, ".*").replace(/\?/g, ".").replace(/\{([^}]+)\}/g, (_, p1) => `(${p1.split(",").join("|")})`);
2191
+ let regexPattern = pattern.replace(/\*\*/g, "<<<DOUBLESTAR>>>").replace(/\*/g, "[^/]*").replace(/<<<DOUBLESTAR>>>/g, ".*").replace(/\?/g, ".").replace(/\{([^}]+)\}/g, (_, p1) => `(${p1.split(",").join("|")})`);
2192
+ if (regexPattern.startsWith(".*/")) {
2193
+ regexPattern = `(.*\\/)?${regexPattern.slice(3)}`;
2194
+ }
2188
2195
  const regex = new RegExp(`^${regexPattern}$`);
2189
2196
  return regex.test(filePath);
2190
2197
  }
@@ -2620,158 +2627,204 @@ function generateChunkId(filePath, chunk) {
2620
2627
  function generateChunkHash(chunk) {
2621
2628
  return hashContent(chunk.content);
2622
2629
  }
2623
-
2624
- // src/indexer/inverted-index.ts
2625
- import { existsSync as existsSync3, readFileSync as readFileSync3, writeFileSync } from "fs";
2626
- import * as path4 from "path";
2627
2630
  var InvertedIndex = class {
2628
- indexPath;
2629
- termToChunks = /* @__PURE__ */ new Map();
2630
- chunkTokens = /* @__PURE__ */ new Map();
2631
- totalTokenCount = 0;
2631
+ inner;
2632
2632
  constructor(indexPath) {
2633
- this.indexPath = path4.join(indexPath, "inverted-index.json");
2633
+ this.inner = new native.InvertedIndex(indexPath);
2634
2634
  }
2635
2635
  load() {
2636
- if (!existsSync3(this.indexPath)) {
2637
- return;
2638
- }
2639
- try {
2640
- const content = readFileSync3(this.indexPath, "utf-8");
2641
- const data = JSON.parse(content);
2642
- for (const [term, chunkIds] of Object.entries(data.termToChunks)) {
2643
- this.termToChunks.set(term, new Set(chunkIds));
2644
- }
2645
- for (const [chunkId, tokens] of Object.entries(data.chunkTokens)) {
2646
- const tokenMap = new Map(Object.entries(tokens).map(([k, v]) => [k, v]));
2647
- this.chunkTokens.set(chunkId, tokenMap);
2648
- for (const count of tokenMap.values()) {
2649
- this.totalTokenCount += count;
2650
- }
2651
- }
2652
- } catch {
2653
- this.termToChunks.clear();
2654
- this.chunkTokens.clear();
2655
- this.totalTokenCount = 0;
2656
- }
2636
+ this.inner.load();
2657
2637
  }
2658
2638
  save() {
2659
- const data = {
2660
- termToChunks: {},
2661
- chunkTokens: {},
2662
- avgDocLength: this.getAvgDocLength()
2663
- };
2664
- for (const [term, chunkIds] of this.termToChunks) {
2665
- data.termToChunks[term] = Array.from(chunkIds);
2666
- }
2667
- for (const [chunkId, tokens] of this.chunkTokens) {
2668
- data.chunkTokens[chunkId] = Object.fromEntries(tokens);
2669
- }
2670
- writeFileSync(this.indexPath, JSON.stringify(data));
2639
+ this.inner.save();
2671
2640
  }
2672
2641
  addChunk(chunkId, content) {
2673
- const tokens = this.tokenize(content);
2674
- const termFreq = /* @__PURE__ */ new Map();
2675
- for (const token of tokens) {
2676
- termFreq.set(token, (termFreq.get(token) || 0) + 1);
2677
- const chunks = this.termToChunks.get(token) || /* @__PURE__ */ new Set();
2678
- chunks.add(chunkId);
2679
- this.termToChunks.set(token, chunks);
2680
- }
2681
- this.chunkTokens.set(chunkId, termFreq);
2682
- this.totalTokenCount += tokens.length;
2642
+ this.inner.addChunk(chunkId, content);
2683
2643
  }
2684
2644
  removeChunk(chunkId) {
2685
- const tokens = this.chunkTokens.get(chunkId);
2686
- if (!tokens) return;
2687
- for (const [token, count] of tokens) {
2688
- this.totalTokenCount -= count;
2689
- const chunks = this.termToChunks.get(token);
2690
- if (chunks) {
2691
- chunks.delete(chunkId);
2692
- if (chunks.size === 0) {
2693
- this.termToChunks.delete(token);
2694
- }
2695
- }
2696
- }
2697
- this.chunkTokens.delete(chunkId);
2645
+ return this.inner.removeChunk(chunkId);
2698
2646
  }
2699
- search(query) {
2700
- const queryTokens = this.tokenize(query);
2701
- if (queryTokens.length === 0) {
2702
- return /* @__PURE__ */ new Map();
2703
- }
2704
- const candidateChunks = /* @__PURE__ */ new Set();
2705
- for (const token of queryTokens) {
2706
- const chunks = this.termToChunks.get(token);
2707
- if (chunks) {
2708
- for (const chunkId of chunks) {
2709
- candidateChunks.add(chunkId);
2710
- }
2711
- }
2712
- }
2713
- const scores = /* @__PURE__ */ new Map();
2714
- const k1 = 1.2;
2715
- const b = 0.75;
2716
- const N = this.chunkTokens.size;
2717
- const avgDocLength = this.getAvgDocLength();
2718
- for (const chunkId of candidateChunks) {
2719
- const termFreq = this.chunkTokens.get(chunkId);
2720
- if (!termFreq) continue;
2721
- const docLength = Array.from(termFreq.values()).reduce((a, b2) => a + b2, 0);
2722
- let score = 0;
2723
- for (const term of queryTokens) {
2724
- const tf = termFreq.get(term) || 0;
2725
- if (tf === 0) continue;
2726
- const df = this.termToChunks.get(term)?.size || 0;
2727
- const idf = Math.log((N - df + 0.5) / (df + 0.5) + 1);
2728
- const tfNorm = tf * (k1 + 1) / (tf + k1 * (1 - b + b * (docLength / avgDocLength)));
2729
- score += idf * tfNorm;
2730
- }
2731
- scores.set(chunkId, score);
2732
- }
2733
- const maxScore = Math.max(...scores.values(), 1);
2734
- for (const [chunkId, score] of scores) {
2735
- scores.set(chunkId, score / maxScore);
2647
+ search(query, limit) {
2648
+ const results = this.inner.search(query, limit ?? 100);
2649
+ const map = /* @__PURE__ */ new Map();
2650
+ for (const r of results) {
2651
+ map.set(r.chunkId, r.score);
2736
2652
  }
2737
- return scores;
2653
+ return map;
2738
2654
  }
2739
2655
  hasChunk(chunkId) {
2740
- return this.chunkTokens.has(chunkId);
2656
+ return this.inner.hasChunk(chunkId);
2741
2657
  }
2742
2658
  clear() {
2743
- this.termToChunks.clear();
2744
- this.chunkTokens.clear();
2745
- this.totalTokenCount = 0;
2659
+ this.inner.clear();
2746
2660
  }
2747
2661
  getDocumentCount() {
2748
- return this.chunkTokens.size;
2662
+ return this.inner.documentCount();
2663
+ }
2664
+ };
2665
+ var Database = class {
2666
+ inner;
2667
+ constructor(dbPath) {
2668
+ this.inner = new native.Database(dbPath);
2669
+ }
2670
+ embeddingExists(contentHash) {
2671
+ return this.inner.embeddingExists(contentHash);
2672
+ }
2673
+ getEmbedding(contentHash) {
2674
+ return this.inner.getEmbedding(contentHash) ?? null;
2675
+ }
2676
+ upsertEmbedding(contentHash, embedding, chunkText, model) {
2677
+ this.inner.upsertEmbedding(contentHash, embedding, chunkText, model);
2678
+ }
2679
+ getMissingEmbeddings(contentHashes) {
2680
+ return this.inner.getMissingEmbeddings(contentHashes);
2681
+ }
2682
+ upsertChunk(chunk) {
2683
+ this.inner.upsertChunk(chunk);
2684
+ }
2685
+ getChunk(chunkId) {
2686
+ return this.inner.getChunk(chunkId) ?? null;
2749
2687
  }
2750
- getAvgDocLength() {
2751
- const count = this.chunkTokens.size;
2752
- return count > 0 ? this.totalTokenCount / count : 100;
2688
+ getChunksByFile(filePath) {
2689
+ return this.inner.getChunksByFile(filePath);
2753
2690
  }
2754
- tokenize(text) {
2755
- return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 2);
2691
+ deleteChunksByFile(filePath) {
2692
+ return this.inner.deleteChunksByFile(filePath);
2693
+ }
2694
+ addChunksToBranch(branch, chunkIds) {
2695
+ this.inner.addChunksToBranch(branch, chunkIds);
2696
+ }
2697
+ clearBranch(branch) {
2698
+ return this.inner.clearBranch(branch);
2699
+ }
2700
+ getBranchChunkIds(branch) {
2701
+ return this.inner.getBranchChunkIds(branch);
2702
+ }
2703
+ getBranchDelta(branch, baseBranch) {
2704
+ return this.inner.getBranchDelta(branch, baseBranch);
2705
+ }
2706
+ chunkExistsOnBranch(branch, chunkId) {
2707
+ return this.inner.chunkExistsOnBranch(branch, chunkId);
2708
+ }
2709
+ getAllBranches() {
2710
+ return this.inner.getAllBranches();
2711
+ }
2712
+ getMetadata(key) {
2713
+ return this.inner.getMetadata(key) ?? null;
2714
+ }
2715
+ setMetadata(key, value) {
2716
+ this.inner.setMetadata(key, value);
2717
+ }
2718
+ deleteMetadata(key) {
2719
+ return this.inner.deleteMetadata(key);
2720
+ }
2721
+ gcOrphanEmbeddings() {
2722
+ return this.inner.gcOrphanEmbeddings();
2723
+ }
2724
+ gcOrphanChunks() {
2725
+ return this.inner.gcOrphanChunks();
2726
+ }
2727
+ getStats() {
2728
+ return this.inner.getStats();
2756
2729
  }
2757
2730
  };
2758
2731
 
2732
+ // src/git/index.ts
2733
+ import { existsSync as existsSync3, readFileSync as readFileSync3, readdirSync, statSync } from "fs";
2734
+ import * as path4 from "path";
2735
+ import { execSync } from "child_process";
2736
+ function isGitRepo(dir) {
2737
+ return existsSync3(path4.join(dir, ".git"));
2738
+ }
2739
+ function getCurrentBranch(repoRoot) {
2740
+ const headPath = path4.join(repoRoot, ".git", "HEAD");
2741
+ if (!existsSync3(headPath)) {
2742
+ return null;
2743
+ }
2744
+ try {
2745
+ const headContent = readFileSync3(headPath, "utf-8").trim();
2746
+ const match = headContent.match(/^ref: refs\/heads\/(.+)$/);
2747
+ if (match) {
2748
+ return match[1];
2749
+ }
2750
+ if (/^[0-9a-f]{40}$/i.test(headContent)) {
2751
+ return headContent.slice(0, 7);
2752
+ }
2753
+ return null;
2754
+ } catch {
2755
+ return null;
2756
+ }
2757
+ }
2758
+ function getBaseBranch(repoRoot) {
2759
+ const candidates = ["main", "master", "develop", "trunk"];
2760
+ for (const candidate of candidates) {
2761
+ const refPath = path4.join(repoRoot, ".git", "refs", "heads", candidate);
2762
+ if (existsSync3(refPath)) {
2763
+ return candidate;
2764
+ }
2765
+ const packedRefsPath = path4.join(repoRoot, ".git", "packed-refs");
2766
+ if (existsSync3(packedRefsPath)) {
2767
+ try {
2768
+ const content = readFileSync3(packedRefsPath, "utf-8");
2769
+ if (content.includes(`refs/heads/${candidate}`)) {
2770
+ return candidate;
2771
+ }
2772
+ } catch {
2773
+ }
2774
+ }
2775
+ }
2776
+ try {
2777
+ const result = execSync("git remote show origin", {
2778
+ cwd: repoRoot,
2779
+ encoding: "utf-8",
2780
+ stdio: ["pipe", "pipe", "pipe"]
2781
+ });
2782
+ const match = result.match(/HEAD branch: (.+)/);
2783
+ if (match) {
2784
+ return match[1].trim();
2785
+ }
2786
+ } catch {
2787
+ }
2788
+ return getCurrentBranch(repoRoot) ?? "main";
2789
+ }
2790
+ function getBranchOrDefault(repoRoot) {
2791
+ if (!isGitRepo(repoRoot)) {
2792
+ return "default";
2793
+ }
2794
+ return getCurrentBranch(repoRoot) ?? "default";
2795
+ }
2796
+ function getHeadPath(repoRoot) {
2797
+ return path4.join(repoRoot, ".git", "HEAD");
2798
+ }
2799
+
2759
2800
  // src/indexer/index.ts
2801
+ function float32ArrayToBuffer(arr) {
2802
+ const float32 = new Float32Array(arr);
2803
+ return Buffer.from(float32.buffer);
2804
+ }
2805
+ function bufferToFloat32Array(buf) {
2806
+ return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
2807
+ }
2760
2808
  var Indexer = class {
2761
2809
  config;
2762
2810
  projectRoot;
2763
2811
  indexPath;
2764
2812
  store = null;
2765
2813
  invertedIndex = null;
2814
+ database = null;
2766
2815
  provider = null;
2767
2816
  detectedProvider = null;
2768
2817
  fileHashCache = /* @__PURE__ */ new Map();
2769
2818
  fileHashCachePath = "";
2819
+ failedBatchesPath = "";
2820
+ currentBranch = "default";
2821
+ baseBranch = "main";
2770
2822
  constructor(projectRoot, config) {
2771
2823
  this.projectRoot = projectRoot;
2772
2824
  this.config = config;
2773
2825
  this.indexPath = this.getIndexPath();
2774
2826
  this.fileHashCachePath = path5.join(this.indexPath, "file-hashes.json");
2827
+ this.failedBatchesPath = path5.join(this.indexPath, "failed-batches.json");
2775
2828
  }
2776
2829
  getIndexPath() {
2777
2830
  if (this.config.scope === "global") {
@@ -2796,7 +2849,38 @@ var Indexer = class {
2796
2849
  for (const [k, v] of this.fileHashCache) {
2797
2850
  obj[k] = v;
2798
2851
  }
2799
- writeFileSync2(this.fileHashCachePath, JSON.stringify(obj));
2852
+ writeFileSync(this.fileHashCachePath, JSON.stringify(obj));
2853
+ }
2854
+ loadFailedBatches() {
2855
+ try {
2856
+ if (existsSync4(this.failedBatchesPath)) {
2857
+ const data = readFileSync4(this.failedBatchesPath, "utf-8");
2858
+ return JSON.parse(data);
2859
+ }
2860
+ } catch {
2861
+ return [];
2862
+ }
2863
+ return [];
2864
+ }
2865
+ saveFailedBatches(batches) {
2866
+ if (batches.length === 0) {
2867
+ if (existsSync4(this.failedBatchesPath)) {
2868
+ fsPromises2.unlink(this.failedBatchesPath).catch(() => {
2869
+ });
2870
+ }
2871
+ return;
2872
+ }
2873
+ writeFileSync(this.failedBatchesPath, JSON.stringify(batches, null, 2));
2874
+ }
2875
+ addFailedBatch(batch, error) {
2876
+ const existing = this.loadFailedBatches();
2877
+ existing.push({
2878
+ chunks: batch,
2879
+ error,
2880
+ attemptCount: 1,
2881
+ lastAttempt: (/* @__PURE__ */ new Date()).toISOString()
2882
+ });
2883
+ this.saveFailedBatches(existing);
2800
2884
  }
2801
2885
  async initialize() {
2802
2886
  this.detectedProvider = await detectEmbeddingProvider(this.config.embeddingProvider);
@@ -2817,18 +2901,60 @@ var Indexer = class {
2817
2901
  if (existsSync4(indexFilePath)) {
2818
2902
  this.store.load();
2819
2903
  }
2820
- this.invertedIndex = new InvertedIndex(this.indexPath);
2821
- this.invertedIndex.load();
2904
+ const invertedIndexPath = path5.join(this.indexPath, "inverted-index.json");
2905
+ this.invertedIndex = new InvertedIndex(invertedIndexPath);
2906
+ try {
2907
+ this.invertedIndex.load();
2908
+ } catch {
2909
+ if (existsSync4(invertedIndexPath)) {
2910
+ await fsPromises2.unlink(invertedIndexPath);
2911
+ }
2912
+ this.invertedIndex = new InvertedIndex(invertedIndexPath);
2913
+ }
2914
+ const dbPath = path5.join(this.indexPath, "codebase.db");
2915
+ const dbIsNew = !existsSync4(dbPath);
2916
+ this.database = new Database(dbPath);
2917
+ if (dbIsNew && this.store.count() > 0) {
2918
+ this.migrateFromLegacyIndex();
2919
+ }
2920
+ if (isGitRepo(this.projectRoot)) {
2921
+ this.currentBranch = getBranchOrDefault(this.projectRoot);
2922
+ this.baseBranch = getBaseBranch(this.projectRoot);
2923
+ } else {
2924
+ this.currentBranch = "default";
2925
+ this.baseBranch = "default";
2926
+ }
2927
+ }
2928
+ migrateFromLegacyIndex() {
2929
+ if (!this.store || !this.database) return;
2930
+ const allMetadata = this.store.getAllMetadata();
2931
+ const chunkIds = [];
2932
+ for (const { key, metadata } of allMetadata) {
2933
+ const chunkData = {
2934
+ chunkId: key,
2935
+ contentHash: metadata.hash,
2936
+ filePath: metadata.filePath,
2937
+ startLine: metadata.startLine,
2938
+ endLine: metadata.endLine,
2939
+ nodeType: metadata.chunkType,
2940
+ name: metadata.name,
2941
+ language: metadata.language
2942
+ };
2943
+ this.database.upsertChunk(chunkData);
2944
+ chunkIds.push(key);
2945
+ }
2946
+ this.database.addChunksToBranch(this.currentBranch || "default", chunkIds);
2822
2947
  }
2823
2948
  async ensureInitialized() {
2824
- if (!this.store || !this.provider || !this.invertedIndex || !this.detectedProvider) {
2949
+ if (!this.store || !this.provider || !this.invertedIndex || !this.detectedProvider || !this.database) {
2825
2950
  await this.initialize();
2826
2951
  }
2827
2952
  return {
2828
2953
  store: this.store,
2829
2954
  provider: this.provider,
2830
2955
  invertedIndex: this.invertedIndex,
2831
- detectedProvider: this.detectedProvider
2956
+ detectedProvider: this.detectedProvider,
2957
+ database: this.database
2832
2958
  };
2833
2959
  }
2834
2960
  async estimateCost() {
@@ -2842,7 +2968,7 @@ var Indexer = class {
2842
2968
  return createCostEstimate(files, detectedProvider);
2843
2969
  }
2844
2970
  async index(onProgress) {
2845
- const { store, provider, invertedIndex } = await this.ensureInitialized();
2971
+ const { store, provider, invertedIndex, database, detectedProvider } = await this.ensureInitialized();
2846
2972
  const startTime = Date.now();
2847
2973
  const stats = {
2848
2974
  totalFiles: 0,
@@ -2919,11 +3045,30 @@ var Indexer = class {
2919
3045
  const relativePath = path5.relative(this.projectRoot, parsed.path);
2920
3046
  stats.parseFailures.push(relativePath);
2921
3047
  }
3048
+ let fileChunkCount = 0;
2922
3049
  for (const chunk of parsed.chunks) {
3050
+ if (fileChunkCount >= this.config.indexing.maxChunksPerFile) {
3051
+ break;
3052
+ }
3053
+ if (this.config.indexing.semanticOnly && chunk.chunkType === "other") {
3054
+ continue;
3055
+ }
2923
3056
  const id = generateChunkId(parsed.path, chunk);
2924
3057
  const contentHash = generateChunkHash(chunk);
2925
3058
  currentChunkIds.add(id);
3059
+ const chunkData = {
3060
+ chunkId: id,
3061
+ contentHash,
3062
+ filePath: parsed.path,
3063
+ startLine: chunk.startLine,
3064
+ endLine: chunk.endLine,
3065
+ nodeType: chunk.chunkType,
3066
+ name: chunk.name,
3067
+ language: chunk.language
3068
+ };
3069
+ database.upsertChunk(chunkData);
2926
3070
  if (existingChunks.get(id) === contentHash) {
3071
+ fileChunkCount++;
2927
3072
  continue;
2928
3073
  }
2929
3074
  const text = createEmbeddingText(chunk, parsed.path);
@@ -2936,7 +3081,8 @@ var Indexer = class {
2936
3081
  language: chunk.language,
2937
3082
  hash: contentHash
2938
3083
  };
2939
- pendingChunks.push({ id, text, content: chunk.content, metadata });
3084
+ pendingChunks.push({ id, text, content: chunk.content, contentHash, metadata });
3085
+ fileChunkCount++;
2940
3086
  }
2941
3087
  }
2942
3088
  let removedCount = 0;
@@ -2951,6 +3097,8 @@ var Indexer = class {
2951
3097
  stats.existingChunks = currentChunkIds.size - pendingChunks.length;
2952
3098
  stats.removedChunks = removedCount;
2953
3099
  if (pendingChunks.length === 0 && removedCount === 0) {
3100
+ database.clearBranch(this.currentBranch);
3101
+ database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
2954
3102
  this.fileHashCache = currentFileHashes;
2955
3103
  this.saveFileHashCache();
2956
3104
  stats.durationMs = Date.now() - startTime;
@@ -2964,6 +3112,8 @@ var Indexer = class {
2964
3112
  return stats;
2965
3113
  }
2966
3114
  if (pendingChunks.length === 0) {
3115
+ database.clearBranch(this.currentBranch);
3116
+ database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
2967
3117
  store.save();
2968
3118
  invertedIndex.save();
2969
3119
  this.fileHashCache = currentFileHashes;
@@ -2985,8 +3135,22 @@ var Indexer = class {
2985
3135
  chunksProcessed: 0,
2986
3136
  totalChunks: pendingChunks.length
2987
3137
  });
3138
+ const allContentHashes = pendingChunks.map((c) => c.contentHash);
3139
+ const missingHashes = new Set(database.getMissingEmbeddings(allContentHashes));
3140
+ const chunksNeedingEmbedding = pendingChunks.filter((c) => missingHashes.has(c.contentHash));
3141
+ const chunksWithExistingEmbedding = pendingChunks.filter((c) => !missingHashes.has(c.contentHash));
3142
+ for (const chunk of chunksWithExistingEmbedding) {
3143
+ const embeddingBuffer = database.getEmbedding(chunk.contentHash);
3144
+ if (embeddingBuffer) {
3145
+ const vector = bufferToFloat32Array(embeddingBuffer);
3146
+ store.add(chunk.id, Array.from(vector), chunk.metadata);
3147
+ invertedIndex.removeChunk(chunk.id);
3148
+ invertedIndex.addChunk(chunk.id, chunk.content);
3149
+ stats.indexedChunks++;
3150
+ }
3151
+ }
2988
3152
  const queue = new PQueue({ concurrency: 3 });
2989
- const dynamicBatches = createDynamicBatches(pendingChunks);
3153
+ const dynamicBatches = createDynamicBatches(chunksNeedingEmbedding);
2990
3154
  for (const batch of dynamicBatches) {
2991
3155
  queue.add(async () => {
2992
3156
  try {
@@ -3011,7 +3175,15 @@ var Indexer = class {
3011
3175
  metadata: chunk.metadata
3012
3176
  }));
3013
3177
  store.addBatch(items);
3014
- for (const chunk of batch) {
3178
+ for (let i = 0; i < batch.length; i++) {
3179
+ const chunk = batch[i];
3180
+ const embedding = result.embeddings[i];
3181
+ database.upsertEmbedding(
3182
+ chunk.contentHash,
3183
+ float32ArrayToBuffer(embedding),
3184
+ chunk.text,
3185
+ detectedProvider.modelInfo.model
3186
+ );
3015
3187
  invertedIndex.removeChunk(chunk.id);
3016
3188
  invertedIndex.addChunk(chunk.id, chunk.content);
3017
3189
  }
@@ -3026,6 +3198,7 @@ var Indexer = class {
3026
3198
  });
3027
3199
  } catch (error) {
3028
3200
  stats.failedChunks += batch.length;
3201
+ this.addFailedBatch(batch, String(error));
3029
3202
  console.error(`Failed to embed batch after retries: ${error}`);
3030
3203
  }
3031
3204
  });
@@ -3038,11 +3211,16 @@ var Indexer = class {
3038
3211
  chunksProcessed: stats.indexedChunks,
3039
3212
  totalChunks: pendingChunks.length
3040
3213
  });
3214
+ database.clearBranch(this.currentBranch);
3215
+ database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
3041
3216
  store.save();
3042
3217
  invertedIndex.save();
3043
3218
  this.fileHashCache = currentFileHashes;
3044
3219
  this.saveFileHashCache();
3045
3220
  stats.durationMs = Date.now() - startTime;
3221
+ if (stats.failedChunks > 0) {
3222
+ stats.failedBatchesPath = this.failedBatchesPath;
3223
+ }
3046
3224
  onProgress?.({
3047
3225
  phase: "complete",
3048
3226
  filesProcessed: files.length,
@@ -3053,18 +3231,24 @@ var Indexer = class {
3053
3231
  return stats;
3054
3232
  }
3055
3233
  async search(query, limit, options) {
3056
- const { store, provider } = await this.ensureInitialized();
3234
+ const { store, provider, database } = await this.ensureInitialized();
3057
3235
  if (store.count() === 0) {
3058
3236
  return [];
3059
3237
  }
3060
3238
  const maxResults = limit ?? this.config.search.maxResults;
3061
3239
  const hybridWeight = options?.hybridWeight ?? this.config.search.hybridWeight;
3240
+ const filterByBranch = options?.filterByBranch ?? true;
3062
3241
  const { embedding } = await provider.embed(query);
3063
3242
  const semanticResults = store.search(embedding, maxResults * 4);
3064
3243
  const keywordResults = await this.keywordSearch(query, maxResults * 4);
3065
3244
  const combined = this.fuseResults(semanticResults, keywordResults, hybridWeight, maxResults * 4);
3245
+ let branchChunkIds = null;
3246
+ if (filterByBranch && this.currentBranch !== "default") {
3247
+ branchChunkIds = new Set(database.getBranchChunkIds(this.currentBranch));
3248
+ }
3066
3249
  const filtered = combined.filter((r) => {
3067
3250
  if (r.score < this.config.search.minScore) return false;
3251
+ if (branchChunkIds && !branchChunkIds.has(r.id)) return false;
3068
3252
  if (options?.fileType) {
3069
3253
  const ext = r.metadata.filePath.split(".").pop()?.toLowerCase();
3070
3254
  if (ext !== options.fileType.toLowerCase().replace(/^\./, "")) return false;
@@ -3166,7 +3350,9 @@ var Indexer = class {
3166
3350
  vectorCount: store.count(),
3167
3351
  provider: detectedProvider.provider,
3168
3352
  model: detectedProvider.modelInfo.model,
3169
- indexPath: this.indexPath
3353
+ indexPath: this.indexPath,
3354
+ currentBranch: this.currentBranch,
3355
+ baseBranch: this.baseBranch
3170
3356
  };
3171
3357
  }
3172
3358
  async clearIndex() {
@@ -3177,7 +3363,7 @@ var Indexer = class {
3177
3363
  invertedIndex.save();
3178
3364
  }
3179
3365
  async healthCheck() {
3180
- const { store, invertedIndex } = await this.ensureInitialized();
3366
+ const { store, invertedIndex, database } = await this.ensureInitialized();
3181
3367
  const allMetadata = store.getAllMetadata();
3182
3368
  const filePathsToChunkKeys = /* @__PURE__ */ new Map();
3183
3369
  for (const { key, metadata } of allMetadata) {
@@ -3194,6 +3380,7 @@ var Indexer = class {
3194
3380
  invertedIndex.removeChunk(key);
3195
3381
  removedCount++;
3196
3382
  }
3383
+ database.deleteChunksByFile(filePath);
3197
3384
  removedFilePaths.push(filePath);
3198
3385
  }
3199
3386
  }
@@ -3201,7 +3388,77 @@ var Indexer = class {
3201
3388
  store.save();
3202
3389
  invertedIndex.save();
3203
3390
  }
3204
- return { removed: removedCount, filePaths: removedFilePaths };
3391
+ const gcOrphanEmbeddings = database.gcOrphanEmbeddings();
3392
+ const gcOrphanChunks = database.gcOrphanChunks();
3393
+ return { removed: removedCount, filePaths: removedFilePaths, gcOrphanEmbeddings, gcOrphanChunks };
3394
+ }
3395
+ async retryFailedBatches() {
3396
+ const { store, provider, invertedIndex } = await this.ensureInitialized();
3397
+ const failedBatches = this.loadFailedBatches();
3398
+ if (failedBatches.length === 0) {
3399
+ return { succeeded: 0, failed: 0, remaining: 0 };
3400
+ }
3401
+ let succeeded = 0;
3402
+ let failed = 0;
3403
+ const stillFailing = [];
3404
+ for (const batch of failedBatches) {
3405
+ try {
3406
+ const result = await pRetry(
3407
+ async () => {
3408
+ const texts = batch.chunks.map((c) => c.text);
3409
+ return provider.embedBatch(texts);
3410
+ },
3411
+ {
3412
+ retries: this.config.indexing.retries,
3413
+ minTimeout: this.config.indexing.retryDelayMs
3414
+ }
3415
+ );
3416
+ const items = batch.chunks.map((chunk, idx) => ({
3417
+ id: chunk.id,
3418
+ vector: result.embeddings[idx],
3419
+ metadata: chunk.metadata
3420
+ }));
3421
+ store.addBatch(items);
3422
+ for (const chunk of batch.chunks) {
3423
+ invertedIndex.removeChunk(chunk.id);
3424
+ invertedIndex.addChunk(chunk.id, chunk.content);
3425
+ }
3426
+ succeeded += batch.chunks.length;
3427
+ } catch (error) {
3428
+ failed += batch.chunks.length;
3429
+ stillFailing.push({
3430
+ ...batch,
3431
+ attemptCount: batch.attemptCount + 1,
3432
+ lastAttempt: (/* @__PURE__ */ new Date()).toISOString(),
3433
+ error: String(error)
3434
+ });
3435
+ }
3436
+ }
3437
+ this.saveFailedBatches(stillFailing);
3438
+ if (succeeded > 0) {
3439
+ store.save();
3440
+ invertedIndex.save();
3441
+ }
3442
+ return { succeeded, failed, remaining: stillFailing.length };
3443
+ }
3444
+ getFailedBatchesCount() {
3445
+ return this.loadFailedBatches().length;
3446
+ }
3447
+ getCurrentBranch() {
3448
+ return this.currentBranch;
3449
+ }
3450
+ getBaseBranch() {
3451
+ return this.baseBranch;
3452
+ }
3453
+ refreshBranchInfo() {
3454
+ if (isGitRepo(this.projectRoot)) {
3455
+ this.currentBranch = getBranchOrDefault(this.projectRoot);
3456
+ this.baseBranch = getBaseBranch(this.projectRoot);
3457
+ }
3458
+ }
3459
+ async getDatabaseStats() {
3460
+ const { database } = await this.ensureInitialized();
3461
+ return database.getStats();
3205
3462
  }
3206
3463
  };
3207
3464
 
@@ -5028,9 +5285,82 @@ var FileWatcher = class {
5028
5285
  return this.watcher !== null;
5029
5286
  }
5030
5287
  };
5288
+ var GitHeadWatcher = class {
5289
+ watcher = null;
5290
+ projectRoot;
5291
+ currentBranch = null;
5292
+ onBranchChange = null;
5293
+ debounceTimer = null;
5294
+ debounceMs = 100;
5295
+ // Short debounce for git operations
5296
+ constructor(projectRoot) {
5297
+ this.projectRoot = projectRoot;
5298
+ }
5299
+ start(handler) {
5300
+ if (this.watcher) {
5301
+ return;
5302
+ }
5303
+ if (!isGitRepo(this.projectRoot)) {
5304
+ return;
5305
+ }
5306
+ this.onBranchChange = handler;
5307
+ this.currentBranch = getCurrentBranch(this.projectRoot);
5308
+ const headPath = getHeadPath(this.projectRoot);
5309
+ const refsPath = path6.join(this.projectRoot, ".git", "refs", "heads");
5310
+ this.watcher = chokidar_default.watch([headPath, refsPath], {
5311
+ persistent: true,
5312
+ ignoreInitial: true,
5313
+ awaitWriteFinish: {
5314
+ stabilityThreshold: 50,
5315
+ pollInterval: 10
5316
+ }
5317
+ });
5318
+ this.watcher.on("change", () => this.handleHeadChange());
5319
+ this.watcher.on("add", () => this.handleHeadChange());
5320
+ }
5321
+ handleHeadChange() {
5322
+ if (this.debounceTimer) {
5323
+ clearTimeout(this.debounceTimer);
5324
+ }
5325
+ this.debounceTimer = setTimeout(() => {
5326
+ this.checkBranchChange();
5327
+ }, this.debounceMs);
5328
+ }
5329
+ async checkBranchChange() {
5330
+ const newBranch = getCurrentBranch(this.projectRoot);
5331
+ if (newBranch && newBranch !== this.currentBranch && this.onBranchChange) {
5332
+ const oldBranch = this.currentBranch;
5333
+ this.currentBranch = newBranch;
5334
+ try {
5335
+ await this.onBranchChange(oldBranch, newBranch);
5336
+ } catch (error) {
5337
+ console.error("Error handling branch change:", error);
5338
+ }
5339
+ } else if (newBranch) {
5340
+ this.currentBranch = newBranch;
5341
+ }
5342
+ }
5343
+ getCurrentBranch() {
5344
+ return this.currentBranch;
5345
+ }
5346
+ stop() {
5347
+ if (this.debounceTimer) {
5348
+ clearTimeout(this.debounceTimer);
5349
+ this.debounceTimer = null;
5350
+ }
5351
+ if (this.watcher) {
5352
+ this.watcher.close();
5353
+ this.watcher = null;
5354
+ }
5355
+ this.onBranchChange = null;
5356
+ }
5357
+ isRunning() {
5358
+ return this.watcher !== null;
5359
+ }
5360
+ };
5031
5361
  function createWatcherWithIndexer(indexer, projectRoot, config) {
5032
- const watcher = new FileWatcher(projectRoot, config);
5033
- watcher.start(async (changes) => {
5362
+ const fileWatcher = new FileWatcher(projectRoot, config);
5363
+ fileWatcher.start(async (changes) => {
5034
5364
  const hasAddOrChange = changes.some(
5035
5365
  (c) => c.type === "add" || c.type === "change"
5036
5366
  );
@@ -5039,7 +5369,22 @@ function createWatcherWithIndexer(indexer, projectRoot, config) {
5039
5369
  await indexer.index();
5040
5370
  }
5041
5371
  });
5042
- return watcher;
5372
+ let gitWatcher = null;
5373
+ if (isGitRepo(projectRoot)) {
5374
+ gitWatcher = new GitHeadWatcher(projectRoot);
5375
+ gitWatcher.start(async (oldBranch, newBranch) => {
5376
+ console.log(`Branch changed: ${oldBranch ?? "(none)"} -> ${newBranch}`);
5377
+ await indexer.index();
5378
+ });
5379
+ }
5380
+ return {
5381
+ fileWatcher,
5382
+ gitWatcher,
5383
+ stop() {
5384
+ fileWatcher.stop();
5385
+ gitWatcher?.stop();
5386
+ }
5387
+ };
5043
5388
  }
5044
5389
 
5045
5390
  // src/tools/index.ts
@@ -5123,13 +5468,19 @@ var index_health_check = tool({
5123
5468
  async execute() {
5124
5469
  const indexer = getIndexer();
5125
5470
  const result = await indexer.healthCheck();
5126
- if (result.removed === 0) {
5471
+ if (result.removed === 0 && result.gcOrphanEmbeddings === 0 && result.gcOrphanChunks === 0) {
5127
5472
  return "Index is healthy. No stale entries found.";
5128
5473
  }
5129
- const lines = [
5130
- `Health check complete:`,
5131
- ` Removed stale entries: ${result.removed}`
5132
- ];
5474
+ const lines = [`Health check complete:`];
5475
+ if (result.removed > 0) {
5476
+ lines.push(` Removed stale entries: ${result.removed}`);
5477
+ }
5478
+ if (result.gcOrphanEmbeddings > 0) {
5479
+ lines.push(` Garbage collected orphan embeddings: ${result.gcOrphanEmbeddings}`);
5480
+ }
5481
+ if (result.gcOrphanChunks > 0) {
5482
+ lines.push(` Garbage collected orphan chunks: ${result.gcOrphanChunks}`);
5483
+ }
5133
5484
  if (result.filePaths.length > 0) {
5134
5485
  lines.push(` Cleaned paths: ${result.filePaths.join(", ")}`);
5135
5486
  }
@@ -5184,13 +5535,18 @@ function formatStatus(status) {
5184
5535
  if (!status.indexed) {
5185
5536
  return "Codebase is not indexed. Run index_codebase to create an index.";
5186
5537
  }
5187
- return [
5538
+ const lines = [
5188
5539
  `Index status:`,
5189
5540
  ` Indexed chunks: ${status.vectorCount.toLocaleString()}`,
5190
5541
  ` Provider: ${status.provider}`,
5191
5542
  ` Model: ${status.model}`,
5192
5543
  ` Location: ${status.indexPath}`
5193
- ].join("\n");
5544
+ ];
5545
+ if (status.currentBranch !== "default") {
5546
+ lines.push(` Current branch: ${status.currentBranch}`);
5547
+ lines.push(` Base branch: ${status.baseBranch}`);
5548
+ }
5549
+ return lines.join("\n");
5194
5550
  }
5195
5551
 
5196
5552
  // src/index.ts
@@ -5227,6 +5583,39 @@ var plugin = async ({ directory }) => {
5227
5583
  index_codebase,
5228
5584
  index_status,
5229
5585
  index_health_check
5586
+ },
5587
+ async config(cfg) {
5588
+ cfg.command = cfg.command ?? {};
5589
+ cfg.command["search"] = {
5590
+ description: "Search codebase by meaning using semantic search",
5591
+ template: `Use the \`codebase_search\` tool to find code related to: $ARGUMENTS
5592
+
5593
+ If the index doesn't exist yet, run \`index_codebase\` first.
5594
+
5595
+ Return the most relevant results with file paths and line numbers.`
5596
+ };
5597
+ cfg.command["find"] = {
5598
+ description: "Find code using hybrid approach (semantic + grep)",
5599
+ template: `Find code related to: $ARGUMENTS
5600
+
5601
+ Strategy:
5602
+ 1. First use \`codebase_search\` to find semantically related code
5603
+ 2. From the results, identify specific function/class names
5604
+ 3. Use grep to find all occurrences of those identifiers
5605
+ 4. Combine findings into a comprehensive answer
5606
+
5607
+ If the semantic index doesn't exist, run \`index_codebase\` first.`
5608
+ };
5609
+ cfg.command["index"] = {
5610
+ description: "Index the codebase for semantic search",
5611
+ template: `Run the \`index_codebase\` tool to create or update the semantic search index.
5612
+
5613
+ Show progress and final statistics including:
5614
+ - Number of files processed
5615
+ - Number of chunks indexed
5616
+ - Tokens used
5617
+ - Duration`
5618
+ };
5230
5619
  }
5231
5620
  };
5232
5621
  };