opencode-codebase-index 0.1.10 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -693,6 +693,8 @@ function getDefaultIndexingConfig() {
693
693
  autoIndex: false,
694
694
  watchFiles: true,
695
695
  maxFileSize: 1048576,
696
+ maxChunksPerFile: 100,
697
+ semanticOnly: false,
696
698
  retries: 3,
697
699
  retryDelayMs: 1e3
698
700
  };
@@ -726,6 +728,8 @@ function parseConfig(raw) {
726
728
  autoIndex: typeof rawIndexing.autoIndex === "boolean" ? rawIndexing.autoIndex : defaultIndexing.autoIndex,
727
729
  watchFiles: typeof rawIndexing.watchFiles === "boolean" ? rawIndexing.watchFiles : defaultIndexing.watchFiles,
728
730
  maxFileSize: typeof rawIndexing.maxFileSize === "number" ? rawIndexing.maxFileSize : defaultIndexing.maxFileSize,
731
+ maxChunksPerFile: typeof rawIndexing.maxChunksPerFile === "number" ? Math.max(1, rawIndexing.maxChunksPerFile) : defaultIndexing.maxChunksPerFile,
732
+ semanticOnly: typeof rawIndexing.semanticOnly === "boolean" ? rawIndexing.semanticOnly : defaultIndexing.semanticOnly,
729
733
  retries: typeof rawIndexing.retries === "number" ? rawIndexing.retries : defaultIndexing.retries,
730
734
  retryDelayMs: typeof rawIndexing.retryDelayMs === "number" ? rawIndexing.retryDelayMs : defaultIndexing.retryDelayMs
731
735
  };
@@ -2189,7 +2193,10 @@ function shouldIncludeFile(filePath, projectRoot, includePatterns, excludePatter
2189
2193
  return false;
2190
2194
  }
2191
2195
  function matchGlob(filePath, pattern) {
2192
- const regexPattern = pattern.replace(/\*\*/g, "<<<DOUBLESTAR>>>").replace(/\*/g, "[^/]*").replace(/<<<DOUBLESTAR>>>/g, ".*").replace(/\?/g, ".").replace(/\{([^}]+)\}/g, (_, p1) => `(${p1.split(",").join("|")})`);
2196
+ let regexPattern = pattern.replace(/\*\*/g, "<<<DOUBLESTAR>>>").replace(/\*/g, "[^/]*").replace(/<<<DOUBLESTAR>>>/g, ".*").replace(/\?/g, ".").replace(/\{([^}]+)\}/g, (_, p1) => `(${p1.split(",").join("|")})`);
2197
+ if (regexPattern.startsWith(".*/")) {
2198
+ regexPattern = `(.*\\/)?${regexPattern.slice(3)}`;
2199
+ }
2193
2200
  const regex = new RegExp(`^${regexPattern}$`);
2194
2201
  return regex.test(filePath);
2195
2202
  }
@@ -2626,158 +2633,204 @@ function generateChunkId(filePath, chunk) {
2626
2633
  function generateChunkHash(chunk) {
2627
2634
  return hashContent(chunk.content);
2628
2635
  }
2629
-
2630
- // src/indexer/inverted-index.ts
2631
- var import_fs3 = require("fs");
2632
- var path4 = __toESM(require("path"), 1);
2633
2636
  var InvertedIndex = class {
2634
- indexPath;
2635
- termToChunks = /* @__PURE__ */ new Map();
2636
- chunkTokens = /* @__PURE__ */ new Map();
2637
- totalTokenCount = 0;
2637
+ inner;
2638
2638
  constructor(indexPath) {
2639
- this.indexPath = path4.join(indexPath, "inverted-index.json");
2639
+ this.inner = new native.InvertedIndex(indexPath);
2640
2640
  }
2641
2641
  load() {
2642
- if (!(0, import_fs3.existsSync)(this.indexPath)) {
2643
- return;
2644
- }
2645
- try {
2646
- const content = (0, import_fs3.readFileSync)(this.indexPath, "utf-8");
2647
- const data = JSON.parse(content);
2648
- for (const [term, chunkIds] of Object.entries(data.termToChunks)) {
2649
- this.termToChunks.set(term, new Set(chunkIds));
2650
- }
2651
- for (const [chunkId, tokens] of Object.entries(data.chunkTokens)) {
2652
- const tokenMap = new Map(Object.entries(tokens).map(([k, v]) => [k, v]));
2653
- this.chunkTokens.set(chunkId, tokenMap);
2654
- for (const count of tokenMap.values()) {
2655
- this.totalTokenCount += count;
2656
- }
2657
- }
2658
- } catch {
2659
- this.termToChunks.clear();
2660
- this.chunkTokens.clear();
2661
- this.totalTokenCount = 0;
2662
- }
2642
+ this.inner.load();
2663
2643
  }
2664
2644
  save() {
2665
- const data = {
2666
- termToChunks: {},
2667
- chunkTokens: {},
2668
- avgDocLength: this.getAvgDocLength()
2669
- };
2670
- for (const [term, chunkIds] of this.termToChunks) {
2671
- data.termToChunks[term] = Array.from(chunkIds);
2672
- }
2673
- for (const [chunkId, tokens] of this.chunkTokens) {
2674
- data.chunkTokens[chunkId] = Object.fromEntries(tokens);
2675
- }
2676
- (0, import_fs3.writeFileSync)(this.indexPath, JSON.stringify(data));
2645
+ this.inner.save();
2677
2646
  }
2678
2647
  addChunk(chunkId, content) {
2679
- const tokens = this.tokenize(content);
2680
- const termFreq = /* @__PURE__ */ new Map();
2681
- for (const token of tokens) {
2682
- termFreq.set(token, (termFreq.get(token) || 0) + 1);
2683
- const chunks = this.termToChunks.get(token) || /* @__PURE__ */ new Set();
2684
- chunks.add(chunkId);
2685
- this.termToChunks.set(token, chunks);
2686
- }
2687
- this.chunkTokens.set(chunkId, termFreq);
2688
- this.totalTokenCount += tokens.length;
2648
+ this.inner.addChunk(chunkId, content);
2689
2649
  }
2690
2650
  removeChunk(chunkId) {
2691
- const tokens = this.chunkTokens.get(chunkId);
2692
- if (!tokens) return;
2693
- for (const [token, count] of tokens) {
2694
- this.totalTokenCount -= count;
2695
- const chunks = this.termToChunks.get(token);
2696
- if (chunks) {
2697
- chunks.delete(chunkId);
2698
- if (chunks.size === 0) {
2699
- this.termToChunks.delete(token);
2700
- }
2701
- }
2702
- }
2703
- this.chunkTokens.delete(chunkId);
2651
+ return this.inner.removeChunk(chunkId);
2704
2652
  }
2705
- search(query) {
2706
- const queryTokens = this.tokenize(query);
2707
- if (queryTokens.length === 0) {
2708
- return /* @__PURE__ */ new Map();
2709
- }
2710
- const candidateChunks = /* @__PURE__ */ new Set();
2711
- for (const token of queryTokens) {
2712
- const chunks = this.termToChunks.get(token);
2713
- if (chunks) {
2714
- for (const chunkId of chunks) {
2715
- candidateChunks.add(chunkId);
2716
- }
2717
- }
2653
+ search(query, limit) {
2654
+ const results = this.inner.search(query, limit ?? 100);
2655
+ const map = /* @__PURE__ */ new Map();
2656
+ for (const r of results) {
2657
+ map.set(r.chunkId, r.score);
2718
2658
  }
2719
- const scores = /* @__PURE__ */ new Map();
2720
- const k1 = 1.2;
2721
- const b = 0.75;
2722
- const N = this.chunkTokens.size;
2723
- const avgDocLength = this.getAvgDocLength();
2724
- for (const chunkId of candidateChunks) {
2725
- const termFreq = this.chunkTokens.get(chunkId);
2726
- if (!termFreq) continue;
2727
- const docLength = Array.from(termFreq.values()).reduce((a, b2) => a + b2, 0);
2728
- let score = 0;
2729
- for (const term of queryTokens) {
2730
- const tf = termFreq.get(term) || 0;
2731
- if (tf === 0) continue;
2732
- const df = this.termToChunks.get(term)?.size || 0;
2733
- const idf = Math.log((N - df + 0.5) / (df + 0.5) + 1);
2734
- const tfNorm = tf * (k1 + 1) / (tf + k1 * (1 - b + b * (docLength / avgDocLength)));
2735
- score += idf * tfNorm;
2736
- }
2737
- scores.set(chunkId, score);
2738
- }
2739
- const maxScore = Math.max(...scores.values(), 1);
2740
- for (const [chunkId, score] of scores) {
2741
- scores.set(chunkId, score / maxScore);
2742
- }
2743
- return scores;
2659
+ return map;
2744
2660
  }
2745
2661
  hasChunk(chunkId) {
2746
- return this.chunkTokens.has(chunkId);
2662
+ return this.inner.hasChunk(chunkId);
2747
2663
  }
2748
2664
  clear() {
2749
- this.termToChunks.clear();
2750
- this.chunkTokens.clear();
2751
- this.totalTokenCount = 0;
2665
+ this.inner.clear();
2752
2666
  }
2753
2667
  getDocumentCount() {
2754
- return this.chunkTokens.size;
2668
+ return this.inner.documentCount();
2669
+ }
2670
+ };
2671
+ var Database = class {
2672
+ inner;
2673
+ constructor(dbPath) {
2674
+ this.inner = new native.Database(dbPath);
2675
+ }
2676
+ embeddingExists(contentHash) {
2677
+ return this.inner.embeddingExists(contentHash);
2678
+ }
2679
+ getEmbedding(contentHash) {
2680
+ return this.inner.getEmbedding(contentHash) ?? null;
2681
+ }
2682
+ upsertEmbedding(contentHash, embedding, chunkText, model) {
2683
+ this.inner.upsertEmbedding(contentHash, embedding, chunkText, model);
2684
+ }
2685
+ getMissingEmbeddings(contentHashes) {
2686
+ return this.inner.getMissingEmbeddings(contentHashes);
2687
+ }
2688
+ upsertChunk(chunk) {
2689
+ this.inner.upsertChunk(chunk);
2690
+ }
2691
+ getChunk(chunkId) {
2692
+ return this.inner.getChunk(chunkId) ?? null;
2755
2693
  }
2756
- getAvgDocLength() {
2757
- const count = this.chunkTokens.size;
2758
- return count > 0 ? this.totalTokenCount / count : 100;
2694
+ getChunksByFile(filePath) {
2695
+ return this.inner.getChunksByFile(filePath);
2759
2696
  }
2760
- tokenize(text) {
2761
- return text.toLowerCase().replace(/[^\w\s]/g, " ").split(/\s+/).filter((t) => t.length > 2);
2697
+ deleteChunksByFile(filePath) {
2698
+ return this.inner.deleteChunksByFile(filePath);
2699
+ }
2700
+ addChunksToBranch(branch, chunkIds) {
2701
+ this.inner.addChunksToBranch(branch, chunkIds);
2702
+ }
2703
+ clearBranch(branch) {
2704
+ return this.inner.clearBranch(branch);
2705
+ }
2706
+ getBranchChunkIds(branch) {
2707
+ return this.inner.getBranchChunkIds(branch);
2708
+ }
2709
+ getBranchDelta(branch, baseBranch) {
2710
+ return this.inner.getBranchDelta(branch, baseBranch);
2711
+ }
2712
+ chunkExistsOnBranch(branch, chunkId) {
2713
+ return this.inner.chunkExistsOnBranch(branch, chunkId);
2714
+ }
2715
+ getAllBranches() {
2716
+ return this.inner.getAllBranches();
2717
+ }
2718
+ getMetadata(key) {
2719
+ return this.inner.getMetadata(key) ?? null;
2720
+ }
2721
+ setMetadata(key, value) {
2722
+ this.inner.setMetadata(key, value);
2723
+ }
2724
+ deleteMetadata(key) {
2725
+ return this.inner.deleteMetadata(key);
2726
+ }
2727
+ gcOrphanEmbeddings() {
2728
+ return this.inner.gcOrphanEmbeddings();
2729
+ }
2730
+ gcOrphanChunks() {
2731
+ return this.inner.gcOrphanChunks();
2732
+ }
2733
+ getStats() {
2734
+ return this.inner.getStats();
2762
2735
  }
2763
2736
  };
2764
2737
 
2738
+ // src/git/index.ts
2739
+ var import_fs3 = require("fs");
2740
+ var path4 = __toESM(require("path"), 1);
2741
+ var import_child_process = require("child_process");
2742
+ function isGitRepo(dir) {
2743
+ return (0, import_fs3.existsSync)(path4.join(dir, ".git"));
2744
+ }
2745
+ function getCurrentBranch(repoRoot) {
2746
+ const headPath = path4.join(repoRoot, ".git", "HEAD");
2747
+ if (!(0, import_fs3.existsSync)(headPath)) {
2748
+ return null;
2749
+ }
2750
+ try {
2751
+ const headContent = (0, import_fs3.readFileSync)(headPath, "utf-8").trim();
2752
+ const match = headContent.match(/^ref: refs\/heads\/(.+)$/);
2753
+ if (match) {
2754
+ return match[1];
2755
+ }
2756
+ if (/^[0-9a-f]{40}$/i.test(headContent)) {
2757
+ return headContent.slice(0, 7);
2758
+ }
2759
+ return null;
2760
+ } catch {
2761
+ return null;
2762
+ }
2763
+ }
2764
+ function getBaseBranch(repoRoot) {
2765
+ const candidates = ["main", "master", "develop", "trunk"];
2766
+ for (const candidate of candidates) {
2767
+ const refPath = path4.join(repoRoot, ".git", "refs", "heads", candidate);
2768
+ if ((0, import_fs3.existsSync)(refPath)) {
2769
+ return candidate;
2770
+ }
2771
+ const packedRefsPath = path4.join(repoRoot, ".git", "packed-refs");
2772
+ if ((0, import_fs3.existsSync)(packedRefsPath)) {
2773
+ try {
2774
+ const content = (0, import_fs3.readFileSync)(packedRefsPath, "utf-8");
2775
+ if (content.includes(`refs/heads/${candidate}`)) {
2776
+ return candidate;
2777
+ }
2778
+ } catch {
2779
+ }
2780
+ }
2781
+ }
2782
+ try {
2783
+ const result = (0, import_child_process.execSync)("git remote show origin", {
2784
+ cwd: repoRoot,
2785
+ encoding: "utf-8",
2786
+ stdio: ["pipe", "pipe", "pipe"]
2787
+ });
2788
+ const match = result.match(/HEAD branch: (.+)/);
2789
+ if (match) {
2790
+ return match[1].trim();
2791
+ }
2792
+ } catch {
2793
+ }
2794
+ return getCurrentBranch(repoRoot) ?? "main";
2795
+ }
2796
+ function getBranchOrDefault(repoRoot) {
2797
+ if (!isGitRepo(repoRoot)) {
2798
+ return "default";
2799
+ }
2800
+ return getCurrentBranch(repoRoot) ?? "default";
2801
+ }
2802
+ function getHeadPath(repoRoot) {
2803
+ return path4.join(repoRoot, ".git", "HEAD");
2804
+ }
2805
+
2765
2806
  // src/indexer/index.ts
2807
+ function float32ArrayToBuffer(arr) {
2808
+ const float32 = new Float32Array(arr);
2809
+ return Buffer.from(float32.buffer);
2810
+ }
2811
+ function bufferToFloat32Array(buf) {
2812
+ return new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
2813
+ }
2766
2814
  var Indexer = class {
2767
2815
  config;
2768
2816
  projectRoot;
2769
2817
  indexPath;
2770
2818
  store = null;
2771
2819
  invertedIndex = null;
2820
+ database = null;
2772
2821
  provider = null;
2773
2822
  detectedProvider = null;
2774
2823
  fileHashCache = /* @__PURE__ */ new Map();
2775
2824
  fileHashCachePath = "";
2825
+ failedBatchesPath = "";
2826
+ currentBranch = "default";
2827
+ baseBranch = "main";
2776
2828
  constructor(projectRoot, config) {
2777
2829
  this.projectRoot = projectRoot;
2778
2830
  this.config = config;
2779
2831
  this.indexPath = this.getIndexPath();
2780
2832
  this.fileHashCachePath = path5.join(this.indexPath, "file-hashes.json");
2833
+ this.failedBatchesPath = path5.join(this.indexPath, "failed-batches.json");
2781
2834
  }
2782
2835
  getIndexPath() {
2783
2836
  if (this.config.scope === "global") {
@@ -2804,6 +2857,37 @@ var Indexer = class {
2804
2857
  }
2805
2858
  (0, import_fs4.writeFileSync)(this.fileHashCachePath, JSON.stringify(obj));
2806
2859
  }
2860
+ loadFailedBatches() {
2861
+ try {
2862
+ if ((0, import_fs4.existsSync)(this.failedBatchesPath)) {
2863
+ const data = (0, import_fs4.readFileSync)(this.failedBatchesPath, "utf-8");
2864
+ return JSON.parse(data);
2865
+ }
2866
+ } catch {
2867
+ return [];
2868
+ }
2869
+ return [];
2870
+ }
2871
+ saveFailedBatches(batches) {
2872
+ if (batches.length === 0) {
2873
+ if ((0, import_fs4.existsSync)(this.failedBatchesPath)) {
2874
+ import_fs4.promises.unlink(this.failedBatchesPath).catch(() => {
2875
+ });
2876
+ }
2877
+ return;
2878
+ }
2879
+ (0, import_fs4.writeFileSync)(this.failedBatchesPath, JSON.stringify(batches, null, 2));
2880
+ }
2881
+ addFailedBatch(batch, error) {
2882
+ const existing = this.loadFailedBatches();
2883
+ existing.push({
2884
+ chunks: batch,
2885
+ error,
2886
+ attemptCount: 1,
2887
+ lastAttempt: (/* @__PURE__ */ new Date()).toISOString()
2888
+ });
2889
+ this.saveFailedBatches(existing);
2890
+ }
2807
2891
  async initialize() {
2808
2892
  this.detectedProvider = await detectEmbeddingProvider(this.config.embeddingProvider);
2809
2893
  if (!this.detectedProvider) {
@@ -2823,18 +2907,60 @@ var Indexer = class {
2823
2907
  if ((0, import_fs4.existsSync)(indexFilePath)) {
2824
2908
  this.store.load();
2825
2909
  }
2826
- this.invertedIndex = new InvertedIndex(this.indexPath);
2827
- this.invertedIndex.load();
2910
+ const invertedIndexPath = path5.join(this.indexPath, "inverted-index.json");
2911
+ this.invertedIndex = new InvertedIndex(invertedIndexPath);
2912
+ try {
2913
+ this.invertedIndex.load();
2914
+ } catch {
2915
+ if ((0, import_fs4.existsSync)(invertedIndexPath)) {
2916
+ await import_fs4.promises.unlink(invertedIndexPath);
2917
+ }
2918
+ this.invertedIndex = new InvertedIndex(invertedIndexPath);
2919
+ }
2920
+ const dbPath = path5.join(this.indexPath, "codebase.db");
2921
+ const dbIsNew = !(0, import_fs4.existsSync)(dbPath);
2922
+ this.database = new Database(dbPath);
2923
+ if (dbIsNew && this.store.count() > 0) {
2924
+ this.migrateFromLegacyIndex();
2925
+ }
2926
+ if (isGitRepo(this.projectRoot)) {
2927
+ this.currentBranch = getBranchOrDefault(this.projectRoot);
2928
+ this.baseBranch = getBaseBranch(this.projectRoot);
2929
+ } else {
2930
+ this.currentBranch = "default";
2931
+ this.baseBranch = "default";
2932
+ }
2933
+ }
2934
+ migrateFromLegacyIndex() {
2935
+ if (!this.store || !this.database) return;
2936
+ const allMetadata = this.store.getAllMetadata();
2937
+ const chunkIds = [];
2938
+ for (const { key, metadata } of allMetadata) {
2939
+ const chunkData = {
2940
+ chunkId: key,
2941
+ contentHash: metadata.hash,
2942
+ filePath: metadata.filePath,
2943
+ startLine: metadata.startLine,
2944
+ endLine: metadata.endLine,
2945
+ nodeType: metadata.chunkType,
2946
+ name: metadata.name,
2947
+ language: metadata.language
2948
+ };
2949
+ this.database.upsertChunk(chunkData);
2950
+ chunkIds.push(key);
2951
+ }
2952
+ this.database.addChunksToBranch(this.currentBranch || "default", chunkIds);
2828
2953
  }
2829
2954
  async ensureInitialized() {
2830
- if (!this.store || !this.provider || !this.invertedIndex || !this.detectedProvider) {
2955
+ if (!this.store || !this.provider || !this.invertedIndex || !this.detectedProvider || !this.database) {
2831
2956
  await this.initialize();
2832
2957
  }
2833
2958
  return {
2834
2959
  store: this.store,
2835
2960
  provider: this.provider,
2836
2961
  invertedIndex: this.invertedIndex,
2837
- detectedProvider: this.detectedProvider
2962
+ detectedProvider: this.detectedProvider,
2963
+ database: this.database
2838
2964
  };
2839
2965
  }
2840
2966
  async estimateCost() {
@@ -2848,7 +2974,7 @@ var Indexer = class {
2848
2974
  return createCostEstimate(files, detectedProvider);
2849
2975
  }
2850
2976
  async index(onProgress) {
2851
- const { store, provider, invertedIndex } = await this.ensureInitialized();
2977
+ const { store, provider, invertedIndex, database, detectedProvider } = await this.ensureInitialized();
2852
2978
  const startTime = Date.now();
2853
2979
  const stats = {
2854
2980
  totalFiles: 0,
@@ -2925,11 +3051,30 @@ var Indexer = class {
2925
3051
  const relativePath = path5.relative(this.projectRoot, parsed.path);
2926
3052
  stats.parseFailures.push(relativePath);
2927
3053
  }
3054
+ let fileChunkCount = 0;
2928
3055
  for (const chunk of parsed.chunks) {
3056
+ if (fileChunkCount >= this.config.indexing.maxChunksPerFile) {
3057
+ break;
3058
+ }
3059
+ if (this.config.indexing.semanticOnly && chunk.chunkType === "other") {
3060
+ continue;
3061
+ }
2929
3062
  const id = generateChunkId(parsed.path, chunk);
2930
3063
  const contentHash = generateChunkHash(chunk);
2931
3064
  currentChunkIds.add(id);
3065
+ const chunkData = {
3066
+ chunkId: id,
3067
+ contentHash,
3068
+ filePath: parsed.path,
3069
+ startLine: chunk.startLine,
3070
+ endLine: chunk.endLine,
3071
+ nodeType: chunk.chunkType,
3072
+ name: chunk.name,
3073
+ language: chunk.language
3074
+ };
3075
+ database.upsertChunk(chunkData);
2932
3076
  if (existingChunks.get(id) === contentHash) {
3077
+ fileChunkCount++;
2933
3078
  continue;
2934
3079
  }
2935
3080
  const text = createEmbeddingText(chunk, parsed.path);
@@ -2942,7 +3087,8 @@ var Indexer = class {
2942
3087
  language: chunk.language,
2943
3088
  hash: contentHash
2944
3089
  };
2945
- pendingChunks.push({ id, text, content: chunk.content, metadata });
3090
+ pendingChunks.push({ id, text, content: chunk.content, contentHash, metadata });
3091
+ fileChunkCount++;
2946
3092
  }
2947
3093
  }
2948
3094
  let removedCount = 0;
@@ -2957,6 +3103,8 @@ var Indexer = class {
2957
3103
  stats.existingChunks = currentChunkIds.size - pendingChunks.length;
2958
3104
  stats.removedChunks = removedCount;
2959
3105
  if (pendingChunks.length === 0 && removedCount === 0) {
3106
+ database.clearBranch(this.currentBranch);
3107
+ database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
2960
3108
  this.fileHashCache = currentFileHashes;
2961
3109
  this.saveFileHashCache();
2962
3110
  stats.durationMs = Date.now() - startTime;
@@ -2970,6 +3118,8 @@ var Indexer = class {
2970
3118
  return stats;
2971
3119
  }
2972
3120
  if (pendingChunks.length === 0) {
3121
+ database.clearBranch(this.currentBranch);
3122
+ database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
2973
3123
  store.save();
2974
3124
  invertedIndex.save();
2975
3125
  this.fileHashCache = currentFileHashes;
@@ -2991,8 +3141,22 @@ var Indexer = class {
2991
3141
  chunksProcessed: 0,
2992
3142
  totalChunks: pendingChunks.length
2993
3143
  });
3144
+ const allContentHashes = pendingChunks.map((c) => c.contentHash);
3145
+ const missingHashes = new Set(database.getMissingEmbeddings(allContentHashes));
3146
+ const chunksNeedingEmbedding = pendingChunks.filter((c) => missingHashes.has(c.contentHash));
3147
+ const chunksWithExistingEmbedding = pendingChunks.filter((c) => !missingHashes.has(c.contentHash));
3148
+ for (const chunk of chunksWithExistingEmbedding) {
3149
+ const embeddingBuffer = database.getEmbedding(chunk.contentHash);
3150
+ if (embeddingBuffer) {
3151
+ const vector = bufferToFloat32Array(embeddingBuffer);
3152
+ store.add(chunk.id, Array.from(vector), chunk.metadata);
3153
+ invertedIndex.removeChunk(chunk.id);
3154
+ invertedIndex.addChunk(chunk.id, chunk.content);
3155
+ stats.indexedChunks++;
3156
+ }
3157
+ }
2994
3158
  const queue = new PQueue({ concurrency: 3 });
2995
- const dynamicBatches = createDynamicBatches(pendingChunks);
3159
+ const dynamicBatches = createDynamicBatches(chunksNeedingEmbedding);
2996
3160
  for (const batch of dynamicBatches) {
2997
3161
  queue.add(async () => {
2998
3162
  try {
@@ -3017,7 +3181,15 @@ var Indexer = class {
3017
3181
  metadata: chunk.metadata
3018
3182
  }));
3019
3183
  store.addBatch(items);
3020
- for (const chunk of batch) {
3184
+ for (let i = 0; i < batch.length; i++) {
3185
+ const chunk = batch[i];
3186
+ const embedding = result.embeddings[i];
3187
+ database.upsertEmbedding(
3188
+ chunk.contentHash,
3189
+ float32ArrayToBuffer(embedding),
3190
+ chunk.text,
3191
+ detectedProvider.modelInfo.model
3192
+ );
3021
3193
  invertedIndex.removeChunk(chunk.id);
3022
3194
  invertedIndex.addChunk(chunk.id, chunk.content);
3023
3195
  }
@@ -3032,6 +3204,7 @@ var Indexer = class {
3032
3204
  });
3033
3205
  } catch (error) {
3034
3206
  stats.failedChunks += batch.length;
3207
+ this.addFailedBatch(batch, String(error));
3035
3208
  console.error(`Failed to embed batch after retries: ${error}`);
3036
3209
  }
3037
3210
  });
@@ -3044,11 +3217,16 @@ var Indexer = class {
3044
3217
  chunksProcessed: stats.indexedChunks,
3045
3218
  totalChunks: pendingChunks.length
3046
3219
  });
3220
+ database.clearBranch(this.currentBranch);
3221
+ database.addChunksToBranch(this.currentBranch, Array.from(currentChunkIds));
3047
3222
  store.save();
3048
3223
  invertedIndex.save();
3049
3224
  this.fileHashCache = currentFileHashes;
3050
3225
  this.saveFileHashCache();
3051
3226
  stats.durationMs = Date.now() - startTime;
3227
+ if (stats.failedChunks > 0) {
3228
+ stats.failedBatchesPath = this.failedBatchesPath;
3229
+ }
3052
3230
  onProgress?.({
3053
3231
  phase: "complete",
3054
3232
  filesProcessed: files.length,
@@ -3059,18 +3237,24 @@ var Indexer = class {
3059
3237
  return stats;
3060
3238
  }
3061
3239
  async search(query, limit, options) {
3062
- const { store, provider } = await this.ensureInitialized();
3240
+ const { store, provider, database } = await this.ensureInitialized();
3063
3241
  if (store.count() === 0) {
3064
3242
  return [];
3065
3243
  }
3066
3244
  const maxResults = limit ?? this.config.search.maxResults;
3067
3245
  const hybridWeight = options?.hybridWeight ?? this.config.search.hybridWeight;
3246
+ const filterByBranch = options?.filterByBranch ?? true;
3068
3247
  const { embedding } = await provider.embed(query);
3069
3248
  const semanticResults = store.search(embedding, maxResults * 4);
3070
3249
  const keywordResults = await this.keywordSearch(query, maxResults * 4);
3071
3250
  const combined = this.fuseResults(semanticResults, keywordResults, hybridWeight, maxResults * 4);
3251
+ let branchChunkIds = null;
3252
+ if (filterByBranch && this.currentBranch !== "default") {
3253
+ branchChunkIds = new Set(database.getBranchChunkIds(this.currentBranch));
3254
+ }
3072
3255
  const filtered = combined.filter((r) => {
3073
3256
  if (r.score < this.config.search.minScore) return false;
3257
+ if (branchChunkIds && !branchChunkIds.has(r.id)) return false;
3074
3258
  if (options?.fileType) {
3075
3259
  const ext = r.metadata.filePath.split(".").pop()?.toLowerCase();
3076
3260
  if (ext !== options.fileType.toLowerCase().replace(/^\./, "")) return false;
@@ -3172,7 +3356,9 @@ var Indexer = class {
3172
3356
  vectorCount: store.count(),
3173
3357
  provider: detectedProvider.provider,
3174
3358
  model: detectedProvider.modelInfo.model,
3175
- indexPath: this.indexPath
3359
+ indexPath: this.indexPath,
3360
+ currentBranch: this.currentBranch,
3361
+ baseBranch: this.baseBranch
3176
3362
  };
3177
3363
  }
3178
3364
  async clearIndex() {
@@ -3183,7 +3369,7 @@ var Indexer = class {
3183
3369
  invertedIndex.save();
3184
3370
  }
3185
3371
  async healthCheck() {
3186
- const { store, invertedIndex } = await this.ensureInitialized();
3372
+ const { store, invertedIndex, database } = await this.ensureInitialized();
3187
3373
  const allMetadata = store.getAllMetadata();
3188
3374
  const filePathsToChunkKeys = /* @__PURE__ */ new Map();
3189
3375
  for (const { key, metadata } of allMetadata) {
@@ -3200,6 +3386,7 @@ var Indexer = class {
3200
3386
  invertedIndex.removeChunk(key);
3201
3387
  removedCount++;
3202
3388
  }
3389
+ database.deleteChunksByFile(filePath);
3203
3390
  removedFilePaths.push(filePath);
3204
3391
  }
3205
3392
  }
@@ -3207,7 +3394,77 @@ var Indexer = class {
3207
3394
  store.save();
3208
3395
  invertedIndex.save();
3209
3396
  }
3210
- return { removed: removedCount, filePaths: removedFilePaths };
3397
+ const gcOrphanEmbeddings = database.gcOrphanEmbeddings();
3398
+ const gcOrphanChunks = database.gcOrphanChunks();
3399
+ return { removed: removedCount, filePaths: removedFilePaths, gcOrphanEmbeddings, gcOrphanChunks };
3400
+ }
3401
+ async retryFailedBatches() {
3402
+ const { store, provider, invertedIndex } = await this.ensureInitialized();
3403
+ const failedBatches = this.loadFailedBatches();
3404
+ if (failedBatches.length === 0) {
3405
+ return { succeeded: 0, failed: 0, remaining: 0 };
3406
+ }
3407
+ let succeeded = 0;
3408
+ let failed = 0;
3409
+ const stillFailing = [];
3410
+ for (const batch of failedBatches) {
3411
+ try {
3412
+ const result = await pRetry(
3413
+ async () => {
3414
+ const texts = batch.chunks.map((c) => c.text);
3415
+ return provider.embedBatch(texts);
3416
+ },
3417
+ {
3418
+ retries: this.config.indexing.retries,
3419
+ minTimeout: this.config.indexing.retryDelayMs
3420
+ }
3421
+ );
3422
+ const items = batch.chunks.map((chunk, idx) => ({
3423
+ id: chunk.id,
3424
+ vector: result.embeddings[idx],
3425
+ metadata: chunk.metadata
3426
+ }));
3427
+ store.addBatch(items);
3428
+ for (const chunk of batch.chunks) {
3429
+ invertedIndex.removeChunk(chunk.id);
3430
+ invertedIndex.addChunk(chunk.id, chunk.content);
3431
+ }
3432
+ succeeded += batch.chunks.length;
3433
+ } catch (error) {
3434
+ failed += batch.chunks.length;
3435
+ stillFailing.push({
3436
+ ...batch,
3437
+ attemptCount: batch.attemptCount + 1,
3438
+ lastAttempt: (/* @__PURE__ */ new Date()).toISOString(),
3439
+ error: String(error)
3440
+ });
3441
+ }
3442
+ }
3443
+ this.saveFailedBatches(stillFailing);
3444
+ if (succeeded > 0) {
3445
+ store.save();
3446
+ invertedIndex.save();
3447
+ }
3448
+ return { succeeded, failed, remaining: stillFailing.length };
3449
+ }
3450
+ getFailedBatchesCount() {
3451
+ return this.loadFailedBatches().length;
3452
+ }
3453
+ getCurrentBranch() {
3454
+ return this.currentBranch;
3455
+ }
3456
+ getBaseBranch() {
3457
+ return this.baseBranch;
3458
+ }
3459
+ refreshBranchInfo() {
3460
+ if (isGitRepo(this.projectRoot)) {
3461
+ this.currentBranch = getBranchOrDefault(this.projectRoot);
3462
+ this.baseBranch = getBaseBranch(this.projectRoot);
3463
+ }
3464
+ }
3465
+ async getDatabaseStats() {
3466
+ const { database } = await this.ensureInitialized();
3467
+ return database.getStats();
3211
3468
  }
3212
3469
  };
3213
3470
 
@@ -5034,9 +5291,82 @@ var FileWatcher = class {
5034
5291
  return this.watcher !== null;
5035
5292
  }
5036
5293
  };
5294
+ var GitHeadWatcher = class {
5295
+ watcher = null;
5296
+ projectRoot;
5297
+ currentBranch = null;
5298
+ onBranchChange = null;
5299
+ debounceTimer = null;
5300
+ debounceMs = 100;
5301
+ // Short debounce for git operations
5302
+ constructor(projectRoot) {
5303
+ this.projectRoot = projectRoot;
5304
+ }
5305
+ start(handler) {
5306
+ if (this.watcher) {
5307
+ return;
5308
+ }
5309
+ if (!isGitRepo(this.projectRoot)) {
5310
+ return;
5311
+ }
5312
+ this.onBranchChange = handler;
5313
+ this.currentBranch = getCurrentBranch(this.projectRoot);
5314
+ const headPath = getHeadPath(this.projectRoot);
5315
+ const refsPath = path6.join(this.projectRoot, ".git", "refs", "heads");
5316
+ this.watcher = chokidar_default.watch([headPath, refsPath], {
5317
+ persistent: true,
5318
+ ignoreInitial: true,
5319
+ awaitWriteFinish: {
5320
+ stabilityThreshold: 50,
5321
+ pollInterval: 10
5322
+ }
5323
+ });
5324
+ this.watcher.on("change", () => this.handleHeadChange());
5325
+ this.watcher.on("add", () => this.handleHeadChange());
5326
+ }
5327
+ handleHeadChange() {
5328
+ if (this.debounceTimer) {
5329
+ clearTimeout(this.debounceTimer);
5330
+ }
5331
+ this.debounceTimer = setTimeout(() => {
5332
+ this.checkBranchChange();
5333
+ }, this.debounceMs);
5334
+ }
5335
+ async checkBranchChange() {
5336
+ const newBranch = getCurrentBranch(this.projectRoot);
5337
+ if (newBranch && newBranch !== this.currentBranch && this.onBranchChange) {
5338
+ const oldBranch = this.currentBranch;
5339
+ this.currentBranch = newBranch;
5340
+ try {
5341
+ await this.onBranchChange(oldBranch, newBranch);
5342
+ } catch (error) {
5343
+ console.error("Error handling branch change:", error);
5344
+ }
5345
+ } else if (newBranch) {
5346
+ this.currentBranch = newBranch;
5347
+ }
5348
+ }
5349
+ getCurrentBranch() {
5350
+ return this.currentBranch;
5351
+ }
5352
+ stop() {
5353
+ if (this.debounceTimer) {
5354
+ clearTimeout(this.debounceTimer);
5355
+ this.debounceTimer = null;
5356
+ }
5357
+ if (this.watcher) {
5358
+ this.watcher.close();
5359
+ this.watcher = null;
5360
+ }
5361
+ this.onBranchChange = null;
5362
+ }
5363
+ isRunning() {
5364
+ return this.watcher !== null;
5365
+ }
5366
+ };
5037
5367
  function createWatcherWithIndexer(indexer, projectRoot, config) {
5038
- const watcher = new FileWatcher(projectRoot, config);
5039
- watcher.start(async (changes) => {
5368
+ const fileWatcher = new FileWatcher(projectRoot, config);
5369
+ fileWatcher.start(async (changes) => {
5040
5370
  const hasAddOrChange = changes.some(
5041
5371
  (c) => c.type === "add" || c.type === "change"
5042
5372
  );
@@ -5045,7 +5375,22 @@ function createWatcherWithIndexer(indexer, projectRoot, config) {
5045
5375
  await indexer.index();
5046
5376
  }
5047
5377
  });
5048
- return watcher;
5378
+ let gitWatcher = null;
5379
+ if (isGitRepo(projectRoot)) {
5380
+ gitWatcher = new GitHeadWatcher(projectRoot);
5381
+ gitWatcher.start(async (oldBranch, newBranch) => {
5382
+ console.log(`Branch changed: ${oldBranch ?? "(none)"} -> ${newBranch}`);
5383
+ await indexer.index();
5384
+ });
5385
+ }
5386
+ return {
5387
+ fileWatcher,
5388
+ gitWatcher,
5389
+ stop() {
5390
+ fileWatcher.stop();
5391
+ gitWatcher?.stop();
5392
+ }
5393
+ };
5049
5394
  }
5050
5395
 
5051
5396
  // src/tools/index.ts
@@ -5129,13 +5474,19 @@ var index_health_check = (0, import_plugin.tool)({
5129
5474
  async execute() {
5130
5475
  const indexer = getIndexer();
5131
5476
  const result = await indexer.healthCheck();
5132
- if (result.removed === 0) {
5477
+ if (result.removed === 0 && result.gcOrphanEmbeddings === 0 && result.gcOrphanChunks === 0) {
5133
5478
  return "Index is healthy. No stale entries found.";
5134
5479
  }
5135
- const lines = [
5136
- `Health check complete:`,
5137
- ` Removed stale entries: ${result.removed}`
5138
- ];
5480
+ const lines = [`Health check complete:`];
5481
+ if (result.removed > 0) {
5482
+ lines.push(` Removed stale entries: ${result.removed}`);
5483
+ }
5484
+ if (result.gcOrphanEmbeddings > 0) {
5485
+ lines.push(` Garbage collected orphan embeddings: ${result.gcOrphanEmbeddings}`);
5486
+ }
5487
+ if (result.gcOrphanChunks > 0) {
5488
+ lines.push(` Garbage collected orphan chunks: ${result.gcOrphanChunks}`);
5489
+ }
5139
5490
  if (result.filePaths.length > 0) {
5140
5491
  lines.push(` Cleaned paths: ${result.filePaths.join(", ")}`);
5141
5492
  }
@@ -5190,13 +5541,18 @@ function formatStatus(status) {
5190
5541
  if (!status.indexed) {
5191
5542
  return "Codebase is not indexed. Run index_codebase to create an index.";
5192
5543
  }
5193
- return [
5544
+ const lines = [
5194
5545
  `Index status:`,
5195
5546
  ` Indexed chunks: ${status.vectorCount.toLocaleString()}`,
5196
5547
  ` Provider: ${status.provider}`,
5197
5548
  ` Model: ${status.model}`,
5198
5549
  ` Location: ${status.indexPath}`
5199
- ].join("\n");
5550
+ ];
5551
+ if (status.currentBranch !== "default") {
5552
+ lines.push(` Current branch: ${status.currentBranch}`);
5553
+ lines.push(` Base branch: ${status.baseBranch}`);
5554
+ }
5555
+ return lines.join("\n");
5200
5556
  }
5201
5557
 
5202
5558
  // src/index.ts
@@ -5233,6 +5589,39 @@ var plugin = async ({ directory }) => {
5233
5589
  index_codebase,
5234
5590
  index_status,
5235
5591
  index_health_check
5592
+ },
5593
+ async config(cfg) {
5594
+ cfg.command = cfg.command ?? {};
5595
+ cfg.command["search"] = {
5596
+ description: "Search codebase by meaning using semantic search",
5597
+ template: `Use the \`codebase_search\` tool to find code related to: $ARGUMENTS
5598
+
5599
+ If the index doesn't exist yet, run \`index_codebase\` first.
5600
+
5601
+ Return the most relevant results with file paths and line numbers.`
5602
+ };
5603
+ cfg.command["find"] = {
5604
+ description: "Find code using hybrid approach (semantic + grep)",
5605
+ template: `Find code related to: $ARGUMENTS
5606
+
5607
+ Strategy:
5608
+ 1. First use \`codebase_search\` to find semantically related code
5609
+ 2. From the results, identify specific function/class names
5610
+ 3. Use grep to find all occurrences of those identifiers
5611
+ 4. Combine findings into a comprehensive answer
5612
+
5613
+ If the semantic index doesn't exist, run \`index_codebase\` first.`
5614
+ };
5615
+ cfg.command["index"] = {
5616
+ description: "Index the codebase for semantic search",
5617
+ template: `Run the \`index_codebase\` tool to create or update the semantic search index.
5618
+
5619
+ Show progress and final statistics including:
5620
+ - Number of files processed
5621
+ - Number of chunks indexed
5622
+ - Tokens used
5623
+ - Duration`
5624
+ };
5236
5625
  }
5237
5626
  };
5238
5627
  };