npm - codebasesearch - Versions diffs - 0.1.22 → 0.1.23 - Mend

codebasesearch 0.1.22 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/.prd ADDED Viewed

@@ -0,0 +1,78 @@
+{
+  "project": "code-search",
+  "created": "2026-03-12",
+  "objective": "Profile and improve code-search speed and result quality",
+  "items": [
+    {
+      "id": "fix-dedup-buildtextindex",
+      "subject": "Remove duplicate buildTextIndex from search.js",
+      "status": "pending",
+      "description": "search.js has a private copy of buildTextIndex and tokenize/extractSymbols/isCodeFile that duplicates text-search.js. Import the exported buildTextIndex from text-search.js instead.",
+      "category": "refactor",
+      "effort": "small",
+      "blocking": ["fix-score-normalization", "fix-hybrid-weights"],
+      "blockedBy": []
+    },
+    {
+      "id": "fix-chunk-size",
+      "subject": "Reduce chunk size from 300 to 60 lines for better semantic granularity",
+      "status": "pending",
+      "description": "scanner.js uses 300-line chunks. Embeddings work best on 50-100 line chunks. Reduce to 60-line chunks with 15-line overlap for better vector search quality.",
+      "category": "feature",
+      "effort": "small",
+      "blocking": [],
+      "blockedBy": []
+    },
+    {
+      "id": "fix-score-normalization",
+      "subject": "Fix text search score normalization so top result is always 1.0",
+      "status": "pending",
+      "description": "Text scores divide raw by 100 but scores can exceed 100. Use dynamic max-score scaling. Lower hasGoodTextResults threshold from 0.5 to 0.3.",
+      "category": "bug",
+      "effort": "small",
+      "blocking": [],
+      "blockedBy": ["fix-dedup-buildtextindex"]
+    },
+    {
+      "id": "fix-hybrid-weights",
+      "subject": "Boost text-only exact-match results in hybrid merge",
+      "status": "pending",
+      "description": "Text-only results are capped at 20% weight. Give high-scoring text-only results a floor finalScore of 0.4.",
+      "category": "feature",
+      "effort": "small",
+      "blocking": [],
+      "blockedBy": ["fix-dedup-buildtextindex"]
+    },
+    {
+      "id": "fix-vector-cache-key",
+      "subject": "Strengthen vector search cache key to 20 dimensions",
+      "status": "pending",
+      "description": "Cache key uses only first 5 embedding dims. Use 20 dims for near-zero collision rate.",
+      "category": "bug",
+      "effort": "small",
+      "blocking": [],
+      "blockedBy": []
+    },
+    {
+      "id": "remove-dead-meanpooling",
+      "subject": "Remove dead meanPooling function from embeddings.js",
+      "status": "pending",
+      "description": "meanPooling is defined but never called. Remove dead code.",
+      "category": "refactor",
+      "effort": "small",
+      "blocking": [],
+      "blockedBy": []
+    },
+    {
+      "id": "verify-and-commit",
+      "subject": "Verify improvements and commit all changes",
+      "status": "pending",
+      "description": "Run end-to-end search logic test inline. Commit and push all changes.",
+      "category": "infra",
+      "effort": "small",
+      "blocking": [],
+      "blockedBy": ["fix-dedup-buildtextindex", "fix-chunk-size", "fix-score-normalization", "fix-hybrid-weights", "fix-vector-cache-key", "remove-dead-meanpooling"]
+    }
+  ],
+  "completed": []
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "codebasesearch",
-  "version": "0.1.22",
+  "version": "0.1.23",
   "description": "Ultra-simple code search tool with Jina embeddings, LanceDB, and MCP protocol support",
   "type": "module",
   "bin": {

package/src/cli.js CHANGED Viewed

@@ -75,29 +75,39 @@ export async function run(args) {
     console.log('Generating embeddings and indexing...');
     // Generate embeddings in batches and upsert immediately to free memory
-    const batchSize = 32;
-    let processedCount = 0;
-    for (let i = 0; i < chunks.length; i += batchSize) {
-      const batchChunks = chunks.slice(i, i + batchSize);
-      const batchTexts = batchChunks.map(c => c.content);
-      const batchEmbeddings = await generateEmbeddings(batchTexts);
-      // Create batch with embeddings
-      const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
-        ...chunk,
-        vector: batchEmbeddings[idx]
-      }));
+    // Optimize batch size based on chunk count (larger batches are more efficient)
+    let batchSize = 32;
+    if (chunks.length > 500) batchSize = 64;
+    if (chunks.length > 1000) batchSize = 96;
-      // Upsert immediately to free memory
-      await upsertChunks(batchWithEmbeddings);
-      processedCount += batchWithEmbeddings.length;
+    let processedCount = 0;
+    let embeddingsAvailable = true;
+    try {
+      for (let i = 0; i < chunks.length; i += batchSize) {
+        const batchChunks = chunks.slice(i, i + batchSize);
+        const batchTexts = batchChunks.map(c => c.content);
+        const batchEmbeddings = await generateEmbeddings(batchTexts);
+        // Create batch with embeddings
+        const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
+          ...chunk,
+          vector: batchEmbeddings[idx]
+        }));
+        // Upsert immediately to free memory
+        await upsertChunks(batchWithEmbeddings);
+        processedCount += batchWithEmbeddings.length;
+      }
+    } catch (embeddingError) {
+      console.warn(`Warning: Embedding generation failed (${embeddingError.message}). Using text-only search.\n`);
+      embeddingsAvailable = false;
     }
     console.log('Index created\n');
-    // Execute search
-    const results = await executeSearch(query);
+    // Execute search with chunks for hybrid search (text-only if embeddings failed)
+    const results = await executeSearch(query, 10, chunks);
     // Format and display results
     const output = formatResults(results);

package/src/embeddings.js CHANGED Viewed

@@ -13,6 +13,7 @@ try {
 let modelCache = null;
 let cacheCleared = false;
+let modelLoadTime = 0;
 function clearModelCache() {
   const cacheDirs = [
@@ -37,6 +38,7 @@ async function getModel(retryOnError = true) {
     return modelCache;
   }
+  const modelStart = performance.now();
   console.error('Loading embeddings model (this may take a moment on first run)...');
   const modelLoadPromise = pipeline(
@@ -50,6 +52,7 @@ async function getModel(retryOnError = true) {
   try {
     modelCache = await Promise.race([modelLoadPromise, timeoutPromise]);
+    modelLoadTime = performance.now() - modelStart;
   } catch (e) {
     if (retryOnError && !cacheCleared && (e.message.includes('Protobuf') || e.message.includes('parsing'))) {
       console.error('Detected corrupted cache, clearing and retrying...');
@@ -65,37 +68,8 @@ async function getModel(retryOnError = true) {
   return modelCache;
 }
-async function meanPooling(modelOutput, attentionMask) {
-  // Get token embeddings from model output
-  const tokenEmbeddings = modelOutput.data;
-  const embeddingDim = modelOutput.dims[modelOutput.dims.length - 1];
-  const batchSize = modelOutput.dims[0];
-  const seqLength = modelOutput.dims[1];
-  const pooled = [];
-  for (let b = 0; b < batchSize; b++) {
-    let sum = new Array(embeddingDim).fill(0);
-    let count = 0;
-    for (let s = 0; s < seqLength; s++) {
-      const tokenIdx = b * seqLength + s;
-      const maskValue = attentionMask[tokenIdx] || 1;
-      if (maskValue > 0) {
-        const tokenStart = tokenIdx * embeddingDim;
-        for (let d = 0; d < embeddingDim; d++) {
-          sum[d] += tokenEmbeddings[tokenStart + d] * maskValue;
-        }
-        count += maskValue;
-      }
-    }
-    const normalized = sum.map(v => v / Math.max(count, 1e-9));
-    pooled.push(normalized);
-  }
-  return pooled;
+export function getModelLoadTime() {
+  return modelLoadTime;
 }
 export async function generateEmbeddings(texts) {
@@ -105,11 +79,16 @@ export async function generateEmbeddings(texts) {
     texts = [texts];
   }
-  // Generate embeddings for all texts
-  const embeddings = await model(texts, {
-    pooling: 'mean',
-    normalize: true
-  });
+  // Generate embeddings for all texts with timeout per batch
+  const embeddings = await Promise.race([
+    model(texts, {
+      pooling: 'mean',
+      normalize: true
+    }),
+    new Promise((_, reject) =>
+      setTimeout(() => reject(new Error('Embedding generation timeout')), 60000)
+    )
+  ]);
   // Convert to regular arrays
   const result = [];

package/src/scanner.js CHANGED Viewed

@@ -65,7 +65,7 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
   return files;
 }
-function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
+function chunkContent(content, chunkSize = 60, overlapSize = 15) {
   const lines = content.split('\n');
   const chunks = [];
@@ -81,7 +81,6 @@ function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
       });
     }
-    // Stop if we've reached the end
     if (endIdx === lines.length) {
       break;
     }
@@ -100,7 +99,7 @@ export function scanRepository(rootPath, ignorePatterns) {
       const mtime = file.mtime;
       // For small files, treat as single chunk
-      if (content.split('\n').length <= 1000) {
+      if (content.split('\n').length <= 60) {
         chunks.push({
           file_path: file.relativePath,
           chunk_index: 0,

package/src/search.js CHANGED Viewed

@@ -1,20 +1,84 @@
 import { generateSingleEmbedding } from './embeddings.js';
 import { searchSimilar } from './store.js';
+import { buildTextIndex, searchText } from './text-search.js';
-export async function executeSearch(query, limit = 10) {
+export async function executeSearch(query, limit = 10, allChunks = null, skipVector = false) {
   if (!query || query.trim().length === 0) {
     throw new Error('Query cannot be empty');
   }
   console.error(`Searching for: "${query}"`);
-  // Generate embedding for query
-  const queryEmbedding = await generateSingleEmbedding(query);
+  try {
+    let vectorResults = [];
+    let textResults = [];
-  // Search vector store
-  const results = await searchSimilar(queryEmbedding, limit);
+    if (allChunks && allChunks.length > 0) {
+      const textIndexData = buildTextIndex(allChunks);
+      textResults = searchText(query, allChunks, textIndexData);
+    }
+    const hasGoodTextResults = textResults.length > 0 && textResults[0].score > 0.3;
+    if (!skipVector && !hasGoodTextResults) {
+      try {
+        const queryEmbedding = await generateSingleEmbedding(query);
+        vectorResults = await searchSimilar(queryEmbedding, limit * 2);
+      } catch (e) {
+        console.warn(`Vector search unavailable: ${e.message}`);
+      }
+    }
+    if (vectorResults.length > 0 && textResults.length > 0) {
+      return mergeSearchResults(vectorResults, textResults.slice(0, limit * 2), limit);
+    }
+    const allResults = vectorResults.length > 0 ? vectorResults : textResults;
+    return allResults.slice(0, limit);
+  } catch (error) {
+    console.error('Search error:', error.message);
+    if (allChunks && allChunks.length > 0) {
+      const textIndexData = buildTextIndex(allChunks);
+      const textResults = searchText(query, allChunks, textIndexData);
+      return textResults.slice(0, limit);
+    }
+    throw error;
+  }
+}
+function mergeSearchResults(vectorResults, textResults, limit) {
+  const merged = new Map();
+  vectorResults.forEach((result) => {
+    const key = `${result.file_path}:${result.chunk_index}`;
+    merged.set(key, {
+      ...result,
+      vectorScore: result.score || 0,
+      textScore: 0,
+      finalScore: (result.score || 0) * 0.8
+    });
+  });
+  textResults.forEach((result) => {
+    const key = `${result.file_path}:${result.chunk_index || 0}`;
+    if (merged.has(key)) {
+      const existing = merged.get(key);
+      existing.textScore = result.score || 0;
+      existing.finalScore = (existing.vectorScore * 0.8) + (result.score * 0.2);
+    } else {
+      const textScore = result.score || 0;
+      const finalScore = Math.max(textScore * 0.2, textScore > 0.7 ? 0.4 : 0);
+      merged.set(key, {
+        ...result,
+        vectorScore: 0,
+        textScore,
+        finalScore
+      });
+    }
+  });
-  return results;
+  return Array.from(merged.values())
+    .sort((a, b) => b.finalScore - a.finalScore)
+    .slice(0, limit);
 }
 export function formatResults(results) {
@@ -27,15 +91,14 @@ export function formatResults(results) {
   for (let i = 0; i < results.length; i++) {
     const result = results[i];
-    const match = i + 1;
+    const scoreValue = result.finalScore !== undefined ? result.finalScore : (result.score || 0);
+    const scorePercent = (scoreValue * 100).toFixed(1);
-    lines.push(`${match}. ${result.file_path}:${result.line_start}-${result.line_end} (score: ${(result.score * 100).toFixed(1)}%)`);
+    lines.push(`${i + 1}. ${result.file_path}:${result.line_start}-${result.line_end} (score: ${scorePercent}%)`);
-    // Show code snippet (first 3 lines)
     const codeLines = result.content.split('\n').slice(0, 3);
     for (const line of codeLines) {
-      const trimmed = line.slice(0, 80); // Limit line length
-      lines.push(`   > ${trimmed}`);
+      lines.push(`   > ${line.slice(0, 80)}`);
     }
     lines.push('');

package/src/store.js CHANGED Viewed

@@ -5,6 +5,7 @@ import { mkdirSync, existsSync } from 'fs';
 let dbConnection = null;
 let tableRef = null;
 let isFirstBatch = true;
+let vectorSearchCache = new Map();
 export async function initStore(dbPath) {
   // Ensure directory exists
@@ -121,12 +122,19 @@ export async function searchSimilar(queryEmbedding, limit = 10) {
     // Ensure vector is a proper array/tensor
     const query = Array.isArray(queryEmbedding) ? queryEmbedding : Array.from(queryEmbedding);
+    // Check cache using 20-dimension hash for near-zero collision rate
+    const cacheKey = query.slice(0, 20).join(',');
+    const cached = vectorSearchCache.get(cacheKey);
+    if (cached) {
+      return cached.slice(0, limit);
+    }
     const results = await tableRef
       .search(query)
       .limit(limit)
       .execute();
-    return results.map(result => {
+    const formattedResults = results.map(result => {
       const distance = result._distance !== undefined ? result._distance : (result.distance || 0);
       const score = distance !== null && distance !== undefined ? 1 / (1 + distance) : 0;
       return {
@@ -139,6 +147,15 @@ export async function searchSimilar(queryEmbedding, limit = 10) {
         score: score
       };
     });
+    // Cache results (keep max 100 cached searches)
+    if (vectorSearchCache.size > 100) {
+      const firstKey = vectorSearchCache.keys().next().value;
+      vectorSearchCache.delete(firstKey);
+    }
+    vectorSearchCache.set(cacheKey, formattedResults);
+    return formattedResults;
   } catch (e) {
     console.error('Search failed:', e.message);
     return [];

package/src/text-search.js CHANGED Viewed

@@ -52,12 +52,14 @@ export function searchText(query, chunks, indexData) {
     const meta = chunkMetadata[idx];
     let score = 0;
-    queryTokens.forEach(token => {
-      if (index.has(token) && index.get(token).has(idx)) {
-        const freq = meta.frequency.get(token) || 1;
-        const lengthBoost = token.length > 4 ? 1.5 : 1;
-        score += lengthBoost * Math.min(freq, 5);
-      }
+    // Exact phrase match - highest priority (saves embedding cost)
+    if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
+      score += 30;
+    }
+    // Symbol match in content - function/class named after query terms
+    querySymbols.forEach(symbol => {
+      if (meta.symbols.includes(symbol)) score += 10;
     });
     // Filename token match - strong signal that this file is about the query topic
@@ -66,32 +68,32 @@ export function searchText(query, chunks, indexData) {
       if (meta.fileNameTokens.includes(token)) fileNameMatches++;
     });
     if (fileNameMatches > 0) {
-      score += fileNameMatches * 8;
+      score += fileNameMatches * 10;
     }
-    // Symbol match in content - function/class named after query terms
-    querySymbols.forEach(symbol => {
-      if (meta.symbols.includes(symbol)) score += 5;
+    // Token frequency scoring
+    queryTokens.forEach(token => {
+      if (index.has(token) && index.get(token).has(idx)) {
+        const freq = meta.frequency.get(token) || 1;
+        const lengthBoost = token.length > 4 ? 1.5 : 1;
+        score += lengthBoost * Math.min(freq, 5);
+      }
     });
-    // Exact phrase match
-    if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
-      score += 15;
-    }
     // Code file boost
     if (meta.isCode) score *= 1.2;
     if (score > 0) chunkScores.set(idx, score);
   }
-  const results = Array.from(chunkScores.entries())
-    .map(([idx, score]) => ({
-      ...chunks[idx],
-      score: Math.min(score / 100, 1),
-      _rawScore: score,
-    }))
-    .sort((a, b) => b._rawScore - a._rawScore);
+  const entries = Array.from(chunkScores.entries()).sort((a, b) => b[1] - a[1]);
+  const maxScore = entries.length > 0 ? entries[0][1] : 1;
+  const results = entries.map(([idx, score]) => ({
+    ...chunks[idx],
+    score: score / maxScore,
+    _rawScore: score,
+  }));
   return results;
 }