npm - codebasesearch - Versions diffs - 0.1.10 → 0.1.12 - Mend

codebasesearch 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "codebasesearch",
-  "version": "0.1.10",
+  "version": "0.1.12",
   "description": "Ultra-simple code search tool with Jina embeddings, LanceDB, and MCP protocol support",
   "type": "module",
   "bin": {

package/src/cli.js CHANGED Viewed

@@ -74,25 +74,26 @@ export async function run(args) {
     // Always reindex to ensure freshness
     console.log('Generating embeddings and indexing...');
-    // Generate embeddings in batches
+    // Generate embeddings in batches and upsert immediately to free memory
     const batchSize = 32;
-    const chunkTexts = chunks.map(c => c.content);
-    const allEmbeddings = [];
+    let processedCount = 0;
-    for (let i = 0; i < chunkTexts.length; i += batchSize) {
-      const batchTexts = chunkTexts.slice(i, i + batchSize);
+    for (let i = 0; i < chunks.length; i += batchSize) {
+      const batchChunks = chunks.slice(i, i + batchSize);
+      const batchTexts = batchChunks.map(c => c.content);
       const batchEmbeddings = await generateEmbeddings(batchTexts);
-      allEmbeddings.push(...batchEmbeddings);
-    }
-    // Create chunks with embeddings
-    const chunksWithEmbeddings = chunks.map((chunk, idx) => ({
-      ...chunk,
-      vector: allEmbeddings[idx]
-    }));
+      // Create batch with embeddings
+      const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
+        ...chunk,
+        vector: batchEmbeddings[idx]
+      }));
+      // Upsert immediately to free memory
+      await upsertChunks(batchWithEmbeddings);
+      processedCount += batchWithEmbeddings.length;
+    }
-    // Upsert to store
-    await upsertChunks(chunksWithEmbeddings);
     console.log('Index created\n');
     // Execute search

package/src/ignore-parser.js CHANGED Viewed

@@ -32,7 +32,97 @@ function getHardcodedIgnores() {
     'temp', 'tmp', '.tmp', '.DS_Store', 'Thumbs.db',
     '.swp', '.swo', '*.swp', '*.swo', '.tern-port',
     'dist-server', 'out-tsc', '.cache', '.parcel-cache',
-    'typings', '.env', '.env.local', '.env.*.local'
+    'typings', '.env', '.env.local', '.env.*.local',
+    // JSON files - PRIMARY PRIORITY for memory reduction
+    '*.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
+    'Gemfile.lock', 'poetry.lock', 'Pipfile.lock',
+    // Lock files
+    '*.lock',
+    // Build outputs
+    'public', 'static', 'site', '_site', '.docusaurus', '.gatsby',
+    // Cache/dependency directories
+    '.rush', '.lerna', '.nx',
+    // IDE/editor configs
+    '.cursor', '.replit', '.sublime-project', '.sublime-workspace',
+    '*.iml', '.project', '.classpath', '.settings', '*.sublime-*',
+    // OS files
+    '.Spotlight-V100', '.Trashes', 'ehthumbs.db', '.fseventsd',
+    '.TemporaryItems', '.AppleDouble', '.LSOverride', 'desktop.ini',
+    // Large data files
+    '*.db', '*.sqlite', '*.sqlite3', '*.bak', '*.dump',
+    '*.backup', '*.data', '*.orig',
+    // Logs and temp
+    '*.log', 'logs', 'npm-debug.log', 'yarn-error.log',
+    // Test coverage and reports
+    'lcov.info', '.coverage', 'test-results',
+    // Database related
+    'storage', 'fixtures',
+    // LLM/Vector related
+    '.llamaindex', '.chroma', '.vectorstore', '.embeddings',
+    '.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
+    '.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
+    'chromadb', 'pinecone-cache', 'weaviate-data',
+    // Compiled output
+    '*.min.js', '*.min.css', '*.bundle.js', '*.chunk.js', '*.map',
+    // Generated/build artifacts
+    '.assets', 'out-tsc', 'cmake_build_debug', 'cmake_build_release',
+    // Version managers
+    '.rbenv', '.nvm', '.nvmrc',
+    // Ruby specific
+    '*.gem', '*.rbc', '/pkg', '/spec/reports', '/spec/examples.txt',
+    '/test/tmp', '/test/version_tmp', 'lib/bundler/man', '.ruby-version',
+    // Go specific
+    'go.work',
+    // Rust specific
+    'Cargo.lock', '**/*.rs.bk', '*.pdb',
+    // Java specific
+    '*.class', '*.jar', '*.war', '*.ear', '*.nar', '*.nupkg', '*.snupkg',
+    // C# specific
+    '*.suo', '*.user', '*.userosscache', '*.sln.docstates',
+    'project.lock.json', 'project.fragment.lock.json', 'artifacts',
+    // C/C++ specific
+    '*.o', '*.a', '*.so', '*.exe', '*.obj', '*.dll', '*.dylib',
+    'CMakeFiles', 'CMakeCache.txt', '*.cmake',
+    // Swift/Xcode specific
+    '*.xcodeproj', '*.xcworkspace', '*.moved-aside', '*.pbxuser',
+    '*.mode1v3', '*.mode2v3', '*.perspectivev3',
+    // Scala/SBT specific
+    'lib_managed', 'src_managed', 'project/boot', 'project/plugins/project',
+    '.history', '.lib',
+    // PHP specific
+    'composer.lock', '*.phar',
+    // Docker
+    '.dockerignore', 'docker-compose.override.yml', '.docker',
+    // Documentation build
+    'docs/_build', '.vuepress',
+    // Testing frameworks
+    'jest.config', 'vitest.config', 'pytest.ini', 'tox.ini',
+    '__tests__', '__mocks__', 'spec', 'cypress', 'playwright',
+    // Monorepo workspace patterns (implicit through directory coverage)
+    '.turbo', '.nx',
+    // Python package patterns
+    '*.py[cod]', '*$py.class', '.Python', 'pip-log.txt',
+    'pip-delete-this-directory.txt', '.hypothesis', '.pyre', '.pytype',
+    '*.whl',
+    // Config/metadata that are typically low-value
+    '*.config.js', '*.config.ts', 'webpack.config.js', 'rollup.config.js',
+    'vite.config.js', 'tsconfig.json', 'jsconfig.json', 'babel.config',
+    '.babelrc', '.eslintrc', '.prettierrc', '.stylelintrc', '.editorconfig',
+    '*.local', '*.development', '*.production',
+    // Node specific
+    '.npm', '.node_repl_history', '*.tsbuildinfo', 'yarn-error.log',
+    // Documentation/reference files that don't help with search
+    '*.md', '*.txt', '*.rst', '*.adoc', 'docs', 'documentation', 'wiki',
+    'CHANGELOG', 'HISTORY', 'NEWS', 'UPGRADING', 'FAQ', 'CONTRIBUTING',
+    'SECURITY', 'LICENSE', 'LICENCE', 'COPYRIGHT', 'NOTICE', 'AUTHORS',
+    'THIRDPARTY',
+    // Test and coverage files
+    '*.test', '*.spec', 'test', 'tests', 'htmlcov',
+    // Profiling
+    '*.prof', '*.cpuprofile', '*.heapprofile',
+    // Misc
+    '.tern-port', 'firebase-debug.log', 'firestore-debug.log',
+    'ui-debug.log', '.firebaserc', '.stackdump'
   ]);
 }
@@ -110,15 +200,30 @@ export function loadIgnorePatterns(rootPath) {
 export function shouldIgnore(filePath, ignorePatterns) {
   const normalizedPath = filePath.replace(/\\/g, '/');
   const pathParts = normalizedPath.split('/');
+  const fileName = pathParts[pathParts.length - 1];
   for (const pattern of ignorePatterns) {
+    // Handle path patterns (contain /)
     if (pattern.includes('/')) {
       if (normalizedPath.includes(pattern)) {
         return true;
       }
-    } else {
+    }
+    // Handle extension patterns (*.ext)
+    else if (pattern.startsWith('*.')) {
+      const ext = pattern.slice(1);
+      if (fileName.endsWith(ext)) {
+        return true;
+      }
+    }
+    // Handle exact file name patterns
+    else if (fileName === pattern) {
+      return true;
+    }
+    // Handle directory name patterns (match any path part)
+    else {
       for (const part of pathParts) {
-        if (part === pattern) {
+        if (part === pattern || part.startsWith(pattern + '/')) {
           return true;
         }
       }

package/src/scanner.js CHANGED Viewed

@@ -23,7 +23,6 @@ const SUPPORTED_EXTENSIONS = new Set([
   '.groovy',
   '.gradle',
   '.xml', '.xsd',
-  '.json', '.jsonc',
   '.yaml', '.yml',
   '.toml',
   '.html', '.htm',
@@ -95,7 +94,7 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
   return files;
 }
-function chunkContent(content, chunkSize = 1000, overlapSize = 200) {
+function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
   const lines = content.split('\n');
   const chunks = [];

package/src/search-worker.js CHANGED Viewed

@@ -20,16 +20,16 @@ async function initializeIndex(repositoryPath) {
     const chunks = scanRepository(absolutePath, ignorePatterns);
     if (chunks.length === 0) {
-      return { error: 'No code chunks found', chunks: [], index: null };
+      return { error: 'No code chunks found', chunks: [], indexData: null };
     }
-    const index = buildTextIndex(chunks);
-    const indexData = { chunks, index };
-    indexCache.set(cacheKey, indexData);
+    const indexData = buildTextIndex(chunks);
+    const result = { chunks, indexData };
+    indexCache.set(cacheKey, result);
-    return indexData;
+    return result;
   } catch (error) {
-    return { error: error.message, chunks: [], index: null };
+    return { error: error.message, chunks: [], indexData: null };
   }
 }
@@ -47,7 +47,7 @@ async function performSearch(repositoryPath, query) {
       return { error: indexData.error, results: [] };
     }
-    const results = searchText(query, indexData.chunks, indexData.index);
+    const results = searchText(query, indexData.chunks, indexData.indexData);
     return {
       query,

package/src/store.js CHANGED Viewed

@@ -4,6 +4,7 @@ import { mkdirSync, existsSync } from 'fs';
 let dbConnection = null;
 let tableRef = null;
+let isFirstBatch = true;
 export async function initStore(dbPath) {
   // Ensure directory exists
@@ -19,6 +20,7 @@ export async function initStore(dbPath) {
       uri: dbDir,
       mode: 'overwrite'
     });
+    isFirstBatch = true;
     console.error('Vector store initialized');
     return true;
   } catch (e) {
@@ -68,14 +70,24 @@ export async function upsertChunks(chunks) {
   try {
     let table = null;
-    // Try to open existing table
-    try {
-      table = await dbConnection.openTable(tableName);
-      // Overwrite existing table with new data
-      await table.overwrite(data);
-    } catch (e) {
-      // Table doesn't exist, create new one
-      table = await dbConnection.createTable(tableName, data);
+    if (isFirstBatch) {
+      // First batch: try to open existing table, or create new one
+      try {
+        table = await dbConnection.openTable(tableName);
+        await table.overwrite(data);
+      } catch (e) {
+        table = await dbConnection.createTable(tableName, data);
+      }
+      isFirstBatch = false;
+    } else {
+      // Subsequent batches: add to existing table
+      try {
+        table = await dbConnection.openTable(tableName);
+        await table.add(data);
+      } catch (e) {
+        console.error('Failed to add to table:', e.message);
+        throw e;
+      }
     }
     tableRef = table;

package/src/text-search.js CHANGED Viewed

@@ -1,48 +1,129 @@
 export function buildTextIndex(chunks) {
   const index = new Map();
+  const chunkMetadata = [];
   chunks.forEach((chunk, idx) => {
     const tokens = tokenize(chunk.content);
+    const symbols = extractSymbols(chunk.content);
+    const frequency = new Map();
     tokens.forEach(token => {
+      frequency.set(token, (frequency.get(token) || 0) + 1);
       if (!index.has(token)) {
         index.set(token, []);
       }
       index.get(token).push(idx);
     });
+    chunkMetadata[idx] = {
+      tokens,
+      symbols,
+      frequency,
+      isCode: isCodeFile(chunk.file_path),
+    };
   });
-  return index;
+  return { index, chunkMetadata };
 }
-export function searchText(query, chunks, index) {
+export function searchText(query, chunks, indexData) {
+  const { index, chunkMetadata } = indexData;
   const queryTokens = tokenize(query);
+  const querySymbols = extractSymbols(query);
   const chunkScores = new Map();
-  queryTokens.forEach(token => {
-    if (index.has(token)) {
-      index.get(token).forEach(chunkIdx => {
-        if (!chunkScores.has(chunkIdx)) {
-          chunkScores.set(chunkIdx, 0);
+  chunks.forEach((chunk, idx) => {
+    let score = 0;
+    queryTokens.forEach(token => {
+      if (index.has(token)) {
+        if (index.get(token).includes(idx)) {
+          const freq = chunkMetadata[idx].frequency.get(token) || 1;
+          const boost = token.length > 4 ? 1.5 : 1;
+          score += boost * freq;
         }
-        chunkScores.set(chunkIdx, chunkScores.get(chunkIdx) + 1);
-      });
+      }
+    });
+    querySymbols.forEach(symbol => {
+      if (chunkMetadata[idx].symbols.includes(symbol)) {
+        score += 5;
+      }
+    });
+    const exactMatch = chunk.content.includes(query);
+    if (exactMatch) {
+      score += 10;
+    }
+    if (chunkMetadata[idx].isCode) {
+      score *= 1.2;
+    }
+    if (score > 0) {
+      chunkScores.set(idx, score);
     }
   });
   const results = Array.from(chunkScores.entries())
     .map(([idx, score]) => ({
       ...chunks[idx],
-      score: score / queryTokens.length,
-      matchCount: score,
+      score: Math.min(score / 100, 1),
+      _rawScore: score,
     }))
     .filter(r => r.score > 0)
-    .sort((a, b) => b.score - a.score);
+    .sort((a, b) => b._rawScore - a._rawScore);
   return results;
 }
 function tokenize(text) {
-  return text
-    .toLowerCase()
-    .match(/\b\w+\b/g) || [];
+  const tokens = new Set();
+  text.toLowerCase().split(/\s+/).forEach(word => {
+    if (word.length === 0) return;
+    tokens.add(word.replace(/[^\w]/g, ''));
+    const camelCaseTokens = word.match(/[a-z]+|[A-Z][a-z]*|[0-9]+/g) || [];
+    camelCaseTokens.forEach(t => {
+      if (t.length > 1) tokens.add(t.toLowerCase());
+    });
+    const snakeCaseTokens = word.split(/[-_]/).filter(t => t.length > 0);
+    snakeCaseTokens.forEach(t => {
+      if (t.length > 1) tokens.add(t.toLowerCase());
+    });
+  });
+  return Array.from(tokens).filter(t => t.length > 1);
+}
+function extractSymbols(text) {
+  const symbols = new Set();
+  const functionMatches = text.match(/(?:async\s+)?function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s*)?\(/g) || [];
+  functionMatches.forEach(match => {
+    const name = match.match(/\w+(?=\s*[=\(])/)?.[0];
+    if (name) symbols.add(name.toLowerCase());
+  });
+  const classMatches = text.match(/class\s+(\w+)/g) || [];
+  classMatches.forEach(match => {
+    const name = match.match(/\w+$/)?.[0];
+    if (name) symbols.add(name.toLowerCase());
+  });
+  const exportMatches = text.match(/export\s+(?:async\s+)?(?:function|class)\s+(\w+)/g) || [];
+  exportMatches.forEach(match => {
+    const name = match.match(/\w+$/)?.[0];
+    if (name) symbols.add(name.toLowerCase());
+  });
+  return Array.from(symbols);
+}
+function isCodeFile(filePath) {
+  const codeExtensions = ['.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.go', '.rs', '.rb'];
+  return codeExtensions.some(ext => filePath.endsWith(ext));
 }