codebasesearch 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codebasesearch",
3
- "version": "0.1.10",
3
+ "version": "0.1.12",
4
4
  "description": "Ultra-simple code search tool with Jina embeddings, LanceDB, and MCP protocol support",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli.js CHANGED
@@ -74,25 +74,26 @@ export async function run(args) {
74
74
  // Always reindex to ensure freshness
75
75
  console.log('Generating embeddings and indexing...');
76
76
 
77
- // Generate embeddings in batches
77
+ // Generate embeddings in batches and upsert immediately to free memory
78
78
  const batchSize = 32;
79
- const chunkTexts = chunks.map(c => c.content);
80
- const allEmbeddings = [];
79
+ let processedCount = 0;
81
80
 
82
- for (let i = 0; i < chunkTexts.length; i += batchSize) {
83
- const batchTexts = chunkTexts.slice(i, i + batchSize);
81
+ for (let i = 0; i < chunks.length; i += batchSize) {
82
+ const batchChunks = chunks.slice(i, i + batchSize);
83
+ const batchTexts = batchChunks.map(c => c.content);
84
84
  const batchEmbeddings = await generateEmbeddings(batchTexts);
85
- allEmbeddings.push(...batchEmbeddings);
86
- }
87
85
 
88
- // Create chunks with embeddings
89
- const chunksWithEmbeddings = chunks.map((chunk, idx) => ({
90
- ...chunk,
91
- vector: allEmbeddings[idx]
92
- }));
86
+ // Create batch with embeddings
87
+ const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
88
+ ...chunk,
89
+ vector: batchEmbeddings[idx]
90
+ }));
91
+
92
+ // Upsert immediately to free memory
93
+ await upsertChunks(batchWithEmbeddings);
94
+ processedCount += batchWithEmbeddings.length;
95
+ }
93
96
 
94
- // Upsert to store
95
- await upsertChunks(chunksWithEmbeddings);
96
97
  console.log('Index created\n');
97
98
 
98
99
  // Execute search
@@ -32,7 +32,97 @@ function getHardcodedIgnores() {
32
32
  'temp', 'tmp', '.tmp', '.DS_Store', 'Thumbs.db',
33
33
  '.swp', '.swo', '*.swp', '*.swo', '.tern-port',
34
34
  'dist-server', 'out-tsc', '.cache', '.parcel-cache',
35
- 'typings', '.env', '.env.local', '.env.*.local'
35
+ 'typings', '.env', '.env.local', '.env.*.local',
36
+ // JSON files - PRIMARY PRIORITY for memory reduction
37
+ '*.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
38
+ 'Gemfile.lock', 'poetry.lock', 'Pipfile.lock',
39
+ // Lock files
40
+ '*.lock',
41
+ // Build outputs
42
+ 'public', 'static', 'site', '_site', '.docusaurus', '.gatsby',
43
+ // Cache/dependency directories
44
+ '.rush', '.lerna', '.nx',
45
+ // IDE/editor configs
46
+ '.cursor', '.replit', '.sublime-project', '.sublime-workspace',
47
+ '*.iml', '.project', '.classpath', '.settings', '*.sublime-*',
48
+ // OS files
49
+ '.Spotlight-V100', '.Trashes', 'ehthumbs.db', '.fseventsd',
50
+ '.TemporaryItems', '.AppleDouble', '.LSOverride', 'desktop.ini',
51
+ // Large data files
52
+ '*.db', '*.sqlite', '*.sqlite3', '*.bak', '*.dump',
53
+ '*.backup', '*.data', '*.orig',
54
+ // Logs and temp
55
+ '*.log', 'logs', 'npm-debug.log', 'yarn-error.log',
56
+ // Test coverage and reports
57
+ 'lcov.info', '.coverage', 'test-results',
58
+ // Database related
59
+ 'storage', 'fixtures',
60
+ // LLM/Vector related
61
+ '.llamaindex', '.chroma', '.vectorstore', '.embeddings',
62
+ '.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
63
+ '.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
64
+ 'chromadb', 'pinecone-cache', 'weaviate-data',
65
+ // Compiled output
66
+ '*.min.js', '*.min.css', '*.bundle.js', '*.chunk.js', '*.map',
67
+ // Generated/build artifacts
68
+ '.assets', 'out-tsc', 'cmake_build_debug', 'cmake_build_release',
69
+ // Version managers
70
+ '.rbenv', '.nvm', '.nvmrc',
71
+ // Ruby specific
72
+ '*.gem', '*.rbc', '/pkg', '/spec/reports', '/spec/examples.txt',
73
+ '/test/tmp', '/test/version_tmp', 'lib/bundler/man', '.ruby-version',
74
+ // Go specific
75
+ 'go.work',
76
+ // Rust specific
77
+ 'Cargo.lock', '**/*.rs.bk', '*.pdb',
78
+ // Java specific
79
+ '*.class', '*.jar', '*.war', '*.ear', '*.nar', '*.nupkg', '*.snupkg',
80
+ // C# specific
81
+ '*.suo', '*.user', '*.userosscache', '*.sln.docstates',
82
+ 'project.lock.json', 'project.fragment.lock.json', 'artifacts',
83
+ // C/C++ specific
84
+ '*.o', '*.a', '*.so', '*.exe', '*.obj', '*.dll', '*.dylib',
85
+ 'CMakeFiles', 'CMakeCache.txt', '*.cmake',
86
+ // Swift/Xcode specific
87
+ '*.xcodeproj', '*.xcworkspace', '*.moved-aside', '*.pbxuser',
88
+ '*.mode1v3', '*.mode2v3', '*.perspectivev3',
89
+ // Scala/SBT specific
90
+ 'lib_managed', 'src_managed', 'project/boot', 'project/plugins/project',
91
+ '.history', '.lib',
92
+ // PHP specific
93
+ 'composer.lock', '*.phar',
94
+ // Docker
95
+ '.dockerignore', 'docker-compose.override.yml', '.docker',
96
+ // Documentation build
97
+ 'docs/_build', '.vuepress',
98
+ // Testing frameworks
99
+ 'jest.config', 'vitest.config', 'pytest.ini', 'tox.ini',
100
+ '__tests__', '__mocks__', 'spec', 'cypress', 'playwright',
101
+ // Monorepo workspace patterns (implicit through directory coverage)
102
+ '.turbo', '.nx',
103
+ // Python package patterns
104
+ '*.py[cod]', '*$py.class', '.Python', 'pip-log.txt',
105
+ 'pip-delete-this-directory.txt', '.hypothesis', '.pyre', '.pytype',
106
+ '*.whl',
107
+ // Config/metadata that are typically low-value
108
+ '*.config.js', '*.config.ts', 'webpack.config.js', 'rollup.config.js',
109
+ 'vite.config.js', 'tsconfig.json', 'jsconfig.json', 'babel.config',
110
+ '.babelrc', '.eslintrc', '.prettierrc', '.stylelintrc', '.editorconfig',
111
+ '*.local', '*.development', '*.production',
112
+ // Node specific
113
+ '.npm', '.node_repl_history', '*.tsbuildinfo', 'yarn-error.log',
114
+ // Documentation/reference files that don't help with search
115
+ '*.md', '*.txt', '*.rst', '*.adoc', 'docs', 'documentation', 'wiki',
116
+ 'CHANGELOG', 'HISTORY', 'NEWS', 'UPGRADING', 'FAQ', 'CONTRIBUTING',
117
+ 'SECURITY', 'LICENSE', 'LICENCE', 'COPYRIGHT', 'NOTICE', 'AUTHORS',
118
+ 'THIRDPARTY',
119
+ // Test and coverage files
120
+ '*.test', '*.spec', 'test', 'tests', 'htmlcov',
121
+ // Profiling
122
+ '*.prof', '*.cpuprofile', '*.heapprofile',
123
+ // Misc
124
+ '.tern-port', 'firebase-debug.log', 'firestore-debug.log',
125
+ 'ui-debug.log', '.firebaserc', '.stackdump'
36
126
  ]);
37
127
  }
38
128
 
@@ -110,15 +200,30 @@ export function loadIgnorePatterns(rootPath) {
110
200
  export function shouldIgnore(filePath, ignorePatterns) {
111
201
  const normalizedPath = filePath.replace(/\\/g, '/');
112
202
  const pathParts = normalizedPath.split('/');
203
+ const fileName = pathParts[pathParts.length - 1];
113
204
 
114
205
  for (const pattern of ignorePatterns) {
206
+ // Handle path patterns (contain /)
115
207
  if (pattern.includes('/')) {
116
208
  if (normalizedPath.includes(pattern)) {
117
209
  return true;
118
210
  }
119
- } else {
211
+ }
212
+ // Handle extension patterns (*.ext)
213
+ else if (pattern.startsWith('*.')) {
214
+ const ext = pattern.slice(1);
215
+ if (fileName.endsWith(ext)) {
216
+ return true;
217
+ }
218
+ }
219
+ // Handle exact file name patterns
220
+ else if (fileName === pattern) {
221
+ return true;
222
+ }
223
+ // Handle directory name patterns (match any path part)
224
+ else {
120
225
  for (const part of pathParts) {
121
- if (part === pattern) {
226
+ if (part === pattern || part.startsWith(pattern + '/')) {
122
227
  return true;
123
228
  }
124
229
  }
package/src/scanner.js CHANGED
@@ -23,7 +23,6 @@ const SUPPORTED_EXTENSIONS = new Set([
23
23
  '.groovy',
24
24
  '.gradle',
25
25
  '.xml', '.xsd',
26
- '.json', '.jsonc',
27
26
  '.yaml', '.yml',
28
27
  '.toml',
29
28
  '.html', '.htm',
@@ -95,7 +94,7 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
95
94
  return files;
96
95
  }
97
96
 
98
- function chunkContent(content, chunkSize = 1000, overlapSize = 200) {
97
+ function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
99
98
  const lines = content.split('\n');
100
99
  const chunks = [];
101
100
 
@@ -20,16 +20,16 @@ async function initializeIndex(repositoryPath) {
20
20
  const chunks = scanRepository(absolutePath, ignorePatterns);
21
21
 
22
22
  if (chunks.length === 0) {
23
- return { error: 'No code chunks found', chunks: [], index: null };
23
+ return { error: 'No code chunks found', chunks: [], indexData: null };
24
24
  }
25
25
 
26
- const index = buildTextIndex(chunks);
27
- const indexData = { chunks, index };
28
- indexCache.set(cacheKey, indexData);
26
+ const indexData = buildTextIndex(chunks);
27
+ const result = { chunks, indexData };
28
+ indexCache.set(cacheKey, result);
29
29
 
30
- return indexData;
30
+ return result;
31
31
  } catch (error) {
32
- return { error: error.message, chunks: [], index: null };
32
+ return { error: error.message, chunks: [], indexData: null };
33
33
  }
34
34
  }
35
35
 
@@ -47,7 +47,7 @@ async function performSearch(repositoryPath, query) {
47
47
  return { error: indexData.error, results: [] };
48
48
  }
49
49
 
50
- const results = searchText(query, indexData.chunks, indexData.index);
50
+ const results = searchText(query, indexData.chunks, indexData.indexData);
51
51
 
52
52
  return {
53
53
  query,
package/src/store.js CHANGED
@@ -4,6 +4,7 @@ import { mkdirSync, existsSync } from 'fs';
4
4
 
5
5
  let dbConnection = null;
6
6
  let tableRef = null;
7
+ let isFirstBatch = true;
7
8
 
8
9
  export async function initStore(dbPath) {
9
10
  // Ensure directory exists
@@ -19,6 +20,7 @@ export async function initStore(dbPath) {
19
20
  uri: dbDir,
20
21
  mode: 'overwrite'
21
22
  });
23
+ isFirstBatch = true;
22
24
  console.error('Vector store initialized');
23
25
  return true;
24
26
  } catch (e) {
@@ -68,14 +70,24 @@ export async function upsertChunks(chunks) {
68
70
  try {
69
71
  let table = null;
70
72
 
71
- // Try to open existing table
72
- try {
73
- table = await dbConnection.openTable(tableName);
74
- // Overwrite existing table with new data
75
- await table.overwrite(data);
76
- } catch (e) {
77
- // Table doesn't exist, create new one
78
- table = await dbConnection.createTable(tableName, data);
73
+ if (isFirstBatch) {
74
+ // First batch: try to open existing table, or create new one
75
+ try {
76
+ table = await dbConnection.openTable(tableName);
77
+ await table.overwrite(data);
78
+ } catch (e) {
79
+ table = await dbConnection.createTable(tableName, data);
80
+ }
81
+ isFirstBatch = false;
82
+ } else {
83
+ // Subsequent batches: add to existing table
84
+ try {
85
+ table = await dbConnection.openTable(tableName);
86
+ await table.add(data);
87
+ } catch (e) {
88
+ console.error('Failed to add to table:', e.message);
89
+ throw e;
90
+ }
79
91
  }
80
92
 
81
93
  tableRef = table;
@@ -1,48 +1,129 @@
1
1
  export function buildTextIndex(chunks) {
2
2
  const index = new Map();
3
+ const chunkMetadata = [];
3
4
 
4
5
  chunks.forEach((chunk, idx) => {
5
6
  const tokens = tokenize(chunk.content);
7
+ const symbols = extractSymbols(chunk.content);
8
+ const frequency = new Map();
9
+
6
10
  tokens.forEach(token => {
11
+ frequency.set(token, (frequency.get(token) || 0) + 1);
7
12
  if (!index.has(token)) {
8
13
  index.set(token, []);
9
14
  }
10
15
  index.get(token).push(idx);
11
16
  });
17
+
18
+ chunkMetadata[idx] = {
19
+ tokens,
20
+ symbols,
21
+ frequency,
22
+ isCode: isCodeFile(chunk.file_path),
23
+ };
12
24
  });
13
25
 
14
- return index;
26
+ return { index, chunkMetadata };
15
27
  }
16
28
 
17
- export function searchText(query, chunks, index) {
29
+ export function searchText(query, chunks, indexData) {
30
+ const { index, chunkMetadata } = indexData;
18
31
  const queryTokens = tokenize(query);
32
+ const querySymbols = extractSymbols(query);
19
33
  const chunkScores = new Map();
20
34
 
21
- queryTokens.forEach(token => {
22
- if (index.has(token)) {
23
- index.get(token).forEach(chunkIdx => {
24
- if (!chunkScores.has(chunkIdx)) {
25
- chunkScores.set(chunkIdx, 0);
35
+ chunks.forEach((chunk, idx) => {
36
+ let score = 0;
37
+
38
+ queryTokens.forEach(token => {
39
+ if (index.has(token)) {
40
+ if (index.get(token).includes(idx)) {
41
+ const freq = chunkMetadata[idx].frequency.get(token) || 1;
42
+ const boost = token.length > 4 ? 1.5 : 1;
43
+ score += boost * freq;
26
44
  }
27
- chunkScores.set(chunkIdx, chunkScores.get(chunkIdx) + 1);
28
- });
45
+ }
46
+ });
47
+
48
+ querySymbols.forEach(symbol => {
49
+ if (chunkMetadata[idx].symbols.includes(symbol)) {
50
+ score += 5;
51
+ }
52
+ });
53
+
54
+ const exactMatch = chunk.content.includes(query);
55
+ if (exactMatch) {
56
+ score += 10;
57
+ }
58
+
59
+ if (chunkMetadata[idx].isCode) {
60
+ score *= 1.2;
61
+ }
62
+
63
+ if (score > 0) {
64
+ chunkScores.set(idx, score);
29
65
  }
30
66
  });
31
67
 
32
68
  const results = Array.from(chunkScores.entries())
33
69
  .map(([idx, score]) => ({
34
70
  ...chunks[idx],
35
- score: score / queryTokens.length,
36
- matchCount: score,
71
+ score: Math.min(score / 100, 1),
72
+ _rawScore: score,
37
73
  }))
38
74
  .filter(r => r.score > 0)
39
- .sort((a, b) => b.score - a.score);
75
+ .sort((a, b) => b._rawScore - a._rawScore);
40
76
 
41
77
  return results;
42
78
  }
43
79
 
44
80
  function tokenize(text) {
45
- return text
46
- .toLowerCase()
47
- .match(/\b\w+\b/g) || [];
81
+ const tokens = new Set();
82
+
83
+ text.toLowerCase().split(/\s+/).forEach(word => {
84
+ if (word.length === 0) return;
85
+
86
+ tokens.add(word.replace(/[^\w]/g, ''));
87
+
88
+ const camelCaseTokens = word.match(/[a-z]+|[A-Z][a-z]*|[0-9]+/g) || [];
89
+ camelCaseTokens.forEach(t => {
90
+ if (t.length > 1) tokens.add(t.toLowerCase());
91
+ });
92
+
93
+ const snakeCaseTokens = word.split(/[-_]/).filter(t => t.length > 0);
94
+ snakeCaseTokens.forEach(t => {
95
+ if (t.length > 1) tokens.add(t.toLowerCase());
96
+ });
97
+ });
98
+
99
+ return Array.from(tokens).filter(t => t.length > 1);
100
+ }
101
+
102
+ function extractSymbols(text) {
103
+ const symbols = new Set();
104
+
105
+ const functionMatches = text.match(/(?:async\s+)?function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s*)?\(/g) || [];
106
+ functionMatches.forEach(match => {
107
+ const name = match.match(/\w+(?=\s*[=\(])/)?.[0];
108
+ if (name) symbols.add(name.toLowerCase());
109
+ });
110
+
111
+ const classMatches = text.match(/class\s+(\w+)/g) || [];
112
+ classMatches.forEach(match => {
113
+ const name = match.match(/\w+$/)?.[0];
114
+ if (name) symbols.add(name.toLowerCase());
115
+ });
116
+
117
+ const exportMatches = text.match(/export\s+(?:async\s+)?(?:function|class)\s+(\w+)/g) || [];
118
+ exportMatches.forEach(match => {
119
+ const name = match.match(/\w+$/)?.[0];
120
+ if (name) symbols.add(name.toLowerCase());
121
+ });
122
+
123
+ return Array.from(symbols);
124
+ }
125
+
126
+ function isCodeFile(filePath) {
127
+ const codeExtensions = ['.js', '.ts', '.jsx', '.tsx', '.py', '.java', '.go', '.rs', '.rb'];
128
+ return codeExtensions.some(ext => filePath.endsWith(ext));
48
129
  }