codebasesearch 0.1.21 → 0.1.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.prd ADDED
@@ -0,0 +1,78 @@
1
+ {
2
+ "project": "code-search",
3
+ "created": "2026-03-12",
4
+ "objective": "Profile and improve code-search speed and result quality",
5
+ "items": [
6
+ {
7
+ "id": "fix-dedup-buildtextindex",
8
+ "subject": "Remove duplicate buildTextIndex from search.js",
9
+ "status": "pending",
10
+ "description": "search.js has a private copy of buildTextIndex and tokenize/extractSymbols/isCodeFile that duplicates text-search.js. Import the exported buildTextIndex from text-search.js instead.",
11
+ "category": "refactor",
12
+ "effort": "small",
13
+ "blocking": ["fix-score-normalization", "fix-hybrid-weights"],
14
+ "blockedBy": []
15
+ },
16
+ {
17
+ "id": "fix-chunk-size",
18
+ "subject": "Reduce chunk size from 300 to 60 lines for better semantic granularity",
19
+ "status": "pending",
20
+ "description": "scanner.js uses 300-line chunks. Embeddings work best on 50-100 line chunks. Reduce to 60-line chunks with 15-line overlap for better vector search quality.",
21
+ "category": "feature",
22
+ "effort": "small",
23
+ "blocking": [],
24
+ "blockedBy": []
25
+ },
26
+ {
27
+ "id": "fix-score-normalization",
28
+ "subject": "Fix text search score normalization so top result is always 1.0",
29
+ "status": "pending",
30
+ "description": "Text scores divide raw by 100 but scores can exceed 100. Use dynamic max-score scaling. Lower hasGoodTextResults threshold from 0.5 to 0.3.",
31
+ "category": "bug",
32
+ "effort": "small",
33
+ "blocking": [],
34
+ "blockedBy": ["fix-dedup-buildtextindex"]
35
+ },
36
+ {
37
+ "id": "fix-hybrid-weights",
38
+ "subject": "Boost text-only exact-match results in hybrid merge",
39
+ "status": "pending",
40
+ "description": "Text-only results are capped at 20% weight. Give high-scoring text-only results a floor finalScore of 0.4.",
41
+ "category": "feature",
42
+ "effort": "small",
43
+ "blocking": [],
44
+ "blockedBy": ["fix-dedup-buildtextindex"]
45
+ },
46
+ {
47
+ "id": "fix-vector-cache-key",
48
+ "subject": "Strengthen vector search cache key to 20 dimensions",
49
+ "status": "pending",
50
+ "description": "Cache key uses only first 5 embedding dims. Use 20 dims for near-zero collision rate.",
51
+ "category": "bug",
52
+ "effort": "small",
53
+ "blocking": [],
54
+ "blockedBy": []
55
+ },
56
+ {
57
+ "id": "remove-dead-meanpooling",
58
+ "subject": "Remove dead meanPooling function from embeddings.js",
59
+ "status": "pending",
60
+ "description": "meanPooling is defined but never called. Remove dead code.",
61
+ "category": "refactor",
62
+ "effort": "small",
63
+ "blocking": [],
64
+ "blockedBy": []
65
+ },
66
+ {
67
+ "id": "verify-and-commit",
68
+ "subject": "Verify improvements and commit all changes",
69
+ "status": "pending",
70
+ "description": "Run end-to-end search logic test inline. Commit and push all changes.",
71
+ "category": "infra",
72
+ "effort": "small",
73
+ "blocking": [],
74
+ "blockedBy": ["fix-dedup-buildtextindex", "fix-chunk-size", "fix-score-normalization", "fix-hybrid-weights", "fix-vector-cache-key", "remove-dead-meanpooling"]
75
+ }
76
+ ],
77
+ "completed": []
78
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codebasesearch",
3
- "version": "0.1.21",
3
+ "version": "0.1.23",
4
4
  "description": "Ultra-simple code search tool with Jina embeddings, LanceDB, and MCP protocol support",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli.js CHANGED
@@ -75,29 +75,39 @@ export async function run(args) {
75
75
  console.log('Generating embeddings and indexing...');
76
76
 
77
77
  // Generate embeddings in batches and upsert immediately to free memory
78
- const batchSize = 32;
79
- let processedCount = 0;
80
-
81
- for (let i = 0; i < chunks.length; i += batchSize) {
82
- const batchChunks = chunks.slice(i, i + batchSize);
83
- const batchTexts = batchChunks.map(c => c.content);
84
- const batchEmbeddings = await generateEmbeddings(batchTexts);
85
-
86
- // Create batch with embeddings
87
- const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
88
- ...chunk,
89
- vector: batchEmbeddings[idx]
90
- }));
78
+ // Optimize batch size based on chunk count (larger batches are more efficient)
79
+ let batchSize = 32;
80
+ if (chunks.length > 500) batchSize = 64;
81
+ if (chunks.length > 1000) batchSize = 96;
91
82
 
92
- // Upsert immediately to free memory
93
- await upsertChunks(batchWithEmbeddings);
94
- processedCount += batchWithEmbeddings.length;
83
+ let processedCount = 0;
84
+ let embeddingsAvailable = true;
85
+
86
+ try {
87
+ for (let i = 0; i < chunks.length; i += batchSize) {
88
+ const batchChunks = chunks.slice(i, i + batchSize);
89
+ const batchTexts = batchChunks.map(c => c.content);
90
+ const batchEmbeddings = await generateEmbeddings(batchTexts);
91
+
92
+ // Create batch with embeddings
93
+ const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
94
+ ...chunk,
95
+ vector: batchEmbeddings[idx]
96
+ }));
97
+
98
+ // Upsert immediately to free memory
99
+ await upsertChunks(batchWithEmbeddings);
100
+ processedCount += batchWithEmbeddings.length;
101
+ }
102
+ } catch (embeddingError) {
103
+ console.warn(`Warning: Embedding generation failed (${embeddingError.message}). Using text-only search.\n`);
104
+ embeddingsAvailable = false;
95
105
  }
96
106
 
97
107
  console.log('Index created\n');
98
108
 
99
- // Execute search
100
- const results = await executeSearch(query);
109
+ // Execute search with chunks for hybrid search (text-only if embeddings failed)
110
+ const results = await executeSearch(query, 10, chunks);
101
111
 
102
112
  // Format and display results
103
113
  const output = formatResults(results);
package/src/embeddings.js CHANGED
@@ -13,6 +13,7 @@ try {
13
13
 
14
14
  let modelCache = null;
15
15
  let cacheCleared = false;
16
+ let modelLoadTime = 0;
16
17
 
17
18
  function clearModelCache() {
18
19
  const cacheDirs = [
@@ -37,6 +38,7 @@ async function getModel(retryOnError = true) {
37
38
  return modelCache;
38
39
  }
39
40
 
41
+ const modelStart = performance.now();
40
42
  console.error('Loading embeddings model (this may take a moment on first run)...');
41
43
 
42
44
  const modelLoadPromise = pipeline(
@@ -50,6 +52,7 @@ async function getModel(retryOnError = true) {
50
52
 
51
53
  try {
52
54
  modelCache = await Promise.race([modelLoadPromise, timeoutPromise]);
55
+ modelLoadTime = performance.now() - modelStart;
53
56
  } catch (e) {
54
57
  if (retryOnError && !cacheCleared && (e.message.includes('Protobuf') || e.message.includes('parsing'))) {
55
58
  console.error('Detected corrupted cache, clearing and retrying...');
@@ -65,37 +68,8 @@ async function getModel(retryOnError = true) {
65
68
  return modelCache;
66
69
  }
67
70
 
68
- async function meanPooling(modelOutput, attentionMask) {
69
- // Get token embeddings from model output
70
- const tokenEmbeddings = modelOutput.data;
71
- const embeddingDim = modelOutput.dims[modelOutput.dims.length - 1];
72
- const batchSize = modelOutput.dims[0];
73
- const seqLength = modelOutput.dims[1];
74
-
75
- const pooled = [];
76
-
77
- for (let b = 0; b < batchSize; b++) {
78
- let sum = new Array(embeddingDim).fill(0);
79
- let count = 0;
80
-
81
- for (let s = 0; s < seqLength; s++) {
82
- const tokenIdx = b * seqLength + s;
83
- const maskValue = attentionMask[tokenIdx] || 1;
84
-
85
- if (maskValue > 0) {
86
- const tokenStart = tokenIdx * embeddingDim;
87
- for (let d = 0; d < embeddingDim; d++) {
88
- sum[d] += tokenEmbeddings[tokenStart + d] * maskValue;
89
- }
90
- count += maskValue;
91
- }
92
- }
93
-
94
- const normalized = sum.map(v => v / Math.max(count, 1e-9));
95
- pooled.push(normalized);
96
- }
97
-
98
- return pooled;
71
+ export function getModelLoadTime() {
72
+ return modelLoadTime;
99
73
  }
100
74
 
101
75
  export async function generateEmbeddings(texts) {
@@ -105,11 +79,16 @@ export async function generateEmbeddings(texts) {
105
79
  texts = [texts];
106
80
  }
107
81
 
108
- // Generate embeddings for all texts
109
- const embeddings = await model(texts, {
110
- pooling: 'mean',
111
- normalize: true
112
- });
82
+ // Generate embeddings for all texts with timeout per batch
83
+ const embeddings = await Promise.race([
84
+ model(texts, {
85
+ pooling: 'mean',
86
+ normalize: true
87
+ }),
88
+ new Promise((_, reject) =>
89
+ setTimeout(() => reject(new Error('Embedding generation timeout')), 60000)
90
+ )
91
+ ]);
113
92
 
114
93
  // Convert to regular arrays
115
94
  const result = [];
@@ -106,109 +106,17 @@ function loadDefaultIgnores() {
106
106
 
107
107
  function getHardcodedIgnores() {
108
108
  return new Set([
109
- 'node_modules', '.git', '.svn', '.hg', 'dist', 'build', 'out',
110
- 'target', 'vendor', '__pycache__', '.pytest_cache', '.mypy_cache',
111
- '.next', '.nuxt', '.cache', '.parcel-cache', '.vite', '.turbo',
112
- 'coverage', '.nyc_output', '.firebase', '.terraform', '.aws',
113
- '.azure', '.gcloud', '.vscode', '.idea', '.vs', 'bin', 'obj',
114
- '.gradle', '.mvn', 'Pods', 'DerivedData', '.bundle',
115
- '.yarn', '.pnp', 'pnpm-lock.yaml', '.pnpm-store',
116
- '.tox', '.eggs', '*.egg-info', '.venv', 'venv', 'env',
117
- '.tsc', '.eslintcache', '.stylelintcache', '.parcel-cache',
118
- 'temp', 'tmp', '.tmp', '.DS_Store', 'Thumbs.db',
119
- '.swp', '.swo', '*.swp', '*.swo', '.tern-port',
120
- 'dist-server', 'out-tsc', '.cache', '.parcel-cache',
121
- 'typings', '.env', '.env.local', '.env.*.local',
122
- // JSON files - PRIMARY PRIORITY for memory reduction
123
- '*.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
124
- 'Gemfile.lock', 'poetry.lock', 'Pipfile.lock',
125
- // Lock files
126
- '*.lock',
127
- // Build outputs
128
- 'public', 'static', 'site', '_site', '.docusaurus', '.gatsby',
129
- // Cache/dependency directories
130
- '.rush', '.lerna', '.nx',
131
- // IDE/editor configs
132
- '.cursor', '.replit', '.sublime-project', '.sublime-workspace',
133
- '*.iml', '.project', '.classpath', '.settings', '*.sublime-*',
109
+ // Lock files / package manager artifacts
110
+ 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
111
+ 'Gemfile.lock', 'poetry.lock', 'Pipfile.lock', 'Cargo.lock',
112
+ 'composer.lock', 'go.sum',
134
113
  // OS files
135
- '.Spotlight-V100', '.Trashes', 'ehthumbs.db', '.fseventsd',
136
- '.TemporaryItems', '.AppleDouble', '.LSOverride', 'desktop.ini',
137
- // Large data files
138
- '*.db', '*.sqlite', '*.sqlite3', '*.bak', '*.dump',
139
- '*.backup', '*.data', '*.orig',
140
- // Logs and temp
141
- '*.log', 'logs', 'npm-debug.log', 'yarn-error.log',
142
- // Test coverage and reports
143
- 'lcov.info', '.coverage', 'test-results',
144
- // Database related
145
- 'storage', 'fixtures',
146
- // LLM/Vector related
147
- '.llamaindex', '.chroma', '.vectorstore', '.embeddings',
148
- '.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
149
- '.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
150
- 'chromadb', 'pinecone-cache', 'weaviate-data',
151
- // Compiled output
114
+ '.DS_Store', 'Thumbs.db', 'desktop.ini',
115
+ // Editor swap files
116
+ '.tern-port',
117
+ // Compiled binary artifacts (files, not dirs)
152
118
  '*.min.js', '*.min.css', '*.bundle.js', '*.chunk.js', '*.map',
153
- // Generated/build artifacts
154
- '.assets', 'out-tsc', 'cmake_build_debug', 'cmake_build_release',
155
- // Version managers
156
- '.rbenv', '.nvm', '.nvmrc',
157
- // Ruby specific
158
- '*.gem', '*.rbc', '/pkg', '/spec/reports', '/spec/examples.txt',
159
- '/test/tmp', '/test/version_tmp', 'lib/bundler/man', '.ruby-version',
160
- // Go specific
161
- 'go.work',
162
- // Rust specific
163
- 'Cargo.lock', '**/*.rs.bk', '*.pdb',
164
- // Java specific
165
- '*.class', '*.jar', '*.war', '*.ear', '*.nar', '*.nupkg', '*.snupkg',
166
- // C# specific
167
- '*.suo', '*.user', '*.userosscache', '*.sln.docstates',
168
- 'project.lock.json', 'project.fragment.lock.json', 'artifacts',
169
- // C/C++ specific
170
- '*.o', '*.a', '*.so', '*.exe', '*.obj', '*.dll', '*.dylib',
171
- 'CMakeFiles', 'CMakeCache.txt', '*.cmake',
172
- // Swift/Xcode specific
173
- '*.xcodeproj', '*.xcworkspace', '*.moved-aside', '*.pbxuser',
174
- '*.mode1v3', '*.mode2v3', '*.perspectivev3',
175
- // Scala/SBT specific
176
- 'lib_managed', 'src_managed', 'project/boot', 'project/plugins/project',
177
- '.history', '.lib',
178
- // PHP specific
179
- 'composer.lock', '*.phar',
180
- // Docker
181
- '.dockerignore', 'docker-compose.override.yml', '.docker',
182
- // Documentation build
183
- 'docs/_build', '.vuepress',
184
- // Testing frameworks
185
- 'jest.config', 'vitest.config', 'pytest.ini', 'tox.ini',
186
- '__tests__', '__mocks__', 'spec', 'cypress', 'playwright',
187
- // Monorepo workspace patterns (implicit through directory coverage)
188
- '.turbo', '.nx',
189
- // Python package patterns
190
- '*.py[cod]', '*$py.class', '.Python', 'pip-log.txt',
191
- 'pip-delete-this-directory.txt', '.hypothesis', '.pyre', '.pytype',
192
- '*.whl',
193
- // Config/metadata that are typically low-value
194
- '*.config.js', '*.config.ts', 'webpack.config.js', 'rollup.config.js',
195
- 'vite.config.js', 'tsconfig.json', 'jsconfig.json', 'babel.config',
196
- '.babelrc', '.eslintrc', '.prettierrc', '.stylelintrc', '.editorconfig',
197
- '*.local', '*.development', '*.production',
198
- // Node specific
199
- '.npm', '.node_repl_history', '*.tsbuildinfo', 'yarn-error.log',
200
- // Documentation/reference files that don't help with search
201
- '*.md', '*.txt', '*.rst', '*.adoc', 'docs', 'documentation', 'wiki',
202
- 'CHANGELOG', 'HISTORY', 'NEWS', 'UPGRADING', 'FAQ', 'CONTRIBUTING',
203
- 'SECURITY', 'LICENSE', 'LICENCE', 'COPYRIGHT', 'NOTICE', 'AUTHORS',
204
- 'THIRDPARTY',
205
- // Test and coverage files
206
- '*.test', '*.spec', 'test', 'tests', 'htmlcov',
207
- // Profiling
208
- '*.prof', '*.cpuprofile', '*.heapprofile',
209
- // Misc
210
- '.tern-port', 'firebase-debug.log', 'firestore-debug.log',
211
- 'ui-debug.log', '.firebaserc', '.stackdump'
119
+ '*.tsbuildinfo',
212
120
  ]);
213
121
  }
214
122
 
@@ -283,76 +191,54 @@ export function loadIgnorePatterns(rootPath) {
283
191
  return merged;
284
192
  }
285
193
 
286
- // Directories to always ignore
194
+ // Directories to always ignore - only clear non-source directories
287
195
  const IGNORED_DIRECTORIES = new Set([
288
- // Dependencies - NEVER include
196
+ // Dependencies
289
197
  'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
290
198
  // Version control
291
- '.git', '.svn', '.hg', '.bzr', '.vscode', '.idea', '.vs', '.atom', '.sublime-project',
292
- // Build outputs - comprehensive list
293
- 'dist', 'dist-server', 'dist-ssr', 'dist-client', 'dist-server',
294
- 'build', 'built', 'Build', 'BUILD',
295
- 'out', 'output', 'Output', 'OUT', 'release', 'Release', 'RELEASE',
296
- 'target', 'Target', 'TARGET',
297
- 'bin', 'Bin', 'BIN', 'obj', 'Obj', 'OBJ',
298
- 'public', 'static', 'assets', 'www', 'wwwroot',
299
- 'site', '_site', '.site', '.docusaurus', '.gatsby', '.vuepress',
300
- 'storybook-static', '.nuxt', 'nuxt', '.next', 'next',
301
- 'out-tsc', 'tsc', '.tsc',
199
+ '.git', '.svn', '.hg', '.bzr',
200
+ // IDE
201
+ '.vscode', '.idea', '.vs', '.atom',
202
+ // Build outputs (unambiguous names only)
203
+ 'dist', 'dist-server', 'dist-ssr', 'dist-client',
204
+ 'build', 'built',
205
+ 'out', 'out-tsc',
206
+ 'target',
207
+ 'storybook-static', '.docusaurus', '.gatsby', '.vuepress',
208
+ '.nuxt', '.next',
209
+ '.tsc',
302
210
  // Cache directories
303
- '.cache', 'cache', '.parcel-cache', '.vite', 'vite', '.turbo', 'turbo',
211
+ '.cache', '.parcel-cache', '.vite', '.turbo',
304
212
  '.npm', '.yarn', '.pnp', '.pnpm-store', '.rush', '.lerna', '.nx',
305
213
  // Testing
306
214
  'coverage', '.nyc_output', '.coverage', 'htmlcov', 'test-results',
307
- 'test', 'tests', 'Test', 'Tests', 'TEST', 'TESTS',
308
215
  '__tests__', '__mocks__', '__snapshots__', '__fixtures__',
309
- 'cypress', 'playwright', 'e2e', 'integration', 'spec', 'specs',
216
+ 'cypress', 'playwright',
310
217
  '.tox', '.eggs', '.hypothesis', '.pyre', '.pytype',
311
218
  // Python
312
- '__pycache__', '.pytest_cache', '.mypy_cache', '.venv', 'venv', 'env',
313
- 'env.bak', 'venv.bak', '.Python', 'pip-wheel-metadata', '*.egg-info',
219
+ '__pycache__', '.pytest_cache', '.mypy_cache', '.venv', 'venv',
314
220
  // Java/Gradle/Maven
315
- '.gradle', '.mvn', 'gradle', 'mvn', '.settings', '.project', '.classpath',
221
+ '.gradle', '.mvn',
316
222
  // iOS/Android
317
- 'Pods', 'DerivedData', 'build', '.bundle', 'xcuserdata', '.xcodeproj', '.xcworkspace',
223
+ 'Pods', 'DerivedData', '.bundle', 'xcuserdata',
318
224
  // Ruby
319
- 'vendor', '.bundle', '.ruby-version', 'pkg',
320
- // Rust
321
- 'target', 'Cargo.lock',
322
- // Go
323
- 'vendor', 'Godeps',
324
- // PHP
325
- 'vendor', 'composer',
225
+ '.bundle', 'pkg',
326
226
  // Infrastructure
327
227
  '.terraform', '.terragrunt-cache', '.pulumi', '.serverless', '.firebase',
328
- '.aws', '.azure', '.gcloud', '.vercel', '.netlify', '.now',
329
- // Docker
330
- '.docker', 'docker', '.dockerignore',
228
+ '.aws', '.azure', '.gcloud', '.vercel', '.netlify',
331
229
  // Temp files
332
- 'temp', 'tmp', '.tmp', '.temp', 'tmpfs', 'scratch', '.scratch',
333
- // Documentation
334
- 'docs', 'doc', 'documentation', 'wiki', 'guides', 'examples', 'demo', 'demos',
335
- 'CHANGELOG', 'HISTORY', 'NEWS', 'LICENSE', 'LICENCE', 'COPYING', 'AUTHORS',
336
- // IDE/Editor
337
- '.vs', '.vscode', '.idea', '.eclipse', '.settings', '.classpath', '.project',
338
- // Logs
339
- 'logs', 'log', '*.log',
340
- // Data/Storage
341
- 'storage', 'data', 'database', 'db', 'fixtures', 'seeds',
342
- 'uploads', 'files', 'media', 'resources', 'assets', 'images', 'img',
343
- // LLM/AI
230
+ 'temp', 'tmp', '.tmp', '.temp',
231
+ // LLM/AI artifacts
344
232
  '.llamaindex', '.chroma', '.vectorstore', '.embeddings',
345
233
  '.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
346
234
  '.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
347
235
  'chromadb', 'pinecone-cache', 'weaviate-data',
348
- // Package managers
349
- '.yarn', '.pnpm', '.npm', '.bun',
350
- // Compiled outputs
351
- 'typings', 'types', '@types', 'type-definitions',
352
- // Misc
353
- 'public', 'static', 'site', '_site',
354
- 'cmake_build_debug', 'cmake_build_release', 'CMakeFiles', 'CMakeCache.txt',
355
- 'out-tsc', 'dist-server', 'server', 'client', 'browser', 'esm', 'cjs', 'umd', 'lib', 'es'
236
+ // Package manager caches
237
+ '.pnpm', '.bun',
238
+ // Static/built asset directories
239
+ 'assets', 'static', 'public', 'wwwroot', 'www',
240
+ // Misc generated
241
+ 'cmake_build_debug', 'cmake_build_release', 'CMakeFiles',
356
242
  ]);
357
243
 
358
244
  export function isCodeFile(filePath) {
@@ -373,28 +259,34 @@ export function isCodeFile(filePath) {
373
259
  export function shouldIgnoreDirectory(dirPath) {
374
260
  const normalizedPath = dirPath.replace(/\\/g, '/');
375
261
  const pathParts = normalizedPath.split('/');
376
-
377
262
  for (const part of pathParts) {
378
263
  if (IGNORED_DIRECTORIES.has(part)) {
379
264
  return true;
380
265
  }
381
266
  }
382
-
383
267
  return false;
384
268
  }
385
269
 
386
- export function shouldIgnore(filePath, ignorePatterns) {
270
+ export function shouldIgnore(filePath, ignorePatterns, isDirectory = false) {
387
271
  const normalizedPath = filePath.replace(/\\/g, '/');
388
272
  const pathParts = normalizedPath.split('/');
389
273
  const fileName = pathParts[pathParts.length - 1];
390
-
391
- // Check if any directory in path should be ignored
274
+
275
+ if (isDirectory) {
276
+ if (IGNORED_DIRECTORIES.has(fileName)) return true;
277
+ for (const pattern of ignorePatterns) {
278
+ if (!pattern.includes('/') && fileName === pattern) return true;
279
+ }
280
+ return false;
281
+ }
282
+
283
+ // For files: check all ancestor directories
392
284
  for (const part of pathParts.slice(0, -1)) {
393
285
  if (IGNORED_DIRECTORIES.has(part)) {
394
286
  return true;
395
287
  }
396
288
  }
397
-
289
+
398
290
  // Check if it's a code file using whitelist
399
291
  if (!isCodeFile(filePath)) {
400
292
  return true;
@@ -402,22 +294,13 @@ export function shouldIgnore(filePath, ignorePatterns) {
402
294
 
403
295
  // Check against additional ignore patterns
404
296
  for (const pattern of ignorePatterns) {
405
- // Handle path patterns (contain /)
406
297
  if (pattern.includes('/')) {
407
- if (normalizedPath.includes(pattern)) {
408
- return true;
409
- }
410
- }
411
- // Handle exact file name patterns
412
- else if (fileName === pattern) {
298
+ if (normalizedPath.includes(pattern)) return true;
299
+ } else if (fileName === pattern) {
413
300
  return true;
414
- }
415
- // Handle directory name patterns (match any path part)
416
- else {
301
+ } else {
417
302
  for (const part of pathParts) {
418
- if (part === pattern || part.startsWith(pattern + '/')) {
419
- return true;
420
- }
303
+ if (part === pattern) return true;
421
304
  }
422
305
  }
423
306
  }
package/src/scanner.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import { readdirSync, statSync, readFileSync } from 'fs';
2
2
  import { join, relative } from 'path';
3
- import { shouldIgnore, isCodeFile } from './ignore-parser.js';
3
+ import { shouldIgnore, shouldIgnoreDirectory, isCodeFile } from './ignore-parser.js';
4
4
 
5
5
  function getFileExtension(filePath) {
6
6
  const lastDot = filePath.lastIndexOf('.');
@@ -33,14 +33,15 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
33
33
  // Normalize to forward slashes for consistent ignore pattern matching
34
34
  const normalizedRelPath = relPath.replace(/\\/g, '/');
35
35
 
36
- // Check if should ignore
37
- if (shouldIgnore(normalizedRelPath, ignorePatterns)) {
38
- continue;
39
- }
40
-
41
36
  if (entry.isDirectory()) {
37
+ if (shouldIgnoreDirectory(normalizedRelPath) || shouldIgnore(normalizedRelPath, ignorePatterns, true)) {
38
+ continue;
39
+ }
42
40
  files.push(...walkDirectory(fullPath, ignorePatterns, relPath));
43
41
  } else if (entry.isFile()) {
42
+ if (shouldIgnore(normalizedRelPath, ignorePatterns, false)) {
43
+ continue;
44
+ }
44
45
  if (isCodeFile(normalizedRelPath) && !isBinaryFile(entry.name)) {
45
46
  try {
46
47
  const stat = entry.isSymbolicLink ? null : statSync(fullPath);
@@ -64,7 +65,7 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
64
65
  return files;
65
66
  }
66
67
 
67
- function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
68
+ function chunkContent(content, chunkSize = 60, overlapSize = 15) {
68
69
  const lines = content.split('\n');
69
70
  const chunks = [];
70
71
 
@@ -80,7 +81,6 @@ function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
80
81
  });
81
82
  }
82
83
 
83
- // Stop if we've reached the end
84
84
  if (endIdx === lines.length) {
85
85
  break;
86
86
  }
@@ -99,7 +99,7 @@ export function scanRepository(rootPath, ignorePatterns) {
99
99
  const mtime = file.mtime;
100
100
 
101
101
  // For small files, treat as single chunk
102
- if (content.split('\n').length <= 1000) {
102
+ if (content.split('\n').length <= 60) {
103
103
  chunks.push({
104
104
  file_path: file.relativePath,
105
105
  chunk_index: 0,
package/src/search.js CHANGED
@@ -1,20 +1,84 @@
1
1
  import { generateSingleEmbedding } from './embeddings.js';
2
2
  import { searchSimilar } from './store.js';
3
+ import { buildTextIndex, searchText } from './text-search.js';
3
4
 
4
- export async function executeSearch(query, limit = 10) {
5
+ export async function executeSearch(query, limit = 10, allChunks = null, skipVector = false) {
5
6
  if (!query || query.trim().length === 0) {
6
7
  throw new Error('Query cannot be empty');
7
8
  }
8
9
 
9
10
  console.error(`Searching for: "${query}"`);
10
11
 
11
- // Generate embedding for query
12
- const queryEmbedding = await generateSingleEmbedding(query);
12
+ try {
13
+ let vectorResults = [];
14
+ let textResults = [];
13
15
 
14
- // Search vector store
15
- const results = await searchSimilar(queryEmbedding, limit);
16
+ if (allChunks && allChunks.length > 0) {
17
+ const textIndexData = buildTextIndex(allChunks);
18
+ textResults = searchText(query, allChunks, textIndexData);
19
+ }
20
+
21
+ const hasGoodTextResults = textResults.length > 0 && textResults[0].score > 0.3;
22
+ if (!skipVector && !hasGoodTextResults) {
23
+ try {
24
+ const queryEmbedding = await generateSingleEmbedding(query);
25
+ vectorResults = await searchSimilar(queryEmbedding, limit * 2);
26
+ } catch (e) {
27
+ console.warn(`Vector search unavailable: ${e.message}`);
28
+ }
29
+ }
30
+
31
+ if (vectorResults.length > 0 && textResults.length > 0) {
32
+ return mergeSearchResults(vectorResults, textResults.slice(0, limit * 2), limit);
33
+ }
34
+
35
+ const allResults = vectorResults.length > 0 ? vectorResults : textResults;
36
+ return allResults.slice(0, limit);
37
+ } catch (error) {
38
+ console.error('Search error:', error.message);
39
+ if (allChunks && allChunks.length > 0) {
40
+ const textIndexData = buildTextIndex(allChunks);
41
+ const textResults = searchText(query, allChunks, textIndexData);
42
+ return textResults.slice(0, limit);
43
+ }
44
+ throw error;
45
+ }
46
+ }
47
+
48
+ function mergeSearchResults(vectorResults, textResults, limit) {
49
+ const merged = new Map();
50
+
51
+ vectorResults.forEach((result) => {
52
+ const key = `${result.file_path}:${result.chunk_index}`;
53
+ merged.set(key, {
54
+ ...result,
55
+ vectorScore: result.score || 0,
56
+ textScore: 0,
57
+ finalScore: (result.score || 0) * 0.8
58
+ });
59
+ });
60
+
61
+ textResults.forEach((result) => {
62
+ const key = `${result.file_path}:${result.chunk_index || 0}`;
63
+ if (merged.has(key)) {
64
+ const existing = merged.get(key);
65
+ existing.textScore = result.score || 0;
66
+ existing.finalScore = (existing.vectorScore * 0.8) + (result.score * 0.2);
67
+ } else {
68
+ const textScore = result.score || 0;
69
+ const finalScore = Math.max(textScore * 0.2, textScore > 0.7 ? 0.4 : 0);
70
+ merged.set(key, {
71
+ ...result,
72
+ vectorScore: 0,
73
+ textScore,
74
+ finalScore
75
+ });
76
+ }
77
+ });
16
78
 
17
- return results;
79
+ return Array.from(merged.values())
80
+ .sort((a, b) => b.finalScore - a.finalScore)
81
+ .slice(0, limit);
18
82
  }
19
83
 
20
84
  export function formatResults(results) {
@@ -27,15 +91,14 @@ export function formatResults(results) {
27
91
 
28
92
  for (let i = 0; i < results.length; i++) {
29
93
  const result = results[i];
30
- const match = i + 1;
94
+ const scoreValue = result.finalScore !== undefined ? result.finalScore : (result.score || 0);
95
+ const scorePercent = (scoreValue * 100).toFixed(1);
31
96
 
32
- lines.push(`${match}. ${result.file_path}:${result.line_start}-${result.line_end} (score: ${(result.score * 100).toFixed(1)}%)`);
97
+ lines.push(`${i + 1}. ${result.file_path}:${result.line_start}-${result.line_end} (score: ${scorePercent}%)`);
33
98
 
34
- // Show code snippet (first 3 lines)
35
99
  const codeLines = result.content.split('\n').slice(0, 3);
36
100
  for (const line of codeLines) {
37
- const trimmed = line.slice(0, 80); // Limit line length
38
- lines.push(` > ${trimmed}`);
101
+ lines.push(` > ${line.slice(0, 80)}`);
39
102
  }
40
103
 
41
104
  lines.push('');
package/src/store.js CHANGED
@@ -5,6 +5,7 @@ import { mkdirSync, existsSync } from 'fs';
5
5
  let dbConnection = null;
6
6
  let tableRef = null;
7
7
  let isFirstBatch = true;
8
+ let vectorSearchCache = new Map();
8
9
 
9
10
  export async function initStore(dbPath) {
10
11
  // Ensure directory exists
@@ -121,12 +122,19 @@ export async function searchSimilar(queryEmbedding, limit = 10) {
121
122
  // Ensure vector is a proper array/tensor
122
123
  const query = Array.isArray(queryEmbedding) ? queryEmbedding : Array.from(queryEmbedding);
123
124
 
125
+ // Check cache using 20-dimension hash for near-zero collision rate
126
+ const cacheKey = query.slice(0, 20).join(',');
127
+ const cached = vectorSearchCache.get(cacheKey);
128
+ if (cached) {
129
+ return cached.slice(0, limit);
130
+ }
131
+
124
132
  const results = await tableRef
125
133
  .search(query)
126
134
  .limit(limit)
127
135
  .execute();
128
136
 
129
- return results.map(result => {
137
+ const formattedResults = results.map(result => {
130
138
  const distance = result._distance !== undefined ? result._distance : (result.distance || 0);
131
139
  const score = distance !== null && distance !== undefined ? 1 / (1 + distance) : 0;
132
140
  return {
@@ -139,6 +147,15 @@ export async function searchSimilar(queryEmbedding, limit = 10) {
139
147
  score: score
140
148
  };
141
149
  });
150
+
151
+ // Cache results (keep max 100 cached searches)
152
+ if (vectorSearchCache.size > 100) {
153
+ const firstKey = vectorSearchCache.keys().next().value;
154
+ vectorSearchCache.delete(firstKey);
155
+ }
156
+ vectorSearchCache.set(cacheKey, formattedResults);
157
+
158
+ return formattedResults;
142
159
  } catch (e) {
143
160
  console.error('Search failed:', e.message);
144
161
  return [];
@@ -4,19 +4,21 @@ export function buildTextIndex(chunks) {
4
4
 
5
5
  chunks.forEach((chunk, idx) => {
6
6
  const tokens = tokenize(chunk.content);
7
+ const fileNameTokens = tokenize(chunk.file_path);
7
8
  const symbols = extractSymbols(chunk.content);
8
9
  const frequency = new Map();
9
10
 
10
11
  tokens.forEach(token => {
11
12
  frequency.set(token, (frequency.get(token) || 0) + 1);
12
13
  if (!index.has(token)) {
13
- index.set(token, []);
14
+ index.set(token, new Set());
14
15
  }
15
- index.get(token).push(idx);
16
+ index.get(token).add(idx);
16
17
  });
17
18
 
18
19
  chunkMetadata[idx] = {
19
20
  tokens,
21
+ fileNameTokens,
20
22
  symbols,
21
23
  frequency,
22
24
  isCode: isCodeFile(chunk.file_path),
@@ -32,47 +34,66 @@ export function searchText(query, chunks, indexData) {
32
34
  const querySymbols = extractSymbols(query);
33
35
  const chunkScores = new Map();
34
36
 
35
- chunks.forEach((chunk, idx) => {
37
+ // Use index to find candidate chunks efficiently
38
+ const candidates = new Set();
39
+ queryTokens.forEach(token => {
40
+ if (index.has(token)) {
41
+ for (const idx of index.get(token)) candidates.add(idx);
42
+ }
43
+ });
44
+ querySymbols.forEach(sym => {
45
+ if (index.has(sym)) {
46
+ for (const idx of index.get(sym)) candidates.add(idx);
47
+ }
48
+ });
49
+
50
+ for (const idx of candidates) {
51
+ const chunk = chunks[idx];
52
+ const meta = chunkMetadata[idx];
36
53
  let score = 0;
37
54
 
55
+ // Exact phrase match - highest priority (saves embedding cost)
56
+ if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
57
+ score += 30;
58
+ }
59
+
60
+ // Symbol match in content - function/class named after query terms
61
+ querySymbols.forEach(symbol => {
62
+ if (meta.symbols.includes(symbol)) score += 10;
63
+ });
64
+
65
+ // Filename token match - strong signal that this file is about the query topic
66
+ let fileNameMatches = 0;
38
67
  queryTokens.forEach(token => {
39
- if (index.has(token)) {
40
- if (index.get(token).includes(idx)) {
41
- const freq = chunkMetadata[idx].frequency.get(token) || 1;
42
- const boost = token.length > 4 ? 1.5 : 1;
43
- score += boost * freq;
44
- }
45
- }
68
+ if (meta.fileNameTokens.includes(token)) fileNameMatches++;
46
69
  });
70
+ if (fileNameMatches > 0) {
71
+ score += fileNameMatches * 10;
72
+ }
47
73
 
48
- querySymbols.forEach(symbol => {
49
- if (chunkMetadata[idx].symbols.includes(symbol)) {
50
- score += 5;
74
+ // Token frequency scoring
75
+ queryTokens.forEach(token => {
76
+ if (index.has(token) && index.get(token).has(idx)) {
77
+ const freq = meta.frequency.get(token) || 1;
78
+ const lengthBoost = token.length > 4 ? 1.5 : 1;
79
+ score += lengthBoost * Math.min(freq, 5);
51
80
  }
52
81
  });
53
82
 
54
- const exactMatch = chunk.content.includes(query);
55
- if (exactMatch) {
56
- score += 10;
57
- }
83
+ // Code file boost
84
+ if (meta.isCode) score *= 1.2;
58
85
 
59
- if (chunkMetadata[idx].isCode) {
60
- score *= 1.2;
61
- }
86
+ if (score > 0) chunkScores.set(idx, score);
87
+ }
62
88
 
63
- if (score > 0) {
64
- chunkScores.set(idx, score);
65
- }
66
- });
89
+ const entries = Array.from(chunkScores.entries()).sort((a, b) => b[1] - a[1]);
90
+ const maxScore = entries.length > 0 ? entries[0][1] : 1;
67
91
 
68
- const results = Array.from(chunkScores.entries())
69
- .map(([idx, score]) => ({
70
- ...chunks[idx],
71
- score: Math.min(score / 100, 1),
72
- _rawScore: score,
73
- }))
74
- .filter(r => r.score > 0)
75
- .sort((a, b) => b._rawScore - a._rawScore);
92
+ const results = entries.map(([idx, score]) => ({
93
+ ...chunks[idx],
94
+ score: score / maxScore,
95
+ _rawScore: score,
96
+ }));
76
97
 
77
98
  return results;
78
99
  }
@@ -80,20 +101,24 @@ export function searchText(query, chunks, indexData) {
80
101
  function tokenize(text) {
81
102
  const tokens = new Set();
82
103
 
83
- text.toLowerCase().split(/\s+/).forEach(word => {
104
+ text.split(/\s+/).forEach(word => {
84
105
  if (word.length === 0) return;
85
106
 
86
- tokens.add(word.replace(/[^\w]/g, ''));
87
-
88
- const camelCaseTokens = word.match(/[a-z]+|[A-Z][a-z]*|[0-9]+/g) || [];
107
+ // camelCase/PascalCase split BEFORE lowercasing so uppercase boundaries are visible
108
+ const camelCaseTokens = word.match(/[A-Z]?[a-z]+|[A-Z]+(?=[A-Z][a-z]|\d|\W|$)|[0-9]+/g) || [];
89
109
  camelCaseTokens.forEach(t => {
90
110
  if (t.length > 1) tokens.add(t.toLowerCase());
91
111
  });
92
112
 
93
- const snakeCaseTokens = word.split(/[-_]/).filter(t => t.length > 0);
94
- snakeCaseTokens.forEach(t => {
95
- if (t.length > 1) tokens.add(t.toLowerCase());
113
+ // snake_case and kebab-case split
114
+ word.split(/[-_.]/).forEach(t => {
115
+ const cleaned = t.replace(/[^\w]/g, '').toLowerCase();
116
+ if (cleaned.length > 1) tokens.add(cleaned);
96
117
  });
118
+
119
+ // Full word lowercased (stripped of punctuation)
120
+ const cleaned = word.replace(/[^\w]/g, '').toLowerCase();
121
+ if (cleaned.length > 1) tokens.add(cleaned);
97
122
  });
98
123
 
99
124
  return Array.from(tokens).filter(t => t.length > 1);