codebasesearch 0.1.22 → 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.prd ADDED
@@ -0,0 +1,78 @@
1
+ {
2
+ "project": "code-search",
3
+ "created": "2026-03-12",
4
+ "objective": "Profile and improve code-search speed and result quality",
5
+ "items": [
6
+ {
7
+ "id": "fix-dedup-buildtextindex",
8
+ "subject": "Remove duplicate buildTextIndex from search.js",
9
+ "status": "pending",
10
+ "description": "search.js has a private copy of buildTextIndex and tokenize/extractSymbols/isCodeFile that duplicates text-search.js. Import the exported buildTextIndex from text-search.js instead.",
11
+ "category": "refactor",
12
+ "effort": "small",
13
+ "blocking": ["fix-score-normalization", "fix-hybrid-weights"],
14
+ "blockedBy": []
15
+ },
16
+ {
17
+ "id": "fix-chunk-size",
18
+ "subject": "Reduce chunk size from 300 to 60 lines for better semantic granularity",
19
+ "status": "pending",
20
+ "description": "scanner.js uses 300-line chunks. Embeddings work best on 50-100 line chunks. Reduce to 60-line chunks with 15-line overlap for better vector search quality.",
21
+ "category": "feature",
22
+ "effort": "small",
23
+ "blocking": [],
24
+ "blockedBy": []
25
+ },
26
+ {
27
+ "id": "fix-score-normalization",
28
+ "subject": "Fix text search score normalization so top result is always 1.0",
29
+ "status": "pending",
30
+ "description": "Text scores divide raw by 100 but scores can exceed 100. Use dynamic max-score scaling. Lower hasGoodTextResults threshold from 0.5 to 0.3.",
31
+ "category": "bug",
32
+ "effort": "small",
33
+ "blocking": [],
34
+ "blockedBy": ["fix-dedup-buildtextindex"]
35
+ },
36
+ {
37
+ "id": "fix-hybrid-weights",
38
+ "subject": "Boost text-only exact-match results in hybrid merge",
39
+ "status": "pending",
40
+ "description": "Text-only results are capped at 20% weight. Give high-scoring text-only results a floor finalScore of 0.4.",
41
+ "category": "feature",
42
+ "effort": "small",
43
+ "blocking": [],
44
+ "blockedBy": ["fix-dedup-buildtextindex"]
45
+ },
46
+ {
47
+ "id": "fix-vector-cache-key",
48
+ "subject": "Strengthen vector search cache key to 20 dimensions",
49
+ "status": "pending",
50
+ "description": "Cache key uses only first 5 embedding dims. Use 20 dims for near-zero collision rate.",
51
+ "category": "bug",
52
+ "effort": "small",
53
+ "blocking": [],
54
+ "blockedBy": []
55
+ },
56
+ {
57
+ "id": "remove-dead-meanpooling",
58
+ "subject": "Remove dead meanPooling function from embeddings.js",
59
+ "status": "pending",
60
+ "description": "meanPooling is defined but never called. Remove dead code.",
61
+ "category": "refactor",
62
+ "effort": "small",
63
+ "blocking": [],
64
+ "blockedBy": []
65
+ },
66
+ {
67
+ "id": "verify-and-commit",
68
+ "subject": "Verify improvements and commit all changes",
69
+ "status": "pending",
70
+ "description": "Run end-to-end search logic test inline. Commit and push all changes.",
71
+ "category": "infra",
72
+ "effort": "small",
73
+ "blocking": [],
74
+ "blockedBy": ["fix-dedup-buildtextindex", "fix-chunk-size", "fix-score-normalization", "fix-hybrid-weights", "fix-vector-cache-key", "remove-dead-meanpooling"]
75
+ }
76
+ ],
77
+ "completed": []
78
+ }
package/mcp.js CHANGED
@@ -1,6 +1,5 @@
1
1
  #!/usr/bin/env node
2
2
 
3
- // MUST patch sharp before any other imports
4
3
  import fs from 'fs';
5
4
  import path from 'path';
6
5
  import { fileURLToPath } from 'url';
@@ -23,178 +22,136 @@ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
23
22
  import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
24
23
  import { cwd } from 'process';
25
24
  import { join } from 'path';
26
- import { existsSync, readFileSync, appendFileSync, writeFileSync } from 'fs';
25
+ import { existsSync, readFileSync, appendFileSync, writeFileSync, readdirSync } from 'fs';
26
+ import { homedir } from 'os';
27
27
  import { supervisor } from './src/supervisor.js';
28
28
 
29
- async function ensureIgnoreEntry(rootPath) {
29
+ const WORKSPACE_PATH = join(homedir(), 'workspace');
30
+
31
+ function getWorkspaceFolders() {
32
+ try {
33
+ return readdirSync(WORKSPACE_PATH, { withFileTypes: true })
34
+ .filter(e => e.isDirectory() && !e.name.startsWith('.'))
35
+ .map(e => join(WORKSPACE_PATH, e.name));
36
+ } catch { return []; }
37
+ }
38
+
39
+ function ensureIgnoreEntry(rootPath) {
30
40
  const gitignorePath = join(rootPath, '.gitignore');
31
41
  const entry = '.code-search/';
32
-
33
42
  try {
34
43
  if (existsSync(gitignorePath)) {
35
44
  const content = readFileSync(gitignorePath, 'utf8');
36
- if (!content.includes(entry)) {
37
- appendFileSync(gitignorePath, `\n${entry}`);
38
- }
45
+ if (!content.includes(entry)) appendFileSync(gitignorePath, `\n${entry}`);
39
46
  } else {
40
47
  writeFileSync(gitignorePath, `${entry}\n`);
41
48
  }
42
- } catch (e) {
43
- // Ignore write errors
44
- }
49
+ } catch (e) {}
45
50
  }
46
51
 
47
- const server = new Server(
48
- {
49
- name: 'code-search-mcp',
50
- version: '0.1.0',
51
- },
52
- {
53
- capabilities: {
54
- tools: {},
55
- },
56
- }
57
- );
52
+ function formatResults(result, query, scope) {
53
+ if (result.resultsCount === 0) return `No results found${scope ? ` across ${scope}` : ''} for: "${query}"`;
54
+ const plural = result.resultsCount !== 1 ? 's' : '';
55
+ const header = `Found ${result.resultsCount} result${plural}${scope ? ` across ${scope}` : ''} for: "${query}"\n\n`;
56
+ const body = result.results.map((r) => {
57
+ const pathPart = r.relativePath || r.absolutePath;
58
+ const lineCount = r.totalLines ? ` [${r.totalLines}L]` : '';
59
+ const ctx = r.enclosingContext ? ` (in: ${r.enclosingContext})` : '';
60
+ const rHeader = `${r.rank}. ${pathPart}${lineCount}:${r.lines}${ctx} (score: ${r.score}%)`;
61
+ const rBody = r.snippet.split('\n').map((line) => ` ${line}`).join('\n');
62
+ return `${rHeader}\n${rBody}`;
63
+ }).join('\n\n');
64
+ return header + body;
65
+ }
66
+
67
+ function errResponse(msg) {
68
+ return { content: [{ type: 'text', text: msg }], isError: true };
69
+ }
70
+
71
+ function okResponse(text) {
72
+ return { content: [{ type: 'text', text }] };
73
+ }
58
74
 
59
- server.setRequestHandler(ListToolsRequestSchema, async () => {
60
- return {
61
- tools: [
62
- {
63
- name: 'search',
64
- description:
65
- 'Search through a code repository using semantic search with Jina embeddings. Automatically indexes the repository before searching.',
66
- inputSchema: {
67
- type: 'object',
68
- properties: {
69
- repository_path: {
70
- type: 'string',
71
- description:
72
- 'Absolute or relative path to the repository to search in (defaults to current directory)',
73
- },
74
- query: {
75
- type: 'string',
76
- description:
77
- 'Natural language search query (e.g., "authentication middleware", "database connection")',
78
- },
79
- },
80
- required: ['query'],
75
+ const server = new Server({ name: 'code-search-mcp', version: '0.1.0' }, { capabilities: { tools: {} } });
76
+
77
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({
78
+ tools: [
79
+ {
80
+ name: 'search',
81
+ description: 'Search through a code repository. Automatically indexes before searching.',
82
+ inputSchema: {
83
+ type: 'object',
84
+ properties: {
85
+ repository_path: { type: 'string', description: 'Path to repository (defaults to current directory)' },
86
+ query: { type: 'string', description: 'Natural language search query' },
81
87
  },
88
+ required: ['query'],
82
89
  },
83
- ],
84
- };
85
- });
90
+ },
91
+ {
92
+ name: 'search_workspace',
93
+ description: 'Search across ALL repositories in ~/workspace simultaneously. Returns ranked results with repo name prefix.',
94
+ inputSchema: {
95
+ type: 'object',
96
+ properties: {
97
+ query: { type: 'string', description: 'Natural language search query' },
98
+ limit: { type: 'number', description: 'Max results to return (default: 10)' },
99
+ },
100
+ required: ['query'],
101
+ },
102
+ },
103
+ ],
104
+ }));
86
105
 
87
106
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
88
107
  const { name, arguments: args } = request.params;
89
-
90
- if (name !== 'search') {
91
- return {
92
- content: [
93
- {
94
- type: 'text',
95
- text: `Unknown tool: ${name}`,
96
- },
97
- ],
98
- isError: true,
99
- };
100
- }
101
-
102
108
  const query = args?.query;
103
- const repositoryPath = args?.repository_path || cwd();
104
-
105
- if (!query || typeof query !== 'string') {
106
- return {
107
- content: [
108
- {
109
- type: 'text',
110
- text: 'Error: query is required and must be a string',
111
- },
112
- ],
113
- isError: true,
114
- };
115
- }
109
+
110
+ if (!['search', 'search_workspace'].includes(name)) return errResponse(`Unknown tool: ${name}`);
111
+ if (!query || typeof query !== 'string') return errResponse('Error: query is required and must be a string');
116
112
 
117
113
  try {
118
- await ensureIgnoreEntry(repositoryPath);
119
- const result = await supervisor.sendRequest({
120
- type: 'search',
121
- query,
122
- repositoryPath,
123
- });
124
-
125
- if (result.error) {
126
- return {
127
- content: [
128
- {
129
- type: 'text',
130
- text: `Error: ${result.error}`,
131
- },
132
- ],
133
- isError: true,
134
- };
114
+ if (name === 'search_workspace') {
115
+ const result = await supervisor.sendRequest({
116
+ type: 'search-all',
117
+ query,
118
+ workspacePaths: getWorkspaceFolders(),
119
+ limit: args?.limit || 10,
120
+ });
121
+ if (result.error) return errResponse(`Error: ${result.error}`);
122
+ return okResponse(formatResults(result, query, 'workspace'));
135
123
  }
136
124
 
137
- const repoPath = typeof repositoryPath === 'string' ? repositoryPath : null;
138
- const text =
139
- result.resultsCount === 0
140
- ? `No results found for query: "${query}"`
141
- : `Found ${result.resultsCount} result${result.resultsCount !== 1 ? 's' : ''} for query: "${query}"\n\n${result.results
142
- .map((r) => {
143
- const pathPart = r.relativePath || r.absolutePath;
144
- const lineCount = r.totalLines ? ` [${r.totalLines}L]` : '';
145
- const ctx = r.enclosingContext ? ` (in: ${r.enclosingContext})` : '';
146
- const header = `${r.rank}. ${pathPart}${lineCount}:${r.lines}${ctx} (score: ${r.score}%)`;
147
- const body = r.snippet.split('\n').map((line) => ` ${line}`).join('\n');
148
- return `${header}\n${body}`;
149
- })
150
- .join('\n\n')}`;
151
-
152
- return {
153
- content: [
154
- {
155
- type: 'text',
156
- text,
157
- },
158
- ],
159
- };
125
+ const repositoryPath = args?.repository_path || cwd();
126
+ ensureIgnoreEntry(repositoryPath);
127
+ const result = await supervisor.sendRequest({ type: 'search', query, repositoryPath });
128
+ if (result.error) return errResponse(`Error: ${result.error}`);
129
+ return okResponse(formatResults(result, query, null));
160
130
  } catch (error) {
161
- return {
162
- content: [
163
- {
164
- type: 'text',
165
- text: `Error: ${error.message}`,
166
- },
167
- ],
168
- isError: true,
169
- };
131
+ return errResponse(`Error: ${error.message}`);
170
132
  }
171
133
  });
172
134
 
173
135
  export async function startMcpServer() {
174
136
  const transport = new StdioServerTransport();
175
137
  await server.connect(transport);
138
+
139
+ const workspacePaths = getWorkspaceFolders();
140
+ if (workspacePaths.length > 0) {
141
+ supervisor.sendRequest({ type: 'index-all', workspacePaths })
142
+ .then(r => console.error(`[MCP] Pre-indexed workspace: ${r.message || JSON.stringify(r)}`))
143
+ .catch(e => console.error(`[MCP] Pre-index warning: ${e.message}`));
144
+ }
176
145
  }
177
146
 
178
147
  const isMain = process.argv[1] && (
179
- process.argv[1] === fileURLToPath(import.meta.url) ||
180
- process.argv[1].endsWith('mcp.js') ||
148
+ process.argv[1] === fileURLToPath(import.meta.url) ||
149
+ process.argv[1].endsWith('mcp.js') ||
181
150
  process.argv[1].endsWith('code-search-mcp')
182
151
  );
183
152
 
184
- if (isMain) {
185
- main().catch((error) => {
186
- console.error('Server error:', error);
187
- });
188
- }
153
+ process.on('uncaughtException', (error) => console.error('Uncaught exception:', error));
154
+ process.on('unhandledRejection', (reason) => console.error('Unhandled rejection:', reason));
189
155
 
190
- process.on('uncaughtException', (error) => {
191
- console.error('Uncaught exception:', error);
192
- });
193
-
194
- process.on('unhandledRejection', (reason) => {
195
- console.error('Unhandled rejection:', reason);
196
- });
197
-
198
- async function main() {
199
- await startMcpServer();
200
- }
156
+ async function main() { await startMcpServer(); }
157
+ if (isMain) main().catch((error) => console.error('Server error:', error));
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "codebasesearch",
3
- "version": "0.1.22",
3
+ "version": "0.1.24",
4
4
  "description": "Ultra-simple code search tool with Jina embeddings, LanceDB, and MCP protocol support",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli.js CHANGED
@@ -75,29 +75,39 @@ export async function run(args) {
75
75
  console.log('Generating embeddings and indexing...');
76
76
 
77
77
  // Generate embeddings in batches and upsert immediately to free memory
78
- const batchSize = 32;
79
- let processedCount = 0;
80
-
81
- for (let i = 0; i < chunks.length; i += batchSize) {
82
- const batchChunks = chunks.slice(i, i + batchSize);
83
- const batchTexts = batchChunks.map(c => c.content);
84
- const batchEmbeddings = await generateEmbeddings(batchTexts);
85
-
86
- // Create batch with embeddings
87
- const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
88
- ...chunk,
89
- vector: batchEmbeddings[idx]
90
- }));
78
+ // Optimize batch size based on chunk count (larger batches are more efficient)
79
+ let batchSize = 32;
80
+ if (chunks.length > 500) batchSize = 64;
81
+ if (chunks.length > 1000) batchSize = 96;
91
82
 
92
- // Upsert immediately to free memory
93
- await upsertChunks(batchWithEmbeddings);
94
- processedCount += batchWithEmbeddings.length;
83
+ let processedCount = 0;
84
+ let embeddingsAvailable = true;
85
+
86
+ try {
87
+ for (let i = 0; i < chunks.length; i += batchSize) {
88
+ const batchChunks = chunks.slice(i, i + batchSize);
89
+ const batchTexts = batchChunks.map(c => c.content);
90
+ const batchEmbeddings = await generateEmbeddings(batchTexts);
91
+
92
+ // Create batch with embeddings
93
+ const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
94
+ ...chunk,
95
+ vector: batchEmbeddings[idx]
96
+ }));
97
+
98
+ // Upsert immediately to free memory
99
+ await upsertChunks(batchWithEmbeddings);
100
+ processedCount += batchWithEmbeddings.length;
101
+ }
102
+ } catch (embeddingError) {
103
+ console.warn(`Warning: Embedding generation failed (${embeddingError.message}). Using text-only search.\n`);
104
+ embeddingsAvailable = false;
95
105
  }
96
106
 
97
107
  console.log('Index created\n');
98
108
 
99
- // Execute search
100
- const results = await executeSearch(query);
109
+ // Execute search with chunks for hybrid search (text-only if embeddings failed)
110
+ const results = await executeSearch(query, 10, chunks);
101
111
 
102
112
  // Format and display results
103
113
  const output = formatResults(results);
package/src/embeddings.js CHANGED
@@ -13,6 +13,7 @@ try {
13
13
 
14
14
  let modelCache = null;
15
15
  let cacheCleared = false;
16
+ let modelLoadTime = 0;
16
17
 
17
18
  function clearModelCache() {
18
19
  const cacheDirs = [
@@ -37,6 +38,7 @@ async function getModel(retryOnError = true) {
37
38
  return modelCache;
38
39
  }
39
40
 
41
+ const modelStart = performance.now();
40
42
  console.error('Loading embeddings model (this may take a moment on first run)...');
41
43
 
42
44
  const modelLoadPromise = pipeline(
@@ -50,6 +52,7 @@ async function getModel(retryOnError = true) {
50
52
 
51
53
  try {
52
54
  modelCache = await Promise.race([modelLoadPromise, timeoutPromise]);
55
+ modelLoadTime = performance.now() - modelStart;
53
56
  } catch (e) {
54
57
  if (retryOnError && !cacheCleared && (e.message.includes('Protobuf') || e.message.includes('parsing'))) {
55
58
  console.error('Detected corrupted cache, clearing and retrying...');
@@ -65,37 +68,8 @@ async function getModel(retryOnError = true) {
65
68
  return modelCache;
66
69
  }
67
70
 
68
- async function meanPooling(modelOutput, attentionMask) {
69
- // Get token embeddings from model output
70
- const tokenEmbeddings = modelOutput.data;
71
- const embeddingDim = modelOutput.dims[modelOutput.dims.length - 1];
72
- const batchSize = modelOutput.dims[0];
73
- const seqLength = modelOutput.dims[1];
74
-
75
- const pooled = [];
76
-
77
- for (let b = 0; b < batchSize; b++) {
78
- let sum = new Array(embeddingDim).fill(0);
79
- let count = 0;
80
-
81
- for (let s = 0; s < seqLength; s++) {
82
- const tokenIdx = b * seqLength + s;
83
- const maskValue = attentionMask[tokenIdx] || 1;
84
-
85
- if (maskValue > 0) {
86
- const tokenStart = tokenIdx * embeddingDim;
87
- for (let d = 0; d < embeddingDim; d++) {
88
- sum[d] += tokenEmbeddings[tokenStart + d] * maskValue;
89
- }
90
- count += maskValue;
91
- }
92
- }
93
-
94
- const normalized = sum.map(v => v / Math.max(count, 1e-9));
95
- pooled.push(normalized);
96
- }
97
-
98
- return pooled;
71
+ export function getModelLoadTime() {
72
+ return modelLoadTime;
99
73
  }
100
74
 
101
75
  export async function generateEmbeddings(texts) {
@@ -105,11 +79,16 @@ export async function generateEmbeddings(texts) {
105
79
  texts = [texts];
106
80
  }
107
81
 
108
- // Generate embeddings for all texts
109
- const embeddings = await model(texts, {
110
- pooling: 'mean',
111
- normalize: true
112
- });
82
+ // Generate embeddings for all texts with timeout per batch
83
+ const embeddings = await Promise.race([
84
+ model(texts, {
85
+ pooling: 'mean',
86
+ normalize: true
87
+ }),
88
+ new Promise((_, reject) =>
89
+ setTimeout(() => reject(new Error('Embedding generation timeout')), 60000)
90
+ )
91
+ ]);
113
92
 
114
93
  // Convert to regular arrays
115
94
  const result = [];
package/src/scanner.js CHANGED
@@ -65,7 +65,7 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
65
65
  return files;
66
66
  }
67
67
 
68
- function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
68
+ function chunkContent(content, chunkSize = 60, overlapSize = 15) {
69
69
  const lines = content.split('\n');
70
70
  const chunks = [];
71
71
 
@@ -81,7 +81,6 @@ function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
81
81
  });
82
82
  }
83
83
 
84
- // Stop if we've reached the end
85
84
  if (endIdx === lines.length) {
86
85
  break;
87
86
  }
@@ -100,7 +99,7 @@ export function scanRepository(rootPath, ignorePatterns) {
100
99
  const mtime = file.mtime;
101
100
 
102
101
  // For small files, treat as single chunk
103
- if (content.split('\n').length <= 1000) {
102
+ if (content.split('\n').length <= 60) {
104
103
  chunks.push({
105
104
  file_path: file.relativePath,
106
105
  chunk_index: 0,
@@ -1,6 +1,6 @@
1
1
  import { parentPort } from 'worker_threads';
2
2
  import { resolve, relative } from 'path';
3
- import { existsSync, readFileSync } from 'fs';
3
+ import { existsSync, readFileSync, readdirSync, statSync } from 'fs';
4
4
  import { loadIgnorePatterns } from './ignore-parser.js';
5
5
  import { scanRepository } from './scanner.js';
6
6
  import { buildTextIndex, searchText } from './text-search.js';
@@ -31,12 +31,28 @@ function getFileTotalLines(absoluteFilePath) {
31
31
 
32
32
  let indexCache = new Map();
33
33
 
34
+ function getWorkspaceFolders(workspacePath) {
35
+ try {
36
+ const entries = readdirSync(workspacePath, { withFileTypes: true });
37
+ return entries
38
+ .filter(e => e.isDirectory() && !e.name.startsWith('.'))
39
+ .map(e => resolve(workspacePath, e.name));
40
+ } catch {
41
+ return [];
42
+ }
43
+ }
44
+
34
45
  async function initializeIndex(repositoryPath) {
35
46
  const absolutePath = resolve(repositoryPath);
36
- const cacheKey = absolutePath;
37
47
 
38
- if (indexCache.has(cacheKey)) {
39
- return indexCache.get(cacheKey);
48
+ const cached = indexCache.get(absolutePath);
49
+ if (cached) {
50
+ try {
51
+ const dirStat = statSync(absolutePath);
52
+ if (dirStat.mtimeMs <= cached.indexedAt) return cached;
53
+ } catch {
54
+ return cached;
55
+ }
40
56
  }
41
57
 
42
58
  try {
@@ -48,8 +64,8 @@ async function initializeIndex(repositoryPath) {
48
64
  }
49
65
 
50
66
  const indexData = buildTextIndex(chunks);
51
- const result = { chunks, indexData };
52
- indexCache.set(cacheKey, result);
67
+ const result = { chunks, indexData, indexedAt: Date.now() };
68
+ indexCache.set(absolutePath, result);
53
69
 
54
70
  return result;
55
71
  } catch (error) {
@@ -99,6 +115,53 @@ async function performSearch(repositoryPath, query) {
99
115
  }
100
116
  }
101
117
 
118
+ async function performSearchAll(workspacePaths, query, limit = 10) {
119
+ const allResults = [];
120
+
121
+ for (const repoPath of workspacePaths) {
122
+ const absolutePath = resolve(repoPath);
123
+ if (!existsSync(absolutePath)) continue;
124
+
125
+ const indexData = await initializeIndex(absolutePath);
126
+ if (indexData.error || !indexData.chunks) continue;
127
+
128
+ const results = searchText(query, indexData.chunks, indexData.indexData);
129
+ const repoName = absolutePath.split('/').pop();
130
+
131
+ const seenFiles = new Set();
132
+ for (const r of results) {
133
+ if (!seenFiles.has(r.file_path)) {
134
+ seenFiles.add(r.file_path);
135
+ allResults.push({ ...r, repoName, repoPath: absolutePath });
136
+ }
137
+ if (seenFiles.size >= limit) break;
138
+ }
139
+ }
140
+
141
+ allResults.sort((a, b) => b.score - a.score);
142
+ const top = allResults.slice(0, limit);
143
+
144
+ return {
145
+ query,
146
+ resultsCount: top.length,
147
+ results: top.map((r, idx) => {
148
+ const absoluteFilePath = resolve(r.repoPath, r.file_path);
149
+ const totalLines = getFileTotalLines(absoluteFilePath);
150
+ const enclosingContext = findEnclosingContext(r.content, r.line_start);
151
+ return {
152
+ rank: idx + 1,
153
+ absolutePath: absoluteFilePath,
154
+ relativePath: `${r.repoName}/${r.file_path}`,
155
+ lines: `${r.line_start}-${r.line_end}`,
156
+ totalLines,
157
+ enclosingContext,
158
+ score: (r.score * 100).toFixed(1),
159
+ snippet: r.content.split('\n').slice(0, 30).join('\n'),
160
+ };
161
+ }),
162
+ };
163
+ }
164
+
102
165
  if (parentPort) {
103
166
  parentPort.on('message', async (msg) => {
104
167
  try {
@@ -107,6 +170,26 @@ if (parentPort) {
107
170
  return;
108
171
  }
109
172
 
173
+ if (msg.type === 'index-all') {
174
+ const folders = msg.workspacePaths || getWorkspaceFolders(msg.workspacePath || '');
175
+ let indexed = 0;
176
+ for (const folder of folders) {
177
+ if (existsSync(folder)) {
178
+ await initializeIndex(folder);
179
+ indexed++;
180
+ }
181
+ }
182
+ parentPort.postMessage({ id: msg.id, result: { indexed, message: `Indexed ${indexed} repositories` } });
183
+ return;
184
+ }
185
+
186
+ if (msg.type === 'search-all') {
187
+ const folders = msg.workspacePaths || getWorkspaceFolders(msg.workspacePath || '');
188
+ const result = await performSearchAll(folders, msg.query, msg.limit || 10);
189
+ parentPort.postMessage({ id: msg.id, result });
190
+ return;
191
+ }
192
+
110
193
  if (msg.type === 'search') {
111
194
  const result = await performSearch(msg.repositoryPath || process.cwd(), msg.query);
112
195
  parentPort.postMessage({ id: msg.id, result });
package/src/search.js CHANGED
@@ -1,20 +1,84 @@
1
1
  import { generateSingleEmbedding } from './embeddings.js';
2
2
  import { searchSimilar } from './store.js';
3
+ import { buildTextIndex, searchText } from './text-search.js';
3
4
 
4
- export async function executeSearch(query, limit = 10) {
5
+ export async function executeSearch(query, limit = 10, allChunks = null, skipVector = false) {
5
6
  if (!query || query.trim().length === 0) {
6
7
  throw new Error('Query cannot be empty');
7
8
  }
8
9
 
9
10
  console.error(`Searching for: "${query}"`);
10
11
 
11
- // Generate embedding for query
12
- const queryEmbedding = await generateSingleEmbedding(query);
12
+ try {
13
+ let vectorResults = [];
14
+ let textResults = [];
13
15
 
14
- // Search vector store
15
- const results = await searchSimilar(queryEmbedding, limit);
16
+ if (allChunks && allChunks.length > 0) {
17
+ const textIndexData = buildTextIndex(allChunks);
18
+ textResults = searchText(query, allChunks, textIndexData);
19
+ }
20
+
21
+ const hasGoodTextResults = textResults.length > 0 && textResults[0].score > 0.3;
22
+ if (!skipVector && !hasGoodTextResults) {
23
+ try {
24
+ const queryEmbedding = await generateSingleEmbedding(query);
25
+ vectorResults = await searchSimilar(queryEmbedding, limit * 2);
26
+ } catch (e) {
27
+ console.warn(`Vector search unavailable: ${e.message}`);
28
+ }
29
+ }
30
+
31
+ if (vectorResults.length > 0 && textResults.length > 0) {
32
+ return mergeSearchResults(vectorResults, textResults.slice(0, limit * 2), limit);
33
+ }
34
+
35
+ const allResults = vectorResults.length > 0 ? vectorResults : textResults;
36
+ return allResults.slice(0, limit);
37
+ } catch (error) {
38
+ console.error('Search error:', error.message);
39
+ if (allChunks && allChunks.length > 0) {
40
+ const textIndexData = buildTextIndex(allChunks);
41
+ const textResults = searchText(query, allChunks, textIndexData);
42
+ return textResults.slice(0, limit);
43
+ }
44
+ throw error;
45
+ }
46
+ }
47
+
48
+ function mergeSearchResults(vectorResults, textResults, limit) {
49
+ const merged = new Map();
50
+
51
+ vectorResults.forEach((result) => {
52
+ const key = `${result.file_path}:${result.chunk_index}`;
53
+ merged.set(key, {
54
+ ...result,
55
+ vectorScore: result.score || 0,
56
+ textScore: 0,
57
+ finalScore: (result.score || 0) * 0.8
58
+ });
59
+ });
60
+
61
+ textResults.forEach((result) => {
62
+ const key = `${result.file_path}:${result.chunk_index || 0}`;
63
+ if (merged.has(key)) {
64
+ const existing = merged.get(key);
65
+ existing.textScore = result.score || 0;
66
+ existing.finalScore = (existing.vectorScore * 0.8) + (result.score * 0.2);
67
+ } else {
68
+ const textScore = result.score || 0;
69
+ const finalScore = Math.max(textScore * 0.2, textScore > 0.7 ? 0.4 : 0);
70
+ merged.set(key, {
71
+ ...result,
72
+ vectorScore: 0,
73
+ textScore,
74
+ finalScore
75
+ });
76
+ }
77
+ });
16
78
 
17
- return results;
79
+ return Array.from(merged.values())
80
+ .sort((a, b) => b.finalScore - a.finalScore)
81
+ .slice(0, limit);
18
82
  }
19
83
 
20
84
  export function formatResults(results) {
@@ -27,15 +91,14 @@ export function formatResults(results) {
27
91
 
28
92
  for (let i = 0; i < results.length; i++) {
29
93
  const result = results[i];
30
- const match = i + 1;
94
+ const scoreValue = result.finalScore !== undefined ? result.finalScore : (result.score || 0);
95
+ const scorePercent = (scoreValue * 100).toFixed(1);
31
96
 
32
- lines.push(`${match}. ${result.file_path}:${result.line_start}-${result.line_end} (score: ${(result.score * 100).toFixed(1)}%)`);
97
+ lines.push(`${i + 1}. ${result.file_path}:${result.line_start}-${result.line_end} (score: ${scorePercent}%)`);
33
98
 
34
- // Show code snippet (first 3 lines)
35
99
  const codeLines = result.content.split('\n').slice(0, 3);
36
100
  for (const line of codeLines) {
37
- const trimmed = line.slice(0, 80); // Limit line length
38
- lines.push(` > ${trimmed}`);
101
+ lines.push(` > ${line.slice(0, 80)}`);
39
102
  }
40
103
 
41
104
  lines.push('');
package/src/store.js CHANGED
@@ -5,6 +5,7 @@ import { mkdirSync, existsSync } from 'fs';
5
5
  let dbConnection = null;
6
6
  let tableRef = null;
7
7
  let isFirstBatch = true;
8
+ let vectorSearchCache = new Map();
8
9
 
9
10
  export async function initStore(dbPath) {
10
11
  // Ensure directory exists
@@ -121,12 +122,19 @@ export async function searchSimilar(queryEmbedding, limit = 10) {
121
122
  // Ensure vector is a proper array/tensor
122
123
  const query = Array.isArray(queryEmbedding) ? queryEmbedding : Array.from(queryEmbedding);
123
124
 
125
+ // Check cache using 20-dimension hash for near-zero collision rate
126
+ const cacheKey = query.slice(0, 20).join(',');
127
+ const cached = vectorSearchCache.get(cacheKey);
128
+ if (cached) {
129
+ return cached.slice(0, limit);
130
+ }
131
+
124
132
  const results = await tableRef
125
133
  .search(query)
126
134
  .limit(limit)
127
135
  .execute();
128
136
 
129
- return results.map(result => {
137
+ const formattedResults = results.map(result => {
130
138
  const distance = result._distance !== undefined ? result._distance : (result.distance || 0);
131
139
  const score = distance !== null && distance !== undefined ? 1 / (1 + distance) : 0;
132
140
  return {
@@ -139,6 +147,15 @@ export async function searchSimilar(queryEmbedding, limit = 10) {
139
147
  score: score
140
148
  };
141
149
  });
150
+
151
+ // Cache results (keep max 100 cached searches)
152
+ if (vectorSearchCache.size > 100) {
153
+ const firstKey = vectorSearchCache.keys().next().value;
154
+ vectorSearchCache.delete(firstKey);
155
+ }
156
+ vectorSearchCache.set(cacheKey, formattedResults);
157
+
158
+ return formattedResults;
142
159
  } catch (e) {
143
160
  console.error('Search failed:', e.message);
144
161
  return [];
@@ -52,12 +52,14 @@ export function searchText(query, chunks, indexData) {
52
52
  const meta = chunkMetadata[idx];
53
53
  let score = 0;
54
54
 
55
- queryTokens.forEach(token => {
56
- if (index.has(token) && index.get(token).has(idx)) {
57
- const freq = meta.frequency.get(token) || 1;
58
- const lengthBoost = token.length > 4 ? 1.5 : 1;
59
- score += lengthBoost * Math.min(freq, 5);
60
- }
55
+ // Exact phrase match - highest priority (saves embedding cost)
56
+ if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
57
+ score += 30;
58
+ }
59
+
60
+ // Symbol match in content - function/class named after query terms
61
+ querySymbols.forEach(symbol => {
62
+ if (meta.symbols.includes(symbol)) score += 10;
61
63
  });
62
64
 
63
65
  // Filename token match - strong signal that this file is about the query topic
@@ -66,32 +68,32 @@ export function searchText(query, chunks, indexData) {
66
68
  if (meta.fileNameTokens.includes(token)) fileNameMatches++;
67
69
  });
68
70
  if (fileNameMatches > 0) {
69
- score += fileNameMatches * 8;
71
+ score += fileNameMatches * 10;
70
72
  }
71
73
 
72
- // Symbol match in content - function/class named after query terms
73
- querySymbols.forEach(symbol => {
74
- if (meta.symbols.includes(symbol)) score += 5;
74
+ // Token frequency scoring
75
+ queryTokens.forEach(token => {
76
+ if (index.has(token) && index.get(token).has(idx)) {
77
+ const freq = meta.frequency.get(token) || 1;
78
+ const lengthBoost = token.length > 4 ? 1.5 : 1;
79
+ score += lengthBoost * Math.min(freq, 5);
80
+ }
75
81
  });
76
82
 
77
- // Exact phrase match
78
- if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
79
- score += 15;
80
- }
81
-
82
83
  // Code file boost
83
84
  if (meta.isCode) score *= 1.2;
84
85
 
85
86
  if (score > 0) chunkScores.set(idx, score);
86
87
  }
87
88
 
88
- const results = Array.from(chunkScores.entries())
89
- .map(([idx, score]) => ({
90
- ...chunks[idx],
91
- score: Math.min(score / 100, 1),
92
- _rawScore: score,
93
- }))
94
- .sort((a, b) => b._rawScore - a._rawScore);
89
+ const entries = Array.from(chunkScores.entries()).sort((a, b) => b[1] - a[1]);
90
+ const maxScore = entries.length > 0 ? entries[0][1] : 1;
91
+
92
+ const results = entries.map(([idx, score]) => ({
93
+ ...chunks[idx],
94
+ score: score / maxScore,
95
+ _rawScore: score,
96
+ }));
95
97
 
96
98
  return results;
97
99
  }