codebasesearch 0.1.20 → 0.1.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/ignore-parser.js +53 -170
- package/src/scanner.js +7 -6
- package/src/text-search.js +53 -30
- package/.prd +0 -58
package/package.json
CHANGED
package/src/ignore-parser.js
CHANGED
|
@@ -106,109 +106,17 @@ function loadDefaultIgnores() {
|
|
|
106
106
|
|
|
107
107
|
function getHardcodedIgnores() {
|
|
108
108
|
return new Set([
|
|
109
|
-
|
|
110
|
-
'
|
|
111
|
-
'.
|
|
112
|
-
'
|
|
113
|
-
'.azure', '.gcloud', '.vscode', '.idea', '.vs', 'bin', 'obj',
|
|
114
|
-
'.gradle', '.mvn', 'Pods', 'DerivedData', '.bundle',
|
|
115
|
-
'.yarn', '.pnp', 'pnpm-lock.yaml', '.pnpm-store',
|
|
116
|
-
'.tox', '.eggs', '*.egg-info', '.venv', 'venv', 'env',
|
|
117
|
-
'.tsc', '.eslintcache', '.stylelintcache', '.parcel-cache',
|
|
118
|
-
'temp', 'tmp', '.tmp', '.DS_Store', 'Thumbs.db',
|
|
119
|
-
'.swp', '.swo', '*.swp', '*.swo', '.tern-port',
|
|
120
|
-
'dist-server', 'out-tsc', '.cache', '.parcel-cache',
|
|
121
|
-
'typings', '.env', '.env.local', '.env.*.local',
|
|
122
|
-
// JSON files - PRIMARY PRIORITY for memory reduction
|
|
123
|
-
'*.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
|
|
124
|
-
'Gemfile.lock', 'poetry.lock', 'Pipfile.lock',
|
|
125
|
-
// Lock files
|
|
126
|
-
'*.lock',
|
|
127
|
-
// Build outputs
|
|
128
|
-
'public', 'static', 'site', '_site', '.docusaurus', '.gatsby',
|
|
129
|
-
// Cache/dependency directories
|
|
130
|
-
'.rush', '.lerna', '.nx',
|
|
131
|
-
// IDE/editor configs
|
|
132
|
-
'.cursor', '.replit', '.sublime-project', '.sublime-workspace',
|
|
133
|
-
'*.iml', '.project', '.classpath', '.settings', '*.sublime-*',
|
|
109
|
+
// Lock files / package manager artifacts
|
|
110
|
+
'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
|
|
111
|
+
'Gemfile.lock', 'poetry.lock', 'Pipfile.lock', 'Cargo.lock',
|
|
112
|
+
'composer.lock', 'go.sum',
|
|
134
113
|
// OS files
|
|
135
|
-
'.
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
'*.backup', '*.data', '*.orig',
|
|
140
|
-
// Logs and temp
|
|
141
|
-
'*.log', 'logs', 'npm-debug.log', 'yarn-error.log',
|
|
142
|
-
// Test coverage and reports
|
|
143
|
-
'lcov.info', '.coverage', 'test-results',
|
|
144
|
-
// Database related
|
|
145
|
-
'storage', 'fixtures',
|
|
146
|
-
// LLM/Vector related
|
|
147
|
-
'.llamaindex', '.chroma', '.vectorstore', '.embeddings',
|
|
148
|
-
'.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
|
|
149
|
-
'.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
|
|
150
|
-
'chromadb', 'pinecone-cache', 'weaviate-data',
|
|
151
|
-
// Compiled output
|
|
114
|
+
'.DS_Store', 'Thumbs.db', 'desktop.ini',
|
|
115
|
+
// Editor swap files
|
|
116
|
+
'.tern-port',
|
|
117
|
+
// Compiled binary artifacts (files, not dirs)
|
|
152
118
|
'*.min.js', '*.min.css', '*.bundle.js', '*.chunk.js', '*.map',
|
|
153
|
-
|
|
154
|
-
'.assets', 'out-tsc', 'cmake_build_debug', 'cmake_build_release',
|
|
155
|
-
// Version managers
|
|
156
|
-
'.rbenv', '.nvm', '.nvmrc',
|
|
157
|
-
// Ruby specific
|
|
158
|
-
'*.gem', '*.rbc', '/pkg', '/spec/reports', '/spec/examples.txt',
|
|
159
|
-
'/test/tmp', '/test/version_tmp', 'lib/bundler/man', '.ruby-version',
|
|
160
|
-
// Go specific
|
|
161
|
-
'go.work',
|
|
162
|
-
// Rust specific
|
|
163
|
-
'Cargo.lock', '**/*.rs.bk', '*.pdb',
|
|
164
|
-
// Java specific
|
|
165
|
-
'*.class', '*.jar', '*.war', '*.ear', '*.nar', '*.nupkg', '*.snupkg',
|
|
166
|
-
// C# specific
|
|
167
|
-
'*.suo', '*.user', '*.userosscache', '*.sln.docstates',
|
|
168
|
-
'project.lock.json', 'project.fragment.lock.json', 'artifacts',
|
|
169
|
-
// C/C++ specific
|
|
170
|
-
'*.o', '*.a', '*.so', '*.exe', '*.obj', '*.dll', '*.dylib',
|
|
171
|
-
'CMakeFiles', 'CMakeCache.txt', '*.cmake',
|
|
172
|
-
// Swift/Xcode specific
|
|
173
|
-
'*.xcodeproj', '*.xcworkspace', '*.moved-aside', '*.pbxuser',
|
|
174
|
-
'*.mode1v3', '*.mode2v3', '*.perspectivev3',
|
|
175
|
-
// Scala/SBT specific
|
|
176
|
-
'lib_managed', 'src_managed', 'project/boot', 'project/plugins/project',
|
|
177
|
-
'.history', '.lib',
|
|
178
|
-
// PHP specific
|
|
179
|
-
'composer.lock', '*.phar',
|
|
180
|
-
// Docker
|
|
181
|
-
'.dockerignore', 'docker-compose.override.yml', '.docker',
|
|
182
|
-
// Documentation build
|
|
183
|
-
'docs/_build', '.vuepress',
|
|
184
|
-
// Testing frameworks
|
|
185
|
-
'jest.config', 'vitest.config', 'pytest.ini', 'tox.ini',
|
|
186
|
-
'__tests__', '__mocks__', 'spec', 'cypress', 'playwright',
|
|
187
|
-
// Monorepo workspace patterns (implicit through directory coverage)
|
|
188
|
-
'.turbo', '.nx',
|
|
189
|
-
// Python package patterns
|
|
190
|
-
'*.py[cod]', '*$py.class', '.Python', 'pip-log.txt',
|
|
191
|
-
'pip-delete-this-directory.txt', '.hypothesis', '.pyre', '.pytype',
|
|
192
|
-
'*.whl',
|
|
193
|
-
// Config/metadata that are typically low-value
|
|
194
|
-
'*.config.js', '*.config.ts', 'webpack.config.js', 'rollup.config.js',
|
|
195
|
-
'vite.config.js', 'tsconfig.json', 'jsconfig.json', 'babel.config',
|
|
196
|
-
'.babelrc', '.eslintrc', '.prettierrc', '.stylelintrc', '.editorconfig',
|
|
197
|
-
'*.local', '*.development', '*.production',
|
|
198
|
-
// Node specific
|
|
199
|
-
'.npm', '.node_repl_history', '*.tsbuildinfo', 'yarn-error.log',
|
|
200
|
-
// Documentation/reference files that don't help with search
|
|
201
|
-
'*.md', '*.txt', '*.rst', '*.adoc', 'docs', 'documentation', 'wiki',
|
|
202
|
-
'CHANGELOG', 'HISTORY', 'NEWS', 'UPGRADING', 'FAQ', 'CONTRIBUTING',
|
|
203
|
-
'SECURITY', 'LICENSE', 'LICENCE', 'COPYRIGHT', 'NOTICE', 'AUTHORS',
|
|
204
|
-
'THIRDPARTY',
|
|
205
|
-
// Test and coverage files
|
|
206
|
-
'*.test', '*.spec', 'test', 'tests', 'htmlcov',
|
|
207
|
-
// Profiling
|
|
208
|
-
'*.prof', '*.cpuprofile', '*.heapprofile',
|
|
209
|
-
// Misc
|
|
210
|
-
'.tern-port', 'firebase-debug.log', 'firestore-debug.log',
|
|
211
|
-
'ui-debug.log', '.firebaserc', '.stackdump'
|
|
119
|
+
'*.tsbuildinfo',
|
|
212
120
|
]);
|
|
213
121
|
}
|
|
214
122
|
|
|
@@ -283,76 +191,54 @@ export function loadIgnorePatterns(rootPath) {
|
|
|
283
191
|
return merged;
|
|
284
192
|
}
|
|
285
193
|
|
|
286
|
-
// Directories to always ignore
|
|
194
|
+
// Directories to always ignore - only clear non-source directories
|
|
287
195
|
const IGNORED_DIRECTORIES = new Set([
|
|
288
|
-
// Dependencies
|
|
196
|
+
// Dependencies
|
|
289
197
|
'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
|
|
290
198
|
// Version control
|
|
291
|
-
'.git', '.svn', '.hg', '.bzr',
|
|
292
|
-
//
|
|
293
|
-
'
|
|
294
|
-
|
|
295
|
-
'
|
|
296
|
-
'
|
|
297
|
-
'
|
|
298
|
-
'
|
|
299
|
-
'
|
|
300
|
-
'
|
|
301
|
-
'
|
|
199
|
+
'.git', '.svn', '.hg', '.bzr',
|
|
200
|
+
// IDE
|
|
201
|
+
'.vscode', '.idea', '.vs', '.atom',
|
|
202
|
+
// Build outputs (unambiguous names only)
|
|
203
|
+
'dist', 'dist-server', 'dist-ssr', 'dist-client',
|
|
204
|
+
'build', 'built',
|
|
205
|
+
'out', 'out-tsc',
|
|
206
|
+
'target',
|
|
207
|
+
'storybook-static', '.docusaurus', '.gatsby', '.vuepress',
|
|
208
|
+
'.nuxt', '.next',
|
|
209
|
+
'.tsc',
|
|
302
210
|
// Cache directories
|
|
303
|
-
'.cache', '
|
|
211
|
+
'.cache', '.parcel-cache', '.vite', '.turbo',
|
|
304
212
|
'.npm', '.yarn', '.pnp', '.pnpm-store', '.rush', '.lerna', '.nx',
|
|
305
213
|
// Testing
|
|
306
214
|
'coverage', '.nyc_output', '.coverage', 'htmlcov', 'test-results',
|
|
307
|
-
'test', 'tests', 'Test', 'Tests', 'TEST', 'TESTS',
|
|
308
215
|
'__tests__', '__mocks__', '__snapshots__', '__fixtures__',
|
|
309
|
-
'cypress', 'playwright',
|
|
216
|
+
'cypress', 'playwright',
|
|
310
217
|
'.tox', '.eggs', '.hypothesis', '.pyre', '.pytype',
|
|
311
218
|
// Python
|
|
312
|
-
'__pycache__', '.pytest_cache', '.mypy_cache', '.venv', 'venv',
|
|
313
|
-
'env.bak', 'venv.bak', '.Python', 'pip-wheel-metadata', '*.egg-info',
|
|
219
|
+
'__pycache__', '.pytest_cache', '.mypy_cache', '.venv', 'venv',
|
|
314
220
|
// Java/Gradle/Maven
|
|
315
|
-
'.gradle', '.mvn',
|
|
221
|
+
'.gradle', '.mvn',
|
|
316
222
|
// iOS/Android
|
|
317
|
-
'Pods', 'DerivedData', '
|
|
223
|
+
'Pods', 'DerivedData', '.bundle', 'xcuserdata',
|
|
318
224
|
// Ruby
|
|
319
|
-
'
|
|
320
|
-
// Rust
|
|
321
|
-
'target', 'Cargo.lock',
|
|
322
|
-
// Go
|
|
323
|
-
'vendor', 'Godeps',
|
|
324
|
-
// PHP
|
|
325
|
-
'vendor', 'composer',
|
|
225
|
+
'.bundle', 'pkg',
|
|
326
226
|
// Infrastructure
|
|
327
227
|
'.terraform', '.terragrunt-cache', '.pulumi', '.serverless', '.firebase',
|
|
328
|
-
'.aws', '.azure', '.gcloud', '.vercel', '.netlify',
|
|
329
|
-
// Docker
|
|
330
|
-
'.docker', 'docker', '.dockerignore',
|
|
228
|
+
'.aws', '.azure', '.gcloud', '.vercel', '.netlify',
|
|
331
229
|
// Temp files
|
|
332
|
-
'temp', 'tmp', '.tmp', '.temp',
|
|
333
|
-
//
|
|
334
|
-
'docs', 'doc', 'documentation', 'wiki', 'guides', 'examples', 'demo', 'demos',
|
|
335
|
-
'CHANGELOG', 'HISTORY', 'NEWS', 'LICENSE', 'LICENCE', 'COPYING', 'AUTHORS',
|
|
336
|
-
// IDE/Editor
|
|
337
|
-
'.vs', '.vscode', '.idea', '.eclipse', '.settings', '.classpath', '.project',
|
|
338
|
-
// Logs
|
|
339
|
-
'logs', 'log', '*.log',
|
|
340
|
-
// Data/Storage
|
|
341
|
-
'storage', 'data', 'database', 'db', 'fixtures', 'seeds',
|
|
342
|
-
'uploads', 'files', 'media', 'resources', 'assets', 'images', 'img',
|
|
343
|
-
// LLM/AI
|
|
230
|
+
'temp', 'tmp', '.tmp', '.temp',
|
|
231
|
+
// LLM/AI artifacts
|
|
344
232
|
'.llamaindex', '.chroma', '.vectorstore', '.embeddings',
|
|
345
233
|
'.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
|
|
346
234
|
'.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
|
|
347
235
|
'chromadb', 'pinecone-cache', 'weaviate-data',
|
|
348
|
-
// Package
|
|
349
|
-
'.
|
|
350
|
-
//
|
|
351
|
-
'
|
|
352
|
-
// Misc
|
|
353
|
-
'
|
|
354
|
-
'cmake_build_debug', 'cmake_build_release', 'CMakeFiles', 'CMakeCache.txt',
|
|
355
|
-
'out-tsc', 'dist-server', 'server', 'client', 'browser', 'esm', 'cjs', 'umd', 'lib', 'es'
|
|
236
|
+
// Package manager caches
|
|
237
|
+
'.pnpm', '.bun',
|
|
238
|
+
// Static/built asset directories
|
|
239
|
+
'assets', 'static', 'public', 'wwwroot', 'www',
|
|
240
|
+
// Misc generated
|
|
241
|
+
'cmake_build_debug', 'cmake_build_release', 'CMakeFiles',
|
|
356
242
|
]);
|
|
357
243
|
|
|
358
244
|
export function isCodeFile(filePath) {
|
|
@@ -373,28 +259,34 @@ export function isCodeFile(filePath) {
|
|
|
373
259
|
export function shouldIgnoreDirectory(dirPath) {
|
|
374
260
|
const normalizedPath = dirPath.replace(/\\/g, '/');
|
|
375
261
|
const pathParts = normalizedPath.split('/');
|
|
376
|
-
|
|
377
262
|
for (const part of pathParts) {
|
|
378
263
|
if (IGNORED_DIRECTORIES.has(part)) {
|
|
379
264
|
return true;
|
|
380
265
|
}
|
|
381
266
|
}
|
|
382
|
-
|
|
383
267
|
return false;
|
|
384
268
|
}
|
|
385
269
|
|
|
386
|
-
export function shouldIgnore(filePath, ignorePatterns) {
|
|
270
|
+
export function shouldIgnore(filePath, ignorePatterns, isDirectory = false) {
|
|
387
271
|
const normalizedPath = filePath.replace(/\\/g, '/');
|
|
388
272
|
const pathParts = normalizedPath.split('/');
|
|
389
273
|
const fileName = pathParts[pathParts.length - 1];
|
|
390
|
-
|
|
391
|
-
|
|
274
|
+
|
|
275
|
+
if (isDirectory) {
|
|
276
|
+
if (IGNORED_DIRECTORIES.has(fileName)) return true;
|
|
277
|
+
for (const pattern of ignorePatterns) {
|
|
278
|
+
if (!pattern.includes('/') && fileName === pattern) return true;
|
|
279
|
+
}
|
|
280
|
+
return false;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// For files: check all ancestor directories
|
|
392
284
|
for (const part of pathParts.slice(0, -1)) {
|
|
393
285
|
if (IGNORED_DIRECTORIES.has(part)) {
|
|
394
286
|
return true;
|
|
395
287
|
}
|
|
396
288
|
}
|
|
397
|
-
|
|
289
|
+
|
|
398
290
|
// Check if it's a code file using whitelist
|
|
399
291
|
if (!isCodeFile(filePath)) {
|
|
400
292
|
return true;
|
|
@@ -402,22 +294,13 @@ export function shouldIgnore(filePath, ignorePatterns) {
|
|
|
402
294
|
|
|
403
295
|
// Check against additional ignore patterns
|
|
404
296
|
for (const pattern of ignorePatterns) {
|
|
405
|
-
// Handle path patterns (contain /)
|
|
406
297
|
if (pattern.includes('/')) {
|
|
407
|
-
if (normalizedPath.includes(pattern))
|
|
408
|
-
|
|
409
|
-
}
|
|
410
|
-
}
|
|
411
|
-
// Handle exact file name patterns
|
|
412
|
-
else if (fileName === pattern) {
|
|
298
|
+
if (normalizedPath.includes(pattern)) return true;
|
|
299
|
+
} else if (fileName === pattern) {
|
|
413
300
|
return true;
|
|
414
|
-
}
|
|
415
|
-
// Handle directory name patterns (match any path part)
|
|
416
|
-
else {
|
|
301
|
+
} else {
|
|
417
302
|
for (const part of pathParts) {
|
|
418
|
-
if (part === pattern
|
|
419
|
-
return true;
|
|
420
|
-
}
|
|
303
|
+
if (part === pattern) return true;
|
|
421
304
|
}
|
|
422
305
|
}
|
|
423
306
|
}
|
package/src/scanner.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { readdirSync, statSync, readFileSync } from 'fs';
|
|
2
2
|
import { join, relative } from 'path';
|
|
3
|
-
import { shouldIgnore, isCodeFile } from './ignore-parser.js';
|
|
3
|
+
import { shouldIgnore, shouldIgnoreDirectory, isCodeFile } from './ignore-parser.js';
|
|
4
4
|
|
|
5
5
|
function getFileExtension(filePath) {
|
|
6
6
|
const lastDot = filePath.lastIndexOf('.');
|
|
@@ -33,14 +33,15 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
|
|
|
33
33
|
// Normalize to forward slashes for consistent ignore pattern matching
|
|
34
34
|
const normalizedRelPath = relPath.replace(/\\/g, '/');
|
|
35
35
|
|
|
36
|
-
// Check if should ignore
|
|
37
|
-
if (shouldIgnore(normalizedRelPath, ignorePatterns)) {
|
|
38
|
-
continue;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
36
|
if (entry.isDirectory()) {
|
|
37
|
+
if (shouldIgnoreDirectory(normalizedRelPath) || shouldIgnore(normalizedRelPath, ignorePatterns, true)) {
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
42
40
|
files.push(...walkDirectory(fullPath, ignorePatterns, relPath));
|
|
43
41
|
} else if (entry.isFile()) {
|
|
42
|
+
if (shouldIgnore(normalizedRelPath, ignorePatterns, false)) {
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
44
45
|
if (isCodeFile(normalizedRelPath) && !isBinaryFile(entry.name)) {
|
|
45
46
|
try {
|
|
46
47
|
const stat = entry.isSymbolicLink ? null : statSync(fullPath);
|
package/src/text-search.js
CHANGED
|
@@ -4,19 +4,21 @@ export function buildTextIndex(chunks) {
|
|
|
4
4
|
|
|
5
5
|
chunks.forEach((chunk, idx) => {
|
|
6
6
|
const tokens = tokenize(chunk.content);
|
|
7
|
+
const fileNameTokens = tokenize(chunk.file_path);
|
|
7
8
|
const symbols = extractSymbols(chunk.content);
|
|
8
9
|
const frequency = new Map();
|
|
9
10
|
|
|
10
11
|
tokens.forEach(token => {
|
|
11
12
|
frequency.set(token, (frequency.get(token) || 0) + 1);
|
|
12
13
|
if (!index.has(token)) {
|
|
13
|
-
index.set(token,
|
|
14
|
+
index.set(token, new Set());
|
|
14
15
|
}
|
|
15
|
-
index.get(token).
|
|
16
|
+
index.get(token).add(idx);
|
|
16
17
|
});
|
|
17
18
|
|
|
18
19
|
chunkMetadata[idx] = {
|
|
19
20
|
tokens,
|
|
21
|
+
fileNameTokens,
|
|
20
22
|
symbols,
|
|
21
23
|
frequency,
|
|
22
24
|
isCode: isCodeFile(chunk.file_path),
|
|
@@ -32,38 +34,56 @@ export function searchText(query, chunks, indexData) {
|
|
|
32
34
|
const querySymbols = extractSymbols(query);
|
|
33
35
|
const chunkScores = new Map();
|
|
34
36
|
|
|
35
|
-
|
|
37
|
+
// Use index to find candidate chunks efficiently
|
|
38
|
+
const candidates = new Set();
|
|
39
|
+
queryTokens.forEach(token => {
|
|
40
|
+
if (index.has(token)) {
|
|
41
|
+
for (const idx of index.get(token)) candidates.add(idx);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
querySymbols.forEach(sym => {
|
|
45
|
+
if (index.has(sym)) {
|
|
46
|
+
for (const idx of index.get(sym)) candidates.add(idx);
|
|
47
|
+
}
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
for (const idx of candidates) {
|
|
51
|
+
const chunk = chunks[idx];
|
|
52
|
+
const meta = chunkMetadata[idx];
|
|
36
53
|
let score = 0;
|
|
37
54
|
|
|
38
55
|
queryTokens.forEach(token => {
|
|
39
|
-
if (index.has(token)) {
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
score += boost * freq;
|
|
44
|
-
}
|
|
56
|
+
if (index.has(token) && index.get(token).has(idx)) {
|
|
57
|
+
const freq = meta.frequency.get(token) || 1;
|
|
58
|
+
const lengthBoost = token.length > 4 ? 1.5 : 1;
|
|
59
|
+
score += lengthBoost * Math.min(freq, 5);
|
|
45
60
|
}
|
|
46
61
|
});
|
|
47
62
|
|
|
63
|
+
// Filename token match - strong signal that this file is about the query topic
|
|
64
|
+
let fileNameMatches = 0;
|
|
65
|
+
queryTokens.forEach(token => {
|
|
66
|
+
if (meta.fileNameTokens.includes(token)) fileNameMatches++;
|
|
67
|
+
});
|
|
68
|
+
if (fileNameMatches > 0) {
|
|
69
|
+
score += fileNameMatches * 8;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Symbol match in content - function/class named after query terms
|
|
48
73
|
querySymbols.forEach(symbol => {
|
|
49
|
-
if (
|
|
50
|
-
score += 5;
|
|
51
|
-
}
|
|
74
|
+
if (meta.symbols.includes(symbol)) score += 5;
|
|
52
75
|
});
|
|
53
76
|
|
|
54
|
-
|
|
55
|
-
if (
|
|
56
|
-
score +=
|
|
77
|
+
// Exact phrase match
|
|
78
|
+
if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
|
|
79
|
+
score += 15;
|
|
57
80
|
}
|
|
58
81
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
}
|
|
82
|
+
// Code file boost
|
|
83
|
+
if (meta.isCode) score *= 1.2;
|
|
62
84
|
|
|
63
|
-
if (score > 0)
|
|
64
|
-
|
|
65
|
-
}
|
|
66
|
-
});
|
|
85
|
+
if (score > 0) chunkScores.set(idx, score);
|
|
86
|
+
}
|
|
67
87
|
|
|
68
88
|
const results = Array.from(chunkScores.entries())
|
|
69
89
|
.map(([idx, score]) => ({
|
|
@@ -71,7 +91,6 @@ export function searchText(query, chunks, indexData) {
|
|
|
71
91
|
score: Math.min(score / 100, 1),
|
|
72
92
|
_rawScore: score,
|
|
73
93
|
}))
|
|
74
|
-
.filter(r => r.score > 0)
|
|
75
94
|
.sort((a, b) => b._rawScore - a._rawScore);
|
|
76
95
|
|
|
77
96
|
return results;
|
|
@@ -80,20 +99,24 @@ export function searchText(query, chunks, indexData) {
|
|
|
80
99
|
function tokenize(text) {
|
|
81
100
|
const tokens = new Set();
|
|
82
101
|
|
|
83
|
-
text.
|
|
102
|
+
text.split(/\s+/).forEach(word => {
|
|
84
103
|
if (word.length === 0) return;
|
|
85
104
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
const camelCaseTokens = word.match(/[a-z]+|[A-Z][a-z]*|[0-9]+/g) || [];
|
|
105
|
+
// camelCase/PascalCase split BEFORE lowercasing so uppercase boundaries are visible
|
|
106
|
+
const camelCaseTokens = word.match(/[A-Z]?[a-z]+|[A-Z]+(?=[A-Z][a-z]|\d|\W|$)|[0-9]+/g) || [];
|
|
89
107
|
camelCaseTokens.forEach(t => {
|
|
90
108
|
if (t.length > 1) tokens.add(t.toLowerCase());
|
|
91
109
|
});
|
|
92
110
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
111
|
+
// snake_case and kebab-case split
|
|
112
|
+
word.split(/[-_.]/).forEach(t => {
|
|
113
|
+
const cleaned = t.replace(/[^\w]/g, '').toLowerCase();
|
|
114
|
+
if (cleaned.length > 1) tokens.add(cleaned);
|
|
96
115
|
});
|
|
116
|
+
|
|
117
|
+
// Full word lowercased (stripped of punctuation)
|
|
118
|
+
const cleaned = word.replace(/[^\w]/g, '').toLowerCase();
|
|
119
|
+
if (cleaned.length > 1) tokens.add(cleaned);
|
|
97
120
|
});
|
|
98
121
|
|
|
99
122
|
return Array.from(tokens).filter(t => t.length > 1);
|
package/.prd
DELETED
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"project": "thorns-mcp",
|
|
3
|
-
"created": "2026-03-06",
|
|
4
|
-
"objective": "Improve search output to be maximally revealing so agents need minimal follow-up exploration",
|
|
5
|
-
"items": [
|
|
6
|
-
{
|
|
7
|
-
"id": "1",
|
|
8
|
-
"subject": "Expand snippet from 3 lines to full chunk content in search-worker.js",
|
|
9
|
-
"status": "pending",
|
|
10
|
-
"description": "In search-worker.js line 63, snippet is truncated to 3 lines. Show full chunk content (up to ~30 lines) so agents see the complete context. Also add totalLines to each result by counting newlines in full file content.",
|
|
11
|
-
"blocking": ["3"],
|
|
12
|
-
"blockedBy": [],
|
|
13
|
-
"effort": "small",
|
|
14
|
-
"category": "feature"
|
|
15
|
-
},
|
|
16
|
-
{
|
|
17
|
-
"id": "2",
|
|
18
|
-
"subject": "Add enclosing function/class context detection to search-worker.js",
|
|
19
|
-
"status": "pending",
|
|
20
|
-
"description": "For each result, detect the nearest enclosing function or class name above the match line. Pass this as 'context' field in the result object. Use regex scan backwards through lines above line_start.",
|
|
21
|
-
"blocking": ["3"],
|
|
22
|
-
"blockedBy": [],
|
|
23
|
-
"effort": "medium",
|
|
24
|
-
"category": "feature"
|
|
25
|
-
},
|
|
26
|
-
{
|
|
27
|
-
"id": "3",
|
|
28
|
-
"subject": "Update mcp.js output formatter to display all new fields",
|
|
29
|
-
"status": "pending",
|
|
30
|
-
"description": "Update the result formatting in mcp.js to show: totalLines alongside path (e.g. 'file.js [142 lines]:5-20'), enclosing context (e.g. 'in: functionName'), and the full snippet. Also show relative path when repository_path is provided.",
|
|
31
|
-
"blocking": ["4"],
|
|
32
|
-
"blockedBy": ["1", "2"],
|
|
33
|
-
"effort": "small",
|
|
34
|
-
"category": "feature"
|
|
35
|
-
},
|
|
36
|
-
{
|
|
37
|
-
"id": "4",
|
|
38
|
-
"subject": "Verify output quality by running CLI on /mnt/c/dev/spawnpoint",
|
|
39
|
-
"status": "pending",
|
|
40
|
-
"description": "Run: node src/cli.js /mnt/c/dev/spawnpoint with a test query and inspect the output quality. Confirm snippets are longer, context is shown, line counts are shown.",
|
|
41
|
-
"blocking": ["5"],
|
|
42
|
-
"blockedBy": ["3"],
|
|
43
|
-
"effort": "small",
|
|
44
|
-
"category": "feature"
|
|
45
|
-
},
|
|
46
|
-
{
|
|
47
|
-
"id": "5",
|
|
48
|
-
"subject": "Bump version to 0.1.19 and git push",
|
|
49
|
-
"status": "pending",
|
|
50
|
-
"description": "Update package.json version from 0.1.18 to 0.1.19, git add -A, commit, push.",
|
|
51
|
-
"blocking": [],
|
|
52
|
-
"blockedBy": ["4"],
|
|
53
|
-
"effort": "small",
|
|
54
|
-
"category": "infra"
|
|
55
|
-
}
|
|
56
|
-
],
|
|
57
|
-
"completed": []
|
|
58
|
-
}
|