codebasesearch 0.1.21 → 0.1.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.prd +78 -0
- package/package.json +1 -1
- package/src/cli.js +28 -18
- package/src/embeddings.js +15 -36
- package/src/ignore-parser.js +53 -170
- package/src/scanner.js +9 -9
- package/src/search.js +74 -11
- package/src/store.js +18 -1
- package/src/text-search.js +64 -39
package/.prd
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
{
|
|
2
|
+
"project": "code-search",
|
|
3
|
+
"created": "2026-03-12",
|
|
4
|
+
"objective": "Profile and improve code-search speed and result quality",
|
|
5
|
+
"items": [
|
|
6
|
+
{
|
|
7
|
+
"id": "fix-dedup-buildtextindex",
|
|
8
|
+
"subject": "Remove duplicate buildTextIndex from search.js",
|
|
9
|
+
"status": "pending",
|
|
10
|
+
"description": "search.js has a private copy of buildTextIndex and tokenize/extractSymbols/isCodeFile that duplicates text-search.js. Import the exported buildTextIndex from text-search.js instead.",
|
|
11
|
+
"category": "refactor",
|
|
12
|
+
"effort": "small",
|
|
13
|
+
"blocking": ["fix-score-normalization", "fix-hybrid-weights"],
|
|
14
|
+
"blockedBy": []
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"id": "fix-chunk-size",
|
|
18
|
+
"subject": "Reduce chunk size from 300 to 60 lines for better semantic granularity",
|
|
19
|
+
"status": "pending",
|
|
20
|
+
"description": "scanner.js uses 300-line chunks. Embeddings work best on 50-100 line chunks. Reduce to 60-line chunks with 15-line overlap for better vector search quality.",
|
|
21
|
+
"category": "feature",
|
|
22
|
+
"effort": "small",
|
|
23
|
+
"blocking": [],
|
|
24
|
+
"blockedBy": []
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"id": "fix-score-normalization",
|
|
28
|
+
"subject": "Fix text search score normalization so top result is always 1.0",
|
|
29
|
+
"status": "pending",
|
|
30
|
+
"description": "Text scores divide raw by 100 but scores can exceed 100. Use dynamic max-score scaling. Lower hasGoodTextResults threshold from 0.5 to 0.3.",
|
|
31
|
+
"category": "bug",
|
|
32
|
+
"effort": "small",
|
|
33
|
+
"blocking": [],
|
|
34
|
+
"blockedBy": ["fix-dedup-buildtextindex"]
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"id": "fix-hybrid-weights",
|
|
38
|
+
"subject": "Boost text-only exact-match results in hybrid merge",
|
|
39
|
+
"status": "pending",
|
|
40
|
+
"description": "Text-only results are capped at 20% weight. Give high-scoring text-only results a floor finalScore of 0.4.",
|
|
41
|
+
"category": "feature",
|
|
42
|
+
"effort": "small",
|
|
43
|
+
"blocking": [],
|
|
44
|
+
"blockedBy": ["fix-dedup-buildtextindex"]
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"id": "fix-vector-cache-key",
|
|
48
|
+
"subject": "Strengthen vector search cache key to 20 dimensions",
|
|
49
|
+
"status": "pending",
|
|
50
|
+
"description": "Cache key uses only first 5 embedding dims. Use 20 dims for near-zero collision rate.",
|
|
51
|
+
"category": "bug",
|
|
52
|
+
"effort": "small",
|
|
53
|
+
"blocking": [],
|
|
54
|
+
"blockedBy": []
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
"id": "remove-dead-meanpooling",
|
|
58
|
+
"subject": "Remove dead meanPooling function from embeddings.js",
|
|
59
|
+
"status": "pending",
|
|
60
|
+
"description": "meanPooling is defined but never called. Remove dead code.",
|
|
61
|
+
"category": "refactor",
|
|
62
|
+
"effort": "small",
|
|
63
|
+
"blocking": [],
|
|
64
|
+
"blockedBy": []
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"id": "verify-and-commit",
|
|
68
|
+
"subject": "Verify improvements and commit all changes",
|
|
69
|
+
"status": "pending",
|
|
70
|
+
"description": "Run end-to-end search logic test inline. Commit and push all changes.",
|
|
71
|
+
"category": "infra",
|
|
72
|
+
"effort": "small",
|
|
73
|
+
"blocking": [],
|
|
74
|
+
"blockedBy": ["fix-dedup-buildtextindex", "fix-chunk-size", "fix-score-normalization", "fix-hybrid-weights", "fix-vector-cache-key", "remove-dead-meanpooling"]
|
|
75
|
+
}
|
|
76
|
+
],
|
|
77
|
+
"completed": []
|
|
78
|
+
}
|
package/package.json
CHANGED
package/src/cli.js
CHANGED
|
@@ -75,29 +75,39 @@ export async function run(args) {
|
|
|
75
75
|
console.log('Generating embeddings and indexing...');
|
|
76
76
|
|
|
77
77
|
// Generate embeddings in batches and upsert immediately to free memory
|
|
78
|
-
|
|
79
|
-
let
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
const batchChunks = chunks.slice(i, i + batchSize);
|
|
83
|
-
const batchTexts = batchChunks.map(c => c.content);
|
|
84
|
-
const batchEmbeddings = await generateEmbeddings(batchTexts);
|
|
85
|
-
|
|
86
|
-
// Create batch with embeddings
|
|
87
|
-
const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
|
|
88
|
-
...chunk,
|
|
89
|
-
vector: batchEmbeddings[idx]
|
|
90
|
-
}));
|
|
78
|
+
// Optimize batch size based on chunk count (larger batches are more efficient)
|
|
79
|
+
let batchSize = 32;
|
|
80
|
+
if (chunks.length > 500) batchSize = 64;
|
|
81
|
+
if (chunks.length > 1000) batchSize = 96;
|
|
91
82
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
83
|
+
let processedCount = 0;
|
|
84
|
+
let embeddingsAvailable = true;
|
|
85
|
+
|
|
86
|
+
try {
|
|
87
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
88
|
+
const batchChunks = chunks.slice(i, i + batchSize);
|
|
89
|
+
const batchTexts = batchChunks.map(c => c.content);
|
|
90
|
+
const batchEmbeddings = await generateEmbeddings(batchTexts);
|
|
91
|
+
|
|
92
|
+
// Create batch with embeddings
|
|
93
|
+
const batchWithEmbeddings = batchChunks.map((chunk, idx) => ({
|
|
94
|
+
...chunk,
|
|
95
|
+
vector: batchEmbeddings[idx]
|
|
96
|
+
}));
|
|
97
|
+
|
|
98
|
+
// Upsert immediately to free memory
|
|
99
|
+
await upsertChunks(batchWithEmbeddings);
|
|
100
|
+
processedCount += batchWithEmbeddings.length;
|
|
101
|
+
}
|
|
102
|
+
} catch (embeddingError) {
|
|
103
|
+
console.warn(`Warning: Embedding generation failed (${embeddingError.message}). Using text-only search.\n`);
|
|
104
|
+
embeddingsAvailable = false;
|
|
95
105
|
}
|
|
96
106
|
|
|
97
107
|
console.log('Index created\n');
|
|
98
108
|
|
|
99
|
-
// Execute search
|
|
100
|
-
const results = await executeSearch(query);
|
|
109
|
+
// Execute search with chunks for hybrid search (text-only if embeddings failed)
|
|
110
|
+
const results = await executeSearch(query, 10, chunks);
|
|
101
111
|
|
|
102
112
|
// Format and display results
|
|
103
113
|
const output = formatResults(results);
|
package/src/embeddings.js
CHANGED
|
@@ -13,6 +13,7 @@ try {
|
|
|
13
13
|
|
|
14
14
|
let modelCache = null;
|
|
15
15
|
let cacheCleared = false;
|
|
16
|
+
let modelLoadTime = 0;
|
|
16
17
|
|
|
17
18
|
function clearModelCache() {
|
|
18
19
|
const cacheDirs = [
|
|
@@ -37,6 +38,7 @@ async function getModel(retryOnError = true) {
|
|
|
37
38
|
return modelCache;
|
|
38
39
|
}
|
|
39
40
|
|
|
41
|
+
const modelStart = performance.now();
|
|
40
42
|
console.error('Loading embeddings model (this may take a moment on first run)...');
|
|
41
43
|
|
|
42
44
|
const modelLoadPromise = pipeline(
|
|
@@ -50,6 +52,7 @@ async function getModel(retryOnError = true) {
|
|
|
50
52
|
|
|
51
53
|
try {
|
|
52
54
|
modelCache = await Promise.race([modelLoadPromise, timeoutPromise]);
|
|
55
|
+
modelLoadTime = performance.now() - modelStart;
|
|
53
56
|
} catch (e) {
|
|
54
57
|
if (retryOnError && !cacheCleared && (e.message.includes('Protobuf') || e.message.includes('parsing'))) {
|
|
55
58
|
console.error('Detected corrupted cache, clearing and retrying...');
|
|
@@ -65,37 +68,8 @@ async function getModel(retryOnError = true) {
|
|
|
65
68
|
return modelCache;
|
|
66
69
|
}
|
|
67
70
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
const tokenEmbeddings = modelOutput.data;
|
|
71
|
-
const embeddingDim = modelOutput.dims[modelOutput.dims.length - 1];
|
|
72
|
-
const batchSize = modelOutput.dims[0];
|
|
73
|
-
const seqLength = modelOutput.dims[1];
|
|
74
|
-
|
|
75
|
-
const pooled = [];
|
|
76
|
-
|
|
77
|
-
for (let b = 0; b < batchSize; b++) {
|
|
78
|
-
let sum = new Array(embeddingDim).fill(0);
|
|
79
|
-
let count = 0;
|
|
80
|
-
|
|
81
|
-
for (let s = 0; s < seqLength; s++) {
|
|
82
|
-
const tokenIdx = b * seqLength + s;
|
|
83
|
-
const maskValue = attentionMask[tokenIdx] || 1;
|
|
84
|
-
|
|
85
|
-
if (maskValue > 0) {
|
|
86
|
-
const tokenStart = tokenIdx * embeddingDim;
|
|
87
|
-
for (let d = 0; d < embeddingDim; d++) {
|
|
88
|
-
sum[d] += tokenEmbeddings[tokenStart + d] * maskValue;
|
|
89
|
-
}
|
|
90
|
-
count += maskValue;
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
const normalized = sum.map(v => v / Math.max(count, 1e-9));
|
|
95
|
-
pooled.push(normalized);
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
return pooled;
|
|
71
|
+
export function getModelLoadTime() {
|
|
72
|
+
return modelLoadTime;
|
|
99
73
|
}
|
|
100
74
|
|
|
101
75
|
export async function generateEmbeddings(texts) {
|
|
@@ -105,11 +79,16 @@ export async function generateEmbeddings(texts) {
|
|
|
105
79
|
texts = [texts];
|
|
106
80
|
}
|
|
107
81
|
|
|
108
|
-
// Generate embeddings for all texts
|
|
109
|
-
const embeddings = await
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
82
|
+
// Generate embeddings for all texts with timeout per batch
|
|
83
|
+
const embeddings = await Promise.race([
|
|
84
|
+
model(texts, {
|
|
85
|
+
pooling: 'mean',
|
|
86
|
+
normalize: true
|
|
87
|
+
}),
|
|
88
|
+
new Promise((_, reject) =>
|
|
89
|
+
setTimeout(() => reject(new Error('Embedding generation timeout')), 60000)
|
|
90
|
+
)
|
|
91
|
+
]);
|
|
113
92
|
|
|
114
93
|
// Convert to regular arrays
|
|
115
94
|
const result = [];
|
package/src/ignore-parser.js
CHANGED
|
@@ -106,109 +106,17 @@ function loadDefaultIgnores() {
|
|
|
106
106
|
|
|
107
107
|
function getHardcodedIgnores() {
|
|
108
108
|
return new Set([
|
|
109
|
-
|
|
110
|
-
'
|
|
111
|
-
'.
|
|
112
|
-
'
|
|
113
|
-
'.azure', '.gcloud', '.vscode', '.idea', '.vs', 'bin', 'obj',
|
|
114
|
-
'.gradle', '.mvn', 'Pods', 'DerivedData', '.bundle',
|
|
115
|
-
'.yarn', '.pnp', 'pnpm-lock.yaml', '.pnpm-store',
|
|
116
|
-
'.tox', '.eggs', '*.egg-info', '.venv', 'venv', 'env',
|
|
117
|
-
'.tsc', '.eslintcache', '.stylelintcache', '.parcel-cache',
|
|
118
|
-
'temp', 'tmp', '.tmp', '.DS_Store', 'Thumbs.db',
|
|
119
|
-
'.swp', '.swo', '*.swp', '*.swo', '.tern-port',
|
|
120
|
-
'dist-server', 'out-tsc', '.cache', '.parcel-cache',
|
|
121
|
-
'typings', '.env', '.env.local', '.env.*.local',
|
|
122
|
-
// JSON files - PRIMARY PRIORITY for memory reduction
|
|
123
|
-
'*.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
|
|
124
|
-
'Gemfile.lock', 'poetry.lock', 'Pipfile.lock',
|
|
125
|
-
// Lock files
|
|
126
|
-
'*.lock',
|
|
127
|
-
// Build outputs
|
|
128
|
-
'public', 'static', 'site', '_site', '.docusaurus', '.gatsby',
|
|
129
|
-
// Cache/dependency directories
|
|
130
|
-
'.rush', '.lerna', '.nx',
|
|
131
|
-
// IDE/editor configs
|
|
132
|
-
'.cursor', '.replit', '.sublime-project', '.sublime-workspace',
|
|
133
|
-
'*.iml', '.project', '.classpath', '.settings', '*.sublime-*',
|
|
109
|
+
// Lock files / package manager artifacts
|
|
110
|
+
'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
|
|
111
|
+
'Gemfile.lock', 'poetry.lock', 'Pipfile.lock', 'Cargo.lock',
|
|
112
|
+
'composer.lock', 'go.sum',
|
|
134
113
|
// OS files
|
|
135
|
-
'.
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
'*.backup', '*.data', '*.orig',
|
|
140
|
-
// Logs and temp
|
|
141
|
-
'*.log', 'logs', 'npm-debug.log', 'yarn-error.log',
|
|
142
|
-
// Test coverage and reports
|
|
143
|
-
'lcov.info', '.coverage', 'test-results',
|
|
144
|
-
// Database related
|
|
145
|
-
'storage', 'fixtures',
|
|
146
|
-
// LLM/Vector related
|
|
147
|
-
'.llamaindex', '.chroma', '.vectorstore', '.embeddings',
|
|
148
|
-
'.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
|
|
149
|
-
'.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
|
|
150
|
-
'chromadb', 'pinecone-cache', 'weaviate-data',
|
|
151
|
-
// Compiled output
|
|
114
|
+
'.DS_Store', 'Thumbs.db', 'desktop.ini',
|
|
115
|
+
// Editor swap files
|
|
116
|
+
'.tern-port',
|
|
117
|
+
// Compiled binary artifacts (files, not dirs)
|
|
152
118
|
'*.min.js', '*.min.css', '*.bundle.js', '*.chunk.js', '*.map',
|
|
153
|
-
|
|
154
|
-
'.assets', 'out-tsc', 'cmake_build_debug', 'cmake_build_release',
|
|
155
|
-
// Version managers
|
|
156
|
-
'.rbenv', '.nvm', '.nvmrc',
|
|
157
|
-
// Ruby specific
|
|
158
|
-
'*.gem', '*.rbc', '/pkg', '/spec/reports', '/spec/examples.txt',
|
|
159
|
-
'/test/tmp', '/test/version_tmp', 'lib/bundler/man', '.ruby-version',
|
|
160
|
-
// Go specific
|
|
161
|
-
'go.work',
|
|
162
|
-
// Rust specific
|
|
163
|
-
'Cargo.lock', '**/*.rs.bk', '*.pdb',
|
|
164
|
-
// Java specific
|
|
165
|
-
'*.class', '*.jar', '*.war', '*.ear', '*.nar', '*.nupkg', '*.snupkg',
|
|
166
|
-
// C# specific
|
|
167
|
-
'*.suo', '*.user', '*.userosscache', '*.sln.docstates',
|
|
168
|
-
'project.lock.json', 'project.fragment.lock.json', 'artifacts',
|
|
169
|
-
// C/C++ specific
|
|
170
|
-
'*.o', '*.a', '*.so', '*.exe', '*.obj', '*.dll', '*.dylib',
|
|
171
|
-
'CMakeFiles', 'CMakeCache.txt', '*.cmake',
|
|
172
|
-
// Swift/Xcode specific
|
|
173
|
-
'*.xcodeproj', '*.xcworkspace', '*.moved-aside', '*.pbxuser',
|
|
174
|
-
'*.mode1v3', '*.mode2v3', '*.perspectivev3',
|
|
175
|
-
// Scala/SBT specific
|
|
176
|
-
'lib_managed', 'src_managed', 'project/boot', 'project/plugins/project',
|
|
177
|
-
'.history', '.lib',
|
|
178
|
-
// PHP specific
|
|
179
|
-
'composer.lock', '*.phar',
|
|
180
|
-
// Docker
|
|
181
|
-
'.dockerignore', 'docker-compose.override.yml', '.docker',
|
|
182
|
-
// Documentation build
|
|
183
|
-
'docs/_build', '.vuepress',
|
|
184
|
-
// Testing frameworks
|
|
185
|
-
'jest.config', 'vitest.config', 'pytest.ini', 'tox.ini',
|
|
186
|
-
'__tests__', '__mocks__', 'spec', 'cypress', 'playwright',
|
|
187
|
-
// Monorepo workspace patterns (implicit through directory coverage)
|
|
188
|
-
'.turbo', '.nx',
|
|
189
|
-
// Python package patterns
|
|
190
|
-
'*.py[cod]', '*$py.class', '.Python', 'pip-log.txt',
|
|
191
|
-
'pip-delete-this-directory.txt', '.hypothesis', '.pyre', '.pytype',
|
|
192
|
-
'*.whl',
|
|
193
|
-
// Config/metadata that are typically low-value
|
|
194
|
-
'*.config.js', '*.config.ts', 'webpack.config.js', 'rollup.config.js',
|
|
195
|
-
'vite.config.js', 'tsconfig.json', 'jsconfig.json', 'babel.config',
|
|
196
|
-
'.babelrc', '.eslintrc', '.prettierrc', '.stylelintrc', '.editorconfig',
|
|
197
|
-
'*.local', '*.development', '*.production',
|
|
198
|
-
// Node specific
|
|
199
|
-
'.npm', '.node_repl_history', '*.tsbuildinfo', 'yarn-error.log',
|
|
200
|
-
// Documentation/reference files that don't help with search
|
|
201
|
-
'*.md', '*.txt', '*.rst', '*.adoc', 'docs', 'documentation', 'wiki',
|
|
202
|
-
'CHANGELOG', 'HISTORY', 'NEWS', 'UPGRADING', 'FAQ', 'CONTRIBUTING',
|
|
203
|
-
'SECURITY', 'LICENSE', 'LICENCE', 'COPYRIGHT', 'NOTICE', 'AUTHORS',
|
|
204
|
-
'THIRDPARTY',
|
|
205
|
-
// Test and coverage files
|
|
206
|
-
'*.test', '*.spec', 'test', 'tests', 'htmlcov',
|
|
207
|
-
// Profiling
|
|
208
|
-
'*.prof', '*.cpuprofile', '*.heapprofile',
|
|
209
|
-
// Misc
|
|
210
|
-
'.tern-port', 'firebase-debug.log', 'firestore-debug.log',
|
|
211
|
-
'ui-debug.log', '.firebaserc', '.stackdump'
|
|
119
|
+
'*.tsbuildinfo',
|
|
212
120
|
]);
|
|
213
121
|
}
|
|
214
122
|
|
|
@@ -283,76 +191,54 @@ export function loadIgnorePatterns(rootPath) {
|
|
|
283
191
|
return merged;
|
|
284
192
|
}
|
|
285
193
|
|
|
286
|
-
// Directories to always ignore
|
|
194
|
+
// Directories to always ignore - only clear non-source directories
|
|
287
195
|
const IGNORED_DIRECTORIES = new Set([
|
|
288
|
-
// Dependencies
|
|
196
|
+
// Dependencies
|
|
289
197
|
'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
|
|
290
198
|
// Version control
|
|
291
|
-
'.git', '.svn', '.hg', '.bzr',
|
|
292
|
-
//
|
|
293
|
-
'
|
|
294
|
-
|
|
295
|
-
'
|
|
296
|
-
'
|
|
297
|
-
'
|
|
298
|
-
'
|
|
299
|
-
'
|
|
300
|
-
'
|
|
301
|
-
'
|
|
199
|
+
'.git', '.svn', '.hg', '.bzr',
|
|
200
|
+
// IDE
|
|
201
|
+
'.vscode', '.idea', '.vs', '.atom',
|
|
202
|
+
// Build outputs (unambiguous names only)
|
|
203
|
+
'dist', 'dist-server', 'dist-ssr', 'dist-client',
|
|
204
|
+
'build', 'built',
|
|
205
|
+
'out', 'out-tsc',
|
|
206
|
+
'target',
|
|
207
|
+
'storybook-static', '.docusaurus', '.gatsby', '.vuepress',
|
|
208
|
+
'.nuxt', '.next',
|
|
209
|
+
'.tsc',
|
|
302
210
|
// Cache directories
|
|
303
|
-
'.cache', '
|
|
211
|
+
'.cache', '.parcel-cache', '.vite', '.turbo',
|
|
304
212
|
'.npm', '.yarn', '.pnp', '.pnpm-store', '.rush', '.lerna', '.nx',
|
|
305
213
|
// Testing
|
|
306
214
|
'coverage', '.nyc_output', '.coverage', 'htmlcov', 'test-results',
|
|
307
|
-
'test', 'tests', 'Test', 'Tests', 'TEST', 'TESTS',
|
|
308
215
|
'__tests__', '__mocks__', '__snapshots__', '__fixtures__',
|
|
309
|
-
'cypress', 'playwright',
|
|
216
|
+
'cypress', 'playwright',
|
|
310
217
|
'.tox', '.eggs', '.hypothesis', '.pyre', '.pytype',
|
|
311
218
|
// Python
|
|
312
|
-
'__pycache__', '.pytest_cache', '.mypy_cache', '.venv', 'venv',
|
|
313
|
-
'env.bak', 'venv.bak', '.Python', 'pip-wheel-metadata', '*.egg-info',
|
|
219
|
+
'__pycache__', '.pytest_cache', '.mypy_cache', '.venv', 'venv',
|
|
314
220
|
// Java/Gradle/Maven
|
|
315
|
-
'.gradle', '.mvn',
|
|
221
|
+
'.gradle', '.mvn',
|
|
316
222
|
// iOS/Android
|
|
317
|
-
'Pods', 'DerivedData', '
|
|
223
|
+
'Pods', 'DerivedData', '.bundle', 'xcuserdata',
|
|
318
224
|
// Ruby
|
|
319
|
-
'
|
|
320
|
-
// Rust
|
|
321
|
-
'target', 'Cargo.lock',
|
|
322
|
-
// Go
|
|
323
|
-
'vendor', 'Godeps',
|
|
324
|
-
// PHP
|
|
325
|
-
'vendor', 'composer',
|
|
225
|
+
'.bundle', 'pkg',
|
|
326
226
|
// Infrastructure
|
|
327
227
|
'.terraform', '.terragrunt-cache', '.pulumi', '.serverless', '.firebase',
|
|
328
|
-
'.aws', '.azure', '.gcloud', '.vercel', '.netlify',
|
|
329
|
-
// Docker
|
|
330
|
-
'.docker', 'docker', '.dockerignore',
|
|
228
|
+
'.aws', '.azure', '.gcloud', '.vercel', '.netlify',
|
|
331
229
|
// Temp files
|
|
332
|
-
'temp', 'tmp', '.tmp', '.temp',
|
|
333
|
-
//
|
|
334
|
-
'docs', 'doc', 'documentation', 'wiki', 'guides', 'examples', 'demo', 'demos',
|
|
335
|
-
'CHANGELOG', 'HISTORY', 'NEWS', 'LICENSE', 'LICENCE', 'COPYING', 'AUTHORS',
|
|
336
|
-
// IDE/Editor
|
|
337
|
-
'.vs', '.vscode', '.idea', '.eclipse', '.settings', '.classpath', '.project',
|
|
338
|
-
// Logs
|
|
339
|
-
'logs', 'log', '*.log',
|
|
340
|
-
// Data/Storage
|
|
341
|
-
'storage', 'data', 'database', 'db', 'fixtures', 'seeds',
|
|
342
|
-
'uploads', 'files', 'media', 'resources', 'assets', 'images', 'img',
|
|
343
|
-
// LLM/AI
|
|
230
|
+
'temp', 'tmp', '.tmp', '.temp',
|
|
231
|
+
// LLM/AI artifacts
|
|
344
232
|
'.llamaindex', '.chroma', '.vectorstore', '.embeddings',
|
|
345
233
|
'.langchain', '.autogen', '.semantic-kernel', '.openai-cache',
|
|
346
234
|
'.anthropic-cache', 'embeddings', 'vector-db', 'faiss-index',
|
|
347
235
|
'chromadb', 'pinecone-cache', 'weaviate-data',
|
|
348
|
-
// Package
|
|
349
|
-
'.
|
|
350
|
-
//
|
|
351
|
-
'
|
|
352
|
-
// Misc
|
|
353
|
-
'
|
|
354
|
-
'cmake_build_debug', 'cmake_build_release', 'CMakeFiles', 'CMakeCache.txt',
|
|
355
|
-
'out-tsc', 'dist-server', 'server', 'client', 'browser', 'esm', 'cjs', 'umd', 'lib', 'es'
|
|
236
|
+
// Package manager caches
|
|
237
|
+
'.pnpm', '.bun',
|
|
238
|
+
// Static/built asset directories
|
|
239
|
+
'assets', 'static', 'public', 'wwwroot', 'www',
|
|
240
|
+
// Misc generated
|
|
241
|
+
'cmake_build_debug', 'cmake_build_release', 'CMakeFiles',
|
|
356
242
|
]);
|
|
357
243
|
|
|
358
244
|
export function isCodeFile(filePath) {
|
|
@@ -373,28 +259,34 @@ export function isCodeFile(filePath) {
|
|
|
373
259
|
export function shouldIgnoreDirectory(dirPath) {
|
|
374
260
|
const normalizedPath = dirPath.replace(/\\/g, '/');
|
|
375
261
|
const pathParts = normalizedPath.split('/');
|
|
376
|
-
|
|
377
262
|
for (const part of pathParts) {
|
|
378
263
|
if (IGNORED_DIRECTORIES.has(part)) {
|
|
379
264
|
return true;
|
|
380
265
|
}
|
|
381
266
|
}
|
|
382
|
-
|
|
383
267
|
return false;
|
|
384
268
|
}
|
|
385
269
|
|
|
386
|
-
export function shouldIgnore(filePath, ignorePatterns) {
|
|
270
|
+
export function shouldIgnore(filePath, ignorePatterns, isDirectory = false) {
|
|
387
271
|
const normalizedPath = filePath.replace(/\\/g, '/');
|
|
388
272
|
const pathParts = normalizedPath.split('/');
|
|
389
273
|
const fileName = pathParts[pathParts.length - 1];
|
|
390
|
-
|
|
391
|
-
|
|
274
|
+
|
|
275
|
+
if (isDirectory) {
|
|
276
|
+
if (IGNORED_DIRECTORIES.has(fileName)) return true;
|
|
277
|
+
for (const pattern of ignorePatterns) {
|
|
278
|
+
if (!pattern.includes('/') && fileName === pattern) return true;
|
|
279
|
+
}
|
|
280
|
+
return false;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// For files: check all ancestor directories
|
|
392
284
|
for (const part of pathParts.slice(0, -1)) {
|
|
393
285
|
if (IGNORED_DIRECTORIES.has(part)) {
|
|
394
286
|
return true;
|
|
395
287
|
}
|
|
396
288
|
}
|
|
397
|
-
|
|
289
|
+
|
|
398
290
|
// Check if it's a code file using whitelist
|
|
399
291
|
if (!isCodeFile(filePath)) {
|
|
400
292
|
return true;
|
|
@@ -402,22 +294,13 @@ export function shouldIgnore(filePath, ignorePatterns) {
|
|
|
402
294
|
|
|
403
295
|
// Check against additional ignore patterns
|
|
404
296
|
for (const pattern of ignorePatterns) {
|
|
405
|
-
// Handle path patterns (contain /)
|
|
406
297
|
if (pattern.includes('/')) {
|
|
407
|
-
if (normalizedPath.includes(pattern))
|
|
408
|
-
|
|
409
|
-
}
|
|
410
|
-
}
|
|
411
|
-
// Handle exact file name patterns
|
|
412
|
-
else if (fileName === pattern) {
|
|
298
|
+
if (normalizedPath.includes(pattern)) return true;
|
|
299
|
+
} else if (fileName === pattern) {
|
|
413
300
|
return true;
|
|
414
|
-
}
|
|
415
|
-
// Handle directory name patterns (match any path part)
|
|
416
|
-
else {
|
|
301
|
+
} else {
|
|
417
302
|
for (const part of pathParts) {
|
|
418
|
-
if (part === pattern
|
|
419
|
-
return true;
|
|
420
|
-
}
|
|
303
|
+
if (part === pattern) return true;
|
|
421
304
|
}
|
|
422
305
|
}
|
|
423
306
|
}
|
package/src/scanner.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { readdirSync, statSync, readFileSync } from 'fs';
|
|
2
2
|
import { join, relative } from 'path';
|
|
3
|
-
import { shouldIgnore, isCodeFile } from './ignore-parser.js';
|
|
3
|
+
import { shouldIgnore, shouldIgnoreDirectory, isCodeFile } from './ignore-parser.js';
|
|
4
4
|
|
|
5
5
|
function getFileExtension(filePath) {
|
|
6
6
|
const lastDot = filePath.lastIndexOf('.');
|
|
@@ -33,14 +33,15 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
|
|
|
33
33
|
// Normalize to forward slashes for consistent ignore pattern matching
|
|
34
34
|
const normalizedRelPath = relPath.replace(/\\/g, '/');
|
|
35
35
|
|
|
36
|
-
// Check if should ignore
|
|
37
|
-
if (shouldIgnore(normalizedRelPath, ignorePatterns)) {
|
|
38
|
-
continue;
|
|
39
|
-
}
|
|
40
|
-
|
|
41
36
|
if (entry.isDirectory()) {
|
|
37
|
+
if (shouldIgnoreDirectory(normalizedRelPath) || shouldIgnore(normalizedRelPath, ignorePatterns, true)) {
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
42
40
|
files.push(...walkDirectory(fullPath, ignorePatterns, relPath));
|
|
43
41
|
} else if (entry.isFile()) {
|
|
42
|
+
if (shouldIgnore(normalizedRelPath, ignorePatterns, false)) {
|
|
43
|
+
continue;
|
|
44
|
+
}
|
|
44
45
|
if (isCodeFile(normalizedRelPath) && !isBinaryFile(entry.name)) {
|
|
45
46
|
try {
|
|
46
47
|
const stat = entry.isSymbolicLink ? null : statSync(fullPath);
|
|
@@ -64,7 +65,7 @@ function walkDirectory(dirPath, ignorePatterns, relativePath = '') {
|
|
|
64
65
|
return files;
|
|
65
66
|
}
|
|
66
67
|
|
|
67
|
-
function chunkContent(content, chunkSize =
|
|
68
|
+
function chunkContent(content, chunkSize = 60, overlapSize = 15) {
|
|
68
69
|
const lines = content.split('\n');
|
|
69
70
|
const chunks = [];
|
|
70
71
|
|
|
@@ -80,7 +81,6 @@ function chunkContent(content, chunkSize = 1000, overlapSize = 100) {
|
|
|
80
81
|
});
|
|
81
82
|
}
|
|
82
83
|
|
|
83
|
-
// Stop if we've reached the end
|
|
84
84
|
if (endIdx === lines.length) {
|
|
85
85
|
break;
|
|
86
86
|
}
|
|
@@ -99,7 +99,7 @@ export function scanRepository(rootPath, ignorePatterns) {
|
|
|
99
99
|
const mtime = file.mtime;
|
|
100
100
|
|
|
101
101
|
// For small files, treat as single chunk
|
|
102
|
-
if (content.split('\n').length <=
|
|
102
|
+
if (content.split('\n').length <= 60) {
|
|
103
103
|
chunks.push({
|
|
104
104
|
file_path: file.relativePath,
|
|
105
105
|
chunk_index: 0,
|
package/src/search.js
CHANGED
|
@@ -1,20 +1,84 @@
|
|
|
1
1
|
import { generateSingleEmbedding } from './embeddings.js';
|
|
2
2
|
import { searchSimilar } from './store.js';
|
|
3
|
+
import { buildTextIndex, searchText } from './text-search.js';
|
|
3
4
|
|
|
4
|
-
export async function executeSearch(query, limit = 10) {
|
|
5
|
+
export async function executeSearch(query, limit = 10, allChunks = null, skipVector = false) {
|
|
5
6
|
if (!query || query.trim().length === 0) {
|
|
6
7
|
throw new Error('Query cannot be empty');
|
|
7
8
|
}
|
|
8
9
|
|
|
9
10
|
console.error(`Searching for: "${query}"`);
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
12
|
+
try {
|
|
13
|
+
let vectorResults = [];
|
|
14
|
+
let textResults = [];
|
|
13
15
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
+
if (allChunks && allChunks.length > 0) {
|
|
17
|
+
const textIndexData = buildTextIndex(allChunks);
|
|
18
|
+
textResults = searchText(query, allChunks, textIndexData);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const hasGoodTextResults = textResults.length > 0 && textResults[0].score > 0.3;
|
|
22
|
+
if (!skipVector && !hasGoodTextResults) {
|
|
23
|
+
try {
|
|
24
|
+
const queryEmbedding = await generateSingleEmbedding(query);
|
|
25
|
+
vectorResults = await searchSimilar(queryEmbedding, limit * 2);
|
|
26
|
+
} catch (e) {
|
|
27
|
+
console.warn(`Vector search unavailable: ${e.message}`);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if (vectorResults.length > 0 && textResults.length > 0) {
|
|
32
|
+
return mergeSearchResults(vectorResults, textResults.slice(0, limit * 2), limit);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const allResults = vectorResults.length > 0 ? vectorResults : textResults;
|
|
36
|
+
return allResults.slice(0, limit);
|
|
37
|
+
} catch (error) {
|
|
38
|
+
console.error('Search error:', error.message);
|
|
39
|
+
if (allChunks && allChunks.length > 0) {
|
|
40
|
+
const textIndexData = buildTextIndex(allChunks);
|
|
41
|
+
const textResults = searchText(query, allChunks, textIndexData);
|
|
42
|
+
return textResults.slice(0, limit);
|
|
43
|
+
}
|
|
44
|
+
throw error;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function mergeSearchResults(vectorResults, textResults, limit) {
|
|
49
|
+
const merged = new Map();
|
|
50
|
+
|
|
51
|
+
vectorResults.forEach((result) => {
|
|
52
|
+
const key = `${result.file_path}:${result.chunk_index}`;
|
|
53
|
+
merged.set(key, {
|
|
54
|
+
...result,
|
|
55
|
+
vectorScore: result.score || 0,
|
|
56
|
+
textScore: 0,
|
|
57
|
+
finalScore: (result.score || 0) * 0.8
|
|
58
|
+
});
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
textResults.forEach((result) => {
|
|
62
|
+
const key = `${result.file_path}:${result.chunk_index || 0}`;
|
|
63
|
+
if (merged.has(key)) {
|
|
64
|
+
const existing = merged.get(key);
|
|
65
|
+
existing.textScore = result.score || 0;
|
|
66
|
+
existing.finalScore = (existing.vectorScore * 0.8) + (result.score * 0.2);
|
|
67
|
+
} else {
|
|
68
|
+
const textScore = result.score || 0;
|
|
69
|
+
const finalScore = Math.max(textScore * 0.2, textScore > 0.7 ? 0.4 : 0);
|
|
70
|
+
merged.set(key, {
|
|
71
|
+
...result,
|
|
72
|
+
vectorScore: 0,
|
|
73
|
+
textScore,
|
|
74
|
+
finalScore
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
});
|
|
16
78
|
|
|
17
|
-
return
|
|
79
|
+
return Array.from(merged.values())
|
|
80
|
+
.sort((a, b) => b.finalScore - a.finalScore)
|
|
81
|
+
.slice(0, limit);
|
|
18
82
|
}
|
|
19
83
|
|
|
20
84
|
export function formatResults(results) {
|
|
@@ -27,15 +91,14 @@ export function formatResults(results) {
|
|
|
27
91
|
|
|
28
92
|
for (let i = 0; i < results.length; i++) {
|
|
29
93
|
const result = results[i];
|
|
30
|
-
const
|
|
94
|
+
const scoreValue = result.finalScore !== undefined ? result.finalScore : (result.score || 0);
|
|
95
|
+
const scorePercent = (scoreValue * 100).toFixed(1);
|
|
31
96
|
|
|
32
|
-
lines.push(`${
|
|
97
|
+
lines.push(`${i + 1}. ${result.file_path}:${result.line_start}-${result.line_end} (score: ${scorePercent}%)`);
|
|
33
98
|
|
|
34
|
-
// Show code snippet (first 3 lines)
|
|
35
99
|
const codeLines = result.content.split('\n').slice(0, 3);
|
|
36
100
|
for (const line of codeLines) {
|
|
37
|
-
|
|
38
|
-
lines.push(` > ${trimmed}`);
|
|
101
|
+
lines.push(` > ${line.slice(0, 80)}`);
|
|
39
102
|
}
|
|
40
103
|
|
|
41
104
|
lines.push('');
|
package/src/store.js
CHANGED
|
@@ -5,6 +5,7 @@ import { mkdirSync, existsSync } from 'fs';
|
|
|
5
5
|
let dbConnection = null;
|
|
6
6
|
let tableRef = null;
|
|
7
7
|
let isFirstBatch = true;
|
|
8
|
+
let vectorSearchCache = new Map();
|
|
8
9
|
|
|
9
10
|
export async function initStore(dbPath) {
|
|
10
11
|
// Ensure directory exists
|
|
@@ -121,12 +122,19 @@ export async function searchSimilar(queryEmbedding, limit = 10) {
|
|
|
121
122
|
// Ensure vector is a proper array/tensor
|
|
122
123
|
const query = Array.isArray(queryEmbedding) ? queryEmbedding : Array.from(queryEmbedding);
|
|
123
124
|
|
|
125
|
+
// Check cache using 20-dimension hash for near-zero collision rate
|
|
126
|
+
const cacheKey = query.slice(0, 20).join(',');
|
|
127
|
+
const cached = vectorSearchCache.get(cacheKey);
|
|
128
|
+
if (cached) {
|
|
129
|
+
return cached.slice(0, limit);
|
|
130
|
+
}
|
|
131
|
+
|
|
124
132
|
const results = await tableRef
|
|
125
133
|
.search(query)
|
|
126
134
|
.limit(limit)
|
|
127
135
|
.execute();
|
|
128
136
|
|
|
129
|
-
|
|
137
|
+
const formattedResults = results.map(result => {
|
|
130
138
|
const distance = result._distance !== undefined ? result._distance : (result.distance || 0);
|
|
131
139
|
const score = distance !== null && distance !== undefined ? 1 / (1 + distance) : 0;
|
|
132
140
|
return {
|
|
@@ -139,6 +147,15 @@ export async function searchSimilar(queryEmbedding, limit = 10) {
|
|
|
139
147
|
score: score
|
|
140
148
|
};
|
|
141
149
|
});
|
|
150
|
+
|
|
151
|
+
// Cache results (keep max 100 cached searches)
|
|
152
|
+
if (vectorSearchCache.size > 100) {
|
|
153
|
+
const firstKey = vectorSearchCache.keys().next().value;
|
|
154
|
+
vectorSearchCache.delete(firstKey);
|
|
155
|
+
}
|
|
156
|
+
vectorSearchCache.set(cacheKey, formattedResults);
|
|
157
|
+
|
|
158
|
+
return formattedResults;
|
|
142
159
|
} catch (e) {
|
|
143
160
|
console.error('Search failed:', e.message);
|
|
144
161
|
return [];
|
package/src/text-search.js
CHANGED
|
@@ -4,19 +4,21 @@ export function buildTextIndex(chunks) {
|
|
|
4
4
|
|
|
5
5
|
chunks.forEach((chunk, idx) => {
|
|
6
6
|
const tokens = tokenize(chunk.content);
|
|
7
|
+
const fileNameTokens = tokenize(chunk.file_path);
|
|
7
8
|
const symbols = extractSymbols(chunk.content);
|
|
8
9
|
const frequency = new Map();
|
|
9
10
|
|
|
10
11
|
tokens.forEach(token => {
|
|
11
12
|
frequency.set(token, (frequency.get(token) || 0) + 1);
|
|
12
13
|
if (!index.has(token)) {
|
|
13
|
-
index.set(token,
|
|
14
|
+
index.set(token, new Set());
|
|
14
15
|
}
|
|
15
|
-
index.get(token).
|
|
16
|
+
index.get(token).add(idx);
|
|
16
17
|
});
|
|
17
18
|
|
|
18
19
|
chunkMetadata[idx] = {
|
|
19
20
|
tokens,
|
|
21
|
+
fileNameTokens,
|
|
20
22
|
symbols,
|
|
21
23
|
frequency,
|
|
22
24
|
isCode: isCodeFile(chunk.file_path),
|
|
@@ -32,47 +34,66 @@ export function searchText(query, chunks, indexData) {
|
|
|
32
34
|
const querySymbols = extractSymbols(query);
|
|
33
35
|
const chunkScores = new Map();
|
|
34
36
|
|
|
35
|
-
|
|
37
|
+
// Use index to find candidate chunks efficiently
|
|
38
|
+
const candidates = new Set();
|
|
39
|
+
queryTokens.forEach(token => {
|
|
40
|
+
if (index.has(token)) {
|
|
41
|
+
for (const idx of index.get(token)) candidates.add(idx);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
querySymbols.forEach(sym => {
|
|
45
|
+
if (index.has(sym)) {
|
|
46
|
+
for (const idx of index.get(sym)) candidates.add(idx);
|
|
47
|
+
}
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
for (const idx of candidates) {
|
|
51
|
+
const chunk = chunks[idx];
|
|
52
|
+
const meta = chunkMetadata[idx];
|
|
36
53
|
let score = 0;
|
|
37
54
|
|
|
55
|
+
// Exact phrase match - highest priority (saves embedding cost)
|
|
56
|
+
if (chunk.content.toLowerCase().includes(query.toLowerCase())) {
|
|
57
|
+
score += 30;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Symbol match in content - function/class named after query terms
|
|
61
|
+
querySymbols.forEach(symbol => {
|
|
62
|
+
if (meta.symbols.includes(symbol)) score += 10;
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
// Filename token match - strong signal that this file is about the query topic
|
|
66
|
+
let fileNameMatches = 0;
|
|
38
67
|
queryTokens.forEach(token => {
|
|
39
|
-
if (
|
|
40
|
-
if (index.get(token).includes(idx)) {
|
|
41
|
-
const freq = chunkMetadata[idx].frequency.get(token) || 1;
|
|
42
|
-
const boost = token.length > 4 ? 1.5 : 1;
|
|
43
|
-
score += boost * freq;
|
|
44
|
-
}
|
|
45
|
-
}
|
|
68
|
+
if (meta.fileNameTokens.includes(token)) fileNameMatches++;
|
|
46
69
|
});
|
|
70
|
+
if (fileNameMatches > 0) {
|
|
71
|
+
score += fileNameMatches * 10;
|
|
72
|
+
}
|
|
47
73
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
74
|
+
// Token frequency scoring
|
|
75
|
+
queryTokens.forEach(token => {
|
|
76
|
+
if (index.has(token) && index.get(token).has(idx)) {
|
|
77
|
+
const freq = meta.frequency.get(token) || 1;
|
|
78
|
+
const lengthBoost = token.length > 4 ? 1.5 : 1;
|
|
79
|
+
score += lengthBoost * Math.min(freq, 5);
|
|
51
80
|
}
|
|
52
81
|
});
|
|
53
82
|
|
|
54
|
-
|
|
55
|
-
if (
|
|
56
|
-
score += 10;
|
|
57
|
-
}
|
|
83
|
+
// Code file boost
|
|
84
|
+
if (meta.isCode) score *= 1.2;
|
|
58
85
|
|
|
59
|
-
if (
|
|
60
|
-
|
|
61
|
-
}
|
|
86
|
+
if (score > 0) chunkScores.set(idx, score);
|
|
87
|
+
}
|
|
62
88
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
}
|
|
66
|
-
});
|
|
89
|
+
const entries = Array.from(chunkScores.entries()).sort((a, b) => b[1] - a[1]);
|
|
90
|
+
const maxScore = entries.length > 0 ? entries[0][1] : 1;
|
|
67
91
|
|
|
68
|
-
const results =
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
}))
|
|
74
|
-
.filter(r => r.score > 0)
|
|
75
|
-
.sort((a, b) => b._rawScore - a._rawScore);
|
|
92
|
+
const results = entries.map(([idx, score]) => ({
|
|
93
|
+
...chunks[idx],
|
|
94
|
+
score: score / maxScore,
|
|
95
|
+
_rawScore: score,
|
|
96
|
+
}));
|
|
76
97
|
|
|
77
98
|
return results;
|
|
78
99
|
}
|
|
@@ -80,20 +101,24 @@ export function searchText(query, chunks, indexData) {
|
|
|
80
101
|
function tokenize(text) {
|
|
81
102
|
const tokens = new Set();
|
|
82
103
|
|
|
83
|
-
text.
|
|
104
|
+
text.split(/\s+/).forEach(word => {
|
|
84
105
|
if (word.length === 0) return;
|
|
85
106
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
const camelCaseTokens = word.match(/[a-z]+|[A-Z][a-z]*|[0-9]+/g) || [];
|
|
107
|
+
// camelCase/PascalCase split BEFORE lowercasing so uppercase boundaries are visible
|
|
108
|
+
const camelCaseTokens = word.match(/[A-Z]?[a-z]+|[A-Z]+(?=[A-Z][a-z]|\d|\W|$)|[0-9]+/g) || [];
|
|
89
109
|
camelCaseTokens.forEach(t => {
|
|
90
110
|
if (t.length > 1) tokens.add(t.toLowerCase());
|
|
91
111
|
});
|
|
92
112
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
113
|
+
// snake_case and kebab-case split
|
|
114
|
+
word.split(/[-_.]/).forEach(t => {
|
|
115
|
+
const cleaned = t.replace(/[^\w]/g, '').toLowerCase();
|
|
116
|
+
if (cleaned.length > 1) tokens.add(cleaned);
|
|
96
117
|
});
|
|
118
|
+
|
|
119
|
+
// Full word lowercased (stripped of punctuation)
|
|
120
|
+
const cleaned = word.replace(/[^\w]/g, '').toLowerCase();
|
|
121
|
+
if (cleaned.length > 1) tokens.add(cleaned);
|
|
97
122
|
});
|
|
98
123
|
|
|
99
124
|
return Array.from(tokens).filter(t => t.length > 1);
|