smart-coding-mcp 1.3.0 → 1.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/features/clear-cache.js +30 -7
- package/features/index-codebase.js +66 -13
- package/index.js +1 -1
- package/lib/cache.js +5 -0
- package/lib/config.js +2 -1
- package/package.json +6 -1
- package/test/clear-cache.test.js +288 -0
- package/test/embedding-model.test.js +230 -0
- package/test/helpers.js +128 -0
- package/test/hybrid-search.test.js +243 -0
- package/test/index-codebase.test.js +246 -0
- package/test/integration.test.js +223 -0
- package/test/tokenizer.test.js +225 -0
- package/vitest.config.js +29 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for CodebaseIndexer feature
|
|
3
|
+
*
|
|
4
|
+
* Tests the indexing functionality including:
|
|
5
|
+
* - File discovery and filtering
|
|
6
|
+
* - Chunk generation and embedding
|
|
7
|
+
* - Concurrent indexing protection
|
|
8
|
+
* - Force reindex behavior
|
|
9
|
+
* - Progress notifications
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
|
|
13
|
+
import {
|
|
14
|
+
createTestFixtures,
|
|
15
|
+
cleanupFixtures,
|
|
16
|
+
clearTestCache,
|
|
17
|
+
createMockRequest,
|
|
18
|
+
measureTime
|
|
19
|
+
} from './helpers.js';
|
|
20
|
+
import * as IndexCodebaseFeature from '../features/index-codebase.js';
|
|
21
|
+
import { CodebaseIndexer } from '../features/index-codebase.js';
|
|
22
|
+
|
|
23
|
+
describe('CodebaseIndexer', () => {
|
|
24
|
+
let fixtures;
|
|
25
|
+
|
|
26
|
+
beforeAll(async () => {
|
|
27
|
+
fixtures = await createTestFixtures({ workerThreads: 2 });
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
afterAll(async () => {
|
|
31
|
+
await cleanupFixtures(fixtures);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
beforeEach(async () => {
|
|
35
|
+
// Reset state
|
|
36
|
+
fixtures.indexer.isIndexing = false;
|
|
37
|
+
fixtures.indexer.terminateWorkers();
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
describe('Basic Indexing', () => {
|
|
41
|
+
it('should index files and create embeddings', async () => {
|
|
42
|
+
// Clear cache first
|
|
43
|
+
await clearTestCache(fixtures.config);
|
|
44
|
+
fixtures.cache.setVectorStore([]);
|
|
45
|
+
fixtures.cache.fileHashes = new Map();
|
|
46
|
+
|
|
47
|
+
// Run indexing
|
|
48
|
+
const result = await fixtures.indexer.indexAll(true);
|
|
49
|
+
|
|
50
|
+
// Should have processed files
|
|
51
|
+
expect(result.skipped).toBe(false);
|
|
52
|
+
expect(result.filesProcessed).toBeGreaterThan(0);
|
|
53
|
+
expect(result.chunksCreated).toBeGreaterThan(0);
|
|
54
|
+
expect(result.totalFiles).toBeGreaterThan(0);
|
|
55
|
+
expect(result.totalChunks).toBeGreaterThan(0);
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
it('should skip unchanged files on subsequent indexing', async () => {
|
|
59
|
+
// First index
|
|
60
|
+
await fixtures.indexer.indexAll(true);
|
|
61
|
+
|
|
62
|
+
// Second index without force
|
|
63
|
+
const result = await fixtures.indexer.indexAll(false);
|
|
64
|
+
|
|
65
|
+
// Should skip processing (files unchanged)
|
|
66
|
+
expect(result.skipped).toBe(false);
|
|
67
|
+
expect(result.filesProcessed).toBe(0);
|
|
68
|
+
expect(result.message).toContain('up to date');
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
it('should reindex all files when force is true', async () => {
|
|
72
|
+
// First index
|
|
73
|
+
await fixtures.indexer.indexAll(true);
|
|
74
|
+
const firstChunks = fixtures.cache.getVectorStore().length;
|
|
75
|
+
|
|
76
|
+
// Force reindex
|
|
77
|
+
const result = await fixtures.indexer.indexAll(true);
|
|
78
|
+
|
|
79
|
+
// Should have processed all files again
|
|
80
|
+
expect(result.filesProcessed).toBeGreaterThan(0);
|
|
81
|
+
expect(result.chunksCreated).toBeGreaterThan(0);
|
|
82
|
+
});
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
describe('Concurrent Indexing Protection', () => {
|
|
86
|
+
it('should prevent concurrent indexing', async () => {
|
|
87
|
+
// Clear for clean state
|
|
88
|
+
await clearTestCache(fixtures.config);
|
|
89
|
+
fixtures.cache.setVectorStore([]);
|
|
90
|
+
fixtures.cache.fileHashes = new Map();
|
|
91
|
+
|
|
92
|
+
// Start first indexing
|
|
93
|
+
const promise1 = fixtures.indexer.indexAll(true);
|
|
94
|
+
|
|
95
|
+
// Wait for it to start
|
|
96
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
97
|
+
expect(fixtures.indexer.isIndexing).toBe(true);
|
|
98
|
+
|
|
99
|
+
// Second call should be skipped
|
|
100
|
+
const result2 = await fixtures.indexer.indexAll(false);
|
|
101
|
+
|
|
102
|
+
expect(result2.skipped).toBe(true);
|
|
103
|
+
expect(result2.reason).toContain('already in progress');
|
|
104
|
+
|
|
105
|
+
await promise1;
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
it('should set and clear isIndexing flag correctly', async () => {
|
|
109
|
+
// Clear cache to ensure indexing actually runs
|
|
110
|
+
await clearTestCache(fixtures.config);
|
|
111
|
+
fixtures.cache.setVectorStore([]);
|
|
112
|
+
fixtures.cache.fileHashes = new Map();
|
|
113
|
+
|
|
114
|
+
expect(fixtures.indexer.isIndexing).toBe(false);
|
|
115
|
+
|
|
116
|
+
const promise = fixtures.indexer.indexAll(true);
|
|
117
|
+
|
|
118
|
+
// Should be set during indexing
|
|
119
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
120
|
+
expect(fixtures.indexer.isIndexing).toBe(true);
|
|
121
|
+
|
|
122
|
+
await promise;
|
|
123
|
+
|
|
124
|
+
// Should be cleared after indexing
|
|
125
|
+
expect(fixtures.indexer.isIndexing).toBe(false);
|
|
126
|
+
});
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
describe('File Discovery', () => {
|
|
130
|
+
it('should discover files matching configured extensions', async () => {
|
|
131
|
+
const files = await fixtures.indexer.discoverFiles();
|
|
132
|
+
|
|
133
|
+
expect(files.length).toBeGreaterThan(0);
|
|
134
|
+
|
|
135
|
+
// All files should have valid extensions
|
|
136
|
+
const extensions = fixtures.config.fileExtensions.map(ext => `.${ext}`);
|
|
137
|
+
for (const file of files) {
|
|
138
|
+
const ext = file.substring(file.lastIndexOf('.'));
|
|
139
|
+
expect(extensions).toContain(ext);
|
|
140
|
+
}
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
it('should exclude files in excluded directories', async () => {
|
|
144
|
+
const files = await fixtures.indexer.discoverFiles();
|
|
145
|
+
|
|
146
|
+
// No files from node_modules
|
|
147
|
+
const nodeModulesFiles = files.filter(f => f.includes('node_modules'));
|
|
148
|
+
expect(nodeModulesFiles.length).toBe(0);
|
|
149
|
+
|
|
150
|
+
// No files from .smart-coding-cache
|
|
151
|
+
const cacheFiles = files.filter(f => f.includes('.smart-coding-cache'));
|
|
152
|
+
expect(cacheFiles.length).toBe(0);
|
|
153
|
+
});
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
describe('Worker Thread Management', () => {
|
|
157
|
+
it('should initialize workers when CPU count > 1', async () => {
|
|
158
|
+
await fixtures.indexer.initializeWorkers();
|
|
159
|
+
|
|
160
|
+
// Should have at least 1 worker on multi-core systems
|
|
161
|
+
expect(fixtures.indexer.workers.length).toBeGreaterThanOrEqual(0);
|
|
162
|
+
|
|
163
|
+
fixtures.indexer.terminateWorkers();
|
|
164
|
+
expect(fixtures.indexer.workers.length).toBe(0);
|
|
165
|
+
});
|
|
166
|
+
});
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
describe('Index Codebase Tool Handler', () => {
|
|
170
|
+
let fixtures;
|
|
171
|
+
|
|
172
|
+
beforeAll(async () => {
|
|
173
|
+
fixtures = await createTestFixtures({ workerThreads: 2 });
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
afterAll(async () => {
|
|
177
|
+
await cleanupFixtures(fixtures);
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
beforeEach(async () => {
|
|
181
|
+
fixtures.indexer.isIndexing = false;
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
describe('Tool Definition', () => {
|
|
185
|
+
it('should have correct tool definition', () => {
|
|
186
|
+
const toolDef = IndexCodebaseFeature.getToolDefinition();
|
|
187
|
+
|
|
188
|
+
expect(toolDef.name).toBe('b_index_codebase');
|
|
189
|
+
expect(toolDef.description).toContain('reindex');
|
|
190
|
+
expect(toolDef.inputSchema.properties.force).toBeDefined();
|
|
191
|
+
expect(toolDef.inputSchema.properties.force.type).toBe('boolean');
|
|
192
|
+
});
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
describe('Tool Handler', () => {
|
|
196
|
+
it('should return success message on completed indexing', async () => {
|
|
197
|
+
const request = createMockRequest('b_index_codebase', { force: false });
|
|
198
|
+
const result = await IndexCodebaseFeature.handleToolCall(request, fixtures.indexer);
|
|
199
|
+
|
|
200
|
+
expect(result.content[0].text).toContain('reindexed successfully');
|
|
201
|
+
expect(result.content[0].text).toContain('Total files in index');
|
|
202
|
+
expect(result.content[0].text).toContain('Total code chunks');
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
it('should return skipped message on concurrent calls', async () => {
|
|
206
|
+
// Start first indexing
|
|
207
|
+
await clearTestCache(fixtures.config);
|
|
208
|
+
fixtures.cache.setVectorStore([]);
|
|
209
|
+
fixtures.cache.fileHashes = new Map();
|
|
210
|
+
|
|
211
|
+
const promise1 = IndexCodebaseFeature.handleToolCall(
|
|
212
|
+
createMockRequest('b_index_codebase', { force: true }),
|
|
213
|
+
fixtures.indexer
|
|
214
|
+
);
|
|
215
|
+
|
|
216
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
217
|
+
|
|
218
|
+
// Second concurrent call
|
|
219
|
+
const result2 = await IndexCodebaseFeature.handleToolCall(
|
|
220
|
+
createMockRequest('b_index_codebase', { force: false }),
|
|
221
|
+
fixtures.indexer
|
|
222
|
+
);
|
|
223
|
+
|
|
224
|
+
expect(result2.content[0].text).toContain('Indexing skipped');
|
|
225
|
+
expect(result2.content[0].text).toContain('already in progress');
|
|
226
|
+
|
|
227
|
+
await promise1;
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
it('should handle force parameter correctly', async () => {
|
|
231
|
+
// First index
|
|
232
|
+
await IndexCodebaseFeature.handleToolCall(
|
|
233
|
+
createMockRequest('b_index_codebase', { force: true }),
|
|
234
|
+
fixtures.indexer
|
|
235
|
+
);
|
|
236
|
+
|
|
237
|
+
// Non-force should skip unchanged
|
|
238
|
+
const result = await IndexCodebaseFeature.handleToolCall(
|
|
239
|
+
createMockRequest('b_index_codebase', { force: false }),
|
|
240
|
+
fixtures.indexer
|
|
241
|
+
);
|
|
242
|
+
|
|
243
|
+
expect(result.content[0].text).toContain('up to date');
|
|
244
|
+
});
|
|
245
|
+
});
|
|
246
|
+
});
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Integration tests for cross-feature interactions
|
|
3
|
+
*
|
|
4
|
+
* Tests scenarios that involve multiple features working together:
|
|
5
|
+
* 1. Concurrent indexing protection across MCP tool calls
|
|
6
|
+
* 2. Clear cache interaction with indexing
|
|
7
|
+
* 3. Tool handler response quality
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { describe, it, expect, beforeAll, afterAll, beforeEach } from 'vitest';
|
|
11
|
+
import {
|
|
12
|
+
createTestFixtures,
|
|
13
|
+
cleanupFixtures,
|
|
14
|
+
clearTestCache,
|
|
15
|
+
createMockRequest,
|
|
16
|
+
measureTime
|
|
17
|
+
} from './helpers.js';
|
|
18
|
+
import * as IndexCodebaseFeature from '../features/index-codebase.js';
|
|
19
|
+
import * as ClearCacheFeature from '../features/clear-cache.js';
|
|
20
|
+
|
|
21
|
+
describe('Concurrent Indexing', () => {
|
|
22
|
+
let fixtures;
|
|
23
|
+
|
|
24
|
+
beforeAll(async () => {
|
|
25
|
+
fixtures = await createTestFixtures({ workerThreads: 2 });
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
afterAll(async () => {
|
|
29
|
+
await cleanupFixtures(fixtures);
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
beforeEach(async () => {
|
|
33
|
+
// Reset indexing state
|
|
34
|
+
fixtures.indexer.isIndexing = false;
|
|
35
|
+
// Clear cache for clean state
|
|
36
|
+
await clearTestCache(fixtures.config);
|
|
37
|
+
fixtures.cache.setVectorStore([]);
|
|
38
|
+
fixtures.cache.fileHashes = new Map();
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it('should only run one indexer at a time', async () => {
|
|
42
|
+
const request1 = createMockRequest('b_index_codebase', { force: true });
|
|
43
|
+
const request2 = createMockRequest('b_index_codebase', { force: false });
|
|
44
|
+
|
|
45
|
+
// Start first indexing
|
|
46
|
+
const promise1 = IndexCodebaseFeature.handleToolCall(request1, fixtures.indexer);
|
|
47
|
+
|
|
48
|
+
// Wait a bit for first to start
|
|
49
|
+
await new Promise(resolve => setTimeout(resolve, 100));
|
|
50
|
+
|
|
51
|
+
// Verify first is running
|
|
52
|
+
expect(fixtures.indexer.isIndexing).toBe(true);
|
|
53
|
+
|
|
54
|
+
// Start second indexing while first is running
|
|
55
|
+
const promise2 = IndexCodebaseFeature.handleToolCall(request2, fixtures.indexer);
|
|
56
|
+
|
|
57
|
+
// Wait for both to complete
|
|
58
|
+
const [result1, result2] = await Promise.all([promise1, promise2]);
|
|
59
|
+
|
|
60
|
+
// First should complete with stats
|
|
61
|
+
expect(result1.content[0].text).toContain('reindexed successfully');
|
|
62
|
+
expect(result1.content[0].text).toContain('Total files in index');
|
|
63
|
+
|
|
64
|
+
// Second should clearly indicate it was skipped
|
|
65
|
+
expect(result2.content[0].text).toContain('Indexing skipped');
|
|
66
|
+
expect(result2.content[0].text).toContain('already in progress');
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it('should set isIndexing flag during indexing', async () => {
|
|
70
|
+
// Check initial state
|
|
71
|
+
expect(fixtures.indexer.isIndexing).toBe(false);
|
|
72
|
+
|
|
73
|
+
// Start indexing
|
|
74
|
+
const promise = fixtures.indexer.indexAll(true);
|
|
75
|
+
|
|
76
|
+
// Wait for it to start
|
|
77
|
+
await new Promise(resolve => setTimeout(resolve, 50));
|
|
78
|
+
|
|
79
|
+
// Check flag is set
|
|
80
|
+
expect(fixtures.indexer.isIndexing).toBe(true);
|
|
81
|
+
|
|
82
|
+
// Wait for completion
|
|
83
|
+
await promise;
|
|
84
|
+
|
|
85
|
+
// Check flag is cleared
|
|
86
|
+
expect(fixtures.indexer.isIndexing).toBe(false);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
it('should skip concurrent indexing calls gracefully', async () => {
|
|
90
|
+
// Start first indexing
|
|
91
|
+
const promise1 = fixtures.indexer.indexAll(true);
|
|
92
|
+
|
|
93
|
+
await new Promise(resolve => setTimeout(resolve, 50));
|
|
94
|
+
|
|
95
|
+
// Second call should return immediately with skipped status
|
|
96
|
+
const { result, duration } = await measureTime(() => fixtures.indexer.indexAll(false));
|
|
97
|
+
|
|
98
|
+
// Second call should return very quickly (not run full indexing)
|
|
99
|
+
expect(duration).toBeLessThan(100);
|
|
100
|
+
|
|
101
|
+
// Should indicate it was skipped
|
|
102
|
+
expect(result.skipped).toBe(true);
|
|
103
|
+
expect(result.reason).toContain('already in progress');
|
|
104
|
+
|
|
105
|
+
await promise1;
|
|
106
|
+
});
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
describe('Clear Cache Operations', () => {
|
|
110
|
+
let fixtures;
|
|
111
|
+
|
|
112
|
+
beforeAll(async () => {
|
|
113
|
+
fixtures = await createTestFixtures({ workerThreads: 2 });
|
|
114
|
+
});
|
|
115
|
+
|
|
116
|
+
afterAll(async () => {
|
|
117
|
+
await cleanupFixtures(fixtures);
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
beforeEach(async () => {
|
|
121
|
+
fixtures.indexer.isIndexing = false;
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
it('should prevent clear cache while indexing', async () => {
|
|
125
|
+
// Start indexing
|
|
126
|
+
const indexPromise = fixtures.indexer.indexAll(true);
|
|
127
|
+
|
|
128
|
+
await new Promise(resolve => setTimeout(resolve, 50));
|
|
129
|
+
|
|
130
|
+
// Try to clear cache
|
|
131
|
+
const request = createMockRequest('c_clear_cache', {});
|
|
132
|
+
const result = await ClearCacheFeature.handleToolCall(request, fixtures.cacheClearer);
|
|
133
|
+
|
|
134
|
+
// Should fail with appropriate message
|
|
135
|
+
expect(result.content[0].text).toContain('indexing is in progress');
|
|
136
|
+
|
|
137
|
+
await indexPromise;
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it('should allow clear cache after indexing completes', async () => {
|
|
141
|
+
// First index
|
|
142
|
+
await fixtures.indexer.indexAll(true);
|
|
143
|
+
|
|
144
|
+
// Verify indexing is done
|
|
145
|
+
expect(fixtures.indexer.isIndexing).toBe(false);
|
|
146
|
+
|
|
147
|
+
// Now clear cache
|
|
148
|
+
const request = createMockRequest('c_clear_cache', {});
|
|
149
|
+
const result = await ClearCacheFeature.handleToolCall(request, fixtures.cacheClearer);
|
|
150
|
+
|
|
151
|
+
// Should succeed
|
|
152
|
+
expect(result.content[0].text).toContain('Cache cleared successfully');
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
it('should clear cache immediately after indexing without crash', async () => {
|
|
156
|
+
// This tests the race condition scenario
|
|
157
|
+
await fixtures.indexer.indexAll(true);
|
|
158
|
+
|
|
159
|
+
// Immediately clear (potential race with cache.save())
|
|
160
|
+
const result = await fixtures.cacheClearer.execute();
|
|
161
|
+
|
|
162
|
+
expect(result.success).toBe(true);
|
|
163
|
+
expect(result.message).toContain('Cache cleared successfully');
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
it('should handle multiple concurrent clear cache calls', async () => {
|
|
167
|
+
// First index to have something to clear
|
|
168
|
+
await fixtures.indexer.indexAll(true);
|
|
169
|
+
|
|
170
|
+
// Reset the isClearing flag
|
|
171
|
+
fixtures.cacheClearer.isClearing = false;
|
|
172
|
+
|
|
173
|
+
// Multiple concurrent clears - with new mutex, only first should succeed
|
|
174
|
+
const promises = [
|
|
175
|
+
fixtures.cacheClearer.execute(),
|
|
176
|
+
fixtures.cacheClearer.execute(),
|
|
177
|
+
fixtures.cacheClearer.execute()
|
|
178
|
+
];
|
|
179
|
+
|
|
180
|
+
const results = await Promise.allSettled(promises);
|
|
181
|
+
|
|
182
|
+
// First should succeed, others should fail with "already in progress"
|
|
183
|
+
const successes = results.filter(r => r.status === 'fulfilled');
|
|
184
|
+
const failures = results.filter(r => r.status === 'rejected');
|
|
185
|
+
|
|
186
|
+
expect(successes.length).toBe(1);
|
|
187
|
+
expect(failures.length).toBe(2);
|
|
188
|
+
|
|
189
|
+
// Verify failure message
|
|
190
|
+
for (const failure of failures) {
|
|
191
|
+
expect(failure.reason.message).toContain('already in progress');
|
|
192
|
+
}
|
|
193
|
+
});
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
describe('Tool Handler Response Quality', () => {
|
|
197
|
+
let fixtures;
|
|
198
|
+
|
|
199
|
+
beforeAll(async () => {
|
|
200
|
+
fixtures = await createTestFixtures({ workerThreads: 2 });
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
afterAll(async () => {
|
|
204
|
+
await cleanupFixtures(fixtures);
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
it('should return meaningful response when indexing is skipped', async () => {
|
|
208
|
+
// Start first indexing
|
|
209
|
+
const promise1 = fixtures.indexer.indexAll(true);
|
|
210
|
+
await new Promise(resolve => setTimeout(resolve, 50));
|
|
211
|
+
|
|
212
|
+
// Second call via handler
|
|
213
|
+
const request = createMockRequest('b_index_codebase', { force: false });
|
|
214
|
+
const result = await IndexCodebaseFeature.handleToolCall(request, fixtures.indexer);
|
|
215
|
+
|
|
216
|
+
await promise1;
|
|
217
|
+
|
|
218
|
+
// The response should clearly indicate the indexing was skipped
|
|
219
|
+
expect(result.content[0].text).toContain('Indexing skipped');
|
|
220
|
+
expect(result.content[0].text).toContain('already in progress');
|
|
221
|
+
expect(result.content[0].text).toContain('Please wait');
|
|
222
|
+
});
|
|
223
|
+
});
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for Tokenizer utilities
|
|
3
|
+
*
|
|
4
|
+
* Tests the token estimation and model-specific limits including:
|
|
5
|
+
* - Token estimation for various text types
|
|
6
|
+
* - Model token limits lookup
|
|
7
|
+
* - Chunking parameters calculation
|
|
8
|
+
* - Token limit checking
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { describe, it, expect } from 'vitest';
|
|
12
|
+
import {
|
|
13
|
+
estimateTokens,
|
|
14
|
+
getModelTokenLimit,
|
|
15
|
+
getChunkingParams,
|
|
16
|
+
exceedsTokenLimit,
|
|
17
|
+
MODEL_TOKEN_LIMITS
|
|
18
|
+
} from '../lib/tokenizer.js';
|
|
19
|
+
|
|
20
|
+
describe('Token Estimation', () => {
|
|
21
|
+
describe('estimateTokens', () => {
|
|
22
|
+
it('should return 0 for empty string', () => {
|
|
23
|
+
expect(estimateTokens('')).toBe(0);
|
|
24
|
+
expect(estimateTokens(null)).toBe(0);
|
|
25
|
+
expect(estimateTokens(undefined)).toBe(0);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it('should count simple words correctly', () => {
|
|
29
|
+
// Simple words get ~1 token each + 2 for CLS/SEP
|
|
30
|
+
const result = estimateTokens('hello world');
|
|
31
|
+
expect(result).toBeGreaterThanOrEqual(4); // 2 words + 2 special tokens
|
|
32
|
+
expect(result).toBeLessThanOrEqual(6);
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
it('should add extra tokens for long words', () => {
|
|
36
|
+
const shortWord = estimateTokens('cat');
|
|
37
|
+
const longWord = estimateTokens('internationalization');
|
|
38
|
+
|
|
39
|
+
// Long words should have more tokens due to subword splitting
|
|
40
|
+
expect(longWord).toBeGreaterThan(shortWord);
|
|
41
|
+
});
|
|
42
|
+
|
|
43
|
+
it('should count special characters', () => {
|
|
44
|
+
const withoutSpecial = estimateTokens('hello world');
|
|
45
|
+
const withSpecial = estimateTokens('hello(); world{}');
|
|
46
|
+
|
|
47
|
+
// Special characters add to token count
|
|
48
|
+
expect(withSpecial).toBeGreaterThan(withoutSpecial);
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it('should handle code snippets', () => {
|
|
52
|
+
const code = `
|
|
53
|
+
function test() {
|
|
54
|
+
const x = 10;
|
|
55
|
+
return x * 2;
|
|
56
|
+
}
|
|
57
|
+
`;
|
|
58
|
+
|
|
59
|
+
const tokens = estimateTokens(code);
|
|
60
|
+
|
|
61
|
+
// Code has many special chars, should have reasonable token count
|
|
62
|
+
expect(tokens).toBeGreaterThan(10);
|
|
63
|
+
expect(tokens).toBeLessThan(100);
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
it('should handle multiline text', () => {
|
|
67
|
+
const multiline = 'line one\nline two\nline three';
|
|
68
|
+
const tokens = estimateTokens(multiline);
|
|
69
|
+
|
|
70
|
+
expect(tokens).toBeGreaterThan(5);
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
describe('Model Token Limits', () => {
|
|
76
|
+
describe('MODEL_TOKEN_LIMITS', () => {
|
|
77
|
+
it('should have default limit', () => {
|
|
78
|
+
expect(MODEL_TOKEN_LIMITS['default']).toBeDefined();
|
|
79
|
+
expect(MODEL_TOKEN_LIMITS['default']).toBe(256);
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('should have limits for MiniLM models', () => {
|
|
83
|
+
expect(MODEL_TOKEN_LIMITS['Xenova/all-MiniLM-L6-v2']).toBe(256);
|
|
84
|
+
expect(MODEL_TOKEN_LIMITS['Xenova/all-MiniLM-L12-v2']).toBe(256);
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
it('should have limits for code-specific models', () => {
|
|
88
|
+
expect(MODEL_TOKEN_LIMITS['Xenova/codebert-base']).toBe(512);
|
|
89
|
+
expect(MODEL_TOKEN_LIMITS['Xenova/graphcodebert-base']).toBe(512);
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
it('should have limits for E5 and BGE models', () => {
|
|
93
|
+
expect(MODEL_TOKEN_LIMITS['Xenova/e5-small-v2']).toBe(512);
|
|
94
|
+
expect(MODEL_TOKEN_LIMITS['Xenova/bge-base-en-v1.5']).toBe(512);
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
describe('getModelTokenLimit', () => {
|
|
99
|
+
it('should return correct limit for known models', () => {
|
|
100
|
+
expect(getModelTokenLimit('Xenova/all-MiniLM-L6-v2')).toBe(256);
|
|
101
|
+
expect(getModelTokenLimit('Xenova/codebert-base')).toBe(512);
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it('should return default for unknown models', () => {
|
|
105
|
+
expect(getModelTokenLimit('unknown/model-name')).toBe(256);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
it('should return default for null/undefined', () => {
|
|
109
|
+
expect(getModelTokenLimit(null)).toBe(256);
|
|
110
|
+
expect(getModelTokenLimit(undefined)).toBe(256);
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
it('should be case-insensitive', () => {
|
|
114
|
+
const normalCase = getModelTokenLimit('Xenova/all-MiniLM-L6-v2');
|
|
115
|
+
const lowerCase = getModelTokenLimit('xenova/all-minilm-l6-v2');
|
|
116
|
+
|
|
117
|
+
expect(lowerCase).toBe(normalCase);
|
|
118
|
+
});
|
|
119
|
+
});
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
describe('Chunking Parameters', () => {
|
|
123
|
+
describe('getChunkingParams', () => {
|
|
124
|
+
it('should return correct params for default model', () => {
|
|
125
|
+
const params = getChunkingParams('Xenova/all-MiniLM-L6-v2');
|
|
126
|
+
|
|
127
|
+
expect(params.maxTokens).toBe(256);
|
|
128
|
+
expect(params.targetTokens).toBeLessThan(256); // 85% of max
|
|
129
|
+
expect(params.targetTokens).toBeGreaterThan(200);
|
|
130
|
+
expect(params.overlapTokens).toBeLessThan(params.targetTokens);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
it('should calculate ~85% for target tokens', () => {
|
|
134
|
+
const params = getChunkingParams('Xenova/codebert-base'); // 512 limit
|
|
135
|
+
|
|
136
|
+
// 85% of 512 = 435.2 -> floor = 435
|
|
137
|
+
expect(params.targetTokens).toBe(Math.floor(512 * 0.85));
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
it('should calculate ~18% overlap', () => {
|
|
141
|
+
const params = getChunkingParams('Xenova/all-MiniLM-L6-v2');
|
|
142
|
+
|
|
143
|
+
const expectedOverlap = Math.floor(params.targetTokens * 0.18);
|
|
144
|
+
expect(params.overlapTokens).toBe(expectedOverlap);
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it('should return all three parameters', () => {
|
|
148
|
+
const params = getChunkingParams('Xenova/all-MiniLM-L6-v2');
|
|
149
|
+
|
|
150
|
+
expect(params).toHaveProperty('maxTokens');
|
|
151
|
+
expect(params).toHaveProperty('targetTokens');
|
|
152
|
+
expect(params).toHaveProperty('overlapTokens');
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
it('should handle unknown models with defaults', () => {
|
|
156
|
+
const params = getChunkingParams('unknown/model');
|
|
157
|
+
|
|
158
|
+
expect(params.maxTokens).toBe(256);
|
|
159
|
+
expect(params.targetTokens).toBeLessThan(256);
|
|
160
|
+
});
|
|
161
|
+
});
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
describe('Token Limit Checking', () => {
|
|
165
|
+
describe('exceedsTokenLimit', () => {
|
|
166
|
+
it('should return false for short text', () => {
|
|
167
|
+
const shortText = 'hello world';
|
|
168
|
+
expect(exceedsTokenLimit(shortText, 'Xenova/all-MiniLM-L6-v2')).toBe(false);
|
|
169
|
+
});
|
|
170
|
+
|
|
171
|
+
it('should return true for very long text', () => {
|
|
172
|
+
// Create text that definitely exceeds 256 tokens
|
|
173
|
+
const longText = 'word '.repeat(500);
|
|
174
|
+
expect(exceedsTokenLimit(longText, 'Xenova/all-MiniLM-L6-v2')).toBe(true);
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
it('should consider different model limits', () => {
|
|
178
|
+
// Create text that exceeds 256 but not 512
|
|
179
|
+
const mediumText = 'word '.repeat(300);
|
|
180
|
+
|
|
181
|
+
// Should exceed small model limit
|
|
182
|
+
expect(exceedsTokenLimit(mediumText, 'Xenova/all-MiniLM-L6-v2')).toBe(true);
|
|
183
|
+
|
|
184
|
+
// Should not exceed large model limit
|
|
185
|
+
expect(exceedsTokenLimit(mediumText, 'Xenova/codebert-base')).toBe(false);
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
it('should handle empty text', () => {
|
|
189
|
+
expect(exceedsTokenLimit('', 'Xenova/all-MiniLM-L6-v2')).toBe(false);
|
|
190
|
+
});
|
|
191
|
+
});
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
describe('Integration: Token Estimation Accuracy', () => {
|
|
195
|
+
it('should estimate reasonable tokens for typical code chunks', () => {
|
|
196
|
+
const typicalCodeChunk = `
|
|
197
|
+
import { pipeline } from '@xenova/transformers';
|
|
198
|
+
|
|
199
|
+
export class MyClass {
|
|
200
|
+
constructor(config) {
|
|
201
|
+
this.config = config;
|
|
202
|
+
this.data = [];
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
async process(input) {
|
|
206
|
+
const result = await this.transform(input);
|
|
207
|
+
return result.map(item => item.value);
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
`;
|
|
211
|
+
|
|
212
|
+
const tokens = estimateTokens(typicalCodeChunk);
|
|
213
|
+
|
|
214
|
+
// Should be within typical chunk size
|
|
215
|
+
expect(tokens).toBeGreaterThan(30);
|
|
216
|
+
expect(tokens).toBeLessThan(200);
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
it('should keep small code chunks under model limits', () => {
|
|
220
|
+
// A small chunk should definitely be under the limit
|
|
221
|
+
const safeChunk = 'const x = 1;\n'.repeat(10);
|
|
222
|
+
|
|
223
|
+
expect(exceedsTokenLimit(safeChunk, 'Xenova/all-MiniLM-L6-v2')).toBe(false);
|
|
224
|
+
});
|
|
225
|
+
});
|