bluera-knowledge 0.12.10 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/CHANGELOG.md +2 -0
- package/README.md +3 -2
- package/dist/{chunk-VTATT3IR.js → chunk-6ZVW2P2F.js} +100 -38
- package/dist/chunk-6ZVW2P2F.js.map +1 -0
- package/dist/{chunk-6777ULXC.js → chunk-GCUKVV33.js} +2 -2
- package/dist/{chunk-JET33NMA.js → chunk-H5AKKHY7.js} +3 -2
- package/dist/chunk-H5AKKHY7.js.map +1 -0
- package/dist/index.js +3 -3
- package/dist/mcp/server.js +2 -2
- package/dist/workers/background-worker-cli.js +2 -2
- package/package.json +1 -1
- package/src/mcp/handlers/job.handler.ts +5 -0
- package/src/services/index.service.test.ts +347 -0
- package/src/services/index.service.ts +93 -44
- package/src/services/job.service.test.ts +87 -0
- package/src/services/job.service.ts +43 -0
- package/dist/chunk-JET33NMA.js.map +0 -1
- package/dist/chunk-VTATT3IR.js.map +0 -1
- /package/dist/{chunk-6777ULXC.js.map → chunk-GCUKVV33.js.map} +0 -0
|
@@ -1952,3 +1952,350 @@ describe('IndexService - Symlink Handling', () => {
|
|
|
1952
1952
|
// (on most systems, readdir with withFileTypes shows symlinks as isFile() if target is file)
|
|
1953
1953
|
});
|
|
1954
1954
|
});
|
|
1955
|
+
|
|
1956
|
+
describe('IndexService - Batch Embedding', () => {
|
|
1957
|
+
let indexService: IndexService;
|
|
1958
|
+
let lanceStore: LanceStore;
|
|
1959
|
+
let embeddingEngine: EmbeddingEngine;
|
|
1960
|
+
let tempDir: string;
|
|
1961
|
+
let testFilesDir: string;
|
|
1962
|
+
const storeId = createStoreId('batch-embed-test');
|
|
1963
|
+
|
|
1964
|
+
beforeAll(async () => {
|
|
1965
|
+
tempDir = await mkdtemp(join(tmpdir(), 'index-batch-embed-test-'));
|
|
1966
|
+
testFilesDir = join(tempDir, 'files');
|
|
1967
|
+
await mkdir(testFilesDir, { recursive: true });
|
|
1968
|
+
|
|
1969
|
+
lanceStore = new LanceStore(tempDir);
|
|
1970
|
+
embeddingEngine = new EmbeddingEngine();
|
|
1971
|
+
|
|
1972
|
+
await embeddingEngine.initialize();
|
|
1973
|
+
await lanceStore.initialize(storeId);
|
|
1974
|
+
|
|
1975
|
+
indexService = new IndexService(lanceStore, embeddingEngine);
|
|
1976
|
+
}, 120000);
|
|
1977
|
+
|
|
1978
|
+
afterAll(async () => {
|
|
1979
|
+
await rm(tempDir, { recursive: true, force: true });
|
|
1980
|
+
});
|
|
1981
|
+
|
|
1982
|
+
beforeEach(async () => {
|
|
1983
|
+
// Clear test directory for fresh state
|
|
1984
|
+
await rm(testFilesDir, { recursive: true, force: true });
|
|
1985
|
+
await mkdir(testFilesDir, { recursive: true });
|
|
1986
|
+
});
|
|
1987
|
+
|
|
1988
|
+
it('calls embedBatch instead of sequential embed for multiple chunks', async () => {
|
|
1989
|
+
// Create a file large enough to produce multiple chunks (>1500 chars)
|
|
1990
|
+
const largeContent = Array(50)
|
|
1991
|
+
.fill('This is a paragraph of text that will be chunked. ')
|
|
1992
|
+
.join('\n\n');
|
|
1993
|
+
await writeFile(join(testFilesDir, 'large.md'), largeContent);
|
|
1994
|
+
|
|
1995
|
+
const embedBatchSpy = vi.spyOn(embeddingEngine, 'embedBatch');
|
|
1996
|
+
|
|
1997
|
+
const store: FileStore = {
|
|
1998
|
+
type: 'file',
|
|
1999
|
+
id: storeId,
|
|
2000
|
+
name: 'Batch Embed Test Store',
|
|
2001
|
+
path: testFilesDir,
|
|
2002
|
+
createdAt: new Date(),
|
|
2003
|
+
updatedAt: new Date(),
|
|
2004
|
+
};
|
|
2005
|
+
|
|
2006
|
+
const result = await indexService.indexStore(store);
|
|
2007
|
+
|
|
2008
|
+
expect(result.success).toBe(true);
|
|
2009
|
+
if (result.success) {
|
|
2010
|
+
// Should have created multiple chunks
|
|
2011
|
+
expect(result.data.chunksCreated).toBeGreaterThan(1);
|
|
2012
|
+
}
|
|
2013
|
+
|
|
2014
|
+
// embedBatch should be called (it internally uses embed via Promise.all)
|
|
2015
|
+
expect(embedBatchSpy).toHaveBeenCalled();
|
|
2016
|
+
// Verify batch was called with multiple items
|
|
2017
|
+
const callArgs = embedBatchSpy.mock.calls[0];
|
|
2018
|
+
expect(callArgs).toBeDefined();
|
|
2019
|
+
expect(callArgs[0].length).toBeGreaterThan(1);
|
|
2020
|
+
|
|
2021
|
+
embedBatchSpy.mockRestore();
|
|
2022
|
+
});
|
|
2023
|
+
|
|
2024
|
+
it('preserves chunk order when using batch embedding', async () => {
|
|
2025
|
+
// Create file with distinct, ordered sections that will produce multiple chunks
|
|
2026
|
+
const sections = Array(10)
|
|
2027
|
+
.fill(null)
|
|
2028
|
+
.map((_, i) => `# Section ${String(i + 1)}\n\n${'Content for section. '.repeat(50)}`)
|
|
2029
|
+
.join('\n\n');
|
|
2030
|
+
|
|
2031
|
+
await writeFile(join(testFilesDir, 'ordered.md'), sections);
|
|
2032
|
+
|
|
2033
|
+
const embedBatchSpy = vi.spyOn(embeddingEngine, 'embedBatch');
|
|
2034
|
+
|
|
2035
|
+
const store: FileStore = {
|
|
2036
|
+
type: 'file',
|
|
2037
|
+
id: storeId,
|
|
2038
|
+
name: 'Order Test Store',
|
|
2039
|
+
path: testFilesDir,
|
|
2040
|
+
createdAt: new Date(),
|
|
2041
|
+
updatedAt: new Date(),
|
|
2042
|
+
};
|
|
2043
|
+
|
|
2044
|
+
const result = await indexService.indexStore(store);
|
|
2045
|
+
|
|
2046
|
+
expect(result.success).toBe(true);
|
|
2047
|
+
if (result.success) {
|
|
2048
|
+
// Verify chunks are in correct order
|
|
2049
|
+
expect(result.data.chunksCreated).toBeGreaterThan(1);
|
|
2050
|
+
}
|
|
2051
|
+
|
|
2052
|
+
// embedBatch should be called with chunks in order
|
|
2053
|
+
expect(embedBatchSpy).toHaveBeenCalled();
|
|
2054
|
+
const callArgs = embedBatchSpy.mock.calls[0];
|
|
2055
|
+
expect(callArgs).toBeDefined();
|
|
2056
|
+
|
|
2057
|
+
// Verify that if content has "Section 1", it comes before "Section 2" in the array
|
|
2058
|
+
const batchedTexts = callArgs[0];
|
|
2059
|
+
const section1Index = batchedTexts.findIndex((t: string) => t.includes('Section 1'));
|
|
2060
|
+
const section2Index = batchedTexts.findIndex((t: string) => t.includes('Section 2'));
|
|
2061
|
+
|
|
2062
|
+
// Section 1 should appear before Section 2 in the batch (or they may be in different chunks)
|
|
2063
|
+
if (section1Index !== -1 && section2Index !== -1) {
|
|
2064
|
+
expect(section1Index).toBeLessThan(section2Index);
|
|
2065
|
+
}
|
|
2066
|
+
|
|
2067
|
+
embedBatchSpy.mockRestore();
|
|
2068
|
+
});
|
|
2069
|
+
|
|
2070
|
+
it('handles single-chunk files correctly', async () => {
|
|
2071
|
+
// Create a small file that won't be chunked
|
|
2072
|
+
await writeFile(join(testFilesDir, 'small.ts'), 'export const x = 42;');
|
|
2073
|
+
|
|
2074
|
+
const embedBatchSpy = vi.spyOn(embeddingEngine, 'embedBatch');
|
|
2075
|
+
|
|
2076
|
+
const store: FileStore = {
|
|
2077
|
+
type: 'file',
|
|
2078
|
+
id: storeId,
|
|
2079
|
+
name: 'Single Chunk Test Store',
|
|
2080
|
+
path: testFilesDir,
|
|
2081
|
+
createdAt: new Date(),
|
|
2082
|
+
updatedAt: new Date(),
|
|
2083
|
+
};
|
|
2084
|
+
|
|
2085
|
+
const result = await indexService.indexStore(store);
|
|
2086
|
+
|
|
2087
|
+
expect(result.success).toBe(true);
|
|
2088
|
+
if (result.success) {
|
|
2089
|
+
// Should have exactly one chunk
|
|
2090
|
+
expect(result.data.chunksCreated).toBe(1);
|
|
2091
|
+
}
|
|
2092
|
+
|
|
2093
|
+
// embedBatch should still be called (with a single item)
|
|
2094
|
+
expect(embedBatchSpy).toHaveBeenCalled();
|
|
2095
|
+
|
|
2096
|
+
embedBatchSpy.mockRestore();
|
|
2097
|
+
});
|
|
2098
|
+
|
|
2099
|
+
it('handles multiple files with batch embedding', async () => {
|
|
2100
|
+
// Create multiple files to verify batch embedding works across files
|
|
2101
|
+
await writeFile(join(testFilesDir, 'file1.ts'), 'export const a = 1;');
|
|
2102
|
+
await writeFile(join(testFilesDir, 'file2.ts'), 'export const b = 2;');
|
|
2103
|
+
await writeFile(join(testFilesDir, 'file3.ts'), 'export const c = 3;');
|
|
2104
|
+
|
|
2105
|
+
const embedBatchSpy = vi.spyOn(embeddingEngine, 'embedBatch');
|
|
2106
|
+
|
|
2107
|
+
const multiStoreId = createStoreId('multi-file-test');
|
|
2108
|
+
await lanceStore.initialize(multiStoreId);
|
|
2109
|
+
|
|
2110
|
+
const multiIndexService = new IndexService(lanceStore, embeddingEngine);
|
|
2111
|
+
|
|
2112
|
+
const store: FileStore = {
|
|
2113
|
+
type: 'file',
|
|
2114
|
+
id: multiStoreId,
|
|
2115
|
+
name: 'Multi File Test Store',
|
|
2116
|
+
path: testFilesDir,
|
|
2117
|
+
createdAt: new Date(),
|
|
2118
|
+
updatedAt: new Date(),
|
|
2119
|
+
};
|
|
2120
|
+
|
|
2121
|
+
const result = await multiIndexService.indexStore(store);
|
|
2122
|
+
|
|
2123
|
+
expect(result.success).toBe(true);
|
|
2124
|
+
if (result.success) {
|
|
2125
|
+
// Should index all 3 files
|
|
2126
|
+
expect(result.data.documentsIndexed).toBe(3);
|
|
2127
|
+
}
|
|
2128
|
+
|
|
2129
|
+
// embedBatch should be called once per file (3 times)
|
|
2130
|
+
expect(embedBatchSpy).toHaveBeenCalledTimes(3);
|
|
2131
|
+
|
|
2132
|
+
embedBatchSpy.mockRestore();
|
|
2133
|
+
});
|
|
2134
|
+
});
|
|
2135
|
+
|
|
2136
|
+
describe('IndexService - Parallel File Processing', () => {
|
|
2137
|
+
let lanceStore: LanceStore;
|
|
2138
|
+
let embeddingEngine: EmbeddingEngine;
|
|
2139
|
+
let tempDir: string;
|
|
2140
|
+
let testFilesDir: string;
|
|
2141
|
+
const storeId = createStoreId('parallel-test');
|
|
2142
|
+
|
|
2143
|
+
beforeAll(async () => {
|
|
2144
|
+
tempDir = await mkdtemp(join(tmpdir(), 'index-parallel-test-'));
|
|
2145
|
+
testFilesDir = join(tempDir, 'files');
|
|
2146
|
+
await mkdir(testFilesDir, { recursive: true });
|
|
2147
|
+
|
|
2148
|
+
lanceStore = new LanceStore(tempDir);
|
|
2149
|
+
embeddingEngine = new EmbeddingEngine();
|
|
2150
|
+
|
|
2151
|
+
await embeddingEngine.initialize();
|
|
2152
|
+
await lanceStore.initialize(storeId);
|
|
2153
|
+
}, 120000);
|
|
2154
|
+
|
|
2155
|
+
afterAll(async () => {
|
|
2156
|
+
await rm(tempDir, { recursive: true, force: true });
|
|
2157
|
+
});
|
|
2158
|
+
|
|
2159
|
+
beforeEach(async () => {
|
|
2160
|
+
// Clear test directory for fresh state
|
|
2161
|
+
await rm(testFilesDir, { recursive: true, force: true });
|
|
2162
|
+
await mkdir(testFilesDir, { recursive: true });
|
|
2163
|
+
});
|
|
2164
|
+
|
|
2165
|
+
it('uses concurrency option from IndexService constructor', async () => {
|
|
2166
|
+
// Create 10 test files
|
|
2167
|
+
for (let i = 0; i < 10; i++) {
|
|
2168
|
+
await writeFile(
|
|
2169
|
+
join(testFilesDir, `file${String(i)}.ts`),
|
|
2170
|
+
`export const x${String(i)} = ${String(i)};`
|
|
2171
|
+
);
|
|
2172
|
+
}
|
|
2173
|
+
|
|
2174
|
+
// Track when files start being processed
|
|
2175
|
+
const processingTimestamps: number[] = [];
|
|
2176
|
+
const originalEmbedBatch = embeddingEngine.embedBatch.bind(embeddingEngine);
|
|
2177
|
+
|
|
2178
|
+
vi.spyOn(embeddingEngine, 'embedBatch').mockImplementation(async (texts: string[]) => {
|
|
2179
|
+
processingTimestamps.push(Date.now());
|
|
2180
|
+
// Small delay to simulate processing time
|
|
2181
|
+
await new Promise((resolve) => setTimeout(resolve, 50));
|
|
2182
|
+
return originalEmbedBatch(texts);
|
|
2183
|
+
});
|
|
2184
|
+
|
|
2185
|
+
const concurrency = 4;
|
|
2186
|
+
const indexService = new IndexService(lanceStore, embeddingEngine, { concurrency });
|
|
2187
|
+
|
|
2188
|
+
const parallelStoreId = createStoreId('parallel-concurrency-test');
|
|
2189
|
+
await lanceStore.initialize(parallelStoreId);
|
|
2190
|
+
|
|
2191
|
+
const store: FileStore = {
|
|
2192
|
+
type: 'file',
|
|
2193
|
+
id: parallelStoreId,
|
|
2194
|
+
name: 'Parallel Test Store',
|
|
2195
|
+
path: testFilesDir,
|
|
2196
|
+
createdAt: new Date(),
|
|
2197
|
+
updatedAt: new Date(),
|
|
2198
|
+
};
|
|
2199
|
+
|
|
2200
|
+
const result = await indexService.indexStore(store);
|
|
2201
|
+
|
|
2202
|
+
expect(result.success).toBe(true);
|
|
2203
|
+
if (result.success) {
|
|
2204
|
+
expect(result.data.documentsIndexed).toBe(10);
|
|
2205
|
+
}
|
|
2206
|
+
|
|
2207
|
+
vi.restoreAllMocks();
|
|
2208
|
+
});
|
|
2209
|
+
|
|
2210
|
+
it('reports progress correctly with parallel processing', async () => {
|
|
2211
|
+
// Create test files
|
|
2212
|
+
for (let i = 0; i < 5; i++) {
|
|
2213
|
+
await writeFile(
|
|
2214
|
+
join(testFilesDir, `progress${String(i)}.ts`),
|
|
2215
|
+
`export const p${String(i)} = ${String(i)};`
|
|
2216
|
+
);
|
|
2217
|
+
}
|
|
2218
|
+
|
|
2219
|
+
const concurrency = 2;
|
|
2220
|
+
const indexService = new IndexService(lanceStore, embeddingEngine, { concurrency });
|
|
2221
|
+
|
|
2222
|
+
const progressStoreId = createStoreId('progress-test');
|
|
2223
|
+
await lanceStore.initialize(progressStoreId);
|
|
2224
|
+
|
|
2225
|
+
const store: FileStore = {
|
|
2226
|
+
type: 'file',
|
|
2227
|
+
id: progressStoreId,
|
|
2228
|
+
name: 'Progress Test Store',
|
|
2229
|
+
path: testFilesDir,
|
|
2230
|
+
createdAt: new Date(),
|
|
2231
|
+
updatedAt: new Date(),
|
|
2232
|
+
};
|
|
2233
|
+
|
|
2234
|
+
const progressEvents: Array<{ type: string; current: number; total: number }> = [];
|
|
2235
|
+
const onProgress = (event: { type: string; current: number; total: number }): void => {
|
|
2236
|
+
progressEvents.push(event);
|
|
2237
|
+
};
|
|
2238
|
+
|
|
2239
|
+
const result = await indexService.indexStore(store, onProgress);
|
|
2240
|
+
|
|
2241
|
+
expect(result.success).toBe(true);
|
|
2242
|
+
|
|
2243
|
+
// Should have start event
|
|
2244
|
+
expect(progressEvents.some((e) => e.type === 'start')).toBe(true);
|
|
2245
|
+
|
|
2246
|
+
// Should have progress events
|
|
2247
|
+
const progressOnly = progressEvents.filter((e) => e.type === 'progress');
|
|
2248
|
+
expect(progressOnly.length).toBeGreaterThan(0);
|
|
2249
|
+
|
|
2250
|
+
// Should have complete event
|
|
2251
|
+
expect(progressEvents.some((e) => e.type === 'complete')).toBe(true);
|
|
2252
|
+
|
|
2253
|
+
// Current should never exceed total
|
|
2254
|
+
for (const event of progressEvents) {
|
|
2255
|
+
expect(event.current).toBeLessThanOrEqual(event.total);
|
|
2256
|
+
}
|
|
2257
|
+
});
|
|
2258
|
+
|
|
2259
|
+
it('continues processing remaining files if one file fails to read', async () => {
|
|
2260
|
+
// Create valid test files
|
|
2261
|
+
await writeFile(join(testFilesDir, 'valid1.ts'), 'export const a = 1;');
|
|
2262
|
+
await writeFile(join(testFilesDir, 'valid2.ts'), 'export const b = 2;');
|
|
2263
|
+
await writeFile(join(testFilesDir, 'valid3.ts'), 'export const c = 3;');
|
|
2264
|
+
|
|
2265
|
+
// Create a file that will fail to read (remove read permission)
|
|
2266
|
+
const unreadablePath = join(testFilesDir, 'unreadable.ts');
|
|
2267
|
+
await writeFile(unreadablePath, 'export const x = 999;');
|
|
2268
|
+
await chmod(unreadablePath, 0o000);
|
|
2269
|
+
|
|
2270
|
+
const concurrency = 2;
|
|
2271
|
+
const indexService = new IndexService(lanceStore, embeddingEngine, { concurrency });
|
|
2272
|
+
|
|
2273
|
+
const errorStoreId = createStoreId('error-handling-test');
|
|
2274
|
+
await lanceStore.initialize(errorStoreId);
|
|
2275
|
+
|
|
2276
|
+
const store: FileStore = {
|
|
2277
|
+
type: 'file',
|
|
2278
|
+
id: errorStoreId,
|
|
2279
|
+
name: 'Error Handling Test Store',
|
|
2280
|
+
path: testFilesDir,
|
|
2281
|
+
createdAt: new Date(),
|
|
2282
|
+
updatedAt: new Date(),
|
|
2283
|
+
};
|
|
2284
|
+
|
|
2285
|
+
// The indexing should either succeed with partial results or fail gracefully
|
|
2286
|
+
const result = await indexService.indexStore(store);
|
|
2287
|
+
|
|
2288
|
+
// Restore permissions for cleanup
|
|
2289
|
+
await chmod(unreadablePath, 0o644);
|
|
2290
|
+
|
|
2291
|
+
// With current implementation, it may fail completely on the first error
|
|
2292
|
+
// This test documents the current behavior
|
|
2293
|
+
if (result.success) {
|
|
2294
|
+
// If it succeeds, it should have indexed at least the valid files
|
|
2295
|
+
expect(result.data.documentsIndexed).toBeGreaterThanOrEqual(0);
|
|
2296
|
+
} else {
|
|
2297
|
+
// If it fails, it should have an error
|
|
2298
|
+
expect(result.error).toBeDefined();
|
|
2299
|
+
}
|
|
2300
|
+
});
|
|
2301
|
+
});
|
|
@@ -25,6 +25,7 @@ interface IndexOptions {
|
|
|
25
25
|
chunkSize?: number;
|
|
26
26
|
chunkOverlap?: number;
|
|
27
27
|
codeGraphService?: CodeGraphService;
|
|
28
|
+
concurrency?: number;
|
|
28
29
|
}
|
|
29
30
|
|
|
30
31
|
const TEXT_EXTENSIONS = new Set([
|
|
@@ -62,6 +63,7 @@ export class IndexService {
|
|
|
62
63
|
private readonly embeddingEngine: EmbeddingEngine;
|
|
63
64
|
private readonly chunker: ChunkingService;
|
|
64
65
|
private readonly codeGraphService: CodeGraphService | undefined;
|
|
66
|
+
private readonly concurrency: number;
|
|
65
67
|
|
|
66
68
|
constructor(
|
|
67
69
|
lanceStore: LanceStore,
|
|
@@ -75,6 +77,7 @@ export class IndexService {
|
|
|
75
77
|
chunkOverlap: options.chunkOverlap ?? 100,
|
|
76
78
|
});
|
|
77
79
|
this.codeGraphService = options.codeGraphService;
|
|
80
|
+
this.concurrency = options.concurrency ?? 4;
|
|
78
81
|
}
|
|
79
82
|
|
|
80
83
|
async indexStore(store: Store, onProgress?: ProgressCallback): Promise<Result<IndexResult>> {
|
|
@@ -123,6 +126,7 @@ export class IndexService {
|
|
|
123
126
|
storeId: store.id,
|
|
124
127
|
path: store.path,
|
|
125
128
|
fileCount: files.length,
|
|
129
|
+
concurrency: this.concurrency,
|
|
126
130
|
},
|
|
127
131
|
'Files scanned for indexing'
|
|
128
132
|
);
|
|
@@ -138,59 +142,30 @@ export class IndexService {
|
|
|
138
142
|
message: 'Starting index',
|
|
139
143
|
});
|
|
140
144
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
const
|
|
144
|
-
// Pass file path for semantic Markdown chunking
|
|
145
|
-
const chunks = this.chunker.chunk(content, filePath);
|
|
145
|
+
// Process files in parallel batches
|
|
146
|
+
for (let i = 0; i < files.length; i += this.concurrency) {
|
|
147
|
+
const batch = files.slice(i, i + this.concurrency);
|
|
146
148
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
const fileType = this.classifyFileType(ext, fileName, filePath);
|
|
149
|
+
const batchResults = await Promise.all(
|
|
150
|
+
batch.map((filePath) => this.processFile(filePath, store))
|
|
151
|
+
);
|
|
151
152
|
|
|
152
|
-
// Collect
|
|
153
|
-
|
|
154
|
-
|
|
153
|
+
// Collect results from batch
|
|
154
|
+
for (const result of batchResults) {
|
|
155
|
+
documents.push(...result.documents);
|
|
156
|
+
if (result.sourceFile !== undefined) {
|
|
157
|
+
sourceFiles.push(result.sourceFile);
|
|
158
|
+
}
|
|
155
159
|
}
|
|
156
160
|
|
|
157
|
-
|
|
158
|
-
const vector = await this.embeddingEngine.embed(chunk.content);
|
|
159
|
-
const chunkId =
|
|
160
|
-
chunks.length > 1
|
|
161
|
-
? `${store.id}-${fileHash}-${String(chunk.chunkIndex)}`
|
|
162
|
-
: `${store.id}-${fileHash}`;
|
|
163
|
-
|
|
164
|
-
const doc: Document = {
|
|
165
|
-
id: createDocumentId(chunkId),
|
|
166
|
-
content: chunk.content,
|
|
167
|
-
vector,
|
|
168
|
-
metadata: {
|
|
169
|
-
type: chunks.length > 1 ? 'chunk' : 'file',
|
|
170
|
-
storeId: store.id,
|
|
171
|
-
path: filePath,
|
|
172
|
-
indexedAt: new Date(),
|
|
173
|
-
fileHash,
|
|
174
|
-
chunkIndex: chunk.chunkIndex,
|
|
175
|
-
totalChunks: chunk.totalChunks,
|
|
176
|
-
// New metadata for ranking
|
|
177
|
-
fileType,
|
|
178
|
-
sectionHeader: chunk.sectionHeader,
|
|
179
|
-
functionName: chunk.functionName,
|
|
180
|
-
hasDocComments: /\/\*\*[\s\S]*?\*\//.test(chunk.content),
|
|
181
|
-
docSummary: chunk.docSummary,
|
|
182
|
-
},
|
|
183
|
-
};
|
|
184
|
-
documents.push(doc);
|
|
185
|
-
}
|
|
186
|
-
filesProcessed++;
|
|
161
|
+
filesProcessed += batch.length;
|
|
187
162
|
|
|
188
|
-
// Emit progress event
|
|
163
|
+
// Emit progress event after each batch
|
|
189
164
|
onProgress?.({
|
|
190
165
|
type: 'progress',
|
|
191
166
|
current: filesProcessed,
|
|
192
167
|
total: files.length,
|
|
193
|
-
message: `
|
|
168
|
+
message: `Indexed ${String(filesProcessed)}/${String(files.length)} files`,
|
|
194
169
|
});
|
|
195
170
|
}
|
|
196
171
|
|
|
@@ -235,6 +210,80 @@ export class IndexService {
|
|
|
235
210
|
});
|
|
236
211
|
}
|
|
237
212
|
|
|
213
|
+
/**
|
|
214
|
+
* Process a single file: read, chunk, embed, and return documents.
|
|
215
|
+
* Extracted for parallel processing.
|
|
216
|
+
*/
|
|
217
|
+
private async processFile(
|
|
218
|
+
filePath: string,
|
|
219
|
+
store: FileStore | RepoStore
|
|
220
|
+
): Promise<{
|
|
221
|
+
documents: Document[];
|
|
222
|
+
sourceFile: { path: string; content: string } | undefined;
|
|
223
|
+
}> {
|
|
224
|
+
const content = await readFile(filePath, 'utf-8');
|
|
225
|
+
const fileHash = createHash('md5').update(content).digest('hex');
|
|
226
|
+
const chunks = this.chunker.chunk(content, filePath);
|
|
227
|
+
|
|
228
|
+
const ext = extname(filePath).toLowerCase();
|
|
229
|
+
const fileName = basename(filePath).toLowerCase();
|
|
230
|
+
const fileType = this.classifyFileType(ext, fileName, filePath);
|
|
231
|
+
|
|
232
|
+
// Track source file for code graph
|
|
233
|
+
const sourceFile = ['.ts', '.tsx', '.js', '.jsx'].includes(ext)
|
|
234
|
+
? { path: filePath, content }
|
|
235
|
+
: undefined;
|
|
236
|
+
|
|
237
|
+
// Skip files with no chunks (empty files)
|
|
238
|
+
if (chunks.length === 0) {
|
|
239
|
+
return { documents: [], sourceFile };
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Batch embed all chunks from this file
|
|
243
|
+
const chunkContents = chunks.map((c) => c.content);
|
|
244
|
+
const vectors = await this.embeddingEngine.embedBatch(chunkContents);
|
|
245
|
+
|
|
246
|
+
const documents: Document[] = [];
|
|
247
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
248
|
+
const chunk = chunks[i];
|
|
249
|
+
const vector = vectors[i];
|
|
250
|
+
|
|
251
|
+
// Fail fast if chunk/vector mismatch (should never happen)
|
|
252
|
+
if (chunk === undefined || vector === undefined) {
|
|
253
|
+
throw new Error(
|
|
254
|
+
`Chunk/vector mismatch at index ${String(i)}: chunk=${String(chunk !== undefined)}, vector=${String(vector !== undefined)}`
|
|
255
|
+
);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
const chunkId =
|
|
259
|
+
chunks.length > 1
|
|
260
|
+
? `${store.id}-${fileHash}-${String(chunk.chunkIndex)}`
|
|
261
|
+
: `${store.id}-${fileHash}`;
|
|
262
|
+
|
|
263
|
+
documents.push({
|
|
264
|
+
id: createDocumentId(chunkId),
|
|
265
|
+
content: chunk.content,
|
|
266
|
+
vector,
|
|
267
|
+
metadata: {
|
|
268
|
+
type: chunks.length > 1 ? 'chunk' : 'file',
|
|
269
|
+
storeId: store.id,
|
|
270
|
+
path: filePath,
|
|
271
|
+
indexedAt: new Date(),
|
|
272
|
+
fileHash,
|
|
273
|
+
chunkIndex: chunk.chunkIndex,
|
|
274
|
+
totalChunks: chunk.totalChunks,
|
|
275
|
+
fileType,
|
|
276
|
+
sectionHeader: chunk.sectionHeader,
|
|
277
|
+
functionName: chunk.functionName,
|
|
278
|
+
hasDocComments: /\/\*\*[\s\S]*?\*\//.test(chunk.content),
|
|
279
|
+
docSummary: chunk.docSummary,
|
|
280
|
+
},
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return { documents, sourceFile };
|
|
285
|
+
}
|
|
286
|
+
|
|
238
287
|
private async scanDirectory(dir: string): Promise<string[]> {
|
|
239
288
|
const files: string[] = [];
|
|
240
289
|
const entries = await readdir(dir, { withFileTypes: true });
|
|
@@ -422,6 +422,93 @@ describe('JobService', () => {
|
|
|
422
422
|
});
|
|
423
423
|
});
|
|
424
424
|
|
|
425
|
+
describe('cleanupStalePendingJobs', () => {
|
|
426
|
+
it('should clean up pending jobs that have been stale for too long', () => {
|
|
427
|
+
const job = jobService.createJob({
|
|
428
|
+
type: 'index',
|
|
429
|
+
details: { storeId: 'test' },
|
|
430
|
+
});
|
|
431
|
+
// Job stays in pending status (never started)
|
|
432
|
+
|
|
433
|
+
// Make it stale (3 hours old)
|
|
434
|
+
const jobFile = join(tempDir, 'jobs', `${job.id}.json`);
|
|
435
|
+
const jobData = JSON.parse(readFileSync(jobFile, 'utf-8'));
|
|
436
|
+
jobData.updatedAt = new Date(Date.now() - 3 * 60 * 60 * 1000).toISOString();
|
|
437
|
+
writeFileSync(jobFile, JSON.stringify(jobData), 'utf-8');
|
|
438
|
+
|
|
439
|
+
const cleaned = jobService.cleanupStalePendingJobs(2);
|
|
440
|
+
expect(cleaned).toBe(1);
|
|
441
|
+
expect(existsSync(jobFile)).toBe(false);
|
|
442
|
+
});
|
|
443
|
+
|
|
444
|
+
it('should mark stale pending jobs as failed before cleanup', () => {
|
|
445
|
+
const job = jobService.createJob({
|
|
446
|
+
type: 'index',
|
|
447
|
+
details: { storeId: 'test' },
|
|
448
|
+
});
|
|
449
|
+
|
|
450
|
+
// Make it stale
|
|
451
|
+
const jobFile = join(tempDir, 'jobs', `${job.id}.json`);
|
|
452
|
+
const jobData = JSON.parse(readFileSync(jobFile, 'utf-8'));
|
|
453
|
+
jobData.updatedAt = new Date(Date.now() - 3 * 60 * 60 * 1000).toISOString();
|
|
454
|
+
writeFileSync(jobFile, JSON.stringify(jobData), 'utf-8');
|
|
455
|
+
|
|
456
|
+
// Should mark as failed (not just delete)
|
|
457
|
+
jobService.cleanupStalePendingJobs(2, { markAsFailed: true });
|
|
458
|
+
|
|
459
|
+
const updated = jobService.getJob(job.id);
|
|
460
|
+
expect(updated?.status).toBe('failed');
|
|
461
|
+
expect(updated?.message).toContain('stale');
|
|
462
|
+
});
|
|
463
|
+
|
|
464
|
+
it('should not clean up recently created pending jobs', () => {
|
|
465
|
+
const job = jobService.createJob({
|
|
466
|
+
type: 'index',
|
|
467
|
+
details: { storeId: 'test' },
|
|
468
|
+
});
|
|
469
|
+
// Job is pending but recent
|
|
470
|
+
|
|
471
|
+
const cleaned = jobService.cleanupStalePendingJobs(2);
|
|
472
|
+
expect(cleaned).toBe(0);
|
|
473
|
+
|
|
474
|
+
const retrieved = jobService.getJob(job.id);
|
|
475
|
+
expect(retrieved).not.toBeNull();
|
|
476
|
+
});
|
|
477
|
+
|
|
478
|
+
it('should not clean up running jobs even if old', () => {
|
|
479
|
+
const job = jobService.createJob({
|
|
480
|
+
type: 'index',
|
|
481
|
+
details: { storeId: 'test' },
|
|
482
|
+
});
|
|
483
|
+
jobService.updateJob(job.id, { status: 'running' });
|
|
484
|
+
|
|
485
|
+
// Make it old
|
|
486
|
+
const jobFile = join(tempDir, 'jobs', `${job.id}.json`);
|
|
487
|
+
const jobData = JSON.parse(readFileSync(jobFile, 'utf-8'));
|
|
488
|
+
jobData.updatedAt = new Date(Date.now() - 5 * 60 * 60 * 1000).toISOString();
|
|
489
|
+
writeFileSync(jobFile, JSON.stringify(jobData), 'utf-8');
|
|
490
|
+
|
|
491
|
+
const cleaned = jobService.cleanupStalePendingJobs(2);
|
|
492
|
+
expect(cleaned).toBe(0);
|
|
493
|
+
});
|
|
494
|
+
|
|
495
|
+
it('should use default 2 hours if not specified', () => {
|
|
496
|
+
const job = jobService.createJob({
|
|
497
|
+
type: 'index',
|
|
498
|
+
details: { storeId: 'test' },
|
|
499
|
+
});
|
|
500
|
+
|
|
501
|
+
// Make it 3 hours old
|
|
502
|
+
const jobFile = join(tempDir, 'jobs', `${job.id}.json`);
|
|
503
|
+
const jobData = JSON.parse(readFileSync(jobFile, 'utf-8'));
|
|
504
|
+
jobData.updatedAt = new Date(Date.now() - 3 * 60 * 60 * 1000).toISOString();
|
|
505
|
+
writeFileSync(jobFile, JSON.stringify(jobData), 'utf-8');
|
|
506
|
+
|
|
507
|
+
const cleaned = jobService.cleanupStalePendingJobs();
|
|
508
|
+
expect(cleaned).toBe(1);
|
|
509
|
+
});
|
|
510
|
+
});
|
|
511
|
+
|
|
425
512
|
describe('deleteJob', () => {
|
|
426
513
|
it('should delete a job file', () => {
|
|
427
514
|
const job = jobService.createJob({
|
|
@@ -225,6 +225,49 @@ export class JobService {
|
|
|
225
225
|
return cleaned;
|
|
226
226
|
}
|
|
227
227
|
|
|
228
|
+
/**
|
|
229
|
+
* Clean up stale pending jobs that never started or got stuck
|
|
230
|
+
*
|
|
231
|
+
* @param olderThanHours - Consider pending jobs stale after this many hours (default 2)
|
|
232
|
+
* @param options - Options for cleanup behavior
|
|
233
|
+
* @param options.markAsFailed - If true, mark jobs as failed instead of deleting
|
|
234
|
+
* @returns Number of jobs cleaned up or marked as failed
|
|
235
|
+
*/
|
|
236
|
+
cleanupStalePendingJobs(
|
|
237
|
+
olderThanHours: number = 2,
|
|
238
|
+
options: { markAsFailed?: boolean } = {}
|
|
239
|
+
): number {
|
|
240
|
+
const jobs = this.listJobs();
|
|
241
|
+
const cutoffTime = Date.now() - olderThanHours * 60 * 60 * 1000;
|
|
242
|
+
let cleaned = 0;
|
|
243
|
+
|
|
244
|
+
for (const job of jobs) {
|
|
245
|
+
if (job.status === 'pending' && new Date(job.updatedAt).getTime() < cutoffTime) {
|
|
246
|
+
const jobFile = path.join(this.jobsDir, `${job.id}.json`);
|
|
247
|
+
|
|
248
|
+
if (options.markAsFailed === true) {
|
|
249
|
+
// Mark as failed instead of deleting
|
|
250
|
+
this.updateJob(job.id, {
|
|
251
|
+
status: 'failed',
|
|
252
|
+
message: `Job marked as stale - pending for over ${String(olderThanHours)} hours without progress`,
|
|
253
|
+
});
|
|
254
|
+
} else {
|
|
255
|
+
// Delete the job file
|
|
256
|
+
try {
|
|
257
|
+
fs.unlinkSync(jobFile);
|
|
258
|
+
} catch (error) {
|
|
259
|
+
throw new Error(
|
|
260
|
+
`Failed to delete stale job ${job.id}: ${error instanceof Error ? error.message : String(error)}`
|
|
261
|
+
);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
cleaned++;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
return cleaned;
|
|
269
|
+
}
|
|
270
|
+
|
|
228
271
|
/**
|
|
229
272
|
* Delete a specific job
|
|
230
273
|
*/
|