lance-context 0.1.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +232 -23
- package/dist/__tests__/ast-chunker.test.d.ts +2 -0
- package/dist/__tests__/ast-chunker.test.d.ts.map +1 -0
- package/dist/__tests__/ast-chunker.test.js +307 -0
- package/dist/__tests__/ast-chunker.test.js.map +1 -0
- package/dist/__tests__/config.test.d.ts +2 -0
- package/dist/__tests__/config.test.d.ts.map +1 -0
- package/dist/__tests__/config.test.js +242 -0
- package/dist/__tests__/config.test.js.map +1 -0
- package/dist/__tests__/dashboard/beads.test.d.ts +2 -0
- package/dist/__tests__/dashboard/beads.test.d.ts.map +1 -0
- package/dist/__tests__/dashboard/beads.test.js +151 -0
- package/dist/__tests__/dashboard/beads.test.js.map +1 -0
- package/dist/__tests__/dashboard/index.test.d.ts +2 -0
- package/dist/__tests__/dashboard/index.test.d.ts.map +1 -0
- package/dist/__tests__/dashboard/index.test.js +116 -0
- package/dist/__tests__/dashboard/index.test.js.map +1 -0
- package/dist/__tests__/dashboard/routes.test.d.ts +2 -0
- package/dist/__tests__/dashboard/routes.test.d.ts.map +1 -0
- package/dist/__tests__/dashboard/routes.test.js +125 -0
- package/dist/__tests__/dashboard/routes.test.js.map +1 -0
- package/dist/__tests__/dashboard/server.test.d.ts +2 -0
- package/dist/__tests__/dashboard/server.test.d.ts.map +1 -0
- package/dist/__tests__/dashboard/server.test.js +75 -0
- package/dist/__tests__/dashboard/server.test.js.map +1 -0
- package/dist/__tests__/dashboard/state.test.d.ts +2 -0
- package/dist/__tests__/dashboard/state.test.d.ts.map +1 -0
- package/dist/__tests__/dashboard/state.test.js +124 -0
- package/dist/__tests__/dashboard/state.test.js.map +1 -0
- package/dist/__tests__/embeddings/factory.test.d.ts +2 -0
- package/dist/__tests__/embeddings/factory.test.d.ts.map +1 -0
- package/dist/__tests__/embeddings/factory.test.js +100 -0
- package/dist/__tests__/embeddings/factory.test.js.map +1 -0
- package/dist/__tests__/embeddings/jina.test.d.ts +2 -0
- package/dist/__tests__/embeddings/jina.test.d.ts.map +1 -0
- package/dist/__tests__/embeddings/jina.test.js +156 -0
- package/dist/__tests__/embeddings/jina.test.js.map +1 -0
- package/dist/__tests__/embeddings/ollama.test.d.ts +2 -0
- package/dist/__tests__/embeddings/ollama.test.d.ts.map +1 -0
- package/dist/__tests__/embeddings/ollama.test.js +172 -0
- package/dist/__tests__/embeddings/ollama.test.js.map +1 -0
- package/dist/__tests__/embeddings/rate-limiter.test.d.ts +2 -0
- package/dist/__tests__/embeddings/rate-limiter.test.d.ts.map +1 -0
- package/dist/__tests__/embeddings/rate-limiter.test.js +163 -0
- package/dist/__tests__/embeddings/rate-limiter.test.js.map +1 -0
- package/dist/__tests__/embeddings/retry.test.d.ts +2 -0
- package/dist/__tests__/embeddings/retry.test.d.ts.map +1 -0
- package/dist/__tests__/embeddings/retry.test.js +260 -0
- package/dist/__tests__/embeddings/retry.test.js.map +1 -0
- package/dist/__tests__/embeddings/types.test.d.ts +2 -0
- package/dist/__tests__/embeddings/types.test.d.ts.map +1 -0
- package/dist/__tests__/embeddings/types.test.js +31 -0
- package/dist/__tests__/embeddings/types.test.js.map +1 -0
- package/dist/__tests__/mocks/embedding-backend.mock.d.ts +10 -0
- package/dist/__tests__/mocks/embedding-backend.mock.d.ts.map +1 -0
- package/dist/__tests__/mocks/embedding-backend.mock.js +39 -0
- package/dist/__tests__/mocks/embedding-backend.mock.js.map +1 -0
- package/dist/__tests__/mocks/fetch.mock.d.ts +38 -0
- package/dist/__tests__/mocks/fetch.mock.d.ts.map +1 -0
- package/dist/__tests__/mocks/fetch.mock.js +74 -0
- package/dist/__tests__/mocks/fetch.mock.js.map +1 -0
- package/dist/__tests__/mocks/lancedb.mock.d.ts +38 -0
- package/dist/__tests__/mocks/lancedb.mock.d.ts.map +1 -0
- package/dist/__tests__/mocks/lancedb.mock.js +63 -0
- package/dist/__tests__/mocks/lancedb.mock.js.map +1 -0
- package/dist/__tests__/search/clustering.test.d.ts +2 -0
- package/dist/__tests__/search/clustering.test.d.ts.map +1 -0
- package/dist/__tests__/search/clustering.test.js +230 -0
- package/dist/__tests__/search/clustering.test.js.map +1 -0
- package/dist/__tests__/search/hybrid-search.test.d.ts +2 -0
- package/dist/__tests__/search/hybrid-search.test.d.ts.map +1 -0
- package/dist/__tests__/search/hybrid-search.test.js +186 -0
- package/dist/__tests__/search/hybrid-search.test.js.map +1 -0
- package/dist/__tests__/search/indexer.test.d.ts +2 -0
- package/dist/__tests__/search/indexer.test.d.ts.map +1 -0
- package/dist/__tests__/search/indexer.test.js +878 -0
- package/dist/__tests__/search/indexer.test.js.map +1 -0
- package/dist/__tests__/search/tree-sitter-chunker.test.d.ts +2 -0
- package/dist/__tests__/search/tree-sitter-chunker.test.d.ts.map +1 -0
- package/dist/__tests__/search/tree-sitter-chunker.test.js +228 -0
- package/dist/__tests__/search/tree-sitter-chunker.test.js.map +1 -0
- package/dist/__tests__/setup.d.ts +2 -0
- package/dist/__tests__/setup.d.ts.map +1 -0
- package/dist/__tests__/setup.js +11 -0
- package/dist/__tests__/setup.js.map +1 -0
- package/dist/__tests__/utils/concurrency.test.d.ts +2 -0
- package/dist/__tests__/utils/concurrency.test.d.ts.map +1 -0
- package/dist/__tests__/utils/concurrency.test.js +83 -0
- package/dist/__tests__/utils/concurrency.test.js.map +1 -0
- package/dist/__tests__/utils/errors.test.d.ts +2 -0
- package/dist/__tests__/utils/errors.test.d.ts.map +1 -0
- package/dist/__tests__/utils/errors.test.js +136 -0
- package/dist/__tests__/utils/errors.test.js.map +1 -0
- package/dist/__tests__/utils/type-guards.test.d.ts +2 -0
- package/dist/__tests__/utils/type-guards.test.d.ts.map +1 -0
- package/dist/__tests__/utils/type-guards.test.js +80 -0
- package/dist/__tests__/utils/type-guards.test.js.map +1 -0
- package/dist/__tests__/worktree/worktree-manager.test.d.ts +2 -0
- package/dist/__tests__/worktree/worktree-manager.test.d.ts.map +1 -0
- package/dist/__tests__/worktree/worktree-manager.test.js +403 -0
- package/dist/__tests__/worktree/worktree-manager.test.js.map +1 -0
- package/dist/config.d.ts +122 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +508 -0
- package/dist/config.js.map +1 -0
- package/dist/dashboard/beads.d.ts +35 -0
- package/dist/dashboard/beads.d.ts.map +1 -0
- package/dist/dashboard/beads.js +102 -0
- package/dist/dashboard/beads.js.map +1 -0
- package/dist/dashboard/events.d.ts +46 -0
- package/dist/dashboard/events.d.ts.map +1 -0
- package/dist/dashboard/events.js +141 -0
- package/dist/dashboard/events.js.map +1 -0
- package/dist/dashboard/index.d.ts +69 -0
- package/dist/dashboard/index.d.ts.map +1 -0
- package/dist/dashboard/index.js +93 -0
- package/dist/dashboard/index.js.map +1 -0
- package/dist/dashboard/routes.d.ts +6 -0
- package/dist/dashboard/routes.d.ts.map +1 -0
- package/dist/dashboard/routes.js +245 -0
- package/dist/dashboard/routes.js.map +1 -0
- package/dist/dashboard/server.d.ts +27 -0
- package/dist/dashboard/server.d.ts.map +1 -0
- package/dist/dashboard/server.js +72 -0
- package/dist/dashboard/server.js.map +1 -0
- package/dist/dashboard/state.d.ts +125 -0
- package/dist/dashboard/state.d.ts.map +1 -0
- package/dist/dashboard/state.js +264 -0
- package/dist/dashboard/state.js.map +1 -0
- package/dist/dashboard/ui.d.ts +6 -0
- package/dist/dashboard/ui.d.ts.map +1 -0
- package/dist/dashboard/ui.js +1421 -0
- package/dist/dashboard/ui.js.map +1 -0
- package/dist/embeddings/index.d.ts +20 -2
- package/dist/embeddings/index.d.ts.map +1 -1
- package/dist/embeddings/index.js +49 -6
- package/dist/embeddings/index.js.map +1 -1
- package/dist/embeddings/jina.d.ts +9 -0
- package/dist/embeddings/jina.d.ts.map +1 -1
- package/dist/embeddings/jina.js +42 -2
- package/dist/embeddings/jina.js.map +1 -1
- package/dist/embeddings/ollama.d.ts +2 -0
- package/dist/embeddings/ollama.d.ts.map +1 -1
- package/dist/embeddings/ollama.js +21 -5
- package/dist/embeddings/ollama.js.map +1 -1
- package/dist/embeddings/rate-limiter.d.ts +75 -0
- package/dist/embeddings/rate-limiter.d.ts.map +1 -0
- package/dist/embeddings/rate-limiter.js +145 -0
- package/dist/embeddings/rate-limiter.js.map +1 -0
- package/dist/embeddings/retry.d.ts +14 -0
- package/dist/embeddings/retry.d.ts.map +1 -0
- package/dist/embeddings/retry.js +89 -0
- package/dist/embeddings/retry.js.map +1 -0
- package/dist/embeddings/types.d.ts +56 -2
- package/dist/embeddings/types.d.ts.map +1 -1
- package/dist/embeddings/types.js +16 -0
- package/dist/embeddings/types.js.map +1 -1
- package/dist/index.js +1871 -44
- package/dist/index.js.map +1 -1
- package/dist/memory/index.d.ts +63 -0
- package/dist/memory/index.d.ts.map +1 -0
- package/dist/memory/index.js +168 -0
- package/dist/memory/index.js.map +1 -0
- package/dist/search/ast-chunker.d.ts +34 -0
- package/dist/search/ast-chunker.d.ts.map +1 -0
- package/dist/search/ast-chunker.js +261 -0
- package/dist/search/ast-chunker.js.map +1 -0
- package/dist/search/clustering.d.ts +77 -0
- package/dist/search/clustering.d.ts.map +1 -0
- package/dist/search/clustering.js +455 -0
- package/dist/search/clustering.js.map +1 -0
- package/dist/search/indexer.d.ts +239 -3
- package/dist/search/indexer.d.ts.map +1 -1
- package/dist/search/indexer.js +941 -45
- package/dist/search/indexer.js.map +1 -1
- package/dist/search/tree-sitter-chunker.d.ts +69 -0
- package/dist/search/tree-sitter-chunker.d.ts.map +1 -0
- package/dist/search/tree-sitter-chunker.js +436 -0
- package/dist/search/tree-sitter-chunker.js.map +1 -0
- package/dist/symbols/index.d.ts +14 -0
- package/dist/symbols/index.d.ts.map +1 -0
- package/dist/symbols/index.js +19 -0
- package/dist/symbols/index.js.map +1 -0
- package/dist/symbols/name-path.d.ts +113 -0
- package/dist/symbols/name-path.d.ts.map +1 -0
- package/dist/symbols/name-path.js +194 -0
- package/dist/symbols/name-path.js.map +1 -0
- package/dist/symbols/pattern-search.d.ts +14 -0
- package/dist/symbols/pattern-search.d.ts.map +1 -0
- package/dist/symbols/pattern-search.js +224 -0
- package/dist/symbols/pattern-search.js.map +1 -0
- package/dist/symbols/reference-finder.d.ts +38 -0
- package/dist/symbols/reference-finder.d.ts.map +1 -0
- package/dist/symbols/reference-finder.js +376 -0
- package/dist/symbols/reference-finder.js.map +1 -0
- package/dist/symbols/symbol-editor.d.ts +81 -0
- package/dist/symbols/symbol-editor.d.ts.map +1 -0
- package/dist/symbols/symbol-editor.js +257 -0
- package/dist/symbols/symbol-editor.js.map +1 -0
- package/dist/symbols/symbol-extractor.d.ts +49 -0
- package/dist/symbols/symbol-extractor.d.ts.map +1 -0
- package/dist/symbols/symbol-extractor.js +593 -0
- package/dist/symbols/symbol-extractor.js.map +1 -0
- package/dist/symbols/symbol-renamer.d.ts +81 -0
- package/dist/symbols/symbol-renamer.d.ts.map +1 -0
- package/dist/symbols/symbol-renamer.js +204 -0
- package/dist/symbols/symbol-renamer.js.map +1 -0
- package/dist/symbols/types.d.ts +234 -0
- package/dist/symbols/types.d.ts.map +1 -0
- package/dist/symbols/types.js +106 -0
- package/dist/symbols/types.js.map +1 -0
- package/dist/utils/concurrency.d.ts +32 -0
- package/dist/utils/concurrency.d.ts.map +1 -0
- package/dist/utils/concurrency.js +57 -0
- package/dist/utils/concurrency.js.map +1 -0
- package/dist/utils/errors.d.ts +36 -0
- package/dist/utils/errors.d.ts.map +1 -0
- package/dist/utils/errors.js +91 -0
- package/dist/utils/errors.js.map +1 -0
- package/dist/utils/type-guards.d.ts +17 -0
- package/dist/utils/type-guards.d.ts.map +1 -0
- package/dist/utils/type-guards.js +25 -0
- package/dist/utils/type-guards.js.map +1 -0
- package/dist/worktree/index.d.ts +6 -0
- package/dist/worktree/index.d.ts.map +1 -0
- package/dist/worktree/index.js +6 -0
- package/dist/worktree/index.js.map +1 -0
- package/dist/worktree/types.d.ts +101 -0
- package/dist/worktree/types.d.ts.map +1 -0
- package/dist/worktree/types.js +6 -0
- package/dist/worktree/types.js.map +1 -0
- package/dist/worktree/worktree-manager.d.ts +80 -0
- package/dist/worktree/worktree-manager.d.ts.map +1 -0
- package/dist/worktree/worktree-manager.js +407 -0
- package/dist/worktree/worktree-manager.js.map +1 -0
- package/package.json +39 -5
- package/scripts/postinstall.js +48 -0
package/dist/search/indexer.js
CHANGED
|
@@ -1,17 +1,70 @@
|
|
|
1
1
|
import * as lancedb from '@lancedb/lancedb';
|
|
2
|
+
import * as crypto from 'crypto';
|
|
2
3
|
import * as fs from 'fs/promises';
|
|
3
4
|
import * as path from 'path';
|
|
4
|
-
|
|
5
|
-
|
|
5
|
+
import { ASTChunker } from './ast-chunker.js';
|
|
6
|
+
import { TreeSitterChunker } from './tree-sitter-chunker.js';
|
|
7
|
+
import { loadConfig, getDefaultPatterns, getDefaultExcludePatterns, getChunkingConfig, getSearchConfig, } from '../config.js';
|
|
8
|
+
import { minimatch } from 'minimatch';
|
|
9
|
+
import { mapInBatches } from '../utils/concurrency.js';
|
|
10
|
+
import { kMeansClustering, calculateSilhouetteScore, } from './clustering.js';
|
|
11
|
+
/** Default concurrency for parallel file processing */
|
|
12
|
+
const FILE_PROCESSING_CONCURRENCY = 10;
|
|
6
13
|
/**
|
|
7
|
-
*
|
|
14
|
+
* Compute a checksum for index integrity validation.
|
|
15
|
+
* Based on sorted file list and chunk count.
|
|
8
16
|
*/
|
|
17
|
+
function computeIndexChecksum(files, chunkCount) {
|
|
18
|
+
const sortedFiles = [...files].sort();
|
|
19
|
+
const data = JSON.stringify({ files: sortedFiles, chunkCount });
|
|
20
|
+
return crypto.createHash('sha256').update(data).digest('hex').slice(0, 16);
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Sanitize a file path for use in LanceDB filter expressions.
|
|
24
|
+
* Prevents SQL injection by only allowing safe path characters.
|
|
25
|
+
*/
|
|
26
|
+
function sanitizePathForFilter(filepath) {
|
|
27
|
+
// Only allow safe file path characters: alphanumeric, /, ., -, _, space
|
|
28
|
+
// This is more restrictive than escaping and prevents injection attacks
|
|
29
|
+
if (!/^[\w\s./-]+$/.test(filepath)) {
|
|
30
|
+
// If path contains unusual characters, escape single quotes and backslashes
|
|
31
|
+
return filepath.replace(/\\/g, '\\\\').replace(/'/g, "''");
|
|
32
|
+
}
|
|
33
|
+
return filepath.replace(/'/g, "''");
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Code indexer that uses LanceDB for vector storage and semantic search.
|
|
37
|
+
*
|
|
38
|
+
* Provides functionality to:
|
|
39
|
+
* - Index a codebase by chunking files and generating embeddings
|
|
40
|
+
* - Perform hybrid semantic + keyword search
|
|
41
|
+
* - Support incremental indexing (only re-index changed files)
|
|
42
|
+
*
|
|
43
|
+
* @example
|
|
44
|
+
* ```typescript
|
|
45
|
+
* const backend = await createEmbeddingBackend();
|
|
46
|
+
* const indexer = new CodeIndexer('/path/to/project', backend);
|
|
47
|
+
* await indexer.initialize();
|
|
48
|
+
*
|
|
49
|
+
* // Index the codebase
|
|
50
|
+
* await indexer.indexCodebase();
|
|
51
|
+
*
|
|
52
|
+
* // Search for code
|
|
53
|
+
* const results = await indexer.search('authentication middleware');
|
|
54
|
+
* ```
|
|
55
|
+
*/
|
|
56
|
+
/** Maximum number of query embeddings to cache */
|
|
57
|
+
const QUERY_CACHE_MAX_SIZE = 100;
|
|
9
58
|
export class CodeIndexer {
|
|
10
59
|
db = null;
|
|
11
60
|
table = null;
|
|
61
|
+
metadataTable = null;
|
|
12
62
|
embeddingBackend;
|
|
13
63
|
indexPath;
|
|
14
64
|
projectPath;
|
|
65
|
+
config = null;
|
|
66
|
+
/** LRU cache for query embeddings to avoid recomputing identical queries */
|
|
67
|
+
queryEmbeddingCache = new Map();
|
|
15
68
|
constructor(projectPath, embeddingBackend) {
|
|
16
69
|
this.projectPath = projectPath;
|
|
17
70
|
this.embeddingBackend = embeddingBackend;
|
|
@@ -19,6 +72,118 @@ export class CodeIndexer {
|
|
|
19
72
|
}
|
|
20
73
|
async initialize() {
|
|
21
74
|
this.db = await lancedb.connect(this.indexPath);
|
|
75
|
+
this.config = await loadConfig(this.projectPath);
|
|
76
|
+
console.error(`[lance-context] Loaded config with ${this.config.patterns?.length} patterns`);
|
|
77
|
+
}
|
|
78
|
+
get metadataPath() {
|
|
79
|
+
return path.join(this.indexPath, 'index-metadata.json');
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Save index metadata to disk
|
|
83
|
+
*/
|
|
84
|
+
async saveIndexMetadata(fileCount, chunkCount, indexedFiles) {
|
|
85
|
+
// Convert to relative paths for checksum
|
|
86
|
+
const relativePaths = indexedFiles.map((f) => path.isAbsolute(f) ? path.relative(this.projectPath, f) : f);
|
|
87
|
+
const metadata = {
|
|
88
|
+
lastUpdated: new Date().toISOString(),
|
|
89
|
+
fileCount,
|
|
90
|
+
chunkCount,
|
|
91
|
+
embeddingBackend: this.embeddingBackend.name,
|
|
92
|
+
embeddingModel: this.embeddingBackend.getModel(),
|
|
93
|
+
embeddingDimensions: this.embeddingBackend.getDimensions(),
|
|
94
|
+
version: '1.0.0',
|
|
95
|
+
checksum: computeIndexChecksum(relativePaths, chunkCount),
|
|
96
|
+
};
|
|
97
|
+
await fs.writeFile(this.metadataPath, JSON.stringify(metadata, null, 2));
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Load index metadata from disk
|
|
101
|
+
*/
|
|
102
|
+
async loadIndexMetadata() {
|
|
103
|
+
try {
|
|
104
|
+
const content = await fs.readFile(this.metadataPath, 'utf-8');
|
|
105
|
+
return JSON.parse(content);
|
|
106
|
+
}
|
|
107
|
+
catch {
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Get the modification time of a file
|
|
113
|
+
*/
|
|
114
|
+
async getFileMtime(filepath) {
|
|
115
|
+
const stats = await fs.stat(filepath);
|
|
116
|
+
return stats.mtimeMs;
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Get stored metadata for all indexed files
|
|
120
|
+
*/
|
|
121
|
+
async getStoredMetadata() {
|
|
122
|
+
const tableNames = await this.db.tableNames();
|
|
123
|
+
if (!tableNames.includes('file_metadata')) {
|
|
124
|
+
return new Map();
|
|
125
|
+
}
|
|
126
|
+
this.metadataTable = await this.db.openTable('file_metadata');
|
|
127
|
+
const rows = await this.metadataTable.query().toArray();
|
|
128
|
+
const metadata = new Map();
|
|
129
|
+
for (const row of rows) {
|
|
130
|
+
metadata.set(row.filepath, row.mtime);
|
|
131
|
+
}
|
|
132
|
+
return metadata;
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Detect which files have been added, modified, or deleted
|
|
136
|
+
*/
|
|
137
|
+
async detectFileChanges(currentFiles) {
|
|
138
|
+
const storedMetadata = await this.getStoredMetadata();
|
|
139
|
+
const changes = {
|
|
140
|
+
added: [],
|
|
141
|
+
modified: [],
|
|
142
|
+
deleted: [],
|
|
143
|
+
unchanged: [],
|
|
144
|
+
};
|
|
145
|
+
const currentFilesSet = new Set();
|
|
146
|
+
for (const filepath of currentFiles) {
|
|
147
|
+
const relativePath = path.relative(this.projectPath, filepath);
|
|
148
|
+
currentFilesSet.add(relativePath);
|
|
149
|
+
const currentMtime = await this.getFileMtime(filepath);
|
|
150
|
+
const storedMtime = storedMetadata.get(relativePath);
|
|
151
|
+
if (storedMtime === undefined) {
|
|
152
|
+
changes.added.push(filepath);
|
|
153
|
+
}
|
|
154
|
+
else if (currentMtime > storedMtime) {
|
|
155
|
+
changes.modified.push(filepath);
|
|
156
|
+
}
|
|
157
|
+
else {
|
|
158
|
+
changes.unchanged.push(filepath);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
// Find deleted files
|
|
162
|
+
for (const [relativePath] of storedMetadata) {
|
|
163
|
+
if (!currentFilesSet.has(relativePath)) {
|
|
164
|
+
changes.deleted.push(relativePath);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
return changes;
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Save metadata for indexed files
|
|
171
|
+
*/
|
|
172
|
+
async saveFileMetadata(files) {
|
|
173
|
+
const metadata = [];
|
|
174
|
+
for (const filepath of files) {
|
|
175
|
+
const relativePath = path.relative(this.projectPath, filepath);
|
|
176
|
+
const mtime = await this.getFileMtime(filepath);
|
|
177
|
+
metadata.push({ filepath: relativePath, mtime });
|
|
178
|
+
}
|
|
179
|
+
// Drop and recreate metadata table
|
|
180
|
+
const tableNames = await this.db.tableNames();
|
|
181
|
+
if (tableNames.includes('file_metadata')) {
|
|
182
|
+
await this.db.dropTable('file_metadata');
|
|
183
|
+
}
|
|
184
|
+
if (metadata.length > 0) {
|
|
185
|
+
this.metadataTable = await this.db.createTable('file_metadata', metadata);
|
|
186
|
+
}
|
|
22
187
|
}
|
|
23
188
|
async getStatus() {
|
|
24
189
|
const tableNames = await this.db?.tableNames();
|
|
@@ -30,54 +195,199 @@ export class CodeIndexer {
|
|
|
30
195
|
chunkCount: 0,
|
|
31
196
|
lastUpdated: null,
|
|
32
197
|
indexPath: this.indexPath,
|
|
198
|
+
embeddingBackend: this.embeddingBackend.name,
|
|
199
|
+
embeddingModel: this.embeddingBackend.getModel(),
|
|
33
200
|
};
|
|
34
201
|
}
|
|
35
202
|
this.table = await this.db.openTable('code_chunks');
|
|
36
203
|
const count = await this.table.countRows();
|
|
204
|
+
// Load persisted metadata
|
|
205
|
+
const metadata = await this.loadIndexMetadata();
|
|
206
|
+
// Validate index integrity
|
|
207
|
+
const corruptionCheck = await this.validateIndexIntegrity(metadata, count);
|
|
208
|
+
// Check for backend mismatch
|
|
209
|
+
const backendMismatch = this.checkBackendMismatch(metadata);
|
|
37
210
|
return {
|
|
38
211
|
indexed: true,
|
|
39
|
-
fileCount: 0,
|
|
212
|
+
fileCount: metadata?.fileCount ?? 0,
|
|
40
213
|
chunkCount: count,
|
|
41
|
-
lastUpdated:
|
|
214
|
+
lastUpdated: metadata?.lastUpdated ?? null,
|
|
42
215
|
indexPath: this.indexPath,
|
|
216
|
+
embeddingBackend: metadata?.embeddingBackend ?? this.embeddingBackend.name,
|
|
217
|
+
embeddingModel: metadata?.embeddingModel ?? this.embeddingBackend.getModel(),
|
|
218
|
+
corrupted: corruptionCheck.corrupted,
|
|
219
|
+
corruptionReason: corruptionCheck.reason,
|
|
220
|
+
backendMismatch: backendMismatch.mismatch,
|
|
221
|
+
backendMismatchReason: backendMismatch.reason,
|
|
43
222
|
};
|
|
44
223
|
}
|
|
45
|
-
|
|
224
|
+
/**
|
|
225
|
+
* Check if the current embedding backend differs from the one used to create the index.
|
|
226
|
+
* Returns mismatch status and reason if mismatched.
|
|
227
|
+
*/
|
|
228
|
+
checkBackendMismatch(metadata) {
|
|
229
|
+
if (!metadata) {
|
|
230
|
+
return { mismatch: false };
|
|
231
|
+
}
|
|
232
|
+
const currentBackend = this.embeddingBackend.name;
|
|
233
|
+
const currentModel = this.embeddingBackend.getModel();
|
|
234
|
+
const currentDimensions = this.embeddingBackend.getDimensions();
|
|
235
|
+
// Check dimension mismatch (critical - will cause search failures)
|
|
236
|
+
if (metadata.embeddingDimensions && metadata.embeddingDimensions !== currentDimensions) {
|
|
237
|
+
return {
|
|
238
|
+
mismatch: true,
|
|
239
|
+
reason: `Embedding dimension mismatch: index has ${metadata.embeddingDimensions}-dim vectors, ` +
|
|
240
|
+
`current backend (${currentBackend}) produces ${currentDimensions}-dim vectors. Reindex required.`,
|
|
241
|
+
};
|
|
242
|
+
}
|
|
243
|
+
// Check model mismatch (different models produce incompatible embeddings)
|
|
244
|
+
if (metadata.embeddingModel && metadata.embeddingModel !== currentModel) {
|
|
245
|
+
return {
|
|
246
|
+
mismatch: true,
|
|
247
|
+
reason: `Embedding model mismatch: index uses '${metadata.embeddingModel}', ` +
|
|
248
|
+
`current backend uses '${currentModel}'. Reindex required.`,
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
// Check backend mismatch (even same dimensions may have different embedding spaces)
|
|
252
|
+
if (metadata.embeddingBackend && metadata.embeddingBackend !== currentBackend) {
|
|
253
|
+
return {
|
|
254
|
+
mismatch: true,
|
|
255
|
+
reason: `Embedding backend changed from '${metadata.embeddingBackend}' to '${currentBackend}'. Reindex required.`,
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
return { mismatch: false };
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Validate index integrity by checking metadata consistency.
|
|
262
|
+
* Returns corruption status and reason if corrupted.
|
|
263
|
+
*/
|
|
264
|
+
async validateIndexIntegrity(metadata, actualChunkCount) {
|
|
265
|
+
// No metadata file - possible incomplete indexing
|
|
266
|
+
if (!metadata) {
|
|
267
|
+
return {
|
|
268
|
+
corrupted: true,
|
|
269
|
+
reason: 'Missing index metadata file. Index may be incomplete. Run clear_index followed by index_codebase to rebuild.',
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
// Check if chunk count matches
|
|
273
|
+
if (metadata.chunkCount !== actualChunkCount) {
|
|
274
|
+
return {
|
|
275
|
+
corrupted: true,
|
|
276
|
+
reason: `Chunk count mismatch: metadata says ${metadata.chunkCount}, index has ${actualChunkCount}. Run clear_index followed by index_codebase to rebuild.`,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
// Validate checksum if present
|
|
280
|
+
if (metadata.checksum) {
|
|
281
|
+
const storedFiles = await this.getStoredMetadata();
|
|
282
|
+
const fileList = Array.from(storedFiles.keys());
|
|
283
|
+
const computedChecksum = computeIndexChecksum(fileList, actualChunkCount);
|
|
284
|
+
if (computedChecksum !== metadata.checksum) {
|
|
285
|
+
return {
|
|
286
|
+
corrupted: true,
|
|
287
|
+
reason: `Checksum mismatch: file metadata does not match index. Run clear_index followed by index_codebase to rebuild.`,
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
return { corrupted: false };
|
|
292
|
+
}
|
|
293
|
+
async indexCodebase(patterns, excludePatterns, forceReindex = false, onProgress, autoRepair = false) {
|
|
46
294
|
const { glob } = await import('glob');
|
|
295
|
+
const report = (progress) => {
|
|
296
|
+
console.error(`[lance-context] ${progress.message}`);
|
|
297
|
+
onProgress?.(progress);
|
|
298
|
+
};
|
|
299
|
+
// Check for corruption if autoRepair is enabled
|
|
300
|
+
if (autoRepair) {
|
|
301
|
+
const status = await this.getStatus();
|
|
302
|
+
if (status.corrupted) {
|
|
303
|
+
console.error(`[lance-context] Index corruption detected: ${status.corruptionReason}`);
|
|
304
|
+
console.error('[lance-context] Auto-repair enabled, clearing and rebuilding index...');
|
|
305
|
+
await this.clearIndex();
|
|
306
|
+
// Recursively call with forceReindex but without autoRepair to avoid loops
|
|
307
|
+
const result = await this.indexCodebase(patterns, excludePatterns, true, onProgress, false);
|
|
308
|
+
return { ...result, repaired: true };
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
// Use provided patterns or fall back to config/defaults
|
|
312
|
+
const effectivePatterns = patterns || this.config?.patterns || getDefaultPatterns();
|
|
313
|
+
const effectiveExcludePatterns = excludePatterns || this.config?.excludePatterns || getDefaultExcludePatterns();
|
|
314
|
+
report({ phase: 'scanning', current: 0, total: 0, message: 'Scanning for files...' });
|
|
47
315
|
// Find all matching files
|
|
48
316
|
const files = [];
|
|
49
|
-
for (const pattern of
|
|
317
|
+
for (const pattern of effectivePatterns) {
|
|
50
318
|
const matches = await glob(pattern, {
|
|
51
319
|
cwd: this.projectPath,
|
|
52
|
-
ignore:
|
|
320
|
+
ignore: effectiveExcludePatterns,
|
|
53
321
|
absolute: true,
|
|
54
322
|
});
|
|
55
323
|
files.push(...matches);
|
|
56
324
|
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
325
|
+
report({
|
|
326
|
+
phase: 'scanning',
|
|
327
|
+
current: files.length,
|
|
328
|
+
total: files.length,
|
|
329
|
+
message: `Found ${files.length} files to index`,
|
|
330
|
+
});
|
|
331
|
+
// Check if we can do incremental indexing
|
|
332
|
+
const tableNames = await this.db.tableNames();
|
|
333
|
+
const hasExistingIndex = tableNames.includes('code_chunks');
|
|
334
|
+
// Check for embedding dimension or model mismatch
|
|
335
|
+
let embeddingMismatch = false;
|
|
336
|
+
if (hasExistingIndex && !forceReindex) {
|
|
337
|
+
const metadata = await this.loadIndexMetadata();
|
|
338
|
+
const currentDimensions = this.embeddingBackend.getDimensions();
|
|
339
|
+
const currentModel = this.embeddingBackend.getModel();
|
|
340
|
+
// Check dimension mismatch
|
|
341
|
+
if (metadata?.embeddingDimensions && metadata.embeddingDimensions !== currentDimensions) {
|
|
342
|
+
console.error(`[lance-context] Embedding dimension mismatch: index has ${metadata.embeddingDimensions}, ` +
|
|
343
|
+
`current backend (${this.embeddingBackend.name}) uses ${currentDimensions}. Forcing full reindex.`);
|
|
344
|
+
embeddingMismatch = true;
|
|
345
|
+
}
|
|
346
|
+
// Check model mismatch (even if dimensions match, different models produce incompatible embeddings)
|
|
347
|
+
if (metadata?.embeddingModel && metadata.embeddingModel !== currentModel) {
|
|
348
|
+
console.error(`[lance-context] Embedding model mismatch: index uses '${metadata.embeddingModel}', ` +
|
|
349
|
+
`current backend uses '${currentModel}'. Forcing full reindex.`);
|
|
350
|
+
embeddingMismatch = true;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
const canDoIncremental = hasExistingIndex && !forceReindex && !embeddingMismatch;
|
|
354
|
+
if (canDoIncremental) {
|
|
355
|
+
return this.indexIncremental(files, onProgress);
|
|
63
356
|
}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
357
|
+
// Full reindex
|
|
358
|
+
return this.indexFull(files, onProgress);
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Perform a full reindex of all files
|
|
362
|
+
*/
|
|
363
|
+
async indexFull(files, onProgress) {
|
|
364
|
+
const report = (progress) => {
|
|
365
|
+
console.error(`[lance-context] ${progress.message}`);
|
|
366
|
+
onProgress?.(progress);
|
|
367
|
+
};
|
|
368
|
+
// Process files into chunks (parallelized for I/O efficiency)
|
|
369
|
+
report({ phase: 'chunking', current: 0, total: files.length, message: 'Chunking files...' });
|
|
370
|
+
const chunkResults = await mapInBatches(files, async (filepath) => this.chunkFile(filepath), FILE_PROCESSING_CONCURRENCY, (completed, total) => {
|
|
371
|
+
report({
|
|
372
|
+
phase: 'chunking',
|
|
373
|
+
current: completed,
|
|
374
|
+
total,
|
|
375
|
+
message: `Chunked ${completed}/${total} files`,
|
|
73
376
|
});
|
|
74
|
-
|
|
75
|
-
|
|
377
|
+
});
|
|
378
|
+
const allChunks = chunkResults.flat();
|
|
379
|
+
report({
|
|
380
|
+
phase: 'chunking',
|
|
381
|
+
current: files.length,
|
|
382
|
+
total: files.length,
|
|
383
|
+
message: `Created ${allChunks.length} chunks`,
|
|
384
|
+
});
|
|
385
|
+
// Generate embeddings in batches
|
|
386
|
+
await this.embedChunks(allChunks, onProgress);
|
|
76
387
|
// Store in LanceDB
|
|
77
|
-
const dimensions = this.embeddingBackend.getDimensions();
|
|
78
388
|
const data = allChunks.map((chunk) => ({
|
|
79
389
|
id: chunk.id,
|
|
80
|
-
|
|
390
|
+
filepath: chunk.filepath,
|
|
81
391
|
content: chunk.content,
|
|
82
392
|
startLine: chunk.startLine,
|
|
83
393
|
endLine: chunk.endLine,
|
|
@@ -90,26 +400,215 @@ export class CodeIndexer {
|
|
|
90
400
|
await this.db.dropTable('code_chunks');
|
|
91
401
|
}
|
|
92
402
|
this.table = await this.db.createTable('code_chunks', data);
|
|
403
|
+
// Save file metadata for future incremental indexing
|
|
404
|
+
await this.saveFileMetadata(files);
|
|
405
|
+
// Save index metadata with checksum
|
|
406
|
+
await this.saveIndexMetadata(files.length, allChunks.length, files);
|
|
93
407
|
return {
|
|
94
408
|
filesIndexed: files.length,
|
|
95
409
|
chunksCreated: allChunks.length,
|
|
410
|
+
incremental: false,
|
|
96
411
|
};
|
|
97
412
|
}
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
413
|
+
/**
|
|
414
|
+
* Perform incremental indexing - only process changed files
|
|
415
|
+
*/
|
|
416
|
+
async indexIncremental(files, onProgress) {
|
|
417
|
+
const report = (progress) => {
|
|
418
|
+
console.error(`[lance-context] ${progress.message}`);
|
|
419
|
+
onProgress?.(progress);
|
|
420
|
+
};
|
|
421
|
+
const changes = await this.detectFileChanges(files);
|
|
422
|
+
const filesToProcess = [...changes.added, ...changes.modified];
|
|
423
|
+
const hasChanges = filesToProcess.length > 0 || changes.deleted.length > 0;
|
|
424
|
+
if (!hasChanges) {
|
|
425
|
+
report({
|
|
426
|
+
phase: 'complete',
|
|
427
|
+
current: 0,
|
|
428
|
+
total: 0,
|
|
429
|
+
message: 'No changes detected, index is up to date',
|
|
430
|
+
});
|
|
431
|
+
this.table = await this.db.openTable('code_chunks');
|
|
432
|
+
const count = await this.table.countRows();
|
|
433
|
+
return {
|
|
434
|
+
filesIndexed: 0,
|
|
435
|
+
chunksCreated: count,
|
|
436
|
+
incremental: true,
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
report({
|
|
440
|
+
phase: 'scanning',
|
|
441
|
+
current: 0,
|
|
442
|
+
total: filesToProcess.length,
|
|
443
|
+
message: `Incremental update: ${changes.added.length} added, ${changes.modified.length} modified, ${changes.deleted.length} deleted`,
|
|
444
|
+
});
|
|
445
|
+
// Open the existing table
|
|
446
|
+
this.table = await this.db.openTable('code_chunks');
|
|
447
|
+
// Delete chunks from modified and deleted files
|
|
448
|
+
const filesToRemove = [
|
|
449
|
+
...changes.modified.map((f) => path.relative(this.projectPath, f)),
|
|
450
|
+
...changes.deleted,
|
|
451
|
+
];
|
|
452
|
+
if (filesToRemove.length > 0) {
|
|
453
|
+
for (const relativePath of filesToRemove) {
|
|
454
|
+
const sanitizedPath = sanitizePathForFilter(relativePath);
|
|
455
|
+
await this.table.delete(`filepath = '${sanitizedPath}'`);
|
|
456
|
+
}
|
|
457
|
+
report({
|
|
458
|
+
phase: 'chunking',
|
|
459
|
+
current: 0,
|
|
460
|
+
total: filesToProcess.length,
|
|
461
|
+
message: `Removed chunks from ${filesToRemove.length} files`,
|
|
462
|
+
});
|
|
463
|
+
}
|
|
464
|
+
// Process new and modified files (parallelized for I/O efficiency)
|
|
465
|
+
if (filesToProcess.length > 0) {
|
|
466
|
+
const chunkResults = await mapInBatches(filesToProcess, async (filepath) => this.chunkFile(filepath), FILE_PROCESSING_CONCURRENCY, (completed, total) => {
|
|
467
|
+
report({
|
|
468
|
+
phase: 'chunking',
|
|
469
|
+
current: completed,
|
|
470
|
+
total,
|
|
471
|
+
message: `Chunked ${completed}/${total} files`,
|
|
472
|
+
});
|
|
473
|
+
});
|
|
474
|
+
const newChunks = chunkResults.flat();
|
|
475
|
+
report({
|
|
476
|
+
phase: 'chunking',
|
|
477
|
+
current: filesToProcess.length,
|
|
478
|
+
total: filesToProcess.length,
|
|
479
|
+
message: `Created ${newChunks.length} new chunks`,
|
|
480
|
+
});
|
|
481
|
+
// Generate embeddings
|
|
482
|
+
await this.embedChunks(newChunks, onProgress);
|
|
483
|
+
// Add new chunks to the table
|
|
484
|
+
const data = newChunks.map((chunk) => ({
|
|
485
|
+
id: chunk.id,
|
|
486
|
+
filepath: chunk.filepath,
|
|
487
|
+
content: chunk.content,
|
|
488
|
+
startLine: chunk.startLine,
|
|
489
|
+
endLine: chunk.endLine,
|
|
490
|
+
language: chunk.language,
|
|
491
|
+
vector: chunk.embedding,
|
|
492
|
+
}));
|
|
493
|
+
if (data.length > 0) {
|
|
494
|
+
await this.table.add(data);
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
// Update file metadata
|
|
498
|
+
const allCurrentFiles = [...changes.unchanged, ...changes.added, ...changes.modified];
|
|
499
|
+
await this.saveFileMetadata(allCurrentFiles);
|
|
500
|
+
const totalChunks = await this.table.countRows();
|
|
501
|
+
// Save index metadata with checksum
|
|
502
|
+
await this.saveIndexMetadata(allCurrentFiles.length, totalChunks, allCurrentFiles);
|
|
503
|
+
return {
|
|
504
|
+
filesIndexed: filesToProcess.length,
|
|
505
|
+
chunksCreated: totalChunks,
|
|
506
|
+
incremental: true,
|
|
507
|
+
};
|
|
508
|
+
}
|
|
509
|
+
/**
|
|
510
|
+
* Generate embeddings for chunks in batches
|
|
511
|
+
*/
|
|
512
|
+
async embedChunks(chunks, onProgress) {
|
|
513
|
+
const report = (progress) => {
|
|
514
|
+
console.error(`[lance-context] ${progress.message}`);
|
|
515
|
+
onProgress?.(progress);
|
|
516
|
+
};
|
|
517
|
+
const batchSize = 32;
|
|
518
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
519
|
+
const batch = chunks.slice(i, i + batchSize);
|
|
520
|
+
const texts = batch.map((c) => c.content);
|
|
521
|
+
const embeddings = await this.embeddingBackend.embedBatch(texts);
|
|
522
|
+
batch.forEach((chunk, idx) => {
|
|
523
|
+
chunk.embedding = embeddings[idx];
|
|
524
|
+
});
|
|
525
|
+
report({
|
|
526
|
+
phase: 'embedding',
|
|
527
|
+
current: i + batch.length,
|
|
528
|
+
total: chunks.length,
|
|
529
|
+
message: `Embedded ${i + batch.length}/${chunks.length} chunks`,
|
|
530
|
+
});
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
async chunkFile(filepath) {
|
|
534
|
+
const ext = path.extname(filepath).slice(1);
|
|
102
535
|
const language = this.getLanguage(ext);
|
|
103
|
-
const relativePath = path.relative(this.projectPath,
|
|
536
|
+
const relativePath = path.relative(this.projectPath, filepath);
|
|
537
|
+
// Try AST-aware chunking for TypeScript/JavaScript
|
|
538
|
+
if (ASTChunker.canParse(filepath)) {
|
|
539
|
+
try {
|
|
540
|
+
return await this.chunkFileWithAST(filepath, relativePath, language);
|
|
541
|
+
}
|
|
542
|
+
catch {
|
|
543
|
+
// Fall back to line-based chunking if AST parsing fails
|
|
544
|
+
console.error(`[lance-context] AST parsing failed for ${relativePath}, falling back to line-based chunking`);
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
// Try tree-sitter chunking for other languages (Python, Go, Rust, Java, Kotlin)
|
|
548
|
+
if (TreeSitterChunker.canParse(filepath)) {
|
|
549
|
+
try {
|
|
550
|
+
return await this.chunkFileWithTreeSitter(filepath, relativePath, language);
|
|
551
|
+
}
|
|
552
|
+
catch (error) {
|
|
553
|
+
// Fall back to line-based chunking if tree-sitter parsing fails
|
|
554
|
+
console.error(`[lance-context] Tree-sitter parsing failed for ${relativePath}, falling back to line-based chunking:`, error);
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
// Line-based chunking for unsupported languages or as fallback
|
|
558
|
+
return this.chunkFileByLines(filepath, relativePath, language);
|
|
559
|
+
}
|
|
560
|
+
/**
|
|
561
|
+
* Chunk a file using AST-aware parsing
|
|
562
|
+
*/
|
|
563
|
+
async chunkFileWithAST(filepath, relativePath, language) {
|
|
564
|
+
const astChunker = new ASTChunker();
|
|
565
|
+
const astChunks = await astChunker.chunkFile(filepath);
|
|
566
|
+
return astChunks.map((chunk) => ({
|
|
567
|
+
id: `${relativePath}:${chunk.startLine}-${chunk.endLine}${chunk.name ? `:${chunk.name}` : ''}`,
|
|
568
|
+
filepath: relativePath,
|
|
569
|
+
content: chunk.content,
|
|
570
|
+
startLine: chunk.startLine,
|
|
571
|
+
endLine: chunk.endLine,
|
|
572
|
+
language,
|
|
573
|
+
symbolType: chunk.type,
|
|
574
|
+
symbolName: chunk.name,
|
|
575
|
+
}));
|
|
576
|
+
}
|
|
577
|
+
/**
|
|
578
|
+
* Chunk a file using tree-sitter AST parsing (Python, Go, Rust, Java, Kotlin)
|
|
579
|
+
*/
|
|
580
|
+
async chunkFileWithTreeSitter(filepath, relativePath, language) {
|
|
581
|
+
const treeSitterChunker = new TreeSitterChunker();
|
|
582
|
+
const treeSitterChunks = await treeSitterChunker.chunkFile(filepath);
|
|
583
|
+
return treeSitterChunks.map((chunk) => ({
|
|
584
|
+
id: `${relativePath}:${chunk.startLine}-${chunk.endLine}${chunk.name ? `:${chunk.name}` : ''}`,
|
|
585
|
+
filepath: relativePath,
|
|
586
|
+
content: chunk.content,
|
|
587
|
+
startLine: chunk.startLine,
|
|
588
|
+
endLine: chunk.endLine,
|
|
589
|
+
language,
|
|
590
|
+
symbolType: chunk.type,
|
|
591
|
+
symbolName: chunk.name,
|
|
592
|
+
}));
|
|
593
|
+
}
|
|
594
|
+
/**
|
|
595
|
+
* Chunk a file using line-based splitting (fallback)
|
|
596
|
+
*/
|
|
597
|
+
async chunkFileByLines(filepath, relativePath, language) {
|
|
598
|
+
const content = await fs.readFile(filepath, 'utf-8');
|
|
599
|
+
const lines = content.split('\n');
|
|
600
|
+
const chunkingConfig = getChunkingConfig(this.config);
|
|
601
|
+
const chunkSize = chunkingConfig.maxLines;
|
|
602
|
+
const chunkOverlap = chunkingConfig.overlap;
|
|
104
603
|
const chunks = [];
|
|
105
|
-
for (let i = 0; i < lines.length; i +=
|
|
106
|
-
const chunkLines = lines.slice(i, i +
|
|
604
|
+
for (let i = 0; i < lines.length; i += chunkSize - chunkOverlap) {
|
|
605
|
+
const chunkLines = lines.slice(i, i + chunkSize);
|
|
107
606
|
const chunkContent = chunkLines.join('\n');
|
|
108
607
|
if (chunkContent.trim().length === 0)
|
|
109
608
|
continue;
|
|
110
609
|
chunks.push({
|
|
111
610
|
id: `${relativePath}:${i + 1}-${i + chunkLines.length}`,
|
|
112
|
-
|
|
611
|
+
filepath: relativePath,
|
|
113
612
|
content: chunkContent,
|
|
114
613
|
startLine: i + 1,
|
|
115
614
|
endLine: i + chunkLines.length,
|
|
@@ -144,30 +643,427 @@ export class CodeIndexer {
|
|
|
144
643
|
};
|
|
145
644
|
return langMap[ext] || ext;
|
|
146
645
|
}
|
|
147
|
-
|
|
646
|
+
/**
|
|
647
|
+
* Get query embedding from cache or compute it.
|
|
648
|
+
* Uses LRU eviction when cache is full.
|
|
649
|
+
*/
|
|
650
|
+
async getQueryEmbedding(query) {
|
|
651
|
+
// Check cache first
|
|
652
|
+
const cached = this.queryEmbeddingCache.get(query);
|
|
653
|
+
if (cached) {
|
|
654
|
+
// Move to end for LRU (delete and re-insert)
|
|
655
|
+
this.queryEmbeddingCache.delete(query);
|
|
656
|
+
this.queryEmbeddingCache.set(query, cached);
|
|
657
|
+
return cached;
|
|
658
|
+
}
|
|
659
|
+
// Compute embedding
|
|
660
|
+
const embedding = await this.embeddingBackend.embed(query);
|
|
661
|
+
// Evict oldest entry if cache is full (first entry in Map)
|
|
662
|
+
if (this.queryEmbeddingCache.size >= QUERY_CACHE_MAX_SIZE) {
|
|
663
|
+
const oldestKey = this.queryEmbeddingCache.keys().next().value;
|
|
664
|
+
if (oldestKey !== undefined) {
|
|
665
|
+
this.queryEmbeddingCache.delete(oldestKey);
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
// Store in cache
|
|
669
|
+
this.queryEmbeddingCache.set(query, embedding);
|
|
670
|
+
return embedding;
|
|
671
|
+
}
|
|
672
|
+
/**
|
|
673
|
+
* Check if a filepath matches a glob pattern.
|
|
674
|
+
* Supports negation patterns starting with '!'.
|
|
675
|
+
*/
|
|
676
|
+
matchesPathPattern(filepath, pattern) {
|
|
677
|
+
// Handle negation pattern
|
|
678
|
+
if (pattern.startsWith('!')) {
|
|
679
|
+
return !minimatch(filepath, pattern.slice(1));
|
|
680
|
+
}
|
|
681
|
+
return minimatch(filepath, pattern);
|
|
682
|
+
}
|
|
683
|
+
async search(queryOrOptions, limit) {
|
|
684
|
+
// Normalize arguments
|
|
685
|
+
const options = typeof queryOrOptions === 'string'
|
|
686
|
+
? { query: queryOrOptions, limit: limit ?? 10 }
|
|
687
|
+
: queryOrOptions;
|
|
688
|
+
const { query, limit: resultLimit = 10, pathPattern, languages } = options;
|
|
148
689
|
if (!this.table) {
|
|
149
690
|
const status = await this.getStatus();
|
|
150
691
|
if (!status.indexed) {
|
|
151
692
|
throw new Error('Codebase not indexed. Run index_codebase first.');
|
|
152
693
|
}
|
|
153
694
|
}
|
|
154
|
-
const queryEmbedding = await this.
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
695
|
+
const queryEmbedding = await this.getQueryEmbedding(query);
|
|
696
|
+
const searchConfig = getSearchConfig(this.config);
|
|
697
|
+
// Fetch more results than needed for re-ranking and filtering
|
|
698
|
+
// If we have filters, fetch even more to account for filtered-out results
|
|
699
|
+
const hasFilters = pathPattern !== undefined || (languages && languages.length > 0);
|
|
700
|
+
const fetchMultiplier = hasFilters ? 5 : 3;
|
|
701
|
+
const fetchLimit = Math.min(resultLimit * fetchMultiplier, hasFilters ? 100 : 50);
|
|
702
|
+
const results = await this.table.search(queryEmbedding).limit(fetchLimit).toArray();
|
|
703
|
+
// Apply filters
|
|
704
|
+
let filteredResults = results;
|
|
705
|
+
if (pathPattern) {
|
|
706
|
+
filteredResults = filteredResults.filter((r) => this.matchesPathPattern(r.filepath, pathPattern));
|
|
707
|
+
}
|
|
708
|
+
if (languages && languages.length > 0) {
|
|
709
|
+
const normalizedLanguages = languages.map((l) => l.toLowerCase());
|
|
710
|
+
filteredResults = filteredResults.filter((r) => normalizedLanguages.includes(r.language.toLowerCase()));
|
|
711
|
+
}
|
|
712
|
+
// Hybrid scoring: combine semantic similarity with keyword matching
|
|
713
|
+
const scoredResults = filteredResults.map((r, index) => {
|
|
714
|
+
// Semantic score: inverse of rank (higher is better)
|
|
715
|
+
const semanticScore = 1 - index / fetchLimit;
|
|
716
|
+
// Keyword score: based on query term matches
|
|
717
|
+
const keywordScore = this.calculateKeywordScore(query, r.content, r.filepath);
|
|
718
|
+
// Combined score using configurable weights
|
|
719
|
+
const combinedScore = searchConfig.semanticWeight * semanticScore + searchConfig.keywordWeight * keywordScore;
|
|
720
|
+
return { result: r, score: combinedScore };
|
|
721
|
+
});
|
|
722
|
+
// Sort by combined score and take top results
|
|
723
|
+
scoredResults.sort((a, b) => b.score - a.score);
|
|
724
|
+
return scoredResults.slice(0, resultLimit).map((sr) => ({
|
|
725
|
+
id: sr.result.id,
|
|
726
|
+
filepath: sr.result.filepath,
|
|
727
|
+
content: sr.result.content,
|
|
728
|
+
startLine: sr.result.startLine,
|
|
729
|
+
endLine: sr.result.endLine,
|
|
730
|
+
language: sr.result.language,
|
|
731
|
+
symbolType: sr.result.symbolType,
|
|
732
|
+
symbolName: sr.result.symbolName,
|
|
163
733
|
}));
|
|
164
734
|
}
|
|
735
|
+
/**
|
|
736
|
+
* Calculate keyword match score for hybrid search
|
|
737
|
+
*/
|
|
738
|
+
calculateKeywordScore(query, content, filepath) {
|
|
739
|
+
const queryTerms = query
|
|
740
|
+
.toLowerCase()
|
|
741
|
+
.split(/\s+/)
|
|
742
|
+
.filter((t) => t.length > 2);
|
|
743
|
+
if (queryTerms.length === 0)
|
|
744
|
+
return 0;
|
|
745
|
+
const contentLower = content.toLowerCase();
|
|
746
|
+
const filepathLower = filepath.toLowerCase();
|
|
747
|
+
let matchCount = 0;
|
|
748
|
+
let exactMatchBonus = 0;
|
|
749
|
+
for (const term of queryTerms) {
|
|
750
|
+
// Check content matches
|
|
751
|
+
if (contentLower.includes(term)) {
|
|
752
|
+
matchCount++;
|
|
753
|
+
// Bonus for exact word match (not just substring)
|
|
754
|
+
const wordBoundaryRegex = new RegExp(`\\b${term}\\b`, 'i');
|
|
755
|
+
if (wordBoundaryRegex.test(content)) {
|
|
756
|
+
exactMatchBonus += 0.5;
|
|
757
|
+
}
|
|
758
|
+
}
|
|
759
|
+
// Bonus for filename/path match
|
|
760
|
+
if (filepathLower.includes(term)) {
|
|
761
|
+
matchCount += 0.5;
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
// Normalize score to 0-1 range
|
|
765
|
+
const baseScore = matchCount / queryTerms.length;
|
|
766
|
+
const bonusScore = Math.min(exactMatchBonus / queryTerms.length, 0.5);
|
|
767
|
+
return Math.min(baseScore + bonusScore, 1);
|
|
768
|
+
}
|
|
769
|
+
/**
|
|
770
|
+
* Find code chunks semantically similar to a given code snippet or file location.
|
|
771
|
+
* This is useful for finding duplicate logic, similar implementations, or related code.
|
|
772
|
+
*/
|
|
773
|
+
async searchSimilar(options) {
|
|
774
|
+
const { filepath, startLine, endLine, code, limit = 10, threshold = 0, excludeSelf = true, } = options;
|
|
775
|
+
// Validate input first - need either code or filepath
|
|
776
|
+
if (!code && !filepath) {
|
|
777
|
+
throw new Error('Either code or filepath must be provided');
|
|
778
|
+
}
|
|
779
|
+
if (!this.table) {
|
|
780
|
+
const status = await this.getStatus();
|
|
781
|
+
if (!status.indexed) {
|
|
782
|
+
throw new Error('Codebase not indexed. Run index_codebase first.');
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
// Get the source code to find similar chunks for
|
|
786
|
+
let sourceCode;
|
|
787
|
+
let sourceId = null;
|
|
788
|
+
if (code) {
|
|
789
|
+
sourceCode = code;
|
|
790
|
+
}
|
|
791
|
+
else {
|
|
792
|
+
// Read from file
|
|
793
|
+
const fullPath = path.join(this.projectPath, filepath);
|
|
794
|
+
const fileContent = await fs.readFile(fullPath, 'utf-8');
|
|
795
|
+
const lines = fileContent.split('\n');
|
|
796
|
+
const start = startLine ? startLine - 1 : 0;
|
|
797
|
+
const end = endLine ? endLine : lines.length;
|
|
798
|
+
sourceCode = lines.slice(start, end).join('\n');
|
|
799
|
+
// Build source ID for exclusion
|
|
800
|
+
if (startLine && endLine) {
|
|
801
|
+
sourceId = `${filepath}:${startLine}-${endLine}`;
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
if (!sourceCode.trim()) {
|
|
805
|
+
throw new Error('Source code is empty');
|
|
806
|
+
}
|
|
807
|
+
// Embed the source code
|
|
808
|
+
const sourceEmbedding = await this.embeddingBackend.embed(sourceCode);
|
|
809
|
+
// Search for similar chunks - fetch extra to account for filtering
|
|
810
|
+
const fetchLimit = Math.min((limit + 5) * 2, 100);
|
|
811
|
+
const results = await this.table.search(sourceEmbedding).limit(fetchLimit).toArray();
|
|
812
|
+
// LanceDB returns results sorted by distance (ascending)
|
|
813
|
+
// Convert distance to similarity score (1 - normalized_distance)
|
|
814
|
+
const maxDistance = results.length > 0 ? Math.max(...results.map((r) => r._distance || 0)) : 1;
|
|
815
|
+
const scoredResults = [];
|
|
816
|
+
for (const r of results) {
|
|
817
|
+
// Skip self if requested
|
|
818
|
+
if (excludeSelf && sourceId && r.id === sourceId) {
|
|
819
|
+
continue;
|
|
820
|
+
}
|
|
821
|
+
// Also skip if content is identical (for code-based search)
|
|
822
|
+
if (excludeSelf && code && r.content.trim() === code.trim()) {
|
|
823
|
+
continue;
|
|
824
|
+
}
|
|
825
|
+
// Convert distance to similarity (0 = far, 1 = identical)
|
|
826
|
+
const distance = r._distance || 0;
|
|
827
|
+
const similarity = maxDistance > 0 ? 1 - distance / maxDistance : 1;
|
|
828
|
+
// Apply threshold filter
|
|
829
|
+
if (similarity < threshold) {
|
|
830
|
+
continue;
|
|
831
|
+
}
|
|
832
|
+
scoredResults.push({
|
|
833
|
+
id: r.id,
|
|
834
|
+
filepath: r.filepath,
|
|
835
|
+
content: r.content,
|
|
836
|
+
startLine: r.startLine,
|
|
837
|
+
endLine: r.endLine,
|
|
838
|
+
language: r.language,
|
|
839
|
+
similarity,
|
|
840
|
+
symbolType: r.symbolType,
|
|
841
|
+
symbolName: r.symbolName,
|
|
842
|
+
});
|
|
843
|
+
if (scoredResults.length >= limit) {
|
|
844
|
+
break;
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
return scoredResults;
|
|
848
|
+
}
|
|
165
849
|
async clearIndex() {
|
|
166
850
|
const tableNames = await this.db.tableNames();
|
|
167
851
|
if (tableNames.includes('code_chunks')) {
|
|
168
852
|
await this.db.dropTable('code_chunks');
|
|
169
853
|
}
|
|
170
854
|
this.table = null;
|
|
855
|
+
// Clear query embedding cache to prevent stale embeddings
|
|
856
|
+
this.queryEmbeddingCache.clear();
|
|
857
|
+
// Clear clustering metadata
|
|
858
|
+
await this.clearClusteringMetadata();
|
|
859
|
+
}
|
|
860
|
+
get clusteringMetadataPath() {
|
|
861
|
+
return path.join(this.indexPath, 'clustering-metadata.json');
|
|
862
|
+
}
|
|
863
|
+
/**
|
|
864
|
+
* Clear clustering metadata file
|
|
865
|
+
*/
|
|
866
|
+
async clearClusteringMetadata() {
|
|
867
|
+
try {
|
|
868
|
+
await fs.unlink(this.clusteringMetadataPath);
|
|
869
|
+
}
|
|
870
|
+
catch {
|
|
871
|
+
// Ignore errors if file doesn't exist
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
/**
|
|
875
|
+
* Save clustering result to metadata file
|
|
876
|
+
*/
|
|
877
|
+
async saveClusteringMetadata(result) {
|
|
878
|
+
await fs.mkdir(this.indexPath, { recursive: true });
|
|
879
|
+
const data = {
|
|
880
|
+
clusterCount: result.clusterCount,
|
|
881
|
+
clusters: result.clusters,
|
|
882
|
+
// Convert Map to object for JSON serialization
|
|
883
|
+
assignments: Object.fromEntries(result.assignments),
|
|
884
|
+
generatedAt: new Date().toISOString(),
|
|
885
|
+
};
|
|
886
|
+
await fs.writeFile(this.clusteringMetadataPath, JSON.stringify(data, null, 2));
|
|
887
|
+
}
|
|
888
|
+
/**
|
|
889
|
+
* Load clustering result from metadata file
|
|
890
|
+
*/
|
|
891
|
+
async loadClusteringMetadata() {
|
|
892
|
+
try {
|
|
893
|
+
const content = await fs.readFile(this.clusteringMetadataPath, 'utf-8');
|
|
894
|
+
const data = JSON.parse(content);
|
|
895
|
+
return {
|
|
896
|
+
clusterCount: data.clusterCount,
|
|
897
|
+
clusters: data.clusters,
|
|
898
|
+
// Convert object back to Map
|
|
899
|
+
assignments: new Map(Object.entries(data.assignments).map(([k, v]) => [k, v])),
|
|
900
|
+
};
|
|
901
|
+
}
|
|
902
|
+
catch {
|
|
903
|
+
return null;
|
|
904
|
+
}
|
|
905
|
+
}
|
|
906
|
+
/**
|
|
907
|
+
* Cluster the indexed codebase into semantic concept areas.
|
|
908
|
+
* Uses k-means clustering on embeddings to discover related code groups.
|
|
909
|
+
*/
|
|
910
|
+
async clusterConcepts(options = {}) {
|
|
911
|
+
if (!this.table) {
|
|
912
|
+
const status = await this.getStatus();
|
|
913
|
+
if (!status.indexed) {
|
|
914
|
+
throw new Error('Codebase not indexed. Run index_codebase first.');
|
|
915
|
+
}
|
|
916
|
+
this.table = await this.db.openTable('code_chunks');
|
|
917
|
+
}
|
|
918
|
+
// Fetch all chunks with embeddings
|
|
919
|
+
const rows = await this.table.query().toArray();
|
|
920
|
+
const chunks = rows.map((row) => ({
|
|
921
|
+
id: row.id,
|
|
922
|
+
content: row.content,
|
|
923
|
+
filepath: row.filepath,
|
|
924
|
+
embedding: row.vector,
|
|
925
|
+
symbolName: row.symbolName,
|
|
926
|
+
symbolType: row.symbolType,
|
|
927
|
+
}));
|
|
928
|
+
// Perform clustering
|
|
929
|
+
const result = kMeansClustering(chunks, options);
|
|
930
|
+
// Save to metadata file
|
|
931
|
+
await this.saveClusteringMetadata(result);
|
|
932
|
+
return result;
|
|
933
|
+
}
|
|
934
|
+
/**
|
|
935
|
+
* List all discovered concept clusters.
|
|
936
|
+
* Returns cached clustering result if available, otherwise clusters first.
|
|
937
|
+
*/
|
|
938
|
+
async listConcepts(forceRecluster = false) {
|
|
939
|
+
if (!forceRecluster) {
|
|
940
|
+
const cached = await this.loadClusteringMetadata();
|
|
941
|
+
if (cached) {
|
|
942
|
+
return cached.clusters;
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
const result = await this.clusterConcepts();
|
|
946
|
+
return result.clusters;
|
|
947
|
+
}
|
|
948
|
+
/**
|
|
949
|
+
* Search for code within a specific concept cluster.
|
|
950
|
+
* Returns chunks that belong to the specified cluster, optionally filtered by query.
|
|
951
|
+
*/
|
|
952
|
+
async searchByConcept(conceptId, query, limit = 10) {
|
|
953
|
+
const clustering = await this.loadClusteringMetadata();
|
|
954
|
+
if (!clustering) {
|
|
955
|
+
throw new Error('No clustering data available. Run clusterConcepts first.');
|
|
956
|
+
}
|
|
957
|
+
// Get chunk IDs in this cluster
|
|
958
|
+
const chunkIds = new Set();
|
|
959
|
+
for (const [chunkId, clusterId] of clustering.assignments) {
|
|
960
|
+
if (clusterId === conceptId) {
|
|
961
|
+
chunkIds.add(chunkId);
|
|
962
|
+
}
|
|
963
|
+
}
|
|
964
|
+
if (chunkIds.size === 0) {
|
|
965
|
+
return [];
|
|
966
|
+
}
|
|
967
|
+
if (!this.table) {
|
|
968
|
+
this.table = await this.db.openTable('code_chunks');
|
|
969
|
+
}
|
|
970
|
+
// If query provided, use semantic search and filter to cluster
|
|
971
|
+
if (query) {
|
|
972
|
+
const queryEmbedding = await this.getQueryEmbedding(query);
|
|
973
|
+
const results = await this.table
|
|
974
|
+
.search(queryEmbedding)
|
|
975
|
+
.limit(limit * 3)
|
|
976
|
+
.toArray();
|
|
977
|
+
return results
|
|
978
|
+
.filter((r) => chunkIds.has(r.id))
|
|
979
|
+
.slice(0, limit)
|
|
980
|
+
.map((r) => ({
|
|
981
|
+
id: r.id,
|
|
982
|
+
filepath: r.filepath,
|
|
983
|
+
content: r.content,
|
|
984
|
+
startLine: r.startLine,
|
|
985
|
+
endLine: r.endLine,
|
|
986
|
+
language: r.language,
|
|
987
|
+
symbolType: r.symbolType,
|
|
988
|
+
symbolName: r.symbolName,
|
|
989
|
+
}));
|
|
990
|
+
}
|
|
991
|
+
// Without query, return representative chunks from the cluster
|
|
992
|
+
const cluster = clustering.clusters.find((c) => c.id === conceptId);
|
|
993
|
+
if (!cluster) {
|
|
994
|
+
return [];
|
|
995
|
+
}
|
|
996
|
+
const results = [];
|
|
997
|
+
for (const chunkId of cluster.representativeChunks.slice(0, limit)) {
|
|
998
|
+
// Fetch chunk by ID - LanceDB doesn't have direct ID lookup, so we filter
|
|
999
|
+
const rows = await this.table.query().where(`id = '${chunkId}'`).limit(1).toArray();
|
|
1000
|
+
if (rows.length > 0) {
|
|
1001
|
+
const r = rows[0];
|
|
1002
|
+
results.push({
|
|
1003
|
+
id: r.id,
|
|
1004
|
+
filepath: r.filepath,
|
|
1005
|
+
content: r.content,
|
|
1006
|
+
startLine: r.startLine,
|
|
1007
|
+
endLine: r.endLine,
|
|
1008
|
+
language: r.language,
|
|
1009
|
+
symbolType: r.symbolType,
|
|
1010
|
+
symbolName: r.symbolName,
|
|
1011
|
+
});
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
return results;
|
|
1015
|
+
}
|
|
1016
|
+
/**
|
|
1017
|
+
* Generate a comprehensive summary of the codebase including concept areas.
|
|
1018
|
+
*/
|
|
1019
|
+
async summarizeCodebase(clusteringOptions) {
|
|
1020
|
+
const status = await this.getStatus();
|
|
1021
|
+
if (!status.indexed) {
|
|
1022
|
+
throw new Error('Codebase not indexed. Run index_codebase first.');
|
|
1023
|
+
}
|
|
1024
|
+
if (!this.table) {
|
|
1025
|
+
this.table = await this.db.openTable('code_chunks');
|
|
1026
|
+
}
|
|
1027
|
+
// Gather language statistics
|
|
1028
|
+
const rows = await this.table.query().toArray();
|
|
1029
|
+
const languageStats = new Map();
|
|
1030
|
+
for (const row of rows) {
|
|
1031
|
+
const lang = row.language;
|
|
1032
|
+
if (!languageStats.has(lang)) {
|
|
1033
|
+
languageStats.set(lang, { fileCount: new Set(), chunkCount: 0 });
|
|
1034
|
+
}
|
|
1035
|
+
const stats = languageStats.get(lang);
|
|
1036
|
+
stats.fileCount.add(row.filepath);
|
|
1037
|
+
stats.chunkCount++;
|
|
1038
|
+
}
|
|
1039
|
+
const languages = Array.from(languageStats.entries())
|
|
1040
|
+
.map(([language, stats]) => ({
|
|
1041
|
+
language,
|
|
1042
|
+
fileCount: stats.fileCount.size,
|
|
1043
|
+
chunkCount: stats.chunkCount,
|
|
1044
|
+
}))
|
|
1045
|
+
.sort((a, b) => b.chunkCount - a.chunkCount);
|
|
1046
|
+
// Perform clustering
|
|
1047
|
+
const chunks = rows.map((row) => ({
|
|
1048
|
+
id: row.id,
|
|
1049
|
+
content: row.content,
|
|
1050
|
+
filepath: row.filepath,
|
|
1051
|
+
embedding: row.vector,
|
|
1052
|
+
symbolName: row.symbolName,
|
|
1053
|
+
symbolType: row.symbolType,
|
|
1054
|
+
}));
|
|
1055
|
+
const clusteringResult = kMeansClustering(chunks, clusteringOptions);
|
|
1056
|
+
await this.saveClusteringMetadata(clusteringResult);
|
|
1057
|
+
// Calculate clustering quality
|
|
1058
|
+
const silhouetteScore = calculateSilhouetteScore(chunks, clusteringResult.assignments, clusteringResult.clusters);
|
|
1059
|
+
return {
|
|
1060
|
+
totalFiles: status.fileCount,
|
|
1061
|
+
totalChunks: status.chunkCount,
|
|
1062
|
+
languages,
|
|
1063
|
+
concepts: clusteringResult.clusters,
|
|
1064
|
+
clusteringQuality: silhouetteScore,
|
|
1065
|
+
generatedAt: new Date().toISOString(),
|
|
1066
|
+
};
|
|
171
1067
|
}
|
|
172
1068
|
}
|
|
173
1069
|
//# sourceMappingURL=indexer.js.map
|