ragcode-context-engine 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +366 -0
- package/README.zh-CN.md +363 -0
- package/dist/src/cli/configure/app.d.ts +6 -0
- package/dist/src/cli/configure/app.js +81 -0
- package/dist/src/cli/configure/run.d.ts +5 -0
- package/dist/src/cli/configure/run.js +85 -0
- package/dist/src/cli/configure/state.d.ts +42 -0
- package/dist/src/cli/configure/state.js +174 -0
- package/dist/src/cli/configure.d.ts +31 -0
- package/dist/src/cli/configure.js +101 -0
- package/dist/src/cli/index.d.ts +2 -0
- package/dist/src/cli/index.js +503 -0
- package/dist/src/cli/tui/index-progress.d.ts +12 -0
- package/dist/src/cli/tui/index-progress.js +49 -0
- package/dist/src/cli/tui/watch-status.d.ts +10 -0
- package/dist/src/cli/tui/watch-status.js +27 -0
- package/dist/src/cli/update.d.ts +18 -0
- package/dist/src/cli/update.js +111 -0
- package/dist/src/config/dotenv.d.ts +1 -0
- package/dist/src/config/dotenv.js +14 -0
- package/dist/src/config/graph-runtime.d.ts +13 -0
- package/dist/src/config/graph-runtime.js +29 -0
- package/dist/src/config/runtime-config.d.ts +87 -0
- package/dist/src/config/runtime-config.js +215 -0
- package/dist/src/config/semantic-runtime.d.ts +24 -0
- package/dist/src/config/semantic-runtime.js +89 -0
- package/dist/src/context/context-builder.d.ts +20 -0
- package/dist/src/context/context-builder.js +277 -0
- package/dist/src/context/expansion-policy.d.ts +6 -0
- package/dist/src/context/expansion-policy.js +49 -0
- package/dist/src/context/skeletonizer.d.ts +2 -0
- package/dist/src/context/skeletonizer.js +79 -0
- package/dist/src/context/snippet-renderer.d.ts +2 -0
- package/dist/src/context/snippet-renderer.js +67 -0
- package/dist/src/core/contracts.d.ts +74 -0
- package/dist/src/core/contracts.js +1 -0
- package/dist/src/core/engine.d.ts +64 -0
- package/dist/src/core/engine.js +442 -0
- package/dist/src/core/types.d.ts +490 -0
- package/dist/src/core/types.js +1 -0
- package/dist/src/diagnostics/doctor.d.ts +66 -0
- package/dist/src/diagnostics/doctor.js +193 -0
- package/dist/src/diagnostics/embedding-test.d.ts +24 -0
- package/dist/src/diagnostics/embedding-test.js +83 -0
- package/dist/src/graph/diff-files.d.ts +1 -0
- package/dist/src/graph/diff-files.js +14 -0
- package/dist/src/graph/impact-report.d.ts +10 -0
- package/dist/src/graph/impact-report.js +173 -0
- package/dist/src/graph/in-memory-graph-store.d.ts +36 -0
- package/dist/src/graph/in-memory-graph-store.js +395 -0
- package/dist/src/graph/owner-ranking.d.ts +2 -0
- package/dist/src/graph/owner-ranking.js +41 -0
- package/dist/src/graph/sqlite-graph-store.d.ts +51 -0
- package/dist/src/graph/sqlite-graph-store.js +724 -0
- package/dist/src/graph/sqlite-statements.d.ts +36 -0
- package/dist/src/graph/sqlite-statements.js +105 -0
- package/dist/src/graph/target-matcher.d.ts +13 -0
- package/dist/src/graph/target-matcher.js +64 -0
- package/dist/src/index.d.ts +32 -0
- package/dist/src/index.js +32 -0
- package/dist/src/indexing/analyzers/fallback-analyzer.d.ts +6 -0
- package/dist/src/indexing/analyzers/fallback-analyzer.js +45 -0
- package/dist/src/indexing/analyzers/go-treesitter-analyzer.d.ts +2 -0
- package/dist/src/indexing/analyzers/go-treesitter-analyzer.js +87 -0
- package/dist/src/indexing/analyzers/java-treesitter-analyzer.d.ts +2 -0
- package/dist/src/indexing/analyzers/java-treesitter-analyzer.js +88 -0
- package/dist/src/indexing/analyzers/python-treesitter-analyzer.d.ts +2 -0
- package/dist/src/indexing/analyzers/python-treesitter-analyzer.js +96 -0
- package/dist/src/indexing/analyzers/registry.d.ts +5 -0
- package/dist/src/indexing/analyzers/registry.js +23 -0
- package/dist/src/indexing/analyzers/rust-treesitter-analyzer.d.ts +2 -0
- package/dist/src/indexing/analyzers/rust-treesitter-analyzer.js +96 -0
- package/dist/src/indexing/analyzers/tree-sitter-base.d.ts +30 -0
- package/dist/src/indexing/analyzers/tree-sitter-base.js +163 -0
- package/dist/src/indexing/analyzers/types.d.ts +17 -0
- package/dist/src/indexing/analyzers/types.js +1 -0
- package/dist/src/indexing/analyzers/typescript-analyzer.d.ts +5 -0
- package/dist/src/indexing/analyzers/typescript-analyzer.js +199 -0
- package/dist/src/indexing/ast-analyzer.d.ts +11 -0
- package/dist/src/indexing/ast-analyzer.js +11 -0
- package/dist/src/indexing/chunker.d.ts +11 -0
- package/dist/src/indexing/chunker.js +157 -0
- package/dist/src/indexing/ignore-policy.d.ts +6 -0
- package/dist/src/indexing/ignore-policy.js +40 -0
- package/dist/src/indexing/indexer.d.ts +13 -0
- package/dist/src/indexing/indexer.js +189 -0
- package/dist/src/indexing/language.d.ts +3 -0
- package/dist/src/indexing/language.js +24 -0
- package/dist/src/indexing/scanner.d.ts +13 -0
- package/dist/src/indexing/scanner.js +87 -0
- package/dist/src/lsp/definition-resolver.d.ts +6 -0
- package/dist/src/lsp/definition-resolver.js +60 -0
- package/dist/src/lsp/typescript-language-service.d.ts +21 -0
- package/dist/src/lsp/typescript-language-service.js +82 -0
- package/dist/src/mcp/server.d.ts +11 -0
- package/dist/src/mcp/server.js +64 -0
- package/dist/src/mcp/tools.d.ts +266 -0
- package/dist/src/mcp/tools.js +309 -0
- package/dist/src/project/project-identity.d.ts +2 -0
- package/dist/src/project/project-identity.js +24 -0
- package/dist/src/project/project-registry.d.ts +12 -0
- package/dist/src/project/project-registry.js +49 -0
- package/dist/src/project/workspace-resolver.d.ts +20 -0
- package/dist/src/project/workspace-resolver.js +62 -0
- package/dist/src/retrieval/graph-reranker.d.ts +11 -0
- package/dist/src/retrieval/graph-reranker.js +0 -0
- package/dist/src/retrieval/hybrid-retriever.d.ts +31 -0
- package/dist/src/retrieval/hybrid-retriever.js +111 -0
- package/dist/src/retrieval/path-classification.d.ts +6 -0
- package/dist/src/retrieval/path-classification.js +22 -0
- package/dist/src/retrieval/query-matching.d.ts +22 -0
- package/dist/src/retrieval/query-matching.js +166 -0
- package/dist/src/retrieval/query-planner.d.ts +5 -0
- package/dist/src/retrieval/query-planner.js +77 -0
- package/dist/src/retrieval/ranking-signals.d.ts +19 -0
- package/dist/src/retrieval/ranking-signals.js +97 -0
- package/dist/src/retrieval/topology-distance.d.ts +21 -0
- package/dist/src/retrieval/topology-distance.js +116 -0
- package/dist/src/reuse/reuse-detector.d.ts +12 -0
- package/dist/src/reuse/reuse-detector.js +564 -0
- package/dist/src/semantic/deterministic-embedding.d.ts +7 -0
- package/dist/src/semantic/deterministic-embedding.js +31 -0
- package/dist/src/semantic/in-memory-semantic-store.d.ts +11 -0
- package/dist/src/semantic/in-memory-semantic-store.js +65 -0
- package/dist/src/semantic/lance-semantic-store.d.ts +131 -0
- package/dist/src/semantic/lance-semantic-store.js +623 -0
- package/dist/src/semantic/openai-compatible-embedding.d.ts +19 -0
- package/dist/src/semantic/openai-compatible-embedding.js +75 -0
- package/dist/src/service/service-identity.d.ts +13 -0
- package/dist/src/service/service-identity.js +48 -0
- package/dist/src/service/service-manager.d.ts +29 -0
- package/dist/src/service/service-manager.js +231 -0
- package/dist/src/service/service-templates.d.ts +22 -0
- package/dist/src/service/service-templates.js +101 -0
- package/dist/src/subgraph/impact-explainer.d.ts +2 -0
- package/dist/src/subgraph/impact-explainer.js +54 -0
- package/dist/src/subgraph/node-expander.d.ts +13 -0
- package/dist/src/subgraph/node-expander.js +139 -0
- package/dist/src/subgraph/output-preset.d.ts +3 -0
- package/dist/src/subgraph/output-preset.js +102 -0
- package/dist/src/subgraph/subgraph-builder.d.ts +17 -0
- package/dist/src/subgraph/subgraph-builder.js +688 -0
- package/dist/src/topology/export-index.d.ts +7 -0
- package/dist/src/topology/export-index.js +14 -0
- package/dist/src/topology/framework-topology.d.ts +3 -0
- package/dist/src/topology/framework-topology.js +460 -0
- package/dist/src/topology/import-resolver.d.ts +2 -0
- package/dist/src/topology/import-resolver.js +29 -0
- package/dist/src/topology/orm-topology.d.ts +3 -0
- package/dist/src/topology/orm-topology.js +200 -0
- package/dist/src/topology/runtime-topology.d.ts +3 -0
- package/dist/src/topology/runtime-topology.js +204 -0
- package/dist/src/topology/symbol-resolver.d.ts +6 -0
- package/dist/src/topology/symbol-resolver.js +74 -0
- package/dist/src/topology/test-topology.d.ts +2 -0
- package/dist/src/topology/test-topology.js +82 -0
- package/dist/src/utils/hash.d.ts +2 -0
- package/dist/src/utils/hash.js +7 -0
- package/dist/src/utils/path.d.ts +2 -0
- package/dist/src/utils/path.js +7 -0
- package/dist/src/watch/event-journal.d.ts +17 -0
- package/dist/src/watch/event-journal.js +81 -0
- package/dist/src/watch/file-event-coalescer.d.ts +9 -0
- package/dist/src/watch/file-event-coalescer.js +39 -0
- package/dist/src/watch/index-scheduler.d.ts +52 -0
- package/dist/src/watch/index-scheduler.js +190 -0
- package/dist/src/watch/watch-daemon.d.ts +73 -0
- package/dist/src/watch/watch-daemon.js +368 -0
- package/dist/src/watch/watcher-liveness.d.ts +47 -0
- package/dist/src/watch/watcher-liveness.js +168 -0
- package/dist/src/web/server.d.ts +1 -0
- package/dist/src/web/server.js +375 -0
- package/package.json +94 -0
|
@@ -0,0 +1,623 @@
|
|
|
1
|
+
import fs from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { renderChunkForEmbedding } from "./in-memory-semantic-store.js";
|
|
4
|
+
export class LanceSemanticStore {
|
|
5
|
+
uri;
|
|
6
|
+
tablePromise;
|
|
7
|
+
tableName;
|
|
8
|
+
connection;
|
|
9
|
+
module;
|
|
10
|
+
vectorDimensions;
|
|
11
|
+
embeddingProfile;
|
|
12
|
+
profileStore;
|
|
13
|
+
embeddingBatchSize;
|
|
14
|
+
embeddingConcurrency;
|
|
15
|
+
embeddingRetryAttempts;
|
|
16
|
+
embeddingRetryBaseDelayMs;
|
|
17
|
+
repairOnMismatch;
|
|
18
|
+
maxChunks;
|
|
19
|
+
onProgress;
|
|
20
|
+
constructor(uri, tableNameOrOptions = "code_chunks") {
|
|
21
|
+
this.uri = uri;
|
|
22
|
+
if (typeof tableNameOrOptions === "string") {
|
|
23
|
+
this.tableName = tableNameOrOptions;
|
|
24
|
+
this.embeddingProfile = { provider: "unknown" };
|
|
25
|
+
this.profileStore = defaultProfileStore(uri, this.tableName);
|
|
26
|
+
this.embeddingBatchSize = 64;
|
|
27
|
+
this.embeddingConcurrency = 1;
|
|
28
|
+
this.embeddingRetryAttempts = 3;
|
|
29
|
+
this.embeddingRetryBaseDelayMs = 100;
|
|
30
|
+
this.repairOnMismatch = true;
|
|
31
|
+
return;
|
|
32
|
+
}
|
|
33
|
+
this.tableName = tableNameOrOptions.tableName ?? "code_chunks";
|
|
34
|
+
this.connection = tableNameOrOptions.connection;
|
|
35
|
+
this.module = tableNameOrOptions.module;
|
|
36
|
+
this.vectorDimensions = tableNameOrOptions.vectorDimensions;
|
|
37
|
+
this.embeddingProfile = tableNameOrOptions.embeddingProfile ?? { provider: "unknown" };
|
|
38
|
+
this.profileStore = tableNameOrOptions.profileStore ?? defaultProfileStore(uri, this.tableName);
|
|
39
|
+
this.embeddingBatchSize = positiveInteger(tableNameOrOptions.embeddingBatchSize, 64);
|
|
40
|
+
this.embeddingConcurrency = positiveInteger(tableNameOrOptions.embeddingConcurrency, 1);
|
|
41
|
+
this.embeddingRetryAttempts = positiveInteger(tableNameOrOptions.embeddingRetryAttempts, 3);
|
|
42
|
+
this.embeddingRetryBaseDelayMs = positiveInteger(tableNameOrOptions.embeddingRetryBaseDelayMs, 100);
|
|
43
|
+
this.repairOnMismatch = tableNameOrOptions.repairOnMismatch ?? true;
|
|
44
|
+
this.maxChunks = tableNameOrOptions.maxChunks;
|
|
45
|
+
this.onProgress = tableNameOrOptions.onProgress;
|
|
46
|
+
}
|
|
47
|
+
async needsRebuild(_repoRoot, _projectId) {
|
|
48
|
+
const profile = await this.profileStore.read();
|
|
49
|
+
if (!profile)
|
|
50
|
+
return true;
|
|
51
|
+
const table = await this.getExistingTable(profile.dimensions, { repair: false }).catch(() => undefined);
|
|
52
|
+
return !table;
|
|
53
|
+
}
|
|
54
|
+
async resetRepo(repoRoot) {
|
|
55
|
+
const table = await this.getExistingTable();
|
|
56
|
+
if (!table) {
|
|
57
|
+
const dimensions = this.vectorDimensions ?? 64;
|
|
58
|
+
await this.getTable(dimensions, [emptySeedRecord(dimensions)]);
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
await table.delete(equalsPredicate("repoRoot", repoRoot));
|
|
62
|
+
}
|
|
63
|
+
async deleteFile(_repoRoot, projectId, filePath) {
|
|
64
|
+
const table = await this.getExistingTable();
|
|
65
|
+
if (!table)
|
|
66
|
+
return;
|
|
67
|
+
await table.delete(andPredicate(equalsPredicate("projectId", projectId), equalsPredicate("filePath", filePath)));
|
|
68
|
+
}
|
|
69
|
+
async upsertChunks(chunks, provider, generation = 1) {
|
|
70
|
+
if (chunks.length === 0)
|
|
71
|
+
return;
|
|
72
|
+
const selectedChunks = selectChunksForEmbedding(chunks, this.maxChunks);
|
|
73
|
+
const knownDimensions = provider.dimensions ?? this.vectorDimensions;
|
|
74
|
+
const repairedBeforeReuse = knownDimensions ? await this.ensureCompatibleProfile(knownDimensions) : false;
|
|
75
|
+
let table = repairedBeforeReuse ? undefined : await this.getExistingTable(knownDimensions);
|
|
76
|
+
let { chunksToEmbed, reusedRows } = table
|
|
77
|
+
? await this.planChunkEmbeddings(table, selectedChunks, generation)
|
|
78
|
+
: { chunksToEmbed: selectedChunks, reusedRows: [] };
|
|
79
|
+
if (table)
|
|
80
|
+
await this.deleteFileScopesForChunks(table, chunks);
|
|
81
|
+
const reusedVectorDimensions = reusedRows[0]?.vector.length;
|
|
82
|
+
if (reusedVectorDimensions) {
|
|
83
|
+
const repaired = await this.ensureCompatibleProfile(reusedVectorDimensions);
|
|
84
|
+
if (repaired) {
|
|
85
|
+
table = undefined;
|
|
86
|
+
chunksToEmbed = selectedChunks;
|
|
87
|
+
reusedRows = [];
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
table = await this.addRows(table, reusedRows);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (chunksToEmbed.length === 0)
|
|
94
|
+
return;
|
|
95
|
+
const batches = chunkArray(chunksToEmbed, this.embeddingBatchSize);
|
|
96
|
+
let completedChunks = 0;
|
|
97
|
+
const startedAt = Date.now();
|
|
98
|
+
for (let start = 0; start < batches.length; start += this.embeddingConcurrency) {
|
|
99
|
+
const window = batches.slice(start, start + this.embeddingConcurrency);
|
|
100
|
+
const settled = await Promise.allSettled(window.map((batch) => this.embedChunkBatch(batch, provider, generation)));
|
|
101
|
+
for (let offset = 0; offset < settled.length; offset += 1) {
|
|
102
|
+
const outcome = settled[offset];
|
|
103
|
+
if (outcome.status !== "fulfilled")
|
|
104
|
+
continue;
|
|
105
|
+
const rows = outcome.value;
|
|
106
|
+
const vectorDimensions = rows[0]?.vector.length;
|
|
107
|
+
if (!vectorDimensions)
|
|
108
|
+
continue;
|
|
109
|
+
const repaired = await this.ensureCompatibleProfile(vectorDimensions);
|
|
110
|
+
if (repaired)
|
|
111
|
+
table = undefined;
|
|
112
|
+
table = await this.addRows(table, rows);
|
|
113
|
+
completedChunks += rows.length;
|
|
114
|
+
this.onProgress?.({
|
|
115
|
+
totalChunks: chunksToEmbed.length,
|
|
116
|
+
completedChunks,
|
|
117
|
+
batchChunks: rows.length,
|
|
118
|
+
batchIndex: start + offset + 1,
|
|
119
|
+
batchCount: batches.length,
|
|
120
|
+
elapsedMs: Date.now() - startedAt
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
// Surface a failure only after persisting every batch that succeeded in this window, so
|
|
124
|
+
// one transient error never discards (and forces re-embedding of) already-completed batches.
|
|
125
|
+
const failure = settled.find((outcome) => outcome.status === "rejected");
|
|
126
|
+
if (failure)
|
|
127
|
+
throw failure.reason;
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
async embedChunkBatch(chunks, provider, generation) {
|
|
131
|
+
const texts = chunks.map((chunk) => renderChunkForEmbedding(chunk));
|
|
132
|
+
const vectors = await retryEmbedding(() => provider.embedBatch ? provider.embedBatch(texts) : Promise.all(texts.map((text) => provider.embed(text))), {
|
|
133
|
+
attempts: this.embeddingRetryAttempts,
|
|
134
|
+
baseDelayMs: this.embeddingRetryBaseDelayMs
|
|
135
|
+
});
|
|
136
|
+
if (vectors.length !== chunks.length) {
|
|
137
|
+
throw new Error(`Embedding provider returned ${vectors.length} vector(s), expected ${chunks.length}.`);
|
|
138
|
+
}
|
|
139
|
+
return chunks.map((chunk, index) => ({
|
|
140
|
+
id: chunk.id,
|
|
141
|
+
projectId: chunk.projectId,
|
|
142
|
+
repoRoot: chunk.repoRoot,
|
|
143
|
+
filePath: chunk.filePath,
|
|
144
|
+
language: chunk.language,
|
|
145
|
+
kind: chunk.kind,
|
|
146
|
+
symbolName: chunk.symbolName ?? "",
|
|
147
|
+
startLine: chunk.startLine,
|
|
148
|
+
endLine: chunk.endLine,
|
|
149
|
+
content: chunk.content,
|
|
150
|
+
contentHash: chunk.contentHash,
|
|
151
|
+
generation,
|
|
152
|
+
vector: vectors[index]
|
|
153
|
+
}));
|
|
154
|
+
}
|
|
155
|
+
async deleteFileScopesForChunks(table, chunks) {
|
|
156
|
+
const fileScopes = new Set(chunks.map((chunk) => JSON.stringify([chunk.projectId, chunk.filePath])));
|
|
157
|
+
for (const fileScope of fileScopes) {
|
|
158
|
+
const [projectId, filePath] = JSON.parse(fileScope);
|
|
159
|
+
await table.delete(andPredicate(equalsPredicate("projectId", projectId), equalsPredicate("filePath", filePath), "id != '__seed__'"));
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
async planChunkEmbeddings(table, chunks, generation) {
|
|
163
|
+
if (!table.query)
|
|
164
|
+
return { chunksToEmbed: chunks, reusedRows: [] };
|
|
165
|
+
const reusableVectors = await this.loadReusableVectors(table, chunks);
|
|
166
|
+
const chunksToEmbed = [];
|
|
167
|
+
const reusedRows = [];
|
|
168
|
+
for (const chunk of chunks) {
|
|
169
|
+
const vector = reusableVectors.get(reuseVectorKey(chunk.projectId, chunk.contentHash));
|
|
170
|
+
if (vector)
|
|
171
|
+
reusedRows.push(rowForChunk(chunk, vector, generation));
|
|
172
|
+
else
|
|
173
|
+
chunksToEmbed.push(chunk);
|
|
174
|
+
}
|
|
175
|
+
return { chunksToEmbed, reusedRows };
|
|
176
|
+
}
|
|
177
|
+
async loadReusableVectors(table, chunks) {
|
|
178
|
+
const vectors = new Map();
|
|
179
|
+
if (!table.query)
|
|
180
|
+
return vectors;
|
|
181
|
+
const hashesByProject = new Map();
|
|
182
|
+
for (const chunk of chunks) {
|
|
183
|
+
const existing = hashesByProject.get(chunk.projectId);
|
|
184
|
+
if (existing)
|
|
185
|
+
existing.add(chunk.contentHash);
|
|
186
|
+
else
|
|
187
|
+
hashesByProject.set(chunk.projectId, new Set([chunk.contentHash]));
|
|
188
|
+
}
|
|
189
|
+
for (const [projectId, hashSet] of hashesByProject) {
|
|
190
|
+
// One batched IN-query per project (chunked to keep predicates bounded) replaces the
|
|
191
|
+
// former one-round-trip-per-chunk lookup. A truncated batch only lowers the reuse hit
|
|
192
|
+
// rate (those chunks get re-embedded) and never reuses a stale vector.
|
|
193
|
+
for (const batch of chunkArray([...hashSet], REUSE_LOOKUP_BATCH)) {
|
|
194
|
+
const rows = await table.query()
|
|
195
|
+
.where(andPredicate(equalsPredicate("projectId", projectId), inPredicate("contentHash", batch)))
|
|
196
|
+
.limit(batch.length * REUSE_LOOKUP_ROW_MULTIPLIER)
|
|
197
|
+
.toArray();
|
|
198
|
+
for (const reusable of rows) {
|
|
199
|
+
const key = reuseVectorKey(projectId, reusable.contentHash);
|
|
200
|
+
// A vector read back from LanceDB is an Arrow Vector, not a plain number[]; re-adding it
|
|
201
|
+
// verbatim makes the next table.add() fail Arrow schema inference ("vector.isValid").
|
|
202
|
+
// Materialize a plain number[] so reused rows write back cleanly on real LanceDB.
|
|
203
|
+
if (!vectors.has(key))
|
|
204
|
+
vectors.set(key, Array.from(reusable.vector));
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return vectors;
|
|
209
|
+
}
|
|
210
|
+
async search(query, provider) {
|
|
211
|
+
const vector = await retryEmbedding(() => provider.embed(query.query), {
|
|
212
|
+
attempts: this.embeddingRetryAttempts,
|
|
213
|
+
baseDelayMs: this.embeddingRetryBaseDelayMs
|
|
214
|
+
});
|
|
215
|
+
const repaired = await this.ensureCompatibleProfile(vector.length);
|
|
216
|
+
if (repaired)
|
|
217
|
+
return [];
|
|
218
|
+
const table = await this.getExistingTable(vector.length);
|
|
219
|
+
if (!table)
|
|
220
|
+
return [];
|
|
221
|
+
const predicate = searchPredicate(query);
|
|
222
|
+
const rows = await table
|
|
223
|
+
.search(vector)
|
|
224
|
+
.where(predicate)
|
|
225
|
+
.limit(query.limit ?? 20)
|
|
226
|
+
.toArray();
|
|
227
|
+
return rows.filter((row) => row.id !== "__seed__").map((row) => ({
|
|
228
|
+
chunk: {
|
|
229
|
+
id: row.id,
|
|
230
|
+
projectId: row.projectId,
|
|
231
|
+
repoRoot: row.repoRoot,
|
|
232
|
+
filePath: row.filePath,
|
|
233
|
+
language: row.language,
|
|
234
|
+
kind: row.kind,
|
|
235
|
+
symbolName: row.symbolName || undefined,
|
|
236
|
+
startLine: row.startLine,
|
|
237
|
+
endLine: row.endLine,
|
|
238
|
+
content: row.content,
|
|
239
|
+
contentHash: row.contentHash
|
|
240
|
+
},
|
|
241
|
+
score: 1 / (1 + (row._distance ?? 0)),
|
|
242
|
+
source: "semantic",
|
|
243
|
+
reason: "LanceDB vector similarity match"
|
|
244
|
+
}));
|
|
245
|
+
}
|
|
246
|
+
async getTable(vectorDimensions, seedRows) {
|
|
247
|
+
if (!this.tablePromise) {
|
|
248
|
+
this.tablePromise = this.openOrCreateTable(vectorDimensions, seedRows);
|
|
249
|
+
}
|
|
250
|
+
return this.tablePromise;
|
|
251
|
+
}
|
|
252
|
+
async getExistingTable(vectorDimensions, options = {}) {
|
|
253
|
+
const db = await this.getConnection();
|
|
254
|
+
const names = await db.tableNames();
|
|
255
|
+
if (!names.includes(this.tableName))
|
|
256
|
+
return undefined;
|
|
257
|
+
if (!this.tablePromise)
|
|
258
|
+
this.tablePromise = db.openTable(this.tableName);
|
|
259
|
+
const table = await this.tablePromise;
|
|
260
|
+
const problems = await tableSchemaProblems(table, vectorDimensions);
|
|
261
|
+
if (problems.length === 0)
|
|
262
|
+
return table;
|
|
263
|
+
const repair = options.repair ?? this.repairOnMismatch;
|
|
264
|
+
if (!repair)
|
|
265
|
+
throw new Error(`LanceDB table "${this.tableName}" schema mismatch: ${problems.join("; ")}.`);
|
|
266
|
+
await this.dropTableForRepair(db, problems);
|
|
267
|
+
return undefined;
|
|
268
|
+
}
|
|
269
|
+
async addRows(table, rows) {
|
|
270
|
+
const vectorDimensions = rows[0]?.vector.length;
|
|
271
|
+
if (!vectorDimensions)
|
|
272
|
+
throw new Error("Cannot write LanceDB rows without vector dimensions.");
|
|
273
|
+
if (!table)
|
|
274
|
+
return this.getTable(vectorDimensions, rows);
|
|
275
|
+
await deleteSeedRecord(table);
|
|
276
|
+
await table.add(rows);
|
|
277
|
+
return table;
|
|
278
|
+
}
|
|
279
|
+
async openOrCreateTable(vectorDimensions, seedRows) {
|
|
280
|
+
const db = await this.getConnection();
|
|
281
|
+
const names = await db.tableNames();
|
|
282
|
+
if (names.includes(this.tableName)) {
|
|
283
|
+
const table = await db.openTable(this.tableName);
|
|
284
|
+
const problems = await tableSchemaProblems(table, vectorDimensions);
|
|
285
|
+
if (problems.length > 0) {
|
|
286
|
+
await this.dropTableForRepair(db, problems);
|
|
287
|
+
if (seedRows?.length) {
|
|
288
|
+
return db.createTable(this.tableName, seedRows);
|
|
289
|
+
}
|
|
290
|
+
const seedRow = emptySeedRecord(vectorDimensions);
|
|
291
|
+
const created = await db.createTable(this.tableName, [seedRow]);
|
|
292
|
+
return created;
|
|
293
|
+
}
|
|
294
|
+
if (seedRows?.length) {
|
|
295
|
+
await deleteSeedRecord(table);
|
|
296
|
+
await table.add(seedRows);
|
|
297
|
+
}
|
|
298
|
+
return table;
|
|
299
|
+
}
|
|
300
|
+
if (seedRows?.length) {
|
|
301
|
+
return db.createTable(this.tableName, seedRows);
|
|
302
|
+
}
|
|
303
|
+
const seedRow = emptySeedRecord(vectorDimensions);
|
|
304
|
+
return db.createTable(this.tableName, [seedRow]);
|
|
305
|
+
}
|
|
306
|
+
async getConnection() {
|
|
307
|
+
return this.connection ?? await (this.module ?? await loadLanceDb()).connect(this.uri);
|
|
308
|
+
}
|
|
309
|
+
async ensureCompatibleProfile(dimensions) {
|
|
310
|
+
const expected = this.expectedProfile(dimensions);
|
|
311
|
+
const existing = await this.profileStore.read();
|
|
312
|
+
if (!existing) {
|
|
313
|
+
await this.profileStore.write(expected);
|
|
314
|
+
return false;
|
|
315
|
+
}
|
|
316
|
+
const mismatches = profileMismatches(existing, expected);
|
|
317
|
+
if (mismatches.length > 0) {
|
|
318
|
+
if (!this.repairOnMismatch) {
|
|
319
|
+
throw new Error(`LanceDB embedding profile mismatch for table "${this.tableName}": ${mismatches.join("; ")}. Re-index into a new LanceDB URI/table or clear the existing semantic table/profile.`);
|
|
320
|
+
}
|
|
321
|
+
await this.dropTableForRepair(await this.getConnection(), mismatches);
|
|
322
|
+
await this.profileStore.write(expected);
|
|
323
|
+
return true;
|
|
324
|
+
}
|
|
325
|
+
return false;
|
|
326
|
+
}
|
|
327
|
+
async dropTableForRepair(db, reasons) {
|
|
328
|
+
if (!db.dropTable) {
|
|
329
|
+
throw new Error(`LanceDB table "${this.tableName}" requires repair but the current connection cannot drop/recreate tables: ${reasons.join("; ")}.`);
|
|
330
|
+
}
|
|
331
|
+
this.tablePromise = undefined;
|
|
332
|
+
await db.dropTable(this.tableName);
|
|
333
|
+
}
|
|
334
|
+
expectedProfile(dimensions) {
|
|
335
|
+
const now = Date.now();
|
|
336
|
+
return {
|
|
337
|
+
schemaVersion: 1,
|
|
338
|
+
tableName: this.tableName,
|
|
339
|
+
provider: this.embeddingProfile.provider,
|
|
340
|
+
model: this.embeddingProfile.model,
|
|
341
|
+
baseUrl: this.embeddingProfile.baseUrl,
|
|
342
|
+
requestDimensions: this.embeddingProfile.requestDimensions,
|
|
343
|
+
dimensions,
|
|
344
|
+
createdAtMs: now,
|
|
345
|
+
updatedAtMs: now
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
function rowForChunk(chunk, vector, generation) {
|
|
350
|
+
return {
|
|
351
|
+
id: chunk.id,
|
|
352
|
+
projectId: chunk.projectId,
|
|
353
|
+
repoRoot: chunk.repoRoot,
|
|
354
|
+
filePath: chunk.filePath,
|
|
355
|
+
language: chunk.language,
|
|
356
|
+
kind: chunk.kind,
|
|
357
|
+
symbolName: chunk.symbolName ?? "",
|
|
358
|
+
startLine: chunk.startLine,
|
|
359
|
+
endLine: chunk.endLine,
|
|
360
|
+
content: chunk.content,
|
|
361
|
+
contentHash: chunk.contentHash,
|
|
362
|
+
generation,
|
|
363
|
+
vector
|
|
364
|
+
};
|
|
365
|
+
}
|
|
366
|
+
async function retryEmbedding(operation, options) {
|
|
367
|
+
let lastError;
|
|
368
|
+
for (let attempt = 1; attempt <= options.attempts; attempt += 1) {
|
|
369
|
+
try {
|
|
370
|
+
return await operation();
|
|
371
|
+
}
|
|
372
|
+
catch (error) {
|
|
373
|
+
lastError = error;
|
|
374
|
+
if (attempt >= options.attempts || !isRetryableEmbeddingError(error))
|
|
375
|
+
break;
|
|
376
|
+
await sleep(options.baseDelayMs * 2 ** (attempt - 1));
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
throw lastError;
|
|
380
|
+
}
|
|
381
|
+
function isRetryableEmbeddingError(error) {
|
|
382
|
+
if (!error || typeof error !== "object")
|
|
383
|
+
return false;
|
|
384
|
+
const candidate = error;
|
|
385
|
+
if (candidate.status === 429 || candidate.status === 408 || (candidate.status !== undefined && candidate.status >= 500))
|
|
386
|
+
return true;
|
|
387
|
+
// Node/undici network failures surface the code on error.cause, not the top-level error.
|
|
388
|
+
const text = `${candidate.code ?? ""} ${candidate.cause?.code ?? ""} ${candidate.message ?? ""}`.toLowerCase();
|
|
389
|
+
return /rate|timeout|temporar|econnreset|etimedout|429|5\d\d/.test(text);
|
|
390
|
+
}
|
|
391
|
+
function sleep(ms) {
|
|
392
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
393
|
+
}
|
|
394
|
+
function selectChunksForEmbedding(chunks, maxChunks) {
|
|
395
|
+
if (maxChunks === undefined || chunks.length <= maxChunks)
|
|
396
|
+
return chunks;
|
|
397
|
+
return chunks
|
|
398
|
+
.map((chunk, index) => ({ chunk, index, priority: semanticChunkPriority(chunk) }))
|
|
399
|
+
.sort((a, b) => a.priority - b.priority || a.index - b.index)
|
|
400
|
+
.slice(0, maxChunks)
|
|
401
|
+
.sort((a, b) => a.index - b.index)
|
|
402
|
+
.map((entry) => entry.chunk);
|
|
403
|
+
}
|
|
404
|
+
function semanticChunkPriority(chunk) {
|
|
405
|
+
switch (chunk.kind) {
|
|
406
|
+
case "function":
|
|
407
|
+
case "method":
|
|
408
|
+
return 0;
|
|
409
|
+
case "class":
|
|
410
|
+
case "type":
|
|
411
|
+
return 1;
|
|
412
|
+
case "file":
|
|
413
|
+
return 2;
|
|
414
|
+
case "block":
|
|
415
|
+
return 3;
|
|
416
|
+
case "variable":
|
|
417
|
+
return 4;
|
|
418
|
+
default:
|
|
419
|
+
return 5;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
function chunkArray(items, size) {
|
|
423
|
+
const batches = [];
|
|
424
|
+
for (let index = 0; index < items.length; index += size) {
|
|
425
|
+
batches.push(items.slice(index, index + size));
|
|
426
|
+
}
|
|
427
|
+
return batches;
|
|
428
|
+
}
|
|
429
|
+
function positiveInteger(value, fallback) {
|
|
430
|
+
if (value === undefined)
|
|
431
|
+
return fallback;
|
|
432
|
+
if (!Number.isInteger(value) || value <= 0)
|
|
433
|
+
throw new Error(`Invalid positive integer: ${value}`);
|
|
434
|
+
return value;
|
|
435
|
+
}
|
|
436
|
+
function profileMismatches(existing, expected) {
|
|
437
|
+
const mismatches = [];
|
|
438
|
+
if (existing.schemaVersion !== expected.schemaVersion)
|
|
439
|
+
mismatches.push(`schemaVersion ${existing.schemaVersion} != ${expected.schemaVersion}`);
|
|
440
|
+
if (existing.tableName !== expected.tableName)
|
|
441
|
+
mismatches.push(`tableName ${existing.tableName} != ${expected.tableName}`);
|
|
442
|
+
if (existing.provider !== expected.provider)
|
|
443
|
+
mismatches.push(`provider ${existing.provider} != ${expected.provider}`);
|
|
444
|
+
if ((existing.model ?? "") !== (expected.model ?? ""))
|
|
445
|
+
mismatches.push(`model ${existing.model ?? "<unset>"} != ${expected.model ?? "<unset>"}`);
|
|
446
|
+
if ((existing.baseUrl ?? "") !== (expected.baseUrl ?? ""))
|
|
447
|
+
mismatches.push(`baseUrl ${existing.baseUrl ?? "<unset>"} != ${expected.baseUrl ?? "<unset>"}`);
|
|
448
|
+
if (Boolean(existing.requestDimensions) !== Boolean(expected.requestDimensions))
|
|
449
|
+
mismatches.push(`requestDimensions ${Boolean(existing.requestDimensions)} != ${Boolean(expected.requestDimensions)}`);
|
|
450
|
+
if (existing.dimensions !== expected.dimensions)
|
|
451
|
+
mismatches.push(`dimensions ${existing.dimensions} != ${expected.dimensions}`);
|
|
452
|
+
return mismatches;
|
|
453
|
+
}
|
|
454
|
+
function defaultProfileStore(uri, tableName) {
|
|
455
|
+
if (/^[a-z][a-z0-9+.-]*:\/\//i.test(uri)) {
|
|
456
|
+
return new MemoryLanceProfileStore(`${uri}::${tableName}`);
|
|
457
|
+
}
|
|
458
|
+
return new FileLanceProfileStore(path.join(uri, `${tableName}.embedding-profile.json`));
|
|
459
|
+
}
|
|
460
|
+
const memoryProfiles = new Map();
|
|
461
|
+
class MemoryLanceProfileStore {
|
|
462
|
+
key;
|
|
463
|
+
constructor(key) {
|
|
464
|
+
this.key = key;
|
|
465
|
+
}
|
|
466
|
+
async read() {
|
|
467
|
+
return memoryProfiles.get(this.key);
|
|
468
|
+
}
|
|
469
|
+
async write(profile) {
|
|
470
|
+
const existing = memoryProfiles.get(this.key);
|
|
471
|
+
memoryProfiles.set(this.key, {
|
|
472
|
+
...profile,
|
|
473
|
+
createdAtMs: existing?.createdAtMs ?? profile.createdAtMs
|
|
474
|
+
});
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
class FileLanceProfileStore {
|
|
478
|
+
filePath;
|
|
479
|
+
constructor(filePath) {
|
|
480
|
+
this.filePath = filePath;
|
|
481
|
+
}
|
|
482
|
+
async read() {
|
|
483
|
+
const content = await fs.readFile(this.filePath, "utf8").catch(() => undefined);
|
|
484
|
+
if (!content)
|
|
485
|
+
return undefined;
|
|
486
|
+
const parsed = JSON.parse(content);
|
|
487
|
+
return parsed;
|
|
488
|
+
}
|
|
489
|
+
async write(profile) {
|
|
490
|
+
const existing = await this.read();
|
|
491
|
+
await fs.mkdir(path.dirname(this.filePath), { recursive: true });
|
|
492
|
+
await fs.writeFile(this.filePath, JSON.stringify({
|
|
493
|
+
...profile,
|
|
494
|
+
createdAtMs: existing?.createdAtMs ?? profile.createdAtMs,
|
|
495
|
+
updatedAtMs: profile.updatedAtMs
|
|
496
|
+
}, null, 2));
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
async function loadLanceDb() {
|
|
500
|
+
try {
|
|
501
|
+
return await import("@lancedb/lancedb");
|
|
502
|
+
}
|
|
503
|
+
catch (error) {
|
|
504
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
505
|
+
throw new Error(`LanceDB is not installed. Install @lancedb/lancedb to use LanceSemanticStore. Cause: ${message}`);
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
function emptySeedRecord(vectorDimensions) {
|
|
509
|
+
return {
|
|
510
|
+
id: "__seed__",
|
|
511
|
+
projectId: "__seed__",
|
|
512
|
+
repoRoot: "__seed__",
|
|
513
|
+
filePath: "__seed__",
|
|
514
|
+
language: "unknown",
|
|
515
|
+
kind: "block",
|
|
516
|
+
symbolName: "",
|
|
517
|
+
startLine: 0,
|
|
518
|
+
endLine: 0,
|
|
519
|
+
content: "seed",
|
|
520
|
+
contentHash: "seed",
|
|
521
|
+
generation: 1,
|
|
522
|
+
vector: new Array(vectorDimensions).fill(0)
|
|
523
|
+
};
|
|
524
|
+
}
|
|
525
|
+
async function tableSchemaProblems(table, vectorDimensions) {
|
|
526
|
+
const problems = [];
|
|
527
|
+
if (!table.schema)
|
|
528
|
+
return problems;
|
|
529
|
+
const schema = await table.schema();
|
|
530
|
+
const fields = schema.fields ?? [];
|
|
531
|
+
const fieldNames = new Set(fields.map((field) => field.name).filter((name) => typeof name === "string"));
|
|
532
|
+
for (const fieldName of requiredLanceFields) {
|
|
533
|
+
if (!fieldNames.has(fieldName))
|
|
534
|
+
problems.push(`missing column ${fieldName}`);
|
|
535
|
+
}
|
|
536
|
+
const vectorField = fields.find((field) => field.name === "vector");
|
|
537
|
+
const actualDimensions = vectorDimensionsFromField(vectorField);
|
|
538
|
+
if (vectorDimensions !== undefined && actualDimensions !== undefined && actualDimensions !== vectorDimensions) {
|
|
539
|
+
problems.push(`vector dimensions ${actualDimensions} != ${vectorDimensions}`);
|
|
540
|
+
}
|
|
541
|
+
return problems;
|
|
542
|
+
}
|
|
543
|
+
const requiredLanceFields = [
|
|
544
|
+
"id",
|
|
545
|
+
"projectId",
|
|
546
|
+
"repoRoot",
|
|
547
|
+
"filePath",
|
|
548
|
+
"language",
|
|
549
|
+
"kind",
|
|
550
|
+
"symbolName",
|
|
551
|
+
"startLine",
|
|
552
|
+
"endLine",
|
|
553
|
+
"content",
|
|
554
|
+
"contentHash",
|
|
555
|
+
"generation",
|
|
556
|
+
"vector"
|
|
557
|
+
];
|
|
558
|
+
function vectorDimensionsFromField(field) {
|
|
559
|
+
if (!field || typeof field !== "object")
|
|
560
|
+
return undefined;
|
|
561
|
+
const direct = field.vectorDimensions;
|
|
562
|
+
if (typeof direct === "number")
|
|
563
|
+
return direct;
|
|
564
|
+
const type = field.type ?? field.dataType;
|
|
565
|
+
return vectorDimensionsFromType(type);
|
|
566
|
+
}
|
|
567
|
+
function vectorDimensionsFromType(type) {
|
|
568
|
+
if (!type)
|
|
569
|
+
return undefined;
|
|
570
|
+
if (typeof type === "object") {
|
|
571
|
+
const candidate = type;
|
|
572
|
+
for (const key of ["listSize", "fixedSize", "dimension", "dimensions", "length"]) {
|
|
573
|
+
if (typeof candidate[key] === "number")
|
|
574
|
+
return candidate[key];
|
|
575
|
+
}
|
|
576
|
+
if (typeof candidate.toString === "function")
|
|
577
|
+
return vectorDimensionsFromType(candidate.toString());
|
|
578
|
+
}
|
|
579
|
+
if (typeof type === "string") {
|
|
580
|
+
const match = /(?:fixed_size_list|vector|float32|float64)[^0-9]*(\d+)/i.exec(type);
|
|
581
|
+
if (match)
|
|
582
|
+
return Number(match[1]);
|
|
583
|
+
}
|
|
584
|
+
return undefined;
|
|
585
|
+
}
|
|
586
|
+
async function deleteSeedRecord(table) {
|
|
587
|
+
await table.delete(equalsPredicate("id", "__seed__"));
|
|
588
|
+
}
|
|
589
|
+
function andPredicate(...predicates) {
|
|
590
|
+
return predicates.join(" AND ");
|
|
591
|
+
}
|
|
592
|
+
const REUSE_LOOKUP_BATCH = 512;
|
|
593
|
+
const REUSE_LOOKUP_ROW_MULTIPLIER = 4;
|
|
594
|
+
function inPredicate(column, values) {
|
|
595
|
+
if (values.length === 0)
|
|
596
|
+
return "1 = 0";
|
|
597
|
+
const list = values.map((value) => `'${escapeSqlLiteral(value)}'`).join(", ");
|
|
598
|
+
return `${identifier(column)} IN (${list})`;
|
|
599
|
+
}
|
|
600
|
+
function reuseVectorKey(projectId, contentHash) {
|
|
601
|
+
return `${projectId}${contentHash}`;
|
|
602
|
+
}
|
|
603
|
+
function equalsPredicate(column, value) {
|
|
604
|
+
return `${identifier(column)} = '${escapeSqlLiteral(value)}'`;
|
|
605
|
+
}
|
|
606
|
+
function identifier(value) {
|
|
607
|
+
if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(value))
|
|
608
|
+
throw new Error(`Invalid LanceDB predicate identifier: ${value}`);
|
|
609
|
+
return value;
|
|
610
|
+
}
|
|
611
|
+
function escapeSqlLiteral(value) {
|
|
612
|
+
// DataFusion (LanceDB's query engine) follows standard SQL: backslashes are literal inside
|
|
613
|
+
// string literals — only single quotes are escaped, by doubling. Doubling backslashes too
|
|
614
|
+
// made Windows paths (repoRoot from path.resolve) query as `\\` and miss the stored single `\`.
|
|
615
|
+
return value.replaceAll("'", "''");
|
|
616
|
+
}
|
|
617
|
+
function searchPredicate(query) {
|
|
618
|
+
if (query.projectId)
|
|
619
|
+
return equalsPredicate("projectId", query.projectId);
|
|
620
|
+
if (query.repoRoot)
|
|
621
|
+
return equalsPredicate("repoRoot", query.repoRoot);
|
|
622
|
+
throw new Error("Internal error: LanceDB semantic search requires a resolved projectId or repoRoot.");
|
|
623
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { EmbeddingProvider } from "../core/contracts.js";
|
|
2
|
+
export interface OpenAICompatibleEmbeddingProviderOptions {
|
|
3
|
+
apiKey: string;
|
|
4
|
+
model: string;
|
|
5
|
+
baseUrl?: string;
|
|
6
|
+
dimensions?: number;
|
|
7
|
+
requestDimensions?: boolean;
|
|
8
|
+
fetch?: typeof fetch;
|
|
9
|
+
}
|
|
10
|
+
export declare class OpenAICompatibleEmbeddingProvider implements EmbeddingProvider {
|
|
11
|
+
private readonly options;
|
|
12
|
+
readonly dimensions?: number;
|
|
13
|
+
private readonly baseUrl;
|
|
14
|
+
private readonly fetchImpl;
|
|
15
|
+
constructor(options: OpenAICompatibleEmbeddingProviderOptions);
|
|
16
|
+
embed(text: string): Promise<number[]>;
|
|
17
|
+
embedBatch(texts: string[]): Promise<number[][]>;
|
|
18
|
+
private requestEmbeddings;
|
|
19
|
+
}
|