@eclipse-lyra/extension-rag-system 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.d.ts +9 -0
- package/dist/api.d.ts.map +1 -0
- package/dist/api.js +90 -0
- package/dist/api.js.map +1 -0
- package/dist/chunkers/chunker-interface.d.ts +21 -0
- package/dist/chunkers/chunker-interface.d.ts.map +1 -0
- package/dist/chunkers/document-chunker.d.ts +12 -0
- package/dist/chunkers/document-chunker.d.ts.map +1 -0
- package/dist/chunkers/fallback-chunker.d.ts +10 -0
- package/dist/chunkers/fallback-chunker.d.ts.map +1 -0
- package/dist/chunkers/langchain-chunker.d.ts +12 -0
- package/dist/chunkers/langchain-chunker.d.ts.map +1 -0
- package/dist/document-index-service.d.ts +102 -0
- package/dist/document-index-service.d.ts.map +1 -0
- package/dist/embedding-service.d.ts +18 -0
- package/dist/embedding-service.d.ts.map +1 -0
- package/dist/extractors/document-extractor-interface.d.ts +26 -0
- package/dist/extractors/document-extractor-interface.d.ts.map +1 -0
- package/dist/extractors/document-extractor.d.ts +13 -0
- package/dist/extractors/document-extractor.d.ts.map +1 -0
- package/dist/extractors/llm-ocr-extractor.d.ts +16 -0
- package/dist/extractors/llm-ocr-extractor.d.ts.map +1 -0
- package/dist/extractors/pdfjs-extractor.d.ts +7 -0
- package/dist/extractors/pdfjs-extractor.d.ts.map +1 -0
- package/dist/i18n.json.d.ts +13 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +22 -0
- package/dist/index.js.map +1 -0
- package/dist/rag-integration-DO7-zvk2.js +191 -0
- package/dist/rag-integration-DO7-zvk2.js.map +1 -0
- package/dist/rag-integration.d.ts +6 -0
- package/dist/rag-integration.d.ts.map +1 -0
- package/dist/rag-service-BKBGCuO-.js +1872 -0
- package/dist/rag-service-BKBGCuO-.js.map +1 -0
- package/dist/rag-service.d.ts +25 -0
- package/dist/rag-service.d.ts.map +1 -0
- package/dist/rag-system-extension-DfD6H8Vr.js +1142 -0
- package/dist/rag-system-extension-DfD6H8Vr.js.map +1 -0
- package/dist/rag-system-extension.d.ts +2 -0
- package/dist/rag-system-extension.d.ts.map +1 -0
- package/dist/rag-system-manager.d.ts +41 -0
- package/dist/rag-system-manager.d.ts.map +1 -0
- package/dist/rxdb-loader.d.ts +9 -0
- package/dist/rxdb-loader.d.ts.map +1 -0
- package/dist/services/rag-result-formatter.d.ts +23 -0
- package/dist/services/rag-result-formatter.d.ts.map +1 -0
- package/dist/services/relevance-calculator.d.ts +7 -0
- package/dist/services/relevance-calculator.d.ts.map +1 -0
- package/dist/utils/constants.d.ts +35 -0
- package/dist/utils/constants.d.ts.map +1 -0
- package/dist/utils/context-scopes.d.ts +39 -0
- package/dist/utils/context-scopes.d.ts.map +1 -0
- package/dist/utils/query-utils.d.ts +7 -0
- package/dist/utils/query-utils.d.ts.map +1 -0
- package/dist/utils/snippet-extractor.d.ts +22 -0
- package/dist/utils/snippet-extractor.d.ts.map +1 -0
- package/dist/utils/workspace-utils.d.ts +8 -0
- package/dist/utils/workspace-utils.d.ts.map +1 -0
- package/dist/vector-utils.d.ts +28 -0
- package/dist/vector-utils.d.ts.map +1 -0
- package/package.json +39 -0
|
@@ -0,0 +1,1872 @@
|
|
|
1
|
+
import { createLogger, toastWarning, subscribe, TOPIC_WORKSPACE_CONNECTED, TOPIC_WORKSPACE_CHANGED, FileContentType, File, Directory, workspaceService, rootContext } from "@eclipse-lyra/core";
|
|
2
|
+
import { MLModel, inBrowserMLService, MLTask } from "@eclipse-lyra/extension-in-browser-ml/api";
|
|
3
|
+
import { aiService } from "@eclipse-lyra/extension-ai-system/api";
|
|
4
|
+
const logger$8 = createLogger("EmbeddingService");
|
|
5
|
+
class EmbeddingService {
|
|
6
|
+
constructor() {
|
|
7
|
+
this.modelName = MLModel.FEATURE_EXTRACTION;
|
|
8
|
+
this.EMBEDDING_DIMENSION = 384;
|
|
9
|
+
this.DEFAULT_OPTIONS = {
|
|
10
|
+
pooling: "mean",
|
|
11
|
+
normalize: true
|
|
12
|
+
};
|
|
13
|
+
}
|
|
14
|
+
async initialize() {
|
|
15
|
+
if (this.pipePromise) {
|
|
16
|
+
try {
|
|
17
|
+
await this.pipePromise;
|
|
18
|
+
return;
|
|
19
|
+
} catch (error) {
|
|
20
|
+
logger$8.warn("Previous initialization failed, retrying...");
|
|
21
|
+
this.pipePromise = void 0;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
logger$8.info(`Initializing embedding service with model: ${this.modelName}`);
|
|
25
|
+
try {
|
|
26
|
+
this.pipePromise = inBrowserMLService.getPipeline(
|
|
27
|
+
MLTask.FEATURE_EXTRACTION,
|
|
28
|
+
this.modelName,
|
|
29
|
+
{ quantized: false }
|
|
30
|
+
);
|
|
31
|
+
await this.pipePromise;
|
|
32
|
+
logger$8.info("Embedding service initialized successfully");
|
|
33
|
+
} catch (error) {
|
|
34
|
+
const errorMessage = error?.message || String(error);
|
|
35
|
+
const errorDetails = error ? JSON.stringify(error) : "";
|
|
36
|
+
logger$8.error(`Failed to initialize embedding service: ${errorMessage}${errorDetails ? ` - ${errorDetails}` : ""}`);
|
|
37
|
+
this.pipePromise = void 0;
|
|
38
|
+
throw new Error(`Embedding service initialization failed: ${errorMessage}`);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async generateEmbedding(text, options = {}) {
|
|
42
|
+
if (!this.pipePromise) {
|
|
43
|
+
await this.initialize();
|
|
44
|
+
}
|
|
45
|
+
const pipe = await this.pipePromise;
|
|
46
|
+
const opts = { ...this.DEFAULT_OPTIONS, ...options };
|
|
47
|
+
try {
|
|
48
|
+
const output = await pipe(text, {
|
|
49
|
+
pooling: opts.pooling,
|
|
50
|
+
normalize: opts.normalize
|
|
51
|
+
});
|
|
52
|
+
const embedding = Array.from(output.data);
|
|
53
|
+
if (embedding.length !== this.EMBEDDING_DIMENSION) {
|
|
54
|
+
logger$8.warn(`Unexpected embedding dimension: ${embedding.length}, expected ${this.EMBEDDING_DIMENSION}`);
|
|
55
|
+
}
|
|
56
|
+
return embedding;
|
|
57
|
+
} catch (error) {
|
|
58
|
+
logger$8.error(`Failed to generate embedding: ${error}`);
|
|
59
|
+
throw error;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
async generateEmbeddings(texts, options = {}) {
|
|
63
|
+
const embeddings = [];
|
|
64
|
+
for (const text of texts) {
|
|
65
|
+
const embedding = await this.generateEmbedding(text, options);
|
|
66
|
+
embeddings.push(embedding);
|
|
67
|
+
}
|
|
68
|
+
return embeddings;
|
|
69
|
+
}
|
|
70
|
+
getEmbeddingDimension() {
|
|
71
|
+
return this.EMBEDDING_DIMENSION;
|
|
72
|
+
}
|
|
73
|
+
getModelName() {
|
|
74
|
+
return this.modelName;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
const embeddingService = new EmbeddingService();
|
|
78
|
+
const SNIPPET_LENGTHS = {
|
|
79
|
+
PREVIEW: 500,
|
|
80
|
+
CONTEXT: 150
|
|
81
|
+
};
|
|
82
|
+
const RELEVANCE_WEIGHTS = {
|
|
83
|
+
FILE_NAME_MATCH: 10,
|
|
84
|
+
FILE_PATH_MATCH: 5,
|
|
85
|
+
CONTENT_MATCH: 1,
|
|
86
|
+
FILE_NAME_EXACT: 20,
|
|
87
|
+
FILE_PATH_EXACT: 10,
|
|
88
|
+
EXACT_PHRASE: 5,
|
|
89
|
+
TERM_COVERAGE: 15
|
|
90
|
+
};
|
|
91
|
+
const SEARCH_CONFIG = {
|
|
92
|
+
DEFAULT_LIMIT: 5,
|
|
93
|
+
MAX_LIMIT: 20
|
|
94
|
+
};
|
|
95
|
+
const VECTOR_SEARCH_CONFIG = {
|
|
96
|
+
SAMPLE_VECTOR_COUNT: 5
|
|
97
|
+
};
|
|
98
|
+
const INDEX_FIELD_NAMES = [
|
|
99
|
+
"idx0",
|
|
100
|
+
"idx1",
|
|
101
|
+
"idx2",
|
|
102
|
+
"idx3",
|
|
103
|
+
"idx4"
|
|
104
|
+
];
|
|
105
|
+
const CONTENT_PREVIEW_LENGTHS = {
|
|
106
|
+
LONG: 1e3
|
|
107
|
+
};
|
|
108
|
+
function euclideanDistance(vec1, vec2) {
|
|
109
|
+
if (vec1.length !== vec2.length) {
|
|
110
|
+
throw new Error(`Vector dimensions must match: ${vec1.length} vs ${vec2.length}`);
|
|
111
|
+
}
|
|
112
|
+
let sum = 0;
|
|
113
|
+
for (let i = 0; i < vec1.length; i++) {
|
|
114
|
+
const diff = vec1[i] - vec2[i];
|
|
115
|
+
sum += diff * diff;
|
|
116
|
+
}
|
|
117
|
+
return Math.sqrt(sum);
|
|
118
|
+
}
|
|
119
|
+
function generateSampleVectors(count, dimension, existingEmbeddings) {
|
|
120
|
+
const samples = [];
|
|
121
|
+
if (existingEmbeddings && existingEmbeddings.length > 0) {
|
|
122
|
+
for (let i = 0; i < count; i++) {
|
|
123
|
+
if (i < existingEmbeddings.length) {
|
|
124
|
+
samples.push({
|
|
125
|
+
vector: existingEmbeddings[i],
|
|
126
|
+
idx: i
|
|
127
|
+
});
|
|
128
|
+
} else {
|
|
129
|
+
const vector = [];
|
|
130
|
+
for (let j = 0; j < dimension; j++) {
|
|
131
|
+
vector.push(Math.random() * 2 - 1);
|
|
132
|
+
}
|
|
133
|
+
samples.push({
|
|
134
|
+
vector,
|
|
135
|
+
idx: i
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
} else {
|
|
140
|
+
for (let i = 0; i < count; i++) {
|
|
141
|
+
const vector = [];
|
|
142
|
+
for (let j = 0; j < dimension; j++) {
|
|
143
|
+
vector.push(Math.random() * 2 - 1);
|
|
144
|
+
}
|
|
145
|
+
samples.push({
|
|
146
|
+
vector,
|
|
147
|
+
idx: i
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return samples;
|
|
152
|
+
}
|
|
153
|
+
function calculateIndexValues(embedding, sampleVectors) {
|
|
154
|
+
if (sampleVectors.length !== INDEX_FIELD_NAMES.length) {
|
|
155
|
+
throw new Error(
|
|
156
|
+
`Sample vectors count (${sampleVectors.length}) must match index field count (${INDEX_FIELD_NAMES.length})`
|
|
157
|
+
);
|
|
158
|
+
}
|
|
159
|
+
const result = {};
|
|
160
|
+
for (let i = 0; i < INDEX_FIELD_NAMES.length; i++) {
|
|
161
|
+
result[INDEX_FIELD_NAMES[i]] = euclideanDistance(embedding, sampleVectors[i].vector);
|
|
162
|
+
}
|
|
163
|
+
return result;
|
|
164
|
+
}
|
|
165
|
+
function getIndexValueArray(indexValues) {
|
|
166
|
+
return INDEX_FIELD_NAMES.map((name) => indexValues[name]);
|
|
167
|
+
}
|
|
168
|
+
function createIndexValuesFromArray(values) {
|
|
169
|
+
if (values.length !== INDEX_FIELD_NAMES.length) {
|
|
170
|
+
throw new Error(
|
|
171
|
+
`Values array length (${values.length}) must match index field count (${INDEX_FIELD_NAMES.length})`
|
|
172
|
+
);
|
|
173
|
+
}
|
|
174
|
+
const result = {};
|
|
175
|
+
for (let i = 0; i < INDEX_FIELD_NAMES.length; i++) {
|
|
176
|
+
result[INDEX_FIELD_NAMES[i]] = values[i];
|
|
177
|
+
}
|
|
178
|
+
return result;
|
|
179
|
+
}
|
|
180
|
+
function cosineSimilarity(vec1, vec2) {
|
|
181
|
+
if (vec1.length !== vec2.length) {
|
|
182
|
+
throw new Error(`Vector dimensions must match: ${vec1.length} vs ${vec2.length}`);
|
|
183
|
+
}
|
|
184
|
+
let dotProduct = 0;
|
|
185
|
+
let norm1 = 0;
|
|
186
|
+
let norm2 = 0;
|
|
187
|
+
for (let i = 0; i < vec1.length; i++) {
|
|
188
|
+
dotProduct += vec1[i] * vec2[i];
|
|
189
|
+
norm1 += vec1[i] * vec1[i];
|
|
190
|
+
norm2 += vec2[i] * vec2[i];
|
|
191
|
+
}
|
|
192
|
+
const denominator = Math.sqrt(norm1) * Math.sqrt(norm2);
|
|
193
|
+
if (denominator === 0) {
|
|
194
|
+
return 0;
|
|
195
|
+
}
|
|
196
|
+
return dotProduct / denominator;
|
|
197
|
+
}
|
|
198
|
+
const DEFAULT_CHUNK_SIZE$1 = 500;
|
|
199
|
+
const DEFAULT_CHUNK_OVERLAP$1 = 50;
|
|
200
|
+
const DEFAULT_MIN_CHUNK_SIZE = 100;
|
|
201
|
+
class FallbackChunker {
|
|
202
|
+
constructor(options = {}) {
|
|
203
|
+
this.chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE$1;
|
|
204
|
+
this.chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP$1;
|
|
205
|
+
this.minChunkSize = options.minChunkSize ?? DEFAULT_MIN_CHUNK_SIZE;
|
|
206
|
+
}
|
|
207
|
+
chunkDocument(documentId, content, fileName) {
|
|
208
|
+
if (content.length <= this.chunkSize) {
|
|
209
|
+
const text = `${fileName} ${content}`;
|
|
210
|
+
return [{
|
|
211
|
+
id: `${documentId}:chunk:0`,
|
|
212
|
+
documentId,
|
|
213
|
+
chunkIndex: 0,
|
|
214
|
+
text,
|
|
215
|
+
startOffset: 0,
|
|
216
|
+
endOffset: content.length
|
|
217
|
+
}];
|
|
218
|
+
}
|
|
219
|
+
const chunks = [];
|
|
220
|
+
const step = this.chunkSize - this.chunkOverlap;
|
|
221
|
+
let offset = 0;
|
|
222
|
+
let chunkIndex = 0;
|
|
223
|
+
while (offset < content.length) {
|
|
224
|
+
const endOffset = Math.min(offset + this.chunkSize, content.length);
|
|
225
|
+
const chunkText = content.substring(offset, endOffset);
|
|
226
|
+
if (chunkText.trim().length < this.minChunkSize && chunks.length > 0) {
|
|
227
|
+
break;
|
|
228
|
+
}
|
|
229
|
+
const text = chunkIndex === 0 ? `${fileName} ${chunkText}` : chunkText;
|
|
230
|
+
chunks.push({
|
|
231
|
+
id: `${documentId}:chunk:${chunkIndex}`,
|
|
232
|
+
documentId,
|
|
233
|
+
chunkIndex,
|
|
234
|
+
text,
|
|
235
|
+
startOffset: offset,
|
|
236
|
+
endOffset
|
|
237
|
+
});
|
|
238
|
+
offset += step;
|
|
239
|
+
chunkIndex++;
|
|
240
|
+
}
|
|
241
|
+
return chunks;
|
|
242
|
+
}
|
|
243
|
+
getChunkContext(chunk, allChunks) {
|
|
244
|
+
const prevChunk = chunk.chunkIndex > 0 ? allChunks[chunk.chunkIndex - 1] : null;
|
|
245
|
+
const nextChunk = chunk.chunkIndex < allChunks.length - 1 ? allChunks[chunk.chunkIndex + 1] : null;
|
|
246
|
+
let context = "";
|
|
247
|
+
if (prevChunk) {
|
|
248
|
+
context += `[Previous: ${prevChunk.text.substring(Math.max(0, prevChunk.text.length - SNIPPET_LENGTHS.CONTEXT))}]
|
|
249
|
+
|
|
250
|
+
`;
|
|
251
|
+
}
|
|
252
|
+
context += chunk.text;
|
|
253
|
+
if (nextChunk) {
|
|
254
|
+
context += `
|
|
255
|
+
|
|
256
|
+
[Next: ${nextChunk.text.substring(0, SNIPPET_LENGTHS.CONTEXT)}]`;
|
|
257
|
+
}
|
|
258
|
+
return context;
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
const logger$7 = createLogger("LangChainChunker");
|
|
262
|
+
const DEFAULT_CHUNK_SIZE = 500;
|
|
263
|
+
const DEFAULT_CHUNK_OVERLAP = 75;
|
|
264
|
+
function countWords(text) {
|
|
265
|
+
if (!text || text.trim().length === 0) {
|
|
266
|
+
return 0;
|
|
267
|
+
}
|
|
268
|
+
return text.trim().split(/\s+/).filter((word) => word.length > 0).length;
|
|
269
|
+
}
|
|
270
|
+
class LangChainChunker {
|
|
271
|
+
constructor(options = {}) {
|
|
272
|
+
this.textSplitter = null;
|
|
273
|
+
this.chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE;
|
|
274
|
+
this.chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
|
|
275
|
+
}
|
|
276
|
+
async getTextSplitter() {
|
|
277
|
+
if (!this.textSplitter) {
|
|
278
|
+
const { RecursiveCharacterTextSplitter } = await import("@langchain/textsplitters");
|
|
279
|
+
this.textSplitter = new RecursiveCharacterTextSplitter({
|
|
280
|
+
chunkSize: this.chunkSize,
|
|
281
|
+
chunkOverlap: this.chunkOverlap,
|
|
282
|
+
lengthFunction: countWords,
|
|
283
|
+
separators: [
|
|
284
|
+
"\n\n",
|
|
285
|
+
"\n",
|
|
286
|
+
". ",
|
|
287
|
+
"! ",
|
|
288
|
+
"? ",
|
|
289
|
+
" "
|
|
290
|
+
],
|
|
291
|
+
keepSeparator: false
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
return this.textSplitter;
|
|
295
|
+
}
|
|
296
|
+
async chunkDocument(documentId, content, fileName) {
|
|
297
|
+
try {
|
|
298
|
+
const splitter = await this.getTextSplitter();
|
|
299
|
+
const chunks = await splitter.splitText(content);
|
|
300
|
+
const documentChunks = [];
|
|
301
|
+
let currentOffset = 0;
|
|
302
|
+
for (let index = 0; index < chunks.length; index++) {
|
|
303
|
+
const chunkText = chunks[index];
|
|
304
|
+
let startOffset;
|
|
305
|
+
if (index === 0) {
|
|
306
|
+
const foundPos = content.indexOf(chunkText);
|
|
307
|
+
startOffset = foundPos !== -1 ? foundPos : 0;
|
|
308
|
+
currentOffset = startOffset;
|
|
309
|
+
} else {
|
|
310
|
+
const prevChunk = documentChunks[index - 1];
|
|
311
|
+
const overlapBuffer = Math.max(this.chunkOverlap * 10, chunkText.length);
|
|
312
|
+
const searchStart = Math.max(0, prevChunk.endOffset - overlapBuffer);
|
|
313
|
+
const searchEnd = Math.min(content.length, prevChunk.endOffset + chunkText.length);
|
|
314
|
+
const searchWindow = content.substring(searchStart, searchEnd);
|
|
315
|
+
const relativePos = searchWindow.indexOf(chunkText);
|
|
316
|
+
if (relativePos !== -1) {
|
|
317
|
+
const candidateOffset = searchStart + relativePos;
|
|
318
|
+
if (candidateOffset >= prevChunk.startOffset && candidateOffset < prevChunk.endOffset + chunkText.length) {
|
|
319
|
+
startOffset = candidateOffset;
|
|
320
|
+
} else {
|
|
321
|
+
startOffset = prevChunk.endOffset;
|
|
322
|
+
}
|
|
323
|
+
} else {
|
|
324
|
+
startOffset = prevChunk.endOffset;
|
|
325
|
+
}
|
|
326
|
+
currentOffset = startOffset;
|
|
327
|
+
}
|
|
328
|
+
const endOffset = Math.min(startOffset + chunkText.length, content.length);
|
|
329
|
+
const text = index === 0 ? `${fileName} ${chunkText}` : chunkText;
|
|
330
|
+
documentChunks.push({
|
|
331
|
+
id: `${documentId}:chunk:${index}`,
|
|
332
|
+
documentId,
|
|
333
|
+
chunkIndex: index,
|
|
334
|
+
text,
|
|
335
|
+
startOffset,
|
|
336
|
+
endOffset
|
|
337
|
+
});
|
|
338
|
+
}
|
|
339
|
+
this.validateNoWordSplitting(documentChunks, content, documentId);
|
|
340
|
+
logger$7.debug(`Document ${documentId} split into ${documentChunks.length} chunks using LangChain`);
|
|
341
|
+
return documentChunks;
|
|
342
|
+
} catch (error) {
|
|
343
|
+
logger$7.warn(`LangChain chunking failed for ${documentId}, falling back to simple chunking: ${error}`);
|
|
344
|
+
throw error;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
validateNoWordSplitting(chunks, originalContent, documentId) {
|
|
348
|
+
for (let i = 0; i < chunks.length - 1; i++) {
|
|
349
|
+
const currentChunk = chunks[i];
|
|
350
|
+
const nextChunk = chunks[i + 1];
|
|
351
|
+
if (currentChunk.endOffset < originalContent.length && nextChunk.startOffset > currentChunk.endOffset) {
|
|
352
|
+
const gap = originalContent.substring(currentChunk.endOffset, nextChunk.startOffset);
|
|
353
|
+
const gapTrimmed = gap.trim();
|
|
354
|
+
if (gapTrimmed.length > 0 && !/^\s+$/.test(gap)) {
|
|
355
|
+
const beforeChar = originalContent[currentChunk.endOffset - 1];
|
|
356
|
+
const afterChar = originalContent[nextChunk.startOffset];
|
|
357
|
+
if (beforeChar && afterChar && /[a-zA-Z0-9]/.test(beforeChar) && /[a-zA-Z0-9]/.test(afterChar)) {
|
|
358
|
+
logger$7.warn(`Potential word split detected in document ${documentId} between chunks ${i} and ${i + 1}`);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
getChunkContext(chunk, allChunks) {
|
|
365
|
+
const prevChunk = chunk.chunkIndex > 0 ? allChunks[chunk.chunkIndex - 1] : null;
|
|
366
|
+
const nextChunk = chunk.chunkIndex < allChunks.length - 1 ? allChunks[chunk.chunkIndex + 1] : null;
|
|
367
|
+
let context = "";
|
|
368
|
+
if (prevChunk) {
|
|
369
|
+
context += `[Previous: ${prevChunk.text.substring(Math.max(0, prevChunk.text.length - SNIPPET_LENGTHS.CONTEXT))}]
|
|
370
|
+
|
|
371
|
+
`;
|
|
372
|
+
}
|
|
373
|
+
context += chunk.text;
|
|
374
|
+
if (nextChunk) {
|
|
375
|
+
context += `
|
|
376
|
+
|
|
377
|
+
[Next: ${nextChunk.text.substring(0, SNIPPET_LENGTHS.CONTEXT)}]`;
|
|
378
|
+
}
|
|
379
|
+
return context;
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
const logger$6 = createLogger("DocumentChunker");
|
|
383
|
+
class DocumentChunker {
|
|
384
|
+
constructor(options = {}) {
|
|
385
|
+
try {
|
|
386
|
+
this.chunker = new LangChainChunker(options);
|
|
387
|
+
logger$6.debug("Using LangChain chunker");
|
|
388
|
+
} catch (error) {
|
|
389
|
+
logger$6.warn(`Failed to initialize LangChain chunker, using fallback: ${error}`);
|
|
390
|
+
this.chunker = new FallbackChunker(options);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
async chunkDocument(documentId, content, fileName) {
|
|
394
|
+
try {
|
|
395
|
+
const result = this.chunker.chunkDocument(documentId, content, fileName);
|
|
396
|
+
return await Promise.resolve(result);
|
|
397
|
+
} catch (error) {
|
|
398
|
+
logger$6.warn(`Primary chunker failed, falling back: ${error}`);
|
|
399
|
+
const fallback = new FallbackChunker();
|
|
400
|
+
return fallback.chunkDocument(documentId, content, fileName);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
getChunkContext(chunk, allChunks) {
|
|
404
|
+
return this.chunker.getChunkContext(chunk, allChunks);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
new DocumentChunker();
|
|
408
|
+
const logger$5 = createLogger("PDFJSExtractor");
|
|
409
|
+
class PDFJSExtractor {
|
|
410
|
+
canExtract(fileType) {
|
|
411
|
+
return fileType.toLowerCase() === "pdf";
|
|
412
|
+
}
|
|
413
|
+
async extractText(file, options = {}) {
|
|
414
|
+
try {
|
|
415
|
+
const pdfjsLib = await import("pdfjs-dist");
|
|
416
|
+
if (!pdfjsLib.GlobalWorkerOptions.workerSrc) {
|
|
417
|
+
pdfjsLib.GlobalWorkerOptions.workerSrc = `https://unpkg.com/pdfjs-dist@${pdfjsLib.version}/build/pdf.worker.min.mjs`;
|
|
418
|
+
}
|
|
419
|
+
const pdfBlob = await file.getContents({ blob: true });
|
|
420
|
+
const arrayBuffer = await pdfBlob.arrayBuffer();
|
|
421
|
+
const loadingTask = pdfjsLib.getDocument({
|
|
422
|
+
data: arrayBuffer,
|
|
423
|
+
useSystemFonts: true
|
|
424
|
+
});
|
|
425
|
+
const pdf = await loadingTask.promise;
|
|
426
|
+
const numPages = pdf.numPages;
|
|
427
|
+
const textParts = [];
|
|
428
|
+
const includePageNumbers = options.includePageNumbers !== false;
|
|
429
|
+
const pageSeparator = options.pageSeparator || "\n\n";
|
|
430
|
+
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
|
|
431
|
+
const page = await pdf.getPage(pageNum);
|
|
432
|
+
const textContent = await page.getTextContent();
|
|
433
|
+
const pageText = textContent.items.map((item) => item.str).join(" ");
|
|
434
|
+
if (pageText.trim()) {
|
|
435
|
+
if (includePageNumbers) {
|
|
436
|
+
textParts.push(`[Page ${pageNum}]
|
|
437
|
+
${pageText}`);
|
|
438
|
+
} else {
|
|
439
|
+
textParts.push(pageText);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
}
|
|
443
|
+
const extractedText = textParts.join(pageSeparator);
|
|
444
|
+
if (!extractedText || extractedText.trim().length === 0) {
|
|
445
|
+
throw new Error("PDF appears to contain no extractable text (may be image-based or scanned)");
|
|
446
|
+
}
|
|
447
|
+
logger$5.debug(`Extracted ${numPages} pages from PDF: ${file.getName()}`);
|
|
448
|
+
return extractedText;
|
|
449
|
+
} catch (error) {
|
|
450
|
+
logger$5.warn(`Failed to extract text from PDF ${file.getName()}: ${error}`);
|
|
451
|
+
throw new Error(`PDF text extraction failed: ${error}`);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
const logger$4 = createLogger("LLMOCRExtractor");
|
|
456
|
+
class LLMOCRExtractor {
|
|
457
|
+
canExtract(fileType) {
|
|
458
|
+
return ["pdf", "png", "jpg", "jpeg", "tiff", "tif"].includes(fileType.toLowerCase());
|
|
459
|
+
}
|
|
460
|
+
async extractText(file, options = {}) {
|
|
461
|
+
const fileName = file.getName();
|
|
462
|
+
const fileType = options?.fileType || fileName.split(".").pop()?.toLowerCase() || "pdf";
|
|
463
|
+
const providers = await aiService.getProviders();
|
|
464
|
+
const ocrProvider = providers.find((p) => {
|
|
465
|
+
const endpoint = p.parameters?.["ocrApiEndpoint"];
|
|
466
|
+
return endpoint && p.name.toLowerCase().includes("mistral");
|
|
467
|
+
});
|
|
468
|
+
const ocrEndpoint = ocrProvider?.parameters?.["ocrApiEndpoint"];
|
|
469
|
+
if (!ocrProvider || !ocrEndpoint) {
|
|
470
|
+
throw new Error("Mistral OCR provider not configured. Please add ocrApiEndpoint to the provider parameters in AI settings.");
|
|
471
|
+
}
|
|
472
|
+
try {
|
|
473
|
+
const fileBlob = await file.getContents({ blob: true });
|
|
474
|
+
const base64Content = await this.blobToBase64(fileBlob);
|
|
475
|
+
const mimeType = this.getMimeType(fileType);
|
|
476
|
+
const response = await fetch(ocrEndpoint, {
|
|
477
|
+
method: "POST",
|
|
478
|
+
headers: {
|
|
479
|
+
"Authorization": `Bearer ${ocrProvider.apiKey}`,
|
|
480
|
+
"Content-Type": "application/json"
|
|
481
|
+
},
|
|
482
|
+
body: JSON.stringify({
|
|
483
|
+
model: ocrProvider.parameters?.["ocrModel"] || ocrProvider.model || "mistral-ocr-latest",
|
|
484
|
+
document: {
|
|
485
|
+
type: "document_url",
|
|
486
|
+
document_url: `data:${mimeType};base64,${base64Content}`
|
|
487
|
+
},
|
|
488
|
+
include_image_base64: false
|
|
489
|
+
})
|
|
490
|
+
});
|
|
491
|
+
if (!response.ok) {
|
|
492
|
+
const errorText = await response.text().catch(() => "Unknown error");
|
|
493
|
+
throw new Error(`OCR request failed: HTTP ${response.status}: ${errorText}`);
|
|
494
|
+
}
|
|
495
|
+
const result = await response.json();
|
|
496
|
+
if (!result.pages || !Array.isArray(result.pages)) {
|
|
497
|
+
throw new Error("Invalid OCR response format: missing pages array");
|
|
498
|
+
}
|
|
499
|
+
const includePageNumbers = options.includePageNumbers !== false;
|
|
500
|
+
const pageSeparator = options.pageSeparator || "\n\n";
|
|
501
|
+
const textParts = result.pages.map((page, index) => {
|
|
502
|
+
const pageText = page?.markdown || page?.text || "";
|
|
503
|
+
if (!pageText.trim()) {
|
|
504
|
+
return null;
|
|
505
|
+
}
|
|
506
|
+
if (includePageNumbers) {
|
|
507
|
+
return `[Page ${index + 1}]
|
|
508
|
+
${pageText}`;
|
|
509
|
+
}
|
|
510
|
+
return pageText;
|
|
511
|
+
}).filter((text) => text !== null);
|
|
512
|
+
if (textParts.length === 0) {
|
|
513
|
+
throw new Error("No text content found in OCR response");
|
|
514
|
+
}
|
|
515
|
+
const extractedText = textParts.join(pageSeparator);
|
|
516
|
+
logger$4.debug(`Extracted ${result.pages.length} pages from ${fileType} file: ${fileName}`);
|
|
517
|
+
return extractedText;
|
|
518
|
+
} catch (error) {
|
|
519
|
+
logger$4.warn(`Failed to extract text using OCR from ${fileName}: ${error}`);
|
|
520
|
+
throw new Error(`OCR text extraction failed: ${error}`);
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
async blobToBase64(blob) {
|
|
524
|
+
return new Promise((resolve, reject) => {
|
|
525
|
+
const reader = new FileReader();
|
|
526
|
+
reader.onloadend = () => {
|
|
527
|
+
const result = reader.result;
|
|
528
|
+
const base64 = result.split(",")[1];
|
|
529
|
+
resolve(base64);
|
|
530
|
+
};
|
|
531
|
+
reader.onerror = reject;
|
|
532
|
+
reader.readAsDataURL(blob);
|
|
533
|
+
});
|
|
534
|
+
}
|
|
535
|
+
getMimeType(fileType) {
|
|
536
|
+
const mimeTypes = {
|
|
537
|
+
"pdf": "application/pdf",
|
|
538
|
+
"png": "image/png",
|
|
539
|
+
"jpg": "image/jpeg",
|
|
540
|
+
"jpeg": "image/jpeg",
|
|
541
|
+
"tiff": "image/tiff",
|
|
542
|
+
"tif": "image/tiff"
|
|
543
|
+
};
|
|
544
|
+
return mimeTypes[fileType.toLowerCase()] || "application/octet-stream";
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
const logger$3 = createLogger("DocumentExtractor");
|
|
548
|
+
class DocumentExtractor {
|
|
549
|
+
constructor() {
|
|
550
|
+
this.extractors = [
|
|
551
|
+
new LLMOCRExtractor(),
|
|
552
|
+
new PDFJSExtractor()
|
|
553
|
+
];
|
|
554
|
+
}
|
|
555
|
+
canExtract(fileType) {
|
|
556
|
+
return this.extractors.some((extractor) => extractor.canExtract(fileType));
|
|
557
|
+
}
|
|
558
|
+
async extractText(file, options) {
|
|
559
|
+
const fileName = file.getName();
|
|
560
|
+
const fileType = options?.fileType || fileName.split(".").pop()?.toLowerCase() || "txt";
|
|
561
|
+
const availableExtractors = this.extractors.filter((ext) => ext.canExtract(fileType));
|
|
562
|
+
if (availableExtractors.length === 0) {
|
|
563
|
+
throw new Error(`No extractor available for file type: ${fileType}`);
|
|
564
|
+
}
|
|
565
|
+
let lastError = null;
|
|
566
|
+
for (const extractor of availableExtractors) {
|
|
567
|
+
try {
|
|
568
|
+
logger$3.debug(`Using ${extractor.constructor.name} for file type: ${fileType}`);
|
|
569
|
+
return await extractor.extractText(file, { ...options, fileType });
|
|
570
|
+
} catch (error) {
|
|
571
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
572
|
+
logger$3.warn(`${extractor.constructor.name} failed for ${fileName}: ${lastError.message}`);
|
|
573
|
+
if (extractor instanceof LLMOCRExtractor && availableExtractors.length > 1) {
|
|
574
|
+
const fallbackExtractor = availableExtractors.find((ext) => ext instanceof PDFJSExtractor);
|
|
575
|
+
if (fallbackExtractor) {
|
|
576
|
+
const warningMsg = `Mistral OCR extraction failed, falling back to PDF.js extractor for ${fileName}`;
|
|
577
|
+
logger$3.warn(warningMsg);
|
|
578
|
+
toastWarning(warningMsg);
|
|
579
|
+
try {
|
|
580
|
+
logger$3.debug(`Using ${fallbackExtractor.constructor.name} as fallback for file type: ${fileType}`);
|
|
581
|
+
return await fallbackExtractor.extractText(file, { ...options, fileType });
|
|
582
|
+
} catch (fallbackError) {
|
|
583
|
+
lastError = fallbackError instanceof Error ? fallbackError : new Error(String(fallbackError));
|
|
584
|
+
logger$3.warn(`Fallback extraction also failed for ${fileName}: ${lastError.message}`);
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
throw lastError || new Error(`All extractors failed for file type: ${fileType}`);
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
let rxdbModulesPromise = null;
|
|
594
|
+
let pluginsRegistered = false;
|
|
595
|
+
async function importRxdbModules() {
|
|
596
|
+
return Promise.all([
|
|
597
|
+
import("rxdb"),
|
|
598
|
+
import("rxdb/plugins/storage-dexie"),
|
|
599
|
+
import("rxdb/plugins/query-builder"),
|
|
600
|
+
import("rxdb/plugins/migration-schema"),
|
|
601
|
+
import("rxdb/plugins/update")
|
|
602
|
+
]);
|
|
603
|
+
}
|
|
604
|
+
async function getRxDbModules() {
|
|
605
|
+
if (!rxdbModulesPromise) {
|
|
606
|
+
rxdbModulesPromise = importRxdbModules().then(([
|
|
607
|
+
rxdb,
|
|
608
|
+
storageDexie,
|
|
609
|
+
queryBuilder,
|
|
610
|
+
migrationSchema,
|
|
611
|
+
update
|
|
612
|
+
]) => {
|
|
613
|
+
if (!pluginsRegistered) {
|
|
614
|
+
rxdb.addRxPlugin(queryBuilder.RxDBQueryBuilderPlugin);
|
|
615
|
+
rxdb.addRxPlugin(migrationSchema.RxDBMigrationSchemaPlugin);
|
|
616
|
+
rxdb.addRxPlugin(update.RxDBUpdatePlugin);
|
|
617
|
+
pluginsRegistered = true;
|
|
618
|
+
}
|
|
619
|
+
return {
|
|
620
|
+
rxdb,
|
|
621
|
+
storageDexie
|
|
622
|
+
};
|
|
623
|
+
});
|
|
624
|
+
}
|
|
625
|
+
return rxdbModulesPromise;
|
|
626
|
+
}
|
|
627
|
+
const logger$2 = createLogger("DocumentIndexService");
|
|
628
|
+
class DocumentIndexService {
|
|
629
|
+
constructor() {
|
|
630
|
+
this.sampleVectors = [];
|
|
631
|
+
this.isInitialized = false;
|
|
632
|
+
this.DEFAULT_MAX_FILE_SIZE = 5 * 1024 * 1024;
|
|
633
|
+
this.chunker = new DocumentChunker();
|
|
634
|
+
this.documentExtractor = new DocumentExtractor();
|
|
635
|
+
this.DEFAULT_INDEXABLE_TYPES = [
|
|
636
|
+
"md",
|
|
637
|
+
"txt",
|
|
638
|
+
"ts",
|
|
639
|
+
"tsx",
|
|
640
|
+
"js",
|
|
641
|
+
"jsx",
|
|
642
|
+
"json",
|
|
643
|
+
"geojson",
|
|
644
|
+
"kml",
|
|
645
|
+
"gpx",
|
|
646
|
+
"py",
|
|
647
|
+
"html",
|
|
648
|
+
"css",
|
|
649
|
+
"sql",
|
|
650
|
+
"xml",
|
|
651
|
+
"yaml",
|
|
652
|
+
"yml",
|
|
653
|
+
"pdf"
|
|
654
|
+
];
|
|
655
|
+
}
|
|
656
|
+
async initialize() {
|
|
657
|
+
if (this.isInitialized) {
|
|
658
|
+
return;
|
|
659
|
+
}
|
|
660
|
+
logger$2.info("Initializing document index service with RxDB...");
|
|
661
|
+
try {
|
|
662
|
+
const { rxdb, storageDexie } = await getRxDbModules();
|
|
663
|
+
this.db = await rxdb.createRxDatabase({
|
|
664
|
+
name: "document-index-db",
|
|
665
|
+
storage: storageDexie.getRxStorageDexie(),
|
|
666
|
+
ignoreDuplicate: true
|
|
667
|
+
});
|
|
668
|
+
const documentsSchema = {
|
|
669
|
+
version: 0,
|
|
670
|
+
primaryKey: "id",
|
|
671
|
+
type: "object",
|
|
672
|
+
properties: {
|
|
673
|
+
id: { type: "string", maxLength: 500 },
|
|
674
|
+
workspacePath: { type: "string" },
|
|
675
|
+
filePath: { type: "string" },
|
|
676
|
+
fileName: { type: "string" },
|
|
677
|
+
fileType: { type: "string" },
|
|
678
|
+
content: { type: "string" },
|
|
679
|
+
contentHash: { type: "string" },
|
|
680
|
+
metadata: {
|
|
681
|
+
type: "object",
|
|
682
|
+
properties: {
|
|
683
|
+
size: { type: "number" },
|
|
684
|
+
lastModified: { type: "number" },
|
|
685
|
+
language: { type: "string" },
|
|
686
|
+
tags: {
|
|
687
|
+
type: "array",
|
|
688
|
+
items: { type: "string" }
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
},
|
|
692
|
+
indexedAt: { type: "number" },
|
|
693
|
+
updatedAt: { type: "number" }
|
|
694
|
+
},
|
|
695
|
+
required: ["id", "workspacePath", "filePath", "content", "contentHash"],
|
|
696
|
+
indexes: ["workspacePath", "filePath", "fileType"]
|
|
697
|
+
};
|
|
698
|
+
const vectorsSchema = {
|
|
699
|
+
version: 1,
|
|
700
|
+
primaryKey: "id",
|
|
701
|
+
type: "object",
|
|
702
|
+
properties: {
|
|
703
|
+
id: { type: "string", maxLength: 500 },
|
|
704
|
+
documentId: { type: "string", maxLength: 500 },
|
|
705
|
+
embedding: {
|
|
706
|
+
type: "array",
|
|
707
|
+
items: { type: "number" }
|
|
708
|
+
},
|
|
709
|
+
idx0: { type: "number" },
|
|
710
|
+
idx1: { type: "number" },
|
|
711
|
+
idx2: { type: "number" },
|
|
712
|
+
idx3: { type: "number" },
|
|
713
|
+
idx4: { type: "number" },
|
|
714
|
+
chunkIndex: { type: "number" },
|
|
715
|
+
chunkStartOffset: { type: "number" },
|
|
716
|
+
chunkEndOffset: { type: "number" }
|
|
717
|
+
},
|
|
718
|
+
required: ["id", "documentId", "embedding", "idx0", "idx1", "idx2", "idx3", "idx4"],
|
|
719
|
+
indexes: ["documentId", "chunkIndex", "idx0", "idx1", "idx2", "idx3", "idx4"]
|
|
720
|
+
};
|
|
721
|
+
const collectionsToAdd = {
|
|
722
|
+
documents: { schema: documentsSchema },
|
|
723
|
+
vectors: { schema: vectorsSchema }
|
|
724
|
+
};
|
|
725
|
+
try {
|
|
726
|
+
await this.db.addCollections(collectionsToAdd);
|
|
727
|
+
} catch (error) {
|
|
728
|
+
if (error?.code === "DB8" || error?.message?.includes("already exists")) {
|
|
729
|
+
logger$2.debug("Collections already exist, using existing collections");
|
|
730
|
+
} else {
|
|
731
|
+
throw error;
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
this.documentsCollection = this.db.documents;
|
|
735
|
+
this.vectorsCollection = this.db.vectors;
|
|
736
|
+
await this.initializeSampleVectors();
|
|
737
|
+
await this.handleSchemaMigration();
|
|
738
|
+
this.isInitialized = true;
|
|
739
|
+
const count = await this.documentsCollection.count().exec();
|
|
740
|
+
const vectorCount = await this.vectorsCollection.count().exec();
|
|
741
|
+
logger$2.info(`Document index service initialized with ${count} documents and ${vectorCount} embeddings`);
|
|
742
|
+
subscribe(TOPIC_WORKSPACE_CONNECTED, (workspace) => {
|
|
743
|
+
if (workspace) {
|
|
744
|
+
this.handleWorkspaceChange(workspace).catch((err) => {
|
|
745
|
+
logger$2.error(`Failed to handle workspace connection: ${err}`);
|
|
746
|
+
});
|
|
747
|
+
}
|
|
748
|
+
});
|
|
749
|
+
subscribe(TOPIC_WORKSPACE_CHANGED, (workspace) => {
|
|
750
|
+
if (workspace) {
|
|
751
|
+
this.handleWorkspaceChange(workspace).catch((err) => {
|
|
752
|
+
logger$2.error(`Failed to handle workspace change: ${err}`);
|
|
753
|
+
});
|
|
754
|
+
}
|
|
755
|
+
});
|
|
756
|
+
logger$2.info("Document index service initialized");
|
|
757
|
+
} catch (error) {
|
|
758
|
+
logger$2.error(`Failed to initialize document index service: ${error}`);
|
|
759
|
+
throw error;
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
ensureInitialized() {
|
|
763
|
+
if (!this.isInitialized || !this.documentsCollection || !this.vectorsCollection) {
|
|
764
|
+
throw new Error("Document index service not initialized. Call initialize() first.");
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
async initializeSampleVectors() {
|
|
768
|
+
if (this.sampleVectors.length > 0) {
|
|
769
|
+
return;
|
|
770
|
+
}
|
|
771
|
+
const existingEmbeddings = await this.vectorsCollection.find().limit(1e3).exec();
|
|
772
|
+
const embeddingArrays = existingEmbeddings.map((v) => v.embedding);
|
|
773
|
+
this.sampleVectors = generateSampleVectors(
|
|
774
|
+
VECTOR_SEARCH_CONFIG.SAMPLE_VECTOR_COUNT,
|
|
775
|
+
embeddingService.getEmbeddingDimension(),
|
|
776
|
+
embeddingArrays.length > 0 ? embeddingArrays : void 0
|
|
777
|
+
);
|
|
778
|
+
logger$2.info(`Sample vectors initialized for index range method: ${this.sampleVectors.length} vectors, ${existingEmbeddings.length} existing embeddings`);
|
|
779
|
+
}
|
|
780
|
+
generateDocumentId(workspacePath, filePath) {
|
|
781
|
+
return `${workspacePath}:${filePath}`;
|
|
782
|
+
}
|
|
783
|
+
async computeContentHash(content) {
|
|
784
|
+
const encoder = new TextEncoder();
|
|
785
|
+
const data = encoder.encode(content);
|
|
786
|
+
const hashBuffer = await crypto.subtle.digest("SHA-256", data);
|
|
787
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
788
|
+
return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
789
|
+
}
|
|
790
|
+
isIndexableFile(file, options) {
|
|
791
|
+
const fileName = file.getName().toLowerCase();
|
|
792
|
+
const fileTypes = options?.fileTypes || this.DEFAULT_INDEXABLE_TYPES;
|
|
793
|
+
const extension = fileName.split(".").pop();
|
|
794
|
+
if (!extension || !fileTypes.includes(extension)) {
|
|
795
|
+
return false;
|
|
796
|
+
}
|
|
797
|
+
if (options?.excludePatterns) {
|
|
798
|
+
for (const pattern of options.excludePatterns) {
|
|
799
|
+
if (fileName.includes(pattern) || file.getWorkspacePath().includes(pattern)) {
|
|
800
|
+
return false;
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
return true;
|
|
805
|
+
}
|
|
806
|
+
detectLanguage(fileName) {
|
|
807
|
+
const ext = fileName.split(".").pop()?.toLowerCase();
|
|
808
|
+
const langMap = {
|
|
809
|
+
"ts": "typescript",
|
|
810
|
+
"tsx": "typescript",
|
|
811
|
+
"js": "javascript",
|
|
812
|
+
"jsx": "javascript",
|
|
813
|
+
"py": "python",
|
|
814
|
+
"md": "markdown",
|
|
815
|
+
"json": "json",
|
|
816
|
+
"geojson": "geojson",
|
|
817
|
+
"kml": "xml",
|
|
818
|
+
"gpx": "xml",
|
|
819
|
+
"html": "html",
|
|
820
|
+
"css": "css",
|
|
821
|
+
"sql": "sql",
|
|
822
|
+
"xml": "xml",
|
|
823
|
+
"yaml": "yaml",
|
|
824
|
+
"yml": "yaml",
|
|
825
|
+
"pdf": "pdf"
|
|
826
|
+
};
|
|
827
|
+
return langMap[ext || ""] || "text";
|
|
828
|
+
}
|
|
829
|
+
async indexDocument(file, options = {}) {
|
|
830
|
+
if (!this.isInitialized) {
|
|
831
|
+
await this.initialize();
|
|
832
|
+
}
|
|
833
|
+
this.ensureInitialized();
|
|
834
|
+
const workspace = file.getWorkspace();
|
|
835
|
+
const workspacePath = workspace.getName();
|
|
836
|
+
const filePath = file.getWorkspacePath();
|
|
837
|
+
const fileName = file.getName();
|
|
838
|
+
const id = this.generateDocumentId(workspacePath, filePath);
|
|
839
|
+
if (!this.isIndexableFile(file, options)) {
|
|
840
|
+
throw new Error(`File type not indexable: ${fileName}`);
|
|
841
|
+
}
|
|
842
|
+
try {
|
|
843
|
+
let content;
|
|
844
|
+
const fileType = fileName.split(".").pop()?.toLowerCase() || "txt";
|
|
845
|
+
if (this.documentExtractor.canExtract(fileType)) {
|
|
846
|
+
content = await this.documentExtractor.extractText(file, { fileType });
|
|
847
|
+
} else {
|
|
848
|
+
const fileContent = await file.getContents({ contentType: FileContentType.TEXT });
|
|
849
|
+
if (typeof fileContent !== "string") {
|
|
850
|
+
throw new Error(`File content is not text: ${fileName}`);
|
|
851
|
+
}
|
|
852
|
+
content = fileContent;
|
|
853
|
+
}
|
|
854
|
+
if (!content || content.trim().length === 0) {
|
|
855
|
+
throw new Error(`File appears to be empty or text extraction failed: ${fileName}`);
|
|
856
|
+
}
|
|
857
|
+
const maxSize = options.maxFileSize || this.DEFAULT_MAX_FILE_SIZE;
|
|
858
|
+
if (content.length > maxSize) {
|
|
859
|
+
throw new Error(`File too large to index: ${fileName} (${content.length} bytes)`);
|
|
860
|
+
}
|
|
861
|
+
const contentHash = await this.computeContentHash(content);
|
|
862
|
+
const now = Date.now();
|
|
863
|
+
const existing = await this.documentsCollection.findOne(id).exec();
|
|
864
|
+
const existingDoc = existing ? existing.toJSON() : null;
|
|
865
|
+
const existingTags = existingDoc?.metadata.tags || [];
|
|
866
|
+
const newTags = options.tags || [];
|
|
867
|
+
const mergedTags = [.../* @__PURE__ */ new Set([...existingTags, ...newTags])];
|
|
868
|
+
const tagsChanged = mergedTags.length !== existingTags.length || newTags.some((tag) => !existingTags.includes(tag));
|
|
869
|
+
if (existingDoc && existingDoc.contentHash === contentHash && !tagsChanged) {
|
|
870
|
+
logger$2.debug(`Document already indexed and unchanged: ${id}`);
|
|
871
|
+
return existingDoc;
|
|
872
|
+
}
|
|
873
|
+
const language = this.detectLanguage(fileName);
|
|
874
|
+
let lastModified = now;
|
|
875
|
+
try {
|
|
876
|
+
const fileHandle = file.getHandle?.();
|
|
877
|
+
if (fileHandle) {
|
|
878
|
+
const fileInfo = await fileHandle.getFile();
|
|
879
|
+
lastModified = fileInfo.lastModified;
|
|
880
|
+
}
|
|
881
|
+
} catch (err) {
|
|
882
|
+
logger$2.debug(`Could not get file modification time: ${err}`);
|
|
883
|
+
}
|
|
884
|
+
const contentChanged = !existingDoc || existingDoc.contentHash !== contentHash;
|
|
885
|
+
const document = {
|
|
886
|
+
id,
|
|
887
|
+
workspacePath,
|
|
888
|
+
filePath,
|
|
889
|
+
fileName,
|
|
890
|
+
fileType,
|
|
891
|
+
content: options.includeContent !== false ? content : "",
|
|
892
|
+
contentHash,
|
|
893
|
+
metadata: {
|
|
894
|
+
size: content.length,
|
|
895
|
+
lastModified,
|
|
896
|
+
language,
|
|
897
|
+
tags: mergedTags
|
|
898
|
+
},
|
|
899
|
+
indexedAt: existingDoc?.indexedAt || now,
|
|
900
|
+
updatedAt: now
|
|
901
|
+
};
|
|
902
|
+
await this.documentsCollection.upsert(document);
|
|
903
|
+
if (contentChanged) {
|
|
904
|
+
await this.generateAndStoreEmbedding(document);
|
|
905
|
+
} else {
|
|
906
|
+
logger$2.debug(`Document content unchanged, skipping embedding regeneration: ${id}`);
|
|
907
|
+
}
|
|
908
|
+
logger$2.debug(`Indexed document: ${id}`);
|
|
909
|
+
return document;
|
|
910
|
+
} catch (error) {
|
|
911
|
+
logger$2.error(`Failed to index document ${id}: ${error}`);
|
|
912
|
+
throw error;
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
async getDocument(id) {
|
|
916
|
+
if (!this.isInitialized) {
|
|
917
|
+
await this.initialize();
|
|
918
|
+
}
|
|
919
|
+
this.ensureInitialized();
|
|
920
|
+
const doc = await this.documentsCollection.findOne(id).exec();
|
|
921
|
+
return doc ? doc.toJSON() : null;
|
|
922
|
+
}
|
|
923
|
+
async getDocumentByPath(workspacePath, filePath) {
|
|
924
|
+
const id = this.generateDocumentId(workspacePath, filePath);
|
|
925
|
+
return this.getDocument(id);
|
|
926
|
+
}
|
|
927
|
+
async listDocuments(workspacePath) {
|
|
928
|
+
if (!this.isInitialized) {
|
|
929
|
+
await this.initialize();
|
|
930
|
+
}
|
|
931
|
+
this.ensureInitialized();
|
|
932
|
+
let query = this.documentsCollection.find();
|
|
933
|
+
if (workspacePath) {
|
|
934
|
+
query = query.where("workspacePath").eq(workspacePath);
|
|
935
|
+
}
|
|
936
|
+
const docs = await query.exec();
|
|
937
|
+
return docs.map((doc) => doc.toJSON());
|
|
938
|
+
}
|
|
939
|
+
async deleteDocument(id) {
|
|
940
|
+
if (!this.isInitialized) {
|
|
941
|
+
await this.initialize();
|
|
942
|
+
}
|
|
943
|
+
this.ensureInitialized();
|
|
944
|
+
const doc = await this.documentsCollection.findOne(id).exec();
|
|
945
|
+
if (doc) {
|
|
946
|
+
await doc.remove();
|
|
947
|
+
const vectors = await this.vectorsCollection.find().where("documentId").eq(id).exec();
|
|
948
|
+
for (const vector of vectors) {
|
|
949
|
+
await vector.remove();
|
|
950
|
+
}
|
|
951
|
+
logger$2.debug(`Deleted document ${id} and ${vectors.length} associated embeddings`);
|
|
952
|
+
return true;
|
|
953
|
+
}
|
|
954
|
+
return false;
|
|
955
|
+
}
|
|
956
|
+
async deleteDocumentByPath(workspacePath, filePath) {
|
|
957
|
+
const id = this.generateDocumentId(workspacePath, filePath);
|
|
958
|
+
return this.deleteDocument(id);
|
|
959
|
+
}
|
|
960
|
+
async handleSchemaMigration() {
|
|
961
|
+
if (!this.vectorsCollection || !this.documentsCollection) {
|
|
962
|
+
return;
|
|
963
|
+
}
|
|
964
|
+
try {
|
|
965
|
+
const allVectors = await this.vectorsCollection.find().exec();
|
|
966
|
+
const vectorsWithoutChunks = allVectors.filter((v) => {
|
|
967
|
+
const data = v.toJSON();
|
|
968
|
+
return data.chunkIndex === void 0 && data.chunkStartOffset === void 0 && data.chunkEndOffset === void 0;
|
|
969
|
+
});
|
|
970
|
+
if (vectorsWithoutChunks.length === 0) {
|
|
971
|
+
logger$2.debug("No vectors need migration - all have chunk information");
|
|
972
|
+
return;
|
|
973
|
+
}
|
|
974
|
+
logger$2.info(`Detected ${vectorsWithoutChunks.length} vectors without chunk information. Invalidating and reindexing...`);
|
|
975
|
+
const documentIdsToReindex = /* @__PURE__ */ new Set();
|
|
976
|
+
for (const vector of vectorsWithoutChunks) {
|
|
977
|
+
const data = vector.toJSON();
|
|
978
|
+
documentIdsToReindex.add(data.documentId);
|
|
979
|
+
await vector.remove();
|
|
980
|
+
}
|
|
981
|
+
logger$2.info(`Removed ${vectorsWithoutChunks.length} old vectors. Reindexing ${documentIdsToReindex.size} documents...`);
|
|
982
|
+
for (const documentId of documentIdsToReindex) {
|
|
983
|
+
const doc = await this.documentsCollection.findOne(documentId).exec();
|
|
984
|
+
if (doc) {
|
|
985
|
+
const document = doc.toJSON();
|
|
986
|
+
logger$2.debug(`Reindexing document: ${document.fileName}`);
|
|
987
|
+
await this.generateAndStoreEmbedding(document);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
logger$2.info(`Schema migration completed. Reindexed ${documentIdsToReindex.size} documents.`);
|
|
991
|
+
} catch (error) {
|
|
992
|
+
logger$2.error(`Error during schema migration: ${error}`);
|
|
993
|
+
throw error;
|
|
994
|
+
}
|
|
995
|
+
}
|
|
996
|
+
async deleteWorkspace(workspacePath) {
|
|
997
|
+
if (!this.isInitialized) {
|
|
998
|
+
await this.initialize();
|
|
999
|
+
}
|
|
1000
|
+
this.ensureInitialized();
|
|
1001
|
+
const docs = await this.documentsCollection.find().where("workspacePath").eq(workspacePath).exec();
|
|
1002
|
+
const count = docs.length;
|
|
1003
|
+
for (const doc of docs) {
|
|
1004
|
+
await doc.remove();
|
|
1005
|
+
}
|
|
1006
|
+
if (count > 0) {
|
|
1007
|
+
logger$2.info(`Deleted ${count} documents for workspace: ${workspacePath}`);
|
|
1008
|
+
}
|
|
1009
|
+
return count;
|
|
1010
|
+
}
|
|
1011
|
+
async updateDocumentMetadata(id, updates) {
|
|
1012
|
+
if (!this.isInitialized) {
|
|
1013
|
+
await this.initialize();
|
|
1014
|
+
}
|
|
1015
|
+
this.ensureInitialized();
|
|
1016
|
+
const doc = await this.documentsCollection.findOne(id).exec();
|
|
1017
|
+
if (!doc) {
|
|
1018
|
+
return null;
|
|
1019
|
+
}
|
|
1020
|
+
const current = doc.toJSON();
|
|
1021
|
+
const updated = {
|
|
1022
|
+
...current,
|
|
1023
|
+
metadata: {
|
|
1024
|
+
...current.metadata,
|
|
1025
|
+
...updates.metadata
|
|
1026
|
+
},
|
|
1027
|
+
updatedAt: Date.now()
|
|
1028
|
+
};
|
|
1029
|
+
await doc.update({ $set: updated });
|
|
1030
|
+
logger$2.debug(`Updated document metadata: ${id}`);
|
|
1031
|
+
return updated;
|
|
1032
|
+
}
|
|
1033
|
+
async indexWorkspace(workspace, options = {}) {
|
|
1034
|
+
if (!this.isInitialized) {
|
|
1035
|
+
await this.initialize();
|
|
1036
|
+
}
|
|
1037
|
+
const workspacePath = workspace.getName();
|
|
1038
|
+
logger$2.info(`Starting workspace indexing: ${workspacePath}`);
|
|
1039
|
+
const files = await this.collectFiles(workspace, options);
|
|
1040
|
+
logger$2.info(`Found ${files.length} files to index`);
|
|
1041
|
+
let indexed = 0;
|
|
1042
|
+
let failed = 0;
|
|
1043
|
+
const errors = [];
|
|
1044
|
+
for (const file of files) {
|
|
1045
|
+
try {
|
|
1046
|
+
await this.indexDocument(file, options);
|
|
1047
|
+
indexed++;
|
|
1048
|
+
} catch (error) {
|
|
1049
|
+
failed++;
|
|
1050
|
+
const errorMsg = `Failed to index ${file.getName()}: ${error}`;
|
|
1051
|
+
errors.push(errorMsg);
|
|
1052
|
+
logger$2.warn(errorMsg);
|
|
1053
|
+
}
|
|
1054
|
+
}
|
|
1055
|
+
logger$2.info(`Workspace indexing complete: ${indexed} indexed, ${failed} failed`);
|
|
1056
|
+
return { indexed, failed, errors };
|
|
1057
|
+
}
|
|
1058
|
+
async collectFiles(directory, options, files = []) {
|
|
1059
|
+
try {
|
|
1060
|
+
const children = await directory.listChildren(false);
|
|
1061
|
+
for (const child of children) {
|
|
1062
|
+
if (child instanceof File) {
|
|
1063
|
+
if (this.isIndexableFile(child, options)) {
|
|
1064
|
+
files.push(child);
|
|
1065
|
+
}
|
|
1066
|
+
} else if (child instanceof Directory) {
|
|
1067
|
+
await this.collectFiles(child, options, files);
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
} catch (error) {
|
|
1071
|
+
logger$2.warn(`Failed to collect files from ${directory.getName()}: ${error}`);
|
|
1072
|
+
}
|
|
1073
|
+
return files;
|
|
1074
|
+
}
|
|
1075
|
+
async reindexDocument(file, options = {}) {
|
|
1076
|
+
const workspace = file.getWorkspace();
|
|
1077
|
+
const workspacePath = workspace.getName();
|
|
1078
|
+
const filePath = file.getWorkspacePath();
|
|
1079
|
+
const id = this.generateDocumentId(workspacePath, filePath);
|
|
1080
|
+
const existing = await this.getDocument(id);
|
|
1081
|
+
const existingTags = existing?.metadata.tags || [];
|
|
1082
|
+
const newTags = options.tags || [];
|
|
1083
|
+
const mergedTags = [.../* @__PURE__ */ new Set([...existingTags, ...newTags])];
|
|
1084
|
+
await this.deleteDocument(id);
|
|
1085
|
+
return this.indexDocument(file, {
|
|
1086
|
+
...options,
|
|
1087
|
+
tags: mergedTags
|
|
1088
|
+
});
|
|
1089
|
+
}
|
|
1090
|
+
async reindexAllDocuments(options = {}) {
|
|
1091
|
+
if (!this.isInitialized) {
|
|
1092
|
+
await this.initialize();
|
|
1093
|
+
}
|
|
1094
|
+
this.ensureInitialized();
|
|
1095
|
+
const allDocs = await this.listDocuments();
|
|
1096
|
+
let succeeded = 0;
|
|
1097
|
+
let failed = 0;
|
|
1098
|
+
for (const doc of allDocs) {
|
|
1099
|
+
try {
|
|
1100
|
+
const workspace = await workspaceService.getWorkspace();
|
|
1101
|
+
if (!workspace || workspace.getName() !== doc.workspacePath) {
|
|
1102
|
+
logger$2.warn(`Workspace not found: ${doc.workspacePath}`);
|
|
1103
|
+
failed++;
|
|
1104
|
+
continue;
|
|
1105
|
+
}
|
|
1106
|
+
const resource = await workspace.getResource(doc.filePath);
|
|
1107
|
+
if (!resource || !(resource instanceof File)) {
|
|
1108
|
+
logger$2.warn(`File not found: ${doc.filePath}`);
|
|
1109
|
+
failed++;
|
|
1110
|
+
continue;
|
|
1111
|
+
}
|
|
1112
|
+
await this.reindexDocument(resource, options);
|
|
1113
|
+
succeeded++;
|
|
1114
|
+
} catch (error) {
|
|
1115
|
+
logger$2.error(`Failed to reindex document ${doc.id}: ${error}`);
|
|
1116
|
+
failed++;
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
return {
|
|
1120
|
+
total: allDocs.length,
|
|
1121
|
+
succeeded,
|
|
1122
|
+
failed
|
|
1123
|
+
};
|
|
1124
|
+
}
|
|
1125
|
+
async getStats() {
|
|
1126
|
+
if (!this.isInitialized) {
|
|
1127
|
+
await this.initialize();
|
|
1128
|
+
}
|
|
1129
|
+
this.ensureInitialized();
|
|
1130
|
+
const totalDocuments = await this.documentsCollection.count().exec();
|
|
1131
|
+
const allDocs = await this.documentsCollection.find().exec();
|
|
1132
|
+
const byWorkspace = {};
|
|
1133
|
+
for (const doc of allDocs) {
|
|
1134
|
+
const docData = doc.toJSON();
|
|
1135
|
+
const workspacePath = docData.workspacePath;
|
|
1136
|
+
byWorkspace[workspacePath] = (byWorkspace[workspacePath] || 0) + 1;
|
|
1137
|
+
}
|
|
1138
|
+
return {
|
|
1139
|
+
totalDocuments,
|
|
1140
|
+
byWorkspace
|
|
1141
|
+
};
|
|
1142
|
+
}
|
|
1143
|
+
async handleWorkspaceChange(workspace) {
|
|
1144
|
+
logger$2.debug("Workspace changed, checking for document updates...");
|
|
1145
|
+
}
|
|
1146
|
+
async generateAndStoreEmbedding(document) {
|
|
1147
|
+
try {
|
|
1148
|
+
if (!this.vectorsCollection) {
|
|
1149
|
+
logger$2.warn(`Vectors collection not initialized, cannot generate embedding for ${document.id}`);
|
|
1150
|
+
return;
|
|
1151
|
+
}
|
|
1152
|
+
await embeddingService.initialize();
|
|
1153
|
+
if (this.sampleVectors.length === 0) {
|
|
1154
|
+
await this.initializeSampleVectors();
|
|
1155
|
+
}
|
|
1156
|
+
if (this.sampleVectors.length === 0) {
|
|
1157
|
+
logger$2.warn(`Sample vectors not initialized, cannot generate embedding for ${document.id}`);
|
|
1158
|
+
return;
|
|
1159
|
+
}
|
|
1160
|
+
const chunks = await this.chunker.chunkDocument(document.id, document.content, document.fileName);
|
|
1161
|
+
logger$2.debug(`Document ${document.id} split into ${chunks.length} chunks`);
|
|
1162
|
+
for (const chunk of chunks) {
|
|
1163
|
+
const embedding = await embeddingService.generateEmbedding(chunk.text);
|
|
1164
|
+
const indexValues = calculateIndexValues(embedding, this.sampleVectors);
|
|
1165
|
+
const vectorDoc = {
|
|
1166
|
+
id: chunk.id,
|
|
1167
|
+
documentId: document.id,
|
|
1168
|
+
chunkIndex: chunk.chunkIndex,
|
|
1169
|
+
chunkStartOffset: chunk.startOffset,
|
|
1170
|
+
chunkEndOffset: chunk.endOffset,
|
|
1171
|
+
embedding,
|
|
1172
|
+
...indexValues
|
|
1173
|
+
};
|
|
1174
|
+
await this.vectorsCollection.upsert(vectorDoc);
|
|
1175
|
+
}
|
|
1176
|
+
logger$2.debug(`Generated and stored ${chunks.length} embeddings for document: ${document.id}`);
|
|
1177
|
+
} catch (error) {
|
|
1178
|
+
logger$2.warn(`Failed to generate embedding for document ${document.id}: ${error}`);
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
async searchSimilar(queryText, options = {}) {
|
|
1182
|
+
if (!this.isInitialized) {
|
|
1183
|
+
await this.initialize();
|
|
1184
|
+
}
|
|
1185
|
+
this.ensureInitialized();
|
|
1186
|
+
const limit = options.limit || 10;
|
|
1187
|
+
const indexDistance = options.indexDistance || 2;
|
|
1188
|
+
const docsPerIndexSide = options.docsPerIndexSide || 100;
|
|
1189
|
+
if (!this.vectorsCollection || this.sampleVectors.length === 0) {
|
|
1190
|
+
logger$2.warn("Vector search not available: vectors collection or sample vectors not initialized");
|
|
1191
|
+
throw new Error("Vector search not available");
|
|
1192
|
+
}
|
|
1193
|
+
const totalVectors = await this.vectorsCollection.find().exec();
|
|
1194
|
+
logger$2.debug(`Starting vector search with indexDistance=${indexDistance}, limit=${limit}, sampleVectors=${this.sampleVectors.length}, totalIndexedVectors=${totalVectors.length}`);
|
|
1195
|
+
try {
|
|
1196
|
+
await embeddingService.initialize();
|
|
1197
|
+
} catch (error) {
|
|
1198
|
+
logger$2.error(`Failed to initialize embedding service for vector search: ${error}`);
|
|
1199
|
+
throw new Error(`Embedding service initialization failed: ${error}`);
|
|
1200
|
+
}
|
|
1201
|
+
let queryEmbedding;
|
|
1202
|
+
try {
|
|
1203
|
+
queryEmbedding = await embeddingService.generateEmbedding(queryText);
|
|
1204
|
+
} catch (error) {
|
|
1205
|
+
logger$2.error(`Failed to generate query embedding: ${error}`);
|
|
1206
|
+
throw new Error(`Query embedding generation failed: ${error}`);
|
|
1207
|
+
}
|
|
1208
|
+
if (this.sampleVectors.length === 0) {
|
|
1209
|
+
logger$2.warn("Sample vectors not initialized, cannot perform vector search");
|
|
1210
|
+
throw new Error("Sample vectors not initialized");
|
|
1211
|
+
}
|
|
1212
|
+
const queryIndexValues = calculateIndexValues(queryEmbedding, this.sampleVectors);
|
|
1213
|
+
logger$2.debug(`Query index values: ${JSON.stringify(queryIndexValues)}`);
|
|
1214
|
+
const candidateIds = /* @__PURE__ */ new Set();
|
|
1215
|
+
try {
|
|
1216
|
+
for (const idxKey of INDEX_FIELD_NAMES) {
|
|
1217
|
+
const queryValue = queryIndexValues[idxKey];
|
|
1218
|
+
const minValue = queryValue - indexDistance;
|
|
1219
|
+
const maxValue = queryValue + indexDistance;
|
|
1220
|
+
logger$2.debug(`Querying index ${idxKey}: range [${minValue}, ${maxValue}]`);
|
|
1221
|
+
const candidates = await this.vectorsCollection.find().where(idxKey).gte(minValue).lte(maxValue).limit(docsPerIndexSide).exec();
|
|
1222
|
+
logger$2.debug(`Found ${candidates.length} candidates in index ${idxKey}`);
|
|
1223
|
+
for (const candidate of candidates) {
|
|
1224
|
+
candidateIds.add(candidate.documentId);
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
} catch (error) {
|
|
1228
|
+
logger$2.error(`Failed to query vector index: ${error}`);
|
|
1229
|
+
throw new Error(`Vector index query failed: ${error}`);
|
|
1230
|
+
}
|
|
1231
|
+
logger$2.debug(`Total unique candidate IDs: ${candidateIds.size} (out of ${totalVectors.length} indexed vectors)`);
|
|
1232
|
+
const candidateVectors = [];
|
|
1233
|
+
try {
|
|
1234
|
+
for (const docId of candidateIds) {
|
|
1235
|
+
const vectorDocs = await this.vectorsCollection.find().where("documentId").eq(docId).exec();
|
|
1236
|
+
for (const vectorDoc of vectorDocs) {
|
|
1237
|
+
const vectorData = vectorDoc.toJSON();
|
|
1238
|
+
if (vectorData && vectorData.embedding) {
|
|
1239
|
+
candidateVectors.push(vectorData);
|
|
1240
|
+
} else {
|
|
1241
|
+
logger$2.warn(`Invalid vector data for document ${docId}`);
|
|
1242
|
+
}
|
|
1243
|
+
}
|
|
1244
|
+
}
|
|
1245
|
+
} catch (error) {
|
|
1246
|
+
logger$2.error(`Failed to fetch candidate vectors: ${error}`);
|
|
1247
|
+
throw new Error(`Failed to fetch candidate vectors: ${error}`);
|
|
1248
|
+
}
|
|
1249
|
+
logger$2.debug(`Fetched ${candidateVectors.length} candidate vectors`);
|
|
1250
|
+
const results = [];
|
|
1251
|
+
for (const vectorDoc of candidateVectors) {
|
|
1252
|
+
const cosineSim = cosineSimilarity(queryEmbedding, vectorDoc.embedding);
|
|
1253
|
+
const similarity = (cosineSim + 1) / 2;
|
|
1254
|
+
results.push({
|
|
1255
|
+
documentId: vectorDoc.documentId,
|
|
1256
|
+
similarity,
|
|
1257
|
+
chunkIndex: vectorDoc.chunkIndex,
|
|
1258
|
+
chunkStartOffset: vectorDoc.chunkStartOffset,
|
|
1259
|
+
chunkEndOffset: vectorDoc.chunkEndOffset
|
|
1260
|
+
});
|
|
1261
|
+
}
|
|
1262
|
+
results.sort((a, b) => b.similarity - a.similarity);
|
|
1263
|
+
logger$2.debug(`Computed similarities for ${results.length} candidates, top similarity: ${results[0]?.similarity || "N/A"}`);
|
|
1264
|
+
const topResults = results.slice(0, limit);
|
|
1265
|
+
const documentResults = [];
|
|
1266
|
+
for (const result of topResults) {
|
|
1267
|
+
const doc = await this.documentsCollection.findOne(result.documentId).exec();
|
|
1268
|
+
if (doc) {
|
|
1269
|
+
const document = doc.toJSON();
|
|
1270
|
+
if (options.workspacePath && document.workspacePath !== options.workspacePath) {
|
|
1271
|
+
continue;
|
|
1272
|
+
}
|
|
1273
|
+
if (options.fileType && document.fileType !== options.fileType) {
|
|
1274
|
+
continue;
|
|
1275
|
+
}
|
|
1276
|
+
documentResults.push({
|
|
1277
|
+
document,
|
|
1278
|
+
similarity: result.similarity,
|
|
1279
|
+
chunkIndex: result.chunkIndex,
|
|
1280
|
+
chunkStartOffset: result.chunkStartOffset,
|
|
1281
|
+
chunkEndOffset: result.chunkEndOffset
|
|
1282
|
+
});
|
|
1283
|
+
}
|
|
1284
|
+
}
|
|
1285
|
+
return documentResults;
|
|
1286
|
+
}
|
|
1287
|
+
async indexFileInContext(file, context, options = {}) {
|
|
1288
|
+
if (!this.isInitialized) {
|
|
1289
|
+
await this.initialize();
|
|
1290
|
+
}
|
|
1291
|
+
this.ensureInitialized();
|
|
1292
|
+
const workspace = file.getWorkspace();
|
|
1293
|
+
const workspacePath = workspace.getName();
|
|
1294
|
+
const filePath = file.getWorkspacePath();
|
|
1295
|
+
const id = this.generateDocumentId(workspacePath, filePath);
|
|
1296
|
+
const existing = await this.documentsCollection.findOne(id).exec();
|
|
1297
|
+
const existingDoc = existing ? existing.toJSON() : null;
|
|
1298
|
+
const contextTags = context.tags || [];
|
|
1299
|
+
const newTags = [...options.tags || [], ...contextTags];
|
|
1300
|
+
if (existingDoc) {
|
|
1301
|
+
const existingTags = existingDoc.metadata.tags || [];
|
|
1302
|
+
const mergedTags = [.../* @__PURE__ */ new Set([...existingTags, ...newTags])];
|
|
1303
|
+
const tagsChanged = mergedTags.length !== existingTags.length || newTags.some((tag) => !existingTags.includes(tag));
|
|
1304
|
+
if (tagsChanged) {
|
|
1305
|
+
await this.updateDocumentMetadata(existingDoc.id, {
|
|
1306
|
+
metadata: {
|
|
1307
|
+
...existingDoc.metadata,
|
|
1308
|
+
tags: mergedTags
|
|
1309
|
+
}
|
|
1310
|
+
});
|
|
1311
|
+
logger$2.debug(`Added tags to existing document: ${id}`);
|
|
1312
|
+
return {
|
|
1313
|
+
...existingDoc,
|
|
1314
|
+
metadata: {
|
|
1315
|
+
...existingDoc.metadata,
|
|
1316
|
+
tags: mergedTags
|
|
1317
|
+
}
|
|
1318
|
+
};
|
|
1319
|
+
} else {
|
|
1320
|
+
logger$2.debug(`Document already has all tags: ${id}`);
|
|
1321
|
+
return existingDoc;
|
|
1322
|
+
}
|
|
1323
|
+
}
|
|
1324
|
+
return this.indexDocument(file, {
|
|
1325
|
+
...options,
|
|
1326
|
+
tags: newTags
|
|
1327
|
+
});
|
|
1328
|
+
}
|
|
1329
|
+
async indexFilesInContext(files, context, options = {}) {
|
|
1330
|
+
let succeeded = 0;
|
|
1331
|
+
let failed = 0;
|
|
1332
|
+
const contextTags = context.tags || [];
|
|
1333
|
+
for (const file of files) {
|
|
1334
|
+
try {
|
|
1335
|
+
await this.indexDocument(file, {
|
|
1336
|
+
...options,
|
|
1337
|
+
tags: [...options.tags || [], ...contextTags]
|
|
1338
|
+
});
|
|
1339
|
+
succeeded++;
|
|
1340
|
+
logger$2.debug(`Indexed file with context tags: ${file.getWorkspacePath()}`);
|
|
1341
|
+
} catch (error) {
|
|
1342
|
+
logger$2.error(`Failed to index file ${file.getWorkspacePath()}: ${error}`);
|
|
1343
|
+
failed++;
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
return { succeeded, failed };
|
|
1347
|
+
}
|
|
1348
|
+
async reindexFileInContext(file, context, options = {}) {
|
|
1349
|
+
const contextTags = context.tags || [];
|
|
1350
|
+
return this.reindexDocument(file, {
|
|
1351
|
+
...options,
|
|
1352
|
+
tags: [...options.tags || [], ...contextTags]
|
|
1353
|
+
});
|
|
1354
|
+
}
|
|
1355
|
+
async removeFileFromContext(file, context) {
|
|
1356
|
+
if (!this.isInitialized) {
|
|
1357
|
+
await this.initialize();
|
|
1358
|
+
}
|
|
1359
|
+
this.ensureInitialized();
|
|
1360
|
+
const document = await this.getDocumentByPath(
|
|
1361
|
+
file.getWorkspace().getName(),
|
|
1362
|
+
file.getWorkspacePath()
|
|
1363
|
+
);
|
|
1364
|
+
if (document && context.tags && context.tags.length > 0) {
|
|
1365
|
+
const contextTags = new Set(context.tags);
|
|
1366
|
+
const updatedTags = (document.metadata.tags || []).filter((tag) => !contextTags.has(tag));
|
|
1367
|
+
if (updatedTags.length !== document.metadata.tags?.length) {
|
|
1368
|
+
await this.updateDocumentMetadata(document.id, {
|
|
1369
|
+
metadata: {
|
|
1370
|
+
...document.metadata,
|
|
1371
|
+
tags: updatedTags
|
|
1372
|
+
}
|
|
1373
|
+
});
|
|
1374
|
+
}
|
|
1375
|
+
}
|
|
1376
|
+
}
|
|
1377
|
+
async clearContext(context) {
|
|
1378
|
+
if (!context.tags || context.tags.length === 0) {
|
|
1379
|
+
return;
|
|
1380
|
+
}
|
|
1381
|
+
const contextTags = new Set(context.tags);
|
|
1382
|
+
const allDocs = await this.listDocuments();
|
|
1383
|
+
for (const doc of allDocs) {
|
|
1384
|
+
const hasContextTag = doc.metadata.tags?.some((tag) => contextTags.has(tag));
|
|
1385
|
+
if (hasContextTag) {
|
|
1386
|
+
const updatedTags = doc.metadata.tags.filter((tag) => !contextTags.has(tag));
|
|
1387
|
+
try {
|
|
1388
|
+
const workspace = await workspaceService.getWorkspace();
|
|
1389
|
+
if (workspace && workspace.getName() === doc.workspacePath) {
|
|
1390
|
+
const resource = await workspace.getResource(doc.filePath);
|
|
1391
|
+
if (resource instanceof File) {
|
|
1392
|
+
await this.indexDocument(resource, {
|
|
1393
|
+
tags: updatedTags
|
|
1394
|
+
});
|
|
1395
|
+
}
|
|
1396
|
+
}
|
|
1397
|
+
} catch (error) {
|
|
1398
|
+
logger$2.warn(`Failed to clear context tags from ${doc.filePath}: ${error}`);
|
|
1399
|
+
}
|
|
1400
|
+
}
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
async getFilePathsInContext(context) {
|
|
1404
|
+
if (!context.tags || context.tags.length === 0) {
|
|
1405
|
+
return [];
|
|
1406
|
+
}
|
|
1407
|
+
const contextTags = new Set(context.tags);
|
|
1408
|
+
const allDocs = await this.listDocuments();
|
|
1409
|
+
return allDocs.filter((doc) => doc.metadata.tags?.some((tag) => contextTags.has(tag))).map((doc) => doc.filePath);
|
|
1410
|
+
}
|
|
1411
|
+
}
|
|
1412
|
+
const documentIndexService = new DocumentIndexService();
|
|
1413
|
+
rootContext.put("documentIndexService", documentIndexService);
|
|
1414
|
+
const logger$1 = createLogger("WorkspaceUtils");
|
|
1415
|
+
async function getWorkspacePath(providedPath) {
|
|
1416
|
+
const workspace = await workspaceService.getWorkspace();
|
|
1417
|
+
if (!workspace) {
|
|
1418
|
+
logger$1.warn("No workspace connected");
|
|
1419
|
+
return null;
|
|
1420
|
+
}
|
|
1421
|
+
const workspacePath = providedPath || workspace.getName();
|
|
1422
|
+
if (!workspacePath) {
|
|
1423
|
+
logger$1.warn("No workspace path available");
|
|
1424
|
+
return null;
|
|
1425
|
+
}
|
|
1426
|
+
return { workspace, workspacePath };
|
|
1427
|
+
}
|
|
1428
|
+
const DEFAULT_MIN_TERM_LENGTH = 2;
|
|
1429
|
+
function extractQueryTerms(query, options = {}) {
|
|
1430
|
+
if (!query || !query.trim()) {
|
|
1431
|
+
return [];
|
|
1432
|
+
}
|
|
1433
|
+
const minLength = options.minTermLength ?? DEFAULT_MIN_TERM_LENGTH;
|
|
1434
|
+
const text = options.caseSensitive ? query : query.toLowerCase();
|
|
1435
|
+
return text.split(/\s+/).filter((term) => term.length >= minLength);
|
|
1436
|
+
}
|
|
1437
|
+
function normalizeQuery(query) {
|
|
1438
|
+
return query.toLowerCase().trim();
|
|
1439
|
+
}
|
|
1440
|
+
const DEFAULT_MAX_SNIPPETS = 10;
|
|
1441
|
+
const DEFAULT_SNIPPET_LENGTH = 400;
|
|
1442
|
+
const DEFAULT_MIN_GAP = 400;
|
|
1443
|
+
class SnippetExtractor {
|
|
1444
|
+
constructor(options = {}) {
|
|
1445
|
+
this.maxSnippets = options.maxSnippets ?? DEFAULT_MAX_SNIPPETS;
|
|
1446
|
+
this.snippetLength = options.snippetLength ?? DEFAULT_SNIPPET_LENGTH;
|
|
1447
|
+
this.minGap = options.minGap ?? DEFAULT_MIN_GAP;
|
|
1448
|
+
}
|
|
1449
|
+
extractSnippets(content, queryTerms, maxSnippets) {
|
|
1450
|
+
const effectiveMaxSnippets = maxSnippets ?? this.maxSnippets;
|
|
1451
|
+
if (queryTerms.length === 0) {
|
|
1452
|
+
return [];
|
|
1453
|
+
}
|
|
1454
|
+
const contentLower = content.toLowerCase();
|
|
1455
|
+
const snippetCandidates = [];
|
|
1456
|
+
const seenSnippets = /* @__PURE__ */ new Set();
|
|
1457
|
+
for (const term of queryTerms) {
|
|
1458
|
+
let index = contentLower.indexOf(term);
|
|
1459
|
+
while (index !== -1) {
|
|
1460
|
+
const start = Math.max(0, index - this.snippetLength / 2);
|
|
1461
|
+
const end = Math.min(content.length, index + term.length + this.snippetLength / 2);
|
|
1462
|
+
const snippet = content.substring(start, end).trim();
|
|
1463
|
+
const snippetKey = `${start}-${end}`;
|
|
1464
|
+
if (snippet && !seenSnippets.has(snippetKey)) {
|
|
1465
|
+
seenSnippets.add(snippetKey);
|
|
1466
|
+
const score = this.calculateSnippetScore(snippet, queryTerms);
|
|
1467
|
+
snippetCandidates.push({ snippet, score, start });
|
|
1468
|
+
}
|
|
1469
|
+
index = contentLower.indexOf(term, index + 1);
|
|
1470
|
+
}
|
|
1471
|
+
}
|
|
1472
|
+
if (snippetCandidates.length === 0 && queryTerms.length > 0) {
|
|
1473
|
+
const firstTerm = queryTerms[0];
|
|
1474
|
+
const index = contentLower.indexOf(firstTerm);
|
|
1475
|
+
if (index !== -1) {
|
|
1476
|
+
const start = Math.max(0, index - this.snippetLength);
|
|
1477
|
+
const end = Math.min(content.length, index + firstTerm.length + this.snippetLength);
|
|
1478
|
+
const snippet = content.substring(start, end).trim();
|
|
1479
|
+
if (snippet) {
|
|
1480
|
+
snippetCandidates.push({ snippet, score: 10, start });
|
|
1481
|
+
}
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
snippetCandidates.sort((a, b) => {
|
|
1485
|
+
if (b.score !== a.score) {
|
|
1486
|
+
return b.score - a.score;
|
|
1487
|
+
}
|
|
1488
|
+
return a.start - b.start;
|
|
1489
|
+
});
|
|
1490
|
+
return this.selectNonOverlappingSnippets(snippetCandidates, effectiveMaxSnippets);
|
|
1491
|
+
}
|
|
1492
|
+
calculateSnippetScore(snippet, queryTerms) {
|
|
1493
|
+
const snippetLower = snippet.toLowerCase();
|
|
1494
|
+
let score = 0;
|
|
1495
|
+
for (const qTerm of queryTerms) {
|
|
1496
|
+
const termMatches = (snippetLower.match(
|
|
1497
|
+
new RegExp(qTerm.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "gi")
|
|
1498
|
+
) || []).length;
|
|
1499
|
+
score += termMatches * 10;
|
|
1500
|
+
}
|
|
1501
|
+
const fullQuery = queryTerms.join(" ");
|
|
1502
|
+
if (snippetLower.includes(fullQuery)) {
|
|
1503
|
+
score += 50;
|
|
1504
|
+
}
|
|
1505
|
+
const uniqueTermsFound = queryTerms.filter((t) => snippetLower.includes(t)).length;
|
|
1506
|
+
score += uniqueTermsFound * 20;
|
|
1507
|
+
const termDensity = uniqueTermsFound / queryTerms.length;
|
|
1508
|
+
score += termDensity * 30;
|
|
1509
|
+
return score;
|
|
1510
|
+
}
|
|
1511
|
+
selectNonOverlappingSnippets(candidates, maxSnippets) {
|
|
1512
|
+
const selectedSnippets = [];
|
|
1513
|
+
const usedRanges = [];
|
|
1514
|
+
for (const candidate of candidates) {
|
|
1515
|
+
if (selectedSnippets.length >= maxSnippets) break;
|
|
1516
|
+
const candidateStart = candidate.start;
|
|
1517
|
+
const candidateEnd = candidate.start + candidate.snippet.length;
|
|
1518
|
+
const overlaps = usedRanges.some((range) => {
|
|
1519
|
+
return !(candidateEnd < range.start - this.minGap || candidateStart > range.end + this.minGap);
|
|
1520
|
+
});
|
|
1521
|
+
if (!overlaps) {
|
|
1522
|
+
selectedSnippets.push(candidate.snippet);
|
|
1523
|
+
usedRanges.push({ start: candidateStart, end: candidateEnd });
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
return selectedSnippets;
|
|
1527
|
+
}
|
|
1528
|
+
extractSimpleSnippet(content, maxLength = 500) {
|
|
1529
|
+
const preview = content.substring(0, maxLength);
|
|
1530
|
+
return preview + (content.length > maxLength ? "..." : "");
|
|
1531
|
+
}
|
|
1532
|
+
extractContextSnippets(content, query, contextLength = 150) {
|
|
1533
|
+
const queryLower = query.toLowerCase();
|
|
1534
|
+
const contentLower = content.toLowerCase();
|
|
1535
|
+
const matches = [];
|
|
1536
|
+
let index = contentLower.indexOf(queryLower);
|
|
1537
|
+
while (index !== -1) {
|
|
1538
|
+
matches.push(index);
|
|
1539
|
+
index = contentLower.indexOf(queryLower, index + 1);
|
|
1540
|
+
}
|
|
1541
|
+
if (matches.length === 0) {
|
|
1542
|
+
return [];
|
|
1543
|
+
}
|
|
1544
|
+
const snippets = [];
|
|
1545
|
+
for (const matchIndex of matches) {
|
|
1546
|
+
const start = Math.max(0, matchIndex - contextLength);
|
|
1547
|
+
const end = Math.min(content.length, matchIndex + query.length + contextLength);
|
|
1548
|
+
snippets.push({ start, end, matchIndex });
|
|
1549
|
+
}
|
|
1550
|
+
return snippets;
|
|
1551
|
+
}
|
|
1552
|
+
}
|
|
1553
|
+
new SnippetExtractor();
|
|
1554
|
+
class RelevanceCalculator {
|
|
1555
|
+
calculateRelevance(doc, query) {
|
|
1556
|
+
const queryLower = normalizeQuery(query);
|
|
1557
|
+
const queryTerms = extractQueryTerms(query);
|
|
1558
|
+
let score = 0;
|
|
1559
|
+
const fileNameLower = doc.fileName.toLowerCase();
|
|
1560
|
+
const filePathLower = doc.filePath.toLowerCase();
|
|
1561
|
+
const contentLower = doc.content.toLowerCase();
|
|
1562
|
+
const fileNameMatches = queryTerms.filter((term) => fileNameLower.includes(term)).length;
|
|
1563
|
+
const filePathMatches = queryTerms.filter((term) => filePathLower.includes(term)).length;
|
|
1564
|
+
const contentMatches = queryTerms.filter((term) => contentLower.includes(term)).length;
|
|
1565
|
+
score += fileNameMatches * RELEVANCE_WEIGHTS.FILE_NAME_MATCH;
|
|
1566
|
+
score += filePathMatches * RELEVANCE_WEIGHTS.FILE_PATH_MATCH;
|
|
1567
|
+
score += contentMatches * RELEVANCE_WEIGHTS.CONTENT_MATCH;
|
|
1568
|
+
if (fileNameLower.includes(queryLower)) {
|
|
1569
|
+
score += RELEVANCE_WEIGHTS.FILE_NAME_EXACT;
|
|
1570
|
+
}
|
|
1571
|
+
if (filePathLower.includes(queryLower)) {
|
|
1572
|
+
score += RELEVANCE_WEIGHTS.FILE_PATH_EXACT;
|
|
1573
|
+
}
|
|
1574
|
+
const exactPhraseMatches = (contentLower.match(
|
|
1575
|
+
new RegExp(queryLower.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g")
|
|
1576
|
+
) || []).length;
|
|
1577
|
+
score += exactPhraseMatches * RELEVANCE_WEIGHTS.EXACT_PHRASE;
|
|
1578
|
+
const totalTerms = queryTerms.length;
|
|
1579
|
+
const matchedTerms = queryTerms.filter((term) => contentLower.includes(term)).length;
|
|
1580
|
+
const termCoverage = totalTerms > 0 ? matchedTerms / totalTerms : 0;
|
|
1581
|
+
score += termCoverage * RELEVANCE_WEIGHTS.TERM_COVERAGE;
|
|
1582
|
+
return score;
|
|
1583
|
+
}
|
|
1584
|
+
calculateFileNameRelevance(doc, fileName) {
|
|
1585
|
+
const docNameLower = doc.fileName.toLowerCase();
|
|
1586
|
+
const fileNameLower = fileName.toLowerCase();
|
|
1587
|
+
if (docNameLower === fileNameLower) {
|
|
1588
|
+
return 100;
|
|
1589
|
+
}
|
|
1590
|
+
if (docNameLower.includes(fileNameLower)) {
|
|
1591
|
+
return 80;
|
|
1592
|
+
}
|
|
1593
|
+
return 0;
|
|
1594
|
+
}
|
|
1595
|
+
}
|
|
1596
|
+
const relevanceCalculator = new RelevanceCalculator();
|
|
1597
|
+
class RAGResultFormatter {
|
|
1598
|
+
constructor(snippetExtractor2) {
|
|
1599
|
+
this.snippetExtractor = snippetExtractor2;
|
|
1600
|
+
}
|
|
1601
|
+
formatRAGContext(results) {
|
|
1602
|
+
if (results.length === 0) {
|
|
1603
|
+
return "";
|
|
1604
|
+
}
|
|
1605
|
+
const sections = results.map((result, idx) => {
|
|
1606
|
+
const doc = result.document;
|
|
1607
|
+
const snippets = result.matchedSnippets.map((s, i) => ` [Snippet ${i + 1}]
|
|
1608
|
+
${s}`).join("\n\n");
|
|
1609
|
+
return `[Document ${idx + 1}: ${doc.fileName} (${doc.filePath})]
|
|
1610
|
+
Relevance: ${result.relevance.toFixed(2)}
|
|
1611
|
+
${snippets.length > 0 ? `Relevant snippets:
|
|
1612
|
+
${snippets}` : `Content preview: ${this.snippetExtractor.extractSimpleSnippet(doc.content, SNIPPET_LENGTHS.PREVIEW)}`}
|
|
1613
|
+
---`;
|
|
1614
|
+
}).join("\n\n");
|
|
1615
|
+
return `Here are relevant documents from the workspace that might help answer the question:
|
|
1616
|
+
|
|
1617
|
+
${sections}
|
|
1618
|
+
|
|
1619
|
+
Use the information from these documents to provide a helpful answer. Pay special attention to numbers, percentages, dates, and specific values mentioned in the snippets. If the documents don't contain relevant information, you can still answer based on your general knowledge.`;
|
|
1620
|
+
}
|
|
1621
|
+
formatSearchResults(results) {
|
|
1622
|
+
return results.map((r) => ({
|
|
1623
|
+
file: r.document.fileName,
|
|
1624
|
+
path: r.document.filePath,
|
|
1625
|
+
relevance: r.relevance.toFixed(2),
|
|
1626
|
+
language: r.document.metadata.language,
|
|
1627
|
+
size: r.document.metadata.size,
|
|
1628
|
+
snippets: r.matchedSnippets,
|
|
1629
|
+
preview: this.snippetExtractor.extractSimpleSnippet(r.document.content, 200) + "..."
|
|
1630
|
+
}));
|
|
1631
|
+
}
|
|
1632
|
+
formatCommandResults(results) {
|
|
1633
|
+
return results.map((r) => ({
|
|
1634
|
+
file: r.document.fileName,
|
|
1635
|
+
path: r.document.filePath,
|
|
1636
|
+
relevance: r.relevance,
|
|
1637
|
+
snippets: r.matchedSnippets
|
|
1638
|
+
}));
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
const logger = createLogger("RAGService");
|
|
1642
|
+
const snippetExtractor = new SnippetExtractor();
|
|
1643
|
+
const resultFormatter = new RAGResultFormatter(snippetExtractor);
|
|
1644
|
+
class RAGService {
|
|
1645
|
+
async searchDocuments(query, options = {}) {
|
|
1646
|
+
const limit = Math.min(options.limit || SEARCH_CONFIG.DEFAULT_LIMIT, SEARCH_CONFIG.MAX_LIMIT);
|
|
1647
|
+
const workspaceResult = await getWorkspacePath(options.workspacePath);
|
|
1648
|
+
if (!workspaceResult) {
|
|
1649
|
+
logger.warn("No workspace connected, cannot search documents");
|
|
1650
|
+
return [];
|
|
1651
|
+
}
|
|
1652
|
+
const { workspacePath } = workspaceResult;
|
|
1653
|
+
if (options.filePath) {
|
|
1654
|
+
const specificDoc = await documentIndexService.getDocumentByPath(workspacePath, options.filePath);
|
|
1655
|
+
if (specificDoc) {
|
|
1656
|
+
const queryTerms = extractQueryTerms(query);
|
|
1657
|
+
const matchedSnippets = queryTerms.length > 0 ? snippetExtractor.extractSnippets(specificDoc.content, queryTerms, 3) : [snippetExtractor.extractSimpleSnippet(specificDoc.content, SNIPPET_LENGTHS.PREVIEW)];
|
|
1658
|
+
return [{
|
|
1659
|
+
document: specificDoc,
|
|
1660
|
+
relevance: 100,
|
|
1661
|
+
matchedSnippets
|
|
1662
|
+
}];
|
|
1663
|
+
}
|
|
1664
|
+
logger.warn(`Document not found: ${options.filePath}`);
|
|
1665
|
+
return [];
|
|
1666
|
+
}
|
|
1667
|
+
if (options.fileName) {
|
|
1668
|
+
const allDocs = await documentIndexService.listDocuments(workspacePath);
|
|
1669
|
+
const fileNameLower = options.fileName.toLowerCase();
|
|
1670
|
+
const matchingDocs = allDocs.filter((doc) => {
|
|
1671
|
+
const docNameLower = doc.fileName.toLowerCase();
|
|
1672
|
+
return docNameLower === fileNameLower || docNameLower.includes(fileNameLower);
|
|
1673
|
+
});
|
|
1674
|
+
if (matchingDocs.length > 0) {
|
|
1675
|
+
const results = [];
|
|
1676
|
+
const queryTerms = extractQueryTerms(query);
|
|
1677
|
+
for (const doc of matchingDocs.slice(0, limit)) {
|
|
1678
|
+
const matchedSnippets = queryTerms.length > 0 ? snippetExtractor.extractSnippets(doc.content, queryTerms, 3) : [snippetExtractor.extractSimpleSnippet(doc.content, SNIPPET_LENGTHS.PREVIEW)];
|
|
1679
|
+
results.push({
|
|
1680
|
+
document: doc,
|
|
1681
|
+
relevance: relevanceCalculator.calculateFileNameRelevance(doc, options.fileName),
|
|
1682
|
+
matchedSnippets
|
|
1683
|
+
});
|
|
1684
|
+
}
|
|
1685
|
+
return results;
|
|
1686
|
+
}
|
|
1687
|
+
logger.warn(`No documents found with name: ${options.fileName}`);
|
|
1688
|
+
return [];
|
|
1689
|
+
}
|
|
1690
|
+
if (!query || !query.trim()) {
|
|
1691
|
+
logger.warn("No query provided and no filePath/fileName specified");
|
|
1692
|
+
return [];
|
|
1693
|
+
}
|
|
1694
|
+
try {
|
|
1695
|
+
const vectorResults = await documentIndexService.searchSimilar(query, {
|
|
1696
|
+
limit: limit * 2,
|
|
1697
|
+
workspacePath,
|
|
1698
|
+
fileType: options.fileType
|
|
1699
|
+
});
|
|
1700
|
+
const results = [];
|
|
1701
|
+
for (const { document, similarity, chunkStartOffset, chunkEndOffset } of vectorResults) {
|
|
1702
|
+
if (!this.matchesContextScope(document, options.documentSearchScope)) {
|
|
1703
|
+
continue;
|
|
1704
|
+
}
|
|
1705
|
+
const relevance = similarity * 100;
|
|
1706
|
+
if (options.minRelevance && relevance < options.minRelevance) {
|
|
1707
|
+
continue;
|
|
1708
|
+
}
|
|
1709
|
+
const queryLower = normalizeQuery(query);
|
|
1710
|
+
const queryTerms = extractQueryTerms(query);
|
|
1711
|
+
let matchedSnippets;
|
|
1712
|
+
if (chunkStartOffset !== void 0 && chunkEndOffset !== void 0) {
|
|
1713
|
+
const chunkText = document.content.substring(
|
|
1714
|
+
Math.max(0, chunkStartOffset),
|
|
1715
|
+
Math.min(document.content.length, chunkEndOffset)
|
|
1716
|
+
);
|
|
1717
|
+
if (chunkText.trim().length > 0) {
|
|
1718
|
+
matchedSnippets = [chunkText.trim()];
|
|
1719
|
+
} else {
|
|
1720
|
+
matchedSnippets = snippetExtractor.extractSnippets(document.content, queryTerms, 15);
|
|
1721
|
+
}
|
|
1722
|
+
} else {
|
|
1723
|
+
matchedSnippets = snippetExtractor.extractSnippets(document.content, queryTerms, 15);
|
|
1724
|
+
}
|
|
1725
|
+
if (matchedSnippets.length === 0 && queryLower.length > 0) {
|
|
1726
|
+
if (chunkStartOffset !== void 0 && chunkEndOffset !== void 0) {
|
|
1727
|
+
const chunkText = document.content.substring(
|
|
1728
|
+
Math.max(0, chunkStartOffset),
|
|
1729
|
+
Math.min(document.content.length, chunkEndOffset)
|
|
1730
|
+
);
|
|
1731
|
+
if (chunkText.trim().length > 0) {
|
|
1732
|
+
matchedSnippets = [chunkText.trim()];
|
|
1733
|
+
} else {
|
|
1734
|
+
matchedSnippets = snippetExtractor.extractSnippets(document.content, [queryLower], 10);
|
|
1735
|
+
}
|
|
1736
|
+
} else {
|
|
1737
|
+
matchedSnippets = snippetExtractor.extractSnippets(document.content, [queryLower], 10);
|
|
1738
|
+
}
|
|
1739
|
+
}
|
|
1740
|
+
results.push({
|
|
1741
|
+
document,
|
|
1742
|
+
relevance,
|
|
1743
|
+
matchedSnippets
|
|
1744
|
+
});
|
|
1745
|
+
if (results.length >= limit) {
|
|
1746
|
+
break;
|
|
1747
|
+
}
|
|
1748
|
+
}
|
|
1749
|
+
if (results.length === 0) {
|
|
1750
|
+
logger.debug("Vector search returned no results, falling back to text search");
|
|
1751
|
+
return this.fallbackTextSearch(query, options);
|
|
1752
|
+
}
|
|
1753
|
+
return results;
|
|
1754
|
+
} catch (error) {
|
|
1755
|
+
logger.warn(`Vector search failed, falling back to text search: ${error}`);
|
|
1756
|
+
logger.debug(`Vector search error details: ${error}`);
|
|
1757
|
+
return this.fallbackTextSearch(query, options);
|
|
1758
|
+
}
|
|
1759
|
+
}
|
|
1760
|
+
async fallbackTextSearch(query, options = {}) {
|
|
1761
|
+
const limit = Math.min(options.limit || SEARCH_CONFIG.DEFAULT_LIMIT, SEARCH_CONFIG.MAX_LIMIT);
|
|
1762
|
+
const workspaceResult = await getWorkspacePath(options.workspacePath);
|
|
1763
|
+
if (!workspaceResult) {
|
|
1764
|
+
logger.warn("No workspace connected, cannot perform text search");
|
|
1765
|
+
return [];
|
|
1766
|
+
}
|
|
1767
|
+
const { workspacePath } = workspaceResult;
|
|
1768
|
+
const allDocuments = await documentIndexService.listDocuments(workspacePath);
|
|
1769
|
+
const queryTerms = extractQueryTerms(query);
|
|
1770
|
+
const results = [];
|
|
1771
|
+
for (const doc of allDocuments) {
|
|
1772
|
+
if (options.fileType && doc.fileType !== options.fileType) {
|
|
1773
|
+
continue;
|
|
1774
|
+
}
|
|
1775
|
+
if (!this.matchesContextScope(doc, options.documentSearchScope)) {
|
|
1776
|
+
continue;
|
|
1777
|
+
}
|
|
1778
|
+
const relevance = relevanceCalculator.calculateRelevance(doc, query);
|
|
1779
|
+
if (options.minRelevance && relevance < options.minRelevance) {
|
|
1780
|
+
continue;
|
|
1781
|
+
}
|
|
1782
|
+
const matchedSnippets = snippetExtractor.extractSnippets(doc.content, queryTerms, 3);
|
|
1783
|
+
results.push({
|
|
1784
|
+
document: doc,
|
|
1785
|
+
relevance,
|
|
1786
|
+
matchedSnippets
|
|
1787
|
+
});
|
|
1788
|
+
}
|
|
1789
|
+
results.sort((a, b) => b.relevance - a.relevance);
|
|
1790
|
+
return results.slice(0, limit);
|
|
1791
|
+
}
|
|
1792
|
+
formatRAGContext(results) {
|
|
1793
|
+
return resultFormatter.formatRAGContext(results);
|
|
1794
|
+
}
|
|
1795
|
+
matchesContextScope(doc, scope) {
|
|
1796
|
+
if (!scope) {
|
|
1797
|
+
return true;
|
|
1798
|
+
}
|
|
1799
|
+
if (scope.includePaths && scope.includePaths.length > 0) {
|
|
1800
|
+
const matches = scope.includePaths.some((pattern) => {
|
|
1801
|
+
if (pattern.includes("*") || pattern.includes("?")) {
|
|
1802
|
+
const regex = new RegExp("^" + pattern.replace(/\*/g, ".*").replace(/\?/g, ".") + "$");
|
|
1803
|
+
return regex.test(doc.filePath);
|
|
1804
|
+
}
|
|
1805
|
+
return doc.filePath.startsWith(pattern) || doc.filePath === pattern;
|
|
1806
|
+
});
|
|
1807
|
+
if (!matches) {
|
|
1808
|
+
return false;
|
|
1809
|
+
}
|
|
1810
|
+
}
|
|
1811
|
+
if (scope.excludePaths && scope.excludePaths.length > 0) {
|
|
1812
|
+
const matches = scope.excludePaths.some((pattern) => {
|
|
1813
|
+
if (pattern.includes("*") || pattern.includes("?")) {
|
|
1814
|
+
const regex = new RegExp("^" + pattern.replace(/\*/g, ".*").replace(/\?/g, ".") + "$");
|
|
1815
|
+
return regex.test(doc.filePath);
|
|
1816
|
+
}
|
|
1817
|
+
return doc.filePath.startsWith(pattern) || doc.filePath === pattern;
|
|
1818
|
+
});
|
|
1819
|
+
if (matches) {
|
|
1820
|
+
return false;
|
|
1821
|
+
}
|
|
1822
|
+
}
|
|
1823
|
+
if (scope.pathPattern) {
|
|
1824
|
+
const regex = scope.pathPattern instanceof RegExp ? scope.pathPattern : new RegExp(scope.pathPattern);
|
|
1825
|
+
if (!regex.test(doc.filePath)) {
|
|
1826
|
+
return false;
|
|
1827
|
+
}
|
|
1828
|
+
}
|
|
1829
|
+
if (scope.tags && scope.tags.length > 0) {
|
|
1830
|
+
const docTags = doc.metadata.tags || [];
|
|
1831
|
+
const hasAllTags = scope.tags.every((tag) => docTags.includes(tag));
|
|
1832
|
+
if (!hasAllTags) {
|
|
1833
|
+
return false;
|
|
1834
|
+
}
|
|
1835
|
+
}
|
|
1836
|
+
if (scope.metadataFilter) {
|
|
1837
|
+
if (!scope.metadataFilter(doc)) {
|
|
1838
|
+
return false;
|
|
1839
|
+
}
|
|
1840
|
+
}
|
|
1841
|
+
return true;
|
|
1842
|
+
}
|
|
1843
|
+
}
|
|
1844
|
+
const ragService = new RAGService();
|
|
1845
|
+
async function searchWorkspaceDocuments(query, options = {}) {
|
|
1846
|
+
const workspaceResult = await getWorkspacePath(options.workspacePath);
|
|
1847
|
+
if (!workspaceResult) {
|
|
1848
|
+
logger.warn("No workspace connected, cannot search documents");
|
|
1849
|
+
return [];
|
|
1850
|
+
}
|
|
1851
|
+
return ragService.searchDocuments(query, {
|
|
1852
|
+
...options,
|
|
1853
|
+
workspacePath: workspaceResult.workspacePath
|
|
1854
|
+
});
|
|
1855
|
+
}
|
|
1856
|
+
export {
|
|
1857
|
+
CONTENT_PREVIEW_LENGTHS as C,
|
|
1858
|
+
RAGResultFormatter as R,
|
|
1859
|
+
SnippetExtractor as S,
|
|
1860
|
+
createIndexValuesFromArray as a,
|
|
1861
|
+
euclideanDistance as b,
|
|
1862
|
+
cosineSimilarity as c,
|
|
1863
|
+
documentIndexService as d,
|
|
1864
|
+
embeddingService as e,
|
|
1865
|
+
getWorkspacePath as f,
|
|
1866
|
+
getIndexValueArray as g,
|
|
1867
|
+
SNIPPET_LENGTHS as h,
|
|
1868
|
+
SEARCH_CONFIG as i,
|
|
1869
|
+
ragService as r,
|
|
1870
|
+
searchWorkspaceDocuments as s
|
|
1871
|
+
};
|
|
1872
|
+
//# sourceMappingURL=rag-service-BKBGCuO-.js.map
|