@eclipse-lyra/extension-rag-system 0.7.57 → 0.7.59

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1653 @@
1
+ import { Directory, File, FileContentType, TOPIC_WORKSPACE_CHANGED, TOPIC_WORKSPACE_CONNECTED, createLogger, rootContext, subscribe, toastWarning, workspaceService } from "@eclipse-lyra/core";
2
+ import { MLModel, MLTask, inBrowserMLService } from "@eclipse-lyra/extension-in-browser-ml/api";
3
+ import { aiService } from "@eclipse-lyra/extension-ai-system/api";
4
+ //#region src/embedding-service.ts
5
+ var logger$8 = createLogger("EmbeddingService");
6
+ var EmbeddingService = class {
7
+ constructor() {
8
+ this.modelName = MLModel.FEATURE_EXTRACTION;
9
+ this.EMBEDDING_DIMENSION = 384;
10
+ this.DEFAULT_OPTIONS = {
11
+ pooling: "mean",
12
+ normalize: true
13
+ };
14
+ }
15
+ async initialize() {
16
+ if (this.pipePromise) try {
17
+ await this.pipePromise;
18
+ return;
19
+ } catch (error) {
20
+ logger$8.warn("Previous initialization failed, retrying...");
21
+ this.pipePromise = void 0;
22
+ }
23
+ logger$8.info(`Initializing embedding service with model: ${this.modelName}`);
24
+ try {
25
+ this.pipePromise = inBrowserMLService.getPipeline(MLTask.FEATURE_EXTRACTION, this.modelName, { quantized: false });
26
+ await this.pipePromise;
27
+ logger$8.info("Embedding service initialized successfully");
28
+ } catch (error) {
29
+ const errorMessage = error?.message || String(error);
30
+ const errorDetails = error ? JSON.stringify(error) : "";
31
+ logger$8.error(`Failed to initialize embedding service: ${errorMessage}${errorDetails ? ` - ${errorDetails}` : ""}`);
32
+ this.pipePromise = void 0;
33
+ throw new Error(`Embedding service initialization failed: ${errorMessage}`);
34
+ }
35
+ }
36
+ async generateEmbedding(text, options = {}) {
37
+ if (!this.pipePromise) await this.initialize();
38
+ const pipe = await this.pipePromise;
39
+ const opts = {
40
+ ...this.DEFAULT_OPTIONS,
41
+ ...options
42
+ };
43
+ try {
44
+ const output = await pipe(text, {
45
+ pooling: opts.pooling,
46
+ normalize: opts.normalize
47
+ });
48
+ const embedding = Array.from(output.data);
49
+ if (embedding.length !== this.EMBEDDING_DIMENSION) logger$8.warn(`Unexpected embedding dimension: ${embedding.length}, expected ${this.EMBEDDING_DIMENSION}`);
50
+ return embedding;
51
+ } catch (error) {
52
+ logger$8.error(`Failed to generate embedding: ${error}`);
53
+ throw error;
54
+ }
55
+ }
56
+ async generateEmbeddings(texts, options = {}) {
57
+ const embeddings = [];
58
+ for (const text of texts) {
59
+ const embedding = await this.generateEmbedding(text, options);
60
+ embeddings.push(embedding);
61
+ }
62
+ return embeddings;
63
+ }
64
+ getEmbeddingDimension() {
65
+ return this.EMBEDDING_DIMENSION;
66
+ }
67
+ getModelName() {
68
+ return this.modelName;
69
+ }
70
+ };
71
+ var embeddingService = new EmbeddingService();
72
+ //#endregion
73
+ //#region src/utils/constants.ts
74
+ var SNIPPET_LENGTHS = {
75
+ DEFAULT: 400,
76
+ PREVIEW: 500,
77
+ LONG_PREVIEW: 1e3,
78
+ CONTEXT: 150
79
+ };
80
+ var RELEVANCE_WEIGHTS = {
81
+ FILE_NAME_MATCH: 10,
82
+ FILE_PATH_MATCH: 5,
83
+ CONTENT_MATCH: 1,
84
+ FILE_NAME_EXACT: 20,
85
+ FILE_PATH_EXACT: 10,
86
+ EXACT_PHRASE: 5,
87
+ TERM_COVERAGE: 15
88
+ };
89
+ var SEARCH_CONFIG = {
90
+ DEFAULT_LIMIT: 5,
91
+ MAX_LIMIT: 20,
92
+ DEFAULT_MIN_RELEVANCE: 0,
93
+ QUERY_TERM_MIN_LENGTH: 2
94
+ };
95
+ var VECTOR_SEARCH_CONFIG = {
96
+ DEFAULT_INDEX_DISTANCE: 2,
97
+ DEFAULT_DOCS_PER_INDEX_SIDE: 100,
98
+ DEFAULT_LIMIT: 10,
99
+ SAMPLE_VECTOR_COUNT: 5
100
+ };
101
+ var INDEX_FIELD_NAMES = [
102
+ "idx0",
103
+ "idx1",
104
+ "idx2",
105
+ "idx3",
106
+ "idx4"
107
+ ];
108
+ var CONTENT_PREVIEW_LENGTHS = {
109
+ SHORT: 200,
110
+ MEDIUM: 500,
111
+ LONG: 1e3
112
+ };
113
+ //#endregion
114
+ //#region src/vector-utils.ts
115
+ function euclideanDistance(vec1, vec2) {
116
+ if (vec1.length !== vec2.length) throw new Error(`Vector dimensions must match: ${vec1.length} vs ${vec2.length}`);
117
+ let sum = 0;
118
+ for (let i = 0; i < vec1.length; i++) {
119
+ const diff = vec1[i] - vec2[i];
120
+ sum += diff * diff;
121
+ }
122
+ return Math.sqrt(sum);
123
+ }
124
+ function generateSampleVectors(count, dimension, existingEmbeddings) {
125
+ const samples = [];
126
+ if (existingEmbeddings && existingEmbeddings.length > 0) for (let i = 0; i < count; i++) if (i < existingEmbeddings.length) samples.push({
127
+ vector: existingEmbeddings[i],
128
+ idx: i
129
+ });
130
+ else {
131
+ const vector = [];
132
+ for (let j = 0; j < dimension; j++) vector.push(Math.random() * 2 - 1);
133
+ samples.push({
134
+ vector,
135
+ idx: i
136
+ });
137
+ }
138
+ else for (let i = 0; i < count; i++) {
139
+ const vector = [];
140
+ for (let j = 0; j < dimension; j++) vector.push(Math.random() * 2 - 1);
141
+ samples.push({
142
+ vector,
143
+ idx: i
144
+ });
145
+ }
146
+ return samples;
147
+ }
148
+ function calculateIndexValues(embedding, sampleVectors) {
149
+ if (sampleVectors.length !== INDEX_FIELD_NAMES.length) throw new Error(`Sample vectors count (${sampleVectors.length}) must match index field count (${INDEX_FIELD_NAMES.length})`);
150
+ const result = {};
151
+ for (let i = 0; i < INDEX_FIELD_NAMES.length; i++) result[INDEX_FIELD_NAMES[i]] = euclideanDistance(embedding, sampleVectors[i].vector);
152
+ return result;
153
+ }
154
+ function getIndexValueArray(indexValues) {
155
+ return INDEX_FIELD_NAMES.map((name) => indexValues[name]);
156
+ }
157
+ function createIndexValuesFromArray(values) {
158
+ if (values.length !== INDEX_FIELD_NAMES.length) throw new Error(`Values array length (${values.length}) must match index field count (${INDEX_FIELD_NAMES.length})`);
159
+ const result = {};
160
+ for (let i = 0; i < INDEX_FIELD_NAMES.length; i++) result[INDEX_FIELD_NAMES[i]] = values[i];
161
+ return result;
162
+ }
163
+ function cosineSimilarity(vec1, vec2) {
164
+ if (vec1.length !== vec2.length) throw new Error(`Vector dimensions must match: ${vec1.length} vs ${vec2.length}`);
165
+ let dotProduct = 0;
166
+ let norm1 = 0;
167
+ let norm2 = 0;
168
+ for (let i = 0; i < vec1.length; i++) {
169
+ dotProduct += vec1[i] * vec2[i];
170
+ norm1 += vec1[i] * vec1[i];
171
+ norm2 += vec2[i] * vec2[i];
172
+ }
173
+ const denominator = Math.sqrt(norm1) * Math.sqrt(norm2);
174
+ if (denominator === 0) return 0;
175
+ return dotProduct / denominator;
176
+ }
177
+ //#endregion
178
+ //#region src/chunkers/fallback-chunker.ts
179
+ var DEFAULT_CHUNK_SIZE$1 = 500;
180
+ var DEFAULT_CHUNK_OVERLAP$1 = 50;
181
+ var DEFAULT_MIN_CHUNK_SIZE = 100;
182
+ var FallbackChunker = class {
183
+ constructor(options = {}) {
184
+ this.chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE$1;
185
+ this.chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP$1;
186
+ this.minChunkSize = options.minChunkSize ?? DEFAULT_MIN_CHUNK_SIZE;
187
+ }
188
+ chunkDocument(documentId, content, fileName) {
189
+ if (content.length <= this.chunkSize) {
190
+ const text = `${fileName} ${content}`;
191
+ return [{
192
+ id: `${documentId}:chunk:0`,
193
+ documentId,
194
+ chunkIndex: 0,
195
+ text,
196
+ startOffset: 0,
197
+ endOffset: content.length
198
+ }];
199
+ }
200
+ const chunks = [];
201
+ const step = this.chunkSize - this.chunkOverlap;
202
+ let offset = 0;
203
+ let chunkIndex = 0;
204
+ while (offset < content.length) {
205
+ const endOffset = Math.min(offset + this.chunkSize, content.length);
206
+ const chunkText = content.substring(offset, endOffset);
207
+ if (chunkText.trim().length < this.minChunkSize && chunks.length > 0) break;
208
+ const text = chunkIndex === 0 ? `${fileName} ${chunkText}` : chunkText;
209
+ chunks.push({
210
+ id: `${documentId}:chunk:${chunkIndex}`,
211
+ documentId,
212
+ chunkIndex,
213
+ text,
214
+ startOffset: offset,
215
+ endOffset
216
+ });
217
+ offset += step;
218
+ chunkIndex++;
219
+ }
220
+ return chunks;
221
+ }
222
+ getChunkContext(chunk, allChunks) {
223
+ const prevChunk = chunk.chunkIndex > 0 ? allChunks[chunk.chunkIndex - 1] : null;
224
+ const nextChunk = chunk.chunkIndex < allChunks.length - 1 ? allChunks[chunk.chunkIndex + 1] : null;
225
+ let context = "";
226
+ if (prevChunk) context += `[Previous: ${prevChunk.text.substring(Math.max(0, prevChunk.text.length - SNIPPET_LENGTHS.CONTEXT))}]\n\n`;
227
+ context += chunk.text;
228
+ if (nextChunk) context += `\n\n[Next: ${nextChunk.text.substring(0, SNIPPET_LENGTHS.CONTEXT)}]`;
229
+ return context;
230
+ }
231
+ };
232
+ //#endregion
233
+ //#region src/chunkers/langchain-chunker.ts
234
+ var logger$7 = createLogger("LangChainChunker");
235
+ var DEFAULT_CHUNK_SIZE = 500;
236
+ var DEFAULT_CHUNK_OVERLAP = 75;
237
+ function countWords(text) {
238
+ if (!text || text.trim().length === 0) return 0;
239
+ return text.trim().split(/\s+/).filter((word) => word.length > 0).length;
240
+ }
241
+ var LangChainChunker = class {
242
+ constructor(options = {}) {
243
+ this.textSplitter = null;
244
+ this.chunkSize = options.chunkSize ?? DEFAULT_CHUNK_SIZE;
245
+ this.chunkOverlap = options.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
246
+ }
247
+ async getTextSplitter() {
248
+ if (!this.textSplitter) {
249
+ const { RecursiveCharacterTextSplitter } = await import("@langchain/textsplitters");
250
+ this.textSplitter = new RecursiveCharacterTextSplitter({
251
+ chunkSize: this.chunkSize,
252
+ chunkOverlap: this.chunkOverlap,
253
+ lengthFunction: countWords,
254
+ separators: [
255
+ "\n\n",
256
+ "\n",
257
+ ". ",
258
+ "! ",
259
+ "? ",
260
+ " "
261
+ ],
262
+ keepSeparator: false
263
+ });
264
+ }
265
+ return this.textSplitter;
266
+ }
267
+ async chunkDocument(documentId, content, fileName) {
268
+ try {
269
+ const chunks = await (await this.getTextSplitter()).splitText(content);
270
+ const documentChunks = [];
271
+ for (let index = 0; index < chunks.length; index++) {
272
+ const chunkText = chunks[index];
273
+ let startOffset;
274
+ if (index === 0) {
275
+ const foundPos = content.indexOf(chunkText);
276
+ startOffset = foundPos !== -1 ? foundPos : 0;
277
+ } else {
278
+ const prevChunk = documentChunks[index - 1];
279
+ const overlapBuffer = Math.max(this.chunkOverlap * 10, chunkText.length);
280
+ const searchStart = Math.max(0, prevChunk.endOffset - overlapBuffer);
281
+ const searchEnd = Math.min(content.length, prevChunk.endOffset + chunkText.length);
282
+ const relativePos = content.substring(searchStart, searchEnd).indexOf(chunkText);
283
+ if (relativePos !== -1) {
284
+ const candidateOffset = searchStart + relativePos;
285
+ if (candidateOffset >= prevChunk.startOffset && candidateOffset < prevChunk.endOffset + chunkText.length) startOffset = candidateOffset;
286
+ else startOffset = prevChunk.endOffset;
287
+ } else startOffset = prevChunk.endOffset;
288
+ }
289
+ const endOffset = Math.min(startOffset + chunkText.length, content.length);
290
+ const text = index === 0 ? `${fileName} ${chunkText}` : chunkText;
291
+ documentChunks.push({
292
+ id: `${documentId}:chunk:${index}`,
293
+ documentId,
294
+ chunkIndex: index,
295
+ text,
296
+ startOffset,
297
+ endOffset
298
+ });
299
+ }
300
+ this.validateNoWordSplitting(documentChunks, content, documentId);
301
+ logger$7.debug(`Document ${documentId} split into ${documentChunks.length} chunks using LangChain`);
302
+ return documentChunks;
303
+ } catch (error) {
304
+ logger$7.warn(`LangChain chunking failed for ${documentId}, falling back to simple chunking: ${error}`);
305
+ throw error;
306
+ }
307
+ }
308
+ validateNoWordSplitting(chunks, originalContent, documentId) {
309
+ for (let i = 0; i < chunks.length - 1; i++) {
310
+ const currentChunk = chunks[i];
311
+ const nextChunk = chunks[i + 1];
312
+ if (currentChunk.endOffset < originalContent.length && nextChunk.startOffset > currentChunk.endOffset) {
313
+ const gap = originalContent.substring(currentChunk.endOffset, nextChunk.startOffset);
314
+ if (gap.trim().length > 0 && !/^\s+$/.test(gap)) {
315
+ const beforeChar = originalContent[currentChunk.endOffset - 1];
316
+ const afterChar = originalContent[nextChunk.startOffset];
317
+ if (beforeChar && afterChar && /[a-zA-Z0-9]/.test(beforeChar) && /[a-zA-Z0-9]/.test(afterChar)) logger$7.warn(`Potential word split detected in document ${documentId} between chunks ${i} and ${i + 1}`);
318
+ }
319
+ }
320
+ }
321
+ }
322
+ getChunkContext(chunk, allChunks) {
323
+ const prevChunk = chunk.chunkIndex > 0 ? allChunks[chunk.chunkIndex - 1] : null;
324
+ const nextChunk = chunk.chunkIndex < allChunks.length - 1 ? allChunks[chunk.chunkIndex + 1] : null;
325
+ let context = "";
326
+ if (prevChunk) context += `[Previous: ${prevChunk.text.substring(Math.max(0, prevChunk.text.length - SNIPPET_LENGTHS.CONTEXT))}]\n\n`;
327
+ context += chunk.text;
328
+ if (nextChunk) context += `\n\n[Next: ${nextChunk.text.substring(0, SNIPPET_LENGTHS.CONTEXT)}]`;
329
+ return context;
330
+ }
331
+ };
332
+ //#endregion
333
+ //#region src/chunkers/document-chunker.ts
334
+ var logger$6 = createLogger("DocumentChunker");
335
+ var DocumentChunker = class {
336
+ constructor(options = {}) {
337
+ try {
338
+ this.chunker = new LangChainChunker(options);
339
+ logger$6.debug("Using LangChain chunker");
340
+ } catch (error) {
341
+ logger$6.warn(`Failed to initialize LangChain chunker, using fallback: ${error}`);
342
+ this.chunker = new FallbackChunker(options);
343
+ }
344
+ }
345
+ async chunkDocument(documentId, content, fileName) {
346
+ try {
347
+ const result = this.chunker.chunkDocument(documentId, content, fileName);
348
+ return await Promise.resolve(result);
349
+ } catch (error) {
350
+ logger$6.warn(`Primary chunker failed, falling back: ${error}`);
351
+ return new FallbackChunker().chunkDocument(documentId, content, fileName);
352
+ }
353
+ }
354
+ getChunkContext(chunk, allChunks) {
355
+ return this.chunker.getChunkContext(chunk, allChunks);
356
+ }
357
+ };
358
+ new DocumentChunker();
359
+ //#endregion
360
+ //#region src/extractors/pdfjs-extractor.ts
361
+ var logger$5 = createLogger("PDFJSExtractor");
362
+ var PDFJSExtractor = class {
363
+ canExtract(fileType) {
364
+ return fileType.toLowerCase() === "pdf";
365
+ }
366
+ async extractText(file, options = {}) {
367
+ try {
368
+ const pdfjsLib = await import("pdfjs-dist");
369
+ if (!pdfjsLib.GlobalWorkerOptions.workerSrc) pdfjsLib.GlobalWorkerOptions.workerSrc = `https://unpkg.com/pdfjs-dist@${pdfjsLib.version}/build/pdf.worker.min.mjs`;
370
+ const arrayBuffer = await (await file.getContents({ blob: true })).arrayBuffer();
371
+ const pdf = await pdfjsLib.getDocument({
372
+ data: arrayBuffer,
373
+ useSystemFonts: true
374
+ }).promise;
375
+ const numPages = pdf.numPages;
376
+ const textParts = [];
377
+ const includePageNumbers = options.includePageNumbers !== false;
378
+ const pageSeparator = options.pageSeparator || "\n\n";
379
+ for (let pageNum = 1; pageNum <= numPages; pageNum++) {
380
+ const pageText = (await (await pdf.getPage(pageNum)).getTextContent()).items.map((item) => item.str).join(" ");
381
+ if (pageText.trim()) if (includePageNumbers) textParts.push(`[Page ${pageNum}]\n${pageText}`);
382
+ else textParts.push(pageText);
383
+ }
384
+ const extractedText = textParts.join(pageSeparator);
385
+ if (!extractedText || extractedText.trim().length === 0) throw new Error("PDF appears to contain no extractable text (may be image-based or scanned)");
386
+ logger$5.debug(`Extracted ${numPages} pages from PDF: ${file.getName()}`);
387
+ return extractedText;
388
+ } catch (error) {
389
+ logger$5.warn(`Failed to extract text from PDF ${file.getName()}: ${error}`);
390
+ throw new Error(`PDF text extraction failed: ${error}`);
391
+ }
392
+ }
393
+ };
394
+ //#endregion
395
+ //#region src/extractors/llm-ocr-extractor.ts
396
+ var logger$4 = createLogger("LLMOCRExtractor");
397
+ /**
398
+ * LLM-based OCR extractor for document files using Mistral OCR.
399
+ *
400
+ * This extractor uses Mistral OCR API to perform OCR on document files,
401
+ * particularly useful for scanned documents or image-based files
402
+ * that cannot be processed by standard text extraction methods.
403
+ */
404
+ var LLMOCRExtractor = class {
405
+ canExtract(fileType) {
406
+ return [
407
+ "pdf",
408
+ "png",
409
+ "jpg",
410
+ "jpeg",
411
+ "tiff",
412
+ "tif"
413
+ ].includes(fileType.toLowerCase());
414
+ }
415
+ async extractText(file, options = {}) {
416
+ const fileName = file.getName();
417
+ const fileType = options?.fileType || fileName.split(".").pop()?.toLowerCase() || "pdf";
418
+ const ocrProvider = (await aiService.getProviders()).find((p) => {
419
+ return p.parameters?.["ocrApiEndpoint"] && p.name.toLowerCase().includes("mistral");
420
+ });
421
+ const ocrEndpoint = ocrProvider?.parameters?.["ocrApiEndpoint"];
422
+ if (!ocrProvider || !ocrEndpoint) throw new Error("Mistral OCR provider not configured. Please add ocrApiEndpoint to the provider parameters in AI settings.");
423
+ try {
424
+ const fileBlob = await file.getContents({ blob: true });
425
+ const base64Content = await this.blobToBase64(fileBlob);
426
+ const mimeType = this.getMimeType(fileType);
427
+ const response = await fetch(ocrEndpoint, {
428
+ method: "POST",
429
+ headers: {
430
+ "Authorization": `Bearer ${ocrProvider.apiKey}`,
431
+ "Content-Type": "application/json"
432
+ },
433
+ body: JSON.stringify({
434
+ model: ocrProvider.parameters?.["ocrModel"] || ocrProvider.model || "mistral-ocr-latest",
435
+ document: {
436
+ type: "document_url",
437
+ document_url: `data:${mimeType};base64,${base64Content}`
438
+ },
439
+ include_image_base64: false
440
+ })
441
+ });
442
+ if (!response.ok) {
443
+ const errorText = await response.text().catch(() => "Unknown error");
444
+ throw new Error(`OCR request failed: HTTP ${response.status}: ${errorText}`);
445
+ }
446
+ const result = await response.json();
447
+ if (!result.pages || !Array.isArray(result.pages)) throw new Error("Invalid OCR response format: missing pages array");
448
+ const includePageNumbers = options.includePageNumbers !== false;
449
+ const pageSeparator = options.pageSeparator || "\n\n";
450
+ const textParts = result.pages.map((page, index) => {
451
+ const pageText = page?.markdown || page?.text || "";
452
+ if (!pageText.trim()) return null;
453
+ if (includePageNumbers) return `[Page ${index + 1}]\n${pageText}`;
454
+ return pageText;
455
+ }).filter((text) => text !== null);
456
+ if (textParts.length === 0) throw new Error("No text content found in OCR response");
457
+ const extractedText = textParts.join(pageSeparator);
458
+ logger$4.debug(`Extracted ${result.pages.length} pages from ${fileType} file: ${fileName}`);
459
+ return extractedText;
460
+ } catch (error) {
461
+ logger$4.warn(`Failed to extract text using OCR from ${fileName}: ${error}`);
462
+ throw new Error(`OCR text extraction failed: ${error}`);
463
+ }
464
+ }
465
+ async blobToBase64(blob) {
466
+ return new Promise((resolve, reject) => {
467
+ const reader = new FileReader();
468
+ reader.onloadend = () => {
469
+ const base64 = reader.result.split(",")[1];
470
+ resolve(base64);
471
+ };
472
+ reader.onerror = reject;
473
+ reader.readAsDataURL(blob);
474
+ });
475
+ }
476
+ getMimeType(fileType) {
477
+ return {
478
+ "pdf": "application/pdf",
479
+ "png": "image/png",
480
+ "jpg": "image/jpeg",
481
+ "jpeg": "image/jpeg",
482
+ "tiff": "image/tiff",
483
+ "tif": "image/tiff"
484
+ }[fileType.toLowerCase()] || "application/octet-stream";
485
+ }
486
+ };
487
+ //#endregion
488
+ //#region src/extractors/document-extractor.ts
489
+ var logger$3 = createLogger("DocumentExtractor");
490
+ var DocumentExtractor = class {
491
+ constructor() {
492
+ this.extractors = [new LLMOCRExtractor(), new PDFJSExtractor()];
493
+ }
494
+ canExtract(fileType) {
495
+ return this.extractors.some((extractor) => extractor.canExtract(fileType));
496
+ }
497
+ async extractText(file, options) {
498
+ const fileName = file.getName();
499
+ const fileType = options?.fileType || fileName.split(".").pop()?.toLowerCase() || "txt";
500
+ const availableExtractors = this.extractors.filter((ext) => ext.canExtract(fileType));
501
+ if (availableExtractors.length === 0) throw new Error(`No extractor available for file type: ${fileType}`);
502
+ let lastError = null;
503
+ for (const extractor of availableExtractors) try {
504
+ logger$3.debug(`Using ${extractor.constructor.name} for file type: ${fileType}`);
505
+ return await extractor.extractText(file, {
506
+ ...options,
507
+ fileType
508
+ });
509
+ } catch (error) {
510
+ lastError = error instanceof Error ? error : new Error(String(error));
511
+ logger$3.warn(`${extractor.constructor.name} failed for ${fileName}: ${lastError.message}`);
512
+ if (extractor instanceof LLMOCRExtractor && availableExtractors.length > 1) {
513
+ const fallbackExtractor = availableExtractors.find((ext) => ext instanceof PDFJSExtractor);
514
+ if (fallbackExtractor) {
515
+ const warningMsg = `Mistral OCR extraction failed, falling back to PDF.js extractor for ${fileName}`;
516
+ logger$3.warn(warningMsg);
517
+ toastWarning(warningMsg);
518
+ try {
519
+ logger$3.debug(`Using ${fallbackExtractor.constructor.name} as fallback for file type: ${fileType}`);
520
+ return await fallbackExtractor.extractText(file, {
521
+ ...options,
522
+ fileType
523
+ });
524
+ } catch (fallbackError) {
525
+ lastError = fallbackError instanceof Error ? fallbackError : new Error(String(fallbackError));
526
+ logger$3.warn(`Fallback extraction also failed for ${fileName}: ${lastError.message}`);
527
+ }
528
+ }
529
+ }
530
+ }
531
+ throw lastError || /* @__PURE__ */ new Error(`All extractors failed for file type: ${fileType}`);
532
+ }
533
+ };
534
+ new DocumentExtractor();
535
+ //#endregion
536
+ //#region src/rxdb-loader.ts
537
+ var rxdbModulesPromise = null;
538
+ var pluginsRegistered = false;
539
+ async function importRxdbModules() {
540
+ return Promise.all([
541
+ import("rxdb"),
542
+ import("rxdb/plugins/storage-dexie"),
543
+ import("rxdb/plugins/query-builder"),
544
+ import("rxdb/plugins/migration-schema"),
545
+ import("rxdb/plugins/update")
546
+ ]);
547
+ }
548
+ async function getRxDbModules() {
549
+ if (!rxdbModulesPromise) rxdbModulesPromise = importRxdbModules().then(([rxdb, storageDexie, queryBuilder, migrationSchema, update]) => {
550
+ if (!pluginsRegistered) {
551
+ rxdb.addRxPlugin(queryBuilder.RxDBQueryBuilderPlugin);
552
+ rxdb.addRxPlugin(migrationSchema.RxDBMigrationSchemaPlugin);
553
+ rxdb.addRxPlugin(update.RxDBUpdatePlugin);
554
+ pluginsRegistered = true;
555
+ }
556
+ return {
557
+ rxdb,
558
+ storageDexie
559
+ };
560
+ });
561
+ return rxdbModulesPromise;
562
+ }
563
+ //#endregion
564
+ //#region src/document-index-service.ts
565
+ var logger$2 = createLogger("DocumentIndexService");
566
+ var DocumentIndexService = class {
567
+ constructor() {
568
+ this.sampleVectors = [];
569
+ this.isInitialized = false;
570
+ this.DEFAULT_MAX_FILE_SIZE = 5 * 1024 * 1024;
571
+ this.chunker = new DocumentChunker();
572
+ this.documentExtractor = new DocumentExtractor();
573
+ this.DEFAULT_INDEXABLE_TYPES = [
574
+ "md",
575
+ "txt",
576
+ "ts",
577
+ "tsx",
578
+ "js",
579
+ "jsx",
580
+ "json",
581
+ "geojson",
582
+ "kml",
583
+ "gpx",
584
+ "py",
585
+ "html",
586
+ "css",
587
+ "sql",
588
+ "xml",
589
+ "yaml",
590
+ "yml",
591
+ "pdf"
592
+ ];
593
+ }
594
+ async initialize() {
595
+ if (this.isInitialized) return;
596
+ logger$2.info("Initializing document index service with RxDB...");
597
+ try {
598
+ const { rxdb, storageDexie } = await getRxDbModules();
599
+ this.db = await rxdb.createRxDatabase({
600
+ name: "document-index-db",
601
+ storage: storageDexie.getRxStorageDexie(),
602
+ ignoreDuplicate: true
603
+ });
604
+ const collectionsToAdd = {
605
+ documents: { schema: {
606
+ version: 0,
607
+ primaryKey: "id",
608
+ type: "object",
609
+ properties: {
610
+ id: {
611
+ type: "string",
612
+ maxLength: 500
613
+ },
614
+ workspacePath: { type: "string" },
615
+ filePath: { type: "string" },
616
+ fileName: { type: "string" },
617
+ fileType: { type: "string" },
618
+ content: { type: "string" },
619
+ contentHash: { type: "string" },
620
+ metadata: {
621
+ type: "object",
622
+ properties: {
623
+ size: { type: "number" },
624
+ lastModified: { type: "number" },
625
+ language: { type: "string" },
626
+ tags: {
627
+ type: "array",
628
+ items: { type: "string" }
629
+ }
630
+ }
631
+ },
632
+ indexedAt: { type: "number" },
633
+ updatedAt: { type: "number" }
634
+ },
635
+ required: [
636
+ "id",
637
+ "workspacePath",
638
+ "filePath",
639
+ "content",
640
+ "contentHash"
641
+ ],
642
+ indexes: [
643
+ "workspacePath",
644
+ "filePath",
645
+ "fileType"
646
+ ]
647
+ } },
648
+ vectors: { schema: {
649
+ version: 1,
650
+ primaryKey: "id",
651
+ type: "object",
652
+ properties: {
653
+ id: {
654
+ type: "string",
655
+ maxLength: 500
656
+ },
657
+ documentId: {
658
+ type: "string",
659
+ maxLength: 500
660
+ },
661
+ embedding: {
662
+ type: "array",
663
+ items: { type: "number" }
664
+ },
665
+ idx0: { type: "number" },
666
+ idx1: { type: "number" },
667
+ idx2: { type: "number" },
668
+ idx3: { type: "number" },
669
+ idx4: { type: "number" },
670
+ chunkIndex: { type: "number" },
671
+ chunkStartOffset: { type: "number" },
672
+ chunkEndOffset: { type: "number" }
673
+ },
674
+ required: [
675
+ "id",
676
+ "documentId",
677
+ "embedding",
678
+ "idx0",
679
+ "idx1",
680
+ "idx2",
681
+ "idx3",
682
+ "idx4"
683
+ ],
684
+ indexes: [
685
+ "documentId",
686
+ "chunkIndex",
687
+ "idx0",
688
+ "idx1",
689
+ "idx2",
690
+ "idx3",
691
+ "idx4"
692
+ ]
693
+ } }
694
+ };
695
+ try {
696
+ await this.db.addCollections(collectionsToAdd);
697
+ } catch (error) {
698
+ if (error?.code === "DB8" || error?.message?.includes("already exists")) logger$2.debug("Collections already exist, using existing collections");
699
+ else throw error;
700
+ }
701
+ this.documentsCollection = this.db.documents;
702
+ this.vectorsCollection = this.db.vectors;
703
+ await this.initializeSampleVectors();
704
+ await this.handleSchemaMigration();
705
+ this.isInitialized = true;
706
+ const count = await this.documentsCollection.count().exec();
707
+ const vectorCount = await this.vectorsCollection.count().exec();
708
+ logger$2.info(`Document index service initialized with ${count} documents and ${vectorCount} embeddings`);
709
+ subscribe(TOPIC_WORKSPACE_CONNECTED, (workspace) => {
710
+ if (workspace) this.handleWorkspaceChange(workspace).catch((err) => {
711
+ logger$2.error(`Failed to handle workspace connection: ${err}`);
712
+ });
713
+ });
714
+ subscribe(TOPIC_WORKSPACE_CHANGED, (workspace) => {
715
+ if (workspace) this.handleWorkspaceChange(workspace).catch((err) => {
716
+ logger$2.error(`Failed to handle workspace change: ${err}`);
717
+ });
718
+ });
719
+ logger$2.info("Document index service initialized");
720
+ } catch (error) {
721
+ logger$2.error(`Failed to initialize document index service: ${error}`);
722
+ throw error;
723
+ }
724
+ }
725
+ ensureInitialized() {
726
+ if (!this.isInitialized || !this.documentsCollection || !this.vectorsCollection) throw new Error("Document index service not initialized. Call initialize() first.");
727
+ }
728
+ async initializeSampleVectors() {
729
+ if (this.sampleVectors.length > 0) return;
730
+ const existingEmbeddings = await this.vectorsCollection.find().limit(1e3).exec();
731
+ const embeddingArrays = existingEmbeddings.map((v) => v.embedding);
732
+ this.sampleVectors = generateSampleVectors(VECTOR_SEARCH_CONFIG.SAMPLE_VECTOR_COUNT, embeddingService.getEmbeddingDimension(), embeddingArrays.length > 0 ? embeddingArrays : void 0);
733
+ logger$2.info(`Sample vectors initialized for index range method: ${this.sampleVectors.length} vectors, ${existingEmbeddings.length} existing embeddings`);
734
+ }
735
+ generateDocumentId(workspacePath, filePath) {
736
+ return `${workspacePath}:${filePath}`;
737
+ }
738
+ async computeContentHash(content) {
739
+ const data = new TextEncoder().encode(content);
740
+ const hashBuffer = await crypto.subtle.digest("SHA-256", data);
741
+ return Array.from(new Uint8Array(hashBuffer)).map((b) => b.toString(16).padStart(2, "0")).join("");
742
+ }
743
+ isIndexableFile(file, options) {
744
+ const fileName = file.getName().toLowerCase();
745
+ const fileTypes = options?.fileTypes || this.DEFAULT_INDEXABLE_TYPES;
746
+ const extension = fileName.split(".").pop();
747
+ if (!extension || !fileTypes.includes(extension)) return false;
748
+ if (options?.excludePatterns) {
749
+ for (const pattern of options.excludePatterns) if (fileName.includes(pattern) || file.getWorkspacePath().includes(pattern)) return false;
750
+ }
751
+ return true;
752
+ }
753
+ detectLanguage(fileName) {
754
+ const ext = fileName.split(".").pop()?.toLowerCase();
755
+ return {
756
+ "ts": "typescript",
757
+ "tsx": "typescript",
758
+ "js": "javascript",
759
+ "jsx": "javascript",
760
+ "py": "python",
761
+ "md": "markdown",
762
+ "json": "json",
763
+ "geojson": "geojson",
764
+ "kml": "xml",
765
+ "gpx": "xml",
766
+ "html": "html",
767
+ "css": "css",
768
+ "sql": "sql",
769
+ "xml": "xml",
770
+ "yaml": "yaml",
771
+ "yml": "yaml",
772
+ "pdf": "pdf"
773
+ }[ext || ""] || "text";
774
+ }
775
+ async indexDocument(file, options = {}) {
776
+ if (!this.isInitialized) await this.initialize();
777
+ this.ensureInitialized();
778
+ const workspacePath = file.getWorkspace().getName();
779
+ const filePath = file.getWorkspacePath();
780
+ const fileName = file.getName();
781
+ const id = this.generateDocumentId(workspacePath, filePath);
782
+ if (!this.isIndexableFile(file, options)) throw new Error(`File type not indexable: ${fileName}`);
783
+ try {
784
+ let content;
785
+ const fileType = fileName.split(".").pop()?.toLowerCase() || "txt";
786
+ if (this.documentExtractor.canExtract(fileType)) content = await this.documentExtractor.extractText(file, { fileType });
787
+ else {
788
+ const fileContent = await file.getContents({ contentType: FileContentType.TEXT });
789
+ if (typeof fileContent !== "string") throw new Error(`File content is not text: ${fileName}`);
790
+ content = fileContent;
791
+ }
792
+ if (!content || content.trim().length === 0) throw new Error(`File appears to be empty or text extraction failed: ${fileName}`);
793
+ const maxSize = options.maxFileSize || this.DEFAULT_MAX_FILE_SIZE;
794
+ if (content.length > maxSize) throw new Error(`File too large to index: ${fileName} (${content.length} bytes)`);
795
+ const contentHash = await this.computeContentHash(content);
796
+ const now = Date.now();
797
+ const existing = await this.documentsCollection.findOne(id).exec();
798
+ const existingDoc = existing ? existing.toJSON() : null;
799
+ const existingTags = existingDoc?.metadata.tags || [];
800
+ const newTags = options.tags || [];
801
+ const mergedTags = [...new Set([...existingTags, ...newTags])];
802
+ const tagsChanged = mergedTags.length !== existingTags.length || newTags.some((tag) => !existingTags.includes(tag));
803
+ if (existingDoc && existingDoc.contentHash === contentHash && !tagsChanged) {
804
+ logger$2.debug(`Document already indexed and unchanged: ${id}`);
805
+ return existingDoc;
806
+ }
807
+ const language = this.detectLanguage(fileName);
808
+ let lastModified = now;
809
+ try {
810
+ const fileHandle = file.getHandle?.();
811
+ if (fileHandle) lastModified = (await fileHandle.getFile()).lastModified;
812
+ } catch (err) {
813
+ logger$2.debug(`Could not get file modification time: ${err}`);
814
+ }
815
+ const contentChanged = !existingDoc || existingDoc.contentHash !== contentHash;
816
+ const document = {
817
+ id,
818
+ workspacePath,
819
+ filePath,
820
+ fileName,
821
+ fileType,
822
+ content: options.includeContent !== false ? content : "",
823
+ contentHash,
824
+ metadata: {
825
+ size: content.length,
826
+ lastModified,
827
+ language,
828
+ tags: mergedTags
829
+ },
830
+ indexedAt: existingDoc?.indexedAt || now,
831
+ updatedAt: now
832
+ };
833
+ await this.documentsCollection.upsert(document);
834
+ if (contentChanged) await this.generateAndStoreEmbedding(document);
835
+ else logger$2.debug(`Document content unchanged, skipping embedding regeneration: ${id}`);
836
+ logger$2.debug(`Indexed document: ${id}`);
837
+ return document;
838
+ } catch (error) {
839
+ logger$2.error(`Failed to index document ${id}: ${error}`);
840
+ throw error;
841
+ }
842
+ }
843
+ async getDocument(id) {
844
+ if (!this.isInitialized) await this.initialize();
845
+ this.ensureInitialized();
846
+ const doc = await this.documentsCollection.findOne(id).exec();
847
+ return doc ? doc.toJSON() : null;
848
+ }
849
+ async getDocumentByPath(workspacePath, filePath) {
850
+ const id = this.generateDocumentId(workspacePath, filePath);
851
+ return this.getDocument(id);
852
+ }
853
+ async listDocuments(workspacePath) {
854
+ if (!this.isInitialized) await this.initialize();
855
+ this.ensureInitialized();
856
+ let query = this.documentsCollection.find();
857
+ if (workspacePath) query = query.where("workspacePath").eq(workspacePath);
858
+ return (await query.exec()).map((doc) => doc.toJSON());
859
+ }
860
+ async deleteDocument(id) {
861
+ if (!this.isInitialized) await this.initialize();
862
+ this.ensureInitialized();
863
+ const doc = await this.documentsCollection.findOne(id).exec();
864
+ if (doc) {
865
+ await doc.remove();
866
+ const vectors = await this.vectorsCollection.find().where("documentId").eq(id).exec();
867
+ for (const vector of vectors) await vector.remove();
868
+ logger$2.debug(`Deleted document ${id} and ${vectors.length} associated embeddings`);
869
+ return true;
870
+ }
871
+ return false;
872
+ }
873
+ async deleteDocumentByPath(workspacePath, filePath) {
874
+ const id = this.generateDocumentId(workspacePath, filePath);
875
+ return this.deleteDocument(id);
876
+ }
877
+ async handleSchemaMigration() {
878
+ if (!this.vectorsCollection || !this.documentsCollection) return;
879
+ try {
880
+ const vectorsWithoutChunks = (await this.vectorsCollection.find().exec()).filter((v) => {
881
+ const data = v.toJSON();
882
+ return data.chunkIndex === void 0 && data.chunkStartOffset === void 0 && data.chunkEndOffset === void 0;
883
+ });
884
+ if (vectorsWithoutChunks.length === 0) {
885
+ logger$2.debug("No vectors need migration - all have chunk information");
886
+ return;
887
+ }
888
+ logger$2.info(`Detected ${vectorsWithoutChunks.length} vectors without chunk information. Invalidating and reindexing...`);
889
+ const documentIdsToReindex = /* @__PURE__ */ new Set();
890
+ for (const vector of vectorsWithoutChunks) {
891
+ const data = vector.toJSON();
892
+ documentIdsToReindex.add(data.documentId);
893
+ await vector.remove();
894
+ }
895
+ logger$2.info(`Removed ${vectorsWithoutChunks.length} old vectors. Reindexing ${documentIdsToReindex.size} documents...`);
896
+ for (const documentId of documentIdsToReindex) {
897
+ const doc = await this.documentsCollection.findOne(documentId).exec();
898
+ if (doc) {
899
+ const document = doc.toJSON();
900
+ logger$2.debug(`Reindexing document: ${document.fileName}`);
901
+ await this.generateAndStoreEmbedding(document);
902
+ }
903
+ }
904
+ logger$2.info(`Schema migration completed. Reindexed ${documentIdsToReindex.size} documents.`);
905
+ } catch (error) {
906
+ logger$2.error(`Error during schema migration: ${error}`);
907
+ throw error;
908
+ }
909
+ }
910
+ async deleteWorkspace(workspacePath) {
911
+ if (!this.isInitialized) await this.initialize();
912
+ this.ensureInitialized();
913
+ const docs = await this.documentsCollection.find().where("workspacePath").eq(workspacePath).exec();
914
+ const count = docs.length;
915
+ for (const doc of docs) await doc.remove();
916
+ if (count > 0) logger$2.info(`Deleted ${count} documents for workspace: ${workspacePath}`);
917
+ return count;
918
+ }
919
+ async updateDocumentMetadata(id, updates) {
920
+ if (!this.isInitialized) await this.initialize();
921
+ this.ensureInitialized();
922
+ const doc = await this.documentsCollection.findOne(id).exec();
923
+ if (!doc) return null;
924
+ const current = doc.toJSON();
925
+ const updated = {
926
+ ...current,
927
+ metadata: {
928
+ ...current.metadata,
929
+ ...updates.metadata
930
+ },
931
+ updatedAt: Date.now()
932
+ };
933
+ await doc.update({ $set: updated });
934
+ logger$2.debug(`Updated document metadata: ${id}`);
935
+ return updated;
936
+ }
937
+ async indexWorkspace(workspace, options = {}) {
938
+ if (!this.isInitialized) await this.initialize();
939
+ const workspacePath = workspace.getName();
940
+ logger$2.info(`Starting workspace indexing: ${workspacePath}`);
941
+ const files = await this.collectFiles(workspace, options);
942
+ logger$2.info(`Found ${files.length} files to index`);
943
+ let indexed = 0;
944
+ let failed = 0;
945
+ const errors = [];
946
+ for (const file of files) try {
947
+ await this.indexDocument(file, options);
948
+ indexed++;
949
+ } catch (error) {
950
+ failed++;
951
+ const errorMsg = `Failed to index ${file.getName()}: ${error}`;
952
+ errors.push(errorMsg);
953
+ logger$2.warn(errorMsg);
954
+ }
955
+ logger$2.info(`Workspace indexing complete: ${indexed} indexed, ${failed} failed`);
956
+ return {
957
+ indexed,
958
+ failed,
959
+ errors
960
+ };
961
+ }
962
+ async collectFiles(directory, options, files = []) {
963
+ try {
964
+ const children = await directory.listChildren(false);
965
+ for (const child of children) if (child instanceof File) {
966
+ if (this.isIndexableFile(child, options)) files.push(child);
967
+ } else if (child instanceof Directory) await this.collectFiles(child, options, files);
968
+ } catch (error) {
969
+ logger$2.warn(`Failed to collect files from ${directory.getName()}: ${error}`);
970
+ }
971
+ return files;
972
+ }
973
+ async reindexDocument(file, options = {}) {
974
+ const workspacePath = file.getWorkspace().getName();
975
+ const filePath = file.getWorkspacePath();
976
+ const id = this.generateDocumentId(workspacePath, filePath);
977
+ const existingTags = (await this.getDocument(id))?.metadata.tags || [];
978
+ const newTags = options.tags || [];
979
+ const mergedTags = [...new Set([...existingTags, ...newTags])];
980
+ await this.deleteDocument(id);
981
+ return this.indexDocument(file, {
982
+ ...options,
983
+ tags: mergedTags
984
+ });
985
+ }
986
+ async reindexAllDocuments(options = {}) {
987
+ if (!this.isInitialized) await this.initialize();
988
+ this.ensureInitialized();
989
+ const allDocs = await this.listDocuments();
990
+ let succeeded = 0;
991
+ let failed = 0;
992
+ for (const doc of allDocs) try {
993
+ const workspace = await workspaceService.getWorkspace();
994
+ if (!workspace || workspace.getName() !== doc.workspacePath) {
995
+ logger$2.warn(`Workspace not found: ${doc.workspacePath}`);
996
+ failed++;
997
+ continue;
998
+ }
999
+ const resource = await workspace.getResource(doc.filePath);
1000
+ if (!resource || !(resource instanceof File)) {
1001
+ logger$2.warn(`File not found: ${doc.filePath}`);
1002
+ failed++;
1003
+ continue;
1004
+ }
1005
+ await this.reindexDocument(resource, options);
1006
+ succeeded++;
1007
+ } catch (error) {
1008
+ logger$2.error(`Failed to reindex document ${doc.id}: ${error}`);
1009
+ failed++;
1010
+ }
1011
+ return {
1012
+ total: allDocs.length,
1013
+ succeeded,
1014
+ failed
1015
+ };
1016
+ }
1017
+ async getStats() {
1018
+ if (!this.isInitialized) await this.initialize();
1019
+ this.ensureInitialized();
1020
+ const totalDocuments = await this.documentsCollection.count().exec();
1021
+ const allDocs = await this.documentsCollection.find().exec();
1022
+ const byWorkspace = {};
1023
+ for (const doc of allDocs) {
1024
+ const workspacePath = doc.toJSON().workspacePath;
1025
+ byWorkspace[workspacePath] = (byWorkspace[workspacePath] || 0) + 1;
1026
+ }
1027
+ return {
1028
+ totalDocuments,
1029
+ byWorkspace
1030
+ };
1031
+ }
1032
+ async handleWorkspaceChange(workspace) {
1033
+ logger$2.debug("Workspace changed, checking for document updates...");
1034
+ }
1035
+ async generateAndStoreEmbedding(document) {
1036
+ try {
1037
+ if (!this.vectorsCollection) {
1038
+ logger$2.warn(`Vectors collection not initialized, cannot generate embedding for ${document.id}`);
1039
+ return;
1040
+ }
1041
+ await embeddingService.initialize();
1042
+ if (this.sampleVectors.length === 0) await this.initializeSampleVectors();
1043
+ if (this.sampleVectors.length === 0) {
1044
+ logger$2.warn(`Sample vectors not initialized, cannot generate embedding for ${document.id}`);
1045
+ return;
1046
+ }
1047
+ const chunks = await this.chunker.chunkDocument(document.id, document.content, document.fileName);
1048
+ logger$2.debug(`Document ${document.id} split into ${chunks.length} chunks`);
1049
+ for (const chunk of chunks) {
1050
+ const embedding = await embeddingService.generateEmbedding(chunk.text);
1051
+ const indexValues = calculateIndexValues(embedding, this.sampleVectors);
1052
+ const vectorDoc = {
1053
+ id: chunk.id,
1054
+ documentId: document.id,
1055
+ chunkIndex: chunk.chunkIndex,
1056
+ chunkStartOffset: chunk.startOffset,
1057
+ chunkEndOffset: chunk.endOffset,
1058
+ embedding,
1059
+ ...indexValues
1060
+ };
1061
+ await this.vectorsCollection.upsert(vectorDoc);
1062
+ }
1063
+ logger$2.debug(`Generated and stored ${chunks.length} embeddings for document: ${document.id}`);
1064
+ } catch (error) {
1065
+ logger$2.warn(`Failed to generate embedding for document ${document.id}: ${error}`);
1066
+ }
1067
+ }
1068
+ async searchSimilar(queryText, options = {}) {
1069
+ if (!this.isInitialized) await this.initialize();
1070
+ this.ensureInitialized();
1071
+ const limit = options.limit || 10;
1072
+ const indexDistance = options.indexDistance || 2;
1073
+ const docsPerIndexSide = options.docsPerIndexSide || 100;
1074
+ if (!this.vectorsCollection || this.sampleVectors.length === 0) {
1075
+ logger$2.warn("Vector search not available: vectors collection or sample vectors not initialized");
1076
+ throw new Error("Vector search not available");
1077
+ }
1078
+ const totalVectors = await this.vectorsCollection.find().exec();
1079
+ logger$2.debug(`Starting vector search with indexDistance=${indexDistance}, limit=${limit}, sampleVectors=${this.sampleVectors.length}, totalIndexedVectors=${totalVectors.length}`);
1080
+ try {
1081
+ await embeddingService.initialize();
1082
+ } catch (error) {
1083
+ logger$2.error(`Failed to initialize embedding service for vector search: ${error}`);
1084
+ throw new Error(`Embedding service initialization failed: ${error}`);
1085
+ }
1086
+ let queryEmbedding;
1087
+ try {
1088
+ queryEmbedding = await embeddingService.generateEmbedding(queryText);
1089
+ } catch (error) {
1090
+ logger$2.error(`Failed to generate query embedding: ${error}`);
1091
+ throw new Error(`Query embedding generation failed: ${error}`);
1092
+ }
1093
+ if (this.sampleVectors.length === 0) {
1094
+ logger$2.warn("Sample vectors not initialized, cannot perform vector search");
1095
+ throw new Error("Sample vectors not initialized");
1096
+ }
1097
+ const queryIndexValues = calculateIndexValues(queryEmbedding, this.sampleVectors);
1098
+ logger$2.debug(`Query index values: ${JSON.stringify(queryIndexValues)}`);
1099
+ const candidateIds = /* @__PURE__ */ new Set();
1100
+ try {
1101
+ for (const idxKey of INDEX_FIELD_NAMES) {
1102
+ const queryValue = queryIndexValues[idxKey];
1103
+ const minValue = queryValue - indexDistance;
1104
+ const maxValue = queryValue + indexDistance;
1105
+ logger$2.debug(`Querying index ${idxKey}: range [${minValue}, ${maxValue}]`);
1106
+ const candidates = await this.vectorsCollection.find().where(idxKey).gte(minValue).lte(maxValue).limit(docsPerIndexSide).exec();
1107
+ logger$2.debug(`Found ${candidates.length} candidates in index ${idxKey}`);
1108
+ for (const candidate of candidates) candidateIds.add(candidate.documentId);
1109
+ }
1110
+ } catch (error) {
1111
+ logger$2.error(`Failed to query vector index: ${error}`);
1112
+ throw new Error(`Vector index query failed: ${error}`);
1113
+ }
1114
+ logger$2.debug(`Total unique candidate IDs: ${candidateIds.size} (out of ${totalVectors.length} indexed vectors)`);
1115
+ const candidateVectors = [];
1116
+ try {
1117
+ for (const docId of candidateIds) {
1118
+ const vectorDocs = await this.vectorsCollection.find().where("documentId").eq(docId).exec();
1119
+ for (const vectorDoc of vectorDocs) {
1120
+ const vectorData = vectorDoc.toJSON();
1121
+ if (vectorData && vectorData.embedding) candidateVectors.push(vectorData);
1122
+ else logger$2.warn(`Invalid vector data for document ${docId}`);
1123
+ }
1124
+ }
1125
+ } catch (error) {
1126
+ logger$2.error(`Failed to fetch candidate vectors: ${error}`);
1127
+ throw new Error(`Failed to fetch candidate vectors: ${error}`);
1128
+ }
1129
+ logger$2.debug(`Fetched ${candidateVectors.length} candidate vectors`);
1130
+ const results = [];
1131
+ for (const vectorDoc of candidateVectors) {
1132
+ const similarity = (cosineSimilarity(queryEmbedding, vectorDoc.embedding) + 1) / 2;
1133
+ results.push({
1134
+ documentId: vectorDoc.documentId,
1135
+ similarity,
1136
+ chunkIndex: vectorDoc.chunkIndex,
1137
+ chunkStartOffset: vectorDoc.chunkStartOffset,
1138
+ chunkEndOffset: vectorDoc.chunkEndOffset
1139
+ });
1140
+ }
1141
+ results.sort((a, b) => b.similarity - a.similarity);
1142
+ logger$2.debug(`Computed similarities for ${results.length} candidates, top similarity: ${results[0]?.similarity || "N/A"}`);
1143
+ const topResults = results.slice(0, limit);
1144
+ const documentResults = [];
1145
+ for (const result of topResults) {
1146
+ const doc = await this.documentsCollection.findOne(result.documentId).exec();
1147
+ if (doc) {
1148
+ const document = doc.toJSON();
1149
+ if (options.workspacePath && document.workspacePath !== options.workspacePath) continue;
1150
+ if (options.fileType && document.fileType !== options.fileType) continue;
1151
+ documentResults.push({
1152
+ document,
1153
+ similarity: result.similarity,
1154
+ chunkIndex: result.chunkIndex,
1155
+ chunkStartOffset: result.chunkStartOffset,
1156
+ chunkEndOffset: result.chunkEndOffset
1157
+ });
1158
+ }
1159
+ }
1160
+ return documentResults;
1161
+ }
1162
+ async indexFileInContext(file, context, options = {}) {
1163
+ if (!this.isInitialized) await this.initialize();
1164
+ this.ensureInitialized();
1165
+ const workspacePath = file.getWorkspace().getName();
1166
+ const filePath = file.getWorkspacePath();
1167
+ const id = this.generateDocumentId(workspacePath, filePath);
1168
+ const existing = await this.documentsCollection.findOne(id).exec();
1169
+ const existingDoc = existing ? existing.toJSON() : null;
1170
+ const contextTags = context.tags || [];
1171
+ const newTags = [...options.tags || [], ...contextTags];
1172
+ if (existingDoc) {
1173
+ const existingTags = existingDoc.metadata.tags || [];
1174
+ const mergedTags = [...new Set([...existingTags, ...newTags])];
1175
+ if (mergedTags.length !== existingTags.length || newTags.some((tag) => !existingTags.includes(tag))) {
1176
+ await this.updateDocumentMetadata(existingDoc.id, { metadata: {
1177
+ ...existingDoc.metadata,
1178
+ tags: mergedTags
1179
+ } });
1180
+ logger$2.debug(`Added tags to existing document: ${id}`);
1181
+ return {
1182
+ ...existingDoc,
1183
+ metadata: {
1184
+ ...existingDoc.metadata,
1185
+ tags: mergedTags
1186
+ }
1187
+ };
1188
+ } else {
1189
+ logger$2.debug(`Document already has all tags: ${id}`);
1190
+ return existingDoc;
1191
+ }
1192
+ }
1193
+ return this.indexDocument(file, {
1194
+ ...options,
1195
+ tags: newTags
1196
+ });
1197
+ }
1198
+ async indexFilesInContext(files, context, options = {}) {
1199
+ let succeeded = 0;
1200
+ let failed = 0;
1201
+ const contextTags = context.tags || [];
1202
+ for (const file of files) try {
1203
+ await this.indexDocument(file, {
1204
+ ...options,
1205
+ tags: [...options.tags || [], ...contextTags]
1206
+ });
1207
+ succeeded++;
1208
+ logger$2.debug(`Indexed file with context tags: ${file.getWorkspacePath()}`);
1209
+ } catch (error) {
1210
+ logger$2.error(`Failed to index file ${file.getWorkspacePath()}: ${error}`);
1211
+ failed++;
1212
+ }
1213
+ return {
1214
+ succeeded,
1215
+ failed
1216
+ };
1217
+ }
1218
+ async reindexFileInContext(file, context, options = {}) {
1219
+ const contextTags = context.tags || [];
1220
+ return this.reindexDocument(file, {
1221
+ ...options,
1222
+ tags: [...options.tags || [], ...contextTags]
1223
+ });
1224
+ }
1225
+ async removeFileFromContext(file, context) {
1226
+ if (!this.isInitialized) await this.initialize();
1227
+ this.ensureInitialized();
1228
+ const document = await this.getDocumentByPath(file.getWorkspace().getName(), file.getWorkspacePath());
1229
+ if (document && context.tags && context.tags.length > 0) {
1230
+ const contextTags = new Set(context.tags);
1231
+ const updatedTags = (document.metadata.tags || []).filter((tag) => !contextTags.has(tag));
1232
+ if (updatedTags.length !== document.metadata.tags?.length) await this.updateDocumentMetadata(document.id, { metadata: {
1233
+ ...document.metadata,
1234
+ tags: updatedTags
1235
+ } });
1236
+ }
1237
+ }
1238
+ async clearContext(context) {
1239
+ if (!context.tags || context.tags.length === 0) return;
1240
+ const contextTags = new Set(context.tags);
1241
+ const allDocs = await this.listDocuments();
1242
+ for (const doc of allDocs) if (doc.metadata.tags?.some((tag) => contextTags.has(tag))) {
1243
+ const updatedTags = doc.metadata.tags.filter((tag) => !contextTags.has(tag));
1244
+ try {
1245
+ const workspace = await workspaceService.getWorkspace();
1246
+ if (workspace && workspace.getName() === doc.workspacePath) {
1247
+ const resource = await workspace.getResource(doc.filePath);
1248
+ if (resource instanceof File) await this.indexDocument(resource, { tags: updatedTags });
1249
+ }
1250
+ } catch (error) {
1251
+ logger$2.warn(`Failed to clear context tags from ${doc.filePath}: ${error}`);
1252
+ }
1253
+ }
1254
+ }
1255
+ async getFilePathsInContext(context) {
1256
+ if (!context.tags || context.tags.length === 0) return [];
1257
+ const contextTags = new Set(context.tags);
1258
+ return (await this.listDocuments()).filter((doc) => doc.metadata.tags?.some((tag) => contextTags.has(tag))).map((doc) => doc.filePath);
1259
+ }
1260
+ };
1261
+ var documentIndexService = new DocumentIndexService();
1262
+ rootContext.put("documentIndexService", documentIndexService);
1263
+ //#endregion
1264
+ //#region src/utils/workspace-utils.ts
1265
+ var logger$1 = createLogger("WorkspaceUtils");
1266
+ async function getWorkspacePath(providedPath) {
1267
+ const workspace = await workspaceService.getWorkspace();
1268
+ if (!workspace) {
1269
+ logger$1.warn("No workspace connected");
1270
+ return null;
1271
+ }
1272
+ const workspacePath = providedPath || workspace.getName();
1273
+ if (!workspacePath) {
1274
+ logger$1.warn("No workspace path available");
1275
+ return null;
1276
+ }
1277
+ return {
1278
+ workspace,
1279
+ workspacePath
1280
+ };
1281
+ }
1282
+ //#endregion
1283
+ //#region src/utils/query-utils.ts
1284
+ var DEFAULT_MIN_TERM_LENGTH = 2;
1285
+ function extractQueryTerms(query, options = {}) {
1286
+ if (!query || !query.trim()) return [];
1287
+ const minLength = options.minTermLength ?? DEFAULT_MIN_TERM_LENGTH;
1288
+ return (options.caseSensitive ? query : query.toLowerCase()).split(/\s+/).filter((term) => term.length >= minLength);
1289
+ }
1290
+ function normalizeQuery(query) {
1291
+ return query.toLowerCase().trim();
1292
+ }
1293
+ //#endregion
1294
+ //#region src/utils/snippet-extractor.ts
1295
+ var DEFAULT_MAX_SNIPPETS = 10;
1296
+ var DEFAULT_SNIPPET_LENGTH = 400;
1297
+ var DEFAULT_MIN_GAP = 400;
1298
+ var SnippetExtractor = class {
1299
+ constructor(options = {}) {
1300
+ this.maxSnippets = options.maxSnippets ?? DEFAULT_MAX_SNIPPETS;
1301
+ this.snippetLength = options.snippetLength ?? DEFAULT_SNIPPET_LENGTH;
1302
+ this.minGap = options.minGap ?? DEFAULT_MIN_GAP;
1303
+ }
1304
+ extractSnippets(content, queryTerms, maxSnippets) {
1305
+ const effectiveMaxSnippets = maxSnippets ?? this.maxSnippets;
1306
+ if (queryTerms.length === 0) return [];
1307
+ const contentLower = content.toLowerCase();
1308
+ const snippetCandidates = [];
1309
+ const seenSnippets = /* @__PURE__ */ new Set();
1310
+ for (const term of queryTerms) {
1311
+ let index = contentLower.indexOf(term);
1312
+ while (index !== -1) {
1313
+ const start = Math.max(0, index - this.snippetLength / 2);
1314
+ const end = Math.min(content.length, index + term.length + this.snippetLength / 2);
1315
+ const snippet = content.substring(start, end).trim();
1316
+ const snippetKey = `${start}-${end}`;
1317
+ if (snippet && !seenSnippets.has(snippetKey)) {
1318
+ seenSnippets.add(snippetKey);
1319
+ const score = this.calculateSnippetScore(snippet, queryTerms);
1320
+ snippetCandidates.push({
1321
+ snippet,
1322
+ score,
1323
+ start
1324
+ });
1325
+ }
1326
+ index = contentLower.indexOf(term, index + 1);
1327
+ }
1328
+ }
1329
+ if (snippetCandidates.length === 0 && queryTerms.length > 0) {
1330
+ const firstTerm = queryTerms[0];
1331
+ const index = contentLower.indexOf(firstTerm);
1332
+ if (index !== -1) {
1333
+ const start = Math.max(0, index - this.snippetLength);
1334
+ const end = Math.min(content.length, index + firstTerm.length + this.snippetLength);
1335
+ const snippet = content.substring(start, end).trim();
1336
+ if (snippet) snippetCandidates.push({
1337
+ snippet,
1338
+ score: 10,
1339
+ start
1340
+ });
1341
+ }
1342
+ }
1343
+ snippetCandidates.sort((a, b) => {
1344
+ if (b.score !== a.score) return b.score - a.score;
1345
+ return a.start - b.start;
1346
+ });
1347
+ return this.selectNonOverlappingSnippets(snippetCandidates, effectiveMaxSnippets);
1348
+ }
1349
+ calculateSnippetScore(snippet, queryTerms) {
1350
+ const snippetLower = snippet.toLowerCase();
1351
+ let score = 0;
1352
+ for (const qTerm of queryTerms) {
1353
+ const termMatches = (snippetLower.match(new RegExp(qTerm.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "gi")) || []).length;
1354
+ score += termMatches * 10;
1355
+ }
1356
+ const fullQuery = queryTerms.join(" ");
1357
+ if (snippetLower.includes(fullQuery)) score += 50;
1358
+ const uniqueTermsFound = queryTerms.filter((t) => snippetLower.includes(t)).length;
1359
+ score += uniqueTermsFound * 20;
1360
+ const termDensity = uniqueTermsFound / queryTerms.length;
1361
+ score += termDensity * 30;
1362
+ return score;
1363
+ }
1364
+ selectNonOverlappingSnippets(candidates, maxSnippets) {
1365
+ const selectedSnippets = [];
1366
+ const usedRanges = [];
1367
+ for (const candidate of candidates) {
1368
+ if (selectedSnippets.length >= maxSnippets) break;
1369
+ const candidateStart = candidate.start;
1370
+ const candidateEnd = candidate.start + candidate.snippet.length;
1371
+ if (!usedRanges.some((range) => {
1372
+ return !(candidateEnd < range.start - this.minGap || candidateStart > range.end + this.minGap);
1373
+ })) {
1374
+ selectedSnippets.push(candidate.snippet);
1375
+ usedRanges.push({
1376
+ start: candidateStart,
1377
+ end: candidateEnd
1378
+ });
1379
+ }
1380
+ }
1381
+ return selectedSnippets;
1382
+ }
1383
+ extractSimpleSnippet(content, maxLength = 500) {
1384
+ return content.substring(0, maxLength) + (content.length > maxLength ? "..." : "");
1385
+ }
1386
+ extractContextSnippets(content, query, contextLength = 150) {
1387
+ const queryLower = query.toLowerCase();
1388
+ const contentLower = content.toLowerCase();
1389
+ const matches = [];
1390
+ let index = contentLower.indexOf(queryLower);
1391
+ while (index !== -1) {
1392
+ matches.push(index);
1393
+ index = contentLower.indexOf(queryLower, index + 1);
1394
+ }
1395
+ if (matches.length === 0) return [];
1396
+ const snippets = [];
1397
+ for (const matchIndex of matches) {
1398
+ const start = Math.max(0, matchIndex - contextLength);
1399
+ const end = Math.min(content.length, matchIndex + query.length + contextLength);
1400
+ snippets.push({
1401
+ start,
1402
+ end,
1403
+ matchIndex
1404
+ });
1405
+ }
1406
+ return snippets;
1407
+ }
1408
+ };
1409
+ new SnippetExtractor();
1410
+ //#endregion
1411
+ //#region src/services/relevance-calculator.ts
1412
+ var RelevanceCalculator = class {
1413
+ calculateRelevance(doc, query) {
1414
+ const queryLower = normalizeQuery(query);
1415
+ const queryTerms = extractQueryTerms(query);
1416
+ let score = 0;
1417
+ const fileNameLower = doc.fileName.toLowerCase();
1418
+ const filePathLower = doc.filePath.toLowerCase();
1419
+ const contentLower = doc.content.toLowerCase();
1420
+ const fileNameMatches = queryTerms.filter((term) => fileNameLower.includes(term)).length;
1421
+ const filePathMatches = queryTerms.filter((term) => filePathLower.includes(term)).length;
1422
+ const contentMatches = queryTerms.filter((term) => contentLower.includes(term)).length;
1423
+ score += fileNameMatches * RELEVANCE_WEIGHTS.FILE_NAME_MATCH;
1424
+ score += filePathMatches * RELEVANCE_WEIGHTS.FILE_PATH_MATCH;
1425
+ score += contentMatches * RELEVANCE_WEIGHTS.CONTENT_MATCH;
1426
+ if (fileNameLower.includes(queryLower)) score += RELEVANCE_WEIGHTS.FILE_NAME_EXACT;
1427
+ if (filePathLower.includes(queryLower)) score += RELEVANCE_WEIGHTS.FILE_PATH_EXACT;
1428
+ const exactPhraseMatches = (contentLower.match(new RegExp(queryLower.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g")) || []).length;
1429
+ score += exactPhraseMatches * RELEVANCE_WEIGHTS.EXACT_PHRASE;
1430
+ const totalTerms = queryTerms.length;
1431
+ const matchedTerms = queryTerms.filter((term) => contentLower.includes(term)).length;
1432
+ const termCoverage = totalTerms > 0 ? matchedTerms / totalTerms : 0;
1433
+ score += termCoverage * RELEVANCE_WEIGHTS.TERM_COVERAGE;
1434
+ return score;
1435
+ }
1436
+ calculateFileNameRelevance(doc, fileName) {
1437
+ const docNameLower = doc.fileName.toLowerCase();
1438
+ const fileNameLower = fileName.toLowerCase();
1439
+ if (docNameLower === fileNameLower) return 100;
1440
+ if (docNameLower.includes(fileNameLower)) return 80;
1441
+ return 0;
1442
+ }
1443
+ };
1444
+ var relevanceCalculator = new RelevanceCalculator();
1445
+ //#endregion
1446
+ //#region src/services/rag-result-formatter.ts
1447
+ var RAGResultFormatter = class {
1448
+ constructor(snippetExtractor) {
1449
+ this.snippetExtractor = snippetExtractor;
1450
+ }
1451
+ formatRAGContext(results) {
1452
+ if (results.length === 0) return "";
1453
+ return `Here are relevant documents from the workspace that might help answer the question:
1454
+
1455
+ ${results.map((result, idx) => {
1456
+ const doc = result.document;
1457
+ const snippets = result.matchedSnippets.map((s, i) => ` [Snippet ${i + 1}]\n ${s}`).join("\n\n");
1458
+ return `[Document ${idx + 1}: ${doc.fileName} (${doc.filePath})]
1459
+ Relevance: ${result.relevance.toFixed(2)}
1460
+ ${snippets.length > 0 ? `Relevant snippets:\n${snippets}` : `Content preview: ${this.snippetExtractor.extractSimpleSnippet(doc.content, SNIPPET_LENGTHS.PREVIEW)}`}
1461
+ ---`;
1462
+ }).join("\n\n")}
1463
+
1464
+ Use the information from these documents to provide a helpful answer. Pay special attention to numbers, percentages, dates, and specific values mentioned in the snippets. If the documents don't contain relevant information, you can still answer based on your general knowledge.`;
1465
+ }
1466
+ formatSearchResults(results) {
1467
+ return results.map((r) => ({
1468
+ file: r.document.fileName,
1469
+ path: r.document.filePath,
1470
+ relevance: r.relevance.toFixed(2),
1471
+ language: r.document.metadata.language,
1472
+ size: r.document.metadata.size,
1473
+ snippets: r.matchedSnippets,
1474
+ preview: this.snippetExtractor.extractSimpleSnippet(r.document.content, 200) + "..."
1475
+ }));
1476
+ }
1477
+ formatCommandResults(results) {
1478
+ return results.map((r) => ({
1479
+ file: r.document.fileName,
1480
+ path: r.document.filePath,
1481
+ relevance: r.relevance,
1482
+ snippets: r.matchedSnippets
1483
+ }));
1484
+ }
1485
+ };
1486
+ //#endregion
1487
+ //#region src/rag-service.ts
1488
+ var logger = createLogger("RAGService");
1489
+ var snippetExtractor = new SnippetExtractor();
1490
+ var resultFormatter = new RAGResultFormatter(snippetExtractor);
1491
+ var RAGService = class {
1492
+ async searchDocuments(query, options = {}) {
1493
+ const limit = Math.min(options.limit || SEARCH_CONFIG.DEFAULT_LIMIT, SEARCH_CONFIG.MAX_LIMIT);
1494
+ const workspaceResult = await getWorkspacePath(options.workspacePath);
1495
+ if (!workspaceResult) {
1496
+ logger.warn("No workspace connected, cannot search documents");
1497
+ return [];
1498
+ }
1499
+ const { workspacePath } = workspaceResult;
1500
+ if (options.filePath) {
1501
+ const specificDoc = await documentIndexService.getDocumentByPath(workspacePath, options.filePath);
1502
+ if (specificDoc) {
1503
+ const queryTerms = extractQueryTerms(query);
1504
+ return [{
1505
+ document: specificDoc,
1506
+ relevance: 100,
1507
+ matchedSnippets: queryTerms.length > 0 ? snippetExtractor.extractSnippets(specificDoc.content, queryTerms, 3) : [snippetExtractor.extractSimpleSnippet(specificDoc.content, SNIPPET_LENGTHS.PREVIEW)]
1508
+ }];
1509
+ }
1510
+ logger.warn(`Document not found: ${options.filePath}`);
1511
+ return [];
1512
+ }
1513
+ if (options.fileName) {
1514
+ const allDocs = await documentIndexService.listDocuments(workspacePath);
1515
+ const fileNameLower = options.fileName.toLowerCase();
1516
+ const matchingDocs = allDocs.filter((doc) => {
1517
+ const docNameLower = doc.fileName.toLowerCase();
1518
+ return docNameLower === fileNameLower || docNameLower.includes(fileNameLower);
1519
+ });
1520
+ if (matchingDocs.length > 0) {
1521
+ const results = [];
1522
+ const queryTerms = extractQueryTerms(query);
1523
+ for (const doc of matchingDocs.slice(0, limit)) {
1524
+ const matchedSnippets = queryTerms.length > 0 ? snippetExtractor.extractSnippets(doc.content, queryTerms, 3) : [snippetExtractor.extractSimpleSnippet(doc.content, SNIPPET_LENGTHS.PREVIEW)];
1525
+ results.push({
1526
+ document: doc,
1527
+ relevance: relevanceCalculator.calculateFileNameRelevance(doc, options.fileName),
1528
+ matchedSnippets
1529
+ });
1530
+ }
1531
+ return results;
1532
+ }
1533
+ logger.warn(`No documents found with name: ${options.fileName}`);
1534
+ return [];
1535
+ }
1536
+ if (!query || !query.trim()) {
1537
+ logger.warn("No query provided and no filePath/fileName specified");
1538
+ return [];
1539
+ }
1540
+ try {
1541
+ const vectorResults = await documentIndexService.searchSimilar(query, {
1542
+ limit: limit * 2,
1543
+ workspacePath,
1544
+ fileType: options.fileType
1545
+ });
1546
+ const results = [];
1547
+ for (const { document, similarity, chunkStartOffset, chunkEndOffset } of vectorResults) {
1548
+ if (!this.matchesContextScope(document, options.documentSearchScope)) continue;
1549
+ const relevance = similarity * 100;
1550
+ if (options.minRelevance && relevance < options.minRelevance) continue;
1551
+ const queryLower = normalizeQuery(query);
1552
+ const queryTerms = extractQueryTerms(query);
1553
+ let matchedSnippets;
1554
+ if (chunkStartOffset !== void 0 && chunkEndOffset !== void 0) {
1555
+ const chunkText = document.content.substring(Math.max(0, chunkStartOffset), Math.min(document.content.length, chunkEndOffset));
1556
+ if (chunkText.trim().length > 0) matchedSnippets = [chunkText.trim()];
1557
+ else matchedSnippets = snippetExtractor.extractSnippets(document.content, queryTerms, 15);
1558
+ } else matchedSnippets = snippetExtractor.extractSnippets(document.content, queryTerms, 15);
1559
+ if (matchedSnippets.length === 0 && queryLower.length > 0) if (chunkStartOffset !== void 0 && chunkEndOffset !== void 0) {
1560
+ const chunkText = document.content.substring(Math.max(0, chunkStartOffset), Math.min(document.content.length, chunkEndOffset));
1561
+ if (chunkText.trim().length > 0) matchedSnippets = [chunkText.trim()];
1562
+ else matchedSnippets = snippetExtractor.extractSnippets(document.content, [queryLower], 10);
1563
+ } else matchedSnippets = snippetExtractor.extractSnippets(document.content, [queryLower], 10);
1564
+ results.push({
1565
+ document,
1566
+ relevance,
1567
+ matchedSnippets
1568
+ });
1569
+ if (results.length >= limit) break;
1570
+ }
1571
+ if (results.length === 0) {
1572
+ logger.debug("Vector search returned no results, falling back to text search");
1573
+ return this.fallbackTextSearch(query, options);
1574
+ }
1575
+ return results;
1576
+ } catch (error) {
1577
+ logger.warn(`Vector search failed, falling back to text search: ${error}`);
1578
+ logger.debug(`Vector search error details: ${error}`);
1579
+ return this.fallbackTextSearch(query, options);
1580
+ }
1581
+ }
1582
+ async fallbackTextSearch(query, options = {}) {
1583
+ const limit = Math.min(options.limit || SEARCH_CONFIG.DEFAULT_LIMIT, SEARCH_CONFIG.MAX_LIMIT);
1584
+ const workspaceResult = await getWorkspacePath(options.workspacePath);
1585
+ if (!workspaceResult) {
1586
+ logger.warn("No workspace connected, cannot perform text search");
1587
+ return [];
1588
+ }
1589
+ const { workspacePath } = workspaceResult;
1590
+ const allDocuments = await documentIndexService.listDocuments(workspacePath);
1591
+ const queryTerms = extractQueryTerms(query);
1592
+ const results = [];
1593
+ for (const doc of allDocuments) {
1594
+ if (options.fileType && doc.fileType !== options.fileType) continue;
1595
+ if (!this.matchesContextScope(doc, options.documentSearchScope)) continue;
1596
+ const relevance = relevanceCalculator.calculateRelevance(doc, query);
1597
+ if (options.minRelevance && relevance < options.minRelevance) continue;
1598
+ const matchedSnippets = snippetExtractor.extractSnippets(doc.content, queryTerms, 3);
1599
+ results.push({
1600
+ document: doc,
1601
+ relevance,
1602
+ matchedSnippets
1603
+ });
1604
+ }
1605
+ results.sort((a, b) => b.relevance - a.relevance);
1606
+ return results.slice(0, limit);
1607
+ }
1608
+ formatRAGContext(results) {
1609
+ return resultFormatter.formatRAGContext(results);
1610
+ }
1611
+ matchesContextScope(doc, scope) {
1612
+ if (!scope) return true;
1613
+ if (scope.includePaths && scope.includePaths.length > 0) {
1614
+ if (!scope.includePaths.some((pattern) => {
1615
+ if (pattern.includes("*") || pattern.includes("?")) return new RegExp("^" + pattern.replace(/\*/g, ".*").replace(/\?/g, ".") + "$").test(doc.filePath);
1616
+ return doc.filePath.startsWith(pattern) || doc.filePath === pattern;
1617
+ })) return false;
1618
+ }
1619
+ if (scope.excludePaths && scope.excludePaths.length > 0) {
1620
+ if (scope.excludePaths.some((pattern) => {
1621
+ if (pattern.includes("*") || pattern.includes("?")) return new RegExp("^" + pattern.replace(/\*/g, ".*").replace(/\?/g, ".") + "$").test(doc.filePath);
1622
+ return doc.filePath.startsWith(pattern) || doc.filePath === pattern;
1623
+ })) return false;
1624
+ }
1625
+ if (scope.pathPattern) {
1626
+ if (!(scope.pathPattern instanceof RegExp ? scope.pathPattern : new RegExp(scope.pathPattern)).test(doc.filePath)) return false;
1627
+ }
1628
+ if (scope.tags && scope.tags.length > 0) {
1629
+ const docTags = doc.metadata.tags || [];
1630
+ if (!scope.tags.every((tag) => docTags.includes(tag))) return false;
1631
+ }
1632
+ if (scope.metadataFilter) {
1633
+ if (!scope.metadataFilter(doc)) return false;
1634
+ }
1635
+ return true;
1636
+ }
1637
+ };
1638
+ var ragService = new RAGService();
1639
+ async function searchWorkspaceDocuments(query, options = {}) {
1640
+ const workspaceResult = await getWorkspacePath(options.workspacePath);
1641
+ if (!workspaceResult) {
1642
+ logger.warn("No workspace connected, cannot search documents");
1643
+ return [];
1644
+ }
1645
+ return ragService.searchDocuments(query, {
1646
+ ...options,
1647
+ workspacePath: workspaceResult.workspacePath
1648
+ });
1649
+ }
1650
+ //#endregion
1651
+ export { getWorkspacePath as a, createIndexValuesFromArray as c, CONTENT_PREVIEW_LENGTHS as d, SEARCH_CONFIG as f, SnippetExtractor as i, euclideanDistance as l, embeddingService as m, searchWorkspaceDocuments as n, documentIndexService as o, SNIPPET_LENGTHS as p, RAGResultFormatter as r, cosineSimilarity as s, ragService as t, getIndexValueArray as u };
1652
+
1653
+ //# sourceMappingURL=rag-service-Chw9PNZn.js.map