@chatbot-packages/rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunking/index.d.ts +51 -0
- package/dist/chunking/index.js +248 -0
- package/dist/chunking/index.js.map +1 -0
- package/dist/embeddings/index.d.ts +103 -0
- package/dist/embeddings/index.js +195 -0
- package/dist/embeddings/index.js.map +1 -0
- package/dist/extractors/index.d.ts +95 -0
- package/dist/extractors/index.js +343 -0
- package/dist/extractors/index.js.map +1 -0
- package/dist/index.d.ts +78 -0
- package/dist/index.js +1576 -0
- package/dist/index.js.map +1 -0
- package/dist/retrieval/index.d.ts +65 -0
- package/dist/retrieval/index.js +144 -0
- package/dist/retrieval/index.js.map +1 -0
- package/dist/types-CjnplPJD.d.ts +242 -0
- package/dist/vectorstore/index.d.ts +109 -0
- package/dist/vectorstore/index.js +422 -0
- package/dist/vectorstore/index.js.map +1 -0
- package/package.json +83 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1576 @@
|
|
|
1
|
+
// src/rag-service.ts
|
|
2
|
+
import { generateId as generateId2 } from "@chatbot-packages/utils";
|
|
3
|
+
import { OpenAIProvider, AnthropicProvider } from "@chatbot-packages/ai";
|
|
4
|
+
|
|
5
|
+
// src/extractors/base.ts
|
|
6
|
+
var BaseExtractor = class {
|
|
7
|
+
options;
|
|
8
|
+
constructor(options) {
|
|
9
|
+
this.options = options;
|
|
10
|
+
}
|
|
11
|
+
/** Get file extension */
|
|
12
|
+
getExtension(path) {
|
|
13
|
+
const parts = path.split(".");
|
|
14
|
+
return parts.length > 1 ? parts[parts.length - 1].toLowerCase() : "";
|
|
15
|
+
}
|
|
16
|
+
/** Normalize line endings */
|
|
17
|
+
normalizeLineEndings(text) {
|
|
18
|
+
return text.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
|
|
19
|
+
}
|
|
20
|
+
/** Clean excessive whitespace */
|
|
21
|
+
cleanWhitespace(text) {
|
|
22
|
+
return text.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+/g, " ").trim();
|
|
23
|
+
}
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
// src/extractors/chm.ts
|
|
27
|
+
import { exec } from "child_process";
|
|
28
|
+
import { promisify } from "util";
|
|
29
|
+
import { readdir as readdir2, readFile as readFile2, mkdir, rm } from "fs/promises";
|
|
30
|
+
import { existsSync } from "fs";
|
|
31
|
+
import { join as join2, basename as basename2, dirname } from "path";
|
|
32
|
+
|
|
33
|
+
// src/extractors/html.ts
|
|
34
|
+
import { readFile, readdir, stat } from "fs/promises";
|
|
35
|
+
import { join, basename } from "path";
|
|
36
|
+
import * as cheerio from "cheerio";
|
|
37
|
+
import { convert } from "html-to-text";
|
|
38
|
+
var HTMLExtractor = class extends BaseExtractor {
|
|
39
|
+
htmlOptions;
|
|
40
|
+
constructor(options) {
|
|
41
|
+
super(options);
|
|
42
|
+
this.htmlOptions = {
|
|
43
|
+
preserveHeadings: true,
|
|
44
|
+
includeLinks: false,
|
|
45
|
+
removeTags: ["script", "style", "nav", "footer", "header", "aside", "meta", "link"],
|
|
46
|
+
...options
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
getType() {
|
|
50
|
+
return "html";
|
|
51
|
+
}
|
|
52
|
+
canHandle(path) {
|
|
53
|
+
const ext = this.getExtension(path);
|
|
54
|
+
return ext === "html" || ext === "htm";
|
|
55
|
+
}
|
|
56
|
+
async extract() {
|
|
57
|
+
const { sourcePath } = this.options;
|
|
58
|
+
const stats = await stat(sourcePath);
|
|
59
|
+
if (stats.isFile()) {
|
|
60
|
+
const content = await readFile(sourcePath, "utf-8");
|
|
61
|
+
return [await this.extractFromString(content, sourcePath)];
|
|
62
|
+
}
|
|
63
|
+
if (stats.isDirectory()) {
|
|
64
|
+
return this.extractFromDirectory(sourcePath);
|
|
65
|
+
}
|
|
66
|
+
throw new Error(`Invalid path: ${sourcePath}`);
|
|
67
|
+
}
|
|
68
|
+
async extractFromString(html, filePath) {
|
|
69
|
+
const $ = cheerio.load(html);
|
|
70
|
+
for (const tag of this.htmlOptions.removeTags || []) {
|
|
71
|
+
$(tag).remove();
|
|
72
|
+
}
|
|
73
|
+
let title = $("title").text().trim();
|
|
74
|
+
if (!title) {
|
|
75
|
+
title = $("h1").first().text().trim();
|
|
76
|
+
}
|
|
77
|
+
if (!title) {
|
|
78
|
+
title = basename(filePath, ".html").replace(/-|_/g, " ");
|
|
79
|
+
}
|
|
80
|
+
const text = convert($.html(), {
|
|
81
|
+
wordwrap: false,
|
|
82
|
+
preserveNewlines: true,
|
|
83
|
+
selectors: [
|
|
84
|
+
{ selector: "h1", options: { uppercase: false, prefix: "\n# " } },
|
|
85
|
+
{ selector: "h2", options: { uppercase: false, prefix: "\n## " } },
|
|
86
|
+
{ selector: "h3", options: { uppercase: false, prefix: "\n### " } },
|
|
87
|
+
{ selector: "h4", options: { uppercase: false, prefix: "\n#### " } },
|
|
88
|
+
{ selector: "h5", options: { uppercase: false, prefix: "\n##### " } },
|
|
89
|
+
{ selector: "h6", options: { uppercase: false, prefix: "\n###### " } },
|
|
90
|
+
{ selector: "ul", options: { itemPrefix: " - " } },
|
|
91
|
+
{ selector: "ol", options: { itemPrefix: " 1. " } },
|
|
92
|
+
{ selector: "table", format: "dataTable" },
|
|
93
|
+
{ selector: "a", options: { ignoreHref: !this.htmlOptions.includeLinks } },
|
|
94
|
+
{ selector: "img", format: "skip" }
|
|
95
|
+
]
|
|
96
|
+
});
|
|
97
|
+
const content = this.cleanWhitespace(this.normalizeLineEndings(text));
|
|
98
|
+
return {
|
|
99
|
+
path: filePath,
|
|
100
|
+
title,
|
|
101
|
+
content,
|
|
102
|
+
format: "html",
|
|
103
|
+
metadata: {
|
|
104
|
+
originalLength: html.length,
|
|
105
|
+
extractedLength: content.length
|
|
106
|
+
}
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
async extractFromDirectory(dir) {
|
|
110
|
+
const documents = [];
|
|
111
|
+
const scanDir = async (currentDir) => {
|
|
112
|
+
const entries = await readdir(currentDir, { withFileTypes: true });
|
|
113
|
+
for (const entry of entries) {
|
|
114
|
+
const fullPath = join(currentDir, entry.name);
|
|
115
|
+
if (entry.isDirectory()) {
|
|
116
|
+
await scanDir(fullPath);
|
|
117
|
+
} else if (entry.isFile() && this.canHandle(entry.name)) {
|
|
118
|
+
try {
|
|
119
|
+
const content = await readFile(fullPath, "utf-8");
|
|
120
|
+
const doc = await this.extractFromString(content, fullPath);
|
|
121
|
+
if (doc.content.trim()) {
|
|
122
|
+
documents.push(doc);
|
|
123
|
+
}
|
|
124
|
+
} catch (error) {
|
|
125
|
+
console.warn(`Failed to extract ${fullPath}:`, error);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
};
|
|
130
|
+
await scanDir(dir);
|
|
131
|
+
return documents;
|
|
132
|
+
}
|
|
133
|
+
};
|
|
134
|
+
|
|
135
|
+
// src/extractors/chm.ts
|
|
136
|
+
var execAsync = promisify(exec);
|
|
137
|
+
var CHMExtractor = class extends BaseExtractor {
|
|
138
|
+
tempDir;
|
|
139
|
+
constructor(options) {
|
|
140
|
+
super(options);
|
|
141
|
+
this.tempDir = options.outputDir || join2(dirname(options.sourcePath), ".chm-extract-temp");
|
|
142
|
+
}
|
|
143
|
+
getType() {
|
|
144
|
+
return "chm";
|
|
145
|
+
}
|
|
146
|
+
canHandle(path) {
|
|
147
|
+
return this.getExtension(path) === "chm";
|
|
148
|
+
}
|
|
149
|
+
async extract() {
|
|
150
|
+
const { sourcePath } = this.options;
|
|
151
|
+
if (!existsSync(sourcePath)) {
|
|
152
|
+
throw new Error(`CHM file not found: ${sourcePath}`);
|
|
153
|
+
}
|
|
154
|
+
await mkdir(this.tempDir, { recursive: true });
|
|
155
|
+
try {
|
|
156
|
+
await this.extractWithSevenZip(sourcePath, this.tempDir);
|
|
157
|
+
const htmlFiles = await this.findHTMLFiles(this.tempDir);
|
|
158
|
+
const documents = [];
|
|
159
|
+
for (const htmlFile of htmlFiles) {
|
|
160
|
+
try {
|
|
161
|
+
const content = await readFile2(htmlFile, "utf-8");
|
|
162
|
+
const htmlExtractor = new HTMLExtractor({ sourcePath: htmlFile });
|
|
163
|
+
const extracted = await htmlExtractor.extractFromString(content, htmlFile);
|
|
164
|
+
if (extracted.content.trim()) {
|
|
165
|
+
documents.push({
|
|
166
|
+
...extracted,
|
|
167
|
+
format: "chm",
|
|
168
|
+
metadata: {
|
|
169
|
+
...extracted.metadata,
|
|
170
|
+
sourceChm: basename2(sourcePath),
|
|
171
|
+
originalPath: htmlFile.replace(this.tempDir, "")
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
} catch (error) {
|
|
176
|
+
console.warn(`Failed to parse HTML file: ${htmlFile}`, error);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return documents;
|
|
180
|
+
} finally {
|
|
181
|
+
if (existsSync(this.tempDir)) {
|
|
182
|
+
await rm(this.tempDir, { recursive: true, force: true });
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
async extractWithSevenZip(chmPath, outputDir) {
|
|
187
|
+
try {
|
|
188
|
+
await execAsync(`7z x "${chmPath}" -o"${outputDir}" -y`, {
|
|
189
|
+
maxBuffer: 50 * 1024 * 1024
|
|
190
|
+
// 50MB buffer
|
|
191
|
+
});
|
|
192
|
+
} catch (error) {
|
|
193
|
+
try {
|
|
194
|
+
await execAsync(`7za x "${chmPath}" -o"${outputDir}" -y`, {
|
|
195
|
+
maxBuffer: 50 * 1024 * 1024
|
|
196
|
+
});
|
|
197
|
+
} catch {
|
|
198
|
+
throw new Error(
|
|
199
|
+
`Failed to extract CHM file. Please ensure 7z is installed.
|
|
200
|
+
- On Ubuntu/Debian: sudo apt-get install p7zip-full
|
|
201
|
+
- On macOS: brew install p7zip
|
|
202
|
+
- On Windows: Install 7-Zip from https://7-zip.org/
|
|
203
|
+
Original error: ${error}`
|
|
204
|
+
);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
async findHTMLFiles(dir) {
|
|
209
|
+
const htmlFiles = [];
|
|
210
|
+
const scanDir = async (currentDir) => {
|
|
211
|
+
const entries = await readdir2(currentDir, { withFileTypes: true });
|
|
212
|
+
for (const entry of entries) {
|
|
213
|
+
const fullPath = join2(currentDir, entry.name);
|
|
214
|
+
if (entry.isDirectory()) {
|
|
215
|
+
if (!entry.name.startsWith("$") && !entry.name.startsWith("#")) {
|
|
216
|
+
await scanDir(fullPath);
|
|
217
|
+
}
|
|
218
|
+
} else if (entry.isFile()) {
|
|
219
|
+
const ext = this.getExtension(entry.name);
|
|
220
|
+
if (ext === "html" || ext === "htm") {
|
|
221
|
+
htmlFiles.push(fullPath);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
};
|
|
226
|
+
await scanDir(dir);
|
|
227
|
+
return htmlFiles.sort();
|
|
228
|
+
}
|
|
229
|
+
};
|
|
230
|
+
|
|
231
|
+
// src/extractors/markdown.ts
|
|
232
|
+
import { readFile as readFile3, readdir as readdir3, stat as stat2 } from "fs/promises";
|
|
233
|
+
import { join as join3, basename as basename3 } from "path";
|
|
234
|
+
var MarkdownExtractor = class extends BaseExtractor {
|
|
235
|
+
getType() {
|
|
236
|
+
return "markdown";
|
|
237
|
+
}
|
|
238
|
+
canHandle(path) {
|
|
239
|
+
const ext = this.getExtension(path);
|
|
240
|
+
return ext === "md" || ext === "markdown";
|
|
241
|
+
}
|
|
242
|
+
async extract() {
|
|
243
|
+
const { sourcePath } = this.options;
|
|
244
|
+
const stats = await stat2(sourcePath);
|
|
245
|
+
if (stats.isFile()) {
|
|
246
|
+
return [await this.extractFromFile(sourcePath)];
|
|
247
|
+
}
|
|
248
|
+
if (stats.isDirectory()) {
|
|
249
|
+
return this.extractFromDirectory(sourcePath);
|
|
250
|
+
}
|
|
251
|
+
throw new Error(`Invalid path: ${sourcePath}`);
|
|
252
|
+
}
|
|
253
|
+
async extractFromFile(filePath) {
|
|
254
|
+
const content = await readFile3(filePath, "utf-8");
|
|
255
|
+
const normalizedContent = this.normalizeLineEndings(content);
|
|
256
|
+
let title = this.extractTitle(normalizedContent);
|
|
257
|
+
if (!title) {
|
|
258
|
+
title = basename3(filePath, ".md").replace(/-|_/g, " ");
|
|
259
|
+
}
|
|
260
|
+
return {
|
|
261
|
+
path: filePath,
|
|
262
|
+
title,
|
|
263
|
+
content: this.cleanWhitespace(normalizedContent),
|
|
264
|
+
format: "markdown",
|
|
265
|
+
metadata: {
|
|
266
|
+
originalLength: content.length
|
|
267
|
+
}
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
extractTitle(content) {
|
|
271
|
+
const h1Match = content.match(/^#\s+(.+)$/m);
|
|
272
|
+
if (h1Match) {
|
|
273
|
+
return h1Match[1].trim();
|
|
274
|
+
}
|
|
275
|
+
const underlineMatch = content.match(/^(.+)\n=+\s*$/m);
|
|
276
|
+
if (underlineMatch) {
|
|
277
|
+
return underlineMatch[1].trim();
|
|
278
|
+
}
|
|
279
|
+
return void 0;
|
|
280
|
+
}
|
|
281
|
+
async extractFromDirectory(dir) {
|
|
282
|
+
const documents = [];
|
|
283
|
+
const scanDir = async (currentDir) => {
|
|
284
|
+
const entries = await readdir3(currentDir, { withFileTypes: true });
|
|
285
|
+
for (const entry of entries) {
|
|
286
|
+
const fullPath = join3(currentDir, entry.name);
|
|
287
|
+
if (entry.isDirectory()) {
|
|
288
|
+
if (!entry.name.startsWith(".") && entry.name !== "node_modules") {
|
|
289
|
+
await scanDir(fullPath);
|
|
290
|
+
}
|
|
291
|
+
} else if (entry.isFile() && this.canHandle(entry.name)) {
|
|
292
|
+
try {
|
|
293
|
+
const doc = await this.extractFromFile(fullPath);
|
|
294
|
+
if (doc.content.trim()) {
|
|
295
|
+
documents.push(doc);
|
|
296
|
+
}
|
|
297
|
+
} catch (error) {
|
|
298
|
+
console.warn(`Failed to extract ${fullPath}:`, error);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
};
|
|
303
|
+
await scanDir(dir);
|
|
304
|
+
return documents;
|
|
305
|
+
}
|
|
306
|
+
};
|
|
307
|
+
|
|
308
|
+
// src/extractors/index.ts
|
|
309
|
+
import { stat as stat3 } from "fs/promises";
|
|
310
|
+
async function extractDocuments(sourcePath) {
|
|
311
|
+
const stats = await stat3(sourcePath);
|
|
312
|
+
const ext = sourcePath.split(".").pop()?.toLowerCase() || "";
|
|
313
|
+
if (stats.isFile()) {
|
|
314
|
+
if (ext === "chm") {
|
|
315
|
+
const extractor = new CHMExtractor({ sourcePath });
|
|
316
|
+
return extractor.extract();
|
|
317
|
+
}
|
|
318
|
+
if (ext === "html" || ext === "htm") {
|
|
319
|
+
const extractor = new HTMLExtractor({ sourcePath });
|
|
320
|
+
return extractor.extract();
|
|
321
|
+
}
|
|
322
|
+
if (ext === "md" || ext === "markdown") {
|
|
323
|
+
const extractor = new MarkdownExtractor({ sourcePath });
|
|
324
|
+
return extractor.extract();
|
|
325
|
+
}
|
|
326
|
+
throw new Error(`Unsupported file format: ${ext}`);
|
|
327
|
+
}
|
|
328
|
+
if (stats.isDirectory()) {
|
|
329
|
+
const allDocuments = [];
|
|
330
|
+
const htmlExtractor = new HTMLExtractor({ sourcePath });
|
|
331
|
+
const htmlDocs = await htmlExtractor.extract();
|
|
332
|
+
allDocuments.push(...htmlDocs);
|
|
333
|
+
const mdExtractor = new MarkdownExtractor({ sourcePath });
|
|
334
|
+
const mdDocs = await mdExtractor.extract();
|
|
335
|
+
allDocuments.push(...mdDocs);
|
|
336
|
+
return allDocuments;
|
|
337
|
+
}
|
|
338
|
+
throw new Error(`Invalid path: ${sourcePath}`);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// src/chunking/header-aware-chunker.ts
|
|
342
|
+
import { generateId } from "@chatbot-packages/utils";
|
|
343
|
+
var HeaderAwareChunker = class {
|
|
344
|
+
options;
|
|
345
|
+
constructor(options) {
|
|
346
|
+
this.options = {
|
|
347
|
+
chunkSize: 512,
|
|
348
|
+
chunkOverlap: 50,
|
|
349
|
+
minChunkSize: 100,
|
|
350
|
+
maxChunkSize: 800,
|
|
351
|
+
respectHeadings: true,
|
|
352
|
+
splitOnHeadings: [1, 2],
|
|
353
|
+
...options
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Chunk a document into smaller pieces
|
|
358
|
+
*/
|
|
359
|
+
chunk(document, documentId) {
|
|
360
|
+
const docId = documentId || generateId("doc");
|
|
361
|
+
const { content } = document;
|
|
362
|
+
if (!content.trim()) {
|
|
363
|
+
return {
|
|
364
|
+
chunks: [],
|
|
365
|
+
stats: { totalChunks: 0, avgChunkSize: 0, minChunkSize: 0, maxChunkSize: 0 }
|
|
366
|
+
};
|
|
367
|
+
}
|
|
368
|
+
const chunks = [];
|
|
369
|
+
const sections = this.splitBySections(content);
|
|
370
|
+
for (const section of sections) {
|
|
371
|
+
const sectionChunks = this.chunkSection(section, docId, document.path);
|
|
372
|
+
chunks.push(...sectionChunks);
|
|
373
|
+
}
|
|
374
|
+
const sizes = chunks.map((c) => c.text.length);
|
|
375
|
+
const stats = {
|
|
376
|
+
totalChunks: chunks.length,
|
|
377
|
+
avgChunkSize: sizes.length > 0 ? Math.round(sizes.reduce((a, b) => a + b, 0) / sizes.length) : 0,
|
|
378
|
+
minChunkSize: sizes.length > 0 ? Math.min(...sizes) : 0,
|
|
379
|
+
maxChunkSize: sizes.length > 0 ? Math.max(...sizes) : 0
|
|
380
|
+
};
|
|
381
|
+
return { chunks, stats };
|
|
382
|
+
}
|
|
383
|
+
/**
|
|
384
|
+
* Split content by heading boundaries
|
|
385
|
+
*/
|
|
386
|
+
splitBySections(content) {
|
|
387
|
+
const lines = content.split("\n");
|
|
388
|
+
const sections = [];
|
|
389
|
+
let currentContext = { sectionPath: "" };
|
|
390
|
+
let currentText = [];
|
|
391
|
+
for (const line of lines) {
|
|
392
|
+
const heading = this.parseHeading(line);
|
|
393
|
+
if (heading && this.options.splitOnHeadings.includes(heading.level)) {
|
|
394
|
+
if (currentText.length > 0) {
|
|
395
|
+
sections.push({
|
|
396
|
+
text: currentText.join("\n").trim(),
|
|
397
|
+
context: { ...currentContext }
|
|
398
|
+
});
|
|
399
|
+
currentText = [];
|
|
400
|
+
}
|
|
401
|
+
if (heading.level === 1) {
|
|
402
|
+
currentContext = {
|
|
403
|
+
h1: heading.text,
|
|
404
|
+
sectionPath: heading.text
|
|
405
|
+
};
|
|
406
|
+
} else if (heading.level === 2) {
|
|
407
|
+
currentContext = {
|
|
408
|
+
...currentContext,
|
|
409
|
+
h2: heading.text,
|
|
410
|
+
h3: void 0,
|
|
411
|
+
sectionPath: currentContext.h1 ? `${currentContext.h1} > ${heading.text}` : heading.text
|
|
412
|
+
};
|
|
413
|
+
} else if (heading.level === 3) {
|
|
414
|
+
currentContext = {
|
|
415
|
+
...currentContext,
|
|
416
|
+
h3: heading.text,
|
|
417
|
+
sectionPath: currentContext.h2 ? `${currentContext.sectionPath} > ${heading.text}` : heading.text
|
|
418
|
+
};
|
|
419
|
+
}
|
|
420
|
+
currentText.push(line);
|
|
421
|
+
} else {
|
|
422
|
+
currentText.push(line);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
if (currentText.length > 0) {
|
|
426
|
+
sections.push({
|
|
427
|
+
text: currentText.join("\n").trim(),
|
|
428
|
+
context: { ...currentContext }
|
|
429
|
+
});
|
|
430
|
+
}
|
|
431
|
+
return sections;
|
|
432
|
+
}
|
|
433
|
+
/**
|
|
434
|
+
* Parse a heading line
|
|
435
|
+
*/
|
|
436
|
+
parseHeading(line) {
|
|
437
|
+
const match = line.match(/^(#{1,6})\s+(.+)$/);
|
|
438
|
+
if (match) {
|
|
439
|
+
return {
|
|
440
|
+
level: match[1].length,
|
|
441
|
+
text: match[2].trim()
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
return null;
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* Chunk a single section
|
|
448
|
+
*/
|
|
449
|
+
chunkSection(section, documentId, sourcePath) {
|
|
450
|
+
const { text, context } = section;
|
|
451
|
+
const chunks = [];
|
|
452
|
+
if (this.estimateTokens(text) <= this.options.maxChunkSize) {
|
|
453
|
+
if (this.estimateTokens(text) >= this.options.minChunkSize) {
|
|
454
|
+
chunks.push(this.createChunk(text, context, documentId, sourcePath, 0));
|
|
455
|
+
}
|
|
456
|
+
return chunks;
|
|
457
|
+
}
|
|
458
|
+
const paragraphs = text.split(/\n\n+/);
|
|
459
|
+
let currentChunk = [];
|
|
460
|
+
let currentTokens = 0;
|
|
461
|
+
for (const para of paragraphs) {
|
|
462
|
+
const paraTokens = this.estimateTokens(para);
|
|
463
|
+
if (paraTokens > this.options.maxChunkSize) {
|
|
464
|
+
if (currentChunk.length > 0) {
|
|
465
|
+
chunks.push(
|
|
466
|
+
this.createChunk(
|
|
467
|
+
currentChunk.join("\n\n"),
|
|
468
|
+
context,
|
|
469
|
+
documentId,
|
|
470
|
+
sourcePath,
|
|
471
|
+
chunks.length
|
|
472
|
+
)
|
|
473
|
+
);
|
|
474
|
+
currentChunk = [];
|
|
475
|
+
currentTokens = 0;
|
|
476
|
+
}
|
|
477
|
+
const sentenceChunks = this.splitLargeParagraph(para);
|
|
478
|
+
for (const sentenceChunk of sentenceChunks) {
|
|
479
|
+
chunks.push(
|
|
480
|
+
this.createChunk(sentenceChunk, context, documentId, sourcePath, chunks.length)
|
|
481
|
+
);
|
|
482
|
+
}
|
|
483
|
+
continue;
|
|
484
|
+
}
|
|
485
|
+
if (currentTokens + paraTokens > this.options.chunkSize) {
|
|
486
|
+
if (currentChunk.length > 0) {
|
|
487
|
+
chunks.push(
|
|
488
|
+
this.createChunk(
|
|
489
|
+
currentChunk.join("\n\n"),
|
|
490
|
+
context,
|
|
491
|
+
documentId,
|
|
492
|
+
sourcePath,
|
|
493
|
+
chunks.length
|
|
494
|
+
)
|
|
495
|
+
);
|
|
496
|
+
const overlapText = this.getOverlapText(currentChunk);
|
|
497
|
+
currentChunk = overlapText ? [overlapText, para] : [para];
|
|
498
|
+
currentTokens = this.estimateTokens(currentChunk.join("\n\n"));
|
|
499
|
+
} else {
|
|
500
|
+
currentChunk = [para];
|
|
501
|
+
currentTokens = paraTokens;
|
|
502
|
+
}
|
|
503
|
+
} else {
|
|
504
|
+
currentChunk.push(para);
|
|
505
|
+
currentTokens += paraTokens;
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
if (currentChunk.length > 0 && currentTokens >= this.options.minChunkSize) {
|
|
509
|
+
chunks.push(
|
|
510
|
+
this.createChunk(currentChunk.join("\n\n"), context, documentId, sourcePath, chunks.length)
|
|
511
|
+
);
|
|
512
|
+
}
|
|
513
|
+
return chunks;
|
|
514
|
+
}
|
|
515
|
+
/**
|
|
516
|
+
* Split a large paragraph into sentence-based chunks
|
|
517
|
+
*/
|
|
518
|
+
splitLargeParagraph(paragraph) {
|
|
519
|
+
const sentences = paragraph.match(/[^.!?]+[.!?]+/g) || [paragraph];
|
|
520
|
+
const chunks = [];
|
|
521
|
+
let currentChunk = [];
|
|
522
|
+
let currentTokens = 0;
|
|
523
|
+
for (const sentence of sentences) {
|
|
524
|
+
const sentenceTokens = this.estimateTokens(sentence);
|
|
525
|
+
if (currentTokens + sentenceTokens > this.options.chunkSize && currentChunk.length > 0) {
|
|
526
|
+
chunks.push(currentChunk.join(" ").trim());
|
|
527
|
+
currentChunk = [];
|
|
528
|
+
currentTokens = 0;
|
|
529
|
+
}
|
|
530
|
+
currentChunk.push(sentence.trim());
|
|
531
|
+
currentTokens += sentenceTokens;
|
|
532
|
+
}
|
|
533
|
+
if (currentChunk.length > 0) {
|
|
534
|
+
chunks.push(currentChunk.join(" ").trim());
|
|
535
|
+
}
|
|
536
|
+
return chunks;
|
|
537
|
+
}
|
|
538
|
+
/**
|
|
539
|
+
* Get overlap text from previous chunk
|
|
540
|
+
*/
|
|
541
|
+
getOverlapText(previousChunk) {
|
|
542
|
+
if (!this.options.chunkOverlap || previousChunk.length === 0) {
|
|
543
|
+
return null;
|
|
544
|
+
}
|
|
545
|
+
const reversed = [...previousChunk].reverse();
|
|
546
|
+
const overlapParts = [];
|
|
547
|
+
let tokens = 0;
|
|
548
|
+
for (const part of reversed) {
|
|
549
|
+
const partTokens = this.estimateTokens(part);
|
|
550
|
+
if (tokens + partTokens > this.options.chunkOverlap) {
|
|
551
|
+
break;
|
|
552
|
+
}
|
|
553
|
+
overlapParts.unshift(part);
|
|
554
|
+
tokens += partTokens;
|
|
555
|
+
}
|
|
556
|
+
return overlapParts.length > 0 ? overlapParts.join("\n\n") : null;
|
|
557
|
+
}
|
|
558
|
+
/**
|
|
559
|
+
* Create a chunk object
|
|
560
|
+
*/
|
|
561
|
+
createChunk(text, context, documentId, sourcePath, index) {
|
|
562
|
+
const metadata = {
|
|
563
|
+
sectionPath: context.sectionPath,
|
|
564
|
+
headingH1: context.h1,
|
|
565
|
+
headingH2: context.h2,
|
|
566
|
+
headingH3: context.h3,
|
|
567
|
+
sourcePath,
|
|
568
|
+
chunkIndex: index
|
|
569
|
+
};
|
|
570
|
+
return {
|
|
571
|
+
id: generateId("chunk"),
|
|
572
|
+
documentId,
|
|
573
|
+
text: text.trim(),
|
|
574
|
+
metadata,
|
|
575
|
+
createdAt: /* @__PURE__ */ new Date()
|
|
576
|
+
};
|
|
577
|
+
}
|
|
578
|
+
/**
|
|
579
|
+
* Estimate token count (rough approximation: ~4 chars per token)
|
|
580
|
+
*/
|
|
581
|
+
estimateTokens(text) {
|
|
582
|
+
return Math.ceil(text.length / 4);
|
|
583
|
+
}
|
|
584
|
+
};
|
|
585
|
+
|
|
586
|
+
// src/embeddings/local.ts
|
|
587
|
+
var pipeline = null;
|
|
588
|
+
var embedder = null;
|
|
589
|
+
async function loadPipeline(model) {
|
|
590
|
+
if (!pipeline) {
|
|
591
|
+
const transformers = await import("@xenova/transformers");
|
|
592
|
+
pipeline = transformers.pipeline;
|
|
593
|
+
}
|
|
594
|
+
if (!embedder) {
|
|
595
|
+
console.log(`[LocalEmbeddings] Loading model: ${model}...`);
|
|
596
|
+
embedder = await pipeline("feature-extraction", model, {
|
|
597
|
+
quantized: true
|
|
598
|
+
// Use quantized model for faster inference
|
|
599
|
+
});
|
|
600
|
+
console.log(`[LocalEmbeddings] Model loaded successfully`);
|
|
601
|
+
}
|
|
602
|
+
return embedder;
|
|
603
|
+
}
|
|
604
|
+
var LocalEmbeddingBackend = class _LocalEmbeddingBackend {
|
|
605
|
+
model;
|
|
606
|
+
batchSize;
|
|
607
|
+
normalize;
|
|
608
|
+
dimensions;
|
|
609
|
+
// Model dimension map
|
|
610
|
+
static MODEL_DIMENSIONS = {
|
|
611
|
+
"Xenova/bge-large-en-v1.5": 1024,
|
|
612
|
+
"Xenova/bge-base-en-v1.5": 768,
|
|
613
|
+
"Xenova/bge-small-en-v1.5": 384,
|
|
614
|
+
"Xenova/all-MiniLM-L6-v2": 384,
|
|
615
|
+
"Xenova/all-MiniLM-L12-v2": 384,
|
|
616
|
+
"Xenova/all-mpnet-base-v2": 768
|
|
617
|
+
};
|
|
618
|
+
constructor(options) {
|
|
619
|
+
this.model = options?.model || "Xenova/bge-base-en-v1.5";
|
|
620
|
+
this.batchSize = options?.batchSize || 32;
|
|
621
|
+
this.normalize = options?.normalize ?? true;
|
|
622
|
+
this.dimensions = _LocalEmbeddingBackend.MODEL_DIMENSIONS[this.model] || 768;
|
|
623
|
+
}
|
|
624
|
+
async embed(text) {
|
|
625
|
+
const embedder2 = await loadPipeline(this.model);
|
|
626
|
+
const processedText = this.model.includes("bge") ? `Represent this sentence for searching relevant passages: ${text}` : text;
|
|
627
|
+
const output = await embedder2(processedText, {
|
|
628
|
+
pooling: "mean",
|
|
629
|
+
normalize: this.normalize
|
|
630
|
+
});
|
|
631
|
+
const embedding = Array.from(output.data);
|
|
632
|
+
return {
|
|
633
|
+
embedding,
|
|
634
|
+
tokens: Math.ceil(text.length / 4)
|
|
635
|
+
// Rough estimate
|
|
636
|
+
};
|
|
637
|
+
}
|
|
638
|
+
async embedBatch(texts) {
|
|
639
|
+
const results = [];
|
|
640
|
+
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
641
|
+
const batch = texts.slice(i, i + this.batchSize);
|
|
642
|
+
const batchResults = await Promise.all(batch.map((text) => this.embed(text)));
|
|
643
|
+
results.push(...batchResults);
|
|
644
|
+
}
|
|
645
|
+
return results;
|
|
646
|
+
}
|
|
647
|
+
getDimensions() {
|
|
648
|
+
return this.dimensions;
|
|
649
|
+
}
|
|
650
|
+
getModel() {
|
|
651
|
+
return this.model;
|
|
652
|
+
}
|
|
653
|
+
};
|
|
654
|
+
var LOCAL_MODELS = {
|
|
655
|
+
/** BGE Large - Best quality, slower (1024 dims) */
|
|
656
|
+
BGE_LARGE: "Xenova/bge-large-en-v1.5",
|
|
657
|
+
/** BGE Base - Good balance (768 dims) */
|
|
658
|
+
BGE_BASE: "Xenova/bge-base-en-v1.5",
|
|
659
|
+
/** BGE Small - Fastest (384 dims) */
|
|
660
|
+
BGE_SMALL: "Xenova/bge-small-en-v1.5",
|
|
661
|
+
/** MiniLM L6 - Very fast (384 dims) */
|
|
662
|
+
MINILM_L6: "Xenova/all-MiniLM-L6-v2",
|
|
663
|
+
/** MiniLM L12 - Good quality (384 dims) */
|
|
664
|
+
MINILM_L12: "Xenova/all-MiniLM-L12-v2",
|
|
665
|
+
/** MPNet - High quality (768 dims) */
|
|
666
|
+
MPNET: "Xenova/all-mpnet-base-v2"
|
|
667
|
+
};
|
|
668
|
+
|
|
669
|
+
// src/embeddings/openai.ts
|
|
670
|
+
var OpenAIEmbeddingBackend = class _OpenAIEmbeddingBackend {
|
|
671
|
+
apiKey;
|
|
672
|
+
model;
|
|
673
|
+
dimensions;
|
|
674
|
+
batchSize;
|
|
675
|
+
baseUrl;
|
|
676
|
+
// Model dimension defaults
|
|
677
|
+
static MODEL_DIMENSIONS = {
|
|
678
|
+
"text-embedding-3-large": 3072,
|
|
679
|
+
"text-embedding-3-small": 1536,
|
|
680
|
+
"text-embedding-ada-002": 1536
|
|
681
|
+
};
|
|
682
|
+
constructor(options) {
|
|
683
|
+
this.apiKey = options.apiKey;
|
|
684
|
+
this.model = options.model || "text-embedding-3-small";
|
|
685
|
+
this.dimensions = options.dimensions || _OpenAIEmbeddingBackend.MODEL_DIMENSIONS[this.model] || 1536;
|
|
686
|
+
this.batchSize = options.batchSize || 100;
|
|
687
|
+
this.baseUrl = options.baseUrl || "https://api.openai.com/v1";
|
|
688
|
+
}
|
|
689
|
+
async embed(text) {
|
|
690
|
+
const results = await this.embedBatch([text]);
|
|
691
|
+
return results[0];
|
|
692
|
+
}
|
|
693
|
+
async embedBatch(texts) {
|
|
694
|
+
const allResults = [];
|
|
695
|
+
for (let i = 0; i < texts.length; i += this.batchSize) {
|
|
696
|
+
const batch = texts.slice(i, i + this.batchSize);
|
|
697
|
+
const batchResults = await this.callAPI(batch);
|
|
698
|
+
allResults.push(...batchResults);
|
|
699
|
+
}
|
|
700
|
+
return allResults;
|
|
701
|
+
}
|
|
702
|
+
async callAPI(texts) {
|
|
703
|
+
const body = {
|
|
704
|
+
model: this.model,
|
|
705
|
+
input: texts
|
|
706
|
+
};
|
|
707
|
+
if (this.model.startsWith("text-embedding-3-") && this.dimensions) {
|
|
708
|
+
body.dimensions = this.dimensions;
|
|
709
|
+
}
|
|
710
|
+
const response = await fetch(`${this.baseUrl}/embeddings`, {
|
|
711
|
+
method: "POST",
|
|
712
|
+
headers: {
|
|
713
|
+
"Content-Type": "application/json",
|
|
714
|
+
Authorization: `Bearer ${this.apiKey}`
|
|
715
|
+
},
|
|
716
|
+
body: JSON.stringify(body)
|
|
717
|
+
});
|
|
718
|
+
if (!response.ok) {
|
|
719
|
+
const error = await response.text();
|
|
720
|
+
throw new Error(`OpenAI API error: ${response.status} - ${error}`);
|
|
721
|
+
}
|
|
722
|
+
const data = await response.json();
|
|
723
|
+
const sorted = data.data.sort((a, b) => a.index - b.index);
|
|
724
|
+
return sorted.map((item, i) => ({
|
|
725
|
+
embedding: item.embedding,
|
|
726
|
+
tokens: Math.ceil(texts[i].length / 4)
|
|
727
|
+
// Rough estimate
|
|
728
|
+
}));
|
|
729
|
+
}
|
|
730
|
+
getDimensions() {
|
|
731
|
+
return this.dimensions;
|
|
732
|
+
}
|
|
733
|
+
getModel() {
|
|
734
|
+
return this.model;
|
|
735
|
+
}
|
|
736
|
+
};
|
|
737
|
+
var OPENAI_MODELS = {
|
|
738
|
+
/** text-embedding-3-large - Highest quality (3072 dims, can reduce) */
|
|
739
|
+
EMBEDDING_3_LARGE: "text-embedding-3-large",
|
|
740
|
+
/** text-embedding-3-small - Good balance (1536 dims, can reduce) */
|
|
741
|
+
EMBEDDING_3_SMALL: "text-embedding-3-small",
|
|
742
|
+
/** text-embedding-ada-002 - Legacy model (1536 dims) */
|
|
743
|
+
ADA_002: "text-embedding-ada-002"
|
|
744
|
+
};
|
|
745
|
+
|
|
746
|
+
// src/embeddings/index.ts
|
|
747
|
+
function createEmbeddingBackend(options) {
|
|
748
|
+
switch (options.provider) {
|
|
749
|
+
case "local":
|
|
750
|
+
return new LocalEmbeddingBackend({
|
|
751
|
+
model: options.model,
|
|
752
|
+
batchSize: options.batchSize
|
|
753
|
+
});
|
|
754
|
+
case "openai":
|
|
755
|
+
if (!options.apiKey) {
|
|
756
|
+
throw new Error("OpenAI embedding requires an API key");
|
|
757
|
+
}
|
|
758
|
+
return new OpenAIEmbeddingBackend({
|
|
759
|
+
apiKey: options.apiKey,
|
|
760
|
+
model: options.model,
|
|
761
|
+
dimensions: options.dimensions,
|
|
762
|
+
batchSize: options.batchSize
|
|
763
|
+
});
|
|
764
|
+
case "huggingface":
|
|
765
|
+
return new LocalEmbeddingBackend({
|
|
766
|
+
model: options.model || "Xenova/all-MiniLM-L6-v2",
|
|
767
|
+
batchSize: options.batchSize
|
|
768
|
+
});
|
|
769
|
+
default:
|
|
770
|
+
throw new Error(`Unknown embedding provider: ${options.provider}`);
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
// src/vectorstore/sqlite.ts
|
|
775
|
+
import Database from "better-sqlite3";
|
|
776
|
+
var SQLiteVectorStore = class {
|
|
777
|
+
db;
|
|
778
|
+
tableName;
|
|
779
|
+
dimensions;
|
|
780
|
+
constructor(options) {
|
|
781
|
+
this.db = new Database(options.path || ":memory:");
|
|
782
|
+
this.tableName = options.tableName || "chunks";
|
|
783
|
+
this.dimensions = options.dimensions;
|
|
784
|
+
this.db.pragma("journal_mode = WAL");
|
|
785
|
+
}
|
|
786
|
+
async initialize() {
|
|
787
|
+
this.db.exec(`
|
|
788
|
+
CREATE TABLE IF NOT EXISTS ${this.tableName} (
|
|
789
|
+
id TEXT PRIMARY KEY,
|
|
790
|
+
document_id TEXT NOT NULL,
|
|
791
|
+
text TEXT NOT NULL,
|
|
792
|
+
embedding TEXT,
|
|
793
|
+
metadata TEXT,
|
|
794
|
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
|
795
|
+
)
|
|
796
|
+
`);
|
|
797
|
+
this.db.exec(`
|
|
798
|
+
CREATE INDEX IF NOT EXISTS idx_${this.tableName}_document_id
|
|
799
|
+
ON ${this.tableName}(document_id)
|
|
800
|
+
`);
|
|
801
|
+
this.db.exec(`
|
|
802
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS ${this.tableName}_fts
|
|
803
|
+
USING fts5(id, text, content='${this.tableName}', content_rowid='rowid')
|
|
804
|
+
`);
|
|
805
|
+
this.db.exec(`
|
|
806
|
+
CREATE TRIGGER IF NOT EXISTS ${this.tableName}_ai AFTER INSERT ON ${this.tableName} BEGIN
|
|
807
|
+
INSERT INTO ${this.tableName}_fts(rowid, id, text) VALUES (new.rowid, new.id, new.text);
|
|
808
|
+
END
|
|
809
|
+
`);
|
|
810
|
+
this.db.exec(`
|
|
811
|
+
CREATE TRIGGER IF NOT EXISTS ${this.tableName}_ad AFTER DELETE ON ${this.tableName} BEGIN
|
|
812
|
+
INSERT INTO ${this.tableName}_fts(${this.tableName}_fts, rowid, id, text)
|
|
813
|
+
VALUES('delete', old.rowid, old.id, old.text);
|
|
814
|
+
END
|
|
815
|
+
`);
|
|
816
|
+
}
|
|
817
|
+
async insert(chunks) {
|
|
818
|
+
const stmt = this.db.prepare(`
|
|
819
|
+
INSERT OR REPLACE INTO ${this.tableName} (id, document_id, text, embedding, metadata, created_at)
|
|
820
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
821
|
+
`);
|
|
822
|
+
const insertMany = this.db.transaction((items) => {
|
|
823
|
+
for (const chunk of items) {
|
|
824
|
+
stmt.run(
|
|
825
|
+
chunk.id,
|
|
826
|
+
chunk.documentId,
|
|
827
|
+
chunk.text,
|
|
828
|
+
chunk.embedding ? JSON.stringify(chunk.embedding) : null,
|
|
829
|
+
JSON.stringify(chunk.metadata),
|
|
830
|
+
chunk.createdAt.toISOString()
|
|
831
|
+
);
|
|
832
|
+
}
|
|
833
|
+
});
|
|
834
|
+
insertMany(chunks);
|
|
835
|
+
}
|
|
836
|
+
async denseSearch(embedding, topK, filter) {
|
|
837
|
+
let query = `SELECT * FROM ${this.tableName} WHERE embedding IS NOT NULL`;
|
|
838
|
+
const params = [];
|
|
839
|
+
if (filter?.documentId) {
|
|
840
|
+
query += ` AND document_id = ?`;
|
|
841
|
+
params.push(filter.documentId);
|
|
842
|
+
}
|
|
843
|
+
const rows = this.db.prepare(query).all(...params);
|
|
844
|
+
const results = [];
|
|
845
|
+
for (const row of rows) {
|
|
846
|
+
const chunkEmbedding = JSON.parse(row.embedding);
|
|
847
|
+
const score = this.cosineSimilarity(embedding, chunkEmbedding);
|
|
848
|
+
results.push({
|
|
849
|
+
chunk: this.rowToChunk(row),
|
|
850
|
+
score
|
|
851
|
+
});
|
|
852
|
+
}
|
|
853
|
+
results.sort((a, b) => b.score - a.score);
|
|
854
|
+
return results.slice(0, topK).map((r) => ({
|
|
855
|
+
...r,
|
|
856
|
+
searchType: "dense"
|
|
857
|
+
}));
|
|
858
|
+
}
|
|
859
|
+
async sparseSearch(query, topK, filter) {
|
|
860
|
+
let ftsQuery = `
|
|
861
|
+
SELECT c.*, bm25(${this.tableName}_fts) as score
|
|
862
|
+
FROM ${this.tableName}_fts fts
|
|
863
|
+
JOIN ${this.tableName} c ON fts.id = c.id
|
|
864
|
+
WHERE ${this.tableName}_fts MATCH ?
|
|
865
|
+
`;
|
|
866
|
+
const params = [this.escapeFTSQuery(query)];
|
|
867
|
+
if (filter?.documentId) {
|
|
868
|
+
ftsQuery += ` AND c.document_id = ?`;
|
|
869
|
+
params.push(filter.documentId);
|
|
870
|
+
}
|
|
871
|
+
ftsQuery += ` ORDER BY score LIMIT ?`;
|
|
872
|
+
params.push(topK);
|
|
873
|
+
try {
|
|
874
|
+
const rows = this.db.prepare(ftsQuery).all(...params);
|
|
875
|
+
return rows.map((row) => ({
|
|
876
|
+
chunk: this.rowToChunk(row),
|
|
877
|
+
score: Math.abs(row.score),
|
|
878
|
+
// BM25 returns negative scores
|
|
879
|
+
searchType: "sparse"
|
|
880
|
+
}));
|
|
881
|
+
} catch {
|
|
882
|
+
return [];
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
async deleteByDocumentId(documentId) {
|
|
886
|
+
const result = this.db.prepare(`DELETE FROM ${this.tableName} WHERE document_id = ?`).run(documentId);
|
|
887
|
+
return result.changes;
|
|
888
|
+
}
|
|
889
|
+
async getById(id) {
|
|
890
|
+
const row = this.db.prepare(`SELECT * FROM ${this.tableName} WHERE id = ?`).get(id);
|
|
891
|
+
return row ? this.rowToChunk(row) : null;
|
|
892
|
+
}
|
|
893
|
+
async close() {
|
|
894
|
+
this.db.close();
|
|
895
|
+
}
|
|
896
|
+
/**
|
|
897
|
+
* Calculate cosine similarity between two vectors
|
|
898
|
+
*/
|
|
899
|
+
cosineSimilarity(a, b) {
|
|
900
|
+
if (a.length !== b.length) {
|
|
901
|
+
throw new Error("Vectors must have same length");
|
|
902
|
+
}
|
|
903
|
+
let dotProduct = 0;
|
|
904
|
+
let normA = 0;
|
|
905
|
+
let normB = 0;
|
|
906
|
+
for (let i = 0; i < a.length; i++) {
|
|
907
|
+
dotProduct += a[i] * b[i];
|
|
908
|
+
normA += a[i] * a[i];
|
|
909
|
+
normB += b[i] * b[i];
|
|
910
|
+
}
|
|
911
|
+
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
|
|
912
|
+
return denominator === 0 ? 0 : dotProduct / denominator;
|
|
913
|
+
}
|
|
914
|
+
/**
|
|
915
|
+
* Convert database row to DocumentChunk
|
|
916
|
+
*/
|
|
917
|
+
rowToChunk(row) {
|
|
918
|
+
return {
|
|
919
|
+
id: row.id,
|
|
920
|
+
documentId: row.document_id,
|
|
921
|
+
text: row.text,
|
|
922
|
+
embedding: row.embedding ? JSON.parse(row.embedding) : void 0,
|
|
923
|
+
metadata: JSON.parse(row.metadata),
|
|
924
|
+
createdAt: new Date(row.created_at)
|
|
925
|
+
};
|
|
926
|
+
}
|
|
927
|
+
/**
|
|
928
|
+
* Escape FTS query to prevent syntax errors
|
|
929
|
+
*/
|
|
930
|
+
escapeFTSQuery(query) {
|
|
931
|
+
const cleaned = query.replace(/['"(){}[\]^~*?:\\]/g, " ").trim();
|
|
932
|
+
return cleaned.split(/\s+/).filter((w) => w.length > 0).map((w) => `"${w}"`).join(" OR ");
|
|
933
|
+
}
|
|
934
|
+
};
|
|
935
|
+
|
|
936
|
+
// src/vectorstore/postgres.ts
|
|
937
|
+
import { Pool } from "pg";
|
|
938
|
+
var PostgresVectorStore = class {
|
|
939
|
+
pool;
|
|
940
|
+
tableName;
|
|
941
|
+
schema;
|
|
942
|
+
dimensions;
|
|
943
|
+
fullTableName;
|
|
944
|
+
constructor(options) {
|
|
945
|
+
const poolConfig = options.poolConfig || {
|
|
946
|
+
connectionString: options.connectionString,
|
|
947
|
+
max: 10,
|
|
948
|
+
idleTimeoutMillis: 3e4
|
|
949
|
+
};
|
|
950
|
+
this.pool = new Pool(poolConfig);
|
|
951
|
+
this.tableName = options.tableName || "chunks";
|
|
952
|
+
this.schema = options.schema || "public";
|
|
953
|
+
this.dimensions = options.dimensions;
|
|
954
|
+
this.fullTableName = `${this.schema}.${this.tableName}`;
|
|
955
|
+
}
|
|
956
|
+
async initialize() {
|
|
957
|
+
const client = await this.pool.connect();
|
|
958
|
+
try {
|
|
959
|
+
await client.query("CREATE EXTENSION IF NOT EXISTS vector");
|
|
960
|
+
await client.query("CREATE EXTENSION IF NOT EXISTS pg_trgm");
|
|
961
|
+
await client.query(`
|
|
962
|
+
CREATE TABLE IF NOT EXISTS ${this.schema}.documents (
|
|
963
|
+
id TEXT PRIMARY KEY,
|
|
964
|
+
source_id TEXT NOT NULL,
|
|
965
|
+
path TEXT NOT NULL,
|
|
966
|
+
title TEXT NOT NULL,
|
|
967
|
+
metadata JSONB DEFAULT '{}',
|
|
968
|
+
created_at TIMESTAMPTZ DEFAULT NOW(),
|
|
969
|
+
updated_at TIMESTAMPTZ DEFAULT NOW()
|
|
970
|
+
)
|
|
971
|
+
`);
|
|
972
|
+
await client.query(`
|
|
973
|
+
CREATE TABLE IF NOT EXISTS ${this.fullTableName} (
|
|
974
|
+
id TEXT PRIMARY KEY,
|
|
975
|
+
document_id TEXT NOT NULL REFERENCES ${this.schema}.documents(id) ON DELETE CASCADE,
|
|
976
|
+
text TEXT NOT NULL,
|
|
977
|
+
embedding vector(${this.dimensions}),
|
|
978
|
+
metadata JSONB DEFAULT '{}',
|
|
979
|
+
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
980
|
+
)
|
|
981
|
+
`);
|
|
982
|
+
await client.query(`
|
|
983
|
+
CREATE INDEX IF NOT EXISTS idx_${this.tableName}_document_id
|
|
984
|
+
ON ${this.fullTableName}(document_id)
|
|
985
|
+
`);
|
|
986
|
+
await client.query(`
|
|
987
|
+
CREATE INDEX IF NOT EXISTS idx_${this.tableName}_embedding_hnsw
|
|
988
|
+
ON ${this.fullTableName}
|
|
989
|
+
USING hnsw (embedding vector_cosine_ops)
|
|
990
|
+
WITH (m = 16, ef_construction = 64)
|
|
991
|
+
`);
|
|
992
|
+
await client.query(`
|
|
993
|
+
CREATE INDEX IF NOT EXISTS idx_${this.tableName}_text_trgm
|
|
994
|
+
ON ${this.fullTableName}
|
|
995
|
+
USING gin (text gin_trgm_ops)
|
|
996
|
+
`);
|
|
997
|
+
await client.query(`
|
|
998
|
+
CREATE INDEX IF NOT EXISTS idx_${this.tableName}_metadata
|
|
999
|
+
ON ${this.fullTableName}
|
|
1000
|
+
USING gin (metadata)
|
|
1001
|
+
`);
|
|
1002
|
+
} finally {
|
|
1003
|
+
client.release();
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
async insert(chunks) {
|
|
1007
|
+
if (chunks.length === 0) return;
|
|
1008
|
+
const client = await this.pool.connect();
|
|
1009
|
+
try {
|
|
1010
|
+
await client.query("BEGIN");
|
|
1011
|
+
for (const chunk of chunks) {
|
|
1012
|
+
const embedding = chunk.embedding ? `[${chunk.embedding.join(",")}]` : null;
|
|
1013
|
+
await client.query(
|
|
1014
|
+
`
|
|
1015
|
+
INSERT INTO ${this.fullTableName} (id, document_id, text, embedding, metadata, created_at)
|
|
1016
|
+
VALUES ($1, $2, $3, $4::vector, $5, $6)
|
|
1017
|
+
ON CONFLICT (id) DO UPDATE SET
|
|
1018
|
+
text = EXCLUDED.text,
|
|
1019
|
+
embedding = EXCLUDED.embedding,
|
|
1020
|
+
metadata = EXCLUDED.metadata
|
|
1021
|
+
`,
|
|
1022
|
+
[
|
|
1023
|
+
chunk.id,
|
|
1024
|
+
chunk.documentId,
|
|
1025
|
+
chunk.text,
|
|
1026
|
+
embedding,
|
|
1027
|
+
JSON.stringify(chunk.metadata),
|
|
1028
|
+
chunk.createdAt
|
|
1029
|
+
]
|
|
1030
|
+
);
|
|
1031
|
+
}
|
|
1032
|
+
await client.query("COMMIT");
|
|
1033
|
+
} catch (error) {
|
|
1034
|
+
await client.query("ROLLBACK");
|
|
1035
|
+
throw error;
|
|
1036
|
+
} finally {
|
|
1037
|
+
client.release();
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
async denseSearch(embedding, topK, filter) {
|
|
1041
|
+
const embeddingStr = `[${embedding.join(",")}]`;
|
|
1042
|
+
let query = `
|
|
1043
|
+
SELECT
|
|
1044
|
+
id,
|
|
1045
|
+
document_id,
|
|
1046
|
+
text,
|
|
1047
|
+
metadata,
|
|
1048
|
+
created_at,
|
|
1049
|
+
1 - (embedding <=> $1::vector) as score
|
|
1050
|
+
FROM ${this.fullTableName}
|
|
1051
|
+
WHERE embedding IS NOT NULL
|
|
1052
|
+
`;
|
|
1053
|
+
const params = [embeddingStr];
|
|
1054
|
+
let paramIndex = 2;
|
|
1055
|
+
if (filter?.documentId) {
|
|
1056
|
+
query += ` AND document_id = $${paramIndex}`;
|
|
1057
|
+
params.push(filter.documentId);
|
|
1058
|
+
paramIndex++;
|
|
1059
|
+
}
|
|
1060
|
+
if (filter?.metadata) {
|
|
1061
|
+
query += ` AND metadata @> $${paramIndex}`;
|
|
1062
|
+
params.push(JSON.stringify(filter.metadata));
|
|
1063
|
+
paramIndex++;
|
|
1064
|
+
}
|
|
1065
|
+
query += ` ORDER BY embedding <=> $1::vector LIMIT $${paramIndex}`;
|
|
1066
|
+
params.push(topK);
|
|
1067
|
+
const result = await this.pool.query(query, params);
|
|
1068
|
+
return result.rows.map((row) => ({
|
|
1069
|
+
chunk: this.rowToChunk(row),
|
|
1070
|
+
score: row.score,
|
|
1071
|
+
searchType: "dense"
|
|
1072
|
+
}));
|
|
1073
|
+
}
|
|
1074
|
+
async sparseSearch(query, topK, filter) {
|
|
1075
|
+
let sql = `
|
|
1076
|
+
SELECT
|
|
1077
|
+
id,
|
|
1078
|
+
document_id,
|
|
1079
|
+
text,
|
|
1080
|
+
metadata,
|
|
1081
|
+
created_at,
|
|
1082
|
+
similarity(text, $1) as score
|
|
1083
|
+
FROM ${this.fullTableName}
|
|
1084
|
+
WHERE text % $1
|
|
1085
|
+
`;
|
|
1086
|
+
const params = [query];
|
|
1087
|
+
let paramIndex = 2;
|
|
1088
|
+
if (filter?.documentId) {
|
|
1089
|
+
sql += ` AND document_id = $${paramIndex}`;
|
|
1090
|
+
params.push(filter.documentId);
|
|
1091
|
+
paramIndex++;
|
|
1092
|
+
}
|
|
1093
|
+
sql += ` ORDER BY score DESC LIMIT $${paramIndex}`;
|
|
1094
|
+
params.push(topK);
|
|
1095
|
+
const result = await this.pool.query(sql, params);
|
|
1096
|
+
return result.rows.map((row) => ({
|
|
1097
|
+
chunk: this.rowToChunk(row),
|
|
1098
|
+
score: row.score,
|
|
1099
|
+
searchType: "sparse"
|
|
1100
|
+
}));
|
|
1101
|
+
}
|
|
1102
|
+
async deleteByDocumentId(documentId) {
|
|
1103
|
+
const result = await this.pool.query(
|
|
1104
|
+
`DELETE FROM ${this.fullTableName} WHERE document_id = $1`,
|
|
1105
|
+
[documentId]
|
|
1106
|
+
);
|
|
1107
|
+
return result.rowCount || 0;
|
|
1108
|
+
}
|
|
1109
|
+
async getById(id) {
|
|
1110
|
+
const result = await this.pool.query(
|
|
1111
|
+
`SELECT * FROM ${this.fullTableName} WHERE id = $1`,
|
|
1112
|
+
[id]
|
|
1113
|
+
);
|
|
1114
|
+
return result.rows.length > 0 ? this.rowToChunk(result.rows[0]) : null;
|
|
1115
|
+
}
|
|
1116
|
+
async close() {
|
|
1117
|
+
await this.pool.end();
|
|
1118
|
+
}
|
|
1119
|
+
/**
|
|
1120
|
+
* Get chunk and document counts
|
|
1121
|
+
*/
|
|
1122
|
+
async getStats() {
|
|
1123
|
+
const chunksResult = await this.pool.query(
|
|
1124
|
+
`SELECT COUNT(*) FROM ${this.fullTableName}`
|
|
1125
|
+
);
|
|
1126
|
+
const docsResult = await this.pool.query(
|
|
1127
|
+
`SELECT COUNT(*) FROM ${this.schema}.documents`
|
|
1128
|
+
);
|
|
1129
|
+
return {
|
|
1130
|
+
chunks: parseInt(chunksResult.rows[0].count, 10),
|
|
1131
|
+
documents: parseInt(docsResult.rows[0].count, 10)
|
|
1132
|
+
};
|
|
1133
|
+
}
|
|
1134
|
+
/**
|
|
1135
|
+
* Insert or update a document
|
|
1136
|
+
*/
|
|
1137
|
+
async upsertDocument(doc) {
|
|
1138
|
+
await this.pool.query(
|
|
1139
|
+
`
|
|
1140
|
+
INSERT INTO ${this.schema}.documents (id, source_id, path, title, metadata)
|
|
1141
|
+
VALUES ($1, $2, $3, $4, $5)
|
|
1142
|
+
ON CONFLICT (id) DO UPDATE SET
|
|
1143
|
+
path = EXCLUDED.path,
|
|
1144
|
+
title = EXCLUDED.title,
|
|
1145
|
+
metadata = EXCLUDED.metadata,
|
|
1146
|
+
updated_at = NOW()
|
|
1147
|
+
`,
|
|
1148
|
+
[doc.id, doc.sourceId, doc.path, doc.title, JSON.stringify(doc.metadata || {})]
|
|
1149
|
+
);
|
|
1150
|
+
}
|
|
1151
|
+
rowToChunk(row) {
|
|
1152
|
+
return {
|
|
1153
|
+
id: row.id,
|
|
1154
|
+
documentId: row.document_id,
|
|
1155
|
+
text: row.text,
|
|
1156
|
+
metadata: row.metadata,
|
|
1157
|
+
createdAt: new Date(row.created_at)
|
|
1158
|
+
};
|
|
1159
|
+
}
|
|
1160
|
+
};
|
|
1161
|
+
|
|
1162
|
+
// src/vectorstore/index.ts
|
|
1163
|
+
function createVectorStore(options) {
|
|
1164
|
+
switch (options.type) {
|
|
1165
|
+
case "sqlite":
|
|
1166
|
+
return new SQLiteVectorStore({
|
|
1167
|
+
path: options.connectionString || ":memory:",
|
|
1168
|
+
tableName: options.tableName,
|
|
1169
|
+
dimensions: options.dimensions
|
|
1170
|
+
});
|
|
1171
|
+
case "postgres":
|
|
1172
|
+
if (!options.connectionString) {
|
|
1173
|
+
throw new Error("PostgreSQL requires a connection string");
|
|
1174
|
+
}
|
|
1175
|
+
return new PostgresVectorStore({
|
|
1176
|
+
connectionString: options.connectionString,
|
|
1177
|
+
tableName: options.tableName,
|
|
1178
|
+
dimensions: options.dimensions
|
|
1179
|
+
});
|
|
1180
|
+
case "memory":
|
|
1181
|
+
return new SQLiteVectorStore({
|
|
1182
|
+
path: ":memory:",
|
|
1183
|
+
tableName: options.tableName,
|
|
1184
|
+
dimensions: options.dimensions
|
|
1185
|
+
});
|
|
1186
|
+
default:
|
|
1187
|
+
throw new Error(`Unknown vector store type: ${options.type}`);
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
// src/retrieval/hybrid.ts
|
|
1192
|
+
var HybridRetriever = class {
|
|
1193
|
+
vectorStore;
|
|
1194
|
+
embeddings;
|
|
1195
|
+
defaultTopK;
|
|
1196
|
+
denseWeight;
|
|
1197
|
+
sparseWeight;
|
|
1198
|
+
rrfK;
|
|
1199
|
+
constructor(options) {
|
|
1200
|
+
this.vectorStore = options.vectorStore;
|
|
1201
|
+
this.embeddings = options.embeddings;
|
|
1202
|
+
this.defaultTopK = options.defaultTopK || 10;
|
|
1203
|
+
this.denseWeight = options.denseWeight ?? 0.7;
|
|
1204
|
+
this.sparseWeight = options.sparseWeight ?? 0.3;
|
|
1205
|
+
this.rrfK = options.rrfK || 60;
|
|
1206
|
+
}
|
|
1207
|
+
/**
|
|
1208
|
+
* Perform hybrid search combining dense and sparse retrieval
|
|
1209
|
+
*/
|
|
1210
|
+
async search(query, options) {
|
|
1211
|
+
const startTime = Date.now();
|
|
1212
|
+
const topK = options?.topK || this.defaultTopK;
|
|
1213
|
+
const filter = options?.filter;
|
|
1214
|
+
const denseWeight = options?.denseWeight ?? this.denseWeight;
|
|
1215
|
+
const sparseWeight = options?.sparseWeight ?? this.sparseWeight;
|
|
1216
|
+
const candidateK = Math.min(topK * 3, 100);
|
|
1217
|
+
const [denseResults, sparseResults] = await Promise.all([
|
|
1218
|
+
this.denseSearch(query, candidateK, filter),
|
|
1219
|
+
this.sparseSearch(query, candidateK, filter)
|
|
1220
|
+
]);
|
|
1221
|
+
const fusedResults = this.rrfFusion(
|
|
1222
|
+
denseResults,
|
|
1223
|
+
sparseResults,
|
|
1224
|
+
denseWeight,
|
|
1225
|
+
sparseWeight
|
|
1226
|
+
);
|
|
1227
|
+
const finalResults = fusedResults.slice(0, topK);
|
|
1228
|
+
return {
|
|
1229
|
+
results: finalResults,
|
|
1230
|
+
stats: {
|
|
1231
|
+
denseCount: denseResults.length,
|
|
1232
|
+
sparseCount: sparseResults.length,
|
|
1233
|
+
rerankingApplied: false,
|
|
1234
|
+
totalTime: Date.now() - startTime
|
|
1235
|
+
}
|
|
1236
|
+
};
|
|
1237
|
+
}
|
|
1238
|
+
/**
|
|
1239
|
+
* Dense (vector) search
|
|
1240
|
+
*/
|
|
1241
|
+
async denseSearch(query, topK, filter) {
|
|
1242
|
+
try {
|
|
1243
|
+
const embeddingResult = await this.embeddings.embed(query);
|
|
1244
|
+
return await this.vectorStore.denseSearch(embeddingResult.embedding, topK, filter);
|
|
1245
|
+
} catch (error) {
|
|
1246
|
+
console.warn("Dense search failed:", error);
|
|
1247
|
+
return [];
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
/**
|
|
1251
|
+
* Sparse (BM25/text) search
|
|
1252
|
+
*/
|
|
1253
|
+
async sparseSearch(query, topK, filter) {
|
|
1254
|
+
try {
|
|
1255
|
+
return await this.vectorStore.sparseSearch(query, topK, filter);
|
|
1256
|
+
} catch (error) {
|
|
1257
|
+
console.warn("Sparse search failed:", error);
|
|
1258
|
+
return [];
|
|
1259
|
+
}
|
|
1260
|
+
}
|
|
1261
|
+
/**
|
|
1262
|
+
* Reciprocal Rank Fusion
|
|
1263
|
+
*
|
|
1264
|
+
* Combines multiple ranked lists into a single ranking.
|
|
1265
|
+
* Formula: score = sum(1 / (k + rank_i)) for each list i
|
|
1266
|
+
*/
|
|
1267
|
+
rrfFusion(denseResults, sparseResults, denseWeight, sparseWeight) {
|
|
1268
|
+
const k = this.rrfK;
|
|
1269
|
+
const scores = /* @__PURE__ */ new Map();
|
|
1270
|
+
denseResults.forEach((result, rank) => {
|
|
1271
|
+
const rrfScore = denseWeight * 1 / (k + rank + 1);
|
|
1272
|
+
const existing = scores.get(result.chunk.id);
|
|
1273
|
+
if (existing) {
|
|
1274
|
+
existing.score += rrfScore;
|
|
1275
|
+
} else {
|
|
1276
|
+
scores.set(result.chunk.id, {
|
|
1277
|
+
score: rrfScore,
|
|
1278
|
+
result: { ...result, searchType: "hybrid" }
|
|
1279
|
+
});
|
|
1280
|
+
}
|
|
1281
|
+
});
|
|
1282
|
+
sparseResults.forEach((result, rank) => {
|
|
1283
|
+
const rrfScore = sparseWeight * 1 / (k + rank + 1);
|
|
1284
|
+
const existing = scores.get(result.chunk.id);
|
|
1285
|
+
if (existing) {
|
|
1286
|
+
existing.score += rrfScore;
|
|
1287
|
+
} else {
|
|
1288
|
+
scores.set(result.chunk.id, {
|
|
1289
|
+
score: rrfScore,
|
|
1290
|
+
result: { ...result, searchType: "hybrid" }
|
|
1291
|
+
});
|
|
1292
|
+
}
|
|
1293
|
+
});
|
|
1294
|
+
const combined = Array.from(scores.values()).sort((a, b) => b.score - a.score).map((entry) => ({
|
|
1295
|
+
...entry.result,
|
|
1296
|
+
score: entry.score
|
|
1297
|
+
}));
|
|
1298
|
+
return combined;
|
|
1299
|
+
}
|
|
1300
|
+
};
|
|
1301
|
+
var DenseRetriever = class {
|
|
1302
|
+
vectorStore;
|
|
1303
|
+
embeddings;
|
|
1304
|
+
defaultTopK;
|
|
1305
|
+
constructor(options) {
|
|
1306
|
+
this.vectorStore = options.vectorStore;
|
|
1307
|
+
this.embeddings = options.embeddings;
|
|
1308
|
+
this.defaultTopK = options.defaultTopK || 10;
|
|
1309
|
+
}
|
|
1310
|
+
async search(query, options) {
|
|
1311
|
+
const startTime = Date.now();
|
|
1312
|
+
const topK = options?.topK || this.defaultTopK;
|
|
1313
|
+
const embeddingResult = await this.embeddings.embed(query);
|
|
1314
|
+
const results = await this.vectorStore.denseSearch(
|
|
1315
|
+
embeddingResult.embedding,
|
|
1316
|
+
topK,
|
|
1317
|
+
options?.filter
|
|
1318
|
+
);
|
|
1319
|
+
return {
|
|
1320
|
+
results,
|
|
1321
|
+
stats: {
|
|
1322
|
+
denseCount: results.length,
|
|
1323
|
+
sparseCount: 0,
|
|
1324
|
+
rerankingApplied: false,
|
|
1325
|
+
totalTime: Date.now() - startTime
|
|
1326
|
+
}
|
|
1327
|
+
};
|
|
1328
|
+
}
|
|
1329
|
+
};
|
|
1330
|
+
|
|
1331
|
+
// src/rag-service.ts
|
|
1332
|
+
var RAG = class {
|
|
1333
|
+
constructor(options) {
|
|
1334
|
+
this.options = options;
|
|
1335
|
+
this.embeddings = createEmbeddingBackend(options.embeddings);
|
|
1336
|
+
this.vectorStore = createVectorStore({
|
|
1337
|
+
...options.vectorStore,
|
|
1338
|
+
dimensions: this.embeddings.getDimensions()
|
|
1339
|
+
});
|
|
1340
|
+
this.retriever = new HybridRetriever({
|
|
1341
|
+
vectorStore: this.vectorStore,
|
|
1342
|
+
embeddings: this.embeddings,
|
|
1343
|
+
defaultTopK: options.retrieval?.topK || 8,
|
|
1344
|
+
denseWeight: options.retrieval?.denseWeight,
|
|
1345
|
+
sparseWeight: options.retrieval?.sparseWeight,
|
|
1346
|
+
rrfK: options.retrieval?.rrfK
|
|
1347
|
+
});
|
|
1348
|
+
this.chunker = new HeaderAwareChunker(options.chunking);
|
|
1349
|
+
this.llm = this.createLLM(options.llm);
|
|
1350
|
+
this.systemPrompt = options.systemPrompt || `You are a helpful documentation assistant. Answer questions based on the provided context.
|
|
1351
|
+
Always cite your sources using [1], [2], etc. format when referencing specific information.
|
|
1352
|
+
If the context doesn't contain enough information to answer, say so clearly.
|
|
1353
|
+
Be concise and accurate.`;
|
|
1354
|
+
}
|
|
1355
|
+
vectorStore;
|
|
1356
|
+
embeddings;
|
|
1357
|
+
retriever;
|
|
1358
|
+
llm;
|
|
1359
|
+
chunker;
|
|
1360
|
+
systemPrompt;
|
|
1361
|
+
initialized = false;
|
|
1362
|
+
createLLM(config) {
|
|
1363
|
+
switch (config.provider) {
|
|
1364
|
+
case "openai":
|
|
1365
|
+
case "cerebras":
|
|
1366
|
+
case "groq":
|
|
1367
|
+
return new OpenAIProvider({
|
|
1368
|
+
apiKey: config.apiKey || process.env.OPENAI_API_KEY || "",
|
|
1369
|
+
baseUrl: config.baseUrl,
|
|
1370
|
+
model: config.model || "gpt-4o-mini"
|
|
1371
|
+
});
|
|
1372
|
+
case "anthropic":
|
|
1373
|
+
return new AnthropicProvider({
|
|
1374
|
+
apiKey: config.apiKey || process.env.ANTHROPIC_API_KEY || "",
|
|
1375
|
+
model: config.model || "claude-sonnet-4-20250514"
|
|
1376
|
+
});
|
|
1377
|
+
default:
|
|
1378
|
+
throw new Error(`Unknown LLM provider: ${config.provider}`);
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
/**
|
|
1382
|
+
* Initialize the RAG service
|
|
1383
|
+
*/
|
|
1384
|
+
async initialize() {
|
|
1385
|
+
if (this.initialized) return;
|
|
1386
|
+
await this.vectorStore.initialize();
|
|
1387
|
+
this.initialized = true;
|
|
1388
|
+
}
|
|
1389
|
+
/**
|
|
1390
|
+
* Index documents from a path
|
|
1391
|
+
*/
|
|
1392
|
+
async index(path, options) {
|
|
1393
|
+
await this.initialize();
|
|
1394
|
+
const sourceId = options?.sourceId || generateId2("source");
|
|
1395
|
+
console.log(`[RAG] Indexing documents from: ${path}`);
|
|
1396
|
+
const documents = await extractDocuments(path);
|
|
1397
|
+
console.log(`[RAG] Extracted ${documents.length} documents`);
|
|
1398
|
+
let totalChunks = 0;
|
|
1399
|
+
for (const doc of documents) {
|
|
1400
|
+
const documentId = generateId2("doc");
|
|
1401
|
+
const { chunks, stats } = this.chunker.chunk(doc, documentId);
|
|
1402
|
+
console.log(
|
|
1403
|
+
`[RAG] Chunked "${doc.title}": ${stats.totalChunks} chunks (avg: ${stats.avgChunkSize} chars)`
|
|
1404
|
+
);
|
|
1405
|
+
if (chunks.length === 0) continue;
|
|
1406
|
+
console.log(`[RAG] Generating embeddings for ${chunks.length} chunks...`);
|
|
1407
|
+
const texts = chunks.map((c) => c.text);
|
|
1408
|
+
const embeddingResults = await this.embeddings.embedBatch(texts);
|
|
1409
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
1410
|
+
chunks[i].embedding = embeddingResults[i].embedding;
|
|
1411
|
+
}
|
|
1412
|
+
await this.vectorStore.insert(chunks);
|
|
1413
|
+
totalChunks += chunks.length;
|
|
1414
|
+
}
|
|
1415
|
+
console.log(`[RAG] Indexing complete: ${documents.length} docs, ${totalChunks} chunks`);
|
|
1416
|
+
return {
|
|
1417
|
+
documentsIndexed: documents.length,
|
|
1418
|
+
chunksCreated: totalChunks
|
|
1419
|
+
};
|
|
1420
|
+
}
|
|
1421
|
+
/**
|
|
1422
|
+
* Ask a question and get an answer with citations
|
|
1423
|
+
*/
|
|
1424
|
+
async ask(question, options) {
|
|
1425
|
+
await this.initialize();
|
|
1426
|
+
const startTime = Date.now();
|
|
1427
|
+
const retrievalStart = Date.now();
|
|
1428
|
+
const { results, stats } = await this.retriever.search(question, options);
|
|
1429
|
+
const retrievalTime = Date.now() - retrievalStart;
|
|
1430
|
+
if (results.length === 0) {
|
|
1431
|
+
return {
|
|
1432
|
+
question,
|
|
1433
|
+
answer: "I couldn't find any relevant information in the documentation to answer your question.",
|
|
1434
|
+
citations: [],
|
|
1435
|
+
context: [],
|
|
1436
|
+
metadata: {
|
|
1437
|
+
totalTime: Date.now() - startTime,
|
|
1438
|
+
retrievalTime,
|
|
1439
|
+
generationTime: 0,
|
|
1440
|
+
cached: false,
|
|
1441
|
+
model: this.llm.name || "unknown"
|
|
1442
|
+
}
|
|
1443
|
+
};
|
|
1444
|
+
}
|
|
1445
|
+
const context = results.map((r) => r.chunk);
|
|
1446
|
+
const contextText = this.buildContextText(context);
|
|
1447
|
+
const generationStart = Date.now();
|
|
1448
|
+
const answer = await this.generateAnswer(question, contextText);
|
|
1449
|
+
const generationTime = Date.now() - generationStart;
|
|
1450
|
+
const citations = this.extractCitations(answer, context);
|
|
1451
|
+
return {
|
|
1452
|
+
question,
|
|
1453
|
+
answer,
|
|
1454
|
+
citations,
|
|
1455
|
+
context,
|
|
1456
|
+
metadata: {
|
|
1457
|
+
totalTime: Date.now() - startTime,
|
|
1458
|
+
retrievalTime,
|
|
1459
|
+
generationTime,
|
|
1460
|
+
cached: false,
|
|
1461
|
+
model: this.llm.name || "unknown"
|
|
1462
|
+
}
|
|
1463
|
+
};
|
|
1464
|
+
}
|
|
1465
|
+
/**
|
|
1466
|
+
* Build context text from chunks
|
|
1467
|
+
*/
|
|
1468
|
+
buildContextText(chunks) {
|
|
1469
|
+
return chunks.map((chunk, i) => {
|
|
1470
|
+
const header = chunk.metadata.sectionPath || chunk.metadata.headingH1 || "Document";
|
|
1471
|
+
return `[${i + 1}] ${header}
|
|
1472
|
+
${chunk.text}`;
|
|
1473
|
+
}).join("\n\n---\n\n");
|
|
1474
|
+
}
|
|
1475
|
+
/**
|
|
1476
|
+
* Generate answer using LLM
|
|
1477
|
+
*/
|
|
1478
|
+
async generateAnswer(question, context) {
|
|
1479
|
+
const prompt = `Based on the following documentation context, answer the user's question.
|
|
1480
|
+
Cite sources using [1], [2], etc. format when referencing specific information.
|
|
1481
|
+
|
|
1482
|
+
CONTEXT:
|
|
1483
|
+
${context}
|
|
1484
|
+
|
|
1485
|
+
QUESTION: ${question}
|
|
1486
|
+
|
|
1487
|
+
ANSWER:`;
|
|
1488
|
+
const response = await this.llm.complete({
|
|
1489
|
+
messages: [
|
|
1490
|
+
{
|
|
1491
|
+
id: generateId2("msg"),
|
|
1492
|
+
role: "user",
|
|
1493
|
+
content: [{ type: "text", text: prompt }],
|
|
1494
|
+
timestamp: /* @__PURE__ */ new Date()
|
|
1495
|
+
}
|
|
1496
|
+
],
|
|
1497
|
+
systemPrompt: this.systemPrompt,
|
|
1498
|
+
maxTokens: 1e3
|
|
1499
|
+
});
|
|
1500
|
+
return response.content;
|
|
1501
|
+
}
|
|
1502
|
+
/**
|
|
1503
|
+
* Extract citations from answer text
|
|
1504
|
+
*/
|
|
1505
|
+
extractCitations(answer, context) {
|
|
1506
|
+
const citations = [];
|
|
1507
|
+
const citationRegex = /\[(\d+)\]/g;
|
|
1508
|
+
const matches = answer.matchAll(citationRegex);
|
|
1509
|
+
const seenIndexes = /* @__PURE__ */ new Set();
|
|
1510
|
+
for (const match of matches) {
|
|
1511
|
+
const index = parseInt(match[1], 10);
|
|
1512
|
+
if (!seenIndexes.has(index) && index > 0 && index <= context.length) {
|
|
1513
|
+
seenIndexes.add(index);
|
|
1514
|
+
const chunk = context[index - 1];
|
|
1515
|
+
citations.push({
|
|
1516
|
+
index,
|
|
1517
|
+
chunkId: chunk.id,
|
|
1518
|
+
sectionPath: chunk.metadata.sectionPath,
|
|
1519
|
+
heading: chunk.metadata.headingH3 || chunk.metadata.headingH2 || chunk.metadata.headingH1,
|
|
1520
|
+
snippet: chunk.text.substring(0, 200) + (chunk.text.length > 200 ? "..." : "")
|
|
1521
|
+
});
|
|
1522
|
+
}
|
|
1523
|
+
}
|
|
1524
|
+
return citations.sort((a, b) => a.index - b.index);
|
|
1525
|
+
}
|
|
1526
|
+
/**
|
|
1527
|
+
* Get document count
|
|
1528
|
+
*/
|
|
1529
|
+
async getDocumentCount() {
|
|
1530
|
+
return 0;
|
|
1531
|
+
}
|
|
1532
|
+
/**
|
|
1533
|
+
* Get chunk count
|
|
1534
|
+
*/
|
|
1535
|
+
async getChunkCount() {
|
|
1536
|
+
return 0;
|
|
1537
|
+
}
|
|
1538
|
+
/**
|
|
1539
|
+
* Clear all indexed data
|
|
1540
|
+
*/
|
|
1541
|
+
async clear() {
|
|
1542
|
+
console.log("[RAG] Clear not implemented yet");
|
|
1543
|
+
}
|
|
1544
|
+
/**
|
|
1545
|
+
* Close connections
|
|
1546
|
+
*/
|
|
1547
|
+
async close() {
|
|
1548
|
+
await this.vectorStore.close();
|
|
1549
|
+
}
|
|
1550
|
+
};
|
|
1551
|
+
async function createRAG(options) {
|
|
1552
|
+
const rag = new RAG(options);
|
|
1553
|
+
await rag.initialize();
|
|
1554
|
+
return rag;
|
|
1555
|
+
}
|
|
1556
|
+
export {
|
|
1557
|
+
BaseExtractor,
|
|
1558
|
+
CHMExtractor,
|
|
1559
|
+
DenseRetriever,
|
|
1560
|
+
HTMLExtractor,
|
|
1561
|
+
HeaderAwareChunker,
|
|
1562
|
+
HybridRetriever,
|
|
1563
|
+
LOCAL_MODELS,
|
|
1564
|
+
LocalEmbeddingBackend,
|
|
1565
|
+
MarkdownExtractor,
|
|
1566
|
+
OPENAI_MODELS,
|
|
1567
|
+
OpenAIEmbeddingBackend,
|
|
1568
|
+
PostgresVectorStore,
|
|
1569
|
+
RAG,
|
|
1570
|
+
SQLiteVectorStore,
|
|
1571
|
+
createEmbeddingBackend,
|
|
1572
|
+
createRAG,
|
|
1573
|
+
createVectorStore,
|
|
1574
|
+
extractDocuments
|
|
1575
|
+
};
|
|
1576
|
+
//# sourceMappingURL=index.js.map
|