@operor/knowledge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +457 -0
- package/dist/index.d.ts +437 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1442 -0
- package/dist/index.js.map +1 -0
- package/package.json +42 -0
- package/src/EmbeddingService.ts +92 -0
- package/src/IngestionPipeline.ts +357 -0
- package/src/QueryNormalizer.ts +59 -0
- package/src/QueryRewriter.ts +73 -0
- package/src/RankFusion.ts +72 -0
- package/src/RetrievalPipeline.ts +388 -0
- package/src/SQLiteKnowledgeStore.ts +379 -0
- package/src/TextChunker.ts +34 -0
- package/src/__tests__/cli-integration.test.ts +134 -0
- package/src/__tests__/content-fetcher.test.ts +156 -0
- package/src/__tests__/knowledge.test.ts +493 -0
- package/src/__tests__/retrieval-layers.test.ts +672 -0
- package/src/index.ts +41 -0
- package/src/ingestors/FileIngestor.ts +85 -0
- package/src/ingestors/SiteCrawler.ts +153 -0
- package/src/ingestors/UrlIngestor.ts +106 -0
- package/src/ingestors/WatiFaqSync.ts +75 -0
- package/src/ingestors/content-fetcher.ts +142 -0
- package/src/types.ts +62 -0
- package/tsconfig.json +9 -0
- package/tsdown.config.ts +10 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1442 @@
|
|
|
1
|
+
import { embed, embedMany, generateText } from "ai";
|
|
2
|
+
import { createOpenAI } from "@ai-sdk/openai";
|
|
3
|
+
import { createGoogleGenerativeAI } from "@ai-sdk/google";
|
|
4
|
+
import { mistral } from "@ai-sdk/mistral";
|
|
5
|
+
import { cohere } from "@ai-sdk/cohere";
|
|
6
|
+
import { statSync } from "node:fs";
|
|
7
|
+
import Database from "better-sqlite3";
|
|
8
|
+
import * as sqliteVec from "sqlite-vec";
|
|
9
|
+
import { MarkdownTextSplitter, RecursiveCharacterTextSplitter } from "@langchain/textsplitters";
|
|
10
|
+
import { createHash, randomUUID } from "node:crypto";
|
|
11
|
+
import { Readability } from "@mozilla/readability";
|
|
12
|
+
import { parseHTML } from "linkedom";
|
|
13
|
+
import { readFile } from "node:fs/promises";
|
|
14
|
+
import { extname } from "node:path";
|
|
15
|
+
|
|
16
|
+
//#region src/EmbeddingService.ts
|
|
17
|
+
var EmbeddingService = class EmbeddingService {
|
|
18
|
+
config;
|
|
19
|
+
constructor(config) {
|
|
20
|
+
this.config = config;
|
|
21
|
+
}
|
|
22
|
+
getModel() {
|
|
23
|
+
const { provider, apiKey, baseURL, model } = this.config;
|
|
24
|
+
switch (provider) {
|
|
25
|
+
case "openai": return createOpenAI({
|
|
26
|
+
apiKey,
|
|
27
|
+
baseURL
|
|
28
|
+
}).embedding(model || "text-embedding-3-small", { dimensions: this.config.dimensions });
|
|
29
|
+
case "google": return createGoogleGenerativeAI({
|
|
30
|
+
apiKey,
|
|
31
|
+
baseURL
|
|
32
|
+
}).textEmbeddingModel(model || "text-embedding-004");
|
|
33
|
+
case "mistral": return mistral.embedding(model || "mistral-embed", { apiKey });
|
|
34
|
+
case "cohere": return cohere.embedding(model || "embed-english-v3.0", { apiKey });
|
|
35
|
+
case "ollama": return createOpenAI({
|
|
36
|
+
apiKey: apiKey || "ollama",
|
|
37
|
+
baseURL: baseURL || "http://localhost:11434/v1"
|
|
38
|
+
}).embedding(model || "nomic-embed-text");
|
|
39
|
+
default: throw new Error(`Unknown embedding provider: ${provider}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
get provider() {
|
|
43
|
+
return this.config.provider;
|
|
44
|
+
}
|
|
45
|
+
get dimensions() {
|
|
46
|
+
if (this.config.dimensions) return this.config.dimensions;
|
|
47
|
+
return EmbeddingService.defaultDimensions(this.config.provider, this.config.model);
|
|
48
|
+
}
|
|
49
|
+
static defaultDimensions(provider, model) {
|
|
50
|
+
switch (provider) {
|
|
51
|
+
case "openai": return 1536;
|
|
52
|
+
case "google": return 768;
|
|
53
|
+
case "mistral": return 1024;
|
|
54
|
+
case "cohere": return 1024;
|
|
55
|
+
case "ollama": return 768;
|
|
56
|
+
default: return 1536;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
async embed(text) {
|
|
60
|
+
return (await embed({
|
|
61
|
+
model: this.getModel(),
|
|
62
|
+
value: text
|
|
63
|
+
})).embedding;
|
|
64
|
+
}
|
|
65
|
+
async embedMany(texts) {
|
|
66
|
+
if (texts.length === 0) return [];
|
|
67
|
+
return (await embedMany({
|
|
68
|
+
model: this.getModel(),
|
|
69
|
+
values: texts
|
|
70
|
+
})).embeddings;
|
|
71
|
+
}
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
//#endregion
|
|
75
|
+
//#region src/SQLiteKnowledgeStore.ts
|
|
76
|
+
var SQLiteKnowledgeStore = class {
|
|
77
|
+
db;
|
|
78
|
+
dbPath;
|
|
79
|
+
dimensions;
|
|
80
|
+
dimensionWarned = false;
|
|
81
|
+
constructor(dbPath = "./knowledge.db", dimensions = 1536) {
|
|
82
|
+
this.db = new Database(dbPath);
|
|
83
|
+
this.dbPath = dbPath;
|
|
84
|
+
this.dimensions = dimensions;
|
|
85
|
+
this.db.pragma("journal_mode = WAL");
|
|
86
|
+
this.db.pragma("foreign_keys = ON");
|
|
87
|
+
sqliteVec.load(this.db);
|
|
88
|
+
}
|
|
89
|
+
getDimensions() {
|
|
90
|
+
return this.dimensions;
|
|
91
|
+
}
|
|
92
|
+
async initialize() {
|
|
93
|
+
this.db.exec(`
|
|
94
|
+
CREATE TABLE IF NOT EXISTS kb_documents (
|
|
95
|
+
id TEXT PRIMARY KEY,
|
|
96
|
+
source_type TEXT NOT NULL,
|
|
97
|
+
source_url TEXT,
|
|
98
|
+
file_name TEXT,
|
|
99
|
+
title TEXT,
|
|
100
|
+
content TEXT NOT NULL,
|
|
101
|
+
metadata TEXT,
|
|
102
|
+
created_at INTEGER NOT NULL,
|
|
103
|
+
updated_at INTEGER NOT NULL,
|
|
104
|
+
priority INTEGER DEFAULT 2,
|
|
105
|
+
content_hash TEXT
|
|
106
|
+
);
|
|
107
|
+
|
|
108
|
+
CREATE TABLE IF NOT EXISTS kb_chunks (
|
|
109
|
+
id TEXT PRIMARY KEY,
|
|
110
|
+
document_id TEXT NOT NULL,
|
|
111
|
+
content TEXT NOT NULL,
|
|
112
|
+
chunk_index INTEGER NOT NULL,
|
|
113
|
+
metadata TEXT,
|
|
114
|
+
FOREIGN KEY (document_id) REFERENCES kb_documents(id) ON DELETE CASCADE
|
|
115
|
+
);
|
|
116
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_document ON kb_chunks(document_id);
|
|
117
|
+
CREATE INDEX IF NOT EXISTS idx_documents_source_url ON kb_documents(source_url);
|
|
118
|
+
`);
|
|
119
|
+
try {
|
|
120
|
+
this.db.exec("ALTER TABLE kb_documents ADD COLUMN priority INTEGER DEFAULT 2");
|
|
121
|
+
} catch {}
|
|
122
|
+
try {
|
|
123
|
+
this.db.exec("ALTER TABLE kb_documents ADD COLUMN content_hash TEXT");
|
|
124
|
+
} catch {}
|
|
125
|
+
try {
|
|
126
|
+
this.db.exec("CREATE INDEX IF NOT EXISTS idx_documents_source_url ON kb_documents(source_url)");
|
|
127
|
+
} catch {}
|
|
128
|
+
this.db.exec(`
|
|
129
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS vec_chunks USING vec0(
|
|
130
|
+
chunk_id TEXT PRIMARY KEY,
|
|
131
|
+
embedding float[${this.dimensions}]
|
|
132
|
+
);
|
|
133
|
+
`);
|
|
134
|
+
this.db.exec(`
|
|
135
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS fts_chunks USING fts5(
|
|
136
|
+
chunk_id UNINDEXED,
|
|
137
|
+
content,
|
|
138
|
+
tokenize='porter unicode61'
|
|
139
|
+
);
|
|
140
|
+
`);
|
|
141
|
+
}
|
|
142
|
+
async close() {
|
|
143
|
+
this.db.close();
|
|
144
|
+
}
|
|
145
|
+
async addDocument(doc) {
|
|
146
|
+
this.db.prepare(`
|
|
147
|
+
INSERT OR REPLACE INTO kb_documents (id, source_type, source_url, file_name, title, content, metadata, created_at, updated_at, priority, content_hash)
|
|
148
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
149
|
+
`).run(doc.id, doc.sourceType, doc.sourceUrl || null, doc.fileName || null, doc.title || null, doc.content, doc.metadata ? JSON.stringify(doc.metadata) : null, doc.createdAt, doc.updatedAt, doc.priority ?? 2, doc.contentHash || null);
|
|
150
|
+
}
|
|
151
|
+
async getDocument(id) {
|
|
152
|
+
const row = this.db.prepare("SELECT * FROM kb_documents WHERE id = ?").get(id);
|
|
153
|
+
return row ? this.rowToDocument(row) : null;
|
|
154
|
+
}
|
|
155
|
+
async listDocuments() {
|
|
156
|
+
return this.db.prepare("SELECT * FROM kb_documents ORDER BY created_at DESC").all().map((r) => this.rowToDocument(r));
|
|
157
|
+
}
|
|
158
|
+
async deleteDocument(id) {
|
|
159
|
+
const chunks = this.db.prepare("SELECT id FROM kb_chunks WHERE document_id = ?").all(id);
|
|
160
|
+
for (const chunk of chunks) {
|
|
161
|
+
this.db.prepare("DELETE FROM vec_chunks WHERE chunk_id = ?").run(chunk.id);
|
|
162
|
+
this.db.prepare("DELETE FROM fts_chunks WHERE chunk_id = ?").run(chunk.id);
|
|
163
|
+
}
|
|
164
|
+
this.db.prepare("DELETE FROM kb_chunks WHERE document_id = ?").run(id);
|
|
165
|
+
this.db.prepare("DELETE FROM kb_documents WHERE id = ?").run(id);
|
|
166
|
+
}
|
|
167
|
+
async addChunks(chunks) {
|
|
168
|
+
const insertChunk = this.db.prepare(`
|
|
169
|
+
INSERT OR REPLACE INTO kb_chunks (id, document_id, content, chunk_index, metadata)
|
|
170
|
+
VALUES (?, ?, ?, ?, ?)
|
|
171
|
+
`);
|
|
172
|
+
const insertVec = this.db.prepare(`
|
|
173
|
+
INSERT OR REPLACE INTO vec_chunks (chunk_id, embedding)
|
|
174
|
+
VALUES (?, ?)
|
|
175
|
+
`);
|
|
176
|
+
const deleteFts = this.db.prepare(`
|
|
177
|
+
DELETE FROM fts_chunks WHERE chunk_id = ?
|
|
178
|
+
`);
|
|
179
|
+
const insertFts = this.db.prepare(`
|
|
180
|
+
INSERT INTO fts_chunks (chunk_id, content)
|
|
181
|
+
VALUES (?, ?)
|
|
182
|
+
`);
|
|
183
|
+
this.db.transaction((items) => {
|
|
184
|
+
for (const chunk of items) {
|
|
185
|
+
insertChunk.run(chunk.id, chunk.documentId, chunk.content, chunk.chunkIndex, chunk.metadata ? JSON.stringify(chunk.metadata) : null);
|
|
186
|
+
if (chunk.embedding) {
|
|
187
|
+
if (chunk.embedding.length !== this.dimensions && !this.dimensionWarned) {
|
|
188
|
+
this.dimensionWarned = true;
|
|
189
|
+
console.warn(`[KB] Dimension mismatch: store expects ${this.dimensions}d vectors but received ${chunk.embedding.length}d. This will cause search errors. Re-ingest your documents after switching embedding providers, or set the correct dimensions when creating the store.`);
|
|
190
|
+
}
|
|
191
|
+
insertVec.run(chunk.id, new Float32Array(chunk.embedding));
|
|
192
|
+
}
|
|
193
|
+
deleteFts.run(chunk.id);
|
|
194
|
+
insertFts.run(chunk.id, chunk.content);
|
|
195
|
+
}
|
|
196
|
+
})(chunks);
|
|
197
|
+
}
|
|
198
|
+
getChunkCount(documentId) {
|
|
199
|
+
return this.db.prepare("SELECT COUNT(*) as count FROM kb_chunks WHERE document_id = ?").get(documentId)?.count || 0;
|
|
200
|
+
}
|
|
201
|
+
async search(query, embedding, options) {
|
|
202
|
+
return this.searchByEmbedding(embedding, options);
|
|
203
|
+
}
|
|
204
|
+
async searchByEmbedding(embedding, options) {
|
|
205
|
+
const limit = options?.limit || 5;
|
|
206
|
+
const fetchLimit = options?.sourceTypes ? Math.min(limit * 10, 100) : limit;
|
|
207
|
+
const vecRows = this.db.prepare(`
|
|
208
|
+
SELECT chunk_id, distance
|
|
209
|
+
FROM vec_chunks
|
|
210
|
+
WHERE embedding MATCH ?
|
|
211
|
+
ORDER BY distance
|
|
212
|
+
LIMIT ?
|
|
213
|
+
`).all(new Float32Array(embedding), fetchLimit);
|
|
214
|
+
if (vecRows.length === 0) return [];
|
|
215
|
+
const results = [];
|
|
216
|
+
for (const vecRow of vecRows) {
|
|
217
|
+
const distance = vecRow.distance;
|
|
218
|
+
const score = 1 / (1 + distance);
|
|
219
|
+
if (options?.scoreThreshold && score < options.scoreThreshold) continue;
|
|
220
|
+
const chunk = this.db.prepare("SELECT * FROM kb_chunks WHERE id = ?").get(vecRow.chunk_id);
|
|
221
|
+
if (!chunk) continue;
|
|
222
|
+
const doc = this.db.prepare("SELECT * FROM kb_documents WHERE id = ?").get(chunk.document_id);
|
|
223
|
+
if (!doc) continue;
|
|
224
|
+
if (options?.sourceTypes && !options.sourceTypes.includes(doc.source_type)) continue;
|
|
225
|
+
results.push({
|
|
226
|
+
chunk: {
|
|
227
|
+
id: chunk.id,
|
|
228
|
+
documentId: chunk.document_id,
|
|
229
|
+
content: chunk.content,
|
|
230
|
+
chunkIndex: chunk.chunk_index,
|
|
231
|
+
metadata: chunk.metadata ? JSON.parse(chunk.metadata) : void 0
|
|
232
|
+
},
|
|
233
|
+
document: this.rowToDocument(doc),
|
|
234
|
+
score,
|
|
235
|
+
distance
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
return results.slice(0, limit);
|
|
239
|
+
}
|
|
240
|
+
async searchByKeyword(query, options) {
|
|
241
|
+
const limit = options?.limit || 5;
|
|
242
|
+
const safeQuery = query.replace(/"/g, "\"\"");
|
|
243
|
+
let ftsRows;
|
|
244
|
+
try {
|
|
245
|
+
ftsRows = this.db.prepare(`
|
|
246
|
+
SELECT chunk_id, rank
|
|
247
|
+
FROM fts_chunks
|
|
248
|
+
WHERE fts_chunks MATCH ?
|
|
249
|
+
ORDER BY rank
|
|
250
|
+
LIMIT ?
|
|
251
|
+
`).all(safeQuery, limit * 2);
|
|
252
|
+
} catch {
|
|
253
|
+
return [];
|
|
254
|
+
}
|
|
255
|
+
if (ftsRows.length === 0) return [];
|
|
256
|
+
const results = [];
|
|
257
|
+
for (const ftsRow of ftsRows) {
|
|
258
|
+
if (results.length >= limit) break;
|
|
259
|
+
const chunk = this.db.prepare("SELECT * FROM kb_chunks WHERE id = ?").get(ftsRow.chunk_id);
|
|
260
|
+
if (!chunk) continue;
|
|
261
|
+
const doc = this.db.prepare("SELECT * FROM kb_documents WHERE id = ?").get(chunk.document_id);
|
|
262
|
+
if (!doc) continue;
|
|
263
|
+
if (options?.sourceTypes && !options.sourceTypes.includes(doc.source_type)) continue;
|
|
264
|
+
const bm25Score = -ftsRow.rank;
|
|
265
|
+
if (options?.scoreThreshold && bm25Score < options.scoreThreshold) continue;
|
|
266
|
+
results.push({
|
|
267
|
+
chunk: {
|
|
268
|
+
id: chunk.id,
|
|
269
|
+
documentId: chunk.document_id,
|
|
270
|
+
content: chunk.content,
|
|
271
|
+
chunkIndex: chunk.chunk_index,
|
|
272
|
+
metadata: chunk.metadata ? JSON.parse(chunk.metadata) : void 0
|
|
273
|
+
},
|
|
274
|
+
document: this.rowToDocument(doc),
|
|
275
|
+
score: bm25Score,
|
|
276
|
+
distance: 0
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
return results;
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Get all chunks from kb_chunks (text content only, no embeddings).
|
|
283
|
+
* Used by rebuild to re-embed all content.
|
|
284
|
+
*/
|
|
285
|
+
getAllChunks() {
|
|
286
|
+
return this.db.prepare("SELECT id, document_id AS documentId, content, chunk_index AS chunkIndex, metadata FROM kb_chunks ORDER BY document_id, chunk_index").all();
|
|
287
|
+
}
|
|
288
|
+
/**
|
|
289
|
+
* Drop and recreate the vec_chunks virtual table with new dimensions.
|
|
290
|
+
* Preserves kb_chunks, kb_documents, and fts_chunks — only vector data is affected.
|
|
291
|
+
*/
|
|
292
|
+
rebuildVecTable(newDimensions) {
|
|
293
|
+
this.db.exec("DROP TABLE IF EXISTS vec_chunks");
|
|
294
|
+
this.db.exec(`
|
|
295
|
+
CREATE VIRTUAL TABLE vec_chunks USING vec0(
|
|
296
|
+
chunk_id TEXT PRIMARY KEY,
|
|
297
|
+
embedding float[${newDimensions}]
|
|
298
|
+
);
|
|
299
|
+
`);
|
|
300
|
+
this.dimensions = newDimensions;
|
|
301
|
+
this.dimensionWarned = false;
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Batch-insert embeddings into vec_chunks.
|
|
305
|
+
* Expects an array of { chunkId, embedding } pairs.
|
|
306
|
+
*/
|
|
307
|
+
batchInsertEmbeddings(items) {
|
|
308
|
+
const insert = this.db.prepare("INSERT OR REPLACE INTO vec_chunks (chunk_id, embedding) VALUES (?, ?)");
|
|
309
|
+
this.db.transaction((batch) => {
|
|
310
|
+
for (const item of batch) insert.run(item.chunkId, new Float32Array(item.embedding));
|
|
311
|
+
})(items);
|
|
312
|
+
}
|
|
313
|
+
async getStats() {
|
|
314
|
+
const docCount = this.db.prepare("SELECT COUNT(*) as count FROM kb_documents").get();
|
|
315
|
+
const chunkCount = this.db.prepare("SELECT COUNT(*) as count FROM kb_chunks").get();
|
|
316
|
+
let dbSizeBytes = 0;
|
|
317
|
+
try {
|
|
318
|
+
dbSizeBytes = statSync(this.dbPath).size;
|
|
319
|
+
} catch {}
|
|
320
|
+
return {
|
|
321
|
+
documentCount: docCount.count,
|
|
322
|
+
chunkCount: chunkCount.count,
|
|
323
|
+
embeddingDimensions: this.dimensions,
|
|
324
|
+
dbSizeBytes
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
async findBySourceUrl(url) {
|
|
328
|
+
const row = this.db.prepare("SELECT * FROM kb_documents WHERE source_url = ?").get(url);
|
|
329
|
+
return row ? this.rowToDocument(row) : null;
|
|
330
|
+
}
|
|
331
|
+
async findByContentHash(hash) {
|
|
332
|
+
const row = this.db.prepare("SELECT * FROM kb_documents WHERE content_hash = ?").get(hash);
|
|
333
|
+
return row ? this.rowToDocument(row) : null;
|
|
334
|
+
}
|
|
335
|
+
async updateDocument(id, updates) {
|
|
336
|
+
const sets = [];
|
|
337
|
+
const values = [];
|
|
338
|
+
if (updates.content !== void 0) {
|
|
339
|
+
sets.push("content = ?");
|
|
340
|
+
values.push(updates.content);
|
|
341
|
+
}
|
|
342
|
+
if (updates.title !== void 0) {
|
|
343
|
+
sets.push("title = ?");
|
|
344
|
+
values.push(updates.title);
|
|
345
|
+
}
|
|
346
|
+
if (updates.contentHash !== void 0) {
|
|
347
|
+
sets.push("content_hash = ?");
|
|
348
|
+
values.push(updates.contentHash);
|
|
349
|
+
}
|
|
350
|
+
if (updates.priority !== void 0) {
|
|
351
|
+
sets.push("priority = ?");
|
|
352
|
+
values.push(updates.priority);
|
|
353
|
+
}
|
|
354
|
+
if (updates.metadata !== void 0) {
|
|
355
|
+
sets.push("metadata = ?");
|
|
356
|
+
values.push(JSON.stringify(updates.metadata));
|
|
357
|
+
}
|
|
358
|
+
sets.push("updated_at = ?");
|
|
359
|
+
values.push(Date.now());
|
|
360
|
+
values.push(id);
|
|
361
|
+
this.db.prepare(`UPDATE kb_documents SET ${sets.join(", ")} WHERE id = ?`).run(...values);
|
|
362
|
+
}
|
|
363
|
+
async findSimilarFaq(embedding, threshold) {
|
|
364
|
+
const results = await this.searchByEmbedding(embedding, {
|
|
365
|
+
sourceTypes: ["faq"],
|
|
366
|
+
limit: 1
|
|
367
|
+
});
|
|
368
|
+
if (results.length > 0 && results[0].score >= threshold) return results[0];
|
|
369
|
+
return null;
|
|
370
|
+
}
|
|
371
|
+
rowToDocument(row) {
|
|
372
|
+
return {
|
|
373
|
+
id: row.id,
|
|
374
|
+
sourceType: row.source_type,
|
|
375
|
+
sourceUrl: row.source_url || void 0,
|
|
376
|
+
fileName: row.file_name || void 0,
|
|
377
|
+
title: row.title || void 0,
|
|
378
|
+
content: row.content,
|
|
379
|
+
metadata: row.metadata ? JSON.parse(row.metadata) : void 0,
|
|
380
|
+
createdAt: row.created_at,
|
|
381
|
+
updatedAt: row.updated_at,
|
|
382
|
+
priority: row.priority ?? 2,
|
|
383
|
+
contentHash: row.content_hash || void 0
|
|
384
|
+
};
|
|
385
|
+
}
|
|
386
|
+
};
|
|
387
|
+
|
|
388
|
+
//#endregion
|
|
389
|
+
//#region src/TextChunker.ts
|
|
390
|
+
var TextChunker = class {
|
|
391
|
+
defaultChunkSize;
|
|
392
|
+
defaultChunkOverlap;
|
|
393
|
+
constructor(options) {
|
|
394
|
+
this.defaultChunkSize = options?.chunkSize || 3200;
|
|
395
|
+
this.defaultChunkOverlap = options?.chunkOverlap || 200;
|
|
396
|
+
}
|
|
397
|
+
async chunk(text, options) {
|
|
398
|
+
return (await new RecursiveCharacterTextSplitter({
|
|
399
|
+
chunkSize: options?.chunkSize || this.defaultChunkSize,
|
|
400
|
+
chunkOverlap: options?.chunkOverlap || this.defaultChunkOverlap
|
|
401
|
+
}).createDocuments([text])).map((d) => d.pageContent);
|
|
402
|
+
}
|
|
403
|
+
async chunkMarkdown(markdown, options) {
|
|
404
|
+
return (await new MarkdownTextSplitter({
|
|
405
|
+
chunkSize: options?.chunkSize || this.defaultChunkSize,
|
|
406
|
+
chunkOverlap: options?.chunkOverlap || this.defaultChunkOverlap
|
|
407
|
+
}).createDocuments([markdown])).map((d) => d.pageContent);
|
|
408
|
+
}
|
|
409
|
+
};
|
|
410
|
+
|
|
411
|
+
//#endregion
|
|
412
|
+
//#region src/QueryNormalizer.ts
|
|
413
|
+
/**
|
|
414
|
+
* Query normalization for improved KB retrieval.
|
|
415
|
+
* Expands chat abbreviations and normalizes whitespace before embedding.
|
|
416
|
+
*/
|
|
417
|
+
const ABBREVIATIONS = [
|
|
418
|
+
[/\bu\b/gi, "you"],
|
|
419
|
+
[/\bur\b/gi, "your"],
|
|
420
|
+
[/\br\b/gi, "are"],
|
|
421
|
+
[/\bpls\b/gi, "please"],
|
|
422
|
+
[/\bplz\b/gi, "please"],
|
|
423
|
+
[/\bthx\b/gi, "thanks"],
|
|
424
|
+
[/\bthnx\b/gi, "thanks"],
|
|
425
|
+
[/\bty\b/gi, "thank you"],
|
|
426
|
+
[/\bwat\b/gi, "what"],
|
|
427
|
+
[/\bbc\b/gi, "because"],
|
|
428
|
+
[/\bcuz\b/gi, "because"],
|
|
429
|
+
[/\bgonna\b/gi, "going to"],
|
|
430
|
+
[/\bwanna\b/gi, "want to"],
|
|
431
|
+
[/\bgotta\b/gi, "got to"],
|
|
432
|
+
[/\blemme\b/gi, "let me"],
|
|
433
|
+
[/\bgimme\b/gi, "give me"],
|
|
434
|
+
[/\bdunno\b/gi, "do not know"],
|
|
435
|
+
[/\bhrs\b/gi, "hours"],
|
|
436
|
+
[/\bmins\b/gi, "minutes"],
|
|
437
|
+
[/\bmsg\b/gi, "message"],
|
|
438
|
+
[/\bmsgs\b/gi, "messages"],
|
|
439
|
+
[/\binfo\b/gi, "information"],
|
|
440
|
+
[/\btmr\b/gi, "tomorrow"],
|
|
441
|
+
[/\btmrw\b/gi, "tomorrow"],
|
|
442
|
+
[/\bw\/\b/gi, "with"],
|
|
443
|
+
[/\bw\/o\b/gi, "without"],
|
|
444
|
+
[/\bidk\b/gi, "I do not know"],
|
|
445
|
+
[/\bimo\b/gi, "in my opinion"],
|
|
446
|
+
[/\bbtw\b/gi, "by the way"],
|
|
447
|
+
[/\basap\b/gi, "as soon as possible"],
|
|
448
|
+
[/\b4\b/g, "for"],
|
|
449
|
+
[/\b2\b/g, "to"]
|
|
450
|
+
];
|
|
451
|
+
/**
|
|
452
|
+
* Normalize a user query for better embedding similarity.
|
|
453
|
+
* - Expands chat abbreviations with word-boundary awareness
|
|
454
|
+
* - Lowercases
|
|
455
|
+
* - Collapses whitespace
|
|
456
|
+
*/
|
|
457
|
+
function normalizeQuery(query) {
|
|
458
|
+
let normalized = query.toLowerCase();
|
|
459
|
+
for (const [pattern, replacement] of ABBREVIATIONS) normalized = normalized.replace(pattern, replacement);
|
|
460
|
+
normalized = normalized.replace(/\s+/g, " ").trim();
|
|
461
|
+
return normalized;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
//#endregion
|
|
465
|
+
//#region src/IngestionPipeline.ts
|
|
466
|
+
var IngestionPipeline = class {
|
|
467
|
+
store;
|
|
468
|
+
embedder;
|
|
469
|
+
chunker;
|
|
470
|
+
llmProvider;
|
|
471
|
+
constructor(store, embedder, chunker, llmProvider) {
|
|
472
|
+
this.store = store;
|
|
473
|
+
this.embedder = embedder;
|
|
474
|
+
this.chunker = chunker;
|
|
475
|
+
this.llmProvider = llmProvider;
|
|
476
|
+
}
|
|
477
|
+
cleanContent(text) {
|
|
478
|
+
return text.replace(/!\[.*?\]\(.*?\)/g, "").replace(/[ \t]+/g, " ").replace(/(\n\s*){3,}/g, "\n\n").split("\n").filter((line, i, arr) => i === 0 || line !== arr[i - 1]).join("\n").trim().slice(0, 15e3);
|
|
479
|
+
}
|
|
480
|
+
async extractQAPairs(content, title) {
|
|
481
|
+
const cleaned = this.cleanContent(content);
|
|
482
|
+
const titleHint = title ? `\nPage title: "${title}"` : "";
|
|
483
|
+
const response = await this.llmProvider.complete([{
|
|
484
|
+
role: "system",
|
|
485
|
+
content: `You extract self-contained Q&A pairs from web page content. Each answer must include ALL relevant details (names, numbers, prices, dates) so it can be understood without the original page. Output ONLY a JSON array of {"question":"...","answer":"..."} objects. No markdown fences.`
|
|
486
|
+
}, {
|
|
487
|
+
role: "user",
|
|
488
|
+
content: `Extract Q&A pairs from this content.${titleHint}\n\n${cleaned}`
|
|
489
|
+
}]);
|
|
490
|
+
try {
|
|
491
|
+
const text = response.text.replace(/^```(?:json)?\s*/m, "").replace(/\s*```\s*$/m, "").trim();
|
|
492
|
+
const parsed = JSON.parse(text);
|
|
493
|
+
if (Array.isArray(parsed)) return parsed.filter((p) => p.question && p.answer);
|
|
494
|
+
} catch {
|
|
495
|
+
const match = response.text.match(/\[[\s\S]*\]/);
|
|
496
|
+
if (match) try {
|
|
497
|
+
const parsed = JSON.parse(match[0]);
|
|
498
|
+
if (Array.isArray(parsed)) return parsed.filter((p) => p.question && p.answer);
|
|
499
|
+
} catch {}
|
|
500
|
+
}
|
|
501
|
+
return [];
|
|
502
|
+
}
|
|
503
|
+
computeHash(content) {
|
|
504
|
+
return createHash("sha256").update(content).digest("hex");
|
|
505
|
+
}
|
|
506
|
+
async ingest(input) {
|
|
507
|
+
if (!input.content || input.content.trim().length === 0) throw new Error(`No content to ingest for "${input.title || input.sourceUrl || "unknown"}"`);
|
|
508
|
+
const sqliteStore = this.store;
|
|
509
|
+
const cleaned = this.cleanContent(input.content);
|
|
510
|
+
const contentHash = this.computeHash(cleaned);
|
|
511
|
+
const priority = input.priority ?? (input.sourceType === "faq" ? 1 : 2);
|
|
512
|
+
if (input.sourceUrl && sqliteStore.findBySourceUrl) {
|
|
513
|
+
const existing = await sqliteStore.findBySourceUrl(input.sourceUrl);
|
|
514
|
+
if (existing) {
|
|
515
|
+
await sqliteStore.updateDocument(existing.id, {
|
|
516
|
+
content: input.content,
|
|
517
|
+
title: input.title,
|
|
518
|
+
contentHash,
|
|
519
|
+
priority,
|
|
520
|
+
metadata: input.metadata
|
|
521
|
+
});
|
|
522
|
+
await this.store.deleteDocument(existing.id);
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
if (sqliteStore.findByContentHash) {
|
|
526
|
+
const existing = await sqliteStore.findByContentHash(contentHash);
|
|
527
|
+
if (existing) {
|
|
528
|
+
console.log(`[KB] Duplicate content detected (hash match), skipping: "${input.title || input.sourceUrl || "unknown"}"`);
|
|
529
|
+
return existing;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
if (input.extractQA && this.llmProvider) {
|
|
533
|
+
const pairs = await this.extractQAPairs(input.content, input.title);
|
|
534
|
+
if (pairs.length > 0) {
|
|
535
|
+
for (const pair of pairs) await this.ingestFaq(pair.question, pair.answer, { sourceUrl: input.sourceUrl });
|
|
536
|
+
const now = Date.now();
|
|
537
|
+
const parentDoc = {
|
|
538
|
+
id: randomUUID(),
|
|
539
|
+
sourceType: input.sourceType,
|
|
540
|
+
sourceUrl: input.sourceUrl,
|
|
541
|
+
fileName: input.fileName,
|
|
542
|
+
title: input.title,
|
|
543
|
+
content: `Extracted ${pairs.length} Q&A pairs`,
|
|
544
|
+
metadata: {
|
|
545
|
+
...input.metadata,
|
|
546
|
+
faqCount: pairs.length
|
|
547
|
+
},
|
|
548
|
+
createdAt: now,
|
|
549
|
+
updatedAt: now,
|
|
550
|
+
priority,
|
|
551
|
+
contentHash
|
|
552
|
+
};
|
|
553
|
+
await this.store.addDocument(parentDoc);
|
|
554
|
+
return parentDoc;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
const now = Date.now();
|
|
558
|
+
const doc = {
|
|
559
|
+
id: randomUUID(),
|
|
560
|
+
sourceType: input.sourceType,
|
|
561
|
+
sourceUrl: input.sourceUrl,
|
|
562
|
+
fileName: input.fileName,
|
|
563
|
+
title: input.title,
|
|
564
|
+
content: input.content,
|
|
565
|
+
metadata: input.metadata,
|
|
566
|
+
createdAt: now,
|
|
567
|
+
updatedAt: now,
|
|
568
|
+
priority,
|
|
569
|
+
contentHash
|
|
570
|
+
};
|
|
571
|
+
await this.store.addDocument(doc);
|
|
572
|
+
const texts = input.isMarkdown || input.sourceType === "url" ? await this.chunker.chunkMarkdown(input.content) : await this.chunker.chunk(input.content);
|
|
573
|
+
const embeddings = await this.embedder.embedMany(texts);
|
|
574
|
+
const chunks = texts.map((text, i) => ({
|
|
575
|
+
id: randomUUID(),
|
|
576
|
+
documentId: doc.id,
|
|
577
|
+
content: text,
|
|
578
|
+
chunkIndex: i,
|
|
579
|
+
embedding: embeddings[i],
|
|
580
|
+
metadata: input.metadata
|
|
581
|
+
}));
|
|
582
|
+
await this.store.addChunks(chunks);
|
|
583
|
+
if (this.store.getChunkCount) {
|
|
584
|
+
if (this.store.getChunkCount(doc.id) === 0) console.warn(`[KB] WARNING: Document "${input.title || doc.id}" was saved but NO vector embeddings were stored.`);
|
|
585
|
+
}
|
|
586
|
+
return doc;
|
|
587
|
+
}
|
|
588
|
+
async ingestFaq(question, answer, metadata) {
|
|
589
|
+
const embedding = await this.embedder.embed(normalizeQuery(question));
|
|
590
|
+
const sqliteStore = this.store;
|
|
591
|
+
if (sqliteStore.findSimilarFaq && !metadata?.forceReplace) {
|
|
592
|
+
const match = await sqliteStore.findSimilarFaq(embedding, .9);
|
|
593
|
+
if (match) {
|
|
594
|
+
const existingQ = match.chunk.metadata?.question || match.document.title;
|
|
595
|
+
const existingA = match.chunk.metadata?.answer;
|
|
596
|
+
const now = Date.now();
|
|
597
|
+
const content = `Q: ${question}\nA: ${answer}`;
|
|
598
|
+
return {
|
|
599
|
+
id: randomUUID(),
|
|
600
|
+
sourceType: "faq",
|
|
601
|
+
sourceUrl: metadata?.sourceUrl,
|
|
602
|
+
title: question,
|
|
603
|
+
content,
|
|
604
|
+
metadata: {
|
|
605
|
+
...metadata,
|
|
606
|
+
question,
|
|
607
|
+
answer
|
|
608
|
+
},
|
|
609
|
+
priority: 1,
|
|
610
|
+
createdAt: now,
|
|
611
|
+
updatedAt: now,
|
|
612
|
+
existingMatch: {
|
|
613
|
+
id: match.document.id,
|
|
614
|
+
question: existingQ,
|
|
615
|
+
answer: existingA,
|
|
616
|
+
score: match.score
|
|
617
|
+
}
|
|
618
|
+
};
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
if (metadata?.forceReplace && metadata?.replaceId) await this.store.deleteDocument(metadata.replaceId);
|
|
622
|
+
const now = Date.now();
|
|
623
|
+
const content = `Q: ${question}\nA: ${answer}`;
|
|
624
|
+
const doc = {
|
|
625
|
+
id: randomUUID(),
|
|
626
|
+
sourceType: "faq",
|
|
627
|
+
sourceUrl: metadata?.sourceUrl,
|
|
628
|
+
title: question,
|
|
629
|
+
content,
|
|
630
|
+
metadata: {
|
|
631
|
+
...metadata,
|
|
632
|
+
question,
|
|
633
|
+
answer
|
|
634
|
+
},
|
|
635
|
+
priority: 1,
|
|
636
|
+
createdAt: now,
|
|
637
|
+
updatedAt: now
|
|
638
|
+
};
|
|
639
|
+
await this.store.addDocument(doc);
|
|
640
|
+
const chunk = {
|
|
641
|
+
id: randomUUID(),
|
|
642
|
+
documentId: doc.id,
|
|
643
|
+
content,
|
|
644
|
+
chunkIndex: 0,
|
|
645
|
+
embedding,
|
|
646
|
+
metadata: {
|
|
647
|
+
question,
|
|
648
|
+
answer
|
|
649
|
+
}
|
|
650
|
+
};
|
|
651
|
+
await this.store.addChunks([chunk]);
|
|
652
|
+
return doc;
|
|
653
|
+
}
|
|
654
|
+
/**
|
|
655
|
+
* Rebuild all vector embeddings using the current embedding provider.
|
|
656
|
+
* Preserves all document content, chunks, and FTS data — only replaces vectors.
|
|
657
|
+
*
|
|
658
|
+
* Requires the store to be a SQLiteKnowledgeStore (uses rebuild-specific methods).
|
|
659
|
+
*/
|
|
660
|
+
async rebuild(onProgress) {
|
|
661
|
+
const sqliteStore = this.store;
|
|
662
|
+
if (!sqliteStore.getAllChunks || !sqliteStore.rebuildVecTable || !sqliteStore.batchInsertEmbeddings) throw new Error("Rebuild requires a SQLiteKnowledgeStore with rebuild methods.");
|
|
663
|
+
const oldDimensions = sqliteStore.getDimensions();
|
|
664
|
+
const newDimensions = this.embedder.dimensions;
|
|
665
|
+
const documents = await this.store.listDocuments();
|
|
666
|
+
const docMap = new Map(documents.map((d) => [d.id, d]));
|
|
667
|
+
const allChunks = sqliteStore.getAllChunks();
|
|
668
|
+
if (allChunks.length === 0) return {
|
|
669
|
+
documentsRebuilt: 0,
|
|
670
|
+
chunksRebuilt: 0,
|
|
671
|
+
oldDimensions,
|
|
672
|
+
newDimensions
|
|
673
|
+
};
|
|
674
|
+
sqliteStore.rebuildVecTable(newDimensions);
|
|
675
|
+
const chunksByDoc = /* @__PURE__ */ new Map();
|
|
676
|
+
for (const chunk of allChunks) {
|
|
677
|
+
const list = chunksByDoc.get(chunk.documentId) || [];
|
|
678
|
+
list.push(chunk);
|
|
679
|
+
chunksByDoc.set(chunk.documentId, list);
|
|
680
|
+
}
|
|
681
|
+
let processedDocs = 0;
|
|
682
|
+
const totalDocs = chunksByDoc.size;
|
|
683
|
+
let totalChunksRebuilt = 0;
|
|
684
|
+
for (const [docId, chunks] of chunksByDoc) {
|
|
685
|
+
const doc = docMap.get(docId);
|
|
686
|
+
const docTitle = doc?.title || docId.slice(0, 8);
|
|
687
|
+
onProgress?.(processedDocs, totalDocs, docTitle);
|
|
688
|
+
const textsToEmbed = [];
|
|
689
|
+
for (const chunk of chunks) if (doc?.sourceType === "faq") {
|
|
690
|
+
const question = (chunk.metadata ? JSON.parse(chunk.metadata) : null)?.question || doc.title || chunk.content;
|
|
691
|
+
textsToEmbed.push(normalizeQuery(question));
|
|
692
|
+
} else textsToEmbed.push(chunk.content);
|
|
693
|
+
const embeddings = await this.embedder.embedMany(textsToEmbed);
|
|
694
|
+
const items = chunks.map((chunk, i) => ({
|
|
695
|
+
chunkId: chunk.id,
|
|
696
|
+
embedding: embeddings[i]
|
|
697
|
+
}));
|
|
698
|
+
sqliteStore.batchInsertEmbeddings(items);
|
|
699
|
+
totalChunksRebuilt += chunks.length;
|
|
700
|
+
processedDocs++;
|
|
701
|
+
}
|
|
702
|
+
onProgress?.(totalDocs, totalDocs, "done");
|
|
703
|
+
return {
|
|
704
|
+
documentsRebuilt: totalDocs,
|
|
705
|
+
chunksRebuilt: totalChunksRebuilt,
|
|
706
|
+
oldDimensions,
|
|
707
|
+
newDimensions
|
|
708
|
+
};
|
|
709
|
+
}
|
|
710
|
+
};
|
|
711
|
+
|
|
712
|
+
//#endregion
|
|
713
|
+
//#region src/RankFusion.ts
|
|
714
|
+
/**
|
|
715
|
+
* Reciprocal Rank Fusion (RRF) for combining multiple ranked result sets.
|
|
716
|
+
* Standard technique for hybrid search (vector + keyword).
|
|
717
|
+
*/
|
|
718
|
+
/**
|
|
719
|
+
* Fuse multiple ranked result sets using Reciprocal Rank Fusion.
|
|
720
|
+
*
|
|
721
|
+
* @param resultSets - Array of Maps where key = item ID, value = rank (0-based)
|
|
722
|
+
* @param k - Smoothing constant (default 60, industry standard)
|
|
723
|
+
* @returns Map of item ID → fused RRF score, sorted descending by score
|
|
724
|
+
*/
|
|
725
|
+
function reciprocalRankFusion(resultSets, k = 60) {
|
|
726
|
+
const scores = /* @__PURE__ */ new Map();
|
|
727
|
+
for (const rankMap of resultSets) for (const [id, rank] of rankMap) {
|
|
728
|
+
const prev = scores.get(id) ?? 0;
|
|
729
|
+
scores.set(id, prev + 1 / (k + rank));
|
|
730
|
+
}
|
|
731
|
+
return new Map([...scores.entries()].sort((a, b) => b[1] - a[1]));
|
|
732
|
+
}
|
|
733
|
+
/**
|
|
734
|
+
* Weighted Score Fusion: combine vector and keyword scores using weighted average.
|
|
735
|
+
* BM25 scores are min-max normalized to 0-1 before combining.
|
|
736
|
+
*
|
|
737
|
+
* @returns Map of item ID → fused score, sorted descending
|
|
738
|
+
*/
|
|
739
|
+
function weightedScoreFusion(vectorResults, keywordResults, vectorWeight = .7, keywordWeight = .3) {
|
|
740
|
+
const bm25Scores = /* @__PURE__ */ new Map();
|
|
741
|
+
if (keywordResults.length > 0) {
|
|
742
|
+
const scores = keywordResults.map((r) => r.score);
|
|
743
|
+
const min = Math.min(...scores);
|
|
744
|
+
const range = Math.max(...scores) - min || 1;
|
|
745
|
+
for (const r of keywordResults) bm25Scores.set(r.id, (r.score - min) / range);
|
|
746
|
+
}
|
|
747
|
+
const vecScores = /* @__PURE__ */ new Map();
|
|
748
|
+
for (const r of vectorResults) vecScores.set(r.id, r.score);
|
|
749
|
+
const allIds = new Set([...vecScores.keys(), ...bm25Scores.keys()]);
|
|
750
|
+
const fused = /* @__PURE__ */ new Map();
|
|
751
|
+
for (const id of allIds) {
|
|
752
|
+
const vs = vecScores.get(id) ?? 0;
|
|
753
|
+
const ks = bm25Scores.get(id) ?? 0;
|
|
754
|
+
fused.set(id, vectorWeight * vs + keywordWeight * ks);
|
|
755
|
+
}
|
|
756
|
+
return new Map([...fused.entries()].sort((a, b) => b[1] - a[1]));
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
//#endregion
|
|
760
|
+
//#region src/RetrievalPipeline.ts
|
|
761
|
+
/**
|
|
762
|
+
* Heuristic splitter for compound questions. No LLM call — zero latency cost.
|
|
763
|
+
* Splits on "?" followed by more text, or " and " when both sides are >3 chars.
|
|
764
|
+
* Returns the original query in a single-element array if no split detected.
|
|
765
|
+
* Capped at 4 sub-queries max.
|
|
766
|
+
*/
|
|
767
|
+
function splitCompoundQuery(query) {
|
|
768
|
+
const qParts = query.split(/\?\s*/).filter((p) => p.trim().length > 3);
|
|
769
|
+
if (qParts.length > 1) return qParts.slice(0, 4).map((p) => p.trim());
|
|
770
|
+
const andParts = query.split(/\s+and\s+/i).filter((p) => p.trim().length > 3);
|
|
771
|
+
if (andParts.length > 1) return andParts.slice(0, 4).map((p) => p.trim());
|
|
772
|
+
return [query];
|
|
773
|
+
}
|
|
774
|
+
var RetrievalPipeline = class {
|
|
775
|
+
store;
|
|
776
|
+
embedder;
|
|
777
|
+
faqThreshold;
|
|
778
|
+
faqLowThreshold;
|
|
779
|
+
faqScoreGap;
|
|
780
|
+
useHybridSearch;
|
|
781
|
+
queryRewriter;
|
|
782
|
+
rewriteHighThreshold;
|
|
783
|
+
rewriteLowThreshold;
|
|
784
|
+
fusionStrategy;
|
|
785
|
+
constructor(store, embedder, thresholdOrOptions) {
|
|
786
|
+
this.store = store;
|
|
787
|
+
this.embedder = embedder;
|
|
788
|
+
if (typeof thresholdOrOptions === "number") {
|
|
789
|
+
this.faqThreshold = thresholdOrOptions;
|
|
790
|
+
this.faqLowThreshold = .7;
|
|
791
|
+
this.faqScoreGap = .15;
|
|
792
|
+
this.useHybridSearch = true;
|
|
793
|
+
this.rewriteHighThreshold = .7;
|
|
794
|
+
this.rewriteLowThreshold = .5;
|
|
795
|
+
this.fusionStrategy = "rrf";
|
|
796
|
+
} else {
|
|
797
|
+
const opts = thresholdOrOptions ?? {};
|
|
798
|
+
this.faqThreshold = opts.faqThreshold ?? .85;
|
|
799
|
+
this.faqLowThreshold = opts.faqLowThreshold ?? .7;
|
|
800
|
+
this.faqScoreGap = opts.faqScoreGap ?? .15;
|
|
801
|
+
this.useHybridSearch = opts.useHybridSearch ?? true;
|
|
802
|
+
this.queryRewriter = opts.queryRewriter;
|
|
803
|
+
this.rewriteHighThreshold = opts.rewriteHighThreshold ?? .7;
|
|
804
|
+
this.rewriteLowThreshold = opts.rewriteLowThreshold ?? .5;
|
|
805
|
+
this.fusionStrategy = opts.fusionStrategy ?? "rrf";
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
async retrieve(query, options) {
|
|
809
|
+
const subQueries = splitCompoundQuery(query);
|
|
810
|
+
if (subQueries.length <= 1) return this.retrieveSingle(query, options);
|
|
811
|
+
const subResults = await Promise.all(subQueries.map((sq) => this.retrieveSingle(sq, options)));
|
|
812
|
+
const seen = /* @__PURE__ */ new Set();
|
|
813
|
+
const faqMatches = [];
|
|
814
|
+
for (const sr of subResults) if (sr.isFaqMatch && sr.faqAnswer && sr.faqQuestion) {
|
|
815
|
+
const docId = sr.results[0]?.document?.id;
|
|
816
|
+
if (docId && !seen.has(docId)) {
|
|
817
|
+
seen.add(docId);
|
|
818
|
+
faqMatches.push({
|
|
819
|
+
faqQuestion: sr.faqQuestion,
|
|
820
|
+
faqAnswer: sr.faqAnswer,
|
|
821
|
+
score: sr.results[0]?.score ?? 0
|
|
822
|
+
});
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
if (faqMatches.length >= 2) {
|
|
826
|
+
const allResults = subResults.flatMap((sr) => sr.results);
|
|
827
|
+
const seenChunks = /* @__PURE__ */ new Set();
|
|
828
|
+
const dedupedResults = allResults.filter((r) => {
|
|
829
|
+
if (seenChunks.has(r.chunk.id)) return false;
|
|
830
|
+
seenChunks.add(r.chunk.id);
|
|
831
|
+
return true;
|
|
832
|
+
});
|
|
833
|
+
return {
|
|
834
|
+
results: dedupedResults,
|
|
835
|
+
context: this.formatContext(dedupedResults),
|
|
836
|
+
isFaqMatch: true,
|
|
837
|
+
faqMatches,
|
|
838
|
+
faqAnswer: faqMatches[0].faqAnswer,
|
|
839
|
+
faqQuestion: faqMatches[0].faqQuestion
|
|
840
|
+
};
|
|
841
|
+
}
|
|
842
|
+
return this.retrieveSingle(query, options);
|
|
843
|
+
}
|
|
844
|
+
async retrieveSingle(query, options) {
|
|
845
|
+
const normalized = normalizeQuery(query);
|
|
846
|
+
const embedding = await this.embedder.embed(normalized);
|
|
847
|
+
const faqResults = await this.store.searchByEmbedding(embedding, {
|
|
848
|
+
...options,
|
|
849
|
+
sourceTypes: ["faq"],
|
|
850
|
+
limit: 2
|
|
851
|
+
});
|
|
852
|
+
if (faqResults.length > 0) {
|
|
853
|
+
let top = faqResults[0];
|
|
854
|
+
if (faqResults.length > 1) {
|
|
855
|
+
if (top.score - faqResults[1].score <= .02 && (faqResults[1].document.updatedAt ?? 0) > (top.document.updatedAt ?? 0)) top = faqResults[1];
|
|
856
|
+
}
|
|
857
|
+
const faqAnswer = top.chunk.metadata?.answer || top.document.metadata?.answer;
|
|
858
|
+
const faqQuestion = top.chunk.metadata?.question || top.document.metadata?.question;
|
|
859
|
+
if (top.score >= this.faqThreshold) return {
|
|
860
|
+
results: [top],
|
|
861
|
+
context: this.formatContext([top]),
|
|
862
|
+
isFaqMatch: true,
|
|
863
|
+
faqAnswer,
|
|
864
|
+
faqQuestion
|
|
865
|
+
};
|
|
866
|
+
if (top.score >= this.faqLowThreshold) {
|
|
867
|
+
if ((faqResults.length > 1 ? top.score - faqResults[1].score : 1) > this.faqScoreGap) return {
|
|
868
|
+
results: [top],
|
|
869
|
+
context: this.formatContext([top]),
|
|
870
|
+
isFaqMatch: true,
|
|
871
|
+
faqAnswer,
|
|
872
|
+
faqQuestion
|
|
873
|
+
};
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
const results = await this.hybridSearch(normalized, embedding, options);
|
|
877
|
+
const topScore = results.length > 0 ? results[0].score : 0;
|
|
878
|
+
if (this.queryRewriter && topScore >= this.rewriteLowThreshold && topScore < this.rewriteHighThreshold) try {
|
|
879
|
+
const rewriteResult = await this.queryRewriter.rewrite(normalized);
|
|
880
|
+
const rewrittenEmbedding = await this.embedder.embed(rewriteResult.rewritten);
|
|
881
|
+
const rewrittenFaqResults = await this.store.searchByEmbedding(rewrittenEmbedding, {
|
|
882
|
+
...options,
|
|
883
|
+
sourceTypes: ["faq"],
|
|
884
|
+
limit: 2
|
|
885
|
+
});
|
|
886
|
+
if (rewrittenFaqResults.length > 0 && rewrittenFaqResults[0].score >= this.faqLowThreshold) {
|
|
887
|
+
const top = rewrittenFaqResults[0];
|
|
888
|
+
const gap = rewrittenFaqResults.length > 1 ? top.score - rewrittenFaqResults[1].score : 1;
|
|
889
|
+
if (top.score >= this.faqThreshold || gap > this.faqScoreGap) {
|
|
890
|
+
const faqAnswer = top.chunk.metadata?.answer || top.document.metadata?.answer;
|
|
891
|
+
const faqQuestion = top.chunk.metadata?.question || top.document.metadata?.question;
|
|
892
|
+
return {
|
|
893
|
+
results: [top],
|
|
894
|
+
context: this.formatContext([top]),
|
|
895
|
+
isFaqMatch: true,
|
|
896
|
+
rewritten: rewriteResult.rewritten,
|
|
897
|
+
faqAnswer,
|
|
898
|
+
faqQuestion
|
|
899
|
+
};
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
const rewrittenResults = await this.hybridSearch(rewriteResult.rewritten, rewrittenEmbedding, options);
|
|
903
|
+
if (rewrittenResults.length > 0 && rewrittenResults[0].score > topScore) return {
|
|
904
|
+
results: rewrittenResults,
|
|
905
|
+
context: this.formatContext(rewrittenResults),
|
|
906
|
+
isFaqMatch: false,
|
|
907
|
+
rewritten: rewriteResult.rewritten
|
|
908
|
+
};
|
|
909
|
+
} catch {}
|
|
910
|
+
return {
|
|
911
|
+
results,
|
|
912
|
+
context: this.formatContext(results),
|
|
913
|
+
isFaqMatch: false
|
|
914
|
+
};
|
|
915
|
+
}
|
|
916
|
+
async hybridSearch(query, embedding, options) {
|
|
917
|
+
const limit = options?.limit || 5;
|
|
918
|
+
if (!this.useHybridSearch || !this.store.searchByKeyword) {
|
|
919
|
+
const vecResults = await this.store.searchByEmbedding(embedding, {
|
|
920
|
+
...options,
|
|
921
|
+
limit
|
|
922
|
+
});
|
|
923
|
+
return this.applyBoosts(vecResults.slice(0, limit));
|
|
924
|
+
}
|
|
925
|
+
const searchOpts = {
|
|
926
|
+
...options,
|
|
927
|
+
limit: limit * 2
|
|
928
|
+
};
|
|
929
|
+
const [vecResults, ftsResults] = await Promise.all([this.store.searchByEmbedding(embedding, searchOpts), this.store.searchByKeyword(query, searchOpts)]);
|
|
930
|
+
if (ftsResults.length === 0) return this.applyBoosts(vecResults.slice(0, limit));
|
|
931
|
+
let fusedResults;
|
|
932
|
+
if (this.fusionStrategy === "weighted") {
|
|
933
|
+
const fused = weightedScoreFusion(vecResults.map((r) => ({
|
|
934
|
+
id: r.chunk.id,
|
|
935
|
+
score: r.score
|
|
936
|
+
})), ftsResults.map((r) => ({
|
|
937
|
+
id: r.chunk.id,
|
|
938
|
+
score: r.score
|
|
939
|
+
})));
|
|
940
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
941
|
+
for (const r of vecResults) resultMap.set(r.chunk.id, r);
|
|
942
|
+
for (const r of ftsResults) if (!resultMap.has(r.chunk.id)) resultMap.set(r.chunk.id, r);
|
|
943
|
+
fusedResults = [];
|
|
944
|
+
for (const [chunkId, fusedScore] of fused) {
|
|
945
|
+
if (fusedResults.length >= limit) break;
|
|
946
|
+
const result = resultMap.get(chunkId);
|
|
947
|
+
if (result) fusedResults.push({
|
|
948
|
+
...result,
|
|
949
|
+
score: fusedScore
|
|
950
|
+
});
|
|
951
|
+
}
|
|
952
|
+
} else {
|
|
953
|
+
const vecRanks = /* @__PURE__ */ new Map();
|
|
954
|
+
vecResults.forEach((r, i) => vecRanks.set(r.chunk.id, i));
|
|
955
|
+
const ftsRanks = /* @__PURE__ */ new Map();
|
|
956
|
+
ftsResults.forEach((r, i) => ftsRanks.set(r.chunk.id, i));
|
|
957
|
+
const fused = reciprocalRankFusion([vecRanks, ftsRanks]);
|
|
958
|
+
const resultMap = /* @__PURE__ */ new Map();
|
|
959
|
+
for (const r of vecResults) resultMap.set(r.chunk.id, r);
|
|
960
|
+
for (const r of ftsResults) if (!resultMap.has(r.chunk.id)) resultMap.set(r.chunk.id, r);
|
|
961
|
+
fusedResults = [];
|
|
962
|
+
for (const [chunkId, _rrfScore] of fused) {
|
|
963
|
+
if (fusedResults.length >= limit) break;
|
|
964
|
+
const result = resultMap.get(chunkId);
|
|
965
|
+
if (result) fusedResults.push(result);
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
return this.applyBoosts(fusedResults);
|
|
969
|
+
}
|
|
970
|
+
/**
|
|
971
|
+
* Apply freshness and priority boosts to search results, then re-sort.
|
|
972
|
+
*/
|
|
973
|
+
applyBoosts(results) {
|
|
974
|
+
if (results.length === 0) return results;
|
|
975
|
+
const thirtyDaysAgo = Date.now() - 720 * 60 * 60 * 1e3;
|
|
976
|
+
const boosted = results.map((r) => {
|
|
977
|
+
let score = r.score;
|
|
978
|
+
if (r.document.updatedAt && r.document.updatedAt > thirtyDaysAgo) score += .05;
|
|
979
|
+
const priority = r.document.priority ?? 2;
|
|
980
|
+
if (priority === 1) score += .03;
|
|
981
|
+
else if (priority === 3) score -= .02;
|
|
982
|
+
return {
|
|
983
|
+
...r,
|
|
984
|
+
score
|
|
985
|
+
};
|
|
986
|
+
});
|
|
987
|
+
boosted.sort((a, b) => b.score - a.score);
|
|
988
|
+
return boosted;
|
|
989
|
+
}
|
|
990
|
+
formatContext(results) {
|
|
991
|
+
if (results.length === 0) return "";
|
|
992
|
+
return `## Knowledge Base Context\n\n${results.map((r, i) => {
|
|
993
|
+
const source = r.document.title || r.document.sourceUrl || r.document.fileName || "Unknown";
|
|
994
|
+
return `### Source ${i + 1}: ${source} (score: ${r.score.toFixed(2)})\n${r.chunk.content}`;
|
|
995
|
+
}).join("\n\n")}`;
|
|
996
|
+
}
|
|
997
|
+
};
|
|
998
|
+
|
|
999
|
+
//#endregion
|
|
1000
|
+
//#region src/QueryRewriter.ts
|
|
1001
|
+
const SYSTEM_PROMPT = "You are a query normalizer. Rewrite the following informal/casual query into a clear, well-formed question. Only output the rewritten question, nothing else.";
|
|
1002
|
+
const MAX_CACHE_SIZE = 1e3;
|
|
1003
|
+
var QueryRewriter = class {
|
|
1004
|
+
model;
|
|
1005
|
+
cache;
|
|
1006
|
+
maxCacheSize;
|
|
1007
|
+
constructor(options) {
|
|
1008
|
+
this.model = options.model;
|
|
1009
|
+
this.cache = /* @__PURE__ */ new Map();
|
|
1010
|
+
this.maxCacheSize = options.maxCacheSize ?? MAX_CACHE_SIZE;
|
|
1011
|
+
}
|
|
1012
|
+
async rewrite(query) {
|
|
1013
|
+
const cacheKey = query.toLowerCase().trim();
|
|
1014
|
+
const cached = this.cache.get(cacheKey);
|
|
1015
|
+
if (cached) return {
|
|
1016
|
+
original: query,
|
|
1017
|
+
rewritten: cached,
|
|
1018
|
+
cached: true
|
|
1019
|
+
};
|
|
1020
|
+
const { text, usage } = await generateText({
|
|
1021
|
+
model: this.model,
|
|
1022
|
+
system: SYSTEM_PROMPT,
|
|
1023
|
+
prompt: query
|
|
1024
|
+
});
|
|
1025
|
+
const rewritten = text.trim();
|
|
1026
|
+
if (this.cache.size >= this.maxCacheSize) {
|
|
1027
|
+
const oldest = this.cache.keys().next().value;
|
|
1028
|
+
this.cache.delete(oldest);
|
|
1029
|
+
}
|
|
1030
|
+
this.cache.set(cacheKey, rewritten);
|
|
1031
|
+
return {
|
|
1032
|
+
original: query,
|
|
1033
|
+
rewritten,
|
|
1034
|
+
cached: false,
|
|
1035
|
+
tokenUsage: usage ? {
|
|
1036
|
+
prompt: usage.promptTokens,
|
|
1037
|
+
completion: usage.completionTokens
|
|
1038
|
+
} : void 0
|
|
1039
|
+
};
|
|
1040
|
+
}
|
|
1041
|
+
get cacheSize() {
|
|
1042
|
+
return this.cache.size;
|
|
1043
|
+
}
|
|
1044
|
+
clearCache() {
|
|
1045
|
+
this.cache.clear();
|
|
1046
|
+
}
|
|
1047
|
+
};
|
|
1048
|
+
|
|
1049
|
+
//#endregion
|
|
1050
|
+
//#region src/ingestors/content-fetcher.ts
|
|
1051
|
+
let crawl4aiHealthy = null;
|
|
1052
|
+
let crawl4aiHealthCheckedAt = 0;
|
|
1053
|
+
const HEALTH_CACHE_MS = 300 * 1e3;
|
|
1054
|
+
/** Reset the Crawl4AI health check cache. Used in tests. */
|
|
1055
|
+
function resetCrawl4aiHealthCache() {
|
|
1056
|
+
crawl4aiHealthy = null;
|
|
1057
|
+
crawl4aiHealthCheckedAt = 0;
|
|
1058
|
+
}
|
|
1059
|
+
/**
|
|
1060
|
+
* Fetch raw HTML from a URL. Used for link extraction, sitemaps, etc.
|
|
1061
|
+
*/
|
|
1062
|
+
async function fetchHtml(url) {
|
|
1063
|
+
const response = await fetch(url, { headers: { "User-Agent": "Operor-KB/1.0" } });
|
|
1064
|
+
if (!response.ok) throw new Error(`Failed to fetch ${url}: ${response.status}`);
|
|
1065
|
+
return response.text();
|
|
1066
|
+
}
|
|
1067
|
+
/**
|
|
1068
|
+
* Smart content fetch: tries Crawl4AI first (if configured), falls back to Readability.
|
|
1069
|
+
*/
|
|
1070
|
+
async function fetchContent(url, options) {
|
|
1071
|
+
if (options?.crawl4aiUrl) try {
|
|
1072
|
+
if (await isCrawl4aiHealthy(options.crawl4aiUrl)) return await fetchViaCrawl4AI(url, options.crawl4aiUrl);
|
|
1073
|
+
} catch {}
|
|
1074
|
+
const { title, content } = extractFromHtml(await fetchHtml(url), url);
|
|
1075
|
+
return {
|
|
1076
|
+
title,
|
|
1077
|
+
content,
|
|
1078
|
+
isMarkdown: false
|
|
1079
|
+
};
|
|
1080
|
+
}
|
|
1081
|
+
/**
|
|
1082
|
+
* Extract readable content from pre-fetched HTML using @mozilla/readability.
|
|
1083
|
+
*/
|
|
1084
|
+
function extractFromHtml(html, url) {
|
|
1085
|
+
const { document } = parseHTML(html);
|
|
1086
|
+
const article = new Readability(document, { url }).parse();
|
|
1087
|
+
return {
|
|
1088
|
+
title: article?.title || "",
|
|
1089
|
+
content: article?.textContent?.trim() || ""
|
|
1090
|
+
};
|
|
1091
|
+
}
|
|
1092
|
+
/**
|
|
1093
|
+
* Extract same-domain links from HTML.
|
|
1094
|
+
*/
|
|
1095
|
+
function extractLinks(html, baseUrl) {
|
|
1096
|
+
const { document } = parseHTML(html);
|
|
1097
|
+
const links = [];
|
|
1098
|
+
const base = new URL(baseUrl);
|
|
1099
|
+
for (const a of document.querySelectorAll("a[href]")) try {
|
|
1100
|
+
const href = a.getAttribute("href");
|
|
1101
|
+
if (!href) continue;
|
|
1102
|
+
const resolved = new URL(href, baseUrl);
|
|
1103
|
+
if (resolved.hostname === base.hostname && resolved.protocol.startsWith("http")) links.push(resolved.href.split("#")[0]);
|
|
1104
|
+
} catch {}
|
|
1105
|
+
return [...new Set(links)];
|
|
1106
|
+
}
|
|
1107
|
+
async function isCrawl4aiHealthy(baseUrl) {
|
|
1108
|
+
if (crawl4aiHealthy !== null && Date.now() - crawl4aiHealthCheckedAt < HEALTH_CACHE_MS) return crawl4aiHealthy;
|
|
1109
|
+
try {
|
|
1110
|
+
crawl4aiHealthy = (await fetch(`${baseUrl}/health`, { signal: AbortSignal.timeout(2e3) })).ok;
|
|
1111
|
+
} catch {
|
|
1112
|
+
crawl4aiHealthy = false;
|
|
1113
|
+
}
|
|
1114
|
+
crawl4aiHealthCheckedAt = Date.now();
|
|
1115
|
+
return crawl4aiHealthy;
|
|
1116
|
+
}
|
|
1117
|
+
async function fetchViaCrawl4AI(url, baseUrl) {
|
|
1118
|
+
const res = await fetch(`${baseUrl}/crawl`, {
|
|
1119
|
+
method: "POST",
|
|
1120
|
+
headers: { "Content-Type": "application/json" },
|
|
1121
|
+
body: JSON.stringify({
|
|
1122
|
+
urls: [url],
|
|
1123
|
+
browser_config: {
|
|
1124
|
+
type: "BrowserConfig",
|
|
1125
|
+
params: { headless: true }
|
|
1126
|
+
},
|
|
1127
|
+
crawler_config: {
|
|
1128
|
+
type: "CrawlerRunConfig",
|
|
1129
|
+
params: {
|
|
1130
|
+
cache_mode: "bypass",
|
|
1131
|
+
markdown_generator: {
|
|
1132
|
+
type: "DefaultMarkdownGenerator",
|
|
1133
|
+
params: { content_filter: {
|
|
1134
|
+
type: "PruningContentFilter",
|
|
1135
|
+
params: { threshold: .48 }
|
|
1136
|
+
} }
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
}),
|
|
1141
|
+
signal: AbortSignal.timeout(3e4)
|
|
1142
|
+
});
|
|
1143
|
+
if (!res.ok) throw new Error(`Crawl4AI error: ${res.status}`);
|
|
1144
|
+
const result = (await res.json()).results?.[0];
|
|
1145
|
+
if (!result) throw new Error("Crawl4AI returned no results");
|
|
1146
|
+
const markdown = result.markdown?.fit_markdown || result.markdown?.raw_markdown || "";
|
|
1147
|
+
if (!markdown) throw new Error("Crawl4AI returned empty markdown");
|
|
1148
|
+
return {
|
|
1149
|
+
title: markdown.match(/^#\s+(.+)$/m)?.[1] || "",
|
|
1150
|
+
content: markdown,
|
|
1151
|
+
isMarkdown: true
|
|
1152
|
+
};
|
|
1153
|
+
}
|
|
1154
|
+
|
|
1155
|
+
//#endregion
|
|
1156
|
+
//#region src/ingestors/UrlIngestor.ts
|
|
1157
|
+
var UrlIngestor = class {
|
|
1158
|
+
pipeline;
|
|
1159
|
+
fetchOptions;
|
|
1160
|
+
constructor(pipeline, options) {
|
|
1161
|
+
this.pipeline = pipeline;
|
|
1162
|
+
this.fetchOptions = { crawl4aiUrl: options?.crawl4aiUrl };
|
|
1163
|
+
}
|
|
1164
|
+
async ingestUrl(url, options) {
|
|
1165
|
+
const { title, content, isMarkdown } = await fetchContent(url, this.fetchOptions);
|
|
1166
|
+
return this.pipeline.ingest({
|
|
1167
|
+
sourceType: "url",
|
|
1168
|
+
sourceUrl: url,
|
|
1169
|
+
title,
|
|
1170
|
+
content,
|
|
1171
|
+
isMarkdown,
|
|
1172
|
+
priority: options?.priority,
|
|
1173
|
+
extractQA: options?.extractQA
|
|
1174
|
+
});
|
|
1175
|
+
}
|
|
1176
|
+
async ingestSitemap(sitemapUrl, options) {
|
|
1177
|
+
const maxPages = options?.maxPages || 50;
|
|
1178
|
+
const xml = await fetchHtml(sitemapUrl);
|
|
1179
|
+
const urls = this.parseSitemapUrls(xml).slice(0, maxPages);
|
|
1180
|
+
const docs = [];
|
|
1181
|
+
for (const url of urls) try {
|
|
1182
|
+
const doc = await this.ingestUrl(url);
|
|
1183
|
+
docs.push(doc);
|
|
1184
|
+
} catch {}
|
|
1185
|
+
return docs;
|
|
1186
|
+
}
|
|
1187
|
+
async crawl(startUrl, options) {
|
|
1188
|
+
const maxPages = options?.maxPages || 20;
|
|
1189
|
+
const maxDepth = options?.maxDepth || 2;
|
|
1190
|
+
const visited = /* @__PURE__ */ new Set();
|
|
1191
|
+
const docs = [];
|
|
1192
|
+
const queue = [{
|
|
1193
|
+
url: startUrl,
|
|
1194
|
+
depth: 0
|
|
1195
|
+
}];
|
|
1196
|
+
while (queue.length > 0 && docs.length < maxPages) {
|
|
1197
|
+
const item = queue.shift();
|
|
1198
|
+
if (visited.has(item.url) || item.depth > maxDepth) continue;
|
|
1199
|
+
visited.add(item.url);
|
|
1200
|
+
try {
|
|
1201
|
+
const html = await fetchHtml(item.url);
|
|
1202
|
+
const { title, content, isMarkdown } = await fetchContent(item.url, this.fetchOptions);
|
|
1203
|
+
const doc = await this.pipeline.ingest({
|
|
1204
|
+
sourceType: "url",
|
|
1205
|
+
sourceUrl: item.url,
|
|
1206
|
+
title,
|
|
1207
|
+
content,
|
|
1208
|
+
isMarkdown
|
|
1209
|
+
});
|
|
1210
|
+
docs.push(doc);
|
|
1211
|
+
if (item.depth < maxDepth) {
|
|
1212
|
+
const links = extractLinks(html, item.url);
|
|
1213
|
+
for (const link of links) if (!visited.has(link)) queue.push({
|
|
1214
|
+
url: link,
|
|
1215
|
+
depth: item.depth + 1
|
|
1216
|
+
});
|
|
1217
|
+
}
|
|
1218
|
+
} catch {}
|
|
1219
|
+
}
|
|
1220
|
+
return docs;
|
|
1221
|
+
}
|
|
1222
|
+
parseSitemapUrls(xml) {
|
|
1223
|
+
const urls = [];
|
|
1224
|
+
const locRegex = /<loc>(.*?)<\/loc>/g;
|
|
1225
|
+
let match;
|
|
1226
|
+
while ((match = locRegex.exec(xml)) !== null) urls.push(match[1]);
|
|
1227
|
+
return urls.filter((u) => !u.endsWith(".xml"));
|
|
1228
|
+
}
|
|
1229
|
+
};
|
|
1230
|
+
|
|
1231
|
+
//#endregion
|
|
1232
|
+
//#region src/ingestors/FileIngestor.ts
|
|
1233
|
+
var FileIngestor = class {
|
|
1234
|
+
pipeline;
|
|
1235
|
+
constructor(pipeline) {
|
|
1236
|
+
this.pipeline = pipeline;
|
|
1237
|
+
}
|
|
1238
|
+
async ingestFile(filePath, title, options) {
|
|
1239
|
+
const ext = extname(filePath).toLowerCase();
|
|
1240
|
+
const content = await this.extractContent(filePath, ext);
|
|
1241
|
+
const fileName = filePath.split("/").pop() || filePath;
|
|
1242
|
+
return this.pipeline.ingest({
|
|
1243
|
+
sourceType: "file",
|
|
1244
|
+
fileName,
|
|
1245
|
+
title: title || fileName,
|
|
1246
|
+
content,
|
|
1247
|
+
priority: options?.priority
|
|
1248
|
+
});
|
|
1249
|
+
}
|
|
1250
|
+
async extractContent(filePath, ext) {
|
|
1251
|
+
switch (ext) {
|
|
1252
|
+
case ".pdf": return this.extractPdf(filePath);
|
|
1253
|
+
case ".docx": return this.extractDocx(filePath);
|
|
1254
|
+
case ".xlsx":
|
|
1255
|
+
case ".xls": return this.extractXlsx(filePath);
|
|
1256
|
+
case ".csv":
|
|
1257
|
+
case ".txt":
|
|
1258
|
+
case ".md": return readFile(filePath, "utf-8");
|
|
1259
|
+
case ".html":
|
|
1260
|
+
case ".htm": return this.extractHtml(filePath);
|
|
1261
|
+
default: return readFile(filePath, "utf-8");
|
|
1262
|
+
}
|
|
1263
|
+
}
|
|
1264
|
+
async extractPdf(filePath) {
|
|
1265
|
+
const { getDocumentProxy, extractText } = await import("unpdf");
|
|
1266
|
+
const buffer = await readFile(filePath);
|
|
1267
|
+
const { text } = await extractText(await getDocumentProxy(new Uint8Array(buffer)), { mergePages: true });
|
|
1268
|
+
return text;
|
|
1269
|
+
}
|
|
1270
|
+
async extractDocx(filePath) {
|
|
1271
|
+
const mammoth = await import("mammoth");
|
|
1272
|
+
const buffer = await readFile(filePath);
|
|
1273
|
+
return (await mammoth.extractRawText({ buffer })).value;
|
|
1274
|
+
}
|
|
1275
|
+
async extractXlsx(filePath) {
|
|
1276
|
+
const XLSX = await import("xlsx");
|
|
1277
|
+
const buffer = await readFile(filePath);
|
|
1278
|
+
const workbook = XLSX.read(buffer, { type: "buffer" });
|
|
1279
|
+
const lines = [];
|
|
1280
|
+
for (const sheetName of workbook.SheetNames) {
|
|
1281
|
+
const sheet = workbook.Sheets[sheetName];
|
|
1282
|
+
const csv = XLSX.utils.sheet_to_csv(sheet);
|
|
1283
|
+
lines.push(`## ${sheetName}\n${csv}`);
|
|
1284
|
+
}
|
|
1285
|
+
return lines.join("\n\n");
|
|
1286
|
+
}
|
|
1287
|
+
async extractHtml(filePath) {
|
|
1288
|
+
const { parseHTML } = await import("linkedom");
|
|
1289
|
+
const { Readability } = await import("@mozilla/readability");
|
|
1290
|
+
const html = await readFile(filePath, "utf-8");
|
|
1291
|
+
const { document } = parseHTML(html);
|
|
1292
|
+
return new Readability(document).parse()?.textContent?.trim() || html;
|
|
1293
|
+
}
|
|
1294
|
+
};
|
|
1295
|
+
|
|
1296
|
+
//#endregion
|
|
1297
|
+
//#region src/ingestors/SiteCrawler.ts
|
|
1298
|
+
var SiteCrawler = class {
|
|
1299
|
+
pipeline;
|
|
1300
|
+
fetchOptions;
|
|
1301
|
+
constructor(pipeline, options) {
|
|
1302
|
+
this.pipeline = pipeline;
|
|
1303
|
+
this.fetchOptions = { crawl4aiUrl: options?.crawl4aiUrl };
|
|
1304
|
+
}
|
|
1305
|
+
async crawlSite(startUrl, options = {}) {
|
|
1306
|
+
const { maxDepth = 2, maxPages = 50, useSitemap = true, delayMs = 500, onProgress } = options;
|
|
1307
|
+
const docs = [];
|
|
1308
|
+
const visited = /* @__PURE__ */ new Set();
|
|
1309
|
+
if (useSitemap) {
|
|
1310
|
+
const sitemapUrls = await this.tryFetchSitemap(startUrl);
|
|
1311
|
+
if (sitemapUrls.length > 0) {
|
|
1312
|
+
const urlsToIngest = sitemapUrls.slice(0, maxPages);
|
|
1313
|
+
for (const url of urlsToIngest) {
|
|
1314
|
+
if (visited.has(url)) continue;
|
|
1315
|
+
visited.add(url);
|
|
1316
|
+
try {
|
|
1317
|
+
onProgress?.(docs.length + 1, urlsToIngest.length, url);
|
|
1318
|
+
const doc = await this.ingestPage(url);
|
|
1319
|
+
docs.push(doc);
|
|
1320
|
+
if (delayMs > 0) await this.delay(delayMs);
|
|
1321
|
+
} catch {}
|
|
1322
|
+
if (docs.length >= maxPages) break;
|
|
1323
|
+
}
|
|
1324
|
+
return docs;
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
const queue = [{
|
|
1328
|
+
url: startUrl,
|
|
1329
|
+
depth: 0
|
|
1330
|
+
}];
|
|
1331
|
+
while (queue.length > 0 && docs.length < maxPages) {
|
|
1332
|
+
const item = queue.shift();
|
|
1333
|
+
if (visited.has(item.url) || item.depth > maxDepth) continue;
|
|
1334
|
+
visited.add(item.url);
|
|
1335
|
+
try {
|
|
1336
|
+
onProgress?.(docs.length + 1, docs.length + queue.length + 1, item.url);
|
|
1337
|
+
const html = await fetchHtml(item.url);
|
|
1338
|
+
const { title, content, isMarkdown } = await fetchContent(item.url, this.fetchOptions);
|
|
1339
|
+
const doc = await this.pipeline.ingest({
|
|
1340
|
+
sourceType: "url",
|
|
1341
|
+
sourceUrl: item.url,
|
|
1342
|
+
title,
|
|
1343
|
+
content,
|
|
1344
|
+
isMarkdown
|
|
1345
|
+
});
|
|
1346
|
+
docs.push(doc);
|
|
1347
|
+
if (delayMs > 0) await this.delay(delayMs);
|
|
1348
|
+
if (item.depth < maxDepth) {
|
|
1349
|
+
for (const link of extractLinks(html, item.url)) if (!visited.has(link)) queue.push({
|
|
1350
|
+
url: link,
|
|
1351
|
+
depth: item.depth + 1
|
|
1352
|
+
});
|
|
1353
|
+
}
|
|
1354
|
+
} catch {}
|
|
1355
|
+
}
|
|
1356
|
+
return docs;
|
|
1357
|
+
}
|
|
1358
|
+
async tryFetchSitemap(baseUrl) {
|
|
1359
|
+
try {
|
|
1360
|
+
const url = new URL(baseUrl);
|
|
1361
|
+
const xml = await fetchHtml(`${url.protocol}//${url.hostname}/sitemap.xml`);
|
|
1362
|
+
return await this.parseSitemapUrls(xml);
|
|
1363
|
+
} catch {
|
|
1364
|
+
return [];
|
|
1365
|
+
}
|
|
1366
|
+
}
|
|
1367
|
+
async parseSitemapUrls(xml) {
|
|
1368
|
+
const urls = [];
|
|
1369
|
+
const locRegex = /<loc>(.*?)<\/loc>/g;
|
|
1370
|
+
let match;
|
|
1371
|
+
while ((match = locRegex.exec(xml)) !== null) urls.push(match[1]);
|
|
1372
|
+
if (xml.includes("<sitemapindex") || urls.every((u) => u.endsWith(".xml"))) {
|
|
1373
|
+
const pageUrls = [];
|
|
1374
|
+
for (const childSitemapUrl of urls) try {
|
|
1375
|
+
const childXml = await fetchHtml(childSitemapUrl);
|
|
1376
|
+
const childUrls = [];
|
|
1377
|
+
const childRegex = /<loc>(.*?)<\/loc>/g;
|
|
1378
|
+
let childMatch;
|
|
1379
|
+
while ((childMatch = childRegex.exec(childXml)) !== null) childUrls.push(childMatch[1]);
|
|
1380
|
+
pageUrls.push(...childUrls.filter((u) => !u.endsWith(".xml")));
|
|
1381
|
+
} catch {}
|
|
1382
|
+
return pageUrls;
|
|
1383
|
+
}
|
|
1384
|
+
return urls.filter((u) => !u.endsWith(".xml"));
|
|
1385
|
+
}
|
|
1386
|
+
async ingestPage(url) {
|
|
1387
|
+
const { title, content, isMarkdown } = await fetchContent(url, this.fetchOptions);
|
|
1388
|
+
return this.pipeline.ingest({
|
|
1389
|
+
sourceType: "url",
|
|
1390
|
+
sourceUrl: url,
|
|
1391
|
+
title,
|
|
1392
|
+
content,
|
|
1393
|
+
isMarkdown
|
|
1394
|
+
});
|
|
1395
|
+
}
|
|
1396
|
+
delay(ms) {
|
|
1397
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
1398
|
+
}
|
|
1399
|
+
};
|
|
1400
|
+
|
|
1401
|
+
//#endregion
|
|
1402
|
+
//#region src/ingestors/WatiFaqSync.ts
|
|
1403
|
+
var WatiFaqSync = class {
|
|
1404
|
+
pipeline;
|
|
1405
|
+
llmExtract;
|
|
1406
|
+
constructor(pipeline, llmExtract) {
|
|
1407
|
+
this.pipeline = pipeline;
|
|
1408
|
+
this.llmExtract = llmExtract;
|
|
1409
|
+
}
|
|
1410
|
+
async syncFromConversations(conversations, options) {
|
|
1411
|
+
const minLen = options?.minAnswerLength || 20;
|
|
1412
|
+
const maxPairs = options?.maxPairs || 100;
|
|
1413
|
+
if (!this.llmExtract) throw new Error("LLM extract function required for FAQ extraction");
|
|
1414
|
+
const allPairs = [];
|
|
1415
|
+
for (const convo of conversations) {
|
|
1416
|
+
const pairs = await this.llmExtract(convo);
|
|
1417
|
+
allPairs.push(...pairs);
|
|
1418
|
+
if (allPairs.length >= maxPairs) break;
|
|
1419
|
+
}
|
|
1420
|
+
const filtered = allPairs.filter((p) => p.answer.length >= minLen && p.question.trim().length > 0).slice(0, maxPairs);
|
|
1421
|
+
const docs = [];
|
|
1422
|
+
for (const pair of filtered) {
|
|
1423
|
+
const doc = await this.pipeline.ingestFaq(pair.question, pair.answer, { source: "wati-sync" });
|
|
1424
|
+
docs.push(doc);
|
|
1425
|
+
}
|
|
1426
|
+
return docs;
|
|
1427
|
+
}
|
|
1428
|
+
async syncFromPairs(pairs, options) {
|
|
1429
|
+
const minLen = options?.minAnswerLength || 20;
|
|
1430
|
+
const filtered = pairs.filter((p) => p.answer.length >= minLen && p.question.trim().length > 0);
|
|
1431
|
+
const docs = [];
|
|
1432
|
+
for (const pair of filtered) {
|
|
1433
|
+
const doc = await this.pipeline.ingestFaq(pair.question, pair.answer, { source: "wati-sync" });
|
|
1434
|
+
docs.push(doc);
|
|
1435
|
+
}
|
|
1436
|
+
return docs;
|
|
1437
|
+
}
|
|
1438
|
+
};
|
|
1439
|
+
|
|
1440
|
+
//#endregion
|
|
1441
|
+
export { EmbeddingService, FileIngestor, IngestionPipeline, QueryRewriter, RetrievalPipeline, SQLiteKnowledgeStore, SiteCrawler, TextChunker, UrlIngestor, WatiFaqSync, extractFromHtml, extractLinks, fetchContent, fetchHtml, normalizeQuery, reciprocalRankFusion, resetCrawl4aiHealthCache, splitCompoundQuery, weightedScoreFusion };
|
|
1442
|
+
//# sourceMappingURL=index.js.map
|