@operor/knowledge 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,357 @@
1
+ import { randomUUID, createHash } from 'node:crypto';
2
+ import type { KBDocument, KBChunk, KnowledgeStore } from './types.js';
3
+ import type { EmbeddingService } from './EmbeddingService.js';
4
+ import type { TextChunker } from './TextChunker.js';
5
+ import type { SQLiteKnowledgeStore } from './SQLiteKnowledgeStore.js';
6
+ import { normalizeQuery } from './QueryNormalizer.js';
7
+
8
+ export interface ContentReformatter {
9
+ complete(messages: { role: 'system' | 'user'; content: string }[]): Promise<{ text: string }>;
10
+ }
11
+
12
+ export interface IngestInput {
13
+ sourceType: KBDocument['sourceType'];
14
+ content: string;
15
+ title?: string;
16
+ sourceUrl?: string;
17
+ fileName?: string;
18
+ metadata?: Record<string, any>;
19
+ isMarkdown?: boolean;
20
+ /** Opt-in to LLM Q&A extraction (expensive). Default: false (chunking path). */
21
+ extractQA?: boolean;
22
+ /** Document priority: 1=official, 2=supplementary, 3=archived. Auto-assigned if omitted. */
23
+ priority?: number;
24
+ }
25
+
26
+ export interface IngestFaqOptions {
27
+ sourceUrl?: string;
28
+ [key: string]: any;
29
+ }
30
+
31
+ export interface IngestFaqResult extends KBDocument {
32
+ existingMatch?: { id: string; question: string; answer: string; score: number };
33
+ }
34
+
35
+ export interface RebuildResult {
36
+ documentsRebuilt: number;
37
+ chunksRebuilt: number;
38
+ oldDimensions: number;
39
+ newDimensions: number;
40
+ }
41
+
42
+ export class IngestionPipeline {
43
+ private store: KnowledgeStore;
44
+ private embedder: EmbeddingService;
45
+ private chunker: TextChunker;
46
+ private llmProvider?: ContentReformatter;
47
+
48
+ constructor(store: KnowledgeStore, embedder: EmbeddingService, chunker: TextChunker, llmProvider?: ContentReformatter) {
49
+ this.store = store;
50
+ this.embedder = embedder;
51
+ this.chunker = chunker;
52
+ this.llmProvider = llmProvider;
53
+ }
54
+
55
+ private cleanContent(text: string): string {
56
+ return text
57
+ .replace(/!\[.*?\]\(.*?\)/g, '') // strip image markdown
58
+ .replace(/[ \t]+/g, ' ') // collapse horizontal whitespace
59
+ .replace(/(\n\s*){3,}/g, '\n\n') // collapse 3+ newlines to 2
60
+ .split('\n').filter((line, i, arr) => i === 0 || line !== arr[i - 1]).join('\n') // dedup consecutive identical lines
61
+ .trim()
62
+ .slice(0, 15000);
63
+ }
64
+
65
+ private async extractQAPairs(content: string, title?: string): Promise<Array<{ question: string; answer: string }>> {
66
+ const cleaned = this.cleanContent(content);
67
+ const titleHint = title ? `\nPage title: "${title}"` : '';
68
+ const response = await this.llmProvider!.complete([
69
+ {
70
+ role: 'system',
71
+ content: `You extract self-contained Q&A pairs from web page content. Each answer must include ALL relevant details (names, numbers, prices, dates) so it can be understood without the original page. Output ONLY a JSON array of {"question":"...","answer":"..."} objects. No markdown fences.`,
72
+ },
73
+ {
74
+ role: 'user',
75
+ content: `Extract Q&A pairs from this content.${titleHint}\n\n${cleaned}`,
76
+ },
77
+ ]);
78
+
79
+ try {
80
+ // Strip markdown fences if present
81
+ const text = response.text.replace(/^```(?:json)?\s*/m, '').replace(/\s*```\s*$/m, '').trim();
82
+ const parsed = JSON.parse(text);
83
+ if (Array.isArray(parsed)) return parsed.filter((p: any) => p.question && p.answer);
84
+ } catch {
85
+ // Try to find JSON array in response
86
+ const match = response.text.match(/\[[\s\S]*\]/);
87
+ if (match) {
88
+ try {
89
+ const parsed = JSON.parse(match[0]);
90
+ if (Array.isArray(parsed)) return parsed.filter((p: any) => p.question && p.answer);
91
+ } catch { /* fall through */ }
92
+ }
93
+ }
94
+ return [];
95
+ }
96
+
97
+ private computeHash(content: string): string {
98
+ return createHash('sha256').update(content).digest('hex');
99
+ }
100
+
101
+ async ingest(input: IngestInput): Promise<KBDocument> {
102
+ if (!input.content || input.content.trim().length === 0) {
103
+ throw new Error(`No content to ingest for "${input.title || input.sourceUrl || 'unknown'}"`);
104
+ }
105
+
106
+ const sqliteStore = this.store as SQLiteKnowledgeStore;
107
+ const cleaned = this.cleanContent(input.content);
108
+ const contentHash = this.computeHash(cleaned);
109
+
110
+ // Auto-assign priority by source type if not specified
111
+ const priority = input.priority ?? (input.sourceType === 'faq' ? 1 : 2);
112
+
113
+ // Dedup: check by source URL first
114
+ if (input.sourceUrl && sqliteStore.findBySourceUrl) {
115
+ const existing = await sqliteStore.findBySourceUrl(input.sourceUrl);
116
+ if (existing) {
117
+ // Update existing document instead of duplicating
118
+ await sqliteStore.updateDocument(existing.id, {
119
+ content: input.content,
120
+ title: input.title,
121
+ contentHash,
122
+ priority,
123
+ metadata: input.metadata,
124
+ });
125
+ // Delete old chunks and re-chunk
126
+ await this.store.deleteDocument(existing.id);
127
+ // Re-add the updated document (deleteDocument removes everything)
128
+ // Fall through to create new doc with same content
129
+ }
130
+ }
131
+
132
+ // Dedup: check by content hash
133
+ if (sqliteStore.findByContentHash) {
134
+ const existing = await sqliteStore.findByContentHash(contentHash);
135
+ if (existing) {
136
+ console.log(`[KB] Duplicate content detected (hash match), skipping: "${input.title || input.sourceUrl || 'unknown'}"`);
137
+ return existing;
138
+ }
139
+ }
140
+
141
+ // LLM Q&A extraction: only when explicitly opted in
142
+ if (input.extractQA && this.llmProvider) {
143
+ const pairs = await this.extractQAPairs(input.content, input.title);
144
+ if (pairs.length > 0) {
145
+ for (const pair of pairs) {
146
+ await this.ingestFaq(pair.question, pair.answer, { sourceUrl: input.sourceUrl });
147
+ }
148
+ const now = Date.now();
149
+ const parentDoc: KBDocument = {
150
+ id: randomUUID(),
151
+ sourceType: input.sourceType,
152
+ sourceUrl: input.sourceUrl,
153
+ fileName: input.fileName,
154
+ title: input.title,
155
+ content: `Extracted ${pairs.length} Q&A pairs`,
156
+ metadata: { ...input.metadata, faqCount: pairs.length },
157
+ createdAt: now,
158
+ updatedAt: now,
159
+ priority,
160
+ contentHash,
161
+ };
162
+ await this.store.addDocument(parentDoc);
163
+ return parentDoc;
164
+ }
165
+ }
166
+
167
+ // Default path: chunk content → embed → store
168
+ const now = Date.now();
169
+ const doc: KBDocument = {
170
+ id: randomUUID(),
171
+ sourceType: input.sourceType,
172
+ sourceUrl: input.sourceUrl,
173
+ fileName: input.fileName,
174
+ title: input.title,
175
+ content: input.content,
176
+ metadata: input.metadata,
177
+ createdAt: now,
178
+ updatedAt: now,
179
+ priority,
180
+ contentHash,
181
+ };
182
+
183
+ await this.store.addDocument(doc);
184
+
185
+ // Use MarkdownTextSplitter for URL content or explicit markdown
186
+ const useMarkdown = input.isMarkdown || input.sourceType === 'url';
187
+ const texts = useMarkdown
188
+ ? await this.chunker.chunkMarkdown(input.content)
189
+ : await this.chunker.chunk(input.content);
190
+ const embeddings = await this.embedder.embedMany(texts);
191
+
192
+ const chunks: KBChunk[] = texts.map((text, i) => ({
193
+ id: randomUUID(),
194
+ documentId: doc.id,
195
+ content: text,
196
+ chunkIndex: i,
197
+ embedding: embeddings[i],
198
+ metadata: input.metadata,
199
+ }));
200
+
201
+ await this.store.addChunks(chunks);
202
+
203
+ if (this.store.getChunkCount) {
204
+ const storedCount = this.store.getChunkCount(doc.id);
205
+ if (storedCount === 0) {
206
+ console.warn(`[KB] WARNING: Document "${input.title || doc.id}" was saved but NO vector embeddings were stored.`);
207
+ }
208
+ }
209
+
210
+ return doc;
211
+ }
212
+
213
+ async ingestFaq(question: string, answer: string, metadata?: Record<string, any> & { forceReplace?: boolean }): Promise<IngestFaqResult> {
214
+ const embedding = await this.embedder.embed(normalizeQuery(question));
215
+ const sqliteStore = this.store as SQLiteKnowledgeStore;
216
+
217
+ // FAQ dedup: check for similar existing FAQ
218
+ if (sqliteStore.findSimilarFaq && !metadata?.forceReplace) {
219
+ const match = await sqliteStore.findSimilarFaq(embedding, 0.90);
220
+ if (match) {
221
+ const existingQ = match.chunk.metadata?.question || match.document.title;
222
+ const existingA = match.chunk.metadata?.answer;
223
+ // Return the new doc with existingMatch info so caller can decide
224
+ const now = Date.now();
225
+ const content = `Q: ${question}\nA: ${answer}`;
226
+ const doc: IngestFaqResult = {
227
+ id: randomUUID(),
228
+ sourceType: 'faq',
229
+ sourceUrl: metadata?.sourceUrl,
230
+ title: question,
231
+ content,
232
+ metadata: { ...metadata, question, answer },
233
+ priority: 1,
234
+ createdAt: now,
235
+ updatedAt: now,
236
+ existingMatch: { id: match.document.id, question: existingQ, answer: existingA, score: match.score },
237
+ };
238
+ return doc;
239
+ }
240
+ }
241
+
242
+ // If forceReplace, delete the existing FAQ first (caller provides the ID via metadata)
243
+ if (metadata?.forceReplace && metadata?.replaceId) {
244
+ await this.store.deleteDocument(metadata.replaceId);
245
+ }
246
+
247
+ const now = Date.now();
248
+ const content = `Q: ${question}\nA: ${answer}`;
249
+ const doc: IngestFaqResult = {
250
+ id: randomUUID(),
251
+ sourceType: 'faq',
252
+ sourceUrl: metadata?.sourceUrl,
253
+ title: question,
254
+ content,
255
+ metadata: { ...metadata, question, answer },
256
+ priority: 1,
257
+ createdAt: now,
258
+ updatedAt: now,
259
+ };
260
+
261
+ await this.store.addDocument(doc);
262
+
263
+ const chunk: KBChunk = {
264
+ id: randomUUID(),
265
+ documentId: doc.id,
266
+ content,
267
+ chunkIndex: 0,
268
+ embedding,
269
+ metadata: { question, answer },
270
+ };
271
+
272
+ await this.store.addChunks([chunk]);
273
+ return doc;
274
+ }
275
+
276
+ /**
277
+ * Rebuild all vector embeddings using the current embedding provider.
278
+ * Preserves all document content, chunks, and FTS data — only replaces vectors.
279
+ *
280
+ * Requires the store to be a SQLiteKnowledgeStore (uses rebuild-specific methods).
281
+ */
282
+ async rebuild(onProgress?: (current: number, total: number, docTitle: string) => void): Promise<RebuildResult> {
283
+ const sqliteStore = this.store as SQLiteKnowledgeStore;
284
+ if (!sqliteStore.getAllChunks || !sqliteStore.rebuildVecTable || !sqliteStore.batchInsertEmbeddings) {
285
+ throw new Error('Rebuild requires a SQLiteKnowledgeStore with rebuild methods.');
286
+ }
287
+
288
+ const oldDimensions = sqliteStore.getDimensions();
289
+ const newDimensions = this.embedder.dimensions;
290
+
291
+ // Get all documents (for sourceType lookup) and all chunks
292
+ const documents = await this.store.listDocuments();
293
+ const docMap = new Map(documents.map(d => [d.id, d]));
294
+ const allChunks = sqliteStore.getAllChunks();
295
+
296
+ if (allChunks.length === 0) {
297
+ return { documentsRebuilt: 0, chunksRebuilt: 0, oldDimensions, newDimensions };
298
+ }
299
+
300
+ // Drop and recreate vec_chunks with new dimensions
301
+ sqliteStore.rebuildVecTable(newDimensions);
302
+
303
+ // Group chunks by document for progress reporting
304
+ const chunksByDoc = new Map<string, typeof allChunks>();
305
+ for (const chunk of allChunks) {
306
+ const list = chunksByDoc.get(chunk.documentId) || [];
307
+ list.push(chunk);
308
+ chunksByDoc.set(chunk.documentId, list);
309
+ }
310
+
311
+ let processedDocs = 0;
312
+ const totalDocs = chunksByDoc.size;
313
+ let totalChunksRebuilt = 0;
314
+
315
+ for (const [docId, chunks] of chunksByDoc) {
316
+ const doc = docMap.get(docId);
317
+ const docTitle = doc?.title || docId.slice(0, 8);
318
+
319
+ onProgress?.(processedDocs, totalDocs, docTitle);
320
+
321
+ // Determine what text to embed per chunk
322
+ const textsToEmbed: string[] = [];
323
+ for (const chunk of chunks) {
324
+ if (doc?.sourceType === 'faq') {
325
+ // FAQ: embed the normalized question for consistent matching
326
+ const meta = chunk.metadata ? JSON.parse(chunk.metadata) : null;
327
+ const question = meta?.question || doc.title || chunk.content;
328
+ textsToEmbed.push(normalizeQuery(question));
329
+ } else {
330
+ textsToEmbed.push(chunk.content);
331
+ }
332
+ }
333
+
334
+ // Embed all chunks for this document in one batch
335
+ const embeddings = await this.embedder.embedMany(textsToEmbed);
336
+
337
+ // Insert new embeddings
338
+ const items = chunks.map((chunk, i) => ({
339
+ chunkId: chunk.id,
340
+ embedding: embeddings[i],
341
+ }));
342
+ sqliteStore.batchInsertEmbeddings(items);
343
+
344
+ totalChunksRebuilt += chunks.length;
345
+ processedDocs++;
346
+ }
347
+
348
+ onProgress?.(totalDocs, totalDocs, 'done');
349
+
350
+ return {
351
+ documentsRebuilt: totalDocs,
352
+ chunksRebuilt: totalChunksRebuilt,
353
+ oldDimensions,
354
+ newDimensions,
355
+ };
356
+ }
357
+ }
@@ -0,0 +1,59 @@
1
+ /**
2
+ * Query normalization for improved KB retrieval.
3
+ * Expands chat abbreviations and normalizes whitespace before embedding.
4
+ */
5
+
6
+ const ABBREVIATIONS: [RegExp, string][] = [
7
+ [/\bu\b/gi, 'you'],
8
+ [/\bur\b/gi, 'your'],
9
+ [/\br\b/gi, 'are'],
10
+ [/\bpls\b/gi, 'please'],
11
+ [/\bplz\b/gi, 'please'],
12
+ [/\bthx\b/gi, 'thanks'],
13
+ [/\bthnx\b/gi, 'thanks'],
14
+ [/\bty\b/gi, 'thank you'],
15
+ [/\bwat\b/gi, 'what'],
16
+ [/\bbc\b/gi, 'because'],
17
+ [/\bcuz\b/gi, 'because'],
18
+ [/\bgonna\b/gi, 'going to'],
19
+ [/\bwanna\b/gi, 'want to'],
20
+ [/\bgotta\b/gi, 'got to'],
21
+ [/\blemme\b/gi, 'let me'],
22
+ [/\bgimme\b/gi, 'give me'],
23
+ [/\bdunno\b/gi, 'do not know'],
24
+ [/\bhrs\b/gi, 'hours'],
25
+ [/\bmins\b/gi, 'minutes'],
26
+ [/\bmsg\b/gi, 'message'],
27
+ [/\bmsgs\b/gi, 'messages'],
28
+ [/\binfo\b/gi, 'information'],
29
+ [/\btmr\b/gi, 'tomorrow'],
30
+ [/\btmrw\b/gi, 'tomorrow'],
31
+ [/\bw\/\b/gi, 'with'],
32
+ [/\bw\/o\b/gi, 'without'],
33
+ [/\bidk\b/gi, 'I do not know'],
34
+ [/\bimo\b/gi, 'in my opinion'],
35
+ [/\bbtw\b/gi, 'by the way'],
36
+ [/\basap\b/gi, 'as soon as possible'],
37
+ // Digit substitutions — word-boundary aware
38
+ [/\b4\b/g, 'for'],
39
+ [/\b2\b/g, 'to'],
40
+ ];
41
+
42
+ /**
43
+ * Normalize a user query for better embedding similarity.
44
+ * - Expands chat abbreviations with word-boundary awareness
45
+ * - Lowercases
46
+ * - Collapses whitespace
47
+ */
48
+ export function normalizeQuery(query: string): string {
49
+ let normalized = query.toLowerCase();
50
+
51
+ for (const [pattern, replacement] of ABBREVIATIONS) {
52
+ normalized = normalized.replace(pattern, replacement);
53
+ }
54
+
55
+ // Collapse whitespace
56
+ normalized = normalized.replace(/\s+/g, ' ').trim();
57
+
58
+ return normalized;
59
+ }
@@ -0,0 +1,73 @@
1
+ import type { LanguageModelV1 } from 'ai';
2
+ import { generateText } from 'ai';
3
+
4
+ const SYSTEM_PROMPT =
5
+ 'You are a query normalizer. Rewrite the following informal/casual query into a clear, well-formed question. Only output the rewritten question, nothing else.';
6
+
7
+ const MAX_CACHE_SIZE = 1000;
8
+
9
+ export interface QueryRewriterOptions {
10
+ model: LanguageModelV1;
11
+ maxCacheSize?: number;
12
+ }
13
+
14
+ export interface RewriteResult {
15
+ original: string;
16
+ rewritten: string;
17
+ cached: boolean;
18
+ tokenUsage?: { prompt: number; completion: number };
19
+ }
20
+
21
+ export class QueryRewriter {
22
+ private model: LanguageModelV1;
23
+ private cache: Map<string, string>;
24
+ private maxCacheSize: number;
25
+
26
+ constructor(options: QueryRewriterOptions) {
27
+ this.model = options.model;
28
+ this.cache = new Map();
29
+ this.maxCacheSize = options.maxCacheSize ?? MAX_CACHE_SIZE;
30
+ }
31
+
32
+ async rewrite(query: string): Promise<RewriteResult> {
33
+ const cacheKey = query.toLowerCase().trim();
34
+
35
+ // Check cache first
36
+ const cached = this.cache.get(cacheKey);
37
+ if (cached) {
38
+ return { original: query, rewritten: cached, cached: true };
39
+ }
40
+
41
+ const { text, usage } = await generateText({
42
+ model: this.model,
43
+ system: SYSTEM_PROMPT,
44
+ prompt: query,
45
+ });
46
+
47
+ const rewritten = text.trim();
48
+
49
+ // LRU eviction: delete oldest entry if at capacity
50
+ if (this.cache.size >= this.maxCacheSize) {
51
+ const oldest = this.cache.keys().next().value!;
52
+ this.cache.delete(oldest);
53
+ }
54
+ this.cache.set(cacheKey, rewritten);
55
+
56
+ return {
57
+ original: query,
58
+ rewritten,
59
+ cached: false,
60
+ tokenUsage: usage
61
+ ? { prompt: usage.promptTokens, completion: usage.completionTokens }
62
+ : undefined,
63
+ };
64
+ }
65
+
66
+ get cacheSize(): number {
67
+ return this.cache.size;
68
+ }
69
+
70
+ clearCache(): void {
71
+ this.cache.clear();
72
+ }
73
+ }
@@ -0,0 +1,72 @@
1
+ /**
2
+ * Reciprocal Rank Fusion (RRF) for combining multiple ranked result sets.
3
+ * Standard technique for hybrid search (vector + keyword).
4
+ */
5
+
6
+ /**
7
+ * Fuse multiple ranked result sets using Reciprocal Rank Fusion.
8
+ *
9
+ * @param resultSets - Array of Maps where key = item ID, value = rank (0-based)
10
+ * @param k - Smoothing constant (default 60, industry standard)
11
+ * @returns Map of item ID → fused RRF score, sorted descending by score
12
+ */
13
+ export function reciprocalRankFusion(
14
+ resultSets: Map<string, number>[],
15
+ k: number = 60,
16
+ ): Map<string, number> {
17
+ const scores = new Map<string, number>();
18
+
19
+ for (const rankMap of resultSets) {
20
+ for (const [id, rank] of rankMap) {
21
+ const prev = scores.get(id) ?? 0;
22
+ scores.set(id, prev + 1 / (k + rank));
23
+ }
24
+ }
25
+
26
+ // Sort by score descending
27
+ const sorted = new Map(
28
+ [...scores.entries()].sort((a, b) => b[1] - a[1]),
29
+ );
30
+
31
+ return sorted;
32
+ }
33
+
34
+ /**
35
+ * Weighted Score Fusion: combine vector and keyword scores using weighted average.
36
+ * BM25 scores are min-max normalized to 0-1 before combining.
37
+ *
38
+ * @returns Map of item ID → fused score, sorted descending
39
+ */
40
+ export function weightedScoreFusion(
41
+ vectorResults: { id: string; score: number }[],
42
+ keywordResults: { id: string; score: number }[],
43
+ vectorWeight: number = 0.7,
44
+ keywordWeight: number = 0.3,
45
+ ): Map<string, number> {
46
+ // Min-max normalize BM25 scores to 0-1
47
+ const bm25Scores = new Map<string, number>();
48
+ if (keywordResults.length > 0) {
49
+ const scores = keywordResults.map(r => r.score);
50
+ const min = Math.min(...scores);
51
+ const max = Math.max(...scores);
52
+ const range = max - min || 1;
53
+ for (const r of keywordResults) {
54
+ bm25Scores.set(r.id, (r.score - min) / range);
55
+ }
56
+ }
57
+
58
+ const vecScores = new Map<string, number>();
59
+ for (const r of vectorResults) vecScores.set(r.id, r.score);
60
+
61
+ // Combine all IDs
62
+ const allIds = new Set([...vecScores.keys(), ...bm25Scores.keys()]);
63
+ const fused = new Map<string, number>();
64
+
65
+ for (const id of allIds) {
66
+ const vs = vecScores.get(id) ?? 0;
67
+ const ks = bm25Scores.get(id) ?? 0;
68
+ fused.set(id, vectorWeight * vs + keywordWeight * ks);
69
+ }
70
+
71
+ return new Map([...fused.entries()].sort((a, b) => b[1] - a[1]));
72
+ }