@operor/knowledge 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +457 -0
- package/dist/index.d.ts +437 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1442 -0
- package/dist/index.js.map +1 -0
- package/package.json +42 -0
- package/src/EmbeddingService.ts +92 -0
- package/src/IngestionPipeline.ts +357 -0
- package/src/QueryNormalizer.ts +59 -0
- package/src/QueryRewriter.ts +73 -0
- package/src/RankFusion.ts +72 -0
- package/src/RetrievalPipeline.ts +388 -0
- package/src/SQLiteKnowledgeStore.ts +379 -0
- package/src/TextChunker.ts +34 -0
- package/src/__tests__/cli-integration.test.ts +134 -0
- package/src/__tests__/content-fetcher.test.ts +156 -0
- package/src/__tests__/knowledge.test.ts +493 -0
- package/src/__tests__/retrieval-layers.test.ts +672 -0
- package/src/index.ts +41 -0
- package/src/ingestors/FileIngestor.ts +85 -0
- package/src/ingestors/SiteCrawler.ts +153 -0
- package/src/ingestors/UrlIngestor.ts +106 -0
- package/src/ingestors/WatiFaqSync.ts +75 -0
- package/src/ingestors/content-fetcher.ts +142 -0
- package/src/types.ts +62 -0
- package/tsconfig.json +9 -0
- package/tsdown.config.ts +10 -0
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
import type { KBSearchResult, KBSearchOptions, KnowledgeStore } from './types.js';
|
|
2
|
+
import type { EmbeddingService } from './EmbeddingService.js';
|
|
3
|
+
import type { QueryRewriter } from './QueryRewriter.js';
|
|
4
|
+
import { normalizeQuery } from './QueryNormalizer.js';
|
|
5
|
+
import { reciprocalRankFusion, weightedScoreFusion } from './RankFusion.js';
|
|
6
|
+
|
|
7
|
+
export interface RetrievalResult {
|
|
8
|
+
results: KBSearchResult[];
|
|
9
|
+
context: string;
|
|
10
|
+
isFaqMatch: boolean;
|
|
11
|
+
rewritten?: string;
|
|
12
|
+
/** Raw FAQ answer extracted from metadata (only set when isFaqMatch is true). */
|
|
13
|
+
faqAnswer?: string;
|
|
14
|
+
/** Raw FAQ question extracted from metadata (only set when isFaqMatch is true). */
|
|
15
|
+
faqQuestion?: string;
|
|
16
|
+
/** Multiple FAQ matches from compound query splitting. */
|
|
17
|
+
faqMatches?: Array<{ faqQuestion: string; faqAnswer: string; score: number }>;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface RetrievalPipelineOptions {
|
|
21
|
+
faqThreshold?: number;
|
|
22
|
+
faqLowThreshold?: number;
|
|
23
|
+
faqScoreGap?: number;
|
|
24
|
+
useHybridSearch?: boolean;
|
|
25
|
+
queryRewriter?: QueryRewriter;
|
|
26
|
+
rewriteHighThreshold?: number;
|
|
27
|
+
rewriteLowThreshold?: number;
|
|
28
|
+
fusionStrategy?: 'rrf' | 'weighted';
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Heuristic splitter for compound questions. No LLM call — zero latency cost.
|
|
33
|
+
* Splits on "?" followed by more text, or " and " when both sides are >3 chars.
|
|
34
|
+
* Returns the original query in a single-element array if no split detected.
|
|
35
|
+
* Capped at 4 sub-queries max.
|
|
36
|
+
*/
|
|
37
|
+
export function splitCompoundQuery(query: string): string[] {
|
|
38
|
+
// Strategy 1: Split on "?" followed by more text
|
|
39
|
+
const qParts = query.split(/\?\s*/).filter(p => p.trim().length > 3);
|
|
40
|
+
if (qParts.length > 1) {
|
|
41
|
+
return qParts.slice(0, 4).map(p => p.trim());
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Strategy 2: Split on " and " when both sides are >3 chars
|
|
45
|
+
// Use word boundary to avoid splitting "android", "band", etc.
|
|
46
|
+
const andParts = query.split(/\s+and\s+/i).filter(p => p.trim().length > 3);
|
|
47
|
+
if (andParts.length > 1) {
|
|
48
|
+
return andParts.slice(0, 4).map(p => p.trim());
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return [query];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export class RetrievalPipeline {
|
|
55
|
+
private store: KnowledgeStore;
|
|
56
|
+
private embedder: EmbeddingService;
|
|
57
|
+
private faqThreshold: number;
|
|
58
|
+
private faqLowThreshold: number;
|
|
59
|
+
private faqScoreGap: number;
|
|
60
|
+
private useHybridSearch: boolean;
|
|
61
|
+
private queryRewriter?: QueryRewriter;
|
|
62
|
+
private rewriteHighThreshold: number;
|
|
63
|
+
private rewriteLowThreshold: number;
|
|
64
|
+
private fusionStrategy: 'rrf' | 'weighted';
|
|
65
|
+
|
|
66
|
+
constructor(store: KnowledgeStore, embedder: EmbeddingService, faqThreshold?: number);
|
|
67
|
+
constructor(store: KnowledgeStore, embedder: EmbeddingService, options?: RetrievalPipelineOptions);
|
|
68
|
+
constructor(
|
|
69
|
+
store: KnowledgeStore,
|
|
70
|
+
embedder: EmbeddingService,
|
|
71
|
+
thresholdOrOptions?: number | RetrievalPipelineOptions,
|
|
72
|
+
) {
|
|
73
|
+
this.store = store;
|
|
74
|
+
this.embedder = embedder;
|
|
75
|
+
|
|
76
|
+
if (typeof thresholdOrOptions === 'number') {
|
|
77
|
+
this.faqThreshold = thresholdOrOptions;
|
|
78
|
+
this.faqLowThreshold = 0.70;
|
|
79
|
+
this.faqScoreGap = 0.15;
|
|
80
|
+
this.useHybridSearch = true;
|
|
81
|
+
this.rewriteHighThreshold = 0.70;
|
|
82
|
+
this.rewriteLowThreshold = 0.50;
|
|
83
|
+
this.fusionStrategy = 'rrf';
|
|
84
|
+
} else {
|
|
85
|
+
const opts = thresholdOrOptions ?? {};
|
|
86
|
+
this.faqThreshold = opts.faqThreshold ?? 0.85;
|
|
87
|
+
this.faqLowThreshold = opts.faqLowThreshold ?? 0.70;
|
|
88
|
+
this.faqScoreGap = opts.faqScoreGap ?? 0.15;
|
|
89
|
+
this.useHybridSearch = opts.useHybridSearch ?? true;
|
|
90
|
+
this.queryRewriter = opts.queryRewriter;
|
|
91
|
+
this.rewriteHighThreshold = opts.rewriteHighThreshold ?? 0.70;
|
|
92
|
+
this.rewriteLowThreshold = opts.rewriteLowThreshold ?? 0.50;
|
|
93
|
+
this.fusionStrategy = opts.fusionStrategy ?? 'rrf';
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
async retrieve(query: string, options?: KBSearchOptions): Promise<RetrievalResult> {
|
|
98
|
+
const subQueries = splitCompoundQuery(query);
|
|
99
|
+
|
|
100
|
+
// Single query — use existing path
|
|
101
|
+
if (subQueries.length <= 1) {
|
|
102
|
+
return this.retrieveSingle(query, options);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Compound query — retrieve each sub-query independently
|
|
106
|
+
const subResults = await Promise.all(
|
|
107
|
+
subQueries.map(sq => this.retrieveSingle(sq, options))
|
|
108
|
+
);
|
|
109
|
+
|
|
110
|
+
// Collect all FAQ matches, deduplicate by document ID
|
|
111
|
+
const seen = new Set<string>();
|
|
112
|
+
const faqMatches: Array<{ faqQuestion: string; faqAnswer: string; score: number }> = [];
|
|
113
|
+
|
|
114
|
+
for (const sr of subResults) {
|
|
115
|
+
if (sr.isFaqMatch && sr.faqAnswer && sr.faqQuestion) {
|
|
116
|
+
const docId = sr.results[0]?.document?.id;
|
|
117
|
+
if (docId && !seen.has(docId)) {
|
|
118
|
+
seen.add(docId);
|
|
119
|
+
faqMatches.push({
|
|
120
|
+
faqQuestion: sr.faqQuestion,
|
|
121
|
+
faqAnswer: sr.faqAnswer,
|
|
122
|
+
score: sr.results[0]?.score ?? 0,
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// If 2+ FAQ matches, return combined result
|
|
129
|
+
if (faqMatches.length >= 2) {
|
|
130
|
+
const allResults = subResults.flatMap(sr => sr.results);
|
|
131
|
+
// Deduplicate results by chunk ID
|
|
132
|
+
const seenChunks = new Set<string>();
|
|
133
|
+
const dedupedResults = allResults.filter(r => {
|
|
134
|
+
if (seenChunks.has(r.chunk.id)) return false;
|
|
135
|
+
seenChunks.add(r.chunk.id);
|
|
136
|
+
return true;
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
results: dedupedResults,
|
|
141
|
+
context: this.formatContext(dedupedResults),
|
|
142
|
+
isFaqMatch: true,
|
|
143
|
+
faqMatches,
|
|
144
|
+
// Use first match's answer for backward compat
|
|
145
|
+
faqAnswer: faqMatches[0].faqAnswer,
|
|
146
|
+
faqQuestion: faqMatches[0].faqQuestion,
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// 0-1 FAQ matches — fall back to single-query retrieval with original query
|
|
151
|
+
return this.retrieveSingle(query, options);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
private async retrieveSingle(query: string, options?: KBSearchOptions): Promise<RetrievalResult> {
|
|
155
|
+
// Layer 1: Normalize query (expand abbreviations, lowercase, collapse whitespace)
|
|
156
|
+
const normalized = normalizeQuery(query);
|
|
157
|
+
const embedding = await this.embedder.embed(normalized);
|
|
158
|
+
|
|
159
|
+
// FAQ fast-path: search FAQ docs first (top 2 for score gap analysis)
|
|
160
|
+
const faqResults = await this.store.searchByEmbedding(embedding, {
|
|
161
|
+
...options,
|
|
162
|
+
sourceTypes: ['faq'],
|
|
163
|
+
limit: 2,
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
if (faqResults.length > 0) {
|
|
167
|
+
// FAQ freshness tiebreak: when top 2 are within 0.02, prefer the newer one
|
|
168
|
+
let top = faqResults[0];
|
|
169
|
+
if (faqResults.length > 1) {
|
|
170
|
+
const scoreDiff = top.score - faqResults[1].score;
|
|
171
|
+
if (scoreDiff <= 0.02 && (faqResults[1].document.updatedAt ?? 0) > (top.document.updatedAt ?? 0)) {
|
|
172
|
+
top = faqResults[1];
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const faqAnswer = top.chunk.metadata?.answer || top.document.metadata?.answer;
|
|
177
|
+
const faqQuestion = top.chunk.metadata?.question || top.document.metadata?.question;
|
|
178
|
+
|
|
179
|
+
// Layer 2: Score gap analysis
|
|
180
|
+
// High confidence: score >= 0.85
|
|
181
|
+
if (top.score >= this.faqThreshold) {
|
|
182
|
+
return {
|
|
183
|
+
results: [top],
|
|
184
|
+
context: this.formatContext([top]),
|
|
185
|
+
isFaqMatch: true,
|
|
186
|
+
faqAnswer,
|
|
187
|
+
faqQuestion,
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Medium confidence with clear standout: score >= 0.70 && gap > 0.15
|
|
192
|
+
if (top.score >= this.faqLowThreshold) {
|
|
193
|
+
const gap = faqResults.length > 1 ? top.score - faqResults[1].score : 1;
|
|
194
|
+
if (gap > this.faqScoreGap) {
|
|
195
|
+
return {
|
|
196
|
+
results: [top],
|
|
197
|
+
context: this.formatContext([top]),
|
|
198
|
+
isFaqMatch: true,
|
|
199
|
+
faqAnswer,
|
|
200
|
+
faqQuestion,
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Layer 3: Hybrid search (vector + FTS5 keyword) with RRF
|
|
207
|
+
const results = await this.hybridSearch(normalized, embedding, options);
|
|
208
|
+
|
|
209
|
+
// Layer 4: Conditional LLM query rewrite
|
|
210
|
+
// Only attempt if: rewriter is configured, top score is in the "uncertain" band,
|
|
211
|
+
// and there's something in the KB worth re-matching against.
|
|
212
|
+
const topScore = results.length > 0 ? results[0].score : 0;
|
|
213
|
+
if (
|
|
214
|
+
this.queryRewriter &&
|
|
215
|
+
topScore >= this.rewriteLowThreshold &&
|
|
216
|
+
topScore < this.rewriteHighThreshold
|
|
217
|
+
) {
|
|
218
|
+
try {
|
|
219
|
+
const rewriteResult = await this.queryRewriter.rewrite(normalized);
|
|
220
|
+
const rewrittenEmbedding = await this.embedder.embed(rewriteResult.rewritten);
|
|
221
|
+
|
|
222
|
+
// Re-run FAQ fast-path with rewritten query
|
|
223
|
+
const rewrittenFaqResults = await this.store.searchByEmbedding(rewrittenEmbedding, {
|
|
224
|
+
...options,
|
|
225
|
+
sourceTypes: ['faq'],
|
|
226
|
+
limit: 2,
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
if (rewrittenFaqResults.length > 0 && rewrittenFaqResults[0].score >= this.faqLowThreshold) {
|
|
230
|
+
const top = rewrittenFaqResults[0];
|
|
231
|
+
const gap = rewrittenFaqResults.length > 1 ? top.score - rewrittenFaqResults[1].score : 1;
|
|
232
|
+
if (top.score >= this.faqThreshold || gap > this.faqScoreGap) {
|
|
233
|
+
const faqAnswer = top.chunk.metadata?.answer || top.document.metadata?.answer;
|
|
234
|
+
const faqQuestion = top.chunk.metadata?.question || top.document.metadata?.question;
|
|
235
|
+
return {
|
|
236
|
+
results: [top],
|
|
237
|
+
context: this.formatContext([top]),
|
|
238
|
+
isFaqMatch: true,
|
|
239
|
+
rewritten: rewriteResult.rewritten,
|
|
240
|
+
faqAnswer,
|
|
241
|
+
faqQuestion,
|
|
242
|
+
};
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// Re-run hybrid search with rewritten query
|
|
247
|
+
const rewrittenResults = await this.hybridSearch(rewriteResult.rewritten, rewrittenEmbedding, options);
|
|
248
|
+
if (rewrittenResults.length > 0 && rewrittenResults[0].score > topScore) {
|
|
249
|
+
return {
|
|
250
|
+
results: rewrittenResults,
|
|
251
|
+
context: this.formatContext(rewrittenResults),
|
|
252
|
+
isFaqMatch: false,
|
|
253
|
+
rewritten: rewriteResult.rewritten,
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
} catch {
|
|
257
|
+
// Rewrite failed — fall through with original results
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return {
|
|
262
|
+
results,
|
|
263
|
+
context: this.formatContext(results),
|
|
264
|
+
isFaqMatch: false,
|
|
265
|
+
};
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
private async hybridSearch(
|
|
269
|
+
query: string,
|
|
270
|
+
embedding: number[],
|
|
271
|
+
options?: KBSearchOptions,
|
|
272
|
+
): Promise<KBSearchResult[]> {
|
|
273
|
+
const limit = options?.limit || 5;
|
|
274
|
+
|
|
275
|
+
// If hybrid search is disabled or store doesn't support keyword search, vector-only
|
|
276
|
+
if (!this.useHybridSearch || !this.store.searchByKeyword) {
|
|
277
|
+
const vecResults = await this.store.searchByEmbedding(embedding, {
|
|
278
|
+
...options,
|
|
279
|
+
limit,
|
|
280
|
+
});
|
|
281
|
+
return this.applyBoosts(vecResults.slice(0, limit));
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Run vector search and FTS5 keyword search in parallel
|
|
285
|
+
const searchOpts = { ...options, limit: limit * 2 };
|
|
286
|
+
const [vecResults, ftsResults] = await Promise.all([
|
|
287
|
+
this.store.searchByEmbedding(embedding, searchOpts),
|
|
288
|
+
this.store.searchByKeyword(query, searchOpts),
|
|
289
|
+
]);
|
|
290
|
+
|
|
291
|
+
// If FTS returned nothing, fall back to vector-only
|
|
292
|
+
if (ftsResults.length === 0) {
|
|
293
|
+
return this.applyBoosts(vecResults.slice(0, limit));
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
let fusedResults: KBSearchResult[];
|
|
297
|
+
|
|
298
|
+
if (this.fusionStrategy === 'weighted') {
|
|
299
|
+
// Weighted score fusion: combine actual scores
|
|
300
|
+
const vecItems = vecResults.map(r => ({ id: r.chunk.id, score: r.score }));
|
|
301
|
+
const ftsItems = ftsResults.map(r => ({ id: r.chunk.id, score: r.score }));
|
|
302
|
+
const fused = weightedScoreFusion(vecItems, ftsItems);
|
|
303
|
+
|
|
304
|
+
const resultMap = new Map<string, KBSearchResult>();
|
|
305
|
+
for (const r of vecResults) resultMap.set(r.chunk.id, r);
|
|
306
|
+
for (const r of ftsResults) {
|
|
307
|
+
if (!resultMap.has(r.chunk.id)) resultMap.set(r.chunk.id, r);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
fusedResults = [];
|
|
311
|
+
for (const [chunkId, fusedScore] of fused) {
|
|
312
|
+
if (fusedResults.length >= limit) break;
|
|
313
|
+
const result = resultMap.get(chunkId);
|
|
314
|
+
if (result) {
|
|
315
|
+
// Use fused score as the result score for weighted strategy
|
|
316
|
+
fusedResults.push({ ...result, score: fusedScore });
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
} else {
|
|
320
|
+
// RRF: rank-based fusion (default)
|
|
321
|
+
const vecRanks = new Map<string, number>();
|
|
322
|
+
vecResults.forEach((r, i) => vecRanks.set(r.chunk.id, i));
|
|
323
|
+
|
|
324
|
+
const ftsRanks = new Map<string, number>();
|
|
325
|
+
ftsResults.forEach((r, i) => ftsRanks.set(r.chunk.id, i));
|
|
326
|
+
|
|
327
|
+
const fused = reciprocalRankFusion([vecRanks, ftsRanks]);
|
|
328
|
+
|
|
329
|
+
const resultMap = new Map<string, KBSearchResult>();
|
|
330
|
+
for (const r of vecResults) resultMap.set(r.chunk.id, r);
|
|
331
|
+
for (const r of ftsResults) {
|
|
332
|
+
if (!resultMap.has(r.chunk.id)) resultMap.set(r.chunk.id, r);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// RRF preserves original vector similarity scores for downstream thresholds
|
|
336
|
+
fusedResults = [];
|
|
337
|
+
for (const [chunkId, _rrfScore] of fused) {
|
|
338
|
+
if (fusedResults.length >= limit) break;
|
|
339
|
+
const result = resultMap.get(chunkId);
|
|
340
|
+
if (result) {
|
|
341
|
+
fusedResults.push(result);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return this.applyBoosts(fusedResults);
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Apply freshness and priority boosts to search results, then re-sort.
|
|
351
|
+
*/
|
|
352
|
+
private applyBoosts(results: KBSearchResult[]): KBSearchResult[] {
|
|
353
|
+
if (results.length === 0) return results;
|
|
354
|
+
|
|
355
|
+
const thirtyDaysAgo = Date.now() - (30 * 24 * 60 * 60 * 1000);
|
|
356
|
+
|
|
357
|
+
const boosted = results.map(r => {
|
|
358
|
+
let score = r.score;
|
|
359
|
+
|
|
360
|
+
// Freshness boost: +0.05 for docs updated within 30 days
|
|
361
|
+
if (r.document.updatedAt && r.document.updatedAt > thirtyDaysAgo) {
|
|
362
|
+
score += 0.05;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Priority boost
|
|
366
|
+
const priority = (r.document as any).priority ?? 2;
|
|
367
|
+
if (priority === 1) score += 0.03;
|
|
368
|
+
else if (priority === 3) score -= 0.02;
|
|
369
|
+
|
|
370
|
+
return { ...r, score };
|
|
371
|
+
});
|
|
372
|
+
|
|
373
|
+
// Re-sort by boosted score
|
|
374
|
+
boosted.sort((a, b) => b.score - a.score);
|
|
375
|
+
return boosted;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
private formatContext(results: KBSearchResult[]): string {
|
|
379
|
+
if (results.length === 0) return '';
|
|
380
|
+
|
|
381
|
+
const sections = results.map((r, i) => {
|
|
382
|
+
const source = r.document.title || r.document.sourceUrl || r.document.fileName || 'Unknown';
|
|
383
|
+
return `### Source ${i + 1}: ${source} (score: ${r.score.toFixed(2)})\n${r.chunk.content}`;
|
|
384
|
+
});
|
|
385
|
+
|
|
386
|
+
return `## Knowledge Base Context\n\n${sections.join('\n\n')}`;
|
|
387
|
+
}
|
|
388
|
+
}
|