ralph-hero-knowledge-index 0.1.24 → 0.1.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.mcp.json +1 -1
- package/dist/hybrid-search.js +41 -15
- package/dist/hybrid-search.js.map +1 -1
- package/dist/vector-search.d.ts +6 -0
- package/dist/vector-search.js +5 -1
- package/dist/vector-search.js.map +1 -1
- package/package.json +1 -1
- package/src/__tests__/hybrid-search.test.ts +227 -0
- package/src/__tests__/vector-search.test.ts +26 -0
- package/src/hybrid-search.ts +59 -17
- package/src/vector-search.ts +11 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ralph-knowledge",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.25",
|
|
4
4
|
"description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Chad Dubiel",
|
package/.mcp.json
CHANGED
package/dist/hybrid-search.js
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Maximum snippet length (in characters) when the snippet is sourced from a
|
|
3
|
+
* chunk's content. Keeps the MCP payload compact while still representative.
|
|
4
|
+
*/
|
|
5
|
+
const SNIPPET_MAX_CHARS = 300;
|
|
1
6
|
export class HybridSearch {
|
|
2
7
|
db;
|
|
3
8
|
fts;
|
|
@@ -53,9 +58,22 @@ export class HybridSearch {
|
|
|
53
58
|
});
|
|
54
59
|
const queryEmbedding = await this.embedFn(query);
|
|
55
60
|
const vecResults = this.vec.search(queryEmbedding, limit * 2);
|
|
56
|
-
//
|
|
57
|
-
//
|
|
58
|
-
//
|
|
61
|
+
// Bucket vector results by doc_id, keeping the best-ranked chunk per doc.
|
|
62
|
+
// vecResults is already sorted by distance ascending, so the first
|
|
63
|
+
// occurrence of a given doc_id has the smallest rank (best match).
|
|
64
|
+
const buckets = new Map();
|
|
65
|
+
for (let i = 0; i < vecResults.length; i++) {
|
|
66
|
+
const hit = vecResults[i];
|
|
67
|
+
const docId = this.docIdFromVecId(hit.id);
|
|
68
|
+
if (buckets.has(docId))
|
|
69
|
+
continue; // Already have best rank for this doc
|
|
70
|
+
buckets.set(docId, {
|
|
71
|
+
bestRank: i,
|
|
72
|
+
bestChunkId: hit.id,
|
|
73
|
+
bestContent: hit.content ?? "",
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
// Build RRF score map (keyed by doc_id for both FTS and vector buckets)
|
|
59
77
|
const scores = new Map();
|
|
60
78
|
const bestChunkByDoc = new Map();
|
|
61
79
|
for (let i = 0; i < ftsResults.length; i++) {
|
|
@@ -63,16 +81,13 @@ export class HybridSearch {
|
|
|
63
81
|
const rrfScore = 1 / (HybridSearch.RRF_K + i + 1);
|
|
64
82
|
scores.set(id, (scores.get(id) ?? 0) + rrfScore);
|
|
65
83
|
}
|
|
66
|
-
for (
|
|
67
|
-
const
|
|
68
|
-
const docId = this.docIdFromVecId(vecId);
|
|
69
|
-
const rrfScore = 1 / (HybridSearch.RRF_K + i + 1);
|
|
84
|
+
for (const [docId, bucket] of buckets) {
|
|
85
|
+
const rrfScore = 1 / (HybridSearch.RRF_K + bucket.bestRank + 1);
|
|
70
86
|
scores.set(docId, (scores.get(docId) ?? 0) + rrfScore);
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
}
|
|
87
|
+
// Track best chunk for later enrichment
|
|
88
|
+
const existing = bestChunkByDoc.get(docId);
|
|
89
|
+
if (!existing || bucket.bestRank < existing.rank) {
|
|
90
|
+
bestChunkByDoc.set(docId, { chunkId: bucket.bestChunkId, rank: bucket.bestRank });
|
|
76
91
|
}
|
|
77
92
|
}
|
|
78
93
|
// Build a lookup of FTS results by id for quick access
|
|
@@ -80,12 +95,20 @@ export class HybridSearch {
|
|
|
80
95
|
for (const r of ftsResults) {
|
|
81
96
|
ftsById.set(r.id, r);
|
|
82
97
|
}
|
|
83
|
-
// Assemble combined results
|
|
98
|
+
// Assemble combined results. For vector-hit docs, replace the snippet
|
|
99
|
+
// with the winning chunk's content (truncated). FTS-only hits keep the
|
|
100
|
+
// FTS snippet.
|
|
84
101
|
const combined = [];
|
|
85
102
|
for (const [id, rrfScore] of scores) {
|
|
86
103
|
const ftsHit = ftsById.get(id);
|
|
104
|
+
const bucket = buckets.get(id);
|
|
87
105
|
if (ftsHit) {
|
|
88
|
-
|
|
106
|
+
// FTS hit (possibly also a vector hit): prefer the chunk snippet when
|
|
107
|
+
// the vector side contributed real chunk content.
|
|
108
|
+
const snippet = bucket && bucket.bestContent
|
|
109
|
+
? bucket.bestContent.slice(0, SNIPPET_MAX_CHARS)
|
|
110
|
+
: ftsHit.snippet;
|
|
111
|
+
combined.push({ ...ftsHit, score: rrfScore, snippet });
|
|
89
112
|
}
|
|
90
113
|
else {
|
|
91
114
|
// Vector-only result: fetch document metadata from db
|
|
@@ -93,6 +116,9 @@ export class HybridSearch {
|
|
|
93
116
|
// Skip stub documents — they have no real content or path
|
|
94
117
|
if (!doc || doc.isStub)
|
|
95
118
|
continue;
|
|
119
|
+
const snippet = bucket
|
|
120
|
+
? bucket.bestContent.slice(0, SNIPPET_MAX_CHARS)
|
|
121
|
+
: "";
|
|
96
122
|
combined.push({
|
|
97
123
|
id: doc.id,
|
|
98
124
|
path: doc.path,
|
|
@@ -101,7 +127,7 @@ export class HybridSearch {
|
|
|
101
127
|
status: doc.status,
|
|
102
128
|
date: doc.date,
|
|
103
129
|
score: rrfScore,
|
|
104
|
-
snippet
|
|
130
|
+
snippet,
|
|
105
131
|
});
|
|
106
132
|
}
|
|
107
133
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"hybrid-search.js","sourceRoot":"","sources":["../src/hybrid-search.ts"],"names":[],"mappings":"AAgBA,MAAM,OAAO,YAAY;IAIJ;IACA;IACA;IACA;IANX,MAAM,CAAU,KAAK,GAAG,EAAE,CAAC;IAEnC,YACmB,EAAe,EACf,GAAc,EACd,GAAiB,EACjB,OAAgB;QAHhB,OAAE,GAAF,EAAE,CAAa;QACf,QAAG,GAAH,GAAG,CAAW;QACd,QAAG,GAAH,GAAG,CAAc;QACjB,YAAO,GAAP,OAAO,CAAS;IAChC,CAAC;IAEJ;;;OAGG;IACK,iBAAiB;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,EAAE;aACnB,OAAO,CACN,qEAAqE,CACtE;aACA,GAAG,EAAE,CAAC;QACT,OAAO,GAAG,KAAK,SAAS,CAAC;IAC3B,CAAC;IAED;;;;OAIG;IACK,cAAc,CAAC,KAAa;QAClC,MAAM,MAAM,GAAG,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACvC,IAAI,MAAM,KAAK,CAAC,CAAC;YAAE,OAAO,KAAK,CAAC;QAChC,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC;YAAE,OAAO,KAAK,CAAC;QAC/D,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;IAChC,CAAC;IAEO,UAAU,CAAC,OAAe;QAChC,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;YAAE,OAAO,SAAS,CAAC;QAChD,OAAO,IAAI,CAAC,EAAE,CAAC,EAAE;aACd,OAAO,CACN;kCAC0B,CAC3B;aACA,GAAG,CAAC,OAAO,CAAyB,CAAC;IAC1C,CAAC;IAED,KAAK,CAAC,MAAM,CACV,KAAa,EACb,UAAyB,EAAE;QAE3B,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,iBAAiB,GAAG,KAAK,EAAE,KAAK,GAAG,EAAE,EAAE,UAAU,EAAE,GAAG,OAAO,CAAC;QAElF,0EAA0E;QAC1E,gCAAgC;QAChC,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE;YACxC,iBAAiB,EAAE,IAAI;YACvB,KAAK,EAAE,KAAK,GAAG,CAAC;YAChB,UAAU;SACX,CAAC,CAAC;QAEH,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QACjD,MAAM,UAAU,
|
|
1
|
+
{"version":3,"file":"hybrid-search.js","sourceRoot":"","sources":["../src/hybrid-search.ts"],"names":[],"mappings":"AAgBA;;;GAGG;AACH,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAa9B,MAAM,OAAO,YAAY;IAIJ;IACA;IACA;IACA;IANX,MAAM,CAAU,KAAK,GAAG,EAAE,CAAC;IAEnC,YACmB,EAAe,EACf,GAAc,EACd,GAAiB,EACjB,OAAgB;QAHhB,OAAE,GAAF,EAAE,CAAa;QACf,QAAG,GAAH,GAAG,CAAW;QACd,QAAG,GAAH,GAAG,CAAc;QACjB,YAAO,GAAP,OAAO,CAAS;IAChC,CAAC;IAEJ;;;OAGG;IACK,iBAAiB;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,EAAE;aACnB,OAAO,CACN,qEAAqE,CACtE;aACA,GAAG,EAAE,CAAC;QACT,OAAO,GAAG,KAAK,SAAS,CAAC;IAC3B,CAAC;IAED;;;;OAIG;IACK,cAAc,CAAC,KAAa;QAClC,MAAM,MAAM,GAAG,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACvC,IAAI,MAAM,KAAK,CAAC,CAAC;YAAE,OAAO,KAAK,CAAC;QAChC,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC;YAAE,OAAO,KAAK,CAAC;QAC/D,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;IAChC,CAAC;IAEO,UAAU,CAAC,OAAe;QAChC,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;YAAE,OAAO,SAAS,CAAC;QAChD,OAAO,IAAI,CAAC,EAAE,CAAC,EAAE;aACd,OAAO,CACN;kCAC0B,CAC3B;aACA,GAAG,CAAC,OAAO,CAAyB,CAAC;IAC1C,CAAC;IAED,KAAK,CAAC,MAAM,CACV,KAAa,EACb,UAAyB,EAAE;QAE3B,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,iBAAiB,GAAG,KAAK,EAAE,KAAK,GAAG,EAAE,EAAE,UAAU,EAAE,GAAG,OAAO,CAAC;QAElF,0EAA0E;QAC1E,gCAAgC;QAChC,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE;YACxC,iBAAiB,EAAE,IAAI;YACvB,KAAK,EAAE,KAAK,GAAG,CAAC;YAChB,UAAU;SACX,CAAC,CAAC;QAEH,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QACjD,MAAM,UAAU,GAAmB,IAAI,CAAC,GAAG,CAAC,MAAM,CAChD,cAAc,EACd,KAAK,GAAG,CAAC,CACV,CAAC;QAEF,0EAA0E;QAC1E,mEAAmE;QACnE,mEAAmE;QACnE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAqB,CAAC;QAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC1C,IAAI,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC;gBAAE,SAAS,CAAC,sCAAsC;YACxE,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE;gBACjB,QAAQ,EAAE,CAAC;gBACX,WAAW,EAAE,GAAG,CAAC,EAAE;gBACnB,WAAW,EAAE,GAAG,CAAC,OAAO,IAAI,EAAE;aAC/B,CAAC,CAAC;QACL,CAAC;QAED,wEAAwE;QACxE,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;QACzC,MAAM,cAAc,GAAG,IAAI,GAAG,EAA6C,CAAC;QAE5E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,EAAE,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5B,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,YAAY,CAAC,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAClD,MAAM,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC;QACnD,CAAC;QAED,KAAK,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACtC,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,YAAY,CAAC,KAAK,GAAG,MAAM,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC;YAChE,MAAM,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC;YACvD,wCAAwC;YACxC,MAAM,QAAQ,GAAG,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YAC3C,IAAI,CAAC,QAAQ,IAAI,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACjD,cAAc,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,OAAO,EAAE,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;YACpF,CAAC;QACH,CAAC;QAED,uDAAuD;QACvD,MAAM,OAAO,GAAG,IAAI,GAAG,EAAwB,CAAC;QAChD,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;YAC3B,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;QACvB,CAAC;QAED,sEAAsE;QACtE,uEAAuE;QACvE,eAAe;QACf,MAAM,QAAQ,GAAmB,EAAE,CAAC;QAEpC,KAAK,MAAM,CAAC,EAAE,EAAE,QAAQ,CAAC,IAAI,MAAM,EAAE,CAAC;YACpC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC/B,IAAI,MAAM,EAAE,CAAC;gBACX,sEAAsE;gBACtE,kDAAkD;gBAClD,MAAM,OAAO,GACX,MAAM,IAAI,MAAM,CAAC,WAAW;oBAC1B,CAAC,CAAC,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,CAAC;oBAChD,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC;gBACrB,QAAQ,CAAC,IAAI,CAAC,EAAE,GAAG,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;YACzD,CAAC;iBAAM,CAAC;gBACN,sDAAsD;gBACtD,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;gBACpC,0DAA0D;gBAC1D,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM;oBAAE,SAAS;gBACjC,MAAM,OAAO,GAAG,MAAM;oBACpB,CAAC,CAAC,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,CAAC;oBAChD,CAAC,CAAC,EAAE,CAAC;gBACP,QAAQ,CAAC,IAAI,CAAC;oBACZ,EAAE,EAAE,GAAG,CAAC,EAAE;oBACV,IAAI,EAAE,GAAG,CAAC,IAAc;oBACxB,KAAK,EAAE,GAAG,CAAC,KAAK;oBAChB,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,MAAM,EAAE,GAAG,CAAC,MAAM;oBAClB,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,KAAK,EAAE,QAAQ;oBACf,OAAO;iBACR,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAE3C,0BAA0B;QAC1B,IAAI,QAAQ,GAAG,QAAQ,CAAC;QACxB,IAAI,CAAC,iBAAiB,EAAE,CAAC;YACvB,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,YAAY,CAAC,CAAC;QAC/D,CAAC;QAED,oBAAoB;QACpB,IAAI,IAAI,EAAE,CAAC;YACT,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;QACrD,CAAC;QAED,oBAAoB;QACpB,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;YAC7B,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;gBAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBACtC,OAAO,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC5C,CAAC,CAAC,CAAC;QACL,CAAC;QAED,sEAAsE;QACtE,uEAAuE;QACvE,sDAAsD;QACtD,IAAI,UAAU,IAAI,UAAU,KAAK,KAAK,EAAE,CAAC;YACvC,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;gBAC/B,MAAM,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBACzC,gDAAgD;gBAChD,OAAO,CAAC,IAAI,IAAI,KAAK,CAAC,KAAK,UAAU,CAAC;YACxC,CAAC,CAAC,CAAC;QACL,CAAC;QAED,oEAAoE;QACpE,kBAAkB;QAClB,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YACtC,IAAI,CAAC,IAAI;gBAAE,SAAS;YACpB,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAC5C,IAAI,CAAC,KAAK;gBAAE,SAAS;YACrB,CAAC,CAAC,WAAW,GAAG,KAAK,CAAC,EAAE,CAAC;YACzB,CAAC,CAAC,UAAU,GAAG,KAAK,CAAC,WAAW,CAAC;YACjC,CAAC,CAAC,SAAS,GAAG,KAAK,CAAC,UAAU,CAAC;YAC/B,CAAC,CAAC,OAAO,GAAG,KAAK,CAAC,QAAQ,CAAC;YAC3B,CAAC,CAAC,aAAa,GAAG,KAAK,CAAC,cAAc,CAAC;QACzC,CAAC;QAED,OAAO,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;IAClC,CAAC"}
|
package/dist/vector-search.d.ts
CHANGED
|
@@ -2,6 +2,12 @@ import type { KnowledgeDB } from "./db.js";
|
|
|
2
2
|
export interface VectorResult {
|
|
3
3
|
id: string;
|
|
4
4
|
distance: number;
|
|
5
|
+
/**
|
|
6
|
+
* Chunk content populated via LEFT JOIN to `chunks` table when the vec id
|
|
7
|
+
* matches a chunk row. When the vec id is doc-level (back-compat / legacy
|
|
8
|
+
* fixtures) or no matching chunks row exists, this is `null`.
|
|
9
|
+
*/
|
|
10
|
+
content?: string | null;
|
|
5
11
|
}
|
|
6
12
|
export declare class VectorSearch {
|
|
7
13
|
private knowledgeDb;
|
package/dist/vector-search.js
CHANGED
|
@@ -60,10 +60,14 @@ export class VectorSearch {
|
|
|
60
60
|
search(queryEmbedding, limit = 10) {
|
|
61
61
|
this.ensureVecLoaded();
|
|
62
62
|
const buf = float32ToBuffer(queryEmbedding);
|
|
63
|
+
// LEFT JOIN to `chunks` so chunk-level vec rows surface their content.
|
|
64
|
+
// Doc-level vec ids (no matching chunks row) return content = NULL, which
|
|
65
|
+
// preserves back-compat for pre-chunks callers and legacy test fixtures.
|
|
63
66
|
return this.knowledgeDb.db
|
|
64
67
|
.prepare(`
|
|
65
|
-
SELECT id, distance
|
|
68
|
+
SELECT documents_vec.id, distance, chunks.content
|
|
66
69
|
FROM documents_vec
|
|
70
|
+
LEFT JOIN chunks ON chunks.id = documents_vec.id
|
|
67
71
|
WHERE embedding MATCH ? AND k = ?
|
|
68
72
|
ORDER BY distance
|
|
69
73
|
`)
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"vector-search.js","sourceRoot":"","sources":["../src/vector-search.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,SAAS,MAAM,YAAY,CAAC;
|
|
1
|
+
{"version":3,"file":"vector-search.js","sourceRoot":"","sources":["../src/vector-search.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,SAAS,MAAM,YAAY,CAAC;AAcxC,SAAS,eAAe,CAAC,GAAiB;IACxC,OAAO,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC;AACjE,CAAC;AAED,MAAM,OAAO,YAAY;IAGH;IAFZ,SAAS,GAAG,KAAK,CAAC;IAE1B,YAAoB,WAAwB;QAAxB,gBAAW,GAAX,WAAW,CAAa;IAAG,CAAC;IAExC,eAAe;QACrB,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC;YACpB,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;YACpC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACxB,CAAC;IACH,CAAC;IAED,WAAW;QACT,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,CAAC;;;;;KAKxB,CAAC,CAAC;IACL,CAAC;IAED,SAAS;QACP,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC;IACjE,CAAC;IAED,eAAe,CAAC,EAAU,EAAE,SAAuB;QACjD,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;QACvC,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,wCAAwC,CAAC;aACjD,GAAG,CAAC,EAAE,CAAC,CAAC;QACX,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,yDAAyD,CAAC;aAClE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;IAClB,CAAC;IAED,eAAe,CAAC,EAAU;QACxB,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,wCAAwC,CAAC;aACjD,GAAG,CAAC,EAAE,CAAC,CAAC;IACb,CAAC;IAED;;;;;;;;OAQG;IACH,oBAAoB,CAAC,KAAa;QAChC,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,2CAA2C,CAAC;aACpD,GAAG,CAAC,GAAG,KAAK,KAAK,CAAC,CAAC;IACxB,CAAC;IAED,MAAM,CAAC,cAA4B,EAAE,QAAgB,EAAE;QACrD,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,eAAe,CAAC,cAAc,CAAC,CAAC;QAC5C,uEAAuE;QACvE,0EAA0E;QAC1E,yEAAyE;QACzE,OAAO,IAAI,CAAC,WAAW,CAAC,EAAE;aACvB,OAAO,CACN;;;;;;KAMH,CACE;aACA,GAAG,CAAC,GAAG,EAAE,KAAK,CAAmB,CAAC;IACvC,CAAC;CACF"}
|
package/package.json
CHANGED
|
@@ -217,3 +217,230 @@ describe("HybridSearch chunk metadata enrichment", () => {
|
|
|
217
217
|
expect(hit!.chunkIndex).toBeUndefined();
|
|
218
218
|
});
|
|
219
219
|
});
|
|
220
|
+
|
|
221
|
+
describe("HybridSearch chunk-to-doc dedup", () => {
|
|
222
|
+
let dedupDb: KnowledgeDB;
|
|
223
|
+
let dedupFts: FtsSearch;
|
|
224
|
+
let dedupVec: VectorSearch;
|
|
225
|
+
let dedupHybrid: HybridSearch;
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* Deterministic embed function for dedup tests: always returns the same
|
|
229
|
+
* vector as mockEmbedding(42). Paired with chunk embeddings that use the
|
|
230
|
+
* same seed so the chunks rank near-perfectly for any query.
|
|
231
|
+
*/
|
|
232
|
+
const fixedEmbedFn: EmbedFn = async () => mockEmbedding(42);
|
|
233
|
+
|
|
234
|
+
/** Insert a chunk row into the chunks table. */
|
|
235
|
+
function insertChunk(
|
|
236
|
+
db: KnowledgeDB,
|
|
237
|
+
chunkId: string,
|
|
238
|
+
docId: string,
|
|
239
|
+
index: number,
|
|
240
|
+
content: string,
|
|
241
|
+
): void {
|
|
242
|
+
db.db
|
|
243
|
+
.prepare(
|
|
244
|
+
`INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end)
|
|
245
|
+
VALUES (?, ?, ?, ?, ?, ?)`
|
|
246
|
+
)
|
|
247
|
+
.run(chunkId, docId, index, content, 0, content.length);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
beforeEach(() => {
|
|
251
|
+
dedupDb = new KnowledgeDB(":memory:");
|
|
252
|
+
|
|
253
|
+
dedupDb.upsertDocument({
|
|
254
|
+
id: "chunk-doc",
|
|
255
|
+
path: "thoughts/shared/research/chunking-strategies.md",
|
|
256
|
+
title: "Chunking Strategies Deep Dive",
|
|
257
|
+
date: "2026-03-01",
|
|
258
|
+
type: "research",
|
|
259
|
+
status: "draft",
|
|
260
|
+
githubIssue: null,
|
|
261
|
+
content:
|
|
262
|
+
"Header paragraph not a chunk match. Body discusses recursive character splitter tradeoffs.",
|
|
263
|
+
});
|
|
264
|
+
|
|
265
|
+
dedupDb.upsertDocument({
|
|
266
|
+
id: "other-doc",
|
|
267
|
+
path: "thoughts/shared/plans/other.md",
|
|
268
|
+
title: "Unrelated Plan",
|
|
269
|
+
date: "2026-03-02",
|
|
270
|
+
type: "plan",
|
|
271
|
+
status: "draft",
|
|
272
|
+
githubIssue: null,
|
|
273
|
+
content: "This is a completely different topic unrelated to the query.",
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
ensureV3Schema(dedupDb);
|
|
277
|
+
|
|
278
|
+
dedupFts = new FtsSearch(dedupDb);
|
|
279
|
+
dedupFts.rebuildIndex();
|
|
280
|
+
|
|
281
|
+
dedupVec = new VectorSearch(dedupDb);
|
|
282
|
+
dedupVec.createIndex();
|
|
283
|
+
|
|
284
|
+
// Five chunks from chunk-doc, all seeded identically so they rank as
|
|
285
|
+
// the top-5 vector hits for fixedEmbedFn. Distinct content per chunk so
|
|
286
|
+
// we can verify which one becomes the snippet.
|
|
287
|
+
for (let i = 0; i < 5; i++) {
|
|
288
|
+
const id = `chunk-doc#c${i}`;
|
|
289
|
+
const content = `Chunk ${i} content about recursive character splitter tradeoffs — paragraph ${i}.`;
|
|
290
|
+
insertChunk(dedupDb, id, "chunk-doc", i, content);
|
|
291
|
+
// Slight seed variation so distance differs per chunk; chunk 0 is best.
|
|
292
|
+
dedupVec.upsertEmbedding(id, mockEmbedding(42 + i * 0.0001));
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// other-doc has a single chunk embedded with a very different seed so
|
|
296
|
+
// it ranks well below chunk-doc's chunks.
|
|
297
|
+
insertChunk(
|
|
298
|
+
dedupDb,
|
|
299
|
+
"other-doc#c0",
|
|
300
|
+
"other-doc",
|
|
301
|
+
0,
|
|
302
|
+
"Unrelated single chunk content.",
|
|
303
|
+
);
|
|
304
|
+
dedupVec.upsertEmbedding("other-doc#c0", mockEmbedding(900));
|
|
305
|
+
|
|
306
|
+
dedupHybrid = new HybridSearch(
|
|
307
|
+
dedupDb,
|
|
308
|
+
dedupFts,
|
|
309
|
+
dedupVec,
|
|
310
|
+
fixedEmbedFn,
|
|
311
|
+
);
|
|
312
|
+
});
|
|
313
|
+
|
|
314
|
+
it("deduplicates: 5 chunks from same doc yield exactly 1 result entry", async () => {
|
|
315
|
+
const results = await dedupHybrid.search("anything");
|
|
316
|
+
|
|
317
|
+
const chunkDocHits = results.filter((r) => r.id === "chunk-doc");
|
|
318
|
+
expect(chunkDocHits).toHaveLength(1);
|
|
319
|
+
// Also ensure no chunk-level id leaks into the results
|
|
320
|
+
const chunkIds = results.filter((r) => r.id.includes("#c"));
|
|
321
|
+
expect(chunkIds).toHaveLength(0);
|
|
322
|
+
});
|
|
323
|
+
|
|
324
|
+
it("surfaced entry's snippet comes from the highest-ranked chunk", async () => {
|
|
325
|
+
const results = await dedupHybrid.search("anything");
|
|
326
|
+
|
|
327
|
+
const chunkDocHit = results.find((r) => r.id === "chunk-doc");
|
|
328
|
+
expect(chunkDocHit).toBeDefined();
|
|
329
|
+
// Chunk 0 has the smallest seed offset (mockEmbedding(42 + 0)) so it
|
|
330
|
+
// should have the smallest distance to fixedEmbedFn = mockEmbedding(42).
|
|
331
|
+
expect(chunkDocHit!.snippet).toContain("Chunk 0");
|
|
332
|
+
});
|
|
333
|
+
|
|
334
|
+
it("snippet length is at most 300 characters", async () => {
|
|
335
|
+
// Add a chunk with very long content to chunk-doc and re-embed so it
|
|
336
|
+
// becomes chunk 0's rival.
|
|
337
|
+
const longContent = "X".repeat(5000);
|
|
338
|
+
dedupDb.db
|
|
339
|
+
.prepare(
|
|
340
|
+
`INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end)
|
|
341
|
+
VALUES (?, ?, ?, ?, ?, ?)`
|
|
342
|
+
)
|
|
343
|
+
.run("chunk-doc#c99", "chunk-doc", 99, longContent, 0, longContent.length);
|
|
344
|
+
// Embed with seed exactly 42 so it becomes the best hit (distance 0).
|
|
345
|
+
dedupVec.upsertEmbedding("chunk-doc#c99", mockEmbedding(42));
|
|
346
|
+
|
|
347
|
+
const results = await dedupHybrid.search("anything");
|
|
348
|
+
const hit = results.find((r) => r.id === "chunk-doc");
|
|
349
|
+
expect(hit).toBeDefined();
|
|
350
|
+
expect(hit!.snippet.length).toBeLessThanOrEqual(300);
|
|
351
|
+
});
|
|
352
|
+
|
|
353
|
+
it("title-only FTS match still returns the doc (no regression on legacy doc-level hits)", async () => {
|
|
354
|
+
// Document with a doc-level vec row (no chunks) — simulates a legacy
|
|
355
|
+
// record that predates the chunks table.
|
|
356
|
+
dedupDb.upsertDocument({
|
|
357
|
+
id: "legacy-doc",
|
|
358
|
+
path: "thoughts/legacy.md",
|
|
359
|
+
title: "Legacy Title Only Matching Query",
|
|
360
|
+
date: "2026-02-01",
|
|
361
|
+
type: "research",
|
|
362
|
+
status: "draft",
|
|
363
|
+
githubIssue: null,
|
|
364
|
+
content: "Legacy body text",
|
|
365
|
+
});
|
|
366
|
+
dedupFts.rebuildIndex();
|
|
367
|
+
dedupVec.upsertEmbedding("legacy-doc", mockEmbedding(800));
|
|
368
|
+
|
|
369
|
+
const results = await dedupHybrid.search("Legacy");
|
|
370
|
+
const legacyHit = results.find((r) => r.id === "legacy-doc");
|
|
371
|
+
expect(legacyHit).toBeDefined();
|
|
372
|
+
// FTS contributed the snippet (no chunk content to override it).
|
|
373
|
+
expect(legacyHit!.snippet).toBeDefined();
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
it("RRF score: bucketed rank 0 + FTS rank 2 equals 1/(60+1) + 1/(60+3)", async () => {
|
|
377
|
+
// Force a known configuration by clearing and rebuilding:
|
|
378
|
+
// - chunk-doc is the #1 vector hit (bucketed rank 0)
|
|
379
|
+
// - chunk-doc is the #3 FTS hit (index 2)
|
|
380
|
+
// We arrange this by inserting three docs that match "match" in FTS,
|
|
381
|
+
// ordered by BM25 so chunk-doc ends up at rank 2.
|
|
382
|
+
//
|
|
383
|
+
// Simpler: test this using a fresh controlled fixture.
|
|
384
|
+
const tdb = new KnowledgeDB(":memory:");
|
|
385
|
+
|
|
386
|
+
tdb.upsertDocument({
|
|
387
|
+
id: "d-fts-top",
|
|
388
|
+
path: "a.md",
|
|
389
|
+
title: "match match match match",
|
|
390
|
+
date: "2026-01-01",
|
|
391
|
+
type: "research",
|
|
392
|
+
status: "draft",
|
|
393
|
+
githubIssue: null,
|
|
394
|
+
content: "match match match match match",
|
|
395
|
+
});
|
|
396
|
+
tdb.upsertDocument({
|
|
397
|
+
id: "d-fts-second",
|
|
398
|
+
path: "b.md",
|
|
399
|
+
title: "match match match",
|
|
400
|
+
date: "2026-01-02",
|
|
401
|
+
type: "research",
|
|
402
|
+
status: "draft",
|
|
403
|
+
githubIssue: null,
|
|
404
|
+
content: "match match match",
|
|
405
|
+
});
|
|
406
|
+
tdb.upsertDocument({
|
|
407
|
+
id: "target",
|
|
408
|
+
path: "c.md",
|
|
409
|
+
title: "Target Doc",
|
|
410
|
+
date: "2026-01-03",
|
|
411
|
+
type: "research",
|
|
412
|
+
status: "draft",
|
|
413
|
+
githubIssue: null,
|
|
414
|
+
content: "target doc with match keyword once",
|
|
415
|
+
});
|
|
416
|
+
|
|
417
|
+
const tfts = new FtsSearch(tdb);
|
|
418
|
+
tfts.rebuildIndex();
|
|
419
|
+
|
|
420
|
+
const tvec = new VectorSearch(tdb);
|
|
421
|
+
tvec.createIndex();
|
|
422
|
+
|
|
423
|
+
// One chunk for target, seeded exactly 42 so it's the only/best vec hit.
|
|
424
|
+
insertChunk(tdb, "target#c0", "target", 0, "Chunk content for target.");
|
|
425
|
+
tvec.upsertEmbedding("target#c0", mockEmbedding(42));
|
|
426
|
+
// Add far-away embeddings for the other docs so they don't contribute
|
|
427
|
+
// to the vector bucket's top ranks for target.
|
|
428
|
+
tvec.upsertEmbedding("d-fts-top", mockEmbedding(900));
|
|
429
|
+
tvec.upsertEmbedding("d-fts-second", mockEmbedding(901));
|
|
430
|
+
|
|
431
|
+
const thybrid = new HybridSearch(tdb, tfts, tvec, fixedEmbedFn);
|
|
432
|
+
const results = await thybrid.search("match");
|
|
433
|
+
|
|
434
|
+
const target = results.find((r) => r.id === "target");
|
|
435
|
+
expect(target).toBeDefined();
|
|
436
|
+
|
|
437
|
+
// Verify FTS rank of target is 2 (third position) by fetching raw FTS.
|
|
438
|
+
const ftsRaw = tfts.search("match", { includeSuperseded: true, limit: 40 });
|
|
439
|
+
const ftsRankOfTarget = ftsRaw.findIndex((r) => r.id === "target");
|
|
440
|
+
expect(ftsRankOfTarget).toBe(2);
|
|
441
|
+
|
|
442
|
+
const K = 60;
|
|
443
|
+
const expected = 1 / (K + 0 + 1) + 1 / (K + 2 + 1);
|
|
444
|
+
expect(target!.score).toBeCloseTo(expected, 10);
|
|
445
|
+
});
|
|
446
|
+
});
|
|
@@ -63,4 +63,30 @@ describe("VectorSearch", () => {
|
|
|
63
63
|
const results = vecSearch.search(mockEmbedding(1), 1);
|
|
64
64
|
expect(results).toHaveLength(1);
|
|
65
65
|
});
|
|
66
|
+
|
|
67
|
+
it("returns content = null when vec id has no matching chunks row (back-compat)", () => {
|
|
68
|
+
// doc-1 has no chunks row; vec id is doc-level. LEFT JOIN should yield null.
|
|
69
|
+
const results = vecSearch.search(mockEmbedding(1), 5);
|
|
70
|
+
const hit = results.find((r) => r.id === "doc-1");
|
|
71
|
+
expect(hit).toBeDefined();
|
|
72
|
+
expect(hit!.content).toBeNull();
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it("returns content populated when vec id matches a chunks row", () => {
|
|
76
|
+
// Insert a chunk-level vec row + matching chunks row for doc-1
|
|
77
|
+
db.db
|
|
78
|
+
.prepare(
|
|
79
|
+
`INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end)
|
|
80
|
+
VALUES (?, ?, ?, ?, ?, ?)`
|
|
81
|
+
)
|
|
82
|
+
.run("doc-1#c0", "doc-1", 0, "This is the first chunk content.", 0, 32);
|
|
83
|
+
vecSearch.upsertEmbedding("doc-1#c0", mockEmbedding(1));
|
|
84
|
+
// Remove the doc-level vec row so chunk-level wins
|
|
85
|
+
vecSearch.deleteEmbedding("doc-1");
|
|
86
|
+
|
|
87
|
+
const results = vecSearch.search(mockEmbedding(1), 5);
|
|
88
|
+
const hit = results.find((r) => r.id === "doc-1#c0");
|
|
89
|
+
expect(hit).toBeDefined();
|
|
90
|
+
expect(hit!.content).toBe("This is the first chunk content.");
|
|
91
|
+
});
|
|
66
92
|
});
|
package/src/hybrid-search.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import type { KnowledgeDB } from "./db.js";
|
|
2
2
|
import type { FtsSearch, SearchOptions, SearchResult } from "./search.js";
|
|
3
|
-
import type { VectorSearch } from "./vector-search.js";
|
|
3
|
+
import type { VectorResult, VectorSearch } from "./vector-search.js";
|
|
4
4
|
|
|
5
5
|
export type EmbedFn = (text: string) => Promise<Float32Array>;
|
|
6
6
|
|
|
@@ -14,6 +14,23 @@ interface ChunkRow {
|
|
|
14
14
|
content: string;
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
+
/**
|
|
18
|
+
* Maximum snippet length (in characters) when the snippet is sourced from a
|
|
19
|
+
* chunk's content. Keeps the MCP payload compact while still representative.
|
|
20
|
+
*/
|
|
21
|
+
const SNIPPET_MAX_CHARS = 300;
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Per-document bucket tracking the best-ranked chunk for a given doc_id in
|
|
25
|
+
* the vector result list. The "rank" is the index of the first occurrence of
|
|
26
|
+
* the document in the distance-sorted vector results (smaller = better).
|
|
27
|
+
*/
|
|
28
|
+
interface DocBucket {
|
|
29
|
+
bestRank: number;
|
|
30
|
+
bestChunkId: string;
|
|
31
|
+
bestContent: string;
|
|
32
|
+
}
|
|
33
|
+
|
|
17
34
|
export class HybridSearch {
|
|
18
35
|
private static readonly RRF_K = 60;
|
|
19
36
|
|
|
@@ -75,11 +92,27 @@ export class HybridSearch {
|
|
|
75
92
|
});
|
|
76
93
|
|
|
77
94
|
const queryEmbedding = await this.embedFn(query);
|
|
78
|
-
const vecResults = this.vec.search(
|
|
95
|
+
const vecResults: VectorResult[] = this.vec.search(
|
|
96
|
+
queryEmbedding,
|
|
97
|
+
limit * 2,
|
|
98
|
+
);
|
|
99
|
+
|
|
100
|
+
// Bucket vector results by doc_id, keeping the best-ranked chunk per doc.
|
|
101
|
+
// vecResults is already sorted by distance ascending, so the first
|
|
102
|
+
// occurrence of a given doc_id has the smallest rank (best match).
|
|
103
|
+
const buckets = new Map<string, DocBucket>();
|
|
104
|
+
for (let i = 0; i < vecResults.length; i++) {
|
|
105
|
+
const hit = vecResults[i];
|
|
106
|
+
const docId = this.docIdFromVecId(hit.id);
|
|
107
|
+
if (buckets.has(docId)) continue; // Already have best rank for this doc
|
|
108
|
+
buckets.set(docId, {
|
|
109
|
+
bestRank: i,
|
|
110
|
+
bestChunkId: hit.id,
|
|
111
|
+
bestContent: hit.content ?? "",
|
|
112
|
+
});
|
|
113
|
+
}
|
|
79
114
|
|
|
80
|
-
// Build RRF score map
|
|
81
|
-
// like `{doc}#c{n}`, we collapse to the parent doc for scoring but
|
|
82
|
-
// remember the best-scoring chunk id per doc for later meta enrichment.
|
|
115
|
+
// Build RRF score map (keyed by doc_id for both FTS and vector buckets)
|
|
83
116
|
const scores = new Map<string, number>();
|
|
84
117
|
const bestChunkByDoc = new Map<string, { chunkId: string; rank: number }>();
|
|
85
118
|
|
|
@@ -89,16 +122,13 @@ export class HybridSearch {
|
|
|
89
122
|
scores.set(id, (scores.get(id) ?? 0) + rrfScore);
|
|
90
123
|
}
|
|
91
124
|
|
|
92
|
-
for (
|
|
93
|
-
const
|
|
94
|
-
const docId = this.docIdFromVecId(vecId);
|
|
95
|
-
const rrfScore = 1 / (HybridSearch.RRF_K + i + 1);
|
|
125
|
+
for (const [docId, bucket] of buckets) {
|
|
126
|
+
const rrfScore = 1 / (HybridSearch.RRF_K + bucket.bestRank + 1);
|
|
96
127
|
scores.set(docId, (scores.get(docId) ?? 0) + rrfScore);
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
}
|
|
128
|
+
// Track best chunk for later enrichment
|
|
129
|
+
const existing = bestChunkByDoc.get(docId);
|
|
130
|
+
if (!existing || bucket.bestRank < existing.rank) {
|
|
131
|
+
bestChunkByDoc.set(docId, { chunkId: bucket.bestChunkId, rank: bucket.bestRank });
|
|
102
132
|
}
|
|
103
133
|
}
|
|
104
134
|
|
|
@@ -108,18 +138,30 @@ export class HybridSearch {
|
|
|
108
138
|
ftsById.set(r.id, r);
|
|
109
139
|
}
|
|
110
140
|
|
|
111
|
-
// Assemble combined results
|
|
141
|
+
// Assemble combined results. For vector-hit docs, replace the snippet
|
|
142
|
+
// with the winning chunk's content (truncated). FTS-only hits keep the
|
|
143
|
+
// FTS snippet.
|
|
112
144
|
const combined: SearchResult[] = [];
|
|
113
145
|
|
|
114
146
|
for (const [id, rrfScore] of scores) {
|
|
115
147
|
const ftsHit = ftsById.get(id);
|
|
148
|
+
const bucket = buckets.get(id);
|
|
116
149
|
if (ftsHit) {
|
|
117
|
-
|
|
150
|
+
// FTS hit (possibly also a vector hit): prefer the chunk snippet when
|
|
151
|
+
// the vector side contributed real chunk content.
|
|
152
|
+
const snippet =
|
|
153
|
+
bucket && bucket.bestContent
|
|
154
|
+
? bucket.bestContent.slice(0, SNIPPET_MAX_CHARS)
|
|
155
|
+
: ftsHit.snippet;
|
|
156
|
+
combined.push({ ...ftsHit, score: rrfScore, snippet });
|
|
118
157
|
} else {
|
|
119
158
|
// Vector-only result: fetch document metadata from db
|
|
120
159
|
const doc = this.db.getDocument(id);
|
|
121
160
|
// Skip stub documents — they have no real content or path
|
|
122
161
|
if (!doc || doc.isStub) continue;
|
|
162
|
+
const snippet = bucket
|
|
163
|
+
? bucket.bestContent.slice(0, SNIPPET_MAX_CHARS)
|
|
164
|
+
: "";
|
|
123
165
|
combined.push({
|
|
124
166
|
id: doc.id,
|
|
125
167
|
path: doc.path as string,
|
|
@@ -128,7 +170,7 @@ export class HybridSearch {
|
|
|
128
170
|
status: doc.status,
|
|
129
171
|
date: doc.date,
|
|
130
172
|
score: rrfScore,
|
|
131
|
-
snippet
|
|
173
|
+
snippet,
|
|
132
174
|
});
|
|
133
175
|
}
|
|
134
176
|
}
|
package/src/vector-search.ts
CHANGED
|
@@ -4,6 +4,12 @@ import type { KnowledgeDB } from "./db.js";
|
|
|
4
4
|
export interface VectorResult {
|
|
5
5
|
id: string;
|
|
6
6
|
distance: number;
|
|
7
|
+
/**
|
|
8
|
+
* Chunk content populated via LEFT JOIN to `chunks` table when the vec id
|
|
9
|
+
* matches a chunk row. When the vec id is doc-level (back-compat / legacy
|
|
10
|
+
* fixtures) or no matching chunks row exists, this is `null`.
|
|
11
|
+
*/
|
|
12
|
+
content?: string | null;
|
|
7
13
|
}
|
|
8
14
|
|
|
9
15
|
function float32ToBuffer(arr: Float32Array): Buffer {
|
|
@@ -73,11 +79,15 @@ export class VectorSearch {
|
|
|
73
79
|
search(queryEmbedding: Float32Array, limit: number = 10): VectorResult[] {
|
|
74
80
|
this.ensureVecLoaded();
|
|
75
81
|
const buf = float32ToBuffer(queryEmbedding);
|
|
82
|
+
// LEFT JOIN to `chunks` so chunk-level vec rows surface their content.
|
|
83
|
+
// Doc-level vec ids (no matching chunks row) return content = NULL, which
|
|
84
|
+
// preserves back-compat for pre-chunks callers and legacy test fixtures.
|
|
76
85
|
return this.knowledgeDb.db
|
|
77
86
|
.prepare(
|
|
78
87
|
`
|
|
79
|
-
SELECT id, distance
|
|
88
|
+
SELECT documents_vec.id, distance, chunks.content
|
|
80
89
|
FROM documents_vec
|
|
90
|
+
LEFT JOIN chunks ON chunks.id = documents_vec.id
|
|
81
91
|
WHERE embedding MATCH ? AND k = ?
|
|
82
92
|
ORDER BY distance
|
|
83
93
|
`
|