ralph-hero-knowledge-index 0.1.24 → 0.1.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ralph-knowledge",
3
- "version": "0.1.24",
3
+ "version": "0.1.25",
4
4
  "description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
5
5
  "author": {
6
6
  "name": "Chad Dubiel",
package/.mcp.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "mcpServers": {
3
3
  "ralph-knowledge": {
4
4
  "command": "npx",
5
- "args": ["-y", "ralph-hero-knowledge-index@0.1.24"]
5
+ "args": ["-y", "ralph-hero-knowledge-index@0.1.25"]
6
6
  }
7
7
  }
8
8
  }
@@ -1,3 +1,8 @@
1
+ /**
2
+ * Maximum snippet length (in characters) when the snippet is sourced from a
3
+ * chunk's content. Keeps the MCP payload compact while still representative.
4
+ */
5
+ const SNIPPET_MAX_CHARS = 300;
1
6
  export class HybridSearch {
2
7
  db;
3
8
  fts;
@@ -53,9 +58,22 @@ export class HybridSearch {
53
58
  });
54
59
  const queryEmbedding = await this.embedFn(query);
55
60
  const vecResults = this.vec.search(queryEmbedding, limit * 2);
56
- // Build RRF score map, keyed by document_id. When vec ids are chunk ids
57
- // like `{doc}#c{n}`, we collapse to the parent doc for scoring but
58
- // remember the best-scoring chunk id per doc for later meta enrichment.
61
+ // Bucket vector results by doc_id, keeping the best-ranked chunk per doc.
62
+ // vecResults is already sorted by distance ascending, so the first
63
+ // occurrence of a given doc_id has the smallest rank (best match).
64
+ const buckets = new Map();
65
+ for (let i = 0; i < vecResults.length; i++) {
66
+ const hit = vecResults[i];
67
+ const docId = this.docIdFromVecId(hit.id);
68
+ if (buckets.has(docId))
69
+ continue; // Already have best rank for this doc
70
+ buckets.set(docId, {
71
+ bestRank: i,
72
+ bestChunkId: hit.id,
73
+ bestContent: hit.content ?? "",
74
+ });
75
+ }
76
+ // Build RRF score map (keyed by doc_id for both FTS and vector buckets)
59
77
  const scores = new Map();
60
78
  const bestChunkByDoc = new Map();
61
79
  for (let i = 0; i < ftsResults.length; i++) {
@@ -63,16 +81,13 @@ export class HybridSearch {
63
81
  const rrfScore = 1 / (HybridSearch.RRF_K + i + 1);
64
82
  scores.set(id, (scores.get(id) ?? 0) + rrfScore);
65
83
  }
66
- for (let i = 0; i < vecResults.length; i++) {
67
- const vecId = vecResults[i].id;
68
- const docId = this.docIdFromVecId(vecId);
69
- const rrfScore = 1 / (HybridSearch.RRF_K + i + 1);
84
+ for (const [docId, bucket] of buckets) {
85
+ const rrfScore = 1 / (HybridSearch.RRF_K + bucket.bestRank + 1);
70
86
  scores.set(docId, (scores.get(docId) ?? 0) + rrfScore);
71
- if (vecId !== docId) {
72
- const existing = bestChunkByDoc.get(docId);
73
- if (!existing || i < existing.rank) {
74
- bestChunkByDoc.set(docId, { chunkId: vecId, rank: i });
75
- }
87
+ // Track best chunk for later enrichment
88
+ const existing = bestChunkByDoc.get(docId);
89
+ if (!existing || bucket.bestRank < existing.rank) {
90
+ bestChunkByDoc.set(docId, { chunkId: bucket.bestChunkId, rank: bucket.bestRank });
76
91
  }
77
92
  }
78
93
  // Build a lookup of FTS results by id for quick access
@@ -80,12 +95,20 @@ export class HybridSearch {
80
95
  for (const r of ftsResults) {
81
96
  ftsById.set(r.id, r);
82
97
  }
83
- // Assemble combined results
98
+ // Assemble combined results. For vector-hit docs, replace the snippet
99
+ // with the winning chunk's content (truncated). FTS-only hits keep the
100
+ // FTS snippet.
84
101
  const combined = [];
85
102
  for (const [id, rrfScore] of scores) {
86
103
  const ftsHit = ftsById.get(id);
104
+ const bucket = buckets.get(id);
87
105
  if (ftsHit) {
88
- combined.push({ ...ftsHit, score: rrfScore });
106
+ // FTS hit (possibly also a vector hit): prefer the chunk snippet when
107
+ // the vector side contributed real chunk content.
108
+ const snippet = bucket && bucket.bestContent
109
+ ? bucket.bestContent.slice(0, SNIPPET_MAX_CHARS)
110
+ : ftsHit.snippet;
111
+ combined.push({ ...ftsHit, score: rrfScore, snippet });
89
112
  }
90
113
  else {
91
114
  // Vector-only result: fetch document metadata from db
@@ -93,6 +116,9 @@ export class HybridSearch {
93
116
  // Skip stub documents — they have no real content or path
94
117
  if (!doc || doc.isStub)
95
118
  continue;
119
+ const snippet = bucket
120
+ ? bucket.bestContent.slice(0, SNIPPET_MAX_CHARS)
121
+ : "";
96
122
  combined.push({
97
123
  id: doc.id,
98
124
  path: doc.path,
@@ -101,7 +127,7 @@ export class HybridSearch {
101
127
  status: doc.status,
102
128
  date: doc.date,
103
129
  score: rrfScore,
104
- snippet: "",
130
+ snippet,
105
131
  });
106
132
  }
107
133
  }
@@ -1 +1 @@
1
- {"version":3,"file":"hybrid-search.js","sourceRoot":"","sources":["../src/hybrid-search.ts"],"names":[],"mappings":"AAgBA,MAAM,OAAO,YAAY;IAIJ;IACA;IACA;IACA;IANX,MAAM,CAAU,KAAK,GAAG,EAAE,CAAC;IAEnC,YACmB,EAAe,EACf,GAAc,EACd,GAAiB,EACjB,OAAgB;QAHhB,OAAE,GAAF,EAAE,CAAa;QACf,QAAG,GAAH,GAAG,CAAW;QACd,QAAG,GAAH,GAAG,CAAc;QACjB,YAAO,GAAP,OAAO,CAAS;IAChC,CAAC;IAEJ;;;OAGG;IACK,iBAAiB;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,EAAE;aACnB,OAAO,CACN,qEAAqE,CACtE;aACA,GAAG,EAAE,CAAC;QACT,OAAO,GAAG,KAAK,SAAS,CAAC;IAC3B,CAAC;IAED;;;;OAIG;IACK,cAAc,CAAC,KAAa;QAClC,MAAM,MAAM,GAAG,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACvC,IAAI,MAAM,KAAK,CAAC,CAAC;YAAE,OAAO,KAAK,CAAC;QAChC,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC;YAAE,OAAO,KAAK,CAAC;QAC/D,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;IAChC,CAAC;IAEO,UAAU,CAAC,OAAe;QAChC,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;YAAE,OAAO,SAAS,CAAC;QAChD,OAAO,IAAI,CAAC,EAAE,CAAC,EAAE;aACd,OAAO,CACN;kCAC0B,CAC3B;aACA,GAAG,CAAC,OAAO,CAAyB,CAAC;IAC1C,CAAC;IAED,KAAK,CAAC,MAAM,CACV,KAAa,EACb,UAAyB,EAAE;QAE3B,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,iBAAiB,GAAG,KAAK,EAAE,KAAK,GAAG,EAAE,EAAE,UAAU,EAAE,GAAG,OAAO,CAAC;QAElF,0EAA0E;QAC1E,gCAAgC;QAChC,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE;YACxC,iBAAiB,EAAE,IAAI;YACvB,KAAK,EAAE,KAAK,GAAG,CAAC;YAChB,UAAU;SACX,CAAC,CAAC;QAEH,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QACjD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,cAAc,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;QAE9D,wEAAwE;QACxE,mEAAmE;QACnE,wEAAwE;QACxE,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;QACzC,MAAM,cAAc,GAAG,IAAI,GAAG,EAA6C,CAAC;QAE5E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,EAAE,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5B,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,YAAY,CAAC,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAClD,MAAM,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC;QACnD,CAAC;QAED,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC;YACzC,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,YAAY,CAAC,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAClD,MAAM,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC;YACvD,IAAI,KAAK,KAAK,KAAK,EAAE,CAAC;gBACpB,MAAM,QAAQ,GAAG,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;gBAC3C,IAAI,CAAC,QAAQ,IAAI,CAAC,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC;oBACnC,cAAc,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC;gBACzD,CAAC;YACH,CAAC;QACH,CAAC;QAED,uDAAuD;QACvD,MAAM,OAAO,GAAG,IAAI,GAAG,EAAwB,CAAC;QAChD,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;YAC3B,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;QACvB,CAAC;QAED,4BAA4B;QAC5B,MAAM,QAAQ,GAAmB,EAAE,CAAC;QAEpC,KAAK,MAAM,CAAC,EAAE,EAAE,QAAQ,CAAC,IAAI,MAAM,EAAE,CAAC;YACpC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC/B,IAAI,MAAM,EAAE,CAAC;gBACX,QAAQ,CAAC,IAAI,CAAC,EAAE,GAAG,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,CAAC,CAAC;YAChD,CAAC;iBAAM,CAAC;gBACN,sDAAsD;gBACtD,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;gBACpC,0DAA0D;gBAC1D,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM;oBAAE,SAAS;gBACjC,QAAQ,CAAC,IAAI,CAAC;oBACZ,EAAE,EAAE,GAAG,CAAC,EAAE;oBACV,IAAI,EAAE,GAAG,CAAC,IAAc;oBACxB,KAAK,EAAE,GAAG,CAAC,KAAK;oBAChB,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,MAAM,EAAE,GAAG,CAAC,MAAM;oBAClB,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,KAAK,EAAE,QAAQ;oBACf,OAAO,EAAE,EAAE;iBACZ,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAE3C,0BAA0B;QAC1B,IAAI,QAAQ,GAAG,QAAQ,CAAC;QACxB,IAAI,CAAC,iBAAiB,EAAE,CAAC;YACvB,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,YAAY,CAAC,CAAC;QAC/D,CAAC;QAED,oBAAoB;QACpB,IAAI,IAAI,EAAE,CAAC;YACT,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;QACrD,CAAC;QAED,oBAAoB;QACpB,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;YAC7B,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;gBAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBACtC,OAAO,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC5C,CAAC,CAAC,CAAC;QACL,CAAC;QAED,sEAAsE;QACtE,uEAAuE;QACvE,sDAAsD;QACtD,IAAI,UAAU,IAAI,UAAU,KAAK,KAAK,EAAE,CAAC;YACvC,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;gBAC/B,MAAM,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBACzC,gDAAgD;gBAChD,OAAO,CAAC,IAAI,IAAI,KAAK,CAAC,KAAK,UAAU,CAAC;YACxC,CAAC,CAAC,CAAC;QACL,CAAC;QAED,oEAAoE;QACpE,kBAAkB;QAClB,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YACtC,IAAI,CAAC,IAAI;gBAAE,SAAS;YACpB,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAC5C,IAAI,CAAC,KAAK;gBAAE,SAAS;YACrB,CAAC,CAAC,WAAW,GAAG,KAAK,CAAC,EAAE,CAAC;YACzB,CAAC,CAAC,UAAU,GAAG,KAAK,CAAC,WAAW,CAAC;YACjC,CAAC,CAAC,SAAS,GAAG,KAAK,CAAC,UAAU,CAAC;YAC/B,CAAC,CAAC,OAAO,GAAG,KAAK,CAAC,QAAQ,CAAC;YAC3B,CAAC,CAAC,aAAa,GAAG,KAAK,CAAC,cAAc,CAAC;QACzC,CAAC;QAED,OAAO,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;IAClC,CAAC"}
1
+ {"version":3,"file":"hybrid-search.js","sourceRoot":"","sources":["../src/hybrid-search.ts"],"names":[],"mappings":"AAgBA;;;GAGG;AACH,MAAM,iBAAiB,GAAG,GAAG,CAAC;AAa9B,MAAM,OAAO,YAAY;IAIJ;IACA;IACA;IACA;IANX,MAAM,CAAU,KAAK,GAAG,EAAE,CAAC;IAEnC,YACmB,EAAe,EACf,GAAc,EACd,GAAiB,EACjB,OAAgB;QAHhB,OAAE,GAAF,EAAE,CAAa;QACf,QAAG,GAAH,GAAG,CAAW;QACd,QAAG,GAAH,GAAG,CAAc;QACjB,YAAO,GAAP,OAAO,CAAS;IAChC,CAAC;IAEJ;;;OAGG;IACK,iBAAiB;QACvB,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,EAAE;aACnB,OAAO,CACN,qEAAqE,CACtE;aACA,GAAG,EAAE,CAAC;QACT,OAAO,GAAG,KAAK,SAAS,CAAC;IAC3B,CAAC;IAED;;;;OAIG;IACK,cAAc,CAAC,KAAa;QAClC,MAAM,MAAM,GAAG,KAAK,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QACvC,IAAI,MAAM,KAAK,CAAC,CAAC;YAAE,OAAO,KAAK,CAAC;QAChC,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACvC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC;YAAE,OAAO,KAAK,CAAC;QAC/D,OAAO,KAAK,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;IAChC,CAAC;IAEO,UAAU,CAAC,OAAe;QAChC,IAAI,CAAC,IAAI,CAAC,iBAAiB,EAAE;YAAE,OAAO,SAAS,CAAC;QAChD,OAAO,IAAI,CAAC,EAAE,CAAC,EAAE;aACd,OAAO,CACN;kCAC0B,CAC3B;aACA,GAAG,CAAC,OAAO,CAAyB,CAAC;IAC1C,CAAC;IAED,KAAK,CAAC,MAAM,CACV,KAAa,EACb,UAAyB,EAAE;QAE3B,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,iBAAiB,GAAG,KAAK,EAAE,KAAK,GAAG,EAAE,EAAE,UAAU,EAAE,GAAG,OAAO,CAAC;QAElF,0EAA0E;QAC1E,gCAAgC;QAChC,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,EAAE;YACxC,iBAAiB,EAAE,IAAI;YACvB,KAAK,EAAE,KAAK,GAAG,CAAC;YAChB,UAAU;SACX,CAAC,CAAC;QAEH,MAAM,cAAc,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;QACjD,MAAM,UAAU,GAAmB,IAAI,CAAC,GAAG,CAAC,MAAM,CAChD,cAAc,EACd,KAAK,GAAG,CAAC,CACV,CAAC;QAEF,0EAA0E;QAC1E,mEAAmE;QACnE,mEAAmE;QACnE,MAAM,OAAO,GAAG,IAAI,GAAG,EAAqB,CAAC;QAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,GAAG,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC1C,IAAI,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC;gBAAE,SAAS,CAAC,sCAAsC;YACxE,OAAO,CAAC,GAAG,CAAC,KAAK,EAAE;gBACjB,QAAQ,EAAE,CAAC;gBACX,WAAW,EAAE,GAAG,CAAC,EAAE;gBACnB,WAAW,EAAE,GAAG,CAAC,OAAO,IAAI,EAAE;aAC/B,CAAC,CAAC;QACL,CAAC;QAED,wEAAwE;QACxE,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAC;QACzC,MAAM,cAAc,GAAG,IAAI,GAAG,EAA6C,CAAC;QAE5E,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC3C,MAAM,EAAE,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5B,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,YAAY,CAAC,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;YAClD,MAAM,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC;QACnD,CAAC;QAED,KAAK,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,IAAI,OAAO,EAAE,CAAC;YACtC,MAAM,QAAQ,GAAG,CAAC,GAAG,CAAC,YAAY,CAAC,KAAK,GAAG,MAAM,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC;YAChE,MAAM,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC;YACvD,wCAAwC;YACxC,MAAM,QAAQ,GAAG,cAAc,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;YAC3C,IAAI,CAAC,QAAQ,IAAI,MAAM,CAAC,QAAQ,GAAG,QAAQ,CAAC,IAAI,EAAE,CAAC;gBACjD,cAAc,CAAC,GAAG,CAAC,KAAK,EAAE,EAAE,OAAO,EAAE,MAAM,CAAC,WAAW,EAAE,IAAI,EAAE,MAAM,CAAC,QAAQ,EAAE,CAAC,CAAC;YACpF,CAAC;QACH,CAAC;QAED,uDAAuD;QACvD,MAAM,OAAO,GAAG,IAAI,GAAG,EAAwB,CAAC;QAChD,KAAK,MAAM,CAAC,IAAI,UAAU,EAAE,CAAC;YAC3B,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;QACvB,CAAC;QAED,sEAAsE;QACtE,uEAAuE;QACvE,eAAe;QACf,MAAM,QAAQ,GAAmB,EAAE,CAAC;QAEpC,KAAK,MAAM,CAAC,EAAE,EAAE,QAAQ,CAAC,IAAI,MAAM,EAAE,CAAC;YACpC,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YAC/B,IAAI,MAAM,EAAE,CAAC;gBACX,sEAAsE;gBACtE,kDAAkD;gBAClD,MAAM,OAAO,GACX,MAAM,IAAI,MAAM,CAAC,WAAW;oBAC1B,CAAC,CAAC,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,CAAC;oBAChD,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC;gBACrB,QAAQ,CAAC,IAAI,CAAC,EAAE,GAAG,MAAM,EAAE,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;YACzD,CAAC;iBAAM,CAAC;gBACN,sDAAsD;gBACtD,MAAM,GAAG,GAAG,IAAI,CAAC,EAAE,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;gBACpC,0DAA0D;gBAC1D,IAAI,CAAC,GAAG,IAAI,GAAG,CAAC,MAAM;oBAAE,SAAS;gBACjC,MAAM,OAAO,GAAG,MAAM;oBACpB,CAAC,CAAC,MAAM,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,iBAAiB,CAAC;oBAChD,CAAC,CAAC,EAAE,CAAC;gBACP,QAAQ,CAAC,IAAI,CAAC;oBACZ,EAAE,EAAE,GAAG,CAAC,EAAE;oBACV,IAAI,EAAE,GAAG,CAAC,IAAc;oBACxB,KAAK,EAAE,GAAG,CAAC,KAAK;oBAChB,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,MAAM,EAAE,GAAG,CAAC,MAAM;oBAClB,IAAI,EAAE,GAAG,CAAC,IAAI;oBACd,KAAK,EAAE,QAAQ;oBACf,OAAO;iBACR,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAE3C,0BAA0B;QAC1B,IAAI,QAAQ,GAAG,QAAQ,CAAC;QACxB,IAAI,CAAC,iBAAiB,EAAE,CAAC;YACvB,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,KAAK,YAAY,CAAC,CAAC;QAC/D,CAAC;QAED,oBAAoB;QACpB,IAAI,IAAI,EAAE,CAAC;YACT,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;QACrD,CAAC;QAED,oBAAoB;QACpB,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,CAAC;YAC7B,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;gBAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBACtC,OAAO,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC5C,CAAC,CAAC,CAAC;QACL,CAAC;QAED,sEAAsE;QACtE,uEAAuE;QACvE,sDAAsD;QACtD,IAAI,UAAU,IAAI,UAAU,KAAK,KAAK,EAAE,CAAC;YACvC,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;gBAC/B,MAAM,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC,aAAa,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;gBACzC,gDAAgD;gBAChD,OAAO,CAAC,IAAI,IAAI,KAAK,CAAC,KAAK,UAAU,CAAC;YACxC,CAAC,CAAC,CAAC;QACL,CAAC;QAED,oEAAoE;QACpE,kBAAkB;QAClB,KAAK,MAAM,CAAC,IAAI,QAAQ,EAAE,CAAC;YACzB,MAAM,IAAI,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YACtC,IAAI,CAAC,IAAI;gBAAE,SAAS;YACpB,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YAC5C,IAAI,CAAC,KAAK;gBAAE,SAAS;YACrB,CAAC,CAAC,WAAW,GAAG,KAAK,CAAC,EAAE,CAAC;YACzB,CAAC,CAAC,UAAU,GAAG,KAAK,CAAC,WAAW,CAAC;YACjC,CAAC,CAAC,SAAS,GAAG,KAAK,CAAC,UAAU,CAAC;YAC/B,CAAC,CAAC,OAAO,GAAG,KAAK,CAAC,QAAQ,CAAC;YAC3B,CAAC,CAAC,aAAa,GAAG,KAAK,CAAC,cAAc,CAAC;QACzC,CAAC;QAED,OAAO,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC;IAClC,CAAC"}
@@ -2,6 +2,12 @@ import type { KnowledgeDB } from "./db.js";
2
2
  export interface VectorResult {
3
3
  id: string;
4
4
  distance: number;
5
+ /**
6
+ * Chunk content populated via LEFT JOIN to `chunks` table when the vec id
7
+ * matches a chunk row. When the vec id is doc-level (back-compat / legacy
8
+ * fixtures) or no matching chunks row exists, this is `null`.
9
+ */
10
+ content?: string | null;
5
11
  }
6
12
  export declare class VectorSearch {
7
13
  private knowledgeDb;
@@ -60,10 +60,14 @@ export class VectorSearch {
60
60
  search(queryEmbedding, limit = 10) {
61
61
  this.ensureVecLoaded();
62
62
  const buf = float32ToBuffer(queryEmbedding);
63
+ // LEFT JOIN to `chunks` so chunk-level vec rows surface their content.
64
+ // Doc-level vec ids (no matching chunks row) return content = NULL, which
65
+ // preserves back-compat for pre-chunks callers and legacy test fixtures.
63
66
  return this.knowledgeDb.db
64
67
  .prepare(`
65
- SELECT id, distance
68
+ SELECT documents_vec.id, distance, chunks.content
66
69
  FROM documents_vec
70
+ LEFT JOIN chunks ON chunks.id = documents_vec.id
67
71
  WHERE embedding MATCH ? AND k = ?
68
72
  ORDER BY distance
69
73
  `)
@@ -1 +1 @@
1
- {"version":3,"file":"vector-search.js","sourceRoot":"","sources":["../src/vector-search.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,SAAS,MAAM,YAAY,CAAC;AAQxC,SAAS,eAAe,CAAC,GAAiB;IACxC,OAAO,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC;AACjE,CAAC;AAED,MAAM,OAAO,YAAY;IAGH;IAFZ,SAAS,GAAG,KAAK,CAAC;IAE1B,YAAoB,WAAwB;QAAxB,gBAAW,GAAX,WAAW,CAAa;IAAG,CAAC;IAExC,eAAe;QACrB,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC;YACpB,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;YACpC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACxB,CAAC;IACH,CAAC;IAED,WAAW;QACT,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,CAAC;;;;;KAKxB,CAAC,CAAC;IACL,CAAC;IAED,SAAS;QACP,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC;IACjE,CAAC;IAED,eAAe,CAAC,EAAU,EAAE,SAAuB;QACjD,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;QACvC,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,wCAAwC,CAAC;aACjD,GAAG,CAAC,EAAE,CAAC,CAAC;QACX,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,yDAAyD,CAAC;aAClE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;IAClB,CAAC;IAED,eAAe,CAAC,EAAU;QACxB,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,wCAAwC,CAAC;aACjD,GAAG,CAAC,EAAE,CAAC,CAAC;IACb,CAAC;IAED;;;;;;;;OAQG;IACH,oBAAoB,CAAC,KAAa;QAChC,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,2CAA2C,CAAC;aACpD,GAAG,CAAC,GAAG,KAAK,KAAK,CAAC,CAAC;IACxB,CAAC;IAED,MAAM,CAAC,cAA4B,EAAE,QAAgB,EAAE;QACrD,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,eAAe,CAAC,cAAc,CAAC,CAAC;QAC5C,OAAO,IAAI,CAAC,WAAW,CAAC,EAAE;aACvB,OAAO,CACN;;;;;KAKH,CACE;aACA,GAAG,CAAC,GAAG,EAAE,KAAK,CAAmB,CAAC;IACvC,CAAC;CACF"}
1
+ {"version":3,"file":"vector-search.js","sourceRoot":"","sources":["../src/vector-search.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,SAAS,MAAM,YAAY,CAAC;AAcxC,SAAS,eAAe,CAAC,GAAiB;IACxC,OAAO,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,UAAU,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC;AACjE,CAAC;AAED,MAAM,OAAO,YAAY;IAGH;IAFZ,SAAS,GAAG,KAAK,CAAC;IAE1B,YAAoB,WAAwB;QAAxB,gBAAW,GAAX,WAAW,CAAa;IAAG,CAAC;IAExC,eAAe;QACrB,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,CAAC;YACpB,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC;YACpC,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACxB,CAAC;IACH,CAAC;IAED,WAAW;QACT,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,CAAC;;;;;KAKxB,CAAC,CAAC;IACL,CAAC;IAED,SAAS;QACP,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,IAAI,CAAC,oCAAoC,CAAC,CAAC;IACjE,CAAC;IAED,eAAe,CAAC,EAAU,EAAE,SAAuB;QACjD,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,eAAe,CAAC,SAAS,CAAC,CAAC;QACvC,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,wCAAwC,CAAC;aACjD,GAAG,CAAC,EAAE,CAAC,CAAC;QACX,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,yDAAyD,CAAC;aAClE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;IAClB,CAAC;IAED,eAAe,CAAC,EAAU;QACxB,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,wCAAwC,CAAC;aACjD,GAAG,CAAC,EAAE,CAAC,CAAC;IACb,CAAC;IAED;;;;;;;;OAQG;IACH,oBAAoB,CAAC,KAAa;QAChC,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,EAAE;aAChB,OAAO,CAAC,2CAA2C,CAAC;aACpD,GAAG,CAAC,GAAG,KAAK,KAAK,CAAC,CAAC;IACxB,CAAC;IAED,MAAM,CAAC,cAA4B,EAAE,QAAgB,EAAE;QACrD,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,MAAM,GAAG,GAAG,eAAe,CAAC,cAAc,CAAC,CAAC;QAC5C,uEAAuE;QACvE,0EAA0E;QAC1E,yEAAyE;QACzE,OAAO,IAAI,CAAC,WAAW,CAAC,EAAE;aACvB,OAAO,CACN;;;;;;KAMH,CACE;aACA,GAAG,CAAC,GAAG,EAAE,KAAK,CAAmB,CAAC;IACvC,CAAC;CACF"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ralph-hero-knowledge-index",
3
- "version": "0.1.24",
3
+ "version": "0.1.25",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
@@ -217,3 +217,230 @@ describe("HybridSearch chunk metadata enrichment", () => {
217
217
  expect(hit!.chunkIndex).toBeUndefined();
218
218
  });
219
219
  });
220
+
221
+ describe("HybridSearch chunk-to-doc dedup", () => {
222
+ let dedupDb: KnowledgeDB;
223
+ let dedupFts: FtsSearch;
224
+ let dedupVec: VectorSearch;
225
+ let dedupHybrid: HybridSearch;
226
+
227
+ /**
228
+ * Deterministic embed function for dedup tests: always returns the same
229
+ * vector as mockEmbedding(42). Paired with chunk embeddings that use the
230
+ * same seed so the chunks rank near-perfectly for any query.
231
+ */
232
+ const fixedEmbedFn: EmbedFn = async () => mockEmbedding(42);
233
+
234
+ /** Insert a chunk row into the chunks table. */
235
+ function insertChunk(
236
+ db: KnowledgeDB,
237
+ chunkId: string,
238
+ docId: string,
239
+ index: number,
240
+ content: string,
241
+ ): void {
242
+ db.db
243
+ .prepare(
244
+ `INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end)
245
+ VALUES (?, ?, ?, ?, ?, ?)`
246
+ )
247
+ .run(chunkId, docId, index, content, 0, content.length);
248
+ }
249
+
250
+ beforeEach(() => {
251
+ dedupDb = new KnowledgeDB(":memory:");
252
+
253
+ dedupDb.upsertDocument({
254
+ id: "chunk-doc",
255
+ path: "thoughts/shared/research/chunking-strategies.md",
256
+ title: "Chunking Strategies Deep Dive",
257
+ date: "2026-03-01",
258
+ type: "research",
259
+ status: "draft",
260
+ githubIssue: null,
261
+ content:
262
+ "Header paragraph not a chunk match. Body discusses recursive character splitter tradeoffs.",
263
+ });
264
+
265
+ dedupDb.upsertDocument({
266
+ id: "other-doc",
267
+ path: "thoughts/shared/plans/other.md",
268
+ title: "Unrelated Plan",
269
+ date: "2026-03-02",
270
+ type: "plan",
271
+ status: "draft",
272
+ githubIssue: null,
273
+ content: "This is a completely different topic unrelated to the query.",
274
+ });
275
+
276
+ ensureV3Schema(dedupDb);
277
+
278
+ dedupFts = new FtsSearch(dedupDb);
279
+ dedupFts.rebuildIndex();
280
+
281
+ dedupVec = new VectorSearch(dedupDb);
282
+ dedupVec.createIndex();
283
+
284
+ // Five chunks from chunk-doc, all seeded identically so they rank as
285
+ // the top-5 vector hits for fixedEmbedFn. Distinct content per chunk so
286
+ // we can verify which one becomes the snippet.
287
+ for (let i = 0; i < 5; i++) {
288
+ const id = `chunk-doc#c${i}`;
289
+ const content = `Chunk ${i} content about recursive character splitter tradeoffs — paragraph ${i}.`;
290
+ insertChunk(dedupDb, id, "chunk-doc", i, content);
291
+ // Slight seed variation so distance differs per chunk; chunk 0 is best.
292
+ dedupVec.upsertEmbedding(id, mockEmbedding(42 + i * 0.0001));
293
+ }
294
+
295
+ // other-doc has a single chunk embedded with a very different seed so
296
+ // it ranks well below chunk-doc's chunks.
297
+ insertChunk(
298
+ dedupDb,
299
+ "other-doc#c0",
300
+ "other-doc",
301
+ 0,
302
+ "Unrelated single chunk content.",
303
+ );
304
+ dedupVec.upsertEmbedding("other-doc#c0", mockEmbedding(900));
305
+
306
+ dedupHybrid = new HybridSearch(
307
+ dedupDb,
308
+ dedupFts,
309
+ dedupVec,
310
+ fixedEmbedFn,
311
+ );
312
+ });
313
+
314
+ it("deduplicates: 5 chunks from same doc yield exactly 1 result entry", async () => {
315
+ const results = await dedupHybrid.search("anything");
316
+
317
+ const chunkDocHits = results.filter((r) => r.id === "chunk-doc");
318
+ expect(chunkDocHits).toHaveLength(1);
319
+ // Also ensure no chunk-level id leaks into the results
320
+ const chunkIds = results.filter((r) => r.id.includes("#c"));
321
+ expect(chunkIds).toHaveLength(0);
322
+ });
323
+
324
+ it("surfaced entry's snippet comes from the highest-ranked chunk", async () => {
325
+ const results = await dedupHybrid.search("anything");
326
+
327
+ const chunkDocHit = results.find((r) => r.id === "chunk-doc");
328
+ expect(chunkDocHit).toBeDefined();
329
+ // Chunk 0 has the smallest seed offset (mockEmbedding(42 + 0)) so it
330
+ // should have the smallest distance to fixedEmbedFn = mockEmbedding(42).
331
+ expect(chunkDocHit!.snippet).toContain("Chunk 0");
332
+ });
333
+
334
+ it("snippet length is at most 300 characters", async () => {
335
+ // Add a chunk with very long content to chunk-doc and re-embed so it
336
+ // becomes chunk 0's rival.
337
+ const longContent = "X".repeat(5000);
338
+ dedupDb.db
339
+ .prepare(
340
+ `INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end)
341
+ VALUES (?, ?, ?, ?, ?, ?)`
342
+ )
343
+ .run("chunk-doc#c99", "chunk-doc", 99, longContent, 0, longContent.length);
344
+ // Embed with seed exactly 42 so it becomes the best hit (distance 0).
345
+ dedupVec.upsertEmbedding("chunk-doc#c99", mockEmbedding(42));
346
+
347
+ const results = await dedupHybrid.search("anything");
348
+ const hit = results.find((r) => r.id === "chunk-doc");
349
+ expect(hit).toBeDefined();
350
+ expect(hit!.snippet.length).toBeLessThanOrEqual(300);
351
+ });
352
+
353
+ it("title-only FTS match still returns the doc (no regression on legacy doc-level hits)", async () => {
354
+ // Document with a doc-level vec row (no chunks) — simulates a legacy
355
+ // record that predates the chunks table.
356
+ dedupDb.upsertDocument({
357
+ id: "legacy-doc",
358
+ path: "thoughts/legacy.md",
359
+ title: "Legacy Title Only Matching Query",
360
+ date: "2026-02-01",
361
+ type: "research",
362
+ status: "draft",
363
+ githubIssue: null,
364
+ content: "Legacy body text",
365
+ });
366
+ dedupFts.rebuildIndex();
367
+ dedupVec.upsertEmbedding("legacy-doc", mockEmbedding(800));
368
+
369
+ const results = await dedupHybrid.search("Legacy");
370
+ const legacyHit = results.find((r) => r.id === "legacy-doc");
371
+ expect(legacyHit).toBeDefined();
372
+ // FTS contributed the snippet (no chunk content to override it).
373
+ expect(legacyHit!.snippet).toBeDefined();
374
+ });
375
+
376
+ it("RRF score: bucketed rank 0 + FTS rank 2 equals 1/(60+1) + 1/(60+3)", async () => {
377
+ // Force a known configuration by clearing and rebuilding:
378
+ // - chunk-doc is the #1 vector hit (bucketed rank 0)
379
+ // - chunk-doc is the #3 FTS hit (index 2)
380
+ // We arrange this by inserting three docs that match "match" in FTS,
381
+ // ordered by BM25 so chunk-doc ends up at rank 2.
382
+ //
383
+ // Simpler: test this using a fresh controlled fixture.
384
+ const tdb = new KnowledgeDB(":memory:");
385
+
386
+ tdb.upsertDocument({
387
+ id: "d-fts-top",
388
+ path: "a.md",
389
+ title: "match match match match",
390
+ date: "2026-01-01",
391
+ type: "research",
392
+ status: "draft",
393
+ githubIssue: null,
394
+ content: "match match match match match",
395
+ });
396
+ tdb.upsertDocument({
397
+ id: "d-fts-second",
398
+ path: "b.md",
399
+ title: "match match match",
400
+ date: "2026-01-02",
401
+ type: "research",
402
+ status: "draft",
403
+ githubIssue: null,
404
+ content: "match match match",
405
+ });
406
+ tdb.upsertDocument({
407
+ id: "target",
408
+ path: "c.md",
409
+ title: "Target Doc",
410
+ date: "2026-01-03",
411
+ type: "research",
412
+ status: "draft",
413
+ githubIssue: null,
414
+ content: "target doc with match keyword once",
415
+ });
416
+
417
+ const tfts = new FtsSearch(tdb);
418
+ tfts.rebuildIndex();
419
+
420
+ const tvec = new VectorSearch(tdb);
421
+ tvec.createIndex();
422
+
423
+ // One chunk for target, seeded exactly 42 so it's the only/best vec hit.
424
+ insertChunk(tdb, "target#c0", "target", 0, "Chunk content for target.");
425
+ tvec.upsertEmbedding("target#c0", mockEmbedding(42));
426
+ // Add far-away embeddings for the other docs so they don't contribute
427
+ // to the vector bucket's top ranks for target.
428
+ tvec.upsertEmbedding("d-fts-top", mockEmbedding(900));
429
+ tvec.upsertEmbedding("d-fts-second", mockEmbedding(901));
430
+
431
+ const thybrid = new HybridSearch(tdb, tfts, tvec, fixedEmbedFn);
432
+ const results = await thybrid.search("match");
433
+
434
+ const target = results.find((r) => r.id === "target");
435
+ expect(target).toBeDefined();
436
+
437
+ // Verify FTS rank of target is 2 (third position) by fetching raw FTS.
438
+ const ftsRaw = tfts.search("match", { includeSuperseded: true, limit: 40 });
439
+ const ftsRankOfTarget = ftsRaw.findIndex((r) => r.id === "target");
440
+ expect(ftsRankOfTarget).toBe(2);
441
+
442
+ const K = 60;
443
+ const expected = 1 / (K + 0 + 1) + 1 / (K + 2 + 1);
444
+ expect(target!.score).toBeCloseTo(expected, 10);
445
+ });
446
+ });
@@ -63,4 +63,30 @@ describe("VectorSearch", () => {
63
63
  const results = vecSearch.search(mockEmbedding(1), 1);
64
64
  expect(results).toHaveLength(1);
65
65
  });
66
+
67
+ it("returns content = null when vec id has no matching chunks row (back-compat)", () => {
68
+ // doc-1 has no chunks row; vec id is doc-level. LEFT JOIN should yield null.
69
+ const results = vecSearch.search(mockEmbedding(1), 5);
70
+ const hit = results.find((r) => r.id === "doc-1");
71
+ expect(hit).toBeDefined();
72
+ expect(hit!.content).toBeNull();
73
+ });
74
+
75
+ it("returns content populated when vec id matches a chunks row", () => {
76
+ // Insert a chunk-level vec row + matching chunks row for doc-1
77
+ db.db
78
+ .prepare(
79
+ `INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end)
80
+ VALUES (?, ?, ?, ?, ?, ?)`
81
+ )
82
+ .run("doc-1#c0", "doc-1", 0, "This is the first chunk content.", 0, 32);
83
+ vecSearch.upsertEmbedding("doc-1#c0", mockEmbedding(1));
84
+ // Remove the doc-level vec row so chunk-level wins
85
+ vecSearch.deleteEmbedding("doc-1");
86
+
87
+ const results = vecSearch.search(mockEmbedding(1), 5);
88
+ const hit = results.find((r) => r.id === "doc-1#c0");
89
+ expect(hit).toBeDefined();
90
+ expect(hit!.content).toBe("This is the first chunk content.");
91
+ });
66
92
  });
@@ -1,6 +1,6 @@
1
1
  import type { KnowledgeDB } from "./db.js";
2
2
  import type { FtsSearch, SearchOptions, SearchResult } from "./search.js";
3
- import type { VectorSearch } from "./vector-search.js";
3
+ import type { VectorResult, VectorSearch } from "./vector-search.js";
4
4
 
5
5
  export type EmbedFn = (text: string) => Promise<Float32Array>;
6
6
 
@@ -14,6 +14,23 @@ interface ChunkRow {
14
14
  content: string;
15
15
  }
16
16
 
17
+ /**
18
+ * Maximum snippet length (in characters) when the snippet is sourced from a
19
+ * chunk's content. Keeps the MCP payload compact while still representative.
20
+ */
21
+ const SNIPPET_MAX_CHARS = 300;
22
+
23
+ /**
24
+ * Per-document bucket tracking the best-ranked chunk for a given doc_id in
25
+ * the vector result list. The "rank" is the index of the first occurrence of
26
+ * the document in the distance-sorted vector results (smaller = better).
27
+ */
28
+ interface DocBucket {
29
+ bestRank: number;
30
+ bestChunkId: string;
31
+ bestContent: string;
32
+ }
33
+
17
34
  export class HybridSearch {
18
35
  private static readonly RRF_K = 60;
19
36
 
@@ -75,11 +92,27 @@ export class HybridSearch {
75
92
  });
76
93
 
77
94
  const queryEmbedding = await this.embedFn(query);
78
- const vecResults = this.vec.search(queryEmbedding, limit * 2);
95
+ const vecResults: VectorResult[] = this.vec.search(
96
+ queryEmbedding,
97
+ limit * 2,
98
+ );
99
+
100
+ // Bucket vector results by doc_id, keeping the best-ranked chunk per doc.
101
+ // vecResults is already sorted by distance ascending, so the first
102
+ // occurrence of a given doc_id has the smallest rank (best match).
103
+ const buckets = new Map<string, DocBucket>();
104
+ for (let i = 0; i < vecResults.length; i++) {
105
+ const hit = vecResults[i];
106
+ const docId = this.docIdFromVecId(hit.id);
107
+ if (buckets.has(docId)) continue; // Already have best rank for this doc
108
+ buckets.set(docId, {
109
+ bestRank: i,
110
+ bestChunkId: hit.id,
111
+ bestContent: hit.content ?? "",
112
+ });
113
+ }
79
114
 
80
- // Build RRF score map, keyed by document_id. When vec ids are chunk ids
81
- // like `{doc}#c{n}`, we collapse to the parent doc for scoring but
82
- // remember the best-scoring chunk id per doc for later meta enrichment.
115
+ // Build RRF score map (keyed by doc_id for both FTS and vector buckets)
83
116
  const scores = new Map<string, number>();
84
117
  const bestChunkByDoc = new Map<string, { chunkId: string; rank: number }>();
85
118
 
@@ -89,16 +122,13 @@ export class HybridSearch {
89
122
  scores.set(id, (scores.get(id) ?? 0) + rrfScore);
90
123
  }
91
124
 
92
- for (let i = 0; i < vecResults.length; i++) {
93
- const vecId = vecResults[i].id;
94
- const docId = this.docIdFromVecId(vecId);
95
- const rrfScore = 1 / (HybridSearch.RRF_K + i + 1);
125
+ for (const [docId, bucket] of buckets) {
126
+ const rrfScore = 1 / (HybridSearch.RRF_K + bucket.bestRank + 1);
96
127
  scores.set(docId, (scores.get(docId) ?? 0) + rrfScore);
97
- if (vecId !== docId) {
98
- const existing = bestChunkByDoc.get(docId);
99
- if (!existing || i < existing.rank) {
100
- bestChunkByDoc.set(docId, { chunkId: vecId, rank: i });
101
- }
128
+ // Track best chunk for later enrichment
129
+ const existing = bestChunkByDoc.get(docId);
130
+ if (!existing || bucket.bestRank < existing.rank) {
131
+ bestChunkByDoc.set(docId, { chunkId: bucket.bestChunkId, rank: bucket.bestRank });
102
132
  }
103
133
  }
104
134
 
@@ -108,18 +138,30 @@ export class HybridSearch {
108
138
  ftsById.set(r.id, r);
109
139
  }
110
140
 
111
- // Assemble combined results
141
+ // Assemble combined results. For vector-hit docs, replace the snippet
142
+ // with the winning chunk's content (truncated). FTS-only hits keep the
143
+ // FTS snippet.
112
144
  const combined: SearchResult[] = [];
113
145
 
114
146
  for (const [id, rrfScore] of scores) {
115
147
  const ftsHit = ftsById.get(id);
148
+ const bucket = buckets.get(id);
116
149
  if (ftsHit) {
117
- combined.push({ ...ftsHit, score: rrfScore });
150
+ // FTS hit (possibly also a vector hit): prefer the chunk snippet when
151
+ // the vector side contributed real chunk content.
152
+ const snippet =
153
+ bucket && bucket.bestContent
154
+ ? bucket.bestContent.slice(0, SNIPPET_MAX_CHARS)
155
+ : ftsHit.snippet;
156
+ combined.push({ ...ftsHit, score: rrfScore, snippet });
118
157
  } else {
119
158
  // Vector-only result: fetch document metadata from db
120
159
  const doc = this.db.getDocument(id);
121
160
  // Skip stub documents — they have no real content or path
122
161
  if (!doc || doc.isStub) continue;
162
+ const snippet = bucket
163
+ ? bucket.bestContent.slice(0, SNIPPET_MAX_CHARS)
164
+ : "";
123
165
  combined.push({
124
166
  id: doc.id,
125
167
  path: doc.path as string,
@@ -128,7 +170,7 @@ export class HybridSearch {
128
170
  status: doc.status,
129
171
  date: doc.date,
130
172
  score: rrfScore,
131
- snippet: "",
173
+ snippet,
132
174
  });
133
175
  }
134
176
  }
@@ -4,6 +4,12 @@ import type { KnowledgeDB } from "./db.js";
4
4
  export interface VectorResult {
5
5
  id: string;
6
6
  distance: number;
7
+ /**
8
+ * Chunk content populated via LEFT JOIN to `chunks` table when the vec id
9
+ * matches a chunk row. When the vec id is doc-level (back-compat / legacy
10
+ * fixtures) or no matching chunks row exists, this is `null`.
11
+ */
12
+ content?: string | null;
7
13
  }
8
14
 
9
15
  function float32ToBuffer(arr: Float32Array): Buffer {
@@ -73,11 +79,15 @@ export class VectorSearch {
73
79
  search(queryEmbedding: Float32Array, limit: number = 10): VectorResult[] {
74
80
  this.ensureVecLoaded();
75
81
  const buf = float32ToBuffer(queryEmbedding);
82
+ // LEFT JOIN to `chunks` so chunk-level vec rows surface their content.
83
+ // Doc-level vec ids (no matching chunks row) return content = NULL, which
84
+ // preserves back-compat for pre-chunks callers and legacy test fixtures.
76
85
  return this.knowledgeDb.db
77
86
  .prepare(
78
87
  `
79
- SELECT id, distance
88
+ SELECT documents_vec.id, distance, chunks.content
80
89
  FROM documents_vec
90
+ LEFT JOIN chunks ON chunks.id = documents_vec.id
81
91
  WHERE embedding MATCH ? AND k = ?
82
92
  ORDER BY distance
83
93
  `