@loreai/core 0.17.1 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. package/dist/bun/agents-file.d.ts +4 -0
  2. package/dist/bun/agents-file.d.ts.map +1 -1
  3. package/dist/bun/config.d.ts +2 -0
  4. package/dist/bun/config.d.ts.map +1 -1
  5. package/dist/bun/curator.d.ts +45 -0
  6. package/dist/bun/curator.d.ts.map +1 -1
  7. package/dist/bun/data-dir.d.ts +18 -0
  8. package/dist/bun/data-dir.d.ts.map +1 -0
  9. package/dist/bun/db.d.ts +85 -0
  10. package/dist/bun/db.d.ts.map +1 -1
  11. package/dist/bun/distillation.d.ts +2 -13
  12. package/dist/bun/distillation.d.ts.map +1 -1
  13. package/dist/bun/embedding-vendor.d.ts +22 -38
  14. package/dist/bun/embedding-vendor.d.ts.map +1 -1
  15. package/dist/bun/embedding-worker-types.d.ts +17 -12
  16. package/dist/bun/embedding-worker-types.d.ts.map +1 -1
  17. package/dist/bun/embedding-worker.d.ts +9 -2
  18. package/dist/bun/embedding-worker.d.ts.map +1 -1
  19. package/dist/bun/embedding-worker.js +38864 -33
  20. package/dist/bun/embedding-worker.js.map +4 -4
  21. package/dist/bun/embedding.d.ts +35 -23
  22. package/dist/bun/embedding.d.ts.map +1 -1
  23. package/dist/bun/gradient.d.ts +17 -1
  24. package/dist/bun/gradient.d.ts.map +1 -1
  25. package/dist/bun/import/detect.d.ts +14 -0
  26. package/dist/bun/import/detect.d.ts.map +1 -0
  27. package/dist/bun/import/extract.d.ts +43 -0
  28. package/dist/bun/import/extract.d.ts.map +1 -0
  29. package/dist/bun/import/history.d.ts +40 -0
  30. package/dist/bun/import/history.d.ts.map +1 -0
  31. package/dist/bun/import/index.d.ts +17 -0
  32. package/dist/bun/import/index.d.ts.map +1 -0
  33. package/dist/bun/import/providers/aider.d.ts +2 -0
  34. package/dist/bun/import/providers/aider.d.ts.map +1 -0
  35. package/dist/bun/import/providers/claude-code.d.ts +2 -0
  36. package/dist/bun/import/providers/claude-code.d.ts.map +1 -0
  37. package/dist/bun/import/providers/cline.d.ts +2 -0
  38. package/dist/bun/import/providers/cline.d.ts.map +1 -0
  39. package/dist/bun/import/providers/codex.d.ts +2 -0
  40. package/dist/bun/import/providers/codex.d.ts.map +1 -0
  41. package/dist/bun/import/providers/continue.d.ts +2 -0
  42. package/dist/bun/import/providers/continue.d.ts.map +1 -0
  43. package/dist/bun/import/providers/index.d.ts +19 -0
  44. package/dist/bun/import/providers/index.d.ts.map +1 -0
  45. package/dist/bun/import/providers/opencode.d.ts +2 -0
  46. package/dist/bun/import/providers/opencode.d.ts.map +1 -0
  47. package/dist/bun/import/providers/pi.d.ts +2 -0
  48. package/dist/bun/import/providers/pi.d.ts.map +1 -0
  49. package/dist/bun/import/types.d.ts +82 -0
  50. package/dist/bun/import/types.d.ts.map +1 -0
  51. package/dist/bun/index.d.ts +5 -2
  52. package/dist/bun/index.d.ts.map +1 -1
  53. package/dist/bun/index.js +3150 -439
  54. package/dist/bun/index.js.map +4 -4
  55. package/dist/bun/instruction-detect.d.ts +66 -0
  56. package/dist/bun/instruction-detect.d.ts.map +1 -0
  57. package/dist/bun/log.d.ts +9 -0
  58. package/dist/bun/log.d.ts.map +1 -1
  59. package/dist/bun/ltm.d.ts +139 -5
  60. package/dist/bun/ltm.d.ts.map +1 -1
  61. package/dist/bun/pattern-extract.d.ts +7 -0
  62. package/dist/bun/pattern-extract.d.ts.map +1 -1
  63. package/dist/bun/prompt.d.ts +1 -1
  64. package/dist/bun/prompt.d.ts.map +1 -1
  65. package/dist/bun/recall.d.ts.map +1 -1
  66. package/dist/bun/search.d.ts +5 -3
  67. package/dist/bun/search.d.ts.map +1 -1
  68. package/dist/bun/session-limiter.d.ts +26 -0
  69. package/dist/bun/session-limiter.d.ts.map +1 -0
  70. package/dist/bun/temporal.d.ts +2 -0
  71. package/dist/bun/temporal.d.ts.map +1 -1
  72. package/dist/bun/types.d.ts +1 -1
  73. package/dist/node/agents-file.d.ts +4 -0
  74. package/dist/node/agents-file.d.ts.map +1 -1
  75. package/dist/node/config.d.ts +2 -0
  76. package/dist/node/config.d.ts.map +1 -1
  77. package/dist/node/curator.d.ts +45 -0
  78. package/dist/node/curator.d.ts.map +1 -1
  79. package/dist/node/data-dir.d.ts +18 -0
  80. package/dist/node/data-dir.d.ts.map +1 -0
  81. package/dist/node/db.d.ts +85 -0
  82. package/dist/node/db.d.ts.map +1 -1
  83. package/dist/node/distillation.d.ts +2 -13
  84. package/dist/node/distillation.d.ts.map +1 -1
  85. package/dist/node/embedding-vendor.d.ts +22 -38
  86. package/dist/node/embedding-vendor.d.ts.map +1 -1
  87. package/dist/node/embedding-worker-types.d.ts +17 -12
  88. package/dist/node/embedding-worker-types.d.ts.map +1 -1
  89. package/dist/node/embedding-worker.d.ts +9 -2
  90. package/dist/node/embedding-worker.d.ts.map +1 -1
  91. package/dist/node/embedding-worker.js +38864 -33
  92. package/dist/node/embedding-worker.js.map +4 -4
  93. package/dist/node/embedding.d.ts +35 -23
  94. package/dist/node/embedding.d.ts.map +1 -1
  95. package/dist/node/gradient.d.ts +17 -1
  96. package/dist/node/gradient.d.ts.map +1 -1
  97. package/dist/node/import/detect.d.ts +14 -0
  98. package/dist/node/import/detect.d.ts.map +1 -0
  99. package/dist/node/import/extract.d.ts +43 -0
  100. package/dist/node/import/extract.d.ts.map +1 -0
  101. package/dist/node/import/history.d.ts +40 -0
  102. package/dist/node/import/history.d.ts.map +1 -0
  103. package/dist/node/import/index.d.ts +17 -0
  104. package/dist/node/import/index.d.ts.map +1 -0
  105. package/dist/node/import/providers/aider.d.ts +2 -0
  106. package/dist/node/import/providers/aider.d.ts.map +1 -0
  107. package/dist/node/import/providers/claude-code.d.ts +2 -0
  108. package/dist/node/import/providers/claude-code.d.ts.map +1 -0
  109. package/dist/node/import/providers/cline.d.ts +2 -0
  110. package/dist/node/import/providers/cline.d.ts.map +1 -0
  111. package/dist/node/import/providers/codex.d.ts +2 -0
  112. package/dist/node/import/providers/codex.d.ts.map +1 -0
  113. package/dist/node/import/providers/continue.d.ts +2 -0
  114. package/dist/node/import/providers/continue.d.ts.map +1 -0
  115. package/dist/node/import/providers/index.d.ts +19 -0
  116. package/dist/node/import/providers/index.d.ts.map +1 -0
  117. package/dist/node/import/providers/opencode.d.ts +2 -0
  118. package/dist/node/import/providers/opencode.d.ts.map +1 -0
  119. package/dist/node/import/providers/pi.d.ts +2 -0
  120. package/dist/node/import/providers/pi.d.ts.map +1 -0
  121. package/dist/node/import/types.d.ts +82 -0
  122. package/dist/node/import/types.d.ts.map +1 -0
  123. package/dist/node/index.d.ts +5 -2
  124. package/dist/node/index.d.ts.map +1 -1
  125. package/dist/node/index.js +3150 -439
  126. package/dist/node/index.js.map +4 -4
  127. package/dist/node/instruction-detect.d.ts +66 -0
  128. package/dist/node/instruction-detect.d.ts.map +1 -0
  129. package/dist/node/log.d.ts +9 -0
  130. package/dist/node/log.d.ts.map +1 -1
  131. package/dist/node/ltm.d.ts +139 -5
  132. package/dist/node/ltm.d.ts.map +1 -1
  133. package/dist/node/pattern-extract.d.ts +7 -0
  134. package/dist/node/pattern-extract.d.ts.map +1 -1
  135. package/dist/node/prompt.d.ts +1 -1
  136. package/dist/node/prompt.d.ts.map +1 -1
  137. package/dist/node/recall.d.ts.map +1 -1
  138. package/dist/node/search.d.ts +5 -3
  139. package/dist/node/search.d.ts.map +1 -1
  140. package/dist/node/session-limiter.d.ts +26 -0
  141. package/dist/node/session-limiter.d.ts.map +1 -0
  142. package/dist/node/temporal.d.ts +2 -0
  143. package/dist/node/temporal.d.ts.map +1 -1
  144. package/dist/node/types.d.ts +1 -1
  145. package/dist/types/agents-file.d.ts +4 -0
  146. package/dist/types/agents-file.d.ts.map +1 -1
  147. package/dist/types/config.d.ts +2 -0
  148. package/dist/types/config.d.ts.map +1 -1
  149. package/dist/types/curator.d.ts +45 -0
  150. package/dist/types/curator.d.ts.map +1 -1
  151. package/dist/types/data-dir.d.ts +18 -0
  152. package/dist/types/data-dir.d.ts.map +1 -0
  153. package/dist/types/db.d.ts +85 -0
  154. package/dist/types/db.d.ts.map +1 -1
  155. package/dist/types/distillation.d.ts +2 -13
  156. package/dist/types/distillation.d.ts.map +1 -1
  157. package/dist/types/embedding-vendor.d.ts +22 -38
  158. package/dist/types/embedding-vendor.d.ts.map +1 -1
  159. package/dist/types/embedding-worker-types.d.ts +17 -12
  160. package/dist/types/embedding-worker-types.d.ts.map +1 -1
  161. package/dist/types/embedding-worker.d.ts +9 -2
  162. package/dist/types/embedding-worker.d.ts.map +1 -1
  163. package/dist/types/embedding.d.ts +35 -23
  164. package/dist/types/embedding.d.ts.map +1 -1
  165. package/dist/types/gradient.d.ts +17 -1
  166. package/dist/types/gradient.d.ts.map +1 -1
  167. package/dist/types/import/detect.d.ts +14 -0
  168. package/dist/types/import/detect.d.ts.map +1 -0
  169. package/dist/types/import/extract.d.ts +43 -0
  170. package/dist/types/import/extract.d.ts.map +1 -0
  171. package/dist/types/import/history.d.ts +40 -0
  172. package/dist/types/import/history.d.ts.map +1 -0
  173. package/dist/types/import/index.d.ts +17 -0
  174. package/dist/types/import/index.d.ts.map +1 -0
  175. package/dist/types/import/providers/aider.d.ts +2 -0
  176. package/dist/types/import/providers/aider.d.ts.map +1 -0
  177. package/dist/types/import/providers/claude-code.d.ts +2 -0
  178. package/dist/types/import/providers/claude-code.d.ts.map +1 -0
  179. package/dist/types/import/providers/cline.d.ts +2 -0
  180. package/dist/types/import/providers/cline.d.ts.map +1 -0
  181. package/dist/types/import/providers/codex.d.ts +2 -0
  182. package/dist/types/import/providers/codex.d.ts.map +1 -0
  183. package/dist/types/import/providers/continue.d.ts +2 -0
  184. package/dist/types/import/providers/continue.d.ts.map +1 -0
  185. package/dist/types/import/providers/index.d.ts +19 -0
  186. package/dist/types/import/providers/index.d.ts.map +1 -0
  187. package/dist/types/import/providers/opencode.d.ts +2 -0
  188. package/dist/types/import/providers/opencode.d.ts.map +1 -0
  189. package/dist/types/import/providers/pi.d.ts +2 -0
  190. package/dist/types/import/providers/pi.d.ts.map +1 -0
  191. package/dist/types/import/types.d.ts +82 -0
  192. package/dist/types/import/types.d.ts.map +1 -0
  193. package/dist/types/index.d.ts +5 -2
  194. package/dist/types/index.d.ts.map +1 -1
  195. package/dist/types/instruction-detect.d.ts +66 -0
  196. package/dist/types/instruction-detect.d.ts.map +1 -0
  197. package/dist/types/log.d.ts +9 -0
  198. package/dist/types/log.d.ts.map +1 -1
  199. package/dist/types/ltm.d.ts +139 -5
  200. package/dist/types/ltm.d.ts.map +1 -1
  201. package/dist/types/pattern-extract.d.ts +7 -0
  202. package/dist/types/pattern-extract.d.ts.map +1 -1
  203. package/dist/types/prompt.d.ts +1 -1
  204. package/dist/types/prompt.d.ts.map +1 -1
  205. package/dist/types/recall.d.ts.map +1 -1
  206. package/dist/types/search.d.ts +5 -3
  207. package/dist/types/search.d.ts.map +1 -1
  208. package/dist/types/session-limiter.d.ts +26 -0
  209. package/dist/types/session-limiter.d.ts.map +1 -0
  210. package/dist/types/temporal.d.ts +2 -0
  211. package/dist/types/temporal.d.ts.map +1 -1
  212. package/dist/types/types.d.ts +1 -1
  213. package/package.json +3 -4
  214. package/src/agents-file.ts +41 -13
  215. package/src/config.ts +31 -18
  216. package/src/curator.ts +163 -75
  217. package/src/data-dir.ts +76 -0
  218. package/src/db.ts +457 -11
  219. package/src/distillation.ts +65 -16
  220. package/src/embedding-vendor.ts +23 -40
  221. package/src/embedding-worker-types.ts +19 -11
  222. package/src/embedding-worker.ts +111 -47
  223. package/src/embedding.ts +224 -174
  224. package/src/gradient.ts +192 -75
  225. package/src/import/detect.ts +37 -0
  226. package/src/import/extract.ts +137 -0
  227. package/src/import/history.ts +99 -0
  228. package/src/import/index.ts +45 -0
  229. package/src/import/providers/aider.ts +207 -0
  230. package/src/import/providers/claude-code.ts +339 -0
  231. package/src/import/providers/cline.ts +324 -0
  232. package/src/import/providers/codex.ts +369 -0
  233. package/src/import/providers/continue.ts +304 -0
  234. package/src/import/providers/index.ts +32 -0
  235. package/src/import/providers/opencode.ts +272 -0
  236. package/src/import/providers/pi.ts +332 -0
  237. package/src/import/types.ts +91 -0
  238. package/src/index.ts +13 -0
  239. package/src/instruction-detect.ts +275 -0
  240. package/src/log.ts +91 -3
  241. package/src/ltm.ts +789 -41
  242. package/src/pattern-extract.ts +41 -0
  243. package/src/prompt.ts +7 -1
  244. package/src/recall.ts +43 -5
  245. package/src/search.ts +7 -5
  246. package/src/session-limiter.ts +47 -0
  247. package/src/temporal.ts +18 -6
  248. package/src/types.ts +1 -1
package/src/ltm.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { uuidv7 } from "uuidv7";
2
- import { db, ensureProject } from "./db";
2
+ import { db, ensureProject, getKV, setKV } from "./db";
3
3
  import { config } from "./config";
4
- import { ftsQuery, EMPTY_QUERY, extractTopTerms, runRelaxedSearch } from "./search";
4
+ import { ftsQuery, ftsQueryOr, EMPTY_QUERY, extractTopTerms, filterTerms, runRelaxedSearch } from "./search";
5
5
  import * as embedding from "./embedding";
6
6
  import * as latReader from "./lat-reader";
7
7
  import * as log from "./log";
@@ -50,6 +50,10 @@ export function create(input: {
50
50
  ? ensureProject(input.projectPath)
51
51
  : null;
52
52
 
53
+ // IF-2: Global entries (pid=null) must be cross-project to avoid a data hole
54
+ // where forSession() can't find them in either the project or cross-project pool.
55
+ const crossProject = pid === null ? true : (input.crossProject ?? false);
56
+
53
57
  // Dedup guard: if an entry with the same project_id + title already exists,
54
58
  // update its content instead of inserting a duplicate. This prevents the
55
59
  // curator from creating multiple entries for the same concept across sessions.
@@ -90,6 +94,16 @@ export function create(input: {
90
94
  update(crossExisting.id, { content: input.content });
91
95
  return crossExisting.id;
92
96
  }
97
+
98
+ // Fuzzy dedup: check for title-similar entries via FTS5 + word-overlap.
99
+ // This catches near-duplicates the curator creates with slightly different
100
+ // titles for the same concept (e.g. "Upgrade lock bug" vs "Upgrade binary
101
+ // lock re-entry bug"). Placed after exact checks (cheaper checks first).
102
+ const fuzzyMatch = findFuzzyDuplicate({ title: input.title, projectId: pid });
103
+ if (fuzzyMatch) {
104
+ update(fuzzyMatch.id, { content: input.content });
105
+ return fuzzyMatch.id;
106
+ }
93
107
  }
94
108
 
95
109
  const id = input.id ?? uuidv7();
@@ -106,7 +120,7 @@ export function create(input: {
106
120
  input.title,
107
121
  input.content,
108
122
  input.session ?? null,
109
- (input.crossProject ?? false) ? 1 : 0,
123
+ crossProject ? 1 : 0,
110
124
  now,
111
125
  now,
112
126
  );
@@ -130,8 +144,10 @@ export function update(
130
144
  params.push(input.content);
131
145
  }
132
146
  if (input.confidence !== undefined) {
147
+ // Clamp to [0.0, 1.0] — an LLM-provided value outside this range would
148
+ // give disproportionate scoring weight (>1) or silently soft-delete (<0.2).
133
149
  sets.push("confidence = ?");
134
- params.push(input.confidence);
150
+ params.push(Math.max(0, Math.min(1, input.confidence)));
135
151
  }
136
152
  sets.push("updated_at = ?");
137
153
  params.push(Date.now());
@@ -153,6 +169,100 @@ export function remove(id: string) {
153
169
  db().query("DELETE FROM knowledge WHERE id = ?").run(id);
154
170
  }
155
171
 
172
+ // ---------------------------------------------------------------------------
173
+ // Fuzzy title dedup — word-overlap similarity
174
+ // ---------------------------------------------------------------------------
175
+
176
+ /**
177
+ * Compute title word-overlap between two titles.
178
+ * Returns { coefficient, intersectionSize } where:
179
+ * - coefficient = |A ∩ B| / min(|A|, |B|) (0–1)
180
+ * - intersectionSize = number of shared meaningful words
181
+ * Filters stopwords and single-char tokens for meaningful comparison.
182
+ */
183
+ function titleOverlap(a: string, b: string): { coefficient: number; intersectionSize: number } {
184
+ const wordsA = new Set(filterTerms(a).map((w) => w.toLowerCase()));
185
+ const wordsB = new Set(filterTerms(b).map((w) => w.toLowerCase()));
186
+ if (wordsA.size === 0 || wordsB.size === 0) return { coefficient: 0, intersectionSize: 0 };
187
+ const intersection = [...wordsA].filter((w) => wordsB.has(w));
188
+ return {
189
+ coefficient: intersection.length / Math.min(wordsA.size, wordsB.size),
190
+ intersectionSize: intersection.length,
191
+ };
192
+ }
193
+
194
+ /** Minimum word-overlap coefficient to consider two titles as duplicates. */
195
+ const FUZZY_DEDUP_THRESHOLD = 0.7;
196
+ /** Minimum number of overlapping meaningful words required for a fuzzy match.
197
+ * Prevents false positives on short titles where 2-3 common words produce
198
+ * a high overlap coefficient despite being genuinely different entries. */
199
+ const FUZZY_DEDUP_MIN_OVERLAP = 4;
200
+ /** Minimum cosine similarity for embedding-based dedup. Empirically tuned
201
+ * against 312 Nomic v1.5 entries:
202
+ * - 0.935+: all genuine duplicates (same topic, different wording)
203
+ * - 0.92–0.935: contains false positives from same-subsystem entries
204
+ * (e.g. "BGE Small unusable" ↔ "Nomic OOM" scored 0.9326 — related
205
+ * but distinct bugs). Star clustering amplifies this by bridging.
206
+ * - <0.92: mixed or unrelated entries */
207
+ const EMBEDDING_DEDUP_THRESHOLD = 0.935;
208
+
209
+ /**
210
+ * Find an existing knowledge entry whose title is fuzzy-similar to the given title.
211
+ *
212
+ * Uses FTS5 to find up to 5 candidates, then applies word-overlap filtering.
213
+ * This is the same algorithm used by `check()` but returns a single match
214
+ * for use in the `create()` dedup guard.
215
+ *
216
+ * @returns The first matching entry (id + title), or null if no fuzzy match.
217
+ */
218
+ export function findFuzzyDuplicate(input: {
219
+ title: string;
220
+ projectId: string | null;
221
+ excludeId?: string;
222
+ }): { id: string; title: string } | null {
223
+ const q = ftsQueryOr(input.title);
224
+ if (q === EMPTY_QUERY) return null;
225
+
226
+ const { title: tw, content: cw, category: catw } = config().search.ftsWeights;
227
+
228
+ try {
229
+ // Build query scoped to the same project + cross-project entries
230
+ const excludeClause = input.excludeId ? "AND k.id != ?" : "";
231
+ const sql = input.projectId !== null
232
+ ? `SELECT k.id, k.title FROM knowledge_fts f
233
+ CROSS JOIN knowledge k ON k.rowid = f.rowid
234
+ WHERE knowledge_fts MATCH ?
235
+ AND (k.project_id = ? OR k.cross_project = 1)
236
+ AND k.confidence > 0.2
237
+ ${excludeClause}
238
+ ORDER BY bm25(knowledge_fts, ?, ?, ?) LIMIT 5`
239
+ : `SELECT k.id, k.title FROM knowledge_fts f
240
+ CROSS JOIN knowledge k ON k.rowid = f.rowid
241
+ WHERE knowledge_fts MATCH ?
242
+ AND (k.project_id IS NULL OR k.cross_project = 1)
243
+ AND k.confidence > 0.2
244
+ ${excludeClause}
245
+ ORDER BY bm25(knowledge_fts, ?, ?, ?) LIMIT 5`;
246
+
247
+ const params: (string | number)[] = input.projectId !== null
248
+ ? [q, input.projectId, ...(input.excludeId ? [input.excludeId] : []), tw, cw, catw]
249
+ : [q, ...(input.excludeId ? [input.excludeId] : []), tw, cw, catw];
250
+
251
+ const candidates = db().query(sql).all(...params) as Array<{ id: string; title: string }>;
252
+
253
+ for (const candidate of candidates) {
254
+ const { coefficient, intersectionSize } = titleOverlap(input.title, candidate.title);
255
+ if (coefficient >= FUZZY_DEDUP_THRESHOLD && intersectionSize >= FUZZY_DEDUP_MIN_OVERLAP) {
256
+ return candidate;
257
+ }
258
+ }
259
+ } catch {
260
+ // FTS5 error — fall through to no match
261
+ }
262
+
263
+ return null;
264
+ }
265
+
156
266
  export function forProject(
157
267
  projectPath: string,
158
268
  includeCross = true,
@@ -245,6 +355,26 @@ function scoreEntriesFTS(sessionContext: string): Map<string, number> {
245
355
  }
246
356
  }
247
357
 
358
+ /**
359
+ * Well-known knowledge entry categories managed by the curator.
360
+ * The DB column is a free-form string, but these are the standard values.
361
+ */
362
+ export type KnowledgeCategory = "decision" | "pattern" | "preference" | "architecture" | "gotcha";
363
+
364
+ /** Options for `forSession()` to control entry selection. */
365
+ export type ForSessionOptions = {
366
+ /** Caller-provided context (e.g., user's current message) for relevance
367
+ * scoring when no session context exists in the DB yet. */
368
+ contextHint?: string;
369
+ /** Restrict to these categories (e.g., `['preference']` for turn 1). */
370
+ categories?: (KnowledgeCategory | (string & {}))[];
371
+ /** Exclude these categories (e.g., `['preference']` for context-bound
372
+ * entries when preferences are already injected in a separate block).
373
+ * Mutually exclusive with `categories` — if both are provided,
374
+ * `categories` (include) wins. */
375
+ excludeCategories?: (KnowledgeCategory | (string & {}))[];
376
+ };
377
+
248
378
  /**
249
379
  * Build a relevance-ranked, budget-capped list of knowledge entries for injection
250
380
  * into the system prompt of a live session.
@@ -252,43 +382,61 @@ function scoreEntriesFTS(sessionContext: string): Map<string, number> {
252
382
  * Strategy:
253
383
  * 1. Both project-specific and cross-project entries are scored for relevance
254
384
  * against recent session context (last distillation + recent raw messages).
255
- * 2. Project entries get a safety net: the top PROJECT_SAFETY_NET entries by
385
+ * 2. When embeddings are available, vector cosine similarity is used for scoring
386
+ * (captures semantic matches that keyword overlap misses). Falls back to
387
+ * FTS5 BM25 when embeddings are unavailable.
388
+ * 3. Project entries get a safety net: the top PROJECT_SAFETY_NET entries by
256
389
  * confidence are always included even if they have zero relevance score.
257
390
  * This ensures the most important project knowledge is never lost to
258
- * coarse term-overlap scoring.
259
- * 3. All scored entries are merged into a single pool and greedily packed
391
+ * coarse scoring.
392
+ * 4. All scored entries are merged into a single pool and greedily packed
260
393
  * into the token budget by score descending.
261
- * 4. If there's no session context yet (first turn), fall back to top entries
394
+ * 5. If there's no session context yet (first turn), fall back to top entries
262
395
  * by confidence only (capped at NO_CONTEXT_FALLBACK_CAP per pool).
263
396
  *
264
397
  * @param projectPath Current project path
265
398
  * @param sessionID Current session ID (for context extraction)
266
399
  * @param maxTokens Hard token budget for the entire formatted block
400
+ * @param options Optional category filter and context hint
267
401
  */
268
- export function forSession(
402
+ export async function forSession(
269
403
  projectPath: string,
270
404
  sessionID: string | undefined,
271
405
  maxTokens: number,
272
- ): KnowledgeEntry[] {
406
+ options?: ForSessionOptions,
407
+ ): Promise<KnowledgeEntry[]> {
273
408
  const pid = ensureProject(projectPath);
409
+ const categoryFilter = options?.categories;
410
+ const excludeFilter = options?.excludeCategories;
411
+
412
+ // Build optional SQL category clauses (include / exclude are mutually exclusive)
413
+ let categoryClause = "";
414
+ let categoryParams: string[] = [];
415
+ if (categoryFilter?.length) {
416
+ categoryClause = ` AND category IN (${categoryFilter.map(() => "?").join(",")})`;
417
+ categoryParams = categoryFilter;
418
+ } else if (excludeFilter?.length) {
419
+ categoryClause = ` AND category NOT IN (${excludeFilter.map(() => "?").join(",")})`;
420
+ categoryParams = excludeFilter;
421
+ }
274
422
 
275
423
  // --- 1. Load project-specific entries ---
276
424
  const projectEntries = db()
277
425
  .query(
278
426
  `SELECT ${KNOWLEDGE_COLS} FROM knowledge
279
- WHERE project_id = ? AND cross_project = 0 AND confidence > 0.2
427
+ WHERE project_id = ? AND cross_project = 0 AND confidence > 0.2${categoryClause}
280
428
  ORDER BY confidence DESC, updated_at DESC`,
281
429
  )
282
- .all(pid) as KnowledgeEntry[];
430
+ .all(pid, ...categoryParams) as KnowledgeEntry[];
283
431
 
284
432
  // --- 2. Load cross-project candidates ---
285
433
  const crossEntries = db()
286
434
  .query(
287
435
  `SELECT ${KNOWLEDGE_COLS} FROM knowledge
288
- WHERE (project_id IS NULL OR cross_project = 1) AND confidence > 0.2
436
+ WHERE (project_id IS NULL OR cross_project = 1) AND confidence > 0.2${categoryClause}
289
437
  ORDER BY confidence DESC, updated_at DESC`,
290
438
  )
291
- .all() as KnowledgeEntry[];
439
+ .all(...categoryParams) as KnowledgeEntry[];
292
440
 
293
441
  if (!crossEntries.length && !projectEntries.length) return [];
294
442
 
@@ -317,38 +465,82 @@ export function forSession(
317
465
  }
318
466
  }
319
467
 
468
+ // Fall back to caller-provided context hint (e.g., user's first message)
469
+ if (!sessionContext.trim() && options?.contextHint) {
470
+ sessionContext = options.contextHint;
471
+ }
472
+
320
473
  // --- 4. Score both pools by relevance ---
321
474
  let scoredProject: Scored[];
322
475
  let scoredCross: Scored[];
323
476
 
324
- if (sessionContext.trim().length > 20) {
325
- // Use FTS5 BM25 to score all knowledge entries against session context
326
- const ftsScores = scoreEntriesFTS(sessionContext);
327
-
328
- // Score project entries: FTS relevance × confidence, with safety net
329
- const rawScored: Scored[] = projectEntries.map((entry) => ({
330
- entry,
331
- score: (ftsScores.get(entry.id) ?? 0) * entry.confidence,
332
- }));
333
- const matched = rawScored.filter((s) => s.score > 0);
334
- const matchedIds = new Set(matched.map((s) => s.entry.id));
335
-
336
- // Safety net: top PROJECT_SAFETY_NET entries by confidence that weren't already matched.
337
- // Given a tiny score (0.001 * confidence) so they sort below genuinely matched entries.
338
- const safetyNet = projectEntries
339
- .filter((e) => !matchedIds.has(e.id))
340
- .slice(0, PROJECT_SAFETY_NET)
341
- .map((e) => ({ entry: e, score: 0.001 * e.confidence }));
342
-
343
- scoredProject = [...matched, ...safetyNet];
477
+ if (sessionContext.trim().length > 20 && embedding.isAvailable()) {
478
+ // Vector scoring: embed session context, score entries by cosine similarity.
479
+ // Captures semantic matches (e.g., "OpenAI Batch API" ↔ "batch queue worker")
480
+ // that keyword-based FTS5 misses.
481
+ let vectorScores: Map<string, number>;
482
+ try {
483
+ const [contextVec] = await embedding.embed([sessionContext], "query");
484
+ const hits = embedding.vectorSearch(contextVec, 50, excludeFilter);
485
+ vectorScores = new Map(hits.map((h) => [h.id, h.similarity]));
486
+ } catch (err) {
487
+ log.warn("Vector scoring failed, falling back to FTS5:", err);
488
+ vectorScores = new Map();
489
+ }
344
490
 
345
- // Score cross-project entries — only include entries with FTS match
346
- scoredCross = crossEntries
347
- .filter((e) => ftsScores.has(e.id))
348
- .map((e) => ({
349
- entry: e,
350
- score: (ftsScores.get(e.id) ?? 0) * e.confidence,
351
- }));
491
+ if (vectorScores.size > 0) {
492
+ // Hybrid scoring: vector search only covers entries with stored embeddings.
493
+ // Entries without embeddings (e.g. newly created, async embed not yet done)
494
+ // fall back to FTS5 so they aren't invisible to scoring.
495
+ const ftsScores = scoreEntriesFTS(sessionContext);
496
+
497
+ // Score project entries: prefer vector similarity, fall back to FTS5
498
+ const rawScored: Scored[] = projectEntries.map((entry) => {
499
+ const vecScore = vectorScores.get(entry.id);
500
+ const score = vecScore != null
501
+ ? vecScore * entry.confidence
502
+ : (ftsScores.get(entry.id) ?? 0) * entry.confidence;
503
+ return { entry, score };
504
+ });
505
+ const matched = rawScored.filter((s) => s.score > 0);
506
+ const matchedIds = new Set(matched.map((s) => s.entry.id));
507
+
508
+ // Safety net: top PROJECT_SAFETY_NET entries by confidence that weren't already matched.
509
+ // Given a tiny score (0.001 * confidence) so they sort below genuinely matched entries.
510
+ const safetyNet = projectEntries
511
+ .filter((e) => !matchedIds.has(e.id))
512
+ .slice(0, PROJECT_SAFETY_NET)
513
+ .map((e) => ({ entry: e, score: 0.001 * e.confidence }));
514
+
515
+ scoredProject = [...matched, ...safetyNet];
516
+
517
+ // Cross-project: include entries matched by vector OR FTS5
518
+ scoredCross = crossEntries
519
+ .filter((e) => vectorScores.has(e.id) || ftsScores.has(e.id))
520
+ .map((e) => {
521
+ const vecScore = vectorScores.get(e.id);
522
+ const score = vecScore != null
523
+ ? vecScore * e.confidence
524
+ : (ftsScores.get(e.id) ?? 0) * e.confidence;
525
+ return { entry: e, score };
526
+ });
527
+ } else {
528
+ // Vector failed — fall through to FTS5
529
+ const ftsScores = scoreEntriesFTS(sessionContext);
530
+ ({ scoredProject, scoredCross } = scoreFTS(
531
+ projectEntries,
532
+ crossEntries,
533
+ ftsScores,
534
+ ));
535
+ }
536
+ } else if (sessionContext.trim().length > 20) {
537
+ // Embeddings unavailable — use FTS5 BM25 as fallback
538
+ const ftsScores = scoreEntriesFTS(sessionContext);
539
+ ({ scoredProject, scoredCross } = scoreFTS(
540
+ projectEntries,
541
+ crossEntries,
542
+ ftsScores,
543
+ ));
352
544
  } else {
353
545
  // No session context — fall back to top entries by confidence, capped
354
546
  scoredProject = projectEntries
@@ -410,6 +602,36 @@ export function forSession(
410
602
  return result;
411
603
  }
412
604
 
605
+ /** Score entries using FTS5 BM25 — extracted for reuse in the vector-fallback path. */
606
+ function scoreFTS(
607
+ projectEntries: KnowledgeEntry[],
608
+ crossEntries: KnowledgeEntry[],
609
+ ftsScores: Map<string, number>,
610
+ ): { scoredProject: Scored[]; scoredCross: Scored[] } {
611
+ const rawScored: Scored[] = projectEntries.map((entry) => ({
612
+ entry,
613
+ score: (ftsScores.get(entry.id) ?? 0) * entry.confidence,
614
+ }));
615
+ const matched = rawScored.filter((s) => s.score > 0);
616
+ const matchedIds = new Set(matched.map((s) => s.entry.id));
617
+
618
+ const safetyNet = projectEntries
619
+ .filter((e) => !matchedIds.has(e.id))
620
+ .slice(0, PROJECT_SAFETY_NET)
621
+ .map((e) => ({ entry: e, score: 0.001 * e.confidence }));
622
+
623
+ const scoredProject = [...matched, ...safetyNet];
624
+
625
+ const scoredCross = crossEntries
626
+ .filter((e) => ftsScores.has(e.id))
627
+ .map((e) => ({
628
+ entry: e,
629
+ score: (ftsScores.get(e.id) ?? 0) * e.confidence,
630
+ }));
631
+
632
+ return { scoredProject, scoredCross };
633
+ }
634
+
413
635
  export function all(): KnowledgeEntry[] {
414
636
  return db()
415
637
  .query(
@@ -418,6 +640,17 @@ export function all(): KnowledgeEntry[] {
418
640
  .all() as KnowledgeEntry[];
419
641
  }
420
642
 
643
+ /** Return all cross-project and global (user-level) knowledge entries. */
644
+ export function crossProject(): KnowledgeEntry[] {
645
+ return db()
646
+ .query(
647
+ `SELECT ${KNOWLEDGE_COLS} FROM knowledge
648
+ WHERE (project_id IS NULL OR cross_project = 1) AND confidence > 0.2
649
+ ORDER BY confidence DESC, updated_at DESC`,
650
+ )
651
+ .all() as KnowledgeEntry[];
652
+ }
653
+
421
654
  // LIKE-based fallback for when FTS5 fails unexpectedly.
422
655
  function searchLike(input: {
423
656
  query: string;
@@ -832,3 +1065,518 @@ export function check(projectPath: string): IntegrityIssue[] {
832
1065
 
833
1066
  return issues;
834
1067
  }
1068
+
1069
+ // ---------------------------------------------------------------------------
1070
+ // Deduplication — embedding-based semantic clustering with word-overlap fallback
1071
+ // ---------------------------------------------------------------------------
1072
+
1073
+ export type DedupCluster = {
1074
+ surviving: { id: string; title: string };
1075
+ merged: Array<{ id: string; title: string }>;
1076
+ };
1077
+
1078
+ /** Stable pair key for two entry IDs — sorted to ensure order-independence. */
1079
+ export function dedupPairKey(idA: string, idB: string): string {
1080
+ return idA < idB ? `${idA}:${idB}` : `${idB}:${idA}`;
1081
+ }
1082
+
1083
+ export type DedupResult = {
1084
+ clusters: DedupCluster[];
1085
+ totalRemoved: number;
1086
+ /** Pairwise embedding cosine similarities. Key: dedupPairKey(idA, idB). */
1087
+ pairSimilarities: Map<string, number>;
1088
+ /** All entry titles by ID — for feedback recording after entries are deleted. */
1089
+ entryTitles: Map<string, string>;
1090
+ };
1091
+
1092
+ /**
1093
+ * Deduplicate knowledge entries for a project.
1094
+ *
1095
+ * Uses two complementary signals with "star" clustering (no transitive
1096
+ * chains) to prevent snowball merging:
1097
+ *
1098
+ * 1. **Title word-overlap** (Jaccard on meaningful words) — catches entries
1099
+ * with similar titles regardless of content wording.
1100
+ * 2. **Embedding cosine similarity** (when embeddings are available) — catches
1101
+ * entries with different titles but semantically identical content. Nomic
1102
+ * v1.5 produces a same-domain spread of 0.46–0.70 for distinct entries,
1103
+ * making threshold-based dedup viable at 0.935+ (lower thresholds catch
1104
+ * related-but-distinct entries as false positives, especially via star
1105
+ * clustering where a hub entry bridges two distinct topics).
1106
+ *
1107
+ * Pairs matching either signal are clustered together. For each cluster,
1108
+ * picks a survivor (highest confidence, then most recently updated, then
1109
+ * shortest title) and removes the rest.
1110
+ *
1111
+ * @param projectPath Project root path
1112
+ * @param opts.dryRun If true (default), report clusters without deleting
1113
+ * @returns Cluster report and count of removed entries
1114
+ */
1115
+ /** Core dedup logic — operates on an arbitrary list of entries. */
1116
+ function _dedup(
1117
+ entries: KnowledgeEntry[],
1118
+ dryRun: boolean,
1119
+ embeddingThreshold: number = EMBEDDING_DEDUP_THRESHOLD,
1120
+ ): DedupResult {
1121
+ if (entries.length < 2) return { clusters: [], totalRemoved: 0, pairSimilarities: new Map(), entryTitles: new Map() };
1122
+
1123
+ // --- Build neighbor map using title overlap + embedding similarity ---
1124
+ // Two entries are considered neighbors (potential duplicates) if EITHER:
1125
+ // (a) title word-overlap ≥ 0.7 with ≥ 4 shared words, OR
1126
+ // (b) embedding cosine similarity ≥ embeddingThreshold (default 0.935)
1127
+ // Star clustering (no transitivity) prevents snowball merging.
1128
+ // O(n²) pairwise comparison — acceptable for n ≤ 25 (maxEntries cap).
1129
+
1130
+ // Load embeddings for the given entries (if available).
1131
+ // We query directly rather than using vectorSearch() because we need
1132
+ // pairwise comparison among entries, not a query-vs-all search.
1133
+ const embeddingMap = new Map<string, Float32Array>();
1134
+ {
1135
+ const entryIds = entries.map((e) => e.id);
1136
+ // Build parameterized IN clause for the entry IDs
1137
+ const placeholders = entryIds.map(() => "?").join(",");
1138
+ const rows = db()
1139
+ .query(`SELECT id, embedding FROM knowledge WHERE embedding IS NOT NULL AND id IN (${placeholders})`)
1140
+ .all(...entryIds) as Array<{ id: string; embedding: Buffer }>;
1141
+ for (const row of rows) {
1142
+ try {
1143
+ embeddingMap.set(row.id, embedding.fromBlob(row.embedding));
1144
+ } catch {
1145
+ // Skip corrupted embeddings — entry falls back to title-overlap only.
1146
+ log.info(`skipping corrupted embedding for entry ${row.id}`);
1147
+ }
1148
+ }
1149
+ }
1150
+
1151
+ // Pre-compute neighbors for all pairs
1152
+ type DedupHit = { id: string; score: number };
1153
+ const neighborMap = new Map<string, DedupHit[]>();
1154
+ // Collect all pairwise embedding similarities (for feedback/calibration).
1155
+ const pairSimilarities = new Map<string, number>();
1156
+
1157
+ for (const entry of entries) {
1158
+ const neighbors: DedupHit[] = [];
1159
+ const entryVec = embeddingMap.get(entry.id);
1160
+
1161
+ for (const other of entries) {
1162
+ if (other.id === entry.id) continue;
1163
+
1164
+ // Signal 1: title word-overlap
1165
+ const { coefficient, intersectionSize } = titleOverlap(entry.title, other.title);
1166
+ const titleMatch = coefficient >= FUZZY_DEDUP_THRESHOLD && intersectionSize >= FUZZY_DEDUP_MIN_OVERLAP;
1167
+
1168
+ // Signal 2: embedding cosine similarity
1169
+ let embeddingMatch = false;
1170
+ let similarity = 0;
1171
+ if (entryVec) {
1172
+ const otherVec = embeddingMap.get(other.id);
1173
+ if (otherVec && entryVec.length === otherVec.length) {
1174
+ similarity = embedding.cosineSimilarity(entryVec, otherVec);
1175
+ embeddingMatch = similarity >= embeddingThreshold;
1176
+ }
1177
+ }
1178
+
1179
+ // Track all pairwise embedding similarities for calibration signals
1180
+ if (similarity > 0) {
1181
+ const pk = dedupPairKey(entry.id, other.id);
1182
+ if (!pairSimilarities.has(pk)) {
1183
+ pairSimilarities.set(pk, similarity);
1184
+ }
1185
+ }
1186
+
1187
+ if (titleMatch || embeddingMatch) {
1188
+ // Use the stronger signal as the match score for cluster priority
1189
+ neighbors.push({ id: other.id, score: Math.max(coefficient, similarity) });
1190
+ }
1191
+ }
1192
+ neighbors.sort((a, b) => b.score - a.score);
1193
+ neighborMap.set(entry.id, neighbors);
1194
+ }
1195
+
1196
+ // Greedy star clustering — process entries with most neighbors first
1197
+ const claimed = new Set<string>();
1198
+ const rawClusters = new Map<string, string[]>();
1199
+
1200
+ const sortedIds = [...neighborMap.keys()].sort(
1201
+ (a, b) => neighborMap.get(b)!.length - neighborMap.get(a)!.length,
1202
+ );
1203
+
1204
+ for (const centerId of sortedIds) {
1205
+ if (claimed.has(centerId)) continue;
1206
+ claimed.add(centerId);
1207
+ const members = [centerId];
1208
+
1209
+ for (const { id: neighborId } of neighborMap.get(centerId)!) {
1210
+ if (claimed.has(neighborId)) continue;
1211
+ claimed.add(neighborId);
1212
+ members.push(neighborId);
1213
+ }
1214
+
1215
+ if (members.length > 1) {
1216
+ rawClusters.set(centerId, members);
1217
+ }
1218
+ }
1219
+
1220
+ // Build clusters and pick survivors
1221
+ const entryById = new Map(entries.map((e) => [e.id, e]));
1222
+ const result: DedupCluster[] = [];
1223
+ let totalRemoved = 0;
1224
+
1225
+ for (const members of rawClusters.values()) {
1226
+ if (members.length < 2) continue;
1227
+
1228
+ // Pick survivor: highest confidence → most recent → shortest title
1229
+ const sorted = members
1230
+ .map((id) => entryById.get(id)!)
1231
+ .filter(Boolean)
1232
+ .sort((a, b) => {
1233
+ if (b.confidence !== a.confidence) return b.confidence - a.confidence;
1234
+ if (b.updated_at !== a.updated_at) return b.updated_at - a.updated_at;
1235
+ return a.title.length - b.title.length;
1236
+ });
1237
+
1238
+ const survivor = sorted[0];
1239
+ const merged = sorted.slice(1);
1240
+
1241
+ result.push({
1242
+ surviving: { id: survivor.id, title: survivor.title },
1243
+ merged: merged.map((e) => ({ id: e.id, title: e.title })),
1244
+ });
1245
+
1246
+ if (!dryRun) {
1247
+ for (const entry of merged) {
1248
+ remove(entry.id);
1249
+ }
1250
+ }
1251
+
1252
+ totalRemoved += merged.length;
1253
+ }
1254
+
1255
+ // Sort clusters by size descending for readability
1256
+ result.sort((a, b) => b.merged.length - a.merged.length);
1257
+
1258
+ // Build title map from all input entries — survives entry deletion.
1259
+ const entryTitles = new Map(entries.map((e) => [e.id, e.title]));
1260
+
1261
+ return { clusters: result, totalRemoved, pairSimilarities, entryTitles };
1262
+ }
1263
+
1264
+ export async function deduplicate(
1265
+ projectPath: string,
1266
+ opts?: { dryRun?: boolean },
1267
+ ): Promise<DedupResult> {
1268
+ const pid = ensureProject(projectPath);
1269
+ const threshold = loadCalibratedThreshold(pid) ?? EMBEDDING_DEDUP_THRESHOLD;
1270
+ const entries = forProject(projectPath, false);
1271
+ return _dedup(entries, opts?.dryRun ?? true, threshold);
1272
+ }
1273
+
1274
+ /** Deduplicate global (cross-project) entries that have no project_id. */
1275
+ export async function deduplicateGlobal(
1276
+ opts?: { dryRun?: boolean },
1277
+ ): Promise<DedupResult> {
1278
+ const threshold = loadCalibratedThreshold(null) ?? EMBEDDING_DEDUP_THRESHOLD;
1279
+ const entries = db()
1280
+ .query(
1281
+ `SELECT ${KNOWLEDGE_COLS} FROM knowledge
1282
+ WHERE project_id IS NULL
1283
+ AND confidence > 0.2
1284
+ ORDER BY confidence DESC, updated_at DESC`,
1285
+ )
1286
+ .all() as KnowledgeEntry[];
1287
+ return _dedup(entries, opts?.dryRun ?? true, threshold);
1288
+ }
1289
+
1290
+ // ---------------------------------------------------------------------------
1291
+ // Dedup feedback & adaptive threshold calibration
1292
+ // ---------------------------------------------------------------------------
1293
+
1294
+ export type DedupFeedbackSource = "auto_dedup" | "cli_yes" | "cli_interactive";
1295
+
1296
+ const MIN_CALIBRATION_SAMPLES = 20;
1297
+ const DEFAULT_EMBEDDING_DEDUP_THRESHOLD = EMBEDDING_DEDUP_THRESHOLD;
1298
+ /** Only record auto-signals for pairs with similarity >= this floor. */
1299
+ const AUTO_SIGNAL_MIN_SIMILARITY = 0.80;
1300
+ /** Max auto-signal pairs to record per dedup run (closest to threshold). */
1301
+ const AUTO_SIGNAL_MAX_PAIRS = 50;
1302
+
1303
+ /** Record a single dedup feedback row. */
1304
+ export function recordDedupFeedback(input: {
1305
+ projectId: string | null;
1306
+ entryATitle: string;
1307
+ entryBTitle: string;
1308
+ similarity: number;
1309
+ accepted: boolean;
1310
+ source: DedupFeedbackSource;
1311
+ }): void {
1312
+ db()
1313
+ .query(
1314
+ `INSERT INTO dedup_feedback
1315
+ (project_id, entry_a_title, entry_b_title, similarity, accepted, source, created_at)
1316
+ VALUES (?, ?, ?, ?, ?, ?, ?)`,
1317
+ )
1318
+ .run(
1319
+ input.projectId,
1320
+ input.entryATitle,
1321
+ input.entryBTitle,
1322
+ input.similarity,
1323
+ input.accepted ? 1 : 0,
1324
+ input.source,
1325
+ Date.now(),
1326
+ );
1327
+ }
1328
+
1329
+ /**
1330
+ * Bulk-record feedback for all merged pairs in a DedupResult.
1331
+ * Only records pairs with embedding similarity > 0 (title-overlap-only
1332
+ * matches are excluded from calibration).
1333
+ */
1334
+ export function recordDedupResultFeedback(
1335
+ projectId: string | null,
1336
+ result: DedupResult,
1337
+ accepted: boolean,
1338
+ source: DedupFeedbackSource,
1339
+ ): void {
1340
+ for (const cluster of result.clusters) {
1341
+ for (const merged of cluster.merged) {
1342
+ const pk = dedupPairKey(cluster.surviving.id, merged.id);
1343
+ const similarity = result.pairSimilarities.get(pk);
1344
+ if (similarity != null && similarity > 0) {
1345
+ recordDedupFeedback({
1346
+ projectId,
1347
+ entryATitle: cluster.surviving.title,
1348
+ entryBTitle: merged.title,
1349
+ similarity,
1350
+ accepted,
1351
+ source,
1352
+ });
1353
+ }
1354
+ }
1355
+ }
1356
+ }
1357
+
1358
+ /**
1359
+ * Record automatic calibration signals from a post-curation dedup sweep.
1360
+ *
1361
+ * Only records **reject** signals — non-merged pairs with similarity in
1362
+ * [0.80, threshold). Accept signals from auto-dedup are tautological (the
1363
+ * pair was merged *because* its similarity exceeded the threshold), so they
1364
+ * provide no new information and would create a self-reinforcing feedback
1365
+ * loop. Manual signals (cli_yes, cli_interactive) provide the accept side.
1366
+ *
1367
+ * Caps at AUTO_SIGNAL_MAX_PAIRS most interesting pairs per run (closest
1368
+ * to the threshold boundary) to avoid table bloat.
1369
+ */
1370
+ export function recordAutoSignals(
1371
+ projectId: string | null,
1372
+ result: DedupResult,
1373
+ ): void {
1374
+ // Collect merged pair IDs for quick lookup (to exclude from reject signals)
1375
+ const mergedPairs = new Set<string>();
1376
+ for (const cluster of result.clusters) {
1377
+ for (const merged of cluster.merged) {
1378
+ mergedPairs.add(dedupPairKey(cluster.surviving.id, merged.id));
1379
+ }
1380
+ }
1381
+
1382
+ // Build a title map — we need titles for reject signals (non-merged pairs).
1383
+ // Use entryTitles from result first, then fall back to cluster data.
1384
+ const titleMap = new Map<string, string>(result.entryTitles);
1385
+ for (const cluster of result.clusters) {
1386
+ if (!titleMap.has(cluster.surviving.id)) {
1387
+ titleMap.set(cluster.surviving.id, cluster.surviving.title);
1388
+ }
1389
+ for (const m of cluster.merged) {
1390
+ if (!titleMap.has(m.id)) titleMap.set(m.id, m.title);
1391
+ }
1392
+ }
1393
+
1394
+ // Collect reject signals: non-merged pairs with high similarity
1395
+ type Signal = { entryATitle: string; entryBTitle: string; similarity: number };
1396
+ const signals: Signal[] = [];
1397
+
1398
+ for (const [pk, sim] of result.pairSimilarities) {
1399
+ if (sim < AUTO_SIGNAL_MIN_SIMILARITY) continue;
1400
+ if (mergedPairs.has(pk)) continue; // merged pair — skip (tautological accept)
1401
+
1402
+ const [idA, idB] = pk.split(":");
1403
+ const titleA = titleMap.get(idA);
1404
+ const titleB = titleMap.get(idB);
1405
+ if (!titleA || !titleB) continue;
1406
+
1407
+ signals.push({ entryATitle: titleA, entryBTitle: titleB, similarity: sim });
1408
+ }
1409
+
1410
+ // Sort by distance to threshold boundary (most informative first), cap
1411
+ const currentThreshold = loadCalibratedThreshold(projectId) ?? DEFAULT_EMBEDDING_DEDUP_THRESHOLD;
1412
+ signals.sort((a, b) => Math.abs(a.similarity - currentThreshold) - Math.abs(b.similarity - currentThreshold));
1413
+ const capped = signals.slice(0, AUTO_SIGNAL_MAX_PAIRS);
1414
+
1415
+ // Prune old feedback to prevent unbounded table growth
1416
+ pruneDedupFeedback(projectId);
1417
+
1418
+ for (const s of capped) {
1419
+ recordDedupFeedback({
1420
+ projectId,
1421
+ entryATitle: s.entryATitle,
1422
+ entryBTitle: s.entryBTitle,
1423
+ similarity: s.similarity,
1424
+ accepted: false,
1425
+ source: "auto_dedup",
1426
+ });
1427
+ }
1428
+ }
1429
+
1430
+ /** Get all feedback for a project (for calibration). */
1431
+ export function getDedupFeedback(
1432
+ projectId: string | null,
1433
+ ): Array<{ similarity: number; accepted: boolean; source: string }> {
1434
+ const rows = (
1435
+ projectId !== null
1436
+ ? db()
1437
+ .query(
1438
+ "SELECT similarity, accepted, source FROM dedup_feedback WHERE project_id = ? ORDER BY similarity",
1439
+ )
1440
+ .all(projectId)
1441
+ : db()
1442
+ .query(
1443
+ "SELECT similarity, accepted, source FROM dedup_feedback WHERE project_id IS NULL ORDER BY similarity",
1444
+ )
1445
+ .all()
1446
+ ) as Array<{ similarity: number; accepted: number; source: string }>;
1447
+ return rows.map((r) => ({ similarity: r.similarity, accepted: r.accepted === 1, source: r.source }));
1448
+ }
1449
+
1450
+ /** Quick count of feedback rows for a project. */
1451
+ export function getDedupFeedbackCount(projectId: string | null): number {
1452
+ const row = (
1453
+ projectId !== null
1454
+ ? db()
1455
+ .query("SELECT COUNT(*) as cnt FROM dedup_feedback WHERE project_id = ?")
1456
+ .get(projectId)
1457
+ : db()
1458
+ .query("SELECT COUNT(*) as cnt FROM dedup_feedback WHERE project_id IS NULL")
1459
+ .get()
1460
+ ) as { cnt: number } | null;
1461
+ return row?.cnt ?? 0;
1462
+ }
1463
+
1464
+ /** Max feedback rows to keep per project (prevents unbounded growth). */
1465
+ const MAX_FEEDBACK_ROWS_PER_PROJECT = 500;
1466
+
1467
+ /**
1468
+ * Prune old feedback rows for a project, keeping the most recent
1469
+ * MAX_FEEDBACK_ROWS_PER_PROJECT rows. Called from recordAutoSignals
1470
+ * to prevent unbounded table growth.
1471
+ */
1472
+ export function pruneDedupFeedback(projectId: string | null): void {
1473
+ const count = getDedupFeedbackCount(projectId);
1474
+ if (count <= MAX_FEEDBACK_ROWS_PER_PROJECT) return;
1475
+
1476
+ const excess = count - MAX_FEEDBACK_ROWS_PER_PROJECT;
1477
+ if (projectId !== null) {
1478
+ db()
1479
+ .query(
1480
+ `DELETE FROM dedup_feedback WHERE id IN (
1481
+ SELECT id FROM dedup_feedback WHERE project_id = ?
1482
+ ORDER BY created_at ASC LIMIT ?
1483
+ )`,
1484
+ )
1485
+ .run(projectId, excess);
1486
+ } else {
1487
+ db()
1488
+ .query(
1489
+ `DELETE FROM dedup_feedback WHERE id IN (
1490
+ SELECT id FROM dedup_feedback WHERE project_id IS NULL
1491
+ ORDER BY created_at ASC LIMIT ?
1492
+ )`,
1493
+ )
1494
+ .run(excess);
1495
+ }
1496
+ }
1497
+
1498
+ /**
1499
+ * Compute an optimal embedding dedup threshold from user feedback.
1500
+ *
1501
+ * Algorithm:
1502
+ * 1. Load all (similarity, accepted) pairs for the project.
1503
+ * 2. If fewer than MIN_CALIBRATION_SAMPLES, return null (use default).
1504
+ * 3. If all feedback is "accept" (no rejects), return the minimum
1505
+ * accepted similarity minus a small margin (0.005).
1506
+ * 4. If all feedback is "reject" (no accepts), return null.
1507
+ * 5. Otherwise, find the threshold that maximizes separation:
1508
+ * - For each candidate threshold (midpoint between consecutive
1509
+ * distinct similarity values), compute accuracy:
1510
+ * correct = accepted_pairs_above + rejected_pairs_below
1511
+ * accuracy = correct / total
1512
+ * - Pick the threshold with highest accuracy.
1513
+ * - Tie-break: prefer higher threshold (conservative).
1514
+ * - Clamp to [0.85, 0.98].
1515
+ */
1516
+ export function calibrateDedupThreshold(projectId: string | null): number | null {
1517
+ const feedback = getDedupFeedback(projectId);
1518
+ if (feedback.length < MIN_CALIBRATION_SAMPLES) return null;
1519
+
1520
+ const accepted = feedback.filter((f) => f.accepted);
1521
+ const rejected = feedback.filter((f) => !f.accepted);
1522
+
1523
+ // Edge case: all accept, no rejects
1524
+ if (rejected.length === 0) {
1525
+ const minAccepted = Math.min(...accepted.map((f) => f.similarity));
1526
+ return Math.max(0.85, minAccepted - 0.005);
1527
+ }
1528
+
1529
+ // Edge case: all reject, no accepts
1530
+ if (accepted.length === 0) {
1531
+ log.warn("dedup calibration: all feedback is reject — keeping default threshold");
1532
+ return null;
1533
+ }
1534
+
1535
+ // Find optimal threshold via accuracy maximization
1536
+ const allSims = [...new Set(feedback.map((f) => f.similarity))].sort((a, b) => a - b);
1537
+
1538
+ let bestThreshold = DEFAULT_EMBEDDING_DEDUP_THRESHOLD;
1539
+ let bestAccuracy = -1;
1540
+
1541
+ for (let i = 0; i < allSims.length - 1; i++) {
1542
+ const candidate = (allSims[i] + allSims[i + 1]) / 2;
1543
+
1544
+ // Pairs above threshold are predicted "merge" — should be accepted
1545
+ // Pairs below threshold are predicted "keep separate" — should be rejected
1546
+ const correctAccepted = accepted.filter((f) => f.similarity >= candidate).length;
1547
+ const correctRejected = rejected.filter((f) => f.similarity < candidate).length;
1548
+ const accuracy = (correctAccepted + correctRejected) / feedback.length;
1549
+
1550
+ // Tie-break: prefer higher threshold (conservative — fewer false merges)
1551
+ if (accuracy > bestAccuracy || (accuracy === bestAccuracy && candidate > bestThreshold)) {
1552
+ bestAccuracy = accuracy;
1553
+ bestThreshold = candidate;
1554
+ }
1555
+ }
1556
+
1557
+ // Clamp to sane range
1558
+ return Math.max(0.85, Math.min(0.98, bestThreshold));
1559
+ }
1560
+
1561
+ /** Persist the calibrated threshold for a project. */
1562
+ export function saveCalibratedThreshold(
1563
+ projectId: string | null,
1564
+ threshold: number,
1565
+ sampleSize: number,
1566
+ ): void {
1567
+ const key = `dedup_threshold:${projectId ?? "global"}`;
1568
+ setKV(key, JSON.stringify({ threshold, sampleSize, calibratedAt: Date.now() }));
1569
+ }
1570
+
1571
+ /** Load the calibrated threshold for a project, or null if not calibrated. */
1572
+ export function loadCalibratedThreshold(projectId: string | null): number | null {
1573
+ const key = `dedup_threshold:${projectId ?? "global"}`;
1574
+ const raw = getKV(key);
1575
+ if (!raw) return null;
1576
+ try {
1577
+ const parsed = JSON.parse(raw);
1578
+ return typeof parsed.threshold === "number" ? parsed.threshold : null;
1579
+ } catch {
1580
+ return null;
1581
+ }
1582
+ }