@loreai/core 0.17.0 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. package/dist/bun/agents-file.d.ts +4 -0
  2. package/dist/bun/agents-file.d.ts.map +1 -1
  3. package/dist/bun/config.d.ts +2 -0
  4. package/dist/bun/config.d.ts.map +1 -1
  5. package/dist/bun/curator.d.ts +45 -0
  6. package/dist/bun/curator.d.ts.map +1 -1
  7. package/dist/bun/data-dir.d.ts +18 -0
  8. package/dist/bun/data-dir.d.ts.map +1 -0
  9. package/dist/bun/db.d.ts +12 -0
  10. package/dist/bun/db.d.ts.map +1 -1
  11. package/dist/bun/distillation.d.ts.map +1 -1
  12. package/dist/bun/embedding-vendor.d.ts +22 -38
  13. package/dist/bun/embedding-vendor.d.ts.map +1 -1
  14. package/dist/bun/embedding-worker-types.d.ts +17 -12
  15. package/dist/bun/embedding-worker-types.d.ts.map +1 -1
  16. package/dist/bun/embedding-worker.d.ts +9 -2
  17. package/dist/bun/embedding-worker.d.ts.map +1 -1
  18. package/dist/bun/embedding-worker.js +38864 -33
  19. package/dist/bun/embedding-worker.js.map +4 -4
  20. package/dist/bun/embedding.d.ts +30 -22
  21. package/dist/bun/embedding.d.ts.map +1 -1
  22. package/dist/bun/gradient.d.ts +8 -1
  23. package/dist/bun/gradient.d.ts.map +1 -1
  24. package/dist/bun/import/detect.d.ts +14 -0
  25. package/dist/bun/import/detect.d.ts.map +1 -0
  26. package/dist/bun/import/extract.d.ts +43 -0
  27. package/dist/bun/import/extract.d.ts.map +1 -0
  28. package/dist/bun/import/history.d.ts +40 -0
  29. package/dist/bun/import/history.d.ts.map +1 -0
  30. package/dist/bun/import/index.d.ts +17 -0
  31. package/dist/bun/import/index.d.ts.map +1 -0
  32. package/dist/bun/import/providers/aider.d.ts +2 -0
  33. package/dist/bun/import/providers/aider.d.ts.map +1 -0
  34. package/dist/bun/import/providers/claude-code.d.ts +2 -0
  35. package/dist/bun/import/providers/claude-code.d.ts.map +1 -0
  36. package/dist/bun/import/providers/cline.d.ts +2 -0
  37. package/dist/bun/import/providers/cline.d.ts.map +1 -0
  38. package/dist/bun/import/providers/codex.d.ts +2 -0
  39. package/dist/bun/import/providers/codex.d.ts.map +1 -0
  40. package/dist/bun/import/providers/continue.d.ts +2 -0
  41. package/dist/bun/import/providers/continue.d.ts.map +1 -0
  42. package/dist/bun/import/providers/index.d.ts +19 -0
  43. package/dist/bun/import/providers/index.d.ts.map +1 -0
  44. package/dist/bun/import/providers/opencode.d.ts +2 -0
  45. package/dist/bun/import/providers/opencode.d.ts.map +1 -0
  46. package/dist/bun/import/providers/pi.d.ts +2 -0
  47. package/dist/bun/import/providers/pi.d.ts.map +1 -0
  48. package/dist/bun/import/types.d.ts +82 -0
  49. package/dist/bun/import/types.d.ts.map +1 -0
  50. package/dist/bun/index.d.ts +4 -1
  51. package/dist/bun/index.d.ts.map +1 -1
  52. package/dist/bun/index.js +2217 -224
  53. package/dist/bun/index.js.map +4 -4
  54. package/dist/bun/instruction-detect.d.ts +66 -0
  55. package/dist/bun/instruction-detect.d.ts.map +1 -0
  56. package/dist/bun/log.d.ts +9 -0
  57. package/dist/bun/log.d.ts.map +1 -1
  58. package/dist/bun/ltm.d.ts +40 -0
  59. package/dist/bun/ltm.d.ts.map +1 -1
  60. package/dist/bun/pattern-extract.d.ts +7 -0
  61. package/dist/bun/pattern-extract.d.ts.map +1 -1
  62. package/dist/bun/prompt.d.ts +1 -1
  63. package/dist/bun/prompt.d.ts.map +1 -1
  64. package/dist/bun/recall.d.ts.map +1 -1
  65. package/dist/bun/search.d.ts +5 -3
  66. package/dist/bun/search.d.ts.map +1 -1
  67. package/dist/bun/temporal.d.ts.map +1 -1
  68. package/dist/bun/types.d.ts +1 -1
  69. package/dist/node/agents-file.d.ts +4 -0
  70. package/dist/node/agents-file.d.ts.map +1 -1
  71. package/dist/node/config.d.ts +2 -0
  72. package/dist/node/config.d.ts.map +1 -1
  73. package/dist/node/curator.d.ts +45 -0
  74. package/dist/node/curator.d.ts.map +1 -1
  75. package/dist/node/data-dir.d.ts +18 -0
  76. package/dist/node/data-dir.d.ts.map +1 -0
  77. package/dist/node/db.d.ts +12 -0
  78. package/dist/node/db.d.ts.map +1 -1
  79. package/dist/node/distillation.d.ts.map +1 -1
  80. package/dist/node/embedding-vendor.d.ts +22 -38
  81. package/dist/node/embedding-vendor.d.ts.map +1 -1
  82. package/dist/node/embedding-worker-types.d.ts +17 -12
  83. package/dist/node/embedding-worker-types.d.ts.map +1 -1
  84. package/dist/node/embedding-worker.d.ts +9 -2
  85. package/dist/node/embedding-worker.d.ts.map +1 -1
  86. package/dist/node/embedding-worker.js +38864 -33
  87. package/dist/node/embedding-worker.js.map +4 -4
  88. package/dist/node/embedding.d.ts +30 -22
  89. package/dist/node/embedding.d.ts.map +1 -1
  90. package/dist/node/gradient.d.ts +8 -1
  91. package/dist/node/gradient.d.ts.map +1 -1
  92. package/dist/node/import/detect.d.ts +14 -0
  93. package/dist/node/import/detect.d.ts.map +1 -0
  94. package/dist/node/import/extract.d.ts +43 -0
  95. package/dist/node/import/extract.d.ts.map +1 -0
  96. package/dist/node/import/history.d.ts +40 -0
  97. package/dist/node/import/history.d.ts.map +1 -0
  98. package/dist/node/import/index.d.ts +17 -0
  99. package/dist/node/import/index.d.ts.map +1 -0
  100. package/dist/node/import/providers/aider.d.ts +2 -0
  101. package/dist/node/import/providers/aider.d.ts.map +1 -0
  102. package/dist/node/import/providers/claude-code.d.ts +2 -0
  103. package/dist/node/import/providers/claude-code.d.ts.map +1 -0
  104. package/dist/node/import/providers/cline.d.ts +2 -0
  105. package/dist/node/import/providers/cline.d.ts.map +1 -0
  106. package/dist/node/import/providers/codex.d.ts +2 -0
  107. package/dist/node/import/providers/codex.d.ts.map +1 -0
  108. package/dist/node/import/providers/continue.d.ts +2 -0
  109. package/dist/node/import/providers/continue.d.ts.map +1 -0
  110. package/dist/node/import/providers/index.d.ts +19 -0
  111. package/dist/node/import/providers/index.d.ts.map +1 -0
  112. package/dist/node/import/providers/opencode.d.ts +2 -0
  113. package/dist/node/import/providers/opencode.d.ts.map +1 -0
  114. package/dist/node/import/providers/pi.d.ts +2 -0
  115. package/dist/node/import/providers/pi.d.ts.map +1 -0
  116. package/dist/node/import/types.d.ts +82 -0
  117. package/dist/node/import/types.d.ts.map +1 -0
  118. package/dist/node/index.d.ts +4 -1
  119. package/dist/node/index.d.ts.map +1 -1
  120. package/dist/node/index.js +2217 -224
  121. package/dist/node/index.js.map +4 -4
  122. package/dist/node/instruction-detect.d.ts +66 -0
  123. package/dist/node/instruction-detect.d.ts.map +1 -0
  124. package/dist/node/log.d.ts +9 -0
  125. package/dist/node/log.d.ts.map +1 -1
  126. package/dist/node/ltm.d.ts +40 -0
  127. package/dist/node/ltm.d.ts.map +1 -1
  128. package/dist/node/pattern-extract.d.ts +7 -0
  129. package/dist/node/pattern-extract.d.ts.map +1 -1
  130. package/dist/node/prompt.d.ts +1 -1
  131. package/dist/node/prompt.d.ts.map +1 -1
  132. package/dist/node/recall.d.ts.map +1 -1
  133. package/dist/node/search.d.ts +5 -3
  134. package/dist/node/search.d.ts.map +1 -1
  135. package/dist/node/temporal.d.ts.map +1 -1
  136. package/dist/node/types.d.ts +1 -1
  137. package/dist/types/agents-file.d.ts +4 -0
  138. package/dist/types/agents-file.d.ts.map +1 -1
  139. package/dist/types/config.d.ts +2 -0
  140. package/dist/types/config.d.ts.map +1 -1
  141. package/dist/types/curator.d.ts +45 -0
  142. package/dist/types/curator.d.ts.map +1 -1
  143. package/dist/types/data-dir.d.ts +18 -0
  144. package/dist/types/data-dir.d.ts.map +1 -0
  145. package/dist/types/db.d.ts +12 -0
  146. package/dist/types/db.d.ts.map +1 -1
  147. package/dist/types/distillation.d.ts.map +1 -1
  148. package/dist/types/embedding-vendor.d.ts +22 -38
  149. package/dist/types/embedding-vendor.d.ts.map +1 -1
  150. package/dist/types/embedding-worker-types.d.ts +17 -12
  151. package/dist/types/embedding-worker-types.d.ts.map +1 -1
  152. package/dist/types/embedding-worker.d.ts +9 -2
  153. package/dist/types/embedding-worker.d.ts.map +1 -1
  154. package/dist/types/embedding.d.ts +30 -22
  155. package/dist/types/embedding.d.ts.map +1 -1
  156. package/dist/types/gradient.d.ts +8 -1
  157. package/dist/types/gradient.d.ts.map +1 -1
  158. package/dist/types/import/detect.d.ts +14 -0
  159. package/dist/types/import/detect.d.ts.map +1 -0
  160. package/dist/types/import/extract.d.ts +43 -0
  161. package/dist/types/import/extract.d.ts.map +1 -0
  162. package/dist/types/import/history.d.ts +40 -0
  163. package/dist/types/import/history.d.ts.map +1 -0
  164. package/dist/types/import/index.d.ts +17 -0
  165. package/dist/types/import/index.d.ts.map +1 -0
  166. package/dist/types/import/providers/aider.d.ts +2 -0
  167. package/dist/types/import/providers/aider.d.ts.map +1 -0
  168. package/dist/types/import/providers/claude-code.d.ts +2 -0
  169. package/dist/types/import/providers/claude-code.d.ts.map +1 -0
  170. package/dist/types/import/providers/cline.d.ts +2 -0
  171. package/dist/types/import/providers/cline.d.ts.map +1 -0
  172. package/dist/types/import/providers/codex.d.ts +2 -0
  173. package/dist/types/import/providers/codex.d.ts.map +1 -0
  174. package/dist/types/import/providers/continue.d.ts +2 -0
  175. package/dist/types/import/providers/continue.d.ts.map +1 -0
  176. package/dist/types/import/providers/index.d.ts +19 -0
  177. package/dist/types/import/providers/index.d.ts.map +1 -0
  178. package/dist/types/import/providers/opencode.d.ts +2 -0
  179. package/dist/types/import/providers/opencode.d.ts.map +1 -0
  180. package/dist/types/import/providers/pi.d.ts +2 -0
  181. package/dist/types/import/providers/pi.d.ts.map +1 -0
  182. package/dist/types/import/types.d.ts +82 -0
  183. package/dist/types/import/types.d.ts.map +1 -0
  184. package/dist/types/index.d.ts +4 -1
  185. package/dist/types/index.d.ts.map +1 -1
  186. package/dist/types/instruction-detect.d.ts +66 -0
  187. package/dist/types/instruction-detect.d.ts.map +1 -0
  188. package/dist/types/log.d.ts +9 -0
  189. package/dist/types/log.d.ts.map +1 -1
  190. package/dist/types/ltm.d.ts +40 -0
  191. package/dist/types/ltm.d.ts.map +1 -1
  192. package/dist/types/pattern-extract.d.ts +7 -0
  193. package/dist/types/pattern-extract.d.ts.map +1 -1
  194. package/dist/types/prompt.d.ts +1 -1
  195. package/dist/types/prompt.d.ts.map +1 -1
  196. package/dist/types/recall.d.ts.map +1 -1
  197. package/dist/types/search.d.ts +5 -3
  198. package/dist/types/search.d.ts.map +1 -1
  199. package/dist/types/temporal.d.ts.map +1 -1
  200. package/dist/types/types.d.ts +1 -1
  201. package/package.json +2 -4
  202. package/src/agents-file.ts +41 -13
  203. package/src/config.ts +31 -18
  204. package/src/curator.ts +111 -75
  205. package/src/data-dir.ts +76 -0
  206. package/src/db.ts +110 -11
  207. package/src/distillation.ts +10 -2
  208. package/src/embedding-vendor.ts +23 -40
  209. package/src/embedding-worker-types.ts +19 -11
  210. package/src/embedding-worker.ts +111 -47
  211. package/src/embedding.ts +196 -171
  212. package/src/gradient.ts +9 -1
  213. package/src/import/detect.ts +37 -0
  214. package/src/import/extract.ts +137 -0
  215. package/src/import/history.ts +99 -0
  216. package/src/import/index.ts +45 -0
  217. package/src/import/providers/aider.ts +207 -0
  218. package/src/import/providers/claude-code.ts +339 -0
  219. package/src/import/providers/cline.ts +324 -0
  220. package/src/import/providers/codex.ts +369 -0
  221. package/src/import/providers/continue.ts +304 -0
  222. package/src/import/providers/index.ts +32 -0
  223. package/src/import/providers/opencode.ts +272 -0
  224. package/src/import/providers/pi.ts +332 -0
  225. package/src/import/types.ts +91 -0
  226. package/src/index.ts +5 -0
  227. package/src/instruction-detect.ts +275 -0
  228. package/src/log.ts +91 -3
  229. package/src/ltm.ts +316 -3
  230. package/src/pattern-extract.ts +41 -0
  231. package/src/prompt.ts +7 -1
  232. package/src/recall.ts +43 -5
  233. package/src/search.ts +7 -5
  234. package/src/temporal.ts +8 -6
  235. package/src/types.ts +1 -1
package/src/ltm.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { uuidv7 } from "uuidv7";
2
2
  import { db, ensureProject } from "./db";
3
3
  import { config } from "./config";
4
- import { ftsQuery, EMPTY_QUERY, extractTopTerms, runRelaxedSearch } from "./search";
4
+ import { ftsQuery, ftsQueryOr, EMPTY_QUERY, extractTopTerms, filterTerms, runRelaxedSearch } from "./search";
5
5
  import * as embedding from "./embedding";
6
6
  import * as latReader from "./lat-reader";
7
7
  import * as log from "./log";
@@ -50,6 +50,10 @@ export function create(input: {
50
50
  ? ensureProject(input.projectPath)
51
51
  : null;
52
52
 
53
+ // IF-2: Global entries (pid=null) must be cross-project to avoid a data hole
54
+ // where forSession() can't find them in either the project or cross-project pool.
55
+ const crossProject = pid === null ? true : (input.crossProject ?? false);
56
+
53
57
  // Dedup guard: if an entry with the same project_id + title already exists,
54
58
  // update its content instead of inserting a duplicate. This prevents the
55
59
  // curator from creating multiple entries for the same concept across sessions.
@@ -90,6 +94,16 @@ export function create(input: {
90
94
  update(crossExisting.id, { content: input.content });
91
95
  return crossExisting.id;
92
96
  }
97
+
98
+ // Fuzzy dedup: check for title-similar entries via FTS5 + word-overlap.
99
+ // This catches near-duplicates the curator creates with slightly different
100
+ // titles for the same concept (e.g. "Upgrade lock bug" vs "Upgrade binary
101
+ // lock re-entry bug"). Placed after exact checks (cheaper checks first).
102
+ const fuzzyMatch = findFuzzyDuplicate({ title: input.title, projectId: pid });
103
+ if (fuzzyMatch) {
104
+ update(fuzzyMatch.id, { content: input.content });
105
+ return fuzzyMatch.id;
106
+ }
93
107
  }
94
108
 
95
109
  const id = input.id ?? uuidv7();
@@ -106,7 +120,7 @@ export function create(input: {
106
120
  input.title,
107
121
  input.content,
108
122
  input.session ?? null,
109
- (input.crossProject ?? false) ? 1 : 0,
123
+ crossProject ? 1 : 0,
110
124
  now,
111
125
  now,
112
126
  );
@@ -130,8 +144,10 @@ export function update(
130
144
  params.push(input.content);
131
145
  }
132
146
  if (input.confidence !== undefined) {
147
+ // Clamp to [0.0, 1.0] — an LLM-provided value outside this range would
148
+ // give disproportionate scoring weight (>1) or silently soft-delete (<0.2).
133
149
  sets.push("confidence = ?");
134
- params.push(input.confidence);
150
+ params.push(Math.max(0, Math.min(1, input.confidence)));
135
151
  }
136
152
  sets.push("updated_at = ?");
137
153
  params.push(Date.now());
@@ -153,6 +169,100 @@ export function remove(id: string) {
153
169
  db().query("DELETE FROM knowledge WHERE id = ?").run(id);
154
170
  }
155
171
 
172
+ // ---------------------------------------------------------------------------
173
+ // Fuzzy title dedup — word-overlap similarity
174
+ // ---------------------------------------------------------------------------
175
+
176
+ /**
177
+ * Compute title word-overlap between two titles.
178
+ * Returns { coefficient, intersectionSize } where:
179
+ * - coefficient = |A ∩ B| / min(|A|, |B|) (0–1)
180
+ * - intersectionSize = number of shared meaningful words
181
+ * Filters stopwords and single-char tokens for meaningful comparison.
182
+ */
183
+ function titleOverlap(a: string, b: string): { coefficient: number; intersectionSize: number } {
184
+ const wordsA = new Set(filterTerms(a).map((w) => w.toLowerCase()));
185
+ const wordsB = new Set(filterTerms(b).map((w) => w.toLowerCase()));
186
+ if (wordsA.size === 0 || wordsB.size === 0) return { coefficient: 0, intersectionSize: 0 };
187
+ const intersection = [...wordsA].filter((w) => wordsB.has(w));
188
+ return {
189
+ coefficient: intersection.length / Math.min(wordsA.size, wordsB.size),
190
+ intersectionSize: intersection.length,
191
+ };
192
+ }
193
+
194
+ /** Minimum word-overlap coefficient to consider two titles as duplicates. */
195
+ const FUZZY_DEDUP_THRESHOLD = 0.7;
196
+ /** Minimum number of overlapping meaningful words required for a fuzzy match.
197
+ * Prevents false positives on short titles where 2-3 common words produce
198
+ * a high overlap coefficient despite being genuinely different entries. */
199
+ const FUZZY_DEDUP_MIN_OVERLAP = 4;
200
+ /** Minimum cosine similarity for embedding-based dedup. Empirically tuned
201
+ * against 312 Nomic v1.5 entries:
202
+ * - 0.935+: all genuine duplicates (same topic, different wording)
203
+ * - 0.92–0.935: contains false positives from same-subsystem entries
204
+ * (e.g. "BGE Small unusable" ↔ "Nomic OOM" scored 0.9326 — related
205
+ * but distinct bugs). Star clustering amplifies this by bridging.
206
+ * - <0.92: mixed or unrelated entries */
207
+ const EMBEDDING_DEDUP_THRESHOLD = 0.935;
208
+
209
+ /**
210
+ * Find an existing knowledge entry whose title is fuzzy-similar to the given title.
211
+ *
212
+ * Uses FTS5 to find up to 5 candidates, then applies word-overlap filtering.
213
+ * This is the same algorithm used by `check()` but returns a single match
214
+ * for use in the `create()` dedup guard.
215
+ *
216
+ * @returns The first matching entry (id + title), or null if no fuzzy match.
217
+ */
218
+ export function findFuzzyDuplicate(input: {
219
+ title: string;
220
+ projectId: string | null;
221
+ excludeId?: string;
222
+ }): { id: string; title: string } | null {
223
+ const q = ftsQueryOr(input.title);
224
+ if (q === EMPTY_QUERY) return null;
225
+
226
+ const { title: tw, content: cw, category: catw } = config().search.ftsWeights;
227
+
228
+ try {
229
+ // Build query scoped to the same project + cross-project entries
230
+ const excludeClause = input.excludeId ? "AND k.id != ?" : "";
231
+ const sql = input.projectId !== null
232
+ ? `SELECT k.id, k.title FROM knowledge_fts f
233
+ CROSS JOIN knowledge k ON k.rowid = f.rowid
234
+ WHERE knowledge_fts MATCH ?
235
+ AND (k.project_id = ? OR k.cross_project = 1)
236
+ AND k.confidence > 0.2
237
+ ${excludeClause}
238
+ ORDER BY bm25(knowledge_fts, ?, ?, ?) LIMIT 5`
239
+ : `SELECT k.id, k.title FROM knowledge_fts f
240
+ CROSS JOIN knowledge k ON k.rowid = f.rowid
241
+ WHERE knowledge_fts MATCH ?
242
+ AND (k.project_id IS NULL OR k.cross_project = 1)
243
+ AND k.confidence > 0.2
244
+ ${excludeClause}
245
+ ORDER BY bm25(knowledge_fts, ?, ?, ?) LIMIT 5`;
246
+
247
+ const params: (string | number)[] = input.projectId !== null
248
+ ? [q, input.projectId, ...(input.excludeId ? [input.excludeId] : []), tw, cw, catw]
249
+ : [q, ...(input.excludeId ? [input.excludeId] : []), tw, cw, catw];
250
+
251
+ const candidates = db().query(sql).all(...params) as Array<{ id: string; title: string }>;
252
+
253
+ for (const candidate of candidates) {
254
+ const { coefficient, intersectionSize } = titleOverlap(input.title, candidate.title);
255
+ if (coefficient >= FUZZY_DEDUP_THRESHOLD && intersectionSize >= FUZZY_DEDUP_MIN_OVERLAP) {
256
+ return candidate;
257
+ }
258
+ }
259
+ } catch {
260
+ // FTS5 error — fall through to no match
261
+ }
262
+
263
+ return null;
264
+ }
265
+
156
266
  export function forProject(
157
267
  projectPath: string,
158
268
  includeCross = true,
@@ -418,6 +528,17 @@ export function all(): KnowledgeEntry[] {
418
528
  .all() as KnowledgeEntry[];
419
529
  }
420
530
 
531
+ /** Return all cross-project and global (user-level) knowledge entries. */
532
+ export function crossProject(): KnowledgeEntry[] {
533
+ return db()
534
+ .query(
535
+ `SELECT ${KNOWLEDGE_COLS} FROM knowledge
536
+ WHERE (project_id IS NULL OR cross_project = 1) AND confidence > 0.2
537
+ ORDER BY confidence DESC, updated_at DESC`,
538
+ )
539
+ .all() as KnowledgeEntry[];
540
+ }
541
+
421
542
  // LIKE-based fallback for when FTS5 fails unexpectedly.
422
543
  function searchLike(input: {
423
544
  query: string;
@@ -832,3 +953,195 @@ export function check(projectPath: string): IntegrityIssue[] {
832
953
 
833
954
  return issues;
834
955
  }
956
+
957
+ // ---------------------------------------------------------------------------
958
+ // Deduplication — embedding-based semantic clustering with word-overlap fallback
959
+ // ---------------------------------------------------------------------------
960
+
961
+ export type DedupCluster = {
962
+ surviving: { id: string; title: string };
963
+ merged: Array<{ id: string; title: string }>;
964
+ };
965
+
966
+ export type DedupResult = {
967
+ clusters: DedupCluster[];
968
+ totalRemoved: number;
969
+ };
970
+
971
+ /**
972
+ * Deduplicate knowledge entries for a project.
973
+ *
974
+ * Uses two complementary signals with "star" clustering (no transitive
975
+ * chains) to prevent snowball merging:
976
+ *
977
+ * 1. **Title word-overlap** (Jaccard on meaningful words) — catches entries
978
+ * with similar titles regardless of content wording.
979
+ * 2. **Embedding cosine similarity** (when embeddings are available) — catches
980
+ * entries with different titles but semantically identical content. Nomic
981
+ * v1.5 produces a same-domain spread of 0.46–0.70 for distinct entries,
982
+ * making threshold-based dedup viable at 0.935+ (lower thresholds catch
983
+ * related-but-distinct entries as false positives, especially via star
984
+ * clustering where a hub entry bridges two distinct topics).
985
+ *
986
+ * Pairs matching either signal are clustered together. For each cluster,
987
+ * picks a survivor (highest confidence, then most recently updated, then
988
+ * shortest title) and removes the rest.
989
+ *
990
+ * @param projectPath Project root path
991
+ * @param opts.dryRun If true (default), report clusters without deleting
992
+ * @returns Cluster report and count of removed entries
993
+ */
994
+ /** Core dedup logic — operates on an arbitrary list of entries. */
995
+ function _dedup(entries: KnowledgeEntry[], dryRun: boolean): DedupResult {
996
+ if (entries.length < 2) return { clusters: [], totalRemoved: 0 };
997
+
998
+ // --- Build neighbor map using title overlap + embedding similarity ---
999
+ // Two entries are considered neighbors (potential duplicates) if EITHER:
1000
+ // (a) title word-overlap ≥ 0.7 with ≥ 4 shared words, OR
1001
+ // (b) embedding cosine similarity ≥ 0.935
1002
+ // Star clustering (no transitivity) prevents snowball merging.
1003
+ // O(n²) pairwise comparison — acceptable for n ≤ 25 (maxEntries cap).
1004
+
1005
+ // Load embeddings for the given entries (if available).
1006
+ // We query directly rather than using vectorSearch() because we need
1007
+ // pairwise comparison among entries, not a query-vs-all search.
1008
+ const embeddingMap = new Map<string, Float32Array>();
1009
+ {
1010
+ const entryIds = entries.map((e) => e.id);
1011
+ // Build parameterized IN clause for the entry IDs
1012
+ const placeholders = entryIds.map(() => "?").join(",");
1013
+ const rows = db()
1014
+ .query(`SELECT id, embedding FROM knowledge WHERE embedding IS NOT NULL AND id IN (${placeholders})`)
1015
+ .all(...entryIds) as Array<{ id: string; embedding: Buffer }>;
1016
+ for (const row of rows) {
1017
+ try {
1018
+ embeddingMap.set(row.id, embedding.fromBlob(row.embedding));
1019
+ } catch {
1020
+ // Skip corrupted embeddings — entry falls back to title-overlap only.
1021
+ log.info(`skipping corrupted embedding for entry ${row.id}`);
1022
+ }
1023
+ }
1024
+ }
1025
+
1026
+ // Pre-compute neighbors for all pairs
1027
+ type DedupHit = { id: string; score: number };
1028
+ const neighborMap = new Map<string, DedupHit[]>();
1029
+
1030
+ for (const entry of entries) {
1031
+ const neighbors: DedupHit[] = [];
1032
+ const entryVec = embeddingMap.get(entry.id);
1033
+
1034
+ for (const other of entries) {
1035
+ if (other.id === entry.id) continue;
1036
+
1037
+ // Signal 1: title word-overlap
1038
+ const { coefficient, intersectionSize } = titleOverlap(entry.title, other.title);
1039
+ const titleMatch = coefficient >= FUZZY_DEDUP_THRESHOLD && intersectionSize >= FUZZY_DEDUP_MIN_OVERLAP;
1040
+
1041
+ // Signal 2: embedding cosine similarity
1042
+ let embeddingMatch = false;
1043
+ let similarity = 0;
1044
+ if (entryVec) {
1045
+ const otherVec = embeddingMap.get(other.id);
1046
+ if (otherVec && entryVec.length === otherVec.length) {
1047
+ similarity = embedding.cosineSimilarity(entryVec, otherVec);
1048
+ embeddingMatch = similarity >= EMBEDDING_DEDUP_THRESHOLD;
1049
+ }
1050
+ }
1051
+
1052
+ if (titleMatch || embeddingMatch) {
1053
+ // Use the stronger signal as the match score for cluster priority
1054
+ neighbors.push({ id: other.id, score: Math.max(coefficient, similarity) });
1055
+ }
1056
+ }
1057
+ neighbors.sort((a, b) => b.score - a.score);
1058
+ neighborMap.set(entry.id, neighbors);
1059
+ }
1060
+
1061
+ // Greedy star clustering — process entries with most neighbors first
1062
+ const claimed = new Set<string>();
1063
+ const rawClusters = new Map<string, string[]>();
1064
+
1065
+ const sortedIds = [...neighborMap.keys()].sort(
1066
+ (a, b) => neighborMap.get(b)!.length - neighborMap.get(a)!.length,
1067
+ );
1068
+
1069
+ for (const centerId of sortedIds) {
1070
+ if (claimed.has(centerId)) continue;
1071
+ claimed.add(centerId);
1072
+ const members = [centerId];
1073
+
1074
+ for (const { id: neighborId } of neighborMap.get(centerId)!) {
1075
+ if (claimed.has(neighborId)) continue;
1076
+ claimed.add(neighborId);
1077
+ members.push(neighborId);
1078
+ }
1079
+
1080
+ if (members.length > 1) {
1081
+ rawClusters.set(centerId, members);
1082
+ }
1083
+ }
1084
+
1085
+ // Build clusters and pick survivors
1086
+ const entryById = new Map(entries.map((e) => [e.id, e]));
1087
+ const result: DedupCluster[] = [];
1088
+ let totalRemoved = 0;
1089
+
1090
+ for (const members of rawClusters.values()) {
1091
+ if (members.length < 2) continue;
1092
+
1093
+ // Pick survivor: highest confidence → most recent → shortest title
1094
+ const sorted = members
1095
+ .map((id) => entryById.get(id)!)
1096
+ .filter(Boolean)
1097
+ .sort((a, b) => {
1098
+ if (b.confidence !== a.confidence) return b.confidence - a.confidence;
1099
+ if (b.updated_at !== a.updated_at) return b.updated_at - a.updated_at;
1100
+ return a.title.length - b.title.length;
1101
+ });
1102
+
1103
+ const survivor = sorted[0];
1104
+ const merged = sorted.slice(1);
1105
+
1106
+ result.push({
1107
+ surviving: { id: survivor.id, title: survivor.title },
1108
+ merged: merged.map((e) => ({ id: e.id, title: e.title })),
1109
+ });
1110
+
1111
+ if (!dryRun) {
1112
+ for (const entry of merged) {
1113
+ remove(entry.id);
1114
+ }
1115
+ }
1116
+
1117
+ totalRemoved += merged.length;
1118
+ }
1119
+
1120
+ // Sort clusters by size descending for readability
1121
+ result.sort((a, b) => b.merged.length - a.merged.length);
1122
+
1123
+ return { clusters: result, totalRemoved };
1124
+ }
1125
+
1126
+ export async function deduplicate(
1127
+ projectPath: string,
1128
+ opts?: { dryRun?: boolean },
1129
+ ): Promise<DedupResult> {
1130
+ const entries = forProject(projectPath, false);
1131
+ return _dedup(entries, opts?.dryRun ?? true);
1132
+ }
1133
+
1134
+ /** Deduplicate global (cross-project) entries that have no project_id. */
1135
+ export async function deduplicateGlobal(
1136
+ opts?: { dryRun?: boolean },
1137
+ ): Promise<DedupResult> {
1138
+ const entries = db()
1139
+ .query(
1140
+ `SELECT ${KNOWLEDGE_COLS} FROM knowledge
1141
+ WHERE project_id IS NULL
1142
+ AND confidence > 0.2
1143
+ ORDER BY confidence DESC, updated_at DESC`,
1144
+ )
1145
+ .all() as KnowledgeEntry[];
1146
+ return _dedup(entries, opts?.dryRun ?? true);
1147
+ }
@@ -12,6 +12,13 @@
12
12
  * - "prefers X for Y"
13
13
  * - "going with X because Y"
14
14
  *
15
+ * Also matches process instruction patterns from distilled observations
16
+ * where the observer normalizes user assertions:
17
+ * - "User stated always X"
18
+ * - "User said never Y"
19
+ * - "User stated make sure to X"
20
+ * - "User stated don't forget to X"
21
+ *
15
22
  * Extracted entries participate in the normal curator cycle — the curator
16
23
  * can consolidate or remove them based on actual value. The extraction is
17
24
  * a cheap seed, not a permanent fixture.
@@ -76,6 +83,33 @@ const PATTERNS: PatternDef[] = [
76
83
  category: "preference",
77
84
  titleFn: (m) => `Typically uses ${m[1].trim()}`,
78
85
  },
86
+
87
+ // Process instruction patterns — match distilled observations recording
88
+ // user assertions about workflow/process rules. The distillation observer
89
+ // normalizes user instructions into "User stated always X" phrasing.
90
+ // These require "stated/asserted/said" to avoid overlapping with the
91
+ // existing "typically uses" pattern above (which already handles
92
+ // "user always use/prefer/go with X").
93
+ {
94
+ regex: /(?:user |team |we )(?:stated |asserted |said )(?:to )?always (.+?)(?:\.|,|$)/gi,
95
+ category: "preference",
96
+ titleFn: (m) => `Always ${m[1].trim()}`,
97
+ },
98
+ {
99
+ regex: /(?:user |team |we )(?:stated |asserted |said )(?:to )?never (.+?)(?:\.|,|$)/gi,
100
+ category: "preference",
101
+ titleFn: (m) => `Never ${m[1].trim()}`,
102
+ },
103
+ {
104
+ regex: /(?:user |team |we )(?:stated |asserted |said )(?:to )?make sure to (.+?)(?:\.|,|$)/gi,
105
+ category: "preference",
106
+ titleFn: (m) => `Make sure to ${m[1].trim()}`,
107
+ },
108
+ {
109
+ regex: /(?:user |team |we )(?:stated |asserted |said )(?:to )?(?:don't|do not) forget (?:to )?(.+?)(?:\.|,|$)/gi,
110
+ category: "preference",
111
+ titleFn: (m) => `Always ${m[1].trim()}`,
112
+ },
79
113
  ];
80
114
 
81
115
  /**
@@ -96,6 +130,13 @@ export function extractPatterns(observations: string): ExtractedPattern[] {
96
130
  regex.lastIndex = 0;
97
131
  let match: RegExpMatchArray | null;
98
132
  while ((match = regex.exec(observations)) !== null) {
133
+ // Skip false positives: template placeholders (e.g. "X", "Y"),
134
+ // quoted fragments, or very short captures that are clearly not
135
+ // real technology/tool names. Plain apostrophes (') are allowed
136
+ // since they appear in valid names like "Bun's test runner".
137
+ const captures = match.slice(1);
138
+ if (captures.some((c) => c && (c.trim().length <= 2 || /["\u201C\u201D`\u2018\u2019]/.test(c)))) continue;
139
+
99
140
  const title = titleFn(match);
100
141
  const key = title.toLowerCase();
101
142
  if (seen.has(key)) continue;
package/src/prompt.ts CHANGED
@@ -222,6 +222,10 @@ Focus ONLY on knowledge that helps a coding agent work effectively on THIS codeb
222
222
  - Environment/tooling setup details that affect development
223
223
  - Important relationships between components that aren't obvious from reading the code
224
224
  - User preferences and working style specific to how they use this project
225
+ - Repeated user instructions — when the user says things like "always", "never",
226
+ "make sure to", "don't forget to", these are high-value preference candidates.
227
+ If you see instruction-like language, prioritize extracting it as a "preference" entry.
228
+ These instructions represent how the user wants to work and should persist across sessions.
225
229
 
226
230
  Do NOT extract:
227
231
  - Task-specific details (file currently being edited, current bug being fixed)
@@ -316,7 +320,9 @@ IMPORTANT:
316
320
  2. When updating, REPLACE the content with a complete rewrite — never append.
317
321
  3. If entries cover the same system from different angles, merge them: update one, delete the rest.
318
322
  4. Only create a new entry for genuinely distinct knowledge with no existing home.
319
- 5. Keep all entries under 150 words. If an existing entry is too long, use an update op to trim it.`;
323
+ 5. Keep all entries under 150 words. If an existing entry is too long, use an update op to trim it.
324
+ 6. Pay special attention to user instructions ("always do X", "never do Y", "make sure to X").
325
+ These are strong signals for "preference" entries with high confidence.`;
320
326
  }
321
327
 
322
328
  /**
package/src/recall.ts CHANGED
@@ -475,14 +475,27 @@ export async function searchRecall(
475
475
  }
476
476
  }
477
477
 
478
+ // Determine vector boost weight: for queries with enough meaningful terms,
479
+ // boost vector search lists so semantic similarity outweighs keyword noise.
480
+ const queryTermCount = filterTerms(query).length;
481
+ const vectorWeight =
482
+ queryTermCount >= (searchConfig?.vectorBoostMinTerms ?? 3)
483
+ ? (searchConfig?.vectorBoostWeight ?? 1.5)
484
+ : 1;
485
+
478
486
  // Collect per-query RRF lists. Original query is always first; if expansion
479
487
  // produced extras, we still weight the original twice by adding both original
480
488
  // and expanded lists (RRF naturally weights items appearing in more lists).
481
489
  const allRrfLists: Array<{
482
490
  items: TaggedResult[];
483
491
  key: (r: TaggedResult) => string;
492
+ weight?: number;
484
493
  }> = [];
485
494
 
495
+ // Track where primary (first-query) lists end so the MAX_RRF_LISTS cap
496
+ // trims expanded-query lists first, preserving vector/supplemental lists.
497
+ let primaryListEnd = 0;
498
+
486
499
  for (const q of queries) {
487
500
  const knowledgeResults: ltm.ScoredKnowledgeEntry[] = [];
488
501
  if (knowledgeEnabled && scope !== "session") {
@@ -568,7 +581,15 @@ export async function searchRecall(
568
581
  key: (r) => `t:${r.item.id}`,
569
582
  });
570
583
  }
584
+
585
+ // Mark the end of the first (original) query's lists. Supplemental lists
586
+ // (vector, lat.md, cross-project, quality, exact-match) are appended after
587
+ // the loop and should be preserved over expanded-query lists when capping.
588
+ if (primaryListEnd === 0) {
589
+ primaryListEnd = allRrfLists.length;
590
+ }
571
591
  }
592
+ const perQueryListEnd = allRrfLists.length;
572
593
 
573
594
  // Vector search on the original query (not expansions — avoid redundant embeds).
574
595
  if (embedding.isAvailable() && scope !== "session") {
@@ -593,6 +614,7 @@ export async function searchRecall(
593
614
  allRrfLists.push({
594
615
  items: vectorTagged,
595
616
  key: (r) => `k:${r.item.id}`,
617
+ weight: vectorWeight,
596
618
  });
597
619
  }
598
620
  }
@@ -618,6 +640,7 @@ export async function searchRecall(
618
640
  allRrfLists.push({
619
641
  items: distVectorTagged,
620
642
  key: (r) => `d:${r.item.id}`,
643
+ weight: vectorWeight,
621
644
  });
622
645
  }
623
646
  }
@@ -648,6 +671,7 @@ export async function searchRecall(
648
671
  allRrfLists.push({
649
672
  items: temporalVectorTagged,
650
673
  key: (r) => `t:${r.item.id}`,
674
+ weight: vectorWeight,
651
675
  });
652
676
  }
653
677
  }
@@ -786,6 +810,25 @@ export async function searchRecall(
786
810
  }
787
811
  }
788
812
 
813
+ // Cap the number of RRF lists to prevent score inflation from marginal items.
814
+ // With query expansion (3 queries × 4 sources + supplemental lists), the list
815
+ // count can exceed 15. Each list gives marginal items enough cumulative RRF
816
+ // score to clear the relevance floor.
817
+ //
818
+ // Priority: primary (original query BM25 + recency) and supplemental
819
+ // (vector, lat.md, cross-project, quality, exact-match) are high-value.
820
+ // Expanded-query BM25 lists are lowest priority — trim those first.
821
+ const MAX_RRF_LISTS = 10;
822
+ if (allRrfLists.length > MAX_RRF_LISTS) {
823
+ // Layout: [0..primaryListEnd) = primary, [primaryListEnd..perQueryEnd) = expanded, [perQueryEnd..) = supplemental
824
+ const primary = allRrfLists.slice(0, primaryListEnd);
825
+ const expanded = allRrfLists.slice(primaryListEnd, perQueryListEnd);
826
+ const supplemental = allRrfLists.slice(perQueryListEnd);
827
+ const budget = Math.max(0, MAX_RRF_LISTS - primary.length - supplemental.length);
828
+ allRrfLists.length = 0;
829
+ allRrfLists.push(...primary, ...expanded.slice(0, budget), ...supplemental);
830
+ }
831
+
789
832
  const fused = reciprocalRankFusion<TaggedResult>(allRrfLists);
790
833
 
791
834
  // Cap output: return at most 3x the per-source limit. With 7+ RRF sources
@@ -885,11 +928,6 @@ export async function runRecall(input: RecallInput): Promise<RecallResult> {
885
928
  return recallById(input.id);
886
929
  }
887
930
 
888
- // Short-circuit vague queries — stopwords-only would match everything.
889
- if (ftsQuery(input.query) === EMPTY_QUERY) {
890
- return "Query too vague — try using specific keywords, file names, or technical terms.";
891
- }
892
-
893
931
  const fused = await searchRecall(input);
894
932
  const recallCfg = input.searchConfig?.recall;
895
933
  return formatFusedResults(fused, {
package/src/search.ts CHANGED
@@ -302,29 +302,31 @@ export function normalizeRank(
302
302
  /**
303
303
  * Reciprocal Rank Fusion: merge multiple ranked lists into a single ranked list.
304
304
  *
305
- * RRF score = Σ(1 / (k + rank_i)) for each list where the item appears.
305
+ * RRF score = Σ(weight / (k + rank_i)) for each list where the item appears.
306
306
  * k = 60 is standard (from Cormack et al., 2009; also used by QMD).
307
307
  *
308
308
  * RRF is rank-based, not score-based — raw score magnitude differences across
309
309
  * different FTS5 tables don't matter. Only relative ordering within each list.
310
310
  *
311
- * @param lists Each list provides items (in ranked order) and a key function
312
- * for deduplication. Items at the front of the array are rank 0.
311
+ * @param lists Each list provides items (in ranked order), a key function
312
+ * for deduplication, and an optional weight (default 1).
313
+ * Items at the front of the array are rank 0.
313
314
  * @param k Smoothing constant. Default 60.
314
315
  * @returns Fused list sorted by RRF score descending. When items appear
315
316
  * in multiple lists, the first occurrence's item is kept.
316
317
  */
317
318
  export function reciprocalRankFusion<T>(
318
- lists: Array<{ items: T[]; key: (item: T) => string }>,
319
+ lists: Array<{ items: T[]; key: (item: T) => string; weight?: number }>,
319
320
  k = 60,
320
321
  ): Array<{ item: T; score: number }> {
321
322
  const scores = new Map<string, { item: T; score: number }>();
322
323
 
323
324
  for (const list of lists) {
325
+ const w = list.weight ?? 1;
324
326
  for (let rank = 0; rank < list.items.length; rank++) {
325
327
  const item = list.items[rank];
326
328
  const id = list.key(item);
327
- const rrfScore = 1 / (k + rank);
329
+ const rrfScore = w / (k + rank);
328
330
  const existing = scores.get(id);
329
331
  if (existing) {
330
332
  existing.score += rrfScore;
package/src/temporal.ts CHANGED
@@ -171,6 +171,8 @@ export function markDistilled(ids: string[]) {
171
171
  .run(...ids);
172
172
  }
173
173
 
174
+ // Only searches undistilled messages — distilled content is already represented
175
+ // in distillation search results and would duplicate/dilute temporal hits.
174
176
  // LIKE-based fallback for when FTS5 fails unexpectedly.
175
177
  function searchLike(input: {
176
178
  pid: string;
@@ -186,8 +188,8 @@ function searchLike(input: {
186
188
  const conditions = terms.map(() => "LOWER(content) LIKE ?").join(" AND ");
187
189
  const likeParams = terms.map((t) => `%${t}%`);
188
190
  const query = input.sessionID
189
- ? `SELECT * FROM temporal_messages WHERE project_id = ? AND session_id = ? AND ${conditions} ORDER BY created_at DESC LIMIT ?`
190
- : `SELECT * FROM temporal_messages WHERE project_id = ? AND ${conditions} ORDER BY created_at DESC LIMIT ?`;
191
+ ? `SELECT * FROM temporal_messages WHERE project_id = ? AND session_id = ? AND distilled = 0 AND ${conditions} ORDER BY created_at DESC LIMIT ?`
192
+ : `SELECT * FROM temporal_messages WHERE project_id = ? AND distilled = 0 AND ${conditions} ORDER BY created_at DESC LIMIT ?`;
191
193
  const params = input.sessionID
192
194
  ? [input.pid, input.sessionID, ...likeParams, input.limit]
193
195
  : [input.pid, ...likeParams, input.limit];
@@ -208,11 +210,11 @@ export function search(input: {
208
210
  const ftsSQL = input.sessionID
209
211
  ? `SELECT m.* FROM temporal_fts f
210
212
  CROSS JOIN temporal_messages m ON m.rowid = f.rowid
211
- WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ?
213
+ WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ? AND m.distilled = 0
212
214
  ORDER BY rank LIMIT ?`
213
215
  : `SELECT m.* FROM temporal_fts f
214
216
  CROSS JOIN temporal_messages m ON m.rowid = f.rowid
215
- WHERE f.content MATCH ? AND m.project_id = ?
217
+ WHERE f.content MATCH ? AND m.project_id = ? AND m.distilled = 0
216
218
  ORDER BY rank LIMIT ?`;
217
219
 
218
220
  try {
@@ -251,11 +253,11 @@ export function searchScored(input: {
251
253
  const ftsSQL = input.sessionID
252
254
  ? `SELECT m.*, rank FROM temporal_fts f
253
255
  CROSS JOIN temporal_messages m ON m.rowid = f.rowid
254
- WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ?
256
+ WHERE f.content MATCH ? AND m.project_id = ? AND m.session_id = ? AND m.distilled = 0
255
257
  ORDER BY rank LIMIT ?`
256
258
  : `SELECT m.*, rank FROM temporal_fts f
257
259
  CROSS JOIN temporal_messages m ON m.rowid = f.rowid
258
- WHERE f.content MATCH ? AND m.project_id = ?
260
+ WHERE f.content MATCH ? AND m.project_id = ? AND m.distilled = 0
259
261
  ORDER BY rank LIMIT ?`;
260
262
 
261
263
  try {
package/src/types.ts CHANGED
@@ -183,7 +183,7 @@ export type LoreMessageWithParts = {
183
183
  * Host adapters implement this:
184
184
  * - OpenCode: wraps `client.session.create()` + `client.session.prompt()`
185
185
  * - Pi: wraps `complete()` from `@mariozechner/pi-ai`
186
- * - Standalone: direct `fetch()` to provider APIs
186
+ * - Gateway: direct `fetch()` to provider APIs
187
187
  */
188
188
  export interface LLMClient {
189
189
  /**