@loreai/core 0.18.0 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/bun/curator.d.ts.map +1 -1
  2. package/dist/bun/db.d.ts +73 -0
  3. package/dist/bun/db.d.ts.map +1 -1
  4. package/dist/bun/distillation.d.ts +2 -13
  5. package/dist/bun/distillation.d.ts.map +1 -1
  6. package/dist/bun/embedding.d.ts +5 -1
  7. package/dist/bun/embedding.d.ts.map +1 -1
  8. package/dist/bun/gradient.d.ts +9 -0
  9. package/dist/bun/gradient.d.ts.map +1 -1
  10. package/dist/bun/index.d.ts +2 -2
  11. package/dist/bun/index.d.ts.map +1 -1
  12. package/dist/bun/index.js +817 -99
  13. package/dist/bun/index.js.map +4 -4
  14. package/dist/bun/ltm.d.ts +99 -5
  15. package/dist/bun/ltm.d.ts.map +1 -1
  16. package/dist/bun/session-limiter.d.ts +26 -0
  17. package/dist/bun/session-limiter.d.ts.map +1 -0
  18. package/dist/bun/temporal.d.ts +2 -0
  19. package/dist/bun/temporal.d.ts.map +1 -1
  20. package/dist/node/curator.d.ts.map +1 -1
  21. package/dist/node/db.d.ts +73 -0
  22. package/dist/node/db.d.ts.map +1 -1
  23. package/dist/node/distillation.d.ts +2 -13
  24. package/dist/node/distillation.d.ts.map +1 -1
  25. package/dist/node/embedding.d.ts +5 -1
  26. package/dist/node/embedding.d.ts.map +1 -1
  27. package/dist/node/gradient.d.ts +9 -0
  28. package/dist/node/gradient.d.ts.map +1 -1
  29. package/dist/node/index.d.ts +2 -2
  30. package/dist/node/index.d.ts.map +1 -1
  31. package/dist/node/index.js +817 -99
  32. package/dist/node/index.js.map +4 -4
  33. package/dist/node/ltm.d.ts +99 -5
  34. package/dist/node/ltm.d.ts.map +1 -1
  35. package/dist/node/session-limiter.d.ts +26 -0
  36. package/dist/node/session-limiter.d.ts.map +1 -0
  37. package/dist/node/temporal.d.ts +2 -0
  38. package/dist/node/temporal.d.ts.map +1 -1
  39. package/dist/types/curator.d.ts.map +1 -1
  40. package/dist/types/db.d.ts +73 -0
  41. package/dist/types/db.d.ts.map +1 -1
  42. package/dist/types/distillation.d.ts +2 -13
  43. package/dist/types/distillation.d.ts.map +1 -1
  44. package/dist/types/embedding.d.ts +5 -1
  45. package/dist/types/embedding.d.ts.map +1 -1
  46. package/dist/types/gradient.d.ts +9 -0
  47. package/dist/types/gradient.d.ts.map +1 -1
  48. package/dist/types/index.d.ts +2 -2
  49. package/dist/types/index.d.ts.map +1 -1
  50. package/dist/types/ltm.d.ts +99 -5
  51. package/dist/types/ltm.d.ts.map +1 -1
  52. package/dist/types/session-limiter.d.ts +26 -0
  53. package/dist/types/session-limiter.d.ts.map +1 -0
  54. package/dist/types/temporal.d.ts +2 -0
  55. package/dist/types/temporal.d.ts.map +1 -1
  56. package/package.json +2 -1
  57. package/src/curator.ts +54 -2
  58. package/src/db.ts +347 -0
  59. package/src/distillation.ts +55 -14
  60. package/src/embedding.ts +28 -3
  61. package/src/gradient.ts +183 -74
  62. package/src/index.ts +8 -0
  63. package/src/ltm.ts +480 -45
  64. package/src/session-limiter.ts +47 -0
  65. package/src/temporal.ts +10 -0
package/src/ltm.ts CHANGED
@@ -1,5 +1,5 @@
1
1
  import { uuidv7 } from "uuidv7";
2
- import { db, ensureProject } from "./db";
2
+ import { db, ensureProject, getKV, setKV } from "./db";
3
3
  import { config } from "./config";
4
4
  import { ftsQuery, ftsQueryOr, EMPTY_QUERY, extractTopTerms, filterTerms, runRelaxedSearch } from "./search";
5
5
  import * as embedding from "./embedding";
@@ -355,6 +355,26 @@ function scoreEntriesFTS(sessionContext: string): Map<string, number> {
355
355
  }
356
356
  }
357
357
 
358
+ /**
359
+ * Well-known knowledge entry categories managed by the curator.
360
+ * The DB column is a free-form string, but these are the standard values.
361
+ */
362
+ export type KnowledgeCategory = "decision" | "pattern" | "preference" | "architecture" | "gotcha";
363
+
364
+ /** Options for `forSession()` to control entry selection. */
365
+ export type ForSessionOptions = {
366
+ /** Caller-provided context (e.g., user's current message) for relevance
367
+ * scoring when no session context exists in the DB yet. */
368
+ contextHint?: string;
369
+ /** Restrict to these categories (e.g., `['preference']` for turn 1). */
370
+ categories?: (KnowledgeCategory | (string & {}))[];
371
+ /** Exclude these categories (e.g., `['preference']` for context-bound
372
+ * entries when preferences are already injected in a separate block).
373
+ * Mutually exclusive with `categories` — if both are provided,
374
+ * `categories` (include) wins. */
375
+ excludeCategories?: (KnowledgeCategory | (string & {}))[];
376
+ };
377
+
358
378
  /**
359
379
  * Build a relevance-ranked, budget-capped list of knowledge entries for injection
360
380
  * into the system prompt of a live session.
@@ -362,43 +382,61 @@ function scoreEntriesFTS(sessionContext: string): Map<string, number> {
362
382
  * Strategy:
363
383
  * 1. Both project-specific and cross-project entries are scored for relevance
364
384
  * against recent session context (last distillation + recent raw messages).
365
- * 2. Project entries get a safety net: the top PROJECT_SAFETY_NET entries by
385
+ * 2. When embeddings are available, vector cosine similarity is used for scoring
386
+ * (captures semantic matches that keyword overlap misses). Falls back to
387
+ * FTS5 BM25 when embeddings are unavailable.
388
+ * 3. Project entries get a safety net: the top PROJECT_SAFETY_NET entries by
366
389
  * confidence are always included even if they have zero relevance score.
367
390
  * This ensures the most important project knowledge is never lost to
368
- * coarse term-overlap scoring.
369
- * 3. All scored entries are merged into a single pool and greedily packed
391
+ * coarse scoring.
392
+ * 4. All scored entries are merged into a single pool and greedily packed
370
393
  * into the token budget by score descending.
371
- * 4. If there's no session context yet (first turn), fall back to top entries
394
+ * 5. If there's no session context yet (first turn), fall back to top entries
372
395
  * by confidence only (capped at NO_CONTEXT_FALLBACK_CAP per pool).
373
396
  *
374
397
  * @param projectPath Current project path
375
398
  * @param sessionID Current session ID (for context extraction)
376
399
  * @param maxTokens Hard token budget for the entire formatted block
400
+ * @param options Optional category filter and context hint
377
401
  */
378
- export function forSession(
402
+ export async function forSession(
379
403
  projectPath: string,
380
404
  sessionID: string | undefined,
381
405
  maxTokens: number,
382
- ): KnowledgeEntry[] {
406
+ options?: ForSessionOptions,
407
+ ): Promise<KnowledgeEntry[]> {
383
408
  const pid = ensureProject(projectPath);
409
+ const categoryFilter = options?.categories;
410
+ const excludeFilter = options?.excludeCategories;
411
+
412
+ // Build optional SQL category clauses (include / exclude are mutually exclusive)
413
+ let categoryClause = "";
414
+ let categoryParams: string[] = [];
415
+ if (categoryFilter?.length) {
416
+ categoryClause = ` AND category IN (${categoryFilter.map(() => "?").join(",")})`;
417
+ categoryParams = categoryFilter;
418
+ } else if (excludeFilter?.length) {
419
+ categoryClause = ` AND category NOT IN (${excludeFilter.map(() => "?").join(",")})`;
420
+ categoryParams = excludeFilter;
421
+ }
384
422
 
385
423
  // --- 1. Load project-specific entries ---
386
424
  const projectEntries = db()
387
425
  .query(
388
426
  `SELECT ${KNOWLEDGE_COLS} FROM knowledge
389
- WHERE project_id = ? AND cross_project = 0 AND confidence > 0.2
427
+ WHERE project_id = ? AND cross_project = 0 AND confidence > 0.2${categoryClause}
390
428
  ORDER BY confidence DESC, updated_at DESC`,
391
429
  )
392
- .all(pid) as KnowledgeEntry[];
430
+ .all(pid, ...categoryParams) as KnowledgeEntry[];
393
431
 
394
432
  // --- 2. Load cross-project candidates ---
395
433
  const crossEntries = db()
396
434
  .query(
397
435
  `SELECT ${KNOWLEDGE_COLS} FROM knowledge
398
- WHERE (project_id IS NULL OR cross_project = 1) AND confidence > 0.2
436
+ WHERE (project_id IS NULL OR cross_project = 1) AND confidence > 0.2${categoryClause}
399
437
  ORDER BY confidence DESC, updated_at DESC`,
400
438
  )
401
- .all() as KnowledgeEntry[];
439
+ .all(...categoryParams) as KnowledgeEntry[];
402
440
 
403
441
  if (!crossEntries.length && !projectEntries.length) return [];
404
442
 
@@ -427,38 +465,82 @@ export function forSession(
427
465
  }
428
466
  }
429
467
 
468
+ // Fall back to caller-provided context hint (e.g., user's first message)
469
+ if (!sessionContext.trim() && options?.contextHint) {
470
+ sessionContext = options.contextHint;
471
+ }
472
+
430
473
  // --- 4. Score both pools by relevance ---
431
474
  let scoredProject: Scored[];
432
475
  let scoredCross: Scored[];
433
476
 
434
- if (sessionContext.trim().length > 20) {
435
- // Use FTS5 BM25 to score all knowledge entries against session context
436
- const ftsScores = scoreEntriesFTS(sessionContext);
437
-
438
- // Score project entries: FTS relevance × confidence, with safety net
439
- const rawScored: Scored[] = projectEntries.map((entry) => ({
440
- entry,
441
- score: (ftsScores.get(entry.id) ?? 0) * entry.confidence,
442
- }));
443
- const matched = rawScored.filter((s) => s.score > 0);
444
- const matchedIds = new Set(matched.map((s) => s.entry.id));
445
-
446
- // Safety net: top PROJECT_SAFETY_NET entries by confidence that weren't already matched.
447
- // Given a tiny score (0.001 * confidence) so they sort below genuinely matched entries.
448
- const safetyNet = projectEntries
449
- .filter((e) => !matchedIds.has(e.id))
450
- .slice(0, PROJECT_SAFETY_NET)
451
- .map((e) => ({ entry: e, score: 0.001 * e.confidence }));
452
-
453
- scoredProject = [...matched, ...safetyNet];
477
+ if (sessionContext.trim().length > 20 && embedding.isAvailable()) {
478
+ // Vector scoring: embed session context, score entries by cosine similarity.
479
+ // Captures semantic matches (e.g., "OpenAI Batch API" ↔ "batch queue worker")
480
+ // that keyword-based FTS5 misses.
481
+ let vectorScores: Map<string, number>;
482
+ try {
483
+ const [contextVec] = await embedding.embed([sessionContext], "query");
484
+ const hits = embedding.vectorSearch(contextVec, 50, excludeFilter);
485
+ vectorScores = new Map(hits.map((h) => [h.id, h.similarity]));
486
+ } catch (err) {
487
+ log.warn("Vector scoring failed, falling back to FTS5:", err);
488
+ vectorScores = new Map();
489
+ }
454
490
 
455
- // Score cross-project entries — only include entries with FTS match
456
- scoredCross = crossEntries
457
- .filter((e) => ftsScores.has(e.id))
458
- .map((e) => ({
459
- entry: e,
460
- score: (ftsScores.get(e.id) ?? 0) * e.confidence,
461
- }));
491
+ if (vectorScores.size > 0) {
492
+ // Hybrid scoring: vector search only covers entries with stored embeddings.
493
+ // Entries without embeddings (e.g. newly created, async embed not yet done)
494
+ // fall back to FTS5 so they aren't invisible to scoring.
495
+ const ftsScores = scoreEntriesFTS(sessionContext);
496
+
497
+ // Score project entries: prefer vector similarity, fall back to FTS5
498
+ const rawScored: Scored[] = projectEntries.map((entry) => {
499
+ const vecScore = vectorScores.get(entry.id);
500
+ const score = vecScore != null
501
+ ? vecScore * entry.confidence
502
+ : (ftsScores.get(entry.id) ?? 0) * entry.confidence;
503
+ return { entry, score };
504
+ });
505
+ const matched = rawScored.filter((s) => s.score > 0);
506
+ const matchedIds = new Set(matched.map((s) => s.entry.id));
507
+
508
+ // Safety net: top PROJECT_SAFETY_NET entries by confidence that weren't already matched.
509
+ // Given a tiny score (0.001 * confidence) so they sort below genuinely matched entries.
510
+ const safetyNet = projectEntries
511
+ .filter((e) => !matchedIds.has(e.id))
512
+ .slice(0, PROJECT_SAFETY_NET)
513
+ .map((e) => ({ entry: e, score: 0.001 * e.confidence }));
514
+
515
+ scoredProject = [...matched, ...safetyNet];
516
+
517
+ // Cross-project: include entries matched by vector OR FTS5
518
+ scoredCross = crossEntries
519
+ .filter((e) => vectorScores.has(e.id) || ftsScores.has(e.id))
520
+ .map((e) => {
521
+ const vecScore = vectorScores.get(e.id);
522
+ const score = vecScore != null
523
+ ? vecScore * e.confidence
524
+ : (ftsScores.get(e.id) ?? 0) * e.confidence;
525
+ return { entry: e, score };
526
+ });
527
+ } else {
528
+ // Vector failed — fall through to FTS5
529
+ const ftsScores = scoreEntriesFTS(sessionContext);
530
+ ({ scoredProject, scoredCross } = scoreFTS(
531
+ projectEntries,
532
+ crossEntries,
533
+ ftsScores,
534
+ ));
535
+ }
536
+ } else if (sessionContext.trim().length > 20) {
537
+ // Embeddings unavailable — use FTS5 BM25 as fallback
538
+ const ftsScores = scoreEntriesFTS(sessionContext);
539
+ ({ scoredProject, scoredCross } = scoreFTS(
540
+ projectEntries,
541
+ crossEntries,
542
+ ftsScores,
543
+ ));
462
544
  } else {
463
545
  // No session context — fall back to top entries by confidence, capped
464
546
  scoredProject = projectEntries
@@ -520,6 +602,36 @@ export function forSession(
520
602
  return result;
521
603
  }
522
604
 
605
+ /** Score entries using FTS5 BM25 — extracted for reuse in the vector-fallback path. */
606
+ function scoreFTS(
607
+ projectEntries: KnowledgeEntry[],
608
+ crossEntries: KnowledgeEntry[],
609
+ ftsScores: Map<string, number>,
610
+ ): { scoredProject: Scored[]; scoredCross: Scored[] } {
611
+ const rawScored: Scored[] = projectEntries.map((entry) => ({
612
+ entry,
613
+ score: (ftsScores.get(entry.id) ?? 0) * entry.confidence,
614
+ }));
615
+ const matched = rawScored.filter((s) => s.score > 0);
616
+ const matchedIds = new Set(matched.map((s) => s.entry.id));
617
+
618
+ const safetyNet = projectEntries
619
+ .filter((e) => !matchedIds.has(e.id))
620
+ .slice(0, PROJECT_SAFETY_NET)
621
+ .map((e) => ({ entry: e, score: 0.001 * e.confidence }));
622
+
623
+ const scoredProject = [...matched, ...safetyNet];
624
+
625
+ const scoredCross = crossEntries
626
+ .filter((e) => ftsScores.has(e.id))
627
+ .map((e) => ({
628
+ entry: e,
629
+ score: (ftsScores.get(e.id) ?? 0) * e.confidence,
630
+ }));
631
+
632
+ return { scoredProject, scoredCross };
633
+ }
634
+
523
635
  export function all(): KnowledgeEntry[] {
524
636
  return db()
525
637
  .query(
@@ -963,9 +1075,18 @@ export type DedupCluster = {
963
1075
  merged: Array<{ id: string; title: string }>;
964
1076
  };
965
1077
 
1078
+ /** Stable pair key for two entry IDs — sorted to ensure order-independence. */
1079
+ export function dedupPairKey(idA: string, idB: string): string {
1080
+ return idA < idB ? `${idA}:${idB}` : `${idB}:${idA}`;
1081
+ }
1082
+
966
1083
  export type DedupResult = {
967
1084
  clusters: DedupCluster[];
968
1085
  totalRemoved: number;
1086
+ /** Pairwise embedding cosine similarities. Key: dedupPairKey(idA, idB). */
1087
+ pairSimilarities: Map<string, number>;
1088
+ /** All entry titles by ID — for feedback recording after entries are deleted. */
1089
+ entryTitles: Map<string, string>;
969
1090
  };
970
1091
 
971
1092
  /**
@@ -992,13 +1113,17 @@ export type DedupResult = {
992
1113
  * @returns Cluster report and count of removed entries
993
1114
  */
994
1115
  /** Core dedup logic — operates on an arbitrary list of entries. */
995
- function _dedup(entries: KnowledgeEntry[], dryRun: boolean): DedupResult {
996
- if (entries.length < 2) return { clusters: [], totalRemoved: 0 };
1116
+ function _dedup(
1117
+ entries: KnowledgeEntry[],
1118
+ dryRun: boolean,
1119
+ embeddingThreshold: number = EMBEDDING_DEDUP_THRESHOLD,
1120
+ ): DedupResult {
1121
+ if (entries.length < 2) return { clusters: [], totalRemoved: 0, pairSimilarities: new Map(), entryTitles: new Map() };
997
1122
 
998
1123
  // --- Build neighbor map using title overlap + embedding similarity ---
999
1124
  // Two entries are considered neighbors (potential duplicates) if EITHER:
1000
1125
  // (a) title word-overlap ≥ 0.7 with ≥ 4 shared words, OR
1001
- // (b) embedding cosine similarity ≥ 0.935
1126
+ // (b) embedding cosine similarity ≥ embeddingThreshold (default 0.935)
1002
1127
  // Star clustering (no transitivity) prevents snowball merging.
1003
1128
  // O(n²) pairwise comparison — acceptable for n ≤ 25 (maxEntries cap).
1004
1129
 
@@ -1026,6 +1151,8 @@ function _dedup(entries: KnowledgeEntry[], dryRun: boolean): DedupResult {
1026
1151
  // Pre-compute neighbors for all pairs
1027
1152
  type DedupHit = { id: string; score: number };
1028
1153
  const neighborMap = new Map<string, DedupHit[]>();
1154
+ // Collect all pairwise embedding similarities (for feedback/calibration).
1155
+ const pairSimilarities = new Map<string, number>();
1029
1156
 
1030
1157
  for (const entry of entries) {
1031
1158
  const neighbors: DedupHit[] = [];
@@ -1045,7 +1172,15 @@ function _dedup(entries: KnowledgeEntry[], dryRun: boolean): DedupResult {
1045
1172
  const otherVec = embeddingMap.get(other.id);
1046
1173
  if (otherVec && entryVec.length === otherVec.length) {
1047
1174
  similarity = embedding.cosineSimilarity(entryVec, otherVec);
1048
- embeddingMatch = similarity >= EMBEDDING_DEDUP_THRESHOLD;
1175
+ embeddingMatch = similarity >= embeddingThreshold;
1176
+ }
1177
+ }
1178
+
1179
+ // Track all pairwise embedding similarities for calibration signals
1180
+ if (similarity > 0) {
1181
+ const pk = dedupPairKey(entry.id, other.id);
1182
+ if (!pairSimilarities.has(pk)) {
1183
+ pairSimilarities.set(pk, similarity);
1049
1184
  }
1050
1185
  }
1051
1186
 
@@ -1120,21 +1255,27 @@ function _dedup(entries: KnowledgeEntry[], dryRun: boolean): DedupResult {
1120
1255
  // Sort clusters by size descending for readability
1121
1256
  result.sort((a, b) => b.merged.length - a.merged.length);
1122
1257
 
1123
- return { clusters: result, totalRemoved };
1258
+ // Build title map from all input entries — survives entry deletion.
1259
+ const entryTitles = new Map(entries.map((e) => [e.id, e.title]));
1260
+
1261
+ return { clusters: result, totalRemoved, pairSimilarities, entryTitles };
1124
1262
  }
1125
1263
 
1126
1264
  export async function deduplicate(
1127
1265
  projectPath: string,
1128
1266
  opts?: { dryRun?: boolean },
1129
1267
  ): Promise<DedupResult> {
1268
+ const pid = ensureProject(projectPath);
1269
+ const threshold = loadCalibratedThreshold(pid) ?? EMBEDDING_DEDUP_THRESHOLD;
1130
1270
  const entries = forProject(projectPath, false);
1131
- return _dedup(entries, opts?.dryRun ?? true);
1271
+ return _dedup(entries, opts?.dryRun ?? true, threshold);
1132
1272
  }
1133
1273
 
1134
1274
  /** Deduplicate global (cross-project) entries that have no project_id. */
1135
1275
  export async function deduplicateGlobal(
1136
1276
  opts?: { dryRun?: boolean },
1137
1277
  ): Promise<DedupResult> {
1278
+ const threshold = loadCalibratedThreshold(null) ?? EMBEDDING_DEDUP_THRESHOLD;
1138
1279
  const entries = db()
1139
1280
  .query(
1140
1281
  `SELECT ${KNOWLEDGE_COLS} FROM knowledge
@@ -1143,5 +1284,299 @@ export async function deduplicateGlobal(
1143
1284
  ORDER BY confidence DESC, updated_at DESC`,
1144
1285
  )
1145
1286
  .all() as KnowledgeEntry[];
1146
- return _dedup(entries, opts?.dryRun ?? true);
1287
+ return _dedup(entries, opts?.dryRun ?? true, threshold);
1288
+ }
1289
+
1290
+ // ---------------------------------------------------------------------------
1291
+ // Dedup feedback & adaptive threshold calibration
1292
+ // ---------------------------------------------------------------------------
1293
+
1294
+ export type DedupFeedbackSource = "auto_dedup" | "cli_yes" | "cli_interactive";
1295
+
1296
+ const MIN_CALIBRATION_SAMPLES = 20;
1297
+ const DEFAULT_EMBEDDING_DEDUP_THRESHOLD = EMBEDDING_DEDUP_THRESHOLD;
1298
+ /** Only record auto-signals for pairs with similarity >= this floor. */
1299
+ const AUTO_SIGNAL_MIN_SIMILARITY = 0.80;
1300
+ /** Max auto-signal pairs to record per dedup run (closest to threshold). */
1301
+ const AUTO_SIGNAL_MAX_PAIRS = 50;
1302
+
1303
+ /** Record a single dedup feedback row. */
1304
+ export function recordDedupFeedback(input: {
1305
+ projectId: string | null;
1306
+ entryATitle: string;
1307
+ entryBTitle: string;
1308
+ similarity: number;
1309
+ accepted: boolean;
1310
+ source: DedupFeedbackSource;
1311
+ }): void {
1312
+ db()
1313
+ .query(
1314
+ `INSERT INTO dedup_feedback
1315
+ (project_id, entry_a_title, entry_b_title, similarity, accepted, source, created_at)
1316
+ VALUES (?, ?, ?, ?, ?, ?, ?)`,
1317
+ )
1318
+ .run(
1319
+ input.projectId,
1320
+ input.entryATitle,
1321
+ input.entryBTitle,
1322
+ input.similarity,
1323
+ input.accepted ? 1 : 0,
1324
+ input.source,
1325
+ Date.now(),
1326
+ );
1327
+ }
1328
+
1329
+ /**
1330
+ * Bulk-record feedback for all merged pairs in a DedupResult.
1331
+ * Only records pairs with embedding similarity > 0 (title-overlap-only
1332
+ * matches are excluded from calibration).
1333
+ */
1334
+ export function recordDedupResultFeedback(
1335
+ projectId: string | null,
1336
+ result: DedupResult,
1337
+ accepted: boolean,
1338
+ source: DedupFeedbackSource,
1339
+ ): void {
1340
+ for (const cluster of result.clusters) {
1341
+ for (const merged of cluster.merged) {
1342
+ const pk = dedupPairKey(cluster.surviving.id, merged.id);
1343
+ const similarity = result.pairSimilarities.get(pk);
1344
+ if (similarity != null && similarity > 0) {
1345
+ recordDedupFeedback({
1346
+ projectId,
1347
+ entryATitle: cluster.surviving.title,
1348
+ entryBTitle: merged.title,
1349
+ similarity,
1350
+ accepted,
1351
+ source,
1352
+ });
1353
+ }
1354
+ }
1355
+ }
1356
+ }
1357
+
1358
+ /**
1359
+ * Record automatic calibration signals from a post-curation dedup sweep.
1360
+ *
1361
+ * Only records **reject** signals — non-merged pairs with similarity in
1362
+ * [0.80, threshold). Accept signals from auto-dedup are tautological (the
1363
+ * pair was merged *because* its similarity exceeded the threshold), so they
1364
+ * provide no new information and would create a self-reinforcing feedback
1365
+ * loop. Manual signals (cli_yes, cli_interactive) provide the accept side.
1366
+ *
1367
+ * Caps at AUTO_SIGNAL_MAX_PAIRS most interesting pairs per run (closest
1368
+ * to the threshold boundary) to avoid table bloat.
1369
+ */
1370
+ export function recordAutoSignals(
1371
+ projectId: string | null,
1372
+ result: DedupResult,
1373
+ ): void {
1374
+ // Collect merged pair IDs for quick lookup (to exclude from reject signals)
1375
+ const mergedPairs = new Set<string>();
1376
+ for (const cluster of result.clusters) {
1377
+ for (const merged of cluster.merged) {
1378
+ mergedPairs.add(dedupPairKey(cluster.surviving.id, merged.id));
1379
+ }
1380
+ }
1381
+
1382
+ // Build a title map — we need titles for reject signals (non-merged pairs).
1383
+ // Use entryTitles from result first, then fall back to cluster data.
1384
+ const titleMap = new Map<string, string>(result.entryTitles);
1385
+ for (const cluster of result.clusters) {
1386
+ if (!titleMap.has(cluster.surviving.id)) {
1387
+ titleMap.set(cluster.surviving.id, cluster.surviving.title);
1388
+ }
1389
+ for (const m of cluster.merged) {
1390
+ if (!titleMap.has(m.id)) titleMap.set(m.id, m.title);
1391
+ }
1392
+ }
1393
+
1394
+ // Collect reject signals: non-merged pairs with high similarity
1395
+ type Signal = { entryATitle: string; entryBTitle: string; similarity: number };
1396
+ const signals: Signal[] = [];
1397
+
1398
+ for (const [pk, sim] of result.pairSimilarities) {
1399
+ if (sim < AUTO_SIGNAL_MIN_SIMILARITY) continue;
1400
+ if (mergedPairs.has(pk)) continue; // merged pair — skip (tautological accept)
1401
+
1402
+ const [idA, idB] = pk.split(":");
1403
+ const titleA = titleMap.get(idA);
1404
+ const titleB = titleMap.get(idB);
1405
+ if (!titleA || !titleB) continue;
1406
+
1407
+ signals.push({ entryATitle: titleA, entryBTitle: titleB, similarity: sim });
1408
+ }
1409
+
1410
+ // Sort by distance to threshold boundary (most informative first), cap
1411
+ const currentThreshold = loadCalibratedThreshold(projectId) ?? DEFAULT_EMBEDDING_DEDUP_THRESHOLD;
1412
+ signals.sort((a, b) => Math.abs(a.similarity - currentThreshold) - Math.abs(b.similarity - currentThreshold));
1413
+ const capped = signals.slice(0, AUTO_SIGNAL_MAX_PAIRS);
1414
+
1415
+ // Prune old feedback to prevent unbounded table growth
1416
+ pruneDedupFeedback(projectId);
1417
+
1418
+ for (const s of capped) {
1419
+ recordDedupFeedback({
1420
+ projectId,
1421
+ entryATitle: s.entryATitle,
1422
+ entryBTitle: s.entryBTitle,
1423
+ similarity: s.similarity,
1424
+ accepted: false,
1425
+ source: "auto_dedup",
1426
+ });
1427
+ }
1428
+ }
1429
+
1430
+ /** Get all feedback for a project (for calibration). */
1431
+ export function getDedupFeedback(
1432
+ projectId: string | null,
1433
+ ): Array<{ similarity: number; accepted: boolean; source: string }> {
1434
+ const rows = (
1435
+ projectId !== null
1436
+ ? db()
1437
+ .query(
1438
+ "SELECT similarity, accepted, source FROM dedup_feedback WHERE project_id = ? ORDER BY similarity",
1439
+ )
1440
+ .all(projectId)
1441
+ : db()
1442
+ .query(
1443
+ "SELECT similarity, accepted, source FROM dedup_feedback WHERE project_id IS NULL ORDER BY similarity",
1444
+ )
1445
+ .all()
1446
+ ) as Array<{ similarity: number; accepted: number; source: string }>;
1447
+ return rows.map((r) => ({ similarity: r.similarity, accepted: r.accepted === 1, source: r.source }));
1448
+ }
1449
+
1450
+ /** Quick count of feedback rows for a project. */
1451
+ export function getDedupFeedbackCount(projectId: string | null): number {
1452
+ const row = (
1453
+ projectId !== null
1454
+ ? db()
1455
+ .query("SELECT COUNT(*) as cnt FROM dedup_feedback WHERE project_id = ?")
1456
+ .get(projectId)
1457
+ : db()
1458
+ .query("SELECT COUNT(*) as cnt FROM dedup_feedback WHERE project_id IS NULL")
1459
+ .get()
1460
+ ) as { cnt: number } | null;
1461
+ return row?.cnt ?? 0;
1462
+ }
1463
+
1464
+ /** Max feedback rows to keep per project (prevents unbounded growth). */
1465
+ const MAX_FEEDBACK_ROWS_PER_PROJECT = 500;
1466
+
1467
+ /**
1468
+ * Prune old feedback rows for a project, keeping the most recent
1469
+ * MAX_FEEDBACK_ROWS_PER_PROJECT rows. Called from recordAutoSignals
1470
+ * to prevent unbounded table growth.
1471
+ */
1472
+ export function pruneDedupFeedback(projectId: string | null): void {
1473
+ const count = getDedupFeedbackCount(projectId);
1474
+ if (count <= MAX_FEEDBACK_ROWS_PER_PROJECT) return;
1475
+
1476
+ const excess = count - MAX_FEEDBACK_ROWS_PER_PROJECT;
1477
+ if (projectId !== null) {
1478
+ db()
1479
+ .query(
1480
+ `DELETE FROM dedup_feedback WHERE id IN (
1481
+ SELECT id FROM dedup_feedback WHERE project_id = ?
1482
+ ORDER BY created_at ASC LIMIT ?
1483
+ )`,
1484
+ )
1485
+ .run(projectId, excess);
1486
+ } else {
1487
+ db()
1488
+ .query(
1489
+ `DELETE FROM dedup_feedback WHERE id IN (
1490
+ SELECT id FROM dedup_feedback WHERE project_id IS NULL
1491
+ ORDER BY created_at ASC LIMIT ?
1492
+ )`,
1493
+ )
1494
+ .run(excess);
1495
+ }
1496
+ }
1497
+
1498
+ /**
1499
+ * Compute an optimal embedding dedup threshold from user feedback.
1500
+ *
1501
+ * Algorithm:
1502
+ * 1. Load all (similarity, accepted) pairs for the project.
1503
+ * 2. If fewer than MIN_CALIBRATION_SAMPLES, return null (use default).
1504
+ * 3. If all feedback is "accept" (no rejects), return the minimum
1505
+ * accepted similarity minus a small margin (0.005).
1506
+ * 4. If all feedback is "reject" (no accepts), return null.
1507
+ * 5. Otherwise, find the threshold that maximizes separation:
1508
+ * - For each candidate threshold (midpoint between consecutive
1509
+ * distinct similarity values), compute accuracy:
1510
+ * correct = accepted_pairs_above + rejected_pairs_below
1511
+ * accuracy = correct / total
1512
+ * - Pick the threshold with highest accuracy.
1513
+ * - Tie-break: prefer higher threshold (conservative).
1514
+ * - Clamp to [0.85, 0.98].
1515
+ */
1516
+ export function calibrateDedupThreshold(projectId: string | null): number | null {
1517
+ const feedback = getDedupFeedback(projectId);
1518
+ if (feedback.length < MIN_CALIBRATION_SAMPLES) return null;
1519
+
1520
+ const accepted = feedback.filter((f) => f.accepted);
1521
+ const rejected = feedback.filter((f) => !f.accepted);
1522
+
1523
+ // Edge case: all accept, no rejects
1524
+ if (rejected.length === 0) {
1525
+ const minAccepted = Math.min(...accepted.map((f) => f.similarity));
1526
+ return Math.max(0.85, minAccepted - 0.005);
1527
+ }
1528
+
1529
+ // Edge case: all reject, no accepts
1530
+ if (accepted.length === 0) {
1531
+ log.warn("dedup calibration: all feedback is reject — keeping default threshold");
1532
+ return null;
1533
+ }
1534
+
1535
+ // Find optimal threshold via accuracy maximization
1536
+ const allSims = [...new Set(feedback.map((f) => f.similarity))].sort((a, b) => a - b);
1537
+
1538
+ let bestThreshold = DEFAULT_EMBEDDING_DEDUP_THRESHOLD;
1539
+ let bestAccuracy = -1;
1540
+
1541
+ for (let i = 0; i < allSims.length - 1; i++) {
1542
+ const candidate = (allSims[i] + allSims[i + 1]) / 2;
1543
+
1544
+ // Pairs above threshold are predicted "merge" — should be accepted
1545
+ // Pairs below threshold are predicted "keep separate" — should be rejected
1546
+ const correctAccepted = accepted.filter((f) => f.similarity >= candidate).length;
1547
+ const correctRejected = rejected.filter((f) => f.similarity < candidate).length;
1548
+ const accuracy = (correctAccepted + correctRejected) / feedback.length;
1549
+
1550
+ // Tie-break: prefer higher threshold (conservative — fewer false merges)
1551
+ if (accuracy > bestAccuracy || (accuracy === bestAccuracy && candidate > bestThreshold)) {
1552
+ bestAccuracy = accuracy;
1553
+ bestThreshold = candidate;
1554
+ }
1555
+ }
1556
+
1557
+ // Clamp to sane range
1558
+ return Math.max(0.85, Math.min(0.98, bestThreshold));
1559
+ }
1560
+
1561
+ /** Persist the calibrated threshold for a project. */
1562
+ export function saveCalibratedThreshold(
1563
+ projectId: string | null,
1564
+ threshold: number,
1565
+ sampleSize: number,
1566
+ ): void {
1567
+ const key = `dedup_threshold:${projectId ?? "global"}`;
1568
+ setKV(key, JSON.stringify({ threshold, sampleSize, calibratedAt: Date.now() }));
1569
+ }
1570
+
1571
+ /** Load the calibrated threshold for a project, or null if not calibrated. */
1572
+ export function loadCalibratedThreshold(projectId: string | null): number | null {
1573
+ const key = `dedup_threshold:${projectId ?? "global"}`;
1574
+ const raw = getKV(key);
1575
+ if (!raw) return null;
1576
+ try {
1577
+ const parsed = JSON.parse(raw);
1578
+ return typeof parsed.threshold === "number" ? parsed.threshold : null;
1579
+ } catch {
1580
+ return null;
1581
+ }
1147
1582
  }
@@ -0,0 +1,47 @@
1
+ /**
2
+ * Per-key concurrency limiter using p-limit.
3
+ *
4
+ * Each key (typically a session ID) gets its own p-limit(1) instance,
5
+ * serializing async operations on the same key while allowing different
6
+ * keys to run fully in parallel.
7
+ *
8
+ * Two independent limiter pools are provided — one for distillation and
9
+ * one for curation — so they don't block each other.
10
+ */
11
+
12
+ import pLimit from "p-limit";
13
+
14
+ type LimitFunction = ReturnType<typeof pLimit>;
15
+
16
+ function createLimiterPool() {
17
+ const limiters = new Map<string, LimitFunction>();
18
+
19
+ /** Get or create a p-limit(1) limiter for the given key. */
20
+ function get(key: string): LimitFunction {
21
+ let limiter = limiters.get(key);
22
+ if (!limiter) {
23
+ limiter = pLimit(1);
24
+ limiters.set(key, limiter);
25
+ }
26
+ return limiter;
27
+ }
28
+
29
+ /** Check if a limiter for `key` is currently busy (active or pending work). */
30
+ function isBusy(key: string): boolean {
31
+ const limiter = limiters.get(key);
32
+ return limiter ? limiter.activeCount + limiter.pendingCount > 0 : false;
33
+ }
34
+
35
+ /** Clear all limiters (for test cleanup). */
36
+ function clear(): void {
37
+ limiters.clear();
38
+ }
39
+
40
+ return { get, isBusy, clear };
41
+ }
42
+
43
+ /** Serializes distillation.run() and metaDistill() per session. */
44
+ export const distillLimiter = createLimiterPool();
45
+
46
+ /** Serializes curator.run() per session with skip-if-busy semantics. */
47
+ export const curatorLimiter = createLimiterPool();