@ijfw/memory-server 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/bin/ijfw +27 -0
  2. package/bin/ijfw-dashboard +180 -0
  3. package/bin/ijfw-dispatch-plan +41 -0
  4. package/bin/ijfw-memorize +273 -0
  5. package/bin/ijfw-memory +51 -0
  6. package/fixtures/demo-target.js +28 -0
  7. package/package.json +53 -0
  8. package/src/api-client.js +190 -0
  9. package/src/audit-roster.js +315 -0
  10. package/src/caps.js +37 -0
  11. package/src/cold-scan-runner.mjs +37 -0
  12. package/src/compute/edges.js +155 -0
  13. package/src/compute/extract.js +560 -0
  14. package/src/compute/fts5.js +420 -0
  15. package/src/compute/graph-auto-index.js +191 -0
  16. package/src/compute/graph-lock.js +114 -0
  17. package/src/compute/index.js +18 -0
  18. package/src/compute/migration-runner.js +116 -0
  19. package/src/compute/migrations/001-initial.js +23 -0
  20. package/src/compute/migrations/002-porter-stemming-source.js +139 -0
  21. package/src/compute/migrations/003-tier-semantic.js +69 -0
  22. package/src/compute/migrations/004-kg-tables.js +83 -0
  23. package/src/compute/migrations/005-stale-candidate.js +72 -0
  24. package/src/compute/python-resolver.js +106 -0
  25. package/src/compute/runner-vm.js +185 -0
  26. package/src/compute/runner.js +416 -0
  27. package/src/compute/sandbox-detect.js +122 -0
  28. package/src/compute/sandbox-linux.js +164 -0
  29. package/src/compute/sandbox-macos.js +167 -0
  30. package/src/compute/sandbox-windows.js +63 -0
  31. package/src/compute/schema.sql +118 -0
  32. package/src/compute/staleness.js +239 -0
  33. package/src/compute/synonyms.js +367 -0
  34. package/src/compute/traverse.js +180 -0
  35. package/src/cost/aggregator.js +229 -0
  36. package/src/cost/pricing.js +134 -0
  37. package/src/cost/readers/claude.js +179 -0
  38. package/src/cost/readers/codex.js +131 -0
  39. package/src/cost/readers/gemini.js +111 -0
  40. package/src/cost/savings.js +243 -0
  41. package/src/cross-dispatcher.js +437 -0
  42. package/src/cross-orchestrator-cli.js +1885 -0
  43. package/src/cross-orchestrator.js +598 -0
  44. package/src/cross-project-search.js +114 -0
  45. package/src/dashboard-client.html +1180 -0
  46. package/src/dashboard-server.js +895 -0
  47. package/src/design-companion.js +81 -0
  48. package/src/dispatch/colon-syntax.js +732 -0
  49. package/src/dispatch-planner.js +235 -0
  50. package/src/dream/cooldown.js +105 -0
  51. package/src/dream/runner.mjs +373 -0
  52. package/src/dream/staleness-wiring.js +195 -0
  53. package/src/feedback-detector.js +57 -0
  54. package/src/hero-line.js +115 -0
  55. package/src/importers/claude-mem.js +152 -0
  56. package/src/importers/cli.js +311 -0
  57. package/src/importers/common.js +84 -0
  58. package/src/importers/discover.js +235 -0
  59. package/src/importers/rtk.js +107 -0
  60. package/src/intent-router.js +221 -0
  61. package/src/lib/atomic-io.js +201 -0
  62. package/src/lib/cache.js +33 -0
  63. package/src/lib/npm-view.js +104 -0
  64. package/src/lib/status-card.js +95 -0
  65. package/src/lib/token.js +85 -0
  66. package/src/memory/fts5.js +349 -0
  67. package/src/memory/migration-runner.js +116 -0
  68. package/src/memory/migrations/001-fts5-init.js +26 -0
  69. package/src/memory/migrations/002-tier-semantic.js +60 -0
  70. package/src/memory/migrations/003-stale-candidate.js +60 -0
  71. package/src/memory/reader.js +300 -0
  72. package/src/memory/recall-counter.js +76 -0
  73. package/src/memory/schema.sql +79 -0
  74. package/src/memory/search.js +431 -0
  75. package/src/memory/staleness.js +237 -0
  76. package/src/memory/tier-promotion.js +377 -0
  77. package/src/memory/tokenize.js +63 -0
  78. package/src/project-type-detector.js +866 -0
  79. package/src/prompt-check.js +171 -0
  80. package/src/ralph-allowlist.js +88 -0
  81. package/src/receipts.js +129 -0
  82. package/src/redactor.js +107 -0
  83. package/src/sandbox.js +275 -0
  84. package/src/sanitizer.js +69 -0
  85. package/src/scan-resume.js +167 -0
  86. package/src/schema.js +82 -0
  87. package/src/search-bm25.js +108 -0
  88. package/src/server.js +1414 -0
  89. package/src/swarm-config.js +80 -0
  90. package/src/trident/dispatch.js +211 -0
  91. package/src/trident/lens-health.js +253 -0
  92. package/src/update-apply.js +79 -0
  93. package/src/update-check.js +136 -0
  94. package/src/vectors.js +178 -0
  95. package/templates/design/bento-grid.md +84 -0
  96. package/templates/design/brutalist-luxe.md +82 -0
  97. package/templates/design/cinematic-dark.md +82 -0
  98. package/templates/design/data-dense-dashboard.md +88 -0
  99. package/templates/design/editorial-warm.md +81 -0
  100. package/templates/design/glassmorphic.md +84 -0
  101. package/templates/design/magazine-editorial.md +84 -0
  102. package/templates/design/maximalist-vibrant.md +85 -0
  103. package/templates/design/neo-swiss-tech.md +85 -0
  104. package/templates/design/swiss-minimal.md +80 -0
  105. package/templates/design/terminal-native.md +83 -0
  106. package/templates/design/warm-organic.md +84 -0
@@ -0,0 +1,377 @@
1
+ // IJFW v1.3.0 -- D1 tier promotion logic.
2
+ //
3
+ // Source authority: .planning/1.3.0/D-PILLAR-SPEC.md §1 (tier promotion rules).
4
+ //
5
+ // Implements the four promotion edges defined in the spec:
6
+ //
7
+ // Working -> Episodic at SessionEnd boundary (called by D3 hook)
8
+ // Episodic -> Semantic when supersession fires in dream cycle
9
+ // (Jaccard similarity > 0.7 OR explicit
10
+ // `promote: semantic` tag)
11
+ // Working -> Procedural from TaskUpdate completed events with task
12
+ // duration >= 5 minutes (then dream-cycle
13
+ // pattern matcher confirms 3+ similar
14
+ // task->commit chains -- tracked here as
15
+ // `procedural_candidate` rows; final
16
+ // confirmation deferred to dream cycle module)
17
+ // Semantic -> archived NO promotion in alpha
18
+ //
19
+ // Each promotion function returns `{ promoted: <int>, errors: [<string>...] }`.
20
+ // Errors are caught per-row so one bad source doesn't abort the batch --
21
+ // the dream cycle and SessionEnd hook are best-effort consolidation, not
22
+ // integrity-critical writes.
23
+ //
24
+ // Promotions are ADDITIVE: the source row is preserved in place (audit
25
+ // trail). The destination row is a NEW INSERT with the new tier_semantic
26
+ // label and a `source` pointer referencing the originating record.
27
+ //
28
+ // Concurrency: each promotion function opens its own write tx via
29
+ // db.txn(...) (BEGIN IMMEDIATE) so a SessionEnd consolidation racing a
30
+ // dream-cycle scan serialises cleanly through the busy_timeout=5000 +
31
+ // RESERVED lock pattern from fts5.js.
32
+
33
+ import { tokenizeBody, jaccardSimilarity } from './tokenize.js';
34
+
35
+ // Constants from D-PILLAR-SPEC §1.
36
+ const JACCARD_THRESHOLD = 0.7;
37
+ const PROCEDURAL_MIN_DURATION_MS = 5 * 60 * 1000;
38
+ const PROCEDURAL_PATTERN_MIN_CHAINS = 3;
39
+
40
+ // Sentinel tier_semantic values. These are the only valid labels; callers
41
+ // that filter by tier should use these constants to avoid drift.
42
+ export const TIERS = Object.freeze({
43
+ WORKING: 'working',
44
+ EPISODIC: 'episodic',
45
+ SEMANTIC: 'semantic',
46
+ PROCEDURAL: 'procedural',
47
+ PROCEDURAL_CANDIDATE: 'procedural_candidate',
48
+ });
49
+
50
+ // --- Working -> Episodic ---------------------------------------------------
51
+
52
+ /**
53
+ * Promote Working tier observations from the just-ended session into a
54
+ * single Episodic summary record. Per D-PILLAR-SPEC §1, this is invoked
55
+ * at SessionEnd boundary by the D3 hook; it is idempotent per session
56
+ * (a session that's already been consolidated is skipped).
57
+ *
58
+ * Strategy:
59
+ * 1. Find Working memory_entries with this session_id.
60
+ * 2. If none, no-op (return promoted: 0).
61
+ * 3. If an Episodic record already exists for this session_id, skip
62
+ * (idempotency).
63
+ * 4. Otherwise INSERT one new memory_entry with tier_semantic='episodic',
64
+ * body = concatenated bodies, source = `session:<id>:episodic`,
65
+ * session_id = same.
66
+ *
67
+ * @param {object} db better-sqlite3 handle (or compatible)
68
+ * @param {object} opts
69
+ * @param {string} opts.session_id the just-ended session
70
+ * @returns {{ promoted: number, errors: string[] }}
71
+ */
72
+ export function promoteWorkingToEpisodic(db, opts = {}) {
73
+ const errors = [];
74
+ if (!db || typeof db.prepare !== 'function') {
75
+ return { promoted: 0, errors: ['promoteWorkingToEpisodic: invalid db handle'] };
76
+ }
77
+ const session_id = opts.session_id;
78
+ if (typeof session_id !== 'string' || session_id.length === 0) {
79
+ return { promoted: 0, errors: ['promoteWorkingToEpisodic: session_id required'] };
80
+ }
81
+
82
+ let promoted = 0;
83
+ try {
84
+ // Idempotency: skip if Episodic for this session already exists.
85
+ const existing = db.prepare(
86
+ `SELECT id FROM memory_entries
87
+ WHERE session_id = ? AND tier_semantic = ? LIMIT 1`
88
+ ).get(session_id, TIERS.EPISODIC);
89
+ if (existing) return { promoted: 0, errors: [] };
90
+
91
+ // Working observations for this session.
92
+ const workingRows = db.prepare(
93
+ `SELECT id, body, source FROM memory_entries
94
+ WHERE session_id = ? AND tier_semantic = ?
95
+ ORDER BY created_at ASC`
96
+ ).all(session_id, TIERS.WORKING);
97
+
98
+ if (workingRows.length === 0) return { promoted: 0, errors: [] };
99
+
100
+ // Concatenate bodies into one Episodic summary. Real D3 hook will
101
+ // call an extract-session-summary helper; this minimal version just
102
+ // joins bodies so the tier transition is exercised end-to-end.
103
+ const summary = workingRows.map(r => r.body).join('\n\n---\n\n');
104
+ const sourcePtr = `session:${session_id}:episodic`;
105
+
106
+ const tx = db.txn(() => {
107
+ const stmt = db.prepare(
108
+ `INSERT INTO memory_entries (body, source, session_id, created_at, tier_semantic)
109
+ VALUES (?, ?, ?, ?, ?)`
110
+ );
111
+ stmt.run(summary, sourcePtr, session_id, Date.now(), TIERS.EPISODIC);
112
+ promoted = 1;
113
+ });
114
+ tx();
115
+ } catch (err) {
116
+ errors.push(`promoteWorkingToEpisodic: ${err.message}`);
117
+ }
118
+
119
+ return { promoted, errors };
120
+ }
121
+
122
+ // --- Episodic -> Semantic --------------------------------------------------
123
+
124
+ /**
125
+ * Promote Episodic records to Semantic when supersession criteria are met.
126
+ *
127
+ * Per D-PILLAR-SPEC §1 trigger A (explicit) + trigger B (supersession):
128
+ * A. If an Episodic record's `source` carries the literal substring
129
+ * `promote:semantic` (set by user via slash command or skill), promote.
130
+ * B. If two Episodic records have token-set Jaccard similarity > 0.7,
131
+ * promote the LATER one to Semantic and leave the earlier as the
132
+ * audit-trail source row.
133
+ *
134
+ * Promotion writes a new memory_entry with tier_semantic='semantic'. The
135
+ * source pointer references the Episodic row that triggered promotion.
136
+ *
137
+ * Idempotent: skips Episodic records that already have a Semantic
138
+ * counterpart (matched by source pointer `episodic:<id>:semantic`).
139
+ *
140
+ * @param {object} db
141
+ * @returns {{ promoted: number, errors: string[] }}
142
+ */
143
+ export function promoteEpisodicToSemantic(db) {
144
+ const errors = [];
145
+ if (!db || typeof db.prepare !== 'function') {
146
+ return { promoted: 0, errors: ['promoteEpisodicToSemantic: invalid db handle'] };
147
+ }
148
+
149
+ let promoted = 0;
150
+ try {
151
+ const episodics = db.prepare(
152
+ `SELECT id, body, source, session_id FROM memory_entries
153
+ WHERE tier_semantic = ?
154
+ ORDER BY created_at ASC`
155
+ ).all(TIERS.EPISODIC);
156
+
157
+ if (episodics.length === 0) return { promoted: 0, errors: [], superseded: [] };
158
+
159
+ // Already-promoted set: every Semantic row whose source pointer
160
+ // matches `episodic:<id>:semantic` shape.
161
+ const semantics = db.prepare(
162
+ `SELECT source FROM memory_entries WHERE tier_semantic = ?`
163
+ ).all(TIERS.SEMANTIC);
164
+ const alreadyPromoted = new Set();
165
+ for (const s of semantics) {
166
+ const m = String(s.source || '').match(/^episodic:(\d+):semantic$/);
167
+ if (m) alreadyPromoted.add(Number(m[1]));
168
+ }
169
+
170
+ // Pre-tokenize bodies for Jaccard sweep.
171
+ const tokenized = episodics.map(r => ({
172
+ ...r,
173
+ tokens: tokenizeBody(r.body),
174
+ }));
175
+
176
+ // Sweep: trigger A (explicit tag) takes precedence over B
177
+ // (supersession). For B, only consider pairs where the earlier id <
178
+ // the later id; promote the LATER row.
179
+ const toPromote = new Map(); // id -> { source_id, reason }
180
+
181
+ // Trigger A.
182
+ for (const r of tokenized) {
183
+ if (alreadyPromoted.has(r.id)) continue;
184
+ const src = String(r.source || '');
185
+ if (src.includes('promote:semantic')) {
186
+ toPromote.set(r.id, { source_id: r.id, reason: 'explicit' });
187
+ }
188
+ }
189
+
190
+ // Trigger B.
191
+ for (let i = 0; i < tokenized.length; i++) {
192
+ const a = tokenized[i];
193
+ for (let j = i + 1; j < tokenized.length; j++) {
194
+ const b = tokenized[j];
195
+ if (alreadyPromoted.has(b.id) || toPromote.has(b.id)) continue;
196
+ const sim = jaccardSimilarity(a.tokens, b.tokens);
197
+ if (sim > JACCARD_THRESHOLD) {
198
+ toPromote.set(b.id, { source_id: a.id, reason: `jaccard=${sim.toFixed(3)}` });
199
+ }
200
+ }
201
+ }
202
+
203
+ if (toPromote.size === 0) return { promoted: 0, errors: [], superseded: [] };
204
+
205
+ // GA-B1: expose the per-promotion record so the dream-cycle runner
206
+ // can walk D4 propagateStale on each freshly-superseded Episodic
207
+ // body. Each entry carries the superseded Episodic row's id +
208
+ // body so the caller can re-extract entities and BFS the symbol
209
+ // graph from there. `reason` echoes the trigger ('explicit' or
210
+ // 'jaccard=<sim>').
211
+ const supersededDetails = [];
212
+ const tx = db.txn(() => {
213
+ const stmt = db.prepare(
214
+ `INSERT INTO memory_entries (body, source, session_id, created_at, tier_semantic)
215
+ VALUES (?, ?, ?, ?, ?)`
216
+ );
217
+ const idx = new Map(tokenized.map(r => [r.id, r]));
218
+ for (const [id, info] of toPromote.entries()) {
219
+ const row = idx.get(id);
220
+ if (!row) continue;
221
+ const sourcePtr = `episodic:${id}:semantic`;
222
+ stmt.run(row.body, sourcePtr, row.session_id, Date.now(), TIERS.SEMANTIC);
223
+ promoted++;
224
+ supersededDetails.push({
225
+ id,
226
+ body: row.body,
227
+ session_id: row.session_id,
228
+ source_id: info.source_id,
229
+ reason: info.reason,
230
+ });
231
+ }
232
+ });
233
+ tx();
234
+ return { promoted, errors, superseded: supersededDetails };
235
+ } catch (err) {
236
+ errors.push(`promoteEpisodicToSemantic: ${err.message}`);
237
+ }
238
+
239
+ return { promoted, errors, superseded: [] };
240
+ }
241
+
242
+ // --- Working -> Procedural -------------------------------------------------
243
+
244
+ /**
245
+ * Promote Working tier observations into a procedural_candidate (and on
246
+ * pattern match, into Procedural) per D-PILLAR-SPEC §1.
247
+ *
248
+ * Caller provides a TaskUpdate event:
249
+ * { task_id, status, start_ts, end_ts, body, session_id, commit_tags? }
250
+ *
251
+ * Behaviour:
252
+ * - status !== 'completed' OR duration < 5min -> no-op
253
+ * - status === 'completed' AND duration >= 5min:
254
+ * 1. INSERT procedural_candidate row with body = TaskUpdate.body and
255
+ * source = `task:<task_id>:procedural_candidate`
256
+ * 2. Look back at recent procedural_candidates with same session_id
257
+ * family / similar body (Jaccard > 0.7 against the candidate set);
258
+ * if 3+ similar chains found, promote the candidate (and matched
259
+ * older candidates) to Procedural with a composite source
260
+ * pointer.
261
+ *
262
+ * The pattern-match promotion is intentionally simple in alpha: token-set
263
+ * similarity, no LLM-side semantics. The dream cycle module is free to
264
+ * call promoteWorkingToProcedural many times; idempotency is enforced by
265
+ * source pointer uniqueness on each insert.
266
+ *
267
+ * @param {object} db
268
+ * @param {object} taskUpdate
269
+ * @returns {{ promoted: number, errors: string[] }}
270
+ */
271
+ export function promoteWorkingToProcedural(db, taskUpdate = {}) {
272
+ const errors = [];
273
+ if (!db || typeof db.prepare !== 'function') {
274
+ return { promoted: 0, errors: ['promoteWorkingToProcedural: invalid db handle'] };
275
+ }
276
+
277
+ const status = String(taskUpdate.status || '');
278
+ if (status !== 'completed') return { promoted: 0, errors: [] };
279
+
280
+ const start = Number(taskUpdate.start_ts || 0);
281
+ const end = Number(taskUpdate.end_ts || 0);
282
+ const duration = end - start;
283
+ if (!Number.isFinite(duration) || duration < PROCEDURAL_MIN_DURATION_MS) {
284
+ return { promoted: 0, errors: [] };
285
+ }
286
+
287
+ const task_id = taskUpdate.task_id || '';
288
+ const session_id = taskUpdate.session_id || null;
289
+ const body = String(taskUpdate.body || '');
290
+ if (!task_id || !body) {
291
+ return { promoted: 0, errors: ['promoteWorkingToProcedural: task_id and body required'] };
292
+ }
293
+
294
+ let promoted = 0;
295
+ try {
296
+ const candidatePtr = `task:${task_id}:procedural_candidate`;
297
+
298
+ // Idempotency: if this task already has a candidate row, don't
299
+ // double-write.
300
+ const existing = db.prepare(
301
+ `SELECT id FROM memory_entries WHERE source = ? LIMIT 1`
302
+ ).get(candidatePtr);
303
+
304
+ const tx = db.txn(() => {
305
+ if (!existing) {
306
+ const stmt = db.prepare(
307
+ `INSERT INTO memory_entries (body, source, session_id, created_at, tier_semantic)
308
+ VALUES (?, ?, ?, ?, ?)`
309
+ );
310
+ stmt.run(body, candidatePtr, session_id, Date.now(), TIERS.PROCEDURAL_CANDIDATE);
311
+ promoted++;
312
+ }
313
+
314
+ // Pattern match: find existing candidates and check similarity.
315
+ // Promote if 3+ similar chains.
316
+ const candidates = db.prepare(
317
+ `SELECT id, body, source FROM memory_entries
318
+ WHERE tier_semantic = ?
319
+ ORDER BY created_at ASC`
320
+ ).all(TIERS.PROCEDURAL_CANDIDATE);
321
+
322
+ const myTokens = tokenizeBody(body);
323
+ let similarCount = 0;
324
+ const matchedIds = [];
325
+ for (const c of candidates) {
326
+ const sim = jaccardSimilarity(myTokens, tokenizeBody(c.body));
327
+ if (sim > JACCARD_THRESHOLD) {
328
+ similarCount++;
329
+ matchedIds.push(c.id);
330
+ }
331
+ }
332
+
333
+ if (similarCount >= PROCEDURAL_PATTERN_MIN_CHAINS) {
334
+ // Confirmed Procedural. Write a new row with composite source
335
+ // pointer, leaving candidates in place as audit trail.
336
+ const proceduralPtr =
337
+ `procedural:from-candidates:${matchedIds.join(',')}`;
338
+ // Idempotency: don't double-promote the same candidate set.
339
+ const already = db.prepare(
340
+ `SELECT id FROM memory_entries WHERE source = ? LIMIT 1`
341
+ ).get(proceduralPtr);
342
+ if (!already) {
343
+ const stmt2 = db.prepare(
344
+ `INSERT INTO memory_entries (body, source, session_id, created_at, tier_semantic)
345
+ VALUES (?, ?, ?, ?, ?)`
346
+ );
347
+ stmt2.run(body, proceduralPtr, session_id, Date.now(), TIERS.PROCEDURAL);
348
+ promoted++;
349
+ }
350
+ }
351
+ });
352
+ tx();
353
+ } catch (err) {
354
+ errors.push(`promoteWorkingToProcedural: ${err.message}`);
355
+ }
356
+
357
+ return { promoted, errors };
358
+ }
359
+
360
+ // --- Semantic -> archived (alpha no-op) ------------------------------------
361
+
362
+ /**
363
+ * Per D-PILLAR-SPEC §1: no promotion in alpha. Function exists so callers
364
+ * that wire up the four-edge state machine don't get an undefined-import
365
+ * error; returns a no-op shape consistent with the others.
366
+ */
367
+ export function promoteSemanticToArchived(_db) {
368
+ return { promoted: 0, errors: [] };
369
+ }
370
+
371
+ export default {
372
+ TIERS,
373
+ promoteWorkingToEpisodic,
374
+ promoteEpisodicToSemantic,
375
+ promoteWorkingToProcedural,
376
+ promoteSemanticToArchived,
377
+ };
@@ -0,0 +1,63 @@
1
+ // IJFW v1.3.0 -- shared tokenization + Jaccard similarity for tier-promotion.
2
+ //
3
+ // Source authority: .planning/1.3.0/D-PILLAR-SPEC.md §1 (Episodic ->
4
+ // Semantic supersession trigger B uses token-set Jaccard > 0.7).
5
+ //
6
+ // Zero-deps, deterministic. Lowercases, strips non-word chars, drops
7
+ // length-1 tokens (noise) and a small English stopword set so common
8
+ // glue words don't dominate similarity scores.
9
+
10
+ // Stopword list -- tiny on purpose. Anything bigger drifts toward
11
+ // language-specific behaviour; the goal is to remove glue, not to do NLP.
12
+ const STOPWORDS = new Set([
13
+ 'a', 'an', 'the', 'and', 'or', 'but', 'of', 'in', 'on', 'at', 'to', 'for',
14
+ 'with', 'by', 'from', 'as', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
15
+ 'this', 'that', 'these', 'those', 'it', 'its', 'i', 'we', 'you', 'they',
16
+ 'so', 'if', 'then', 'than', 'do', 'did', 'does',
17
+ ]);
18
+
19
+ /**
20
+ * Tokenize a body string into a Set of normalised tokens.
21
+ *
22
+ * - Lowercases.
23
+ * - Splits on non-word characters.
24
+ * - Drops empty / length-1 tokens.
25
+ * - Drops stopwords.
26
+ *
27
+ * @param {string} body
28
+ * @returns {Set<string>}
29
+ */
30
+ export function tokenizeBody(body) {
31
+ if (typeof body !== 'string' || body.length === 0) return new Set();
32
+ const out = new Set();
33
+ for (const tok of body.toLowerCase().split(/[^a-z0-9_]+/)) {
34
+ if (tok.length <= 1) continue;
35
+ if (STOPWORDS.has(tok)) continue;
36
+ out.add(tok);
37
+ }
38
+ return out;
39
+ }
40
+
41
+ /**
42
+ * Jaccard similarity between two token sets:
43
+ * |A intersect B| / |A union B|
44
+ *
45
+ * Returns 0 when both sets are empty. Bounded [0.0, 1.0].
46
+ *
47
+ * @param {Set<string>} a
48
+ * @param {Set<string>} b
49
+ * @returns {number}
50
+ */
51
+ export function jaccardSimilarity(a, b) {
52
+ const A = a instanceof Set ? a : new Set(a || []);
53
+ const B = b instanceof Set ? b : new Set(b || []);
54
+ if (A.size === 0 && B.size === 0) return 0;
55
+ let inter = 0;
56
+ // Iterate the smaller set for efficiency.
57
+ const [small, big] = A.size <= B.size ? [A, B] : [B, A];
58
+ for (const t of small) if (big.has(t)) inter++;
59
+ const union = A.size + B.size - inter;
60
+ return union === 0 ? 0 : inter / union;
61
+ }
62
+
63
+ export default { tokenizeBody, jaccardSimilarity };