mindforge-cc 2.3.5 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/.agent/skills/mindforge-plan-phase/SKILL.md +1 -0
  2. package/.agent/skills/mindforge-system-architecture/SKILL.md +136 -0
  3. package/.agent/skills/mindforge-system-architecture/examples.md +120 -0
  4. package/.agent/skills/mindforge-system-architecture/scaling-checklist.md +76 -0
  5. package/.agent/skills/mindforge-tdd/SKILL.md +112 -0
  6. package/.agent/skills/mindforge-tdd/deep-modules.md +21 -0
  7. package/.agent/skills/mindforge-tdd/interface-design.md +22 -0
  8. package/.agent/skills/mindforge-tdd/mocking.md +24 -0
  9. package/.agent/skills/mindforge-tdd/refactoring.md +21 -0
  10. package/.agent/skills/mindforge-tdd/tests.md +28 -0
  11. package/.agent/workflows/mindforge-plan-phase.md +30 -1
  12. package/.agent/workflows/mindforge:architecture.md +40 -0
  13. package/.agent/workflows/mindforge:executor.md +18 -0
  14. package/.agent/workflows/mindforge:identity.md +18 -0
  15. package/.agent/workflows/mindforge:memory.md +18 -0
  16. package/.agent/workflows/mindforge:planner.md +18 -0
  17. package/.agent/workflows/mindforge:researcher.md +18 -0
  18. package/.agent/workflows/mindforge:reviewer.md +18 -0
  19. package/.agent/workflows/mindforge:tdd.md +41 -0
  20. package/.agent/workflows/mindforge:tool.md +18 -0
  21. package/.mindforge/engine/ads-protocol.md +54 -0
  22. package/.mindforge/engine/compaction-protocol.md +21 -36
  23. package/.mindforge/engine/context-injector.md +26 -0
  24. package/.mindforge/engine/knowledge-graph-protocol.md +125 -0
  25. package/.mindforge/engine/shard-controller.md +53 -0
  26. package/.mindforge/engine/temporal-protocol.md +40 -0
  27. package/.mindforge/personas/mf-executor.md +40 -0
  28. package/.mindforge/personas/mf-memory.md +33 -0
  29. package/.mindforge/personas/mf-planner.md +45 -0
  30. package/.mindforge/personas/mf-researcher.md +39 -0
  31. package/.mindforge/personas/mf-reviewer.md +35 -0
  32. package/.mindforge/personas/mf-tool.md +33 -0
  33. package/.planning/AUDIT.jsonl +1 -0
  34. package/.planning/TEMPORAL-TEST.md +1 -0
  35. package/.planning/history/36525e1d9da1b674/ARCHITECTURE.md +0 -0
  36. package/.planning/history/36525e1d9da1b674/HANDOFF.json +8 -0
  37. package/.planning/history/36525e1d9da1b674/PROJECT.md +33 -0
  38. package/.planning/history/36525e1d9da1b674/RELEASE-CHECKLIST.md +68 -0
  39. package/.planning/history/36525e1d9da1b674/REQUIREMENTS.md +0 -0
  40. package/.planning/history/36525e1d9da1b674/ROADMAP.md +12 -0
  41. package/.planning/history/36525e1d9da1b674/SNAPSHOT-META.json +18 -0
  42. package/.planning/history/36525e1d9da1b674/STATE.md +31 -0
  43. package/.planning/history/36525e1d9da1b674/TEMPORAL-TEST.md +1 -0
  44. package/.planning/history/36525e1d9da1b674/jira-sync.json +5 -0
  45. package/.planning/history/36525e1d9da1b674/slack-threads.json +3 -0
  46. package/.planning/history/test-audit-001/ARCHITECTURE.md +0 -0
  47. package/.planning/history/test-audit-001/HANDOFF.json +8 -0
  48. package/.planning/history/test-audit-001/PROJECT.md +33 -0
  49. package/.planning/history/test-audit-001/RELEASE-CHECKLIST.md +68 -0
  50. package/.planning/history/test-audit-001/REQUIREMENTS.md +0 -0
  51. package/.planning/history/test-audit-001/ROADMAP.md +12 -0
  52. package/.planning/history/test-audit-001/SNAPSHOT-META.json +17 -0
  53. package/.planning/history/test-audit-001/STATE.md +31 -0
  54. package/.planning/history/test-audit-001/TEMPORAL-TEST.md +1 -0
  55. package/.planning/history/test-audit-001/jira-sync.json +5 -0
  56. package/.planning/history/test-audit-001/slack-threads.json +3 -0
  57. package/CHANGELOG.md +101 -0
  58. package/README.md +57 -23
  59. package/bin/autonomous/auto-runner.js +23 -0
  60. package/bin/dashboard/server.js +2 -0
  61. package/bin/dashboard/temporal-api.js +82 -0
  62. package/bin/engine/temporal-cli.js +52 -0
  63. package/bin/engine/temporal-hub.js +138 -0
  64. package/bin/hindsight-injector.js +59 -0
  65. package/bin/memory/auto-shadow.js +274 -0
  66. package/bin/memory/embedding-engine.js +326 -0
  67. package/bin/memory/knowledge-capture.js +122 -5
  68. package/bin/memory/knowledge-graph.js +572 -0
  69. package/bin/memory/knowledge-store.js +15 -3
  70. package/bin/mindforge-cli.js +19 -0
  71. package/bin/models/model-router.js +1 -0
  72. package/bin/review/ads-engine.js +126 -0
  73. package/bin/review/ads-synthesizer.js +117 -0
  74. package/bin/shard-helper.js +134 -0
  75. package/bin/spawn-agent.js +61 -0
  76. package/docs/PERSONAS.md +71 -5
  77. package/docs/adr/ADR-042-ads-protocol.md +30 -0
  78. package/docs/architecture/README.md +55 -0
  79. package/docs/architecture/V3-CORE.md +52 -0
  80. package/docs/commands-reference.md +3 -2
  81. package/docs/usp-features.md +33 -15
  82. package/package.json +1 -1
@@ -0,0 +1,274 @@
1
+ /**
2
+ * MindForge v2.4.0 — Auto-Shadow Engine (RAG 2.0)
3
+ * Proactive "ghost pattern" injection — surfaces relevant knowledge
4
+ * before subagent execution WITHOUT manual /mindforge:remember calls.
5
+ *
6
+ * Design:
7
+ * - Runs automatically before each subagent spawn (context-injector hook)
8
+ * - Queries both the Knowledge Graph (traversal) and Embedding Engine (similarity)
9
+ * - Formats top results into a structured context section
10
+ * - Budget-capped at 2KB (~8000 chars) to prevent context bloat
11
+ * - Deduplicates against Hot/Warm context already loaded
12
+ * - Never shadows secrets, credentials, or deprecated entries
13
+ */
14
+ 'use strict';
15
+
16
+ const fs = require('fs');
17
+ const path = require('path');
18
+ const Store = require('./knowledge-store');
19
+ const Graph = require('./knowledge-graph');
20
+ const Embedder = require('./embedding-engine');
21
+
22
+ // ── Configuration ─────────────────────────────────────────────────────────────
23
+ const MAX_SHADOW_CHARS = 8000; // ~2KB tokens
24
+ const MAX_SHADOW_ITEMS = 5; // Max items in shadow section
25
+ const MIN_SHADOW_SCORE = 0.35; // Minimum combined score to include
26
+ const SECURITY_KEYWORDS = new Set([
27
+ 'password', 'secret', 'token', 'api_key', 'apikey', 'private_key',
28
+ 'credential', 'auth_token', 'bearer', 'encryption_key', 'ssh',
29
+ ]);
30
+
31
+ // ── Core Shadow Logic ─────────────────────────────────────────────────────────
32
+
33
+ /**
34
+ * Generate auto-shadow context for a given task description.
35
+ * This is the primary entry point, called by the context-injector.
36
+ *
37
+ * @param {object} opts
38
+ * @param {string} opts.taskDescription - Current task/plan description
39
+ * @param {string[]} [opts.excludeIds] - Entry IDs already in hot/warm context
40
+ * @param {string[]} [opts.techStack] - Tech stack for relevance boosting
41
+ * @param {number} [opts.maxItems] - Override max items (default: 5)
42
+ * @returns {{ formatted: string, items: object[], count: number, budgetUsed: number }}
43
+ */
44
+ function generateShadowContext(opts = {}) {
45
+ const {
46
+ taskDescription = '',
47
+ excludeIds = [],
48
+ techStack = [],
49
+ maxItems = MAX_SHADOW_ITEMS,
50
+ } = opts;
51
+
52
+ if (!taskDescription || taskDescription.length < 10) {
53
+ return { formatted: '', items: [], count: 0, budgetUsed: 0 };
54
+ }
55
+
56
+ // 1. Build embeddings for the full knowledge base
57
+ const allEntries = Store.readAll(true); // Include global
58
+ const activeEntries = allEntries.filter(e => !e.deprecated && e.confidence >= 0.3);
59
+
60
+ if (activeEntries.length === 0) {
61
+ return { formatted: '', items: [], count: 0, budgetUsed: 0 };
62
+ }
63
+
64
+ const { vectors, df, N } = Embedder.buildEmbeddings(activeEntries);
65
+
66
+ // 2. Hybrid query: embedding similarity + graph traversal
67
+ const queryText = `${taskDescription} ${techStack.join(' ')}`;
68
+ const related = Graph.findRelated(queryText, vectors, df, N, {
69
+ maxHops: 2,
70
+ topK: maxItems * 2, // Over-fetch for filtering
71
+ });
72
+
73
+ // 3. Filter and enrich results
74
+ const excludeSet = new Set(excludeIds);
75
+ const enriched = [];
76
+
77
+ for (const result of related) {
78
+ if (excludeSet.has(result.id)) continue;
79
+ if (result.score < MIN_SHADOW_SCORE) continue;
80
+
81
+ const entry = activeEntries.find(e => e.id === result.id);
82
+ if (!entry) continue;
83
+
84
+ // Security guard: never shadow secrets
85
+ if (containsSecrets(entry)) continue;
86
+
87
+ enriched.push({
88
+ id: entry.id,
89
+ type: entry.type,
90
+ topic: entry.topic,
91
+ content: entry.content,
92
+ confidence: entry.confidence,
93
+ score: result.score,
94
+ source: result.source,
95
+ tags: entry.tags || [],
96
+ edges: getEdgeSummary(entry.id),
97
+ });
98
+ }
99
+
100
+ // 4. Sort by score and cap
101
+ enriched.sort((a, b) => b.score - a.score);
102
+ const capped = enriched.slice(0, maxItems);
103
+
104
+ // 5. Format for context injection (budget-capped)
105
+ const formatted = formatShadowSection(capped);
106
+
107
+ return {
108
+ formatted,
109
+ items: capped,
110
+ count: capped.length,
111
+ budgetUsed: formatted.length,
112
+ };
113
+ }
114
+
115
+ // ── Formatting ────────────────────────────────────────────────────────────────
116
+
117
+ /**
118
+ * Format shadow items into a structured context section.
119
+ * Budget-capped at MAX_SHADOW_CHARS.
120
+ * @param {object[]} items
121
+ * @returns {string}
122
+ */
123
+ function formatShadowSection(items) {
124
+ if (items.length === 0) return '';
125
+
126
+ const lines = [
127
+ '## Auto-Shadow Context (RAG 2.0)',
128
+ '',
129
+ '> These are automatically surfaced "ghost patterns" from past sessions.',
130
+ '> Use them as background context — do not explicitly reference them unless relevant.',
131
+ '',
132
+ ];
133
+
134
+ let totalChars = lines.join('\n').length;
135
+
136
+ for (const item of items) {
137
+ const icon = getTypeIcon(item.type);
138
+ const edgeNote = item.edges ? ` [${item.edges}]` : '';
139
+ const confidenceBar = `${(item.confidence * 100).toFixed(0)}%`;
140
+
141
+ const header = `### ${icon} ${item.topic} (${confidenceBar} confidence)${edgeNote}`;
142
+ const content = truncateContent(item.content, 300);
143
+ const tags = item.tags.length > 0 ? `Tags: ${item.tags.join(', ')}` : '';
144
+ const sourceLabel = `Source: ${item.source} | Score: ${item.score.toFixed(2)}`;
145
+
146
+ const block = [header, content, tags, sourceLabel, ''].join('\n');
147
+
148
+ if (totalChars + block.length > MAX_SHADOW_CHARS) break;
149
+ lines.push(block);
150
+ totalChars += block.length;
151
+ }
152
+
153
+ return lines.join('\n');
154
+ }
155
+
156
+ /**
157
+ * Get icon for entry type.
158
+ * @param {string} type
159
+ * @returns {string}
160
+ */
161
+ function getTypeIcon(type) {
162
+ const icons = {
163
+ architectural_decision: '🏛️',
164
+ code_pattern: '🔧',
165
+ bug_pattern: '🐛',
166
+ team_preference: '👥',
167
+ domain_knowledge: '📚',
168
+ };
169
+ return icons[type] || '💡';
170
+ }
171
+
172
+ /**
173
+ * Get a brief edge summary for a node.
174
+ * @param {string} nodeId
175
+ * @returns {string}
176
+ */
177
+ function getEdgeSummary(nodeId) {
178
+ try {
179
+ const edges = Graph.getNodeEdges(nodeId);
180
+ if (edges.length === 0) return '';
181
+
182
+ const types = {};
183
+ for (const e of edges) {
184
+ types[e.type] = (types[e.type] || 0) + 1;
185
+ }
186
+
187
+ return Object.entries(types)
188
+ .map(([type, count]) => `${count} ${type.toLowerCase().replace(/_/g, '-')}`)
189
+ .join(', ');
190
+ } catch {
191
+ return '';
192
+ }
193
+ }
194
+
195
+ /**
196
+ * Truncate content to maxLen characters, adding ellipsis.
197
+ * @param {string} content
198
+ * @param {number} maxLen
199
+ * @returns {string}
200
+ */
201
+ function truncateContent(content, maxLen) {
202
+ if (!content) return '';
203
+ if (content.length <= maxLen) return content;
204
+ return content.slice(0, maxLen - 3) + '...';
205
+ }
206
+
207
+ // ── Security ──────────────────────────────────────────────────────────────────
208
+
209
+ /**
210
+ * Check if an entry might contain secrets/credentials.
211
+ * @param {object} entry
212
+ * @returns {boolean}
213
+ */
214
+ function containsSecrets(entry) {
215
+ const text = `${entry.topic} ${entry.content} ${(entry.tags || []).join(' ')}`.toLowerCase();
216
+
217
+ for (const keyword of SECURITY_KEYWORDS) {
218
+ if (text.includes(keyword)) return true;
219
+ }
220
+
221
+ // Check for common secret patterns
222
+ const secretPatterns = [
223
+ /[a-z0-9]{32,}/, // Long hex strings (API keys)
224
+ /-----BEGIN/, // PEM keys
225
+ /sk_[a-z]+_[a-z0-9]/i, // Stripe-style keys
226
+ ];
227
+
228
+ for (const pattern of secretPatterns) {
229
+ if (pattern.test(entry.content || '')) return true;
230
+ }
231
+
232
+ return false;
233
+ }
234
+
235
+ // ── Contradiction Detection ───────────────────────────────────────────────────
236
+
237
+ /**
238
+ * Check shadow items for contradictions and flag them.
239
+ * Looks for CONTRADICTS edges between shadow items.
240
+ * @param {object[]} items - Shadow items
241
+ * @returns {object[]} Items with contradiction flags
242
+ */
243
+ function flagContradictions(items) {
244
+ const itemIds = new Set(items.map(i => i.id));
245
+
246
+ for (const item of items) {
247
+ const edges = Graph.getNodeEdges(item.id, {
248
+ edgeTypes: [Graph.EDGE_TYPES.CONTRADICTS],
249
+ });
250
+
251
+ const contradicted = edges.some(e =>
252
+ itemIds.has(e.sourceId === item.id ? e.targetId : e.sourceId)
253
+ );
254
+
255
+ if (contradicted) {
256
+ item.contradiction = true;
257
+ item.topic = `⚠️ ${item.topic} [CONTRADICTED]`;
258
+ }
259
+ }
260
+
261
+ return items;
262
+ }
263
+
264
+ // ── Exports ───────────────────────────────────────────────────────────────────
265
+ module.exports = {
266
+ generateShadowContext,
267
+ formatShadowSection,
268
+ containsSecrets,
269
+ flagContradictions,
270
+ getEdgeSummary,
271
+ MAX_SHADOW_CHARS,
272
+ MAX_SHADOW_ITEMS,
273
+ MIN_SHADOW_SCORE,
274
+ };
@@ -0,0 +1,326 @@
1
+ /**
2
+ * MindForge v2.4.0 — Embedding Engine (RAG 2.0)
3
+ * Local-first TF-IDF vector space for semantic similarity.
4
+ *
5
+ * No external API dependencies — runs entirely on local compute.
6
+ * Provides vectorization and cosine similarity for the Knowledge Graph.
7
+ *
8
+ * Design:
9
+ * - Sparse TF-IDF vectors stored as { token → weight } objects
10
+ * - Cosine similarity for semantic matching between entries
11
+ * - Embedding cache persisted to disk for fast session restarts
12
+ * - Auto-edge inference when similarity exceeds threshold
13
+ */
14
+ 'use strict';
15
+
16
+ const fs = require('fs');
17
+ const path = require('path');
18
+ const crypto = require('crypto');
19
+
20
+ // ── Configuration ─────────────────────────────────────────────────────────────
21
+ const SIMILARITY_THRESHOLD = 0.65; // Auto-edge creation threshold
22
+ const SHADOW_THRESHOLD = 0.50; // Minimum similarity for auto-shadow retrieval
23
+ const MAX_VECTOR_TERMS = 200; // Cap sparse vector dimensionality
24
+ const CACHE_SCHEMA_VERSION = '1.0.0';
25
+
26
+ // ── Stopwords (expanded for technical content) ────────────────────────────────
27
+ const STOPWORDS = new Set([
28
+ 'the', 'a', 'an', 'is', 'it', 'in', 'on', 'at', 'to', 'for', 'of', 'and',
29
+ 'or', 'but', 'not', 'this', 'that', 'with', 'from', 'by', 'be', 'are',
30
+ 'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
31
+ 'could', 'should', 'may', 'might', 'can', 'use', 'using', 'used', 'when',
32
+ 'where', 'which', 'what', 'how', 'why', 'who', 'all', 'any', 'some', 'we',
33
+ 'our', 'they', 'their', 'you', 'your', 'my', 'its', 'also', 'just', 'more',
34
+ 'very', 'been', 'being', 'each', 'then', 'than', 'into', 'only', 'other',
35
+ 'such', 'like', 'over', 'after', 'before', 'between', 'through', 'about',
36
+ 'will', 'shall', 'must', 'need', 'make', 'made', 'get', 'got', 'set',
37
+ 'new', 'old', 'see', 'way', 'well', 'back', 'even', 'give', 'most',
38
+ ]);
39
+
40
+ // ── Tokenizer ─────────────────────────────────────────────────────────────────
41
+
42
+ /**
43
+ * Tokenize text into normalized, filtered terms.
44
+ * Handles camelCase, snake_case, and kebab-case splitting.
45
+ * @param {string} text - Raw text to tokenize
46
+ * @returns {string[]} Filtered tokens
47
+ */
48
+ function tokenize(text) {
49
+ if (!text || typeof text !== 'string') return [];
50
+
51
+ return text
52
+ // Split camelCase: "userId" → "user Id"
53
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
54
+ // Split snake_case and kebab-case
55
+ .replace(/[_-]/g, ' ')
56
+ // Remove non-alphanumeric (keep dots for versions)
57
+ .replace(/[^a-zA-Z0-9.\s]/g, ' ')
58
+ .toLowerCase()
59
+ .split(/\s+/)
60
+ .filter(w => w.length > 2 && !STOPWORDS.has(w));
61
+ }
62
+
63
+ /**
64
+ * Extract n-grams (bigrams) from tokens for compound term matching.
65
+ * "react memo" stays as a single feature "react_memo".
66
+ * @param {string[]} tokens - Unigram tokens
67
+ * @returns {string[]} Bigram tokens
68
+ */
69
+ function bigrams(tokens) {
70
+ const result = [];
71
+ for (let i = 0; i < tokens.length - 1; i++) {
72
+ result.push(`${tokens[i]}_${tokens[i + 1]}`);
73
+ }
74
+ return result;
75
+ }
76
+
77
+ // ── TF-IDF Vectorization ──────────────────────────────────────────────────────
78
+
79
+ /**
80
+ * Build a document-frequency map from a corpus of documents.
81
+ * @param {Array<{id: string, tokens: string[]}>} corpus
82
+ * @returns {Map<string, number>} token → document frequency
83
+ */
84
+ function buildDocumentFrequency(corpus) {
85
+ const df = new Map();
86
+ for (const doc of corpus) {
87
+ const unique = new Set(doc.tokens);
88
+ for (const token of unique) {
89
+ df.set(token, (df.get(token) || 0) + 1);
90
+ }
91
+ }
92
+ return df;
93
+ }
94
+
95
+ /**
96
+ * Compute TF-IDF vector for a single document.
97
+ * @param {string[]} tokens - Document tokens
98
+ * @param {Map<string, number>} df - Document frequency map
99
+ * @param {number} N - Total document count
100
+ * @returns {Object<string, number>} Sparse vector { token → weight }
101
+ */
102
+ function computeTfIdfVector(tokens, df, N) {
103
+ if (tokens.length === 0) return {};
104
+
105
+ // Term frequency
106
+ const tf = {};
107
+ for (const t of tokens) {
108
+ tf[t] = (tf[t] || 0) + 1;
109
+ }
110
+
111
+ // TF-IDF weights
112
+ const vector = {};
113
+ for (const [term, count] of Object.entries(tf)) {
114
+ const termFreq = count / tokens.length; // Normalized TF
115
+ const docFreq = df.get(term) || 1;
116
+ const idf = Math.log((N + 1) / (docFreq + 1)) + 1; // Smoothed IDF
117
+ vector[term] = termFreq * idf;
118
+ }
119
+
120
+ // Cap dimensionality: keep only top-N terms by weight
121
+ const sorted = Object.entries(vector)
122
+ .sort((a, b) => b[1] - a[1])
123
+ .slice(0, MAX_VECTOR_TERMS);
124
+
125
+ const capped = {};
126
+ for (const [term, weight] of sorted) {
127
+ capped[term] = weight;
128
+ }
129
+
130
+ return capped;
131
+ }
132
+
133
+ // ── Similarity ────────────────────────────────────────────────────────────────
134
+
135
+ /**
136
+ * Compute cosine similarity between two sparse vectors.
137
+ * @param {Object<string, number>} vecA - Sparse vector A
138
+ * @param {Object<string, number>} vecB - Sparse vector B
139
+ * @returns {number} Cosine similarity [0, 1]
140
+ */
141
+ function cosineSimilarity(vecA, vecB) {
142
+ if (!vecA || !vecB) return 0;
143
+
144
+ const keysA = Object.keys(vecA);
145
+ const keysB = Object.keys(vecB);
146
+ if (keysA.length === 0 || keysB.length === 0) return 0;
147
+
148
+ // Dot product (only over shared terms)
149
+ let dot = 0;
150
+ for (const key of keysA) {
151
+ if (vecB[key]) {
152
+ dot += vecA[key] * vecB[key];
153
+ }
154
+ }
155
+
156
+ if (dot === 0) return 0;
157
+
158
+ // Magnitudes
159
+ let magA = 0;
160
+ for (const v of Object.values(vecA)) magA += v * v;
161
+ magA = Math.sqrt(magA);
162
+
163
+ let magB = 0;
164
+ for (const v of Object.values(vecB)) magB += v * v;
165
+ magB = Math.sqrt(magB);
166
+
167
+ if (magA === 0 || magB === 0) return 0;
168
+
169
+ return dot / (magA * magB);
170
+ }
171
+
172
+ // ── Corpus Manager ────────────────────────────────────────────────────────────
173
+
174
+ /**
175
+ * Build embeddings for all knowledge entries.
176
+ * @param {object[]} entries - Knowledge entries with { id, topic, content, tags }
177
+ * @returns {{ vectors: Map<string, object>, df: Map<string, number> }}
178
+ */
179
+ function buildEmbeddings(entries) {
180
+ // Tokenize all entries
181
+ const corpus = entries
182
+ .filter(e => !e.deprecated)
183
+ .map(e => {
184
+ const text = `${e.topic || ''} ${e.content || ''} ${(e.tags || []).join(' ')}`;
185
+ const unigrams = tokenize(text);
186
+ const bi = bigrams(unigrams);
187
+ return { id: e.id, tokens: [...unigrams, ...bi] };
188
+ });
189
+
190
+ const df = buildDocumentFrequency(corpus);
191
+ const N = corpus.length;
192
+
193
+ const vectors = new Map();
194
+ for (const doc of corpus) {
195
+ vectors.set(doc.id, computeTfIdfVector(doc.tokens, df, N));
196
+ }
197
+
198
+ return { vectors, df, N };
199
+ }
200
+
201
+ /**
202
+ * Compute embedding for a single query string against an existing corpus.
203
+ * @param {string} queryText - Natural language query
204
+ * @param {Map<string, number>} df - Document frequency from corpus
205
+ * @param {number} N - Total document count
206
+ * @returns {Object<string, number>} Query vector
207
+ */
208
+ function embedQuery(queryText, df, N) {
209
+ const unigrams = tokenize(queryText);
210
+ const bi = bigrams(unigrams);
211
+ return computeTfIdfVector([...unigrams, ...bi], df, N);
212
+ }
213
+
214
+ /**
215
+ * Find the top-K most similar entries to a query.
216
+ * @param {string} queryText - Natural language query
217
+ * @param {Map<string, object>} vectors - Precomputed entry vectors
218
+ * @param {Map<string, number>} df - Document frequency
219
+ * @param {number} N - Corpus size
220
+ * @param {number} topK - Max results
221
+ * @param {number} minSimilarity - Minimum cosine similarity (default: SHADOW_THRESHOLD)
222
+ * @returns {Array<{id: string, similarity: number}>}
223
+ */
224
+ function findSimilar(queryText, vectors, df, N, topK = 10, minSimilarity = SHADOW_THRESHOLD) {
225
+ const queryVec = embedQuery(queryText, df, N);
226
+ const results = [];
227
+
228
+ for (const [id, vec] of vectors) {
229
+ const sim = cosineSimilarity(queryVec, vec);
230
+ if (sim >= minSimilarity) {
231
+ results.push({ id, similarity: sim });
232
+ }
233
+ }
234
+
235
+ return results
236
+ .sort((a, b) => b.similarity - a.similarity)
237
+ .slice(0, topK);
238
+ }
239
+
240
+ /**
241
+ * Find entries that should be auto-linked (similarity > SIMILARITY_THRESHOLD).
242
+ * @param {string} entryId - The new entry's ID
243
+ * @param {Map<string, object>} vectors - All entry vectors
244
+ * @returns {Array<{targetId: string, similarity: number}>} Candidate edges
245
+ */
246
+ function inferEdges(entryId, vectors) {
247
+ const entryVec = vectors.get(entryId);
248
+ if (!entryVec) return [];
249
+
250
+ const candidates = [];
251
+ for (const [id, vec] of vectors) {
252
+ if (id === entryId) continue;
253
+ const sim = cosineSimilarity(entryVec, vec);
254
+ if (sim >= SIMILARITY_THRESHOLD) {
255
+ candidates.push({ targetId: id, similarity: sim });
256
+ }
257
+ }
258
+
259
+ return candidates.sort((a, b) => b.similarity - a.similarity);
260
+ }
261
+
262
+ // ── Embedding Cache ───────────────────────────────────────────────────────────
263
+
264
+ /**
265
+ * Save embedding cache to disk.
266
+ * @param {string} cachePath - Absolute path to cache file
267
+ * @param {Map<string, object>} vectors - Entry vectors
268
+ * @param {Map<string, number>} df - Document frequency
269
+ * @param {number} N - Corpus size
270
+ */
271
+ function saveCache(cachePath, vectors, df, N) {
272
+ const data = {
273
+ schema_version: CACHE_SCHEMA_VERSION,
274
+ updated_at: new Date().toISOString(),
275
+ corpus_size: N,
276
+ df: Object.fromEntries(df),
277
+ vectors: Object.fromEntries(vectors),
278
+ checksum: '',
279
+ };
280
+
281
+ const payload = JSON.stringify(data);
282
+ data.checksum = crypto.createHash('sha256').update(payload).digest('hex');
283
+
284
+ const dir = path.dirname(cachePath);
285
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
286
+ fs.writeFileSync(cachePath, JSON.stringify(data, null, 2));
287
+ }
288
+
289
+ /**
290
+ * Load embedding cache from disk.
291
+ * @param {string} cachePath - Absolute path to cache file
292
+ * @returns {{ vectors: Map<string, object>, df: Map<string, number>, N: number } | null}
293
+ */
294
+ function loadCache(cachePath) {
295
+ if (!fs.existsSync(cachePath)) return null;
296
+
297
+ try {
298
+ const raw = JSON.parse(fs.readFileSync(cachePath, 'utf8'));
299
+ if (raw.schema_version !== CACHE_SCHEMA_VERSION) return null;
300
+
301
+ return {
302
+ vectors: new Map(Object.entries(raw.vectors || {})),
303
+ df: new Map(Object.entries(raw.df || {})),
304
+ N: raw.corpus_size || 0,
305
+ };
306
+ } catch {
307
+ return null; // Corrupt cache — rebuild
308
+ }
309
+ }
310
+
311
+ // ── Exports ───────────────────────────────────────────────────────────────────
312
+ module.exports = {
313
+ tokenize,
314
+ bigrams,
315
+ buildDocumentFrequency,
316
+ computeTfIdfVector,
317
+ cosineSimilarity,
318
+ buildEmbeddings,
319
+ embedQuery,
320
+ findSimilar,
321
+ inferEdges,
322
+ saveCache,
323
+ loadCache,
324
+ SIMILARITY_THRESHOLD,
325
+ SHADOW_THRESHOLD,
326
+ };