mindforge-cc 2.3.5 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/skills/mindforge-plan-phase/SKILL.md +1 -0
- package/.agent/skills/mindforge-system-architecture/SKILL.md +136 -0
- package/.agent/skills/mindforge-system-architecture/examples.md +120 -0
- package/.agent/skills/mindforge-system-architecture/scaling-checklist.md +76 -0
- package/.agent/skills/mindforge-tdd/SKILL.md +112 -0
- package/.agent/skills/mindforge-tdd/deep-modules.md +21 -0
- package/.agent/skills/mindforge-tdd/interface-design.md +22 -0
- package/.agent/skills/mindforge-tdd/mocking.md +24 -0
- package/.agent/skills/mindforge-tdd/refactoring.md +21 -0
- package/.agent/skills/mindforge-tdd/tests.md +28 -0
- package/.agent/workflows/mindforge-plan-phase.md +30 -1
- package/.agent/workflows/mindforge:architecture.md +40 -0
- package/.agent/workflows/mindforge:executor.md +18 -0
- package/.agent/workflows/mindforge:identity.md +18 -0
- package/.agent/workflows/mindforge:memory.md +18 -0
- package/.agent/workflows/mindforge:planner.md +18 -0
- package/.agent/workflows/mindforge:researcher.md +18 -0
- package/.agent/workflows/mindforge:reviewer.md +18 -0
- package/.agent/workflows/mindforge:tdd.md +41 -0
- package/.agent/workflows/mindforge:tool.md +18 -0
- package/.mindforge/engine/ads-protocol.md +54 -0
- package/.mindforge/engine/compaction-protocol.md +21 -36
- package/.mindforge/engine/context-injector.md +26 -0
- package/.mindforge/engine/knowledge-graph-protocol.md +125 -0
- package/.mindforge/engine/shard-controller.md +53 -0
- package/.mindforge/engine/temporal-protocol.md +40 -0
- package/.mindforge/personas/mf-executor.md +40 -0
- package/.mindforge/personas/mf-memory.md +33 -0
- package/.mindforge/personas/mf-planner.md +45 -0
- package/.mindforge/personas/mf-researcher.md +39 -0
- package/.mindforge/personas/mf-reviewer.md +35 -0
- package/.mindforge/personas/mf-tool.md +33 -0
- package/.planning/AUDIT.jsonl +1 -0
- package/.planning/TEMPORAL-TEST.md +1 -0
- package/.planning/history/36525e1d9da1b674/ARCHITECTURE.md +0 -0
- package/.planning/history/36525e1d9da1b674/HANDOFF.json +8 -0
- package/.planning/history/36525e1d9da1b674/PROJECT.md +33 -0
- package/.planning/history/36525e1d9da1b674/RELEASE-CHECKLIST.md +68 -0
- package/.planning/history/36525e1d9da1b674/REQUIREMENTS.md +0 -0
- package/.planning/history/36525e1d9da1b674/ROADMAP.md +12 -0
- package/.planning/history/36525e1d9da1b674/SNAPSHOT-META.json +18 -0
- package/.planning/history/36525e1d9da1b674/STATE.md +31 -0
- package/.planning/history/36525e1d9da1b674/TEMPORAL-TEST.md +1 -0
- package/.planning/history/36525e1d9da1b674/jira-sync.json +5 -0
- package/.planning/history/36525e1d9da1b674/slack-threads.json +3 -0
- package/.planning/history/test-audit-001/ARCHITECTURE.md +0 -0
- package/.planning/history/test-audit-001/HANDOFF.json +8 -0
- package/.planning/history/test-audit-001/PROJECT.md +33 -0
- package/.planning/history/test-audit-001/RELEASE-CHECKLIST.md +68 -0
- package/.planning/history/test-audit-001/REQUIREMENTS.md +0 -0
- package/.planning/history/test-audit-001/ROADMAP.md +12 -0
- package/.planning/history/test-audit-001/SNAPSHOT-META.json +17 -0
- package/.planning/history/test-audit-001/STATE.md +31 -0
- package/.planning/history/test-audit-001/TEMPORAL-TEST.md +1 -0
- package/.planning/history/test-audit-001/jira-sync.json +5 -0
- package/.planning/history/test-audit-001/slack-threads.json +3 -0
- package/CHANGELOG.md +101 -0
- package/README.md +57 -23
- package/bin/autonomous/auto-runner.js +23 -0
- package/bin/dashboard/server.js +2 -0
- package/bin/dashboard/temporal-api.js +82 -0
- package/bin/engine/temporal-cli.js +52 -0
- package/bin/engine/temporal-hub.js +138 -0
- package/bin/hindsight-injector.js +59 -0
- package/bin/memory/auto-shadow.js +274 -0
- package/bin/memory/embedding-engine.js +326 -0
- package/bin/memory/knowledge-capture.js +122 -5
- package/bin/memory/knowledge-graph.js +572 -0
- package/bin/memory/knowledge-store.js +15 -3
- package/bin/mindforge-cli.js +19 -0
- package/bin/models/model-router.js +1 -0
- package/bin/review/ads-engine.js +126 -0
- package/bin/review/ads-synthesizer.js +117 -0
- package/bin/shard-helper.js +134 -0
- package/bin/spawn-agent.js +61 -0
- package/docs/PERSONAS.md +71 -5
- package/docs/adr/ADR-042-ads-protocol.md +30 -0
- package/docs/architecture/README.md +55 -0
- package/docs/architecture/V3-CORE.md +52 -0
- package/docs/commands-reference.md +3 -2
- package/docs/usp-features.md +33 -15
- package/package.json +1 -1
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MindForge v2.4.0 — Auto-Shadow Engine (RAG 2.0)
|
|
3
|
+
* Proactive "ghost pattern" injection — surfaces relevant knowledge
|
|
4
|
+
* before subagent execution WITHOUT manual /mindforge:remember calls.
|
|
5
|
+
*
|
|
6
|
+
* Design:
|
|
7
|
+
* - Runs automatically before each subagent spawn (context-injector hook)
|
|
8
|
+
* - Queries both the Knowledge Graph (traversal) and Embedding Engine (similarity)
|
|
9
|
+
* - Formats top results into a structured context section
|
|
10
|
+
* - Budget-capped at 2KB (~8000 chars) to prevent context bloat
|
|
11
|
+
* - Deduplicates against Hot/Warm context already loaded
|
|
12
|
+
* - Never shadows secrets, credentials, or deprecated entries
|
|
13
|
+
*/
|
|
14
|
+
'use strict';
|
|
15
|
+
|
|
16
|
+
const fs = require('fs');
|
|
17
|
+
const path = require('path');
|
|
18
|
+
const Store = require('./knowledge-store');
|
|
19
|
+
const Graph = require('./knowledge-graph');
|
|
20
|
+
const Embedder = require('./embedding-engine');
|
|
21
|
+
|
|
22
|
+
// ── Configuration ─────────────────────────────────────────────────────────────
|
|
23
|
+
const MAX_SHADOW_CHARS = 8000; // ~2KB tokens
|
|
24
|
+
const MAX_SHADOW_ITEMS = 5; // Max items in shadow section
|
|
25
|
+
const MIN_SHADOW_SCORE = 0.35; // Minimum combined score to include
|
|
26
|
+
const SECURITY_KEYWORDS = new Set([
|
|
27
|
+
'password', 'secret', 'token', 'api_key', 'apikey', 'private_key',
|
|
28
|
+
'credential', 'auth_token', 'bearer', 'encryption_key', 'ssh',
|
|
29
|
+
]);
|
|
30
|
+
|
|
31
|
+
// ── Core Shadow Logic ─────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Generate auto-shadow context for a given task description.
|
|
35
|
+
* This is the primary entry point, called by the context-injector.
|
|
36
|
+
*
|
|
37
|
+
* @param {object} opts
|
|
38
|
+
* @param {string} opts.taskDescription - Current task/plan description
|
|
39
|
+
* @param {string[]} [opts.excludeIds] - Entry IDs already in hot/warm context
|
|
40
|
+
* @param {string[]} [opts.techStack] - Tech stack for relevance boosting
|
|
41
|
+
* @param {number} [opts.maxItems] - Override max items (default: 5)
|
|
42
|
+
* @returns {{ formatted: string, items: object[], count: number, budgetUsed: number }}
|
|
43
|
+
*/
|
|
44
|
+
function generateShadowContext(opts = {}) {
|
|
45
|
+
const {
|
|
46
|
+
taskDescription = '',
|
|
47
|
+
excludeIds = [],
|
|
48
|
+
techStack = [],
|
|
49
|
+
maxItems = MAX_SHADOW_ITEMS,
|
|
50
|
+
} = opts;
|
|
51
|
+
|
|
52
|
+
if (!taskDescription || taskDescription.length < 10) {
|
|
53
|
+
return { formatted: '', items: [], count: 0, budgetUsed: 0 };
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// 1. Build embeddings for the full knowledge base
|
|
57
|
+
const allEntries = Store.readAll(true); // Include global
|
|
58
|
+
const activeEntries = allEntries.filter(e => !e.deprecated && e.confidence >= 0.3);
|
|
59
|
+
|
|
60
|
+
if (activeEntries.length === 0) {
|
|
61
|
+
return { formatted: '', items: [], count: 0, budgetUsed: 0 };
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const { vectors, df, N } = Embedder.buildEmbeddings(activeEntries);
|
|
65
|
+
|
|
66
|
+
// 2. Hybrid query: embedding similarity + graph traversal
|
|
67
|
+
const queryText = `${taskDescription} ${techStack.join(' ')}`;
|
|
68
|
+
const related = Graph.findRelated(queryText, vectors, df, N, {
|
|
69
|
+
maxHops: 2,
|
|
70
|
+
topK: maxItems * 2, // Over-fetch for filtering
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
// 3. Filter and enrich results
|
|
74
|
+
const excludeSet = new Set(excludeIds);
|
|
75
|
+
const enriched = [];
|
|
76
|
+
|
|
77
|
+
for (const result of related) {
|
|
78
|
+
if (excludeSet.has(result.id)) continue;
|
|
79
|
+
if (result.score < MIN_SHADOW_SCORE) continue;
|
|
80
|
+
|
|
81
|
+
const entry = activeEntries.find(e => e.id === result.id);
|
|
82
|
+
if (!entry) continue;
|
|
83
|
+
|
|
84
|
+
// Security guard: never shadow secrets
|
|
85
|
+
if (containsSecrets(entry)) continue;
|
|
86
|
+
|
|
87
|
+
enriched.push({
|
|
88
|
+
id: entry.id,
|
|
89
|
+
type: entry.type,
|
|
90
|
+
topic: entry.topic,
|
|
91
|
+
content: entry.content,
|
|
92
|
+
confidence: entry.confidence,
|
|
93
|
+
score: result.score,
|
|
94
|
+
source: result.source,
|
|
95
|
+
tags: entry.tags || [],
|
|
96
|
+
edges: getEdgeSummary(entry.id),
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// 4. Sort by score and cap
|
|
101
|
+
enriched.sort((a, b) => b.score - a.score);
|
|
102
|
+
const capped = enriched.slice(0, maxItems);
|
|
103
|
+
|
|
104
|
+
// 5. Format for context injection (budget-capped)
|
|
105
|
+
const formatted = formatShadowSection(capped);
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
formatted,
|
|
109
|
+
items: capped,
|
|
110
|
+
count: capped.length,
|
|
111
|
+
budgetUsed: formatted.length,
|
|
112
|
+
};
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// ── Formatting ────────────────────────────────────────────────────────────────
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Format shadow items into a structured context section.
|
|
119
|
+
* Budget-capped at MAX_SHADOW_CHARS.
|
|
120
|
+
* @param {object[]} items
|
|
121
|
+
* @returns {string}
|
|
122
|
+
*/
|
|
123
|
+
function formatShadowSection(items) {
|
|
124
|
+
if (items.length === 0) return '';
|
|
125
|
+
|
|
126
|
+
const lines = [
|
|
127
|
+
'## Auto-Shadow Context (RAG 2.0)',
|
|
128
|
+
'',
|
|
129
|
+
'> These are automatically surfaced "ghost patterns" from past sessions.',
|
|
130
|
+
'> Use them as background context — do not explicitly reference them unless relevant.',
|
|
131
|
+
'',
|
|
132
|
+
];
|
|
133
|
+
|
|
134
|
+
let totalChars = lines.join('\n').length;
|
|
135
|
+
|
|
136
|
+
for (const item of items) {
|
|
137
|
+
const icon = getTypeIcon(item.type);
|
|
138
|
+
const edgeNote = item.edges ? ` [${item.edges}]` : '';
|
|
139
|
+
const confidenceBar = `${(item.confidence * 100).toFixed(0)}%`;
|
|
140
|
+
|
|
141
|
+
const header = `### ${icon} ${item.topic} (${confidenceBar} confidence)${edgeNote}`;
|
|
142
|
+
const content = truncateContent(item.content, 300);
|
|
143
|
+
const tags = item.tags.length > 0 ? `Tags: ${item.tags.join(', ')}` : '';
|
|
144
|
+
const sourceLabel = `Source: ${item.source} | Score: ${item.score.toFixed(2)}`;
|
|
145
|
+
|
|
146
|
+
const block = [header, content, tags, sourceLabel, ''].join('\n');
|
|
147
|
+
|
|
148
|
+
if (totalChars + block.length > MAX_SHADOW_CHARS) break;
|
|
149
|
+
lines.push(block);
|
|
150
|
+
totalChars += block.length;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return lines.join('\n');
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Get icon for entry type.
|
|
158
|
+
* @param {string} type
|
|
159
|
+
* @returns {string}
|
|
160
|
+
*/
|
|
161
|
+
function getTypeIcon(type) {
|
|
162
|
+
const icons = {
|
|
163
|
+
architectural_decision: '🏛️',
|
|
164
|
+
code_pattern: '🔧',
|
|
165
|
+
bug_pattern: '🐛',
|
|
166
|
+
team_preference: '👥',
|
|
167
|
+
domain_knowledge: '📚',
|
|
168
|
+
};
|
|
169
|
+
return icons[type] || '💡';
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Get a brief edge summary for a node.
|
|
174
|
+
* @param {string} nodeId
|
|
175
|
+
* @returns {string}
|
|
176
|
+
*/
|
|
177
|
+
function getEdgeSummary(nodeId) {
|
|
178
|
+
try {
|
|
179
|
+
const edges = Graph.getNodeEdges(nodeId);
|
|
180
|
+
if (edges.length === 0) return '';
|
|
181
|
+
|
|
182
|
+
const types = {};
|
|
183
|
+
for (const e of edges) {
|
|
184
|
+
types[e.type] = (types[e.type] || 0) + 1;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return Object.entries(types)
|
|
188
|
+
.map(([type, count]) => `${count} ${type.toLowerCase().replace(/_/g, '-')}`)
|
|
189
|
+
.join(', ');
|
|
190
|
+
} catch {
|
|
191
|
+
return '';
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Truncate content to maxLen characters, adding ellipsis.
|
|
197
|
+
* @param {string} content
|
|
198
|
+
* @param {number} maxLen
|
|
199
|
+
* @returns {string}
|
|
200
|
+
*/
|
|
201
|
+
function truncateContent(content, maxLen) {
|
|
202
|
+
if (!content) return '';
|
|
203
|
+
if (content.length <= maxLen) return content;
|
|
204
|
+
return content.slice(0, maxLen - 3) + '...';
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// ── Security ──────────────────────────────────────────────────────────────────
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Check if an entry might contain secrets/credentials.
|
|
211
|
+
* @param {object} entry
|
|
212
|
+
* @returns {boolean}
|
|
213
|
+
*/
|
|
214
|
+
function containsSecrets(entry) {
|
|
215
|
+
const text = `${entry.topic} ${entry.content} ${(entry.tags || []).join(' ')}`.toLowerCase();
|
|
216
|
+
|
|
217
|
+
for (const keyword of SECURITY_KEYWORDS) {
|
|
218
|
+
if (text.includes(keyword)) return true;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Check for common secret patterns
|
|
222
|
+
const secretPatterns = [
|
|
223
|
+
/[a-z0-9]{32,}/, // Long hex strings (API keys)
|
|
224
|
+
/-----BEGIN/, // PEM keys
|
|
225
|
+
/sk_[a-z]+_[a-z0-9]/i, // Stripe-style keys
|
|
226
|
+
];
|
|
227
|
+
|
|
228
|
+
for (const pattern of secretPatterns) {
|
|
229
|
+
if (pattern.test(entry.content || '')) return true;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
return false;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// ── Contradiction Detection ───────────────────────────────────────────────────
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Check shadow items for contradictions and flag them.
|
|
239
|
+
* Looks for CONTRADICTS edges between shadow items.
|
|
240
|
+
* @param {object[]} items - Shadow items
|
|
241
|
+
* @returns {object[]} Items with contradiction flags
|
|
242
|
+
*/
|
|
243
|
+
function flagContradictions(items) {
|
|
244
|
+
const itemIds = new Set(items.map(i => i.id));
|
|
245
|
+
|
|
246
|
+
for (const item of items) {
|
|
247
|
+
const edges = Graph.getNodeEdges(item.id, {
|
|
248
|
+
edgeTypes: [Graph.EDGE_TYPES.CONTRADICTS],
|
|
249
|
+
});
|
|
250
|
+
|
|
251
|
+
const contradicted = edges.some(e =>
|
|
252
|
+
itemIds.has(e.sourceId === item.id ? e.targetId : e.sourceId)
|
|
253
|
+
);
|
|
254
|
+
|
|
255
|
+
if (contradicted) {
|
|
256
|
+
item.contradiction = true;
|
|
257
|
+
item.topic = `⚠️ ${item.topic} [CONTRADICTED]`;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return items;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// ── Exports ───────────────────────────────────────────────────────────────────
|
|
265
|
+
module.exports = {
|
|
266
|
+
generateShadowContext,
|
|
267
|
+
formatShadowSection,
|
|
268
|
+
containsSecrets,
|
|
269
|
+
flagContradictions,
|
|
270
|
+
getEdgeSummary,
|
|
271
|
+
MAX_SHADOW_CHARS,
|
|
272
|
+
MAX_SHADOW_ITEMS,
|
|
273
|
+
MIN_SHADOW_SCORE,
|
|
274
|
+
};
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MindForge v2.4.0 — Embedding Engine (RAG 2.0)
|
|
3
|
+
* Local-first TF-IDF vector space for semantic similarity.
|
|
4
|
+
*
|
|
5
|
+
* No external API dependencies — runs entirely on local compute.
|
|
6
|
+
* Provides vectorization and cosine similarity for the Knowledge Graph.
|
|
7
|
+
*
|
|
8
|
+
* Design:
|
|
9
|
+
* - Sparse TF-IDF vectors stored as { token → weight } objects
|
|
10
|
+
* - Cosine similarity for semantic matching between entries
|
|
11
|
+
* - Embedding cache persisted to disk for fast session restarts
|
|
12
|
+
* - Auto-edge inference when similarity exceeds threshold
|
|
13
|
+
*/
|
|
14
|
+
'use strict';
|
|
15
|
+
|
|
16
|
+
const fs = require('fs');
|
|
17
|
+
const path = require('path');
|
|
18
|
+
const crypto = require('crypto');
|
|
19
|
+
|
|
20
|
+
// ── Configuration ─────────────────────────────────────────────────────────────
|
|
21
|
+
const SIMILARITY_THRESHOLD = 0.65; // Auto-edge creation threshold
|
|
22
|
+
const SHADOW_THRESHOLD = 0.50; // Minimum similarity for auto-shadow retrieval
|
|
23
|
+
const MAX_VECTOR_TERMS = 200; // Cap sparse vector dimensionality
|
|
24
|
+
const CACHE_SCHEMA_VERSION = '1.0.0';
|
|
25
|
+
|
|
26
|
+
// ── Stopwords (expanded for technical content) ────────────────────────────────
|
|
27
|
+
const STOPWORDS = new Set([
|
|
28
|
+
'the', 'a', 'an', 'is', 'it', 'in', 'on', 'at', 'to', 'for', 'of', 'and',
|
|
29
|
+
'or', 'but', 'not', 'this', 'that', 'with', 'from', 'by', 'be', 'are',
|
|
30
|
+
'was', 'were', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
31
|
+
'could', 'should', 'may', 'might', 'can', 'use', 'using', 'used', 'when',
|
|
32
|
+
'where', 'which', 'what', 'how', 'why', 'who', 'all', 'any', 'some', 'we',
|
|
33
|
+
'our', 'they', 'their', 'you', 'your', 'my', 'its', 'also', 'just', 'more',
|
|
34
|
+
'very', 'been', 'being', 'each', 'then', 'than', 'into', 'only', 'other',
|
|
35
|
+
'such', 'like', 'over', 'after', 'before', 'between', 'through', 'about',
|
|
36
|
+
'will', 'shall', 'must', 'need', 'make', 'made', 'get', 'got', 'set',
|
|
37
|
+
'new', 'old', 'see', 'way', 'well', 'back', 'even', 'give', 'most',
|
|
38
|
+
]);
|
|
39
|
+
|
|
40
|
+
// ── Tokenizer ─────────────────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Tokenize text into normalized, filtered terms.
|
|
44
|
+
* Handles camelCase, snake_case, and kebab-case splitting.
|
|
45
|
+
* @param {string} text - Raw text to tokenize
|
|
46
|
+
* @returns {string[]} Filtered tokens
|
|
47
|
+
*/
|
|
48
|
+
function tokenize(text) {
|
|
49
|
+
if (!text || typeof text !== 'string') return [];
|
|
50
|
+
|
|
51
|
+
return text
|
|
52
|
+
// Split camelCase: "userId" → "user Id"
|
|
53
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
54
|
+
// Split snake_case and kebab-case
|
|
55
|
+
.replace(/[_-]/g, ' ')
|
|
56
|
+
// Remove non-alphanumeric (keep dots for versions)
|
|
57
|
+
.replace(/[^a-zA-Z0-9.\s]/g, ' ')
|
|
58
|
+
.toLowerCase()
|
|
59
|
+
.split(/\s+/)
|
|
60
|
+
.filter(w => w.length > 2 && !STOPWORDS.has(w));
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Extract n-grams (bigrams) from tokens for compound term matching.
|
|
65
|
+
* "react memo" stays as a single feature "react_memo".
|
|
66
|
+
* @param {string[]} tokens - Unigram tokens
|
|
67
|
+
* @returns {string[]} Bigram tokens
|
|
68
|
+
*/
|
|
69
|
+
function bigrams(tokens) {
|
|
70
|
+
const result = [];
|
|
71
|
+
for (let i = 0; i < tokens.length - 1; i++) {
|
|
72
|
+
result.push(`${tokens[i]}_${tokens[i + 1]}`);
|
|
73
|
+
}
|
|
74
|
+
return result;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// ── TF-IDF Vectorization ──────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Build a document-frequency map from a corpus of documents.
|
|
81
|
+
* @param {Array<{id: string, tokens: string[]}>} corpus
|
|
82
|
+
* @returns {Map<string, number>} token → document frequency
|
|
83
|
+
*/
|
|
84
|
+
function buildDocumentFrequency(corpus) {
|
|
85
|
+
const df = new Map();
|
|
86
|
+
for (const doc of corpus) {
|
|
87
|
+
const unique = new Set(doc.tokens);
|
|
88
|
+
for (const token of unique) {
|
|
89
|
+
df.set(token, (df.get(token) || 0) + 1);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return df;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Compute TF-IDF vector for a single document.
|
|
97
|
+
* @param {string[]} tokens - Document tokens
|
|
98
|
+
* @param {Map<string, number>} df - Document frequency map
|
|
99
|
+
* @param {number} N - Total document count
|
|
100
|
+
* @returns {Object<string, number>} Sparse vector { token → weight }
|
|
101
|
+
*/
|
|
102
|
+
function computeTfIdfVector(tokens, df, N) {
|
|
103
|
+
if (tokens.length === 0) return {};
|
|
104
|
+
|
|
105
|
+
// Term frequency
|
|
106
|
+
const tf = {};
|
|
107
|
+
for (const t of tokens) {
|
|
108
|
+
tf[t] = (tf[t] || 0) + 1;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// TF-IDF weights
|
|
112
|
+
const vector = {};
|
|
113
|
+
for (const [term, count] of Object.entries(tf)) {
|
|
114
|
+
const termFreq = count / tokens.length; // Normalized TF
|
|
115
|
+
const docFreq = df.get(term) || 1;
|
|
116
|
+
const idf = Math.log((N + 1) / (docFreq + 1)) + 1; // Smoothed IDF
|
|
117
|
+
vector[term] = termFreq * idf;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Cap dimensionality: keep only top-N terms by weight
|
|
121
|
+
const sorted = Object.entries(vector)
|
|
122
|
+
.sort((a, b) => b[1] - a[1])
|
|
123
|
+
.slice(0, MAX_VECTOR_TERMS);
|
|
124
|
+
|
|
125
|
+
const capped = {};
|
|
126
|
+
for (const [term, weight] of sorted) {
|
|
127
|
+
capped[term] = weight;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return capped;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// ── Similarity ────────────────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Compute cosine similarity between two sparse vectors.
|
|
137
|
+
* @param {Object<string, number>} vecA - Sparse vector A
|
|
138
|
+
* @param {Object<string, number>} vecB - Sparse vector B
|
|
139
|
+
* @returns {number} Cosine similarity [0, 1]
|
|
140
|
+
*/
|
|
141
|
+
function cosineSimilarity(vecA, vecB) {
|
|
142
|
+
if (!vecA || !vecB) return 0;
|
|
143
|
+
|
|
144
|
+
const keysA = Object.keys(vecA);
|
|
145
|
+
const keysB = Object.keys(vecB);
|
|
146
|
+
if (keysA.length === 0 || keysB.length === 0) return 0;
|
|
147
|
+
|
|
148
|
+
// Dot product (only over shared terms)
|
|
149
|
+
let dot = 0;
|
|
150
|
+
for (const key of keysA) {
|
|
151
|
+
if (vecB[key]) {
|
|
152
|
+
dot += vecA[key] * vecB[key];
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
if (dot === 0) return 0;
|
|
157
|
+
|
|
158
|
+
// Magnitudes
|
|
159
|
+
let magA = 0;
|
|
160
|
+
for (const v of Object.values(vecA)) magA += v * v;
|
|
161
|
+
magA = Math.sqrt(magA);
|
|
162
|
+
|
|
163
|
+
let magB = 0;
|
|
164
|
+
for (const v of Object.values(vecB)) magB += v * v;
|
|
165
|
+
magB = Math.sqrt(magB);
|
|
166
|
+
|
|
167
|
+
if (magA === 0 || magB === 0) return 0;
|
|
168
|
+
|
|
169
|
+
return dot / (magA * magB);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// ── Corpus Manager ────────────────────────────────────────────────────────────
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Build embeddings for all knowledge entries.
|
|
176
|
+
* @param {object[]} entries - Knowledge entries with { id, topic, content, tags }
|
|
177
|
+
* @returns {{ vectors: Map<string, object>, df: Map<string, number> }}
|
|
178
|
+
*/
|
|
179
|
+
function buildEmbeddings(entries) {
|
|
180
|
+
// Tokenize all entries
|
|
181
|
+
const corpus = entries
|
|
182
|
+
.filter(e => !e.deprecated)
|
|
183
|
+
.map(e => {
|
|
184
|
+
const text = `${e.topic || ''} ${e.content || ''} ${(e.tags || []).join(' ')}`;
|
|
185
|
+
const unigrams = tokenize(text);
|
|
186
|
+
const bi = bigrams(unigrams);
|
|
187
|
+
return { id: e.id, tokens: [...unigrams, ...bi] };
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
const df = buildDocumentFrequency(corpus);
|
|
191
|
+
const N = corpus.length;
|
|
192
|
+
|
|
193
|
+
const vectors = new Map();
|
|
194
|
+
for (const doc of corpus) {
|
|
195
|
+
vectors.set(doc.id, computeTfIdfVector(doc.tokens, df, N));
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return { vectors, df, N };
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Compute embedding for a single query string against an existing corpus.
|
|
203
|
+
* @param {string} queryText - Natural language query
|
|
204
|
+
* @param {Map<string, number>} df - Document frequency from corpus
|
|
205
|
+
* @param {number} N - Total document count
|
|
206
|
+
* @returns {Object<string, number>} Query vector
|
|
207
|
+
*/
|
|
208
|
+
function embedQuery(queryText, df, N) {
|
|
209
|
+
const unigrams = tokenize(queryText);
|
|
210
|
+
const bi = bigrams(unigrams);
|
|
211
|
+
return computeTfIdfVector([...unigrams, ...bi], df, N);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Find the top-K most similar entries to a query.
|
|
216
|
+
* @param {string} queryText - Natural language query
|
|
217
|
+
* @param {Map<string, object>} vectors - Precomputed entry vectors
|
|
218
|
+
* @param {Map<string, number>} df - Document frequency
|
|
219
|
+
* @param {number} N - Corpus size
|
|
220
|
+
* @param {number} topK - Max results
|
|
221
|
+
* @param {number} minSimilarity - Minimum cosine similarity (default: SHADOW_THRESHOLD)
|
|
222
|
+
* @returns {Array<{id: string, similarity: number}>}
|
|
223
|
+
*/
|
|
224
|
+
function findSimilar(queryText, vectors, df, N, topK = 10, minSimilarity = SHADOW_THRESHOLD) {
|
|
225
|
+
const queryVec = embedQuery(queryText, df, N);
|
|
226
|
+
const results = [];
|
|
227
|
+
|
|
228
|
+
for (const [id, vec] of vectors) {
|
|
229
|
+
const sim = cosineSimilarity(queryVec, vec);
|
|
230
|
+
if (sim >= minSimilarity) {
|
|
231
|
+
results.push({ id, similarity: sim });
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return results
|
|
236
|
+
.sort((a, b) => b.similarity - a.similarity)
|
|
237
|
+
.slice(0, topK);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Find entries that should be auto-linked (similarity > SIMILARITY_THRESHOLD).
|
|
242
|
+
* @param {string} entryId - The new entry's ID
|
|
243
|
+
* @param {Map<string, object>} vectors - All entry vectors
|
|
244
|
+
* @returns {Array<{targetId: string, similarity: number}>} Candidate edges
|
|
245
|
+
*/
|
|
246
|
+
function inferEdges(entryId, vectors) {
|
|
247
|
+
const entryVec = vectors.get(entryId);
|
|
248
|
+
if (!entryVec) return [];
|
|
249
|
+
|
|
250
|
+
const candidates = [];
|
|
251
|
+
for (const [id, vec] of vectors) {
|
|
252
|
+
if (id === entryId) continue;
|
|
253
|
+
const sim = cosineSimilarity(entryVec, vec);
|
|
254
|
+
if (sim >= SIMILARITY_THRESHOLD) {
|
|
255
|
+
candidates.push({ targetId: id, similarity: sim });
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return candidates.sort((a, b) => b.similarity - a.similarity);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// ── Embedding Cache ───────────────────────────────────────────────────────────
|
|
263
|
+
|
|
264
|
+
/**
|
|
265
|
+
* Save embedding cache to disk.
|
|
266
|
+
* @param {string} cachePath - Absolute path to cache file
|
|
267
|
+
* @param {Map<string, object>} vectors - Entry vectors
|
|
268
|
+
* @param {Map<string, number>} df - Document frequency
|
|
269
|
+
* @param {number} N - Corpus size
|
|
270
|
+
*/
|
|
271
|
+
function saveCache(cachePath, vectors, df, N) {
|
|
272
|
+
const data = {
|
|
273
|
+
schema_version: CACHE_SCHEMA_VERSION,
|
|
274
|
+
updated_at: new Date().toISOString(),
|
|
275
|
+
corpus_size: N,
|
|
276
|
+
df: Object.fromEntries(df),
|
|
277
|
+
vectors: Object.fromEntries(vectors),
|
|
278
|
+
checksum: '',
|
|
279
|
+
};
|
|
280
|
+
|
|
281
|
+
const payload = JSON.stringify(data);
|
|
282
|
+
data.checksum = crypto.createHash('sha256').update(payload).digest('hex');
|
|
283
|
+
|
|
284
|
+
const dir = path.dirname(cachePath);
|
|
285
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
286
|
+
fs.writeFileSync(cachePath, JSON.stringify(data, null, 2));
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Load embedding cache from disk.
|
|
291
|
+
* @param {string} cachePath - Absolute path to cache file
|
|
292
|
+
* @returns {{ vectors: Map<string, object>, df: Map<string, number>, N: number } | null}
|
|
293
|
+
*/
|
|
294
|
+
function loadCache(cachePath) {
|
|
295
|
+
if (!fs.existsSync(cachePath)) return null;
|
|
296
|
+
|
|
297
|
+
try {
|
|
298
|
+
const raw = JSON.parse(fs.readFileSync(cachePath, 'utf8'));
|
|
299
|
+
if (raw.schema_version !== CACHE_SCHEMA_VERSION) return null;
|
|
300
|
+
|
|
301
|
+
return {
|
|
302
|
+
vectors: new Map(Object.entries(raw.vectors || {})),
|
|
303
|
+
df: new Map(Object.entries(raw.df || {})),
|
|
304
|
+
N: raw.corpus_size || 0,
|
|
305
|
+
};
|
|
306
|
+
} catch {
|
|
307
|
+
return null; // Corrupt cache — rebuild
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
// ── Exports ───────────────────────────────────────────────────────────────────
|
|
312
|
+
module.exports = {
|
|
313
|
+
tokenize,
|
|
314
|
+
bigrams,
|
|
315
|
+
buildDocumentFrequency,
|
|
316
|
+
computeTfIdfVector,
|
|
317
|
+
cosineSimilarity,
|
|
318
|
+
buildEmbeddings,
|
|
319
|
+
embedQuery,
|
|
320
|
+
findSimilar,
|
|
321
|
+
inferEdges,
|
|
322
|
+
saveCache,
|
|
323
|
+
loadCache,
|
|
324
|
+
SIMILARITY_THRESHOLD,
|
|
325
|
+
SHADOW_THRESHOLD,
|
|
326
|
+
};
|