mindforge-cc 11.0.0 → 11.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/hooks/mindforge-statusline.js +2 -2
- package/.mindforge/config.json +13 -4
- package/CHANGELOG.md +101 -0
- package/MINDFORGE.md +3 -3
- package/RELEASENOTES.md +1 -1
- package/bin/autonomous/audit-writer.js +108 -86
- package/bin/autonomous/auto-runner.js +304 -19
- package/bin/autonomous/dependency-dag.js +59 -0
- package/bin/autonomous/wave-executor.js +20 -1
- package/bin/council-cli.js +161 -0
- package/bin/dashboard/approval-handler.js +3 -1
- package/bin/dashboard/server.js +1 -1
- package/bin/dashboard/sse-bridge.js +9 -12
- package/bin/engine/council-runtime.js +124 -0
- package/bin/engine/otel-exporter.js +123 -0
- package/bin/engine/remediation-engine.js +1 -1
- package/bin/engine/self-corrective-synthesizer.js +1 -1
- package/bin/engine/temporal-cli.js +4 -2
- package/bin/engine/verification-runner.js +131 -0
- package/bin/engine/verify-cli.js +34 -0
- package/bin/eval/eval-harness.js +82 -0
- package/bin/eval/golden-set-retrieval.json +46 -0
- package/bin/governance/audit-hash.js +12 -0
- package/bin/governance/audit-verifier.js +60 -0
- package/bin/governance/quantum-crypto.js +63 -9
- package/bin/governance/ztai-manager.js +30 -2
- package/bin/hindsight-injector.js +5 -6
- package/bin/hooks/instinct-capture-hook.js +186 -0
- package/bin/memory/auto-shadow.js +32 -3
- package/bin/memory/identity-synthesizer.js +2 -2
- package/bin/memory/knowledge-store.js +30 -6
- package/bin/memory/retrieval-fusion.js +58 -0
- package/bin/memory/semantic-hub.js +2 -2
- package/bin/memory/vector-hub.js +111 -6
- package/bin/mindforge-cli.js +4 -5
- package/bin/models/anthropic-provider.js +13 -4
- package/bin/models/cost-tracker.js +3 -1
- package/bin/models/difficulty-scorer.js +54 -0
- package/bin/models/gemini-provider.js +6 -2
- package/bin/models/model-router.js +31 -18
- package/bin/models/openai-provider.js +6 -3
- package/bin/models/pricing-registry.js +128 -0
- package/bin/review/ads-engine.js +1 -1
- package/bin/security/trust-boundaries.js +102 -0
- package/bin/security/trust-gate-hook.js +39 -0
- package/bin/skill-registry.js +3 -2
- package/bin/skills-builder/marketplace-cli.js +5 -3
- package/bin/skills-builder/skill-registrar.js +4 -6
- package/bin/sre/sentinel.js +7 -5
- package/bin/utils/append-queue.js +55 -0
- package/bin/utils/file-io.js +27 -37
- package/bin/utils/version-check.js +59 -0
- package/bin/verify-audit.js +12 -0
- package/bin/wizard/theme.js +1 -2
- package/package.json +1 -1
- package/bin/dashboard/team-tracker.js +0 -0
package/bin/dashboard/server.js
CHANGED
|
@@ -161,7 +161,7 @@ app.use((req, res, next) => {
|
|
|
161
161
|
res.setHeader('X-Content-Type-Options', 'nosniff');
|
|
162
162
|
res.setHeader('X-Frame-Options', 'DENY');
|
|
163
163
|
res.setHeader('Cache-Control', 'no-store'); // Never cache dashboard responses
|
|
164
|
-
res.setHeader('Content-Security-Policy',
|
|
164
|
+
res.setHeader('Content-Security-Policy', 'default-src \'self\'; script-src \'self\'; style-src \'self\' \'unsafe-inline\'; connect-src \'self\'');
|
|
165
165
|
res.setHeader('X-XSS-Protection', '1; mode=block');
|
|
166
166
|
res.setHeader('Referrer-Policy', 'strict-origin-when-cross-origin');
|
|
167
167
|
next();
|
|
@@ -21,7 +21,6 @@ const APPROVAL_DIR = path.join(process.cwd(), '.planning', 'approvals');
|
|
|
21
21
|
const clients = new Set(); // Connected SSE response objects
|
|
22
22
|
|
|
23
23
|
let _lastAuditSize = 0;
|
|
24
|
-
let _auditInode = 0; // Track file inode for rotation detection
|
|
25
24
|
let _lastAutoState = '';
|
|
26
25
|
let _lastApprovals = '';
|
|
27
26
|
|
|
@@ -78,14 +77,16 @@ function pollAuditLog() {
|
|
|
78
77
|
try {
|
|
79
78
|
const stat = fs.statSync(AUDIT_PATH);
|
|
80
79
|
const newSize = stat.size;
|
|
81
|
-
const newIno = stat.ino;
|
|
82
80
|
|
|
83
|
-
//
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
// Truncation / recreation recovery (NOT rotation — audit rotation was retired in
|
|
82
|
+
// UC-04b because it broke the hash chain; AUDIT.jsonl grows unbounded). If the
|
|
83
|
+
// file ever SHRINKS (manually truncated, .planning wiped, or replaced), reset the
|
|
84
|
+
// read offset to 0 so the live tail keeps working instead of stalling forever on
|
|
85
|
+
// the `newSize <= _lastAuditSize` early-return below or reading at a stale offset.
|
|
86
|
+
if (newSize < _lastAuditSize) {
|
|
87
|
+
process.stderr.write(`[sse-bridge] AUDIT.jsonl shrank (size: ${_lastAuditSize} -> ${newSize}) — re-tailing from start\n`);
|
|
86
88
|
_lastAuditSize = 0;
|
|
87
89
|
}
|
|
88
|
-
_auditInode = newIno;
|
|
89
90
|
|
|
90
91
|
if (newSize <= _lastAuditSize) return;
|
|
91
92
|
|
|
@@ -169,9 +170,7 @@ function startPolling() {
|
|
|
169
170
|
|
|
170
171
|
// Initialize AUDIT position on first start
|
|
171
172
|
if (!_initialized && fs.existsSync(AUDIT_PATH)) {
|
|
172
|
-
|
|
173
|
-
_lastAuditSize = stat.size;
|
|
174
|
-
_auditInode = stat.ino;
|
|
173
|
+
_lastAuditSize = fs.statSync(AUDIT_PATH).size;
|
|
175
174
|
_initialized = true;
|
|
176
175
|
}
|
|
177
176
|
|
|
@@ -207,9 +206,7 @@ function stopPolling() {
|
|
|
207
206
|
function start() {
|
|
208
207
|
// Pre-initialize AUDIT position so first client gets instant data
|
|
209
208
|
if (!_initialized && fs.existsSync(AUDIT_PATH)) {
|
|
210
|
-
|
|
211
|
-
_lastAuditSize = stat.size;
|
|
212
|
-
_auditInode = stat.ino;
|
|
209
|
+
_lastAuditSize = fs.statSync(AUDIT_PATH).size;
|
|
213
210
|
_initialized = true;
|
|
214
211
|
}
|
|
215
212
|
// Polling starts lazily when addClient() is called
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* MindForge — Council Runtime (UC-10). Thin multi-voice decision harness (ADS).
|
|
4
|
+
*
|
|
5
|
+
* Activates the Adversarial Decision Loop (ADS) mandated by CLAUDE.md: instead of
|
|
6
|
+
* a multi-round debate simulator, this is a THIN runtime — parallel position
|
|
7
|
+
* collection (one per voice) + consensus scoring + dissent capture.
|
|
8
|
+
*
|
|
9
|
+
* The model is INJECTABLE (no hard LLM dependency) so callers/tests can supply a
|
|
10
|
+
* mock. The Semaphore from wave-executor is reused to bound concurrent voice calls.
|
|
11
|
+
*
|
|
12
|
+
* Design decision — NO challenge round (kept thin per adversarial review + YAGNI):
|
|
13
|
+
* A second model call per dissenter that folds a revised confidence back into the
|
|
14
|
+
* consensus adds ordering/weighting complexity and extra failure modes without
|
|
15
|
+
* changing the activation goal. Position-collection + consensus fully satisfies the
|
|
16
|
+
* ADS loop and the verdict contract. Add a challenge round only if a concrete need
|
|
17
|
+
* arises (an explicit `opts.challengeRound` flag would be the clean extension point).
|
|
18
|
+
*
|
|
19
|
+
* Filenames use opts.decisionId (NOT Date.now(), which is unavailable in some
|
|
20
|
+
* MindForge execution contexts); falls back to "council-latest.json".
|
|
21
|
+
*/
|
|
22
|
+
const fs = require('fs');
|
|
23
|
+
const path = require('path');
|
|
24
|
+
const { Semaphore } = require('../autonomous/wave-executor');
|
|
25
|
+
|
|
26
|
+
const DEFAULT_VOICES = ['architect', 'skeptic', 'pragmatist', 'critic'];
|
|
27
|
+
const VALID_RECOMMENDATIONS = ['PROCEED', 'REVISE'];
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Validates a single voice's position payload. Throws a clear, voice-named error
|
|
31
|
+
* on a malformed payload rather than letting it degrade into NaN consensus.
|
|
32
|
+
* @param {string} voice — The voice that produced the position (for the error message).
|
|
33
|
+
* @param {object} position — The raw position returned by the model.
|
|
34
|
+
*/
|
|
35
|
+
function validatePosition(voice, position) {
|
|
36
|
+
if (!position || typeof position !== 'object') {
|
|
37
|
+
throw new Error(`Council voice "${voice}" returned an invalid position: expected an object, got ${position === null ? 'null' : typeof position}`);
|
|
38
|
+
}
|
|
39
|
+
if (!VALID_RECOMMENDATIONS.includes(position.recommendation)) {
|
|
40
|
+
throw new Error(`Council voice "${voice}" returned an invalid position: recommendation must be one of ${VALID_RECOMMENDATIONS.join('/')}, got ${JSON.stringify(position.recommendation)}`);
|
|
41
|
+
}
|
|
42
|
+
if (typeof position.confidence !== 'number' || !Number.isFinite(position.confidence) || position.confidence < 0 || position.confidence > 1) {
|
|
43
|
+
throw new Error(`Council voice "${voice}" returned an invalid position: confidence must be a number in [0,1], got ${position.confidence}`);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Runs a thin adversarial council over a question.
|
|
49
|
+
* @param {string} question — The decision/question put to the council.
|
|
50
|
+
* @param {object} [opts]
|
|
51
|
+
* @param {string[]} [opts.voices] — Voice personas to consult (default: 4 ADS voices).
|
|
52
|
+
* @param {number} [opts.consensusThreshold=0.75] — Threshold for PROCEED/REVISE.
|
|
53
|
+
* @param {function} opts.model — REQUIRED. async ({voice, question}) =>
|
|
54
|
+
* { recommendation: 'PROCEED'|'REVISE', confidence: number(0..1), rationale: string }
|
|
55
|
+
* @param {number} [opts.maxConcurrency] — Bound on parallel voice calls (default: #voices).
|
|
56
|
+
* @param {boolean} [opts.writeDecision=true] — Persist a decision record to disk.
|
|
57
|
+
* @param {string} [opts.outputPath] — Directory for the record (default: .planning/decisions).
|
|
58
|
+
* @param {string} [opts.decisionId] — Stable id used in the filename (no Date.now()).
|
|
59
|
+
* @returns {Promise<{question,positions,consensus,verdict,dissent}>}
|
|
60
|
+
*/
|
|
61
|
+
async function runCouncil(question, opts = {}) {
|
|
62
|
+
const voices = Array.isArray(opts.voices) && opts.voices.length > 0
|
|
63
|
+
? opts.voices
|
|
64
|
+
: DEFAULT_VOICES;
|
|
65
|
+
const consensusThreshold = opts.consensusThreshold ?? 0.75;
|
|
66
|
+
const model = opts.model;
|
|
67
|
+
if (typeof model !== 'function') {
|
|
68
|
+
throw new Error('runCouncil requires an injectable model function (opts.model)');
|
|
69
|
+
}
|
|
70
|
+
const maxConcurrency = opts.maxConcurrency || voices.length;
|
|
71
|
+
|
|
72
|
+
// Parallel position collection — bounded by the reused Semaphore.
|
|
73
|
+
const sem = new Semaphore(maxConcurrency);
|
|
74
|
+
const positions = await Promise.all(voices.map(async (voice) => {
|
|
75
|
+
await sem.acquire();
|
|
76
|
+
try {
|
|
77
|
+
const position = await model({ voice, question });
|
|
78
|
+
// Validate the payload immediately — never silently swallow a malformed
|
|
79
|
+
// position into NaN consensus (which would collapse to NO_CONSENSUS and
|
|
80
|
+
// write NaN to the decision record).
|
|
81
|
+
validatePosition(voice, position);
|
|
82
|
+
return { voice, ...position };
|
|
83
|
+
} finally {
|
|
84
|
+
sem.release();
|
|
85
|
+
}
|
|
86
|
+
}));
|
|
87
|
+
|
|
88
|
+
// Consensus = mean approval signal across voices.
|
|
89
|
+
// A PROCEED contributes its confidence; a REVISE contributes its inverse
|
|
90
|
+
// (so a high-confidence REVISE pulls consensus down hard).
|
|
91
|
+
const consensus = positions.reduce((sum, p) => {
|
|
92
|
+
const approval = p.recommendation === 'PROCEED' ? p.confidence : (1 - p.confidence);
|
|
93
|
+
return sum + approval;
|
|
94
|
+
}, 0) / positions.length;
|
|
95
|
+
|
|
96
|
+
const verdict = consensus >= consensusThreshold ? 'PROCEED'
|
|
97
|
+
: consensus <= (1 - consensusThreshold) ? 'REVISE'
|
|
98
|
+
: 'NO_CONSENSUS';
|
|
99
|
+
|
|
100
|
+
// Dissent capture:
|
|
101
|
+
// - For a decisive verdict (PROCEED/REVISE): the voices opposing that direction.
|
|
102
|
+
// - For NO_CONSENSUS (the deadlock ADS most needs documented): the FULL split —
|
|
103
|
+
// every voice's {voice, recommendation, rationale} — so the decision record
|
|
104
|
+
// preserves both camps rather than recording an empty dissent list.
|
|
105
|
+
const dissent = verdict === 'NO_CONSENSUS'
|
|
106
|
+
? positions.map((p) => ({ voice: p.voice, recommendation: p.recommendation, rationale: p.rationale }))
|
|
107
|
+
: positions.filter((p) =>
|
|
108
|
+
(verdict === 'PROCEED' && p.recommendation !== 'PROCEED') ||
|
|
109
|
+
(verdict === 'REVISE' && p.recommendation === 'PROCEED'))
|
|
110
|
+
.map((d) => ({ voice: d.voice, rationale: d.rationale }));
|
|
111
|
+
|
|
112
|
+
const result = { question, positions, consensus, verdict, dissent };
|
|
113
|
+
|
|
114
|
+
if (opts.writeDecision !== false) {
|
|
115
|
+
const dir = opts.outputPath || path.join(process.cwd(), '.planning', 'decisions');
|
|
116
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
117
|
+
const name = opts.decisionId ? `council-${opts.decisionId}.json` : 'council-latest.json';
|
|
118
|
+
fs.writeFileSync(path.join(dir, name), JSON.stringify(result, null, 2));
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return result;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
module.exports = { runCouncil };
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
/**
|
|
3
|
+
* MindForge — OTel GenAI Exporter (UC-18).
|
|
4
|
+
* Translates NexusTracer spans to OpenTelemetry GenAI semantic conventions.
|
|
5
|
+
* Active only when OTEL_EXPORTER_OTLP_ENDPOINT is set.
|
|
6
|
+
*
|
|
7
|
+
* NexusTracer span shape (from nexus-tracer.js startSpan/endSpan):
|
|
8
|
+
* {
|
|
9
|
+
* id: 'sp_<hex>',
|
|
10
|
+
* trace_id: 'tr_<hex>',
|
|
11
|
+
* parent_id: string|null,
|
|
12
|
+
* name: string,
|
|
13
|
+
* status: 'active'|'success'|'error',
|
|
14
|
+
* start_time: ISO-8601,
|
|
15
|
+
* end_time: ISO-8601,
|
|
16
|
+
* attributes: {
|
|
17
|
+
* service: string,
|
|
18
|
+
* host: string,
|
|
19
|
+
* pid: number,
|
|
20
|
+
* model_id?: string,
|
|
21
|
+
* skill?: string,
|
|
22
|
+
* input_tokens?: number,
|
|
23
|
+
* output_tokens?: number,
|
|
24
|
+
* ...
|
|
25
|
+
* }
|
|
26
|
+
* }
|
|
27
|
+
*
|
|
28
|
+
* Mapping to OTel GenAI semantic conventions:
|
|
29
|
+
* span.name → name
|
|
30
|
+
* span.attributes.model_id → gen_ai.request.model, gen_ai.response.model
|
|
31
|
+
* span.attributes.input_tokens → gen_ai.usage.input_tokens
|
|
32
|
+
* span.attributes.output_tokens → gen_ai.usage.output_tokens
|
|
33
|
+
* span.name → gen_ai.operation.name
|
|
34
|
+
* 'mindforge' → gen_ai.system (or span.attributes.provider if present)
|
|
35
|
+
*/
|
|
36
|
+
|
|
37
|
+
const crypto = require('crypto');
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Check if the OTel exporter is enabled (env var gate).
|
|
41
|
+
* @returns {boolean}
|
|
42
|
+
*/
|
|
43
|
+
function isEnabled() {
|
|
44
|
+
return !!process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Translate a NexusTracer span to OTel GenAI-compatible format.
|
|
49
|
+
* Produces a valid 16-byte hex traceId and 8-byte hex spanId.
|
|
50
|
+
*
|
|
51
|
+
* @param {object} nexusSpan - A span object from NexusTracer
|
|
52
|
+
* @returns {object} OTel-compatible span object
|
|
53
|
+
*/
|
|
54
|
+
function toOtelSpan(nexusSpan) {
|
|
55
|
+
const attrs = nexusSpan.attributes || {};
|
|
56
|
+
|
|
57
|
+
return {
|
|
58
|
+
traceId: crypto.randomBytes(16).toString('hex'),
|
|
59
|
+
spanId: crypto.randomBytes(8).toString('hex'),
|
|
60
|
+
parentSpanId: nexusSpan.parent_id || '',
|
|
61
|
+
name: nexusSpan.name || 'unknown',
|
|
62
|
+
kind: 1, // SPAN_KIND_INTERNAL
|
|
63
|
+
startTimeUnixNano: nexusSpan.start_time
|
|
64
|
+
? BigInt(new Date(nexusSpan.start_time).getTime()) * 1_000_000n
|
|
65
|
+
: 0n,
|
|
66
|
+
endTimeUnixNano: nexusSpan.end_time
|
|
67
|
+
? BigInt(new Date(nexusSpan.end_time).getTime()) * 1_000_000n
|
|
68
|
+
: 0n,
|
|
69
|
+
status: nexusSpan.status === 'success' ? { code: 1 } : { code: 2 },
|
|
70
|
+
attributes: {
|
|
71
|
+
'gen_ai.system': attrs.provider || 'mindforge',
|
|
72
|
+
'gen_ai.request.model': attrs.model_id || '',
|
|
73
|
+
'gen_ai.response.model': attrs.model_id || '',
|
|
74
|
+
'gen_ai.usage.input_tokens': attrs.input_tokens || 0,
|
|
75
|
+
'gen_ai.usage.output_tokens': attrs.output_tokens || 0,
|
|
76
|
+
'gen_ai.operation.name': nexusSpan.name || '',
|
|
77
|
+
'service.name': attrs.service || 'mindforge-nexus',
|
|
78
|
+
},
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Serialize BigInt values to strings for JSON compatibility.
|
|
84
|
+
* @param {object} otelSpan
|
|
85
|
+
* @returns {object}
|
|
86
|
+
*/
|
|
87
|
+
function toJsonSafe(otelSpan) {
|
|
88
|
+
return {
|
|
89
|
+
...otelSpan,
|
|
90
|
+
startTimeUnixNano: String(otelSpan.startTimeUnixNano),
|
|
91
|
+
endTimeUnixNano: String(otelSpan.endTimeUnixNano),
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Export a NexusTracer span to the OTel-compatible local file.
|
|
97
|
+
* In production, this would POST to OTEL_EXPORTER_OTLP_ENDPOINT/v1/traces.
|
|
98
|
+
* For now, appends to .mindforge/metrics/otel-spans.jsonl for verification.
|
|
99
|
+
*
|
|
100
|
+
* @param {object} nexusSpan - A span from NexusTracer
|
|
101
|
+
*/
|
|
102
|
+
async function exportSpan(nexusSpan) {
|
|
103
|
+
if (!isEnabled()) return;
|
|
104
|
+
|
|
105
|
+
const otelSpan = toOtelSpan(nexusSpan);
|
|
106
|
+
const jsonSafe = toJsonSafe(otelSpan);
|
|
107
|
+
|
|
108
|
+
const fs = require('fs');
|
|
109
|
+
const path = require('path');
|
|
110
|
+
const outPath = path.join(process.cwd(), '.mindforge', 'metrics', 'otel-spans.jsonl');
|
|
111
|
+
|
|
112
|
+
try {
|
|
113
|
+
const dir = path.dirname(outPath);
|
|
114
|
+
if (!fs.existsSync(dir)) {
|
|
115
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
116
|
+
}
|
|
117
|
+
fs.appendFileSync(outPath, JSON.stringify(jsonSafe) + '\n');
|
|
118
|
+
} catch {
|
|
119
|
+
// Non-fatal: observability export should never break the main flow
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
module.exports = { isEnabled, toOtelSpan, toJsonSafe, exportSpan };
|
|
@@ -12,7 +12,7 @@ const SUBCOMMAND = ARGS[0];
|
|
|
12
12
|
|
|
13
13
|
async function main() {
|
|
14
14
|
switch (SUBCOMMAND) {
|
|
15
|
-
case 'status':
|
|
15
|
+
case 'status': {
|
|
16
16
|
const history = TemporalHub.getHistory();
|
|
17
17
|
console.log('\n⏳ MindForge Temporal Status');
|
|
18
18
|
console.log(` Snapshots: ${history.length}`);
|
|
@@ -20,6 +20,7 @@ async function main() {
|
|
|
20
20
|
console.log(` Latest: ${history[0].id} (${history[0].timestamp})`);
|
|
21
21
|
}
|
|
22
22
|
break;
|
|
23
|
+
}
|
|
23
24
|
|
|
24
25
|
case 'cleanup':
|
|
25
26
|
console.log('🧹 Cleaning up old temporal snapshots...');
|
|
@@ -27,7 +28,7 @@ async function main() {
|
|
|
27
28
|
console.log('✅ Cleanup complete.');
|
|
28
29
|
break;
|
|
29
30
|
|
|
30
|
-
case 'inject':
|
|
31
|
+
case 'inject': {
|
|
31
32
|
const auditId = ARGS[1];
|
|
32
33
|
const fix = ARGS.slice(2).join(' ');
|
|
33
34
|
if (!auditId || !fix) {
|
|
@@ -42,6 +43,7 @@ async function main() {
|
|
|
42
43
|
process.exit(1);
|
|
43
44
|
}
|
|
44
45
|
break;
|
|
46
|
+
}
|
|
45
47
|
|
|
46
48
|
default:
|
|
47
49
|
console.log('Usage: /mindforge:temporal <status|cleanup|inject>');
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { execSync } = require('child_process');
|
|
4
|
+
const path = require('path');
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
|
|
7
|
+
const MAX_OUTPUT_LENGTH = 2000;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Stage definitions — each maps a stage name to its command and optional skip condition.
|
|
11
|
+
* The tests stage guards against recursion: if NODE_ENV=test (set by run-all.js) or
|
|
12
|
+
* MINDFORGE_VERIFICATION_ACTIVE=1 (set by this runner), we skip to prevent infinite nesting.
|
|
13
|
+
*/
|
|
14
|
+
const STAGE_DEFS = {
|
|
15
|
+
tests: {
|
|
16
|
+
command: 'node tests/run-all.js',
|
|
17
|
+
skipIf: () =>
|
|
18
|
+
process.env.MINDFORGE_VERIFICATION_ACTIVE === '1' ||
|
|
19
|
+
process.env.NODE_ENV === 'test',
|
|
20
|
+
},
|
|
21
|
+
lint: {
|
|
22
|
+
command: 'npx eslint . --max-warnings=0',
|
|
23
|
+
skipIf: null,
|
|
24
|
+
},
|
|
25
|
+
audit: {
|
|
26
|
+
command: 'node bin/verify-audit.js',
|
|
27
|
+
skipIf: null,
|
|
28
|
+
},
|
|
29
|
+
typecheck: {
|
|
30
|
+
command: 'npx tsc --noEmit',
|
|
31
|
+
skipIf: (cwd) => !fs.existsSync(path.join(cwd, 'tsconfig.json')),
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Run a single stage, returning a structured result object.
|
|
37
|
+
*/
|
|
38
|
+
function executeStage(name, cwd) {
|
|
39
|
+
const def = STAGE_DEFS[name];
|
|
40
|
+
if (!def) {
|
|
41
|
+
return { name, status: 'skip', durationMs: 0, output: `Unknown stage: ${name}` };
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Check skip condition
|
|
45
|
+
if (def.skipIf && def.skipIf(cwd)) {
|
|
46
|
+
return { name, status: 'skip', durationMs: 0, output: '' };
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const start = Date.now();
|
|
50
|
+
let output = '';
|
|
51
|
+
let status = 'pass';
|
|
52
|
+
|
|
53
|
+
try {
|
|
54
|
+
const env = Object.assign({}, process.env, {
|
|
55
|
+
MINDFORGE_VERIFICATION_ACTIVE: '1',
|
|
56
|
+
});
|
|
57
|
+
const result = execSync(def.command, {
|
|
58
|
+
cwd,
|
|
59
|
+
encoding: 'utf8',
|
|
60
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
61
|
+
timeout: 120000,
|
|
62
|
+
env,
|
|
63
|
+
});
|
|
64
|
+
output = (result || '').slice(0, MAX_OUTPUT_LENGTH);
|
|
65
|
+
} catch (err) {
|
|
66
|
+
status = 'fail';
|
|
67
|
+
const stdout = err.stdout || '';
|
|
68
|
+
const stderr = err.stderr || '';
|
|
69
|
+
output = (stdout + '\n' + stderr).trim().slice(0, MAX_OUTPUT_LENGTH);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const durationMs = Date.now() - start;
|
|
73
|
+
return { name, status, durationMs, output };
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Run verification across multiple stages.
|
|
78
|
+
* @param {{ cwd: string, stages: string[] }} opts
|
|
79
|
+
* @returns {Promise<object>} Structured verification result
|
|
80
|
+
*/
|
|
81
|
+
async function runVerification({ cwd, stages }) {
|
|
82
|
+
const resolvedCwd = path.resolve(cwd);
|
|
83
|
+
const results = [];
|
|
84
|
+
|
|
85
|
+
for (const stageName of stages) {
|
|
86
|
+
const result = executeStage(stageName, resolvedCwd);
|
|
87
|
+
results.push(result);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
const passed = results.filter(s => s.status === 'pass').length;
|
|
91
|
+
const failed = results.filter(s => s.status === 'fail').length;
|
|
92
|
+
const skipped = results.filter(s => s.status === 'skip').length;
|
|
93
|
+
const totalDurationMs = results.reduce((sum, s) => sum + s.durationMs, 0);
|
|
94
|
+
|
|
95
|
+
return {
|
|
96
|
+
stages: results,
|
|
97
|
+
summary: { passed, failed, skipped, totalDurationMs },
|
|
98
|
+
timestamp: new Date().toISOString(),
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Format a verification result as a markdown report.
|
|
104
|
+
* @param {object} result — output from runVerification
|
|
105
|
+
* @returns {string} Markdown report
|
|
106
|
+
*/
|
|
107
|
+
function formatReport(result) {
|
|
108
|
+
const statusEmoji = { pass: '✅', fail: '❌', skip: '⏭️' };
|
|
109
|
+
const lines = [];
|
|
110
|
+
|
|
111
|
+
lines.push('# Verification Report');
|
|
112
|
+
lines.push('');
|
|
113
|
+
lines.push(`**Timestamp:** ${result.timestamp}`);
|
|
114
|
+
lines.push('');
|
|
115
|
+
lines.push('| Stage | Status | Duration |');
|
|
116
|
+
lines.push('|-------|--------|----------|');
|
|
117
|
+
|
|
118
|
+
for (const stage of result.stages) {
|
|
119
|
+
const emoji = statusEmoji[stage.status] || '?';
|
|
120
|
+
const duration = stage.durationMs > 0 ? `${stage.durationMs}ms` : '-';
|
|
121
|
+
lines.push(`| ${stage.name} | ${emoji} ${stage.status} | ${duration} |`);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
lines.push('');
|
|
125
|
+
lines.push(`**Summary:** ${result.summary.passed} passed, ${result.summary.failed} failed, ${result.summary.skipped} skipped (${result.summary.totalDurationMs}ms total)`);
|
|
126
|
+
lines.push('');
|
|
127
|
+
|
|
128
|
+
return lines.join('\n');
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
module.exports = { runVerification, formatReport };
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* verify-cli.js — Entrypoint for the `verify` CLI command.
|
|
6
|
+
* Calls the unified verification runner across all stages and writes
|
|
7
|
+
* the formatted report to .planning/VERIFICATION.md.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const path = require('path');
|
|
11
|
+
const fs = require('fs');
|
|
12
|
+
const { runVerification, formatReport } = require('./verification-runner');
|
|
13
|
+
|
|
14
|
+
const STAGES = ['tests', 'lint', 'audit', 'typecheck'];
|
|
15
|
+
const CWD = process.env.MINDFORGE_ROOT || path.resolve(__dirname, '../..');
|
|
16
|
+
|
|
17
|
+
async function main() {
|
|
18
|
+
const planningDir = path.join(CWD, '.planning');
|
|
19
|
+
if (!fs.existsSync(planningDir)) {
|
|
20
|
+
fs.mkdirSync(planningDir, { recursive: true });
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const result = await runVerification({ cwd: CWD, stages: STAGES });
|
|
24
|
+
const report = formatReport(result);
|
|
25
|
+
|
|
26
|
+
fs.writeFileSync(path.join(planningDir, 'VERIFICATION.md'), report);
|
|
27
|
+
process.stdout.write(report + '\n');
|
|
28
|
+
process.exit(result.summary.failed > 0 ? 1 : 0);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
main().catch(err => {
|
|
32
|
+
console.error('Verification runner failed:', err.message);
|
|
33
|
+
process.exit(1);
|
|
34
|
+
});
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Recall@K — fraction of relevant items found in the top-k retrieved results.
|
|
5
|
+
* @param {string[]} retrieved - IDs in ranked order
|
|
6
|
+
* @param {string[]} relevant - ground-truth relevant IDs
|
|
7
|
+
* @param {number} k - cutoff
|
|
8
|
+
* @returns {number} recall in [0, 1]
|
|
9
|
+
*/
|
|
10
|
+
function recallAtK(retrieved, relevant, k) {
|
|
11
|
+
if (relevant.length === 0) return 0;
|
|
12
|
+
const topK = retrieved.slice(0, k);
|
|
13
|
+
const relevantSet = new Set(relevant);
|
|
14
|
+
const found = topK.filter(id => relevantSet.has(id)).length;
|
|
15
|
+
return found / relevant.length;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* nDCG (Normalized Discounted Cumulative Gain) with graded relevance.
|
|
20
|
+
* @param {string[]} retrieved - IDs in ranked order
|
|
21
|
+
* @param {Object.<string, number>} relevanceMap - {id: grade} where grade is 0-3
|
|
22
|
+
* @param {number} k - cutoff
|
|
23
|
+
* @returns {number} nDCG in [0, 1]
|
|
24
|
+
*/
|
|
25
|
+
function ndcg(retrieved, relevanceMap, k) {
|
|
26
|
+
const topK = retrieved.slice(0, k);
|
|
27
|
+
|
|
28
|
+
// DCG = Σ (2^rel_i - 1) / log2(i + 2) for i = 0..k-1
|
|
29
|
+
const dcg = topK.reduce((sum, id, i) => {
|
|
30
|
+
const rel = relevanceMap[id] || 0;
|
|
31
|
+
return sum + (Math.pow(2, rel) - 1) / Math.log2(i + 2);
|
|
32
|
+
}, 0);
|
|
33
|
+
|
|
34
|
+
// IDCG — ideal ordering: sort all relevance grades descending, take top-k
|
|
35
|
+
const idealGrades = Object.values(relevanceMap)
|
|
36
|
+
.filter(g => g > 0)
|
|
37
|
+
.sort((a, b) => b - a)
|
|
38
|
+
.slice(0, k);
|
|
39
|
+
|
|
40
|
+
const idcg = idealGrades.reduce((sum, rel, i) => {
|
|
41
|
+
return sum + (Math.pow(2, rel) - 1) / Math.log2(i + 2);
|
|
42
|
+
}, 0);
|
|
43
|
+
|
|
44
|
+
if (idcg === 0) return 0;
|
|
45
|
+
return dcg / idcg;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Run a full evaluation over a golden set of queries.
|
|
50
|
+
* @param {Object} opts
|
|
51
|
+
* @param {Array<{query: string, relevant: string[]}>} opts.goldenSet
|
|
52
|
+
* @param {function(string): string[]} opts.retriever
|
|
53
|
+
* @param {number} opts.k
|
|
54
|
+
* @returns {Promise<{meanRecallAtK: number, meanNDCG: number, perQuery: Array}>}
|
|
55
|
+
*/
|
|
56
|
+
async function runEval({ goldenSet, retriever, k }) {
|
|
57
|
+
const perQuery = [];
|
|
58
|
+
|
|
59
|
+
for (const { query, relevant } of goldenSet) {
|
|
60
|
+
const retrieved = await Promise.resolve(retriever(query));
|
|
61
|
+
|
|
62
|
+
// Binary relevance map: relevant items get grade 1, others 0
|
|
63
|
+
const relevanceMap = {};
|
|
64
|
+
for (const id of relevant) {
|
|
65
|
+
relevanceMap[id] = 1;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const recall = recallAtK(retrieved, relevant, k);
|
|
69
|
+
const ndcgScore = ndcg(retrieved, relevanceMap, k);
|
|
70
|
+
|
|
71
|
+
perQuery.push({ query, recall, ndcg: ndcgScore, retrieved });
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (perQuery.length === 0) return { meanRecallAtK: 0, meanNDCG: 0, perQuery: [] };
|
|
75
|
+
|
|
76
|
+
const meanRecallAtK = perQuery.reduce((s, q) => s + q.recall, 0) / perQuery.length;
|
|
77
|
+
const meanNDCG = perQuery.reduce((s, q) => s + q.ndcg, 0) / perQuery.length;
|
|
78
|
+
|
|
79
|
+
return { meanRecallAtK, meanNDCG, perQuery };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
module.exports = { recallAtK, ndcg, runEval };
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "Golden set for retrieval quality evaluation. Each entry has a natural-language query and the IDs of documents that SHOULD be retrieved.",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"queries": [
|
|
5
|
+
{
|
|
6
|
+
"query": "how does the audit hash chain work",
|
|
7
|
+
"relevant": ["audit-hash", "audit-verifier", "verify-audit"]
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
"query": "what model should I use for a security-sensitive task",
|
|
11
|
+
"relevant": ["difficulty-scorer", "model-router", "pricing-registry", "trust-boundaries"]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"query": "how does wave execution and parallel orchestration work",
|
|
15
|
+
"relevant": ["wave-executor", "swarm-controller", "auto-executor"]
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"query": "how do I track token costs and budget enforcement",
|
|
19
|
+
"relevant": ["cost-tracker", "token-ledger", "budget-enforcer", "finops-hub"]
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"query": "how does the knowledge store persist and retrieve entries",
|
|
23
|
+
"relevant": ["knowledge-store", "knowledge-graph-protocol", "shard-controller"]
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
"query": "what happens during council consensus and synthesis",
|
|
27
|
+
"relevant": ["council-protocol", "synthesis-engine", "council-templates"]
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"query": "how do instincts get captured and promoted to skills",
|
|
31
|
+
"relevant": ["capture-engine", "promotion-engine", "instinct-schema", "skill-registry"]
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
"query": "what verification checks run before marking a task complete",
|
|
35
|
+
"relevant": ["verification-pipeline", "trust-verifier", "policy-engine"]
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"query": "how does the autonomous stuck detector recover failed tasks",
|
|
39
|
+
"relevant": ["stuck-detector", "node-repair", "steering-manager", "progress-reporter"]
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"query": "how are hooks triggered and what security gates exist pre-commit",
|
|
43
|
+
"relevant": ["trust-gate-hook", "instinct-capture-hook", "policy-gate-hardened", "impact-analyzer"]
|
|
44
|
+
}
|
|
45
|
+
]
|
|
46
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
const crypto = require('crypto');
|
|
3
|
+
/**
|
|
4
|
+
* Canonical audit hash material. MUST be the single source of truth for both
|
|
5
|
+
* the writer (pre-_hash entry) and the verifier (entry with _hash stripped).
|
|
6
|
+
* Hashes {...entry, previous_hash} — entry must NOT contain _hash.
|
|
7
|
+
*/
|
|
8
|
+
function hashAuditEntry(entryWithoutHash, previousHash) {
|
|
9
|
+
const material = JSON.stringify({ ...entryWithoutHash, previous_hash: previousHash });
|
|
10
|
+
return crypto.createHash('sha256').update(material).digest('hex');
|
|
11
|
+
}
|
|
12
|
+
module.exports = { hashAuditEntry };
|