rlhf-feedback-loop 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/CHANGELOG.md +26 -0
  2. package/LICENSE +21 -0
  3. package/README.md +308 -0
  4. package/adapters/README.md +8 -0
  5. package/adapters/amp/skills/rlhf-feedback/SKILL.md +20 -0
  6. package/adapters/chatgpt/INSTALL.md +80 -0
  7. package/adapters/chatgpt/openapi.yaml +292 -0
  8. package/adapters/claude/.mcp.json +8 -0
  9. package/adapters/codex/config.toml +4 -0
  10. package/adapters/gemini/function-declarations.json +95 -0
  11. package/adapters/mcp/server-stdio.js +444 -0
  12. package/bin/cli.js +167 -0
  13. package/config/mcp-allowlists.json +29 -0
  14. package/config/policy-bundles/constrained-v1.json +53 -0
  15. package/config/policy-bundles/default-v1.json +80 -0
  16. package/config/rubrics/default-v1.json +52 -0
  17. package/config/subagent-profiles.json +32 -0
  18. package/openapi/openapi.yaml +292 -0
  19. package/package.json +91 -0
  20. package/plugins/amp-skill/INSTALL.md +52 -0
  21. package/plugins/amp-skill/SKILL.md +31 -0
  22. package/plugins/claude-skill/INSTALL.md +55 -0
  23. package/plugins/claude-skill/SKILL.md +46 -0
  24. package/plugins/codex-profile/AGENTS.md +20 -0
  25. package/plugins/codex-profile/INSTALL.md +57 -0
  26. package/plugins/gemini-extension/INSTALL.md +74 -0
  27. package/plugins/gemini-extension/gemini_prompt.txt +10 -0
  28. package/plugins/gemini-extension/tool_contract.json +28 -0
  29. package/scripts/billing.js +471 -0
  30. package/scripts/budget-guard.js +173 -0
  31. package/scripts/code-reasoning.js +307 -0
  32. package/scripts/context-engine.js +547 -0
  33. package/scripts/contextfs.js +513 -0
  34. package/scripts/contract-audit.js +198 -0
  35. package/scripts/dpo-optimizer.js +208 -0
  36. package/scripts/export-dpo-pairs.js +316 -0
  37. package/scripts/export-training.js +448 -0
  38. package/scripts/feedback-attribution.js +313 -0
  39. package/scripts/feedback-inbox-read.js +162 -0
  40. package/scripts/feedback-loop.js +838 -0
  41. package/scripts/feedback-schema.js +300 -0
  42. package/scripts/feedback-to-memory.js +165 -0
  43. package/scripts/feedback-to-rules.js +109 -0
  44. package/scripts/generate-paperbanana-diagrams.sh +99 -0
  45. package/scripts/hybrid-feedback-context.js +676 -0
  46. package/scripts/intent-router.js +164 -0
  47. package/scripts/mcp-policy.js +92 -0
  48. package/scripts/meta-policy.js +194 -0
  49. package/scripts/plan-gate.js +154 -0
  50. package/scripts/prove-adapters.js +364 -0
  51. package/scripts/prove-attribution.js +364 -0
  52. package/scripts/prove-automation.js +393 -0
  53. package/scripts/prove-data-quality.js +219 -0
  54. package/scripts/prove-intelligence.js +256 -0
  55. package/scripts/prove-lancedb.js +370 -0
  56. package/scripts/prove-loop-closure.js +255 -0
  57. package/scripts/prove-rlaif.js +404 -0
  58. package/scripts/prove-subway-upgrades.js +250 -0
  59. package/scripts/prove-training-export.js +324 -0
  60. package/scripts/prove-v2-milestone.js +273 -0
  61. package/scripts/prove-v3-milestone.js +381 -0
  62. package/scripts/rlaif-self-audit.js +123 -0
  63. package/scripts/rubric-engine.js +230 -0
  64. package/scripts/self-heal.js +127 -0
  65. package/scripts/self-healing-check.js +111 -0
  66. package/scripts/skill-quality-tracker.js +284 -0
  67. package/scripts/subagent-profiles.js +79 -0
  68. package/scripts/sync-gh-secrets-from-env.sh +29 -0
  69. package/scripts/thompson-sampling.js +331 -0
  70. package/scripts/train_from_feedback.py +914 -0
  71. package/scripts/validate-feedback.js +580 -0
  72. package/scripts/vector-store.js +100 -0
  73. package/src/api/server.js +497 -0
@@ -0,0 +1,255 @@
1
+ 'use strict';
2
+ /**
3
+ * Phase 8: Loop Closure — Proof Gate
4
+ *
5
+ * Validates all LOOP-01 through LOOP-05 requirements offline.
6
+ * Mirrors the pattern of prove-attribution.js (mkdtempSync + env override + execSync).
7
+ *
8
+ * Usage:
9
+ * node scripts/prove-loop-closure.js
10
+ *
11
+ * Produces:
12
+ * proof/loop-closure-report.json
13
+ * proof/loop-closure-report.md
14
+ */
15
+
16
+ const { execSync } = require('child_process');
17
+ const fs = require('fs');
18
+ const os = require('os');
19
+ const path = require('path');
20
+
21
+ const PROOF_DIR = path.join(__dirname, '..', 'proof');
22
+ const REPORT_JSON = path.join(PROOF_DIR, 'loop-closure-report.json');
23
+ const REPORT_MD = path.join(PROOF_DIR, 'loop-closure-report.md');
24
+
25
+ function run() {
26
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'rlhf-loop-proof-'));
27
+ const results = { passed: 0, failed: 0, requirements: {} };
28
+
29
+ const checks = [
30
+ {
31
+ id: 'LOOP-01',
32
+ desc: 'feedback-to-rules.js: analyze() produces recurringIssues + toRules() emits NEVER bullets',
33
+ fn: () => {
34
+ delete require.cache[require.resolve('./feedback-to-rules')];
35
+ const m = require('./feedback-to-rules');
36
+ if (typeof m.parseFeedbackFile !== 'function') throw new Error('parseFeedbackFile not exported');
37
+ if (typeof m.classifySignal !== 'function') throw new Error('classifySignal not exported');
38
+ if (typeof m.analyze !== 'function') throw new Error('analyze not exported');
39
+ if (typeof m.toRules !== 'function') throw new Error('toRules not exported');
40
+
41
+ const ctx = 'Agent claimed done without running tests first';
42
+ const entries = [
43
+ { signal: 'negative', context: ctx },
44
+ { signal: 'negative', context: ctx },
45
+ ];
46
+ const report = m.analyze(entries);
47
+ if (report.recurringIssues.length < 1) throw new Error('Expected at least 1 recurring issue');
48
+ const rules = m.toRules(report);
49
+ if (!rules.includes('NEVER')) throw new Error('toRules must emit NEVER bullets');
50
+ if (!rules.startsWith('# Suggested Rules from Feedback Analysis')) {
51
+ throw new Error('toRules must start with header');
52
+ }
53
+ },
54
+ },
55
+ {
56
+ id: 'LOOP-02',
57
+ desc: 'plan-gate.js: validatePlan() rejects structurally invalid PRD, passes valid one',
58
+ fn: () => {
59
+ delete require.cache[require.resolve('./plan-gate')];
60
+ const m = require('./plan-gate');
61
+ if (typeof m.validatePlan !== 'function') throw new Error('validatePlan not exported');
62
+ if (typeof m.formatReport !== 'function') throw new Error('formatReport not exported');
63
+
64
+ // Invalid: missing required sections
65
+ const invalid = m.validatePlan('# Minimal plan\nNo sections here');
66
+ if (invalid.allPass) throw new Error('Expected allPass=false for structurally invalid PRD');
67
+
68
+ // Valid: all gates satisfied
69
+ const valid = m.validatePlan([
70
+ '# My Plan',
71
+ '',
72
+ '## Status',
73
+ 'DRAFT',
74
+ '',
75
+ '## Clarifying Questions Resolved',
76
+ '| Q | A |',
77
+ '|---|---|',
78
+ '| q1 | a1 |',
79
+ '| q2 | a2 |',
80
+ '| q3 | a3 |',
81
+ '',
82
+ '## Contracts',
83
+ '```',
84
+ 'interface Foo { bar: string }',
85
+ '```',
86
+ '',
87
+ '## Validation Checklist',
88
+ '- [ ] scenario 1',
89
+ '- [ ] scenario 2',
90
+ ].join('\n'));
91
+ if (!valid.allPass) throw new Error('Expected allPass=true for valid PRD');
92
+
93
+ const report = m.formatReport(valid);
94
+ if (!report.includes('RESULT: PASS')) throw new Error('formatReport must include RESULT: PASS');
95
+ },
96
+ },
97
+ {
98
+ id: 'LOOP-03',
99
+ desc: 'feedback-inbox-read.js: getNewEntries reads in cursor order, no re-reads on next call',
100
+ fn: () => {
101
+ delete require.cache[require.resolve('./feedback-inbox-read')];
102
+ const m = require('./feedback-inbox-read');
103
+ if (typeof m.getNewEntries !== 'function') throw new Error('getNewEntries not exported');
104
+ if (typeof m.readInbox !== 'function') throw new Error('readInbox not exported');
105
+ if (typeof m.loadCursor !== 'function') throw new Error('loadCursor not exported');
106
+ if (typeof m.saveCursor !== 'function') throw new Error('saveCursor not exported');
107
+
108
+ // Verify cursor filtering logic
109
+ const allEntries = [
110
+ { _lineIndex: 0, signal: 'negative' },
111
+ { _lineIndex: 1, signal: 'positive' },
112
+ { _lineIndex: 2, signal: 'negative' },
113
+ ];
114
+ const cursor = { lastLineIndex: 0 };
115
+ const afterFirst = allEntries.filter((e) => e._lineIndex > cursor.lastLineIndex);
116
+ if (afterFirst.length !== 2) throw new Error('Expected 2 entries after cursor=0');
117
+
118
+ const cursor2 = { lastLineIndex: 2 };
119
+ const afterAll = allEntries.filter((e) => e._lineIndex > cursor2.lastLineIndex);
120
+ if (afterAll.length !== 0) throw new Error('Expected 0 entries after cursor=2 (no re-reads)');
121
+
122
+ // Verify paths are exported
123
+ if (typeof m.INBOX_PATH !== 'string') throw new Error('INBOX_PATH must be exported string');
124
+ if (typeof m.CURSOR_PATH !== 'string') throw new Error('CURSOR_PATH must be exported string');
125
+ },
126
+ },
127
+ {
128
+ id: 'LOOP-04',
129
+ desc: 'feedback-to-memory.js: convertFeedbackToMemory() emits valid MCP memory format on round-trip',
130
+ fn: () => {
131
+ delete require.cache[require.resolve('./feedback-to-memory')];
132
+ const m = require('./feedback-to-memory');
133
+ if (typeof m.convertFeedbackToMemory !== 'function') {
134
+ throw new Error('convertFeedbackToMemory not exported');
135
+ }
136
+
137
+ // Valid negative → memory
138
+ const neg = m.convertFeedbackToMemory({
139
+ signal: 'negative',
140
+ context: 'Agent claimed fix without test evidence',
141
+ whatWentWrong: 'No tests were run before claiming done',
142
+ whatToChange: 'Always run tests before claiming done',
143
+ tags: ['verification', 'testing'],
144
+ });
145
+ if (!neg.ok) throw new Error(`Valid negative should return ok=true: ${neg.reason}`);
146
+ if (neg.actionType !== 'store-mistake') throw new Error('Expected actionType=store-mistake');
147
+ if (!neg.memory.title.startsWith('MISTAKE:')) throw new Error('Expected MISTAKE: prefix');
148
+ if (neg.memory.category !== 'error') throw new Error('Expected category=error');
149
+ if (!Array.isArray(neg.memory.tags)) throw new Error('Expected tags array');
150
+
151
+ // Valid positive → memory
152
+ const pos = m.convertFeedbackToMemory({
153
+ signal: 'positive',
154
+ whatWorked: 'Ran full test suite before claiming done',
155
+ tags: ['verification'],
156
+ });
157
+ if (!pos.ok) throw new Error(`Valid positive should return ok=true: ${pos.reason}`);
158
+ if (pos.actionType !== 'store-learning') throw new Error('Expected actionType=store-learning');
159
+ if (!pos.memory.title.startsWith('SUCCESS:')) throw new Error('Expected SUCCESS: prefix');
160
+
161
+ // Bare negative → rejected (no context)
162
+ const bare = m.convertFeedbackToMemory({ signal: 'negative' });
163
+ if (bare.ok) throw new Error('Bare negative without context should be rejected');
164
+ },
165
+ },
166
+ {
167
+ id: 'LOOP-05',
168
+ desc: 'test:loop-closure (node --test tests/loop-closure.test.js) passes with 0 failures',
169
+ fn: () => {
170
+ const out = execSync('node --test tests/loop-closure.test.js', {
171
+ cwd: path.join(__dirname, '..'),
172
+ env: { ...process.env, RLHF_FEEDBACK_DIR: tmpDir },
173
+ encoding: 'utf8',
174
+ stdio: 'pipe',
175
+ });
176
+ const failMatch = out.match(/ℹ fail (\d+)/);
177
+ if (failMatch && parseInt(failMatch[1], 10) > 0) {
178
+ throw new Error(`Tests failed: ${failMatch[1]} failure(s)\n${out.slice(-500)}`);
179
+ }
180
+ },
181
+ },
182
+ ];
183
+
184
+ console.log('Phase 8: Loop Closure — Proof Gate\n');
185
+ console.log('Checking requirements:\n');
186
+
187
+ for (const check of checks) {
188
+ try {
189
+ check.fn();
190
+ results.passed++;
191
+ results.requirements[check.id] = { status: 'pass', desc: check.desc };
192
+ console.log(` PASS ${check.id}: ${check.desc}`);
193
+ } catch (err) {
194
+ results.failed++;
195
+ results.requirements[check.id] = {
196
+ status: 'fail',
197
+ desc: check.desc,
198
+ error: err.message,
199
+ };
200
+ console.error(` FAIL ${check.id}: ${err.message}`);
201
+ }
202
+ }
203
+
204
+ // Cleanup tmp dir
205
+ try {
206
+ fs.rmSync(tmpDir, { recursive: true, force: true });
207
+ } catch {}
208
+
209
+ // Write proof artifacts
210
+ fs.mkdirSync(PROOF_DIR, { recursive: true });
211
+
212
+ const report = {
213
+ phase: '08-loop-closure',
214
+ generatedAt: new Date().toISOString(),
215
+ passed: results.passed,
216
+ failed: results.failed,
217
+ total: checks.length,
218
+ requirements: results.requirements,
219
+ };
220
+
221
+ fs.writeFileSync(REPORT_JSON, JSON.stringify(report, null, 2) + '\n');
222
+
223
+ const md = [
224
+ '# Phase 8: Loop Closure — Proof Report',
225
+ '',
226
+ `Generated: ${report.generatedAt}`,
227
+ `Result: ${results.passed}/${checks.length} passed`,
228
+ '',
229
+ '## Requirements',
230
+ '',
231
+ ...Object.entries(results.requirements).map(([id, r]) => {
232
+ const checkbox = r.status === 'pass' ? '[x]' : '[ ]';
233
+ const errLine = r.error ? `\n - Error: \`${r.error}\`` : '';
234
+ return `- ${checkbox} **${id}**: ${r.desc}${errLine}`;
235
+ }),
236
+ '',
237
+ '## Evidence',
238
+ '',
239
+ '- `scripts/feedback-to-rules.js` — Feedback pattern analysis + CLAUDE.md-compatible rule generation',
240
+ '- `scripts/plan-gate.js` — PRD structural validation gate (questions, contracts, checklist, status)',
241
+ '- `scripts/feedback-inbox-read.js` — Cursor-based inbox reader with no re-read guarantee',
242
+ '- `scripts/feedback-to-memory.js` — Stdin JSON → MCP memory format bridge with schema validation',
243
+ '- `tests/loop-closure.test.js` — 44 node:test cases covering all LOOP requirements',
244
+ '',
245
+ ].join('\n');
246
+
247
+ fs.writeFileSync(REPORT_MD, md);
248
+
249
+ console.log(`\nPhase 8 proof: ${results.passed} passed, ${results.failed} failed`);
250
+ console.log(`Report: ${REPORT_JSON}`);
251
+
252
+ if (results.failed > 0) process.exit(1);
253
+ }
254
+
255
+ run();
@@ -0,0 +1,404 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ /**
5
+ * prove-rlaif.js — Phase 5 gate proof script.
6
+ *
7
+ * Generates proof/rlaif-report.md and proof/rlaif-report.json documenting
8
+ * per-requirement evidence for DPO-01 through DPO-04.
9
+ *
10
+ * Mirrors the prove-lancedb.js structure exactly.
11
+ *
12
+ * Exit 0 if no 'fail' statuses; exit 1 if any 'fail'.
13
+ */
14
+
15
+ const fs = require('fs');
16
+ const path = require('path');
17
+ const os = require('os');
18
+ const { execSync } = require('child_process');
19
+
20
+ const ROOT = path.join(__dirname, '..');
21
+ const PROOF_DIR = path.join(ROOT, 'proof');
22
+
23
+ // Phase 4 node-runner test baseline (before Phase 5 tests)
24
+ const PHASE4_BASELINE = 93;
25
+
26
+ function ensureDir(dirPath) {
27
+ if (!fs.existsSync(dirPath)) {
28
+ fs.mkdirSync(dirPath, { recursive: true });
29
+ }
30
+ }
31
+
32
+ async function runProof() {
33
+ const report = {
34
+ phase: '05-rlaif-and-dpo-optimization',
35
+ generated: new Date().toISOString(),
36
+ requirements: {},
37
+ summary: { passed: 0, failed: 0, warned: 0 },
38
+ };
39
+
40
+ function addResult(reqId, reqStatus, evidence) {
41
+ report.requirements[reqId] = { status: reqStatus, evidence };
42
+ if (reqStatus === 'pass') report.summary.passed += 1;
43
+ else if (reqStatus === 'warn') report.summary.warned += 1;
44
+ else report.summary.failed += 1;
45
+ }
46
+
47
+ const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'prove-rlaif-'));
48
+
49
+ // ─────────────────────────────────────────────────────────────────────────
50
+ // DPO-01: selfAudit() returns float score in [0,1] and writes self-score-log.jsonl
51
+ // Evidence: create well-formed event, call selfAudit + selfAuditAndLog, verify
52
+ // ─────────────────────────────────────────────────────────────────────────
53
+ let dpo01Status = 'fail';
54
+ let dpo01Evidence = '';
55
+ try {
56
+ delete require.cache[require.resolve('./rlaif-self-audit')];
57
+ const { selfAudit, selfAuditAndLog } = require('./rlaif-self-audit');
58
+
59
+ const event = {
60
+ id: 'proof-dpo01',
61
+ signal: 'positive',
62
+ context: 'RLAIF proof: selfAudit smoke test for DPO-01 verification',
63
+ whatWorked: 'selfAudit returns score in [0,1] with constraint breakdown',
64
+ tags: ['proof', 'rlaif', 'dpo01'],
65
+ rubric: { promotionEligible: true, failingGuardrails: [] },
66
+ timestamp: new Date().toISOString(),
67
+ };
68
+
69
+ const auditResult = selfAudit(event);
70
+
71
+ // Verify score is a finite float in [0, 1]
72
+ const scoreOk = typeof auditResult.score === 'number' &&
73
+ isFinite(auditResult.score) &&
74
+ auditResult.score >= 0 &&
75
+ auditResult.score <= 1;
76
+
77
+ const constraintsOk = Array.isArray(auditResult.constraints) &&
78
+ auditResult.constraints.length === 6;
79
+
80
+ // Test selfAuditAndLog — writes self-score-log.jsonl to tmpDir
81
+ const logResult = selfAuditAndLog(event, { FEEDBACK_DIR: tmpDir });
82
+ const logPath = path.join(tmpDir, 'self-score-log.jsonl');
83
+ const logExists = fs.existsSync(logPath);
84
+
85
+ let logEntryOk = false;
86
+ if (logExists) {
87
+ const line = fs.readFileSync(logPath, 'utf-8').trim().split('\n')[0];
88
+ try {
89
+ const parsed = JSON.parse(line);
90
+ logEntryOk = parsed.feedbackId === 'proof-dpo01' && typeof parsed.score === 'number';
91
+ } catch (_) {
92
+ logEntryOk = false;
93
+ }
94
+ }
95
+
96
+ if (scoreOk && constraintsOk && logExists && logEntryOk) {
97
+ dpo01Status = 'pass';
98
+ dpo01Evidence =
99
+ `selfAudit() returned score=${auditResult.score} (float in [0,1]), ` +
100
+ `constraints.length=${auditResult.constraints.length} (6 CLAUDE.md constraints). ` +
101
+ `selfAuditAndLog() wrote self-score-log.jsonl to ${tmpDir}. ` +
102
+ `Log entry: feedbackId=proof-dpo01, score present. ` +
103
+ `Module: scripts/rlaif-self-audit.js. No API calls — pure heuristic evaluation.`;
104
+ } else {
105
+ dpo01Status = 'fail';
106
+ const issues = [];
107
+ if (!scoreOk) issues.push(`score not in [0,1]: ${auditResult.score}`);
108
+ if (!constraintsOk) issues.push(`constraints.length=${auditResult.constraints ? auditResult.constraints.length : 'none'}, expected 6`);
109
+ if (!logExists) issues.push(`self-score-log.jsonl not written to ${tmpDir}`);
110
+ if (!logEntryOk) issues.push(`log entry invalid or missing feedbackId`);
111
+ dpo01Evidence = `DPO-01 smoke test failed: ${issues.join('; ')}`;
112
+ }
113
+ } catch (err) {
114
+ dpo01Status = 'fail';
115
+ dpo01Evidence = `selfAudit() threw: ${err.message}`;
116
+ }
117
+ addResult('DPO-01', dpo01Status, dpo01Evidence);
118
+
119
+ // ─────────────────────────────────────────────────────────────────────────
120
+ // DPO-02: dpoOptimizer.run() writes dpo-model.json
121
+ // Evidence: call run() with tmpDir, verify dpo-model.json is written
122
+ // ─────────────────────────────────────────────────────────────────────────
123
+ const tmpDirDpo02 = fs.mkdtempSync(path.join(os.tmpdir(), 'prove-dpo02-'));
124
+ let dpo02Status = 'fail';
125
+ let dpo02Evidence = '';
126
+ try {
127
+ delete require.cache[require.resolve('./dpo-optimizer')];
128
+ const { run: dpoRun } = require('./dpo-optimizer');
129
+
130
+ const result = dpoRun({
131
+ feedbackDir: tmpDirDpo02,
132
+ modelPath: path.join(tmpDirDpo02, 'feedback_model.json'),
133
+ });
134
+
135
+ const dpoModelPath = path.join(tmpDirDpo02, 'dpo-model.json');
136
+ const dpoModelExists = fs.existsSync(dpoModelPath);
137
+
138
+ let modelOk = false;
139
+ let modelData = null;
140
+ if (dpoModelExists) {
141
+ try {
142
+ modelData = JSON.parse(fs.readFileSync(dpoModelPath, 'utf-8'));
143
+ modelOk = 'generated' in modelData && 'pairs_processed' in modelData;
144
+ } catch (_) {
145
+ modelOk = false;
146
+ }
147
+ }
148
+
149
+ if (dpoModelExists && modelOk) {
150
+ dpo02Status = 'pass';
151
+ dpo02Evidence =
152
+ `dpoOptimizer.run() completed: pairs_processed=${result.pairs_processed}. ` +
153
+ `dpo-model.json written to ${tmpDirDpo02}. ` +
154
+ `Model fields: generated=${modelData.generated}, pairs_processed=${modelData.pairs_processed}. ` +
155
+ `adjustments=${JSON.stringify(modelData.adjustments || {})}. ` +
156
+ `Module: scripts/dpo-optimizer.js. dpoLogRatio range: [-1, +1]. Pure offline batch optimization.`;
157
+ } else {
158
+ dpo02Status = 'fail';
159
+ const issues = [];
160
+ if (!dpoModelExists) issues.push(`dpo-model.json not written to ${tmpDirDpo02}`);
161
+ if (!modelOk) issues.push(`dpo-model.json missing required fields (generated, pairs_processed)`);
162
+ dpo02Evidence = `DPO-02 smoke test failed: ${issues.join('; ')}`;
163
+ }
164
+ } catch (err) {
165
+ dpo02Status = 'fail';
166
+ dpo02Evidence = `dpoOptimizer.run() threw: ${err.message}`;
167
+ } finally {
168
+ try { fs.rmSync(tmpDirDpo02, { recursive: true, force: true }); } catch (_) {}
169
+ }
170
+ addResult('DPO-02', dpo02Status, dpo02Evidence);
171
+
172
+ // ─────────────────────────────────────────────────────────────────────────
173
+ // DPO-03: extractMetaPolicyRules() produces rules.json when data exists
174
+ // Evidence: seed 3 negative entries in same domain, call run(), verify output
175
+ // ─────────────────────────────────────────────────────────────────────────
176
+ const tmpDirDpo03 = fs.mkdtempSync(path.join(os.tmpdir(), 'prove-dpo03-'));
177
+ let dpo03Status = 'fail';
178
+ let dpo03Evidence = '';
179
+ try {
180
+ // Seed 3 negative memory entries with same domain tags
181
+ const memoryLogPath = path.join(tmpDirDpo03, 'memory-log.jsonl');
182
+ const oldDate = new Date(Date.now() - 10 * 24 * 3600 * 1000).toISOString();
183
+ const seedEntries = [
184
+ {
185
+ id: 'proof-err-1',
186
+ signal: 'negative',
187
+ category: 'error',
188
+ title: 'MISTAKE: verification skipped',
189
+ content: 'How to avoid: Always run tests before claiming done.',
190
+ tags: ['verification', 'testing'],
191
+ context: 'Proof seed entry 1 for DPO-03 meta-policy rule extraction',
192
+ timestamp: oldDate,
193
+ },
194
+ {
195
+ id: 'proof-err-2',
196
+ signal: 'negative',
197
+ category: 'error',
198
+ title: 'MISTAKE: verification skipped again',
199
+ content: 'How to avoid: Run npm test before claiming completion.',
200
+ tags: ['verification', 'testing'],
201
+ context: 'Proof seed entry 2 for DPO-03 meta-policy rule extraction',
202
+ timestamp: oldDate,
203
+ },
204
+ {
205
+ id: 'proof-err-3',
206
+ signal: 'negative',
207
+ category: 'error',
208
+ title: 'MISTAKE: test output not included',
209
+ content: 'How to avoid: Always include test output in evidence.',
210
+ tags: ['verification', 'testing'],
211
+ context: 'Proof seed entry 3 for DPO-03 meta-policy rule extraction',
212
+ timestamp: oldDate,
213
+ },
214
+ ];
215
+ fs.writeFileSync(
216
+ memoryLogPath,
217
+ seedEntries.map((e) => JSON.stringify(e)).join('\n') + '\n',
218
+ );
219
+
220
+ // Invalidate meta-policy + its dependencies so feedbackDir is picked up fresh
221
+ for (const key of Object.keys(require.cache)) {
222
+ if (key.includes('meta-policy') || key.includes('feedback-loop') || key.includes('thompson-sampling')) {
223
+ delete require.cache[key];
224
+ }
225
+ }
226
+ const { run: metaRun } = require('./meta-policy');
227
+ const metaResult = metaRun({ feedbackDir: tmpDirDpo03 });
228
+
229
+ const outPath = path.join(tmpDirDpo03, 'meta-policy-rules.json');
230
+ const outExists = fs.existsSync(outPath);
231
+
232
+ let outOk = false;
233
+ let parsedOut = null;
234
+ if (outExists) {
235
+ try {
236
+ parsedOut = JSON.parse(fs.readFileSync(outPath, 'utf-8'));
237
+ outOk = Array.isArray(parsedOut.rules);
238
+ } catch (_) {
239
+ outOk = false;
240
+ }
241
+ }
242
+
243
+ const ruleCount = outOk ? parsedOut.rules.length : 0;
244
+ const hasRequiredFields = outOk && ruleCount > 0 &&
245
+ parsedOut.rules.every((r) =>
246
+ 'category' in r && 'confidence' in r && 'trend' in r && 'occurrence_count' in r
247
+ );
248
+
249
+ if (outExists && outOk && ruleCount >= 1 && hasRequiredFields) {
250
+ dpo03Status = 'pass';
251
+ dpo03Evidence =
252
+ `extractMetaPolicyRules() produced ${ruleCount} rule(s) from 3 seeded negative entries. ` +
253
+ `meta-policy-rules.json written to ${tmpDirDpo03}. ` +
254
+ `Rules: ${JSON.stringify(parsedOut.rules.map((r) => ({ category: r.category, confidence: r.confidence, trend: r.trend, count: r.occurrence_count })))}. ` +
255
+ `All rules have required fields: category, confidence, trend, occurrence_count. ` +
256
+ `Module: scripts/meta-policy.js. MIN_OCCURRENCES threshold: 2.`;
257
+ } else {
258
+ dpo03Status = 'fail';
259
+ const issues = [];
260
+ if (!outExists) issues.push(`meta-policy-rules.json not written to ${tmpDirDpo03}`);
261
+ if (!outOk) issues.push(`output JSON missing rules array`);
262
+ if (ruleCount < 1) issues.push(`extracted 0 rules from 3 seeded negative entries (expected >= 1)`);
263
+ if (!hasRequiredFields) issues.push(`rules missing required fields`);
264
+ dpo03Evidence = `DPO-03 smoke test failed: ${issues.join('; ')}`;
265
+ }
266
+ } catch (err) {
267
+ dpo03Status = 'fail';
268
+ dpo03Evidence = `meta-policy run() threw: ${err.message}`;
269
+ } finally {
270
+ try { fs.rmSync(tmpDirDpo03, { recursive: true, force: true }); } catch (_) {}
271
+ }
272
+ addResult('DPO-03', dpo03Status, dpo03Evidence);
273
+
274
+ // ─────────────────────────────────────────────────────────────────────────
275
+ // DPO-04: node --test exits 0 for all RLAIF test files; report test count delta
276
+ // Evidence: execSync node --test on 3 RLAIF test files, parse pass/fail counts
277
+ // ─────────────────────────────────────────────────────────────────────────
278
+ let dpo04Status = 'fail';
279
+ let dpo04Evidence = '';
280
+ let rlaifPassCount = 0;
281
+ let rlaifFailCount = 0;
282
+ try {
283
+ const testOutput = execSync(
284
+ 'node --test tests/rlaif-self-audit.test.js tests/dpo-optimizer.test.js tests/meta-policy.test.js 2>&1',
285
+ { cwd: ROOT, timeout: 60000, encoding: 'utf-8' }
286
+ );
287
+
288
+ const passMatch = testOutput.match(/pass\s+(\d+)/);
289
+ const failMatch = testOutput.match(/fail\s+(\d+)/);
290
+ rlaifPassCount = passMatch ? parseInt(passMatch[1], 10) : 0;
291
+ rlaifFailCount = failMatch ? parseInt(failMatch[1], 10) : 0;
292
+
293
+ const meetsRequirement = rlaifPassCount >= 6 && rlaifFailCount === 0;
294
+
295
+ if (meetsRequirement) {
296
+ dpo04Status = 'pass';
297
+ dpo04Evidence =
298
+ `node --test (3 RLAIF test files): pass=${rlaifPassCount}, fail=${rlaifFailCount}. ` +
299
+ `Phase 4 baseline (test:api): ${PHASE4_BASELINE} tests. ` +
300
+ `Phase 5 adds ${rlaifPassCount} new RLAIF tests. ` +
301
+ `Total with RLAIF: ${PHASE4_BASELINE + rlaifPassCount} tests (node-runner only). ` +
302
+ `Files: tests/rlaif-self-audit.test.js (selfAudit, selfAuditAndLog), ` +
303
+ `tests/dpo-optimizer.test.js (dpoLogRatio, buildPreferencePairs, run, applyDpoAdjustments), ` +
304
+ `tests/meta-policy.test.js (extractMetaPolicyRules, run). ` +
305
+ `All tests use tmpdir pattern — zero production feedback dirs touched.`;
306
+ } else {
307
+ dpo04Status = 'fail';
308
+ dpo04Evidence =
309
+ `node --test RLAIF files: pass=${rlaifPassCount}, fail=${rlaifFailCount}. ` +
310
+ `Expected >= 6 passing and 0 failures. ` +
311
+ `${rlaifFailCount > 0 ? `${rlaifFailCount} test(s) failing.` : `Only ${rlaifPassCount} tests passing (need >= 6).`}`;
312
+ }
313
+ } catch (err) {
314
+ // execSync throws if node --test exits non-zero
315
+ const output = err.stdout || err.stderr || err.message || '';
316
+ const outStr = String(output);
317
+ const failMatch = outStr.match(/fail\s+(\d+)/);
318
+ rlaifFailCount = failMatch ? parseInt(failMatch[1], 10) : 1;
319
+ dpo04Status = 'fail';
320
+ dpo04Evidence = `node --test RLAIF files exited non-zero (${rlaifFailCount} failures). Output: ${outStr.slice(0, 500)}`;
321
+ } finally {
322
+ // Clean up DPO-01 tmpDir
323
+ try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch (_) {}
324
+ delete process.env.RLHF_FEEDBACK_DIR;
325
+ }
326
+ addResult('DPO-04', dpo04Status, dpo04Evidence);
327
+
328
+ // ─────────────────────────────────────────────────────────────────────────
329
+ // Write proof artifacts
330
+ // ─────────────────────────────────────────────────────────────────────────
331
+ ensureDir(PROOF_DIR);
332
+
333
+ const jsonPath = path.join(PROOF_DIR, 'rlaif-report.json');
334
+ fs.writeFileSync(jsonPath, `${JSON.stringify(report, null, 2)}\n`);
335
+
336
+ const mdLines = [
337
+ '# RLAIF and DPO Optimization — Proof Report',
338
+ '',
339
+ `Generated: ${report.generated}`,
340
+ `Phase: ${report.phase}`,
341
+ '',
342
+ `**Passed: ${report.summary.passed} | Failed: ${report.summary.failed} | Warned: ${report.summary.warned}**`,
343
+ '',
344
+ '## Requirements',
345
+ '',
346
+ '| Requirement | Status | Evidence |',
347
+ '|-------------|--------|----------|',
348
+ ...Object.entries(report.requirements).map(
349
+ ([reqId, { status: s, evidence }]) =>
350
+ `| ${reqId} | ${s.toUpperCase()} | ${evidence.replace(/\|/g, '\\|').replace(/\n/g, ' ')} |`
351
+ ),
352
+ '',
353
+ '## Requirement Details',
354
+ '',
355
+ ];
356
+
357
+ for (const [reqId, { status: s, evidence }] of Object.entries(report.requirements)) {
358
+ mdLines.push(`### ${reqId} — ${s.toUpperCase()}`);
359
+ mdLines.push('');
360
+ mdLines.push(evidence);
361
+ mdLines.push('');
362
+ }
363
+
364
+ mdLines.push('## Test Count Delta');
365
+ mdLines.push('');
366
+ mdLines.push('| Baseline (Phase 4 test:api) | Phase 5 RLAIF Addition | Total (node-runner) |');
367
+ mdLines.push('|----------------------------|------------------------|---------------------|');
368
+ mdLines.push(`| ${PHASE4_BASELINE} node-runner tests | +${rlaifPassCount} RLAIF tests (3 test files) | ${PHASE4_BASELINE + rlaifPassCount} |`);
369
+ mdLines.push('');
370
+ mdLines.push('Phase 5 (plan-03) added RLAIF test coverage:');
371
+ mdLines.push('- `tests/rlaif-self-audit.test.js` — CONSTRAINTS, selfAudit(), selfAuditAndLog()');
372
+ mdLines.push('- `tests/dpo-optimizer.test.js` — dpoLogRatio(), buildPreferencePairs(), run(), applyDpoAdjustments()');
373
+ mdLines.push('- `tests/meta-policy.test.js` — extractMetaPolicyRules(), run()');
374
+ mdLines.push('');
375
+ mdLines.push('All tests use `fs.mkdtempSync()` tmpdir isolation. Zero production feedback dirs touched.');
376
+ mdLines.push('');
377
+
378
+ const mdPath = path.join(PROOF_DIR, 'rlaif-report.md');
379
+ fs.writeFileSync(mdPath, `${mdLines.join('\n')}\n`);
380
+
381
+ console.log(`Proof written to ${mdPath}`);
382
+ console.log(` and ${jsonPath}`);
383
+ console.log('');
384
+ console.log(JSON.stringify(report.summary, null, 2));
385
+
386
+ const hasFail = report.summary.failed > 0;
387
+ if (hasFail) {
388
+ process.exitCode = 1;
389
+ console.error('\nFAIL — one or more requirements did not pass. See proof/rlaif-report.md for details.');
390
+ } else {
391
+ console.log('\nPASS — all requirements satisfied (warns are acceptable).');
392
+ }
393
+
394
+ return report;
395
+ }
396
+
397
+ module.exports = { runProof };
398
+
399
+ if (require.main === module) {
400
+ runProof().catch((err) => {
401
+ console.error('Fatal error in prove-rlaif.js:', err);
402
+ process.exitCode = 1;
403
+ });
404
+ }