principles-disciple 1.18.0 → 1.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/src/commands/nocturnal-rollout.ts +2 -0
- package/src/core/merge-gate-audit.ts +506 -0
- package/src/core/nocturnal-export.ts +106 -6
- package/src/core/nocturnal-snapshot-contract.ts +12 -26
- package/src/core/nocturnal-trajectory-extractor.ts +9 -6
- package/src/core/nocturnal-trinity.ts +111 -28
- package/src/core/promotion-gate.ts +33 -0
- package/src/core/replay-engine.ts +25 -0
- package/src/service/evolution-worker.ts +47 -11
- package/src/service/subagent-workflow/nocturnal-workflow-manager.ts +2 -6
- package/tests/core/merge-gate-audit.test.ts +284 -0
- package/tests/core/nocturnal-export.test.ts +55 -0
- package/tests/core/nocturnal-snapshot-contract.test.ts +53 -2
- package/tests/core/nocturnal-trinity.test.ts +77 -4
- package/tests/core/pain-integration.test.ts +27 -0
- package/tests/core/promotion-gate.test.ts +5 -0
- package/tests/core/replay-engine.test.ts +19 -0
- package/tests/service/nocturnal-workflow-manager.test.ts +2 -0
package/openclaw.plugin.json
CHANGED
package/package.json
CHANGED
|
@@ -222,6 +222,8 @@ Checkpoint: ${checkpointId.substring(0, 8)}...
|
|
|
222
222
|
Profile: ${profile}
|
|
223
223
|
Result: ${result.passes ? 'PASS' : 'FAIL'}
|
|
224
224
|
Suggested State: ${result.suggestedState ? formatPromotionState(result.suggestedState, zh) : 'N/A'}
|
|
225
|
+
Evidence Mode: ${result.evidenceSummary.evidenceMode}
|
|
226
|
+
Shadow Samples: ${result.evidenceSummary.shadowSampleCount}
|
|
225
227
|
|
|
226
228
|
--- Delta Check ---
|
|
227
229
|
${result.deltaCheck.passed ? 'PASS' : 'FAIL'} Delta: ${result.deltaCheck.actual >= 0 ? '+' : ''}${result.deltaCheck.actual.toFixed(4)} (threshold: ${result.deltaCheck.threshold.toFixed(4)})
|
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
import * as fs from 'fs';
|
|
2
|
+
import * as path from 'path';
|
|
3
|
+
import { getImplementationAssetRoot } from './code-implementation-storage.js';
|
|
4
|
+
import { listDatasetRecords } from './nocturnal-dataset.js';
|
|
5
|
+
import { listArtifactLineageRecords } from './nocturnal-artifact-lineage.js';
|
|
6
|
+
import { listExports, verifyExportIntegrity } from './nocturnal-export.js';
|
|
7
|
+
import { OpenClawTrinityRuntimeAdapter } from './nocturnal-trinity.js';
|
|
8
|
+
import { resolvePdPath } from './paths.js';
|
|
9
|
+
import type { ReplayReport } from './replay-engine.js';
|
|
10
|
+
|
|
11
|
+
export type MergeGateAuditStatus = 'pass' | 'block' | 'defer';
|
|
12
|
+
|
|
13
|
+
export interface MergeGateAuditCheck {
|
|
14
|
+
id: string;
|
|
15
|
+
status: MergeGateAuditStatus;
|
|
16
|
+
summary: string;
|
|
17
|
+
details?: Record<string, unknown>;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface MergeGateAuditReport {
|
|
21
|
+
overallStatus: MergeGateAuditStatus;
|
|
22
|
+
generatedAt: string;
|
|
23
|
+
workspaceDir: string;
|
|
24
|
+
stateDir: string;
|
|
25
|
+
checks: MergeGateAuditCheck[];
|
|
26
|
+
counts: {
|
|
27
|
+
pass: number;
|
|
28
|
+
block: number;
|
|
29
|
+
defer: number;
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
function isWithinDir(parentDir: string, candidatePath: string): boolean {
|
|
34
|
+
const relative = path.relative(path.resolve(parentDir), path.resolve(candidatePath));
|
|
35
|
+
return relative === '' || (!relative.startsWith('..') && !path.isAbsolute(relative));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function computeOverallStatus(checks: MergeGateAuditCheck[]): MergeGateAuditStatus {
|
|
39
|
+
if (checks.some((check) => check.status === 'block')) {
|
|
40
|
+
return 'block';
|
|
41
|
+
}
|
|
42
|
+
if (checks.some((check) => check.status === 'defer')) {
|
|
43
|
+
return 'defer';
|
|
44
|
+
}
|
|
45
|
+
return 'pass';
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function countStatuses(checks: MergeGateAuditCheck[]): MergeGateAuditReport['counts'] {
|
|
49
|
+
const counts = { pass: 0, block: 0, defer: 0 };
|
|
50
|
+
for (const check of checks) {
|
|
51
|
+
counts[check.status] += 1;
|
|
52
|
+
}
|
|
53
|
+
return counts;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function auditPainFlagPathContract(workspaceDir: string): MergeGateAuditCheck {
|
|
57
|
+
const painFlagPath = resolvePdPath(workspaceDir, 'PAIN_FLAG');
|
|
58
|
+
const expectedPath = path.join(path.resolve(workspaceDir), '.state', '.pain_flag');
|
|
59
|
+
const normalizedPainFlagPath = path.normalize(painFlagPath);
|
|
60
|
+
const normalizedExpectedPath = path.normalize(expectedPath);
|
|
61
|
+
|
|
62
|
+
if (normalizedPainFlagPath !== normalizedExpectedPath) {
|
|
63
|
+
return {
|
|
64
|
+
id: 'pain_flag_path_contract',
|
|
65
|
+
status: 'block',
|
|
66
|
+
summary: 'Canonical pain flag path does not resolve under workspace/.state/.pain_flag.',
|
|
67
|
+
details: {
|
|
68
|
+
resolvedPath: normalizedPainFlagPath,
|
|
69
|
+
expectedPath: normalizedExpectedPath,
|
|
70
|
+
},
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
id: 'pain_flag_path_contract',
|
|
76
|
+
status: 'pass',
|
|
77
|
+
summary: 'Canonical pain flag path resolves to workspace/.state/.pain_flag.',
|
|
78
|
+
details: {
|
|
79
|
+
resolvedPath: normalizedPainFlagPath,
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function auditQueuePathContract(workspaceDir: string): MergeGateAuditCheck {
|
|
85
|
+
const queuePath = resolvePdPath(workspaceDir, 'EVOLUTION_QUEUE');
|
|
86
|
+
const expectedPath = path.join(path.resolve(workspaceDir), '.state', 'evolution_queue.json');
|
|
87
|
+
const normalizedQueuePath = path.normalize(queuePath);
|
|
88
|
+
const normalizedExpectedPath = path.normalize(expectedPath);
|
|
89
|
+
|
|
90
|
+
if (normalizedQueuePath !== normalizedExpectedPath) {
|
|
91
|
+
return {
|
|
92
|
+
id: 'queue_path_contract',
|
|
93
|
+
status: 'block',
|
|
94
|
+
summary: 'Canonical evolution queue path does not resolve under workspace/.state/evolution_queue.json.',
|
|
95
|
+
details: {
|
|
96
|
+
resolvedPath: normalizedQueuePath,
|
|
97
|
+
expectedPath: normalizedExpectedPath,
|
|
98
|
+
},
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
id: 'queue_path_contract',
|
|
104
|
+
status: 'pass',
|
|
105
|
+
summary: 'Canonical evolution queue path resolves to workspace/.state/evolution_queue.json.',
|
|
106
|
+
details: {
|
|
107
|
+
resolvedPath: normalizedQueuePath,
|
|
108
|
+
},
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function auditRuntimeAdapterContract(): MergeGateAuditCheck {
|
|
113
|
+
// Check the prototype surface only — do NOT instantiate the adapter.
|
|
114
|
+
// Instantiation triggers cleanupStaleTempDirs() which scans os.tmpdir()
|
|
115
|
+
// and could have side effects (removing stale temp dirs of other processes).
|
|
116
|
+
const hasSurface =
|
|
117
|
+
typeof OpenClawTrinityRuntimeAdapter.prototype.isRuntimeAvailable === 'function' &&
|
|
118
|
+
typeof OpenClawTrinityRuntimeAdapter.prototype.getLastFailureReason === 'function';
|
|
119
|
+
|
|
120
|
+
if (!hasSurface) {
|
|
121
|
+
return {
|
|
122
|
+
id: 'runtime_adapter_contract',
|
|
123
|
+
status: 'block',
|
|
124
|
+
summary: 'OpenClaw runtime adapter does not expose the expected contract-check surface.',
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
id: 'runtime_adapter_contract',
|
|
130
|
+
status: 'pass',
|
|
131
|
+
summary: 'OpenClaw runtime adapter exposes the expected contract-check surface (isRuntimeAvailable, getLastFailureReason).',
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function auditDatasetArtifactIntegrity(workspaceDir: string): MergeGateAuditCheck {
|
|
136
|
+
const records = listDatasetRecords(workspaceDir);
|
|
137
|
+
if (records.length === 0) {
|
|
138
|
+
return {
|
|
139
|
+
id: 'dataset_artifact_integrity',
|
|
140
|
+
status: 'defer',
|
|
141
|
+
summary: 'No dataset records found. Dataset artifact integrity cannot be verified yet.',
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
const missingArtifacts: string[] = [];
|
|
146
|
+
const outOfWorkspaceArtifacts: string[] = [];
|
|
147
|
+
|
|
148
|
+
for (const record of records) {
|
|
149
|
+
if (!fs.existsSync(record.artifactPath)) {
|
|
150
|
+
missingArtifacts.push(record.sampleFingerprint);
|
|
151
|
+
continue;
|
|
152
|
+
}
|
|
153
|
+
if (!isWithinDir(workspaceDir, record.artifactPath)) {
|
|
154
|
+
outOfWorkspaceArtifacts.push(record.sampleFingerprint);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
if (missingArtifacts.length > 0 || outOfWorkspaceArtifacts.length > 0) {
|
|
159
|
+
return {
|
|
160
|
+
id: 'dataset_artifact_integrity',
|
|
161
|
+
status: 'block',
|
|
162
|
+
summary: 'Dataset registry points to missing artifacts or paths outside the workspace boundary.',
|
|
163
|
+
details: {
|
|
164
|
+
recordCount: records.length,
|
|
165
|
+
missingArtifacts,
|
|
166
|
+
outOfWorkspaceArtifacts,
|
|
167
|
+
},
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return {
|
|
172
|
+
id: 'dataset_artifact_integrity',
|
|
173
|
+
status: 'pass',
|
|
174
|
+
summary: 'All dataset artifacts exist and remain inside the workspace boundary.',
|
|
175
|
+
details: {
|
|
176
|
+
recordCount: records.length,
|
|
177
|
+
},
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function auditArtifactLineageIntegrity(workspaceDir: string): MergeGateAuditCheck {
|
|
182
|
+
const records = listArtifactLineageRecords(workspaceDir);
|
|
183
|
+
if (records.length === 0) {
|
|
184
|
+
return {
|
|
185
|
+
id: 'artifact_lineage_integrity',
|
|
186
|
+
status: 'defer',
|
|
187
|
+
summary: 'No artifact lineage records found. Lineage integrity cannot be verified yet.',
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
const missingStoragePaths: string[] = [];
|
|
192
|
+
const outOfWorkspaceStoragePaths: string[] = [];
|
|
193
|
+
|
|
194
|
+
for (const record of records) {
|
|
195
|
+
if (!fs.existsSync(record.storagePath)) {
|
|
196
|
+
missingStoragePaths.push(record.artifactId);
|
|
197
|
+
continue;
|
|
198
|
+
}
|
|
199
|
+
if (!isWithinDir(workspaceDir, record.storagePath)) {
|
|
200
|
+
outOfWorkspaceStoragePaths.push(record.artifactId);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
if (missingStoragePaths.length > 0 || outOfWorkspaceStoragePaths.length > 0) {
|
|
205
|
+
return {
|
|
206
|
+
id: 'artifact_lineage_integrity',
|
|
207
|
+
status: 'block',
|
|
208
|
+
summary: 'Artifact lineage points to missing files or paths outside the workspace boundary.',
|
|
209
|
+
details: {
|
|
210
|
+
recordCount: records.length,
|
|
211
|
+
missingStoragePaths,
|
|
212
|
+
outOfWorkspaceStoragePaths,
|
|
213
|
+
},
|
|
214
|
+
};
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
id: 'artifact_lineage_integrity',
|
|
219
|
+
status: 'pass',
|
|
220
|
+
summary: 'All lineage storage paths exist and remain inside the workspace boundary.',
|
|
221
|
+
details: {
|
|
222
|
+
recordCount: records.length,
|
|
223
|
+
},
|
|
224
|
+
};
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function auditOrpoExportIntegrity(workspaceDir: string): MergeGateAuditCheck {
|
|
228
|
+
const exports = listExports(workspaceDir);
|
|
229
|
+
if (exports.length === 0) {
|
|
230
|
+
return {
|
|
231
|
+
id: 'orpo_export_integrity',
|
|
232
|
+
status: 'defer',
|
|
233
|
+
summary: 'No ORPO exports found. Export integrity cannot be verified yet.',
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
const invalidExportIds: string[] = [];
|
|
238
|
+
const missingExportFiles: string[] = [];
|
|
239
|
+
|
|
240
|
+
for (const manifest of exports) {
|
|
241
|
+
if (!fs.existsSync(manifest.exportPath)) {
|
|
242
|
+
missingExportFiles.push(manifest.exportId);
|
|
243
|
+
continue;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
const integrity = verifyExportIntegrity(workspaceDir, manifest.exportId);
|
|
247
|
+
if (!integrity || !integrity.valid) {
|
|
248
|
+
invalidExportIds.push(manifest.exportId);
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if (invalidExportIds.length > 0 || missingExportFiles.length > 0) {
|
|
253
|
+
return {
|
|
254
|
+
id: 'orpo_export_integrity',
|
|
255
|
+
status: 'block',
|
|
256
|
+
summary: 'ORPO export manifests or payloads failed integrity verification.',
|
|
257
|
+
details: {
|
|
258
|
+
exportCount: exports.length,
|
|
259
|
+
invalidExportIds,
|
|
260
|
+
missingExportFiles,
|
|
261
|
+
},
|
|
262
|
+
};
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
return {
|
|
266
|
+
id: 'orpo_export_integrity',
|
|
267
|
+
status: 'pass',
|
|
268
|
+
summary: 'All ORPO exports pass manifest fingerprint verification.',
|
|
269
|
+
details: {
|
|
270
|
+
exportCount: exports.length,
|
|
271
|
+
},
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
function isReplayReportShape(value: unknown): value is ReplayReport {
|
|
276
|
+
if (!value || typeof value !== 'object') {
|
|
277
|
+
return false;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const report = value as Partial<ReplayReport>;
|
|
281
|
+
return (
|
|
282
|
+
typeof report.overallDecision === 'string' &&
|
|
283
|
+
typeof report.generatedAt === 'string' &&
|
|
284
|
+
typeof report.implementationId === 'string' &&
|
|
285
|
+
report.evidenceSummary !== undefined &&
|
|
286
|
+
Array.isArray(report.blockers)
|
|
287
|
+
);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* Collect all replay report file paths under the implementations directory.
|
|
292
|
+
*/
|
|
293
|
+
function collectReplayReportPaths(stateDir: string): string[] {
|
|
294
|
+
const implementationsRoot = path.join(stateDir, 'principles', 'implementations');
|
|
295
|
+
if (!fs.existsSync(implementationsRoot)) return [];
|
|
296
|
+
|
|
297
|
+
const implementationIds = fs
|
|
298
|
+
.readdirSync(implementationsRoot, { withFileTypes: true })
|
|
299
|
+
.filter((entry) => entry.isDirectory())
|
|
300
|
+
.map((entry) => entry.name);
|
|
301
|
+
|
|
302
|
+
const paths: string[] = [];
|
|
303
|
+
for (const id of implementationIds) {
|
|
304
|
+
const replaysDir = path.join(getImplementationAssetRoot(stateDir, id), 'replays');
|
|
305
|
+
if (!fs.existsSync(replaysDir)) continue;
|
|
306
|
+
|
|
307
|
+
const files = fs
|
|
308
|
+
.readdirSync(replaysDir, { withFileTypes: true })
|
|
309
|
+
.filter((entry) => entry.isFile() && entry.name.endsWith('.json'))
|
|
310
|
+
.map((entry) => path.join(replaysDir, entry.name));
|
|
311
|
+
paths.push(...files);
|
|
312
|
+
}
|
|
313
|
+
return paths;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
/**
|
|
317
|
+
* Result of validating a single replay report file.
|
|
318
|
+
*/
|
|
319
|
+
type ReplayValidationCategory =
|
|
320
|
+
| 'io_error'
|
|
321
|
+
| 'malformed'
|
|
322
|
+
| 'missing_evidence_summary'
|
|
323
|
+
| 'unsupported_pass'
|
|
324
|
+
| 'empty_needs_review'
|
|
325
|
+
| 'valid';
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Check if the parsed replay report has a valid evidenceSummary shape.
|
|
329
|
+
*/
|
|
330
|
+
function hasValidEvidenceSummary(parsed: unknown): boolean {
|
|
331
|
+
if (!parsed || typeof parsed !== 'object') return false;
|
|
332
|
+
const report = parsed as Partial<ReplayReport>;
|
|
333
|
+
const summary = report.evidenceSummary;
|
|
334
|
+
if (!summary) return false;
|
|
335
|
+
if (typeof (summary as Partial<ReplayReport['evidenceSummary']>).evidenceStatus !== 'string') {
|
|
336
|
+
return false;
|
|
337
|
+
}
|
|
338
|
+
return typeof (summary as Partial<ReplayReport['evidenceSummary']>).totalSamples === 'number';
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Validate a single replay report file and return its category.
|
|
343
|
+
*/
|
|
344
|
+
function validateSingleReplayReport(reportPath: string): ReplayValidationCategory {
|
|
345
|
+
let rawContent: string;
|
|
346
|
+
try {
|
|
347
|
+
rawContent = fs.readFileSync(reportPath, 'utf-8');
|
|
348
|
+
} catch {
|
|
349
|
+
return 'io_error';
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
let parsed: unknown;
|
|
353
|
+
try {
|
|
354
|
+
parsed = JSON.parse(rawContent);
|
|
355
|
+
} catch {
|
|
356
|
+
return 'malformed';
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
if (!isReplayReportShape(parsed)) {
|
|
360
|
+
return 'malformed';
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
if (!hasValidEvidenceSummary(parsed)) {
|
|
364
|
+
return 'missing_evidence_summary';
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
const evidenceSummary = (parsed as ReplayReport).evidenceSummary;
|
|
368
|
+
if (parsed.overallDecision === 'pass' && evidenceSummary.totalSamples === 0) {
|
|
369
|
+
return 'unsupported_pass';
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
if (parsed.overallDecision === 'needs-review' && evidenceSummary.totalSamples === 0) {
|
|
373
|
+
return 'empty_needs_review';
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
return 'valid';
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* Categorize all replay report files by validation outcome.
|
|
381
|
+
*/
|
|
382
|
+
interface ReplayValidationResults {
|
|
383
|
+
ioErrorReports: string[];
|
|
384
|
+
malformedReports: string[];
|
|
385
|
+
missingEvidenceSummary: string[];
|
|
386
|
+
unsupportedPassingReports: string[];
|
|
387
|
+
emptyEvidenceNeedsReview: string[];
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
function categorizeReplayReports(reportPaths: string[]): ReplayValidationResults {
|
|
391
|
+
const results: ReplayValidationResults = {
|
|
392
|
+
ioErrorReports: [],
|
|
393
|
+
malformedReports: [],
|
|
394
|
+
missingEvidenceSummary: [],
|
|
395
|
+
unsupportedPassingReports: [],
|
|
396
|
+
emptyEvidenceNeedsReview: [],
|
|
397
|
+
};
|
|
398
|
+
|
|
399
|
+
for (const reportPath of reportPaths) {
|
|
400
|
+
const category = validateSingleReplayReport(reportPath);
|
|
401
|
+
switch (category) {
|
|
402
|
+
case 'io_error':
|
|
403
|
+
results.ioErrorReports.push(reportPath);
|
|
404
|
+
break;
|
|
405
|
+
case 'malformed':
|
|
406
|
+
results.malformedReports.push(reportPath);
|
|
407
|
+
break;
|
|
408
|
+
case 'missing_evidence_summary':
|
|
409
|
+
results.missingEvidenceSummary.push(reportPath);
|
|
410
|
+
break;
|
|
411
|
+
case 'unsupported_pass':
|
|
412
|
+
results.unsupportedPassingReports.push(reportPath);
|
|
413
|
+
break;
|
|
414
|
+
case 'empty_needs_review':
|
|
415
|
+
results.emptyEvidenceNeedsReview.push(reportPath);
|
|
416
|
+
break;
|
|
417
|
+
// 'valid' — no action needed
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
return results;
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
function hasValidationFailures(results: ReplayValidationResults): boolean {
|
|
425
|
+
return (
|
|
426
|
+
results.malformedReports.length > 0 ||
|
|
427
|
+
results.ioErrorReports.length > 0 ||
|
|
428
|
+
results.missingEvidenceSummary.length > 0 ||
|
|
429
|
+
results.unsupportedPassingReports.length > 0 ||
|
|
430
|
+
results.emptyEvidenceNeedsReview.length > 0
|
|
431
|
+
);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
function auditReplayEvidenceIntegrity(stateDir: string): MergeGateAuditCheck {
|
|
435
|
+
const replayReportPaths = collectReplayReportPaths(stateDir);
|
|
436
|
+
|
|
437
|
+
if (replayReportPaths.length === 0) {
|
|
438
|
+
return {
|
|
439
|
+
id: 'replay_evidence_integrity',
|
|
440
|
+
status: 'defer',
|
|
441
|
+
summary: 'No replay reports found. Replay evidence integrity cannot be verified yet.',
|
|
442
|
+
};
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
const results = categorizeReplayReports(replayReportPaths);
|
|
446
|
+
|
|
447
|
+
if (hasValidationFailures(results)) {
|
|
448
|
+
return {
|
|
449
|
+
id: 'replay_evidence_integrity',
|
|
450
|
+
status: 'block',
|
|
451
|
+
summary: 'Replay reports contain malformed payloads, I/O errors, empty-evidence passes, or zero-evidence needs-review verdicts.',
|
|
452
|
+
details: {
|
|
453
|
+
reportCount: replayReportPaths.length,
|
|
454
|
+
...results,
|
|
455
|
+
},
|
|
456
|
+
};
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
return {
|
|
460
|
+
id: 'replay_evidence_integrity',
|
|
461
|
+
status: 'pass',
|
|
462
|
+
summary: 'Replay reports include evidence summaries and no empty-evidence unsafe verdicts.',
|
|
463
|
+
details: {
|
|
464
|
+
reportCount: replayReportPaths.length,
|
|
465
|
+
},
|
|
466
|
+
};
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
export function runMergeGateAudit(workspaceDir: string, stateDir: string): MergeGateAuditReport {
|
|
470
|
+
const checks: MergeGateAuditCheck[] = [
|
|
471
|
+
auditPainFlagPathContract(workspaceDir),
|
|
472
|
+
auditQueuePathContract(workspaceDir),
|
|
473
|
+
auditRuntimeAdapterContract(),
|
|
474
|
+
auditDatasetArtifactIntegrity(workspaceDir),
|
|
475
|
+
auditArtifactLineageIntegrity(workspaceDir),
|
|
476
|
+
auditOrpoExportIntegrity(workspaceDir),
|
|
477
|
+
auditReplayEvidenceIntegrity(stateDir),
|
|
478
|
+
];
|
|
479
|
+
|
|
480
|
+
return {
|
|
481
|
+
overallStatus: computeOverallStatus(checks),
|
|
482
|
+
generatedAt: new Date().toISOString(),
|
|
483
|
+
workspaceDir: path.resolve(workspaceDir),
|
|
484
|
+
stateDir: path.resolve(stateDir),
|
|
485
|
+
checks,
|
|
486
|
+
counts: countStatuses(checks),
|
|
487
|
+
};
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
export function formatMergeGateAuditReport(report: MergeGateAuditReport): string {
|
|
491
|
+
const lines: string[] = [
|
|
492
|
+
'=== Merge Gate Audit ===',
|
|
493
|
+
`Overall Status: ${report.overallStatus.toUpperCase()}`,
|
|
494
|
+
`Generated At: ${report.generatedAt}`,
|
|
495
|
+
`Workspace: ${report.workspaceDir}`,
|
|
496
|
+
`State Dir: ${report.stateDir}`,
|
|
497
|
+
`Counts: pass=${report.counts.pass}, block=${report.counts.block}, defer=${report.counts.defer}`,
|
|
498
|
+
'',
|
|
499
|
+
];
|
|
500
|
+
|
|
501
|
+
for (const check of report.checks) {
|
|
502
|
+
lines.push(`[${check.status.toUpperCase()}] ${check.id}: ${check.summary}`);
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
return `${lines.join('\n')}\n`;
|
|
506
|
+
}
|
|
@@ -52,6 +52,10 @@ import {
|
|
|
52
52
|
readDatasetArtifact,
|
|
53
53
|
type NocturnalDatasetRecord,
|
|
54
54
|
} from './nocturnal-dataset.js';
|
|
55
|
+
import {
|
|
56
|
+
listArtifactLineageRecords,
|
|
57
|
+
type ArtifactLineageRecord,
|
|
58
|
+
} from './nocturnal-artifact-lineage.js';
|
|
55
59
|
import { NocturnalPathResolver } from './nocturnal-paths.js';
|
|
56
60
|
|
|
57
61
|
// ---------------------------------------------------------------------------
|
|
@@ -81,6 +85,23 @@ export interface ORPOSample {
|
|
|
81
85
|
exportedAt: string;
|
|
82
86
|
exportId: string;
|
|
83
87
|
datasetFingerprint: string;
|
|
88
|
+
evidenceSummary: ORPOEvidenceSummary;
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export type EvidenceState = 'observed' | 'not_observed' | 'unknown';
|
|
93
|
+
|
|
94
|
+
export interface ORPOEvidenceSummary {
|
|
95
|
+
lineageStatus: 'observed' | 'unknown';
|
|
96
|
+
painSignals: {
|
|
97
|
+
status: EvidenceState;
|
|
98
|
+
count: number | null;
|
|
99
|
+
ids: string[];
|
|
100
|
+
};
|
|
101
|
+
gateBlocks: {
|
|
102
|
+
status: EvidenceState;
|
|
103
|
+
count: number | null;
|
|
104
|
+
ids: string[];
|
|
84
105
|
};
|
|
85
106
|
}
|
|
86
107
|
|
|
@@ -140,10 +161,12 @@ function computeDatasetFingerprint(sampleFingerprints: string[]): string {
|
|
|
140
161
|
function serializeORPOSample(
|
|
141
162
|
record: NocturnalDatasetRecord,
|
|
142
163
|
artifact: ReturnType<typeof readDatasetArtifact>,
|
|
164
|
+
evidenceSummary: ORPOEvidenceSummary,
|
|
143
165
|
exportId: string,
|
|
144
166
|
datasetFingerprint: string
|
|
145
167
|
): ORPOSample {
|
|
146
168
|
const now = new Date().toISOString();
|
|
169
|
+
const rejected = buildEvidenceBoundedRejected(artifact, evidenceSummary);
|
|
147
170
|
|
|
148
171
|
return {
|
|
149
172
|
sampleFingerprint: record.sampleFingerprint,
|
|
@@ -151,12 +174,11 @@ function serializeORPOSample(
|
|
|
151
174
|
sessionId: record.sessionId,
|
|
152
175
|
principleId: record.principleId,
|
|
153
176
|
targetModelFamily: record.targetModelFamily as string, // validated non-null by caller
|
|
154
|
-
//
|
|
155
|
-
|
|
156
|
-
prompt: artifact.badDecision,
|
|
177
|
+
// Export only evidence-bounded narratives. Free-form artifact text can overstate what was observed.
|
|
178
|
+
prompt: rejected,
|
|
157
179
|
chosen: artifact.betterDecision,
|
|
158
|
-
rejected
|
|
159
|
-
rationale:
|
|
180
|
+
rejected,
|
|
181
|
+
rationale: buildEvidenceBoundedRationale(evidenceSummary),
|
|
160
182
|
datasetMetadata: {
|
|
161
183
|
sampleFingerprint: record.sampleFingerprint,
|
|
162
184
|
artifactPath: record.artifactPath,
|
|
@@ -164,10 +186,83 @@ function serializeORPOSample(
|
|
|
164
186
|
exportedAt: now,
|
|
165
187
|
exportId,
|
|
166
188
|
datasetFingerprint,
|
|
189
|
+
evidenceSummary,
|
|
190
|
+
},
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
function buildEvidenceSummary(
|
|
195
|
+
lineageRecord: ArtifactLineageRecord | null
|
|
196
|
+
): ORPOEvidenceSummary {
|
|
197
|
+
if (!lineageRecord) {
|
|
198
|
+
return {
|
|
199
|
+
lineageStatus: 'unknown',
|
|
200
|
+
painSignals: { status: 'unknown', count: null, ids: [] },
|
|
201
|
+
gateBlocks: { status: 'unknown', count: null, ids: [] },
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Defensive: old lineage files on disk may lack these fields
|
|
206
|
+
const painIds = lineageRecord.sourcePainIds ?? [];
|
|
207
|
+
const gateBlockIds = lineageRecord.sourceGateBlockIds ?? [];
|
|
208
|
+
const painCount = painIds.length;
|
|
209
|
+
const gateCount = gateBlockIds.length;
|
|
210
|
+
|
|
211
|
+
return {
|
|
212
|
+
lineageStatus: 'observed',
|
|
213
|
+
painSignals: {
|
|
214
|
+
status: painCount > 0 ? 'observed' : 'not_observed',
|
|
215
|
+
count: painCount,
|
|
216
|
+
ids: [...painIds],
|
|
217
|
+
},
|
|
218
|
+
gateBlocks: {
|
|
219
|
+
status: gateCount > 0 ? 'observed' : 'not_observed',
|
|
220
|
+
count: gateCount,
|
|
221
|
+
ids: [...gateBlockIds],
|
|
167
222
|
},
|
|
168
223
|
};
|
|
169
224
|
}
|
|
170
225
|
|
|
226
|
+
function buildEvidenceBoundedRejected(
|
|
227
|
+
artifact: ReturnType<typeof readDatasetArtifact>,
|
|
228
|
+
evidenceSummary: ORPOEvidenceSummary
|
|
229
|
+
): string {
|
|
230
|
+
if (evidenceSummary.lineageStatus === 'unknown') {
|
|
231
|
+
return 'Take the next action without verified source evidence.';
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
const clauses: string[] = [];
|
|
235
|
+
if (evidenceSummary.painSignals.status === 'observed' && evidenceSummary.painSignals.count) {
|
|
236
|
+
clauses.push(`continue despite ${evidenceSummary.painSignals.count} observed pain signals`);
|
|
237
|
+
}
|
|
238
|
+
if (evidenceSummary.gateBlocks.status === 'observed' && evidenceSummary.gateBlocks.count) {
|
|
239
|
+
clauses.push(`ignore ${evidenceSummary.gateBlocks.count} observed gate blocks`);
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
if (clauses.length === 0) {
|
|
243
|
+
return 'Proceed without first verifying the relevant state from the source session.';
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
const prefix = artifact.badDecision.trim().length > 0
|
|
247
|
+
? 'Proceed with the rejected action and '
|
|
248
|
+
: 'Take the rejected action and ';
|
|
249
|
+
return `${prefix}${clauses.join(' and ')}.`;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
function buildEvidenceBoundedRationale(evidenceSummary: ORPOEvidenceSummary): string {
|
|
253
|
+
if (evidenceSummary.lineageStatus === 'unknown') {
|
|
254
|
+
return 'Source evidence is unknown. Export uses a neutral rationale instead of narrating unverified failures or violations.';
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const painCount = evidenceSummary.painSignals.count ?? 0;
|
|
258
|
+
const gateCount = evidenceSummary.gateBlocks.count ?? 0;
|
|
259
|
+
if (painCount === 0 && gateCount === 0) {
|
|
260
|
+
return 'Source lineage is present but records no pain signals or gate blocks. Export keeps the corrective preference while avoiding invented failure narratives.';
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
return `Observed source evidence: ${painCount} pain signals and ${gateCount} gate blocks. Prefer the bounded corrective action over repeating the rejected choice.`;
|
|
264
|
+
}
|
|
265
|
+
|
|
171
266
|
// ---------------------------------------------------------------------------
|
|
172
267
|
// Core Export Function
|
|
173
268
|
// ---------------------------------------------------------------------------
|
|
@@ -187,6 +282,7 @@ export function exportORPOSamples(
|
|
|
187
282
|
): ExportResult {
|
|
188
283
|
const exportId = crypto.randomUUID();
|
|
189
284
|
const now = new Date().toISOString();
|
|
285
|
+
const lineageRecords = listArtifactLineageRecords(workspaceDir, 'behavioral-sample');
|
|
190
286
|
|
|
191
287
|
// Step 1: Collect eligible records
|
|
192
288
|
// Use listDatasetRecords directly to have full control over the family filter
|
|
@@ -253,8 +349,12 @@ export function exportORPOSamples(
|
|
|
253
349
|
continue;
|
|
254
350
|
}
|
|
255
351
|
|
|
352
|
+
const lineageRecord =
|
|
353
|
+
lineageRecords.find((candidate) => candidate.artifactId === record.artifactId) ?? null;
|
|
354
|
+
const evidenceSummary = buildEvidenceSummary(lineageRecord);
|
|
355
|
+
|
|
256
356
|
// Serialize
|
|
257
|
-
orpoSamples.push(serializeORPOSample(record, artifact, exportId, ''));
|
|
357
|
+
orpoSamples.push(serializeORPOSample(record, artifact, evidenceSummary, exportId, ''));
|
|
258
358
|
}
|
|
259
359
|
|
|
260
360
|
// Step 4: Fail if all samples failed validation
|