principles-disciple 1.17.0 → 1.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/src/commands/nocturnal-rollout.ts +2 -0
- package/src/core/merge-gate-audit.ts +506 -0
- package/src/core/nocturnal-compliance.ts +1 -0
- package/src/core/nocturnal-export.ts +106 -6
- package/src/core/nocturnal-trinity.ts +559 -153
- package/src/core/promotion-gate.ts +33 -0
- package/src/core/replay-engine.ts +25 -0
- package/src/service/evolution-worker.ts +13 -6
- package/src/service/nocturnal-target-selector.ts +9 -2
- package/src/service/subagent-workflow/nocturnal-workflow-manager.ts +2 -6
- package/templates/langs/zh/skills/ai-sprint-orchestration/references/specs/nocturnal-trinity-quality-enhancement.json +111 -0
- package/templates/langs/zh/skills/ai-sprint-orchestration/scripts/lib/task-specs.mjs +1 -1
- package/templates/langs/zh/skills/ai-sprint-orchestration/scripts/run.mjs +1 -1
- package/tests/core/merge-gate-audit.test.ts +284 -0
- package/tests/core/nocturnal-export.test.ts +55 -0
- package/tests/core/nocturnal-trinity.test.ts +77 -4
- package/tests/core/pain-integration.test.ts +27 -0
- package/tests/core/promotion-gate.test.ts +5 -0
- package/tests/core/replay-engine.test.ts +19 -0
- package/tests/service/evolution-worker.nocturnal.test.ts +0 -547
- package/tests/service/nocturnal-workflow-manager.test.ts +2 -0
|
@@ -277,6 +277,12 @@ export interface PromotionGateResult {
|
|
|
277
277
|
threshold: number;
|
|
278
278
|
passed: boolean;
|
|
279
279
|
};
|
|
280
|
+
|
|
281
|
+
evidenceSummary: {
|
|
282
|
+
evidenceMode: 'shadow' | 'eval-proxy' | 'mixed';
|
|
283
|
+
shadowSampleCount: number;
|
|
284
|
+
deltaSource: 'eval';
|
|
285
|
+
};
|
|
280
286
|
}
|
|
281
287
|
|
|
282
288
|
/**
|
|
@@ -337,6 +343,11 @@ export function evaluatePromotionGate(
|
|
|
337
343
|
blockers,
|
|
338
344
|
constraintChecks: [],
|
|
339
345
|
deltaCheck: { actual: 0, threshold: minDelta, passed: false },
|
|
346
|
+
evidenceSummary: {
|
|
347
|
+
evidenceMode: 'eval-proxy',
|
|
348
|
+
shadowSampleCount: 0,
|
|
349
|
+
deltaSource: 'eval',
|
|
350
|
+
},
|
|
340
351
|
};
|
|
341
352
|
}
|
|
342
353
|
|
|
@@ -351,6 +362,11 @@ export function evaluatePromotionGate(
|
|
|
351
362
|
blockers,
|
|
352
363
|
constraintChecks: [],
|
|
353
364
|
deltaCheck: { actual: 0, threshold: minDelta, passed: false },
|
|
365
|
+
evidenceSummary: {
|
|
366
|
+
evidenceMode: 'eval-proxy',
|
|
367
|
+
shadowSampleCount: 0,
|
|
368
|
+
deltaSource: 'eval',
|
|
369
|
+
},
|
|
354
370
|
};
|
|
355
371
|
}
|
|
356
372
|
|
|
@@ -366,6 +382,11 @@ export function evaluatePromotionGate(
|
|
|
366
382
|
blockers,
|
|
367
383
|
constraintChecks: [],
|
|
368
384
|
deltaCheck: { actual: 0, threshold: minDelta, passed: false },
|
|
385
|
+
evidenceSummary: {
|
|
386
|
+
evidenceMode: 'eval-proxy',
|
|
387
|
+
shadowSampleCount: 0,
|
|
388
|
+
deltaSource: 'eval',
|
|
389
|
+
},
|
|
369
390
|
};
|
|
370
391
|
}
|
|
371
392
|
|
|
@@ -496,12 +517,24 @@ export function evaluatePromotionGate(
|
|
|
496
517
|
suggestedState = 'rejected';
|
|
497
518
|
}
|
|
498
519
|
|
|
520
|
+
const evidenceMode =
|
|
521
|
+
arbiterRejectSource === 'shadow' && executabilityRejectSource === 'shadow'
|
|
522
|
+
? 'shadow'
|
|
523
|
+
: arbiterRejectSource === 'eval-proxy' && executabilityRejectSource === 'eval-proxy'
|
|
524
|
+
? 'eval-proxy'
|
|
525
|
+
: 'mixed';
|
|
526
|
+
|
|
499
527
|
return {
|
|
500
528
|
passes: allPassed,
|
|
501
529
|
suggestedState,
|
|
502
530
|
blockers,
|
|
503
531
|
constraintChecks,
|
|
504
532
|
deltaCheck,
|
|
533
|
+
evidenceSummary: {
|
|
534
|
+
evidenceMode,
|
|
535
|
+
shadowSampleCount: shadowStats?.totalCount ?? 0,
|
|
536
|
+
deltaSource: 'eval',
|
|
537
|
+
},
|
|
505
538
|
};
|
|
506
539
|
}
|
|
507
540
|
|
|
@@ -63,6 +63,15 @@ export interface ReplayReport {
|
|
|
63
63
|
principleAnchor: ClassificationSummary;
|
|
64
64
|
};
|
|
65
65
|
blockers: string[];
|
|
66
|
+
evidenceSummary: {
|
|
67
|
+
evidenceStatus: 'observed' | 'empty';
|
|
68
|
+
totalSamples: number;
|
|
69
|
+
classifiedCounts: {
|
|
70
|
+
painNegative: number;
|
|
71
|
+
successPositive: number;
|
|
72
|
+
principleAnchor: number;
|
|
73
|
+
};
|
|
74
|
+
};
|
|
66
75
|
generatedAt: string;
|
|
67
76
|
implementationId: string;
|
|
68
77
|
sampleFingerprints: string[];
|
|
@@ -432,6 +441,11 @@ export class ReplayEngine {
|
|
|
432
441
|
const successSummary = toSummary(successPositive);
|
|
433
442
|
const anchorSummary = toSummary(principleAnchor);
|
|
434
443
|
const blockers: string[] = [];
|
|
444
|
+
const totalSamples = results.length;
|
|
445
|
+
|
|
446
|
+
if (totalSamples === 0) {
|
|
447
|
+
blockers.push('NO REPLAY EVIDENCE: No classified replay samples were available. Report cannot justify promotion-quality conclusions.');
|
|
448
|
+
}
|
|
435
449
|
|
|
436
450
|
for (const leak of painSummary.details.filter((result) => !result.passed)) {
|
|
437
451
|
blockers.push(
|
|
@@ -459,6 +473,15 @@ export class ReplayEngine {
|
|
|
459
473
|
principleAnchor: anchorSummary,
|
|
460
474
|
},
|
|
461
475
|
blockers,
|
|
476
|
+
evidenceSummary: {
|
|
477
|
+
evidenceStatus: totalSamples > 0 ? 'observed' : 'empty',
|
|
478
|
+
totalSamples,
|
|
479
|
+
classifiedCounts: {
|
|
480
|
+
painNegative: painSummary.total,
|
|
481
|
+
successPositive: successSummary.total,
|
|
482
|
+
principleAnchor: anchorSummary.total,
|
|
483
|
+
},
|
|
484
|
+
},
|
|
462
485
|
generatedAt: new Date().toISOString(),
|
|
463
486
|
implementationId,
|
|
464
487
|
sampleFingerprints: results.map((result) => result.sampleFingerprint),
|
|
@@ -471,6 +494,7 @@ export class ReplayEngine {
|
|
|
471
494
|
success: ClassificationSummary,
|
|
472
495
|
anchor: ClassificationSummary
|
|
473
496
|
): 'pass' | 'fail' | 'needs-review' {
|
|
497
|
+
if (pain.total + success.total + anchor.total === 0) return 'needs-review';
|
|
474
498
|
if (pain.failed > 0) return 'fail';
|
|
475
499
|
if (anchor.failed > 0) return 'fail';
|
|
476
500
|
if (success.failed > 0) return 'needs-review';
|
|
@@ -526,6 +550,7 @@ export function formatReplayReport(report: ReplayReport): string {
|
|
|
526
550
|
output += `Implementation: ${report.implementationId}\n`;
|
|
527
551
|
output += `Generated At: ${report.generatedAt}\n`;
|
|
528
552
|
output += `Overall Decision: [${decisionEmoji}]\n\n`;
|
|
553
|
+
output += `Evidence Status: ${report.evidenceSummary.evidenceStatus} (samples=${report.evidenceSummary.totalSamples})\n\n`;
|
|
529
554
|
|
|
530
555
|
const formatSection = (
|
|
531
556
|
label: string,
|
|
@@ -173,7 +173,7 @@ let timeoutId: NodeJS.Timeout | null = null;
|
|
|
173
173
|
* Old queue items (without taskKind) are migrated to pain_diagnosis for compatibility.
|
|
174
174
|
*/
|
|
175
175
|
export type QueueStatus = 'pending' | 'in_progress' | 'completed' | 'failed' | 'canceled';
|
|
176
|
-
export type TaskResolution = 'marker_detected' | 'auto_completed_timeout' | 'failed_max_retries' | 'runtime_unavailable' | 'canceled' | 'late_marker_principle_created' | 'late_marker_no_principle' | 'stub_fallback';
|
|
176
|
+
export type TaskResolution = 'marker_detected' | 'auto_completed_timeout' | 'failed_max_retries' | 'runtime_unavailable' | 'canceled' | 'late_marker_principle_created' | 'late_marker_no_principle' | 'stub_fallback' | 'skipped_thin_violation';
|
|
177
177
|
|
|
178
178
|
/**
|
|
179
179
|
* Recent pain context attached to sleep_reflection tasks.
|
|
@@ -1595,13 +1595,14 @@ async function processEvolutionQueue(wctx: WorkspaceContext, logger: PluginLogge
|
|
|
1595
1595
|
const errorReason = lastEvent?.reason ?? 'unknown';
|
|
1596
1596
|
// #219: Include payload details for better diagnostics
|
|
1597
1597
|
let detailedError = `Workflow terminal_error: ${errorReason}`;
|
|
1598
|
+
let payload: unknown = {};
|
|
1598
1599
|
try {
|
|
1599
|
-
|
|
1600
|
-
if (payload.skipReason) {
|
|
1601
|
-
detailedError += ` (skipReason: ${payload.skipReason})`;
|
|
1600
|
+
payload = lastEvent?.payload ?? {};
|
|
1601
|
+
if ((payload as any).skipReason) {
|
|
1602
|
+
detailedError += ` (skipReason: ${(payload as any).skipReason})`;
|
|
1602
1603
|
}
|
|
1603
|
-
if (payload.failures && Array.isArray(payload.failures) && payload.failures.length > 0) {
|
|
1604
|
-
detailedError += ` | failures: ${(payload.failures as string[]).slice(0, 3).join(', ')}`;
|
|
1604
|
+
if ((payload as any).failures && Array.isArray((payload as any).failures) && (payload as any).failures.length > 0) {
|
|
1605
|
+
detailedError += ` | failures: ${((payload as any).failures as string[]).slice(0, 3).join(', ')}`;
|
|
1605
1606
|
}
|
|
1606
1607
|
} catch { /* ignore parse errors */ }
|
|
1607
1608
|
sleepTask.lastError = detailedError;
|
|
@@ -1613,6 +1614,12 @@ async function processEvolutionQueue(wctx: WorkspaceContext, logger: PluginLogge
|
|
|
1613
1614
|
sleepTask.completed_at = new Date().toISOString();
|
|
1614
1615
|
sleepTask.resolution = 'stub_fallback';
|
|
1615
1616
|
logger?.warn?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} background runtime unavailable, using stub fallback: ${errorReason}`);
|
|
1617
|
+
} else if ((payload as any).skipReason === 'no_violating_sessions') {
|
|
1618
|
+
// #244: No meaningful violations found (thin filter) → skip without failure
|
|
1619
|
+
sleepTask.status = 'completed';
|
|
1620
|
+
sleepTask.completed_at = new Date().toISOString();
|
|
1621
|
+
sleepTask.resolution = 'skipped_thin_violation';
|
|
1622
|
+
logger?.info?.(`[PD:EvolutionWorker] sleep_reflection task ${sleepTask.id} completed: no sessions with meaningful violations found`);
|
|
1616
1623
|
} else {
|
|
1617
1624
|
sleepTask.status = 'failed';
|
|
1618
1625
|
sleepTask.completed_at = new Date().toISOString();
|
|
@@ -302,7 +302,7 @@ export class NocturnalTargetSelector {
|
|
|
302
302
|
this.recentPainContext = recentPainContext;
|
|
303
303
|
this.opts = {
|
|
304
304
|
minViolationDensity: restOptions.minViolationDensity ?? 0.1,
|
|
305
|
-
maxSessionCandidates: restOptions.maxSessionCandidates ??
|
|
305
|
+
maxSessionCandidates: restOptions.maxSessionCandidates ?? 300,
|
|
306
306
|
idleThresholdMs: restOptions.idleThresholdMs ?? DEFAULT_IDLE_THRESHOLD_MS,
|
|
307
307
|
};
|
|
308
308
|
}
|
|
@@ -440,7 +440,14 @@ export class NocturnalTargetSelector {
|
|
|
440
440
|
}
|
|
441
441
|
|
|
442
442
|
// Compute violation signals for each session
|
|
443
|
-
|
|
443
|
+
// #244: Filter out sessions that are too thin for meaningful reflection
|
|
444
|
+
// A session needs enough violation context (failures + pain + gates >= 2)
|
|
445
|
+
const MIN_VIOLATION_DEPTH = 2;
|
|
446
|
+
const richSessions = recentSessions.filter(
|
|
447
|
+
s => (s.failureCount ?? 0) + (s.painEventCount ?? 0) + (s.gateBlockCount ?? 0) >= MIN_VIOLATION_DEPTH
|
|
448
|
+
);
|
|
449
|
+
|
|
450
|
+
const violatingSessions: ViolationSignal[] = richSessions.map((session) => {
|
|
444
451
|
const violationDensity = computeViolationDensity(session);
|
|
445
452
|
const snapshot = this.extractor.getNocturnalSessionSnapshot(session.sessionId);
|
|
446
453
|
|
|
@@ -40,7 +40,6 @@ import type { NocturnalSessionSnapshot } from '../../core/nocturnal-trajectory-e
|
|
|
40
40
|
import type { RecentPainContext } from '../evolution-worker.js';
|
|
41
41
|
import * as fs from 'fs';
|
|
42
42
|
import * as path from 'path';
|
|
43
|
-
import { isSubagentRuntimeAvailable } from '../../utils/subagent-probe.js';
|
|
44
43
|
import { validateNocturnalSnapshotIngress } from '../../core/nocturnal-snapshot-contract.js';
|
|
45
44
|
|
|
46
45
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -173,11 +172,8 @@ export class NocturnalWorkflowManager implements WorkflowManager {
|
|
|
173
172
|
metadata?: Record<string, unknown>;
|
|
174
173
|
}
|
|
175
174
|
): Promise<WorkflowHandle> {
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- Reason: TrinityRuntimeAdapter interface doesn't expose api.runtime.subagent, but OpenClawTrinityRuntimeAdapter has it
|
|
179
|
-
const subagent = (this.runtimeAdapter as any).api?.runtime?.subagent;
|
|
180
|
-
if (!isSubagentRuntimeAvailable(subagent)) {
|
|
175
|
+
const runtimeAvailable = this.runtimeAdapter.isRuntimeAvailable();
|
|
176
|
+
if (!runtimeAvailable) {
|
|
181
177
|
this.logger.warn(`[PD:NocturnalWorkflow] Subagent runtime unavailable, skipping workflow`);
|
|
182
178
|
throw new Error(`NocturnalWorkflowManager: subagent runtime unavailable`);
|
|
183
179
|
}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "nocturnal-trinity-quality-enhancement",
|
|
3
|
+
"title": "Enhance nocturnal Trinity prompt quality",
|
|
4
|
+
"description": "Enhance nocturnal Trinity prompt quality — add Dreamer perspective diversity constraints and Scribe rejected-decision analysis",
|
|
5
|
+
"workspace": "/home/csuzngjh/code/principles",
|
|
6
|
+
"branch": "fix/bugs-231-228",
|
|
7
|
+
"requiresTaskContract": true,
|
|
8
|
+
"maxRoundsPerStage": 2,
|
|
9
|
+
"maxRuntimeMinutes": 60,
|
|
10
|
+
"stages": [
|
|
11
|
+
"investigate",
|
|
12
|
+
"implement-pass-1",
|
|
13
|
+
"verify"
|
|
14
|
+
],
|
|
15
|
+
"taskContract": {
|
|
16
|
+
"goal": "Improve nocturnal Trinity output quality by adding perspective diversity to Dreamer and rejected-decision analysis to Scribe",
|
|
17
|
+
"inScope": [
|
|
18
|
+
"nocturnal-trinity.ts prompt modifications",
|
|
19
|
+
"nocturnal-trinity.test.ts assertion updates",
|
|
20
|
+
"nocturnal-arbiter.ts compatibility verification"
|
|
21
|
+
],
|
|
22
|
+
"outOfScope": [
|
|
23
|
+
"Runtime or infrastructure changes",
|
|
24
|
+
"New file creation",
|
|
25
|
+
"Non-Trinity prompt changes"
|
|
26
|
+
],
|
|
27
|
+
"validationCommands": [
|
|
28
|
+
"npx vitest run packages/openclaw-plugin/tests/core/nocturnal --reporter=verbose"
|
|
29
|
+
],
|
|
30
|
+
"expectedArtifacts": [
|
|
31
|
+
"packages/openclaw-plugin/src/core/nocturnal-trinity.ts"
|
|
32
|
+
]
|
|
33
|
+
},
|
|
34
|
+
"producer": {
|
|
35
|
+
"agent": "iflow",
|
|
36
|
+
"model": "glm-5",
|
|
37
|
+
"timeoutSeconds": 1800
|
|
38
|
+
},
|
|
39
|
+
"reviewerA": {
|
|
40
|
+
"agent": "iflow",
|
|
41
|
+
"model": "glm-4.7",
|
|
42
|
+
"timeoutSeconds": 1200,
|
|
43
|
+
"role": "code-quality",
|
|
44
|
+
"focus": "Verify prompt changes are minimal, backward-compatible, and don't break existing arbiter validation"
|
|
45
|
+
},
|
|
46
|
+
"reviewerB": {
|
|
47
|
+
"agent": "iflow",
|
|
48
|
+
"model": "glm-4.7",
|
|
49
|
+
"timeoutSeconds": 1200,
|
|
50
|
+
"role": "functional-correctness",
|
|
51
|
+
"focus": "Verify tests pass and the new prompt constraints produce structurally valid Trinity output"
|
|
52
|
+
},
|
|
53
|
+
"escalationReviewer": {
|
|
54
|
+
"agent": "iflow",
|
|
55
|
+
"model": "glm-5",
|
|
56
|
+
"timeoutSeconds": 1800
|
|
57
|
+
},
|
|
58
|
+
"stageGoals": {
|
|
59
|
+
"investigate": [
|
|
60
|
+
"Read nocturnal-trinity.ts lines 64-298 (all three prompts) and nocturnal-trinity.test.ts",
|
|
61
|
+
"Identify exact insertion points for Dreamer diversity section and Scribe analysis section",
|
|
62
|
+
"List all test assertions that reference prompt content",
|
|
63
|
+
"Report findings in producer.md"
|
|
64
|
+
],
|
|
65
|
+
"implement-pass-1": [
|
|
66
|
+
"Apply Dreamer perspective diversity constraints to NOCTURNAL_DREAMER_PROMPT",
|
|
67
|
+
"Apply Scribe rejected-decision analysis to NOCTURNAL_SCRIBE_PROMPT",
|
|
68
|
+
"Update test assertions in nocturnal-trinity.test.ts if needed",
|
|
69
|
+
"Run nocturnal-trinity and nocturnal-arbiter tests to verify no breakage"
|
|
70
|
+
],
|
|
71
|
+
"verify": [
|
|
72
|
+
"Run full nocturnal test suite: npx vitest run packages/openclaw-plugin/tests/core/nocturnal --reporter=verbose",
|
|
73
|
+
"Verify all tests pass with 0 failures",
|
|
74
|
+
"Confirm arbiter validation is unchanged",
|
|
75
|
+
"Confirm no new files were created"
|
|
76
|
+
]
|
|
77
|
+
},
|
|
78
|
+
"stageCriteria": {
|
|
79
|
+
"investigate": {
|
|
80
|
+
"scoringDimensions": [
|
|
81
|
+
"completeness",
|
|
82
|
+
"accuracy"
|
|
83
|
+
],
|
|
84
|
+
"dimensionThreshold": 3,
|
|
85
|
+
"requiredDeliverables": [
|
|
86
|
+
"producer.md"
|
|
87
|
+
]
|
|
88
|
+
},
|
|
89
|
+
"implement-pass-1": {
|
|
90
|
+
"scoringDimensions": [
|
|
91
|
+
"correctness",
|
|
92
|
+
"completeness"
|
|
93
|
+
],
|
|
94
|
+
"dimensionThreshold": 3,
|
|
95
|
+
"requiredDeliverables": [
|
|
96
|
+
"producer.md",
|
|
97
|
+
"reviewer-a.md",
|
|
98
|
+
"reviewer-b.md"
|
|
99
|
+
]
|
|
100
|
+
},
|
|
101
|
+
"verify": {
|
|
102
|
+
"scoringDimensions": [
|
|
103
|
+
"correctness"
|
|
104
|
+
],
|
|
105
|
+
"dimensionThreshold": 3,
|
|
106
|
+
"requiredDeliverables": [
|
|
107
|
+
"producer.md"
|
|
108
|
+
]
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
@@ -325,7 +325,7 @@ export function buildStageBrief(spec, stage, round, previousDecision, handoff =
|
|
|
325
325
|
carryForward.trimEnd(),
|
|
326
326
|
'',
|
|
327
327
|
`## Constraints`,
|
|
328
|
-
...spec.context.map((line) => `- ${line}`),
|
|
328
|
+
...((spec.context ?? []).map((line) => `- ${line}`)),
|
|
329
329
|
'',
|
|
330
330
|
...(spec.taskContract
|
|
331
331
|
? [
|
|
@@ -3413,7 +3413,7 @@ if (process.argv[1] === fileURLToPath(import.meta.url)) {
|
|
|
3413
3413
|
main().catch((err) => {
|
|
3414
3414
|
// main() is async and may throw. The try/catch inside main() handles
|
|
3415
3415
|
// errors within its body, but rejections from the Promise itself land here.
|
|
3416
|
-
console.error('Fatal error:', err.message);
|
|
3416
|
+
console.error('Fatal error:', err.message, err.stack);
|
|
3417
3417
|
process.exit(1);
|
|
3418
3418
|
});
|
|
3419
3419
|
}
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
import { afterEach, beforeEach, describe, expect, it } from 'vitest';
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import * as os from 'os';
|
|
4
|
+
import * as path from 'path';
|
|
5
|
+
import {
|
|
6
|
+
formatMergeGateAuditReport,
|
|
7
|
+
runMergeGateAudit,
|
|
8
|
+
} from '../../src/core/merge-gate-audit.js';
|
|
9
|
+
import type { NocturnalArtifact } from '../../src/core/nocturnal-arbiter.js';
|
|
10
|
+
import {
|
|
11
|
+
registerSample,
|
|
12
|
+
updateReviewStatus,
|
|
13
|
+
} from '../../src/core/nocturnal-dataset.js';
|
|
14
|
+
import { appendArtifactLineageRecord } from '../../src/core/nocturnal-artifact-lineage.js';
|
|
15
|
+
import { exportORPOSamples } from '../../src/core/nocturnal-export.js';
|
|
16
|
+
import { createImplementationAssetDir, getImplementationAssetRoot } from '../../src/core/code-implementation-storage.js';
|
|
17
|
+
import { safeRmDir } from '../test-utils.js';
|
|
18
|
+
|
|
19
|
+
function makeArtifact(overrides: Partial<NocturnalArtifact> = {}): NocturnalArtifact {
|
|
20
|
+
return {
|
|
21
|
+
artifactId: 'artifact-1',
|
|
22
|
+
sessionId: 'session-1',
|
|
23
|
+
principleId: 'T-08',
|
|
24
|
+
sourceSnapshotRef: 'snapshot-1',
|
|
25
|
+
badDecision: 'Retried without checking state',
|
|
26
|
+
betterDecision: 'Inspect state before retrying',
|
|
27
|
+
rationale: 'Evidence first.',
|
|
28
|
+
createdAt: '2026-04-12T09:00:00.000Z',
|
|
29
|
+
...overrides,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
describe('merge-gate-audit', () => {
|
|
34
|
+
let tempDir: string;
|
|
35
|
+
let workspaceDir: string;
|
|
36
|
+
let stateDir: string;
|
|
37
|
+
|
|
38
|
+
beforeEach(() => {
|
|
39
|
+
tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'pd-merge-gate-audit-'));
|
|
40
|
+
workspaceDir = path.join(tempDir, 'workspace');
|
|
41
|
+
stateDir = path.join(tempDir, '.state');
|
|
42
|
+
fs.mkdirSync(workspaceDir, { recursive: true });
|
|
43
|
+
fs.mkdirSync(stateDir, { recursive: true });
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
afterEach(() => {
|
|
47
|
+
safeRmDir(tempDir);
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
function registerApprovedArtifact(artifactId = 'artifact-1'): string {
|
|
51
|
+
const artifact = makeArtifact({ artifactId });
|
|
52
|
+
const artifactPath = path.join(
|
|
53
|
+
workspaceDir,
|
|
54
|
+
'.state',
|
|
55
|
+
'nocturnal',
|
|
56
|
+
'samples',
|
|
57
|
+
`${artifactId}.json`,
|
|
58
|
+
);
|
|
59
|
+
fs.mkdirSync(path.dirname(artifactPath), { recursive: true });
|
|
60
|
+
fs.writeFileSync(artifactPath, JSON.stringify(artifact, null, 2), 'utf-8');
|
|
61
|
+
|
|
62
|
+
const record = registerSample(workspaceDir, artifact, artifactPath, 'gpt-4').record;
|
|
63
|
+
updateReviewStatus(
|
|
64
|
+
workspaceDir,
|
|
65
|
+
record.sampleFingerprint,
|
|
66
|
+
'approved_for_training',
|
|
67
|
+
'approved for merge gate audit',
|
|
68
|
+
);
|
|
69
|
+
|
|
70
|
+
appendArtifactLineageRecord(workspaceDir, {
|
|
71
|
+
artifactKind: 'behavioral-sample',
|
|
72
|
+
artifactId: record.artifactId,
|
|
73
|
+
principleId: record.principleId,
|
|
74
|
+
ruleId: null,
|
|
75
|
+
sessionId: record.sessionId,
|
|
76
|
+
sourceSnapshotRef: record.sourceSnapshotRef,
|
|
77
|
+
sourcePainIds: ['pain-1'],
|
|
78
|
+
sourceGateBlockIds: ['gate-1'],
|
|
79
|
+
storagePath: artifactPath,
|
|
80
|
+
implementationId: null,
|
|
81
|
+
createdAt: record.createdAt,
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
return record.sampleFingerprint;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
it('returns defer when audit surfaces are not populated yet', () => {
|
|
88
|
+
const report = runMergeGateAudit(workspaceDir, stateDir);
|
|
89
|
+
|
|
90
|
+
expect(report.overallStatus).toBe('defer');
|
|
91
|
+
expect(report.checks.find((check) => check.id === 'pain_flag_path_contract')?.status).toBe('pass');
|
|
92
|
+
expect(report.checks.find((check) => check.id === 'queue_path_contract')?.status).toBe('pass');
|
|
93
|
+
expect(report.checks.find((check) => check.id === 'runtime_adapter_contract')?.status).toBe('pass');
|
|
94
|
+
expect(report.counts.defer).toBeGreaterThan(0);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
it('blocks malformed replay reports that claim pass without evidence', () => {
|
|
98
|
+
createImplementationAssetDir(stateDir, 'IMPL-1', '1.0.0');
|
|
99
|
+
const replayDir = path.join(getImplementationAssetRoot(stateDir, 'IMPL-1'), 'replays');
|
|
100
|
+
fs.mkdirSync(replayDir, { recursive: true });
|
|
101
|
+
fs.writeFileSync(
|
|
102
|
+
path.join(replayDir, 'bad-report.json'),
|
|
103
|
+
JSON.stringify(
|
|
104
|
+
{
|
|
105
|
+
overallDecision: 'pass',
|
|
106
|
+
blockers: [],
|
|
107
|
+
generatedAt: '2026-04-12T09:00:00.000Z',
|
|
108
|
+
implementationId: 'IMPL-1',
|
|
109
|
+
evidenceSummary: {
|
|
110
|
+
evidenceStatus: 'empty',
|
|
111
|
+
totalSamples: 0,
|
|
112
|
+
classifiedCounts: {
|
|
113
|
+
painNegative: 0,
|
|
114
|
+
successPositive: 0,
|
|
115
|
+
principleAnchor: 0,
|
|
116
|
+
},
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
null,
|
|
120
|
+
2,
|
|
121
|
+
),
|
|
122
|
+
'utf-8',
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
const report = runMergeGateAudit(workspaceDir, stateDir);
|
|
126
|
+
const replayCheck = report.checks.find((check) => check.id === 'replay_evidence_integrity');
|
|
127
|
+
|
|
128
|
+
expect(report.overallStatus).toBe('block');
|
|
129
|
+
expect(replayCheck?.status).toBe('block');
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
it('passes populated dataset, lineage, export, and replay evidence surfaces', () => {
|
|
133
|
+
registerApprovedArtifact('artifact-pass');
|
|
134
|
+
const exportResult = exportORPOSamples(workspaceDir, 'gpt-4');
|
|
135
|
+
expect(exportResult.success).toBe(true);
|
|
136
|
+
|
|
137
|
+
createImplementationAssetDir(stateDir, 'IMPL-1', '1.0.0');
|
|
138
|
+
const replayDir = path.join(getImplementationAssetRoot(stateDir, 'IMPL-1'), 'replays');
|
|
139
|
+
fs.mkdirSync(replayDir, { recursive: true });
|
|
140
|
+
fs.writeFileSync(
|
|
141
|
+
path.join(replayDir, 'good-report.json'),
|
|
142
|
+
JSON.stringify(
|
|
143
|
+
{
|
|
144
|
+
overallDecision: 'pass',
|
|
145
|
+
replayResults: {
|
|
146
|
+
painNegative: { total: 1, passed: 1, failed: 0, details: [] },
|
|
147
|
+
successPositive: { total: 0, passed: 0, failed: 0, details: [] },
|
|
148
|
+
principleAnchor: { total: 0, passed: 0, failed: 0, details: [] },
|
|
149
|
+
},
|
|
150
|
+
blockers: [],
|
|
151
|
+
generatedAt: '2026-04-12T09:00:00.000Z',
|
|
152
|
+
implementationId: 'IMPL-1',
|
|
153
|
+
sampleFingerprints: ['sample-1'],
|
|
154
|
+
evidenceSummary: {
|
|
155
|
+
evidenceStatus: 'observed',
|
|
156
|
+
totalSamples: 1,
|
|
157
|
+
classifiedCounts: {
|
|
158
|
+
painNegative: 1,
|
|
159
|
+
successPositive: 0,
|
|
160
|
+
principleAnchor: 0,
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
},
|
|
164
|
+
null,
|
|
165
|
+
2,
|
|
166
|
+
),
|
|
167
|
+
'utf-8',
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
const report = runMergeGateAudit(workspaceDir, stateDir);
|
|
171
|
+
|
|
172
|
+
expect(report.overallStatus).toBe('pass');
|
|
173
|
+
expect(report.counts.block).toBe(0);
|
|
174
|
+
expect(report.counts.defer).toBe(0);
|
|
175
|
+
expect(formatMergeGateAuditReport(report)).toContain('Overall Status: PASS');
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
it('blocks when dataset artifacts are missing', () => {
|
|
179
|
+
const artifactPath = path.join(
|
|
180
|
+
workspaceDir,
|
|
181
|
+
'.state',
|
|
182
|
+
'nocturnal',
|
|
183
|
+
'samples',
|
|
184
|
+
'artifact-missing.json',
|
|
185
|
+
);
|
|
186
|
+
fs.mkdirSync(path.dirname(artifactPath), { recursive: true });
|
|
187
|
+
const artifact = makeArtifact({ artifactId: 'artifact-missing' });
|
|
188
|
+
fs.writeFileSync(artifactPath, JSON.stringify(artifact, null, 2), 'utf-8');
|
|
189
|
+
|
|
190
|
+
const record = registerSample(workspaceDir, artifact, artifactPath, 'gpt-4').record;
|
|
191
|
+
updateReviewStatus(workspaceDir, record.sampleFingerprint, 'approved_for_training', 'approved');
|
|
192
|
+
|
|
193
|
+
// Append lineage pointing to a real file (so lineage passes)
|
|
194
|
+
appendArtifactLineageRecord(workspaceDir, {
|
|
195
|
+
artifactKind: 'behavioral-sample',
|
|
196
|
+
artifactId: record.artifactId,
|
|
197
|
+
principleId: record.principleId,
|
|
198
|
+
ruleId: null,
|
|
199
|
+
sessionId: record.sessionId,
|
|
200
|
+
sourceSnapshotRef: record.sourceSnapshotRef,
|
|
201
|
+
sourcePainIds: [],
|
|
202
|
+
sourceGateBlockIds: [],
|
|
203
|
+
storagePath: artifactPath,
|
|
204
|
+
implementationId: null,
|
|
205
|
+
createdAt: record.createdAt,
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
// Delete the artifact to simulate a missing file
|
|
209
|
+
fs.unlinkSync(artifactPath);
|
|
210
|
+
|
|
211
|
+
const report = runMergeGateAudit(workspaceDir, stateDir);
|
|
212
|
+
const datasetCheck = report.checks.find((c) => c.id === 'dataset_artifact_integrity');
|
|
213
|
+
|
|
214
|
+
expect(report.overallStatus).toBe('block');
|
|
215
|
+
expect(datasetCheck?.status).toBe('block');
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
it('blocks when artifact lineage storage paths are missing', () => {
|
|
219
|
+
const badPath = path.join(workspaceDir, '.state', 'nocturnal', 'samples', 'nonexistent.json');
|
|
220
|
+
appendArtifactLineageRecord(workspaceDir, {
|
|
221
|
+
artifactKind: 'behavioral-sample',
|
|
222
|
+
artifactId: 'lineage-missing',
|
|
223
|
+
principleId: 'T-08',
|
|
224
|
+
ruleId: null,
|
|
225
|
+
sessionId: 'session-1',
|
|
226
|
+
sourceSnapshotRef: 'snap-1',
|
|
227
|
+
sourcePainIds: [],
|
|
228
|
+
sourceGateBlockIds: [],
|
|
229
|
+
storagePath: badPath,
|
|
230
|
+
implementationId: null,
|
|
231
|
+
createdAt: new Date().toISOString(),
|
|
232
|
+
});
|
|
233
|
+
|
|
234
|
+
const report = runMergeGateAudit(workspaceDir, stateDir);
|
|
235
|
+
const lineageCheck = report.checks.find((c) => c.id === 'artifact_lineage_integrity');
|
|
236
|
+
|
|
237
|
+
expect(report.overallStatus).toBe('block');
|
|
238
|
+
expect(lineageCheck?.status).toBe('block');
|
|
239
|
+
});
|
|
240
|
+
|
|
241
|
+
it('blocks when replay reports are malformed', () => {
|
|
242
|
+
createImplementationAssetDir(stateDir, 'IMPL-BAD', '1.0.0');
|
|
243
|
+
const replayDir = path.join(getImplementationAssetRoot(stateDir, 'IMPL-BAD'), 'replays');
|
|
244
|
+
fs.mkdirSync(replayDir, { recursive: true });
|
|
245
|
+
fs.writeFileSync(
|
|
246
|
+
path.join(replayDir, 'malformed.json'),
|
|
247
|
+
'{bad json',
|
|
248
|
+
'utf-8',
|
|
249
|
+
);
|
|
250
|
+
|
|
251
|
+
const report = runMergeGateAudit(workspaceDir, stateDir);
|
|
252
|
+
const replayCheck = report.checks.find((c) => c.id === 'replay_evidence_integrity');
|
|
253
|
+
const details = replayCheck?.details as Record<string, string[]> | undefined;
|
|
254
|
+
|
|
255
|
+
expect(report.overallStatus).toBe('block');
|
|
256
|
+
expect(replayCheck?.status).toBe('block');
|
|
257
|
+
expect(details?.malformedReports).toHaveLength(1);
|
|
258
|
+
});
|
|
259
|
+
|
|
260
|
+
it('blocks when replay reports have invalid evidenceSummary shape', () => {
|
|
261
|
+
createImplementationAssetDir(stateDir, 'IMPL-NOEVID', '1.0.0');
|
|
262
|
+
const replayDir = path.join(getImplementationAssetRoot(stateDir, 'IMPL-NOEVID'), 'replays');
|
|
263
|
+
fs.mkdirSync(replayDir, { recursive: true });
|
|
264
|
+
fs.writeFileSync(
|
|
265
|
+
path.join(replayDir, 'bad-evidence.json'),
|
|
266
|
+
JSON.stringify({
|
|
267
|
+
overallDecision: 'pass',
|
|
268
|
+
blockers: [],
|
|
269
|
+
generatedAt: '2026-04-12T09:00:00.000Z',
|
|
270
|
+
implementationId: 'IMPL-NOEVID',
|
|
271
|
+
evidenceSummary: { evidenceStatus: 'observed' }, // missing totalSamples
|
|
272
|
+
}),
|
|
273
|
+
'utf-8',
|
|
274
|
+
);
|
|
275
|
+
|
|
276
|
+
const report = runMergeGateAudit(workspaceDir, stateDir);
|
|
277
|
+
const replayCheck = report.checks.find((c) => c.id === 'replay_evidence_integrity');
|
|
278
|
+
const details = replayCheck?.details as Record<string, string[]> | undefined;
|
|
279
|
+
|
|
280
|
+
expect(report.overallStatus).toBe('block');
|
|
281
|
+
expect(replayCheck?.status).toBe('block');
|
|
282
|
+
expect(details?.missingEvidenceSummary).toHaveLength(1);
|
|
283
|
+
});
|
|
284
|
+
});
|