principles-disciple 1.52.0 → 1.53.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/src/core/bootstrap-rules.ts +41 -4
- package/src/core/evolution-hook.ts +74 -0
- package/src/core/file-storage-adapter.ts +203 -0
- package/src/core/init.ts +29 -2
- package/src/core/nocturnal-trinity.ts +230 -0
- package/src/core/observability.ts +242 -0
- package/src/core/pain-signal-adapter.ts +42 -0
- package/src/core/pain-signal.ts +136 -0
- package/src/core/principle-injection.ts +208 -0
- package/src/core/principle-injector.ts +84 -0
- package/src/core/storage-adapter.ts +65 -0
- package/src/core/telemetry-event.ts +109 -0
- package/src/hooks/prompt.ts +18 -3
- package/src/service/evolution-worker.ts +52 -2
- package/tests/core/evolution-hook.test.ts +123 -0
- package/tests/core/file-storage-adapter.test.ts +285 -0
- package/tests/core/nocturnal-trinity.test.ts +236 -0
- package/tests/core/observability.test.ts +383 -0
- package/tests/core/pain-signal-adapter.test.ts +116 -0
- package/tests/core/pain-signal.test.ts +190 -0
- package/tests/core/principle-injection.test.ts +223 -0
- package/tests/core/principle-injector.test.ts +90 -0
- package/tests/core/storage-conformance.test.ts +429 -0
- package/tests/core/telemetry-event.test.ts +119 -0
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
|
|
2
|
+
import * as fs from 'fs';
|
|
3
|
+
import * as os from 'os';
|
|
4
|
+
import * as path from 'path';
|
|
5
|
+
import { FileStorageAdapter } from '../../src/core/file-storage-adapter.js';
|
|
6
|
+
import type { HybridLedgerStore } from '../../src/core/principle-tree-ledger.js';
|
|
7
|
+
import { TREE_NAMESPACE, loadLedger } from '../../src/core/principle-tree-ledger.js';
|
|
8
|
+
import { safeRmDir } from '../test-utils.js';
|
|
9
|
+
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Helpers
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
function createTmpDir(): string {
|
|
15
|
+
return fs.mkdtempSync(path.join(os.tmpdir(), 'pd-file-storage-adapter-test-'));
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function createEmptyStore(): HybridLedgerStore {
|
|
19
|
+
return {
|
|
20
|
+
trainingStore: {},
|
|
21
|
+
tree: {
|
|
22
|
+
principles: {},
|
|
23
|
+
rules: {},
|
|
24
|
+
implementations: {},
|
|
25
|
+
metrics: {},
|
|
26
|
+
lastUpdated: new Date(0).toISOString(),
|
|
27
|
+
},
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Tests
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
describe('FileStorageAdapter', () => {
|
|
36
|
+
let tmpDir: string;
|
|
37
|
+
let adapter: FileStorageAdapter;
|
|
38
|
+
|
|
39
|
+
beforeEach(() => {
|
|
40
|
+
tmpDir = createTmpDir();
|
|
41
|
+
adapter = new FileStorageAdapter(tmpDir, tmpDir);
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
afterEach(() => {
|
|
45
|
+
safeRmDir(tmpDir);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
// -------------------------------------------------------------------------
|
|
49
|
+
// loadLedger
|
|
50
|
+
// -------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
describe('loadLedger', () => {
|
|
53
|
+
it('returns empty store when no file exists', async () => {
|
|
54
|
+
const store = await adapter.loadLedger();
|
|
55
|
+
expect(store.trainingStore).toEqual({});
|
|
56
|
+
expect(store.tree.principles).toEqual({});
|
|
57
|
+
expect(store.tree.rules).toEqual({});
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('loads existing persisted store', async () => {
|
|
61
|
+
const original = createEmptyStore();
|
|
62
|
+
original.tree.principles['P-001'] = {
|
|
63
|
+
id: 'P-001',
|
|
64
|
+
version: 1,
|
|
65
|
+
text: 'Write before delete',
|
|
66
|
+
triggerPattern: 'delete',
|
|
67
|
+
action: 'write first',
|
|
68
|
+
status: 'active',
|
|
69
|
+
priority: 'P1',
|
|
70
|
+
scope: 'general',
|
|
71
|
+
evaluability: 'deterministic',
|
|
72
|
+
valueScore: 0,
|
|
73
|
+
adherenceRate: 0,
|
|
74
|
+
painPreventedCount: 0,
|
|
75
|
+
derivedFromPainIds: [],
|
|
76
|
+
ruleIds: [],
|
|
77
|
+
conflictsWithPrincipleIds: [],
|
|
78
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
79
|
+
updatedAt: '2026-04-17T00:00:00.000Z',
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
// Persist using the low-level ledger to seed the file
|
|
83
|
+
await adapter.saveLedger(original);
|
|
84
|
+
const loaded = await adapter.loadLedger();
|
|
85
|
+
expect(loaded.tree.principles['P-001']).toBeDefined();
|
|
86
|
+
expect(loaded.tree.principles['P-001'].text).toBe('Write before delete');
|
|
87
|
+
});
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
// -------------------------------------------------------------------------
|
|
91
|
+
// saveLedger
|
|
92
|
+
// -------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
describe('saveLedger', () => {
|
|
95
|
+
it('persists store to disk', async () => {
|
|
96
|
+
const store = createEmptyStore();
|
|
97
|
+
store.trainingStore['test-principle'] = {
|
|
98
|
+
principleId: 'test-principle',
|
|
99
|
+
evaluability: 'manual_only',
|
|
100
|
+
applicableOpportunityCount: 0,
|
|
101
|
+
observedViolationCount: 0,
|
|
102
|
+
complianceRate: 0,
|
|
103
|
+
violationTrend: 0,
|
|
104
|
+
generatedSampleCount: 0,
|
|
105
|
+
approvedSampleCount: 0,
|
|
106
|
+
includedTrainRunIds: [],
|
|
107
|
+
deployedCheckpointIds: [],
|
|
108
|
+
internalizationStatus: 'prompt_only',
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
await adapter.saveLedger(store);
|
|
112
|
+
|
|
113
|
+
// Verify file exists and contains the data
|
|
114
|
+
const filePath = path.join(tmpDir, 'principle_training_state.json');
|
|
115
|
+
expect(fs.existsSync(filePath)).toBe(true);
|
|
116
|
+
const raw = JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
|
117
|
+
expect(raw['test-principle']).toBeDefined();
|
|
118
|
+
expect(raw[TREE_NAMESPACE]).toBeDefined();
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
it('round-trips data through save and load', async () => {
|
|
122
|
+
const store = createEmptyStore();
|
|
123
|
+
store.trainingStore['p-1'] = {
|
|
124
|
+
principleId: 'p-1',
|
|
125
|
+
evaluability: 'weak_heuristic',
|
|
126
|
+
applicableOpportunityCount: 5,
|
|
127
|
+
observedViolationCount: 2,
|
|
128
|
+
complianceRate: 0.6,
|
|
129
|
+
violationTrend: -0.1,
|
|
130
|
+
generatedSampleCount: 3,
|
|
131
|
+
approvedSampleCount: 2,
|
|
132
|
+
includedTrainRunIds: ['run-1'],
|
|
133
|
+
deployedCheckpointIds: [],
|
|
134
|
+
internalizationStatus: 'in_training',
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
await adapter.saveLedger(store);
|
|
138
|
+
const loaded = await adapter.loadLedger();
|
|
139
|
+
expect(loaded.trainingStore['p-1'].evaluability).toBe('weak_heuristic');
|
|
140
|
+
expect(loaded.trainingStore['p-1'].applicableOpportunityCount).toBe(5);
|
|
141
|
+
});
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
// -------------------------------------------------------------------------
|
|
145
|
+
// mutateLedger
|
|
146
|
+
// -------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
describe('mutateLedger', () => {
|
|
149
|
+
it('reads, mutates, and writes atomically', async () => {
|
|
150
|
+
// Start with empty store
|
|
151
|
+
await adapter.saveLedger(createEmptyStore());
|
|
152
|
+
|
|
153
|
+
const result = await adapter.mutateLedger((store) => {
|
|
154
|
+
store.tree.principles['P-002'] = {
|
|
155
|
+
id: 'P-002',
|
|
156
|
+
version: 1,
|
|
157
|
+
text: 'Test principle',
|
|
158
|
+
triggerPattern: 'test',
|
|
159
|
+
action: 'do something',
|
|
160
|
+
status: 'candidate',
|
|
161
|
+
priority: 'P2',
|
|
162
|
+
scope: 'general',
|
|
163
|
+
evaluability: 'manual_only',
|
|
164
|
+
valueScore: 0,
|
|
165
|
+
adherenceRate: 0,
|
|
166
|
+
painPreventedCount: 0,
|
|
167
|
+
derivedFromPainIds: [],
|
|
168
|
+
ruleIds: [],
|
|
169
|
+
conflictsWithPrincipleIds: [],
|
|
170
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
171
|
+
updatedAt: '2026-04-17T00:00:00.000Z',
|
|
172
|
+
};
|
|
173
|
+
return 42;
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
expect(result).toBe(42);
|
|
177
|
+
const loaded = await adapter.loadLedger();
|
|
178
|
+
expect(loaded.tree.principles['P-002']).toBeDefined();
|
|
179
|
+
expect(loaded.tree.principles['P-002'].text).toBe('Test principle');
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
it('returns the value from the mutate function', async () => {
|
|
183
|
+
await adapter.saveLedger(createEmptyStore());
|
|
184
|
+
|
|
185
|
+
const count = await adapter.mutateLedger((store) => {
|
|
186
|
+
return Object.keys(store.tree.principles).length;
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
expect(count).toBe(0);
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
it('supports async mutate functions', async () => {
|
|
193
|
+
await adapter.saveLedger(createEmptyStore());
|
|
194
|
+
|
|
195
|
+
const result = await adapter.mutateLedger(async (store) => {
|
|
196
|
+
// Simulate async work
|
|
197
|
+
await new Promise((r) => setTimeout(r, 10));
|
|
198
|
+
store.trainingStore['async-test'] = {
|
|
199
|
+
principleId: 'async-test',
|
|
200
|
+
evaluability: 'deterministic',
|
|
201
|
+
applicableOpportunityCount: 1,
|
|
202
|
+
observedViolationCount: 0,
|
|
203
|
+
complianceRate: 1.0,
|
|
204
|
+
violationTrend: 0,
|
|
205
|
+
generatedSampleCount: 0,
|
|
206
|
+
approvedSampleCount: 0,
|
|
207
|
+
includedTrainRunIds: [],
|
|
208
|
+
deployedCheckpointIds: [],
|
|
209
|
+
internalizationStatus: 'prompt_only',
|
|
210
|
+
};
|
|
211
|
+
return 'async-done';
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
expect(result).toBe('async-done');
|
|
215
|
+
const loaded = await adapter.loadLedger();
|
|
216
|
+
expect(loaded.trainingStore['async-test']).toBeDefined();
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
it('persists via atomicWriteFileSync (file is not corrupted)', async () => {
|
|
220
|
+
await adapter.saveLedger(createEmptyStore());
|
|
221
|
+
|
|
222
|
+
await adapter.mutateLedger((store) => {
|
|
223
|
+
store.tree.principles['P-003'] = {
|
|
224
|
+
id: 'P-003',
|
|
225
|
+
version: 1,
|
|
226
|
+
text: 'Atomic write test',
|
|
227
|
+
triggerPattern: 'test',
|
|
228
|
+
action: 'verify atomicity',
|
|
229
|
+
status: 'candidate',
|
|
230
|
+
priority: 'P1',
|
|
231
|
+
scope: 'general',
|
|
232
|
+
evaluability: 'manual_only',
|
|
233
|
+
valueScore: 0,
|
|
234
|
+
adherenceRate: 0,
|
|
235
|
+
painPreventedCount: 0,
|
|
236
|
+
derivedFromPainIds: [],
|
|
237
|
+
ruleIds: [],
|
|
238
|
+
conflictsWithPrincipleIds: [],
|
|
239
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
240
|
+
updatedAt: '2026-04-17T00:00:00.000Z',
|
|
241
|
+
};
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
// Verify no leftover temp file
|
|
245
|
+
const filePath = path.join(tmpDir, 'principle_training_state.json');
|
|
246
|
+
expect(fs.existsSync(filePath + '.tmp')).toBe(false);
|
|
247
|
+
expect(fs.existsSync(filePath)).toBe(true);
|
|
248
|
+
|
|
249
|
+
// File is valid JSON
|
|
250
|
+
const raw = JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
|
251
|
+
expect(raw[TREE_NAMESPACE].principles['P-003']).toBeDefined();
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
it('is compatible with low-level loadLedger', async () => {
|
|
255
|
+
await adapter.saveLedger(createEmptyStore());
|
|
256
|
+
|
|
257
|
+
await adapter.mutateLedger((store) => {
|
|
258
|
+
store.tree.principles['P-COMPAT'] = {
|
|
259
|
+
id: 'P-COMPAT',
|
|
260
|
+
version: 1,
|
|
261
|
+
text: 'Compatibility test',
|
|
262
|
+
triggerPattern: 'compat',
|
|
263
|
+
action: 'verify',
|
|
264
|
+
status: 'active',
|
|
265
|
+
priority: 'P1',
|
|
266
|
+
scope: 'general',
|
|
267
|
+
evaluability: 'deterministic',
|
|
268
|
+
valueScore: 10,
|
|
269
|
+
adherenceRate: 0.8,
|
|
270
|
+
painPreventedCount: 5,
|
|
271
|
+
derivedFromPainIds: ['pain-1'],
|
|
272
|
+
ruleIds: [],
|
|
273
|
+
conflictsWithPrincipleIds: [],
|
|
274
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
275
|
+
updatedAt: '2026-04-17T00:00:00.000Z',
|
|
276
|
+
};
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
// Load with the low-level function — should see the same data
|
|
280
|
+
const ledger = loadLedger(tmpDir);
|
|
281
|
+
expect(ledger.tree.principles['P-COMPAT']).toBeDefined();
|
|
282
|
+
expect(ledger.tree.principles['P-COMPAT'].text).toBe('Compatibility test');
|
|
283
|
+
});
|
|
284
|
+
});
|
|
285
|
+
});
|
|
@@ -12,6 +12,7 @@ import {
|
|
|
12
12
|
formatReasoningContext,
|
|
13
13
|
invokeStubDreamer,
|
|
14
14
|
invokeStubPhilosopher,
|
|
15
|
+
validateExtraction,
|
|
15
16
|
type TrinityConfig,
|
|
16
17
|
type DreamerOutput,
|
|
17
18
|
type DreamerCandidate,
|
|
@@ -1815,3 +1816,238 @@ describe('Scribe Backward Compatibility (SCRIBE-04)', () => {
|
|
|
1815
1816
|
expect(result.artifact!.chosenJustification).toBeUndefined();
|
|
1816
1817
|
});
|
|
1817
1818
|
});
|
|
1819
|
+
|
|
1820
|
+
// ---------------------------------------------------------------------------
|
|
1821
|
+
// Tests: validateExtraction — Hallucination Detection (SDK-QUAL-02)
|
|
1822
|
+
// ---------------------------------------------------------------------------
|
|
1823
|
+
|
|
1824
|
+
describe('validateExtraction — Hallucination Detection (SDK-QUAL-02)', () => {
|
|
1825
|
+
function makeArtifact(badDecision: string, overrides: Record<string, unknown> = {}): TrinityDraftArtifact {
|
|
1826
|
+
return {
|
|
1827
|
+
selectedCandidateIndex: 0,
|
|
1828
|
+
badDecision,
|
|
1829
|
+
betterDecision: 'Do it right instead',
|
|
1830
|
+
rationale: 'Because the principle says so and this is the correct approach',
|
|
1831
|
+
sessionId: 'session-test-123',
|
|
1832
|
+
principleId: 'T-01',
|
|
1833
|
+
sourceSnapshotRef: 'snapshot-test-001',
|
|
1834
|
+
telemetry: {
|
|
1835
|
+
chainMode: 'trinity',
|
|
1836
|
+
usedStubs: true,
|
|
1837
|
+
dreamerPassed: true,
|
|
1838
|
+
philosopherPassed: true,
|
|
1839
|
+
scribePassed: true,
|
|
1840
|
+
candidateCount: 1,
|
|
1841
|
+
selectedCandidateIndex: 0,
|
|
1842
|
+
stageFailures: [],
|
|
1843
|
+
},
|
|
1844
|
+
...overrides,
|
|
1845
|
+
};
|
|
1846
|
+
}
|
|
1847
|
+
|
|
1848
|
+
function makeSnapshotWithEvidence(overrides: {
|
|
1849
|
+
failedToolCalls?: Array<{ toolName: string; filePath?: string; errorMessage?: string }>;
|
|
1850
|
+
painEvents?: Array<{ source: string; score: number; reason?: string }>;
|
|
1851
|
+
gateBlocks?: Array<{ toolName: string; reason: string }>;
|
|
1852
|
+
userCorrections?: number;
|
|
1853
|
+
} = {}) {
|
|
1854
|
+
const toolCalls = (overrides.failedToolCalls ?? []).map(tc => ({
|
|
1855
|
+
toolName: tc.toolName,
|
|
1856
|
+
outcome: 'failure' as const,
|
|
1857
|
+
filePath: tc.filePath ?? null,
|
|
1858
|
+
durationMs: null,
|
|
1859
|
+
exitCode: 1,
|
|
1860
|
+
errorType: 'runtime_error',
|
|
1861
|
+
errorMessage: tc.errorMessage ?? 'unknown error',
|
|
1862
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
1863
|
+
}));
|
|
1864
|
+
|
|
1865
|
+
const painEvents = (overrides.painEvents ?? []).map(pe => ({
|
|
1866
|
+
source: pe.source,
|
|
1867
|
+
score: pe.score,
|
|
1868
|
+
severity: 'medium' as const,
|
|
1869
|
+
reason: pe.reason ?? null,
|
|
1870
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
1871
|
+
}));
|
|
1872
|
+
|
|
1873
|
+
const gateBlocks = (overrides.gateBlocks ?? []).map(gb => ({
|
|
1874
|
+
toolName: gb.toolName,
|
|
1875
|
+
filePath: null,
|
|
1876
|
+
reason: gb.reason,
|
|
1877
|
+
planStatus: null,
|
|
1878
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
1879
|
+
}));
|
|
1880
|
+
|
|
1881
|
+
const userTurns = Array.from({ length: overrides.userCorrections ?? 0 }, (_, i) => ({
|
|
1882
|
+
turnIndex: i,
|
|
1883
|
+
correctionDetected: true,
|
|
1884
|
+
correctionCue: 'wrong approach',
|
|
1885
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
1886
|
+
}));
|
|
1887
|
+
|
|
1888
|
+
return {
|
|
1889
|
+
sessionId: 'session-test-123',
|
|
1890
|
+
startedAt: '2026-04-17T00:00:00.000Z',
|
|
1891
|
+
updatedAt: '2026-04-17T00:05:00.000Z',
|
|
1892
|
+
assistantTurns: [],
|
|
1893
|
+
userTurns,
|
|
1894
|
+
toolCalls: toolCalls,
|
|
1895
|
+
painEvents,
|
|
1896
|
+
gateBlocks,
|
|
1897
|
+
stats: {
|
|
1898
|
+
failureCount: toolCalls.length,
|
|
1899
|
+
totalPainEvents: painEvents.length,
|
|
1900
|
+
totalGateBlocks: gateBlocks.length,
|
|
1901
|
+
totalAssistantTurns: 5,
|
|
1902
|
+
totalToolCalls: 10,
|
|
1903
|
+
},
|
|
1904
|
+
};
|
|
1905
|
+
}
|
|
1906
|
+
|
|
1907
|
+
it('passes when badDecision references a tool failure from the snapshot', () => {
|
|
1908
|
+
const snapshot = makeSnapshotWithEvidence({
|
|
1909
|
+
failedToolCalls: [{ toolName: 'Edit', filePath: 'src/config.ts', errorMessage: 'permission denied' }],
|
|
1910
|
+
});
|
|
1911
|
+
const artifact = makeArtifact('Proceeded with Edit on src/config.ts without checking permission');
|
|
1912
|
+
|
|
1913
|
+
const result = validateExtraction(artifact, snapshot as any);
|
|
1914
|
+
|
|
1915
|
+
expect(result.isGrounded).toBe(true);
|
|
1916
|
+
expect(result.evidenceTypes).toContain('tool_failures');
|
|
1917
|
+
});
|
|
1918
|
+
|
|
1919
|
+
it('passes when badDecision references a pain event from the snapshot', () => {
|
|
1920
|
+
const snapshot = makeSnapshotWithEvidence({
|
|
1921
|
+
painEvents: [{ source: 'gate', score: 70, reason: 'accumulated friction from repeated file operation failures' }],
|
|
1922
|
+
});
|
|
1923
|
+
const artifact = makeArtifact('Ignored accumulated friction from file operations');
|
|
1924
|
+
|
|
1925
|
+
const result = validateExtraction(artifact, snapshot as any);
|
|
1926
|
+
|
|
1927
|
+
expect(result.isGrounded).toBe(true);
|
|
1928
|
+
expect(result.evidenceTypes).toContain('pain_events');
|
|
1929
|
+
});
|
|
1930
|
+
|
|
1931
|
+
it('passes when badDecision references a gate block from the snapshot', () => {
|
|
1932
|
+
const snapshot = makeSnapshotWithEvidence({
|
|
1933
|
+
gateBlocks: [{ toolName: 'Bash', reason: 'destructive command blocked by safety gate' }],
|
|
1934
|
+
});
|
|
1935
|
+
const artifact = makeArtifact('Attempted to execute a destructive Bash command that was blocked by the gate');
|
|
1936
|
+
|
|
1937
|
+
const result = validateExtraction(artifact, snapshot as any);
|
|
1938
|
+
|
|
1939
|
+
expect(result.isGrounded).toBe(true);
|
|
1940
|
+
expect(result.evidenceTypes).toContain('gate_blocks');
|
|
1941
|
+
});
|
|
1942
|
+
|
|
1943
|
+
it('passes when badDecision references user corrections', () => {
|
|
1944
|
+
const snapshot = makeSnapshotWithEvidence({
|
|
1945
|
+
userCorrections: 2,
|
|
1946
|
+
});
|
|
1947
|
+
const artifact = makeArtifact('Continued with the wrong approach despite user corrections');
|
|
1948
|
+
|
|
1949
|
+
const result = validateExtraction(artifact, snapshot as any);
|
|
1950
|
+
|
|
1951
|
+
expect(result.isGrounded).toBe(true);
|
|
1952
|
+
expect(result.evidenceTypes).toContain('user_corrections');
|
|
1953
|
+
});
|
|
1954
|
+
|
|
1955
|
+
it('detects hallucination when badDecision has no overlap with snapshot evidence', () => {
|
|
1956
|
+
const snapshot = makeSnapshotWithEvidence({
|
|
1957
|
+
failedToolCalls: [{ toolName: 'Read', filePath: 'package.json', errorMessage: 'file not found' }],
|
|
1958
|
+
});
|
|
1959
|
+
const artifact = makeArtifact('Deployed production database without running migration scripts first');
|
|
1960
|
+
|
|
1961
|
+
const result = validateExtraction(artifact, snapshot as any);
|
|
1962
|
+
|
|
1963
|
+
expect(result.isGrounded).toBe(false);
|
|
1964
|
+
expect(result.reason).toContain('Hallucinated extraction');
|
|
1965
|
+
});
|
|
1966
|
+
|
|
1967
|
+
it('passes when snapshot has no evidence at all (no signal to validate against)', () => {
|
|
1968
|
+
const snapshot = makeSnapshotWithEvidence();
|
|
1969
|
+
const artifact = makeArtifact('Made an incorrect decision during the session');
|
|
1970
|
+
|
|
1971
|
+
const result = validateExtraction(artifact, snapshot as any);
|
|
1972
|
+
|
|
1973
|
+
// No evidence means we cannot validate -- allow through
|
|
1974
|
+
expect(result.isGrounded).toBe(true);
|
|
1975
|
+
expect(result.evidenceTypes).toHaveLength(0);
|
|
1976
|
+
});
|
|
1977
|
+
|
|
1978
|
+
it('provides evidence preview for telemetry', () => {
|
|
1979
|
+
const snapshot = makeSnapshotWithEvidence({
|
|
1980
|
+
failedToolCalls: [{ toolName: 'Write', filePath: 'output.log', errorMessage: 'permission denied for write operation' }],
|
|
1981
|
+
painEvents: [{ source: 'hook', score: 80, reason: 'repeated permission denied failures during write operation' }],
|
|
1982
|
+
});
|
|
1983
|
+
const artifact = makeArtifact('Proceeded with write operation on output.log despite permission denied error');
|
|
1984
|
+
|
|
1985
|
+
const result = validateExtraction(artifact, snapshot as any);
|
|
1986
|
+
|
|
1987
|
+
expect(result.isGrounded).toBe(true);
|
|
1988
|
+
expect(result.evidencePreview.length).toBeGreaterThan(0);
|
|
1989
|
+
expect(result.evidenceTypes).toContain('tool_failures');
|
|
1990
|
+
expect(result.evidenceTypes).toContain('pain_events');
|
|
1991
|
+
});
|
|
1992
|
+
|
|
1993
|
+
it('detects hallucination with unrelated but specific badDecision text', () => {
|
|
1994
|
+
const snapshot = makeSnapshotWithEvidence({
|
|
1995
|
+
painEvents: [{ source: 'gate', score: 60, reason: 'rate limit exceeded for API calls' }],
|
|
1996
|
+
});
|
|
1997
|
+
const artifact = makeArtifact('Deleted the primary database without creating a backup first');
|
|
1998
|
+
|
|
1999
|
+
const result = validateExtraction(artifact, snapshot as any);
|
|
2000
|
+
|
|
2001
|
+
expect(result.isGrounded).toBe(false);
|
|
2002
|
+
});
|
|
2003
|
+
|
|
2004
|
+
it('runTrinity stub path fails when hallucination is detected', () => {
|
|
2005
|
+
// Create a snapshot with failure signals so stub candidates are generated
|
|
2006
|
+
// but override the tool calls to be something completely unrelated to what
|
|
2007
|
+
// the stub Dreamer generates (which mentions "failing operation")
|
|
2008
|
+
const snapshot = {
|
|
2009
|
+
sessionId: 'session-hallucination-test',
|
|
2010
|
+
startedAt: '2026-04-17T00:00:00.000Z',
|
|
2011
|
+
updatedAt: '2026-04-17T00:05:00.000Z',
|
|
2012
|
+
assistantTurns: [],
|
|
2013
|
+
userTurns: [],
|
|
2014
|
+
toolCalls: [
|
|
2015
|
+
{
|
|
2016
|
+
toolName: 'Grep',
|
|
2017
|
+
outcome: 'failure' as const,
|
|
2018
|
+
filePath: null,
|
|
2019
|
+
durationMs: null,
|
|
2020
|
+
exitCode: 1,
|
|
2021
|
+
errorType: 'timeout',
|
|
2022
|
+
errorMessage: 'search timed out after 30 seconds',
|
|
2023
|
+
createdAt: '2026-04-17T00:00:00.000Z',
|
|
2024
|
+
},
|
|
2025
|
+
],
|
|
2026
|
+
painEvents: [],
|
|
2027
|
+
gateBlocks: [],
|
|
2028
|
+
stats: {
|
|
2029
|
+
failureCount: 1,
|
|
2030
|
+
totalPainEvents: 0,
|
|
2031
|
+
totalGateBlocks: 0,
|
|
2032
|
+
totalAssistantTurns: 2,
|
|
2033
|
+
totalToolCalls: 1,
|
|
2034
|
+
},
|
|
2035
|
+
};
|
|
2036
|
+
|
|
2037
|
+
const config: TrinityConfig = {
|
|
2038
|
+
useTrinity: true,
|
|
2039
|
+
maxCandidates: 3,
|
|
2040
|
+
useStubs: true,
|
|
2041
|
+
};
|
|
2042
|
+
|
|
2043
|
+
const result = runTrinity({ snapshot: snapshot as any, principleId: 'T-08', config });
|
|
2044
|
+
|
|
2045
|
+
// The stub Dreamer generates candidates mentioning "failing operation" and "config.json"
|
|
2046
|
+
// The snapshot has a Grep failure with "search timed out"
|
|
2047
|
+
// With the normalized token matching: badDecisionTokens = {retry,faili,oper,diagnos,root,caus}
|
|
2048
|
+
// and evidenceTokens = {search,timed,after,seconds,timedout} — no overlap → extraction fails
|
|
2049
|
+
// So result.success must be false with a Hallucinated failure.
|
|
2050
|
+
expect(result.success).toBe(false);
|
|
2051
|
+
expect(result.failures.some(f => f.reason?.includes('Hallucinated'))).toBe(true);
|
|
2052
|
+
});
|
|
2053
|
+
});
|