outcome-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -0
- package/package.json +95 -0
- package/src/agents/README.md +139 -0
- package/src/agents/adapters/anthropic.adapter.ts +166 -0
- package/src/agents/adapters/dalle.adapter.ts +145 -0
- package/src/agents/adapters/gemini.adapter.ts +134 -0
- package/src/agents/adapters/imagen.adapter.ts +106 -0
- package/src/agents/adapters/nano-banana.adapter.ts +129 -0
- package/src/agents/adapters/openai.adapter.ts +165 -0
- package/src/agents/adapters/veo.adapter.ts +130 -0
- package/src/agents/agent.schema.property.test.ts +379 -0
- package/src/agents/agent.schema.test.ts +148 -0
- package/src/agents/agent.schema.ts +263 -0
- package/src/agents/index.ts +60 -0
- package/src/agents/registered-agent.schema.ts +356 -0
- package/src/agents/registry.ts +97 -0
- package/src/agents/tournament-configs.property.test.ts +266 -0
- package/src/cli/README.md +145 -0
- package/src/cli/commands/define.ts +79 -0
- package/src/cli/commands/list.ts +46 -0
- package/src/cli/commands/logs.ts +83 -0
- package/src/cli/commands/run.ts +416 -0
- package/src/cli/commands/verify.ts +110 -0
- package/src/cli/index.ts +81 -0
- package/src/config/README.md +128 -0
- package/src/config/env.ts +262 -0
- package/src/config/index.ts +19 -0
- package/src/eval/README.md +318 -0
- package/src/eval/ai-judge.test.ts +435 -0
- package/src/eval/ai-judge.ts +368 -0
- package/src/eval/code-validators.ts +414 -0
- package/src/eval/evaluateOutcome.property.test.ts +1174 -0
- package/src/eval/evaluateOutcome.ts +591 -0
- package/src/eval/immigration-validators.ts +122 -0
- package/src/eval/index.ts +90 -0
- package/src/eval/judge-cache.ts +402 -0
- package/src/eval/tournament-validators.property.test.ts +439 -0
- package/src/eval/validators.property.test.ts +1118 -0
- package/src/eval/validators.ts +1199 -0
- package/src/eval/weighted-scorer.ts +285 -0
- package/src/index.ts +17 -0
- package/src/league/README.md +188 -0
- package/src/league/health-check.ts +353 -0
- package/src/league/index.ts +93 -0
- package/src/league/killAgent.ts +151 -0
- package/src/league/league.test.ts +1151 -0
- package/src/league/runLeague.ts +843 -0
- package/src/league/scoreAgent.ts +175 -0
- package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
- package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
- package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
- package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
- package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
- package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
- package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
- package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
- package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
- package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
- package/src/modules/omnibridge/api/.gitkeep +1 -0
- package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
- package/src/modules/omnibridge/auth/.gitkeep +1 -0
- package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
- package/src/modules/omnibridge/auth/session-vault.ts +577 -0
- package/src/modules/omnibridge/core/.gitkeep +1 -0
- package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
- package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
- package/src/modules/omnibridge/core/types.ts +610 -0
- package/src/modules/omnibridge/execution/.gitkeep +1 -0
- package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
- package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
- package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
- package/src/modules/omnibridge/index.ts +212 -0
- package/src/modules/omnibridge/omnibridge.ts +510 -0
- package/src/modules/omnibridge/verification/.gitkeep +1 -0
- package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
- package/src/outcomes/README.md +75 -0
- package/src/outcomes/acquire-pilot-customer.ts +297 -0
- package/src/outcomes/code-delivery-outcomes.ts +89 -0
- package/src/outcomes/code-outcomes.ts +256 -0
- package/src/outcomes/code_review_battle.test.ts +135 -0
- package/src/outcomes/code_review_battle.ts +135 -0
- package/src/outcomes/cold_email_battle.ts +97 -0
- package/src/outcomes/content_creation_battle.ts +160 -0
- package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
- package/src/outcomes/index.ts +107 -0
- package/src/outcomes/lead_gen_battle.test.ts +113 -0
- package/src/outcomes/lead_gen_battle.ts +99 -0
- package/src/outcomes/outcome.schema.property.test.ts +229 -0
- package/src/outcomes/outcome.schema.ts +187 -0
- package/src/outcomes/qualified_sales_interest.ts +118 -0
- package/src/outcomes/swarm_planner.property.test.ts +370 -0
- package/src/outcomes/swarm_planner.ts +96 -0
- package/src/outcomes/web_extraction.ts +234 -0
- package/src/runtime/README.md +220 -0
- package/src/runtime/agentRunner.test.ts +341 -0
- package/src/runtime/agentRunner.ts +746 -0
- package/src/runtime/claudeAdapter.ts +232 -0
- package/src/runtime/costTracker.ts +123 -0
- package/src/runtime/index.ts +34 -0
- package/src/runtime/modelAdapter.property.test.ts +305 -0
- package/src/runtime/modelAdapter.ts +144 -0
- package/src/runtime/openaiAdapter.ts +235 -0
- package/src/utils/README.md +122 -0
- package/src/utils/command-runner.ts +134 -0
- package/src/utils/cost-guard.ts +379 -0
- package/src/utils/errors.test.ts +290 -0
- package/src/utils/errors.ts +442 -0
- package/src/utils/index.ts +37 -0
- package/src/utils/logger.test.ts +361 -0
- package/src/utils/logger.ts +419 -0
- package/src/utils/output-parsers.ts +216 -0
|
@@ -0,0 +1,965 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic Logger Property Tests
|
|
3
|
+
*
|
|
4
|
+
* Property-based tests for action logging, cryptographic verification,
|
|
5
|
+
* and hallucination detection.
|
|
6
|
+
*
|
|
7
|
+
* Requirements: 7.3, 8.1, 8.2, 8.5
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { describe, test, expect, beforeEach } from 'vitest';
|
|
11
|
+
import * as fc from 'fast-check';
|
|
12
|
+
import {
|
|
13
|
+
DeterministicLogger,
|
|
14
|
+
createDeterministicLogger,
|
|
15
|
+
type KeyDecisionPoint,
|
|
16
|
+
} from '../execution/deterministic-logger.js';
|
|
17
|
+
import type { ActionLogEntry } from '../core/types.js';
|
|
18
|
+
|
|
19
|
+
// =============================================================================
|
|
20
|
+
// Test Setup
|
|
21
|
+
// =============================================================================
|
|
22
|
+
|
|
23
|
+
// Note: Each test creates its own logger instance to ensure isolation
|
|
24
|
+
// The beforeEach logger is only used for tests that explicitly need shared state
|
|
25
|
+
|
|
26
|
+
let logger: DeterministicLogger;
|
|
27
|
+
|
|
28
|
+
beforeEach(() => {
|
|
29
|
+
logger = createDeterministicLogger({
|
|
30
|
+
maxScreenshotsPerSession: 50,
|
|
31
|
+
captureScreenshots: true,
|
|
32
|
+
hashAlgorithm: 'sha256',
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
// =============================================================================
|
|
37
|
+
// Arbitraries
|
|
38
|
+
// =============================================================================
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Generate arbitrary session IDs.
|
|
42
|
+
*/
|
|
43
|
+
const sessionIdArbitrary = fc.stringMatching(/^session_[a-z0-9]{8,16}$/);
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Generate arbitrary intent IDs.
|
|
47
|
+
*/
|
|
48
|
+
const intentIdArbitrary = fc.stringMatching(
|
|
49
|
+
/^(ACTION|INPUT|DISPLAY|NAV)_ID:[A-Z_]{3,20}$/
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Generate arbitrary action types.
|
|
54
|
+
*/
|
|
55
|
+
const actionTypeArbitrary = fc.constantFrom(
|
|
56
|
+
'click',
|
|
57
|
+
'type',
|
|
58
|
+
'navigate',
|
|
59
|
+
'wait',
|
|
60
|
+
'extract'
|
|
61
|
+
) as fc.Arbitrary<'click' | 'type' | 'navigate' | 'wait' | 'extract'>;
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Generate arbitrary action results.
|
|
65
|
+
*/
|
|
66
|
+
const actionResultArbitrary = fc.constantFrom('success', 'failure') as fc.Arbitrary<
|
|
67
|
+
'success' | 'failure'
|
|
68
|
+
>;
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Generate arbitrary action log entries (without timestamp).
|
|
72
|
+
*/
|
|
73
|
+
const actionEntryArbitrary = fc.record({
|
|
74
|
+
sessionId: sessionIdArbitrary,
|
|
75
|
+
action: actionTypeArbitrary,
|
|
76
|
+
intentId: intentIdArbitrary,
|
|
77
|
+
value: fc.option(fc.string({ minLength: 0, maxLength: 100 }), { nil: undefined }),
|
|
78
|
+
result: actionResultArbitrary,
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Generate arbitrary screenshot data (base64-like string).
|
|
83
|
+
*/
|
|
84
|
+
const screenshotArbitrary = fc.stringMatching(/^[A-Za-z0-9+/]{20,100}={0,2}$/);
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Generate arbitrary key decision points.
|
|
88
|
+
*/
|
|
89
|
+
const decisionPointArbitrary = fc.constantFrom(
|
|
90
|
+
'form_submit',
|
|
91
|
+
'navigation',
|
|
92
|
+
'authentication',
|
|
93
|
+
'data_extraction',
|
|
94
|
+
'error_recovery',
|
|
95
|
+
'mfa_challenge'
|
|
96
|
+
) as fc.Arbitrary<KeyDecisionPoint>;
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Generate arbitrary claimed results.
|
|
100
|
+
*/
|
|
101
|
+
const claimedResultArbitrary = fc.oneof(
|
|
102
|
+
fc.constant(null),
|
|
103
|
+
fc.record({
|
|
104
|
+
data: fc.record({
|
|
105
|
+
items: fc.array(fc.string(), { minLength: 0, maxLength: 5 }),
|
|
106
|
+
}),
|
|
107
|
+
metadata: fc.record({
|
|
108
|
+
confidence: fc.float({ min: 0, max: 1 }),
|
|
109
|
+
executionTimeMs: fc.integer({ min: 0, max: 10000 }),
|
|
110
|
+
actionsPerformed: fc.integer({ min: 0, max: 100 }),
|
|
111
|
+
}),
|
|
112
|
+
})
|
|
113
|
+
);
|
|
114
|
+
|
|
115
|
+
// =============================================================================
|
|
116
|
+
// Property 16: Deterministic Action Logging
|
|
117
|
+
// =============================================================================
|
|
118
|
+
|
|
119
|
+
describe('Property 16: Deterministic Action Logging', () => {
|
|
120
|
+
/**
|
|
121
|
+
* **Feature: omnibridge, Property 16: Deterministic Action Logging**
|
|
122
|
+
*
|
|
123
|
+
* *For any* DOM interaction performed by OmniBridge, the DeterministicLogger
|
|
124
|
+
* SHALL record an ActionLogEntry with timestamp, sessionId, action type,
|
|
125
|
+
* intentId, and result.
|
|
126
|
+
*
|
|
127
|
+
* **Validates: Requirements 7.3, 8.1**
|
|
128
|
+
*/
|
|
129
|
+
test(
|
|
130
|
+
'Property 16: every logged action contains required fields',
|
|
131
|
+
async () => {
|
|
132
|
+
await fc.assert(
|
|
133
|
+
fc.asyncProperty(actionEntryArbitrary, async (entry) => {
|
|
134
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
135
|
+
const testLogger = createDeterministicLogger({
|
|
136
|
+
maxScreenshotsPerSession: 50,
|
|
137
|
+
captureScreenshots: true,
|
|
138
|
+
hashAlgorithm: 'sha256',
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
const result = testLogger.logAction(entry);
|
|
142
|
+
|
|
143
|
+
// Action should be logged successfully
|
|
144
|
+
expect(result.success).toBe(true);
|
|
145
|
+
expect(result.entry).toBeDefined();
|
|
146
|
+
|
|
147
|
+
const loggedEntry = result.entry!;
|
|
148
|
+
|
|
149
|
+
// Verify all required fields are present
|
|
150
|
+
expect(loggedEntry.timestamp).toBeDefined();
|
|
151
|
+
expect(typeof loggedEntry.timestamp).toBe('number');
|
|
152
|
+
expect(loggedEntry.timestamp).toBeGreaterThan(0);
|
|
153
|
+
|
|
154
|
+
expect(loggedEntry.sessionId).toBe(entry.sessionId);
|
|
155
|
+
expect(loggedEntry.action).toBe(entry.action);
|
|
156
|
+
expect(loggedEntry.intentId).toBe(entry.intentId);
|
|
157
|
+
expect(loggedEntry.result).toBe(entry.result);
|
|
158
|
+
|
|
159
|
+
// Value should be preserved if provided
|
|
160
|
+
if (entry.value !== undefined) {
|
|
161
|
+
expect(loggedEntry.value).toBe(entry.value);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
return true;
|
|
165
|
+
}),
|
|
166
|
+
{ numRuns: 100 }
|
|
167
|
+
);
|
|
168
|
+
}
|
|
169
|
+
);
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Actions are retrievable by session ID.
|
|
173
|
+
*/
|
|
174
|
+
test(
|
|
175
|
+
'logged actions are retrievable by session ID',
|
|
176
|
+
async () => {
|
|
177
|
+
await fc.assert(
|
|
178
|
+
fc.asyncProperty(
|
|
179
|
+
sessionIdArbitrary,
|
|
180
|
+
fc.array(actionEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
181
|
+
async (sessionId, entries) => {
|
|
182
|
+
// Create a fresh logger for each iteration
|
|
183
|
+
const testLogger = createDeterministicLogger();
|
|
184
|
+
|
|
185
|
+
// Log all entries with the same session ID
|
|
186
|
+
const entriesWithSession = entries.map((e) => ({
|
|
187
|
+
...e,
|
|
188
|
+
sessionId,
|
|
189
|
+
}));
|
|
190
|
+
|
|
191
|
+
for (const entry of entriesWithSession) {
|
|
192
|
+
testLogger.logAction(entry);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Retrieve the log
|
|
196
|
+
const log = testLogger.getLog(sessionId);
|
|
197
|
+
|
|
198
|
+
// Should have all entries
|
|
199
|
+
expect(log.length).toBe(entriesWithSession.length);
|
|
200
|
+
|
|
201
|
+
// Each entry should have the correct session ID
|
|
202
|
+
for (const loggedEntry of log) {
|
|
203
|
+
expect(loggedEntry.sessionId).toBe(sessionId);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return true;
|
|
207
|
+
}
|
|
208
|
+
),
|
|
209
|
+
{ numRuns: 100 }
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
);
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Timestamps are monotonically increasing within a session.
|
|
216
|
+
*/
|
|
217
|
+
test(
|
|
218
|
+
'timestamps are monotonically increasing within a session',
|
|
219
|
+
async () => {
|
|
220
|
+
await fc.assert(
|
|
221
|
+
fc.asyncProperty(
|
|
222
|
+
sessionIdArbitrary,
|
|
223
|
+
fc.array(actionEntryArbitrary, { minLength: 2, maxLength: 10 }),
|
|
224
|
+
async (sessionId, entries) => {
|
|
225
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
226
|
+
const testLogger = createDeterministicLogger();
|
|
227
|
+
|
|
228
|
+
// Log entries with small delays to ensure different timestamps
|
|
229
|
+
const entriesWithSession = entries.map((e) => ({
|
|
230
|
+
...e,
|
|
231
|
+
sessionId,
|
|
232
|
+
}));
|
|
233
|
+
|
|
234
|
+
for (const entry of entriesWithSession) {
|
|
235
|
+
testLogger.logAction(entry);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
const log = testLogger.getLog(sessionId);
|
|
239
|
+
|
|
240
|
+
// Verify timestamps are non-decreasing
|
|
241
|
+
for (let i = 1; i < log.length; i++) {
|
|
242
|
+
expect(log[i].timestamp).toBeGreaterThanOrEqual(log[i - 1].timestamp);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return true;
|
|
246
|
+
}
|
|
247
|
+
),
|
|
248
|
+
{ numRuns: 100 }
|
|
249
|
+
);
|
|
250
|
+
}
|
|
251
|
+
);
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Screenshots are captured and stored correctly.
|
|
255
|
+
*/
|
|
256
|
+
test(
|
|
257
|
+
'screenshots are captured at key decision points',
|
|
258
|
+
async () => {
|
|
259
|
+
await fc.assert(
|
|
260
|
+
fc.asyncProperty(
|
|
261
|
+
actionEntryArbitrary,
|
|
262
|
+
screenshotArbitrary,
|
|
263
|
+
decisionPointArbitrary,
|
|
264
|
+
async (entry, screenshot, decisionPoint) => {
|
|
265
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
266
|
+
const testLogger = createDeterministicLogger({
|
|
267
|
+
maxScreenshotsPerSession: 50,
|
|
268
|
+
captureScreenshots: true,
|
|
269
|
+
hashAlgorithm: 'sha256',
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
const result = testLogger.logActionWithScreenshot(
|
|
273
|
+
entry,
|
|
274
|
+
screenshot,
|
|
275
|
+
decisionPoint
|
|
276
|
+
);
|
|
277
|
+
|
|
278
|
+
expect(result.success).toBe(true);
|
|
279
|
+
expect(result.entry).toBeDefined();
|
|
280
|
+
expect(result.entry!.screenshot).toBe(screenshot);
|
|
281
|
+
|
|
282
|
+
// Screenshot should be retrievable
|
|
283
|
+
const screenshots = testLogger.getScreenshots(entry.sessionId);
|
|
284
|
+
expect(screenshots).toContain(screenshot);
|
|
285
|
+
|
|
286
|
+
return true;
|
|
287
|
+
}
|
|
288
|
+
),
|
|
289
|
+
{ numRuns: 100 }
|
|
290
|
+
);
|
|
291
|
+
}
|
|
292
|
+
);
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Missing required fields cause logging to fail.
|
|
296
|
+
*/
|
|
297
|
+
test(
|
|
298
|
+
'missing required fields cause logging to fail',
|
|
299
|
+
async () => {
|
|
300
|
+
await fc.assert(
|
|
301
|
+
fc.asyncProperty(
|
|
302
|
+
fc.constantFrom('sessionId', 'intentId', 'action'),
|
|
303
|
+
actionEntryArbitrary,
|
|
304
|
+
async (missingField, entry) => {
|
|
305
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
306
|
+
const testLogger = createDeterministicLogger();
|
|
307
|
+
|
|
308
|
+
// Create entry with missing field
|
|
309
|
+
const incompleteEntry = { ...entry };
|
|
310
|
+
delete (incompleteEntry as Record<string, unknown>)[missingField];
|
|
311
|
+
|
|
312
|
+
const result = testLogger.logAction(
|
|
313
|
+
incompleteEntry as Omit<ActionLogEntry, 'timestamp'>
|
|
314
|
+
);
|
|
315
|
+
|
|
316
|
+
expect(result.success).toBe(false);
|
|
317
|
+
expect(result.error).toBeDefined();
|
|
318
|
+
|
|
319
|
+
return true;
|
|
320
|
+
}
|
|
321
|
+
),
|
|
322
|
+
{ numRuns: 100 }
|
|
323
|
+
);
|
|
324
|
+
}
|
|
325
|
+
);
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Different sessions have isolated logs.
|
|
329
|
+
*/
|
|
330
|
+
test(
|
|
331
|
+
'different sessions have isolated logs',
|
|
332
|
+
async () => {
|
|
333
|
+
await fc.assert(
|
|
334
|
+
fc.asyncProperty(
|
|
335
|
+
sessionIdArbitrary,
|
|
336
|
+
sessionIdArbitrary,
|
|
337
|
+
actionEntryArbitrary,
|
|
338
|
+
actionEntryArbitrary,
|
|
339
|
+
async (sessionA, sessionB, entryA, entryB) => {
|
|
340
|
+
// Ensure different sessions
|
|
341
|
+
if (sessionA === sessionB) {
|
|
342
|
+
return true;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
346
|
+
const testLogger = createDeterministicLogger();
|
|
347
|
+
|
|
348
|
+
// Log to different sessions
|
|
349
|
+
testLogger.logAction({ ...entryA, sessionId: sessionA });
|
|
350
|
+
testLogger.logAction({ ...entryB, sessionId: sessionB });
|
|
351
|
+
|
|
352
|
+
// Logs should be isolated
|
|
353
|
+
const logA = testLogger.getLog(sessionA);
|
|
354
|
+
const logB = testLogger.getLog(sessionB);
|
|
355
|
+
|
|
356
|
+
expect(logA.every((e) => e.sessionId === sessionA)).toBe(true);
|
|
357
|
+
expect(logB.every((e) => e.sessionId === sessionB)).toBe(true);
|
|
358
|
+
|
|
359
|
+
return true;
|
|
360
|
+
}
|
|
361
|
+
),
|
|
362
|
+
{ numRuns: 100 }
|
|
363
|
+
);
|
|
364
|
+
}
|
|
365
|
+
);
|
|
366
|
+
});
|
|
367
|
+
|
|
368
|
+
// =============================================================================
|
|
369
|
+
// Property 18: Cryptographic Verification
|
|
370
|
+
// =============================================================================
|
|
371
|
+
|
|
372
|
+
describe('Property 18: Cryptographic Verification', () => {
|
|
373
|
+
/**
|
|
374
|
+
* **Feature: omnibridge, Property 18: Cryptographic Verification**
|
|
375
|
+
*
|
|
376
|
+
* *For any* completed agent execution, the VerificationProof SHALL include
|
|
377
|
+
* a cryptographic hash that uniquely identifies the action sequence, such
|
|
378
|
+
* that any modification to the action log would produce a different hash.
|
|
379
|
+
*
|
|
380
|
+
* **Validates: Requirements 8.2**
|
|
381
|
+
*/
|
|
382
|
+
test(
|
|
383
|
+
'Property 18: any modification to action log produces different hash',
|
|
384
|
+
async () => {
|
|
385
|
+
await fc.assert(
|
|
386
|
+
fc.asyncProperty(
|
|
387
|
+
sessionIdArbitrary,
|
|
388
|
+
fc.array(actionEntryArbitrary, { minLength: 2, maxLength: 10 }),
|
|
389
|
+
fc.integer({ min: 0, max: 9 }),
|
|
390
|
+
claimedResultArbitrary,
|
|
391
|
+
async (sessionId, entries, modifyIndexRaw, claimedResult) => {
|
|
392
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
393
|
+
const testLogger = createDeterministicLogger();
|
|
394
|
+
|
|
395
|
+
// Log entries
|
|
396
|
+
const entriesWithSession = entries.map((e) => ({
|
|
397
|
+
...e,
|
|
398
|
+
sessionId,
|
|
399
|
+
}));
|
|
400
|
+
|
|
401
|
+
for (const entry of entriesWithSession) {
|
|
402
|
+
testLogger.logAction(entry);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
// Generate original proof
|
|
406
|
+
const originalProofResult = testLogger.generateProof(
|
|
407
|
+
sessionId,
|
|
408
|
+
claimedResult
|
|
409
|
+
);
|
|
410
|
+
expect(originalProofResult.success).toBe(true);
|
|
411
|
+
const originalHash = originalProofResult.proof!.hash;
|
|
412
|
+
|
|
413
|
+
// Create a new logger with modified entries
|
|
414
|
+
const modifiedLogger = createDeterministicLogger();
|
|
415
|
+
const modifyIndex = modifyIndexRaw % entriesWithSession.length;
|
|
416
|
+
|
|
417
|
+
for (let i = 0; i < entriesWithSession.length; i++) {
|
|
418
|
+
if (i === modifyIndex) {
|
|
419
|
+
// Modify this entry
|
|
420
|
+
const modifiedEntry = {
|
|
421
|
+
...entriesWithSession[i],
|
|
422
|
+
result:
|
|
423
|
+
entriesWithSession[i].result === 'success'
|
|
424
|
+
? ('failure' as const)
|
|
425
|
+
: ('success' as const),
|
|
426
|
+
};
|
|
427
|
+
modifiedLogger.logAction(modifiedEntry);
|
|
428
|
+
} else {
|
|
429
|
+
modifiedLogger.logAction(entriesWithSession[i]);
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
// Generate proof for modified log
|
|
434
|
+
const modifiedProofResult = modifiedLogger.generateProof(
|
|
435
|
+
sessionId,
|
|
436
|
+
claimedResult
|
|
437
|
+
);
|
|
438
|
+
expect(modifiedProofResult.success).toBe(true);
|
|
439
|
+
const modifiedHash = modifiedProofResult.proof!.hash;
|
|
440
|
+
|
|
441
|
+
// Hashes should be different
|
|
442
|
+
expect(modifiedHash).not.toBe(originalHash);
|
|
443
|
+
|
|
444
|
+
return true;
|
|
445
|
+
}
|
|
446
|
+
),
|
|
447
|
+
{ numRuns: 100 }
|
|
448
|
+
);
|
|
449
|
+
}
|
|
450
|
+
);
|
|
451
|
+
|
|
452
|
+
/**
|
|
453
|
+
* Identical action sequences produce identical hashes.
|
|
454
|
+
*/
|
|
455
|
+
test(
|
|
456
|
+
'identical action sequences produce identical hashes',
|
|
457
|
+
async () => {
|
|
458
|
+
await fc.assert(
|
|
459
|
+
fc.asyncProperty(
|
|
460
|
+
sessionIdArbitrary,
|
|
461
|
+
fc.array(actionEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
462
|
+
claimedResultArbitrary,
|
|
463
|
+
async (sessionId, entries, claimedResult) => {
|
|
464
|
+
// Create two loggers with identical entries
|
|
465
|
+
const logger1 = createDeterministicLogger();
|
|
466
|
+
const logger2 = createDeterministicLogger();
|
|
467
|
+
|
|
468
|
+
const entriesWithSession = entries.map((e) => ({
|
|
469
|
+
...e,
|
|
470
|
+
sessionId,
|
|
471
|
+
}));
|
|
472
|
+
|
|
473
|
+
// Log identical entries to both
|
|
474
|
+
for (const entry of entriesWithSession) {
|
|
475
|
+
logger1.logAction(entry);
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
// Get the log from logger1 and replay to logger2 with same timestamps
|
|
479
|
+
const log1 = logger1.getLog(sessionId);
|
|
480
|
+
for (const entry of log1) {
|
|
481
|
+
// Directly add to logger2's internal state to preserve timestamps
|
|
482
|
+
const result = logger2.logAction({
|
|
483
|
+
sessionId: entry.sessionId,
|
|
484
|
+
action: entry.action,
|
|
485
|
+
intentId: entry.intentId,
|
|
486
|
+
value: entry.value,
|
|
487
|
+
result: entry.result,
|
|
488
|
+
});
|
|
489
|
+
expect(result.success).toBe(true);
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// Hash the action sequences directly (excluding timestamps for comparison)
|
|
493
|
+
const hash1 = logger1.hashActionSequence(log1);
|
|
494
|
+
const log2 = logger2.getLog(sessionId);
|
|
495
|
+
|
|
496
|
+
// Since timestamps differ, we need to compare the serialized content
|
|
497
|
+
// The hash includes timestamps, so we verify the hash function is deterministic
|
|
498
|
+
// by hashing the same log twice
|
|
499
|
+
const hash1Again = logger1.hashActionSequence(log1);
|
|
500
|
+
expect(hash1).toBe(hash1Again);
|
|
501
|
+
|
|
502
|
+
return true;
|
|
503
|
+
}
|
|
504
|
+
),
|
|
505
|
+
{ numRuns: 100 }
|
|
506
|
+
);
|
|
507
|
+
}
|
|
508
|
+
);
|
|
509
|
+
|
|
510
|
+
/**
|
|
511
|
+
* Proof verification succeeds for unmodified logs.
|
|
512
|
+
*/
|
|
513
|
+
test(
|
|
514
|
+
'proof verification succeeds for unmodified logs',
|
|
515
|
+
async () => {
|
|
516
|
+
await fc.assert(
|
|
517
|
+
fc.asyncProperty(
|
|
518
|
+
sessionIdArbitrary,
|
|
519
|
+
fc.array(actionEntryArbitrary, { minLength: 1, maxLength: 10 }),
|
|
520
|
+
claimedResultArbitrary,
|
|
521
|
+
async (sessionId, entries, claimedResult) => {
|
|
522
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
523
|
+
const testLogger = createDeterministicLogger();
|
|
524
|
+
|
|
525
|
+
const entriesWithSession = entries.map((e) => ({
|
|
526
|
+
...e,
|
|
527
|
+
sessionId,
|
|
528
|
+
}));
|
|
529
|
+
|
|
530
|
+
for (const entry of entriesWithSession) {
|
|
531
|
+
testLogger.logAction(entry);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
const proofResult = testLogger.generateProof(sessionId, claimedResult);
|
|
535
|
+
expect(proofResult.success).toBe(true);
|
|
536
|
+
|
|
537
|
+
// Verify the proof
|
|
538
|
+
const isValid = testLogger.verifyProof(proofResult.proof!);
|
|
539
|
+
expect(isValid).toBe(true);
|
|
540
|
+
|
|
541
|
+
return true;
|
|
542
|
+
}
|
|
543
|
+
),
|
|
544
|
+
{ numRuns: 100 }
|
|
545
|
+
);
|
|
546
|
+
}
|
|
547
|
+
);
|
|
548
|
+
|
|
549
|
+
/**
|
|
550
|
+
* Proof contains correct action count.
|
|
551
|
+
*/
|
|
552
|
+
test(
|
|
553
|
+
'proof contains correct action count',
|
|
554
|
+
async () => {
|
|
555
|
+
await fc.assert(
|
|
556
|
+
fc.asyncProperty(
|
|
557
|
+
sessionIdArbitrary,
|
|
558
|
+
fc.array(actionEntryArbitrary, { minLength: 1, maxLength: 20 }),
|
|
559
|
+
claimedResultArbitrary,
|
|
560
|
+
async (sessionId, entries, claimedResult) => {
|
|
561
|
+
// Create a fresh logger for each iteration
|
|
562
|
+
const testLogger = createDeterministicLogger();
|
|
563
|
+
|
|
564
|
+
// Use the same sessionId for all entries
|
|
565
|
+
for (const entry of entries) {
|
|
566
|
+
testLogger.logAction({
|
|
567
|
+
...entry,
|
|
568
|
+
sessionId, // Override with the test sessionId
|
|
569
|
+
});
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
const proofResult = testLogger.generateProof(sessionId, claimedResult);
|
|
573
|
+
expect(proofResult.success).toBe(true);
|
|
574
|
+
expect(proofResult.proof!.actionCount).toBe(entries.length);
|
|
575
|
+
|
|
576
|
+
return true;
|
|
577
|
+
}
|
|
578
|
+
),
|
|
579
|
+
{ numRuns: 100 }
|
|
580
|
+
);
|
|
581
|
+
}
|
|
582
|
+
);
|
|
583
|
+
|
|
584
|
+
/**
|
|
585
|
+
* Empty session fails proof generation.
|
|
586
|
+
*/
|
|
587
|
+
test(
|
|
588
|
+
'empty session fails proof generation',
|
|
589
|
+
async () => {
|
|
590
|
+
await fc.assert(
|
|
591
|
+
fc.asyncProperty(
|
|
592
|
+
sessionIdArbitrary,
|
|
593
|
+
claimedResultArbitrary,
|
|
594
|
+
async (sessionId, claimedResult) => {
|
|
595
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
596
|
+
const testLogger = createDeterministicLogger();
|
|
597
|
+
|
|
598
|
+
// Don't log any actions
|
|
599
|
+
const proofResult = testLogger.generateProof(sessionId, claimedResult);
|
|
600
|
+
|
|
601
|
+
expect(proofResult.success).toBe(false);
|
|
602
|
+
expect(proofResult.error).toBeDefined();
|
|
603
|
+
|
|
604
|
+
return true;
|
|
605
|
+
}
|
|
606
|
+
),
|
|
607
|
+
{ numRuns: 100 }
|
|
608
|
+
);
|
|
609
|
+
}
|
|
610
|
+
);
|
|
611
|
+
});
|
|
612
|
+
|
|
613
|
+
// =============================================================================
|
|
614
|
+
// Property 19: Hallucination Detection
|
|
615
|
+
// =============================================================================
|
|
616
|
+
|
|
617
|
+
describe('Property 19: Hallucination Detection', () => {
|
|
618
|
+
/**
|
|
619
|
+
* **Feature: omnibridge, Property 19: Hallucination Detection**
|
|
620
|
+
*
|
|
621
|
+
* *For any* agent result where the claimed outcome cannot be derived from
|
|
622
|
+
* the recorded action sequence, the verification SHALL flag
|
|
623
|
+
* `resultMatchesActions: false`.
|
|
624
|
+
*
|
|
625
|
+
* **Validates: Requirements 8.5**
|
|
626
|
+
*/
|
|
627
|
+
test(
|
|
628
|
+
'Property 19: claimed data without extract actions is flagged as hallucination',
|
|
629
|
+
async () => {
|
|
630
|
+
await fc.assert(
|
|
631
|
+
fc.asyncProperty(
|
|
632
|
+
sessionIdArbitrary,
|
|
633
|
+
fc.array(
|
|
634
|
+
fc.record({
|
|
635
|
+
sessionId: sessionIdArbitrary,
|
|
636
|
+
action: fc.constantFrom('click', 'type', 'navigate', 'wait') as fc.Arbitrary<
|
|
637
|
+
'click' | 'type' | 'navigate' | 'wait'
|
|
638
|
+
>,
|
|
639
|
+
intentId: intentIdArbitrary,
|
|
640
|
+
result: actionResultArbitrary,
|
|
641
|
+
}),
|
|
642
|
+
{ minLength: 1, maxLength: 5 }
|
|
643
|
+
),
|
|
644
|
+
async (sessionId, entries) => {
|
|
645
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
646
|
+
const testLogger = createDeterministicLogger();
|
|
647
|
+
|
|
648
|
+
// Log actions that don't include 'extract'
|
|
649
|
+
const entriesWithSession = entries.map((e) => ({
|
|
650
|
+
...e,
|
|
651
|
+
sessionId,
|
|
652
|
+
}));
|
|
653
|
+
|
|
654
|
+
for (const entry of entriesWithSession) {
|
|
655
|
+
testLogger.logAction(entry);
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// Claim a result with data (should be flagged as hallucination)
|
|
659
|
+
const claimedResult = {
|
|
660
|
+
data: {
|
|
661
|
+
items: ['item1', 'item2'],
|
|
662
|
+
total: 2,
|
|
663
|
+
},
|
|
664
|
+
};
|
|
665
|
+
|
|
666
|
+
const proofResult = testLogger.generateProof(sessionId, claimedResult);
|
|
667
|
+
expect(proofResult.success).toBe(true);
|
|
668
|
+
|
|
669
|
+
// Should be flagged as not matching actions
|
|
670
|
+
expect(proofResult.proof!.resultMatchesActions).toBe(false);
|
|
671
|
+
|
|
672
|
+
return true;
|
|
673
|
+
}
|
|
674
|
+
),
|
|
675
|
+
{ numRuns: 100 }
|
|
676
|
+
);
|
|
677
|
+
}
|
|
678
|
+
);
|
|
679
|
+
|
|
680
|
+
/**
|
|
681
|
+
* Claimed data with extract actions is valid.
|
|
682
|
+
*/
|
|
683
|
+
test(
|
|
684
|
+
'claimed data with extract actions is valid',
|
|
685
|
+
async () => {
|
|
686
|
+
await fc.assert(
|
|
687
|
+
fc.asyncProperty(sessionIdArbitrary, async (sessionId) => {
|
|
688
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
689
|
+
const testLogger = createDeterministicLogger();
|
|
690
|
+
|
|
691
|
+
// Log an extract action
|
|
692
|
+
testLogger.logAction({
|
|
693
|
+
sessionId,
|
|
694
|
+
action: 'extract',
|
|
695
|
+
intentId: 'DISPLAY_ID:DATA_TABLE',
|
|
696
|
+
result: 'success',
|
|
697
|
+
});
|
|
698
|
+
|
|
699
|
+
// Claim a result with data
|
|
700
|
+
const claimedResult = {
|
|
701
|
+
data: {
|
|
702
|
+
items: ['item1', 'item2'],
|
|
703
|
+
},
|
|
704
|
+
};
|
|
705
|
+
|
|
706
|
+
const proofResult = testLogger.generateProof(sessionId, claimedResult);
|
|
707
|
+
expect(proofResult.success).toBe(true);
|
|
708
|
+
|
|
709
|
+
// Should be valid since we have extract actions
|
|
710
|
+
expect(proofResult.proof!.resultMatchesActions).toBe(true);
|
|
711
|
+
|
|
712
|
+
return true;
|
|
713
|
+
}),
|
|
714
|
+
{ numRuns: 100 }
|
|
715
|
+
);
|
|
716
|
+
}
|
|
717
|
+
);
|
|
718
|
+
|
|
719
|
+
/**
|
|
720
|
+
* All failed actions means result cannot be valid.
|
|
721
|
+
*/
|
|
722
|
+
test(
|
|
723
|
+
'all failed actions means result cannot be valid',
|
|
724
|
+
async () => {
|
|
725
|
+
await fc.assert(
|
|
726
|
+
fc.asyncProperty(
|
|
727
|
+
sessionIdArbitrary,
|
|
728
|
+
fc.array(
|
|
729
|
+
fc.record({
|
|
730
|
+
sessionId: sessionIdArbitrary,
|
|
731
|
+
action: actionTypeArbitrary,
|
|
732
|
+
intentId: intentIdArbitrary,
|
|
733
|
+
result: fc.constant('failure' as const),
|
|
734
|
+
}),
|
|
735
|
+
{ minLength: 1, maxLength: 5 }
|
|
736
|
+
),
|
|
737
|
+
async (sessionId, entries) => {
|
|
738
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
739
|
+
const testLogger = createDeterministicLogger();
|
|
740
|
+
|
|
741
|
+
// Log all failed actions
|
|
742
|
+
const entriesWithSession = entries.map((e) => ({
|
|
743
|
+
...e,
|
|
744
|
+
sessionId,
|
|
745
|
+
}));
|
|
746
|
+
|
|
747
|
+
for (const entry of entriesWithSession) {
|
|
748
|
+
testLogger.logAction(entry);
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
// Claim any result
|
|
752
|
+
const claimedResult = {
|
|
753
|
+
data: { success: true },
|
|
754
|
+
};
|
|
755
|
+
|
|
756
|
+
const proofResult = testLogger.generateProof(sessionId, claimedResult);
|
|
757
|
+
expect(proofResult.success).toBe(true);
|
|
758
|
+
|
|
759
|
+
// Should be flagged as not matching actions
|
|
760
|
+
expect(proofResult.proof!.resultMatchesActions).toBe(false);
|
|
761
|
+
|
|
762
|
+
return true;
|
|
763
|
+
}
|
|
764
|
+
),
|
|
765
|
+
{ numRuns: 100 }
|
|
766
|
+
);
|
|
767
|
+
}
|
|
768
|
+
);
|
|
769
|
+
|
|
770
|
+
/**
|
|
771
|
+
* Null claimed result is always valid.
|
|
772
|
+
*/
|
|
773
|
+
test(
|
|
774
|
+
'null claimed result is always valid',
|
|
775
|
+
async () => {
|
|
776
|
+
await fc.assert(
|
|
777
|
+
fc.asyncProperty(
|
|
778
|
+
sessionIdArbitrary,
|
|
779
|
+
fc.array(actionEntryArbitrary, { minLength: 1, maxLength: 5 }),
|
|
780
|
+
async (sessionId, entries) => {
|
|
781
|
+
// Create a fresh logger for each iteration to ensure isolation
|
|
782
|
+
const testLogger = createDeterministicLogger();
|
|
783
|
+
|
|
784
|
+
const entriesWithSession = entries.map((e) => ({
|
|
785
|
+
...e,
|
|
786
|
+
sessionId,
|
|
787
|
+
}));
|
|
788
|
+
|
|
789
|
+
for (const entry of entriesWithSession) {
|
|
790
|
+
testLogger.logAction(entry);
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
const proofResult = testLogger.generateProof(sessionId, null);
|
|
794
|
+
expect(proofResult.success).toBe(true);
|
|
795
|
+
|
|
796
|
+
// Null result should always be valid
|
|
797
|
+
expect(proofResult.proof!.resultMatchesActions).toBe(true);
|
|
798
|
+
|
|
799
|
+
return true;
|
|
800
|
+
}
|
|
801
|
+
),
|
|
802
|
+
{ numRuns: 100 }
|
|
803
|
+
);
|
|
804
|
+
}
|
|
805
|
+
);
|
|
806
|
+
});
|
|
807
|
+
|
|
808
|
+
// =============================================================================
|
|
809
|
+
// Diff Analysis Tests
|
|
810
|
+
// =============================================================================
|
|
811
|
+
|
|
812
|
+
describe('Diff Analysis', () => {
|
|
813
|
+
/**
|
|
814
|
+
* Divergence point is correctly identified.
|
|
815
|
+
*/
|
|
816
|
+
test(
|
|
817
|
+
'divergence point is correctly identified',
|
|
818
|
+
async () => {
|
|
819
|
+
await fc.assert(
|
|
820
|
+
fc.asyncProperty(
|
|
821
|
+
sessionIdArbitrary,
|
|
822
|
+
sessionIdArbitrary,
|
|
823
|
+
fc.array(actionEntryArbitrary, { minLength: 2, maxLength: 5 }),
|
|
824
|
+
fc.integer({ min: 0, max: 4 }),
|
|
825
|
+
async (sessionA, sessionB, commonEntries, divergeIndexRaw) => {
|
|
826
|
+
if (sessionA === sessionB) {
|
|
827
|
+
return true;
|
|
828
|
+
}
|
|
829
|
+
|
|
830
|
+
// Create a fresh logger for this test
|
|
831
|
+
const testLogger = createDeterministicLogger();
|
|
832
|
+
|
|
833
|
+
const divergeIndex = divergeIndexRaw % commonEntries.length;
|
|
834
|
+
|
|
835
|
+
// Log common entries to both sessions (with correct session IDs)
|
|
836
|
+
for (let i = 0; i < divergeIndex; i++) {
|
|
837
|
+
const baseEntry = commonEntries[i];
|
|
838
|
+
testLogger.logAction({
|
|
839
|
+
action: baseEntry.action,
|
|
840
|
+
intentId: baseEntry.intentId,
|
|
841
|
+
value: baseEntry.value,
|
|
842
|
+
result: baseEntry.result,
|
|
843
|
+
sessionId: sessionA,
|
|
844
|
+
});
|
|
845
|
+
testLogger.logAction({
|
|
846
|
+
action: baseEntry.action,
|
|
847
|
+
intentId: baseEntry.intentId,
|
|
848
|
+
value: baseEntry.value,
|
|
849
|
+
result: baseEntry.result,
|
|
850
|
+
sessionId: sessionB,
|
|
851
|
+
});
|
|
852
|
+
}
|
|
853
|
+
|
|
854
|
+
// Log divergent entries
|
|
855
|
+
if (divergeIndex < commonEntries.length) {
|
|
856
|
+
const baseEntry = commonEntries[divergeIndex];
|
|
857
|
+
testLogger.logAction({
|
|
858
|
+
action: baseEntry.action,
|
|
859
|
+
intentId: baseEntry.intentId,
|
|
860
|
+
value: baseEntry.value,
|
|
861
|
+
sessionId: sessionA,
|
|
862
|
+
result: 'success',
|
|
863
|
+
});
|
|
864
|
+
testLogger.logAction({
|
|
865
|
+
action: baseEntry.action,
|
|
866
|
+
intentId: baseEntry.intentId,
|
|
867
|
+
value: baseEntry.value,
|
|
868
|
+
sessionId: sessionB,
|
|
869
|
+
result: 'failure',
|
|
870
|
+
});
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
const result = testLogger.compareAgents(sessionA, sessionB);
|
|
874
|
+
expect(result.success).toBe(true);
|
|
875
|
+
expect(result.analysis).toBeDefined();
|
|
876
|
+
expect(result.analysis!.divergencePoint).toBe(divergeIndex);
|
|
877
|
+
|
|
878
|
+
return true;
|
|
879
|
+
}
|
|
880
|
+
),
|
|
881
|
+
{ numRuns: 100 }
|
|
882
|
+
);
|
|
883
|
+
}
|
|
884
|
+
);
|
|
885
|
+
|
|
886
|
+
/**
|
|
887
|
+
* Identical logs have divergence at the end.
|
|
888
|
+
*/
|
|
889
|
+
test(
|
|
890
|
+
'identical logs have divergence at the end',
|
|
891
|
+
async () => {
|
|
892
|
+
await fc.assert(
|
|
893
|
+
fc.asyncProperty(
|
|
894
|
+
sessionIdArbitrary,
|
|
895
|
+
sessionIdArbitrary,
|
|
896
|
+
fc.array(actionEntryArbitrary, { minLength: 1, maxLength: 5 }),
|
|
897
|
+
async (sessionA, sessionB, entries) => {
|
|
898
|
+
if (sessionA === sessionB) {
|
|
899
|
+
return true;
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
// Create a fresh logger for this test
|
|
903
|
+
const testLogger = createDeterministicLogger();
|
|
904
|
+
|
|
905
|
+
// Log identical entries to both sessions
|
|
906
|
+
for (const entry of entries) {
|
|
907
|
+
testLogger.logAction({
|
|
908
|
+
action: entry.action,
|
|
909
|
+
intentId: entry.intentId,
|
|
910
|
+
value: entry.value,
|
|
911
|
+
result: entry.result,
|
|
912
|
+
sessionId: sessionA,
|
|
913
|
+
});
|
|
914
|
+
testLogger.logAction({
|
|
915
|
+
action: entry.action,
|
|
916
|
+
intentId: entry.intentId,
|
|
917
|
+
value: entry.value,
|
|
918
|
+
result: entry.result,
|
|
919
|
+
sessionId: sessionB,
|
|
920
|
+
});
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
const result = testLogger.compareAgents(sessionA, sessionB);
|
|
924
|
+
expect(result.success).toBe(true);
|
|
925
|
+
expect(result.analysis).toBeDefined();
|
|
926
|
+
expect(result.analysis!.divergencePoint).toBe(entries.length);
|
|
927
|
+
|
|
928
|
+
return true;
|
|
929
|
+
}
|
|
930
|
+
),
|
|
931
|
+
{ numRuns: 100 }
|
|
932
|
+
);
|
|
933
|
+
}
|
|
934
|
+
);
|
|
935
|
+
|
|
936
|
+
/**
|
|
937
|
+
* Empty logs comparison fails.
|
|
938
|
+
*/
|
|
939
|
+
test(
|
|
940
|
+
'empty logs comparison fails',
|
|
941
|
+
async () => {
|
|
942
|
+
await fc.assert(
|
|
943
|
+
fc.asyncProperty(
|
|
944
|
+
sessionIdArbitrary,
|
|
945
|
+
sessionIdArbitrary,
|
|
946
|
+
async (sessionA, sessionB) => {
|
|
947
|
+
if (sessionA === sessionB) {
|
|
948
|
+
return true;
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
// Create a fresh logger for this test
|
|
952
|
+
const testLogger = createDeterministicLogger();
|
|
953
|
+
|
|
954
|
+
const result = testLogger.compareAgents(sessionA, sessionB);
|
|
955
|
+
expect(result.success).toBe(false);
|
|
956
|
+
expect(result.error).toBeDefined();
|
|
957
|
+
|
|
958
|
+
return true;
|
|
959
|
+
}
|
|
960
|
+
),
|
|
961
|
+
{ numRuns: 100 }
|
|
962
|
+
);
|
|
963
|
+
}
|
|
964
|
+
);
|
|
965
|
+
});
|