outcome-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -0
- package/package.json +95 -0
- package/src/agents/README.md +139 -0
- package/src/agents/adapters/anthropic.adapter.ts +166 -0
- package/src/agents/adapters/dalle.adapter.ts +145 -0
- package/src/agents/adapters/gemini.adapter.ts +134 -0
- package/src/agents/adapters/imagen.adapter.ts +106 -0
- package/src/agents/adapters/nano-banana.adapter.ts +129 -0
- package/src/agents/adapters/openai.adapter.ts +165 -0
- package/src/agents/adapters/veo.adapter.ts +130 -0
- package/src/agents/agent.schema.property.test.ts +379 -0
- package/src/agents/agent.schema.test.ts +148 -0
- package/src/agents/agent.schema.ts +263 -0
- package/src/agents/index.ts +60 -0
- package/src/agents/registered-agent.schema.ts +356 -0
- package/src/agents/registry.ts +97 -0
- package/src/agents/tournament-configs.property.test.ts +266 -0
- package/src/cli/README.md +145 -0
- package/src/cli/commands/define.ts +79 -0
- package/src/cli/commands/list.ts +46 -0
- package/src/cli/commands/logs.ts +83 -0
- package/src/cli/commands/run.ts +416 -0
- package/src/cli/commands/verify.ts +110 -0
- package/src/cli/index.ts +81 -0
- package/src/config/README.md +128 -0
- package/src/config/env.ts +262 -0
- package/src/config/index.ts +19 -0
- package/src/eval/README.md +318 -0
- package/src/eval/ai-judge.test.ts +435 -0
- package/src/eval/ai-judge.ts +368 -0
- package/src/eval/code-validators.ts +414 -0
- package/src/eval/evaluateOutcome.property.test.ts +1174 -0
- package/src/eval/evaluateOutcome.ts +591 -0
- package/src/eval/immigration-validators.ts +122 -0
- package/src/eval/index.ts +90 -0
- package/src/eval/judge-cache.ts +402 -0
- package/src/eval/tournament-validators.property.test.ts +439 -0
- package/src/eval/validators.property.test.ts +1118 -0
- package/src/eval/validators.ts +1199 -0
- package/src/eval/weighted-scorer.ts +285 -0
- package/src/index.ts +17 -0
- package/src/league/README.md +188 -0
- package/src/league/health-check.ts +353 -0
- package/src/league/index.ts +93 -0
- package/src/league/killAgent.ts +151 -0
- package/src/league/league.test.ts +1151 -0
- package/src/league/runLeague.ts +843 -0
- package/src/league/scoreAgent.ts +175 -0
- package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
- package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
- package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
- package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
- package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
- package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
- package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
- package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
- package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
- package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
- package/src/modules/omnibridge/api/.gitkeep +1 -0
- package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
- package/src/modules/omnibridge/auth/.gitkeep +1 -0
- package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
- package/src/modules/omnibridge/auth/session-vault.ts +577 -0
- package/src/modules/omnibridge/core/.gitkeep +1 -0
- package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
- package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
- package/src/modules/omnibridge/core/types.ts +610 -0
- package/src/modules/omnibridge/execution/.gitkeep +1 -0
- package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
- package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
- package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
- package/src/modules/omnibridge/index.ts +212 -0
- package/src/modules/omnibridge/omnibridge.ts +510 -0
- package/src/modules/omnibridge/verification/.gitkeep +1 -0
- package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
- package/src/outcomes/README.md +75 -0
- package/src/outcomes/acquire-pilot-customer.ts +297 -0
- package/src/outcomes/code-delivery-outcomes.ts +89 -0
- package/src/outcomes/code-outcomes.ts +256 -0
- package/src/outcomes/code_review_battle.test.ts +135 -0
- package/src/outcomes/code_review_battle.ts +135 -0
- package/src/outcomes/cold_email_battle.ts +97 -0
- package/src/outcomes/content_creation_battle.ts +160 -0
- package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
- package/src/outcomes/index.ts +107 -0
- package/src/outcomes/lead_gen_battle.test.ts +113 -0
- package/src/outcomes/lead_gen_battle.ts +99 -0
- package/src/outcomes/outcome.schema.property.test.ts +229 -0
- package/src/outcomes/outcome.schema.ts +187 -0
- package/src/outcomes/qualified_sales_interest.ts +118 -0
- package/src/outcomes/swarm_planner.property.test.ts +370 -0
- package/src/outcomes/swarm_planner.ts +96 -0
- package/src/outcomes/web_extraction.ts +234 -0
- package/src/runtime/README.md +220 -0
- package/src/runtime/agentRunner.test.ts +341 -0
- package/src/runtime/agentRunner.ts +746 -0
- package/src/runtime/claudeAdapter.ts +232 -0
- package/src/runtime/costTracker.ts +123 -0
- package/src/runtime/index.ts +34 -0
- package/src/runtime/modelAdapter.property.test.ts +305 -0
- package/src/runtime/modelAdapter.ts +144 -0
- package/src/runtime/openaiAdapter.ts +235 -0
- package/src/utils/README.md +122 -0
- package/src/utils/command-runner.ts +134 -0
- package/src/utils/cost-guard.ts +379 -0
- package/src/utils/errors.test.ts +290 -0
- package/src/utils/errors.ts +442 -0
- package/src/utils/index.ts +37 -0
- package/src/utils/logger.test.ts +361 -0
- package/src/utils/logger.ts +419 -0
- package/src/utils/output-parsers.ts +216 -0
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Parallel Executor Property Tests
|
|
3
|
+
*
|
|
4
|
+
* Property-based tests for competition orchestration, identical starting states,
|
|
5
|
+
* and crash isolation.
|
|
6
|
+
*
|
|
7
|
+
* Requirements: 7.2, 7.6
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { describe, test, expect, beforeEach, afterEach } from 'vitest';
|
|
11
|
+
import * as fc from 'fast-check';
|
|
12
|
+
import {
|
|
13
|
+
ParallelExecutor,
|
|
14
|
+
createParallelExecutor,
|
|
15
|
+
type CompetitionConfig,
|
|
16
|
+
type AgentExecutionTask,
|
|
17
|
+
type AgentExecutionResult,
|
|
18
|
+
} from '../execution/parallel-executor.js';
|
|
19
|
+
import { createSessionVault } from '../auth/session-vault.js';
|
|
20
|
+
import { createShadowSessionOrchestrator } from '../execution/shadow-session.js';
|
|
21
|
+
import type { SerializedSession, EncryptedBlob } from '../core/types.js';
|
|
22
|
+
|
|
23
|
+
// =============================================================================
|
|
24
|
+
// Test Setup
|
|
25
|
+
// =============================================================================
|
|
26
|
+
|
|
27
|
+
let executor: ParallelExecutor;
|
|
28
|
+
|
|
29
|
+
beforeEach(() => {
|
|
30
|
+
const vault = createSessionVault();
|
|
31
|
+
const orchestrator = createShadowSessionOrchestrator({
|
|
32
|
+
vault,
|
|
33
|
+
heartbeatIntervalMs: 60000,
|
|
34
|
+
sessionTimeoutMs: 3600000,
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
executor = createParallelExecutor({
|
|
38
|
+
sessionOrchestrator: orchestrator,
|
|
39
|
+
sessionVault: vault,
|
|
40
|
+
maxConcurrentCompetitions: 10,
|
|
41
|
+
defaultTimeoutMs: 5000,
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
afterEach(async () => {
|
|
46
|
+
await executor.clear();
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
// =============================================================================
|
|
50
|
+
// Arbitraries
|
|
51
|
+
// =============================================================================
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Generate arbitrary bounty IDs.
|
|
55
|
+
*/
|
|
56
|
+
const bountyIdArbitrary = fc.stringMatching(/^bounty_[a-z0-9]{8,16}$/);
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Generate arbitrary domain names.
|
|
60
|
+
*/
|
|
61
|
+
const domainArbitrary = fc.stringMatching(/^[a-z][a-z0-9-]{2,20}\.(com|org|net|io)$/);
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Generate arbitrary agent counts (reasonable range for testing).
|
|
65
|
+
*/
|
|
66
|
+
const agentCountArbitrary = fc.integer({ min: 2, max: 10 });
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Generate arbitrary agent IDs.
|
|
70
|
+
*/
|
|
71
|
+
const agentIdArbitrary = fc.stringMatching(/^agent_[a-z0-9]{8,16}$/);
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Generate arbitrary session data.
|
|
75
|
+
*/
|
|
76
|
+
const sessionDataArbitrary = fc.record({
|
|
77
|
+
cookies: fc.string({ minLength: 0, maxLength: 100 }),
|
|
78
|
+
localStorage: fc.string({ minLength: 0, maxLength: 100 }),
|
|
79
|
+
sessionStorage: fc.string({ minLength: 0, maxLength: 100 }),
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
// =============================================================================
|
|
83
|
+
// Property 15: Identical Starting States
|
|
84
|
+
// =============================================================================
|
|
85
|
+
|
|
86
|
+
describe('Property 15: Identical Starting States', () => {
|
|
87
|
+
/**
|
|
88
|
+
* **Feature: omnibridge, Property 15: Identical Starting States**
|
|
89
|
+
*
|
|
90
|
+
* *For any* Competition with N agents, all N Shadow_Sessions SHALL have
|
|
91
|
+
* byte-identical `startState` serializations.
|
|
92
|
+
*
|
|
93
|
+
* **Validates: Requirements 7.2**
|
|
94
|
+
*/
|
|
95
|
+
test(
|
|
96
|
+
'Property 15: all agents in a competition have byte-identical starting states',
|
|
97
|
+
async () => {
|
|
98
|
+
await fc.assert(
|
|
99
|
+
fc.asyncProperty(
|
|
100
|
+
bountyIdArbitrary,
|
|
101
|
+
domainArbitrary,
|
|
102
|
+
agentCountArbitrary,
|
|
103
|
+
async (bountyId, domain, agentCount) => {
|
|
104
|
+
const config: CompetitionConfig = {
|
|
105
|
+
bountyId,
|
|
106
|
+
agentCount,
|
|
107
|
+
targetDomain: domain,
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
const result = await executor.createCompetition(config);
|
|
111
|
+
|
|
112
|
+
expect(result.success).toBe(true);
|
|
113
|
+
expect(result.competition).toBeDefined();
|
|
114
|
+
|
|
115
|
+
const competition = result.competition!;
|
|
116
|
+
|
|
117
|
+
// Verify we have the correct number of sessions
|
|
118
|
+
expect(competition.sessions.length).toBe(agentCount);
|
|
119
|
+
|
|
120
|
+
// Get the serialized start state
|
|
121
|
+
const startState = competition.startState;
|
|
122
|
+
const startStateStr = JSON.stringify(startState);
|
|
123
|
+
|
|
124
|
+
// Verify all sessions would start with identical state
|
|
125
|
+
// The startState is shared across all sessions
|
|
126
|
+
for (let i = 0; i < competition.sessions.length; i++) {
|
|
127
|
+
// Each session should reference the same startState
|
|
128
|
+
const sessionStartState = executor.serializeStartState(startState);
|
|
129
|
+
const sessionStartStateStr = JSON.stringify(sessionStartState);
|
|
130
|
+
|
|
131
|
+
// Byte-identical check
|
|
132
|
+
expect(sessionStartStateStr).toBe(startStateStr);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Verify the areStatesIdentical helper works correctly
|
|
136
|
+
for (let i = 0; i < competition.sessions.length - 1; i++) {
|
|
137
|
+
const state1 = executor.serializeStartState(startState);
|
|
138
|
+
const state2 = executor.serializeStartState(startState);
|
|
139
|
+
expect(executor.areStatesIdentical(state1, state2)).toBe(true);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Clean up
|
|
143
|
+
await executor.cleanupCompetition(competition.id);
|
|
144
|
+
|
|
145
|
+
return true;
|
|
146
|
+
}
|
|
147
|
+
),
|
|
148
|
+
{ numRuns: 100 }
|
|
149
|
+
);
|
|
150
|
+
}
|
|
151
|
+
);
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Custom initial state should be preserved identically for all agents.
|
|
155
|
+
*/
|
|
156
|
+
test(
|
|
157
|
+
'custom initial state is byte-identical for all agents',
|
|
158
|
+
async () => {
|
|
159
|
+
const vault = createSessionVault();
|
|
160
|
+
|
|
161
|
+
await fc.assert(
|
|
162
|
+
fc.asyncProperty(
|
|
163
|
+
bountyIdArbitrary,
|
|
164
|
+
domainArbitrary,
|
|
165
|
+
agentCountArbitrary,
|
|
166
|
+
sessionDataArbitrary,
|
|
167
|
+
async (bountyId, domain, agentCount, sessionData) => {
|
|
168
|
+
// Create a custom initial state
|
|
169
|
+
const customState: SerializedSession = {
|
|
170
|
+
cookies: vault.encrypt(sessionData.cookies),
|
|
171
|
+
localStorage: vault.encrypt(sessionData.localStorage),
|
|
172
|
+
sessionStorage: vault.encrypt(sessionData.sessionStorage),
|
|
173
|
+
expiresAt: Date.now() + 86400000,
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
const config: CompetitionConfig = {
|
|
177
|
+
bountyId,
|
|
178
|
+
agentCount,
|
|
179
|
+
targetDomain: domain,
|
|
180
|
+
initialState: customState,
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
const result = await executor.createCompetition(config);
|
|
184
|
+
|
|
185
|
+
expect(result.success).toBe(true);
|
|
186
|
+
expect(result.competition).toBeDefined();
|
|
187
|
+
|
|
188
|
+
const competition = result.competition!;
|
|
189
|
+
|
|
190
|
+
// The start state should be a serialized copy of the custom state
|
|
191
|
+
const startStateStr = JSON.stringify(competition.startState);
|
|
192
|
+
const customStateStr = JSON.stringify(customState);
|
|
193
|
+
|
|
194
|
+
// Should be identical
|
|
195
|
+
expect(startStateStr).toBe(customStateStr);
|
|
196
|
+
|
|
197
|
+
// All serialized copies should be identical
|
|
198
|
+
for (let i = 0; i < agentCount; i++) {
|
|
199
|
+
const copy = executor.serializeStartState(competition.startState);
|
|
200
|
+
expect(JSON.stringify(copy)).toBe(startStateStr);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Clean up
|
|
204
|
+
await executor.cleanupCompetition(competition.id);
|
|
205
|
+
|
|
206
|
+
return true;
|
|
207
|
+
}
|
|
208
|
+
),
|
|
209
|
+
{ numRuns: 100 }
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
);
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Sessions in a competition should all be isolated from each other.
|
|
216
|
+
*/
|
|
217
|
+
test(
|
|
218
|
+
'all sessions in a competition are isolated',
|
|
219
|
+
async () => {
|
|
220
|
+
await fc.assert(
|
|
221
|
+
fc.asyncProperty(
|
|
222
|
+
bountyIdArbitrary,
|
|
223
|
+
domainArbitrary,
|
|
224
|
+
agentCountArbitrary,
|
|
225
|
+
async (bountyId, domain, agentCount) => {
|
|
226
|
+
const config: CompetitionConfig = {
|
|
227
|
+
bountyId,
|
|
228
|
+
agentCount,
|
|
229
|
+
targetDomain: domain,
|
|
230
|
+
};
|
|
231
|
+
|
|
232
|
+
const result = await executor.createCompetition(config);
|
|
233
|
+
|
|
234
|
+
expect(result.success).toBe(true);
|
|
235
|
+
expect(result.competition).toBeDefined();
|
|
236
|
+
|
|
237
|
+
const competition = result.competition!;
|
|
238
|
+
const orchestrator = executor.getSessionOrchestrator();
|
|
239
|
+
|
|
240
|
+
// Check all pairs of sessions are isolated
|
|
241
|
+
for (let i = 0; i < competition.sessions.length; i++) {
|
|
242
|
+
for (let j = i + 1; j < competition.sessions.length; j++) {
|
|
243
|
+
const session1 = competition.sessions[i];
|
|
244
|
+
const session2 = competition.sessions[j];
|
|
245
|
+
|
|
246
|
+
expect(orchestrator.areSessionsIsolated(session1.id, session2.id)).toBe(true);
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// Clean up
|
|
251
|
+
await executor.cleanupCompetition(competition.id);
|
|
252
|
+
|
|
253
|
+
return true;
|
|
254
|
+
}
|
|
255
|
+
),
|
|
256
|
+
{ numRuns: 100 }
|
|
257
|
+
);
|
|
258
|
+
}
|
|
259
|
+
);
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
// =============================================================================
|
|
263
|
+
// Property 17: Crash Isolation
|
|
264
|
+
// =============================================================================
|
|
265
|
+
|
|
266
|
+
describe('Property 17: Crash Isolation', () => {
|
|
267
|
+
/**
|
|
268
|
+
* **Feature: omnibridge, Property 17: Crash Isolation**
|
|
269
|
+
*
|
|
270
|
+
* *For any* Competition where agent A's session crashes,
|
|
271
|
+
* all other agents' sessions SHALL continue executing and return results independently.
|
|
272
|
+
*
|
|
273
|
+
* **Validates: Requirements 7.6**
|
|
274
|
+
*/
|
|
275
|
+
test(
|
|
276
|
+
'Property 17: crashed session does not affect other sessions',
|
|
277
|
+
async () => {
|
|
278
|
+
await fc.assert(
|
|
279
|
+
fc.asyncProperty(
|
|
280
|
+
bountyIdArbitrary,
|
|
281
|
+
domainArbitrary,
|
|
282
|
+
fc.integer({ min: 3, max: 8 }), // Need at least 3 agents to test isolation
|
|
283
|
+
fc.integer({ min: 0, max: 7 }), // Index of agent to crash
|
|
284
|
+
async (bountyId, domain, agentCount, crashIndexRaw) => {
|
|
285
|
+
const crashIndex = crashIndexRaw % agentCount;
|
|
286
|
+
|
|
287
|
+
const config: CompetitionConfig = {
|
|
288
|
+
bountyId,
|
|
289
|
+
agentCount,
|
|
290
|
+
targetDomain: domain,
|
|
291
|
+
};
|
|
292
|
+
|
|
293
|
+
const result = await executor.createCompetition(config);
|
|
294
|
+
|
|
295
|
+
expect(result.success).toBe(true);
|
|
296
|
+
expect(result.competition).toBeDefined();
|
|
297
|
+
|
|
298
|
+
const competition = result.competition!;
|
|
299
|
+
|
|
300
|
+
// Create execution tasks - one will crash
|
|
301
|
+
const tasks: AgentExecutionTask[] = competition.sessions.map((session, index) => ({
|
|
302
|
+
agentId: `agent_${index}`,
|
|
303
|
+
sessionId: session.id,
|
|
304
|
+
execute: async (): Promise<AgentExecutionResult> => {
|
|
305
|
+
if (index === crashIndex) {
|
|
306
|
+
// Simulate a crash
|
|
307
|
+
throw new Error('Simulated crash');
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Simulate successful execution
|
|
311
|
+
return {
|
|
312
|
+
agentId: `agent_${index}`,
|
|
313
|
+
sessionId: session.id,
|
|
314
|
+
status: 'completed',
|
|
315
|
+
actionLog: [],
|
|
316
|
+
executionTimeMs: 100,
|
|
317
|
+
result: {
|
|
318
|
+
data: { success: true },
|
|
319
|
+
metadata: {
|
|
320
|
+
confidence: 1.0,
|
|
321
|
+
executionTimeMs: 100,
|
|
322
|
+
actionsPerformed: 1,
|
|
323
|
+
triangulationHeals: 0,
|
|
324
|
+
},
|
|
325
|
+
verificationHash: 'test_hash',
|
|
326
|
+
},
|
|
327
|
+
};
|
|
328
|
+
},
|
|
329
|
+
}));
|
|
330
|
+
|
|
331
|
+
// Track results
|
|
332
|
+
const completedResults: string[] = [];
|
|
333
|
+
const failedResults: string[] = [];
|
|
334
|
+
|
|
335
|
+
// Execute all tasks
|
|
336
|
+
await executor.startAll(competition, tasks, {
|
|
337
|
+
timeoutMs: 5000,
|
|
338
|
+
onAgentComplete: (agentResult) => {
|
|
339
|
+
if (agentResult.status === 'completed') {
|
|
340
|
+
completedResults.push(agentResult.agentId);
|
|
341
|
+
} else {
|
|
342
|
+
failedResults.push(agentResult.agentId);
|
|
343
|
+
}
|
|
344
|
+
},
|
|
345
|
+
});
|
|
346
|
+
|
|
347
|
+
// Verify crash isolation:
|
|
348
|
+
// 1. The crashed agent should be marked as failed
|
|
349
|
+
expect(failedResults).toContain(`agent_${crashIndex}`);
|
|
350
|
+
|
|
351
|
+
// 2. All other agents should have completed successfully
|
|
352
|
+
for (let i = 0; i < agentCount; i++) {
|
|
353
|
+
if (i !== crashIndex) {
|
|
354
|
+
expect(completedResults).toContain(`agent_${i}`);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// 3. Total completed should be agentCount - 1
|
|
359
|
+
expect(completedResults.length).toBe(agentCount - 1);
|
|
360
|
+
|
|
361
|
+
// 4. Other sessions should not be affected by the crash
|
|
362
|
+
expect(executor.areOtherSessionsAffected(competition.id, competition.sessions[crashIndex].id)).toBe(false);
|
|
363
|
+
|
|
364
|
+
// Clean up
|
|
365
|
+
await executor.cleanupCompetition(competition.id);
|
|
366
|
+
|
|
367
|
+
return true;
|
|
368
|
+
}
|
|
369
|
+
),
|
|
370
|
+
{ numRuns: 100 }
|
|
371
|
+
);
|
|
372
|
+
}
|
|
373
|
+
);
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* Multiple crashes should not cascade to other agents.
|
|
377
|
+
*/
|
|
378
|
+
test(
|
|
379
|
+
'multiple crashes do not cascade to other agents',
|
|
380
|
+
async () => {
|
|
381
|
+
await fc.assert(
|
|
382
|
+
fc.asyncProperty(
|
|
383
|
+
bountyIdArbitrary,
|
|
384
|
+
domainArbitrary,
|
|
385
|
+
fc.integer({ min: 5, max: 10 }), // Need enough agents
|
|
386
|
+
fc.integer({ min: 1, max: 3 }), // Number of crashes
|
|
387
|
+
async (bountyId, domain, agentCount, crashCount) => {
|
|
388
|
+
// Ensure we don't crash more agents than we have
|
|
389
|
+
const actualCrashCount = Math.min(crashCount, agentCount - 1);
|
|
390
|
+
|
|
391
|
+
const config: CompetitionConfig = {
|
|
392
|
+
bountyId,
|
|
393
|
+
agentCount,
|
|
394
|
+
targetDomain: domain,
|
|
395
|
+
};
|
|
396
|
+
|
|
397
|
+
const result = await executor.createCompetition(config);
|
|
398
|
+
|
|
399
|
+
expect(result.success).toBe(true);
|
|
400
|
+
expect(result.competition).toBeDefined();
|
|
401
|
+
|
|
402
|
+
const competition = result.competition!;
|
|
403
|
+
|
|
404
|
+
// Determine which agents will crash
|
|
405
|
+
const crashIndices = new Set<number>();
|
|
406
|
+
for (let i = 0; i < actualCrashCount; i++) {
|
|
407
|
+
crashIndices.add(i);
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Create execution tasks
|
|
411
|
+
const tasks: AgentExecutionTask[] = competition.sessions.map((session, index) => ({
|
|
412
|
+
agentId: `agent_${index}`,
|
|
413
|
+
sessionId: session.id,
|
|
414
|
+
execute: async (): Promise<AgentExecutionResult> => {
|
|
415
|
+
if (crashIndices.has(index)) {
|
|
416
|
+
throw new Error(`Simulated crash for agent ${index}`);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
return {
|
|
420
|
+
agentId: `agent_${index}`,
|
|
421
|
+
sessionId: session.id,
|
|
422
|
+
status: 'completed',
|
|
423
|
+
actionLog: [],
|
|
424
|
+
executionTimeMs: 50,
|
|
425
|
+
result: {
|
|
426
|
+
data: { success: true },
|
|
427
|
+
metadata: {
|
|
428
|
+
confidence: 1.0,
|
|
429
|
+
executionTimeMs: 50,
|
|
430
|
+
actionsPerformed: 1,
|
|
431
|
+
triangulationHeals: 0,
|
|
432
|
+
},
|
|
433
|
+
verificationHash: 'test_hash',
|
|
434
|
+
},
|
|
435
|
+
};
|
|
436
|
+
},
|
|
437
|
+
}));
|
|
438
|
+
|
|
439
|
+
let completedCount = 0;
|
|
440
|
+
let failedCount = 0;
|
|
441
|
+
|
|
442
|
+
await executor.startAll(competition, tasks, {
|
|
443
|
+
timeoutMs: 5000,
|
|
444
|
+
onAgentComplete: (agentResult) => {
|
|
445
|
+
if (agentResult.status === 'completed') {
|
|
446
|
+
completedCount++;
|
|
447
|
+
} else {
|
|
448
|
+
failedCount++;
|
|
449
|
+
}
|
|
450
|
+
},
|
|
451
|
+
});
|
|
452
|
+
|
|
453
|
+
// Verify:
|
|
454
|
+
// 1. Exactly crashCount agents failed
|
|
455
|
+
expect(failedCount).toBe(actualCrashCount);
|
|
456
|
+
|
|
457
|
+
// 2. All other agents completed successfully
|
|
458
|
+
expect(completedCount).toBe(agentCount - actualCrashCount);
|
|
459
|
+
|
|
460
|
+
// Clean up
|
|
461
|
+
await executor.cleanupCompetition(competition.id);
|
|
462
|
+
|
|
463
|
+
return true;
|
|
464
|
+
}
|
|
465
|
+
),
|
|
466
|
+
{ numRuns: 100 }
|
|
467
|
+
);
|
|
468
|
+
}
|
|
469
|
+
);
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* Timeout should be treated as a crash and isolated.
|
|
473
|
+
*/
|
|
474
|
+
test(
|
|
475
|
+
'timeout is isolated like a crash',
|
|
476
|
+
{ timeout: 30000 },
|
|
477
|
+
async () => {
|
|
478
|
+
await fc.assert(
|
|
479
|
+
fc.asyncProperty(
|
|
480
|
+
bountyIdArbitrary,
|
|
481
|
+
domainArbitrary,
|
|
482
|
+
fc.integer({ min: 3, max: 6 }),
|
|
483
|
+
async (bountyId, domain, agentCount) => {
|
|
484
|
+
const config: CompetitionConfig = {
|
|
485
|
+
bountyId,
|
|
486
|
+
agentCount,
|
|
487
|
+
targetDomain: domain,
|
|
488
|
+
};
|
|
489
|
+
|
|
490
|
+
const result = await executor.createCompetition(config);
|
|
491
|
+
|
|
492
|
+
expect(result.success).toBe(true);
|
|
493
|
+
expect(result.competition).toBeDefined();
|
|
494
|
+
|
|
495
|
+
const competition = result.competition!;
|
|
496
|
+
|
|
497
|
+
// First agent will timeout, others complete quickly
|
|
498
|
+
const tasks: AgentExecutionTask[] = competition.sessions.map((session, index) => ({
|
|
499
|
+
agentId: `agent_${index}`,
|
|
500
|
+
sessionId: session.id,
|
|
501
|
+
execute: async (): Promise<AgentExecutionResult> => {
|
|
502
|
+
if (index === 0) {
|
|
503
|
+
// Simulate a long-running task that will timeout
|
|
504
|
+
await new Promise((resolve) => setTimeout(resolve, 500));
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
return {
|
|
508
|
+
agentId: `agent_${index}`,
|
|
509
|
+
sessionId: session.id,
|
|
510
|
+
status: 'completed',
|
|
511
|
+
actionLog: [],
|
|
512
|
+
executionTimeMs: 10,
|
|
513
|
+
result: {
|
|
514
|
+
data: { success: true },
|
|
515
|
+
metadata: {
|
|
516
|
+
confidence: 1.0,
|
|
517
|
+
executionTimeMs: 10,
|
|
518
|
+
actionsPerformed: 1,
|
|
519
|
+
triangulationHeals: 0,
|
|
520
|
+
},
|
|
521
|
+
verificationHash: 'test_hash',
|
|
522
|
+
},
|
|
523
|
+
};
|
|
524
|
+
},
|
|
525
|
+
}));
|
|
526
|
+
|
|
527
|
+
let completedCount = 0;
|
|
528
|
+
let timeoutCount = 0;
|
|
529
|
+
|
|
530
|
+
await executor.startAll(competition, tasks, {
|
|
531
|
+
timeoutMs: 50, // Very short timeout to trigger timeout quickly
|
|
532
|
+
onAgentComplete: (agentResult) => {
|
|
533
|
+
if (agentResult.status === 'completed') {
|
|
534
|
+
completedCount++;
|
|
535
|
+
} else if (agentResult.status === 'timeout') {
|
|
536
|
+
timeoutCount++;
|
|
537
|
+
}
|
|
538
|
+
},
|
|
539
|
+
});
|
|
540
|
+
|
|
541
|
+
// Agent 0 should timeout
|
|
542
|
+
expect(timeoutCount).toBe(1);
|
|
543
|
+
|
|
544
|
+
// All other agents should complete
|
|
545
|
+
expect(completedCount).toBe(agentCount - 1);
|
|
546
|
+
|
|
547
|
+
// Clean up
|
|
548
|
+
await executor.cleanupCompetition(competition.id);
|
|
549
|
+
|
|
550
|
+
return true;
|
|
551
|
+
}
|
|
552
|
+
),
|
|
553
|
+
{ numRuns: 20 } // Reduced runs for timeout test
|
|
554
|
+
);
|
|
555
|
+
}
|
|
556
|
+
);
|
|
557
|
+
});
|
|
558
|
+
|
|
559
|
+
// =============================================================================
|
|
560
|
+
// Additional Property Tests
|
|
561
|
+
// =============================================================================
|
|
562
|
+
|
|
563
|
+
describe('Competition Creation', () => {
|
|
564
|
+
/**
|
|
565
|
+
* Competition should create the correct number of sessions.
|
|
566
|
+
*/
|
|
567
|
+
test(
|
|
568
|
+
'competition creates correct number of sessions',
|
|
569
|
+
async () => {
|
|
570
|
+
await fc.assert(
|
|
571
|
+
fc.asyncProperty(
|
|
572
|
+
bountyIdArbitrary,
|
|
573
|
+
domainArbitrary,
|
|
574
|
+
agentCountArbitrary,
|
|
575
|
+
async (bountyId, domain, agentCount) => {
|
|
576
|
+
const config: CompetitionConfig = {
|
|
577
|
+
bountyId,
|
|
578
|
+
agentCount,
|
|
579
|
+
targetDomain: domain,
|
|
580
|
+
};
|
|
581
|
+
|
|
582
|
+
const result = await executor.createCompetition(config);
|
|
583
|
+
|
|
584
|
+
expect(result.success).toBe(true);
|
|
585
|
+
expect(result.competition).toBeDefined();
|
|
586
|
+
expect(result.competition!.sessions.length).toBe(agentCount);
|
|
587
|
+
|
|
588
|
+
// Clean up
|
|
589
|
+
await executor.cleanupCompetition(result.competition!.id);
|
|
590
|
+
|
|
591
|
+
return true;
|
|
592
|
+
}
|
|
593
|
+
),
|
|
594
|
+
{ numRuns: 100 }
|
|
595
|
+
);
|
|
596
|
+
}
|
|
597
|
+
);
|
|
598
|
+
|
|
599
|
+
/**
|
|
600
|
+
* Competition with custom agent IDs should use those IDs.
|
|
601
|
+
*/
|
|
602
|
+
test(
|
|
603
|
+
'competition uses provided agent IDs',
|
|
604
|
+
async () => {
|
|
605
|
+
await fc.assert(
|
|
606
|
+
fc.asyncProperty(
|
|
607
|
+
bountyIdArbitrary,
|
|
608
|
+
domainArbitrary,
|
|
609
|
+
fc.array(agentIdArbitrary, { minLength: 2, maxLength: 5 }),
|
|
610
|
+
async (bountyId, domain, agentIds) => {
|
|
611
|
+
// Ensure unique agent IDs
|
|
612
|
+
const uniqueAgentIds = [...new Set(agentIds)];
|
|
613
|
+
if (uniqueAgentIds.length < 2) {
|
|
614
|
+
return true; // Skip if not enough unique IDs
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
const config: CompetitionConfig = {
|
|
618
|
+
bountyId,
|
|
619
|
+
agentCount: uniqueAgentIds.length,
|
|
620
|
+
targetDomain: domain,
|
|
621
|
+
agentIds: uniqueAgentIds,
|
|
622
|
+
};
|
|
623
|
+
|
|
624
|
+
const result = await executor.createCompetition(config);
|
|
625
|
+
|
|
626
|
+
expect(result.success).toBe(true);
|
|
627
|
+
expect(result.competition).toBeDefined();
|
|
628
|
+
expect(result.competition!.sessions.length).toBe(uniqueAgentIds.length);
|
|
629
|
+
|
|
630
|
+
// Clean up
|
|
631
|
+
await executor.cleanupCompetition(result.competition!.id);
|
|
632
|
+
|
|
633
|
+
return true;
|
|
634
|
+
}
|
|
635
|
+
),
|
|
636
|
+
{ numRuns: 100 }
|
|
637
|
+
);
|
|
638
|
+
}
|
|
639
|
+
);
|
|
640
|
+
|
|
641
|
+
/**
|
|
642
|
+
* Invalid agent count should fail.
|
|
643
|
+
*/
|
|
644
|
+
test(
|
|
645
|
+
'invalid agent count fails gracefully',
|
|
646
|
+
async () => {
|
|
647
|
+
await fc.assert(
|
|
648
|
+
fc.asyncProperty(
|
|
649
|
+
bountyIdArbitrary,
|
|
650
|
+
domainArbitrary,
|
|
651
|
+
fc.integer({ min: -10, max: 0 }),
|
|
652
|
+
async (bountyId, domain, invalidCount) => {
|
|
653
|
+
const config: CompetitionConfig = {
|
|
654
|
+
bountyId,
|
|
655
|
+
agentCount: invalidCount,
|
|
656
|
+
targetDomain: domain,
|
|
657
|
+
};
|
|
658
|
+
|
|
659
|
+
const result = await executor.createCompetition(config);
|
|
660
|
+
|
|
661
|
+
expect(result.success).toBe(false);
|
|
662
|
+
expect(result.error).toBeDefined();
|
|
663
|
+
|
|
664
|
+
return true;
|
|
665
|
+
}
|
|
666
|
+
),
|
|
667
|
+
{ numRuns: 100 }
|
|
668
|
+
);
|
|
669
|
+
}
|
|
670
|
+
);
|
|
671
|
+
});
|