outcome-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/README.md +261 -0
  2. package/package.json +95 -0
  3. package/src/agents/README.md +139 -0
  4. package/src/agents/adapters/anthropic.adapter.ts +166 -0
  5. package/src/agents/adapters/dalle.adapter.ts +145 -0
  6. package/src/agents/adapters/gemini.adapter.ts +134 -0
  7. package/src/agents/adapters/imagen.adapter.ts +106 -0
  8. package/src/agents/adapters/nano-banana.adapter.ts +129 -0
  9. package/src/agents/adapters/openai.adapter.ts +165 -0
  10. package/src/agents/adapters/veo.adapter.ts +130 -0
  11. package/src/agents/agent.schema.property.test.ts +379 -0
  12. package/src/agents/agent.schema.test.ts +148 -0
  13. package/src/agents/agent.schema.ts +263 -0
  14. package/src/agents/index.ts +60 -0
  15. package/src/agents/registered-agent.schema.ts +356 -0
  16. package/src/agents/registry.ts +97 -0
  17. package/src/agents/tournament-configs.property.test.ts +266 -0
  18. package/src/cli/README.md +145 -0
  19. package/src/cli/commands/define.ts +79 -0
  20. package/src/cli/commands/list.ts +46 -0
  21. package/src/cli/commands/logs.ts +83 -0
  22. package/src/cli/commands/run.ts +416 -0
  23. package/src/cli/commands/verify.ts +110 -0
  24. package/src/cli/index.ts +81 -0
  25. package/src/config/README.md +128 -0
  26. package/src/config/env.ts +262 -0
  27. package/src/config/index.ts +19 -0
  28. package/src/eval/README.md +318 -0
  29. package/src/eval/ai-judge.test.ts +435 -0
  30. package/src/eval/ai-judge.ts +368 -0
  31. package/src/eval/code-validators.ts +414 -0
  32. package/src/eval/evaluateOutcome.property.test.ts +1174 -0
  33. package/src/eval/evaluateOutcome.ts +591 -0
  34. package/src/eval/immigration-validators.ts +122 -0
  35. package/src/eval/index.ts +90 -0
  36. package/src/eval/judge-cache.ts +402 -0
  37. package/src/eval/tournament-validators.property.test.ts +439 -0
  38. package/src/eval/validators.property.test.ts +1118 -0
  39. package/src/eval/validators.ts +1199 -0
  40. package/src/eval/weighted-scorer.ts +285 -0
  41. package/src/index.ts +17 -0
  42. package/src/league/README.md +188 -0
  43. package/src/league/health-check.ts +353 -0
  44. package/src/league/index.ts +93 -0
  45. package/src/league/killAgent.ts +151 -0
  46. package/src/league/league.test.ts +1151 -0
  47. package/src/league/runLeague.ts +843 -0
  48. package/src/league/scoreAgent.ts +175 -0
  49. package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
  50. package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
  51. package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
  52. package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
  53. package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
  54. package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
  55. package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
  56. package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
  57. package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
  58. package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
  59. package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
  60. package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
  61. package/src/modules/omnibridge/api/.gitkeep +1 -0
  62. package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
  63. package/src/modules/omnibridge/auth/.gitkeep +1 -0
  64. package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
  65. package/src/modules/omnibridge/auth/session-vault.ts +577 -0
  66. package/src/modules/omnibridge/core/.gitkeep +1 -0
  67. package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
  68. package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
  69. package/src/modules/omnibridge/core/types.ts +610 -0
  70. package/src/modules/omnibridge/execution/.gitkeep +1 -0
  71. package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
  72. package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
  73. package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
  74. package/src/modules/omnibridge/index.ts +212 -0
  75. package/src/modules/omnibridge/omnibridge.ts +510 -0
  76. package/src/modules/omnibridge/verification/.gitkeep +1 -0
  77. package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
  78. package/src/outcomes/README.md +75 -0
  79. package/src/outcomes/acquire-pilot-customer.ts +297 -0
  80. package/src/outcomes/code-delivery-outcomes.ts +89 -0
  81. package/src/outcomes/code-outcomes.ts +256 -0
  82. package/src/outcomes/code_review_battle.test.ts +135 -0
  83. package/src/outcomes/code_review_battle.ts +135 -0
  84. package/src/outcomes/cold_email_battle.ts +97 -0
  85. package/src/outcomes/content_creation_battle.ts +160 -0
  86. package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
  87. package/src/outcomes/index.ts +107 -0
  88. package/src/outcomes/lead_gen_battle.test.ts +113 -0
  89. package/src/outcomes/lead_gen_battle.ts +99 -0
  90. package/src/outcomes/outcome.schema.property.test.ts +229 -0
  91. package/src/outcomes/outcome.schema.ts +187 -0
  92. package/src/outcomes/qualified_sales_interest.ts +118 -0
  93. package/src/outcomes/swarm_planner.property.test.ts +370 -0
  94. package/src/outcomes/swarm_planner.ts +96 -0
  95. package/src/outcomes/web_extraction.ts +234 -0
  96. package/src/runtime/README.md +220 -0
  97. package/src/runtime/agentRunner.test.ts +341 -0
  98. package/src/runtime/agentRunner.ts +746 -0
  99. package/src/runtime/claudeAdapter.ts +232 -0
  100. package/src/runtime/costTracker.ts +123 -0
  101. package/src/runtime/index.ts +34 -0
  102. package/src/runtime/modelAdapter.property.test.ts +305 -0
  103. package/src/runtime/modelAdapter.ts +144 -0
  104. package/src/runtime/openaiAdapter.ts +235 -0
  105. package/src/utils/README.md +122 -0
  106. package/src/utils/command-runner.ts +134 -0
  107. package/src/utils/cost-guard.ts +379 -0
  108. package/src/utils/errors.test.ts +290 -0
  109. package/src/utils/errors.ts +442 -0
  110. package/src/utils/index.ts +37 -0
  111. package/src/utils/logger.test.ts +361 -0
  112. package/src/utils/logger.ts +419 -0
  113. package/src/utils/output-parsers.ts +216 -0
@@ -0,0 +1,435 @@
1
+ /**
2
+ * AI Judge Unit Tests
3
+ *
4
+ * Tests for the AI-powered evaluation system.
5
+ *
6
+ * @module eval/ai-judge.test
7
+ */
8
+
9
+ import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
10
+ import {
11
+ hashArtifact,
12
+ validateJudgeConfig,
13
+ AIJudgeError,
14
+ evaluateWithAIJudge,
15
+ type JudgeConfig,
16
+ } from './ai-judge.js';
17
+ import {
18
+ InMemoryJudgeCache,
19
+ resetJudgeCache,
20
+ setJudgeCache,
21
+ type JudgeResult,
22
+ } from './judge-cache.js';
23
+
24
+ describe('AI Judge', () => {
25
+ describe('hashArtifact', () => {
26
+ it('should generate consistent hash for same artifact and rubric', () => {
27
+ const artifact = { message: 'Hello world', quality: 'high' };
28
+ const rubric = 'Evaluate for clarity';
29
+
30
+ const hash1 = hashArtifact(artifact, rubric);
31
+ const hash2 = hashArtifact(artifact, rubric);
32
+
33
+ expect(hash1).toBe(hash2);
34
+ expect(hash1).toHaveLength(64); // SHA-256 hex length
35
+ });
36
+
37
+ it('should generate different hash for different artifacts', () => {
38
+ const artifact1 = { message: 'Hello world' };
39
+ const artifact2 = { message: 'Goodbye world' };
40
+ const rubric = 'Evaluate for clarity';
41
+
42
+ const hash1 = hashArtifact(artifact1, rubric);
43
+ const hash2 = hashArtifact(artifact2, rubric);
44
+
45
+ expect(hash1).not.toBe(hash2);
46
+ });
47
+
48
+ it('should generate different hash for different rubrics', () => {
49
+ const artifact = { message: 'Hello world' };
50
+ const rubric1 = 'Evaluate for clarity';
51
+ const rubric2 = 'Evaluate for professionalism';
52
+
53
+ const hash1 = hashArtifact(artifact, rubric1);
54
+ const hash2 = hashArtifact(artifact, rubric2);
55
+
56
+ expect(hash1).not.toBe(hash2);
57
+ });
58
+
59
+ it('should handle string artifacts', () => {
60
+ const artifact = 'Simple string artifact';
61
+ const rubric = 'Evaluate this';
62
+
63
+ const hash = hashArtifact(artifact, rubric);
64
+
65
+ expect(hash).toHaveLength(64);
66
+ });
67
+
68
+ it('should handle complex nested artifacts', () => {
69
+ const artifact = {
70
+ level1: {
71
+ level2: {
72
+ level3: ['a', 'b', 'c'],
73
+ },
74
+ },
75
+ numbers: [1, 2, 3],
76
+ };
77
+ const rubric = 'Evaluate structure';
78
+
79
+ const hash = hashArtifact(artifact, rubric);
80
+
81
+ expect(hash).toHaveLength(64);
82
+ });
83
+ });
84
+
85
+ describe('validateJudgeConfig', () => {
86
+ it('should accept valid gpt-4o config', () => {
87
+ const config: JudgeConfig = {
88
+ model: 'gpt-4o',
89
+ rubric: 'Evaluate for quality',
90
+ maxScore: 10,
91
+ };
92
+
93
+ expect(validateJudgeConfig(config)).toBe(true);
94
+ });
95
+
96
+ it('should accept valid claude-opus config', () => {
97
+ const config: JudgeConfig = {
98
+ model: 'claude-opus',
99
+ rubric: 'Evaluate for quality',
100
+ maxScore: 100,
101
+ };
102
+
103
+ expect(validateJudgeConfig(config)).toBe(true);
104
+ });
105
+
106
+ it('should accept config with optional temperature', () => {
107
+ const config: JudgeConfig = {
108
+ model: 'gpt-4o',
109
+ rubric: 'Evaluate for quality',
110
+ maxScore: 10,
111
+ temperature: 0.5,
112
+ };
113
+
114
+ expect(validateJudgeConfig(config)).toBe(true);
115
+ });
116
+
117
+ it('should accept config with optional maxTokens', () => {
118
+ const config: JudgeConfig = {
119
+ model: 'gpt-4o',
120
+ rubric: 'Evaluate for quality',
121
+ maxScore: 10,
122
+ maxTokens: 2048,
123
+ };
124
+
125
+ expect(validateJudgeConfig(config)).toBe(true);
126
+ });
127
+
128
+ it('should reject invalid model', () => {
129
+ const config = {
130
+ model: 'invalid-model' as 'gpt-4o',
131
+ rubric: 'Evaluate for quality',
132
+ maxScore: 10,
133
+ };
134
+
135
+ expect(() => validateJudgeConfig(config)).toThrow('Invalid judge model');
136
+ });
137
+
138
+ it('should reject empty rubric', () => {
139
+ const config: JudgeConfig = {
140
+ model: 'gpt-4o',
141
+ rubric: '',
142
+ maxScore: 10,
143
+ };
144
+
145
+ expect(() => validateJudgeConfig(config)).toThrow('Rubric must be a non-empty string');
146
+ });
147
+
148
+ it('should reject whitespace-only rubric', () => {
149
+ const config: JudgeConfig = {
150
+ model: 'gpt-4o',
151
+ rubric: ' ',
152
+ maxScore: 10,
153
+ };
154
+
155
+ expect(() => validateJudgeConfig(config)).toThrow('Rubric must be a non-empty string');
156
+ });
157
+
158
+ it('should reject zero maxScore', () => {
159
+ const config: JudgeConfig = {
160
+ model: 'gpt-4o',
161
+ rubric: 'Evaluate for quality',
162
+ maxScore: 0,
163
+ };
164
+
165
+ expect(() => validateJudgeConfig(config)).toThrow('maxScore must be a positive number');
166
+ });
167
+
168
+ it('should reject negative maxScore', () => {
169
+ const config: JudgeConfig = {
170
+ model: 'gpt-4o',
171
+ rubric: 'Evaluate for quality',
172
+ maxScore: -5,
173
+ };
174
+
175
+ expect(() => validateJudgeConfig(config)).toThrow('maxScore must be a positive number');
176
+ });
177
+
178
+ it('should reject temperature below 0', () => {
179
+ const config: JudgeConfig = {
180
+ model: 'gpt-4o',
181
+ rubric: 'Evaluate for quality',
182
+ maxScore: 10,
183
+ temperature: -0.1,
184
+ };
185
+
186
+ expect(() => validateJudgeConfig(config)).toThrow('temperature must be between 0 and 2');
187
+ });
188
+
189
+ it('should reject temperature above 2', () => {
190
+ const config: JudgeConfig = {
191
+ model: 'gpt-4o',
192
+ rubric: 'Evaluate for quality',
193
+ maxScore: 10,
194
+ temperature: 2.5,
195
+ };
196
+
197
+ expect(() => validateJudgeConfig(config)).toThrow('temperature must be between 0 and 2');
198
+ });
199
+
200
+ it('should reject maxTokens below 1', () => {
201
+ const config: JudgeConfig = {
202
+ model: 'gpt-4o',
203
+ rubric: 'Evaluate for quality',
204
+ maxScore: 10,
205
+ maxTokens: 0,
206
+ };
207
+
208
+ expect(() => validateJudgeConfig(config)).toThrow('maxTokens must be between 1 and 4096');
209
+ });
210
+
211
+ it('should reject maxTokens above 4096', () => {
212
+ const config: JudgeConfig = {
213
+ model: 'gpt-4o',
214
+ rubric: 'Evaluate for quality',
215
+ maxScore: 10,
216
+ maxTokens: 5000,
217
+ };
218
+
219
+ expect(() => validateJudgeConfig(config)).toThrow('maxTokens must be between 1 and 4096');
220
+ });
221
+ });
222
+
223
+ describe('AIJudgeError', () => {
224
+ it('should create error with model information', () => {
225
+ const error = new AIJudgeError('Test error', 'gpt-4o');
226
+
227
+ expect(error.message).toBe('Test error');
228
+ expect(error.model).toBe('gpt-4o');
229
+ expect(error.name).toBe('AIJudgeError');
230
+ });
231
+
232
+ it('should include cause when provided', () => {
233
+ const cause = new Error('Original error');
234
+ const error = new AIJudgeError('Wrapped error', 'claude-opus', cause);
235
+
236
+ expect(error.message).toBe('Wrapped error');
237
+ expect(error.model).toBe('claude-opus');
238
+ expect(error.cause).toBe(cause);
239
+ });
240
+ });
241
+
242
+ describe('evaluateWithAIJudge - caching', () => {
243
+ let cache: InMemoryJudgeCache;
244
+
245
+ beforeEach(() => {
246
+ cache = new InMemoryJudgeCache();
247
+ setJudgeCache(cache);
248
+ });
249
+
250
+ afterEach(() => {
251
+ resetJudgeCache();
252
+ });
253
+
254
+ it('should return cached result when available', async () => {
255
+ const artifact = { message: 'Test message' };
256
+ const config: JudgeConfig = {
257
+ model: 'gpt-4o',
258
+ rubric: 'Evaluate for quality',
259
+ maxScore: 10,
260
+ };
261
+
262
+ // Pre-populate cache
263
+ const cacheKey = hashArtifact(artifact, config.rubric);
264
+ const cachedResult: JudgeResult = {
265
+ score: 8,
266
+ normalizedScore: 0.8,
267
+ reasoning: 'Good quality message',
268
+ highlights: ['Clear', 'Concise'],
269
+ model: 'gpt-4o',
270
+ cached: false,
271
+ evaluatedAt: new Date().toISOString(),
272
+ };
273
+ await cache.set(cacheKey, cachedResult);
274
+
275
+ // Should return cached result
276
+ const result = await evaluateWithAIJudge(artifact, config, cache);
277
+
278
+ expect(result.cached).toBe(true);
279
+ expect(result.score).toBe(8);
280
+ expect(result.reasoning).toBe('Good quality message');
281
+ });
282
+
283
+ it('should track cache statistics', async () => {
284
+ const artifact = { message: 'Test message' };
285
+ const config: JudgeConfig = {
286
+ model: 'gpt-4o',
287
+ rubric: 'Evaluate for quality',
288
+ maxScore: 10,
289
+ };
290
+
291
+ // Pre-populate cache
292
+ const cacheKey = hashArtifact(artifact, config.rubric);
293
+ const cachedResult: JudgeResult = {
294
+ score: 8,
295
+ normalizedScore: 0.8,
296
+ reasoning: 'Good quality message',
297
+ highlights: [],
298
+ model: 'gpt-4o',
299
+ cached: false,
300
+ evaluatedAt: new Date().toISOString(),
301
+ };
302
+ await cache.set(cacheKey, cachedResult);
303
+
304
+ // First call - cache hit
305
+ await evaluateWithAIJudge(artifact, config, cache);
306
+
307
+ const stats = await cache.stats();
308
+ expect(stats.hits).toBe(1);
309
+ expect(stats.misses).toBe(0);
310
+ });
311
+ });
312
+ });
313
+
314
+ describe('InMemoryJudgeCache', () => {
315
+ let cache: InMemoryJudgeCache;
316
+
317
+ beforeEach(() => {
318
+ cache = new InMemoryJudgeCache();
319
+ });
320
+
321
+ it('should store and retrieve results', async () => {
322
+ const result: JudgeResult = {
323
+ score: 8,
324
+ normalizedScore: 0.8,
325
+ reasoning: 'Good quality',
326
+ highlights: ['Clear'],
327
+ model: 'gpt-4o',
328
+ cached: false,
329
+ evaluatedAt: new Date().toISOString(),
330
+ };
331
+
332
+ await cache.set('test-key', result);
333
+ const retrieved = await cache.get('test-key');
334
+
335
+ expect(retrieved).not.toBeNull();
336
+ expect(retrieved?.score).toBe(8);
337
+ expect(retrieved?.reasoning).toBe('Good quality');
338
+ });
339
+
340
+ it('should return null for missing keys', async () => {
341
+ const result = await cache.get('nonexistent-key');
342
+ expect(result).toBeNull();
343
+ });
344
+
345
+ it('should delete entries', async () => {
346
+ const result: JudgeResult = {
347
+ score: 8,
348
+ normalizedScore: 0.8,
349
+ reasoning: 'Good quality',
350
+ highlights: [],
351
+ model: 'gpt-4o',
352
+ cached: false,
353
+ evaluatedAt: new Date().toISOString(),
354
+ };
355
+
356
+ await cache.set('test-key', result);
357
+ const deleted = await cache.delete('test-key');
358
+ const retrieved = await cache.get('test-key');
359
+
360
+ expect(deleted).toBe(true);
361
+ expect(retrieved).toBeNull();
362
+ });
363
+
364
+ it('should clear all entries', async () => {
365
+ const result: JudgeResult = {
366
+ score: 8,
367
+ normalizedScore: 0.8,
368
+ reasoning: 'Good quality',
369
+ highlights: [],
370
+ model: 'gpt-4o',
371
+ cached: false,
372
+ evaluatedAt: new Date().toISOString(),
373
+ };
374
+
375
+ await cache.set('key1', result);
376
+ await cache.set('key2', result);
377
+ await cache.clear();
378
+
379
+ const stats = await cache.stats();
380
+ expect(stats.size).toBe(0);
381
+ });
382
+
383
+ it('should track hit/miss statistics', async () => {
384
+ const result: JudgeResult = {
385
+ score: 8,
386
+ normalizedScore: 0.8,
387
+ reasoning: 'Good quality',
388
+ highlights: [],
389
+ model: 'gpt-4o',
390
+ cached: false,
391
+ evaluatedAt: new Date().toISOString(),
392
+ };
393
+
394
+ await cache.set('existing-key', result);
395
+
396
+ // One hit
397
+ await cache.get('existing-key');
398
+ // Two misses
399
+ await cache.get('missing-key-1');
400
+ await cache.get('missing-key-2');
401
+
402
+ const stats = await cache.stats();
403
+ expect(stats.hits).toBe(1);
404
+ expect(stats.misses).toBe(2);
405
+ expect(stats.hitRate).toBeCloseTo(1 / 3, 2);
406
+ });
407
+
408
+ it('should expire entries after TTL', async () => {
409
+ // Create cache with 1 second TTL
410
+ const shortTtlCache = new InMemoryJudgeCache(1);
411
+
412
+ const result: JudgeResult = {
413
+ score: 8,
414
+ normalizedScore: 0.8,
415
+ reasoning: 'Good quality',
416
+ highlights: [],
417
+ model: 'gpt-4o',
418
+ cached: false,
419
+ evaluatedAt: new Date().toISOString(),
420
+ };
421
+
422
+ await shortTtlCache.set('test-key', result);
423
+
424
+ // Should be available immediately
425
+ const immediate = await shortTtlCache.get('test-key');
426
+ expect(immediate).not.toBeNull();
427
+
428
+ // Wait for TTL to expire
429
+ await new Promise((resolve) => setTimeout(resolve, 1100));
430
+
431
+ // Should be expired now
432
+ const expired = await shortTtlCache.get('test-key');
433
+ expect(expired).toBeNull();
434
+ });
435
+ });