outcome-cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -0
- package/package.json +95 -0
- package/src/agents/README.md +139 -0
- package/src/agents/adapters/anthropic.adapter.ts +166 -0
- package/src/agents/adapters/dalle.adapter.ts +145 -0
- package/src/agents/adapters/gemini.adapter.ts +134 -0
- package/src/agents/adapters/imagen.adapter.ts +106 -0
- package/src/agents/adapters/nano-banana.adapter.ts +129 -0
- package/src/agents/adapters/openai.adapter.ts +165 -0
- package/src/agents/adapters/veo.adapter.ts +130 -0
- package/src/agents/agent.schema.property.test.ts +379 -0
- package/src/agents/agent.schema.test.ts +148 -0
- package/src/agents/agent.schema.ts +263 -0
- package/src/agents/index.ts +60 -0
- package/src/agents/registered-agent.schema.ts +356 -0
- package/src/agents/registry.ts +97 -0
- package/src/agents/tournament-configs.property.test.ts +266 -0
- package/src/cli/README.md +145 -0
- package/src/cli/commands/define.ts +79 -0
- package/src/cli/commands/list.ts +46 -0
- package/src/cli/commands/logs.ts +83 -0
- package/src/cli/commands/run.ts +416 -0
- package/src/cli/commands/verify.ts +110 -0
- package/src/cli/index.ts +81 -0
- package/src/config/README.md +128 -0
- package/src/config/env.ts +262 -0
- package/src/config/index.ts +19 -0
- package/src/eval/README.md +318 -0
- package/src/eval/ai-judge.test.ts +435 -0
- package/src/eval/ai-judge.ts +368 -0
- package/src/eval/code-validators.ts +414 -0
- package/src/eval/evaluateOutcome.property.test.ts +1174 -0
- package/src/eval/evaluateOutcome.ts +591 -0
- package/src/eval/immigration-validators.ts +122 -0
- package/src/eval/index.ts +90 -0
- package/src/eval/judge-cache.ts +402 -0
- package/src/eval/tournament-validators.property.test.ts +439 -0
- package/src/eval/validators.property.test.ts +1118 -0
- package/src/eval/validators.ts +1199 -0
- package/src/eval/weighted-scorer.ts +285 -0
- package/src/index.ts +17 -0
- package/src/league/README.md +188 -0
- package/src/league/health-check.ts +353 -0
- package/src/league/index.ts +93 -0
- package/src/league/killAgent.ts +151 -0
- package/src/league/league.test.ts +1151 -0
- package/src/league/runLeague.ts +843 -0
- package/src/league/scoreAgent.ts +175 -0
- package/src/modules/omnibridge/__tests__/.gitkeep +1 -0
- package/src/modules/omnibridge/__tests__/auth-tunnel.property.test.ts +524 -0
- package/src/modules/omnibridge/__tests__/deterministic-logger.property.test.ts +965 -0
- package/src/modules/omnibridge/__tests__/ghost-api.property.test.ts +461 -0
- package/src/modules/omnibridge/__tests__/omnibridge-integration.test.ts +542 -0
- package/src/modules/omnibridge/__tests__/parallel-executor.property.test.ts +671 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.property.test.ts +521 -0
- package/src/modules/omnibridge/__tests__/semantic-normalizer.test.ts +254 -0
- package/src/modules/omnibridge/__tests__/session-vault.property.test.ts +367 -0
- package/src/modules/omnibridge/__tests__/shadow-session.property.test.ts +523 -0
- package/src/modules/omnibridge/__tests__/triangulation-engine.property.test.ts +292 -0
- package/src/modules/omnibridge/__tests__/verification-engine.property.test.ts +769 -0
- package/src/modules/omnibridge/api/.gitkeep +1 -0
- package/src/modules/omnibridge/api/ghost-api.ts +1087 -0
- package/src/modules/omnibridge/auth/.gitkeep +1 -0
- package/src/modules/omnibridge/auth/auth-tunnel.ts +843 -0
- package/src/modules/omnibridge/auth/session-vault.ts +577 -0
- package/src/modules/omnibridge/core/.gitkeep +1 -0
- package/src/modules/omnibridge/core/semantic-normalizer.ts +702 -0
- package/src/modules/omnibridge/core/triangulation-engine.ts +530 -0
- package/src/modules/omnibridge/core/types.ts +610 -0
- package/src/modules/omnibridge/execution/.gitkeep +1 -0
- package/src/modules/omnibridge/execution/deterministic-logger.ts +629 -0
- package/src/modules/omnibridge/execution/parallel-executor.ts +542 -0
- package/src/modules/omnibridge/execution/shadow-session.ts +794 -0
- package/src/modules/omnibridge/index.ts +212 -0
- package/src/modules/omnibridge/omnibridge.ts +510 -0
- package/src/modules/omnibridge/verification/.gitkeep +1 -0
- package/src/modules/omnibridge/verification/verification-engine.ts +783 -0
- package/src/outcomes/README.md +75 -0
- package/src/outcomes/acquire-pilot-customer.ts +297 -0
- package/src/outcomes/code-delivery-outcomes.ts +89 -0
- package/src/outcomes/code-outcomes.ts +256 -0
- package/src/outcomes/code_review_battle.test.ts +135 -0
- package/src/outcomes/code_review_battle.ts +135 -0
- package/src/outcomes/cold_email_battle.ts +97 -0
- package/src/outcomes/content_creation_battle.ts +160 -0
- package/src/outcomes/f1_stem_opt_compliance.ts +61 -0
- package/src/outcomes/index.ts +107 -0
- package/src/outcomes/lead_gen_battle.test.ts +113 -0
- package/src/outcomes/lead_gen_battle.ts +99 -0
- package/src/outcomes/outcome.schema.property.test.ts +229 -0
- package/src/outcomes/outcome.schema.ts +187 -0
- package/src/outcomes/qualified_sales_interest.ts +118 -0
- package/src/outcomes/swarm_planner.property.test.ts +370 -0
- package/src/outcomes/swarm_planner.ts +96 -0
- package/src/outcomes/web_extraction.ts +234 -0
- package/src/runtime/README.md +220 -0
- package/src/runtime/agentRunner.test.ts +341 -0
- package/src/runtime/agentRunner.ts +746 -0
- package/src/runtime/claudeAdapter.ts +232 -0
- package/src/runtime/costTracker.ts +123 -0
- package/src/runtime/index.ts +34 -0
- package/src/runtime/modelAdapter.property.test.ts +305 -0
- package/src/runtime/modelAdapter.ts +144 -0
- package/src/runtime/openaiAdapter.ts +235 -0
- package/src/utils/README.md +122 -0
- package/src/utils/command-runner.ts +134 -0
- package/src/utils/cost-guard.ts +379 -0
- package/src/utils/errors.test.ts +290 -0
- package/src/utils/errors.ts +442 -0
- package/src/utils/index.ts +37 -0
- package/src/utils/logger.test.ts +361 -0
- package/src/utils/logger.ts +419 -0
- package/src/utils/output-parsers.ts +216 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Claude Adapter - Anthropic Claude API integration
|
|
3
|
+
*
|
|
4
|
+
* Implements the ModelAdapter interface for Claude models.
|
|
5
|
+
* Uses the official @anthropic-ai/sdk package following Messages API.
|
|
6
|
+
*
|
|
7
|
+
* @module runtime/claudeAdapter
|
|
8
|
+
* @see https://docs.anthropic.com/en/api/getting-started
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import Anthropic from '@anthropic-ai/sdk';
|
|
12
|
+
import type {
|
|
13
|
+
ModelAdapter,
|
|
14
|
+
ModelOptions,
|
|
15
|
+
ModelResponse,
|
|
16
|
+
ConversationMessage,
|
|
17
|
+
ToolDefinition,
|
|
18
|
+
ToolCall,
|
|
19
|
+
} from './modelAdapter.js';
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Default max tokens for Claude responses.
|
|
23
|
+
*/
|
|
24
|
+
const DEFAULT_MAX_TOKENS = 1024;
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Approximate characters per token for estimation.
|
|
28
|
+
* Claude uses ~4 characters per token on average.
|
|
29
|
+
*/
|
|
30
|
+
const CHARS_PER_TOKEN = 4;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Converts our tool definitions to Claude's tool format.
|
|
34
|
+
*
|
|
35
|
+
* @param tools - Our normalized tool definitions
|
|
36
|
+
* @returns Claude-formatted tools
|
|
37
|
+
*/
|
|
38
|
+
function toClaudeTools(tools: ToolDefinition[]): Anthropic.Tool[] {
|
|
39
|
+
return tools.map((tool) => ({
|
|
40
|
+
name: tool.name,
|
|
41
|
+
description: tool.description,
|
|
42
|
+
input_schema: {
|
|
43
|
+
type: 'object' as const,
|
|
44
|
+
properties: tool.inputSchema.properties,
|
|
45
|
+
required: tool.inputSchema.required,
|
|
46
|
+
},
|
|
47
|
+
}));
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Converts our conversation messages to Claude's message format.
|
|
52
|
+
*
|
|
53
|
+
* @param messages - Our normalized conversation messages
|
|
54
|
+
* @returns Claude-formatted messages
|
|
55
|
+
*/
|
|
56
|
+
function toClaudeMessages(messages: ConversationMessage[]): Anthropic.MessageParam[] {
|
|
57
|
+
const claudeMessages: Anthropic.MessageParam[] = [];
|
|
58
|
+
|
|
59
|
+
for (const msg of messages) {
|
|
60
|
+
if (msg.role === 'user') {
|
|
61
|
+
claudeMessages.push({ role: 'user', content: msg.content });
|
|
62
|
+
} else if (msg.role === 'assistant') {
|
|
63
|
+
// Assistant message with potential tool calls
|
|
64
|
+
const content: Anthropic.ContentBlockParam[] = [];
|
|
65
|
+
if (msg.content) {
|
|
66
|
+
content.push({ type: 'text', text: msg.content });
|
|
67
|
+
}
|
|
68
|
+
if (msg.toolCalls) {
|
|
69
|
+
for (const tc of msg.toolCalls) {
|
|
70
|
+
content.push({
|
|
71
|
+
type: 'tool_use',
|
|
72
|
+
id: tc.id,
|
|
73
|
+
name: tc.name,
|
|
74
|
+
input: tc.arguments,
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
claudeMessages.push({ role: 'assistant', content });
|
|
79
|
+
} else if (msg.role === 'tool') {
|
|
80
|
+
// Tool result - Claude expects this as a user message with tool_result content
|
|
81
|
+
claudeMessages.push({
|
|
82
|
+
role: 'user',
|
|
83
|
+
content: [
|
|
84
|
+
{
|
|
85
|
+
type: 'tool_result',
|
|
86
|
+
tool_use_id: msg.toolCallId!,
|
|
87
|
+
content: msg.content,
|
|
88
|
+
},
|
|
89
|
+
],
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return claudeMessages;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Extracts tool calls from Claude's response content.
|
|
99
|
+
*
|
|
100
|
+
* @param content - Claude response content blocks
|
|
101
|
+
* @returns Array of tool calls
|
|
102
|
+
*/
|
|
103
|
+
function extractToolCalls(content: Anthropic.ContentBlock[]): ToolCall[] {
|
|
104
|
+
const toolCalls: ToolCall[] = [];
|
|
105
|
+
|
|
106
|
+
for (const block of content) {
|
|
107
|
+
if (block.type === 'tool_use') {
|
|
108
|
+
toolCalls.push({
|
|
109
|
+
id: block.id,
|
|
110
|
+
name: block.name,
|
|
111
|
+
arguments: block.input as Record<string, unknown>,
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
return toolCalls;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Creates a Claude model adapter.
|
|
121
|
+
*
|
|
122
|
+
* @param apiKey - Anthropic API key
|
|
123
|
+
* @param modelId - Claude model ID (e.g., 'claude-3-sonnet-20240229')
|
|
124
|
+
* @returns ModelAdapter implementation for Claude
|
|
125
|
+
*
|
|
126
|
+
* @example
|
|
127
|
+
* const adapter = createClaudeAdapter(process.env.ANTHROPIC_API_KEY!, 'claude-3-sonnet-20240229');
|
|
128
|
+
* const response = await adapter.complete('Hello, Claude');
|
|
129
|
+
*
|
|
130
|
+
* @see Requirements 11.1, 11.2, 11.4
|
|
131
|
+
*/
|
|
132
|
+
export function createClaudeAdapter(apiKey: string, modelId: string): ModelAdapter {
|
|
133
|
+
const client = new Anthropic({ apiKey });
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
provider: 'claude',
|
|
137
|
+
modelId,
|
|
138
|
+
|
|
139
|
+
async complete(prompt: string, options?: ModelOptions): Promise<ModelResponse> {
|
|
140
|
+
const messages: Anthropic.MessageParam[] = [
|
|
141
|
+
{ role: 'user', content: prompt },
|
|
142
|
+
];
|
|
143
|
+
|
|
144
|
+
const requestParams: Anthropic.MessageCreateParams = {
|
|
145
|
+
model: modelId,
|
|
146
|
+
max_tokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
147
|
+
...(options?.temperature !== undefined && { temperature: options.temperature }),
|
|
148
|
+
...(options?.systemPrompt && { system: options.systemPrompt }),
|
|
149
|
+
messages,
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
// Add tools if provided
|
|
153
|
+
if (options?.tools && options.tools.length > 0) {
|
|
154
|
+
requestParams.tools = toClaudeTools(options.tools);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const message = await client.messages.create(requestParams);
|
|
158
|
+
|
|
159
|
+
// Extract text content from response
|
|
160
|
+
let content = '';
|
|
161
|
+
for (const block of message.content) {
|
|
162
|
+
if (block.type === 'text') {
|
|
163
|
+
content += block.text;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Extract tool calls if any
|
|
168
|
+
const toolCalls = extractToolCalls(message.content);
|
|
169
|
+
|
|
170
|
+
// Calculate total tokens used (input + output)
|
|
171
|
+
const tokensUsed = message.usage.input_tokens + message.usage.output_tokens;
|
|
172
|
+
|
|
173
|
+
return {
|
|
174
|
+
content,
|
|
175
|
+
tokensUsed,
|
|
176
|
+
model: message.model,
|
|
177
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
|
|
178
|
+
requiresToolResponse: message.stop_reason === 'tool_use',
|
|
179
|
+
};
|
|
180
|
+
},
|
|
181
|
+
|
|
182
|
+
async continueWithToolResults(
|
|
183
|
+
messages: ConversationMessage[],
|
|
184
|
+
options?: ModelOptions
|
|
185
|
+
): Promise<ModelResponse> {
|
|
186
|
+
const claudeMessages = toClaudeMessages(messages);
|
|
187
|
+
|
|
188
|
+
const requestParams: Anthropic.MessageCreateParams = {
|
|
189
|
+
model: modelId,
|
|
190
|
+
max_tokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
191
|
+
...(options?.temperature !== undefined && { temperature: options.temperature }),
|
|
192
|
+
...(options?.systemPrompt && { system: options.systemPrompt }),
|
|
193
|
+
messages: claudeMessages,
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
// Add tools if provided
|
|
197
|
+
if (options?.tools && options.tools.length > 0) {
|
|
198
|
+
requestParams.tools = toClaudeTools(options.tools);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const message = await client.messages.create(requestParams);
|
|
202
|
+
|
|
203
|
+
// Extract text content from response
|
|
204
|
+
let content = '';
|
|
205
|
+
for (const block of message.content) {
|
|
206
|
+
if (block.type === 'text') {
|
|
207
|
+
content += block.text;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Extract tool calls if any
|
|
212
|
+
const toolCalls = extractToolCalls(message.content);
|
|
213
|
+
|
|
214
|
+
// Calculate total tokens used (input + output)
|
|
215
|
+
const tokensUsed = message.usage.input_tokens + message.usage.output_tokens;
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
content,
|
|
219
|
+
tokensUsed,
|
|
220
|
+
model: message.model,
|
|
221
|
+
toolCalls: toolCalls.length > 0 ? toolCalls : undefined,
|
|
222
|
+
requiresToolResponse: message.stop_reason === 'tool_use',
|
|
223
|
+
};
|
|
224
|
+
},
|
|
225
|
+
|
|
226
|
+
countTokens(text: string): number {
|
|
227
|
+
// Approximate token count based on character length
|
|
228
|
+
// Claude uses ~4 characters per token on average
|
|
229
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN);
|
|
230
|
+
},
|
|
231
|
+
};
|
|
232
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cost Tracker - Real-time token and cost tracking per agent
|
|
3
|
+
*
|
|
4
|
+
* Tracks tokens spent per agent in real-time and enforces cost ceilings.
|
|
5
|
+
* Used by the league system to monitor and terminate agents that exceed limits.
|
|
6
|
+
*
|
|
7
|
+
* @module runtime/costTracker
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Default cost per token in USD (approximate for Claude/OpenAI models).
|
|
12
|
+
* This is a simplified rate - actual costs vary by model and token type.
|
|
13
|
+
*/
|
|
14
|
+
const DEFAULT_COST_PER_TOKEN_USD = 0.00001;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Represents a cost tracker for a single agent.
|
|
18
|
+
* Tracks tokens spent and calculates cost in real-time.
|
|
19
|
+
*/
|
|
20
|
+
export interface CostTracker {
|
|
21
|
+
/** Unique identifier for the agent being tracked */
|
|
22
|
+
agentId: string;
|
|
23
|
+
/** Total tokens spent by this agent */
|
|
24
|
+
tokensSpent: number;
|
|
25
|
+
/** Estimated cost in USD based on tokens spent */
|
|
26
|
+
costUsd: number;
|
|
27
|
+
/** Maximum token ceiling for this agent */
|
|
28
|
+
ceiling: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Creates a new cost tracker for an agent.
|
|
33
|
+
*
|
|
34
|
+
* @param agentId - Unique identifier for the agent
|
|
35
|
+
* @param ceiling - Maximum token ceiling for this agent
|
|
36
|
+
* @returns A new CostTracker initialized with zero usage
|
|
37
|
+
*
|
|
38
|
+
* @example
|
|
39
|
+
* const tracker = createCostTracker('agent-1', 10000);
|
|
40
|
+
* // { agentId: 'agent-1', tokensSpent: 0, costUsd: 0, ceiling: 10000 }
|
|
41
|
+
*
|
|
42
|
+
* @see Requirements 10.1, 10.5
|
|
43
|
+
*/
|
|
44
|
+
export function createCostTracker(agentId: string, ceiling: number): CostTracker {
|
|
45
|
+
return {
|
|
46
|
+
agentId,
|
|
47
|
+
tokensSpent: 0,
|
|
48
|
+
costUsd: 0,
|
|
49
|
+
ceiling,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Records token usage for an agent.
|
|
55
|
+
*
|
|
56
|
+
* Updates the tracker's tokensSpent and costUsd fields in place.
|
|
57
|
+
* Cost is calculated using a default rate per token.
|
|
58
|
+
*
|
|
59
|
+
* @param tracker - The cost tracker to update
|
|
60
|
+
* @param tokens - Number of tokens to record
|
|
61
|
+
*
|
|
62
|
+
* @example
|
|
63
|
+
* const tracker = createCostTracker('agent-1', 10000);
|
|
64
|
+
* recordUsage(tracker, 500);
|
|
65
|
+
* // tracker.tokensSpent === 500
|
|
66
|
+
* // tracker.costUsd === 0.005
|
|
67
|
+
*
|
|
68
|
+
* @see Requirements 10.5
|
|
69
|
+
*/
|
|
70
|
+
export function recordUsage(tracker: CostTracker, tokens: number): void {
|
|
71
|
+
tracker.tokensSpent += tokens;
|
|
72
|
+
tracker.costUsd = tracker.tokensSpent * DEFAULT_COST_PER_TOKEN_USD;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Checks if an agent has exceeded its token ceiling.
|
|
77
|
+
*
|
|
78
|
+
* @param tracker - The cost tracker to check
|
|
79
|
+
* @returns True if tokensSpent exceeds ceiling
|
|
80
|
+
*
|
|
81
|
+
* @example
|
|
82
|
+
* const tracker = createCostTracker('agent-1', 1000);
|
|
83
|
+
* recordUsage(tracker, 1001);
|
|
84
|
+
* isOverBudget(tracker); // true
|
|
85
|
+
*
|
|
86
|
+
* @see Requirements 10.1
|
|
87
|
+
*/
|
|
88
|
+
export function isOverBudget(tracker: CostTracker): boolean {
|
|
89
|
+
return tracker.tokensSpent > tracker.ceiling;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Gets the remaining token budget for an agent.
|
|
94
|
+
*
|
|
95
|
+
* @param tracker - The cost tracker to check
|
|
96
|
+
* @returns Number of tokens remaining (can be negative if over budget)
|
|
97
|
+
*
|
|
98
|
+
* @example
|
|
99
|
+
* const tracker = createCostTracker('agent-1', 10000);
|
|
100
|
+
* recordUsage(tracker, 3000);
|
|
101
|
+
* getRemainingBudget(tracker); // 7000
|
|
102
|
+
*/
|
|
103
|
+
export function getRemainingBudget(tracker: CostTracker): number {
|
|
104
|
+
return tracker.ceiling - tracker.tokensSpent;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Gets the percentage of budget used.
|
|
109
|
+
*
|
|
110
|
+
* @param tracker - The cost tracker to check
|
|
111
|
+
* @returns Percentage of budget used (0-100+, can exceed 100 if over budget)
|
|
112
|
+
*
|
|
113
|
+
* @example
|
|
114
|
+
* const tracker = createCostTracker('agent-1', 10000);
|
|
115
|
+
* recordUsage(tracker, 5000);
|
|
116
|
+
* getBudgetUsagePercent(tracker); // 50
|
|
117
|
+
*/
|
|
118
|
+
export function getBudgetUsagePercent(tracker: CostTracker): number {
|
|
119
|
+
if (tracker.ceiling === 0) {
|
|
120
|
+
return tracker.tokensSpent > 0 ? 100 : 0;
|
|
121
|
+
}
|
|
122
|
+
return (tracker.tokensSpent / tracker.ceiling) * 100;
|
|
123
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runtime Module - Agent execution, model adapters, and cost tracking
|
|
3
|
+
*
|
|
4
|
+
* @module runtime
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
export {
|
|
8
|
+
CostTracker,
|
|
9
|
+
createCostTracker,
|
|
10
|
+
recordUsage,
|
|
11
|
+
isOverBudget,
|
|
12
|
+
getRemainingBudget,
|
|
13
|
+
getBudgetUsagePercent,
|
|
14
|
+
} from './costTracker.js';
|
|
15
|
+
|
|
16
|
+
export {
|
|
17
|
+
ModelAdapter,
|
|
18
|
+
ModelOptions,
|
|
19
|
+
ModelResponse,
|
|
20
|
+
ModelAdapterConfig,
|
|
21
|
+
createAdapter,
|
|
22
|
+
} from './modelAdapter.js';
|
|
23
|
+
|
|
24
|
+
export { createClaudeAdapter } from './claudeAdapter.js';
|
|
25
|
+
export { createOpenAIAdapter } from './openaiAdapter.js';
|
|
26
|
+
|
|
27
|
+
export {
|
|
28
|
+
type KillReason,
|
|
29
|
+
type AgentRunStatus,
|
|
30
|
+
type AgentRun,
|
|
31
|
+
type AgentRunConfig,
|
|
32
|
+
runAgent,
|
|
33
|
+
runAgentMock,
|
|
34
|
+
} from './agentRunner.js';
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Property-based tests for Model Response Normalization
|
|
3
|
+
*
|
|
4
|
+
* **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
5
|
+
* **Validates: Requirements 11.4**
|
|
6
|
+
*
|
|
7
|
+
* Property 24: Model Response Normalization
|
|
8
|
+
* *For any* model response from Claude or OpenAI, the adapter SHALL return
|
|
9
|
+
* a normalized ModelResponse with content, tokensUsed, and model fields.
|
|
10
|
+
*
|
|
11
|
+
* Since we cannot call actual APIs in tests, we test the normalization logic
|
|
12
|
+
* by verifying that:
|
|
13
|
+
* 1. ModelResponse interface structure is correct
|
|
14
|
+
* 2. Adapters expose correct provider and modelId
|
|
15
|
+
* 3. countTokens function returns consistent results
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { describe, test, expect } from 'vitest';
|
|
19
|
+
import * as fc from 'fast-check';
|
|
20
|
+
import type { ModelResponse, ModelAdapter } from './modelAdapter.js';
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Arbitrary for generating valid ModelResponse objects.
|
|
24
|
+
* This represents what a normalized response should look like.
|
|
25
|
+
*/
|
|
26
|
+
const modelResponseArb = fc.record({
|
|
27
|
+
content: fc.string(),
|
|
28
|
+
tokensUsed: fc.integer({ min: 0, max: 1000000 }),
|
|
29
|
+
model: fc.string({ minLength: 1 }),
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Arbitrary for generating provider types.
|
|
34
|
+
*/
|
|
35
|
+
const providerArb = fc.constantFrom('claude', 'openai') as fc.Arbitrary<'claude' | 'openai'>;
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Arbitrary for generating model IDs.
|
|
39
|
+
*/
|
|
40
|
+
const modelIdArb = fc.constantFrom(
|
|
41
|
+
'claude-3-sonnet-20240229',
|
|
42
|
+
'claude-3-opus-20240229',
|
|
43
|
+
'claude-3-haiku-20240307',
|
|
44
|
+
'gpt-4-turbo-preview',
|
|
45
|
+
'gpt-4',
|
|
46
|
+
'gpt-3.5-turbo'
|
|
47
|
+
);
|
|
48
|
+
|
|
49
|
+
describe('Model Response Normalization - Property Tests', () => {
|
|
50
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
51
|
+
test('ModelResponse has required fields with correct types', () => {
|
|
52
|
+
fc.assert(
|
|
53
|
+
fc.property(modelResponseArb, (response) => {
|
|
54
|
+
// Verify content is a string
|
|
55
|
+
expect(typeof response.content).toBe('string');
|
|
56
|
+
|
|
57
|
+
// Verify tokensUsed is a non-negative number
|
|
58
|
+
expect(typeof response.tokensUsed).toBe('number');
|
|
59
|
+
expect(response.tokensUsed).toBeGreaterThanOrEqual(0);
|
|
60
|
+
|
|
61
|
+
// Verify model is a non-empty string
|
|
62
|
+
expect(typeof response.model).toBe('string');
|
|
63
|
+
expect(response.model.length).toBeGreaterThan(0);
|
|
64
|
+
}),
|
|
65
|
+
{ numRuns: 100 }
|
|
66
|
+
);
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
70
|
+
test('ModelResponse structure is consistent across all generated responses', () => {
|
|
71
|
+
fc.assert(
|
|
72
|
+
fc.property(modelResponseArb, (response) => {
|
|
73
|
+
// All required keys must be present
|
|
74
|
+
const requiredKeys = ['content', 'tokensUsed', 'model'];
|
|
75
|
+
const responseKeys = Object.keys(response);
|
|
76
|
+
|
|
77
|
+
for (const key of requiredKeys) {
|
|
78
|
+
expect(responseKeys).toContain(key);
|
|
79
|
+
}
|
|
80
|
+
}),
|
|
81
|
+
{ numRuns: 100 }
|
|
82
|
+
);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
86
|
+
test('tokensUsed is always a non-negative integer', () => {
|
|
87
|
+
fc.assert(
|
|
88
|
+
fc.property(
|
|
89
|
+
fc.integer({ min: 0, max: 1000000 }),
|
|
90
|
+
(tokens) => {
|
|
91
|
+
// Simulating what the adapter should return
|
|
92
|
+
const response: ModelResponse = {
|
|
93
|
+
content: 'test content',
|
|
94
|
+
tokensUsed: tokens,
|
|
95
|
+
model: 'test-model',
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
expect(Number.isInteger(response.tokensUsed)).toBe(true);
|
|
99
|
+
expect(response.tokensUsed).toBeGreaterThanOrEqual(0);
|
|
100
|
+
}
|
|
101
|
+
),
|
|
102
|
+
{ numRuns: 100 }
|
|
103
|
+
);
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
107
|
+
test('content can be empty string but must be defined', () => {
|
|
108
|
+
fc.assert(
|
|
109
|
+
fc.property(fc.string(), (content) => {
|
|
110
|
+
const response: ModelResponse = {
|
|
111
|
+
content,
|
|
112
|
+
tokensUsed: 100,
|
|
113
|
+
model: 'test-model',
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
expect(response.content).toBeDefined();
|
|
117
|
+
expect(typeof response.content).toBe('string');
|
|
118
|
+
}),
|
|
119
|
+
{ numRuns: 100 }
|
|
120
|
+
);
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
124
|
+
test('model field preserves the model identifier', () => {
|
|
125
|
+
fc.assert(
|
|
126
|
+
fc.property(modelIdArb, (modelId) => {
|
|
127
|
+
const response: ModelResponse = {
|
|
128
|
+
content: 'test',
|
|
129
|
+
tokensUsed: 50,
|
|
130
|
+
model: modelId,
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
expect(response.model).toBe(modelId);
|
|
134
|
+
expect(response.model.length).toBeGreaterThan(0);
|
|
135
|
+
}),
|
|
136
|
+
{ numRuns: 100 }
|
|
137
|
+
);
|
|
138
|
+
});
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
describe('Model Adapter Interface - Property Tests', () => {
|
|
142
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
143
|
+
test('countTokens returns consistent results for same input', () => {
|
|
144
|
+
// Test the token counting approximation logic used by both adapters
|
|
145
|
+
const CHARS_PER_TOKEN = 4;
|
|
146
|
+
|
|
147
|
+
const countTokens = (text: string): number => {
|
|
148
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN);
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
fc.assert(
|
|
152
|
+
fc.property(fc.string(), (text) => {
|
|
153
|
+
const count1 = countTokens(text);
|
|
154
|
+
const count2 = countTokens(text);
|
|
155
|
+
|
|
156
|
+
// Same input should always produce same output (determinism)
|
|
157
|
+
expect(count1).toBe(count2);
|
|
158
|
+
|
|
159
|
+
// Token count should be non-negative
|
|
160
|
+
expect(count1).toBeGreaterThanOrEqual(0);
|
|
161
|
+
|
|
162
|
+
// Token count should be an integer
|
|
163
|
+
expect(Number.isInteger(count1)).toBe(true);
|
|
164
|
+
}),
|
|
165
|
+
{ numRuns: 100 }
|
|
166
|
+
);
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
170
|
+
test('countTokens scales with text length', () => {
|
|
171
|
+
const CHARS_PER_TOKEN = 4;
|
|
172
|
+
|
|
173
|
+
const countTokens = (text: string): number => {
|
|
174
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN);
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
fc.assert(
|
|
178
|
+
fc.property(
|
|
179
|
+
fc.string({ minLength: 0, maxLength: 100 }),
|
|
180
|
+
fc.string({ minLength: 0, maxLength: 100 }),
|
|
181
|
+
(text1, text2) => {
|
|
182
|
+
const count1 = countTokens(text1);
|
|
183
|
+
const count2 = countTokens(text2);
|
|
184
|
+
|
|
185
|
+
// Longer text should have >= token count
|
|
186
|
+
if (text1.length > text2.length) {
|
|
187
|
+
expect(count1).toBeGreaterThanOrEqual(count2);
|
|
188
|
+
} else if (text2.length > text1.length) {
|
|
189
|
+
expect(count2).toBeGreaterThanOrEqual(count1);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
),
|
|
193
|
+
{ numRuns: 100 }
|
|
194
|
+
);
|
|
195
|
+
});
|
|
196
|
+
|
|
197
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
198
|
+
test('empty text returns zero or minimal tokens', () => {
|
|
199
|
+
const CHARS_PER_TOKEN = 4;
|
|
200
|
+
|
|
201
|
+
const countTokens = (text: string): number => {
|
|
202
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN);
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
const count = countTokens('');
|
|
206
|
+
expect(count).toBe(0);
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
210
|
+
test('provider field is valid enum value', () => {
|
|
211
|
+
fc.assert(
|
|
212
|
+
fc.property(providerArb, (provider) => {
|
|
213
|
+
expect(['claude', 'openai']).toContain(provider);
|
|
214
|
+
}),
|
|
215
|
+
{ numRuns: 100 }
|
|
216
|
+
);
|
|
217
|
+
});
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
describe('Response Normalization Invariants - Property Tests', () => {
|
|
221
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
222
|
+
test('normalized response preserves content integrity', () => {
|
|
223
|
+
fc.assert(
|
|
224
|
+
fc.property(fc.string(), (originalContent) => {
|
|
225
|
+
// Simulate normalization: content should be preserved exactly
|
|
226
|
+
const response: ModelResponse = {
|
|
227
|
+
content: originalContent,
|
|
228
|
+
tokensUsed: 100,
|
|
229
|
+
model: 'test-model',
|
|
230
|
+
};
|
|
231
|
+
|
|
232
|
+
expect(response.content).toBe(originalContent);
|
|
233
|
+
}),
|
|
234
|
+
{ numRuns: 100 }
|
|
235
|
+
);
|
|
236
|
+
});
|
|
237
|
+
|
|
238
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
239
|
+
test('tokensUsed reflects combined input and output tokens', () => {
|
|
240
|
+
fc.assert(
|
|
241
|
+
fc.property(
|
|
242
|
+
fc.integer({ min: 0, max: 500000 }),
|
|
243
|
+
fc.integer({ min: 0, max: 500000 }),
|
|
244
|
+
(inputTokens, outputTokens) => {
|
|
245
|
+
// Simulate how adapters calculate total tokens
|
|
246
|
+
const totalTokens = inputTokens + outputTokens;
|
|
247
|
+
|
|
248
|
+
const response: ModelResponse = {
|
|
249
|
+
content: 'test',
|
|
250
|
+
tokensUsed: totalTokens,
|
|
251
|
+
model: 'test-model',
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
expect(response.tokensUsed).toBe(inputTokens + outputTokens);
|
|
255
|
+
expect(response.tokensUsed).toBeGreaterThanOrEqual(0);
|
|
256
|
+
}
|
|
257
|
+
),
|
|
258
|
+
{ numRuns: 100 }
|
|
259
|
+
);
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
263
|
+
test('model field matches the configured model ID', () => {
|
|
264
|
+
fc.assert(
|
|
265
|
+
fc.property(modelIdArb, (configuredModelId) => {
|
|
266
|
+
// The response model should match what was configured
|
|
267
|
+
const response: ModelResponse = {
|
|
268
|
+
content: 'test',
|
|
269
|
+
tokensUsed: 100,
|
|
270
|
+
model: configuredModelId,
|
|
271
|
+
};
|
|
272
|
+
|
|
273
|
+
expect(response.model).toBe(configuredModelId);
|
|
274
|
+
}),
|
|
275
|
+
{ numRuns: 100 }
|
|
276
|
+
);
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
// **Feature: earnd-bounty-engine, Property 24: Model Response Normalization**
|
|
280
|
+
test('response structure is identical regardless of provider', () => {
|
|
281
|
+
fc.assert(
|
|
282
|
+
fc.property(
|
|
283
|
+
providerArb,
|
|
284
|
+
fc.string(),
|
|
285
|
+
fc.integer({ min: 0, max: 100000 }),
|
|
286
|
+
modelIdArb,
|
|
287
|
+
(provider, content, tokens, modelId) => {
|
|
288
|
+
// Both Claude and OpenAI should produce the same structure
|
|
289
|
+
const response: ModelResponse = {
|
|
290
|
+
content,
|
|
291
|
+
tokensUsed: tokens,
|
|
292
|
+
model: modelId,
|
|
293
|
+
};
|
|
294
|
+
|
|
295
|
+
// Verify structure is consistent
|
|
296
|
+
expect(Object.keys(response).sort()).toEqual(['content', 'model', 'tokensUsed']);
|
|
297
|
+
expect(typeof response.content).toBe('string');
|
|
298
|
+
expect(typeof response.tokensUsed).toBe('number');
|
|
299
|
+
expect(typeof response.model).toBe('string');
|
|
300
|
+
}
|
|
301
|
+
),
|
|
302
|
+
{ numRuns: 100 }
|
|
303
|
+
);
|
|
304
|
+
});
|
|
305
|
+
});
|