olympus-ai 3.3.0 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +1 -1
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +2 -0
- package/dist/__tests__/installer.test.js +1 -1
- package/dist/__tests__/learning/aggregation.test.d.ts +2 -0
- package/dist/__tests__/learning/aggregation.test.d.ts.map +1 -0
- package/dist/__tests__/learning/aggregation.test.js +282 -0
- package/dist/__tests__/learning/aggregation.test.js.map +1 -0
- package/dist/__tests__/learning/anomaly.test.d.ts +2 -0
- package/dist/__tests__/learning/anomaly.test.d.ts.map +1 -0
- package/dist/__tests__/learning/anomaly.test.js +107 -0
- package/dist/__tests__/learning/anomaly.test.js.map +1 -0
- package/dist/__tests__/learning/baselines.test.d.ts +2 -0
- package/dist/__tests__/learning/baselines.test.d.ts.map +1 -0
- package/dist/__tests__/learning/baselines.test.js +155 -0
- package/dist/__tests__/learning/baselines.test.js.map +1 -0
- package/dist/__tests__/learning/efficiency.test.d.ts +2 -0
- package/dist/__tests__/learning/efficiency.test.d.ts.map +1 -0
- package/dist/__tests__/learning/efficiency.test.js +94 -0
- package/dist/__tests__/learning/efficiency.test.js.map +1 -0
- package/dist/__tests__/learning/feedback-loop-injection.test.d.ts +6 -0
- package/dist/__tests__/learning/feedback-loop-injection.test.d.ts.map +1 -0
- package/dist/__tests__/learning/feedback-loop-injection.test.js +288 -0
- package/dist/__tests__/learning/feedback-loop-injection.test.js.map +1 -0
- package/dist/__tests__/learning/learning-capture-integration.test.d.ts +6 -0
- package/dist/__tests__/learning/learning-capture-integration.test.d.ts.map +1 -0
- package/dist/__tests__/learning/learning-capture-integration.test.js +151 -0
- package/dist/__tests__/learning/learning-capture-integration.test.js.map +1 -0
- package/dist/__tests__/learning/token-metrics.test.d.ts +2 -0
- package/dist/__tests__/learning/token-metrics.test.d.ts.map +1 -0
- package/dist/__tests__/learning/token-metrics.test.js +308 -0
- package/dist/__tests__/learning/token-metrics.test.js.map +1 -0
- package/dist/__tests__/token-tracking-integration.test.d.ts +8 -0
- package/dist/__tests__/token-tracking-integration.test.d.ts.map +1 -0
- package/dist/__tests__/token-tracking-integration.test.js +669 -0
- package/dist/__tests__/token-tracking-integration.test.js.map +1 -0
- package/dist/agents/prometheus.d.ts.map +1 -1
- package/dist/agents/prometheus.js +27 -0
- package/dist/agents/prometheus.js.map +1 -1
- package/dist/cli/commands/metrics.d.ts +10 -2
- package/dist/cli/commands/metrics.d.ts.map +1 -1
- package/dist/cli/commands/metrics.js +25 -239
- package/dist/cli/commands/metrics.js.map +1 -1
- package/dist/cli/index.js +196 -1
- package/dist/cli/index.js.map +1 -1
- package/dist/config/loader.d.ts.map +1 -1
- package/dist/config/loader.js +14 -0
- package/dist/config/loader.js.map +1 -1
- package/dist/hooks/registrations/budget-warning.d.ts +8 -0
- package/dist/hooks/registrations/budget-warning.d.ts.map +1 -0
- package/dist/hooks/registrations/budget-warning.js +63 -0
- package/dist/hooks/registrations/budget-warning.js.map +1 -0
- package/dist/hooks/registrations/index.d.ts +3 -2
- package/dist/hooks/registrations/index.d.ts.map +1 -1
- package/dist/hooks/registrations/index.js +5 -3
- package/dist/hooks/registrations/index.js.map +1 -1
- package/dist/hooks/registrations/learning-capture.d.ts +19 -0
- package/dist/hooks/registrations/learning-capture.d.ts.map +1 -0
- package/dist/hooks/registrations/learning-capture.js +220 -0
- package/dist/hooks/registrations/learning-capture.js.map +1 -0
- package/dist/hooks/registrations/session-start.d.ts.map +1 -1
- package/dist/hooks/registrations/session-start.js +13 -0
- package/dist/hooks/registrations/session-start.js.map +1 -1
- package/dist/hooks/registrations/token-metrics.d.ts +10 -2
- package/dist/hooks/registrations/token-metrics.d.ts.map +1 -1
- package/dist/hooks/registrations/token-metrics.js +18 -4
- package/dist/hooks/registrations/token-metrics.js.map +1 -1
- package/dist/installer/index.d.ts +2 -2
- package/dist/installer/index.d.ts.map +1 -1
- package/dist/installer/index.js +202 -15
- package/dist/installer/index.js.map +1 -1
- package/dist/learning/aggregation.d.ts +39 -0
- package/dist/learning/aggregation.d.ts.map +1 -0
- package/dist/learning/aggregation.js +101 -0
- package/dist/learning/aggregation.js.map +1 -0
- package/dist/learning/anomaly.d.ts +30 -0
- package/dist/learning/anomaly.d.ts.map +1 -0
- package/dist/learning/anomaly.js +102 -0
- package/dist/learning/anomaly.js.map +1 -0
- package/dist/learning/baselines.d.ts +44 -0
- package/dist/learning/baselines.d.ts.map +1 -0
- package/dist/learning/baselines.js +126 -0
- package/dist/learning/baselines.js.map +1 -0
- package/dist/learning/efficiency.d.ts +23 -0
- package/dist/learning/efficiency.d.ts.map +1 -0
- package/dist/learning/efficiency.js +67 -0
- package/dist/learning/efficiency.js.map +1 -0
- package/dist/learning/hooks/learned-context.d.ts.map +1 -1
- package/dist/learning/hooks/learned-context.js +46 -0
- package/dist/learning/hooks/learned-context.js.map +1 -1
- package/dist/learning/pricing.d.ts +30 -0
- package/dist/learning/pricing.d.ts.map +1 -0
- package/dist/learning/pricing.js +98 -0
- package/dist/learning/pricing.js.map +1 -0
- package/dist/learning/session-state.d.ts +12 -2
- package/dist/learning/session-state.d.ts.map +1 -1
- package/dist/learning/session-state.js +72 -3
- package/dist/learning/session-state.js.map +1 -1
- package/dist/learning/storage.d.ts +21 -1
- package/dist/learning/storage.d.ts.map +1 -1
- package/dist/learning/storage.js +84 -0
- package/dist/learning/storage.js.map +1 -1
- package/dist/learning/token-estimator.d.ts +41 -0
- package/dist/learning/token-estimator.d.ts.map +1 -0
- package/dist/learning/token-estimator.js +111 -0
- package/dist/learning/token-estimator.js.map +1 -0
- package/dist/learning/types.d.ts +32 -0
- package/dist/learning/types.d.ts.map +1 -1
- package/dist/learning/utils.d.ts +42 -0
- package/dist/learning/utils.d.ts.map +1 -0
- package/dist/learning/utils.js +76 -0
- package/dist/learning/utils.js.map +1 -0
- package/dist/shared/types.d.ts +29 -0
- package/dist/shared/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/scripts/dist/hooks/olympus-hooks.cjs +86 -84
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Task 6.4: Final Integration Testing for Token Tracking
|
|
3
|
+
*
|
|
4
|
+
* Tests the complete token tracking flow from session start to feedback capture,
|
|
5
|
+
* budget warnings, and CLI commands.
|
|
6
|
+
*/
|
|
7
|
+
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
|
8
|
+
import { join } from 'path';
|
|
9
|
+
import { mkdirSync, rmSync, writeFileSync, existsSync, readFileSync } from 'fs';
|
|
10
|
+
import { generateLearnedContext } from '../learning/hooks/learned-context.js';
|
|
11
|
+
import { loadSessionState, saveSessionState, updateTokenBudget, shouldIssueWarning, markWarningIssued, } from '../learning/session-state.js';
|
|
12
|
+
import { appendFeedback, updateAgentPerformance, readFeedbackLog } from '../learning/storage.js';
|
|
13
|
+
import { randomUUID } from 'crypto';
|
|
14
|
+
import * as storage from '../learning/storage.js';
|
|
15
|
+
const TEST_DIR = join(process.cwd(), '.test-integration-' + Date.now());
|
|
16
|
+
const TEST_LEARNING_DIR = join(TEST_DIR, '.claude', 'olympus', 'learning');
|
|
17
|
+
let agentPerfBackup = null;
|
|
18
|
+
let feedbackBackup = null;
|
|
19
|
+
beforeEach(() => {
|
|
20
|
+
// Create isolated test directories
|
|
21
|
+
mkdirSync(TEST_DIR, { recursive: true });
|
|
22
|
+
mkdirSync(join(TEST_DIR, '.olympus'), { recursive: true });
|
|
23
|
+
mkdirSync(TEST_LEARNING_DIR, { recursive: true });
|
|
24
|
+
// Mock getLearningDir to use test directory
|
|
25
|
+
vi.spyOn(storage, 'getLearningDir').mockReturnValue(TEST_LEARNING_DIR);
|
|
26
|
+
});
|
|
27
|
+
afterEach(() => {
|
|
28
|
+
// Restore mocks
|
|
29
|
+
vi.restoreAllMocks();
|
|
30
|
+
// Clean up test directory
|
|
31
|
+
if (TEST_DIR.includes('.test-integration-')) {
|
|
32
|
+
rmSync(TEST_DIR, { recursive: true, force: true });
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
describe('Integration Test 1: End-to-End Session with Token Tracking', () => {
|
|
36
|
+
it.skip('should track tokens through complete session lifecycle', () => {
|
|
37
|
+
const sessionId = 'e2e-session-1';
|
|
38
|
+
// 1. Session Start: Initialize state
|
|
39
|
+
let state = loadSessionState(TEST_DIR, sessionId);
|
|
40
|
+
expect(state.token_budget).toBeDefined();
|
|
41
|
+
expect(state.token_budget.current_usage).toBe(0);
|
|
42
|
+
expect(state.token_budget.session_baseline).toBe(10000);
|
|
43
|
+
// 2. UserPromptSubmit hook (no token tracking)
|
|
44
|
+
// Just verify state persists
|
|
45
|
+
saveSessionState(TEST_DIR, state);
|
|
46
|
+
state = loadSessionState(TEST_DIR, sessionId);
|
|
47
|
+
expect(state.session_id).toBe(sessionId);
|
|
48
|
+
// 3. PostToolUse hook: Capture feedback with token usage
|
|
49
|
+
const feedback1 = {
|
|
50
|
+
id: randomUUID(),
|
|
51
|
+
timestamp: new Date().toISOString(),
|
|
52
|
+
session_id: sessionId,
|
|
53
|
+
project_path: TEST_DIR,
|
|
54
|
+
event_type: 'success',
|
|
55
|
+
agent_used: 'olympian',
|
|
56
|
+
original_task: 'Implement feature X',
|
|
57
|
+
user_message: 'Task completed successfully',
|
|
58
|
+
feedback_category: 'praise',
|
|
59
|
+
confidence: 0.9,
|
|
60
|
+
token_usage: {
|
|
61
|
+
input_tokens: 3000,
|
|
62
|
+
output_tokens: 1500,
|
|
63
|
+
total_tokens: 4500,
|
|
64
|
+
estimated: false,
|
|
65
|
+
},
|
|
66
|
+
};
|
|
67
|
+
appendFeedback(feedback1);
|
|
68
|
+
// Update session state with token usage
|
|
69
|
+
state = updateTokenBudget(state, feedback1.token_usage.total_tokens);
|
|
70
|
+
saveSessionState(TEST_DIR, state);
|
|
71
|
+
expect(state.token_budget.current_usage).toBe(4500);
|
|
72
|
+
// 4. Another PostToolUse: More token accumulation
|
|
73
|
+
const feedback2 = {
|
|
74
|
+
id: randomUUID(),
|
|
75
|
+
timestamp: new Date(Date.now() + 60000).toISOString(),
|
|
76
|
+
session_id: sessionId,
|
|
77
|
+
project_path: TEST_DIR,
|
|
78
|
+
event_type: 'success',
|
|
79
|
+
agent_used: 'olympian',
|
|
80
|
+
original_task: 'Fix bug Y',
|
|
81
|
+
user_message: 'Bug fixed correctly',
|
|
82
|
+
feedback_category: 'praise',
|
|
83
|
+
confidence: 0.9,
|
|
84
|
+
token_usage: {
|
|
85
|
+
input_tokens: 2000,
|
|
86
|
+
output_tokens: 1000,
|
|
87
|
+
total_tokens: 3000,
|
|
88
|
+
estimated: false,
|
|
89
|
+
},
|
|
90
|
+
};
|
|
91
|
+
appendFeedback(feedback2);
|
|
92
|
+
state = updateTokenBudget(state, feedback2.token_usage.total_tokens);
|
|
93
|
+
saveSessionState(TEST_DIR, state);
|
|
94
|
+
expect(state.token_budget.current_usage).toBe(7500);
|
|
95
|
+
// 5. Stop hook: Verify AgentPerformance updated
|
|
96
|
+
const allFeedback = readFeedbackLog();
|
|
97
|
+
const updatedPerf = updateAgentPerformance('olympian', allFeedback);
|
|
98
|
+
expect(updatedPerf).not.toBeNull();
|
|
99
|
+
expect(updatedPerf.token_efficiency).toBeDefined();
|
|
100
|
+
expect(updatedPerf.token_efficiency.total_tokens).toBeGreaterThan(0);
|
|
101
|
+
expect(updatedPerf.token_efficiency.invocation_count).toBeGreaterThan(0);
|
|
102
|
+
// 6. Verify feedback entries have token_usage
|
|
103
|
+
const feedbackPath = join(TEST_LEARNING_DIR, 'feedback-log.jsonl');
|
|
104
|
+
expect(existsSync(feedbackPath)).toBe(true);
|
|
105
|
+
const feedbackEntries = readFeedbackLog();
|
|
106
|
+
expect(feedbackEntries.length).toBeGreaterThanOrEqual(2);
|
|
107
|
+
const lastTwo = feedbackEntries.slice(-2);
|
|
108
|
+
for (const entry of lastTwo) {
|
|
109
|
+
expect(entry.token_usage).toBeDefined();
|
|
110
|
+
expect(entry.token_usage.total_tokens).toBeGreaterThan(0);
|
|
111
|
+
}
|
|
112
|
+
});
|
|
113
|
+
it.skip('should calculate token efficiency metrics correctly', () => {
|
|
114
|
+
const sessionId = 'e2e-session-2';
|
|
115
|
+
let state = loadSessionState(TEST_DIR, sessionId);
|
|
116
|
+
// Create mix of success and failure feedback
|
|
117
|
+
const feedbacks = [
|
|
118
|
+
{
|
|
119
|
+
id: randomUUID(),
|
|
120
|
+
timestamp: new Date().toISOString(),
|
|
121
|
+
session_id: sessionId,
|
|
122
|
+
project_path: TEST_DIR,
|
|
123
|
+
event_type: 'success',
|
|
124
|
+
agent_used: 'oracle-low',
|
|
125
|
+
original_task: 'Task 1',
|
|
126
|
+
user_message: 'Good work',
|
|
127
|
+
feedback_category: 'praise',
|
|
128
|
+
confidence: 0.9,
|
|
129
|
+
token_usage: { input_tokens: 1000, output_tokens: 500, total_tokens: 1500, estimated: false },
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
id: randomUUID(),
|
|
133
|
+
timestamp: new Date(Date.now() + 40000).toISOString(),
|
|
134
|
+
session_id: sessionId,
|
|
135
|
+
project_path: TEST_DIR,
|
|
136
|
+
event_type: 'success',
|
|
137
|
+
agent_used: 'oracle-low',
|
|
138
|
+
original_task: 'Task 2',
|
|
139
|
+
user_message: 'Well done',
|
|
140
|
+
feedback_category: 'praise',
|
|
141
|
+
confidence: 0.9,
|
|
142
|
+
token_usage: { input_tokens: 1200, output_tokens: 600, total_tokens: 1800, estimated: false },
|
|
143
|
+
},
|
|
144
|
+
{
|
|
145
|
+
id: randomUUID(),
|
|
146
|
+
timestamp: new Date(Date.now() + 80000).toISOString(),
|
|
147
|
+
session_id: sessionId,
|
|
148
|
+
project_path: TEST_DIR,
|
|
149
|
+
event_type: 'revision',
|
|
150
|
+
agent_used: 'oracle-low',
|
|
151
|
+
original_task: 'Task 3',
|
|
152
|
+
user_message: 'That needs fixing',
|
|
153
|
+
feedback_category: 'correction',
|
|
154
|
+
confidence: 0.8,
|
|
155
|
+
token_usage: { input_tokens: 1500, output_tokens: 800, total_tokens: 2300, estimated: false },
|
|
156
|
+
},
|
|
157
|
+
];
|
|
158
|
+
let totalTokens = 0;
|
|
159
|
+
for (const fb of feedbacks) {
|
|
160
|
+
appendFeedback(fb);
|
|
161
|
+
totalTokens += fb.token_usage.total_tokens;
|
|
162
|
+
}
|
|
163
|
+
state = updateTokenBudget(state, totalTokens);
|
|
164
|
+
saveSessionState(TEST_DIR, state);
|
|
165
|
+
// Update agent performance from all feedback
|
|
166
|
+
const allFeedback = readFeedbackLog();
|
|
167
|
+
const oracleLowPerf = updateAgentPerformance('oracle-low', allFeedback);
|
|
168
|
+
expect(oracleLowPerf).not.toBeNull();
|
|
169
|
+
expect(oracleLowPerf.token_efficiency).toBeDefined();
|
|
170
|
+
expect(oracleLowPerf.token_efficiency.total_tokens).toBe(totalTokens);
|
|
171
|
+
expect(oracleLowPerf.token_efficiency.invocation_count).toBe(3);
|
|
172
|
+
expect(oracleLowPerf.token_efficiency.avg_tokens_per_success).toBeGreaterThan(0);
|
|
173
|
+
expect(oracleLowPerf.success_count).toBe(2);
|
|
174
|
+
expect(oracleLowPerf.revision_count).toBe(1);
|
|
175
|
+
});
|
|
176
|
+
});
|
|
177
|
+
describe('Integration Test 2: Backward Compatibility', () => {
|
|
178
|
+
it.skip('should handle old feedback entries without token_usage', () => {
|
|
179
|
+
// Create old-style feedback entry (without token_usage)
|
|
180
|
+
const oldFeedback = {
|
|
181
|
+
id: randomUUID(),
|
|
182
|
+
timestamp: new Date(Date.now() - 86400000).toISOString(),
|
|
183
|
+
session_id: 'old-session',
|
|
184
|
+
project_path: TEST_DIR,
|
|
185
|
+
event_type: 'success',
|
|
186
|
+
agent_used: 'olympian',
|
|
187
|
+
original_task: 'Old task without tokens',
|
|
188
|
+
user_message: 'Completed',
|
|
189
|
+
feedback_category: 'praise',
|
|
190
|
+
confidence: 0.9,
|
|
191
|
+
// No token_usage field
|
|
192
|
+
};
|
|
193
|
+
const feedbackPath = join(TEST_LEARNING_DIR, 'feedback-log.jsonl');
|
|
194
|
+
writeFileSync(feedbackPath, JSON.stringify(oldFeedback) + '\n');
|
|
195
|
+
// Add new feedback with token_usage
|
|
196
|
+
const newFeedback = {
|
|
197
|
+
id: randomUUID(),
|
|
198
|
+
timestamp: new Date().toISOString(),
|
|
199
|
+
session_id: 'new-session',
|
|
200
|
+
project_path: TEST_DIR,
|
|
201
|
+
event_type: 'success',
|
|
202
|
+
agent_used: 'olympian',
|
|
203
|
+
original_task: 'New task with tokens',
|
|
204
|
+
user_message: 'Done well',
|
|
205
|
+
feedback_category: 'praise',
|
|
206
|
+
confidence: 0.9,
|
|
207
|
+
token_usage: {
|
|
208
|
+
input_tokens: 2000,
|
|
209
|
+
output_tokens: 1000,
|
|
210
|
+
total_tokens: 3000,
|
|
211
|
+
estimated: false,
|
|
212
|
+
},
|
|
213
|
+
};
|
|
214
|
+
appendFeedback(newFeedback);
|
|
215
|
+
// Should not throw error
|
|
216
|
+
const allFeedback = readFeedbackLog();
|
|
217
|
+
const updated = updateAgentPerformance('olympian', allFeedback);
|
|
218
|
+
expect(updated).not.toBeNull();
|
|
219
|
+
expect(updated.token_efficiency).toBeDefined();
|
|
220
|
+
expect(updated.token_efficiency.total_tokens).toBe(3000); // Only counts new entry
|
|
221
|
+
expect(updated.total_invocations).toBe(2); // Counts both entries
|
|
222
|
+
});
|
|
223
|
+
it.skip('should aggregate metrics from mixed old and new entries', () => {
|
|
224
|
+
const feedbackPath = join(TEST_LEARNING_DIR, 'feedback-log.jsonl');
|
|
225
|
+
// Create mixed entries
|
|
226
|
+
const entries = [
|
|
227
|
+
// Old entries without token_usage
|
|
228
|
+
{
|
|
229
|
+
id: randomUUID(),
|
|
230
|
+
timestamp: new Date(Date.now() - 200000).toISOString(),
|
|
231
|
+
session_id: 'test-session',
|
|
232
|
+
project_path: TEST_DIR,
|
|
233
|
+
event_type: 'success',
|
|
234
|
+
agent_used: 'explore',
|
|
235
|
+
original_task: 'Old task 1',
|
|
236
|
+
user_message: 'Done',
|
|
237
|
+
feedback_category: 'praise',
|
|
238
|
+
confidence: 0.8,
|
|
239
|
+
},
|
|
240
|
+
{
|
|
241
|
+
id: randomUUID(),
|
|
242
|
+
timestamp: new Date(Date.now() - 100000).toISOString(),
|
|
243
|
+
session_id: 'test-session',
|
|
244
|
+
project_path: TEST_DIR,
|
|
245
|
+
event_type: 'success',
|
|
246
|
+
agent_used: 'explore',
|
|
247
|
+
original_task: 'Old task 2',
|
|
248
|
+
user_message: 'Completed',
|
|
249
|
+
feedback_category: 'praise',
|
|
250
|
+
confidence: 0.8,
|
|
251
|
+
},
|
|
252
|
+
// New entries with token_usage
|
|
253
|
+
{
|
|
254
|
+
id: randomUUID(),
|
|
255
|
+
timestamp: new Date(Date.now() - 50000).toISOString(),
|
|
256
|
+
session_id: 'test-session',
|
|
257
|
+
project_path: TEST_DIR,
|
|
258
|
+
event_type: 'success',
|
|
259
|
+
agent_used: 'explore',
|
|
260
|
+
original_task: 'New task 1',
|
|
261
|
+
user_message: 'Good',
|
|
262
|
+
feedback_category: 'praise',
|
|
263
|
+
confidence: 0.9,
|
|
264
|
+
token_usage: { input_tokens: 800, output_tokens: 400, total_tokens: 1200, estimated: false },
|
|
265
|
+
},
|
|
266
|
+
{
|
|
267
|
+
id: randomUUID(),
|
|
268
|
+
timestamp: new Date().toISOString(),
|
|
269
|
+
session_id: 'test-session',
|
|
270
|
+
project_path: TEST_DIR,
|
|
271
|
+
event_type: 'success',
|
|
272
|
+
agent_used: 'explore',
|
|
273
|
+
original_task: 'New task 2',
|
|
274
|
+
user_message: 'Excellent',
|
|
275
|
+
feedback_category: 'praise',
|
|
276
|
+
confidence: 0.9,
|
|
277
|
+
token_usage: { input_tokens: 900, output_tokens: 450, total_tokens: 1350, estimated: false },
|
|
278
|
+
},
|
|
279
|
+
];
|
|
280
|
+
writeFileSync(feedbackPath, entries.map(e => JSON.stringify(e)).join('\n') + '\n');
|
|
281
|
+
// Update performance (should handle mixed data)
|
|
282
|
+
const allFeedback = readFeedbackLog();
|
|
283
|
+
const perf = updateAgentPerformance('explore', allFeedback);
|
|
284
|
+
expect(perf).not.toBeNull();
|
|
285
|
+
expect(perf.total_invocations).toBe(4);
|
|
286
|
+
expect(perf.token_efficiency).toBeDefined();
|
|
287
|
+
// Only entries with token_usage should contribute to token metrics
|
|
288
|
+
expect(perf.token_efficiency.total_tokens).toBe(1200 + 1350);
|
|
289
|
+
expect(perf.token_efficiency.invocation_count).toBe(2);
|
|
290
|
+
});
|
|
291
|
+
});
|
|
292
|
+
describe('Integration Test 3: Injection Token Cap', () => {
|
|
293
|
+
it('should respect 500 token limit for SessionStart injection', () => {
|
|
294
|
+
// Create realistic agent performance data
|
|
295
|
+
const agentPerformance = {
|
|
296
|
+
'olympian': {
|
|
297
|
+
agent_name: 'olympian',
|
|
298
|
+
total_invocations: 25,
|
|
299
|
+
success_count: 22,
|
|
300
|
+
revision_count: 3,
|
|
301
|
+
cancellation_count: 0,
|
|
302
|
+
success_rate: 0.88,
|
|
303
|
+
failure_patterns: [
|
|
304
|
+
{ pattern: 'complex async operations', count: 2, examples: [] },
|
|
305
|
+
{ pattern: 'state management', count: 1, examples: [] },
|
|
306
|
+
],
|
|
307
|
+
strong_areas: ['file editing', 'code generation', 'testing'],
|
|
308
|
+
weak_areas: ['deep debugging'],
|
|
309
|
+
last_updated: new Date().toISOString(),
|
|
310
|
+
token_efficiency: {
|
|
311
|
+
avg_tokens_per_success: 4200,
|
|
312
|
+
avg_tokens_per_failure: 6500,
|
|
313
|
+
total_tokens: 105000,
|
|
314
|
+
invocation_count: 25,
|
|
315
|
+
efficiency_score: 0.85,
|
|
316
|
+
trend: 'stable',
|
|
317
|
+
},
|
|
318
|
+
},
|
|
319
|
+
'oracle-low': {
|
|
320
|
+
agent_name: 'oracle-low',
|
|
321
|
+
total_invocations: 18,
|
|
322
|
+
success_count: 17,
|
|
323
|
+
revision_count: 1,
|
|
324
|
+
cancellation_count: 0,
|
|
325
|
+
success_rate: 0.94,
|
|
326
|
+
failure_patterns: [],
|
|
327
|
+
strong_areas: ['simple debugging', 'code analysis'],
|
|
328
|
+
weak_areas: [],
|
|
329
|
+
last_updated: new Date().toISOString(),
|
|
330
|
+
token_efficiency: {
|
|
331
|
+
avg_tokens_per_success: 2100,
|
|
332
|
+
avg_tokens_per_failure: 2800,
|
|
333
|
+
total_tokens: 37800,
|
|
334
|
+
invocation_count: 18,
|
|
335
|
+
efficiency_score: 1.15,
|
|
336
|
+
trend: 'improving',
|
|
337
|
+
},
|
|
338
|
+
},
|
|
339
|
+
'explore': {
|
|
340
|
+
agent_name: 'explore',
|
|
341
|
+
total_invocations: 30,
|
|
342
|
+
success_count: 29,
|
|
343
|
+
revision_count: 1,
|
|
344
|
+
cancellation_count: 0,
|
|
345
|
+
success_rate: 0.97,
|
|
346
|
+
failure_patterns: [],
|
|
347
|
+
strong_areas: ['codebase search', 'pattern matching'],
|
|
348
|
+
weak_areas: [],
|
|
349
|
+
last_updated: new Date().toISOString(),
|
|
350
|
+
token_efficiency: {
|
|
351
|
+
avg_tokens_per_success: 1200,
|
|
352
|
+
avg_tokens_per_failure: 1500,
|
|
353
|
+
total_tokens: 36000,
|
|
354
|
+
invocation_count: 30,
|
|
355
|
+
efficiency_score: 1.35,
|
|
356
|
+
trend: 'stable',
|
|
357
|
+
},
|
|
358
|
+
},
|
|
359
|
+
};
|
|
360
|
+
const perfPath = join(TEST_LEARNING_DIR, 'agent-performance.json');
|
|
361
|
+
writeFileSync(perfPath, JSON.stringify(agentPerformance, null, 2));
|
|
362
|
+
// Ensure file exists before reading
|
|
363
|
+
expect(existsSync(perfPath)).toBe(true);
|
|
364
|
+
const context = generateLearnedContext(TEST_DIR);
|
|
365
|
+
// Calculate token count (rough estimate: 1 token ≈ 4 chars)
|
|
366
|
+
const estimatedTokens = context.length / 4;
|
|
367
|
+
expect(estimatedTokens).toBeLessThanOrEqual(500);
|
|
368
|
+
expect(context).toContain('<olympus-efficiency>');
|
|
369
|
+
});
|
|
370
|
+
it('should prioritize most efficient agents when space is limited', () => {
|
|
371
|
+
// Create many agents to test truncation
|
|
372
|
+
const agentPerformance = {};
|
|
373
|
+
const agents = [
|
|
374
|
+
{ name: 'explore', efficiency: 1.5 },
|
|
375
|
+
{ name: 'oracle-low', efficiency: 1.3 },
|
|
376
|
+
{ name: 'olympian-low', efficiency: 1.1 },
|
|
377
|
+
{ name: 'olympian', efficiency: 0.9 },
|
|
378
|
+
{ name: 'oracle', efficiency: 0.7 },
|
|
379
|
+
{ name: 'frontend-engineer', efficiency: 0.6 },
|
|
380
|
+
];
|
|
381
|
+
for (const agent of agents) {
|
|
382
|
+
agentPerformance[agent.name] = {
|
|
383
|
+
agent_name: agent.name,
|
|
384
|
+
total_invocations: 10,
|
|
385
|
+
success_count: 9,
|
|
386
|
+
revision_count: 1,
|
|
387
|
+
cancellation_count: 0,
|
|
388
|
+
success_rate: 0.9,
|
|
389
|
+
failure_patterns: [],
|
|
390
|
+
strong_areas: ['area1', 'area2'],
|
|
391
|
+
weak_areas: ['area3'],
|
|
392
|
+
last_updated: new Date().toISOString(),
|
|
393
|
+
token_efficiency: {
|
|
394
|
+
avg_tokens_per_success: 3000,
|
|
395
|
+
avg_tokens_per_failure: 4000,
|
|
396
|
+
total_tokens: 30000,
|
|
397
|
+
invocation_count: 10,
|
|
398
|
+
efficiency_score: agent.efficiency,
|
|
399
|
+
trend: 'stable',
|
|
400
|
+
},
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
writeFileSync(join(TEST_LEARNING_DIR, 'agent-performance.json'), JSON.stringify(agentPerformance, null, 2));
|
|
404
|
+
const context = generateLearnedContext(TEST_DIR);
|
|
405
|
+
// Most efficient agents should appear
|
|
406
|
+
expect(context).toContain('explore');
|
|
407
|
+
expect(context).toContain('oracle-low');
|
|
408
|
+
// Verify token limit
|
|
409
|
+
const estimatedTokens = context.length / 4;
|
|
410
|
+
expect(estimatedTokens).toBeLessThanOrEqual(500);
|
|
411
|
+
});
|
|
412
|
+
});
|
|
413
|
+
describe('Integration Test 4: Budget Warning Behavior', () => {
|
|
414
|
+
it('should fire warning once when threshold exceeded', () => {
|
|
415
|
+
const sessionId = 'budget-warning-1';
|
|
416
|
+
let state = loadSessionState(TEST_DIR, sessionId);
|
|
417
|
+
expect(state.token_budget.session_baseline).toBe(10000);
|
|
418
|
+
expect(state.token_budget.warning_threshold).toBe(1.5);
|
|
419
|
+
const warningLimit = state.token_budget.session_baseline * state.token_budget.warning_threshold;
|
|
420
|
+
expect(warningLimit).toBe(15000);
|
|
421
|
+
// Add tokens below threshold
|
|
422
|
+
state = updateTokenBudget(state, 14000);
|
|
423
|
+
expect(shouldIssueWarning(state)).toBe(false);
|
|
424
|
+
// Exceed threshold
|
|
425
|
+
state = updateTokenBudget(state, 2000); // Total: 16000 > 15000
|
|
426
|
+
expect(shouldIssueWarning(state)).toBe(true);
|
|
427
|
+
// Mark warning issued
|
|
428
|
+
state = markWarningIssued(state);
|
|
429
|
+
saveSessionState(TEST_DIR, state);
|
|
430
|
+
// Should not issue again
|
|
431
|
+
expect(shouldIssueWarning(state)).toBe(false);
|
|
432
|
+
// Even with more usage
|
|
433
|
+
state = updateTokenBudget(state, 10000); // Total: 26000
|
|
434
|
+
expect(shouldIssueWarning(state)).toBe(false);
|
|
435
|
+
expect(state.token_budget.warning_issued).toBe(true);
|
|
436
|
+
});
|
|
437
|
+
it('should be non-blocking (continue: true)', () => {
|
|
438
|
+
const sessionId = 'budget-warning-2';
|
|
439
|
+
let state = loadSessionState(TEST_DIR, sessionId);
|
|
440
|
+
// Massively exceed budget
|
|
441
|
+
state = updateTokenBudget(state, 100000);
|
|
442
|
+
// Warning should be issued
|
|
443
|
+
expect(shouldIssueWarning(state)).toBe(true);
|
|
444
|
+
// But this should never block execution
|
|
445
|
+
// Hook implementation always returns { continue: true }
|
|
446
|
+
// This is a behavioral guarantee verified by the hook itself
|
|
447
|
+
expect(state.token_budget).toBeDefined();
|
|
448
|
+
});
|
|
449
|
+
it('should handle multiple sessions independently', () => {
|
|
450
|
+
// Note: Session state is stored per directory, not per session ID
|
|
451
|
+
// So we need separate directories for independent sessions
|
|
452
|
+
const dir1 = join(TEST_DIR, 'session1');
|
|
453
|
+
const dir2 = join(TEST_DIR, 'session2');
|
|
454
|
+
mkdirSync(dir1, { recursive: true });
|
|
455
|
+
mkdirSync(dir2, { recursive: true });
|
|
456
|
+
let state1 = loadSessionState(dir1, 'session1');
|
|
457
|
+
let state2 = loadSessionState(dir2, 'session2');
|
|
458
|
+
// Exceed budget in session1
|
|
459
|
+
state1 = updateTokenBudget(state1, 20000);
|
|
460
|
+
state1 = markWarningIssued(state1);
|
|
461
|
+
saveSessionState(dir1, state1);
|
|
462
|
+
// Session2 should be independent
|
|
463
|
+
expect(shouldIssueWarning(state2)).toBe(false);
|
|
464
|
+
state2 = updateTokenBudget(state2, 20000);
|
|
465
|
+
expect(shouldIssueWarning(state2)).toBe(true);
|
|
466
|
+
// Reload to verify persistence
|
|
467
|
+
const reloadedState1 = loadSessionState(dir1, 'session1');
|
|
468
|
+
const reloadedState2 = loadSessionState(dir2, 'session2');
|
|
469
|
+
expect(reloadedState1.token_budget.warning_issued).toBe(true);
|
|
470
|
+
expect(reloadedState2.token_budget.warning_issued).toBe(false);
|
|
471
|
+
});
|
|
472
|
+
});
|
|
473
|
+
describe('Integration Test 5: CLI Commands', () => {
|
|
474
|
+
beforeEach(() => {
|
|
475
|
+
// Set up realistic test data
|
|
476
|
+
const agentPerformance = {
|
|
477
|
+
'olympian': {
|
|
478
|
+
agent_name: 'olympian',
|
|
479
|
+
total_invocations: 15,
|
|
480
|
+
success_count: 13,
|
|
481
|
+
revision_count: 2,
|
|
482
|
+
cancellation_count: 0,
|
|
483
|
+
success_rate: 0.87,
|
|
484
|
+
failure_patterns: [],
|
|
485
|
+
strong_areas: ['editing', 'testing'],
|
|
486
|
+
weak_areas: [],
|
|
487
|
+
last_updated: new Date().toISOString(),
|
|
488
|
+
token_efficiency: {
|
|
489
|
+
avg_tokens_per_success: 4200,
|
|
490
|
+
avg_tokens_per_failure: 5500,
|
|
491
|
+
total_tokens: 65600,
|
|
492
|
+
invocation_count: 15,
|
|
493
|
+
efficiency_score: 0.88,
|
|
494
|
+
trend: 'stable',
|
|
495
|
+
},
|
|
496
|
+
},
|
|
497
|
+
'oracle-low': {
|
|
498
|
+
agent_name: 'oracle-low',
|
|
499
|
+
total_invocations: 10,
|
|
500
|
+
success_count: 10,
|
|
501
|
+
revision_count: 0,
|
|
502
|
+
cancellation_count: 0,
|
|
503
|
+
success_rate: 1.0,
|
|
504
|
+
failure_patterns: [],
|
|
505
|
+
strong_areas: ['debugging'],
|
|
506
|
+
weak_areas: [],
|
|
507
|
+
last_updated: new Date().toISOString(),
|
|
508
|
+
token_efficiency: {
|
|
509
|
+
avg_tokens_per_success: 2100,
|
|
510
|
+
avg_tokens_per_failure: 0,
|
|
511
|
+
total_tokens: 21000,
|
|
512
|
+
invocation_count: 10,
|
|
513
|
+
efficiency_score: 1.2,
|
|
514
|
+
trend: 'improving',
|
|
515
|
+
},
|
|
516
|
+
},
|
|
517
|
+
};
|
|
518
|
+
writeFileSync(join(TEST_LEARNING_DIR, 'agent-performance.json'), JSON.stringify(agentPerformance, null, 2));
|
|
519
|
+
});
|
|
520
|
+
it('should handle --efficiency flag data format', () => {
|
|
521
|
+
// Read agent performance
|
|
522
|
+
const perfPath = join(TEST_LEARNING_DIR, 'agent-performance.json');
|
|
523
|
+
const perfData = JSON.parse(readFileSync(perfPath, 'utf-8'));
|
|
524
|
+
// Verify data structure for CLI display
|
|
525
|
+
const agentsWithTokens = Object.values(perfData).filter(a => a.token_efficiency);
|
|
526
|
+
expect(agentsWithTokens.length).toBeGreaterThan(0);
|
|
527
|
+
for (const agent of agentsWithTokens) {
|
|
528
|
+
expect(agent.token_efficiency).toBeDefined();
|
|
529
|
+
expect(agent.token_efficiency.avg_tokens_per_success).toBeGreaterThan(0);
|
|
530
|
+
expect(agent.token_efficiency.efficiency_score).toBeGreaterThan(0);
|
|
531
|
+
expect(agent.token_efficiency.trend).toMatch(/^(improving|stable|declining|insufficient_data)$/);
|
|
532
|
+
}
|
|
533
|
+
});
|
|
534
|
+
it('should handle --show-costs flag data format', () => {
|
|
535
|
+
const perfPath = join(TEST_LEARNING_DIR, 'agent-performance.json');
|
|
536
|
+
const perfData = JSON.parse(readFileSync(perfPath, 'utf-8'));
|
|
537
|
+
// Cost calculation data should be available
|
|
538
|
+
const agentsWithTokens = Object.values(perfData).filter(a => a.token_efficiency);
|
|
539
|
+
for (const agent of agentsWithTokens) {
|
|
540
|
+
const { total_tokens, invocation_count } = agent.token_efficiency;
|
|
541
|
+
expect(total_tokens).toBeGreaterThan(0);
|
|
542
|
+
expect(invocation_count).toBeGreaterThan(0);
|
|
543
|
+
// Can calculate average cost
|
|
544
|
+
const avgTokens = total_tokens / invocation_count;
|
|
545
|
+
expect(avgTokens).toBeGreaterThan(0);
|
|
546
|
+
}
|
|
547
|
+
});
|
|
548
|
+
it('should handle --budget-status flag data format', () => {
|
|
549
|
+
const sessionId = 'cli-budget-test';
|
|
550
|
+
let state = loadSessionState(TEST_DIR, sessionId);
|
|
551
|
+
state = updateTokenBudget(state, 8000);
|
|
552
|
+
saveSessionState(TEST_DIR, state);
|
|
553
|
+
// Reload and verify budget status data
|
|
554
|
+
const reloaded = loadSessionState(TEST_DIR, sessionId);
|
|
555
|
+
expect(reloaded.token_budget).toBeDefined();
|
|
556
|
+
expect(reloaded.token_budget.session_baseline).toBe(10000);
|
|
557
|
+
expect(reloaded.token_budget.current_usage).toBe(8000);
|
|
558
|
+
expect(reloaded.token_budget.warning_threshold).toBe(1.5);
|
|
559
|
+
// Calculate percentage for display
|
|
560
|
+
const percentage = (reloaded.token_budget.current_usage / reloaded.token_budget.session_baseline) * 100;
|
|
561
|
+
expect(percentage).toBe(80);
|
|
562
|
+
});
|
|
563
|
+
it('should gracefully handle no-data case for all CLI commands', () => {
|
|
564
|
+
// Remove agent performance file
|
|
565
|
+
const perfPath = join(TEST_LEARNING_DIR, 'agent-performance.json');
|
|
566
|
+
if (existsSync(perfPath)) {
|
|
567
|
+
rmSync(perfPath);
|
|
568
|
+
}
|
|
569
|
+
// Should not throw when reading non-existent data
|
|
570
|
+
expect(() => {
|
|
571
|
+
if (existsSync(perfPath)) {
|
|
572
|
+
readFileSync(perfPath, 'utf-8');
|
|
573
|
+
}
|
|
574
|
+
}).not.toThrow();
|
|
575
|
+
// Session state should still work even without agent performance
|
|
576
|
+
const sessionId = 'no-data-test';
|
|
577
|
+
const state = loadSessionState(TEST_DIR, sessionId);
|
|
578
|
+
expect(state.token_budget).toBeDefined();
|
|
579
|
+
});
|
|
580
|
+
});
|
|
581
|
+
describe('Integration Test 6: Performance and Regression', () => {
|
|
582
|
+
it('should not cause noticeable slowdown in hook execution', () => {
|
|
583
|
+
const startTime = Date.now();
|
|
584
|
+
// Simulate typical hook workflow
|
|
585
|
+
const sessionId = 'perf-test-1';
|
|
586
|
+
let state = loadSessionState(TEST_DIR, sessionId);
|
|
587
|
+
for (let i = 0; i < 10; i++) {
|
|
588
|
+
const feedback = {
|
|
589
|
+
id: randomUUID(),
|
|
590
|
+
timestamp: new Date(Date.now() + i * 1000).toISOString(),
|
|
591
|
+
session_id: sessionId,
|
|
592
|
+
project_path: TEST_DIR,
|
|
593
|
+
event_type: 'success',
|
|
594
|
+
agent_used: 'olympian',
|
|
595
|
+
original_task: `Task ${i}`,
|
|
596
|
+
user_message: `Task ${i} done`,
|
|
597
|
+
feedback_category: 'praise',
|
|
598
|
+
confidence: 0.9,
|
|
599
|
+
token_usage: {
|
|
600
|
+
input_tokens: 2000 + i * 100,
|
|
601
|
+
output_tokens: 1000 + i * 50,
|
|
602
|
+
total_tokens: 3000 + i * 150,
|
|
603
|
+
estimated: false,
|
|
604
|
+
},
|
|
605
|
+
};
|
|
606
|
+
appendFeedback(feedback);
|
|
607
|
+
state = updateTokenBudget(state, feedback.token_usage.total_tokens);
|
|
608
|
+
}
|
|
609
|
+
// Update performance once at the end (typical pattern)
|
|
610
|
+
const allFeedback = readFeedbackLog();
|
|
611
|
+
updateAgentPerformance('olympian', allFeedback);
|
|
612
|
+
saveSessionState(TEST_DIR, state);
|
|
613
|
+
const elapsed = Date.now() - startTime;
|
|
614
|
+
// Should complete in reasonable time (< 2 seconds for 10 iterations)
|
|
615
|
+
expect(elapsed).toBeLessThan(2000);
|
|
616
|
+
});
|
|
617
|
+
it('should maintain existing functionality without regressions', () => {
|
|
618
|
+
// Verify core features still work
|
|
619
|
+
const sessionId = 'regression-test';
|
|
620
|
+
const state = loadSessionState(TEST_DIR, sessionId);
|
|
621
|
+
// Session state core functionality
|
|
622
|
+
expect(state.session_id).toBe(sessionId);
|
|
623
|
+
expect(state.started_at).toBeDefined();
|
|
624
|
+
expect(state.token_budget).toBeDefined();
|
|
625
|
+
// Feedback capture still works
|
|
626
|
+
const feedback = {
|
|
627
|
+
id: randomUUID(),
|
|
628
|
+
timestamp: new Date().toISOString(),
|
|
629
|
+
session_id: sessionId,
|
|
630
|
+
project_path: TEST_DIR,
|
|
631
|
+
event_type: 'success',
|
|
632
|
+
agent_used: 'explore',
|
|
633
|
+
original_task: 'Search codebase',
|
|
634
|
+
user_message: 'Found it',
|
|
635
|
+
feedback_category: 'praise',
|
|
636
|
+
confidence: 0.9,
|
|
637
|
+
token_usage: {
|
|
638
|
+
input_tokens: 500,
|
|
639
|
+
output_tokens: 200,
|
|
640
|
+
total_tokens: 700,
|
|
641
|
+
estimated: false,
|
|
642
|
+
},
|
|
643
|
+
};
|
|
644
|
+
expect(() => appendFeedback(feedback)).not.toThrow();
|
|
645
|
+
// Agent performance update still works
|
|
646
|
+
const allFeedback = readFeedbackLog();
|
|
647
|
+
expect(() => updateAgentPerformance('explore', allFeedback)).not.toThrow();
|
|
648
|
+
});
|
|
649
|
+
it('should handle concurrent sessions without conflicts', () => {
|
|
650
|
+
// Session state is stored per directory, so create separate directories
|
|
651
|
+
const sessionDirs = ['concurrent-1', 'concurrent-2', 'concurrent-3'].map(id => join(TEST_DIR, id));
|
|
652
|
+
// Create directories and load states
|
|
653
|
+
const states = sessionDirs.map((dir, i) => {
|
|
654
|
+
mkdirSync(dir, { recursive: true });
|
|
655
|
+
return loadSessionState(dir, `session-${i}`);
|
|
656
|
+
});
|
|
657
|
+
// Update each session independently
|
|
658
|
+
for (let i = 0; i < states.length; i++) {
|
|
659
|
+
states[i] = updateTokenBudget(states[i], 5000 * (i + 1));
|
|
660
|
+
saveSessionState(sessionDirs[i], states[i]);
|
|
661
|
+
}
|
|
662
|
+
// Reload and verify independence
|
|
663
|
+
for (let i = 0; i < sessionDirs.length; i++) {
|
|
664
|
+
const reloaded = loadSessionState(sessionDirs[i], `session-${i}`);
|
|
665
|
+
expect(reloaded.token_budget.current_usage).toBe(5000 * (i + 1));
|
|
666
|
+
}
|
|
667
|
+
});
|
|
668
|
+
});
|
|
669
|
+
//# sourceMappingURL=token-tracking-integration.test.js.map
|