@elizaos/training 2.0.0-alpha.76 → 2.0.0-alpha.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/.turbo/turbo-lint.log +0 -3
- package/.turbo/turbo-typecheck.log +0 -1
- package/dist/.tsbuildinfo +0 -1
- package/dist/adapter.js +0 -59
- package/dist/archetypes/ArchetypeConfigService.js +0 -510
- package/dist/archetypes/derive-archetype.js +0 -196
- package/dist/archetypes/index.js +0 -7
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
- package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
- package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
- package/dist/benchmark/BenchmarkDataViewer.js +0 -197
- package/dist/benchmark/BenchmarkHistoryService.js +0 -135
- package/dist/benchmark/BenchmarkRunner.js +0 -483
- package/dist/benchmark/BenchmarkValidator.js +0 -158
- package/dist/benchmark/FastEvalRunner.js +0 -133
- package/dist/benchmark/MetricsValidator.js +0 -104
- package/dist/benchmark/MetricsVisualizer.js +0 -775
- package/dist/benchmark/ModelBenchmarkService.js +0 -433
- package/dist/benchmark/ModelRegistry.js +0 -122
- package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
- package/dist/benchmark/SimulationA2AInterface.js +0 -683
- package/dist/benchmark/SimulationEngine.js +0 -522
- package/dist/benchmark/TaskRunner.js +0 -60
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
- package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
- package/dist/benchmark/index.js +0 -23
- package/dist/benchmark/parseSimulationMetrics.js +0 -86
- package/dist/benchmark/simulation-types.js +0 -1
- package/dist/dependencies.js +0 -197
- package/dist/generation/TrajectoryGenerator.js +0 -244
- package/dist/generation/index.js +0 -6
- package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
- package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
- package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
- package/dist/huggingface/index.js +0 -9
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
- package/dist/index.js +0 -41
- package/dist/init-training.js +0 -43
- package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
- package/dist/metrics/index.js +0 -7
- package/dist/metrics/types.js +0 -21
- package/dist/rubrics/__tests__/index.test.js +0 -150
- package/dist/rubrics/ass-kisser.js +0 -83
- package/dist/rubrics/degen.js +0 -78
- package/dist/rubrics/goody-twoshoes.js +0 -82
- package/dist/rubrics/index.js +0 -184
- package/dist/rubrics/information-trader.js +0 -82
- package/dist/rubrics/infosec.js +0 -99
- package/dist/rubrics/liar.js +0 -102
- package/dist/rubrics/perps-trader.js +0 -85
- package/dist/rubrics/researcher.js +0 -79
- package/dist/rubrics/scammer.js +0 -80
- package/dist/rubrics/social-butterfly.js +0 -71
- package/dist/rubrics/super-predictor.js +0 -95
- package/dist/rubrics/trader.js +0 -65
- package/dist/scoring/ArchetypeScoringService.js +0 -301
- package/dist/scoring/JudgePromptBuilder.js +0 -401
- package/dist/scoring/LLMJudgeCache.js +0 -263
- package/dist/scoring/index.js +0 -8
- package/dist/training/AutomationPipeline.js +0 -714
- package/dist/training/BenchmarkService.js +0 -370
- package/dist/training/ConfigValidator.js +0 -153
- package/dist/training/MarketOutcomesTracker.js +0 -142
- package/dist/training/ModelDeployer.js +0 -128
- package/dist/training/ModelFetcher.js +0 -48
- package/dist/training/ModelSelectionService.js +0 -248
- package/dist/training/ModelUsageVerifier.js +0 -106
- package/dist/training/MultiModelOrchestrator.js +0 -349
- package/dist/training/RLModelConfig.js +0 -295
- package/dist/training/RewardBackpropagationService.js +0 -117
- package/dist/training/RulerScoringService.js +0 -450
- package/dist/training/TrainingMonitor.js +0 -108
- package/dist/training/TrajectoryRecorder.js +0 -281
- package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
- package/dist/training/index.js +0 -30
- package/dist/training/logRLConfig.js +0 -29
- package/dist/training/pipeline.js +0 -80
- package/dist/training/storage/ModelStorageService.js +0 -190
- package/dist/training/storage/TrainingDataArchiver.js +0 -136
- package/dist/training/storage/index.js +0 -7
- package/dist/training/types.js +0 -6
- package/dist/training/window-utils.js +0 -100
- package/dist/utils/index.js +0 -73
- package/dist/utils/logger.js +0 -55
- package/dist/utils/snowflake.js +0 -15
- package/dist/utils/synthetic-detector.js +0 -67
- package/vitest.config.ts +0 -8
|
@@ -1,628 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Trajectory Metrics Extractor Tests
|
|
3
|
-
*
|
|
4
|
-
* Validates that all metrics are properly extracted and never null/undefined/NaN.
|
|
5
|
-
*/
|
|
6
|
-
import { beforeEach, describe, expect, it } from "vitest";
|
|
7
|
-
import { TrajectoryMetricsExtractor, trajectoryMetricsExtractor, } from "../TrajectoryMetricsExtractor";
|
|
8
|
-
describe("TrajectoryMetricsExtractor", () => {
|
|
9
|
-
let extractor;
|
|
10
|
-
beforeEach(() => {
|
|
11
|
-
extractor = new TrajectoryMetricsExtractor();
|
|
12
|
-
});
|
|
13
|
-
/**
|
|
14
|
-
* Helper to check all metrics are valid numbers (not null, undefined, NaN, Infinity)
|
|
15
|
-
*/
|
|
16
|
-
function assertValidMetrics(metrics) {
|
|
17
|
-
// Check root level
|
|
18
|
-
expect(metrics.trajectoryId).toBeDefined();
|
|
19
|
-
expect(metrics.agentId).toBeDefined();
|
|
20
|
-
expect(metrics.extractedAt).toBeInstanceOf(Date);
|
|
21
|
-
// Check social metrics - all should be finite numbers
|
|
22
|
-
const social = metrics.social;
|
|
23
|
-
expect(typeof social.groupChatsJoined).toBe("number");
|
|
24
|
-
expect(Number.isFinite(social.groupChatsJoined)).toBe(true);
|
|
25
|
-
expect(social.groupChatsJoined).toBeGreaterThanOrEqual(0);
|
|
26
|
-
expect(typeof social.groupChatsCreated).toBe("number");
|
|
27
|
-
expect(Number.isFinite(social.groupChatsCreated)).toBe(true);
|
|
28
|
-
expect(social.groupChatsCreated).toBeGreaterThanOrEqual(0);
|
|
29
|
-
expect(typeof social.groupMessagesSent).toBe("number");
|
|
30
|
-
expect(Number.isFinite(social.groupMessagesSent)).toBe(true);
|
|
31
|
-
expect(social.groupMessagesSent).toBeGreaterThanOrEqual(0);
|
|
32
|
-
expect(typeof social.dmsInitiated).toBe("number");
|
|
33
|
-
expect(Number.isFinite(social.dmsInitiated)).toBe(true);
|
|
34
|
-
expect(social.dmsInitiated).toBeGreaterThanOrEqual(0);
|
|
35
|
-
expect(typeof social.dmsReceived).toBe("number");
|
|
36
|
-
expect(Number.isFinite(social.dmsReceived)).toBe(true);
|
|
37
|
-
expect(social.dmsReceived).toBeGreaterThanOrEqual(0);
|
|
38
|
-
expect(typeof social.dmResponseRate).toBe("number");
|
|
39
|
-
expect(Number.isFinite(social.dmResponseRate)).toBe(true);
|
|
40
|
-
expect(social.dmResponseRate).toBeGreaterThanOrEqual(0);
|
|
41
|
-
expect(social.dmResponseRate).toBeLessThanOrEqual(1);
|
|
42
|
-
expect(typeof social.uniqueUsersInteracted).toBe("number");
|
|
43
|
-
expect(Number.isFinite(social.uniqueUsersInteracted)).toBe(true);
|
|
44
|
-
expect(social.uniqueUsersInteracted).toBeGreaterThanOrEqual(0);
|
|
45
|
-
expect(typeof social.postsCreated).toBe("number");
|
|
46
|
-
expect(Number.isFinite(social.postsCreated)).toBe(true);
|
|
47
|
-
expect(social.postsCreated).toBeGreaterThanOrEqual(0);
|
|
48
|
-
expect(typeof social.commentsMade).toBe("number");
|
|
49
|
-
expect(Number.isFinite(social.commentsMade)).toBe(true);
|
|
50
|
-
expect(social.commentsMade).toBeGreaterThanOrEqual(0);
|
|
51
|
-
expect(typeof social.mentionsGiven).toBe("number");
|
|
52
|
-
expect(Number.isFinite(social.mentionsGiven)).toBe(true);
|
|
53
|
-
expect(social.mentionsGiven).toBeGreaterThanOrEqual(0);
|
|
54
|
-
expect(typeof social.mentionsReceived).toBe("number");
|
|
55
|
-
expect(Number.isFinite(social.mentionsReceived)).toBe(true);
|
|
56
|
-
expect(social.mentionsReceived).toBeGreaterThanOrEqual(0);
|
|
57
|
-
expect(typeof social.invitationsSent).toBe("number");
|
|
58
|
-
expect(Number.isFinite(social.invitationsSent)).toBe(true);
|
|
59
|
-
expect(social.invitationsSent).toBeGreaterThanOrEqual(0);
|
|
60
|
-
// Check trading metrics
|
|
61
|
-
const trading = metrics.trading;
|
|
62
|
-
expect(typeof trading.tradesExecuted).toBe("number");
|
|
63
|
-
expect(Number.isFinite(trading.tradesExecuted)).toBe(true);
|
|
64
|
-
expect(trading.tradesExecuted).toBeGreaterThanOrEqual(0);
|
|
65
|
-
expect(typeof trading.profitableTrades).toBe("number");
|
|
66
|
-
expect(Number.isFinite(trading.profitableTrades)).toBe(true);
|
|
67
|
-
expect(trading.profitableTrades).toBeGreaterThanOrEqual(0);
|
|
68
|
-
expect(typeof trading.winRate).toBe("number");
|
|
69
|
-
expect(Number.isFinite(trading.winRate)).toBe(true);
|
|
70
|
-
expect(trading.winRate).toBeGreaterThanOrEqual(0);
|
|
71
|
-
expect(trading.winRate).toBeLessThanOrEqual(1);
|
|
72
|
-
expect(typeof trading.totalPnL).toBe("number");
|
|
73
|
-
expect(Number.isFinite(trading.totalPnL)).toBe(true);
|
|
74
|
-
expect(typeof trading.maxDrawdown).toBe("number");
|
|
75
|
-
expect(Number.isFinite(trading.maxDrawdown)).toBe(true);
|
|
76
|
-
expect(trading.maxDrawdown).toBeGreaterThanOrEqual(0);
|
|
77
|
-
expect(typeof trading.sharpeRatio).toBe("number");
|
|
78
|
-
expect(Number.isFinite(trading.sharpeRatio)).toBe(true);
|
|
79
|
-
expect(typeof trading.avgPositionSize).toBe("number");
|
|
80
|
-
expect(Number.isFinite(trading.avgPositionSize)).toBe(true);
|
|
81
|
-
expect(trading.avgPositionSize).toBeGreaterThanOrEqual(0);
|
|
82
|
-
expect(typeof trading.avgHoldingPeriod).toBe("number");
|
|
83
|
-
expect(Number.isFinite(trading.avgHoldingPeriod)).toBe(true);
|
|
84
|
-
expect(trading.avgHoldingPeriod).toBeGreaterThanOrEqual(0);
|
|
85
|
-
expect(typeof trading.marketsTraded).toBe("number");
|
|
86
|
-
expect(Number.isFinite(trading.marketsTraded)).toBe(true);
|
|
87
|
-
expect(trading.marketsTraded).toBeGreaterThanOrEqual(0);
|
|
88
|
-
expect(typeof trading.buyTrades).toBe("number");
|
|
89
|
-
expect(Number.isFinite(trading.buyTrades)).toBe(true);
|
|
90
|
-
expect(trading.buyTrades).toBeGreaterThanOrEqual(0);
|
|
91
|
-
expect(typeof trading.sellTrades).toBe("number");
|
|
92
|
-
expect(Number.isFinite(trading.sellTrades)).toBe(true);
|
|
93
|
-
expect(trading.sellTrades).toBeGreaterThanOrEqual(0);
|
|
94
|
-
expect(typeof trading.largestWin).toBe("number");
|
|
95
|
-
expect(Number.isFinite(trading.largestWin)).toBe(true);
|
|
96
|
-
expect(typeof trading.largestLoss).toBe("number");
|
|
97
|
-
expect(Number.isFinite(trading.largestLoss)).toBe(true);
|
|
98
|
-
// Check influence metrics
|
|
99
|
-
const influence = metrics.influence;
|
|
100
|
-
expect(typeof influence.followersGained).toBe("number");
|
|
101
|
-
expect(Number.isFinite(influence.followersGained)).toBe(true);
|
|
102
|
-
expect(typeof influence.reputationDelta).toBe("number");
|
|
103
|
-
expect(Number.isFinite(influence.reputationDelta)).toBe(true);
|
|
104
|
-
expect(typeof influence.trustLevelDelta).toBe("number");
|
|
105
|
-
expect(Number.isFinite(influence.trustLevelDelta)).toBe(true);
|
|
106
|
-
expect(typeof influence.influenceScore).toBe("number");
|
|
107
|
-
expect(Number.isFinite(influence.influenceScore)).toBe(true);
|
|
108
|
-
expect(typeof influence.informationSpread).toBe("number");
|
|
109
|
-
expect(Number.isFinite(influence.informationSpread)).toBe(true);
|
|
110
|
-
expect(influence.informationSpread).toBeGreaterThanOrEqual(0);
|
|
111
|
-
expect(typeof influence.positiveReactions).toBe("number");
|
|
112
|
-
expect(Number.isFinite(influence.positiveReactions)).toBe(true);
|
|
113
|
-
expect(influence.positiveReactions).toBeGreaterThanOrEqual(0);
|
|
114
|
-
expect(typeof influence.negativeReactions).toBe("number");
|
|
115
|
-
expect(Number.isFinite(influence.negativeReactions)).toBe(true);
|
|
116
|
-
expect(influence.negativeReactions).toBeGreaterThanOrEqual(0);
|
|
117
|
-
// Check behavior metrics
|
|
118
|
-
const behavior = metrics.behavior;
|
|
119
|
-
expect(typeof behavior.actionsPerTick).toBe("number");
|
|
120
|
-
expect(Number.isFinite(behavior.actionsPerTick)).toBe(true);
|
|
121
|
-
expect(behavior.actionsPerTick).toBeGreaterThanOrEqual(0);
|
|
122
|
-
expect(typeof behavior.socialToTradeRatio).toBe("number");
|
|
123
|
-
expect(Number.isFinite(behavior.socialToTradeRatio)).toBe(true);
|
|
124
|
-
expect(behavior.socialToTradeRatio).toBeGreaterThanOrEqual(0);
|
|
125
|
-
expect(typeof behavior.avgResponseTime).toBe("number");
|
|
126
|
-
expect(Number.isFinite(behavior.avgResponseTime)).toBe(true);
|
|
127
|
-
expect(behavior.avgResponseTime).toBeGreaterThanOrEqual(0);
|
|
128
|
-
expect(typeof behavior.consistencyScore).toBe("number");
|
|
129
|
-
expect(Number.isFinite(behavior.consistencyScore)).toBe(true);
|
|
130
|
-
expect(behavior.consistencyScore).toBeGreaterThanOrEqual(0);
|
|
131
|
-
expect(behavior.consistencyScore).toBeLessThanOrEqual(1);
|
|
132
|
-
expect(typeof behavior.totalActions).toBe("number");
|
|
133
|
-
expect(Number.isFinite(behavior.totalActions)).toBe(true);
|
|
134
|
-
expect(behavior.totalActions).toBeGreaterThanOrEqual(0);
|
|
135
|
-
expect(typeof behavior.failedActions).toBe("number");
|
|
136
|
-
expect(Number.isFinite(behavior.failedActions)).toBe(true);
|
|
137
|
-
expect(behavior.failedActions).toBeGreaterThanOrEqual(0);
|
|
138
|
-
expect(typeof behavior.actionSuccessRate).toBe("number");
|
|
139
|
-
expect(Number.isFinite(behavior.actionSuccessRate)).toBe(true);
|
|
140
|
-
expect(behavior.actionSuccessRate).toBeGreaterThanOrEqual(0);
|
|
141
|
-
expect(behavior.actionSuccessRate).toBeLessThanOrEqual(1);
|
|
142
|
-
expect(typeof behavior.episodeLength).toBe("number");
|
|
143
|
-
expect(Number.isFinite(behavior.episodeLength)).toBe(true);
|
|
144
|
-
expect(behavior.episodeLength).toBeGreaterThanOrEqual(0);
|
|
145
|
-
expect(Array.isArray(behavior.actionTypesUsed)).toBe(true);
|
|
146
|
-
expect(typeof behavior.dominantActionType).toBe("string");
|
|
147
|
-
// Check information metrics
|
|
148
|
-
const information = metrics.information;
|
|
149
|
-
expect(typeof information.researchActions).toBe("number");
|
|
150
|
-
expect(Number.isFinite(information.researchActions)).toBe(true);
|
|
151
|
-
expect(information.researchActions).toBeGreaterThanOrEqual(0);
|
|
152
|
-
expect(typeof information.newsConsumed).toBe("number");
|
|
153
|
-
expect(Number.isFinite(information.newsConsumed)).toBe(true);
|
|
154
|
-
expect(information.newsConsumed).toBeGreaterThanOrEqual(0);
|
|
155
|
-
expect(typeof information.marketDataQueries).toBe("number");
|
|
156
|
-
expect(Number.isFinite(information.marketDataQueries)).toBe(true);
|
|
157
|
-
expect(information.marketDataQueries).toBeGreaterThanOrEqual(0);
|
|
158
|
-
expect(typeof information.infoRequestsSent).toBe("number");
|
|
159
|
-
expect(Number.isFinite(information.infoRequestsSent)).toBe(true);
|
|
160
|
-
expect(information.infoRequestsSent).toBeGreaterThanOrEqual(0);
|
|
161
|
-
expect(typeof information.infoShared).toBe("number");
|
|
162
|
-
expect(Number.isFinite(information.infoShared)).toBe(true);
|
|
163
|
-
expect(information.infoShared).toBeGreaterThanOrEqual(0);
|
|
164
|
-
expect(typeof information.predictionsMade).toBe("number");
|
|
165
|
-
expect(Number.isFinite(information.predictionsMade)).toBe(true);
|
|
166
|
-
expect(information.predictionsMade).toBeGreaterThanOrEqual(0);
|
|
167
|
-
expect(typeof information.correctPredictions).toBe("number");
|
|
168
|
-
expect(Number.isFinite(information.correctPredictions)).toBe(true);
|
|
169
|
-
expect(information.correctPredictions).toBeGreaterThanOrEqual(0);
|
|
170
|
-
expect(typeof information.predictionAccuracy).toBe("number");
|
|
171
|
-
expect(Number.isFinite(information.predictionAccuracy)).toBe(true);
|
|
172
|
-
expect(information.predictionAccuracy).toBeGreaterThanOrEqual(0);
|
|
173
|
-
expect(information.predictionAccuracy).toBeLessThanOrEqual(1);
|
|
174
|
-
}
|
|
175
|
-
describe("extract()", () => {
|
|
176
|
-
it("should return valid metrics for empty steps array", () => {
|
|
177
|
-
const metrics = extractor.extract({
|
|
178
|
-
trajectoryId: "test-traj-1",
|
|
179
|
-
agentId: "test-agent-1",
|
|
180
|
-
steps: [],
|
|
181
|
-
});
|
|
182
|
-
assertValidMetrics(metrics);
|
|
183
|
-
expect(metrics.behavior.episodeLength).toBe(0);
|
|
184
|
-
expect(metrics.behavior.totalActions).toBe(0);
|
|
185
|
-
});
|
|
186
|
-
it("should return valid metrics for minimal step with no action", () => {
|
|
187
|
-
const steps = [
|
|
188
|
-
{
|
|
189
|
-
stepNumber: 0,
|
|
190
|
-
timestamp: Date.now(),
|
|
191
|
-
environmentState: {
|
|
192
|
-
agentBalance: 1000,
|
|
193
|
-
agentPnL: 0,
|
|
194
|
-
openPositions: 0,
|
|
195
|
-
},
|
|
196
|
-
providerAccesses: [],
|
|
197
|
-
llmCalls: [],
|
|
198
|
-
action: {
|
|
199
|
-
actionType: "idle",
|
|
200
|
-
parameters: {},
|
|
201
|
-
success: true,
|
|
202
|
-
},
|
|
203
|
-
reward: 0,
|
|
204
|
-
},
|
|
205
|
-
];
|
|
206
|
-
const metrics = extractor.extract({
|
|
207
|
-
trajectoryId: "test-traj-2",
|
|
208
|
-
agentId: "test-agent-2",
|
|
209
|
-
steps,
|
|
210
|
-
});
|
|
211
|
-
assertValidMetrics(metrics);
|
|
212
|
-
expect(metrics.behavior.episodeLength).toBe(1);
|
|
213
|
-
});
|
|
214
|
-
it("should correctly count trading actions", () => {
|
|
215
|
-
const steps = [
|
|
216
|
-
createStep({
|
|
217
|
-
actionType: "buy",
|
|
218
|
-
parameters: { marketId: "BTC", amount: 100, side: "buy" },
|
|
219
|
-
result: { pnl: 10 },
|
|
220
|
-
success: true,
|
|
221
|
-
}),
|
|
222
|
-
createStep({
|
|
223
|
-
actionType: "sell",
|
|
224
|
-
parameters: { marketId: "ETH", amount: 50, side: "sell" },
|
|
225
|
-
result: { pnl: -5 },
|
|
226
|
-
success: true,
|
|
227
|
-
}),
|
|
228
|
-
createStep({
|
|
229
|
-
actionType: "trade",
|
|
230
|
-
parameters: { marketId: "BTC", amount: 200 },
|
|
231
|
-
result: { pnl: 20 },
|
|
232
|
-
success: true,
|
|
233
|
-
}),
|
|
234
|
-
];
|
|
235
|
-
const metrics = extractor.extract({
|
|
236
|
-
trajectoryId: "test-traj-3",
|
|
237
|
-
agentId: "test-agent-3",
|
|
238
|
-
steps,
|
|
239
|
-
});
|
|
240
|
-
assertValidMetrics(metrics);
|
|
241
|
-
expect(metrics.trading.tradesExecuted).toBe(3);
|
|
242
|
-
expect(metrics.trading.buyTrades).toBe(1);
|
|
243
|
-
expect(metrics.trading.sellTrades).toBe(1);
|
|
244
|
-
expect(metrics.trading.marketsTraded).toBe(2);
|
|
245
|
-
expect(metrics.trading.profitableTrades).toBe(2);
|
|
246
|
-
expect(metrics.trading.totalPnL).toBe(25);
|
|
247
|
-
expect(metrics.trading.winRate).toBeCloseTo(2 / 3, 2);
|
|
248
|
-
expect(metrics.trading.largestWin).toBe(20);
|
|
249
|
-
expect(metrics.trading.largestLoss).toBe(-5);
|
|
250
|
-
});
|
|
251
|
-
it("should correctly count social actions", () => {
|
|
252
|
-
const steps = [
|
|
253
|
-
createStep({
|
|
254
|
-
actionType: "join_group_chat",
|
|
255
|
-
parameters: { groupId: "group-1" },
|
|
256
|
-
success: true,
|
|
257
|
-
}),
|
|
258
|
-
createStep({
|
|
259
|
-
actionType: "post_group_message",
|
|
260
|
-
parameters: { groupId: "group-1", message: "Hello" },
|
|
261
|
-
success: true,
|
|
262
|
-
}),
|
|
263
|
-
createStep({
|
|
264
|
-
actionType: "send_dm",
|
|
265
|
-
parameters: { toUserId: "user-2", initiator: "test-agent-4" },
|
|
266
|
-
success: true,
|
|
267
|
-
}),
|
|
268
|
-
createStep({
|
|
269
|
-
actionType: "create_post",
|
|
270
|
-
parameters: {},
|
|
271
|
-
success: true,
|
|
272
|
-
}),
|
|
273
|
-
createStep({
|
|
274
|
-
actionType: "comment",
|
|
275
|
-
parameters: { authorId: "user-3" },
|
|
276
|
-
success: true,
|
|
277
|
-
}),
|
|
278
|
-
];
|
|
279
|
-
const metrics = extractor.extract({
|
|
280
|
-
trajectoryId: "test-traj-4",
|
|
281
|
-
agentId: "test-agent-4",
|
|
282
|
-
steps,
|
|
283
|
-
});
|
|
284
|
-
assertValidMetrics(metrics);
|
|
285
|
-
expect(metrics.social.groupChatsJoined).toBe(1);
|
|
286
|
-
expect(metrics.social.groupMessagesSent).toBe(1);
|
|
287
|
-
expect(metrics.social.dmsInitiated).toBe(1);
|
|
288
|
-
expect(metrics.social.postsCreated).toBe(1);
|
|
289
|
-
expect(metrics.social.commentsMade).toBe(1);
|
|
290
|
-
expect(metrics.social.uniqueUsersInteracted).toBeGreaterThanOrEqual(2);
|
|
291
|
-
});
|
|
292
|
-
it("should handle failed actions correctly", () => {
|
|
293
|
-
const steps = [
|
|
294
|
-
createStep({ actionType: "trade", success: true }),
|
|
295
|
-
createStep({ actionType: "trade", success: false }),
|
|
296
|
-
createStep({ actionType: "trade", success: false }),
|
|
297
|
-
createStep({ actionType: "trade", success: true }),
|
|
298
|
-
];
|
|
299
|
-
const metrics = extractor.extract({
|
|
300
|
-
trajectoryId: "test-traj-5",
|
|
301
|
-
agentId: "test-agent-5",
|
|
302
|
-
steps,
|
|
303
|
-
});
|
|
304
|
-
assertValidMetrics(metrics);
|
|
305
|
-
expect(metrics.behavior.totalActions).toBe(4);
|
|
306
|
-
expect(metrics.behavior.failedActions).toBe(2);
|
|
307
|
-
expect(metrics.behavior.actionSuccessRate).toBe(0.5);
|
|
308
|
-
});
|
|
309
|
-
it("should calculate socialToTradeRatio correctly", () => {
|
|
310
|
-
const steps = [
|
|
311
|
-
createStep({ actionType: "send_dm", success: true }),
|
|
312
|
-
createStep({ actionType: "send_dm", success: true }),
|
|
313
|
-
createStep({ actionType: "send_dm", success: true }),
|
|
314
|
-
createStep({ actionType: "trade", success: true }),
|
|
315
|
-
];
|
|
316
|
-
const metrics = extractor.extract({
|
|
317
|
-
trajectoryId: "test-traj-6",
|
|
318
|
-
agentId: "test-agent-6",
|
|
319
|
-
steps,
|
|
320
|
-
});
|
|
321
|
-
assertValidMetrics(metrics);
|
|
322
|
-
expect(metrics.behavior.socialToTradeRatio).toBe(3);
|
|
323
|
-
});
|
|
324
|
-
it("should handle social-only trajectories without weird ratios", () => {
|
|
325
|
-
const steps = [
|
|
326
|
-
createStep({ actionType: "send_dm", success: true }),
|
|
327
|
-
createStep({ actionType: "create_post", success: true }),
|
|
328
|
-
createStep({ actionType: "comment", success: true }),
|
|
329
|
-
];
|
|
330
|
-
const metrics = extractor.extract({
|
|
331
|
-
trajectoryId: "test-traj-7",
|
|
332
|
-
agentId: "test-agent-7",
|
|
333
|
-
steps,
|
|
334
|
-
});
|
|
335
|
-
assertValidMetrics(metrics);
|
|
336
|
-
// When no trades, socialToTradeRatio equals the social action count
|
|
337
|
-
expect(metrics.behavior.socialToTradeRatio).toBe(3);
|
|
338
|
-
expect(Number.isFinite(metrics.behavior.socialToTradeRatio)).toBe(true);
|
|
339
|
-
});
|
|
340
|
-
it("should track reputation changes", () => {
|
|
341
|
-
const steps = [
|
|
342
|
-
createStepWithEnvState({ reputation: 100, trustLevel: 50 }),
|
|
343
|
-
createStepWithEnvState({ reputation: 110, trustLevel: 55 }),
|
|
344
|
-
createStepWithEnvState({ reputation: 105, trustLevel: 60 }),
|
|
345
|
-
];
|
|
346
|
-
const metrics = extractor.extract({
|
|
347
|
-
trajectoryId: "test-traj-8",
|
|
348
|
-
agentId: "test-agent-8",
|
|
349
|
-
steps,
|
|
350
|
-
});
|
|
351
|
-
assertValidMetrics(metrics);
|
|
352
|
-
expect(metrics.influence.reputationDelta).toBe(5); // 105 - 100
|
|
353
|
-
expect(metrics.influence.trustLevelDelta).toBe(10); // 60 - 50
|
|
354
|
-
});
|
|
355
|
-
it("should calculate consistency score correctly", () => {
|
|
356
|
-
// Perfectly consistent (all same action)
|
|
357
|
-
const consistentSteps = [
|
|
358
|
-
createStep({ actionType: "trade", success: true }),
|
|
359
|
-
createStep({ actionType: "trade", success: true }),
|
|
360
|
-
createStep({ actionType: "trade", success: true }),
|
|
361
|
-
];
|
|
362
|
-
const consistentMetrics = extractor.extract({
|
|
363
|
-
trajectoryId: "test-traj-9",
|
|
364
|
-
agentId: "test-agent-9",
|
|
365
|
-
steps: consistentSteps,
|
|
366
|
-
});
|
|
367
|
-
assertValidMetrics(consistentMetrics);
|
|
368
|
-
expect(consistentMetrics.behavior.consistencyScore).toBe(1);
|
|
369
|
-
// Less consistent (varied actions)
|
|
370
|
-
const variedSteps = [
|
|
371
|
-
createStep({ actionType: "trade", success: true }),
|
|
372
|
-
createStep({ actionType: "trade", success: true }),
|
|
373
|
-
createStep({ actionType: "trade", success: true }),
|
|
374
|
-
createStep({ actionType: "send_dm", success: true }),
|
|
375
|
-
createStep({ actionType: "create_post", success: true }),
|
|
376
|
-
];
|
|
377
|
-
const variedMetrics = extractor.extract({
|
|
378
|
-
trajectoryId: "test-traj-10",
|
|
379
|
-
agentId: "test-agent-10",
|
|
380
|
-
steps: variedSteps,
|
|
381
|
-
});
|
|
382
|
-
assertValidMetrics(variedMetrics);
|
|
383
|
-
expect(variedMetrics.behavior.consistencyScore).toBeLessThan(1);
|
|
384
|
-
expect(variedMetrics.behavior.consistencyScore).toBeGreaterThan(0);
|
|
385
|
-
});
|
|
386
|
-
it("should correctly identify dominant action type", () => {
|
|
387
|
-
const steps = [
|
|
388
|
-
createStep({ actionType: "trade", success: true }),
|
|
389
|
-
createStep({ actionType: "trade", success: true }),
|
|
390
|
-
createStep({ actionType: "trade", success: true }),
|
|
391
|
-
createStep({ actionType: "send_dm", success: true }),
|
|
392
|
-
createStep({ actionType: "send_dm", success: true }),
|
|
393
|
-
];
|
|
394
|
-
const metrics = extractor.extract({
|
|
395
|
-
trajectoryId: "test-traj-11",
|
|
396
|
-
agentId: "test-agent-11",
|
|
397
|
-
steps,
|
|
398
|
-
});
|
|
399
|
-
assertValidMetrics(metrics);
|
|
400
|
-
expect(metrics.behavior.dominantActionType).toBe("trade");
|
|
401
|
-
});
|
|
402
|
-
it("should handle prediction correctness tracking", () => {
|
|
403
|
-
const steps = [
|
|
404
|
-
createStep({
|
|
405
|
-
actionType: "predict",
|
|
406
|
-
success: true,
|
|
407
|
-
correctness: { predictionCorrect: true },
|
|
408
|
-
}),
|
|
409
|
-
createStep({
|
|
410
|
-
actionType: "predict",
|
|
411
|
-
success: true,
|
|
412
|
-
correctness: { predictionCorrect: false },
|
|
413
|
-
}),
|
|
414
|
-
createStep({
|
|
415
|
-
actionType: "predict",
|
|
416
|
-
success: true,
|
|
417
|
-
correctness: { predictionCorrect: true },
|
|
418
|
-
}),
|
|
419
|
-
];
|
|
420
|
-
const metrics = extractor.extract({
|
|
421
|
-
trajectoryId: "test-traj-12",
|
|
422
|
-
agentId: "test-agent-12",
|
|
423
|
-
steps,
|
|
424
|
-
});
|
|
425
|
-
assertValidMetrics(metrics);
|
|
426
|
-
expect(metrics.information.predictionsMade).toBe(3);
|
|
427
|
-
expect(metrics.information.correctPredictions).toBe(2);
|
|
428
|
-
expect(metrics.information.predictionAccuracy).toBeCloseTo(2 / 3, 2);
|
|
429
|
-
});
|
|
430
|
-
});
|
|
431
|
-
describe("extractFromRaw()", () => {
|
|
432
|
-
it("should return null for invalid JSON", () => {
|
|
433
|
-
const result = extractor.extractFromRaw({
|
|
434
|
-
trajectoryId: "test-traj-13",
|
|
435
|
-
agentId: "test-agent-13",
|
|
436
|
-
stepsJson: "not valid json",
|
|
437
|
-
});
|
|
438
|
-
expect(result).toBeNull();
|
|
439
|
-
});
|
|
440
|
-
it("should return null for empty array JSON", () => {
|
|
441
|
-
const result = extractor.extractFromRaw({
|
|
442
|
-
trajectoryId: "test-traj-14",
|
|
443
|
-
agentId: "test-agent-14",
|
|
444
|
-
stepsJson: "[]",
|
|
445
|
-
});
|
|
446
|
-
expect(result).toBeNull();
|
|
447
|
-
});
|
|
448
|
-
it("should return null for null JSON", () => {
|
|
449
|
-
const result = extractor.extractFromRaw({
|
|
450
|
-
trajectoryId: "test-traj-15",
|
|
451
|
-
agentId: "test-agent-15",
|
|
452
|
-
stepsJson: "null",
|
|
453
|
-
});
|
|
454
|
-
expect(result).toBeNull();
|
|
455
|
-
});
|
|
456
|
-
it("should correctly parse valid JSON steps", () => {
|
|
457
|
-
const steps = [
|
|
458
|
-
createStep({ actionType: "trade", success: true }),
|
|
459
|
-
createStep({ actionType: "send_dm", success: true }),
|
|
460
|
-
];
|
|
461
|
-
const result = extractor.extractFromRaw({
|
|
462
|
-
trajectoryId: "test-traj-16",
|
|
463
|
-
agentId: "test-agent-16",
|
|
464
|
-
stepsJson: JSON.stringify(steps),
|
|
465
|
-
});
|
|
466
|
-
expect(result).not.toBeNull();
|
|
467
|
-
if (result) {
|
|
468
|
-
assertValidMetrics(result);
|
|
469
|
-
expect(result.behavior.totalActions).toBe(2);
|
|
470
|
-
}
|
|
471
|
-
});
|
|
472
|
-
it("should use finalPnL when provided", () => {
|
|
473
|
-
const steps = [
|
|
474
|
-
{
|
|
475
|
-
stepNumber: 0,
|
|
476
|
-
timestamp: Date.now(),
|
|
477
|
-
environmentState: {
|
|
478
|
-
agentBalance: 1000,
|
|
479
|
-
agentPnL: 0,
|
|
480
|
-
openPositions: 0,
|
|
481
|
-
},
|
|
482
|
-
providerAccesses: [],
|
|
483
|
-
llmCalls: [],
|
|
484
|
-
action: { actionType: "idle", parameters: {}, success: true },
|
|
485
|
-
reward: 0,
|
|
486
|
-
},
|
|
487
|
-
];
|
|
488
|
-
const result = extractor.extractFromRaw({
|
|
489
|
-
trajectoryId: "test-traj-17",
|
|
490
|
-
agentId: "test-agent-17",
|
|
491
|
-
stepsJson: JSON.stringify(steps),
|
|
492
|
-
finalPnL: 150.5,
|
|
493
|
-
});
|
|
494
|
-
expect(result).not.toBeNull();
|
|
495
|
-
if (result) {
|
|
496
|
-
assertValidMetrics(result);
|
|
497
|
-
}
|
|
498
|
-
});
|
|
499
|
-
});
|
|
500
|
-
describe("singleton instance", () => {
|
|
501
|
-
it("should export a singleton instance", () => {
|
|
502
|
-
expect(trajectoryMetricsExtractor).toBeInstanceOf(TrajectoryMetricsExtractor);
|
|
503
|
-
});
|
|
504
|
-
});
|
|
505
|
-
describe("edge cases", () => {
|
|
506
|
-
it("should handle undefined parameters gracefully", () => {
|
|
507
|
-
const steps = [
|
|
508
|
-
{
|
|
509
|
-
stepNumber: 0,
|
|
510
|
-
timestamp: Date.now(),
|
|
511
|
-
environmentState: {
|
|
512
|
-
agentBalance: 1000,
|
|
513
|
-
agentPnL: 0,
|
|
514
|
-
openPositions: 0,
|
|
515
|
-
},
|
|
516
|
-
providerAccesses: [],
|
|
517
|
-
llmCalls: [],
|
|
518
|
-
action: {
|
|
519
|
-
actionType: "trade",
|
|
520
|
-
parameters: {},
|
|
521
|
-
success: true,
|
|
522
|
-
},
|
|
523
|
-
reward: 0,
|
|
524
|
-
},
|
|
525
|
-
];
|
|
526
|
-
const metrics = extractor.extract({
|
|
527
|
-
trajectoryId: "test-edge-1",
|
|
528
|
-
agentId: "test-agent-edge-1",
|
|
529
|
-
steps,
|
|
530
|
-
});
|
|
531
|
-
assertValidMetrics(metrics);
|
|
532
|
-
});
|
|
533
|
-
it("should handle very large numbers without overflow", () => {
|
|
534
|
-
const steps = [
|
|
535
|
-
createStep({
|
|
536
|
-
actionType: "trade",
|
|
537
|
-
parameters: { amount: 1e15 },
|
|
538
|
-
result: { pnl: 1e12 },
|
|
539
|
-
success: true,
|
|
540
|
-
}),
|
|
541
|
-
];
|
|
542
|
-
const metrics = extractor.extract({
|
|
543
|
-
trajectoryId: "test-edge-2",
|
|
544
|
-
agentId: "test-agent-edge-2",
|
|
545
|
-
steps,
|
|
546
|
-
});
|
|
547
|
-
assertValidMetrics(metrics);
|
|
548
|
-
expect(Number.isFinite(metrics.trading.totalPnL)).toBe(true);
|
|
549
|
-
expect(Number.isFinite(metrics.trading.avgPositionSize)).toBe(true);
|
|
550
|
-
});
|
|
551
|
-
it("should handle negative PnL correctly", () => {
|
|
552
|
-
const steps = [
|
|
553
|
-
createStep({
|
|
554
|
-
actionType: "trade",
|
|
555
|
-
result: { pnl: -100 },
|
|
556
|
-
success: true,
|
|
557
|
-
}),
|
|
558
|
-
createStep({
|
|
559
|
-
actionType: "trade",
|
|
560
|
-
result: { pnl: -50 },
|
|
561
|
-
success: true,
|
|
562
|
-
}),
|
|
563
|
-
];
|
|
564
|
-
const metrics = extractor.extract({
|
|
565
|
-
trajectoryId: "test-edge-3",
|
|
566
|
-
agentId: "test-agent-edge-3",
|
|
567
|
-
steps,
|
|
568
|
-
});
|
|
569
|
-
assertValidMetrics(metrics);
|
|
570
|
-
expect(metrics.trading.totalPnL).toBe(-150);
|
|
571
|
-
expect(metrics.trading.profitableTrades).toBe(0);
|
|
572
|
-
expect(metrics.trading.winRate).toBe(0);
|
|
573
|
-
expect(metrics.trading.largestLoss).toBe(-100);
|
|
574
|
-
});
|
|
575
|
-
it("should handle mixed case action types", () => {
|
|
576
|
-
const steps = [
|
|
577
|
-
createStep({ actionType: "TRADE", success: true }),
|
|
578
|
-
createStep({ actionType: "Trade", success: true }),
|
|
579
|
-
createStep({ actionType: "trade", success: true }),
|
|
580
|
-
];
|
|
581
|
-
const metrics = extractor.extract({
|
|
582
|
-
trajectoryId: "test-edge-4",
|
|
583
|
-
agentId: "test-agent-edge-4",
|
|
584
|
-
steps,
|
|
585
|
-
});
|
|
586
|
-
assertValidMetrics(metrics);
|
|
587
|
-
expect(metrics.trading.tradesExecuted).toBe(3);
|
|
588
|
-
});
|
|
589
|
-
});
|
|
590
|
-
});
|
|
591
|
-
// Helper functions to create test steps
|
|
592
|
-
function createStep(options) {
|
|
593
|
-
return {
|
|
594
|
-
stepNumber: 0,
|
|
595
|
-
timestamp: Date.now(),
|
|
596
|
-
environmentState: { agentBalance: 1000, agentPnL: 0, openPositions: 0 },
|
|
597
|
-
providerAccesses: [],
|
|
598
|
-
llmCalls: [],
|
|
599
|
-
action: {
|
|
600
|
-
actionType: options.actionType,
|
|
601
|
-
parameters: options.parameters || {},
|
|
602
|
-
result: options.result,
|
|
603
|
-
success: options.success ?? true,
|
|
604
|
-
correctness: options.correctness,
|
|
605
|
-
},
|
|
606
|
-
reward: 0,
|
|
607
|
-
};
|
|
608
|
-
}
|
|
609
|
-
function createStepWithEnvState(envState) {
|
|
610
|
-
return {
|
|
611
|
-
stepNumber: 0,
|
|
612
|
-
timestamp: Date.now(),
|
|
613
|
-
environmentState: {
|
|
614
|
-
agentBalance: 1000,
|
|
615
|
-
agentPnL: 0,
|
|
616
|
-
openPositions: 0,
|
|
617
|
-
...envState,
|
|
618
|
-
},
|
|
619
|
-
providerAccesses: [],
|
|
620
|
-
llmCalls: [],
|
|
621
|
-
action: {
|
|
622
|
-
actionType: "idle",
|
|
623
|
-
parameters: {},
|
|
624
|
-
success: true,
|
|
625
|
-
},
|
|
626
|
-
reward: 0,
|
|
627
|
-
};
|
|
628
|
-
}
|
package/dist/metrics/index.js
DELETED
package/dist/metrics/types.js
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Trajectory Metrics Types
|
|
3
|
-
*
|
|
4
|
-
* Comprehensive behavioral metrics extracted from agent trajectories
|
|
5
|
-
* for use in multi-criteria evaluation with LLM-as-judge.
|
|
6
|
-
*/
|
|
7
|
-
/**
|
|
8
|
-
* Extract summary from full metrics
|
|
9
|
-
*/
|
|
10
|
-
export function getMetricsSummary(metrics) {
|
|
11
|
-
return {
|
|
12
|
-
totalPnL: metrics.trading.totalPnL,
|
|
13
|
-
winRate: metrics.trading.winRate,
|
|
14
|
-
tradesExecuted: metrics.trading.tradesExecuted,
|
|
15
|
-
uniqueUsersInteracted: metrics.social.uniqueUsersInteracted,
|
|
16
|
-
socialToTradeRatio: metrics.behavior.socialToTradeRatio,
|
|
17
|
-
actionSuccessRate: metrics.behavior.actionSuccessRate,
|
|
18
|
-
reputationDelta: metrics.influence.reputationDelta,
|
|
19
|
-
episodeLength: metrics.behavior.episodeLength,
|
|
20
|
-
};
|
|
21
|
-
}
|