@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.turbo/turbo-lint.log +2 -0
  2. package/.turbo/turbo-typecheck.log +1 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/adapter.js +59 -0
  5. package/dist/archetypes/ArchetypeConfigService.js +510 -0
  6. package/dist/archetypes/derive-archetype.js +196 -0
  7. package/dist/archetypes/index.js +7 -0
  8. package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
  9. package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
  10. package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
  11. package/dist/benchmark/BenchmarkDataViewer.js +197 -0
  12. package/dist/benchmark/BenchmarkHistoryService.js +135 -0
  13. package/dist/benchmark/BenchmarkRunner.js +483 -0
  14. package/dist/benchmark/BenchmarkValidator.js +158 -0
  15. package/dist/benchmark/FastEvalRunner.js +133 -0
  16. package/dist/benchmark/MetricsValidator.js +104 -0
  17. package/dist/benchmark/MetricsVisualizer.js +775 -0
  18. package/dist/benchmark/ModelBenchmarkService.js +433 -0
  19. package/dist/benchmark/ModelRegistry.js +122 -0
  20. package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
  21. package/dist/benchmark/SimulationA2AInterface.js +683 -0
  22. package/dist/benchmark/SimulationEngine.js +522 -0
  23. package/dist/benchmark/TaskRunner.js +60 -0
  24. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
  25. package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
  26. package/dist/benchmark/index.js +23 -0
  27. package/dist/benchmark/parseSimulationMetrics.js +86 -0
  28. package/dist/benchmark/simulation-types.js +1 -0
  29. package/dist/dependencies.js +197 -0
  30. package/dist/generation/TrajectoryGenerator.js +244 -0
  31. package/dist/generation/index.js +6 -0
  32. package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
  33. package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
  34. package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
  35. package/dist/huggingface/index.js +9 -0
  36. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
  37. package/dist/index.js +41 -0
  38. package/dist/init-training.js +43 -0
  39. package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
  40. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
  41. package/dist/metrics/index.js +7 -0
  42. package/dist/metrics/types.js +21 -0
  43. package/dist/rubrics/__tests__/index.test.js +150 -0
  44. package/dist/rubrics/ass-kisser.js +83 -0
  45. package/dist/rubrics/degen.js +78 -0
  46. package/dist/rubrics/goody-twoshoes.js +82 -0
  47. package/dist/rubrics/index.js +184 -0
  48. package/dist/rubrics/information-trader.js +82 -0
  49. package/dist/rubrics/infosec.js +99 -0
  50. package/dist/rubrics/liar.js +102 -0
  51. package/dist/rubrics/perps-trader.js +85 -0
  52. package/dist/rubrics/researcher.js +79 -0
  53. package/dist/rubrics/scammer.js +80 -0
  54. package/dist/rubrics/social-butterfly.js +71 -0
  55. package/dist/rubrics/super-predictor.js +95 -0
  56. package/dist/rubrics/trader.js +65 -0
  57. package/dist/scoring/ArchetypeScoringService.js +301 -0
  58. package/dist/scoring/JudgePromptBuilder.js +401 -0
  59. package/dist/scoring/LLMJudgeCache.js +263 -0
  60. package/dist/scoring/index.js +8 -0
  61. package/dist/training/AutomationPipeline.js +714 -0
  62. package/dist/training/BenchmarkService.js +370 -0
  63. package/dist/training/ConfigValidator.js +153 -0
  64. package/dist/training/MarketOutcomesTracker.js +142 -0
  65. package/dist/training/ModelDeployer.js +128 -0
  66. package/dist/training/ModelFetcher.js +48 -0
  67. package/dist/training/ModelSelectionService.js +248 -0
  68. package/dist/training/ModelUsageVerifier.js +106 -0
  69. package/dist/training/MultiModelOrchestrator.js +349 -0
  70. package/dist/training/RLModelConfig.js +295 -0
  71. package/dist/training/RewardBackpropagationService.js +117 -0
  72. package/dist/training/RulerScoringService.js +450 -0
  73. package/dist/training/TrainingMonitor.js +108 -0
  74. package/dist/training/TrajectoryRecorder.js +281 -0
  75. package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
  76. package/dist/training/index.js +30 -0
  77. package/dist/training/logRLConfig.js +29 -0
  78. package/dist/training/pipeline.js +80 -0
  79. package/dist/training/storage/ModelStorageService.js +190 -0
  80. package/dist/training/storage/TrainingDataArchiver.js +136 -0
  81. package/dist/training/storage/index.js +7 -0
  82. package/dist/training/types.js +6 -0
  83. package/dist/training/window-utils.js +100 -0
  84. package/dist/utils/index.js +73 -0
  85. package/dist/utils/logger.js +55 -0
  86. package/dist/utils/snowflake.js +15 -0
  87. package/dist/utils/synthetic-detector.js +67 -0
  88. package/package.json +2 -2
  89. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  90. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  91. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  92. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  93. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  94. package/research-output/training-runs/training-run-1773743782883.json +38 -0
@@ -0,0 +1,628 @@
1
+ /**
2
+ * Trajectory Metrics Extractor Tests
3
+ *
4
+ * Validates that all metrics are properly extracted and never null/undefined/NaN.
5
+ */
6
+ import { beforeEach, describe, expect, it } from "bun:test";
7
+ import { TrajectoryMetricsExtractor, trajectoryMetricsExtractor, } from "../TrajectoryMetricsExtractor";
8
+ describe("TrajectoryMetricsExtractor", () => {
9
+ let extractor;
10
+ beforeEach(() => {
11
+ extractor = new TrajectoryMetricsExtractor();
12
+ });
13
+ /**
14
+ * Helper to check all metrics are valid numbers (not null, undefined, NaN, Infinity)
15
+ */
16
+ function assertValidMetrics(metrics) {
17
+ // Check root level
18
+ expect(metrics.trajectoryId).toBeDefined();
19
+ expect(metrics.agentId).toBeDefined();
20
+ expect(metrics.extractedAt).toBeInstanceOf(Date);
21
+ // Check social metrics - all should be finite numbers
22
+ const social = metrics.social;
23
+ expect(typeof social.groupChatsJoined).toBe("number");
24
+ expect(Number.isFinite(social.groupChatsJoined)).toBe(true);
25
+ expect(social.groupChatsJoined).toBeGreaterThanOrEqual(0);
26
+ expect(typeof social.groupChatsCreated).toBe("number");
27
+ expect(Number.isFinite(social.groupChatsCreated)).toBe(true);
28
+ expect(social.groupChatsCreated).toBeGreaterThanOrEqual(0);
29
+ expect(typeof social.groupMessagesSent).toBe("number");
30
+ expect(Number.isFinite(social.groupMessagesSent)).toBe(true);
31
+ expect(social.groupMessagesSent).toBeGreaterThanOrEqual(0);
32
+ expect(typeof social.dmsInitiated).toBe("number");
33
+ expect(Number.isFinite(social.dmsInitiated)).toBe(true);
34
+ expect(social.dmsInitiated).toBeGreaterThanOrEqual(0);
35
+ expect(typeof social.dmsReceived).toBe("number");
36
+ expect(Number.isFinite(social.dmsReceived)).toBe(true);
37
+ expect(social.dmsReceived).toBeGreaterThanOrEqual(0);
38
+ expect(typeof social.dmResponseRate).toBe("number");
39
+ expect(Number.isFinite(social.dmResponseRate)).toBe(true);
40
+ expect(social.dmResponseRate).toBeGreaterThanOrEqual(0);
41
+ expect(social.dmResponseRate).toBeLessThanOrEqual(1);
42
+ expect(typeof social.uniqueUsersInteracted).toBe("number");
43
+ expect(Number.isFinite(social.uniqueUsersInteracted)).toBe(true);
44
+ expect(social.uniqueUsersInteracted).toBeGreaterThanOrEqual(0);
45
+ expect(typeof social.postsCreated).toBe("number");
46
+ expect(Number.isFinite(social.postsCreated)).toBe(true);
47
+ expect(social.postsCreated).toBeGreaterThanOrEqual(0);
48
+ expect(typeof social.commentsMade).toBe("number");
49
+ expect(Number.isFinite(social.commentsMade)).toBe(true);
50
+ expect(social.commentsMade).toBeGreaterThanOrEqual(0);
51
+ expect(typeof social.mentionsGiven).toBe("number");
52
+ expect(Number.isFinite(social.mentionsGiven)).toBe(true);
53
+ expect(social.mentionsGiven).toBeGreaterThanOrEqual(0);
54
+ expect(typeof social.mentionsReceived).toBe("number");
55
+ expect(Number.isFinite(social.mentionsReceived)).toBe(true);
56
+ expect(social.mentionsReceived).toBeGreaterThanOrEqual(0);
57
+ expect(typeof social.invitationsSent).toBe("number");
58
+ expect(Number.isFinite(social.invitationsSent)).toBe(true);
59
+ expect(social.invitationsSent).toBeGreaterThanOrEqual(0);
60
+ // Check trading metrics
61
+ const trading = metrics.trading;
62
+ expect(typeof trading.tradesExecuted).toBe("number");
63
+ expect(Number.isFinite(trading.tradesExecuted)).toBe(true);
64
+ expect(trading.tradesExecuted).toBeGreaterThanOrEqual(0);
65
+ expect(typeof trading.profitableTrades).toBe("number");
66
+ expect(Number.isFinite(trading.profitableTrades)).toBe(true);
67
+ expect(trading.profitableTrades).toBeGreaterThanOrEqual(0);
68
+ expect(typeof trading.winRate).toBe("number");
69
+ expect(Number.isFinite(trading.winRate)).toBe(true);
70
+ expect(trading.winRate).toBeGreaterThanOrEqual(0);
71
+ expect(trading.winRate).toBeLessThanOrEqual(1);
72
+ expect(typeof trading.totalPnL).toBe("number");
73
+ expect(Number.isFinite(trading.totalPnL)).toBe(true);
74
+ expect(typeof trading.maxDrawdown).toBe("number");
75
+ expect(Number.isFinite(trading.maxDrawdown)).toBe(true);
76
+ expect(trading.maxDrawdown).toBeGreaterThanOrEqual(0);
77
+ expect(typeof trading.sharpeRatio).toBe("number");
78
+ expect(Number.isFinite(trading.sharpeRatio)).toBe(true);
79
+ expect(typeof trading.avgPositionSize).toBe("number");
80
+ expect(Number.isFinite(trading.avgPositionSize)).toBe(true);
81
+ expect(trading.avgPositionSize).toBeGreaterThanOrEqual(0);
82
+ expect(typeof trading.avgHoldingPeriod).toBe("number");
83
+ expect(Number.isFinite(trading.avgHoldingPeriod)).toBe(true);
84
+ expect(trading.avgHoldingPeriod).toBeGreaterThanOrEqual(0);
85
+ expect(typeof trading.marketsTraded).toBe("number");
86
+ expect(Number.isFinite(trading.marketsTraded)).toBe(true);
87
+ expect(trading.marketsTraded).toBeGreaterThanOrEqual(0);
88
+ expect(typeof trading.buyTrades).toBe("number");
89
+ expect(Number.isFinite(trading.buyTrades)).toBe(true);
90
+ expect(trading.buyTrades).toBeGreaterThanOrEqual(0);
91
+ expect(typeof trading.sellTrades).toBe("number");
92
+ expect(Number.isFinite(trading.sellTrades)).toBe(true);
93
+ expect(trading.sellTrades).toBeGreaterThanOrEqual(0);
94
+ expect(typeof trading.largestWin).toBe("number");
95
+ expect(Number.isFinite(trading.largestWin)).toBe(true);
96
+ expect(typeof trading.largestLoss).toBe("number");
97
+ expect(Number.isFinite(trading.largestLoss)).toBe(true);
98
+ // Check influence metrics
99
+ const influence = metrics.influence;
100
+ expect(typeof influence.followersGained).toBe("number");
101
+ expect(Number.isFinite(influence.followersGained)).toBe(true);
102
+ expect(typeof influence.reputationDelta).toBe("number");
103
+ expect(Number.isFinite(influence.reputationDelta)).toBe(true);
104
+ expect(typeof influence.trustLevelDelta).toBe("number");
105
+ expect(Number.isFinite(influence.trustLevelDelta)).toBe(true);
106
+ expect(typeof influence.influenceScore).toBe("number");
107
+ expect(Number.isFinite(influence.influenceScore)).toBe(true);
108
+ expect(typeof influence.informationSpread).toBe("number");
109
+ expect(Number.isFinite(influence.informationSpread)).toBe(true);
110
+ expect(influence.informationSpread).toBeGreaterThanOrEqual(0);
111
+ expect(typeof influence.positiveReactions).toBe("number");
112
+ expect(Number.isFinite(influence.positiveReactions)).toBe(true);
113
+ expect(influence.positiveReactions).toBeGreaterThanOrEqual(0);
114
+ expect(typeof influence.negativeReactions).toBe("number");
115
+ expect(Number.isFinite(influence.negativeReactions)).toBe(true);
116
+ expect(influence.negativeReactions).toBeGreaterThanOrEqual(0);
117
+ // Check behavior metrics
118
+ const behavior = metrics.behavior;
119
+ expect(typeof behavior.actionsPerTick).toBe("number");
120
+ expect(Number.isFinite(behavior.actionsPerTick)).toBe(true);
121
+ expect(behavior.actionsPerTick).toBeGreaterThanOrEqual(0);
122
+ expect(typeof behavior.socialToTradeRatio).toBe("number");
123
+ expect(Number.isFinite(behavior.socialToTradeRatio)).toBe(true);
124
+ expect(behavior.socialToTradeRatio).toBeGreaterThanOrEqual(0);
125
+ expect(typeof behavior.avgResponseTime).toBe("number");
126
+ expect(Number.isFinite(behavior.avgResponseTime)).toBe(true);
127
+ expect(behavior.avgResponseTime).toBeGreaterThanOrEqual(0);
128
+ expect(typeof behavior.consistencyScore).toBe("number");
129
+ expect(Number.isFinite(behavior.consistencyScore)).toBe(true);
130
+ expect(behavior.consistencyScore).toBeGreaterThanOrEqual(0);
131
+ expect(behavior.consistencyScore).toBeLessThanOrEqual(1);
132
+ expect(typeof behavior.totalActions).toBe("number");
133
+ expect(Number.isFinite(behavior.totalActions)).toBe(true);
134
+ expect(behavior.totalActions).toBeGreaterThanOrEqual(0);
135
+ expect(typeof behavior.failedActions).toBe("number");
136
+ expect(Number.isFinite(behavior.failedActions)).toBe(true);
137
+ expect(behavior.failedActions).toBeGreaterThanOrEqual(0);
138
+ expect(typeof behavior.actionSuccessRate).toBe("number");
139
+ expect(Number.isFinite(behavior.actionSuccessRate)).toBe(true);
140
+ expect(behavior.actionSuccessRate).toBeGreaterThanOrEqual(0);
141
+ expect(behavior.actionSuccessRate).toBeLessThanOrEqual(1);
142
+ expect(typeof behavior.episodeLength).toBe("number");
143
+ expect(Number.isFinite(behavior.episodeLength)).toBe(true);
144
+ expect(behavior.episodeLength).toBeGreaterThanOrEqual(0);
145
+ expect(Array.isArray(behavior.actionTypesUsed)).toBe(true);
146
+ expect(typeof behavior.dominantActionType).toBe("string");
147
+ // Check information metrics
148
+ const information = metrics.information;
149
+ expect(typeof information.researchActions).toBe("number");
150
+ expect(Number.isFinite(information.researchActions)).toBe(true);
151
+ expect(information.researchActions).toBeGreaterThanOrEqual(0);
152
+ expect(typeof information.newsConsumed).toBe("number");
153
+ expect(Number.isFinite(information.newsConsumed)).toBe(true);
154
+ expect(information.newsConsumed).toBeGreaterThanOrEqual(0);
155
+ expect(typeof information.marketDataQueries).toBe("number");
156
+ expect(Number.isFinite(information.marketDataQueries)).toBe(true);
157
+ expect(information.marketDataQueries).toBeGreaterThanOrEqual(0);
158
+ expect(typeof information.infoRequestsSent).toBe("number");
159
+ expect(Number.isFinite(information.infoRequestsSent)).toBe(true);
160
+ expect(information.infoRequestsSent).toBeGreaterThanOrEqual(0);
161
+ expect(typeof information.infoShared).toBe("number");
162
+ expect(Number.isFinite(information.infoShared)).toBe(true);
163
+ expect(information.infoShared).toBeGreaterThanOrEqual(0);
164
+ expect(typeof information.predictionsMade).toBe("number");
165
+ expect(Number.isFinite(information.predictionsMade)).toBe(true);
166
+ expect(information.predictionsMade).toBeGreaterThanOrEqual(0);
167
+ expect(typeof information.correctPredictions).toBe("number");
168
+ expect(Number.isFinite(information.correctPredictions)).toBe(true);
169
+ expect(information.correctPredictions).toBeGreaterThanOrEqual(0);
170
+ expect(typeof information.predictionAccuracy).toBe("number");
171
+ expect(Number.isFinite(information.predictionAccuracy)).toBe(true);
172
+ expect(information.predictionAccuracy).toBeGreaterThanOrEqual(0);
173
+ expect(information.predictionAccuracy).toBeLessThanOrEqual(1);
174
+ }
175
+ describe("extract()", () => {
176
+ it("should return valid metrics for empty steps array", () => {
177
+ const metrics = extractor.extract({
178
+ trajectoryId: "test-traj-1",
179
+ agentId: "test-agent-1",
180
+ steps: [],
181
+ });
182
+ assertValidMetrics(metrics);
183
+ expect(metrics.behavior.episodeLength).toBe(0);
184
+ expect(metrics.behavior.totalActions).toBe(0);
185
+ });
186
+ it("should return valid metrics for minimal step with no action", () => {
187
+ const steps = [
188
+ {
189
+ stepNumber: 0,
190
+ timestamp: Date.now(),
191
+ environmentState: {
192
+ agentBalance: 1000,
193
+ agentPnL: 0,
194
+ openPositions: 0,
195
+ },
196
+ providerAccesses: [],
197
+ llmCalls: [],
198
+ action: {
199
+ actionType: "idle",
200
+ parameters: {},
201
+ success: true,
202
+ },
203
+ reward: 0,
204
+ },
205
+ ];
206
+ const metrics = extractor.extract({
207
+ trajectoryId: "test-traj-2",
208
+ agentId: "test-agent-2",
209
+ steps,
210
+ });
211
+ assertValidMetrics(metrics);
212
+ expect(metrics.behavior.episodeLength).toBe(1);
213
+ });
214
+ it("should correctly count trading actions", () => {
215
+ const steps = [
216
+ createStep({
217
+ actionType: "buy",
218
+ parameters: { marketId: "BTC", amount: 100, side: "buy" },
219
+ result: { pnl: 10 },
220
+ success: true,
221
+ }),
222
+ createStep({
223
+ actionType: "sell",
224
+ parameters: { marketId: "ETH", amount: 50, side: "sell" },
225
+ result: { pnl: -5 },
226
+ success: true,
227
+ }),
228
+ createStep({
229
+ actionType: "trade",
230
+ parameters: { marketId: "BTC", amount: 200 },
231
+ result: { pnl: 20 },
232
+ success: true,
233
+ }),
234
+ ];
235
+ const metrics = extractor.extract({
236
+ trajectoryId: "test-traj-3",
237
+ agentId: "test-agent-3",
238
+ steps,
239
+ });
240
+ assertValidMetrics(metrics);
241
+ expect(metrics.trading.tradesExecuted).toBe(3);
242
+ expect(metrics.trading.buyTrades).toBe(1);
243
+ expect(metrics.trading.sellTrades).toBe(1);
244
+ expect(metrics.trading.marketsTraded).toBe(2);
245
+ expect(metrics.trading.profitableTrades).toBe(2);
246
+ expect(metrics.trading.totalPnL).toBe(25);
247
+ expect(metrics.trading.winRate).toBeCloseTo(2 / 3, 2);
248
+ expect(metrics.trading.largestWin).toBe(20);
249
+ expect(metrics.trading.largestLoss).toBe(-5);
250
+ });
251
+ it("should correctly count social actions", () => {
252
+ const steps = [
253
+ createStep({
254
+ actionType: "join_group_chat",
255
+ parameters: { groupId: "group-1" },
256
+ success: true,
257
+ }),
258
+ createStep({
259
+ actionType: "post_group_message",
260
+ parameters: { groupId: "group-1", message: "Hello" },
261
+ success: true,
262
+ }),
263
+ createStep({
264
+ actionType: "send_dm",
265
+ parameters: { toUserId: "user-2", initiator: "test-agent-4" },
266
+ success: true,
267
+ }),
268
+ createStep({
269
+ actionType: "create_post",
270
+ parameters: {},
271
+ success: true,
272
+ }),
273
+ createStep({
274
+ actionType: "comment",
275
+ parameters: { authorId: "user-3" },
276
+ success: true,
277
+ }),
278
+ ];
279
+ const metrics = extractor.extract({
280
+ trajectoryId: "test-traj-4",
281
+ agentId: "test-agent-4",
282
+ steps,
283
+ });
284
+ assertValidMetrics(metrics);
285
+ expect(metrics.social.groupChatsJoined).toBe(1);
286
+ expect(metrics.social.groupMessagesSent).toBe(1);
287
+ expect(metrics.social.dmsInitiated).toBe(1);
288
+ expect(metrics.social.postsCreated).toBe(1);
289
+ expect(metrics.social.commentsMade).toBe(1);
290
+ expect(metrics.social.uniqueUsersInteracted).toBeGreaterThanOrEqual(2);
291
+ });
292
+ it("should handle failed actions correctly", () => {
293
+ const steps = [
294
+ createStep({ actionType: "trade", success: true }),
295
+ createStep({ actionType: "trade", success: false }),
296
+ createStep({ actionType: "trade", success: false }),
297
+ createStep({ actionType: "trade", success: true }),
298
+ ];
299
+ const metrics = extractor.extract({
300
+ trajectoryId: "test-traj-5",
301
+ agentId: "test-agent-5",
302
+ steps,
303
+ });
304
+ assertValidMetrics(metrics);
305
+ expect(metrics.behavior.totalActions).toBe(4);
306
+ expect(metrics.behavior.failedActions).toBe(2);
307
+ expect(metrics.behavior.actionSuccessRate).toBe(0.5);
308
+ });
309
+ it("should calculate socialToTradeRatio correctly", () => {
310
+ const steps = [
311
+ createStep({ actionType: "send_dm", success: true }),
312
+ createStep({ actionType: "send_dm", success: true }),
313
+ createStep({ actionType: "send_dm", success: true }),
314
+ createStep({ actionType: "trade", success: true }),
315
+ ];
316
+ const metrics = extractor.extract({
317
+ trajectoryId: "test-traj-6",
318
+ agentId: "test-agent-6",
319
+ steps,
320
+ });
321
+ assertValidMetrics(metrics);
322
+ expect(metrics.behavior.socialToTradeRatio).toBe(3);
323
+ });
324
+ it("should handle social-only trajectories without weird ratios", () => {
325
+ const steps = [
326
+ createStep({ actionType: "send_dm", success: true }),
327
+ createStep({ actionType: "create_post", success: true }),
328
+ createStep({ actionType: "comment", success: true }),
329
+ ];
330
+ const metrics = extractor.extract({
331
+ trajectoryId: "test-traj-7",
332
+ agentId: "test-agent-7",
333
+ steps,
334
+ });
335
+ assertValidMetrics(metrics);
336
+ // When no trades, socialToTradeRatio equals the social action count
337
+ expect(metrics.behavior.socialToTradeRatio).toBe(3);
338
+ expect(Number.isFinite(metrics.behavior.socialToTradeRatio)).toBe(true);
339
+ });
340
+ it("should track reputation changes", () => {
341
+ const steps = [
342
+ createStepWithEnvState({ reputation: 100, trustLevel: 50 }),
343
+ createStepWithEnvState({ reputation: 110, trustLevel: 55 }),
344
+ createStepWithEnvState({ reputation: 105, trustLevel: 60 }),
345
+ ];
346
+ const metrics = extractor.extract({
347
+ trajectoryId: "test-traj-8",
348
+ agentId: "test-agent-8",
349
+ steps,
350
+ });
351
+ assertValidMetrics(metrics);
352
+ expect(metrics.influence.reputationDelta).toBe(5); // 105 - 100
353
+ expect(metrics.influence.trustLevelDelta).toBe(10); // 60 - 50
354
+ });
355
+ it("should calculate consistency score correctly", () => {
356
+ // Perfectly consistent (all same action)
357
+ const consistentSteps = [
358
+ createStep({ actionType: "trade", success: true }),
359
+ createStep({ actionType: "trade", success: true }),
360
+ createStep({ actionType: "trade", success: true }),
361
+ ];
362
+ const consistentMetrics = extractor.extract({
363
+ trajectoryId: "test-traj-9",
364
+ agentId: "test-agent-9",
365
+ steps: consistentSteps,
366
+ });
367
+ assertValidMetrics(consistentMetrics);
368
+ expect(consistentMetrics.behavior.consistencyScore).toBe(1);
369
+ // Less consistent (varied actions)
370
+ const variedSteps = [
371
+ createStep({ actionType: "trade", success: true }),
372
+ createStep({ actionType: "trade", success: true }),
373
+ createStep({ actionType: "trade", success: true }),
374
+ createStep({ actionType: "send_dm", success: true }),
375
+ createStep({ actionType: "create_post", success: true }),
376
+ ];
377
+ const variedMetrics = extractor.extract({
378
+ trajectoryId: "test-traj-10",
379
+ agentId: "test-agent-10",
380
+ steps: variedSteps,
381
+ });
382
+ assertValidMetrics(variedMetrics);
383
+ expect(variedMetrics.behavior.consistencyScore).toBeLessThan(1);
384
+ expect(variedMetrics.behavior.consistencyScore).toBeGreaterThan(0);
385
+ });
386
+ it("should correctly identify dominant action type", () => {
387
+ const steps = [
388
+ createStep({ actionType: "trade", success: true }),
389
+ createStep({ actionType: "trade", success: true }),
390
+ createStep({ actionType: "trade", success: true }),
391
+ createStep({ actionType: "send_dm", success: true }),
392
+ createStep({ actionType: "send_dm", success: true }),
393
+ ];
394
+ const metrics = extractor.extract({
395
+ trajectoryId: "test-traj-11",
396
+ agentId: "test-agent-11",
397
+ steps,
398
+ });
399
+ assertValidMetrics(metrics);
400
+ expect(metrics.behavior.dominantActionType).toBe("trade");
401
+ });
402
+ it("should handle prediction correctness tracking", () => {
403
+ const steps = [
404
+ createStep({
405
+ actionType: "predict",
406
+ success: true,
407
+ correctness: { predictionCorrect: true },
408
+ }),
409
+ createStep({
410
+ actionType: "predict",
411
+ success: true,
412
+ correctness: { predictionCorrect: false },
413
+ }),
414
+ createStep({
415
+ actionType: "predict",
416
+ success: true,
417
+ correctness: { predictionCorrect: true },
418
+ }),
419
+ ];
420
+ const metrics = extractor.extract({
421
+ trajectoryId: "test-traj-12",
422
+ agentId: "test-agent-12",
423
+ steps,
424
+ });
425
+ assertValidMetrics(metrics);
426
+ expect(metrics.information.predictionsMade).toBe(3);
427
+ expect(metrics.information.correctPredictions).toBe(2);
428
+ expect(metrics.information.predictionAccuracy).toBeCloseTo(2 / 3, 2);
429
+ });
430
+ });
431
+ describe("extractFromRaw()", () => {
432
+ it("should return null for invalid JSON", () => {
433
+ const result = extractor.extractFromRaw({
434
+ trajectoryId: "test-traj-13",
435
+ agentId: "test-agent-13",
436
+ stepsJson: "not valid json",
437
+ });
438
+ expect(result).toBeNull();
439
+ });
440
+ it("should return null for empty array JSON", () => {
441
+ const result = extractor.extractFromRaw({
442
+ trajectoryId: "test-traj-14",
443
+ agentId: "test-agent-14",
444
+ stepsJson: "[]",
445
+ });
446
+ expect(result).toBeNull();
447
+ });
448
+ it("should return null for null JSON", () => {
449
+ const result = extractor.extractFromRaw({
450
+ trajectoryId: "test-traj-15",
451
+ agentId: "test-agent-15",
452
+ stepsJson: "null",
453
+ });
454
+ expect(result).toBeNull();
455
+ });
456
+ it("should correctly parse valid JSON steps", () => {
457
+ const steps = [
458
+ createStep({ actionType: "trade", success: true }),
459
+ createStep({ actionType: "send_dm", success: true }),
460
+ ];
461
+ const result = extractor.extractFromRaw({
462
+ trajectoryId: "test-traj-16",
463
+ agentId: "test-agent-16",
464
+ stepsJson: JSON.stringify(steps),
465
+ });
466
+ expect(result).not.toBeNull();
467
+ if (result) {
468
+ assertValidMetrics(result);
469
+ expect(result.behavior.totalActions).toBe(2);
470
+ }
471
+ });
472
+ it("should use finalPnL when provided", () => {
473
+ const steps = [
474
+ {
475
+ stepNumber: 0,
476
+ timestamp: Date.now(),
477
+ environmentState: {
478
+ agentBalance: 1000,
479
+ agentPnL: 0,
480
+ openPositions: 0,
481
+ },
482
+ providerAccesses: [],
483
+ llmCalls: [],
484
+ action: { actionType: "idle", parameters: {}, success: true },
485
+ reward: 0,
486
+ },
487
+ ];
488
+ const result = extractor.extractFromRaw({
489
+ trajectoryId: "test-traj-17",
490
+ agentId: "test-agent-17",
491
+ stepsJson: JSON.stringify(steps),
492
+ finalPnL: 150.5,
493
+ });
494
+ expect(result).not.toBeNull();
495
+ if (result) {
496
+ assertValidMetrics(result);
497
+ }
498
+ });
499
+ });
500
+ describe("singleton instance", () => {
501
+ it("should export a singleton instance", () => {
502
+ expect(trajectoryMetricsExtractor).toBeInstanceOf(TrajectoryMetricsExtractor);
503
+ });
504
+ });
505
+ describe("edge cases", () => {
506
+ it("should handle undefined parameters gracefully", () => {
507
+ const steps = [
508
+ {
509
+ stepNumber: 0,
510
+ timestamp: Date.now(),
511
+ environmentState: {
512
+ agentBalance: 1000,
513
+ agentPnL: 0,
514
+ openPositions: 0,
515
+ },
516
+ providerAccesses: [],
517
+ llmCalls: [],
518
+ action: {
519
+ actionType: "trade",
520
+ parameters: {},
521
+ success: true,
522
+ },
523
+ reward: 0,
524
+ },
525
+ ];
526
+ const metrics = extractor.extract({
527
+ trajectoryId: "test-edge-1",
528
+ agentId: "test-agent-edge-1",
529
+ steps,
530
+ });
531
+ assertValidMetrics(metrics);
532
+ });
533
+ it("should handle very large numbers without overflow", () => {
534
+ const steps = [
535
+ createStep({
536
+ actionType: "trade",
537
+ parameters: { amount: 1e15 },
538
+ result: { pnl: 1e12 },
539
+ success: true,
540
+ }),
541
+ ];
542
+ const metrics = extractor.extract({
543
+ trajectoryId: "test-edge-2",
544
+ agentId: "test-agent-edge-2",
545
+ steps,
546
+ });
547
+ assertValidMetrics(metrics);
548
+ expect(Number.isFinite(metrics.trading.totalPnL)).toBe(true);
549
+ expect(Number.isFinite(metrics.trading.avgPositionSize)).toBe(true);
550
+ });
551
+ it("should handle negative PnL correctly", () => {
552
+ const steps = [
553
+ createStep({
554
+ actionType: "trade",
555
+ result: { pnl: -100 },
556
+ success: true,
557
+ }),
558
+ createStep({
559
+ actionType: "trade",
560
+ result: { pnl: -50 },
561
+ success: true,
562
+ }),
563
+ ];
564
+ const metrics = extractor.extract({
565
+ trajectoryId: "test-edge-3",
566
+ agentId: "test-agent-edge-3",
567
+ steps,
568
+ });
569
+ assertValidMetrics(metrics);
570
+ expect(metrics.trading.totalPnL).toBe(-150);
571
+ expect(metrics.trading.profitableTrades).toBe(0);
572
+ expect(metrics.trading.winRate).toBe(0);
573
+ expect(metrics.trading.largestLoss).toBe(-100);
574
+ });
575
+ it("should handle mixed case action types", () => {
576
+ const steps = [
577
+ createStep({ actionType: "TRADE", success: true }),
578
+ createStep({ actionType: "Trade", success: true }),
579
+ createStep({ actionType: "trade", success: true }),
580
+ ];
581
+ const metrics = extractor.extract({
582
+ trajectoryId: "test-edge-4",
583
+ agentId: "test-agent-edge-4",
584
+ steps,
585
+ });
586
+ assertValidMetrics(metrics);
587
+ expect(metrics.trading.tradesExecuted).toBe(3);
588
+ });
589
+ });
590
+ });
591
+ // Helper functions to create test steps
592
+ function createStep(options) {
593
+ return {
594
+ stepNumber: 0,
595
+ timestamp: Date.now(),
596
+ environmentState: { agentBalance: 1000, agentPnL: 0, openPositions: 0 },
597
+ providerAccesses: [],
598
+ llmCalls: [],
599
+ action: {
600
+ actionType: options.actionType,
601
+ parameters: options.parameters || {},
602
+ result: options.result,
603
+ success: options.success ?? true,
604
+ correctness: options.correctness,
605
+ },
606
+ reward: 0,
607
+ };
608
+ }
609
+ function createStepWithEnvState(envState) {
610
+ return {
611
+ stepNumber: 0,
612
+ timestamp: Date.now(),
613
+ environmentState: {
614
+ agentBalance: 1000,
615
+ agentPnL: 0,
616
+ openPositions: 0,
617
+ ...envState,
618
+ },
619
+ providerAccesses: [],
620
+ llmCalls: [],
621
+ action: {
622
+ actionType: "idle",
623
+ parameters: {},
624
+ success: true,
625
+ },
626
+ reward: 0,
627
+ };
628
+ }
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Trajectory Metrics Module
3
+ *
4
+ * Exports for behavioral metrics extraction and types.
5
+ */
6
+ export * from "./TrajectoryMetricsExtractor";
7
+ export * from "./types";
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Trajectory Metrics Types
3
+ *
4
+ * Comprehensive behavioral metrics extracted from agent trajectories
5
+ * for use in multi-criteria evaluation with LLM-as-judge.
6
+ */
7
+ /**
8
+ * Extract summary from full metrics
9
+ */
10
+ export function getMetricsSummary(metrics) {
11
+ return {
12
+ totalPnL: metrics.trading.totalPnL,
13
+ winRate: metrics.trading.winRate,
14
+ tradesExecuted: metrics.trading.tradesExecuted,
15
+ uniqueUsersInteracted: metrics.social.uniqueUsersInteracted,
16
+ socialToTradeRatio: metrics.behavior.socialToTradeRatio,
17
+ actionSuccessRate: metrics.behavior.actionSuccessRate,
18
+ reputationDelta: metrics.influence.reputationDelta,
19
+ episodeLength: metrics.behavior.episodeLength,
20
+ };
21
+ }