@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -7,10 +7,10 @@
|
|
|
7
7
|
* @packageDocumentation
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
import type { BehavioralMetrics } from
|
|
11
|
-
import { getMetricsSummary } from
|
|
12
|
-
import { getPriorityMetrics, getRubric } from
|
|
13
|
-
import type { TrajectoryStep } from
|
|
10
|
+
import type { BehavioralMetrics } from "../metrics/types";
|
|
11
|
+
import { getMetricsSummary } from "../metrics/types";
|
|
12
|
+
import { getPriorityMetrics, getRubric } from "../rubrics";
|
|
13
|
+
import type { TrajectoryStep } from "../training/types";
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* Context for trajectory evaluation.
|
|
@@ -56,10 +56,10 @@ export class JudgePromptBuilder {
|
|
|
56
56
|
*/
|
|
57
57
|
buildSinglePrompt(
|
|
58
58
|
trajectory: TrajectoryContext,
|
|
59
|
-
options: JudgePromptOptions = {}
|
|
59
|
+
options: JudgePromptOptions = {},
|
|
60
60
|
): { system: string; user: string } {
|
|
61
61
|
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
62
|
-
const archetype = trajectory.archetype ||
|
|
62
|
+
const archetype = trajectory.archetype || "default";
|
|
63
63
|
const rubric = getRubric(archetype);
|
|
64
64
|
const priorityMetrics = getPriorityMetrics(archetype);
|
|
65
65
|
|
|
@@ -75,12 +75,12 @@ export class JudgePromptBuilder {
|
|
|
75
75
|
buildComparisonPrompt(
|
|
76
76
|
trajectories: TrajectoryContext[],
|
|
77
77
|
scenarioId: string,
|
|
78
|
-
options: JudgePromptOptions = {}
|
|
78
|
+
options: JudgePromptOptions = {},
|
|
79
79
|
): { system: string; user: string } {
|
|
80
80
|
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
81
81
|
|
|
82
82
|
// Get archetype from first trajectory (assume all same archetype for comparison)
|
|
83
|
-
const archetype = trajectories[0]?.archetype ||
|
|
83
|
+
const archetype = trajectories[0]?.archetype || "default";
|
|
84
84
|
const rubric = getRubric(archetype);
|
|
85
85
|
const priorityMetrics = getPriorityMetrics(archetype);
|
|
86
86
|
|
|
@@ -89,7 +89,7 @@ export class JudgePromptBuilder {
|
|
|
89
89
|
trajectories,
|
|
90
90
|
scenarioId,
|
|
91
91
|
priorityMetrics,
|
|
92
|
-
opts
|
|
92
|
+
opts,
|
|
93
93
|
);
|
|
94
94
|
|
|
95
95
|
return { system, user };
|
|
@@ -115,7 +115,7 @@ IMPORTANT: The metrics provided are CONTEXT to inform your judgment. Use them to
|
|
|
115
115
|
*/
|
|
116
116
|
private buildComparisonSystemPrompt(
|
|
117
117
|
archetype: string,
|
|
118
|
-
rubric: string
|
|
118
|
+
rubric: string,
|
|
119
119
|
): string {
|
|
120
120
|
return `You are an expert evaluator of AI agent performance. All trajectories below were given the same scenario and are from "${archetype}" archetype agents.
|
|
121
121
|
|
|
@@ -139,28 +139,28 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
139
139
|
private buildUserPrompt(
|
|
140
140
|
trajectory: TrajectoryContext,
|
|
141
141
|
priorityMetrics: string[],
|
|
142
|
-
options: JudgePromptOptions
|
|
142
|
+
options: JudgePromptOptions,
|
|
143
143
|
): string {
|
|
144
144
|
const parts: string[] = [];
|
|
145
145
|
|
|
146
146
|
// Agent info
|
|
147
147
|
parts.push(`## Agent Information`);
|
|
148
148
|
parts.push(`- Agent ID: ${trajectory.agentId}`);
|
|
149
|
-
parts.push(`- Archetype: ${trajectory.archetype ||
|
|
149
|
+
parts.push(`- Archetype: ${trajectory.archetype || "unknown"}`);
|
|
150
150
|
parts.push(
|
|
151
|
-
`- Episode Length: ${trajectory.episodeLength || trajectory.steps.length} ticks
|
|
151
|
+
`- Episode Length: ${trajectory.episodeLength || trajectory.steps.length} ticks`,
|
|
152
152
|
);
|
|
153
|
-
parts.push(
|
|
153
|
+
parts.push("");
|
|
154
154
|
|
|
155
155
|
// Metrics section
|
|
156
156
|
parts.push(`## Behavioral Metrics`);
|
|
157
157
|
parts.push(this.formatMetrics(trajectory.metrics, priorityMetrics));
|
|
158
|
-
parts.push(
|
|
158
|
+
parts.push("");
|
|
159
159
|
|
|
160
160
|
// Action summary
|
|
161
161
|
parts.push(`## Action Summary`);
|
|
162
162
|
parts.push(this.summarizeActions(trajectory.steps));
|
|
163
|
-
parts.push(
|
|
163
|
+
parts.push("");
|
|
164
164
|
|
|
165
165
|
// Key decisions (if requested)
|
|
166
166
|
if (options.includeKeyDecisions) {
|
|
@@ -168,7 +168,7 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
168
168
|
if (keyDecisions) {
|
|
169
169
|
parts.push(`## Key Decisions`);
|
|
170
170
|
parts.push(keyDecisions);
|
|
171
|
-
parts.push(
|
|
171
|
+
parts.push("");
|
|
172
172
|
}
|
|
173
173
|
}
|
|
174
174
|
|
|
@@ -178,18 +178,18 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
178
178
|
parts.push(
|
|
179
179
|
this.formatRecentActions(
|
|
180
180
|
trajectory.steps,
|
|
181
|
-
options.maxActionsToShow || 20
|
|
182
|
-
)
|
|
181
|
+
options.maxActionsToShow || 20,
|
|
182
|
+
),
|
|
183
183
|
);
|
|
184
|
-
parts.push(
|
|
184
|
+
parts.push("");
|
|
185
185
|
}
|
|
186
186
|
|
|
187
187
|
// Instructions
|
|
188
188
|
parts.push(`## Instructions`);
|
|
189
189
|
parts.push(
|
|
190
|
-
`Score this trajectory on a scale of 0.0 to 1.0 based on how well it embodies the ${trajectory.archetype ||
|
|
190
|
+
`Score this trajectory on a scale of 0.0 to 1.0 based on how well it embodies the ${trajectory.archetype || "agent"} archetype's values.`,
|
|
191
191
|
);
|
|
192
|
-
parts.push(
|
|
192
|
+
parts.push("");
|
|
193
193
|
parts.push(`Respond with JSON:`);
|
|
194
194
|
parts.push(`{
|
|
195
195
|
"score": <float 0-1>,
|
|
@@ -198,7 +198,7 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
198
198
|
"weaknesses": ["<weakness 1>", "<weakness 2>"]
|
|
199
199
|
}`);
|
|
200
200
|
|
|
201
|
-
return parts.join(
|
|
201
|
+
return parts.join("\n");
|
|
202
202
|
}
|
|
203
203
|
|
|
204
204
|
/**
|
|
@@ -208,18 +208,18 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
208
208
|
trajectories: TrajectoryContext[],
|
|
209
209
|
scenarioId: string,
|
|
210
210
|
priorityMetrics: string[],
|
|
211
|
-
_options: JudgePromptOptions
|
|
211
|
+
_options: JudgePromptOptions,
|
|
212
212
|
): string {
|
|
213
213
|
const parts: string[] = [];
|
|
214
214
|
|
|
215
215
|
parts.push(`## Scenario: ${scenarioId}`);
|
|
216
216
|
parts.push(`## Number of Trajectories: ${trajectories.length}`);
|
|
217
|
-
parts.push(
|
|
217
|
+
parts.push("");
|
|
218
218
|
|
|
219
219
|
// Performance context for all trajectories
|
|
220
220
|
parts.push(`## Trajectory Performance Context`);
|
|
221
221
|
parts.push(`(Use this to inform your scoring)`);
|
|
222
|
-
parts.push(
|
|
222
|
+
parts.push("");
|
|
223
223
|
|
|
224
224
|
for (let i = 0; i < trajectories.length; i++) {
|
|
225
225
|
const traj = trajectories[i];
|
|
@@ -227,30 +227,30 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
227
227
|
|
|
228
228
|
const trajId = `trajectory-${i + 1}`;
|
|
229
229
|
parts.push(`### ${trajId}`);
|
|
230
|
-
parts.push(`- Archetype: ${traj.archetype ||
|
|
230
|
+
parts.push(`- Archetype: ${traj.archetype || "unknown"}`);
|
|
231
231
|
parts.push(
|
|
232
|
-
`- Episode Length: ${traj.episodeLength || traj.steps.length} steps
|
|
232
|
+
`- Episode Length: ${traj.episodeLength || traj.steps.length} steps`,
|
|
233
233
|
);
|
|
234
|
-
parts.push(`- Total Reward: ${traj.totalReward?.toFixed(2) ||
|
|
235
|
-
parts.push(
|
|
234
|
+
parts.push(`- Total Reward: ${traj.totalReward?.toFixed(2) || "0.00"}`);
|
|
235
|
+
parts.push("");
|
|
236
236
|
|
|
237
237
|
// Key metrics for this trajectory
|
|
238
238
|
parts.push(`**Key Metrics:**`);
|
|
239
239
|
parts.push(this.formatMetrics(traj.metrics, priorityMetrics));
|
|
240
|
-
parts.push(
|
|
240
|
+
parts.push("");
|
|
241
241
|
|
|
242
242
|
// Action summary
|
|
243
243
|
parts.push(`**Actions:**`);
|
|
244
244
|
parts.push(this.summarizeActions(traj.steps));
|
|
245
|
-
parts.push(
|
|
245
|
+
parts.push("");
|
|
246
246
|
}
|
|
247
247
|
|
|
248
248
|
// Instructions
|
|
249
249
|
parts.push(`## Instructions`);
|
|
250
250
|
parts.push(
|
|
251
|
-
`Score each trajectory from 0.0 to 1.0 RELATIVE to each other based on the archetype rubric
|
|
251
|
+
`Score each trajectory from 0.0 to 1.0 RELATIVE to each other based on the archetype rubric.`,
|
|
252
252
|
);
|
|
253
|
-
parts.push(
|
|
253
|
+
parts.push("");
|
|
254
254
|
parts.push(`Respond with ONLY valid JSON:`);
|
|
255
255
|
parts.push(`{
|
|
256
256
|
"scores": [
|
|
@@ -267,7 +267,7 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
267
267
|
]
|
|
268
268
|
}`);
|
|
269
269
|
|
|
270
|
-
return parts.join(
|
|
270
|
+
return parts.join("\n");
|
|
271
271
|
}
|
|
272
272
|
|
|
273
273
|
/**
|
|
@@ -275,48 +275,48 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
275
275
|
*/
|
|
276
276
|
private formatMetrics(
|
|
277
277
|
metrics: BehavioralMetrics,
|
|
278
|
-
priorityMetrics: string[]
|
|
278
|
+
priorityMetrics: string[],
|
|
279
279
|
): string {
|
|
280
280
|
const lines: string[] = [];
|
|
281
281
|
|
|
282
282
|
// Show priority metrics first with emphasis
|
|
283
283
|
if (priorityMetrics.length > 0) {
|
|
284
|
-
lines.push(
|
|
284
|
+
lines.push("### ⭐ KEY METRICS FOR THIS ARCHETYPE");
|
|
285
285
|
for (const metricPath of priorityMetrics.slice(0, 6)) {
|
|
286
286
|
const value = this.getMetricValue(metrics, metricPath);
|
|
287
287
|
const label = this.formatMetricLabel(metricPath);
|
|
288
288
|
lines.push(`- **${label}**: ${value}`);
|
|
289
289
|
}
|
|
290
|
-
lines.push(
|
|
290
|
+
lines.push("");
|
|
291
291
|
}
|
|
292
292
|
|
|
293
293
|
// Summary metrics
|
|
294
294
|
const summary = getMetricsSummary(metrics);
|
|
295
|
-
lines.push(
|
|
295
|
+
lines.push("### Performance Summary");
|
|
296
296
|
lines.push(`- Total P&L: $${summary.totalPnL.toFixed(2)}`);
|
|
297
297
|
lines.push(`- Win Rate: ${(summary.winRate * 100).toFixed(1)}%`);
|
|
298
298
|
lines.push(`- Trades Executed: ${summary.tradesExecuted}`);
|
|
299
299
|
lines.push(
|
|
300
|
-
`- Action Success Rate: ${(summary.actionSuccessRate * 100).toFixed(1)}
|
|
300
|
+
`- Action Success Rate: ${(summary.actionSuccessRate * 100).toFixed(1)}%`,
|
|
301
301
|
);
|
|
302
|
-
lines.push(
|
|
302
|
+
lines.push("");
|
|
303
303
|
|
|
304
304
|
// Social metrics
|
|
305
|
-
lines.push(
|
|
305
|
+
lines.push("### Social Activity");
|
|
306
306
|
lines.push(
|
|
307
|
-
`- Unique Users Interacted: ${metrics.social.uniqueUsersInteracted}
|
|
307
|
+
`- Unique Users Interacted: ${metrics.social.uniqueUsersInteracted}`,
|
|
308
308
|
);
|
|
309
309
|
lines.push(`- Group Chats Joined: ${metrics.social.groupChatsJoined}`);
|
|
310
310
|
lines.push(`- DMs Initiated: ${metrics.social.dmsInitiated}`);
|
|
311
311
|
lines.push(`- Posts Created: ${metrics.social.postsCreated}`);
|
|
312
312
|
lines.push(`- Comments Made: ${metrics.social.commentsMade}`);
|
|
313
313
|
lines.push(
|
|
314
|
-
`- Social to Trade Ratio: ${metrics.behavior.socialToTradeRatio.toFixed(2)}
|
|
314
|
+
`- Social to Trade Ratio: ${metrics.behavior.socialToTradeRatio.toFixed(2)}`,
|
|
315
315
|
);
|
|
316
|
-
lines.push(
|
|
316
|
+
lines.push("");
|
|
317
317
|
|
|
318
318
|
// Trading metrics
|
|
319
|
-
lines.push(
|
|
319
|
+
lines.push("### Trading Performance");
|
|
320
320
|
lines.push(`- Total P&L: $${metrics.trading.totalPnL.toFixed(2)}`);
|
|
321
321
|
lines.push(`- Win Rate: ${(metrics.trading.winRate * 100).toFixed(1)}%`);
|
|
322
322
|
lines.push(`- Sharpe Ratio: ${metrics.trading.sharpeRatio.toFixed(2)}`);
|
|
@@ -324,94 +324,94 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
324
324
|
lines.push(`- Markets Traded: ${metrics.trading.marketsTraded}`);
|
|
325
325
|
lines.push(`- Largest Win: $${metrics.trading.largestWin.toFixed(2)}`);
|
|
326
326
|
lines.push(`- Largest Loss: $${metrics.trading.largestLoss.toFixed(2)}`);
|
|
327
|
-
lines.push(
|
|
327
|
+
lines.push("");
|
|
328
328
|
|
|
329
329
|
// Influence metrics
|
|
330
|
-
lines.push(
|
|
330
|
+
lines.push("### Influence");
|
|
331
331
|
lines.push(`- Followers Gained: ${metrics.influence.followersGained}`);
|
|
332
332
|
lines.push(
|
|
333
|
-
`- Reputation Delta: ${metrics.influence.reputationDelta > 0 ?
|
|
333
|
+
`- Reputation Delta: ${metrics.influence.reputationDelta > 0 ? "+" : ""}${metrics.influence.reputationDelta}`,
|
|
334
334
|
);
|
|
335
335
|
lines.push(`- Positive Reactions: ${metrics.influence.positiveReactions}`);
|
|
336
336
|
lines.push(`- Information Spread: ${metrics.influence.informationSpread}`);
|
|
337
|
-
lines.push(
|
|
337
|
+
lines.push("");
|
|
338
338
|
|
|
339
339
|
// Behavior metrics
|
|
340
|
-
lines.push(
|
|
340
|
+
lines.push("### Behavior Patterns");
|
|
341
341
|
lines.push(
|
|
342
|
-
`- Actions Per Tick: ${metrics.behavior.actionsPerTick.toFixed(2)}
|
|
342
|
+
`- Actions Per Tick: ${metrics.behavior.actionsPerTick.toFixed(2)}`,
|
|
343
343
|
);
|
|
344
344
|
lines.push(
|
|
345
|
-
`- Consistency Score: ${(metrics.behavior.consistencyScore * 100).toFixed(1)}
|
|
345
|
+
`- Consistency Score: ${(metrics.behavior.consistencyScore * 100).toFixed(1)}%`,
|
|
346
346
|
);
|
|
347
347
|
lines.push(
|
|
348
|
-
`- Dominant Action: ${metrics.behavior.dominantActionType ||
|
|
348
|
+
`- Dominant Action: ${metrics.behavior.dominantActionType || "none"}`,
|
|
349
349
|
);
|
|
350
|
-
lines.push(
|
|
350
|
+
lines.push("");
|
|
351
351
|
|
|
352
352
|
// Information metrics
|
|
353
|
-
lines.push(
|
|
353
|
+
lines.push("### Information Activity");
|
|
354
354
|
lines.push(`- Research Actions: ${metrics.information.researchActions}`);
|
|
355
355
|
lines.push(`- Predictions Made: ${metrics.information.predictionsMade}`);
|
|
356
356
|
lines.push(
|
|
357
|
-
`- Prediction Accuracy: ${(metrics.information.predictionAccuracy * 100).toFixed(1)}
|
|
357
|
+
`- Prediction Accuracy: ${(metrics.information.predictionAccuracy * 100).toFixed(1)}%`,
|
|
358
358
|
);
|
|
359
359
|
|
|
360
|
-
return lines.join(
|
|
360
|
+
return lines.join("\n");
|
|
361
361
|
}
|
|
362
362
|
|
|
363
363
|
/**
|
|
364
364
|
* Get a metric value from the metrics object using a dot-path
|
|
365
365
|
*/
|
|
366
366
|
private getMetricValue(metrics: BehavioralMetrics, path: string): string {
|
|
367
|
-
const [category, key] = path.split(
|
|
368
|
-
if (!category || !key) return
|
|
367
|
+
const [category, key] = path.split(".");
|
|
368
|
+
if (!category || !key) return "N/A";
|
|
369
369
|
|
|
370
370
|
// Access nested metric value based on category
|
|
371
371
|
let value: number | string | string[] | undefined;
|
|
372
372
|
switch (category) {
|
|
373
|
-
case
|
|
373
|
+
case "trading":
|
|
374
374
|
value = metrics.trading[key as keyof typeof metrics.trading];
|
|
375
375
|
break;
|
|
376
|
-
case
|
|
376
|
+
case "social":
|
|
377
377
|
value = metrics.social[key as keyof typeof metrics.social];
|
|
378
378
|
break;
|
|
379
|
-
case
|
|
379
|
+
case "influence":
|
|
380
380
|
value = metrics.influence[key as keyof typeof metrics.influence];
|
|
381
381
|
break;
|
|
382
|
-
case
|
|
382
|
+
case "behavior":
|
|
383
383
|
value = metrics.behavior[key as keyof typeof metrics.behavior];
|
|
384
384
|
break;
|
|
385
|
-
case
|
|
385
|
+
case "information":
|
|
386
386
|
value = metrics.information[key as keyof typeof metrics.information];
|
|
387
387
|
break;
|
|
388
388
|
default:
|
|
389
|
-
return
|
|
389
|
+
return "N/A";
|
|
390
390
|
}
|
|
391
391
|
|
|
392
|
-
if (value === undefined || value === null) return
|
|
392
|
+
if (value === undefined || value === null) return "N/A";
|
|
393
393
|
|
|
394
394
|
// Format based on value type
|
|
395
|
-
if (typeof value ===
|
|
395
|
+
if (typeof value === "number") {
|
|
396
396
|
// Check if it's a rate/percentage
|
|
397
397
|
if (
|
|
398
|
-
key.includes(
|
|
399
|
-
key.includes(
|
|
400
|
-
key.includes(
|
|
398
|
+
key.includes("Rate") ||
|
|
399
|
+
key.includes("Accuracy") ||
|
|
400
|
+
key.includes("Score")
|
|
401
401
|
) {
|
|
402
402
|
return `${(value * 100).toFixed(1)}%`;
|
|
403
403
|
}
|
|
404
404
|
// Check if it's a currency
|
|
405
405
|
if (
|
|
406
|
-
key.includes(
|
|
407
|
-
key.includes(
|
|
408
|
-
key.includes(
|
|
409
|
-
key.includes(
|
|
406
|
+
key.includes("PnL") ||
|
|
407
|
+
key.includes("Win") ||
|
|
408
|
+
key.includes("Loss") ||
|
|
409
|
+
key.includes("Drawdown")
|
|
410
410
|
) {
|
|
411
411
|
return `$${value.toFixed(2)}`;
|
|
412
412
|
}
|
|
413
413
|
// Check if it's a ratio
|
|
414
|
-
if (key.includes(
|
|
414
|
+
if (key.includes("Ratio")) {
|
|
415
415
|
return value.toFixed(2);
|
|
416
416
|
}
|
|
417
417
|
// Integer-like values
|
|
@@ -428,12 +428,12 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
428
428
|
* Format a metric path into a human-readable label
|
|
429
429
|
*/
|
|
430
430
|
private formatMetricLabel(path: string): string {
|
|
431
|
-
const [, key] = path.split(
|
|
431
|
+
const [, key] = path.split(".");
|
|
432
432
|
if (!key) return path;
|
|
433
433
|
|
|
434
434
|
// Convert camelCase to Title Case with spaces
|
|
435
435
|
return key
|
|
436
|
-
.replace(/([A-Z])/g,
|
|
436
|
+
.replace(/([A-Z])/g, " $1")
|
|
437
437
|
.replace(/^./, (str) => str.toUpperCase())
|
|
438
438
|
.trim();
|
|
439
439
|
}
|
|
@@ -461,18 +461,18 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
461
461
|
}
|
|
462
462
|
|
|
463
463
|
const sortedActions = Array.from(actionCounts.entries()).sort(
|
|
464
|
-
(a, b) => b[1] - a[1]
|
|
464
|
+
(a, b) => b[1] - a[1],
|
|
465
465
|
);
|
|
466
466
|
|
|
467
467
|
const lines: string[] = [];
|
|
468
468
|
lines.push(
|
|
469
|
-
`- Total Actions: ${steps.length} (${successCount} successful, ${errorCount} failed)
|
|
469
|
+
`- Total Actions: ${steps.length} (${successCount} successful, ${errorCount} failed)`,
|
|
470
470
|
);
|
|
471
471
|
lines.push(
|
|
472
|
-
`- Action Types: ${sortedActions.map(([type, count]) => `${type}(${count})`).join(
|
|
472
|
+
`- Action Types: ${sortedActions.map(([type, count]) => `${type}(${count})`).join(", ")}`,
|
|
473
473
|
);
|
|
474
474
|
|
|
475
|
-
return lines.join(
|
|
475
|
+
return lines.join("\n");
|
|
476
476
|
}
|
|
477
477
|
|
|
478
478
|
/**
|
|
@@ -481,12 +481,12 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
481
481
|
private extractKeyDecisions(steps: TrajectoryStep[]): string | null {
|
|
482
482
|
const keyActions: string[] = [];
|
|
483
483
|
const keyActionTypes = new Set([
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
484
|
+
"trade",
|
|
485
|
+
"buy",
|
|
486
|
+
"sell",
|
|
487
|
+
"predict",
|
|
488
|
+
"create_group_chat",
|
|
489
|
+
"post",
|
|
490
490
|
]);
|
|
491
491
|
|
|
492
492
|
for (const step of steps) {
|
|
@@ -508,10 +508,10 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
508
508
|
}
|
|
509
509
|
if (result.pnl !== undefined) {
|
|
510
510
|
const pnl = Number(result.pnl);
|
|
511
|
-
description += ` → P&L: ${pnl >= 0 ?
|
|
511
|
+
description += ` → P&L: ${pnl >= 0 ? "+" : ""}$${pnl.toFixed(2)}`;
|
|
512
512
|
}
|
|
513
513
|
|
|
514
|
-
keyActions.push(`- ${description} ${action.success ?
|
|
514
|
+
keyActions.push(`- ${description} ${action.success ? "✓" : "✗"}`);
|
|
515
515
|
}
|
|
516
516
|
}
|
|
517
517
|
|
|
@@ -520,7 +520,7 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
520
520
|
}
|
|
521
521
|
|
|
522
522
|
// Limit to most recent 10 key actions
|
|
523
|
-
return keyActions.slice(-10).join(
|
|
523
|
+
return keyActions.slice(-10).join("\n");
|
|
524
524
|
}
|
|
525
525
|
|
|
526
526
|
/**
|
|
@@ -528,7 +528,7 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
528
528
|
*/
|
|
529
529
|
private formatRecentActions(
|
|
530
530
|
steps: TrajectoryStep[],
|
|
531
|
-
maxActions: number
|
|
531
|
+
maxActions: number,
|
|
532
532
|
): string {
|
|
533
533
|
const recentSteps = steps.slice(-maxActions);
|
|
534
534
|
const lines: string[] = [];
|
|
@@ -537,16 +537,16 @@ The metrics provided are CONTEXT to inform your judgment. Use them to understand
|
|
|
537
537
|
const action = step.action;
|
|
538
538
|
if (!action) continue;
|
|
539
539
|
|
|
540
|
-
const success = action.success ?
|
|
540
|
+
const success = action.success ? "✓" : "✗";
|
|
541
541
|
const reasoning = action.reasoning
|
|
542
542
|
? ` | Reason: ${action.reasoning.substring(0, 50)}...`
|
|
543
|
-
:
|
|
543
|
+
: "";
|
|
544
544
|
lines.push(
|
|
545
|
-
`- [${step.stepNumber}] ${action.actionType} ${success}${reasoning}
|
|
545
|
+
`- [${step.stepNumber}] ${action.actionType} ${success}${reasoning}`,
|
|
546
546
|
);
|
|
547
547
|
}
|
|
548
548
|
|
|
549
|
-
return lines.join(
|
|
549
|
+
return lines.join("\n") || "No actions recorded";
|
|
550
550
|
}
|
|
551
551
|
}
|
|
552
552
|
|
|
@@ -11,10 +11,10 @@
|
|
|
11
11
|
* @packageDocumentation
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
16
|
-
import { getRubricHash, RUBRICS_VERSION } from
|
|
17
|
-
import { logger } from
|
|
14
|
+
import { createHash } from "node:crypto";
|
|
15
|
+
import { getTrainingDataAdapter } from "../adapter";
|
|
16
|
+
import { getRubricHash, RUBRICS_VERSION } from "../rubrics";
|
|
17
|
+
import { logger } from "../utils/logger";
|
|
18
18
|
|
|
19
19
|
/**
|
|
20
20
|
* Cached score entry
|
|
@@ -84,10 +84,10 @@ export class LLMJudgeCache {
|
|
|
84
84
|
private generateCacheKey(
|
|
85
85
|
trajectoryId: string,
|
|
86
86
|
stepsJson: string,
|
|
87
|
-
archetype: string
|
|
87
|
+
archetype: string,
|
|
88
88
|
): string {
|
|
89
89
|
const content = `${trajectoryId}:${stepsJson}:${archetype}:${RUBRICS_VERSION}`;
|
|
90
|
-
return createHash(
|
|
90
|
+
return createHash("sha256").update(content).digest("hex").substring(0, 32);
|
|
91
91
|
}
|
|
92
92
|
|
|
93
93
|
/**
|
|
@@ -121,7 +121,7 @@ export class LLMJudgeCache {
|
|
|
121
121
|
get(
|
|
122
122
|
trajectoryId: string,
|
|
123
123
|
stepsJson: string,
|
|
124
|
-
archetype: string
|
|
124
|
+
archetype: string,
|
|
125
125
|
): CachedScore | null {
|
|
126
126
|
const cacheKey = this.generateCacheKey(trajectoryId, stepsJson, archetype);
|
|
127
127
|
const cached = this.cache.get(cacheKey);
|
|
@@ -144,9 +144,9 @@ export class LLMJudgeCache {
|
|
|
144
144
|
this.updateHitRate();
|
|
145
145
|
|
|
146
146
|
logger.debug(
|
|
147
|
-
|
|
147
|
+
"Cache hit",
|
|
148
148
|
{ trajectoryId, archetype, cacheKey: cacheKey.substring(0, 8) },
|
|
149
|
-
|
|
149
|
+
"LLMJudgeCache",
|
|
150
150
|
);
|
|
151
151
|
|
|
152
152
|
return cached;
|
|
@@ -162,7 +162,7 @@ export class LLMJudgeCache {
|
|
|
162
162
|
score: number,
|
|
163
163
|
reasoning: string,
|
|
164
164
|
strengths: string[] = [],
|
|
165
|
-
weaknesses: string[] = []
|
|
165
|
+
weaknesses: string[] = [],
|
|
166
166
|
): void {
|
|
167
167
|
// Enforce max entries limit
|
|
168
168
|
if (this.cache.size >= this.config.maxEntries) {
|
|
@@ -172,7 +172,7 @@ export class LLMJudgeCache {
|
|
|
172
172
|
const cacheKey = this.generateCacheKey(trajectoryId, stepsJson, archetype);
|
|
173
173
|
const now = new Date();
|
|
174
174
|
const expiresAt = new Date(
|
|
175
|
-
now.getTime() + this.config.ttlHours * 60 * 60 * 1000
|
|
175
|
+
now.getTime() + this.config.ttlHours * 60 * 60 * 1000,
|
|
176
176
|
);
|
|
177
177
|
|
|
178
178
|
const entry: CachedScore = {
|
|
@@ -192,9 +192,9 @@ export class LLMJudgeCache {
|
|
|
192
192
|
this.cache.set(cacheKey, entry);
|
|
193
193
|
|
|
194
194
|
logger.debug(
|
|
195
|
-
|
|
195
|
+
"Cache set",
|
|
196
196
|
{ trajectoryId, archetype, score, cacheKey: cacheKey.substring(0, 8) },
|
|
197
|
-
|
|
197
|
+
"LLMJudgeCache",
|
|
198
198
|
);
|
|
199
199
|
}
|
|
200
200
|
|
|
@@ -242,9 +242,9 @@ export class LLMJudgeCache {
|
|
|
242
242
|
this.stats.invalidations += invalidated;
|
|
243
243
|
|
|
244
244
|
logger.info(
|
|
245
|
-
|
|
245
|
+
"Invalidated cache entries",
|
|
246
246
|
{ archetype, count: invalidated },
|
|
247
|
-
|
|
247
|
+
"LLMJudgeCache",
|
|
248
248
|
);
|
|
249
249
|
|
|
250
250
|
return invalidated;
|
|
@@ -258,7 +258,7 @@ export class LLMJudgeCache {
|
|
|
258
258
|
this.cache.clear();
|
|
259
259
|
this.stats.invalidations += count;
|
|
260
260
|
|
|
261
|
-
logger.info(
|
|
261
|
+
logger.info("Cleared cache", { count }, "LLMJudgeCache");
|
|
262
262
|
}
|
|
263
263
|
|
|
264
264
|
/**
|
|
@@ -290,18 +290,18 @@ export class LLMJudgeCache {
|
|
|
290
290
|
this.set(
|
|
291
291
|
row.trajectoryId,
|
|
292
292
|
row.stepsJson,
|
|
293
|
-
|
|
293
|
+
"default",
|
|
294
294
|
row.aiJudgeReward,
|
|
295
|
-
row.aiJudgeReasoning
|
|
295
|
+
row.aiJudgeReasoning,
|
|
296
296
|
);
|
|
297
297
|
loaded++;
|
|
298
298
|
}
|
|
299
299
|
}
|
|
300
300
|
|
|
301
301
|
logger.info(
|
|
302
|
-
|
|
302
|
+
"Warmed cache from database",
|
|
303
303
|
{ loaded, attempted: results.length },
|
|
304
|
-
|
|
304
|
+
"LLMJudgeCache",
|
|
305
305
|
);
|
|
306
306
|
|
|
307
307
|
return loaded;
|
|
@@ -322,7 +322,10 @@ export const scoreValidator = {
|
|
|
322
322
|
*/
|
|
323
323
|
isValidScore(score: number): boolean {
|
|
324
324
|
return (
|
|
325
|
-
typeof score ===
|
|
325
|
+
typeof score === "number" &&
|
|
326
|
+
!Number.isNaN(score) &&
|
|
327
|
+
score >= 0 &&
|
|
328
|
+
score <= 1
|
|
326
329
|
);
|
|
327
330
|
},
|
|
328
331
|
|
|
@@ -331,7 +334,7 @@ export const scoreValidator = {
|
|
|
331
334
|
*/
|
|
332
335
|
isValidReasoning(reasoning: string): boolean {
|
|
333
336
|
return (
|
|
334
|
-
typeof reasoning ===
|
|
337
|
+
typeof reasoning === "string" &&
|
|
335
338
|
reasoning.length >= 20 &&
|
|
336
339
|
reasoning.length <= 5000
|
|
337
340
|
);
|
|
@@ -356,7 +359,7 @@ export const scoreValidator = {
|
|
|
356
359
|
* Check if scores are consistent (similar trajectories should have similar scores)
|
|
357
360
|
*/
|
|
358
361
|
checkScoreConsistency(
|
|
359
|
-
scores: Array<{ trajectoryId: string; score: number; metricsHash: string }
|
|
362
|
+
scores: Array<{ trajectoryId: string; score: number; metricsHash: string }>,
|
|
360
363
|
): { consistent: boolean; outliers: string[] } {
|
|
361
364
|
if (scores.length < 3) {
|
|
362
365
|
return { consistent: true, outliers: [] };
|
package/src/scoring/index.ts
CHANGED
|
@@ -4,6 +4,6 @@
|
|
|
4
4
|
* LLM-as-judge scoring with archetype-specific rubrics.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
-
export * from
|
|
8
|
-
export * from
|
|
9
|
-
export * from
|
|
7
|
+
export * from "./ArchetypeScoringService";
|
|
8
|
+
export * from "./JudgePromptBuilder";
|
|
9
|
+
export * from "./LLMJudgeCache";
|