@elizaos/training 2.0.0-alpha.77 → 2.0.0-alpha.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/package.json +2 -2
  2. package/.turbo/turbo-lint.log +0 -3
  3. package/.turbo/turbo-typecheck.log +0 -1
  4. package/dist/.tsbuildinfo +0 -1
  5. package/dist/adapter.js +0 -59
  6. package/dist/archetypes/ArchetypeConfigService.js +0 -510
  7. package/dist/archetypes/derive-archetype.js +0 -196
  8. package/dist/archetypes/index.js +0 -7
  9. package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
  10. package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
  11. package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
  12. package/dist/benchmark/BenchmarkDataViewer.js +0 -197
  13. package/dist/benchmark/BenchmarkHistoryService.js +0 -135
  14. package/dist/benchmark/BenchmarkRunner.js +0 -483
  15. package/dist/benchmark/BenchmarkValidator.js +0 -158
  16. package/dist/benchmark/FastEvalRunner.js +0 -133
  17. package/dist/benchmark/MetricsValidator.js +0 -104
  18. package/dist/benchmark/MetricsVisualizer.js +0 -775
  19. package/dist/benchmark/ModelBenchmarkService.js +0 -433
  20. package/dist/benchmark/ModelRegistry.js +0 -122
  21. package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
  22. package/dist/benchmark/SimulationA2AInterface.js +0 -683
  23. package/dist/benchmark/SimulationEngine.js +0 -522
  24. package/dist/benchmark/TaskRunner.js +0 -60
  25. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
  26. package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
  27. package/dist/benchmark/index.js +0 -23
  28. package/dist/benchmark/parseSimulationMetrics.js +0 -86
  29. package/dist/benchmark/simulation-types.js +0 -1
  30. package/dist/dependencies.js +0 -197
  31. package/dist/generation/TrajectoryGenerator.js +0 -244
  32. package/dist/generation/index.js +0 -6
  33. package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
  34. package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
  35. package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
  36. package/dist/huggingface/index.js +0 -9
  37. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
  38. package/dist/index.js +0 -41
  39. package/dist/init-training.js +0 -43
  40. package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
  41. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
  42. package/dist/metrics/index.js +0 -7
  43. package/dist/metrics/types.js +0 -21
  44. package/dist/rubrics/__tests__/index.test.js +0 -150
  45. package/dist/rubrics/ass-kisser.js +0 -83
  46. package/dist/rubrics/degen.js +0 -78
  47. package/dist/rubrics/goody-twoshoes.js +0 -82
  48. package/dist/rubrics/index.js +0 -184
  49. package/dist/rubrics/information-trader.js +0 -82
  50. package/dist/rubrics/infosec.js +0 -99
  51. package/dist/rubrics/liar.js +0 -102
  52. package/dist/rubrics/perps-trader.js +0 -85
  53. package/dist/rubrics/researcher.js +0 -79
  54. package/dist/rubrics/scammer.js +0 -80
  55. package/dist/rubrics/social-butterfly.js +0 -71
  56. package/dist/rubrics/super-predictor.js +0 -95
  57. package/dist/rubrics/trader.js +0 -65
  58. package/dist/scoring/ArchetypeScoringService.js +0 -301
  59. package/dist/scoring/JudgePromptBuilder.js +0 -401
  60. package/dist/scoring/LLMJudgeCache.js +0 -263
  61. package/dist/scoring/index.js +0 -8
  62. package/dist/training/AutomationPipeline.js +0 -714
  63. package/dist/training/BenchmarkService.js +0 -370
  64. package/dist/training/ConfigValidator.js +0 -153
  65. package/dist/training/MarketOutcomesTracker.js +0 -142
  66. package/dist/training/ModelDeployer.js +0 -128
  67. package/dist/training/ModelFetcher.js +0 -48
  68. package/dist/training/ModelSelectionService.js +0 -248
  69. package/dist/training/ModelUsageVerifier.js +0 -106
  70. package/dist/training/MultiModelOrchestrator.js +0 -349
  71. package/dist/training/RLModelConfig.js +0 -295
  72. package/dist/training/RewardBackpropagationService.js +0 -117
  73. package/dist/training/RulerScoringService.js +0 -450
  74. package/dist/training/TrainingMonitor.js +0 -108
  75. package/dist/training/TrajectoryRecorder.js +0 -281
  76. package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
  77. package/dist/training/index.js +0 -30
  78. package/dist/training/logRLConfig.js +0 -29
  79. package/dist/training/pipeline.js +0 -80
  80. package/dist/training/storage/ModelStorageService.js +0 -190
  81. package/dist/training/storage/TrainingDataArchiver.js +0 -136
  82. package/dist/training/storage/index.js +0 -7
  83. package/dist/training/types.js +0 -6
  84. package/dist/training/window-utils.js +0 -100
  85. package/dist/utils/index.js +0 -73
  86. package/dist/utils/logger.js +0 -55
  87. package/dist/utils/snowflake.js +0 -15
  88. package/dist/utils/synthetic-detector.js +0 -67
  89. package/vitest.config.ts +0 -8
@@ -1,117 +0,0 @@
1
- /**
2
- * Reward Backpropagation Service
3
- *
4
- * Updates trajectory rewards when market outcomes become known.
5
- * This allows the RL model to learn from actual results, not just immediate actions.
6
- */
7
- import { getMarketDataAdapter, getTrainingDataAdapter } from "../adapter";
8
- import { logger } from "../utils/logger";
9
- import { MarketOutcomesTracker } from "./MarketOutcomesTracker";
10
- export class RewardBackpropagationService {
11
- outcomesTracker;
12
- constructor() {
13
- this.outcomesTracker = new MarketOutcomesTracker();
14
- }
15
- /**
16
- * Update rewards for trajectories in a window when outcomes become known
17
- */
18
- async updateRewardsForWindow(windowId) {
19
- logger.info("Updating rewards for window", { windowId });
20
- // Get outcomes for this window
21
- const outcomes = await this.outcomesTracker.getWindowOutcomes(windowId);
22
- if (!outcomes) {
23
- logger.info("No outcomes found for window", { windowId });
24
- return 0;
25
- }
26
- // Get all trajectories for this window (filter to training data)
27
- const allTrajectories = await getTrainingDataAdapter().getTrajectoriesByWindow(windowId);
28
- const trajectoriesResult = allTrajectories.filter((t) => t.isTrainingData);
29
- let updated = 0;
30
- for (const traj of trajectoriesResult) {
31
- if (!traj.stepsJson)
32
- continue;
33
- const steps = JSON.parse(traj.stepsJson);
34
- let totalReward = 0;
35
- let hasUpdates = false;
36
- // Update rewards for each step based on outcomes
37
- for (const step of steps) {
38
- const originalReward = step.reward;
39
- let updatedReward = originalReward;
40
- // Check if this step involved trading
41
- if (step.action.actionType.includes("TRADING") ||
42
- step.action.actionType.includes("BUY") ||
43
- step.action.actionType.includes("SELL")) {
44
- // Extract market ID from action parameters
45
- const marketId = step.action.parameters?.marketId;
46
- const ticker = step.action.parameters?.ticker;
47
- if (marketId) {
48
- // Check prediction market outcome
49
- const prediction = outcomes.predictions.find((p) => p.marketId === marketId);
50
- if (prediction) {
51
- // Calculate reward based on whether trade was correct
52
- const side = step.action.parameters?.side;
53
- const isCorrect = (side === "YES" && prediction.outcome === "YES") ||
54
- (side === "NO" && prediction.outcome === "NO");
55
- // Reward: +1 for correct, -1 for incorrect (normalized)
56
- updatedReward = isCorrect ? 1.0 : -1.0;
57
- }
58
- }
59
- else if (ticker) {
60
- // Check perpetual outcome
61
- const stock = outcomes.stocks.find((s) => s.ticker === ticker);
62
- if (stock) {
63
- // Calculate reward based on price movement
64
- const side = step.action.parameters?.side;
65
- const priceChange = stock.changePercent;
66
- // Reward based on whether position direction matched price movement
67
- // Long position: positive reward if price went up
68
- // Short position: positive reward if price went down
69
- if (side === "long") {
70
- updatedReward = Math.max(-1, Math.min(1, priceChange / 10)); // Normalize to -1 to 1
71
- }
72
- else if (side === "short") {
73
- updatedReward = Math.max(-1, Math.min(1, -priceChange / 10)); // Inverted for short
74
- }
75
- }
76
- }
77
- }
78
- if (updatedReward !== originalReward) {
79
- step.reward = updatedReward;
80
- hasUpdates = true;
81
- }
82
- totalReward += step.reward;
83
- }
84
- // Update trajectory if rewards changed
85
- if (hasUpdates) {
86
- await getTrainingDataAdapter().updateTrajectoryRewards(traj.id, JSON.stringify(steps), totalReward);
87
- updated++;
88
- }
89
- }
90
- logger.info("Updated rewards for trajectories", {
91
- windowId,
92
- updated,
93
- total: trajectoriesResult.length,
94
- });
95
- return updated;
96
- }
97
- /**
98
- * Process all windows that have outcomes but haven't been updated
99
- */
100
- async processPendingWindows() {
101
- // Get all windows with outcomes
102
- const marketAdapter = getMarketDataAdapter();
103
- if (!marketAdapter) {
104
- return 0;
105
- }
106
- const windowIds = await marketAdapter.getDistinctWindowsWithOutcomes();
107
- let processed = 0;
108
- for (const windowId of windowIds) {
109
- const updated = await this.updateRewardsForWindow(windowId);
110
- if (updated > 0) {
111
- processed++;
112
- }
113
- }
114
- return processed;
115
- }
116
- }
117
- export const rewardBackpropagationService = new RewardBackpropagationService();
@@ -1,450 +0,0 @@
1
- /**
2
- * RULER Scoring Service
3
- *
4
- * Implements RULER (Relative Universal LLM-Elicited Rewards) using LLM-as-judge.
5
- *
6
- * Key features:
7
- * - Groups trajectories by scenarioId for relative comparison
8
- * - Uses LLM judge to score trajectories relative to each other (0-1)
9
- * - Injects game context (P&L, episode length, actions) into judge prompt
10
- * - Deduplicates common message prefixes to save tokens
11
- * - Works with any LiteLLM-compatible provider (Groq, OpenAI, etc.)
12
- *
13
- * Based on: https://art.openpipe.ai/fundamentals/ruler
14
- */
15
- import { getTrainingDataAdapter } from "../adapter";
16
- /** Cast string to UUID (replaces @elizaos/core asUUID) */
17
- function asUUID(id) {
18
- return id;
19
- }
20
- import { v4 as uuidv4 } from "uuid";
21
- import { getLLMCaller, getToTrainingMessages, } from "../dependencies";
22
- import { getRubric, sanitizeArchetype } from "../rubrics";
23
- import { logger, splitIntoBatches } from "../utils";
24
- /**
25
- * Default RULER rubric - works well for most RL tasks
26
- */
27
- const DEFAULT_RUBRIC = `
28
- - A trajectory that achieves its goal should always get a significantly higher score than a trajectory that does not achieve its goal.
29
- - A trajectory that achieves its goal more efficiently (eg. by avoiding unproductive detours) should get a higher score than a trajectory that achieves its goal less efficiently.
30
- - If one trajectory is only slightly better than another, the difference in scores should be small. If it is significantly better, the difference in scores should be large.
31
- - You may give some partial credit for a trajectory that makes progress towards its goal but does not complete it.
32
- `;
33
- export class RulerScoringService {
34
- minGroupSize = 2; // Minimum trajectories per group for comparison
35
- maxGroupSize = 8; // Optimal group size per RULER docs
36
- /**
37
- * Score trajectories using RULER (LLM-as-judge with relative comparison)
38
- *
39
- * Groups trajectories by scenarioId and scores them relative to each other.
40
- * This is the proper RULER implementation - not simple heuristics!
41
- *
42
- * @param trajectoryIds - Optional: specific trajectory IDs to score. If not provided, scores all unscored trajectories.
43
- * @returns Number of trajectories successfully scored
44
- */
45
- async scoreTrajectories(trajectoryIds) {
46
- const trajectoriesResult = await this.getTrajectoriesToScore(trajectoryIds);
47
- if (trajectoriesResult.length === 0) {
48
- logger.info("No trajectories to score", {}, "RulerScoring");
49
- return 0;
50
- }
51
- const groups = this.groupByScenario(trajectoriesResult);
52
- logger.info("Grouped trajectories for RULER scoring", {
53
- totalTrajectories: trajectoriesResult.length,
54
- groups: groups.length,
55
- avgGroupSize: groups.length > 0 ? trajectoriesResult.length / groups.length : 0,
56
- }, "RulerScoring");
57
- let totalScored = 0;
58
- for (const group of groups) {
59
- if (group.trajectories.length < this.minGroupSize) {
60
- logger.warn("Skipping group with insufficient trajectories", {
61
- scenarioId: group.scenarioId,
62
- count: group.trajectories.length,
63
- minRequired: this.minGroupSize,
64
- }, "RulerScoring");
65
- continue;
66
- }
67
- const batches = splitIntoBatches(group.trajectories, this.maxGroupSize);
68
- for (const batch of batches) {
69
- const scored = await this.scoreGroup(batch, group.scenarioId);
70
- totalScored += scored;
71
- }
72
- }
73
- logger.info("RULER scoring complete", {
74
- totalScored,
75
- totalTrajectories: trajectoriesResult.length,
76
- }, "RulerScoring");
77
- return totalScored;
78
- }
79
- /**
80
- * Score a single trajectory (for backward compatibility)
81
- *
82
- * Note: RULER works best with groups, so this finds other trajectories
83
- * in the same scenario and scores them together.
84
- */
85
- async scoreTrajectory(trajectoryId) {
86
- const scored = await this.scoreTrajectories([trajectoryId]);
87
- if (scored === 0) {
88
- return null;
89
- }
90
- const updated = await getTrainingDataAdapter().getTrajectoryById(trajectoryId);
91
- if (!updated || updated.aiJudgeReward === null) {
92
- return null;
93
- }
94
- return {
95
- trajectoryId: updated.trajectoryId,
96
- overallScore: updated.aiJudgeReward,
97
- reasoning: updated.aiJudgeReasoning || "",
98
- scoredAt: updated.judgedAt || new Date(),
99
- };
100
- }
101
- /**
102
- * Score a group of trajectories using RULER
103
- *
104
- * This is the core RULER implementation:
105
- * 1. Convert trajectories to message format
106
- * 2. Extract common prefix (deduplication)
107
- * 3. Build judge prompt with context (P&L, episode length, etc.)
108
- * 4. Call LLM judge to score trajectories relative to each other
109
- * 5. Save scores to database
110
- */
111
- async scoreGroup(trajectoriesData, scenarioId) {
112
- const richTrajectories = [];
113
- for (const dbTraj of trajectoriesData) {
114
- if (!dbTraj.stepsJson ||
115
- dbTraj.stepsJson === "null" ||
116
- dbTraj.stepsJson === "[]") {
117
- logger.warn("Skipping trajectory with invalid stepsJson", {
118
- trajectoryId: dbTraj.trajectoryId,
119
- }, "RulerScoring");
120
- continue;
121
- }
122
- const steps = JSON.parse(dbTraj.stepsJson);
123
- const stepTimestamp = Date.now();
124
- const richTraj = {
125
- trajectoryId: asUUID(dbTraj.trajectoryId),
126
- agentId: asUUID(uuidv4()),
127
- startTime: 0,
128
- endTime: 0,
129
- durationMs: 0,
130
- scenarioId: dbTraj.scenarioId || undefined,
131
- steps: steps.map((s, idx) => ({
132
- stepId: asUUID(uuidv4()),
133
- stepNumber: idx,
134
- timestamp: s.timestamp || stepTimestamp + idx,
135
- environmentState: {
136
- ...s.environmentState,
137
- timestamp: s.timestamp || stepTimestamp + idx,
138
- agentPoints: s.environmentState.agentPoints ??
139
- 0,
140
- },
141
- observation: {},
142
- providerAccesses: (s.providerAccesses || []).map((p) => ({
143
- providerId: uuidv4(),
144
- providerName: p.providerName,
145
- timestamp: s.timestamp || stepTimestamp + idx,
146
- query: p.data,
147
- data: p.data,
148
- purpose: p.purpose,
149
- })),
150
- llmCalls: (s.llmCalls || []).map((l) => ({
151
- callId: uuidv4(),
152
- timestamp: s.timestamp || stepTimestamp + idx,
153
- model: l.model,
154
- modelVersion: l.modelVersion,
155
- systemPrompt: l.systemPrompt,
156
- userPrompt: l.userPrompt,
157
- response: l.response,
158
- reasoning: l.reasoning,
159
- temperature: l.temperature,
160
- maxTokens: l.maxTokens,
161
- latencyMs: l.latencyMs,
162
- purpose: l.purpose,
163
- actionType: l.actionType,
164
- })),
165
- action: {
166
- attemptId: uuidv4(),
167
- timestamp: s.timestamp || stepTimestamp + idx,
168
- actionType: s.action.actionType,
169
- actionName: s.action.actionType,
170
- parameters: s.action.parameters,
171
- reasoning: s.action.reasoning,
172
- success: s.action.success,
173
- result: s.action.result,
174
- error: s.action.error,
175
- },
176
- reward: s.reward,
177
- done: idx === steps.length - 1,
178
- metadata: {},
179
- })),
180
- totalReward: steps.reduce((sum, s) => sum + s.reward, 0),
181
- rewardComponents: {
182
- environmentReward: steps.reduce((sum, s) => sum + s.reward, 0),
183
- },
184
- metrics: {
185
- episodeLength: dbTraj.episodeLength || steps.length,
186
- finalStatus: "completed",
187
- finalPnL: dbTraj.finalPnL || undefined,
188
- },
189
- metadata: {
190
- isTrainingData: true,
191
- },
192
- };
193
- const toARTMessages = getToTrainingMessages();
194
- const messages = toARTMessages(richTraj);
195
- // Sanitize archetype to prevent prompt injection and handle null/empty values
196
- const archetype = sanitizeArchetype(dbTraj.archetype);
197
- richTrajectories.push({ traj: richTraj, messages, archetype });
198
- }
199
- if (richTrajectories.length < this.minGroupSize) {
200
- logger.warn("Insufficient valid trajectories in group", {
201
- scenarioId,
202
- validCount: richTrajectories.length,
203
- }, "RulerScoring");
204
- return 0;
205
- }
206
- const commonPrefix = this.extractCommonPrefix(richTrajectories.map((rt) => rt.messages));
207
- const judgePrompt = this.buildJudgePrompt(richTrajectories, commonPrefix, scenarioId);
208
- const judgeResponse = await this.callJudge(judgePrompt);
209
- if (!judgeResponse ||
210
- judgeResponse.scores.length !== richTrajectories.length) {
211
- logger.error("Invalid judge response", {
212
- expectedScores: richTrajectories.length,
213
- receivedScores: judgeResponse?.scores.length || 0,
214
- }, "RulerScoring");
215
- return 0;
216
- }
217
- const scoreMap = new Map();
218
- for (const score of judgeResponse.scores) {
219
- scoreMap.set(score.trajectory_id, score);
220
- }
221
- let scored = 0;
222
- for (let i = 0; i < richTrajectories.length; i++) {
223
- const expectedTrajId = `trajectory-${i + 1}`;
224
- const scoreData = scoreMap.get(expectedTrajId);
225
- if (!scoreData) {
226
- logger.warn("Judge did not return score for trajectory", {
227
- expectedTrajId,
228
- receivedIds: judgeResponse.scores.map((s) => s.trajectory_id),
229
- }, "RulerScoring");
230
- continue;
231
- }
232
- const trajectoryId = richTrajectories[i]?.traj.trajectoryId;
233
- await getTrainingDataAdapter().updateTrajectoryScore(trajectoryId, Math.max(0, Math.min(1, scoreData.score)), scoreData.explanation);
234
- scored++;
235
- }
236
- logger.info("Scored trajectory group", {
237
- scenarioId,
238
- scored,
239
- groupSize: richTrajectories.length,
240
- }, "RulerScoring");
241
- return scored;
242
- }
243
- /**
244
- * Build judge prompt with trajectory context
245
- *
246
- * Injects game knowledge (P&L, episode length, actions) into the prompt
247
- * so the judge can make informed relative comparisons.
248
- */
249
- buildJudgePrompt(richTrajectories, commonPrefix, scenarioId) {
250
- // Build context section with game knowledge (injected into prompt)
251
- const contextParts = [];
252
- contextParts.push(`Scenario: ${scenarioId}`);
253
- contextParts.push(`\nTrajectory Performance Context (use this to inform your scoring):`);
254
- for (let i = 0; i < richTrajectories.length; i++) {
255
- const rt = richTrajectories[i];
256
- if (!rt)
257
- continue;
258
- const trajId = `trajectory-${i + 1}`;
259
- contextParts.push(`\n${trajId}:`);
260
- contextParts.push(` - Archetype: ${rt.archetype}`);
261
- contextParts.push(` - Final P&L: $${rt.traj.metrics.finalPnL?.toFixed(2) || "0.00"}`);
262
- contextParts.push(` - Episode Length: ${rt.traj.metrics.episodeLength || 0} steps`);
263
- contextParts.push(` - Total Reward: ${rt.traj.totalReward.toFixed(2)}`);
264
- const actionTypes = rt.traj.steps
265
- .filter((s) => !!s.action)
266
- .map((s) => s.action?.actionType);
267
- const uniqueActions = [...new Set(actionTypes)];
268
- contextParts.push(` - Actions Taken: ${uniqueActions.join(", ")} (${actionTypes.length} total)`);
269
- // Add success/error info
270
- const errors = rt.traj.steps.filter((s) => !!s.action && !s.action.success).length;
271
- const successRate = rt.traj.steps.length > 0
272
- ? (((rt.traj.steps.length - errors) / rt.traj.steps.length) *
273
- 100).toFixed(1)
274
- : "0";
275
- contextParts.push(` - Success Rate: ${successRate}%`);
276
- if (errors > 0) {
277
- contextParts.push(` - Errors: ${errors}`);
278
- }
279
- }
280
- // Build trajectory messages (with deduplicated prefix)
281
- const trajectorySections = [];
282
- for (let i = 0; i < richTrajectories.length; i++) {
283
- const rt = richTrajectories[i];
284
- if (!rt)
285
- continue;
286
- const trajId = `trajectory-${i + 1}`;
287
- // Remove common prefix from messages
288
- const uniqueMessages = rt.messages.slice(commonPrefix.length);
289
- // Truncate very long messages to save tokens (keep last 20 messages max)
290
- const truncatedMessages = uniqueMessages.slice(-20);
291
- trajectorySections.push(`<trajectory id="${trajId}">`);
292
- trajectorySections.push(JSON.stringify(truncatedMessages, null, 2));
293
- trajectorySections.push(`</trajectory>`);
294
- }
295
- // Build full prompt
296
- const userContent = commonPrefix.length > 0
297
- ? `<context>\n${JSON.stringify(commonPrefix, null, 2)}\n</context>\n\n`
298
- : "";
299
- const prompt = `${userContent}${contextParts.join("\n")}\n\nTrajectories:\n\n${trajectorySections.join("\n\n")}`;
300
- // Determine archetype-specific rubric
301
- // If all trajectories share the same archetype, use that archetype's rubric
302
- // Otherwise, fall back to the default rubric
303
- const archetypes = [...new Set(richTrajectories.map((rt) => rt.archetype))];
304
- const isSingleArchetype = archetypes.length === 1 && archetypes[0] !== "default";
305
- const singleArchetype = archetypes[0];
306
- const rubric = isSingleArchetype && singleArchetype
307
- ? getRubric(singleArchetype)
308
- : DEFAULT_RUBRIC;
309
- const archetypeContext = isSingleArchetype
310
- ? `\n\nYou are evaluating ${archetypes[0]?.toUpperCase()} agents. Score them based on how well they embody that archetype's behavior and goals.`
311
- : archetypes.length > 1
312
- ? `\n\nNote: This group contains mixed archetypes (${archetypes.join(", ")}). Consider each agent's archetype when scoring.`
313
- : "";
314
- const systemPrompt = `You are an expert evaluator of AI agent performance. All trajectories below were given the same goal/scenario. Your job is to compare them and assign scores from 0 to 1 based on how well each trajectory achieved its goal.${archetypeContext}
315
-
316
- Grading standards:
317
- ${rubric}
318
-
319
- Important: Use the performance context provided (P&L, episode length, success rate, archetype) to inform your scoring, but also consider the quality of decision-making, efficiency, and goal achievement shown in the trajectory messages.`;
320
- return JSON.stringify({
321
- system: systemPrompt,
322
- user: prompt,
323
- });
324
- }
325
- /**
326
- * Call LLM judge to score trajectories
327
- *
328
- * Uses structured output format to ensure valid JSON response.
329
- */
330
- async callJudge(promptJson) {
331
- const promptData = JSON.parse(promptJson);
332
- const structuredPrompt = `${promptData.user}
333
-
334
- Please respond with ONLY a valid JSON object in this exact format:
335
- {
336
- "scores": [
337
- {
338
- "trajectory_id": "trajectory-1",
339
- "explanation": "Brief explanation of score",
340
- "score": 0.85
341
- },
342
- {
343
- "trajectory_id": "trajectory-2",
344
- "explanation": "Brief explanation of score",
345
- "score": 0.65
346
- }
347
- ]
348
- }
349
-
350
- Return ONLY the JSON, no other text.`;
351
- const llmCaller = getLLMCaller();
352
- const response = await llmCaller.callGroqDirect({
353
- prompt: structuredPrompt,
354
- system: promptData.system,
355
- modelSize: "large",
356
- temperature: 0.3,
357
- maxTokens: 2000,
358
- actionType: "ruler_score_trajectories",
359
- });
360
- let jsonText = response.trim();
361
- jsonText = jsonText
362
- .replace(/```json\n?/g, "")
363
- .replace(/```\n?/g, "")
364
- .trim();
365
- const jsonMatch = jsonText.match(/\{[\s\S]*\}/);
366
- if (!jsonMatch) {
367
- logger.error("Judge response does not contain JSON", {
368
- response: response.substring(0, 500),
369
- }, "RulerScoring");
370
- return null;
371
- }
372
- const parsed = JSON.parse(jsonMatch[0]);
373
- if (!parsed.scores || !Array.isArray(parsed.scores)) {
374
- logger.error("Invalid judge response structure", { parsed }, "RulerScoring");
375
- return null;
376
- }
377
- for (const score of parsed.scores) {
378
- if (score.score < 0 || score.score > 1) {
379
- score.score = Math.max(0, Math.min(1, score.score));
380
- }
381
- }
382
- return parsed;
383
- }
384
- /**
385
- * Extract common message prefix from trajectories
386
- *
387
- * RULER deduplicates common prefixes to save tokens.
388
- */
389
- extractCommonPrefix(messageLists) {
390
- if (messageLists.length === 0)
391
- return [];
392
- const first = messageLists[0];
393
- if (!first)
394
- return [];
395
- const prefix = [];
396
- for (let i = 0; i < first.length; i++) {
397
- const msg = first[i];
398
- if (!msg)
399
- continue;
400
- const allMatch = messageLists.every((msgs) => msgs[i] &&
401
- msgs[i]?.role === msg.role &&
402
- msgs[i]?.content === msg.content);
403
- if (allMatch) {
404
- prefix.push(msg);
405
- }
406
- else {
407
- break;
408
- }
409
- }
410
- return prefix;
411
- }
412
- /**
413
- * Group trajectories by scenarioId
414
- */
415
- groupByScenario(trajectoriesData) {
416
- const groups = new Map();
417
- for (const traj of trajectoriesData) {
418
- const scenarioId = traj.scenarioId || "default";
419
- if (!groups.has(scenarioId)) {
420
- groups.set(scenarioId, []);
421
- }
422
- groups.get(scenarioId)?.push(traj);
423
- }
424
- return Array.from(groups.entries()).map(([scenarioId, trajs]) => ({
425
- scenarioId,
426
- trajectories: trajs,
427
- }));
428
- }
429
- /**
430
- * Get trajectories to score
431
- */
432
- async getTrajectoriesToScore(trajectoryIds) {
433
- const adapter = getTrainingDataAdapter();
434
- return await adapter.getUnscoredTrajectories(trajectoryIds && trajectoryIds.length > 0 ? { trajectoryIds } : undefined);
435
- }
436
- /**
437
- * Score all unscored trajectories in a time window
438
- */
439
- async scoreWindow(windowId) {
440
- const trajectoryIds = await getTrainingDataAdapter().getUnscoredWindowTrajectoryIds(windowId);
441
- if (trajectoryIds.length === 0) {
442
- return 0;
443
- }
444
- return await this.scoreTrajectories(trajectoryIds);
445
- }
446
- }
447
- /**
448
- * Singleton instance of RulerScoringService
449
- */
450
- export const rulerScoringService = new RulerScoringService();
@@ -1,108 +0,0 @@
1
- /**
2
- * Training Monitor Service
3
- *
4
- * Tracks training job progress and updates database with status.
5
- * Monitors Python training process and W&B runs.
6
- */
7
- import { getTrainingDataAdapter } from "../adapter";
8
- import { logger } from "../utils/logger";
9
- export class TrainingMonitor {
10
- /**
11
- * Start monitoring a training job
12
- */
13
- async startMonitoring(batchId) {
14
- const adapter = getTrainingDataAdapter();
15
- await adapter.updateBatchStatus(batchId, "training");
16
- logger.info("Started monitoring training job", { batchId }, "TrainingMonitor");
17
- }
18
- /**
19
- * Update training progress
20
- */
21
- async updateProgress(batchId, progress) {
22
- if (progress.status) {
23
- const adapter = getTrainingDataAdapter();
24
- const errorMsg = progress.status === "failed" ? progress.error : undefined;
25
- await adapter.updateBatchStatus(batchId, progress.status, errorMsg);
26
- }
27
- logger.info("Updated training progress", {
28
- batchId,
29
- status: progress.status,
30
- progress: progress.progress,
31
- }, "TrainingMonitor");
32
- }
33
- /**
34
- * Get current progress for a job
35
- */
36
- async getProgress(batchId) {
37
- const adapter = getTrainingDataAdapter();
38
- const batch = await adapter.getBatchById(batchId);
39
- if (!batch) {
40
- return null;
41
- }
42
- // Calculate progress based on status
43
- let progress = 0;
44
- switch (batch.status) {
45
- case "pending":
46
- progress = 0;
47
- break;
48
- case "preparing":
49
- progress = 0.1;
50
- break;
51
- case "scoring":
52
- progress = 0.3;
53
- break;
54
- case "training":
55
- progress = 0.6;
56
- break;
57
- case "uploading":
58
- progress = 0.9;
59
- break;
60
- case "completed":
61
- progress = 1.0;
62
- break;
63
- case "failed":
64
- progress = 0;
65
- break;
66
- }
67
- // Estimate ETA based on average training time
68
- let eta;
69
- if (batch.status === "training" && batch.startedAt) {
70
- const avgTrainingTime = 2 * 60 * 60 * 1000; // 2 hours average
71
- const elapsed = Date.now() - batch.startedAt.getTime();
72
- eta = Math.max(0, avgTrainingTime - elapsed);
73
- }
74
- return {
75
- batchId,
76
- status: batch.status,
77
- progress,
78
- loss: batch.trainingLoss ?? undefined,
79
- eta,
80
- error: batch.error ?? undefined,
81
- };
82
- }
83
- /**
84
- * Check if training is stuck
85
- */
86
- async checkForStuckJobs() {
87
- const fourHoursMs = 4 * 60 * 60 * 1000;
88
- const adapter = getTrainingDataAdapter();
89
- const stuckJobs = await adapter.getStuckTrainingBatches(fourHoursMs);
90
- if (stuckJobs.length > 0) {
91
- logger.warn("Found stuck training jobs", {
92
- count: stuckJobs.length,
93
- jobs: stuckJobs,
94
- }, "TrainingMonitor");
95
- }
96
- return stuckJobs;
97
- }
98
- /**
99
- * Cancel training job
100
- */
101
- async cancelJob(batchId, reason) {
102
- const adapter = getTrainingDataAdapter();
103
- await adapter.updateBatchStatus(batchId, "failed", `Cancelled: ${reason}`);
104
- logger.warn("Training job cancelled", { batchId, reason }, "TrainingMonitor");
105
- }
106
- }
107
- // Singleton
108
- export const trainingMonitor = new TrainingMonitor();