@elizaos/training 2.0.0-alpha.77 → 2.0.0-alpha.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/.turbo/turbo-lint.log +0 -3
- package/.turbo/turbo-typecheck.log +0 -1
- package/dist/.tsbuildinfo +0 -1
- package/dist/adapter.js +0 -59
- package/dist/archetypes/ArchetypeConfigService.js +0 -510
- package/dist/archetypes/derive-archetype.js +0 -196
- package/dist/archetypes/index.js +0 -7
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
- package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
- package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
- package/dist/benchmark/BenchmarkDataViewer.js +0 -197
- package/dist/benchmark/BenchmarkHistoryService.js +0 -135
- package/dist/benchmark/BenchmarkRunner.js +0 -483
- package/dist/benchmark/BenchmarkValidator.js +0 -158
- package/dist/benchmark/FastEvalRunner.js +0 -133
- package/dist/benchmark/MetricsValidator.js +0 -104
- package/dist/benchmark/MetricsVisualizer.js +0 -775
- package/dist/benchmark/ModelBenchmarkService.js +0 -433
- package/dist/benchmark/ModelRegistry.js +0 -122
- package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
- package/dist/benchmark/SimulationA2AInterface.js +0 -683
- package/dist/benchmark/SimulationEngine.js +0 -522
- package/dist/benchmark/TaskRunner.js +0 -60
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
- package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
- package/dist/benchmark/index.js +0 -23
- package/dist/benchmark/parseSimulationMetrics.js +0 -86
- package/dist/benchmark/simulation-types.js +0 -1
- package/dist/dependencies.js +0 -197
- package/dist/generation/TrajectoryGenerator.js +0 -244
- package/dist/generation/index.js +0 -6
- package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
- package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
- package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
- package/dist/huggingface/index.js +0 -9
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
- package/dist/index.js +0 -41
- package/dist/init-training.js +0 -43
- package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
- package/dist/metrics/index.js +0 -7
- package/dist/metrics/types.js +0 -21
- package/dist/rubrics/__tests__/index.test.js +0 -150
- package/dist/rubrics/ass-kisser.js +0 -83
- package/dist/rubrics/degen.js +0 -78
- package/dist/rubrics/goody-twoshoes.js +0 -82
- package/dist/rubrics/index.js +0 -184
- package/dist/rubrics/information-trader.js +0 -82
- package/dist/rubrics/infosec.js +0 -99
- package/dist/rubrics/liar.js +0 -102
- package/dist/rubrics/perps-trader.js +0 -85
- package/dist/rubrics/researcher.js +0 -79
- package/dist/rubrics/scammer.js +0 -80
- package/dist/rubrics/social-butterfly.js +0 -71
- package/dist/rubrics/super-predictor.js +0 -95
- package/dist/rubrics/trader.js +0 -65
- package/dist/scoring/ArchetypeScoringService.js +0 -301
- package/dist/scoring/JudgePromptBuilder.js +0 -401
- package/dist/scoring/LLMJudgeCache.js +0 -263
- package/dist/scoring/index.js +0 -8
- package/dist/training/AutomationPipeline.js +0 -714
- package/dist/training/BenchmarkService.js +0 -370
- package/dist/training/ConfigValidator.js +0 -153
- package/dist/training/MarketOutcomesTracker.js +0 -142
- package/dist/training/ModelDeployer.js +0 -128
- package/dist/training/ModelFetcher.js +0 -48
- package/dist/training/ModelSelectionService.js +0 -248
- package/dist/training/ModelUsageVerifier.js +0 -106
- package/dist/training/MultiModelOrchestrator.js +0 -349
- package/dist/training/RLModelConfig.js +0 -295
- package/dist/training/RewardBackpropagationService.js +0 -117
- package/dist/training/RulerScoringService.js +0 -450
- package/dist/training/TrainingMonitor.js +0 -108
- package/dist/training/TrajectoryRecorder.js +0 -281
- package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
- package/dist/training/index.js +0 -30
- package/dist/training/logRLConfig.js +0 -29
- package/dist/training/pipeline.js +0 -80
- package/dist/training/storage/ModelStorageService.js +0 -190
- package/dist/training/storage/TrainingDataArchiver.js +0 -136
- package/dist/training/storage/index.js +0 -7
- package/dist/training/types.js +0 -6
- package/dist/training/window-utils.js +0 -100
- package/dist/utils/index.js +0 -73
- package/dist/utils/logger.js +0 -55
- package/dist/utils/snowflake.js +0 -15
- package/dist/utils/synthetic-detector.js +0 -67
- package/vitest.config.ts +0 -8
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Reward Backpropagation Service
|
|
3
|
-
*
|
|
4
|
-
* Updates trajectory rewards when market outcomes become known.
|
|
5
|
-
* This allows the RL model to learn from actual results, not just immediate actions.
|
|
6
|
-
*/
|
|
7
|
-
import { getMarketDataAdapter, getTrainingDataAdapter } from "../adapter";
|
|
8
|
-
import { logger } from "../utils/logger";
|
|
9
|
-
import { MarketOutcomesTracker } from "./MarketOutcomesTracker";
|
|
10
|
-
export class RewardBackpropagationService {
|
|
11
|
-
outcomesTracker;
|
|
12
|
-
constructor() {
|
|
13
|
-
this.outcomesTracker = new MarketOutcomesTracker();
|
|
14
|
-
}
|
|
15
|
-
/**
|
|
16
|
-
* Update rewards for trajectories in a window when outcomes become known
|
|
17
|
-
*/
|
|
18
|
-
async updateRewardsForWindow(windowId) {
|
|
19
|
-
logger.info("Updating rewards for window", { windowId });
|
|
20
|
-
// Get outcomes for this window
|
|
21
|
-
const outcomes = await this.outcomesTracker.getWindowOutcomes(windowId);
|
|
22
|
-
if (!outcomes) {
|
|
23
|
-
logger.info("No outcomes found for window", { windowId });
|
|
24
|
-
return 0;
|
|
25
|
-
}
|
|
26
|
-
// Get all trajectories for this window (filter to training data)
|
|
27
|
-
const allTrajectories = await getTrainingDataAdapter().getTrajectoriesByWindow(windowId);
|
|
28
|
-
const trajectoriesResult = allTrajectories.filter((t) => t.isTrainingData);
|
|
29
|
-
let updated = 0;
|
|
30
|
-
for (const traj of trajectoriesResult) {
|
|
31
|
-
if (!traj.stepsJson)
|
|
32
|
-
continue;
|
|
33
|
-
const steps = JSON.parse(traj.stepsJson);
|
|
34
|
-
let totalReward = 0;
|
|
35
|
-
let hasUpdates = false;
|
|
36
|
-
// Update rewards for each step based on outcomes
|
|
37
|
-
for (const step of steps) {
|
|
38
|
-
const originalReward = step.reward;
|
|
39
|
-
let updatedReward = originalReward;
|
|
40
|
-
// Check if this step involved trading
|
|
41
|
-
if (step.action.actionType.includes("TRADING") ||
|
|
42
|
-
step.action.actionType.includes("BUY") ||
|
|
43
|
-
step.action.actionType.includes("SELL")) {
|
|
44
|
-
// Extract market ID from action parameters
|
|
45
|
-
const marketId = step.action.parameters?.marketId;
|
|
46
|
-
const ticker = step.action.parameters?.ticker;
|
|
47
|
-
if (marketId) {
|
|
48
|
-
// Check prediction market outcome
|
|
49
|
-
const prediction = outcomes.predictions.find((p) => p.marketId === marketId);
|
|
50
|
-
if (prediction) {
|
|
51
|
-
// Calculate reward based on whether trade was correct
|
|
52
|
-
const side = step.action.parameters?.side;
|
|
53
|
-
const isCorrect = (side === "YES" && prediction.outcome === "YES") ||
|
|
54
|
-
(side === "NO" && prediction.outcome === "NO");
|
|
55
|
-
// Reward: +1 for correct, -1 for incorrect (normalized)
|
|
56
|
-
updatedReward = isCorrect ? 1.0 : -1.0;
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
else if (ticker) {
|
|
60
|
-
// Check perpetual outcome
|
|
61
|
-
const stock = outcomes.stocks.find((s) => s.ticker === ticker);
|
|
62
|
-
if (stock) {
|
|
63
|
-
// Calculate reward based on price movement
|
|
64
|
-
const side = step.action.parameters?.side;
|
|
65
|
-
const priceChange = stock.changePercent;
|
|
66
|
-
// Reward based on whether position direction matched price movement
|
|
67
|
-
// Long position: positive reward if price went up
|
|
68
|
-
// Short position: positive reward if price went down
|
|
69
|
-
if (side === "long") {
|
|
70
|
-
updatedReward = Math.max(-1, Math.min(1, priceChange / 10)); // Normalize to -1 to 1
|
|
71
|
-
}
|
|
72
|
-
else if (side === "short") {
|
|
73
|
-
updatedReward = Math.max(-1, Math.min(1, -priceChange / 10)); // Inverted for short
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
}
|
|
78
|
-
if (updatedReward !== originalReward) {
|
|
79
|
-
step.reward = updatedReward;
|
|
80
|
-
hasUpdates = true;
|
|
81
|
-
}
|
|
82
|
-
totalReward += step.reward;
|
|
83
|
-
}
|
|
84
|
-
// Update trajectory if rewards changed
|
|
85
|
-
if (hasUpdates) {
|
|
86
|
-
await getTrainingDataAdapter().updateTrajectoryRewards(traj.id, JSON.stringify(steps), totalReward);
|
|
87
|
-
updated++;
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
logger.info("Updated rewards for trajectories", {
|
|
91
|
-
windowId,
|
|
92
|
-
updated,
|
|
93
|
-
total: trajectoriesResult.length,
|
|
94
|
-
});
|
|
95
|
-
return updated;
|
|
96
|
-
}
|
|
97
|
-
/**
|
|
98
|
-
* Process all windows that have outcomes but haven't been updated
|
|
99
|
-
*/
|
|
100
|
-
async processPendingWindows() {
|
|
101
|
-
// Get all windows with outcomes
|
|
102
|
-
const marketAdapter = getMarketDataAdapter();
|
|
103
|
-
if (!marketAdapter) {
|
|
104
|
-
return 0;
|
|
105
|
-
}
|
|
106
|
-
const windowIds = await marketAdapter.getDistinctWindowsWithOutcomes();
|
|
107
|
-
let processed = 0;
|
|
108
|
-
for (const windowId of windowIds) {
|
|
109
|
-
const updated = await this.updateRewardsForWindow(windowId);
|
|
110
|
-
if (updated > 0) {
|
|
111
|
-
processed++;
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
return processed;
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
export const rewardBackpropagationService = new RewardBackpropagationService();
|
|
@@ -1,450 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* RULER Scoring Service
|
|
3
|
-
*
|
|
4
|
-
* Implements RULER (Relative Universal LLM-Elicited Rewards) using LLM-as-judge.
|
|
5
|
-
*
|
|
6
|
-
* Key features:
|
|
7
|
-
* - Groups trajectories by scenarioId for relative comparison
|
|
8
|
-
* - Uses LLM judge to score trajectories relative to each other (0-1)
|
|
9
|
-
* - Injects game context (P&L, episode length, actions) into judge prompt
|
|
10
|
-
* - Deduplicates common message prefixes to save tokens
|
|
11
|
-
* - Works with any LiteLLM-compatible provider (Groq, OpenAI, etc.)
|
|
12
|
-
*
|
|
13
|
-
* Based on: https://art.openpipe.ai/fundamentals/ruler
|
|
14
|
-
*/
|
|
15
|
-
import { getTrainingDataAdapter } from "../adapter";
|
|
16
|
-
/** Cast string to UUID (replaces @elizaos/core asUUID) */
|
|
17
|
-
function asUUID(id) {
|
|
18
|
-
return id;
|
|
19
|
-
}
|
|
20
|
-
import { v4 as uuidv4 } from "uuid";
|
|
21
|
-
import { getLLMCaller, getToTrainingMessages, } from "../dependencies";
|
|
22
|
-
import { getRubric, sanitizeArchetype } from "../rubrics";
|
|
23
|
-
import { logger, splitIntoBatches } from "../utils";
|
|
24
|
-
/**
|
|
25
|
-
* Default RULER rubric - works well for most RL tasks
|
|
26
|
-
*/
|
|
27
|
-
const DEFAULT_RUBRIC = `
|
|
28
|
-
- A trajectory that achieves its goal should always get a significantly higher score than a trajectory that does not achieve its goal.
|
|
29
|
-
- A trajectory that achieves its goal more efficiently (eg. by avoiding unproductive detours) should get a higher score than a trajectory that achieves its goal less efficiently.
|
|
30
|
-
- If one trajectory is only slightly better than another, the difference in scores should be small. If it is significantly better, the difference in scores should be large.
|
|
31
|
-
- You may give some partial credit for a trajectory that makes progress towards its goal but does not complete it.
|
|
32
|
-
`;
|
|
33
|
-
export class RulerScoringService {
|
|
34
|
-
minGroupSize = 2; // Minimum trajectories per group for comparison
|
|
35
|
-
maxGroupSize = 8; // Optimal group size per RULER docs
|
|
36
|
-
/**
|
|
37
|
-
* Score trajectories using RULER (LLM-as-judge with relative comparison)
|
|
38
|
-
*
|
|
39
|
-
* Groups trajectories by scenarioId and scores them relative to each other.
|
|
40
|
-
* This is the proper RULER implementation - not simple heuristics!
|
|
41
|
-
*
|
|
42
|
-
* @param trajectoryIds - Optional: specific trajectory IDs to score. If not provided, scores all unscored trajectories.
|
|
43
|
-
* @returns Number of trajectories successfully scored
|
|
44
|
-
*/
|
|
45
|
-
async scoreTrajectories(trajectoryIds) {
|
|
46
|
-
const trajectoriesResult = await this.getTrajectoriesToScore(trajectoryIds);
|
|
47
|
-
if (trajectoriesResult.length === 0) {
|
|
48
|
-
logger.info("No trajectories to score", {}, "RulerScoring");
|
|
49
|
-
return 0;
|
|
50
|
-
}
|
|
51
|
-
const groups = this.groupByScenario(trajectoriesResult);
|
|
52
|
-
logger.info("Grouped trajectories for RULER scoring", {
|
|
53
|
-
totalTrajectories: trajectoriesResult.length,
|
|
54
|
-
groups: groups.length,
|
|
55
|
-
avgGroupSize: groups.length > 0 ? trajectoriesResult.length / groups.length : 0,
|
|
56
|
-
}, "RulerScoring");
|
|
57
|
-
let totalScored = 0;
|
|
58
|
-
for (const group of groups) {
|
|
59
|
-
if (group.trajectories.length < this.minGroupSize) {
|
|
60
|
-
logger.warn("Skipping group with insufficient trajectories", {
|
|
61
|
-
scenarioId: group.scenarioId,
|
|
62
|
-
count: group.trajectories.length,
|
|
63
|
-
minRequired: this.minGroupSize,
|
|
64
|
-
}, "RulerScoring");
|
|
65
|
-
continue;
|
|
66
|
-
}
|
|
67
|
-
const batches = splitIntoBatches(group.trajectories, this.maxGroupSize);
|
|
68
|
-
for (const batch of batches) {
|
|
69
|
-
const scored = await this.scoreGroup(batch, group.scenarioId);
|
|
70
|
-
totalScored += scored;
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
logger.info("RULER scoring complete", {
|
|
74
|
-
totalScored,
|
|
75
|
-
totalTrajectories: trajectoriesResult.length,
|
|
76
|
-
}, "RulerScoring");
|
|
77
|
-
return totalScored;
|
|
78
|
-
}
|
|
79
|
-
/**
|
|
80
|
-
* Score a single trajectory (for backward compatibility)
|
|
81
|
-
*
|
|
82
|
-
* Note: RULER works best with groups, so this finds other trajectories
|
|
83
|
-
* in the same scenario and scores them together.
|
|
84
|
-
*/
|
|
85
|
-
async scoreTrajectory(trajectoryId) {
|
|
86
|
-
const scored = await this.scoreTrajectories([trajectoryId]);
|
|
87
|
-
if (scored === 0) {
|
|
88
|
-
return null;
|
|
89
|
-
}
|
|
90
|
-
const updated = await getTrainingDataAdapter().getTrajectoryById(trajectoryId);
|
|
91
|
-
if (!updated || updated.aiJudgeReward === null) {
|
|
92
|
-
return null;
|
|
93
|
-
}
|
|
94
|
-
return {
|
|
95
|
-
trajectoryId: updated.trajectoryId,
|
|
96
|
-
overallScore: updated.aiJudgeReward,
|
|
97
|
-
reasoning: updated.aiJudgeReasoning || "",
|
|
98
|
-
scoredAt: updated.judgedAt || new Date(),
|
|
99
|
-
};
|
|
100
|
-
}
|
|
101
|
-
/**
|
|
102
|
-
* Score a group of trajectories using RULER
|
|
103
|
-
*
|
|
104
|
-
* This is the core RULER implementation:
|
|
105
|
-
* 1. Convert trajectories to message format
|
|
106
|
-
* 2. Extract common prefix (deduplication)
|
|
107
|
-
* 3. Build judge prompt with context (P&L, episode length, etc.)
|
|
108
|
-
* 4. Call LLM judge to score trajectories relative to each other
|
|
109
|
-
* 5. Save scores to database
|
|
110
|
-
*/
|
|
111
|
-
async scoreGroup(trajectoriesData, scenarioId) {
|
|
112
|
-
const richTrajectories = [];
|
|
113
|
-
for (const dbTraj of trajectoriesData) {
|
|
114
|
-
if (!dbTraj.stepsJson ||
|
|
115
|
-
dbTraj.stepsJson === "null" ||
|
|
116
|
-
dbTraj.stepsJson === "[]") {
|
|
117
|
-
logger.warn("Skipping trajectory with invalid stepsJson", {
|
|
118
|
-
trajectoryId: dbTraj.trajectoryId,
|
|
119
|
-
}, "RulerScoring");
|
|
120
|
-
continue;
|
|
121
|
-
}
|
|
122
|
-
const steps = JSON.parse(dbTraj.stepsJson);
|
|
123
|
-
const stepTimestamp = Date.now();
|
|
124
|
-
const richTraj = {
|
|
125
|
-
trajectoryId: asUUID(dbTraj.trajectoryId),
|
|
126
|
-
agentId: asUUID(uuidv4()),
|
|
127
|
-
startTime: 0,
|
|
128
|
-
endTime: 0,
|
|
129
|
-
durationMs: 0,
|
|
130
|
-
scenarioId: dbTraj.scenarioId || undefined,
|
|
131
|
-
steps: steps.map((s, idx) => ({
|
|
132
|
-
stepId: asUUID(uuidv4()),
|
|
133
|
-
stepNumber: idx,
|
|
134
|
-
timestamp: s.timestamp || stepTimestamp + idx,
|
|
135
|
-
environmentState: {
|
|
136
|
-
...s.environmentState,
|
|
137
|
-
timestamp: s.timestamp || stepTimestamp + idx,
|
|
138
|
-
agentPoints: s.environmentState.agentPoints ??
|
|
139
|
-
0,
|
|
140
|
-
},
|
|
141
|
-
observation: {},
|
|
142
|
-
providerAccesses: (s.providerAccesses || []).map((p) => ({
|
|
143
|
-
providerId: uuidv4(),
|
|
144
|
-
providerName: p.providerName,
|
|
145
|
-
timestamp: s.timestamp || stepTimestamp + idx,
|
|
146
|
-
query: p.data,
|
|
147
|
-
data: p.data,
|
|
148
|
-
purpose: p.purpose,
|
|
149
|
-
})),
|
|
150
|
-
llmCalls: (s.llmCalls || []).map((l) => ({
|
|
151
|
-
callId: uuidv4(),
|
|
152
|
-
timestamp: s.timestamp || stepTimestamp + idx,
|
|
153
|
-
model: l.model,
|
|
154
|
-
modelVersion: l.modelVersion,
|
|
155
|
-
systemPrompt: l.systemPrompt,
|
|
156
|
-
userPrompt: l.userPrompt,
|
|
157
|
-
response: l.response,
|
|
158
|
-
reasoning: l.reasoning,
|
|
159
|
-
temperature: l.temperature,
|
|
160
|
-
maxTokens: l.maxTokens,
|
|
161
|
-
latencyMs: l.latencyMs,
|
|
162
|
-
purpose: l.purpose,
|
|
163
|
-
actionType: l.actionType,
|
|
164
|
-
})),
|
|
165
|
-
action: {
|
|
166
|
-
attemptId: uuidv4(),
|
|
167
|
-
timestamp: s.timestamp || stepTimestamp + idx,
|
|
168
|
-
actionType: s.action.actionType,
|
|
169
|
-
actionName: s.action.actionType,
|
|
170
|
-
parameters: s.action.parameters,
|
|
171
|
-
reasoning: s.action.reasoning,
|
|
172
|
-
success: s.action.success,
|
|
173
|
-
result: s.action.result,
|
|
174
|
-
error: s.action.error,
|
|
175
|
-
},
|
|
176
|
-
reward: s.reward,
|
|
177
|
-
done: idx === steps.length - 1,
|
|
178
|
-
metadata: {},
|
|
179
|
-
})),
|
|
180
|
-
totalReward: steps.reduce((sum, s) => sum + s.reward, 0),
|
|
181
|
-
rewardComponents: {
|
|
182
|
-
environmentReward: steps.reduce((sum, s) => sum + s.reward, 0),
|
|
183
|
-
},
|
|
184
|
-
metrics: {
|
|
185
|
-
episodeLength: dbTraj.episodeLength || steps.length,
|
|
186
|
-
finalStatus: "completed",
|
|
187
|
-
finalPnL: dbTraj.finalPnL || undefined,
|
|
188
|
-
},
|
|
189
|
-
metadata: {
|
|
190
|
-
isTrainingData: true,
|
|
191
|
-
},
|
|
192
|
-
};
|
|
193
|
-
const toARTMessages = getToTrainingMessages();
|
|
194
|
-
const messages = toARTMessages(richTraj);
|
|
195
|
-
// Sanitize archetype to prevent prompt injection and handle null/empty values
|
|
196
|
-
const archetype = sanitizeArchetype(dbTraj.archetype);
|
|
197
|
-
richTrajectories.push({ traj: richTraj, messages, archetype });
|
|
198
|
-
}
|
|
199
|
-
if (richTrajectories.length < this.minGroupSize) {
|
|
200
|
-
logger.warn("Insufficient valid trajectories in group", {
|
|
201
|
-
scenarioId,
|
|
202
|
-
validCount: richTrajectories.length,
|
|
203
|
-
}, "RulerScoring");
|
|
204
|
-
return 0;
|
|
205
|
-
}
|
|
206
|
-
const commonPrefix = this.extractCommonPrefix(richTrajectories.map((rt) => rt.messages));
|
|
207
|
-
const judgePrompt = this.buildJudgePrompt(richTrajectories, commonPrefix, scenarioId);
|
|
208
|
-
const judgeResponse = await this.callJudge(judgePrompt);
|
|
209
|
-
if (!judgeResponse ||
|
|
210
|
-
judgeResponse.scores.length !== richTrajectories.length) {
|
|
211
|
-
logger.error("Invalid judge response", {
|
|
212
|
-
expectedScores: richTrajectories.length,
|
|
213
|
-
receivedScores: judgeResponse?.scores.length || 0,
|
|
214
|
-
}, "RulerScoring");
|
|
215
|
-
return 0;
|
|
216
|
-
}
|
|
217
|
-
const scoreMap = new Map();
|
|
218
|
-
for (const score of judgeResponse.scores) {
|
|
219
|
-
scoreMap.set(score.trajectory_id, score);
|
|
220
|
-
}
|
|
221
|
-
let scored = 0;
|
|
222
|
-
for (let i = 0; i < richTrajectories.length; i++) {
|
|
223
|
-
const expectedTrajId = `trajectory-${i + 1}`;
|
|
224
|
-
const scoreData = scoreMap.get(expectedTrajId);
|
|
225
|
-
if (!scoreData) {
|
|
226
|
-
logger.warn("Judge did not return score for trajectory", {
|
|
227
|
-
expectedTrajId,
|
|
228
|
-
receivedIds: judgeResponse.scores.map((s) => s.trajectory_id),
|
|
229
|
-
}, "RulerScoring");
|
|
230
|
-
continue;
|
|
231
|
-
}
|
|
232
|
-
const trajectoryId = richTrajectories[i]?.traj.trajectoryId;
|
|
233
|
-
await getTrainingDataAdapter().updateTrajectoryScore(trajectoryId, Math.max(0, Math.min(1, scoreData.score)), scoreData.explanation);
|
|
234
|
-
scored++;
|
|
235
|
-
}
|
|
236
|
-
logger.info("Scored trajectory group", {
|
|
237
|
-
scenarioId,
|
|
238
|
-
scored,
|
|
239
|
-
groupSize: richTrajectories.length,
|
|
240
|
-
}, "RulerScoring");
|
|
241
|
-
return scored;
|
|
242
|
-
}
|
|
243
|
-
/**
|
|
244
|
-
* Build judge prompt with trajectory context
|
|
245
|
-
*
|
|
246
|
-
* Injects game knowledge (P&L, episode length, actions) into the prompt
|
|
247
|
-
* so the judge can make informed relative comparisons.
|
|
248
|
-
*/
|
|
249
|
-
buildJudgePrompt(richTrajectories, commonPrefix, scenarioId) {
|
|
250
|
-
// Build context section with game knowledge (injected into prompt)
|
|
251
|
-
const contextParts = [];
|
|
252
|
-
contextParts.push(`Scenario: ${scenarioId}`);
|
|
253
|
-
contextParts.push(`\nTrajectory Performance Context (use this to inform your scoring):`);
|
|
254
|
-
for (let i = 0; i < richTrajectories.length; i++) {
|
|
255
|
-
const rt = richTrajectories[i];
|
|
256
|
-
if (!rt)
|
|
257
|
-
continue;
|
|
258
|
-
const trajId = `trajectory-${i + 1}`;
|
|
259
|
-
contextParts.push(`\n${trajId}:`);
|
|
260
|
-
contextParts.push(` - Archetype: ${rt.archetype}`);
|
|
261
|
-
contextParts.push(` - Final P&L: $${rt.traj.metrics.finalPnL?.toFixed(2) || "0.00"}`);
|
|
262
|
-
contextParts.push(` - Episode Length: ${rt.traj.metrics.episodeLength || 0} steps`);
|
|
263
|
-
contextParts.push(` - Total Reward: ${rt.traj.totalReward.toFixed(2)}`);
|
|
264
|
-
const actionTypes = rt.traj.steps
|
|
265
|
-
.filter((s) => !!s.action)
|
|
266
|
-
.map((s) => s.action?.actionType);
|
|
267
|
-
const uniqueActions = [...new Set(actionTypes)];
|
|
268
|
-
contextParts.push(` - Actions Taken: ${uniqueActions.join(", ")} (${actionTypes.length} total)`);
|
|
269
|
-
// Add success/error info
|
|
270
|
-
const errors = rt.traj.steps.filter((s) => !!s.action && !s.action.success).length;
|
|
271
|
-
const successRate = rt.traj.steps.length > 0
|
|
272
|
-
? (((rt.traj.steps.length - errors) / rt.traj.steps.length) *
|
|
273
|
-
100).toFixed(1)
|
|
274
|
-
: "0";
|
|
275
|
-
contextParts.push(` - Success Rate: ${successRate}%`);
|
|
276
|
-
if (errors > 0) {
|
|
277
|
-
contextParts.push(` - Errors: ${errors}`);
|
|
278
|
-
}
|
|
279
|
-
}
|
|
280
|
-
// Build trajectory messages (with deduplicated prefix)
|
|
281
|
-
const trajectorySections = [];
|
|
282
|
-
for (let i = 0; i < richTrajectories.length; i++) {
|
|
283
|
-
const rt = richTrajectories[i];
|
|
284
|
-
if (!rt)
|
|
285
|
-
continue;
|
|
286
|
-
const trajId = `trajectory-${i + 1}`;
|
|
287
|
-
// Remove common prefix from messages
|
|
288
|
-
const uniqueMessages = rt.messages.slice(commonPrefix.length);
|
|
289
|
-
// Truncate very long messages to save tokens (keep last 20 messages max)
|
|
290
|
-
const truncatedMessages = uniqueMessages.slice(-20);
|
|
291
|
-
trajectorySections.push(`<trajectory id="${trajId}">`);
|
|
292
|
-
trajectorySections.push(JSON.stringify(truncatedMessages, null, 2));
|
|
293
|
-
trajectorySections.push(`</trajectory>`);
|
|
294
|
-
}
|
|
295
|
-
// Build full prompt
|
|
296
|
-
const userContent = commonPrefix.length > 0
|
|
297
|
-
? `<context>\n${JSON.stringify(commonPrefix, null, 2)}\n</context>\n\n`
|
|
298
|
-
: "";
|
|
299
|
-
const prompt = `${userContent}${contextParts.join("\n")}\n\nTrajectories:\n\n${trajectorySections.join("\n\n")}`;
|
|
300
|
-
// Determine archetype-specific rubric
|
|
301
|
-
// If all trajectories share the same archetype, use that archetype's rubric
|
|
302
|
-
// Otherwise, fall back to the default rubric
|
|
303
|
-
const archetypes = [...new Set(richTrajectories.map((rt) => rt.archetype))];
|
|
304
|
-
const isSingleArchetype = archetypes.length === 1 && archetypes[0] !== "default";
|
|
305
|
-
const singleArchetype = archetypes[0];
|
|
306
|
-
const rubric = isSingleArchetype && singleArchetype
|
|
307
|
-
? getRubric(singleArchetype)
|
|
308
|
-
: DEFAULT_RUBRIC;
|
|
309
|
-
const archetypeContext = isSingleArchetype
|
|
310
|
-
? `\n\nYou are evaluating ${archetypes[0]?.toUpperCase()} agents. Score them based on how well they embody that archetype's behavior and goals.`
|
|
311
|
-
: archetypes.length > 1
|
|
312
|
-
? `\n\nNote: This group contains mixed archetypes (${archetypes.join(", ")}). Consider each agent's archetype when scoring.`
|
|
313
|
-
: "";
|
|
314
|
-
const systemPrompt = `You are an expert evaluator of AI agent performance. All trajectories below were given the same goal/scenario. Your job is to compare them and assign scores from 0 to 1 based on how well each trajectory achieved its goal.${archetypeContext}
|
|
315
|
-
|
|
316
|
-
Grading standards:
|
|
317
|
-
${rubric}
|
|
318
|
-
|
|
319
|
-
Important: Use the performance context provided (P&L, episode length, success rate, archetype) to inform your scoring, but also consider the quality of decision-making, efficiency, and goal achievement shown in the trajectory messages.`;
|
|
320
|
-
return JSON.stringify({
|
|
321
|
-
system: systemPrompt,
|
|
322
|
-
user: prompt,
|
|
323
|
-
});
|
|
324
|
-
}
|
|
325
|
-
/**
|
|
326
|
-
* Call LLM judge to score trajectories
|
|
327
|
-
*
|
|
328
|
-
* Uses structured output format to ensure valid JSON response.
|
|
329
|
-
*/
|
|
330
|
-
async callJudge(promptJson) {
|
|
331
|
-
const promptData = JSON.parse(promptJson);
|
|
332
|
-
const structuredPrompt = `${promptData.user}
|
|
333
|
-
|
|
334
|
-
Please respond with ONLY a valid JSON object in this exact format:
|
|
335
|
-
{
|
|
336
|
-
"scores": [
|
|
337
|
-
{
|
|
338
|
-
"trajectory_id": "trajectory-1",
|
|
339
|
-
"explanation": "Brief explanation of score",
|
|
340
|
-
"score": 0.85
|
|
341
|
-
},
|
|
342
|
-
{
|
|
343
|
-
"trajectory_id": "trajectory-2",
|
|
344
|
-
"explanation": "Brief explanation of score",
|
|
345
|
-
"score": 0.65
|
|
346
|
-
}
|
|
347
|
-
]
|
|
348
|
-
}
|
|
349
|
-
|
|
350
|
-
Return ONLY the JSON, no other text.`;
|
|
351
|
-
const llmCaller = getLLMCaller();
|
|
352
|
-
const response = await llmCaller.callGroqDirect({
|
|
353
|
-
prompt: structuredPrompt,
|
|
354
|
-
system: promptData.system,
|
|
355
|
-
modelSize: "large",
|
|
356
|
-
temperature: 0.3,
|
|
357
|
-
maxTokens: 2000,
|
|
358
|
-
actionType: "ruler_score_trajectories",
|
|
359
|
-
});
|
|
360
|
-
let jsonText = response.trim();
|
|
361
|
-
jsonText = jsonText
|
|
362
|
-
.replace(/```json\n?/g, "")
|
|
363
|
-
.replace(/```\n?/g, "")
|
|
364
|
-
.trim();
|
|
365
|
-
const jsonMatch = jsonText.match(/\{[\s\S]*\}/);
|
|
366
|
-
if (!jsonMatch) {
|
|
367
|
-
logger.error("Judge response does not contain JSON", {
|
|
368
|
-
response: response.substring(0, 500),
|
|
369
|
-
}, "RulerScoring");
|
|
370
|
-
return null;
|
|
371
|
-
}
|
|
372
|
-
const parsed = JSON.parse(jsonMatch[0]);
|
|
373
|
-
if (!parsed.scores || !Array.isArray(parsed.scores)) {
|
|
374
|
-
logger.error("Invalid judge response structure", { parsed }, "RulerScoring");
|
|
375
|
-
return null;
|
|
376
|
-
}
|
|
377
|
-
for (const score of parsed.scores) {
|
|
378
|
-
if (score.score < 0 || score.score > 1) {
|
|
379
|
-
score.score = Math.max(0, Math.min(1, score.score));
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
return parsed;
|
|
383
|
-
}
|
|
384
|
-
/**
|
|
385
|
-
* Extract common message prefix from trajectories
|
|
386
|
-
*
|
|
387
|
-
* RULER deduplicates common prefixes to save tokens.
|
|
388
|
-
*/
|
|
389
|
-
extractCommonPrefix(messageLists) {
|
|
390
|
-
if (messageLists.length === 0)
|
|
391
|
-
return [];
|
|
392
|
-
const first = messageLists[0];
|
|
393
|
-
if (!first)
|
|
394
|
-
return [];
|
|
395
|
-
const prefix = [];
|
|
396
|
-
for (let i = 0; i < first.length; i++) {
|
|
397
|
-
const msg = first[i];
|
|
398
|
-
if (!msg)
|
|
399
|
-
continue;
|
|
400
|
-
const allMatch = messageLists.every((msgs) => msgs[i] &&
|
|
401
|
-
msgs[i]?.role === msg.role &&
|
|
402
|
-
msgs[i]?.content === msg.content);
|
|
403
|
-
if (allMatch) {
|
|
404
|
-
prefix.push(msg);
|
|
405
|
-
}
|
|
406
|
-
else {
|
|
407
|
-
break;
|
|
408
|
-
}
|
|
409
|
-
}
|
|
410
|
-
return prefix;
|
|
411
|
-
}
|
|
412
|
-
/**
|
|
413
|
-
* Group trajectories by scenarioId
|
|
414
|
-
*/
|
|
415
|
-
groupByScenario(trajectoriesData) {
|
|
416
|
-
const groups = new Map();
|
|
417
|
-
for (const traj of trajectoriesData) {
|
|
418
|
-
const scenarioId = traj.scenarioId || "default";
|
|
419
|
-
if (!groups.has(scenarioId)) {
|
|
420
|
-
groups.set(scenarioId, []);
|
|
421
|
-
}
|
|
422
|
-
groups.get(scenarioId)?.push(traj);
|
|
423
|
-
}
|
|
424
|
-
return Array.from(groups.entries()).map(([scenarioId, trajs]) => ({
|
|
425
|
-
scenarioId,
|
|
426
|
-
trajectories: trajs,
|
|
427
|
-
}));
|
|
428
|
-
}
|
|
429
|
-
/**
|
|
430
|
-
* Get trajectories to score
|
|
431
|
-
*/
|
|
432
|
-
async getTrajectoriesToScore(trajectoryIds) {
|
|
433
|
-
const adapter = getTrainingDataAdapter();
|
|
434
|
-
return await adapter.getUnscoredTrajectories(trajectoryIds && trajectoryIds.length > 0 ? { trajectoryIds } : undefined);
|
|
435
|
-
}
|
|
436
|
-
/**
|
|
437
|
-
* Score all unscored trajectories in a time window
|
|
438
|
-
*/
|
|
439
|
-
async scoreWindow(windowId) {
|
|
440
|
-
const trajectoryIds = await getTrainingDataAdapter().getUnscoredWindowTrajectoryIds(windowId);
|
|
441
|
-
if (trajectoryIds.length === 0) {
|
|
442
|
-
return 0;
|
|
443
|
-
}
|
|
444
|
-
return await this.scoreTrajectories(trajectoryIds);
|
|
445
|
-
}
|
|
446
|
-
}
|
|
447
|
-
/**
|
|
448
|
-
* Singleton instance of RulerScoringService
|
|
449
|
-
*/
|
|
450
|
-
export const rulerScoringService = new RulerScoringService();
|
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Training Monitor Service
|
|
3
|
-
*
|
|
4
|
-
* Tracks training job progress and updates database with status.
|
|
5
|
-
* Monitors Python training process and W&B runs.
|
|
6
|
-
*/
|
|
7
|
-
import { getTrainingDataAdapter } from "../adapter";
|
|
8
|
-
import { logger } from "../utils/logger";
|
|
9
|
-
export class TrainingMonitor {
|
|
10
|
-
/**
|
|
11
|
-
* Start monitoring a training job
|
|
12
|
-
*/
|
|
13
|
-
async startMonitoring(batchId) {
|
|
14
|
-
const adapter = getTrainingDataAdapter();
|
|
15
|
-
await adapter.updateBatchStatus(batchId, "training");
|
|
16
|
-
logger.info("Started monitoring training job", { batchId }, "TrainingMonitor");
|
|
17
|
-
}
|
|
18
|
-
/**
|
|
19
|
-
* Update training progress
|
|
20
|
-
*/
|
|
21
|
-
async updateProgress(batchId, progress) {
|
|
22
|
-
if (progress.status) {
|
|
23
|
-
const adapter = getTrainingDataAdapter();
|
|
24
|
-
const errorMsg = progress.status === "failed" ? progress.error : undefined;
|
|
25
|
-
await adapter.updateBatchStatus(batchId, progress.status, errorMsg);
|
|
26
|
-
}
|
|
27
|
-
logger.info("Updated training progress", {
|
|
28
|
-
batchId,
|
|
29
|
-
status: progress.status,
|
|
30
|
-
progress: progress.progress,
|
|
31
|
-
}, "TrainingMonitor");
|
|
32
|
-
}
|
|
33
|
-
/**
|
|
34
|
-
* Get current progress for a job
|
|
35
|
-
*/
|
|
36
|
-
async getProgress(batchId) {
|
|
37
|
-
const adapter = getTrainingDataAdapter();
|
|
38
|
-
const batch = await adapter.getBatchById(batchId);
|
|
39
|
-
if (!batch) {
|
|
40
|
-
return null;
|
|
41
|
-
}
|
|
42
|
-
// Calculate progress based on status
|
|
43
|
-
let progress = 0;
|
|
44
|
-
switch (batch.status) {
|
|
45
|
-
case "pending":
|
|
46
|
-
progress = 0;
|
|
47
|
-
break;
|
|
48
|
-
case "preparing":
|
|
49
|
-
progress = 0.1;
|
|
50
|
-
break;
|
|
51
|
-
case "scoring":
|
|
52
|
-
progress = 0.3;
|
|
53
|
-
break;
|
|
54
|
-
case "training":
|
|
55
|
-
progress = 0.6;
|
|
56
|
-
break;
|
|
57
|
-
case "uploading":
|
|
58
|
-
progress = 0.9;
|
|
59
|
-
break;
|
|
60
|
-
case "completed":
|
|
61
|
-
progress = 1.0;
|
|
62
|
-
break;
|
|
63
|
-
case "failed":
|
|
64
|
-
progress = 0;
|
|
65
|
-
break;
|
|
66
|
-
}
|
|
67
|
-
// Estimate ETA based on average training time
|
|
68
|
-
let eta;
|
|
69
|
-
if (batch.status === "training" && batch.startedAt) {
|
|
70
|
-
const avgTrainingTime = 2 * 60 * 60 * 1000; // 2 hours average
|
|
71
|
-
const elapsed = Date.now() - batch.startedAt.getTime();
|
|
72
|
-
eta = Math.max(0, avgTrainingTime - elapsed);
|
|
73
|
-
}
|
|
74
|
-
return {
|
|
75
|
-
batchId,
|
|
76
|
-
status: batch.status,
|
|
77
|
-
progress,
|
|
78
|
-
loss: batch.trainingLoss ?? undefined,
|
|
79
|
-
eta,
|
|
80
|
-
error: batch.error ?? undefined,
|
|
81
|
-
};
|
|
82
|
-
}
|
|
83
|
-
/**
|
|
84
|
-
* Check if training is stuck
|
|
85
|
-
*/
|
|
86
|
-
async checkForStuckJobs() {
|
|
87
|
-
const fourHoursMs = 4 * 60 * 60 * 1000;
|
|
88
|
-
const adapter = getTrainingDataAdapter();
|
|
89
|
-
const stuckJobs = await adapter.getStuckTrainingBatches(fourHoursMs);
|
|
90
|
-
if (stuckJobs.length > 0) {
|
|
91
|
-
logger.warn("Found stuck training jobs", {
|
|
92
|
-
count: stuckJobs.length,
|
|
93
|
-
jobs: stuckJobs,
|
|
94
|
-
}, "TrainingMonitor");
|
|
95
|
-
}
|
|
96
|
-
return stuckJobs;
|
|
97
|
-
}
|
|
98
|
-
/**
|
|
99
|
-
* Cancel training job
|
|
100
|
-
*/
|
|
101
|
-
async cancelJob(batchId, reason) {
|
|
102
|
-
const adapter = getTrainingDataAdapter();
|
|
103
|
-
await adapter.updateBatchStatus(batchId, "failed", `Cancelled: ${reason}`);
|
|
104
|
-
logger.warn("Training job cancelled", { batchId, reason }, "TrainingMonitor");
|
|
105
|
-
}
|
|
106
|
-
}
|
|
107
|
-
// Singleton
|
|
108
|
-
export const trainingMonitor = new TrainingMonitor();
|