@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-lint.log +2 -0
- package/.turbo/turbo-typecheck.log +1 -0
- package/dist/.tsbuildinfo +1 -0
- package/dist/adapter.js +59 -0
- package/dist/archetypes/ArchetypeConfigService.js +510 -0
- package/dist/archetypes/derive-archetype.js +196 -0
- package/dist/archetypes/index.js +7 -0
- package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
- package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
- package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
- package/dist/benchmark/BenchmarkDataViewer.js +197 -0
- package/dist/benchmark/BenchmarkHistoryService.js +135 -0
- package/dist/benchmark/BenchmarkRunner.js +483 -0
- package/dist/benchmark/BenchmarkValidator.js +158 -0
- package/dist/benchmark/FastEvalRunner.js +133 -0
- package/dist/benchmark/MetricsValidator.js +104 -0
- package/dist/benchmark/MetricsVisualizer.js +775 -0
- package/dist/benchmark/ModelBenchmarkService.js +433 -0
- package/dist/benchmark/ModelRegistry.js +122 -0
- package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
- package/dist/benchmark/SimulationA2AInterface.js +683 -0
- package/dist/benchmark/SimulationEngine.js +522 -0
- package/dist/benchmark/TaskRunner.js +60 -0
- package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
- package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
- package/dist/benchmark/index.js +23 -0
- package/dist/benchmark/parseSimulationMetrics.js +86 -0
- package/dist/benchmark/simulation-types.js +1 -0
- package/dist/dependencies.js +197 -0
- package/dist/generation/TrajectoryGenerator.js +244 -0
- package/dist/generation/index.js +6 -0
- package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
- package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
- package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
- package/dist/huggingface/index.js +9 -0
- package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
- package/dist/index.js +41 -0
- package/dist/init-training.js +43 -0
- package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
- package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
- package/dist/metrics/index.js +7 -0
- package/dist/metrics/types.js +21 -0
- package/dist/rubrics/__tests__/index.test.js +150 -0
- package/dist/rubrics/ass-kisser.js +83 -0
- package/dist/rubrics/degen.js +78 -0
- package/dist/rubrics/goody-twoshoes.js +82 -0
- package/dist/rubrics/index.js +184 -0
- package/dist/rubrics/information-trader.js +82 -0
- package/dist/rubrics/infosec.js +99 -0
- package/dist/rubrics/liar.js +102 -0
- package/dist/rubrics/perps-trader.js +85 -0
- package/dist/rubrics/researcher.js +79 -0
- package/dist/rubrics/scammer.js +80 -0
- package/dist/rubrics/social-butterfly.js +71 -0
- package/dist/rubrics/super-predictor.js +95 -0
- package/dist/rubrics/trader.js +65 -0
- package/dist/scoring/ArchetypeScoringService.js +301 -0
- package/dist/scoring/JudgePromptBuilder.js +401 -0
- package/dist/scoring/LLMJudgeCache.js +263 -0
- package/dist/scoring/index.js +8 -0
- package/dist/training/AutomationPipeline.js +714 -0
- package/dist/training/BenchmarkService.js +370 -0
- package/dist/training/ConfigValidator.js +153 -0
- package/dist/training/MarketOutcomesTracker.js +142 -0
- package/dist/training/ModelDeployer.js +128 -0
- package/dist/training/ModelFetcher.js +48 -0
- package/dist/training/ModelSelectionService.js +248 -0
- package/dist/training/ModelUsageVerifier.js +106 -0
- package/dist/training/MultiModelOrchestrator.js +349 -0
- package/dist/training/RLModelConfig.js +295 -0
- package/dist/training/RewardBackpropagationService.js +117 -0
- package/dist/training/RulerScoringService.js +450 -0
- package/dist/training/TrainingMonitor.js +108 -0
- package/dist/training/TrajectoryRecorder.js +281 -0
- package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
- package/dist/training/index.js +30 -0
- package/dist/training/logRLConfig.js +29 -0
- package/dist/training/pipeline.js +80 -0
- package/dist/training/storage/ModelStorageService.js +190 -0
- package/dist/training/storage/TrainingDataArchiver.js +136 -0
- package/dist/training/storage/index.js +7 -0
- package/dist/training/types.js +6 -0
- package/dist/training/window-utils.js +100 -0
- package/dist/utils/index.js +73 -0
- package/dist/utils/logger.js +55 -0
- package/dist/utils/snowflake.js +15 -0
- package/dist/utils/synthetic-detector.js +67 -0
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JudgePromptBuilder
|
|
3
|
+
*
|
|
4
|
+
* Builds LLM judge prompts with trajectory metrics context and archetype-specific rubrics.
|
|
5
|
+
* Metrics are included as CONTEXT for the judge, not weighted directly.
|
|
6
|
+
*
|
|
7
|
+
* @packageDocumentation
|
|
8
|
+
*/
|
|
9
|
+
import { getMetricsSummary } from "../metrics/types";
|
|
10
|
+
import { getPriorityMetrics, getRubric } from "../rubrics";
|
|
11
|
+
const DEFAULT_OPTIONS = {
|
|
12
|
+
includeActionDetails: false,
|
|
13
|
+
maxActionsToShow: 20,
|
|
14
|
+
includeKeyDecisions: true,
|
|
15
|
+
};
|
|
16
|
+
/**
|
|
17
|
+
* Builds prompts for LLM-as-judge scoring.
|
|
18
|
+
*/
|
|
19
|
+
export class JudgePromptBuilder {
|
|
20
|
+
/**
|
|
21
|
+
* Build prompt for single trajectory scoring.
|
|
22
|
+
* @param trajectory - Trajectory context
|
|
23
|
+
* @param options - Prompt options
|
|
24
|
+
* @returns System and user prompts
|
|
25
|
+
*/
|
|
26
|
+
buildSinglePrompt(trajectory, options = {}) {
|
|
27
|
+
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
28
|
+
const archetype = trajectory.archetype || "default";
|
|
29
|
+
const rubric = getRubric(archetype);
|
|
30
|
+
const priorityMetrics = getPriorityMetrics(archetype);
|
|
31
|
+
const system = this.buildSystemPrompt(archetype, rubric);
|
|
32
|
+
const user = this.buildUserPrompt(trajectory, priorityMetrics, opts);
|
|
33
|
+
return { system, user };
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Build a judge prompt for comparing multiple trajectories (RULER style)
|
|
37
|
+
*/
|
|
38
|
+
buildComparisonPrompt(trajectories, scenarioId, options = {}) {
|
|
39
|
+
const opts = { ...DEFAULT_OPTIONS, ...options };
|
|
40
|
+
// Get archetype from first trajectory (assume all same archetype for comparison)
|
|
41
|
+
const archetype = trajectories[0]?.archetype || "default";
|
|
42
|
+
const rubric = getRubric(archetype);
|
|
43
|
+
const priorityMetrics = getPriorityMetrics(archetype);
|
|
44
|
+
const system = this.buildComparisonSystemPrompt(archetype, rubric);
|
|
45
|
+
const user = this.buildComparisonUserPrompt(trajectories, scenarioId, priorityMetrics, opts);
|
|
46
|
+
return { system, user };
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Build system prompt for single trajectory evaluation
|
|
50
|
+
*/
|
|
51
|
+
buildSystemPrompt(archetype, rubric) {
|
|
52
|
+
return `You are an expert evaluator of AI agent performance in prediction market simulations.
|
|
53
|
+
|
|
54
|
+
You are evaluating an agent with the "${archetype}" archetype. This archetype has specific goals and behaviors that should be evaluated differently than a generic agent.
|
|
55
|
+
|
|
56
|
+
${rubric}
|
|
57
|
+
|
|
58
|
+
Your task is to score this trajectory on a scale of 0.0 to 1.0 based on how well the agent embodied the "${archetype}" archetype's values and achieved its goals.
|
|
59
|
+
|
|
60
|
+
IMPORTANT: The metrics provided are CONTEXT to inform your judgment. Use them to understand what happened, but make a holistic evaluation based on the rubric - don't just calculate a weighted average of metrics.`;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Build system prompt for RULER comparison
|
|
64
|
+
*/
|
|
65
|
+
buildComparisonSystemPrompt(archetype, rubric) {
|
|
66
|
+
return `You are an expert evaluator of AI agent performance. All trajectories below were given the same scenario and are from "${archetype}" archetype agents.
|
|
67
|
+
|
|
68
|
+
Your job is to compare them RELATIVE to each other and assign scores from 0 to 1 based on how well each trajectory achieved the archetype's goals.
|
|
69
|
+
|
|
70
|
+
${rubric}
|
|
71
|
+
|
|
72
|
+
IMPORTANT RULER PRINCIPLES:
|
|
73
|
+
- A trajectory that achieves its archetype's goals should score significantly higher than one that doesn't
|
|
74
|
+
- A trajectory that achieves goals more efficiently should score higher
|
|
75
|
+
- If one trajectory is only slightly better, score differences should be small
|
|
76
|
+
- If one is significantly better, score differences should be large
|
|
77
|
+
- You may give partial credit for progress towards goals
|
|
78
|
+
|
|
79
|
+
The metrics provided are CONTEXT to inform your judgment. Use them to understand what happened, then make holistic evaluations based on the archetype rubric.`;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Build user prompt with trajectory context and metrics
|
|
83
|
+
*/
|
|
84
|
+
buildUserPrompt(trajectory, priorityMetrics, options) {
|
|
85
|
+
const parts = [];
|
|
86
|
+
// Agent info
|
|
87
|
+
parts.push(`## Agent Information`);
|
|
88
|
+
parts.push(`- Agent ID: ${trajectory.agentId}`);
|
|
89
|
+
parts.push(`- Archetype: ${trajectory.archetype || "unknown"}`);
|
|
90
|
+
parts.push(`- Episode Length: ${trajectory.episodeLength || trajectory.steps.length} ticks`);
|
|
91
|
+
parts.push("");
|
|
92
|
+
// Metrics section
|
|
93
|
+
parts.push(`## Behavioral Metrics`);
|
|
94
|
+
parts.push(this.formatMetrics(trajectory.metrics, priorityMetrics));
|
|
95
|
+
parts.push("");
|
|
96
|
+
// Action summary
|
|
97
|
+
parts.push(`## Action Summary`);
|
|
98
|
+
parts.push(this.summarizeActions(trajectory.steps));
|
|
99
|
+
parts.push("");
|
|
100
|
+
// Key decisions (if requested)
|
|
101
|
+
if (options.includeKeyDecisions) {
|
|
102
|
+
const keyDecisions = this.extractKeyDecisions(trajectory.steps);
|
|
103
|
+
if (keyDecisions) {
|
|
104
|
+
parts.push(`## Key Decisions`);
|
|
105
|
+
parts.push(keyDecisions);
|
|
106
|
+
parts.push("");
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Recent actions (if requested)
|
|
110
|
+
if (options.includeActionDetails) {
|
|
111
|
+
parts.push(`## Recent Actions (last ${options.maxActionsToShow})`);
|
|
112
|
+
parts.push(this.formatRecentActions(trajectory.steps, options.maxActionsToShow || 20));
|
|
113
|
+
parts.push("");
|
|
114
|
+
}
|
|
115
|
+
// Instructions
|
|
116
|
+
parts.push(`## Instructions`);
|
|
117
|
+
parts.push(`Score this trajectory on a scale of 0.0 to 1.0 based on how well it embodies the ${trajectory.archetype || "agent"} archetype's values.`);
|
|
118
|
+
parts.push("");
|
|
119
|
+
parts.push(`Respond with JSON:`);
|
|
120
|
+
parts.push(`{
|
|
121
|
+
"score": <float 0-1>,
|
|
122
|
+
"reasoning": "<2-3 sentence explanation>",
|
|
123
|
+
"strengths": ["<strength 1>", "<strength 2>"],
|
|
124
|
+
"weaknesses": ["<weakness 1>", "<weakness 2>"]
|
|
125
|
+
}`);
|
|
126
|
+
return parts.join("\n");
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Build user prompt for RULER comparison
|
|
130
|
+
*/
|
|
131
|
+
buildComparisonUserPrompt(trajectories, scenarioId, priorityMetrics, _options) {
|
|
132
|
+
const parts = [];
|
|
133
|
+
parts.push(`## Scenario: ${scenarioId}`);
|
|
134
|
+
parts.push(`## Number of Trajectories: ${trajectories.length}`);
|
|
135
|
+
parts.push("");
|
|
136
|
+
// Performance context for all trajectories
|
|
137
|
+
parts.push(`## Trajectory Performance Context`);
|
|
138
|
+
parts.push(`(Use this to inform your scoring)`);
|
|
139
|
+
parts.push("");
|
|
140
|
+
for (let i = 0; i < trajectories.length; i++) {
|
|
141
|
+
const traj = trajectories[i];
|
|
142
|
+
if (!traj)
|
|
143
|
+
continue;
|
|
144
|
+
const trajId = `trajectory-${i + 1}`;
|
|
145
|
+
parts.push(`### ${trajId}`);
|
|
146
|
+
parts.push(`- Archetype: ${traj.archetype || "unknown"}`);
|
|
147
|
+
parts.push(`- Episode Length: ${traj.episodeLength || traj.steps.length} steps`);
|
|
148
|
+
parts.push(`- Total Reward: ${traj.totalReward?.toFixed(2) || "0.00"}`);
|
|
149
|
+
parts.push("");
|
|
150
|
+
// Key metrics for this trajectory
|
|
151
|
+
parts.push(`**Key Metrics:**`);
|
|
152
|
+
parts.push(this.formatMetrics(traj.metrics, priorityMetrics));
|
|
153
|
+
parts.push("");
|
|
154
|
+
// Action summary
|
|
155
|
+
parts.push(`**Actions:**`);
|
|
156
|
+
parts.push(this.summarizeActions(traj.steps));
|
|
157
|
+
parts.push("");
|
|
158
|
+
}
|
|
159
|
+
// Instructions
|
|
160
|
+
parts.push(`## Instructions`);
|
|
161
|
+
parts.push(`Score each trajectory from 0.0 to 1.0 RELATIVE to each other based on the archetype rubric.`);
|
|
162
|
+
parts.push("");
|
|
163
|
+
parts.push(`Respond with ONLY valid JSON:`);
|
|
164
|
+
parts.push(`{
|
|
165
|
+
"scores": [
|
|
166
|
+
{
|
|
167
|
+
"trajectory_id": "trajectory-1",
|
|
168
|
+
"explanation": "Brief explanation",
|
|
169
|
+
"score": 0.85
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"trajectory_id": "trajectory-2",
|
|
173
|
+
"explanation": "Brief explanation",
|
|
174
|
+
"score": 0.65
|
|
175
|
+
}
|
|
176
|
+
]
|
|
177
|
+
}`);
|
|
178
|
+
return parts.join("\n");
|
|
179
|
+
}
|
|
180
|
+
/**
|
|
181
|
+
* Format metrics for prompt, highlighting priority metrics first
|
|
182
|
+
*/
|
|
183
|
+
formatMetrics(metrics, priorityMetrics) {
|
|
184
|
+
const lines = [];
|
|
185
|
+
// Show priority metrics first with emphasis
|
|
186
|
+
if (priorityMetrics.length > 0) {
|
|
187
|
+
lines.push("### ⭐ KEY METRICS FOR THIS ARCHETYPE");
|
|
188
|
+
for (const metricPath of priorityMetrics.slice(0, 6)) {
|
|
189
|
+
const value = this.getMetricValue(metrics, metricPath);
|
|
190
|
+
const label = this.formatMetricLabel(metricPath);
|
|
191
|
+
lines.push(`- **${label}**: ${value}`);
|
|
192
|
+
}
|
|
193
|
+
lines.push("");
|
|
194
|
+
}
|
|
195
|
+
// Summary metrics
|
|
196
|
+
const summary = getMetricsSummary(metrics);
|
|
197
|
+
lines.push("### Performance Summary");
|
|
198
|
+
lines.push(`- Total P&L: $${summary.totalPnL.toFixed(2)}`);
|
|
199
|
+
lines.push(`- Win Rate: ${(summary.winRate * 100).toFixed(1)}%`);
|
|
200
|
+
lines.push(`- Trades Executed: ${summary.tradesExecuted}`);
|
|
201
|
+
lines.push(`- Action Success Rate: ${(summary.actionSuccessRate * 100).toFixed(1)}%`);
|
|
202
|
+
lines.push("");
|
|
203
|
+
// Social metrics
|
|
204
|
+
lines.push("### Social Activity");
|
|
205
|
+
lines.push(`- Unique Users Interacted: ${metrics.social.uniqueUsersInteracted}`);
|
|
206
|
+
lines.push(`- Group Chats Joined: ${metrics.social.groupChatsJoined}`);
|
|
207
|
+
lines.push(`- DMs Initiated: ${metrics.social.dmsInitiated}`);
|
|
208
|
+
lines.push(`- Posts Created: ${metrics.social.postsCreated}`);
|
|
209
|
+
lines.push(`- Comments Made: ${metrics.social.commentsMade}`);
|
|
210
|
+
lines.push(`- Social to Trade Ratio: ${metrics.behavior.socialToTradeRatio.toFixed(2)}`);
|
|
211
|
+
lines.push("");
|
|
212
|
+
// Trading metrics
|
|
213
|
+
lines.push("### Trading Performance");
|
|
214
|
+
lines.push(`- Total P&L: $${metrics.trading.totalPnL.toFixed(2)}`);
|
|
215
|
+
lines.push(`- Win Rate: ${(metrics.trading.winRate * 100).toFixed(1)}%`);
|
|
216
|
+
lines.push(`- Sharpe Ratio: ${metrics.trading.sharpeRatio.toFixed(2)}`);
|
|
217
|
+
lines.push(`- Max Drawdown: $${metrics.trading.maxDrawdown.toFixed(2)}`);
|
|
218
|
+
lines.push(`- Markets Traded: ${metrics.trading.marketsTraded}`);
|
|
219
|
+
lines.push(`- Largest Win: $${metrics.trading.largestWin.toFixed(2)}`);
|
|
220
|
+
lines.push(`- Largest Loss: $${metrics.trading.largestLoss.toFixed(2)}`);
|
|
221
|
+
lines.push("");
|
|
222
|
+
// Influence metrics
|
|
223
|
+
lines.push("### Influence");
|
|
224
|
+
lines.push(`- Followers Gained: ${metrics.influence.followersGained}`);
|
|
225
|
+
lines.push(`- Reputation Delta: ${metrics.influence.reputationDelta > 0 ? "+" : ""}${metrics.influence.reputationDelta}`);
|
|
226
|
+
lines.push(`- Positive Reactions: ${metrics.influence.positiveReactions}`);
|
|
227
|
+
lines.push(`- Information Spread: ${metrics.influence.informationSpread}`);
|
|
228
|
+
lines.push("");
|
|
229
|
+
// Behavior metrics
|
|
230
|
+
lines.push("### Behavior Patterns");
|
|
231
|
+
lines.push(`- Actions Per Tick: ${metrics.behavior.actionsPerTick.toFixed(2)}`);
|
|
232
|
+
lines.push(`- Consistency Score: ${(metrics.behavior.consistencyScore * 100).toFixed(1)}%`);
|
|
233
|
+
lines.push(`- Dominant Action: ${metrics.behavior.dominantActionType || "none"}`);
|
|
234
|
+
lines.push("");
|
|
235
|
+
// Information metrics
|
|
236
|
+
lines.push("### Information Activity");
|
|
237
|
+
lines.push(`- Research Actions: ${metrics.information.researchActions}`);
|
|
238
|
+
lines.push(`- Predictions Made: ${metrics.information.predictionsMade}`);
|
|
239
|
+
lines.push(`- Prediction Accuracy: ${(metrics.information.predictionAccuracy * 100).toFixed(1)}%`);
|
|
240
|
+
return lines.join("\n");
|
|
241
|
+
}
|
|
242
|
+
/**
|
|
243
|
+
* Get a metric value from the metrics object using a dot-path
|
|
244
|
+
*/
|
|
245
|
+
getMetricValue(metrics, path) {
|
|
246
|
+
const [category, key] = path.split(".");
|
|
247
|
+
if (!category || !key)
|
|
248
|
+
return "N/A";
|
|
249
|
+
// Access nested metric value based on category
|
|
250
|
+
let value;
|
|
251
|
+
switch (category) {
|
|
252
|
+
case "trading":
|
|
253
|
+
value = metrics.trading[key];
|
|
254
|
+
break;
|
|
255
|
+
case "social":
|
|
256
|
+
value = metrics.social[key];
|
|
257
|
+
break;
|
|
258
|
+
case "influence":
|
|
259
|
+
value = metrics.influence[key];
|
|
260
|
+
break;
|
|
261
|
+
case "behavior":
|
|
262
|
+
value = metrics.behavior[key];
|
|
263
|
+
break;
|
|
264
|
+
case "information":
|
|
265
|
+
value = metrics.information[key];
|
|
266
|
+
break;
|
|
267
|
+
default:
|
|
268
|
+
return "N/A";
|
|
269
|
+
}
|
|
270
|
+
if (value === undefined || value === null)
|
|
271
|
+
return "N/A";
|
|
272
|
+
// Format based on value type
|
|
273
|
+
if (typeof value === "number") {
|
|
274
|
+
// Check if it's a rate/percentage
|
|
275
|
+
if (key.includes("Rate") ||
|
|
276
|
+
key.includes("Accuracy") ||
|
|
277
|
+
key.includes("Score")) {
|
|
278
|
+
return `${(value * 100).toFixed(1)}%`;
|
|
279
|
+
}
|
|
280
|
+
// Check if it's a currency
|
|
281
|
+
if (key.includes("PnL") ||
|
|
282
|
+
key.includes("Win") ||
|
|
283
|
+
key.includes("Loss") ||
|
|
284
|
+
key.includes("Drawdown")) {
|
|
285
|
+
return `$${value.toFixed(2)}`;
|
|
286
|
+
}
|
|
287
|
+
// Check if it's a ratio
|
|
288
|
+
if (key.includes("Ratio")) {
|
|
289
|
+
return value.toFixed(2);
|
|
290
|
+
}
|
|
291
|
+
// Integer-like values
|
|
292
|
+
if (Number.isInteger(value)) {
|
|
293
|
+
return String(value);
|
|
294
|
+
}
|
|
295
|
+
return value.toFixed(2);
|
|
296
|
+
}
|
|
297
|
+
return String(value);
|
|
298
|
+
}
|
|
299
|
+
/**
|
|
300
|
+
* Format a metric path into a human-readable label
|
|
301
|
+
*/
|
|
302
|
+
formatMetricLabel(path) {
|
|
303
|
+
const [, key] = path.split(".");
|
|
304
|
+
if (!key)
|
|
305
|
+
return path;
|
|
306
|
+
// Convert camelCase to Title Case with spaces
|
|
307
|
+
return key
|
|
308
|
+
.replace(/([A-Z])/g, " $1")
|
|
309
|
+
.replace(/^./, (str) => str.toUpperCase())
|
|
310
|
+
.trim();
|
|
311
|
+
}
|
|
312
|
+
/**
|
|
313
|
+
* Summarize actions in trajectory
|
|
314
|
+
*/
|
|
315
|
+
summarizeActions(steps) {
|
|
316
|
+
const actionCounts = new Map();
|
|
317
|
+
let successCount = 0;
|
|
318
|
+
let errorCount = 0;
|
|
319
|
+
for (const step of steps) {
|
|
320
|
+
const action = step.action;
|
|
321
|
+
if (!action)
|
|
322
|
+
continue;
|
|
323
|
+
const actionType = action.actionType;
|
|
324
|
+
actionCounts.set(actionType, (actionCounts.get(actionType) || 0) + 1);
|
|
325
|
+
if (action.success) {
|
|
326
|
+
successCount++;
|
|
327
|
+
}
|
|
328
|
+
else {
|
|
329
|
+
errorCount++;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
const sortedActions = Array.from(actionCounts.entries()).sort((a, b) => b[1] - a[1]);
|
|
333
|
+
const lines = [];
|
|
334
|
+
lines.push(`- Total Actions: ${steps.length} (${successCount} successful, ${errorCount} failed)`);
|
|
335
|
+
lines.push(`- Action Types: ${sortedActions.map(([type, count]) => `${type}(${count})`).join(", ")}`);
|
|
336
|
+
return lines.join("\n");
|
|
337
|
+
}
|
|
338
|
+
/**
|
|
339
|
+
* Extract key decisions (trades, significant social actions)
|
|
340
|
+
*/
|
|
341
|
+
extractKeyDecisions(steps) {
|
|
342
|
+
const keyActions = [];
|
|
343
|
+
const keyActionTypes = new Set([
|
|
344
|
+
"trade",
|
|
345
|
+
"buy",
|
|
346
|
+
"sell",
|
|
347
|
+
"predict",
|
|
348
|
+
"create_group_chat",
|
|
349
|
+
"post",
|
|
350
|
+
]);
|
|
351
|
+
for (const step of steps) {
|
|
352
|
+
const action = step.action;
|
|
353
|
+
if (!action)
|
|
354
|
+
continue;
|
|
355
|
+
if (keyActionTypes.has(action.actionType.toLowerCase())) {
|
|
356
|
+
const params = action.parameters || {};
|
|
357
|
+
const result = action.result || {};
|
|
358
|
+
let description = `${action.actionType}`;
|
|
359
|
+
// Add relevant details
|
|
360
|
+
if (params.amount || params.size) {
|
|
361
|
+
description += ` (size: ${params.amount || params.size})`;
|
|
362
|
+
}
|
|
363
|
+
if (params.marketId || params.market) {
|
|
364
|
+
description += ` on ${params.marketId || params.market}`;
|
|
365
|
+
}
|
|
366
|
+
if (result.pnl !== undefined) {
|
|
367
|
+
const pnl = Number(result.pnl);
|
|
368
|
+
description += ` → P&L: ${pnl >= 0 ? "+" : ""}$${pnl.toFixed(2)}`;
|
|
369
|
+
}
|
|
370
|
+
keyActions.push(`- ${description} ${action.success ? "✓" : "✗"}`);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
if (keyActions.length === 0) {
|
|
374
|
+
return null;
|
|
375
|
+
}
|
|
376
|
+
// Limit to most recent 10 key actions
|
|
377
|
+
return keyActions.slice(-10).join("\n");
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Format recent actions for detailed view
|
|
381
|
+
*/
|
|
382
|
+
formatRecentActions(steps, maxActions) {
|
|
383
|
+
const recentSteps = steps.slice(-maxActions);
|
|
384
|
+
const lines = [];
|
|
385
|
+
for (const step of recentSteps) {
|
|
386
|
+
const action = step.action;
|
|
387
|
+
if (!action)
|
|
388
|
+
continue;
|
|
389
|
+
const success = action.success ? "✓" : "✗";
|
|
390
|
+
const reasoning = action.reasoning
|
|
391
|
+
? ` | Reason: ${action.reasoning.substring(0, 50)}...`
|
|
392
|
+
: "";
|
|
393
|
+
lines.push(`- [${step.stepNumber}] ${action.actionType} ${success}${reasoning}`);
|
|
394
|
+
}
|
|
395
|
+
return lines.join("\n") || "No actions recorded";
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Singleton instance
|
|
400
|
+
*/
|
|
401
|
+
export const judgePromptBuilder = new JudgePromptBuilder();
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLMJudgeCache
|
|
3
|
+
*
|
|
4
|
+
* Caches LLM-as-judge scoring results to:
|
|
5
|
+
* 1. Avoid redundant API calls for identical trajectories
|
|
6
|
+
* 2. Enable fast re-scoring when rubrics change
|
|
7
|
+
* 3. Provide validation of cached scores
|
|
8
|
+
*
|
|
9
|
+
* Uses content-addressable hashing: cache key = hash(trajectory_content + rubric_version)
|
|
10
|
+
*
|
|
11
|
+
* @packageDocumentation
|
|
12
|
+
*/
|
|
13
|
+
import { createHash } from "node:crypto";
|
|
14
|
+
import { getTrainingDataAdapter } from "../adapter";
|
|
15
|
+
import { getRubricHash, RUBRICS_VERSION } from "../rubrics";
|
|
16
|
+
import { logger } from "../utils/logger";
|
|
17
|
+
const DEFAULT_CONFIG = {
|
|
18
|
+
ttlHours: 168, // 1 week
|
|
19
|
+
maxEntries: 10000,
|
|
20
|
+
validateRubricVersion: true,
|
|
21
|
+
};
|
|
22
|
+
/**
|
|
23
|
+
* In-memory LLM judge cache with validation
|
|
24
|
+
*/
|
|
25
|
+
export class LLMJudgeCache {
|
|
26
|
+
cache = new Map();
|
|
27
|
+
config;
|
|
28
|
+
stats = {
|
|
29
|
+
hits: 0,
|
|
30
|
+
misses: 0,
|
|
31
|
+
invalidations: 0,
|
|
32
|
+
hitRate: 0,
|
|
33
|
+
};
|
|
34
|
+
constructor(config = {}) {
|
|
35
|
+
this.config = { ...DEFAULT_CONFIG, ...config };
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* Generate a cache key from trajectory content and archetype
|
|
39
|
+
*/
|
|
40
|
+
generateCacheKey(trajectoryId, stepsJson, archetype) {
|
|
41
|
+
const content = `${trajectoryId}:${stepsJson}:${archetype}:${RUBRICS_VERSION}`;
|
|
42
|
+
return createHash("sha256").update(content).digest("hex").substring(0, 32);
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Check if a cached score is valid
|
|
46
|
+
*/
|
|
47
|
+
isValid(cached, archetype) {
|
|
48
|
+
// Check expiration
|
|
49
|
+
if (new Date() > cached.expiresAt) {
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
// Check rubric version if validation is enabled
|
|
53
|
+
if (this.config.validateRubricVersion) {
|
|
54
|
+
if (cached.rubricVersion !== RUBRICS_VERSION) {
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
// Check rubric hash for this specific archetype
|
|
58
|
+
const currentRubricHash = getRubricHash(archetype);
|
|
59
|
+
if (cached.rubricHash !== currentRubricHash) {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return true;
|
|
64
|
+
}
|
|
65
|
+
/**
|
|
66
|
+
* Get a cached score if available and valid
|
|
67
|
+
*/
|
|
68
|
+
get(trajectoryId, stepsJson, archetype) {
|
|
69
|
+
const cacheKey = this.generateCacheKey(trajectoryId, stepsJson, archetype);
|
|
70
|
+
const cached = this.cache.get(cacheKey);
|
|
71
|
+
if (!cached) {
|
|
72
|
+
this.stats.misses++;
|
|
73
|
+
this.updateHitRate();
|
|
74
|
+
return null;
|
|
75
|
+
}
|
|
76
|
+
if (!this.isValid(cached, archetype)) {
|
|
77
|
+
this.cache.delete(cacheKey);
|
|
78
|
+
this.stats.invalidations++;
|
|
79
|
+
this.stats.misses++;
|
|
80
|
+
this.updateHitRate();
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
this.stats.hits++;
|
|
84
|
+
this.updateHitRate();
|
|
85
|
+
logger.debug("Cache hit", { trajectoryId, archetype, cacheKey: cacheKey.substring(0, 8) }, "LLMJudgeCache");
|
|
86
|
+
return cached;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Store a score in the cache
|
|
90
|
+
*/
|
|
91
|
+
set(trajectoryId, stepsJson, archetype, score, reasoning, strengths = [], weaknesses = []) {
|
|
92
|
+
// Enforce max entries limit
|
|
93
|
+
if (this.cache.size >= this.config.maxEntries) {
|
|
94
|
+
this.evictOldest();
|
|
95
|
+
}
|
|
96
|
+
const cacheKey = this.generateCacheKey(trajectoryId, stepsJson, archetype);
|
|
97
|
+
const now = new Date();
|
|
98
|
+
const expiresAt = new Date(now.getTime() + this.config.ttlHours * 60 * 60 * 1000);
|
|
99
|
+
const entry = {
|
|
100
|
+
cacheKey,
|
|
101
|
+
trajectoryId,
|
|
102
|
+
archetype,
|
|
103
|
+
score,
|
|
104
|
+
reasoning,
|
|
105
|
+
strengths,
|
|
106
|
+
weaknesses,
|
|
107
|
+
rubricVersion: RUBRICS_VERSION,
|
|
108
|
+
rubricHash: getRubricHash(archetype),
|
|
109
|
+
scoredAt: now,
|
|
110
|
+
expiresAt,
|
|
111
|
+
};
|
|
112
|
+
this.cache.set(cacheKey, entry);
|
|
113
|
+
logger.debug("Cache set", { trajectoryId, archetype, score, cacheKey: cacheKey.substring(0, 8) }, "LLMJudgeCache");
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Evict the oldest cache entry
|
|
117
|
+
*/
|
|
118
|
+
evictOldest() {
|
|
119
|
+
let oldestKey = null;
|
|
120
|
+
let oldestTime = Infinity;
|
|
121
|
+
for (const [key, entry] of this.cache) {
|
|
122
|
+
const entryTime = entry.scoredAt.getTime();
|
|
123
|
+
if (entryTime < oldestTime) {
|
|
124
|
+
oldestTime = entryTime;
|
|
125
|
+
oldestKey = key;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
if (oldestKey) {
|
|
129
|
+
this.cache.delete(oldestKey);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Update hit rate statistic
|
|
134
|
+
*/
|
|
135
|
+
updateHitRate() {
|
|
136
|
+
const total = this.stats.hits + this.stats.misses;
|
|
137
|
+
this.stats.hitRate = total > 0 ? this.stats.hits / total : 0;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Invalidate all cache entries for an archetype (when rubric changes)
|
|
141
|
+
*/
|
|
142
|
+
invalidateArchetype(archetype) {
|
|
143
|
+
let invalidated = 0;
|
|
144
|
+
for (const [key, entry] of this.cache) {
|
|
145
|
+
if (entry.archetype === archetype) {
|
|
146
|
+
this.cache.delete(key);
|
|
147
|
+
invalidated++;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
this.stats.invalidations += invalidated;
|
|
151
|
+
logger.info("Invalidated cache entries", { archetype, count: invalidated }, "LLMJudgeCache");
|
|
152
|
+
return invalidated;
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Invalidate all cache entries
|
|
156
|
+
*/
|
|
157
|
+
clear() {
|
|
158
|
+
const count = this.cache.size;
|
|
159
|
+
this.cache.clear();
|
|
160
|
+
this.stats.invalidations += count;
|
|
161
|
+
logger.info("Cleared cache", { count }, "LLMJudgeCache");
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Get cache statistics
|
|
165
|
+
*/
|
|
166
|
+
getStats() {
|
|
167
|
+
return { ...this.stats };
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Get cache size
|
|
171
|
+
*/
|
|
172
|
+
size() {
|
|
173
|
+
return this.cache.size;
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Warm cache from database
|
|
177
|
+
* Loads previously scored trajectories into cache
|
|
178
|
+
*/
|
|
179
|
+
async warmFromDatabase(limit = 1000) {
|
|
180
|
+
const results = await getTrainingDataAdapter().getScoredTrajectories(limit);
|
|
181
|
+
let loaded = 0;
|
|
182
|
+
for (const row of results) {
|
|
183
|
+
if (row.aiJudgeReward !== null && row.aiJudgeReasoning && row.judgedAt) {
|
|
184
|
+
// Use 'default' archetype for warmed entries since archetype isn't stored
|
|
185
|
+
this.set(row.trajectoryId, row.stepsJson, "default", row.aiJudgeReward, row.aiJudgeReasoning);
|
|
186
|
+
loaded++;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
logger.info("Warmed cache from database", { loaded, attempted: results.length }, "LLMJudgeCache");
|
|
190
|
+
return loaded;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Singleton cache instance
|
|
195
|
+
*/
|
|
196
|
+
export const llmJudgeCache = new LLMJudgeCache();
|
|
197
|
+
/**
|
|
198
|
+
* Score validation utilities
|
|
199
|
+
*/
|
|
200
|
+
export const scoreValidator = {
|
|
201
|
+
/**
|
|
202
|
+
* Validate a score is in valid range
|
|
203
|
+
*/
|
|
204
|
+
isValidScore(score) {
|
|
205
|
+
return (typeof score === "number" &&
|
|
206
|
+
!Number.isNaN(score) &&
|
|
207
|
+
score >= 0 &&
|
|
208
|
+
score <= 1);
|
|
209
|
+
},
|
|
210
|
+
/**
|
|
211
|
+
* Validate reasoning is meaningful
|
|
212
|
+
*/
|
|
213
|
+
isValidReasoning(reasoning) {
|
|
214
|
+
return (typeof reasoning === "string" &&
|
|
215
|
+
reasoning.length >= 20 &&
|
|
216
|
+
reasoning.length <= 5000);
|
|
217
|
+
},
|
|
218
|
+
/**
|
|
219
|
+
* Validate a complete score response
|
|
220
|
+
*/
|
|
221
|
+
isValidScoreResponse(response) {
|
|
222
|
+
return (this.isValidScore(response.score) &&
|
|
223
|
+
this.isValidReasoning(response.reasoning));
|
|
224
|
+
},
|
|
225
|
+
/**
|
|
226
|
+
* Check if scores are consistent (similar trajectories should have similar scores)
|
|
227
|
+
*/
|
|
228
|
+
checkScoreConsistency(scores) {
|
|
229
|
+
if (scores.length < 3) {
|
|
230
|
+
return { consistent: true, outliers: [] };
|
|
231
|
+
}
|
|
232
|
+
// Group by metrics hash
|
|
233
|
+
const byMetrics = new Map();
|
|
234
|
+
for (const s of scores) {
|
|
235
|
+
const existing = byMetrics.get(s.metricsHash) || [];
|
|
236
|
+
existing.push(s.score);
|
|
237
|
+
byMetrics.set(s.metricsHash, existing);
|
|
238
|
+
}
|
|
239
|
+
const outliers = [];
|
|
240
|
+
// Check variance within groups
|
|
241
|
+
for (const [hash, groupScores] of byMetrics) {
|
|
242
|
+
if (groupScores.length < 2)
|
|
243
|
+
continue;
|
|
244
|
+
const mean = groupScores.reduce((a, b) => a + b) / groupScores.length;
|
|
245
|
+
const variance = groupScores.reduce((sum, s) => sum + (s - mean) ** 2, 0) /
|
|
246
|
+
groupScores.length;
|
|
247
|
+
const stdDev = Math.sqrt(variance);
|
|
248
|
+
// Flag if std deviation is too high (inconsistent scoring)
|
|
249
|
+
if (stdDev > 0.2) {
|
|
250
|
+
// Find trajectories in this group
|
|
251
|
+
for (const s of scores) {
|
|
252
|
+
if (s.metricsHash === hash && Math.abs(s.score - mean) > 2 * stdDev) {
|
|
253
|
+
outliers.push(s.trajectoryId);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
return {
|
|
259
|
+
consistent: outliers.length === 0,
|
|
260
|
+
outliers,
|
|
261
|
+
};
|
|
262
|
+
},
|
|
263
|
+
};
|