@elizaos/training 2.0.0-alpha.21 → 2.0.0-alpha.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.turbo/turbo-lint.log +2 -0
  2. package/.turbo/turbo-typecheck.log +1 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/adapter.js +59 -0
  5. package/dist/archetypes/ArchetypeConfigService.js +510 -0
  6. package/dist/archetypes/derive-archetype.js +196 -0
  7. package/dist/archetypes/index.js +7 -0
  8. package/dist/benchmark/ArchetypeMatchupBenchmark.js +547 -0
  9. package/dist/benchmark/BenchmarkChartGenerator.js +632 -0
  10. package/dist/benchmark/BenchmarkDataGenerator.js +825 -0
  11. package/dist/benchmark/BenchmarkDataViewer.js +197 -0
  12. package/dist/benchmark/BenchmarkHistoryService.js +135 -0
  13. package/dist/benchmark/BenchmarkRunner.js +483 -0
  14. package/dist/benchmark/BenchmarkValidator.js +158 -0
  15. package/dist/benchmark/FastEvalRunner.js +133 -0
  16. package/dist/benchmark/MetricsValidator.js +104 -0
  17. package/dist/benchmark/MetricsVisualizer.js +775 -0
  18. package/dist/benchmark/ModelBenchmarkService.js +433 -0
  19. package/dist/benchmark/ModelRegistry.js +122 -0
  20. package/dist/benchmark/RulerBenchmarkIntegration.js +168 -0
  21. package/dist/benchmark/SimulationA2AInterface.js +683 -0
  22. package/dist/benchmark/SimulationEngine.js +522 -0
  23. package/dist/benchmark/TaskRunner.js +60 -0
  24. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +409 -0
  25. package/dist/benchmark/__tests__/HeadToHead.test.js +105 -0
  26. package/dist/benchmark/index.js +23 -0
  27. package/dist/benchmark/parseSimulationMetrics.js +86 -0
  28. package/dist/benchmark/simulation-types.js +1 -0
  29. package/dist/dependencies.js +197 -0
  30. package/dist/generation/TrajectoryGenerator.js +244 -0
  31. package/dist/generation/index.js +6 -0
  32. package/dist/huggingface/HuggingFaceDatasetUploader.js +463 -0
  33. package/dist/huggingface/HuggingFaceIntegrationService.js +272 -0
  34. package/dist/huggingface/HuggingFaceModelUploader.js +385 -0
  35. package/dist/huggingface/index.js +9 -0
  36. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +144 -0
  37. package/dist/index.js +41 -0
  38. package/dist/init-training.js +43 -0
  39. package/dist/metrics/TrajectoryMetricsExtractor.js +523 -0
  40. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +628 -0
  41. package/dist/metrics/index.js +7 -0
  42. package/dist/metrics/types.js +21 -0
  43. package/dist/rubrics/__tests__/index.test.js +150 -0
  44. package/dist/rubrics/ass-kisser.js +83 -0
  45. package/dist/rubrics/degen.js +78 -0
  46. package/dist/rubrics/goody-twoshoes.js +82 -0
  47. package/dist/rubrics/index.js +184 -0
  48. package/dist/rubrics/information-trader.js +82 -0
  49. package/dist/rubrics/infosec.js +99 -0
  50. package/dist/rubrics/liar.js +102 -0
  51. package/dist/rubrics/perps-trader.js +85 -0
  52. package/dist/rubrics/researcher.js +79 -0
  53. package/dist/rubrics/scammer.js +80 -0
  54. package/dist/rubrics/social-butterfly.js +71 -0
  55. package/dist/rubrics/super-predictor.js +95 -0
  56. package/dist/rubrics/trader.js +65 -0
  57. package/dist/scoring/ArchetypeScoringService.js +301 -0
  58. package/dist/scoring/JudgePromptBuilder.js +401 -0
  59. package/dist/scoring/LLMJudgeCache.js +263 -0
  60. package/dist/scoring/index.js +8 -0
  61. package/dist/training/AutomationPipeline.js +714 -0
  62. package/dist/training/BenchmarkService.js +370 -0
  63. package/dist/training/ConfigValidator.js +153 -0
  64. package/dist/training/MarketOutcomesTracker.js +142 -0
  65. package/dist/training/ModelDeployer.js +128 -0
  66. package/dist/training/ModelFetcher.js +48 -0
  67. package/dist/training/ModelSelectionService.js +248 -0
  68. package/dist/training/ModelUsageVerifier.js +106 -0
  69. package/dist/training/MultiModelOrchestrator.js +349 -0
  70. package/dist/training/RLModelConfig.js +295 -0
  71. package/dist/training/RewardBackpropagationService.js +117 -0
  72. package/dist/training/RulerScoringService.js +450 -0
  73. package/dist/training/TrainingMonitor.js +108 -0
  74. package/dist/training/TrajectoryRecorder.js +281 -0
  75. package/dist/training/__tests__/TrajectoryRecorder.test.js +363 -0
  76. package/dist/training/index.js +30 -0
  77. package/dist/training/logRLConfig.js +29 -0
  78. package/dist/training/pipeline.js +80 -0
  79. package/dist/training/storage/ModelStorageService.js +190 -0
  80. package/dist/training/storage/TrainingDataArchiver.js +136 -0
  81. package/dist/training/storage/index.js +7 -0
  82. package/dist/training/types.js +6 -0
  83. package/dist/training/window-utils.js +100 -0
  84. package/dist/utils/index.js +73 -0
  85. package/dist/utils/logger.js +55 -0
  86. package/dist/utils/snowflake.js +15 -0
  87. package/dist/utils/synthetic-detector.js +67 -0
  88. package/package.json +2 -2
  89. package/research-output/training-runs/training-run-1773742857616.json +38 -0
  90. package/research-output/training-runs/training-run-1773742946977.json +38 -0
  91. package/research-output/training-runs/training-run-1773743278891.json +38 -0
  92. package/research-output/training-runs/training-run-1773743409754.json +38 -0
  93. package/research-output/training-runs/training-run-1773743651086.json +38 -0
  94. package/research-output/training-runs/training-run-1773743782883.json +38 -0
@@ -0,0 +1,401 @@
1
+ /**
2
+ * JudgePromptBuilder
3
+ *
4
+ * Builds LLM judge prompts with trajectory metrics context and archetype-specific rubrics.
5
+ * Metrics are included as CONTEXT for the judge, not weighted directly.
6
+ *
7
+ * @packageDocumentation
8
+ */
9
+ import { getMetricsSummary } from "../metrics/types";
10
+ import { getPriorityMetrics, getRubric } from "../rubrics";
11
+ const DEFAULT_OPTIONS = {
12
+ includeActionDetails: false,
13
+ maxActionsToShow: 20,
14
+ includeKeyDecisions: true,
15
+ };
16
+ /**
17
+ * Builds prompts for LLM-as-judge scoring.
18
+ */
19
+ export class JudgePromptBuilder {
20
+ /**
21
+ * Build prompt for single trajectory scoring.
22
+ * @param trajectory - Trajectory context
23
+ * @param options - Prompt options
24
+ * @returns System and user prompts
25
+ */
26
+ buildSinglePrompt(trajectory, options = {}) {
27
+ const opts = { ...DEFAULT_OPTIONS, ...options };
28
+ const archetype = trajectory.archetype || "default";
29
+ const rubric = getRubric(archetype);
30
+ const priorityMetrics = getPriorityMetrics(archetype);
31
+ const system = this.buildSystemPrompt(archetype, rubric);
32
+ const user = this.buildUserPrompt(trajectory, priorityMetrics, opts);
33
+ return { system, user };
34
+ }
35
+ /**
36
+ * Build a judge prompt for comparing multiple trajectories (RULER style)
37
+ */
38
+ buildComparisonPrompt(trajectories, scenarioId, options = {}) {
39
+ const opts = { ...DEFAULT_OPTIONS, ...options };
40
+ // Get archetype from first trajectory (assume all same archetype for comparison)
41
+ const archetype = trajectories[0]?.archetype || "default";
42
+ const rubric = getRubric(archetype);
43
+ const priorityMetrics = getPriorityMetrics(archetype);
44
+ const system = this.buildComparisonSystemPrompt(archetype, rubric);
45
+ const user = this.buildComparisonUserPrompt(trajectories, scenarioId, priorityMetrics, opts);
46
+ return { system, user };
47
+ }
48
+ /**
49
+ * Build system prompt for single trajectory evaluation
50
+ */
51
+ buildSystemPrompt(archetype, rubric) {
52
+ return `You are an expert evaluator of AI agent performance in prediction market simulations.
53
+
54
+ You are evaluating an agent with the "${archetype}" archetype. This archetype has specific goals and behaviors that should be evaluated differently than a generic agent.
55
+
56
+ ${rubric}
57
+
58
+ Your task is to score this trajectory on a scale of 0.0 to 1.0 based on how well the agent embodied the "${archetype}" archetype's values and achieved its goals.
59
+
60
+ IMPORTANT: The metrics provided are CONTEXT to inform your judgment. Use them to understand what happened, but make a holistic evaluation based on the rubric - don't just calculate a weighted average of metrics.`;
61
+ }
62
+ /**
63
+ * Build system prompt for RULER comparison
64
+ */
65
+ buildComparisonSystemPrompt(archetype, rubric) {
66
+ return `You are an expert evaluator of AI agent performance. All trajectories below were given the same scenario and are from "${archetype}" archetype agents.
67
+
68
+ Your job is to compare them RELATIVE to each other and assign scores from 0 to 1 based on how well each trajectory achieved the archetype's goals.
69
+
70
+ ${rubric}
71
+
72
+ IMPORTANT RULER PRINCIPLES:
73
+ - A trajectory that achieves its archetype's goals should score significantly higher than one that doesn't
74
+ - A trajectory that achieves goals more efficiently should score higher
75
+ - If one trajectory is only slightly better, score differences should be small
76
+ - If one is significantly better, score differences should be large
77
+ - You may give partial credit for progress towards goals
78
+
79
+ The metrics provided are CONTEXT to inform your judgment. Use them to understand what happened, then make holistic evaluations based on the archetype rubric.`;
80
+ }
81
+ /**
82
+ * Build user prompt with trajectory context and metrics
83
+ */
84
+ buildUserPrompt(trajectory, priorityMetrics, options) {
85
+ const parts = [];
86
+ // Agent info
87
+ parts.push(`## Agent Information`);
88
+ parts.push(`- Agent ID: ${trajectory.agentId}`);
89
+ parts.push(`- Archetype: ${trajectory.archetype || "unknown"}`);
90
+ parts.push(`- Episode Length: ${trajectory.episodeLength || trajectory.steps.length} ticks`);
91
+ parts.push("");
92
+ // Metrics section
93
+ parts.push(`## Behavioral Metrics`);
94
+ parts.push(this.formatMetrics(trajectory.metrics, priorityMetrics));
95
+ parts.push("");
96
+ // Action summary
97
+ parts.push(`## Action Summary`);
98
+ parts.push(this.summarizeActions(trajectory.steps));
99
+ parts.push("");
100
+ // Key decisions (if requested)
101
+ if (options.includeKeyDecisions) {
102
+ const keyDecisions = this.extractKeyDecisions(trajectory.steps);
103
+ if (keyDecisions) {
104
+ parts.push(`## Key Decisions`);
105
+ parts.push(keyDecisions);
106
+ parts.push("");
107
+ }
108
+ }
109
+ // Recent actions (if requested)
110
+ if (options.includeActionDetails) {
111
+ parts.push(`## Recent Actions (last ${options.maxActionsToShow})`);
112
+ parts.push(this.formatRecentActions(trajectory.steps, options.maxActionsToShow || 20));
113
+ parts.push("");
114
+ }
115
+ // Instructions
116
+ parts.push(`## Instructions`);
117
+ parts.push(`Score this trajectory on a scale of 0.0 to 1.0 based on how well it embodies the ${trajectory.archetype || "agent"} archetype's values.`);
118
+ parts.push("");
119
+ parts.push(`Respond with JSON:`);
120
+ parts.push(`{
121
+ "score": <float 0-1>,
122
+ "reasoning": "<2-3 sentence explanation>",
123
+ "strengths": ["<strength 1>", "<strength 2>"],
124
+ "weaknesses": ["<weakness 1>", "<weakness 2>"]
125
+ }`);
126
+ return parts.join("\n");
127
+ }
128
+ /**
129
+ * Build user prompt for RULER comparison
130
+ */
131
+ buildComparisonUserPrompt(trajectories, scenarioId, priorityMetrics, _options) {
132
+ const parts = [];
133
+ parts.push(`## Scenario: ${scenarioId}`);
134
+ parts.push(`## Number of Trajectories: ${trajectories.length}`);
135
+ parts.push("");
136
+ // Performance context for all trajectories
137
+ parts.push(`## Trajectory Performance Context`);
138
+ parts.push(`(Use this to inform your scoring)`);
139
+ parts.push("");
140
+ for (let i = 0; i < trajectories.length; i++) {
141
+ const traj = trajectories[i];
142
+ if (!traj)
143
+ continue;
144
+ const trajId = `trajectory-${i + 1}`;
145
+ parts.push(`### ${trajId}`);
146
+ parts.push(`- Archetype: ${traj.archetype || "unknown"}`);
147
+ parts.push(`- Episode Length: ${traj.episodeLength || traj.steps.length} steps`);
148
+ parts.push(`- Total Reward: ${traj.totalReward?.toFixed(2) || "0.00"}`);
149
+ parts.push("");
150
+ // Key metrics for this trajectory
151
+ parts.push(`**Key Metrics:**`);
152
+ parts.push(this.formatMetrics(traj.metrics, priorityMetrics));
153
+ parts.push("");
154
+ // Action summary
155
+ parts.push(`**Actions:**`);
156
+ parts.push(this.summarizeActions(traj.steps));
157
+ parts.push("");
158
+ }
159
+ // Instructions
160
+ parts.push(`## Instructions`);
161
+ parts.push(`Score each trajectory from 0.0 to 1.0 RELATIVE to each other based on the archetype rubric.`);
162
+ parts.push("");
163
+ parts.push(`Respond with ONLY valid JSON:`);
164
+ parts.push(`{
165
+ "scores": [
166
+ {
167
+ "trajectory_id": "trajectory-1",
168
+ "explanation": "Brief explanation",
169
+ "score": 0.85
170
+ },
171
+ {
172
+ "trajectory_id": "trajectory-2",
173
+ "explanation": "Brief explanation",
174
+ "score": 0.65
175
+ }
176
+ ]
177
+ }`);
178
+ return parts.join("\n");
179
+ }
180
+ /**
181
+ * Format metrics for prompt, highlighting priority metrics first
182
+ */
183
+ formatMetrics(metrics, priorityMetrics) {
184
+ const lines = [];
185
+ // Show priority metrics first with emphasis
186
+ if (priorityMetrics.length > 0) {
187
+ lines.push("### ⭐ KEY METRICS FOR THIS ARCHETYPE");
188
+ for (const metricPath of priorityMetrics.slice(0, 6)) {
189
+ const value = this.getMetricValue(metrics, metricPath);
190
+ const label = this.formatMetricLabel(metricPath);
191
+ lines.push(`- **${label}**: ${value}`);
192
+ }
193
+ lines.push("");
194
+ }
195
+ // Summary metrics
196
+ const summary = getMetricsSummary(metrics);
197
+ lines.push("### Performance Summary");
198
+ lines.push(`- Total P&L: $${summary.totalPnL.toFixed(2)}`);
199
+ lines.push(`- Win Rate: ${(summary.winRate * 100).toFixed(1)}%`);
200
+ lines.push(`- Trades Executed: ${summary.tradesExecuted}`);
201
+ lines.push(`- Action Success Rate: ${(summary.actionSuccessRate * 100).toFixed(1)}%`);
202
+ lines.push("");
203
+ // Social metrics
204
+ lines.push("### Social Activity");
205
+ lines.push(`- Unique Users Interacted: ${metrics.social.uniqueUsersInteracted}`);
206
+ lines.push(`- Group Chats Joined: ${metrics.social.groupChatsJoined}`);
207
+ lines.push(`- DMs Initiated: ${metrics.social.dmsInitiated}`);
208
+ lines.push(`- Posts Created: ${metrics.social.postsCreated}`);
209
+ lines.push(`- Comments Made: ${metrics.social.commentsMade}`);
210
+ lines.push(`- Social to Trade Ratio: ${metrics.behavior.socialToTradeRatio.toFixed(2)}`);
211
+ lines.push("");
212
+ // Trading metrics
213
+ lines.push("### Trading Performance");
214
+ lines.push(`- Total P&L: $${metrics.trading.totalPnL.toFixed(2)}`);
215
+ lines.push(`- Win Rate: ${(metrics.trading.winRate * 100).toFixed(1)}%`);
216
+ lines.push(`- Sharpe Ratio: ${metrics.trading.sharpeRatio.toFixed(2)}`);
217
+ lines.push(`- Max Drawdown: $${metrics.trading.maxDrawdown.toFixed(2)}`);
218
+ lines.push(`- Markets Traded: ${metrics.trading.marketsTraded}`);
219
+ lines.push(`- Largest Win: $${metrics.trading.largestWin.toFixed(2)}`);
220
+ lines.push(`- Largest Loss: $${metrics.trading.largestLoss.toFixed(2)}`);
221
+ lines.push("");
222
+ // Influence metrics
223
+ lines.push("### Influence");
224
+ lines.push(`- Followers Gained: ${metrics.influence.followersGained}`);
225
+ lines.push(`- Reputation Delta: ${metrics.influence.reputationDelta > 0 ? "+" : ""}${metrics.influence.reputationDelta}`);
226
+ lines.push(`- Positive Reactions: ${metrics.influence.positiveReactions}`);
227
+ lines.push(`- Information Spread: ${metrics.influence.informationSpread}`);
228
+ lines.push("");
229
+ // Behavior metrics
230
+ lines.push("### Behavior Patterns");
231
+ lines.push(`- Actions Per Tick: ${metrics.behavior.actionsPerTick.toFixed(2)}`);
232
+ lines.push(`- Consistency Score: ${(metrics.behavior.consistencyScore * 100).toFixed(1)}%`);
233
+ lines.push(`- Dominant Action: ${metrics.behavior.dominantActionType || "none"}`);
234
+ lines.push("");
235
+ // Information metrics
236
+ lines.push("### Information Activity");
237
+ lines.push(`- Research Actions: ${metrics.information.researchActions}`);
238
+ lines.push(`- Predictions Made: ${metrics.information.predictionsMade}`);
239
+ lines.push(`- Prediction Accuracy: ${(metrics.information.predictionAccuracy * 100).toFixed(1)}%`);
240
+ return lines.join("\n");
241
+ }
242
+ /**
243
+ * Get a metric value from the metrics object using a dot-path
244
+ */
245
+ getMetricValue(metrics, path) {
246
+ const [category, key] = path.split(".");
247
+ if (!category || !key)
248
+ return "N/A";
249
+ // Access nested metric value based on category
250
+ let value;
251
+ switch (category) {
252
+ case "trading":
253
+ value = metrics.trading[key];
254
+ break;
255
+ case "social":
256
+ value = metrics.social[key];
257
+ break;
258
+ case "influence":
259
+ value = metrics.influence[key];
260
+ break;
261
+ case "behavior":
262
+ value = metrics.behavior[key];
263
+ break;
264
+ case "information":
265
+ value = metrics.information[key];
266
+ break;
267
+ default:
268
+ return "N/A";
269
+ }
270
+ if (value === undefined || value === null)
271
+ return "N/A";
272
+ // Format based on value type
273
+ if (typeof value === "number") {
274
+ // Check if it's a rate/percentage
275
+ if (key.includes("Rate") ||
276
+ key.includes("Accuracy") ||
277
+ key.includes("Score")) {
278
+ return `${(value * 100).toFixed(1)}%`;
279
+ }
280
+ // Check if it's a currency
281
+ if (key.includes("PnL") ||
282
+ key.includes("Win") ||
283
+ key.includes("Loss") ||
284
+ key.includes("Drawdown")) {
285
+ return `$${value.toFixed(2)}`;
286
+ }
287
+ // Check if it's a ratio
288
+ if (key.includes("Ratio")) {
289
+ return value.toFixed(2);
290
+ }
291
+ // Integer-like values
292
+ if (Number.isInteger(value)) {
293
+ return String(value);
294
+ }
295
+ return value.toFixed(2);
296
+ }
297
+ return String(value);
298
+ }
299
+ /**
300
+ * Format a metric path into a human-readable label
301
+ */
302
+ formatMetricLabel(path) {
303
+ const [, key] = path.split(".");
304
+ if (!key)
305
+ return path;
306
+ // Convert camelCase to Title Case with spaces
307
+ return key
308
+ .replace(/([A-Z])/g, " $1")
309
+ .replace(/^./, (str) => str.toUpperCase())
310
+ .trim();
311
+ }
312
+ /**
313
+ * Summarize actions in trajectory
314
+ */
315
+ summarizeActions(steps) {
316
+ const actionCounts = new Map();
317
+ let successCount = 0;
318
+ let errorCount = 0;
319
+ for (const step of steps) {
320
+ const action = step.action;
321
+ if (!action)
322
+ continue;
323
+ const actionType = action.actionType;
324
+ actionCounts.set(actionType, (actionCounts.get(actionType) || 0) + 1);
325
+ if (action.success) {
326
+ successCount++;
327
+ }
328
+ else {
329
+ errorCount++;
330
+ }
331
+ }
332
+ const sortedActions = Array.from(actionCounts.entries()).sort((a, b) => b[1] - a[1]);
333
+ const lines = [];
334
+ lines.push(`- Total Actions: ${steps.length} (${successCount} successful, ${errorCount} failed)`);
335
+ lines.push(`- Action Types: ${sortedActions.map(([type, count]) => `${type}(${count})`).join(", ")}`);
336
+ return lines.join("\n");
337
+ }
338
+ /**
339
+ * Extract key decisions (trades, significant social actions)
340
+ */
341
+ extractKeyDecisions(steps) {
342
+ const keyActions = [];
343
+ const keyActionTypes = new Set([
344
+ "trade",
345
+ "buy",
346
+ "sell",
347
+ "predict",
348
+ "create_group_chat",
349
+ "post",
350
+ ]);
351
+ for (const step of steps) {
352
+ const action = step.action;
353
+ if (!action)
354
+ continue;
355
+ if (keyActionTypes.has(action.actionType.toLowerCase())) {
356
+ const params = action.parameters || {};
357
+ const result = action.result || {};
358
+ let description = `${action.actionType}`;
359
+ // Add relevant details
360
+ if (params.amount || params.size) {
361
+ description += ` (size: ${params.amount || params.size})`;
362
+ }
363
+ if (params.marketId || params.market) {
364
+ description += ` on ${params.marketId || params.market}`;
365
+ }
366
+ if (result.pnl !== undefined) {
367
+ const pnl = Number(result.pnl);
368
+ description += ` → P&L: ${pnl >= 0 ? "+" : ""}$${pnl.toFixed(2)}`;
369
+ }
370
+ keyActions.push(`- ${description} ${action.success ? "✓" : "✗"}`);
371
+ }
372
+ }
373
+ if (keyActions.length === 0) {
374
+ return null;
375
+ }
376
+ // Limit to most recent 10 key actions
377
+ return keyActions.slice(-10).join("\n");
378
+ }
379
+ /**
380
+ * Format recent actions for detailed view
381
+ */
382
+ formatRecentActions(steps, maxActions) {
383
+ const recentSteps = steps.slice(-maxActions);
384
+ const lines = [];
385
+ for (const step of recentSteps) {
386
+ const action = step.action;
387
+ if (!action)
388
+ continue;
389
+ const success = action.success ? "✓" : "✗";
390
+ const reasoning = action.reasoning
391
+ ? ` | Reason: ${action.reasoning.substring(0, 50)}...`
392
+ : "";
393
+ lines.push(`- [${step.stepNumber}] ${action.actionType} ${success}${reasoning}`);
394
+ }
395
+ return lines.join("\n") || "No actions recorded";
396
+ }
397
+ }
398
+ /**
399
+ * Singleton instance
400
+ */
401
+ export const judgePromptBuilder = new JudgePromptBuilder();
@@ -0,0 +1,263 @@
1
+ /**
2
+ * LLMJudgeCache
3
+ *
4
+ * Caches LLM-as-judge scoring results to:
5
+ * 1. Avoid redundant API calls for identical trajectories
6
+ * 2. Enable fast re-scoring when rubrics change
7
+ * 3. Provide validation of cached scores
8
+ *
9
+ * Uses content-addressable hashing: cache key = hash(trajectory_content + rubric_version)
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+ import { createHash } from "node:crypto";
14
+ import { getTrainingDataAdapter } from "../adapter";
15
+ import { getRubricHash, RUBRICS_VERSION } from "../rubrics";
16
+ import { logger } from "../utils/logger";
17
+ const DEFAULT_CONFIG = {
18
+ ttlHours: 168, // 1 week
19
+ maxEntries: 10000,
20
+ validateRubricVersion: true,
21
+ };
22
+ /**
23
+ * In-memory LLM judge cache with validation
24
+ */
25
+ export class LLMJudgeCache {
26
+ cache = new Map();
27
+ config;
28
+ stats = {
29
+ hits: 0,
30
+ misses: 0,
31
+ invalidations: 0,
32
+ hitRate: 0,
33
+ };
34
+ constructor(config = {}) {
35
+ this.config = { ...DEFAULT_CONFIG, ...config };
36
+ }
37
+ /**
38
+ * Generate a cache key from trajectory content and archetype
39
+ */
40
+ generateCacheKey(trajectoryId, stepsJson, archetype) {
41
+ const content = `${trajectoryId}:${stepsJson}:${archetype}:${RUBRICS_VERSION}`;
42
+ return createHash("sha256").update(content).digest("hex").substring(0, 32);
43
+ }
44
+ /**
45
+ * Check if a cached score is valid
46
+ */
47
+ isValid(cached, archetype) {
48
+ // Check expiration
49
+ if (new Date() > cached.expiresAt) {
50
+ return false;
51
+ }
52
+ // Check rubric version if validation is enabled
53
+ if (this.config.validateRubricVersion) {
54
+ if (cached.rubricVersion !== RUBRICS_VERSION) {
55
+ return false;
56
+ }
57
+ // Check rubric hash for this specific archetype
58
+ const currentRubricHash = getRubricHash(archetype);
59
+ if (cached.rubricHash !== currentRubricHash) {
60
+ return false;
61
+ }
62
+ }
63
+ return true;
64
+ }
65
+ /**
66
+ * Get a cached score if available and valid
67
+ */
68
+ get(trajectoryId, stepsJson, archetype) {
69
+ const cacheKey = this.generateCacheKey(trajectoryId, stepsJson, archetype);
70
+ const cached = this.cache.get(cacheKey);
71
+ if (!cached) {
72
+ this.stats.misses++;
73
+ this.updateHitRate();
74
+ return null;
75
+ }
76
+ if (!this.isValid(cached, archetype)) {
77
+ this.cache.delete(cacheKey);
78
+ this.stats.invalidations++;
79
+ this.stats.misses++;
80
+ this.updateHitRate();
81
+ return null;
82
+ }
83
+ this.stats.hits++;
84
+ this.updateHitRate();
85
+ logger.debug("Cache hit", { trajectoryId, archetype, cacheKey: cacheKey.substring(0, 8) }, "LLMJudgeCache");
86
+ return cached;
87
+ }
88
+ /**
89
+ * Store a score in the cache
90
+ */
91
+ set(trajectoryId, stepsJson, archetype, score, reasoning, strengths = [], weaknesses = []) {
92
+ // Enforce max entries limit
93
+ if (this.cache.size >= this.config.maxEntries) {
94
+ this.evictOldest();
95
+ }
96
+ const cacheKey = this.generateCacheKey(trajectoryId, stepsJson, archetype);
97
+ const now = new Date();
98
+ const expiresAt = new Date(now.getTime() + this.config.ttlHours * 60 * 60 * 1000);
99
+ const entry = {
100
+ cacheKey,
101
+ trajectoryId,
102
+ archetype,
103
+ score,
104
+ reasoning,
105
+ strengths,
106
+ weaknesses,
107
+ rubricVersion: RUBRICS_VERSION,
108
+ rubricHash: getRubricHash(archetype),
109
+ scoredAt: now,
110
+ expiresAt,
111
+ };
112
+ this.cache.set(cacheKey, entry);
113
+ logger.debug("Cache set", { trajectoryId, archetype, score, cacheKey: cacheKey.substring(0, 8) }, "LLMJudgeCache");
114
+ }
115
+ /**
116
+ * Evict the oldest cache entry
117
+ */
118
+ evictOldest() {
119
+ let oldestKey = null;
120
+ let oldestTime = Infinity;
121
+ for (const [key, entry] of this.cache) {
122
+ const entryTime = entry.scoredAt.getTime();
123
+ if (entryTime < oldestTime) {
124
+ oldestTime = entryTime;
125
+ oldestKey = key;
126
+ }
127
+ }
128
+ if (oldestKey) {
129
+ this.cache.delete(oldestKey);
130
+ }
131
+ }
132
+ /**
133
+ * Update hit rate statistic
134
+ */
135
+ updateHitRate() {
136
+ const total = this.stats.hits + this.stats.misses;
137
+ this.stats.hitRate = total > 0 ? this.stats.hits / total : 0;
138
+ }
139
+ /**
140
+ * Invalidate all cache entries for an archetype (when rubric changes)
141
+ */
142
+ invalidateArchetype(archetype) {
143
+ let invalidated = 0;
144
+ for (const [key, entry] of this.cache) {
145
+ if (entry.archetype === archetype) {
146
+ this.cache.delete(key);
147
+ invalidated++;
148
+ }
149
+ }
150
+ this.stats.invalidations += invalidated;
151
+ logger.info("Invalidated cache entries", { archetype, count: invalidated }, "LLMJudgeCache");
152
+ return invalidated;
153
+ }
154
+ /**
155
+ * Invalidate all cache entries
156
+ */
157
+ clear() {
158
+ const count = this.cache.size;
159
+ this.cache.clear();
160
+ this.stats.invalidations += count;
161
+ logger.info("Cleared cache", { count }, "LLMJudgeCache");
162
+ }
163
+ /**
164
+ * Get cache statistics
165
+ */
166
+ getStats() {
167
+ return { ...this.stats };
168
+ }
169
+ /**
170
+ * Get cache size
171
+ */
172
+ size() {
173
+ return this.cache.size;
174
+ }
175
+ /**
176
+ * Warm cache from database
177
+ * Loads previously scored trajectories into cache
178
+ */
179
+ async warmFromDatabase(limit = 1000) {
180
+ const results = await getTrainingDataAdapter().getScoredTrajectories(limit);
181
+ let loaded = 0;
182
+ for (const row of results) {
183
+ if (row.aiJudgeReward !== null && row.aiJudgeReasoning && row.judgedAt) {
184
+ // Use 'default' archetype for warmed entries since archetype isn't stored
185
+ this.set(row.trajectoryId, row.stepsJson, "default", row.aiJudgeReward, row.aiJudgeReasoning);
186
+ loaded++;
187
+ }
188
+ }
189
+ logger.info("Warmed cache from database", { loaded, attempted: results.length }, "LLMJudgeCache");
190
+ return loaded;
191
+ }
192
+ }
193
+ /**
194
+ * Singleton cache instance
195
+ */
196
+ export const llmJudgeCache = new LLMJudgeCache();
197
+ /**
198
+ * Score validation utilities
199
+ */
200
+ export const scoreValidator = {
201
+ /**
202
+ * Validate a score is in valid range
203
+ */
204
+ isValidScore(score) {
205
+ return (typeof score === "number" &&
206
+ !Number.isNaN(score) &&
207
+ score >= 0 &&
208
+ score <= 1);
209
+ },
210
+ /**
211
+ * Validate reasoning is meaningful
212
+ */
213
+ isValidReasoning(reasoning) {
214
+ return (typeof reasoning === "string" &&
215
+ reasoning.length >= 20 &&
216
+ reasoning.length <= 5000);
217
+ },
218
+ /**
219
+ * Validate a complete score response
220
+ */
221
+ isValidScoreResponse(response) {
222
+ return (this.isValidScore(response.score) &&
223
+ this.isValidReasoning(response.reasoning));
224
+ },
225
+ /**
226
+ * Check if scores are consistent (similar trajectories should have similar scores)
227
+ */
228
+ checkScoreConsistency(scores) {
229
+ if (scores.length < 3) {
230
+ return { consistent: true, outliers: [] };
231
+ }
232
+ // Group by metrics hash
233
+ const byMetrics = new Map();
234
+ for (const s of scores) {
235
+ const existing = byMetrics.get(s.metricsHash) || [];
236
+ existing.push(s.score);
237
+ byMetrics.set(s.metricsHash, existing);
238
+ }
239
+ const outliers = [];
240
+ // Check variance within groups
241
+ for (const [hash, groupScores] of byMetrics) {
242
+ if (groupScores.length < 2)
243
+ continue;
244
+ const mean = groupScores.reduce((a, b) => a + b) / groupScores.length;
245
+ const variance = groupScores.reduce((sum, s) => sum + (s - mean) ** 2, 0) /
246
+ groupScores.length;
247
+ const stdDev = Math.sqrt(variance);
248
+ // Flag if std deviation is too high (inconsistent scoring)
249
+ if (stdDev > 0.2) {
250
+ // Find trajectories in this group
251
+ for (const s of scores) {
252
+ if (s.metricsHash === hash && Math.abs(s.score - mean) > 2 * stdDev) {
253
+ outliers.push(s.trajectoryId);
254
+ }
255
+ }
256
+ }
257
+ }
258
+ return {
259
+ consistent: outliers.length === 0,
260
+ outliers,
261
+ };
262
+ },
263
+ };
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Scoring Module
3
+ *
4
+ * LLM-as-judge scoring with archetype-specific rubrics.
5
+ */
6
+ export * from "./ArchetypeScoringService";
7
+ export * from "./JudgePromptBuilder";
8
+ export * from "./LLMJudgeCache";