@elizaos/training 2.0.0-alpha.76 → 2.0.0-alpha.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/package.json +2 -2
  2. package/.turbo/turbo-lint.log +0 -3
  3. package/.turbo/turbo-typecheck.log +0 -1
  4. package/dist/.tsbuildinfo +0 -1
  5. package/dist/adapter.js +0 -59
  6. package/dist/archetypes/ArchetypeConfigService.js +0 -510
  7. package/dist/archetypes/derive-archetype.js +0 -196
  8. package/dist/archetypes/index.js +0 -7
  9. package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
  10. package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
  11. package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
  12. package/dist/benchmark/BenchmarkDataViewer.js +0 -197
  13. package/dist/benchmark/BenchmarkHistoryService.js +0 -135
  14. package/dist/benchmark/BenchmarkRunner.js +0 -483
  15. package/dist/benchmark/BenchmarkValidator.js +0 -158
  16. package/dist/benchmark/FastEvalRunner.js +0 -133
  17. package/dist/benchmark/MetricsValidator.js +0 -104
  18. package/dist/benchmark/MetricsVisualizer.js +0 -775
  19. package/dist/benchmark/ModelBenchmarkService.js +0 -433
  20. package/dist/benchmark/ModelRegistry.js +0 -122
  21. package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
  22. package/dist/benchmark/SimulationA2AInterface.js +0 -683
  23. package/dist/benchmark/SimulationEngine.js +0 -522
  24. package/dist/benchmark/TaskRunner.js +0 -60
  25. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
  26. package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
  27. package/dist/benchmark/index.js +0 -23
  28. package/dist/benchmark/parseSimulationMetrics.js +0 -86
  29. package/dist/benchmark/simulation-types.js +0 -1
  30. package/dist/dependencies.js +0 -197
  31. package/dist/generation/TrajectoryGenerator.js +0 -244
  32. package/dist/generation/index.js +0 -6
  33. package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
  34. package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
  35. package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
  36. package/dist/huggingface/index.js +0 -9
  37. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
  38. package/dist/index.js +0 -41
  39. package/dist/init-training.js +0 -43
  40. package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
  41. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
  42. package/dist/metrics/index.js +0 -7
  43. package/dist/metrics/types.js +0 -21
  44. package/dist/rubrics/__tests__/index.test.js +0 -150
  45. package/dist/rubrics/ass-kisser.js +0 -83
  46. package/dist/rubrics/degen.js +0 -78
  47. package/dist/rubrics/goody-twoshoes.js +0 -82
  48. package/dist/rubrics/index.js +0 -184
  49. package/dist/rubrics/information-trader.js +0 -82
  50. package/dist/rubrics/infosec.js +0 -99
  51. package/dist/rubrics/liar.js +0 -102
  52. package/dist/rubrics/perps-trader.js +0 -85
  53. package/dist/rubrics/researcher.js +0 -79
  54. package/dist/rubrics/scammer.js +0 -80
  55. package/dist/rubrics/social-butterfly.js +0 -71
  56. package/dist/rubrics/super-predictor.js +0 -95
  57. package/dist/rubrics/trader.js +0 -65
  58. package/dist/scoring/ArchetypeScoringService.js +0 -301
  59. package/dist/scoring/JudgePromptBuilder.js +0 -401
  60. package/dist/scoring/LLMJudgeCache.js +0 -263
  61. package/dist/scoring/index.js +0 -8
  62. package/dist/training/AutomationPipeline.js +0 -714
  63. package/dist/training/BenchmarkService.js +0 -370
  64. package/dist/training/ConfigValidator.js +0 -153
  65. package/dist/training/MarketOutcomesTracker.js +0 -142
  66. package/dist/training/ModelDeployer.js +0 -128
  67. package/dist/training/ModelFetcher.js +0 -48
  68. package/dist/training/ModelSelectionService.js +0 -248
  69. package/dist/training/ModelUsageVerifier.js +0 -106
  70. package/dist/training/MultiModelOrchestrator.js +0 -349
  71. package/dist/training/RLModelConfig.js +0 -295
  72. package/dist/training/RewardBackpropagationService.js +0 -117
  73. package/dist/training/RulerScoringService.js +0 -450
  74. package/dist/training/TrainingMonitor.js +0 -108
  75. package/dist/training/TrajectoryRecorder.js +0 -281
  76. package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
  77. package/dist/training/index.js +0 -30
  78. package/dist/training/logRLConfig.js +0 -29
  79. package/dist/training/pipeline.js +0 -80
  80. package/dist/training/storage/ModelStorageService.js +0 -190
  81. package/dist/training/storage/TrainingDataArchiver.js +0 -136
  82. package/dist/training/storage/index.js +0 -7
  83. package/dist/training/types.js +0 -6
  84. package/dist/training/window-utils.js +0 -100
  85. package/dist/utils/index.js +0 -73
  86. package/dist/utils/logger.js +0 -55
  87. package/dist/utils/snowflake.js +0 -15
  88. package/dist/utils/synthetic-detector.js +0 -67
  89. package/vitest.config.ts +0 -8
@@ -1,301 +0,0 @@
1
- /**
2
- * ArchetypeScoringService
3
- *
4
- * Scores trajectories using LLM-as-judge with archetype-specific rubrics.
5
- * Supports both single trajectory scoring and RULER-style relative comparison.
6
- *
7
- * @packageDocumentation
8
- */
9
- import { getTrainingDataAdapter } from "../adapter";
10
- import { getLLMCaller } from "../dependencies";
11
- import { trajectoryMetricsExtractor } from "../metrics";
12
- import { hasCustomRubric } from "../rubrics";
13
- import { logger, splitIntoBatches } from "../utils";
14
- import { judgePromptBuilder, } from "./JudgePromptBuilder";
15
- const DEFAULT_OPTIONS = {
16
- includeActionDetails: false,
17
- saveToDatabase: true,
18
- };
19
- /**
20
- * Service for scoring trajectories with archetype-aware evaluation.
21
- */
22
- export class ArchetypeScoringService {
23
- minGroupSize = 2;
24
- maxGroupSize = 8;
25
- /**
26
- * Score a single trajectory.
27
- * @param trajectoryId - ID of the trajectory to score
28
- * @param options - Scoring options
29
- * @returns The score or null if trajectory not found
30
- */
31
- async scoreTrajectory(trajectoryId, options = {}) {
32
- const opts = { ...DEFAULT_OPTIONS, ...options };
33
- const traj = await getTrainingDataAdapter().getTrajectoryById(trajectoryId);
34
- if (!traj) {
35
- logger.warn("Trajectory not found", { trajectoryId }, "ArchetypeScoring");
36
- return null;
37
- }
38
- const archetype = traj.archetype || opts.archetype || "default";
39
- const steps = JSON.parse(traj.stepsJson);
40
- const metrics = trajectoryMetricsExtractor.extractFromRaw({
41
- trajectoryId: traj.trajectoryId,
42
- agentId: traj.agentId,
43
- stepsJson: traj.stepsJson,
44
- scenarioId: traj.scenarioId || undefined,
45
- finalPnL: traj.finalPnL || undefined,
46
- });
47
- if (!metrics) {
48
- throw new Error(`Failed to extract metrics for trajectory ${trajectoryId}`);
49
- }
50
- const context = {
51
- trajectoryId: traj.trajectoryId,
52
- agentId: traj.agentId,
53
- archetype,
54
- steps,
55
- metrics,
56
- finalPnL: traj.finalPnL || undefined,
57
- episodeLength: traj.episodeLength,
58
- totalReward: traj.totalReward,
59
- };
60
- const { system, user } = judgePromptBuilder.buildSinglePrompt(context, {
61
- includeActionDetails: opts.includeActionDetails,
62
- });
63
- const response = await this.callSingleJudge(system, user);
64
- if (!response) {
65
- throw new Error(`Judge returned no response for trajectory ${trajectoryId}`);
66
- }
67
- const score = {
68
- trajectoryId: traj.trajectoryId,
69
- agentId: traj.agentId,
70
- archetype,
71
- score: Math.max(0, Math.min(1, response.score)),
72
- reasoning: response.reasoning,
73
- strengths: response.strengths || [],
74
- weaknesses: response.weaknesses || [],
75
- metrics,
76
- scoredAt: new Date(),
77
- };
78
- if (opts.saveToDatabase) {
79
- await getTrainingDataAdapter().updateTrajectoryScore(trajectoryId, score.score, score.reasoning);
80
- }
81
- logger.info("Scored trajectory", {
82
- trajectoryId,
83
- archetype: score.archetype,
84
- score: score.score,
85
- }, "ArchetypeScoring");
86
- return score;
87
- }
88
- /**
89
- * Score multiple trajectories using RULER comparison.
90
- * @param trajectoryIds - IDs of trajectories to score
91
- * @param options - Scoring options
92
- * @returns Array of scores
93
- */
94
- async scoreTrajectoryGroup(trajectoryIds, options = {}) {
95
- const opts = { ...DEFAULT_OPTIONS, ...options };
96
- if (trajectoryIds.length < this.minGroupSize) {
97
- logger.warn("Group too small for RULER scoring", {
98
- size: trajectoryIds.length,
99
- minRequired: this.minGroupSize,
100
- }, "ArchetypeScoring");
101
- return [];
102
- }
103
- const trajResults = await getTrainingDataAdapter().getTrajectoriesByIds(trajectoryIds);
104
- if (trajResults.length < this.minGroupSize) {
105
- logger.warn("Not enough valid trajectories", {
106
- requested: trajectoryIds.length,
107
- found: trajResults.length,
108
- }, "ArchetypeScoring");
109
- return [];
110
- }
111
- const contexts = [];
112
- const fallbackArchetype = opts.archetype || "default";
113
- for (const traj of trajResults) {
114
- const steps = JSON.parse(traj.stepsJson);
115
- const archetype = traj.archetype || fallbackArchetype;
116
- const metrics = trajectoryMetricsExtractor.extractFromRaw({
117
- trajectoryId: traj.trajectoryId,
118
- agentId: traj.agentId,
119
- stepsJson: traj.stepsJson,
120
- scenarioId: traj.scenarioId || undefined,
121
- finalPnL: traj.finalPnL || undefined,
122
- });
123
- if (!metrics) {
124
- throw new Error(`Failed to extract metrics for trajectory ${traj.trajectoryId}`);
125
- }
126
- contexts.push({
127
- trajectoryId: traj.trajectoryId,
128
- agentId: traj.agentId,
129
- archetype,
130
- steps,
131
- metrics,
132
- finalPnL: traj.finalPnL || undefined,
133
- episodeLength: traj.episodeLength,
134
- totalReward: traj.totalReward,
135
- });
136
- }
137
- const batches = splitIntoBatches(contexts, this.maxGroupSize);
138
- const scores = [];
139
- for (const batch of batches) {
140
- const scenarioId = batch[0]?.archetype || "unknown";
141
- const { system, user } = judgePromptBuilder.buildComparisonPrompt(batch, scenarioId);
142
- const response = await this.callComparisonJudge(system, user);
143
- if (!response) {
144
- throw new Error("Judge returned no response for batch");
145
- }
146
- for (let i = 0; i < batch.length; i++) {
147
- const ctx = batch[i];
148
- if (!ctx)
149
- continue;
150
- const expectedId = `trajectory-${i + 1}`;
151
- const scoreData = response.scores.find((s) => s.trajectory_id === expectedId);
152
- if (!scoreData) {
153
- throw new Error(`Missing score for ${expectedId}`);
154
- }
155
- const score = {
156
- trajectoryId: ctx.trajectoryId,
157
- agentId: ctx.agentId,
158
- archetype: ctx.archetype || "default",
159
- score: Math.max(0, Math.min(1, scoreData.score)),
160
- reasoning: scoreData.explanation,
161
- strengths: [],
162
- weaknesses: [],
163
- metrics: ctx.metrics,
164
- scoredAt: new Date(),
165
- };
166
- scores.push(score);
167
- if (opts.saveToDatabase) {
168
- await getTrainingDataAdapter().updateTrajectoryScore(ctx.trajectoryId, score.score, score.reasoning);
169
- }
170
- }
171
- }
172
- logger.info("Scored trajectory group", {
173
- requested: trajectoryIds.length,
174
- scored: scores.length,
175
- }, "ArchetypeScoring");
176
- return scores;
177
- }
178
- /**
179
- * Score trajectories by archetype.
180
- * @param archetype - Archetype to use for scoring
181
- * @param trajectoryIds - IDs to score
182
- * @returns Count of scored and errors
183
- */
184
- async scoreByArchetype(archetype, trajectoryIds) {
185
- if (!hasCustomRubric(archetype)) {
186
- logger.warn("No custom rubric for archetype, using default", { archetype }, "ArchetypeScoring");
187
- }
188
- if (trajectoryIds.length === 0) {
189
- return { scored: 0, errors: 0 };
190
- }
191
- const scores = await this.scoreTrajectoryGroup(trajectoryIds, {
192
- archetype,
193
- saveToDatabase: true,
194
- });
195
- return {
196
- scored: scores.length,
197
- errors: trajectoryIds.length - scores.length,
198
- };
199
- }
200
- /**
201
- * Score all unscored trajectories.
202
- * @param archetype - Default archetype to use
203
- * @param limit - Maximum trajectories to score
204
- * @returns Count of scored and errors
205
- */
206
- async scoreUnscoredTrajectories(archetype = "default", limit = 100) {
207
- const unscoredResult = await getTrainingDataAdapter().getUnscoredTrajectories({ limit });
208
- if (unscoredResult.length === 0) {
209
- logger.info("No unscored trajectories found", {}, "ArchetypeScoring");
210
- return { scored: 0, errors: 0 };
211
- }
212
- const trajectoryIds = unscoredResult.map((r) => r.trajectoryId);
213
- return this.scoreByArchetype(archetype, trajectoryIds);
214
- }
215
- /**
216
- * Score trajectories in parallel with rate limiting.
217
- * @param trajectoryIds - IDs to score
218
- * @param options - Scoring options
219
- * @param concurrency - Maximum concurrent calls
220
- * @returns Array of scores
221
- */
222
- async scoreTrajectoriesParallel(trajectoryIds, options = {}, concurrency = 5) {
223
- const results = [];
224
- const batches = splitIntoBatches(trajectoryIds, concurrency);
225
- logger.info("Starting parallel scoring", {
226
- total: trajectoryIds.length,
227
- batches: batches.length,
228
- concurrency,
229
- }, "ArchetypeScoring");
230
- for (let i = 0; i < batches.length; i++) {
231
- const batch = batches[i] ?? [];
232
- const batchPromises = batch.map((id) => this.scoreTrajectory(id, options));
233
- const batchResults = await Promise.all(batchPromises);
234
- for (const result of batchResults) {
235
- if (result) {
236
- results.push(result);
237
- }
238
- }
239
- if (i < batches.length - 1) {
240
- await new Promise((resolve) => setTimeout(resolve, 100));
241
- }
242
- }
243
- logger.info("Parallel scoring complete", {
244
- requested: trajectoryIds.length,
245
- scored: results.length,
246
- }, "ArchetypeScoring");
247
- return results;
248
- }
249
- /**
250
- * Call LLM judge for single trajectory.
251
- */
252
- async callSingleJudge(system, user) {
253
- const llmCaller = getLLMCaller();
254
- const prompt = `${user}\n\nReturn ONLY valid JSON, no other text.`;
255
- const response = await llmCaller.callGroqDirect({
256
- prompt,
257
- system,
258
- modelSize: "large",
259
- temperature: 0.3,
260
- maxTokens: 1000,
261
- actionType: "archetype_score_trajectory",
262
- });
263
- return this.parseJudgeResponse(response);
264
- }
265
- /**
266
- * Call LLM judge for trajectory comparison.
267
- */
268
- async callComparisonJudge(system, user) {
269
- const llmCaller = getLLMCaller();
270
- const prompt = `${user}\n\nReturn ONLY valid JSON, no other text.`;
271
- const response = await llmCaller.callGroqDirect({
272
- prompt,
273
- system,
274
- modelSize: "large",
275
- temperature: 0.3,
276
- maxTokens: 2000,
277
- actionType: "archetype_ruler_score",
278
- });
279
- return this.parseJudgeResponse(response);
280
- }
281
- /**
282
- * Parse JSON response from judge.
283
- */
284
- parseJudgeResponse(response) {
285
- const jsonText = response
286
- .trim()
287
- .replace(/```json\n?/g, "")
288
- .replace(/```\n?/g, "")
289
- .trim();
290
- const jsonMatch = jsonText.match(/\{[\s\S]*\}/);
291
- if (!jsonMatch) {
292
- logger.error("No JSON found in response", {
293
- preview: response.substring(0, 200),
294
- }, "ArchetypeScoring");
295
- return null;
296
- }
297
- return JSON.parse(jsonMatch[0]);
298
- }
299
- }
300
- /** Singleton instance */
301
- export const archetypeScoringService = new ArchetypeScoringService();