@elizaos/training 2.0.0-alpha.77 → 2.0.0-alpha.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/package.json +2 -2
  2. package/.turbo/turbo-lint.log +0 -3
  3. package/.turbo/turbo-typecheck.log +0 -1
  4. package/dist/.tsbuildinfo +0 -1
  5. package/dist/adapter.js +0 -59
  6. package/dist/archetypes/ArchetypeConfigService.js +0 -510
  7. package/dist/archetypes/derive-archetype.js +0 -196
  8. package/dist/archetypes/index.js +0 -7
  9. package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
  10. package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
  11. package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
  12. package/dist/benchmark/BenchmarkDataViewer.js +0 -197
  13. package/dist/benchmark/BenchmarkHistoryService.js +0 -135
  14. package/dist/benchmark/BenchmarkRunner.js +0 -483
  15. package/dist/benchmark/BenchmarkValidator.js +0 -158
  16. package/dist/benchmark/FastEvalRunner.js +0 -133
  17. package/dist/benchmark/MetricsValidator.js +0 -104
  18. package/dist/benchmark/MetricsVisualizer.js +0 -775
  19. package/dist/benchmark/ModelBenchmarkService.js +0 -433
  20. package/dist/benchmark/ModelRegistry.js +0 -122
  21. package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
  22. package/dist/benchmark/SimulationA2AInterface.js +0 -683
  23. package/dist/benchmark/SimulationEngine.js +0 -522
  24. package/dist/benchmark/TaskRunner.js +0 -60
  25. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
  26. package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
  27. package/dist/benchmark/index.js +0 -23
  28. package/dist/benchmark/parseSimulationMetrics.js +0 -86
  29. package/dist/benchmark/simulation-types.js +0 -1
  30. package/dist/dependencies.js +0 -197
  31. package/dist/generation/TrajectoryGenerator.js +0 -244
  32. package/dist/generation/index.js +0 -6
  33. package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
  34. package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
  35. package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
  36. package/dist/huggingface/index.js +0 -9
  37. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
  38. package/dist/index.js +0 -41
  39. package/dist/init-training.js +0 -43
  40. package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
  41. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
  42. package/dist/metrics/index.js +0 -7
  43. package/dist/metrics/types.js +0 -21
  44. package/dist/rubrics/__tests__/index.test.js +0 -150
  45. package/dist/rubrics/ass-kisser.js +0 -83
  46. package/dist/rubrics/degen.js +0 -78
  47. package/dist/rubrics/goody-twoshoes.js +0 -82
  48. package/dist/rubrics/index.js +0 -184
  49. package/dist/rubrics/information-trader.js +0 -82
  50. package/dist/rubrics/infosec.js +0 -99
  51. package/dist/rubrics/liar.js +0 -102
  52. package/dist/rubrics/perps-trader.js +0 -85
  53. package/dist/rubrics/researcher.js +0 -79
  54. package/dist/rubrics/scammer.js +0 -80
  55. package/dist/rubrics/social-butterfly.js +0 -71
  56. package/dist/rubrics/super-predictor.js +0 -95
  57. package/dist/rubrics/trader.js +0 -65
  58. package/dist/scoring/ArchetypeScoringService.js +0 -301
  59. package/dist/scoring/JudgePromptBuilder.js +0 -401
  60. package/dist/scoring/LLMJudgeCache.js +0 -263
  61. package/dist/scoring/index.js +0 -8
  62. package/dist/training/AutomationPipeline.js +0 -714
  63. package/dist/training/BenchmarkService.js +0 -370
  64. package/dist/training/ConfigValidator.js +0 -153
  65. package/dist/training/MarketOutcomesTracker.js +0 -142
  66. package/dist/training/ModelDeployer.js +0 -128
  67. package/dist/training/ModelFetcher.js +0 -48
  68. package/dist/training/ModelSelectionService.js +0 -248
  69. package/dist/training/ModelUsageVerifier.js +0 -106
  70. package/dist/training/MultiModelOrchestrator.js +0 -349
  71. package/dist/training/RLModelConfig.js +0 -295
  72. package/dist/training/RewardBackpropagationService.js +0 -117
  73. package/dist/training/RulerScoringService.js +0 -450
  74. package/dist/training/TrainingMonitor.js +0 -108
  75. package/dist/training/TrajectoryRecorder.js +0 -281
  76. package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
  77. package/dist/training/index.js +0 -30
  78. package/dist/training/logRLConfig.js +0 -29
  79. package/dist/training/pipeline.js +0 -80
  80. package/dist/training/storage/ModelStorageService.js +0 -190
  81. package/dist/training/storage/TrainingDataArchiver.js +0 -136
  82. package/dist/training/storage/index.js +0 -7
  83. package/dist/training/types.js +0 -6
  84. package/dist/training/window-utils.js +0 -100
  85. package/dist/utils/index.js +0 -73
  86. package/dist/utils/logger.js +0 -55
  87. package/dist/utils/snowflake.js +0 -15
  88. package/dist/utils/synthetic-detector.js +0 -67
  89. package/vitest.config.ts +0 -8
@@ -1,197 +0,0 @@
1
- /**
2
- * Benchmark Data Viewer
3
- *
4
- * Provides utilities to view and inspect benchmark data.
5
- * Useful for validation and understanding benchmark structure.
6
- */
7
- import { promises as fs } from "node:fs";
8
- import * as BenchmarkValidator from "./BenchmarkValidator";
9
- function analyzeTicks(ticks) {
10
- const eventTypes = {};
11
- let withEvents = 0;
12
- for (const tick of ticks) {
13
- if (tick.events.length > 0) {
14
- withEvents++;
15
- }
16
- for (const event of tick.events) {
17
- eventTypes[event.type] = (eventTypes[event.type] || 0) + 1;
18
- }
19
- }
20
- return {
21
- total: ticks.length,
22
- withEvents,
23
- eventTypes,
24
- };
25
- }
26
- function analyzeGroundTruth(groundTruth) {
27
- return {
28
- marketOutcomes: Object.keys(groundTruth.marketOutcomes).length,
29
- priceHistory: Object.fromEntries(Object.entries(groundTruth.priceHistory).map(([ticker, history]) => [
30
- ticker,
31
- history.length,
32
- ])),
33
- optimalActions: groundTruth.optimalActions.length,
34
- socialOpportunities: groundTruth.socialOpportunities.length,
35
- hiddenFacts: groundTruth.hiddenFacts?.length || 0,
36
- hiddenEvents: groundTruth.hiddenEvents?.length || 0,
37
- trueFacts: Object.keys(groundTruth.trueFacts || {}),
38
- };
39
- }
40
- /**
41
- * Load and view a benchmark file
42
- */
43
- export async function viewBenchmark(filePath, options = {}) {
44
- const data = await fs.readFile(filePath, "utf-8");
45
- const snapshot = JSON.parse(data);
46
- // Validate
47
- const validation = BenchmarkValidator.validate(snapshot);
48
- // Build view
49
- const view = {
50
- id: snapshot.id,
51
- version: snapshot.version,
52
- createdAt: snapshot.createdAt,
53
- duration: snapshot.duration,
54
- tickInterval: snapshot.tickInterval,
55
- initialState: {
56
- predictionMarkets: snapshot.initialState.predictionMarkets.length,
57
- perpetualMarkets: snapshot.initialState.perpetualMarkets.length,
58
- agents: snapshot.initialState.agents.length,
59
- posts: snapshot.initialState.posts?.length || 0,
60
- groupChats: snapshot.initialState.groupChats?.length || 0,
61
- },
62
- ticks: analyzeTicks(snapshot.ticks),
63
- validation,
64
- };
65
- if (options.showGroundTruth || options.verbose) {
66
- view.groundTruth = analyzeGroundTruth(snapshot.groundTruth);
67
- }
68
- return view;
69
- }
70
- /**
71
- * Print view to console
72
- */
73
- export function printBenchmarkView(view, options = {}) {
74
- console.log("\n📊 Benchmark Data View\n");
75
- console.log(`ID: ${view.id}`);
76
- console.log(`Version: ${view.version}`);
77
- console.log(`Created: ${new Date(view.createdAt).toISOString()}`);
78
- console.log(`Duration: ${(view.duration / 60).toFixed(1)} minutes`);
79
- console.log(`Tick Interval: ${view.tickInterval}s`);
80
- console.log("\n📈 Initial State:");
81
- console.log(` Prediction Markets: ${view.initialState.predictionMarkets}`);
82
- console.log(` Perpetual Markets: ${view.initialState.perpetualMarkets}`);
83
- console.log(` Agents: ${view.initialState.agents}`);
84
- console.log(` Posts: ${view.initialState.posts}`);
85
- console.log(` Group Chats: ${view.initialState.groupChats}`);
86
- console.log("\n⏱️ Ticks:");
87
- console.log(` Total: ${view.ticks.total}`);
88
- console.log(` With Events: ${view.ticks.withEvents}`);
89
- if (options.verbose) {
90
- console.log(` Event Types:`);
91
- for (const [type, count] of Object.entries(view.ticks.eventTypes)) {
92
- console.log(` ${type}: ${count}`);
93
- }
94
- }
95
- if (view.groundTruth) {
96
- console.log("\n🎯 Ground Truth:");
97
- console.log(` Market Outcomes: ${view.groundTruth.marketOutcomes}`);
98
- console.log(` Price History:`);
99
- for (const [ticker, count] of Object.entries(view.groundTruth.priceHistory)) {
100
- console.log(` ${ticker}: ${count} ticks`);
101
- }
102
- console.log(` Optimal Actions: ${view.groundTruth.optimalActions}`);
103
- console.log(` Social Opportunities: ${view.groundTruth.socialOpportunities}`);
104
- if (options.showHidden) {
105
- console.log(` Hidden Facts: ${view.groundTruth.hiddenFacts}`);
106
- console.log(` Hidden Events: ${view.groundTruth.hiddenEvents}`);
107
- console.log(` True Facts: ${view.groundTruth.trueFacts.join(", ")}`);
108
- }
109
- }
110
- console.log("\n✅ Validation:");
111
- console.log(` Valid: ${view.validation.valid ? "✅" : "❌"}`);
112
- if (view.validation.errors.length > 0) {
113
- console.log(` Errors: ${view.validation.errors.length}`);
114
- if (options.verbose) {
115
- for (const error of view.validation.errors) {
116
- console.log(` ❌ ${error}`);
117
- }
118
- }
119
- }
120
- if (view.validation.warnings.length > 0) {
121
- console.log(` Warnings: ${view.validation.warnings.length}`);
122
- if (options.verbose) {
123
- for (const warning of view.validation.warnings) {
124
- console.log(` ⚠️ ${warning}`);
125
- }
126
- }
127
- }
128
- console.log("");
129
- }
130
- /**
131
- * Get tick details
132
- */
133
- export function getTickDetails(snapshot, tickNumber) {
134
- const tick = snapshot.ticks[tickNumber] || null;
135
- if (!tick) {
136
- return { tick: null, state: null, events: [] };
137
- }
138
- return {
139
- tick,
140
- state: tick.state,
141
- events: tick.events.map((e) => ({
142
- type: e.type,
143
- data: e.data,
144
- })),
145
- };
146
- }
147
- /**
148
- * Get ground truth for a specific tick
149
- */
150
- export function getGroundTruthForTick(snapshot, tickNumber) {
151
- const gt = snapshot.groundTruth;
152
- return {
153
- hiddenFacts: (gt.hiddenFacts || [])
154
- .filter((f) => f.tick === tickNumber)
155
- .map((f) => ({ fact: f.fact, category: f.category })),
156
- hiddenEvents: (gt.hiddenEvents || [])
157
- .filter((e) => e.tick === tickNumber)
158
- .map((e) => ({ type: e.type, description: e.description })),
159
- marketOutcomes: gt.marketOutcomes,
160
- };
161
- }
162
- /**
163
- * Check if agent can access hidden facts (should always be false)
164
- */
165
- export function verifyAgentCannotAccessHiddenFacts(snapshot) {
166
- // Agents can only access game state via SimulationA2AInterface
167
- // Ground truth is stored separately and not exposed
168
- // This is a verification check
169
- const state = snapshot.initialState;
170
- const hasGroundTruth = !!snapshot.groundTruth;
171
- const hasHiddenFacts = !!snapshot.groundTruth?.hiddenFacts?.length;
172
- // Check if ground truth is accidentally in state
173
- const stateKeys = Object.keys(state);
174
- const hasGroundTruthInState = stateKeys.includes("groundTruth") ||
175
- stateKeys.includes("hiddenFacts") ||
176
- stateKeys.includes("hiddenEvents");
177
- if (hasGroundTruthInState) {
178
- return {
179
- canAccess: true,
180
- reason: "Ground truth found in game state (security issue!)",
181
- };
182
- }
183
- return {
184
- canAccess: false,
185
- reason: hasGroundTruth && hasHiddenFacts
186
- ? "Ground truth exists but is properly isolated from game state"
187
- : "No ground truth data found",
188
- };
189
- }
190
- /** @deprecated Use viewBenchmark, printBenchmarkView, etc. instead */
191
- export const BenchmarkDataViewer = {
192
- view: viewBenchmark,
193
- print: printBenchmarkView,
194
- getTickDetails,
195
- getGroundTruthForTick,
196
- verifyAgentCannotAccessHiddenFacts,
197
- };
@@ -1,135 +0,0 @@
1
- /**
2
- * Benchmark History Service
3
- *
4
- * Persists benchmark results to the database for historical tracking and analysis.
5
- */
6
- import { getTrainingDataAdapter, } from "../adapter";
7
- import { logger } from "../utils/logger";
8
- import { generateSnowflakeId } from "../utils/snowflake";
9
- /**
10
- * Service for managing benchmark result history
11
- */
12
- // biome-ignore lint/complexity/noStaticOnlyClass: Service namespace - methods are logically grouped
13
- export class BenchmarkHistoryService {
14
- /**
15
- * Save a benchmark result to the database
16
- */
17
- static async saveResult(input) {
18
- const id = await generateSnowflakeId();
19
- const now = new Date();
20
- const insertData = {
21
- id,
22
- modelId: input.modelId,
23
- benchmarkId: input.benchmarkId,
24
- benchmarkPath: input.benchmarkPath,
25
- runAt: now,
26
- totalPnl: input.metrics.totalPnl,
27
- predictionAccuracy: input.metrics.predictionMetrics.accuracy,
28
- perpWinRate: input.metrics.perpMetrics.winRate,
29
- optimalityScore: input.metrics.optimalityScore,
30
- detailedMetrics: JSON.parse(JSON.stringify(input.metrics)),
31
- baselinePnlDelta: input.baselineComparison?.pnlDelta ?? null,
32
- baselineAccuracyDelta: input.baselineComparison?.accuracyDelta ?? null,
33
- improved: input.baselineComparison?.improved ?? null,
34
- duration: input.duration,
35
- };
36
- await getTrainingDataAdapter().insertBenchmarkResult(insertData);
37
- logger.info("Saved benchmark result", {
38
- id,
39
- modelId: input.modelId,
40
- benchmarkId: input.benchmarkId,
41
- totalPnl: input.metrics.totalPnl,
42
- });
43
- return { ...insertData, createdAt: now };
44
- }
45
- /**
46
- * Get benchmark results by query
47
- */
48
- static async getResults(query) {
49
- return getTrainingDataAdapter().queryBenchmarkResults({
50
- modelId: query.modelId,
51
- benchmarkId: query.benchmarkId,
52
- startDate: query.startDate,
53
- endDate: query.endDate,
54
- limit: query.limit ?? 100,
55
- });
56
- }
57
- /**
58
- * Get the latest result for a model
59
- */
60
- static async getLatestResult(modelId) {
61
- const results = await getTrainingDataAdapter().queryBenchmarkResults({
62
- modelId,
63
- limit: 1,
64
- });
65
- return results[0] ?? null;
66
- }
67
- /**
68
- * Get trend data for a model
69
- */
70
- static async getTrendData(modelId, limit = 20) {
71
- const results = await getTrainingDataAdapter().queryBenchmarkResults({
72
- modelId,
73
- limit,
74
- });
75
- // queryBenchmarkResults returns desc by runAt, reverse for chronological
76
- const chronological = results.reverse();
77
- return {
78
- modelId,
79
- dates: chronological.map((r) => r.runAt),
80
- pnlHistory: chronological.map((r) => r.totalPnl),
81
- accuracyHistory: chronological.map((r) => r.predictionAccuracy),
82
- optimalityHistory: chronological.map((r) => r.optimalityScore),
83
- };
84
- }
85
- /**
86
- * Get comparison data for multiple models
87
- */
88
- static async getModelComparison(modelIds, benchmarkId) {
89
- const adapter = getTrainingDataAdapter();
90
- const comparison = new Map();
91
- for (const modelId of modelIds) {
92
- const results = await adapter.queryBenchmarkResults({
93
- modelId,
94
- benchmarkId,
95
- limit: 10,
96
- });
97
- comparison.set(modelId, results);
98
- }
99
- return comparison;
100
- }
101
- /**
102
- * Get summary statistics for all models
103
- */
104
- static async getModelSummary() {
105
- return getTrainingDataAdapter().getBenchmarkModelSummary();
106
- }
107
- /**
108
- * Check if a model improved vs baseline
109
- */
110
- static async checkImprovement(modelId, baselineModelId, benchmarkId) {
111
- const adapter = getTrainingDataAdapter();
112
- const modelResults = await adapter.queryBenchmarkResults({
113
- modelId,
114
- benchmarkId,
115
- limit: 1,
116
- });
117
- const baselineResults = await adapter.queryBenchmarkResults({
118
- modelId: baselineModelId,
119
- benchmarkId,
120
- limit: 1,
121
- });
122
- const modelResult = modelResults[0];
123
- const baselineResult = baselineResults[0];
124
- if (!modelResult || !baselineResult) {
125
- return null;
126
- }
127
- const delta = modelResult.totalPnl - baselineResult.totalPnl;
128
- return {
129
- improved: delta > 0,
130
- modelPnl: modelResult.totalPnl,
131
- baselinePnl: baselineResult.totalPnl,
132
- delta,
133
- };
134
- }
135
- }