@elizaos/training 2.0.0-alpha.76 → 2.0.0-alpha.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/package.json +2 -2
  2. package/.turbo/turbo-lint.log +0 -3
  3. package/.turbo/turbo-typecheck.log +0 -1
  4. package/dist/.tsbuildinfo +0 -1
  5. package/dist/adapter.js +0 -59
  6. package/dist/archetypes/ArchetypeConfigService.js +0 -510
  7. package/dist/archetypes/derive-archetype.js +0 -196
  8. package/dist/archetypes/index.js +0 -7
  9. package/dist/benchmark/ArchetypeMatchupBenchmark.js +0 -547
  10. package/dist/benchmark/BenchmarkChartGenerator.js +0 -632
  11. package/dist/benchmark/BenchmarkDataGenerator.js +0 -825
  12. package/dist/benchmark/BenchmarkDataViewer.js +0 -197
  13. package/dist/benchmark/BenchmarkHistoryService.js +0 -135
  14. package/dist/benchmark/BenchmarkRunner.js +0 -483
  15. package/dist/benchmark/BenchmarkValidator.js +0 -158
  16. package/dist/benchmark/FastEvalRunner.js +0 -133
  17. package/dist/benchmark/MetricsValidator.js +0 -104
  18. package/dist/benchmark/MetricsVisualizer.js +0 -775
  19. package/dist/benchmark/ModelBenchmarkService.js +0 -433
  20. package/dist/benchmark/ModelRegistry.js +0 -122
  21. package/dist/benchmark/RulerBenchmarkIntegration.js +0 -168
  22. package/dist/benchmark/SimulationA2AInterface.js +0 -683
  23. package/dist/benchmark/SimulationEngine.js +0 -522
  24. package/dist/benchmark/TaskRunner.js +0 -60
  25. package/dist/benchmark/__tests__/BenchmarkRunner.test.js +0 -409
  26. package/dist/benchmark/__tests__/HeadToHead.test.js +0 -105
  27. package/dist/benchmark/index.js +0 -23
  28. package/dist/benchmark/parseSimulationMetrics.js +0 -86
  29. package/dist/benchmark/simulation-types.js +0 -1
  30. package/dist/dependencies.js +0 -197
  31. package/dist/generation/TrajectoryGenerator.js +0 -244
  32. package/dist/generation/index.js +0 -6
  33. package/dist/huggingface/HuggingFaceDatasetUploader.js +0 -463
  34. package/dist/huggingface/HuggingFaceIntegrationService.js +0 -272
  35. package/dist/huggingface/HuggingFaceModelUploader.js +0 -385
  36. package/dist/huggingface/index.js +0 -9
  37. package/dist/huggingface/shared/HuggingFaceUploadUtil.js +0 -144
  38. package/dist/index.js +0 -41
  39. package/dist/init-training.js +0 -43
  40. package/dist/metrics/TrajectoryMetricsExtractor.js +0 -523
  41. package/dist/metrics/__tests__/TrajectoryMetricsExtractor.test.js +0 -628
  42. package/dist/metrics/index.js +0 -7
  43. package/dist/metrics/types.js +0 -21
  44. package/dist/rubrics/__tests__/index.test.js +0 -150
  45. package/dist/rubrics/ass-kisser.js +0 -83
  46. package/dist/rubrics/degen.js +0 -78
  47. package/dist/rubrics/goody-twoshoes.js +0 -82
  48. package/dist/rubrics/index.js +0 -184
  49. package/dist/rubrics/information-trader.js +0 -82
  50. package/dist/rubrics/infosec.js +0 -99
  51. package/dist/rubrics/liar.js +0 -102
  52. package/dist/rubrics/perps-trader.js +0 -85
  53. package/dist/rubrics/researcher.js +0 -79
  54. package/dist/rubrics/scammer.js +0 -80
  55. package/dist/rubrics/social-butterfly.js +0 -71
  56. package/dist/rubrics/super-predictor.js +0 -95
  57. package/dist/rubrics/trader.js +0 -65
  58. package/dist/scoring/ArchetypeScoringService.js +0 -301
  59. package/dist/scoring/JudgePromptBuilder.js +0 -401
  60. package/dist/scoring/LLMJudgeCache.js +0 -263
  61. package/dist/scoring/index.js +0 -8
  62. package/dist/training/AutomationPipeline.js +0 -714
  63. package/dist/training/BenchmarkService.js +0 -370
  64. package/dist/training/ConfigValidator.js +0 -153
  65. package/dist/training/MarketOutcomesTracker.js +0 -142
  66. package/dist/training/ModelDeployer.js +0 -128
  67. package/dist/training/ModelFetcher.js +0 -48
  68. package/dist/training/ModelSelectionService.js +0 -248
  69. package/dist/training/ModelUsageVerifier.js +0 -106
  70. package/dist/training/MultiModelOrchestrator.js +0 -349
  71. package/dist/training/RLModelConfig.js +0 -295
  72. package/dist/training/RewardBackpropagationService.js +0 -117
  73. package/dist/training/RulerScoringService.js +0 -450
  74. package/dist/training/TrainingMonitor.js +0 -108
  75. package/dist/training/TrajectoryRecorder.js +0 -281
  76. package/dist/training/__tests__/TrajectoryRecorder.test.js +0 -363
  77. package/dist/training/index.js +0 -30
  78. package/dist/training/logRLConfig.js +0 -29
  79. package/dist/training/pipeline.js +0 -80
  80. package/dist/training/storage/ModelStorageService.js +0 -190
  81. package/dist/training/storage/TrainingDataArchiver.js +0 -136
  82. package/dist/training/storage/index.js +0 -7
  83. package/dist/training/types.js +0 -6
  84. package/dist/training/window-utils.js +0 -100
  85. package/dist/utils/index.js +0 -73
  86. package/dist/utils/logger.js +0 -55
  87. package/dist/utils/snowflake.js +0 -15
  88. package/dist/utils/synthetic-detector.js +0 -67
  89. package/vitest.config.ts +0 -8
@@ -1,168 +0,0 @@
1
- /**
2
- * RULER Benchmark Integration
3
- *
4
- * Provides utilities to integrate benchmark ground truth data with RULER scoring.
5
- * This allows RULER to evaluate agent trajectories against known benchmark outcomes.
6
- */
7
- /**
8
- * Extract market outcomes from benchmark ground truth for RULER scoring
9
- *
10
- * Converts benchmark ground truth data into the format expected by RULER
11
- * scoring service, extracting both prediction market outcomes and stock
12
- * price changes.
13
- *
14
- * @param snapshot - Benchmark game snapshot with ground truth data
15
- * @returns MarketOutcomes with stocks and predictions arrays
16
- *
17
- * @example
18
- * ```typescript
19
- * const outcomes = extractMarketOutcomesFromBenchmark(snapshot);
20
- * // Returns: { stocks: [...], predictions: [...] }
21
- * ```
22
- */
23
- export function extractMarketOutcomesFromBenchmark(snapshot) {
24
- const gt = snapshot.groundTruth;
25
- // Extract prediction market outcomes
26
- const predictions = Object.entries(gt.marketOutcomes).map(([marketId, outcome]) => ({
27
- marketId,
28
- outcome: outcome ? "YES" : "NO",
29
- }));
30
- // Extract stock/perpetual outcomes from price history
31
- const stocks = Object.entries(gt.priceHistory).map(([ticker, history]) => {
32
- if (history.length === 0) {
33
- return {
34
- ticker,
35
- changePercent: 0,
36
- };
37
- }
38
- const startPrice = history[0]?.price || 0;
39
- const endPrice = history[history.length - 1]?.price || startPrice;
40
- const changePercent = startPrice > 0 ? ((endPrice - startPrice) / startPrice) * 100 : 0;
41
- return {
42
- ticker,
43
- changePercent,
44
- };
45
- });
46
- return {
47
- stocks,
48
- predictions,
49
- };
50
- }
51
- /**
52
- * Get hidden facts for a specific tick (for RULER analysis)
53
- *
54
- * Retrieves hidden facts that were not visible to the agent at a specific
55
- * tick, used for evaluating whether agent decisions aligned with hidden information.
56
- *
57
- * @param snapshot - Benchmark game snapshot
58
- * @param tickNumber - Tick number to get facts for
59
- * @returns Array of hidden facts for that tick
60
- */
61
- export function getHiddenFactsForTick(snapshot, tickNumber) {
62
- return (snapshot.groundTruth.hiddenFacts || []).filter((f) => f.tick === tickNumber);
63
- }
64
- /**
65
- * Get hidden events for a specific tick (for RULER analysis)
66
- *
67
- * Retrieves hidden events that occurred at a specific tick, used for
68
- * evaluating agent decision quality against ground truth.
69
- *
70
- * @param snapshot - Benchmark game snapshot
71
- * @param tickNumber - Tick number to get events for
72
- * @returns Array of hidden events for that tick
73
- */
74
- export function getHiddenEventsForTick(snapshot, tickNumber) {
75
- return (snapshot.groundTruth.hiddenEvents || []).filter((e) => e.tick === tickNumber);
76
- }
77
- /**
78
- * Check if agent decision was optimal given ground truth
79
- *
80
- * Compares an agent's action against the optimal actions defined in the
81
- * benchmark ground truth, allowing a small time window for timing differences.
82
- *
83
- * @param snapshot - Benchmark game snapshot
84
- * @param tickNumber - Tick when action occurred
85
- * @param actionType - Type of action taken
86
- * @param target - Target of the action (market ID, ticker, etc.)
87
- * @returns True if action matches an optimal action within the time window
88
- */
89
- export function wasDecisionOptimal(snapshot, tickNumber, actionType, target) {
90
- const optimalActions = snapshot.groundTruth.optimalActions;
91
- // Find optimal actions near this tick
92
- const window = 2; // Allow 2 tick window
93
- const relevantActions = optimalActions.filter((a) => Math.abs(a.tick - tickNumber) <= window &&
94
- a.type === actionType &&
95
- a.target === target);
96
- return relevantActions.length > 0;
97
- }
98
- /**
99
- * Get true facts about the world state (for RULER context)
100
- *
101
- * Retrieves the true facts about the world state that agents don't know,
102
- * used for RULER evaluation context.
103
- *
104
- * @param snapshot - Benchmark game snapshot
105
- * @returns Object containing true facts about the world state
106
- */
107
- export function getTrueFacts(snapshot) {
108
- return snapshot.groundTruth.trueFacts || {};
109
- }
110
- /**
111
- * Create RULER evaluation context from benchmark
112
- *
113
- * Provides all the ground truth information RULER needs to evaluate
114
- * agent decisions, while ensuring agents never see this data during execution.
115
- *
116
- * @param snapshot - Benchmark game snapshot
117
- * @returns Complete RULER evaluation context with all ground truth data
118
- *
119
- * @remarks
120
- * This function aggregates all ground truth data into a single context object
121
- * that can be used by RULER to score agent trajectories. The data includes
122
- * market outcomes, hidden facts/events, optimal actions, and true facts.
123
- */
124
- export function createRulerContext(snapshot) {
125
- return {
126
- marketOutcomes: extractMarketOutcomesFromBenchmark(snapshot),
127
- trueFacts: getTrueFacts(snapshot),
128
- hiddenFacts: snapshot.groundTruth.hiddenFacts || [],
129
- hiddenEvents: snapshot.groundTruth.hiddenEvents || [],
130
- optimalActions: snapshot.groundTruth.optimalActions,
131
- };
132
- }
133
- /**
134
- * Score agent action against ground truth
135
- *
136
- * Evaluates a single agent action against the benchmark ground truth and
137
- * returns a score indicating how well it aligned with optimal play.
138
- *
139
- * @param snapshot - Benchmark game snapshot
140
- * @param tickNumber - Tick when action occurred
141
- * @param actionType - Type of action taken
142
- * @param target - Target of the action (market ID, ticker, etc.)
143
- * @returns Score from 0-1 (1.0 = optimal, 0.5 = reasonable, 0.0 = poor)
144
- *
145
- * @remarks
146
- * - Returns 1.0 if action matches optimal action
147
- * - Returns 0.5 if action aligns with hidden facts
148
- * - Returns 0.0 otherwise
149
- */
150
- export function scoreActionAgainstGroundTruth(snapshot, tickNumber, actionType, target) {
151
- // Check if action was optimal
152
- const wasOptimal = wasDecisionOptimal(snapshot, tickNumber, actionType, target);
153
- if (wasOptimal) {
154
- return 1.0;
155
- }
156
- // Check if action was reasonable given hidden facts
157
- const hiddenFacts = getHiddenFactsForTick(snapshot, tickNumber);
158
- const relevantFacts = hiddenFacts.filter((f) => f.value &&
159
- typeof f.value === "object" &&
160
- "marketId" in f.value &&
161
- f.value.marketId === target);
162
- if (relevantFacts.length > 0) {
163
- // Partial credit for actions that align with hidden facts
164
- return 0.5;
165
- }
166
- // No credit for actions that don't align with optimal play or hidden facts
167
- return 0.0;
168
- }