@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -5,11 +5,11 @@
|
|
|
5
5
|
* This allows RULER to evaluate agent trajectories against known benchmark outcomes.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import type { MarketOutcomes } from
|
|
8
|
+
import type { MarketOutcomes } from "../training/RulerScoringService";
|
|
9
9
|
import type {
|
|
10
10
|
BenchmarkGameSnapshot,
|
|
11
11
|
GroundTruth,
|
|
12
|
-
} from
|
|
12
|
+
} from "./BenchmarkDataGenerator";
|
|
13
13
|
|
|
14
14
|
/**
|
|
15
15
|
* Extract market outcomes from benchmark ground truth for RULER scoring
|
|
@@ -28,15 +28,15 @@ import type {
|
|
|
28
28
|
* ```
|
|
29
29
|
*/
|
|
30
30
|
export function extractMarketOutcomesFromBenchmark(
|
|
31
|
-
snapshot: BenchmarkGameSnapshot
|
|
31
|
+
snapshot: BenchmarkGameSnapshot,
|
|
32
32
|
): MarketOutcomes {
|
|
33
33
|
const gt = snapshot.groundTruth;
|
|
34
34
|
|
|
35
35
|
// Extract prediction market outcomes
|
|
36
|
-
const predictions: Array<{ marketId: string; outcome:
|
|
36
|
+
const predictions: Array<{ marketId: string; outcome: "YES" | "NO" }> =
|
|
37
37
|
Object.entries(gt.marketOutcomes).map(([marketId, outcome]) => ({
|
|
38
38
|
marketId,
|
|
39
|
-
outcome: outcome ?
|
|
39
|
+
outcome: outcome ? "YES" : "NO",
|
|
40
40
|
}));
|
|
41
41
|
|
|
42
42
|
// Extract stock/perpetual outcomes from price history
|
|
@@ -77,10 +77,10 @@ export function extractMarketOutcomesFromBenchmark(
|
|
|
77
77
|
*/
|
|
78
78
|
export function getHiddenFactsForTick(
|
|
79
79
|
snapshot: BenchmarkGameSnapshot,
|
|
80
|
-
tickNumber: number
|
|
81
|
-
): GroundTruth[
|
|
80
|
+
tickNumber: number,
|
|
81
|
+
): GroundTruth["hiddenFacts"] {
|
|
82
82
|
return (snapshot.groundTruth.hiddenFacts || []).filter(
|
|
83
|
-
(f) => f.tick === tickNumber
|
|
83
|
+
(f) => f.tick === tickNumber,
|
|
84
84
|
);
|
|
85
85
|
}
|
|
86
86
|
|
|
@@ -96,10 +96,10 @@ export function getHiddenFactsForTick(
|
|
|
96
96
|
*/
|
|
97
97
|
export function getHiddenEventsForTick(
|
|
98
98
|
snapshot: BenchmarkGameSnapshot,
|
|
99
|
-
tickNumber: number
|
|
100
|
-
): GroundTruth[
|
|
99
|
+
tickNumber: number,
|
|
100
|
+
): GroundTruth["hiddenEvents"] {
|
|
101
101
|
return (snapshot.groundTruth.hiddenEvents || []).filter(
|
|
102
|
-
(e) => e.tick === tickNumber
|
|
102
|
+
(e) => e.tick === tickNumber,
|
|
103
103
|
);
|
|
104
104
|
}
|
|
105
105
|
|
|
@@ -119,7 +119,7 @@ export function wasDecisionOptimal(
|
|
|
119
119
|
snapshot: BenchmarkGameSnapshot,
|
|
120
120
|
tickNumber: number,
|
|
121
121
|
actionType: string,
|
|
122
|
-
target: string
|
|
122
|
+
target: string,
|
|
123
123
|
): boolean {
|
|
124
124
|
const optimalActions = snapshot.groundTruth.optimalActions;
|
|
125
125
|
|
|
@@ -129,7 +129,7 @@ export function wasDecisionOptimal(
|
|
|
129
129
|
(a) =>
|
|
130
130
|
Math.abs(a.tick - tickNumber) <= window &&
|
|
131
131
|
a.type === actionType &&
|
|
132
|
-
a.target === target
|
|
132
|
+
a.target === target,
|
|
133
133
|
);
|
|
134
134
|
|
|
135
135
|
return relevantActions.length > 0;
|
|
@@ -145,8 +145,8 @@ export function wasDecisionOptimal(
|
|
|
145
145
|
* @returns Object containing true facts about the world state
|
|
146
146
|
*/
|
|
147
147
|
export function getTrueFacts(
|
|
148
|
-
snapshot: BenchmarkGameSnapshot
|
|
149
|
-
): GroundTruth[
|
|
148
|
+
snapshot: BenchmarkGameSnapshot,
|
|
149
|
+
): GroundTruth["trueFacts"] {
|
|
150
150
|
return snapshot.groundTruth.trueFacts || {};
|
|
151
151
|
}
|
|
152
152
|
|
|
@@ -166,10 +166,10 @@ export function getTrueFacts(
|
|
|
166
166
|
*/
|
|
167
167
|
export function createRulerContext(snapshot: BenchmarkGameSnapshot): {
|
|
168
168
|
marketOutcomes: MarketOutcomes;
|
|
169
|
-
trueFacts: GroundTruth[
|
|
170
|
-
hiddenFacts: GroundTruth[
|
|
171
|
-
hiddenEvents: GroundTruth[
|
|
172
|
-
optimalActions: GroundTruth[
|
|
169
|
+
trueFacts: GroundTruth["trueFacts"];
|
|
170
|
+
hiddenFacts: GroundTruth["hiddenFacts"];
|
|
171
|
+
hiddenEvents: GroundTruth["hiddenEvents"];
|
|
172
|
+
optimalActions: GroundTruth["optimalActions"];
|
|
173
173
|
} {
|
|
174
174
|
return {
|
|
175
175
|
marketOutcomes: extractMarketOutcomesFromBenchmark(snapshot),
|
|
@@ -201,14 +201,14 @@ export function scoreActionAgainstGroundTruth(
|
|
|
201
201
|
snapshot: BenchmarkGameSnapshot,
|
|
202
202
|
tickNumber: number,
|
|
203
203
|
actionType: string,
|
|
204
|
-
target: string
|
|
204
|
+
target: string,
|
|
205
205
|
): number {
|
|
206
206
|
// Check if action was optimal
|
|
207
207
|
const wasOptimal = wasDecisionOptimal(
|
|
208
208
|
snapshot,
|
|
209
209
|
tickNumber,
|
|
210
210
|
actionType,
|
|
211
|
-
target
|
|
211
|
+
target,
|
|
212
212
|
);
|
|
213
213
|
|
|
214
214
|
if (wasOptimal) {
|
|
@@ -220,9 +220,9 @@ export function scoreActionAgainstGroundTruth(
|
|
|
220
220
|
const relevantFacts = hiddenFacts.filter(
|
|
221
221
|
(f) =>
|
|
222
222
|
f.value &&
|
|
223
|
-
typeof f.value ===
|
|
224
|
-
|
|
225
|
-
(f.value as { marketId: string }).marketId === target
|
|
223
|
+
typeof f.value === "object" &&
|
|
224
|
+
"marketId" in f.value &&
|
|
225
|
+
(f.value as { marketId: string }).marketId === target,
|
|
226
226
|
);
|
|
227
227
|
|
|
228
228
|
if (relevantFacts.length > 0) {
|