@elizaos/training 2.0.0-alpha.13 → 2.0.0-alpha.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/scripts/rank_trajectories.ts +0 -1
- package/scripts/run_task_benchmark.ts +4 -11
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +188 -185
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +136 -136
- package/src/benchmark/BenchmarkDataViewer.ts +32 -30
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +87 -83
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +20 -21
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +51 -51
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +80 -80
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +72 -72
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +32 -32
- package/src/index.ts +27 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +149 -140
- package/src/training/BenchmarkService.ts +49 -45
- package/src/training/ConfigValidator.ts +38 -32
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +73 -72
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +25 -27
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
|
@@ -5,9 +5,9 @@
|
|
|
5
5
|
* and contains all required fields.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import type { JsonValue } from
|
|
9
|
-
import { logger } from
|
|
10
|
-
import type { BenchmarkGameSnapshot } from
|
|
8
|
+
import type { JsonValue } from "../adapter";
|
|
9
|
+
import { logger } from "../utils/logger";
|
|
10
|
+
import type { BenchmarkGameSnapshot } from "./BenchmarkDataGenerator";
|
|
11
11
|
|
|
12
12
|
export interface BenchmarkValidationResult {
|
|
13
13
|
valid: boolean;
|
|
@@ -23,58 +23,58 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
|
|
|
23
23
|
const warnings: string[] = [];
|
|
24
24
|
|
|
25
25
|
// 1. Check required top-level fields
|
|
26
|
-
if (!snapshot || typeof snapshot !==
|
|
27
|
-
errors.push(
|
|
26
|
+
if (!snapshot || typeof snapshot !== "object") {
|
|
27
|
+
errors.push("Snapshot is null, undefined, or not an object");
|
|
28
28
|
return { valid: false, errors, warnings };
|
|
29
29
|
}
|
|
30
30
|
|
|
31
31
|
const snap = snapshot as Record<string, JsonValue>;
|
|
32
32
|
|
|
33
|
-
if (!snap.id) errors.push(
|
|
34
|
-
if (!snap.version) errors.push(
|
|
35
|
-
if (typeof snap.duration !==
|
|
36
|
-
errors.push(
|
|
37
|
-
if (typeof snap.tickInterval !==
|
|
38
|
-
errors.push(
|
|
39
|
-
if (!snap.initialState) errors.push(
|
|
33
|
+
if (!snap.id) errors.push("Missing required field: id");
|
|
34
|
+
if (!snap.version) errors.push("Missing required field: version");
|
|
35
|
+
if (typeof snap.duration !== "number")
|
|
36
|
+
errors.push("Missing or invalid field: duration");
|
|
37
|
+
if (typeof snap.tickInterval !== "number")
|
|
38
|
+
errors.push("Missing or invalid field: tickInterval");
|
|
39
|
+
if (!snap.initialState) errors.push("Missing required field: initialState");
|
|
40
40
|
if (!Array.isArray(snap.ticks))
|
|
41
|
-
errors.push(
|
|
42
|
-
if (!snap.groundTruth) errors.push(
|
|
41
|
+
errors.push("Missing or invalid field: ticks (must be array)");
|
|
42
|
+
if (!snap.groundTruth) errors.push("Missing required field: groundTruth");
|
|
43
43
|
|
|
44
44
|
// 2. Validate initial state
|
|
45
|
-
if (snap.initialState && typeof snap.initialState ===
|
|
45
|
+
if (snap.initialState && typeof snap.initialState === "object") {
|
|
46
46
|
const state = snap.initialState as Record<string, JsonValue>;
|
|
47
47
|
|
|
48
|
-
if (typeof state.tick !==
|
|
49
|
-
errors.push(
|
|
50
|
-
if (state.tick !== 0) warnings.push(
|
|
48
|
+
if (typeof state.tick !== "number")
|
|
49
|
+
errors.push("initialState.tick must be a number");
|
|
50
|
+
if (state.tick !== 0) warnings.push("initialState.tick should be 0");
|
|
51
51
|
|
|
52
52
|
if (!Array.isArray(state.predictionMarkets)) {
|
|
53
|
-
errors.push(
|
|
53
|
+
errors.push("initialState.predictionMarkets must be an array");
|
|
54
54
|
}
|
|
55
55
|
|
|
56
56
|
if (!Array.isArray(state.perpetualMarkets)) {
|
|
57
|
-
errors.push(
|
|
57
|
+
errors.push("initialState.perpetualMarkets must be an array");
|
|
58
58
|
}
|
|
59
59
|
|
|
60
60
|
if (!Array.isArray(state.agents)) {
|
|
61
|
-
errors.push(
|
|
61
|
+
errors.push("initialState.agents must be an array");
|
|
62
62
|
}
|
|
63
63
|
}
|
|
64
64
|
|
|
65
65
|
// 3. Validate ticks
|
|
66
66
|
if (Array.isArray(snap.ticks)) {
|
|
67
67
|
if (snap.ticks.length === 0) {
|
|
68
|
-
warnings.push(
|
|
68
|
+
warnings.push("Ticks array is empty");
|
|
69
69
|
}
|
|
70
70
|
|
|
71
71
|
snap.ticks.forEach((tick: JsonValue, index: number) => {
|
|
72
|
-
if (!tick || typeof tick !==
|
|
72
|
+
if (!tick || typeof tick !== "object") {
|
|
73
73
|
errors.push(`Tick ${index}: invalid tick object`);
|
|
74
74
|
return;
|
|
75
75
|
}
|
|
76
76
|
const tickObj = tick as Record<string, JsonValue>;
|
|
77
|
-
if (typeof tickObj.number !==
|
|
77
|
+
if (typeof tickObj.number !== "number") {
|
|
78
78
|
errors.push(`Tick ${index}: missing or invalid 'number' field`);
|
|
79
79
|
}
|
|
80
80
|
|
|
@@ -90,51 +90,51 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
|
|
|
90
90
|
// Check tick numbering is sequential
|
|
91
91
|
for (let i = 0; i < snap.ticks.length; i++) {
|
|
92
92
|
const tick = snap.ticks[i] as Record<string, JsonValue> | undefined;
|
|
93
|
-
if (tick && typeof tick.number ===
|
|
93
|
+
if (tick && typeof tick.number === "number" && tick.number !== i) {
|
|
94
94
|
warnings.push(`Tick ${i}: number ${tick.number} doesn't match index`);
|
|
95
95
|
}
|
|
96
96
|
}
|
|
97
97
|
}
|
|
98
98
|
|
|
99
99
|
// 4. Validate ground truth
|
|
100
|
-
if (snap.groundTruth && typeof snap.groundTruth ===
|
|
100
|
+
if (snap.groundTruth && typeof snap.groundTruth === "object") {
|
|
101
101
|
const gt = snap.groundTruth as Record<string, JsonValue>;
|
|
102
102
|
|
|
103
|
-
if (!gt.marketOutcomes || typeof gt.marketOutcomes !==
|
|
104
|
-
errors.push(
|
|
103
|
+
if (!gt.marketOutcomes || typeof gt.marketOutcomes !== "object") {
|
|
104
|
+
errors.push("groundTruth.marketOutcomes must be an object");
|
|
105
105
|
}
|
|
106
106
|
|
|
107
|
-
if (!gt.priceHistory || typeof gt.priceHistory !==
|
|
108
|
-
errors.push(
|
|
107
|
+
if (!gt.priceHistory || typeof gt.priceHistory !== "object") {
|
|
108
|
+
errors.push("groundTruth.priceHistory must be an object");
|
|
109
109
|
}
|
|
110
110
|
|
|
111
111
|
if (!Array.isArray(gt.optimalActions)) {
|
|
112
|
-
errors.push(
|
|
112
|
+
errors.push("groundTruth.optimalActions must be an array");
|
|
113
113
|
}
|
|
114
114
|
|
|
115
115
|
if (!Array.isArray(gt.socialOpportunities)) {
|
|
116
|
-
errors.push(
|
|
116
|
+
errors.push("groundTruth.socialOpportunities must be an array");
|
|
117
117
|
}
|
|
118
118
|
|
|
119
119
|
if (!Array.isArray(gt.hiddenFacts)) {
|
|
120
|
-
errors.push(
|
|
120
|
+
errors.push("groundTruth.hiddenFacts must be an array");
|
|
121
121
|
}
|
|
122
122
|
|
|
123
123
|
if (!Array.isArray(gt.hiddenEvents)) {
|
|
124
|
-
errors.push(
|
|
124
|
+
errors.push("groundTruth.hiddenEvents must be an array");
|
|
125
125
|
}
|
|
126
126
|
|
|
127
|
-
if (!gt.trueFacts || typeof gt.trueFacts !==
|
|
128
|
-
errors.push(
|
|
127
|
+
if (!gt.trueFacts || typeof gt.trueFacts !== "object") {
|
|
128
|
+
errors.push("groundTruth.trueFacts must be an object");
|
|
129
129
|
}
|
|
130
130
|
}
|
|
131
131
|
|
|
132
132
|
// 5. Cross-validate: markets in initialState should have outcomes in groundTruth
|
|
133
133
|
if (
|
|
134
134
|
snap.initialState &&
|
|
135
|
-
typeof snap.initialState ===
|
|
135
|
+
typeof snap.initialState === "object" &&
|
|
136
136
|
snap.groundTruth &&
|
|
137
|
-
typeof snap.groundTruth ===
|
|
137
|
+
typeof snap.groundTruth === "object"
|
|
138
138
|
) {
|
|
139
139
|
const initialState = snap.initialState as Record<string, JsonValue>;
|
|
140
140
|
const groundTruth = snap.groundTruth as Record<string, JsonValue>;
|
|
@@ -145,7 +145,7 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
|
|
|
145
145
|
) as Array<Record<string, JsonValue>>;
|
|
146
146
|
const outcomes = (
|
|
147
147
|
groundTruth.marketOutcomes &&
|
|
148
|
-
typeof groundTruth.marketOutcomes ===
|
|
148
|
+
typeof groundTruth.marketOutcomes === "object"
|
|
149
149
|
? groundTruth.marketOutcomes
|
|
150
150
|
: {}
|
|
151
151
|
) as Record<string, JsonValue>;
|
|
@@ -153,17 +153,17 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
|
|
|
153
153
|
markets.forEach((market) => {
|
|
154
154
|
if (
|
|
155
155
|
market.id &&
|
|
156
|
-
typeof market.id ===
|
|
156
|
+
typeof market.id === "string" &&
|
|
157
157
|
!(market.id in outcomes)
|
|
158
158
|
) {
|
|
159
159
|
warnings.push(
|
|
160
|
-
`Market ${market.id} in initialState but no outcome in groundTruth
|
|
160
|
+
`Market ${market.id} in initialState but no outcome in groundTruth`,
|
|
161
161
|
);
|
|
162
162
|
}
|
|
163
163
|
});
|
|
164
164
|
}
|
|
165
165
|
|
|
166
|
-
logger.info(
|
|
166
|
+
logger.info("Benchmark validation complete", {
|
|
167
167
|
valid: errors.length === 0,
|
|
168
168
|
errors: errors.length,
|
|
169
169
|
warnings: warnings.length,
|
|
@@ -179,8 +179,10 @@ export function validate(snapshot: unknown): BenchmarkValidationResult {
|
|
|
179
179
|
/**
|
|
180
180
|
* Quick sanity check (fast, minimal validation)
|
|
181
181
|
*/
|
|
182
|
-
export function sanityCheck(
|
|
183
|
-
|
|
182
|
+
export function sanityCheck(
|
|
183
|
+
snapshot: unknown,
|
|
184
|
+
): snapshot is BenchmarkGameSnapshot {
|
|
185
|
+
if (!snapshot || typeof snapshot !== "object") return false;
|
|
184
186
|
const snap = snapshot as Record<string, JsonValue>;
|
|
185
187
|
return !!(
|
|
186
188
|
snap.id &&
|
|
@@ -194,11 +196,11 @@ export function sanityCheck(snapshot: unknown): snapshot is BenchmarkGameSnapsho
|
|
|
194
196
|
* Validate and throw if invalid
|
|
195
197
|
*/
|
|
196
198
|
export function validateOrThrow(
|
|
197
|
-
snapshot: unknown
|
|
199
|
+
snapshot: unknown,
|
|
198
200
|
): asserts snapshot is BenchmarkGameSnapshot {
|
|
199
201
|
const result = validate(snapshot);
|
|
200
202
|
|
|
201
203
|
if (!result.valid) {
|
|
202
|
-
throw new Error(`Invalid benchmark data: ${result.errors.join(
|
|
204
|
+
throw new Error(`Invalid benchmark data: ${result.errors.join(", ")}`);
|
|
203
205
|
}
|
|
204
206
|
}
|
|
@@ -8,16 +8,16 @@
|
|
|
8
8
|
* - Progress tracking
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
-
import { logger } from
|
|
12
|
-
import { type BenchmarkRunConfig, BenchmarkRunner } from
|
|
13
|
-
import type { SimulationResult } from
|
|
11
|
+
import { logger } from "../utils/logger";
|
|
12
|
+
import { type BenchmarkRunConfig, BenchmarkRunner } from "./BenchmarkRunner";
|
|
13
|
+
import type { SimulationResult } from "./SimulationEngine";
|
|
14
14
|
|
|
15
15
|
export interface FastEvalConfig {
|
|
16
16
|
/** Benchmark file path */
|
|
17
17
|
benchmarkPath: string;
|
|
18
18
|
|
|
19
19
|
/** Agent runtime to test */
|
|
20
|
-
agentRuntime: BenchmarkRunConfig[
|
|
20
|
+
agentRuntime: BenchmarkRunConfig["agentRuntime"];
|
|
21
21
|
|
|
22
22
|
/** Agent user ID */
|
|
23
23
|
agentUserId: string;
|
|
@@ -63,6 +63,7 @@ export interface FastEvalResult {
|
|
|
63
63
|
worstRun: SimulationResult;
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
+
// biome-ignore lint/complexity/noStaticOnlyClass: Runner namespace - run/runWithProgress are logically grouped
|
|
66
67
|
export class FastEvalRunner {
|
|
67
68
|
/**
|
|
68
69
|
* Run fast evaluation
|
|
@@ -98,7 +99,7 @@ export class FastEvalRunner {
|
|
|
98
99
|
const iterations = config.iterations || 1;
|
|
99
100
|
const parallelRuns = config.parallelRuns || 1;
|
|
100
101
|
|
|
101
|
-
logger.info(
|
|
102
|
+
logger.info("Starting fast evaluation", {
|
|
102
103
|
benchmarkPath: config.benchmarkPath,
|
|
103
104
|
agentUserId: config.agentUserId,
|
|
104
105
|
iterations,
|
|
@@ -118,7 +119,7 @@ export class FastEvalRunner {
|
|
|
118
119
|
const batchSize = batchEnd - batchStart;
|
|
119
120
|
|
|
120
121
|
logger.info(
|
|
121
|
-
`Running batch ${batchStart + 1}-${batchEnd} of ${iterations}
|
|
122
|
+
`Running batch ${batchStart + 1}-${batchEnd} of ${iterations}`,
|
|
122
123
|
);
|
|
123
124
|
|
|
124
125
|
// Run batch in parallel
|
|
@@ -157,18 +158,18 @@ export class FastEvalRunner {
|
|
|
157
158
|
const avgAccuracy =
|
|
158
159
|
results.reduce(
|
|
159
160
|
(sum, r) => sum + r.metrics.predictionMetrics.accuracy,
|
|
160
|
-
0
|
|
161
|
+
0,
|
|
161
162
|
) / results.length;
|
|
162
163
|
const avgOptimality =
|
|
163
164
|
results.reduce((sum, r) => sum + r.metrics.optimalityScore, 0) /
|
|
164
165
|
results.length;
|
|
165
166
|
|
|
166
167
|
const bestRun = results.reduce((best, current) =>
|
|
167
|
-
current.metrics.totalPnl > best.metrics.totalPnl ? current : best
|
|
168
|
+
current.metrics.totalPnl > best.metrics.totalPnl ? current : best,
|
|
168
169
|
);
|
|
169
170
|
|
|
170
171
|
const worstRun = results.reduce((worst, current) =>
|
|
171
|
-
current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst
|
|
172
|
+
current.metrics.totalPnl < worst.metrics.totalPnl ? current : worst,
|
|
172
173
|
);
|
|
173
174
|
|
|
174
175
|
const summary = {
|
|
@@ -179,7 +180,7 @@ export class FastEvalRunner {
|
|
|
179
180
|
runsCompleted: results.length,
|
|
180
181
|
};
|
|
181
182
|
|
|
182
|
-
logger.info(
|
|
183
|
+
logger.info("Fast evaluation completed", summary);
|
|
183
184
|
|
|
184
185
|
return {
|
|
185
186
|
results,
|
|
@@ -193,22 +194,22 @@ export class FastEvalRunner {
|
|
|
193
194
|
* Run evaluation with progress bar
|
|
194
195
|
*/
|
|
195
196
|
static async runWithProgress(
|
|
196
|
-
config: FastEvalConfig
|
|
197
|
+
config: FastEvalConfig,
|
|
197
198
|
): Promise<FastEvalResult> {
|
|
198
199
|
let lastProgress = 0;
|
|
199
200
|
|
|
200
|
-
return
|
|
201
|
+
return FastEvalRunner.run({
|
|
201
202
|
...config,
|
|
202
203
|
onProgress: (progress) => {
|
|
203
204
|
const percent = Math.round((progress.completed / progress.total) * 100);
|
|
204
205
|
if (percent !== lastProgress) {
|
|
205
206
|
const barLength = 40;
|
|
206
207
|
const filled = Math.round(
|
|
207
|
-
(progress.completed / progress.total) * barLength
|
|
208
|
+
(progress.completed / progress.total) * barLength,
|
|
208
209
|
);
|
|
209
|
-
const bar =
|
|
210
|
+
const bar = "█".repeat(filled) + "░".repeat(barLength - filled);
|
|
210
211
|
process.stdout.write(
|
|
211
|
-
`\r[${bar}] ${percent}% (${progress.completed}/${progress.total})
|
|
212
|
+
`\r[${bar}] ${percent}% (${progress.completed}/${progress.total})`,
|
|
212
213
|
);
|
|
213
214
|
lastProgress = percent;
|
|
214
215
|
}
|
|
@@ -218,7 +219,7 @@ export class FastEvalRunner {
|
|
|
218
219
|
}
|
|
219
220
|
},
|
|
220
221
|
}).then((result) => {
|
|
221
|
-
process.stdout.write(
|
|
222
|
+
process.stdout.write("\n");
|
|
222
223
|
return result;
|
|
223
224
|
});
|
|
224
225
|
}
|
|
@@ -4,10 +4,10 @@
|
|
|
4
4
|
* Validates that benchmark metrics are calculated correctly against ground truth.
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
-
import type { ValidationResult } from
|
|
8
|
-
import { logger } from
|
|
9
|
-
import type { GroundTruth } from
|
|
10
|
-
import type { AgentAction, SimulationMetrics } from
|
|
7
|
+
import type { ValidationResult } from "../training/ConfigValidator";
|
|
8
|
+
import { logger } from "../utils/logger";
|
|
9
|
+
import type { GroundTruth } from "./BenchmarkDataGenerator";
|
|
10
|
+
import type { AgentAction, SimulationMetrics } from "./simulation-types";
|
|
11
11
|
|
|
12
12
|
export class MetricsValidator {
|
|
13
13
|
/**
|
|
@@ -16,16 +16,16 @@ export class MetricsValidator {
|
|
|
16
16
|
static validate(
|
|
17
17
|
metrics: SimulationMetrics,
|
|
18
18
|
actions: AgentAction[],
|
|
19
|
-
groundTruth: GroundTruth
|
|
19
|
+
groundTruth: GroundTruth,
|
|
20
20
|
): ValidationResult {
|
|
21
21
|
const errors: string[] = [];
|
|
22
22
|
const warnings: string[] = [];
|
|
23
23
|
|
|
24
24
|
// 1. Validate prediction accuracy calculation
|
|
25
|
-
const predictionValidation =
|
|
25
|
+
const predictionValidation = MetricsValidator.validatePredictionMetrics(
|
|
26
26
|
metrics.predictionMetrics,
|
|
27
27
|
actions,
|
|
28
|
-
groundTruth
|
|
28
|
+
groundTruth,
|
|
29
29
|
);
|
|
30
30
|
errors.push(...predictionValidation.errors);
|
|
31
31
|
warnings.push(...predictionValidation.warnings);
|
|
@@ -38,23 +38,23 @@ export class MetricsValidator {
|
|
|
38
38
|
// 3. Validate timing metrics are reasonable
|
|
39
39
|
if (metrics.timing.avgResponseTime < 0) {
|
|
40
40
|
errors.push(
|
|
41
|
-
`Invalid average response time: ${metrics.timing.avgResponseTime}
|
|
41
|
+
`Invalid average response time: ${metrics.timing.avgResponseTime}`,
|
|
42
42
|
);
|
|
43
43
|
}
|
|
44
44
|
|
|
45
45
|
if (metrics.timing.maxResponseTime < metrics.timing.avgResponseTime) {
|
|
46
46
|
errors.push(
|
|
47
|
-
`Max response time less than average: ${metrics.timing.maxResponseTime} < ${metrics.timing.avgResponseTime}
|
|
47
|
+
`Max response time less than average: ${metrics.timing.maxResponseTime} < ${metrics.timing.avgResponseTime}`,
|
|
48
48
|
);
|
|
49
49
|
}
|
|
50
50
|
|
|
51
51
|
// 4. Validate action counts match
|
|
52
52
|
const predictionActions = actions.filter(
|
|
53
|
-
(a) => a.type ===
|
|
53
|
+
(a) => a.type === "buy_prediction",
|
|
54
54
|
);
|
|
55
55
|
if (predictionActions.length !== metrics.predictionMetrics.totalPositions) {
|
|
56
56
|
warnings.push(
|
|
57
|
-
`Prediction action count mismatch: ${predictionActions.length} actions vs ${metrics.predictionMetrics.totalPositions} positions
|
|
57
|
+
`Prediction action count mismatch: ${predictionActions.length} actions vs ${metrics.predictionMetrics.totalPositions} positions`,
|
|
58
58
|
);
|
|
59
59
|
}
|
|
60
60
|
|
|
@@ -64,20 +64,20 @@ export class MetricsValidator {
|
|
|
64
64
|
const calculatedAccuracy =
|
|
65
65
|
totalPositions > 0 ? correctPredictions / totalPositions : 0;
|
|
66
66
|
const accuracyDiff = Math.abs(
|
|
67
|
-
calculatedAccuracy - metrics.predictionMetrics.accuracy
|
|
67
|
+
calculatedAccuracy - metrics.predictionMetrics.accuracy,
|
|
68
68
|
);
|
|
69
69
|
|
|
70
70
|
if (accuracyDiff > 0.01) {
|
|
71
71
|
// Allow 1% tolerance for floating point
|
|
72
72
|
errors.push(
|
|
73
|
-
`Accuracy calculation mismatch: reported ${metrics.predictionMetrics.accuracy}, calculated ${calculatedAccuracy}
|
|
73
|
+
`Accuracy calculation mismatch: reported ${metrics.predictionMetrics.accuracy}, calculated ${calculatedAccuracy}`,
|
|
74
74
|
);
|
|
75
75
|
}
|
|
76
76
|
|
|
77
77
|
// 6. Validate correct + incorrect = total
|
|
78
78
|
if (correctPredictions + incorrectPredictions !== totalPositions) {
|
|
79
79
|
errors.push(
|
|
80
|
-
`Prediction count mismatch: ${correctPredictions} + ${incorrectPredictions} != ${totalPositions}
|
|
80
|
+
`Prediction count mismatch: ${correctPredictions} + ${incorrectPredictions} != ${totalPositions}`,
|
|
81
81
|
);
|
|
82
82
|
}
|
|
83
83
|
|
|
@@ -86,17 +86,17 @@ export class MetricsValidator {
|
|
|
86
86
|
const calculatedWinRate =
|
|
87
87
|
metrics.perpMetrics.profitableTrades / metrics.perpMetrics.totalTrades;
|
|
88
88
|
const winRateDiff = Math.abs(
|
|
89
|
-
calculatedWinRate - metrics.perpMetrics.winRate
|
|
89
|
+
calculatedWinRate - metrics.perpMetrics.winRate,
|
|
90
90
|
);
|
|
91
91
|
|
|
92
92
|
if (winRateDiff > 0.01) {
|
|
93
93
|
errors.push(
|
|
94
|
-
`Win rate calculation mismatch: reported ${metrics.perpMetrics.winRate}, calculated ${calculatedWinRate}
|
|
94
|
+
`Win rate calculation mismatch: reported ${metrics.perpMetrics.winRate}, calculated ${calculatedWinRate}`,
|
|
95
95
|
);
|
|
96
96
|
}
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
-
logger.info(
|
|
99
|
+
logger.info("Metrics validation complete", {
|
|
100
100
|
valid: errors.length === 0,
|
|
101
101
|
errors: errors.length,
|
|
102
102
|
warnings: warnings.length,
|
|
@@ -113,16 +113,16 @@ export class MetricsValidator {
|
|
|
113
113
|
* Validate prediction metrics against ground truth
|
|
114
114
|
*/
|
|
115
115
|
private static validatePredictionMetrics(
|
|
116
|
-
_predictionMetrics: SimulationMetrics[
|
|
116
|
+
_predictionMetrics: SimulationMetrics["predictionMetrics"],
|
|
117
117
|
actions: AgentAction[],
|
|
118
|
-
groundTruth: GroundTruth
|
|
118
|
+
groundTruth: GroundTruth,
|
|
119
119
|
): ValidationResult {
|
|
120
120
|
const errors: string[] = [];
|
|
121
121
|
const warnings: string[] = [];
|
|
122
122
|
|
|
123
123
|
// Get all prediction actions
|
|
124
124
|
const predictionActions = actions.filter(
|
|
125
|
-
(a) => a.type ===
|
|
125
|
+
(a) => a.type === "buy_prediction",
|
|
126
126
|
);
|
|
127
127
|
|
|
128
128
|
// Validate each action against ground truth
|
|
@@ -133,7 +133,6 @@ export class MetricsValidator {
|
|
|
133
133
|
// Check if we have ground truth for this market
|
|
134
134
|
if (!(marketId in groundTruth.marketOutcomes)) {
|
|
135
135
|
warnings.push(`No ground truth for market ${marketId}`);
|
|
136
|
-
continue;
|
|
137
136
|
}
|
|
138
137
|
|
|
139
138
|
// Verify the outcome exists in ground truth
|