@elizaos/training 2.0.0-alpha.41 → 2.0.0-alpha.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/research-output/training-runs/training-run-1773726941205.json +38 -0
- package/research-output/training-runs/training-run-1773742857616.json +38 -0
- package/research-output/training-runs/training-run-1773742946977.json +38 -0
- package/research-output/training-runs/training-run-1773743278891.json +38 -0
- package/research-output/training-runs/training-run-1773743409754.json +38 -0
- package/research-output/training-runs/training-run-1773743651086.json +38 -0
- package/research-output/training-runs/training-run-1773743782883.json +38 -0
- package/research-output/training-runs/training-run-1773755075895.json +38 -0
- package/research-output/training-runs/training-run-1773755142682.json +38 -0
- package/scripts/rank_trajectories.ts +20 -6
- package/scripts/run_task_benchmark.ts +7 -13
- package/src/adapter.ts +96 -49
- package/src/archetypes/ArchetypeConfigService.ts +276 -264
- package/src/archetypes/derive-archetype.ts +47 -47
- package/src/archetypes/index.ts +2 -2
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +70 -70
- package/src/benchmark/BenchmarkChartGenerator.ts +70 -69
- package/src/benchmark/BenchmarkDataGenerator.ts +162 -152
- package/src/benchmark/BenchmarkDataViewer.ts +98 -97
- package/src/benchmark/BenchmarkHistoryService.ts +13 -12
- package/src/benchmark/BenchmarkRunner.ts +94 -85
- package/src/benchmark/BenchmarkValidator.ts +48 -46
- package/src/benchmark/FastEvalRunner.ts +17 -16
- package/src/benchmark/MetricsValidator.ts +141 -141
- package/src/benchmark/MetricsVisualizer.ts +92 -85
- package/src/benchmark/ModelBenchmarkService.ts +90 -82
- package/src/benchmark/ModelRegistry.ts +44 -44
- package/src/benchmark/RulerBenchmarkIntegration.ts +24 -24
- package/src/benchmark/SimulationA2AInterface.ts +118 -118
- package/src/benchmark/SimulationEngine.ts +55 -54
- package/src/benchmark/TaskRunner.ts +87 -79
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +82 -82
- package/src/benchmark/__tests__/HeadToHead.test.ts +26 -26
- package/src/benchmark/index.ts +27 -27
- package/src/benchmark/parseSimulationMetrics.ts +32 -32
- package/src/benchmark/simulation-types.ts +10 -10
- package/src/dependencies.ts +34 -34
- package/src/generation/TrajectoryGenerator.ts +39 -37
- package/src/generation/index.ts +1 -1
- package/src/huggingface/HuggingFaceDatasetUploader.ts +74 -73
- package/src/huggingface/HuggingFaceIntegrationService.ts +59 -53
- package/src/huggingface/HuggingFaceModelUploader.ts +60 -59
- package/src/huggingface/index.ts +6 -6
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +66 -59
- package/src/index.ts +30 -27
- package/src/init-training.ts +6 -6
- package/src/metrics/TrajectoryMetricsExtractor.ts +70 -71
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +182 -182
- package/src/metrics/index.ts +2 -2
- package/src/rubrics/__tests__/index.test.ts +73 -73
- package/src/rubrics/ass-kisser.ts +6 -6
- package/src/rubrics/degen.ts +6 -6
- package/src/rubrics/goody-twoshoes.ts +6 -6
- package/src/rubrics/index.ts +50 -50
- package/src/rubrics/information-trader.ts +6 -6
- package/src/rubrics/infosec.ts +6 -6
- package/src/rubrics/liar.ts +6 -6
- package/src/rubrics/perps-trader.ts +6 -6
- package/src/rubrics/researcher.ts +6 -6
- package/src/rubrics/scammer.ts +6 -6
- package/src/rubrics/social-butterfly.ts +7 -7
- package/src/rubrics/super-predictor.ts +6 -6
- package/src/rubrics/trader.ts +5 -5
- package/src/scoring/ArchetypeScoringService.ts +56 -54
- package/src/scoring/JudgePromptBuilder.ts +96 -96
- package/src/scoring/LLMJudgeCache.ts +26 -23
- package/src/scoring/index.ts +3 -3
- package/src/training/AutomationPipeline.ts +166 -154
- package/src/training/BenchmarkService.ts +53 -47
- package/src/training/ConfigValidator.ts +202 -190
- package/src/training/MarketOutcomesTracker.ts +22 -12
- package/src/training/ModelDeployer.ts +15 -15
- package/src/training/ModelFetcher.ts +7 -7
- package/src/training/ModelSelectionService.ts +32 -32
- package/src/training/ModelUsageVerifier.ts +31 -24
- package/src/training/MultiModelOrchestrator.ts +44 -44
- package/src/training/RLModelConfig.ts +57 -57
- package/src/training/RewardBackpropagationService.ts +18 -17
- package/src/training/RulerScoringService.ts +86 -79
- package/src/training/TrainingMonitor.ts +29 -29
- package/src/training/TrajectoryRecorder.ts +40 -30
- package/src/training/__tests__/TrajectoryRecorder.test.ts +105 -105
- package/src/training/index.ts +36 -36
- package/src/training/logRLConfig.ts +7 -7
- package/src/training/pipeline.ts +13 -16
- package/src/training/storage/ModelStorageService.ts +32 -32
- package/src/training/storage/TrainingDataArchiver.ts +21 -21
- package/src/training/storage/index.ts +2 -2
- package/src/training/types.ts +6 -6
- package/src/training/window-utils.ts +14 -14
- package/src/utils/index.ts +7 -7
- package/src/utils/logger.ts +5 -5
- package/src/utils/snowflake.ts +1 -1
- package/src/utils/synthetic-detector.ts +7 -7
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@elizaos/training",
|
|
3
|
-
"version": "2.0.0-alpha.
|
|
3
|
+
"version": "2.0.0-alpha.44",
|
|
4
4
|
"description": "ElizaOS RL training pipeline with benchmarking and model publishing support",
|
|
5
5
|
"main": "./src/index.ts",
|
|
6
6
|
"types": "./src/index.ts",
|
|
@@ -53,5 +53,5 @@
|
|
|
53
53
|
"bun-types": "^1.3.2",
|
|
54
54
|
"typescript": "^5.9.3"
|
|
55
55
|
},
|
|
56
|
-
"gitHead": "
|
|
56
|
+
"gitHead": "2b27a4e70ebdf054b117b87ed9e8f9f709fe006b"
|
|
57
57
|
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T05:55:41.205Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 139
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 2130
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 0
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 2270,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T10:20:57.616Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 132
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 1528
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 0
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 1660,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T10:22:26.977Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 55
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 1215
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 0
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 1270,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T10:27:58.891Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 315
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 3870
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 0
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 4185,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T10:30:09.754Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 145
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 2265
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 1
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 2412,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T10:34:11.086Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 138
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 1809
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 1
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 1949,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T10:36:22.883Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 40
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 1138
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 0
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 1178,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T13:44:35.895Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 138
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 2068
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 0
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 2208,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"timestamp": "2026-03-17T13:45:42.682Z",
|
|
3
|
+
"config": {
|
|
4
|
+
"skipTraining": true,
|
|
5
|
+
"skipBenchmark": true,
|
|
6
|
+
"ticks": 100,
|
|
7
|
+
"archetype": "trader",
|
|
8
|
+
"verbose": false
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"name": "Check Prerequisites",
|
|
13
|
+
"success": true,
|
|
14
|
+
"message": "Prerequisites satisfied (Ollama: yes)",
|
|
15
|
+
"details": {
|
|
16
|
+
"python": true,
|
|
17
|
+
"trainingDir": true,
|
|
18
|
+
"mlx": false,
|
|
19
|
+
"ollama": true
|
|
20
|
+
},
|
|
21
|
+
"duration": 132
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "Install Dependencies",
|
|
25
|
+
"success": true,
|
|
26
|
+
"message": "Python dependencies installed",
|
|
27
|
+
"duration": 2688
|
|
28
|
+
},
|
|
29
|
+
{
|
|
30
|
+
"name": "Train Model",
|
|
31
|
+
"success": true,
|
|
32
|
+
"message": "Training skipped (--skip-training)",
|
|
33
|
+
"duration": 0
|
|
34
|
+
}
|
|
35
|
+
],
|
|
36
|
+
"totalDuration": 2820,
|
|
37
|
+
"success": true
|
|
38
|
+
}
|
|
@@ -67,7 +67,6 @@ async function main() {
|
|
|
67
67
|
// Initialize Judge Runtime
|
|
68
68
|
const character = {
|
|
69
69
|
name: 'JudgeAgent',
|
|
70
|
-
modelProvider: "openai" as any,
|
|
71
70
|
bio: ['I am an impartial AI judge.'],
|
|
72
71
|
settings: {
|
|
73
72
|
secrets: {
|
|
@@ -138,7 +137,18 @@ async function main() {
|
|
|
138
137
|
|
|
139
138
|
console.log(`Found ${lines.length} trajectories to rank.`);
|
|
140
139
|
|
|
141
|
-
|
|
140
|
+
interface ScoredTrajectory {
|
|
141
|
+
trajectoryId?: string;
|
|
142
|
+
steps?: Array<{
|
|
143
|
+
action?: { parameters?: { text?: string } };
|
|
144
|
+
}>;
|
|
145
|
+
metadata?: { task?: string };
|
|
146
|
+
score?: number;
|
|
147
|
+
reasoning?: string;
|
|
148
|
+
isScored?: boolean;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const scoredTrajectories: ScoredTrajectory[] = [];
|
|
142
152
|
|
|
143
153
|
// Clear output file first if overwriting
|
|
144
154
|
if (fs.existsSync(outputFile)) {
|
|
@@ -147,12 +157,12 @@ async function main() {
|
|
|
147
157
|
|
|
148
158
|
for (const line of lines) {
|
|
149
159
|
try {
|
|
150
|
-
const trajectory = JSON.parse(line);
|
|
160
|
+
const trajectory = JSON.parse(line) as ScoredTrajectory;
|
|
151
161
|
const { steps, metadata } = trajectory;
|
|
152
162
|
const task = metadata?.task || 'Unknown Task';
|
|
153
163
|
|
|
154
164
|
// Extract the last step's action/response
|
|
155
|
-
const lastStep = steps[steps.length - 1];
|
|
165
|
+
const lastStep = steps && steps.length > 0 ? steps[steps.length - 1] : undefined;
|
|
156
166
|
const response = lastStep?.action?.parameters?.text || "No response found";
|
|
157
167
|
|
|
158
168
|
console.log(`Ranking trajectory ${trajectory.trajectoryId}...`);
|
|
@@ -177,11 +187,15 @@ Return ONLY valid JSON.
|
|
|
177
187
|
const resultText = typeof result === 'string' ? result : result.text;
|
|
178
188
|
|
|
179
189
|
// Parse JSON
|
|
180
|
-
|
|
190
|
+
interface ScoreData {
|
|
191
|
+
score: number;
|
|
192
|
+
reasoning: string;
|
|
193
|
+
}
|
|
194
|
+
let scoreData: ScoreData;
|
|
181
195
|
try {
|
|
182
196
|
// simple cleanup for markdown code blocks
|
|
183
197
|
const jsonStr = resultText.replace(/```json/g, '').replace(/```/g, '').trim();
|
|
184
|
-
scoreData = JSON.parse(jsonStr);
|
|
198
|
+
scoreData = JSON.parse(jsonStr) as ScoreData;
|
|
185
199
|
} catch (e) {
|
|
186
200
|
console.warn(`Failed to parse judge output for ${trajectory.trajectoryId}: ${resultText}`);
|
|
187
201
|
scoreData = { score: 0, reasoning: "Parse Error" };
|
|
@@ -50,8 +50,7 @@ class BenchmarkRuntimeManager implements IAgentRuntimeManager {
|
|
|
50
50
|
// Create a new runtime
|
|
51
51
|
const character = {
|
|
52
52
|
name: 'BenchmarkAgent',
|
|
53
|
-
|
|
54
|
-
bio: 'A helpful assistant for benchmarking.',
|
|
53
|
+
bio: ['A helpful assistant for benchmarking.'],
|
|
55
54
|
settings: {
|
|
56
55
|
secrets: {
|
|
57
56
|
OPENAI_API_KEY: process.env.OPENAI_API_KEY || ''
|
|
@@ -60,13 +59,7 @@ class BenchmarkRuntimeManager implements IAgentRuntimeManager {
|
|
|
60
59
|
};
|
|
61
60
|
|
|
62
61
|
const runtime = new AgentRuntime({
|
|
63
|
-
token: process.env.OPENAI_API_KEY || '',
|
|
64
|
-
modelProvider: "openai" as any,
|
|
65
62
|
character,
|
|
66
|
-
plugins: [],
|
|
67
|
-
providers: [],
|
|
68
|
-
actions: [],
|
|
69
|
-
evaluators: [],
|
|
70
63
|
});
|
|
71
64
|
|
|
72
65
|
// We must initialize with allowNoDatabase to avoid DB error
|
|
@@ -116,7 +109,7 @@ class BenchmarkTaskInteractor implements ITaskInteractor {
|
|
|
116
109
|
|
|
117
110
|
const userMemory: Memory = {
|
|
118
111
|
id: messageId as `${string}-${string}-${string}-${string}-${string}`,
|
|
119
|
-
|
|
112
|
+
entityId: userId as `${string}-${string}-${string}-${string}-${string}`,
|
|
120
113
|
agentId: runtime.agentId,
|
|
121
114
|
roomId: roomId as `${string}-${string}-${string}-${string}-${string}`,
|
|
122
115
|
content: {
|
|
@@ -143,6 +136,7 @@ Assistant:`;
|
|
|
143
136
|
// Signature: generateText(input: string, options?: GenerateTextOptions)
|
|
144
137
|
const result = await runtime.generateText(context, {
|
|
145
138
|
modelType: ModelType.TEXT_SMALL,
|
|
139
|
+
stopSequences: [],
|
|
146
140
|
});
|
|
147
141
|
// Handle both string and object return types for safety
|
|
148
142
|
const response = typeof result === 'string' ? result : result.text;
|
|
@@ -227,11 +221,11 @@ async function main() {
|
|
|
227
221
|
agentService: new BenchmarkAgentService(),
|
|
228
222
|
agentRuntimeManager: new BenchmarkRuntimeManager(),
|
|
229
223
|
autonomousCoordinator: {
|
|
230
|
-
executeAutonomousTick: async () => ({ success: true })
|
|
231
|
-
}
|
|
224
|
+
executeAutonomousTick: async () => ({ success: true }),
|
|
225
|
+
},
|
|
232
226
|
llmCaller: {
|
|
233
|
-
callGroqDirect: async () => "mock response"
|
|
234
|
-
}
|
|
227
|
+
callGroqDirect: async () => "mock response",
|
|
228
|
+
},
|
|
235
229
|
});
|
|
236
230
|
|
|
237
231
|
// Import task interactor config
|