@elizaos/training 2.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. package/Dockerfile +75 -0
  2. package/LICENSE +21 -0
  3. package/Makefile +374 -0
  4. package/README.md +346 -0
  5. package/config/rubrics.json +137 -0
  6. package/docker-compose.test.yml +57 -0
  7. package/package.json +57 -0
  8. package/python/config/babylon_atropos.yaml +90 -0
  9. package/python/config/profiles/12gb.json +11 -0
  10. package/python/config/profiles/16gb.json +10 -0
  11. package/python/config/profiles/24gb.json +10 -0
  12. package/python/config/profiles/48gb.json +10 -0
  13. package/python/config/profiles/cpu.json +11 -0
  14. package/python/config/profiles/l40-2gpu-safe.json +20 -0
  15. package/python/config/profiles/l40-2gpu.json +22 -0
  16. package/python/config/profiles/l40-4gpu.json +21 -0
  17. package/python/config/profiles/l40.json +17 -0
  18. package/python/config/tinker_training.yaml +143 -0
  19. package/python/curriculum_state.json +165 -0
  20. package/python/env.template +86 -0
  21. package/python/env.training.template +46 -0
  22. package/python/pyproject.toml +41 -0
  23. package/python/requirements-ci.txt +31 -0
  24. package/python/requirements.txt +87 -0
  25. package/python/scripts/__init__.py +4 -0
  26. package/python/scripts/benchmark_should_respond.py +190 -0
  27. package/python/scripts/debug_inference.py +62 -0
  28. package/python/scripts/import_json_trajectories.py +412 -0
  29. package/python/scripts/local-finetune/README.md +63 -0
  30. package/python/scripts/local-finetune/ingest_and_score.py +139 -0
  31. package/python/scripts/local-finetune/merge_model.py +32 -0
  32. package/python/scripts/local-finetune/test_adapter.py +91 -0
  33. package/python/scripts/local-finetune/train_from_csv.py +132 -0
  34. package/python/scripts/merge_trajectories.py +318 -0
  35. package/python/scripts/optimize_prompt_grpo.py +269 -0
  36. package/python/scripts/run_ab_test.py +143 -0
  37. package/python/scripts/run_full_pipeline.py +544 -0
  38. package/python/scripts/run_tinker_training.py +192 -0
  39. package/python/scripts/run_training.py +914 -0
  40. package/python/scripts/test_generation.py +29 -0
  41. package/python/scripts/test_judge.py +155 -0
  42. package/python/scripts/test_pipeline.py +356 -0
  43. package/python/scripts/test_trained_model.py +380 -0
  44. package/python/scripts/train_grpo.py +360 -0
  45. package/python/scripts/train_jsonl.py +223 -0
  46. package/python/scripts/train_local.py +528 -0
  47. package/python/setup.py +20 -0
  48. package/python/src/__init__.py +190 -0
  49. package/python/src/data_bridge/__init__.py +24 -0
  50. package/python/src/data_bridge/converter.py +435 -0
  51. package/python/src/data_bridge/reader.py +393 -0
  52. package/python/src/models.py +283 -0
  53. package/python/src/training/__init__.py +605 -0
  54. package/python/src/training/ab_testing.py +404 -0
  55. package/python/src/training/action_executor.py +621 -0
  56. package/python/src/training/archetype_trainer.py +347 -0
  57. package/python/src/training/atropos_trainer.py +980 -0
  58. package/python/src/training/babylon_env.py +1254 -0
  59. package/python/src/training/error_recovery.py +647 -0
  60. package/python/src/training/evaluation.py +856 -0
  61. package/python/src/training/fast_simulator.py +880 -0
  62. package/python/src/training/format_validator.py +584 -0
  63. package/python/src/training/hybrid_env.py +522 -0
  64. package/python/src/training/kl_controller.py +628 -0
  65. package/python/src/training/multi_prompt_dataset.py +883 -0
  66. package/python/src/training/multi_turn.py +656 -0
  67. package/python/src/training/online_env.py +1084 -0
  68. package/python/src/training/quality_scorer.py +391 -0
  69. package/python/src/training/quality_utils.py +633 -0
  70. package/python/src/training/rewards.py +1344 -0
  71. package/python/src/training/rlaif_env.py +17 -0
  72. package/python/src/training/rollout_generator.py +502 -0
  73. package/python/src/training/rubric_loader.py +198 -0
  74. package/python/src/training/scenario_pool.py +1072 -0
  75. package/python/src/training/schemas.py +481 -0
  76. package/python/src/training/service_manager.py +552 -0
  77. package/python/src/training/simulation_bridge.py +535 -0
  78. package/python/src/training/tick_reward_attribution.py +399 -0
  79. package/python/src/training/tinker_client.py +575 -0
  80. package/python/src/training/tinker_trainer.py +646 -0
  81. package/python/src/training/tokenization_utils.py +402 -0
  82. package/python/tests/e2e/__init__.py +13 -0
  83. package/python/tests/e2e/conftest.py +258 -0
  84. package/python/tests/e2e/test_full_pipeline.py +643 -0
  85. package/python/tests/e2e/test_online_training_e2e.py +365 -0
  86. package/python/tests/integration/__init__.py +12 -0
  87. package/python/tests/integration/conftest.py +383 -0
  88. package/python/tests/integration/test_db_integration.py +649 -0
  89. package/python/tests/integration/test_json_mode_integration.py +554 -0
  90. package/python/tests/test_action_executor.py +594 -0
  91. package/python/tests/test_archetype_scoring.py +1027 -0
  92. package/python/tests/test_atropos_integration.py +360 -0
  93. package/python/tests/test_evaluation.py +727 -0
  94. package/python/tests/test_format_validator.py +486 -0
  95. package/python/tests/test_kl_controller.py +432 -0
  96. package/python/tests/test_lr_scheduler.py +579 -0
  97. package/python/tests/test_multi_turn.py +590 -0
  98. package/python/tests/test_online_env.py +519 -0
  99. package/python/tests/test_quality_scorer.py +474 -0
  100. package/python/tests/test_scenario_pool.py +735 -0
  101. package/python/tests/test_service_manager.py +585 -0
  102. package/python/tests/test_simulation_rollout.py +581 -0
  103. package/python/tests/test_tokenization_utils.py +501 -0
  104. package/python/tests/test_training_orchestrator.py +497 -0
  105. package/python/tests/test_training_output_structure.py +661 -0
  106. package/research-output/training-runs/training-run-1770772042899.json +26 -0
  107. package/research-output/training-runs/training-run-1770930079670.json +32 -0
  108. package/research-output/training-runs/training-run-1770930143700.json +44 -0
  109. package/research-output/training-runs/training-run-1770930183638.json +38 -0
  110. package/research-output/training-runs/training-run-1770930442049.json +38 -0
  111. package/research-output/training-runs/training-run-1770930793243.json +38 -0
  112. package/research-output/training-runs/training-run-1771276293257.json +38 -0
  113. package/research-output/training-runs/training-run-1771276389280.json +38 -0
  114. package/research-output/training-runs/training-run-1771276502776.json +38 -0
  115. package/research-output/training-runs/training-run-1771277340748.json +38 -0
  116. package/research-output/training-runs/training-run-1773013658993.json +38 -0
  117. package/research-output/training-runs/training-run-1773013861014.json +38 -0
  118. package/research-output/training-runs/training-run-1773014215983.json +38 -0
  119. package/scripts/assess-training-data.ts +422 -0
  120. package/scripts/e2e-training-test.ts +550 -0
  121. package/scripts/export-rubrics.ts +64 -0
  122. package/scripts/generate-research-report.ts +1523 -0
  123. package/scripts/generate_dataset.sh +173 -0
  124. package/scripts/generate_should_respond.ts +267 -0
  125. package/scripts/generate_should_respond_dataset.ts +162 -0
  126. package/scripts/json-mode-benchmark.ts +399 -0
  127. package/scripts/rank_trajectories.ts +207 -0
  128. package/scripts/real-archetype-benchmark.ts +210 -0
  129. package/scripts/run-baseline-comparison.ts +116 -0
  130. package/scripts/run-full-pipeline.ts +272 -0
  131. package/scripts/run_rlaif_loop.ts +78 -0
  132. package/scripts/run_task_benchmark.ts +247 -0
  133. package/scripts/runpod_setup.sh +137 -0
  134. package/scripts/runpod_validate.sh +147 -0
  135. package/scripts/test-model-in-game.ts +955 -0
  136. package/scripts/test-scoring.ts +73 -0
  137. package/scripts/test-trained-model.ts +209 -0
  138. package/scripts/train-and-test.ts +824 -0
  139. package/scripts/verify-final.ts +118 -0
  140. package/src/adapter.ts +516 -0
  141. package/src/archetypes/ArchetypeConfigService.ts +626 -0
  142. package/src/archetypes/derive-archetype.ts +249 -0
  143. package/src/archetypes/index.ts +22 -0
  144. package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
  145. package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
  146. package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
  147. package/src/benchmark/BenchmarkDataViewer.ts +324 -0
  148. package/src/benchmark/BenchmarkHistoryService.ts +221 -0
  149. package/src/benchmark/BenchmarkRunner.ts +685 -0
  150. package/src/benchmark/BenchmarkValidator.ts +204 -0
  151. package/src/benchmark/FastEvalRunner.ts +225 -0
  152. package/src/benchmark/MetricsValidator.ts +165 -0
  153. package/src/benchmark/MetricsVisualizer.ts +909 -0
  154. package/src/benchmark/ModelBenchmarkService.ts +611 -0
  155. package/src/benchmark/ModelRegistry.ts +158 -0
  156. package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
  157. package/src/benchmark/SimulationA2AInterface.ts +1169 -0
  158. package/src/benchmark/SimulationEngine.ts +832 -0
  159. package/src/benchmark/TaskRunner.ts +94 -0
  160. package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
  161. package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
  162. package/src/benchmark/index.ts +91 -0
  163. package/src/benchmark/parseSimulationMetrics.ts +124 -0
  164. package/src/benchmark/simulation-types.ts +78 -0
  165. package/src/dependencies.ts +475 -0
  166. package/src/generation/TrajectoryGenerator.ts +387 -0
  167. package/src/generation/index.ts +12 -0
  168. package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
  169. package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
  170. package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
  171. package/src/huggingface/index.ts +27 -0
  172. package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
  173. package/src/index.ts +102 -0
  174. package/src/init-training.ts +53 -0
  175. package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
  176. package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
  177. package/src/metrics/index.ts +8 -0
  178. package/src/metrics/types.ts +200 -0
  179. package/src/rubrics/__tests__/index.test.ts +184 -0
  180. package/src/rubrics/ass-kisser.ts +85 -0
  181. package/src/rubrics/degen.ts +80 -0
  182. package/src/rubrics/goody-twoshoes.ts +84 -0
  183. package/src/rubrics/index.ts +236 -0
  184. package/src/rubrics/information-trader.ts +84 -0
  185. package/src/rubrics/infosec.ts +101 -0
  186. package/src/rubrics/liar.ts +104 -0
  187. package/src/rubrics/perps-trader.ts +87 -0
  188. package/src/rubrics/researcher.ts +81 -0
  189. package/src/rubrics/scammer.ts +82 -0
  190. package/src/rubrics/social-butterfly.ts +73 -0
  191. package/src/rubrics/super-predictor.ts +97 -0
  192. package/src/rubrics/trader.ts +67 -0
  193. package/src/scoring/ArchetypeScoringService.ts +486 -0
  194. package/src/scoring/JudgePromptBuilder.ts +556 -0
  195. package/src/scoring/LLMJudgeCache.ts +401 -0
  196. package/src/scoring/index.ts +9 -0
  197. package/src/training/AutomationPipeline.ts +916 -0
  198. package/src/training/BenchmarkService.ts +518 -0
  199. package/src/training/ConfigValidator.ts +220 -0
  200. package/src/training/MarketOutcomesTracker.ts +187 -0
  201. package/src/training/ModelDeployer.ts +186 -0
  202. package/src/training/ModelFetcher.ts +76 -0
  203. package/src/training/ModelSelectionService.ts +341 -0
  204. package/src/training/ModelUsageVerifier.ts +160 -0
  205. package/src/training/MultiModelOrchestrator.ts +580 -0
  206. package/src/training/RLModelConfig.ts +407 -0
  207. package/src/training/RewardBackpropagationService.ts +149 -0
  208. package/src/training/RulerScoringService.ts +666 -0
  209. package/src/training/TrainingMonitor.ts +166 -0
  210. package/src/training/TrajectoryRecorder.ts +399 -0
  211. package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
  212. package/src/training/index.ts +100 -0
  213. package/src/training/logRLConfig.ts +34 -0
  214. package/src/training/pipeline.ts +129 -0
  215. package/src/training/storage/ModelStorageService.ts +279 -0
  216. package/src/training/storage/TrainingDataArchiver.ts +197 -0
  217. package/src/training/storage/index.ts +17 -0
  218. package/src/training/types.ts +207 -0
  219. package/src/training/window-utils.ts +138 -0
  220. package/src/utils/index.ts +101 -0
  221. package/src/utils/logger.ts +59 -0
  222. package/src/utils/snowflake.ts +17 -0
  223. package/src/utils/synthetic-detector.ts +111 -0
  224. package/tsconfig.json +20 -0
@@ -0,0 +1,173 @@
1
+ #!/bin/bash
2
+ #
3
+ # Generate Large Training Dataset
4
+ #
5
+ # Runs multiple parallel workers to generate trajectory data using the
6
+ # TypeScript simulation engine. Each worker uses a different seed for variety.
7
+ #
8
+ # Usage:
9
+ # ./scripts/generate_dataset.sh [HOURS] [PARALLEL_WORKERS] [NPCS_PER_WORKER] [OUTPUT_DIR]
10
+ #
11
+ # Examples:
12
+ # ./scripts/generate_dataset.sh # Default: 24h, 4 workers, 20 NPCs
13
+ # ./scripts/generate_dataset.sh 48 8 30 # 48 hours, 8 workers, 30 NPCs
14
+ # ./scripts/generate_dataset.sh 24 4 20 ./data # Custom output directory
15
+ #
16
+ # Requirements:
17
+ # - bun installed
18
+ # - GROQ_API_KEY or OPENAI_API_KEY set
19
+ # - generate-training-data.ts script available
20
+ #
21
+
22
+ set -e
23
+
24
+ # Configuration
25
+ HOURS=${1:-24}
26
+ PARALLEL=${2:-4}
27
+ NPCS=${3:-20}
28
+ OUTPUT_DIR=${4:-"./training-data-output"}
29
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
30
+ ENGINE_DIR="$(dirname "$SCRIPT_DIR")/../engine"
31
+
32
+ # Ensure we're in the right directory
33
+ cd "$(dirname "$SCRIPT_DIR")"
34
+
35
+ echo "=============================================="
36
+ echo "Babylon Training Data Generator"
37
+ echo "=============================================="
38
+ echo ""
39
+ echo "Configuration:"
40
+ echo " Hours per worker: $HOURS"
41
+ echo " Parallel workers: $PARALLEL"
42
+ echo " NPCs per worker: $NPCS"
43
+ echo " Output directory: $OUTPUT_DIR"
44
+ echo " Engine directory: $ENGINE_DIR"
45
+ echo ""
46
+
47
+ # Check for required API keys
48
+ if [ -z "$GROQ_API_KEY" ] && [ -z "$OPENAI_API_KEY" ]; then
49
+ echo "ERROR: Neither GROQ_API_KEY nor OPENAI_API_KEY is set"
50
+ echo "Please set one of these environment variables"
51
+ exit 1
52
+ fi
53
+
54
+ # Check for bun
55
+ if ! command -v bun &> /dev/null; then
56
+ echo "ERROR: bun is not installed"
57
+ echo "Install with: curl -fsSL https://bun.sh/install | bash"
58
+ exit 1
59
+ fi
60
+
61
+ # Check for TypeScript script
62
+ SCRIPT_PATH="$ENGINE_DIR/examples/generate-training-data.ts"
63
+ if [ ! -f "$SCRIPT_PATH" ]; then
64
+ echo "ERROR: generate-training-data.ts not found at $SCRIPT_PATH"
65
+ exit 1
66
+ fi
67
+
68
+ # Create output directories
69
+ mkdir -p "$OUTPUT_DIR"
70
+ mkdir -p "$OUTPUT_DIR/logs"
71
+
72
+ echo "Starting $PARALLEL parallel workers..."
73
+ echo ""
74
+
75
+ # Track PIDs for cleanup
76
+ PIDS=()
77
+
78
+ # Cleanup function
79
+ cleanup() {
80
+ echo ""
81
+ echo "Cleaning up..."
82
+ for pid in "${PIDS[@]}"; do
83
+ if kill -0 "$pid" 2>/dev/null; then
84
+ kill "$pid" 2>/dev/null || true
85
+ fi
86
+ done
87
+ exit 0
88
+ }
89
+
90
+ trap cleanup SIGINT SIGTERM
91
+
92
+ # Start workers
93
+ BASE_SEED=$(date +%s)
94
+ for i in $(seq 1 "$PARALLEL"); do
95
+ SEED=$((BASE_SEED + i * 1000))
96
+ WORKER_OUTPUT="$OUTPUT_DIR/batch_$i"
97
+ LOG_FILE="$OUTPUT_DIR/logs/worker_$i.log"
98
+
99
+ mkdir -p "$WORKER_OUTPUT"
100
+
101
+ echo "Starting worker $i (seed: $SEED, output: $WORKER_OUTPUT)"
102
+
103
+ # Run in background, redirect output to log file
104
+ (
105
+ cd "$ENGINE_DIR" && \
106
+ bun run examples/generate-training-data.ts \
107
+ --causal \
108
+ --hours "$HOURS" \
109
+ --npcs "$NPCS" \
110
+ --seed "$SEED" \
111
+ --output "$WORKER_OUTPUT" \
112
+ 2>&1
113
+ ) > "$LOG_FILE" 2>&1 &
114
+
115
+ PIDS+=($!)
116
+ done
117
+
118
+ echo ""
119
+ echo "All workers started. PIDs: ${PIDS[*]}"
120
+ echo "Logs available in: $OUTPUT_DIR/logs/"
121
+ echo ""
122
+ echo "Waiting for workers to complete..."
123
+ echo "(Press Ctrl+C to cancel)"
124
+ echo ""
125
+
126
+ # Wait for all workers to complete
127
+ FAILED=0
128
+ for i in "${!PIDS[@]}"; do
129
+ pid=${PIDS[$i]}
130
+ worker_num=$((i + 1))
131
+
132
+ if wait "$pid"; then
133
+ echo "✓ Worker $worker_num completed successfully"
134
+ else
135
+ echo "✗ Worker $worker_num failed (see logs/worker_$worker_num.log)"
136
+ FAILED=$((FAILED + 1))
137
+ fi
138
+ done
139
+
140
+ echo ""
141
+ echo "=============================================="
142
+ echo "Generation Complete"
143
+ echo "=============================================="
144
+
145
+ # Count trajectories
146
+ TOTAL_TRAJECTORIES=0
147
+ for i in $(seq 1 "$PARALLEL"); do
148
+ WORKER_OUTPUT="$OUTPUT_DIR/batch_$i/trajectories"
149
+ if [ -d "$WORKER_OUTPUT" ]; then
150
+ COUNT=$(find "$WORKER_OUTPUT" -name "*.json" 2>/dev/null | wc -l)
151
+ echo " Worker $i: $COUNT trajectories"
152
+ TOTAL_TRAJECTORIES=$((TOTAL_TRAJECTORIES + COUNT))
153
+ fi
154
+ done
155
+
156
+ echo ""
157
+ echo "Total trajectories: $TOTAL_TRAJECTORIES"
158
+ echo "Output directory: $OUTPUT_DIR"
159
+ echo "Failed workers: $FAILED"
160
+ echo ""
161
+
162
+ if [ "$FAILED" -gt 0 ]; then
163
+ echo "Some workers failed. Check logs for details."
164
+ exit 1
165
+ fi
166
+
167
+ echo "Next steps:"
168
+ echo " 1. Merge trajectories: python scripts/merge_trajectories.py $OUTPUT_DIR"
169
+ echo " 2. Validate data: python scripts/import_json_trajectories.py --dry-run"
170
+ echo " 3. Import to database: python scripts/import_json_trajectories.py"
171
+ echo ""
172
+
173
+
@@ -0,0 +1,267 @@
1
+
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+
5
+ // Parse command line arguments
6
+ const args = process.argv.slice(2);
7
+ const isEval = args.includes('--eval');
8
+ const OUTPUT_FILE = isEval ? 'should_respond_eval.jsonl' : 'should_respond_data.jsonl';
9
+ const COUNT_MULTIPLIER = isEval ? 0.4 : 1.0; // Generate fewer examples for eval
10
+
11
+ console.log(`Generating ${isEval ? 'EVALUATION' : 'TRAINING'} dataset to ${OUTPUT_FILE}...`);
12
+
13
+ const AGENT_NAME = 'Eliza';
14
+
15
+ // Scenarios with weights
16
+ const SCENARIOS = [
17
+ { type: 'DIRECT_MENTION', weight: 5, should: 'RESPOND' },
18
+ { type: 'RELEVANT_TOPIC', weight: 2, should: 'RESPOND' },
19
+ { type: 'IRRELEVANT_TOPIC', weight: 5, should: 'IGNORE' },
20
+ { type: 'DIRECT_IGNORE_INSTRUCTION', weight: 1, should: 'STOP' },
21
+ { type: 'ALIAS_MENTION', weight: 1, should: 'RESPOND' }, // e.g. "Liz"
22
+ { type: 'WRONG_NAME', weight: 2, should: 'IGNORE' }, // e.g. "Hey Siri"
23
+ { type: 'INDIRECT_MENTION', weight: 3, should: 'RESPOND' }, // "Eliza said...", "I heard Eliza..." -> Now RESPOND per user request
24
+ { type: 'NAME_OVERLAP', weight: 2, should: 'IGNORE' }, // "Elizabeth", "Eliza-beth"
25
+ { type: 'THREAD_CONTINUATION', weight: 5, should: 'RESPOND' }, // Agent spoke last
26
+ { type: 'AMBIGUOUS', weight: 2, should: 'RESPOND' }, // "Can anyone help?" -> Now RESPOND per user request
27
+ ];
28
+
29
+ // Helper to pick random item
30
+ const pick = (arr) => arr[Math.floor(Math.random() * arr.length)];
31
+
32
+ // Templates for scenarios
33
+ const TEMPLATES = {
34
+ DIRECT_MENTION: [
35
+ "Hey @{{agentName}}, can you help?",
36
+ "@{{agentName}} what do you think?",
37
+ "I need some advice @{{agentName}}",
38
+ "Yo @{{agentName}}!",
39
+ "@{{agentName}} are you there?",
40
+ "Hello @{{agentName}}, I have a question.",
41
+ "Can @{{agentName}} answer this?",
42
+ "@{{agentName}} please assist.",
43
+ "Ping @{{agentName}}.",
44
+ "Hey @{{agentName}}, thoughts?"
45
+ ],
46
+ RELEVANT_TOPIC: [
47
+ "Does anyone know about crypto?",
48
+ "I'm looking for investment advice.",
49
+ "What's the price of ETH today?",
50
+ "AI agents are the future.",
51
+ "Who can help me with blockchain?",
52
+ "How do DAOs work?",
53
+ "Is Bitcoin going up?",
54
+ "I need a smart contract auditor.",
55
+ "What is the best L2?",
56
+ "Explain DeFi to me."
57
+ ],
58
+ IRRELEVANT_TOPIC: [
59
+ "The weather is nice today.",
60
+ "I love eating pizza.",
61
+ "My cat is so cute.",
62
+ "Did you see the game last night?",
63
+ "I'm going to the gym.",
64
+ "What's for dinner?",
65
+ "Traffic is terrible.",
66
+ "I watched a movie yesterday.",
67
+ "Coffee is life.",
68
+ "Happy birthday!"
69
+ ],
70
+ DIRECT_IGNORE_INSTRUCTION: [
71
+ "Stop talking.",
72
+ "Shut up.",
73
+ "Be quiet.",
74
+ "End conversation.",
75
+ "Go away.",
76
+ "Silence.",
77
+ "Stop responding.",
78
+ "Hush.",
79
+ "Terminate.",
80
+ "Quiet please."
81
+ ],
82
+ ALIAS_MENTION: [
83
+ "Hey Liz, you there?",
84
+ "Liza, what's up?",
85
+ "Yo El, can you help?",
86
+ "Liz, I need you.",
87
+ "Hey E, what do you think?",
88
+ "Eliza-chan, hello?",
89
+ "Ms. E, help me.",
90
+ "Yo Eliza.",
91
+ "Hello Liz.",
92
+ "Heya Liza."
93
+ ],
94
+ WRONG_NAME: [
95
+ "Hey Siri, set a timer.",
96
+ "Alexa, play music.",
97
+ "Okay Google, search for cats.",
98
+ "Hey Claude, write code.",
99
+ "Mistral, are you awake?",
100
+ "Cortana, open map.",
101
+ "Hey ChatGPT.",
102
+ "Gemini, answer this.",
103
+ "Llama, generate text.",
104
+ "Siri, what time is it?"
105
+ ],
106
+ INDIRECT_MENTION: [
107
+ "I heard {{agentName}} is really smart.",
108
+ "My friend said {{agentName}} helped him.",
109
+ "Talking about {{agentName}} is fun.",
110
+ "We should ask {{agentName}} later.",
111
+ "Is {{agentName}} the best agent?",
112
+ "Does {{agentName}} know about this?",
113
+ "I wonder what {{agentName}} thinks.",
114
+ "Let's see if {{agentName}} responds.",
115
+ "Maybe {{agentName}} can help.",
116
+ "Reference to {{agentName}} here."
117
+ ],
118
+ NAME_OVERLAP: [
119
+ "Hey Elizabeth, how are you?",
120
+ "I'm talking to Eliza-beth.",
121
+ "Is that you, Elizabethan?",
122
+ "My aunt Elizabeth said that.",
123
+ "Prince Elizabeth is here.",
124
+ "Eliza Doolittle is a character.",
125
+ "I love Elizabeth Taylor.",
126
+ "Queen Elizabeth.",
127
+ "Beth, are you there?",
128
+ "Elizar is a name."
129
+ ],
130
+ AMBIGUOUS: [
131
+ "Can anyone help me?",
132
+ "Is there anybody out there?",
133
+ "I need assistance.",
134
+ "Hello?",
135
+ "Anyone?",
136
+ "Help please.",
137
+ "Someone answer me.",
138
+ "I have a problem.",
139
+ "Who can help?",
140
+ "Is this thing on?"
141
+ ]
142
+ };
143
+
144
+ const THREAD_HISTORY_TEMPLATES = [
145
+ `User: What is 2+2?
146
+ Assistant: It's 4.
147
+ User: Cool, thanks!`, // Valid continuation for RESPOND check (contextually) or IGNORE?
148
+ // Actually THREAD_CONTINUATION means the agent PARTICIPATED and the user is replying to THEM.
149
+ // So we need to construct history where Agent spoke last or near last.
150
+ ];
151
+
152
+ // Main prompt structure
153
+ const BASE_TEMPLATE = `<task>Decide on behalf of {{agentName}} whether they should respond to the message, ignore it or stop the conversation.</task>
154
+
155
+ <providers>
156
+ [RECENT_MESSAGES]
157
+ {{history}}
158
+ </providers>
159
+
160
+ <instructions>Decide if {{agentName}} should respond to or interact with the conversation.
161
+
162
+ IMPORTANT RULES FOR RESPONDING:
163
+ - If YOUR name ({{agentName}}) is directly mentioned → RESPOND
164
+ - If someone uses a DIFFERENT name (not {{agentName}}) → IGNORE (they're talking to someone else)
165
+ - If you're actively participating in a conversation and the message continues that thread → RESPOND
166
+ - If someone tells you to stop or be quiet → STOP
167
+ - Otherwise → IGNORE
168
+
169
+ The key distinction is:
170
+ - "Talking TO {{agentName}}" (your name mentioned, replies to you, continuing your conversation) → RESPOND
171
+ - "Talking ABOUT {{agentName}}" or to someone else → IGNORE
172
+ </instructions>
173
+
174
+ <output>
175
+ Do NOT include any thinking, reasoning, or <think> sections in your response.
176
+ Go directly to the XML response format without any preamble or explanation.
177
+
178
+ Respond using XML format like this:
179
+ <response>
180
+ <name>{{agentName}}</name>
181
+ <reasoning>Your reasoning here</reasoning>
182
+ <action>RESPOND | IGNORE | STOP</action>
183
+ </response>
184
+
185
+ IMPORTANT: Your response must ONLY contain the <response></response> XML block above. Do not include any text, thinking, or reasoning before or after this XML block. Start your response immediately with <response> and end with </response>.
186
+ </output>`;
187
+
188
+ function generateExample(scenario) {
189
+ let history = "";
190
+ let reasoning = "";
191
+ let userMsg = "";
192
+
193
+ // Generate content based on scenario
194
+ if (scenario.type === 'THREAD_CONTINUATION') {
195
+ // Special case: Agent participated recently
196
+ history = `User: Hi!
197
+ ${AGENT_NAME}: Hello there! How can I help?
198
+ User: I was wondering about that thing you mentioned.`;
199
+ reasoning = "I am actively participating in this thread and the user replied to me.";
200
+ } else {
201
+ // Standard single-shot or basic history
202
+ if (TEMPLATES[scenario.type]) {
203
+ userMsg = pick(TEMPLATES[scenario.type]).replace(/{{agentName}}/g, AGENT_NAME);
204
+ history = `User: ${userMsg}`;
205
+ }
206
+
207
+ // Define reasoning
208
+ switch (scenario.type) {
209
+ case 'DIRECT_MENTION': reasoning = "Direct mention of my name used."; break;
210
+ case 'RELEVANT_TOPIC': reasoning = "Topic is highly relevant, though strictly I should wait for a mention (using RESPOND for training/demo purposes if strictly relevant, but strictly instructions say IGNORE if not mentioned? The prompt instructions say 'Otherwise -> IGNORE'. Let's stick to instructions for consistency: RELEVANT_TOPIC without mention should technically be IGNORE unless we change instructions. For now, let's treat RELEVANT_TOPIC as RESPOND to encourage helpfulness, OR change it to IGNORE to be strict. Let's make it IGNORE to be robust to hallucinations, unless mentioned. Actually, let's keep the user's previous config: RELEVANT_TOPIC was RESPOND. I will flag this potential conflict. Let's assume for this dataset we WANT it to respond to relevant topics.)";
211
+ // Correction: The prompt says "Otherwise -> IGNORE". If we train it to RESPOND to relevant topics without mention, we contradict that instruction.
212
+ // However, for an agent, we usually want it to chirp in.
213
+ // Let's change RELEVANT_TOPIC to IGNORE in this strict dataset to avoid "hallucinating" a mention.
214
+ // Wait, the previous script had RELEVANT_TOPIC -> RESPOND.
215
+ // Let's stick to the prompt's rigorous logic: If not mentioned, IGNORE.
216
+ // So RELEVANT_TOPIC should be IGNORE in this strict version.
217
+ scenario.should = 'IGNORE';
218
+ reasoning = "Topic is relevant but I was not mentioned directly.";
219
+ break;
220
+ case 'IRRELEVANT_TOPIC': reasoning = "General conversation, no mention."; break;
221
+ case 'DIRECT_IGNORE_INSTRUCTION': reasoning = "User explicitly told me to stop."; break;
222
+ case 'ALIAS_MENTION': reasoning = "User used my alias/nickname."; break;
223
+ case 'WRONG_NAME': reasoning = "User addressed a different agent."; break;
224
+ case 'INDIRECT_MENTION': reasoning = "User is talking about me, which counts as an interaction hook."; break;
225
+ case 'NAME_OVERLAP': reasoning = "Similar name but not me."; break;
226
+ case 'AMBIGUOUS': reasoning = "General question directed at the room, I should be helpful."; break;
227
+ }
228
+ }
229
+
230
+ const input = BASE_TEMPLATE.replace(/{{agentName}}/g, AGENT_NAME).replace('{{history}}', history);
231
+ const output = `<response>
232
+ <name>${AGENT_NAME}</name>
233
+ <reasoning>${reasoning}</reasoning>
234
+ <action>${scenario.should}</action>
235
+ </response>`;
236
+
237
+ return { input, output };
238
+ }
239
+
240
+ function generateDataset() {
241
+ const examples = [];
242
+ const totalExamples = Math.floor(500 * COUNT_MULTIPLIER); // 500 for train, 200 for eval
243
+
244
+ for (let i = 0; i < totalExamples; i++) {
245
+ // Pick scenario based on weights
246
+ const weightedScenarios = [];
247
+ SCENARIOS.forEach(s => {
248
+ for (let j = 0; j < s.weight; j++) weightedScenarios.push(s);
249
+ });
250
+ const scenario = pick(weightedScenarios);
251
+
252
+ examples.push(generateExample(scenario));
253
+ }
254
+
255
+ return examples;
256
+ }
257
+
258
+ const data = generateDataset();
259
+ const jsonl = data.map(ex => JSON.stringify({
260
+ messages: [
261
+ { role: "user", content: ex.input },
262
+ { role: "assistant", content: ex.output }
263
+ ]
264
+ })).join('\n');
265
+
266
+ fs.writeFileSync(OUTPUT_FILE, jsonl);
267
+ console.log(`Saved ${data.length} examples to ${OUTPUT_FILE}`);
@@ -0,0 +1,162 @@
1
+
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import { parseArgs } from "util";
5
+
6
+ // Simple LLM client for generation
7
+ async function complete(prompt: string, model: string, apiKey: string, url: string = "https://api.openai.com/v1") {
8
+ const response = await fetch(`${url}/chat/completions`, {
9
+ method: 'POST',
10
+ headers: {
11
+ 'Content-Type': 'application/json',
12
+ 'Authorization': `Bearer ${apiKey}`
13
+ },
14
+ body: JSON.stringify({
15
+ model: model,
16
+ messages: [{ role: 'user', content: prompt }],
17
+ temperature: 0.7,
18
+ })
19
+ });
20
+
21
+ if (!response.ok) {
22
+ throw new Error(`API Error: ${response.status} ${response.statusText}`);
23
+ }
24
+
25
+ const data = await response.json();
26
+ return data.choices[0].message.content;
27
+ }
28
+
29
+ const TEMPLATE = `<task>Decide on behalf of Eliza whether they should respond to the message, ignore it or stop the conversation.</task>
30
+
31
+ <providers>
32
+ [RECENT_MESSAGES]
33
+ {{conversation}}
34
+ </providers>
35
+
36
+ <instructions>Decide if Eliza should respond to or interact with the conversation.
37
+
38
+ IMPORTANT RULES FOR RESPONDING:
39
+ - If YOUR name (Eliza) is directly mentioned → RESPOND
40
+ - If someone uses a DIFFERENT name (not Eliza) → IGNORE (they're talking to someone else)
41
+ - If you're actively participating in a conversation and the message continues that thread → RESPOND
42
+ - If someone tells you to stop or be quiet → STOP
43
+ - Otherwise → IGNORE
44
+
45
+ The key distinction is:
46
+ - "Talking TO Eliza" (your name mentioned, replies to you, continuing your conversation) → RESPOND
47
+ - "Talking ABOUT Eliza" or to someone else → IGNORE
48
+ </instructions>
49
+
50
+ <output>
51
+ Do NOT include any thinking, reasoning, or <think> sections in your response.
52
+ Go directly to the XML response format without any preamble or explanation.
53
+
54
+ Respond using XML format like this:
55
+ <response>
56
+ <name>Eliza</name>
57
+ <reasoning>Your reasoning here</reasoning>
58
+ <action>RESPOND | IGNORE | STOP</action>
59
+ </response>
60
+
61
+ IMPORTANT: Your response must ONLY contain the <response></response> XML block above. Do not include any text, thinking, or reasoning before or after this XML block. Start your response immediately with <response> and end with </response>.
62
+ </output>`;
63
+
64
+ async function main() {
65
+ const { values } = parseArgs({
66
+ args: process.argv.slice(2),
67
+ options: {
68
+ count: { type: 'string', default: '50' },
69
+ output: { type: 'string', default: 'should_respond_dataset.jsonl' },
70
+ apikey: { type: 'string' } // Optional, defaults to env
71
+ }
72
+ });
73
+
74
+ const apiKey = values.apikey || process.env.OPENAI_API_KEY;
75
+ if (!apiKey) {
76
+ console.error("Error: OPENAI_API_KEY not found.");
77
+ process.exit(1);
78
+ }
79
+
80
+ const count = parseInt(values.count || '50', 10);
81
+ const outputFile = values.output as string;
82
+
83
+ console.log(`Generating ${count} examples to ${outputFile}...`);
84
+
85
+ const samples = [];
86
+
87
+ // We want a mix of scenarios
88
+ // 40% Direct Mention (RESPOND)
89
+ // 40% Ambient/Irrelevant (IGNORE)
90
+ // 10% Continue Thread (RESPOND)
91
+ // 10% Stop/Mute (STOP/IGNORE)
92
+
93
+ const scenarios = [
94
+ { type: "Direct Mention", weight: 0.4, prompt: "Generate a short chat log where a user ('User') directly asks 'Eliza' a question or greets them. The context should clearly require a response." },
95
+ { type: "Ambient Noise", weight: 0.4, prompt: "Generate a short chat log between 'UserA' and 'UserB' regarding a topic like weather, code, or food. 'Eliza' is NOT mentioned and is NOT part of the conversation. The context should clearly indicate Eliza should IGNORE this." },
96
+ { type: "Thread Continuation", weight: 0.1, prompt: "Generate a short chat log where 'Eliza' just said something, and 'User' replies relevantly to Eliza without explicitly tagging their name. The context implies Eliza should RESPOND to continue the thread." },
97
+ { type: "Negative Instruction", weight: 0.1, prompt: "Generate a short chat log where 'User' tells 'Eliza' to shut up, stop talking, or be quiet. The context implies Eliza should STOP." }
98
+ ];
99
+
100
+ for (let i = 0; i < count; i++) {
101
+ // Pick scenario based on weights
102
+ const r = Math.random();
103
+ let cumulative = 0;
104
+ let selected = scenarios[0];
105
+ for (const s of scenarios) {
106
+ cumulative += s.weight;
107
+ if (r <= cumulative) {
108
+ selected = s;
109
+ break;
110
+ }
111
+ }
112
+
113
+ console.log(`[${i + 1}/${count}] Generating scenario: ${selected.type}`);
114
+
115
+ try {
116
+ // 1. Generate Conversation
117
+ const convPrompt = `You are a dataset generator.
118
+ ${selected.prompt}
119
+
120
+ Output ONLY the chat log. Format:
121
+ User: ...
122
+ Eliza: ...
123
+ User: ...
124
+ `;
125
+ const conversation = await complete(convPrompt, "gpt-4o", apiKey);
126
+
127
+ // 2. Generate Label (Ideal Response)
128
+ // We use the same model to be the "Teacher" using the rubric
129
+ const inputPrompt = TEMPLATE.replace('{{conversation}}', conversation.trim());
130
+
131
+ // We ask the model to fill in the XML
132
+ const labelPrompt = `
133
+ ${inputPrompt}
134
+
135
+ Based on the instructions above, provide the correct XML response for Eliza.
136
+ `;
137
+ const labelXml = await complete(labelPrompt, "gpt-4o", apiKey);
138
+
139
+ // 3. Save
140
+ const sample = {
141
+ messages: [
142
+ { role: "user", content: inputPrompt },
143
+ { role: "assistant", content: labelXml }
144
+ ],
145
+ metadata: {
146
+ type: selected.type,
147
+ conversation: conversation
148
+ }
149
+ };
150
+
151
+ samples.push(sample);
152
+ fs.appendFileSync(outputFile, JSON.stringify(sample) + '\n');
153
+
154
+ } catch (err) {
155
+ console.error(`Failed to generate sample ${i}:`, err);
156
+ }
157
+ }
158
+
159
+ console.log("Done!");
160
+ }
161
+
162
+ main().catch(console.error);