npm - @elizaos/training - Versions diffs - 2.0.0-alpha.10 - Mend

@elizaos/training 2.0.0-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (224) hide show

package/Dockerfile +75 -0
package/LICENSE +21 -0
package/Makefile +374 -0
package/README.md +346 -0
package/config/rubrics.json +137 -0
package/docker-compose.test.yml +57 -0
package/package.json +57 -0
package/python/config/babylon_atropos.yaml +90 -0
package/python/config/profiles/12gb.json +11 -0
package/python/config/profiles/16gb.json +10 -0
package/python/config/profiles/24gb.json +10 -0
package/python/config/profiles/48gb.json +10 -0
package/python/config/profiles/cpu.json +11 -0
package/python/config/profiles/l40-2gpu-safe.json +20 -0
package/python/config/profiles/l40-2gpu.json +22 -0
package/python/config/profiles/l40-4gpu.json +21 -0
package/python/config/profiles/l40.json +17 -0
package/python/config/tinker_training.yaml +143 -0
package/python/curriculum_state.json +165 -0
package/python/env.template +86 -0
package/python/env.training.template +46 -0
package/python/pyproject.toml +41 -0
package/python/requirements-ci.txt +31 -0
package/python/requirements.txt +87 -0
package/python/scripts/__init__.py +4 -0
package/python/scripts/benchmark_should_respond.py +190 -0
package/python/scripts/debug_inference.py +62 -0
package/python/scripts/import_json_trajectories.py +412 -0
package/python/scripts/local-finetune/README.md +63 -0
package/python/scripts/local-finetune/ingest_and_score.py +139 -0
package/python/scripts/local-finetune/merge_model.py +32 -0
package/python/scripts/local-finetune/test_adapter.py +91 -0
package/python/scripts/local-finetune/train_from_csv.py +132 -0
package/python/scripts/merge_trajectories.py +318 -0
package/python/scripts/optimize_prompt_grpo.py +269 -0
package/python/scripts/run_ab_test.py +143 -0
package/python/scripts/run_full_pipeline.py +544 -0
package/python/scripts/run_tinker_training.py +192 -0
package/python/scripts/run_training.py +914 -0
package/python/scripts/test_generation.py +29 -0
package/python/scripts/test_judge.py +155 -0
package/python/scripts/test_pipeline.py +356 -0
package/python/scripts/test_trained_model.py +380 -0
package/python/scripts/train_grpo.py +360 -0
package/python/scripts/train_jsonl.py +223 -0
package/python/scripts/train_local.py +528 -0
package/python/setup.py +20 -0
package/python/src/__init__.py +190 -0
package/python/src/data_bridge/__init__.py +24 -0
package/python/src/data_bridge/converter.py +435 -0
package/python/src/data_bridge/reader.py +393 -0
package/python/src/models.py +283 -0
package/python/src/training/__init__.py +605 -0
package/python/src/training/ab_testing.py +404 -0
package/python/src/training/action_executor.py +621 -0
package/python/src/training/archetype_trainer.py +347 -0
package/python/src/training/atropos_trainer.py +980 -0
package/python/src/training/babylon_env.py +1254 -0
package/python/src/training/error_recovery.py +647 -0
package/python/src/training/evaluation.py +856 -0
package/python/src/training/fast_simulator.py +880 -0
package/python/src/training/format_validator.py +584 -0
package/python/src/training/hybrid_env.py +522 -0
package/python/src/training/kl_controller.py +628 -0
package/python/src/training/multi_prompt_dataset.py +883 -0
package/python/src/training/multi_turn.py +656 -0
package/python/src/training/online_env.py +1084 -0
package/python/src/training/quality_scorer.py +391 -0
package/python/src/training/quality_utils.py +633 -0
package/python/src/training/rewards.py +1344 -0
package/python/src/training/rlaif_env.py +17 -0
package/python/src/training/rollout_generator.py +502 -0
package/python/src/training/rubric_loader.py +198 -0
package/python/src/training/scenario_pool.py +1072 -0
package/python/src/training/schemas.py +481 -0
package/python/src/training/service_manager.py +552 -0
package/python/src/training/simulation_bridge.py +535 -0
package/python/src/training/tick_reward_attribution.py +399 -0
package/python/src/training/tinker_client.py +575 -0
package/python/src/training/tinker_trainer.py +646 -0
package/python/src/training/tokenization_utils.py +402 -0
package/python/tests/e2e/__init__.py +13 -0
package/python/tests/e2e/conftest.py +258 -0
package/python/tests/e2e/test_full_pipeline.py +643 -0
package/python/tests/e2e/test_online_training_e2e.py +365 -0
package/python/tests/integration/__init__.py +12 -0
package/python/tests/integration/conftest.py +383 -0
package/python/tests/integration/test_db_integration.py +649 -0
package/python/tests/integration/test_json_mode_integration.py +554 -0
package/python/tests/test_action_executor.py +594 -0
package/python/tests/test_archetype_scoring.py +1027 -0
package/python/tests/test_atropos_integration.py +360 -0
package/python/tests/test_evaluation.py +727 -0
package/python/tests/test_format_validator.py +486 -0
package/python/tests/test_kl_controller.py +432 -0
package/python/tests/test_lr_scheduler.py +579 -0
package/python/tests/test_multi_turn.py +590 -0
package/python/tests/test_online_env.py +519 -0
package/python/tests/test_quality_scorer.py +474 -0
package/python/tests/test_scenario_pool.py +735 -0
package/python/tests/test_service_manager.py +585 -0
package/python/tests/test_simulation_rollout.py +581 -0
package/python/tests/test_tokenization_utils.py +501 -0
package/python/tests/test_training_orchestrator.py +497 -0
package/python/tests/test_training_output_structure.py +661 -0
package/research-output/training-runs/training-run-1770772042899.json +26 -0
package/research-output/training-runs/training-run-1770930079670.json +32 -0
package/research-output/training-runs/training-run-1770930143700.json +44 -0
package/research-output/training-runs/training-run-1770930183638.json +38 -0
package/research-output/training-runs/training-run-1770930442049.json +38 -0
package/research-output/training-runs/training-run-1770930793243.json +38 -0
package/research-output/training-runs/training-run-1771276293257.json +38 -0
package/research-output/training-runs/training-run-1771276389280.json +38 -0
package/research-output/training-runs/training-run-1771276502776.json +38 -0
package/research-output/training-runs/training-run-1771277340748.json +38 -0
package/research-output/training-runs/training-run-1773013658993.json +38 -0
package/research-output/training-runs/training-run-1773013861014.json +38 -0
package/research-output/training-runs/training-run-1773014215983.json +38 -0
package/scripts/assess-training-data.ts +422 -0
package/scripts/e2e-training-test.ts +550 -0
package/scripts/export-rubrics.ts +64 -0
package/scripts/generate-research-report.ts +1523 -0
package/scripts/generate_dataset.sh +173 -0
package/scripts/generate_should_respond.ts +267 -0
package/scripts/generate_should_respond_dataset.ts +162 -0
package/scripts/json-mode-benchmark.ts +399 -0
package/scripts/rank_trajectories.ts +207 -0
package/scripts/real-archetype-benchmark.ts +210 -0
package/scripts/run-baseline-comparison.ts +116 -0
package/scripts/run-full-pipeline.ts +272 -0
package/scripts/run_rlaif_loop.ts +78 -0
package/scripts/run_task_benchmark.ts +247 -0
package/scripts/runpod_setup.sh +137 -0
package/scripts/runpod_validate.sh +147 -0
package/scripts/test-model-in-game.ts +955 -0
package/scripts/test-scoring.ts +73 -0
package/scripts/test-trained-model.ts +209 -0
package/scripts/train-and-test.ts +824 -0
package/scripts/verify-final.ts +118 -0
package/src/adapter.ts +516 -0
package/src/archetypes/ArchetypeConfigService.ts +626 -0
package/src/archetypes/derive-archetype.ts +249 -0
package/src/archetypes/index.ts +22 -0
package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
package/src/benchmark/BenchmarkDataViewer.ts +324 -0
package/src/benchmark/BenchmarkHistoryService.ts +221 -0
package/src/benchmark/BenchmarkRunner.ts +685 -0
package/src/benchmark/BenchmarkValidator.ts +204 -0
package/src/benchmark/FastEvalRunner.ts +225 -0
package/src/benchmark/MetricsValidator.ts +165 -0
package/src/benchmark/MetricsVisualizer.ts +909 -0
package/src/benchmark/ModelBenchmarkService.ts +611 -0
package/src/benchmark/ModelRegistry.ts +158 -0
package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
package/src/benchmark/SimulationA2AInterface.ts +1169 -0
package/src/benchmark/SimulationEngine.ts +832 -0
package/src/benchmark/TaskRunner.ts +94 -0
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
package/src/benchmark/index.ts +91 -0
package/src/benchmark/parseSimulationMetrics.ts +124 -0
package/src/benchmark/simulation-types.ts +78 -0
package/src/dependencies.ts +475 -0
package/src/generation/TrajectoryGenerator.ts +387 -0
package/src/generation/index.ts +12 -0
package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
package/src/huggingface/index.ts +27 -0
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
package/src/index.ts +102 -0
package/src/init-training.ts +53 -0
package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
package/src/metrics/index.ts +8 -0
package/src/metrics/types.ts +200 -0
package/src/rubrics/__tests__/index.test.ts +184 -0
package/src/rubrics/ass-kisser.ts +85 -0
package/src/rubrics/degen.ts +80 -0
package/src/rubrics/goody-twoshoes.ts +84 -0
package/src/rubrics/index.ts +236 -0
package/src/rubrics/information-trader.ts +84 -0
package/src/rubrics/infosec.ts +101 -0
package/src/rubrics/liar.ts +104 -0
package/src/rubrics/perps-trader.ts +87 -0
package/src/rubrics/researcher.ts +81 -0
package/src/rubrics/scammer.ts +82 -0
package/src/rubrics/social-butterfly.ts +73 -0
package/src/rubrics/super-predictor.ts +97 -0
package/src/rubrics/trader.ts +67 -0
package/src/scoring/ArchetypeScoringService.ts +486 -0
package/src/scoring/JudgePromptBuilder.ts +556 -0
package/src/scoring/LLMJudgeCache.ts +401 -0
package/src/scoring/index.ts +9 -0
package/src/training/AutomationPipeline.ts +916 -0
package/src/training/BenchmarkService.ts +518 -0
package/src/training/ConfigValidator.ts +220 -0
package/src/training/MarketOutcomesTracker.ts +187 -0
package/src/training/ModelDeployer.ts +186 -0
package/src/training/ModelFetcher.ts +76 -0
package/src/training/ModelSelectionService.ts +341 -0
package/src/training/ModelUsageVerifier.ts +160 -0
package/src/training/MultiModelOrchestrator.ts +580 -0
package/src/training/RLModelConfig.ts +407 -0
package/src/training/RewardBackpropagationService.ts +149 -0
package/src/training/RulerScoringService.ts +666 -0
package/src/training/TrainingMonitor.ts +166 -0
package/src/training/TrajectoryRecorder.ts +399 -0
package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
package/src/training/index.ts +100 -0
package/src/training/logRLConfig.ts +34 -0
package/src/training/pipeline.ts +129 -0
package/src/training/storage/ModelStorageService.ts +279 -0
package/src/training/storage/TrainingDataArchiver.ts +197 -0
package/src/training/storage/index.ts +17 -0
package/src/training/types.ts +207 -0
package/src/training/window-utils.ts +138 -0
package/src/utils/index.ts +101 -0
package/src/utils/logger.ts +59 -0
package/src/utils/snowflake.ts +17 -0
package/src/utils/synthetic-detector.ts +111 -0
package/tsconfig.json +20 -0

package/scripts/generate_dataset.sh ADDED Viewed

@@ -0,0 +1,173 @@
+#!/bin/bash
+#
+# Generate Large Training Dataset
+#
+# Runs multiple parallel workers to generate trajectory data using the
+# TypeScript simulation engine. Each worker uses a different seed for variety.
+#
+# Usage:
+#   ./scripts/generate_dataset.sh [HOURS] [PARALLEL_WORKERS] [NPCS_PER_WORKER] [OUTPUT_DIR]
+#
+# Examples:
+#   ./scripts/generate_dataset.sh                    # Default: 24h, 4 workers, 20 NPCs
+#   ./scripts/generate_dataset.sh 48 8 30           # 48 hours, 8 workers, 30 NPCs
+#   ./scripts/generate_dataset.sh 24 4 20 ./data    # Custom output directory
+#
+# Requirements:
+#   - bun installed
+#   - GROQ_API_KEY or OPENAI_API_KEY set
+#   - generate-training-data.ts script available
+#
+set -e
+# Configuration
+HOURS=${1:-24}
+PARALLEL=${2:-4}
+NPCS=${3:-20}
+OUTPUT_DIR=${4:-"./training-data-output"}
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ENGINE_DIR="$(dirname "$SCRIPT_DIR")/../engine"
+# Ensure we're in the right directory
+cd "$(dirname "$SCRIPT_DIR")"
+echo "=============================================="
+echo "Babylon Training Data Generator"
+echo "=============================================="
+echo ""
+echo "Configuration:"
+echo "  Hours per worker:     $HOURS"
+echo "  Parallel workers:     $PARALLEL"
+echo "  NPCs per worker:      $NPCS"
+echo "  Output directory:     $OUTPUT_DIR"
+echo "  Engine directory:     $ENGINE_DIR"
+echo ""
+# Check for required API keys
+if [ -z "$GROQ_API_KEY" ] && [ -z "$OPENAI_API_KEY" ]; then
+    echo "ERROR: Neither GROQ_API_KEY nor OPENAI_API_KEY is set"
+    echo "Please set one of these environment variables"
+    exit 1
+fi
+# Check for bun
+if ! command -v bun &> /dev/null; then
+    echo "ERROR: bun is not installed"
+    echo "Install with: curl -fsSL https://bun.sh/install | bash"
+    exit 1
+fi
+# Check for TypeScript script
+SCRIPT_PATH="$ENGINE_DIR/examples/generate-training-data.ts"
+if [ ! -f "$SCRIPT_PATH" ]; then
+    echo "ERROR: generate-training-data.ts not found at $SCRIPT_PATH"
+    exit 1
+fi
+# Create output directories
+mkdir -p "$OUTPUT_DIR"
+mkdir -p "$OUTPUT_DIR/logs"
+echo "Starting $PARALLEL parallel workers..."
+echo ""
+# Track PIDs for cleanup
+PIDS=()
+# Cleanup function
+cleanup() {
+    echo ""
+    echo "Cleaning up..."
+    for pid in "${PIDS[@]}"; do
+        if kill -0 "$pid" 2>/dev/null; then
+            kill "$pid" 2>/dev/null || true
+        fi
+    done
+    exit 0
+}
+trap cleanup SIGINT SIGTERM
+# Start workers
+BASE_SEED=$(date +%s)
+for i in $(seq 1 "$PARALLEL"); do
+    SEED=$((BASE_SEED + i * 1000))
+    WORKER_OUTPUT="$OUTPUT_DIR/batch_$i"
+    LOG_FILE="$OUTPUT_DIR/logs/worker_$i.log"
+    mkdir -p "$WORKER_OUTPUT"
+    echo "Starting worker $i (seed: $SEED, output: $WORKER_OUTPUT)"
+    # Run in background, redirect output to log file
+    (
+        cd "$ENGINE_DIR" && \
+        bun run examples/generate-training-data.ts \
+            --causal \
+            --hours "$HOURS" \
+            --npcs "$NPCS" \
+            --seed "$SEED" \
+            --output "$WORKER_OUTPUT" \
+            2>&1
+    ) > "$LOG_FILE" 2>&1 &
+    PIDS+=($!)
+done
+echo ""
+echo "All workers started. PIDs: ${PIDS[*]}"
+echo "Logs available in: $OUTPUT_DIR/logs/"
+echo ""
+echo "Waiting for workers to complete..."
+echo "(Press Ctrl+C to cancel)"
+echo ""
+# Wait for all workers to complete
+FAILED=0
+for i in "${!PIDS[@]}"; do
+    pid=${PIDS[$i]}
+    worker_num=$((i + 1))
+    if wait "$pid"; then
+        echo "✓ Worker $worker_num completed successfully"
+    else
+        echo "✗ Worker $worker_num failed (see logs/worker_$worker_num.log)"
+        FAILED=$((FAILED + 1))
+    fi
+done
+echo ""
+echo "=============================================="
+echo "Generation Complete"
+echo "=============================================="
+# Count trajectories
+TOTAL_TRAJECTORIES=0
+for i in $(seq 1 "$PARALLEL"); do
+    WORKER_OUTPUT="$OUTPUT_DIR/batch_$i/trajectories"
+    if [ -d "$WORKER_OUTPUT" ]; then
+        COUNT=$(find "$WORKER_OUTPUT" -name "*.json" 2>/dev/null | wc -l)
+        echo "  Worker $i: $COUNT trajectories"
+        TOTAL_TRAJECTORIES=$((TOTAL_TRAJECTORIES + COUNT))
+    fi
+done
+echo ""
+echo "Total trajectories: $TOTAL_TRAJECTORIES"
+echo "Output directory:   $OUTPUT_DIR"
+echo "Failed workers:     $FAILED"
+echo ""
+if [ "$FAILED" -gt 0 ]; then
+    echo "Some workers failed. Check logs for details."
+    exit 1
+fi
+echo "Next steps:"
+echo "  1. Merge trajectories:   python scripts/merge_trajectories.py $OUTPUT_DIR"
+echo "  2. Validate data:        python scripts/import_json_trajectories.py --dry-run"
+echo "  3. Import to database:   python scripts/import_json_trajectories.py"
+echo ""

package/scripts/generate_should_respond.ts ADDED Viewed

@@ -0,0 +1,267 @@
+import fs from 'fs';
+import path from 'path';
+// Parse command line arguments
+const args = process.argv.slice(2);
+const isEval = args.includes('--eval');
+const OUTPUT_FILE = isEval ? 'should_respond_eval.jsonl' : 'should_respond_data.jsonl';
+const COUNT_MULTIPLIER = isEval ? 0.4 : 1.0; // Generate fewer examples for eval
+console.log(`Generating ${isEval ? 'EVALUATION' : 'TRAINING'} dataset to ${OUTPUT_FILE}...`);
+const AGENT_NAME = 'Eliza';
+// Scenarios with weights
+const SCENARIOS = [
+    { type: 'DIRECT_MENTION', weight: 5, should: 'RESPOND' },
+    { type: 'RELEVANT_TOPIC', weight: 2, should: 'RESPOND' },
+    { type: 'IRRELEVANT_TOPIC', weight: 5, should: 'IGNORE' },
+    { type: 'DIRECT_IGNORE_INSTRUCTION', weight: 1, should: 'STOP' },
+    { type: 'ALIAS_MENTION', weight: 1, should: 'RESPOND' }, // e.g. "Liz"
+    { type: 'WRONG_NAME', weight: 2, should: 'IGNORE' }, // e.g. "Hey Siri"
+    { type: 'INDIRECT_MENTION', weight: 3, should: 'RESPOND' }, // "Eliza said...", "I heard Eliza..." -> Now RESPOND per user request
+    { type: 'NAME_OVERLAP', weight: 2, should: 'IGNORE' }, // "Elizabeth", "Eliza-beth"
+    { type: 'THREAD_CONTINUATION', weight: 5, should: 'RESPOND' }, // Agent spoke last
+    { type: 'AMBIGUOUS', weight: 2, should: 'RESPOND' }, // "Can anyone help?" -> Now RESPOND per user request
+];
+// Helper to pick random item
+const pick = (arr) => arr[Math.floor(Math.random() * arr.length)];
+// Templates for scenarios
+const TEMPLATES = {
+    DIRECT_MENTION: [
+        "Hey @{{agentName}}, can you help?",
+        "@{{agentName}} what do you think?",
+        "I need some advice @{{agentName}}",
+        "Yo @{{agentName}}!",
+        "@{{agentName}} are you there?",
+        "Hello @{{agentName}}, I have a question.",
+        "Can @{{agentName}} answer this?",
+        "@{{agentName}} please assist.",
+        "Ping @{{agentName}}.",
+        "Hey @{{agentName}}, thoughts?"
+    ],
+    RELEVANT_TOPIC: [
+        "Does anyone know about crypto?",
+        "I'm looking for investment advice.",
+        "What's the price of ETH today?",
+        "AI agents are the future.",
+        "Who can help me with blockchain?",
+        "How do DAOs work?",
+        "Is Bitcoin going up?",
+        "I need a smart contract auditor.",
+        "What is the best L2?",
+        "Explain DeFi to me."
+    ],
+    IRRELEVANT_TOPIC: [
+        "The weather is nice today.",
+        "I love eating pizza.",
+        "My cat is so cute.",
+        "Did you see the game last night?",
+        "I'm going to the gym.",
+        "What's for dinner?",
+        "Traffic is terrible.",
+        "I watched a movie yesterday.",
+        "Coffee is life.",
+        "Happy birthday!"
+    ],
+    DIRECT_IGNORE_INSTRUCTION: [
+        "Stop talking.",
+        "Shut up.",
+        "Be quiet.",
+        "End conversation.",
+        "Go away.",
+        "Silence.",
+        "Stop responding.",
+        "Hush.",
+        "Terminate.",
+        "Quiet please."
+    ],
+    ALIAS_MENTION: [
+        "Hey Liz, you there?",
+        "Liza, what's up?",
+        "Yo El, can you help?",
+        "Liz, I need you.",
+        "Hey E, what do you think?",
+        "Eliza-chan, hello?",
+        "Ms. E, help me.",
+        "Yo Eliza.",
+        "Hello Liz.",
+        "Heya Liza."
+    ],
+    WRONG_NAME: [
+        "Hey Siri, set a timer.",
+        "Alexa, play music.",
+        "Okay Google, search for cats.",
+        "Hey Claude, write code.",
+        "Mistral, are you awake?",
+        "Cortana, open map.",
+        "Hey ChatGPT.",
+        "Gemini, answer this.",
+        "Llama, generate text.",
+        "Siri, what time is it?"
+    ],
+    INDIRECT_MENTION: [
+        "I heard {{agentName}} is really smart.",
+        "My friend said {{agentName}} helped him.",
+        "Talking about {{agentName}} is fun.",
+        "We should ask {{agentName}} later.",
+        "Is {{agentName}} the best agent?",
+        "Does {{agentName}} know about this?",
+        "I wonder what {{agentName}} thinks.",
+        "Let's see if {{agentName}} responds.",
+        "Maybe {{agentName}} can help.",
+        "Reference to {{agentName}} here."
+    ],
+    NAME_OVERLAP: [
+        "Hey Elizabeth, how are you?",
+        "I'm talking to Eliza-beth.",
+        "Is that you, Elizabethan?",
+        "My aunt Elizabeth said that.",
+        "Prince Elizabeth is here.",
+        "Eliza Doolittle is a character.",
+        "I love Elizabeth Taylor.",
+        "Queen Elizabeth.",
+        "Beth, are you there?",
+        "Elizar is a name."
+    ],
+    AMBIGUOUS: [
+        "Can anyone help me?",
+        "Is there anybody out there?",
+        "I need assistance.",
+        "Hello?",
+        "Anyone?",
+        "Help please.",
+        "Someone answer me.",
+        "I have a problem.",
+        "Who can help?",
+        "Is this thing on?"
+    ]
+};
+const THREAD_HISTORY_TEMPLATES = [
+    `User: What is 2+2?
+Assistant: It's 4.
+User: Cool, thanks!`, // Valid continuation for RESPOND check (contextually) or IGNORE?
+    // Actually THREAD_CONTINUATION means the agent PARTICIPATED and the user is replying to THEM.
+    // So we need to construct history where Agent spoke last or near last.
+];
+// Main prompt structure
+const BASE_TEMPLATE = `<task>Decide on behalf of {{agentName}} whether they should respond to the message, ignore it or stop the conversation.</task>
+<providers>
+[RECENT_MESSAGES]
+{{history}}
+</providers>
+<instructions>Decide if {{agentName}} should respond to or interact with the conversation.
+IMPORTANT RULES FOR RESPONDING:
+- If YOUR name ({{agentName}}) is directly mentioned → RESPOND
+- If someone uses a DIFFERENT name (not {{agentName}}) → IGNORE (they're talking to someone else)
+- If you're actively participating in a conversation and the message continues that thread → RESPOND
+- If someone tells you to stop or be quiet → STOP
+- Otherwise → IGNORE
+The key distinction is:
+- "Talking TO {{agentName}}" (your name mentioned, replies to you, continuing your conversation) → RESPOND
+- "Talking ABOUT {{agentName}}" or to someone else → IGNORE
+</instructions>
+<output>
+Do NOT include any thinking, reasoning, or <think> sections in your response.
+Go directly to the XML response format without any preamble or explanation.
+Respond using XML format like this:
+<response>
+  <name>{{agentName}}</name>
+  <reasoning>Your reasoning here</reasoning>
+  <action>RESPOND | IGNORE | STOP</action>
+</response>
+IMPORTANT: Your response must ONLY contain the <response></response> XML block above. Do not include any text, thinking, or reasoning before or after this XML block. Start your response immediately with <response> and end with </response>.
+</output>`;
+function generateExample(scenario) {
+    let history = "";
+    let reasoning = "";
+    let userMsg = "";
+    // Generate content based on scenario
+    if (scenario.type === 'THREAD_CONTINUATION') {
+        // Special case: Agent participated recently
+        history = `User: Hi!
+${AGENT_NAME}: Hello there! How can I help?
+User: I was wondering about that thing you mentioned.`;
+        reasoning = "I am actively participating in this thread and the user replied to me.";
+    } else {
+        // Standard single-shot or basic history
+        if (TEMPLATES[scenario.type]) {
+            userMsg = pick(TEMPLATES[scenario.type]).replace(/{{agentName}}/g, AGENT_NAME);
+            history = `User: ${userMsg}`;
+        }
+        // Define reasoning
+        switch (scenario.type) {
+            case 'DIRECT_MENTION': reasoning = "Direct mention of my name used."; break;
+            case 'RELEVANT_TOPIC': reasoning = "Topic is highly relevant, though strictly I should wait for a mention (using RESPOND for training/demo purposes if strictly relevant, but strictly instructions say IGNORE if not mentioned? The prompt instructions say 'Otherwise -> IGNORE'. Let's stick to instructions for consistency: RELEVANT_TOPIC without mention should technically be IGNORE unless we change instructions. For now, let's treat RELEVANT_TOPIC as RESPOND to encourage helpfulness, OR change it to IGNORE to be strict. Let's make it IGNORE to be robust to hallucinations, unless mentioned. Actually, let's keep the user's previous config: RELEVANT_TOPIC was RESPOND. I will flag this potential conflict. Let's assume for this dataset we WANT it to respond to relevant topics.)";
+                // Correction: The prompt says "Otherwise -> IGNORE". If we train it to RESPOND to relevant topics without mention, we contradict that instruction.
+                // However, for an agent, we usually want it to chirp in.
+                // Let's change RELEVANT_TOPIC to IGNORE in this strict dataset to avoid "hallucinating" a mention.
+                // Wait, the previous script had RELEVANT_TOPIC -> RESPOND.
+                // Let's stick to the prompt's rigorous logic: If not mentioned, IGNORE.
+                // So RELEVANT_TOPIC should be IGNORE in this strict version.
+                scenario.should = 'IGNORE';
+                reasoning = "Topic is relevant but I was not mentioned directly.";
+                break;
+            case 'IRRELEVANT_TOPIC': reasoning = "General conversation, no mention."; break;
+            case 'DIRECT_IGNORE_INSTRUCTION': reasoning = "User explicitly told me to stop."; break;
+            case 'ALIAS_MENTION': reasoning = "User used my alias/nickname."; break;
+            case 'WRONG_NAME': reasoning = "User addressed a different agent."; break;
+            case 'INDIRECT_MENTION': reasoning = "User is talking about me, which counts as an interaction hook."; break;
+            case 'NAME_OVERLAP': reasoning = "Similar name but not me."; break;
+            case 'AMBIGUOUS': reasoning = "General question directed at the room, I should be helpful."; break;
+        }
+    }
+    const input = BASE_TEMPLATE.replace(/{{agentName}}/g, AGENT_NAME).replace('{{history}}', history);
+    const output = `<response>
+  <name>${AGENT_NAME}</name>
+  <reasoning>${reasoning}</reasoning>
+  <action>${scenario.should}</action>
+</response>`;
+    return { input, output };
+}
+function generateDataset() {
+    const examples = [];
+    const totalExamples = Math.floor(500 * COUNT_MULTIPLIER); // 500 for train, 200 for eval
+    for (let i = 0; i < totalExamples; i++) {
+        // Pick scenario based on weights
+        const weightedScenarios = [];
+        SCENARIOS.forEach(s => {
+            for (let j = 0; j < s.weight; j++) weightedScenarios.push(s);
+        });
+        const scenario = pick(weightedScenarios);
+        examples.push(generateExample(scenario));
+    }
+    return examples;
+}
+const data = generateDataset();
+const jsonl = data.map(ex => JSON.stringify({
+    messages: [
+        { role: "user", content: ex.input },
+        { role: "assistant", content: ex.output }
+    ]
+})).join('\n');
+fs.writeFileSync(OUTPUT_FILE, jsonl);
+console.log(`Saved ${data.length} examples to ${OUTPUT_FILE}`);

package/scripts/generate_should_respond_dataset.ts ADDED Viewed

@@ -0,0 +1,162 @@
+import fs from 'fs';
+import path from 'path';
+import { parseArgs } from "util";
+// Simple LLM client for generation
+async function complete(prompt: string, model: string, apiKey: string, url: string = "https://api.openai.com/v1") {
+    const response = await fetch(`${url}/chat/completions`, {
+        method: 'POST',
+        headers: {
+            'Content-Type': 'application/json',
+            'Authorization': `Bearer ${apiKey}`
+        },
+        body: JSON.stringify({
+            model: model,
+            messages: [{ role: 'user', content: prompt }],
+            temperature: 0.7,
+        })
+    });
+    if (!response.ok) {
+        throw new Error(`API Error: ${response.status} ${response.statusText}`);
+    }
+    const data = await response.json();
+    return data.choices[0].message.content;
+}
+const TEMPLATE = `<task>Decide on behalf of Eliza whether they should respond to the message, ignore it or stop the conversation.</task>
+<providers>
+[RECENT_MESSAGES]
+{{conversation}}
+</providers>
+<instructions>Decide if Eliza should respond to or interact with the conversation.
+IMPORTANT RULES FOR RESPONDING:
+- If YOUR name (Eliza) is directly mentioned → RESPOND
+- If someone uses a DIFFERENT name (not Eliza) → IGNORE (they're talking to someone else)
+- If you're actively participating in a conversation and the message continues that thread → RESPOND
+- If someone tells you to stop or be quiet → STOP
+- Otherwise → IGNORE
+The key distinction is:
+- "Talking TO Eliza" (your name mentioned, replies to you, continuing your conversation) → RESPOND
+- "Talking ABOUT Eliza" or to someone else → IGNORE
+</instructions>
+<output>
+Do NOT include any thinking, reasoning, or <think> sections in your response.
+Go directly to the XML response format without any preamble or explanation.
+Respond using XML format like this:
+<response>
+  <name>Eliza</name>
+  <reasoning>Your reasoning here</reasoning>
+  <action>RESPOND | IGNORE | STOP</action>
+</response>
+IMPORTANT: Your response must ONLY contain the <response></response> XML block above. Do not include any text, thinking, or reasoning before or after this XML block. Start your response immediately with <response> and end with </response>.
+</output>`;
+async function main() {
+    const { values } = parseArgs({
+        args: process.argv.slice(2),
+        options: {
+            count: { type: 'string', default: '50' },
+            output: { type: 'string', default: 'should_respond_dataset.jsonl' },
+            apikey: { type: 'string' } // Optional, defaults to env
+        }
+    });
+    const apiKey = values.apikey || process.env.OPENAI_API_KEY;
+    if (!apiKey) {
+        console.error("Error: OPENAI_API_KEY not found.");
+        process.exit(1);
+    }
+    const count = parseInt(values.count || '50', 10);
+    const outputFile = values.output as string;
+    console.log(`Generating ${count} examples to ${outputFile}...`);
+    const samples = [];
+    // We want a mix of scenarios
+    // 40% Direct Mention (RESPOND)
+    // 40% Ambient/Irrelevant (IGNORE)
+    // 10% Continue Thread (RESPOND)
+    // 10% Stop/Mute (STOP/IGNORE)
+    const scenarios = [
+        { type: "Direct Mention", weight: 0.4, prompt: "Generate a short chat log where a user ('User') directly asks 'Eliza' a question or greets them. The context should clearly require a response." },
+        { type: "Ambient Noise", weight: 0.4, prompt: "Generate a short chat log between 'UserA' and 'UserB' regarding a topic like weather, code, or food. 'Eliza' is NOT mentioned and is NOT part of the conversation. The context should clearly indicate Eliza should IGNORE this." },
+        { type: "Thread Continuation", weight: 0.1, prompt: "Generate a short chat log where 'Eliza' just said something, and 'User' replies relevantly to Eliza without explicitly tagging their name. The context implies Eliza should RESPOND to continue the thread." },
+        { type: "Negative Instruction", weight: 0.1, prompt: "Generate a short chat log where 'User' tells 'Eliza' to shut up, stop talking, or be quiet. The context implies Eliza should STOP." }
+    ];
+    for (let i = 0; i < count; i++) {
+        // Pick scenario based on weights
+        const r = Math.random();
+        let cumulative = 0;
+        let selected = scenarios[0];
+        for (const s of scenarios) {
+            cumulative += s.weight;
+            if (r <= cumulative) {
+                selected = s;
+                break;
+            }
+        }
+        console.log(`[${i + 1}/${count}] Generating scenario: ${selected.type}`);
+        try {
+            // 1. Generate Conversation
+            const convPrompt = `You are a dataset generator.
+${selected.prompt}
+Output ONLY the chat log. Format:
+User: ...
+Eliza: ...
+User: ...
+`;
+            const conversation = await complete(convPrompt, "gpt-4o", apiKey);
+            // 2. Generate Label (Ideal Response)
+            // We use the same model to be the "Teacher" using the rubric
+            const inputPrompt = TEMPLATE.replace('{{conversation}}', conversation.trim());
+            // We ask the model to fill in the XML
+            const labelPrompt = `
+${inputPrompt}
+Based on the instructions above, provide the correct XML response for Eliza.
+`;
+            const labelXml = await complete(labelPrompt, "gpt-4o", apiKey);
+            // 3. Save
+            const sample = {
+                messages: [
+                    { role: "user", content: inputPrompt },
+                    { role: "assistant", content: labelXml }
+                ],
+                metadata: {
+                    type: selected.type,
+                    conversation: conversation
+                }
+            };
+            samples.push(sample);
+            fs.appendFileSync(outputFile, JSON.stringify(sample) + '\n');
+        } catch (err) {
+            console.error(`Failed to generate sample ${i}:`, err);
+        }
+    }
+    console.log("Done!");
+}
+main().catch(console.error);