@elizaos/training 2.0.0-alpha.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +75 -0
- package/LICENSE +21 -0
- package/Makefile +374 -0
- package/README.md +346 -0
- package/config/rubrics.json +137 -0
- package/docker-compose.test.yml +57 -0
- package/package.json +57 -0
- package/python/config/babylon_atropos.yaml +90 -0
- package/python/config/profiles/12gb.json +11 -0
- package/python/config/profiles/16gb.json +10 -0
- package/python/config/profiles/24gb.json +10 -0
- package/python/config/profiles/48gb.json +10 -0
- package/python/config/profiles/cpu.json +11 -0
- package/python/config/profiles/l40-2gpu-safe.json +20 -0
- package/python/config/profiles/l40-2gpu.json +22 -0
- package/python/config/profiles/l40-4gpu.json +21 -0
- package/python/config/profiles/l40.json +17 -0
- package/python/config/tinker_training.yaml +143 -0
- package/python/curriculum_state.json +165 -0
- package/python/env.template +86 -0
- package/python/env.training.template +46 -0
- package/python/pyproject.toml +41 -0
- package/python/requirements-ci.txt +31 -0
- package/python/requirements.txt +87 -0
- package/python/scripts/__init__.py +4 -0
- package/python/scripts/benchmark_should_respond.py +190 -0
- package/python/scripts/debug_inference.py +62 -0
- package/python/scripts/import_json_trajectories.py +412 -0
- package/python/scripts/local-finetune/README.md +63 -0
- package/python/scripts/local-finetune/ingest_and_score.py +139 -0
- package/python/scripts/local-finetune/merge_model.py +32 -0
- package/python/scripts/local-finetune/test_adapter.py +91 -0
- package/python/scripts/local-finetune/train_from_csv.py +132 -0
- package/python/scripts/merge_trajectories.py +318 -0
- package/python/scripts/optimize_prompt_grpo.py +269 -0
- package/python/scripts/run_ab_test.py +143 -0
- package/python/scripts/run_full_pipeline.py +544 -0
- package/python/scripts/run_tinker_training.py +192 -0
- package/python/scripts/run_training.py +914 -0
- package/python/scripts/test_generation.py +29 -0
- package/python/scripts/test_judge.py +155 -0
- package/python/scripts/test_pipeline.py +356 -0
- package/python/scripts/test_trained_model.py +380 -0
- package/python/scripts/train_grpo.py +360 -0
- package/python/scripts/train_jsonl.py +223 -0
- package/python/scripts/train_local.py +528 -0
- package/python/setup.py +20 -0
- package/python/src/__init__.py +190 -0
- package/python/src/data_bridge/__init__.py +24 -0
- package/python/src/data_bridge/converter.py +435 -0
- package/python/src/data_bridge/reader.py +393 -0
- package/python/src/models.py +283 -0
- package/python/src/training/__init__.py +605 -0
- package/python/src/training/ab_testing.py +404 -0
- package/python/src/training/action_executor.py +621 -0
- package/python/src/training/archetype_trainer.py +347 -0
- package/python/src/training/atropos_trainer.py +980 -0
- package/python/src/training/babylon_env.py +1254 -0
- package/python/src/training/error_recovery.py +647 -0
- package/python/src/training/evaluation.py +856 -0
- package/python/src/training/fast_simulator.py +880 -0
- package/python/src/training/format_validator.py +584 -0
- package/python/src/training/hybrid_env.py +522 -0
- package/python/src/training/kl_controller.py +628 -0
- package/python/src/training/multi_prompt_dataset.py +883 -0
- package/python/src/training/multi_turn.py +656 -0
- package/python/src/training/online_env.py +1084 -0
- package/python/src/training/quality_scorer.py +391 -0
- package/python/src/training/quality_utils.py +633 -0
- package/python/src/training/rewards.py +1344 -0
- package/python/src/training/rlaif_env.py +17 -0
- package/python/src/training/rollout_generator.py +502 -0
- package/python/src/training/rubric_loader.py +198 -0
- package/python/src/training/scenario_pool.py +1072 -0
- package/python/src/training/schemas.py +481 -0
- package/python/src/training/service_manager.py +552 -0
- package/python/src/training/simulation_bridge.py +535 -0
- package/python/src/training/tick_reward_attribution.py +399 -0
- package/python/src/training/tinker_client.py +575 -0
- package/python/src/training/tinker_trainer.py +646 -0
- package/python/src/training/tokenization_utils.py +402 -0
- package/python/tests/e2e/__init__.py +13 -0
- package/python/tests/e2e/conftest.py +258 -0
- package/python/tests/e2e/test_full_pipeline.py +643 -0
- package/python/tests/e2e/test_online_training_e2e.py +365 -0
- package/python/tests/integration/__init__.py +12 -0
- package/python/tests/integration/conftest.py +383 -0
- package/python/tests/integration/test_db_integration.py +649 -0
- package/python/tests/integration/test_json_mode_integration.py +554 -0
- package/python/tests/test_action_executor.py +594 -0
- package/python/tests/test_archetype_scoring.py +1027 -0
- package/python/tests/test_atropos_integration.py +360 -0
- package/python/tests/test_evaluation.py +727 -0
- package/python/tests/test_format_validator.py +486 -0
- package/python/tests/test_kl_controller.py +432 -0
- package/python/tests/test_lr_scheduler.py +579 -0
- package/python/tests/test_multi_turn.py +590 -0
- package/python/tests/test_online_env.py +519 -0
- package/python/tests/test_quality_scorer.py +474 -0
- package/python/tests/test_scenario_pool.py +735 -0
- package/python/tests/test_service_manager.py +585 -0
- package/python/tests/test_simulation_rollout.py +581 -0
- package/python/tests/test_tokenization_utils.py +501 -0
- package/python/tests/test_training_orchestrator.py +497 -0
- package/python/tests/test_training_output_structure.py +661 -0
- package/research-output/training-runs/training-run-1770772042899.json +26 -0
- package/research-output/training-runs/training-run-1770930079670.json +32 -0
- package/research-output/training-runs/training-run-1770930143700.json +44 -0
- package/research-output/training-runs/training-run-1770930183638.json +38 -0
- package/research-output/training-runs/training-run-1770930442049.json +38 -0
- package/research-output/training-runs/training-run-1770930793243.json +38 -0
- package/research-output/training-runs/training-run-1771276293257.json +38 -0
- package/research-output/training-runs/training-run-1771276389280.json +38 -0
- package/research-output/training-runs/training-run-1771276502776.json +38 -0
- package/research-output/training-runs/training-run-1771277340748.json +38 -0
- package/research-output/training-runs/training-run-1773013658993.json +38 -0
- package/research-output/training-runs/training-run-1773013861014.json +38 -0
- package/research-output/training-runs/training-run-1773014215983.json +38 -0
- package/scripts/assess-training-data.ts +422 -0
- package/scripts/e2e-training-test.ts +550 -0
- package/scripts/export-rubrics.ts +64 -0
- package/scripts/generate-research-report.ts +1523 -0
- package/scripts/generate_dataset.sh +173 -0
- package/scripts/generate_should_respond.ts +267 -0
- package/scripts/generate_should_respond_dataset.ts +162 -0
- package/scripts/json-mode-benchmark.ts +399 -0
- package/scripts/rank_trajectories.ts +207 -0
- package/scripts/real-archetype-benchmark.ts +210 -0
- package/scripts/run-baseline-comparison.ts +116 -0
- package/scripts/run-full-pipeline.ts +272 -0
- package/scripts/run_rlaif_loop.ts +78 -0
- package/scripts/run_task_benchmark.ts +247 -0
- package/scripts/runpod_setup.sh +137 -0
- package/scripts/runpod_validate.sh +147 -0
- package/scripts/test-model-in-game.ts +955 -0
- package/scripts/test-scoring.ts +73 -0
- package/scripts/test-trained-model.ts +209 -0
- package/scripts/train-and-test.ts +824 -0
- package/scripts/verify-final.ts +118 -0
- package/src/adapter.ts +516 -0
- package/src/archetypes/ArchetypeConfigService.ts +626 -0
- package/src/archetypes/derive-archetype.ts +249 -0
- package/src/archetypes/index.ts +22 -0
- package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
- package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
- package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
- package/src/benchmark/BenchmarkDataViewer.ts +324 -0
- package/src/benchmark/BenchmarkHistoryService.ts +221 -0
- package/src/benchmark/BenchmarkRunner.ts +685 -0
- package/src/benchmark/BenchmarkValidator.ts +204 -0
- package/src/benchmark/FastEvalRunner.ts +225 -0
- package/src/benchmark/MetricsValidator.ts +165 -0
- package/src/benchmark/MetricsVisualizer.ts +909 -0
- package/src/benchmark/ModelBenchmarkService.ts +611 -0
- package/src/benchmark/ModelRegistry.ts +158 -0
- package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
- package/src/benchmark/SimulationA2AInterface.ts +1169 -0
- package/src/benchmark/SimulationEngine.ts +832 -0
- package/src/benchmark/TaskRunner.ts +94 -0
- package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
- package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
- package/src/benchmark/index.ts +91 -0
- package/src/benchmark/parseSimulationMetrics.ts +124 -0
- package/src/benchmark/simulation-types.ts +78 -0
- package/src/dependencies.ts +475 -0
- package/src/generation/TrajectoryGenerator.ts +387 -0
- package/src/generation/index.ts +12 -0
- package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
- package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
- package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
- package/src/huggingface/index.ts +27 -0
- package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
- package/src/index.ts +102 -0
- package/src/init-training.ts +53 -0
- package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
- package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
- package/src/metrics/index.ts +8 -0
- package/src/metrics/types.ts +200 -0
- package/src/rubrics/__tests__/index.test.ts +184 -0
- package/src/rubrics/ass-kisser.ts +85 -0
- package/src/rubrics/degen.ts +80 -0
- package/src/rubrics/goody-twoshoes.ts +84 -0
- package/src/rubrics/index.ts +236 -0
- package/src/rubrics/information-trader.ts +84 -0
- package/src/rubrics/infosec.ts +101 -0
- package/src/rubrics/liar.ts +104 -0
- package/src/rubrics/perps-trader.ts +87 -0
- package/src/rubrics/researcher.ts +81 -0
- package/src/rubrics/scammer.ts +82 -0
- package/src/rubrics/social-butterfly.ts +73 -0
- package/src/rubrics/super-predictor.ts +97 -0
- package/src/rubrics/trader.ts +67 -0
- package/src/scoring/ArchetypeScoringService.ts +486 -0
- package/src/scoring/JudgePromptBuilder.ts +556 -0
- package/src/scoring/LLMJudgeCache.ts +401 -0
- package/src/scoring/index.ts +9 -0
- package/src/training/AutomationPipeline.ts +916 -0
- package/src/training/BenchmarkService.ts +518 -0
- package/src/training/ConfigValidator.ts +220 -0
- package/src/training/MarketOutcomesTracker.ts +187 -0
- package/src/training/ModelDeployer.ts +186 -0
- package/src/training/ModelFetcher.ts +76 -0
- package/src/training/ModelSelectionService.ts +341 -0
- package/src/training/ModelUsageVerifier.ts +160 -0
- package/src/training/MultiModelOrchestrator.ts +580 -0
- package/src/training/RLModelConfig.ts +407 -0
- package/src/training/RewardBackpropagationService.ts +149 -0
- package/src/training/RulerScoringService.ts +666 -0
- package/src/training/TrainingMonitor.ts +166 -0
- package/src/training/TrajectoryRecorder.ts +399 -0
- package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
- package/src/training/index.ts +100 -0
- package/src/training/logRLConfig.ts +34 -0
- package/src/training/pipeline.ts +129 -0
- package/src/training/storage/ModelStorageService.ts +279 -0
- package/src/training/storage/TrainingDataArchiver.ts +197 -0
- package/src/training/storage/index.ts +17 -0
- package/src/training/types.ts +207 -0
- package/src/training/window-utils.ts +138 -0
- package/src/utils/index.ts +101 -0
- package/src/utils/logger.ts +59 -0
- package/src/utils/snowflake.ts +17 -0
- package/src/utils/synthetic-detector.ts +111 -0
- package/tsconfig.json +20 -0
package/Dockerfile
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Babylon RL Training Docker Image
|
|
2
|
+
#
|
|
3
|
+
# Supports local dev (12GB GPU) through production (4x L40 192GB)
|
|
4
|
+
#
|
|
5
|
+
# Build:
|
|
6
|
+
# docker build -t babylon-training .
|
|
7
|
+
#
|
|
8
|
+
# Run (single GPU):
|
|
9
|
+
# docker run --gpus all -v $(pwd)/trained_models:/app/trained_models babylon-training \
|
|
10
|
+
# --profile l40 --steps 5000
|
|
11
|
+
#
|
|
12
|
+
# Run (4x GPU tensor parallel):
|
|
13
|
+
# docker run --gpus all -v $(pwd)/trained_models:/app/trained_models babylon-training \
|
|
14
|
+
# --profile l40-4gpu --steps 10000
|
|
15
|
+
|
|
16
|
+
FROM nvidia/cuda:12.1-runtime-ubuntu22.04
|
|
17
|
+
|
|
18
|
+
# Prevent interactive prompts
|
|
19
|
+
ENV DEBIAN_FRONTEND=noninteractive
|
|
20
|
+
|
|
21
|
+
# Install system dependencies
|
|
22
|
+
RUN apt-get update && apt-get install -y \
|
|
23
|
+
python3.11 \
|
|
24
|
+
python3.11-venv \
|
|
25
|
+
python3-pip \
|
|
26
|
+
git \
|
|
27
|
+
curl \
|
|
28
|
+
wget \
|
|
29
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
30
|
+
|
|
31
|
+
# Create app directory
|
|
32
|
+
WORKDIR /app
|
|
33
|
+
|
|
34
|
+
# Install Python dependencies
|
|
35
|
+
COPY python/requirements.txt ./requirements.txt
|
|
36
|
+
RUN python3.11 -m pip install --no-cache-dir --upgrade pip && \
|
|
37
|
+
python3.11 -m pip install --no-cache-dir -r requirements.txt
|
|
38
|
+
|
|
39
|
+
# Install vLLM (separate layer for caching)
|
|
40
|
+
RUN python3.11 -m pip install --no-cache-dir vllm>=0.4.0
|
|
41
|
+
|
|
42
|
+
# Install atroposlib
|
|
43
|
+
RUN python3.11 -m pip install --no-cache-dir atroposlib
|
|
44
|
+
|
|
45
|
+
# Install flash-attention (optional, for performance)
|
|
46
|
+
RUN python3.11 -m pip install --no-cache-dir flash-attn --no-build-isolation || echo "Flash attention not available"
|
|
47
|
+
|
|
48
|
+
# Copy application code
|
|
49
|
+
COPY python/ ./python/
|
|
50
|
+
COPY Makefile ./Makefile
|
|
51
|
+
|
|
52
|
+
# Set Python path
|
|
53
|
+
ENV PYTHONPATH=/app/python
|
|
54
|
+
|
|
55
|
+
# Create directories for outputs
|
|
56
|
+
RUN mkdir -p /app/trained_models /app/logs /app/data
|
|
57
|
+
|
|
58
|
+
# Default environment variables
|
|
59
|
+
ENV DATABASE_URL=""
|
|
60
|
+
ENV WANDB_API_KEY=""
|
|
61
|
+
ENV WANDB_PROJECT="babylon-training"
|
|
62
|
+
ENV CUDA_VISIBLE_DEVICES="0"
|
|
63
|
+
|
|
64
|
+
# Health check
|
|
65
|
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
|
66
|
+
CMD curl -f http://localhost:8000/ || exit 1
|
|
67
|
+
|
|
68
|
+
# Entrypoint
|
|
69
|
+
ENTRYPOINT ["python3.11", "python/scripts/run_training.py"]
|
|
70
|
+
|
|
71
|
+
# Default command (can be overridden)
|
|
72
|
+
CMD ["--profile", "l40", "--steps", "5000"]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shaw Walters and elizaOS Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/Makefile
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
# Babylon Training Pipeline - Developer Makefile
|
|
2
|
+
#
|
|
3
|
+
# Quick reference:
|
|
4
|
+
# make help - Show all commands
|
|
5
|
+
# make tier1 - Run Python unit tests (no infra)
|
|
6
|
+
# make tier2 - Run JSON mode tests
|
|
7
|
+
# make tier3 - Run DB integration tests
|
|
8
|
+
# make tier4 - Run full GPU training
|
|
9
|
+
#
|
|
10
|
+
# Infrastructure:
|
|
11
|
+
# make db-up - Start test PostgreSQL/Redis
|
|
12
|
+
# make db-down - Stop and remove containers
|
|
13
|
+
# make db-migrate - Apply database schema
|
|
14
|
+
|
|
15
|
+
.PHONY: all help tier1 tier2 tier3 tier4 db-up db-down db-migrate \
|
|
16
|
+
train-12gb train-16gb train-24gb train-l40 train-l40-2gpu train-l40-4gpu \
|
|
17
|
+
train-online bridge-server generate-data venv lint test clean
|
|
18
|
+
|
|
19
|
+
# Default target
|
|
20
|
+
all: test
|
|
21
|
+
|
|
22
|
+
# Colors for output
|
|
23
|
+
CYAN := \033[36m
|
|
24
|
+
GREEN := \033[32m
|
|
25
|
+
YELLOW := \033[33m
|
|
26
|
+
RESET := \033[0m
|
|
27
|
+
|
|
28
|
+
# Paths
|
|
29
|
+
PYTHON_DIR := python
|
|
30
|
+
VENV := venv
|
|
31
|
+
VENV_BIN := $(VENV)/bin
|
|
32
|
+
PYTHON := $(VENV_BIN)/python
|
|
33
|
+
PIP := $(VENV_BIN)/pip
|
|
34
|
+
PYTEST := $(VENV_BIN)/pytest
|
|
35
|
+
|
|
36
|
+
# Database
|
|
37
|
+
DB_URL := postgresql://babylon_test:test_password@localhost:5434/babylon_test
|
|
38
|
+
DB_COMPOSE := docker-compose.test.yml
|
|
39
|
+
|
|
40
|
+
# Default profile (can be overridden: make train PROFILE=24gb)
|
|
41
|
+
PROFILE ?= 12gb
|
|
42
|
+
|
|
43
|
+
#---------------------------------------------------------------------------
|
|
44
|
+
# Help
|
|
45
|
+
#---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
help:
|
|
48
|
+
@echo ""
|
|
49
|
+
@echo "$(CYAN)Babylon Training Pipeline$(RESET)"
|
|
50
|
+
@echo "=========================="
|
|
51
|
+
@echo ""
|
|
52
|
+
@echo "$(GREEN)Testing Tiers:$(RESET)"
|
|
53
|
+
@echo " make tier1 Python unit tests (no infrastructure)"
|
|
54
|
+
@echo " make tier2 JSON mode integration tests"
|
|
55
|
+
@echo " make tier3 Database integration tests (requires Docker)"
|
|
56
|
+
@echo " make tier4 Full GPU training test"
|
|
57
|
+
@echo ""
|
|
58
|
+
@echo "$(GREEN)Infrastructure:$(RESET)"
|
|
59
|
+
@echo " make db-up Start test PostgreSQL and Redis"
|
|
60
|
+
@echo " make db-down Stop and remove test containers"
|
|
61
|
+
@echo " make db-migrate Apply database schema"
|
|
62
|
+
@echo " make db-reset Stop, clean, start, and migrate"
|
|
63
|
+
@echo ""
|
|
64
|
+
@echo "$(GREEN)Training (with GPU profiles):$(RESET)"
|
|
65
|
+
@echo " make train-12gb Train with 12GB GPU profile (RTX 3060)"
|
|
66
|
+
@echo " make train-16gb Train with 16GB GPU profile (RTX 4080)"
|
|
67
|
+
@echo " make train-24gb Train with 24GB GPU profile (RTX 4090)"
|
|
68
|
+
@echo " make train-l40 Train with L40 (48GB) profile"
|
|
69
|
+
@echo " make train-l40-2gpu Train with 2x L40 (96GB) profile"
|
|
70
|
+
@echo " make train-l40-4gpu Train with 4x L40 (192GB) for Qwen3 30B"
|
|
71
|
+
@echo " make train PROFILE=<name> Train with custom profile"
|
|
72
|
+
@echo ""
|
|
73
|
+
@echo "$(GREEN)Online Training (Phase 3):$(RESET)"
|
|
74
|
+
@echo " make bridge-server Start TypeScript simulation bridge"
|
|
75
|
+
@echo " make bridge-check Check if bridge server is running"
|
|
76
|
+
@echo " make train-online Run online training (requires bridge-server)"
|
|
77
|
+
@echo " make train-hybrid Run hybrid training (mix offline + online)"
|
|
78
|
+
@echo " make generate-data Generate trajectories for offline training"
|
|
79
|
+
@echo ""
|
|
80
|
+
@echo "$(GREEN)Cloud & Production (Phase 4):$(RESET)"
|
|
81
|
+
@echo " make docker-build Build Docker image for cloud deployment"
|
|
82
|
+
@echo " make train-cloud Train with W&B logging enabled"
|
|
83
|
+
@echo " make train-cloud-l40 Cloud training with 1x L40"
|
|
84
|
+
@echo " make train-cloud-l40-2gpu Cloud training with 2x L40"
|
|
85
|
+
@echo " make train-cloud-l40-4gpu Cloud training with 4x L40 (Qwen3 30B)"
|
|
86
|
+
@echo " make train-cloud-online Cloud online training with W&B"
|
|
87
|
+
@echo ""
|
|
88
|
+
@echo "$(GREEN)A/B Testing & Evaluation:$(RESET)"
|
|
89
|
+
@echo " make ab-test Run A/B test (MODEL_A vs MODEL_B)"
|
|
90
|
+
@echo " make ab-test-quick Quick A/B test with trained model"
|
|
91
|
+
@echo ""
|
|
92
|
+
@echo "$(GREEN)Development:$(RESET)"
|
|
93
|
+
@echo " make venv Create/update Python virtual environment"
|
|
94
|
+
@echo " make lint Run linting"
|
|
95
|
+
@echo " make test Run all tests (tier1 + tier2)"
|
|
96
|
+
@echo " make clean Remove generated files"
|
|
97
|
+
@echo ""
|
|
98
|
+
@echo "$(YELLOW)Profiles available:$(RESET)"
|
|
99
|
+
@echo " 12gb, 16gb, 24gb, 48gb, cpu, l40, l40-2gpu, l40-4gpu"
|
|
100
|
+
@echo ""
|
|
101
|
+
|
|
102
|
+
#---------------------------------------------------------------------------
|
|
103
|
+
# Virtual Environment
|
|
104
|
+
#---------------------------------------------------------------------------
|
|
105
|
+
|
|
106
|
+
venv:
|
|
107
|
+
@echo "$(CYAN)Setting up Python virtual environment...$(RESET)"
|
|
108
|
+
cd $(PYTHON_DIR) && python3 -m venv venv
|
|
109
|
+
cd $(PYTHON_DIR) && $(PIP) install --upgrade pip
|
|
110
|
+
cd $(PYTHON_DIR) && $(PIP) install -r requirements.txt
|
|
111
|
+
cd $(PYTHON_DIR) && $(PIP) install -e .
|
|
112
|
+
@echo "$(GREEN)✓ Virtual environment ready$(RESET)"
|
|
113
|
+
@echo " Activate with: source $(PYTHON_DIR)/$(VENV_BIN)/activate"
|
|
114
|
+
|
|
115
|
+
#---------------------------------------------------------------------------
|
|
116
|
+
# Testing Tiers
|
|
117
|
+
#---------------------------------------------------------------------------
|
|
118
|
+
|
|
119
|
+
tier1:
|
|
120
|
+
@echo "$(CYAN)Running Tier 1: Python Unit Tests$(RESET)"
|
|
121
|
+
cd $(PYTHON_DIR) && PYTHONPATH=. $(PYTEST) tests/ -v \
|
|
122
|
+
--ignore=tests/integration/ \
|
|
123
|
+
--ignore=tests/e2e/ \
|
|
124
|
+
-x
|
|
125
|
+
@echo "$(GREEN)✓ Tier 1 passed$(RESET)"
|
|
126
|
+
|
|
127
|
+
tier2:
|
|
128
|
+
@echo "$(CYAN)Running Tier 2: JSON Mode Tests$(RESET)"
|
|
129
|
+
cd $(PYTHON_DIR) && PYTHONPATH=. $(PYTEST) tests/integration/test_json_mode_integration.py -v -x
|
|
130
|
+
@echo "$(GREEN)✓ Tier 2 passed$(RESET)"
|
|
131
|
+
|
|
132
|
+
tier3: db-up db-migrate
|
|
133
|
+
@echo "$(CYAN)Running Tier 3: Database Integration Tests$(RESET)"
|
|
134
|
+
cd $(PYTHON_DIR) && DATABASE_URL=$(DB_URL) PYTHONPATH=. \
|
|
135
|
+
$(PYTEST) tests/integration/test_db_integration.py -v -x
|
|
136
|
+
@echo "$(GREEN)✓ Tier 3 passed$(RESET)"
|
|
137
|
+
|
|
138
|
+
tier4: db-up db-migrate tier4-import
|
|
139
|
+
@echo "$(CYAN)Running Tier 4: Full GPU Training$(RESET)"
|
|
140
|
+
@echo "Using profile: $(PROFILE)"
|
|
141
|
+
cd $(PYTHON_DIR) && DATABASE_URL=$(DB_URL) WANDB_MODE=offline \
|
|
142
|
+
PATH="$(shell pwd)/$(PYTHON_DIR)/$(VENV_BIN):$$PATH" \
|
|
143
|
+
$(PYTHON) scripts/run_training.py \
|
|
144
|
+
--profile $(PROFILE) \
|
|
145
|
+
--steps 1 \
|
|
146
|
+
--no-wandb \
|
|
147
|
+
--skip-validation
|
|
148
|
+
@echo "$(GREEN)✓ Tier 4 passed$(RESET)"
|
|
149
|
+
|
|
150
|
+
# Training data output directory (absolute path from repo root)
|
|
151
|
+
TRAINING_DATA_DIR := $(shell cd ../.. && pwd)/training-data-output
|
|
152
|
+
|
|
153
|
+
tier4-generate:
|
|
154
|
+
@echo "$(CYAN)Generating training data...$(RESET)"
|
|
155
|
+
cd ../.. && bun run packages/engine/examples/generate-training-data.ts \
|
|
156
|
+
--causal --hours 2 --npcs 5 --seed 42
|
|
157
|
+
@echo "$(GREEN)✓ Training data generated$(RESET)"
|
|
158
|
+
|
|
159
|
+
tier4-import:
|
|
160
|
+
@echo "$(CYAN)Importing trajectories to database...$(RESET)"
|
|
161
|
+
@if [ -d "$(TRAINING_DATA_DIR)/trajectories" ]; then \
|
|
162
|
+
cd $(PYTHON_DIR) && DATABASE_URL=$(DB_URL) \
|
|
163
|
+
$(PYTHON) scripts/import_json_trajectories.py \
|
|
164
|
+
--source $(TRAINING_DATA_DIR); \
|
|
165
|
+
else \
|
|
166
|
+
echo "$(YELLOW)Note: No trajectories found. Run 'make tier4-generate' first.$(RESET)"; \
|
|
167
|
+
fi
|
|
168
|
+
|
|
169
|
+
#---------------------------------------------------------------------------
|
|
170
|
+
# Infrastructure
|
|
171
|
+
#---------------------------------------------------------------------------
|
|
172
|
+
|
|
173
|
+
db-up:
|
|
174
|
+
@echo "$(CYAN)Starting test database...$(RESET)"
|
|
175
|
+
docker compose -f $(DB_COMPOSE) up -d
|
|
176
|
+
@sleep 3
|
|
177
|
+
@docker compose -f $(DB_COMPOSE) ps
|
|
178
|
+
@echo "$(GREEN)✓ Database ready$(RESET)"
|
|
179
|
+
|
|
180
|
+
db-down:
|
|
181
|
+
@echo "$(CYAN)Stopping test database...$(RESET)"
|
|
182
|
+
docker compose -f $(DB_COMPOSE) down -v
|
|
183
|
+
@echo "$(GREEN)✓ Database stopped$(RESET)"
|
|
184
|
+
|
|
185
|
+
db-migrate:
|
|
186
|
+
@echo "$(CYAN)Applying database schema...$(RESET)"
|
|
187
|
+
cd ../db && DATABASE_URL=$(DB_URL) bunx drizzle-kit push --force
|
|
188
|
+
@echo "$(GREEN)✓ Schema applied$(RESET)"
|
|
189
|
+
|
|
190
|
+
db-reset: db-down db-up db-migrate
|
|
191
|
+
@echo "$(GREEN)✓ Database reset complete$(RESET)"
|
|
192
|
+
|
|
193
|
+
#---------------------------------------------------------------------------
|
|
194
|
+
# Training Shortcuts
|
|
195
|
+
#---------------------------------------------------------------------------
|
|
196
|
+
|
|
197
|
+
train-12gb:
|
|
198
|
+
$(MAKE) train PROFILE=12gb
|
|
199
|
+
|
|
200
|
+
train-16gb:
|
|
201
|
+
$(MAKE) train PROFILE=16gb
|
|
202
|
+
|
|
203
|
+
train-24gb:
|
|
204
|
+
$(MAKE) train PROFILE=24gb
|
|
205
|
+
|
|
206
|
+
train-l40:
|
|
207
|
+
$(MAKE) train PROFILE=l40
|
|
208
|
+
|
|
209
|
+
train-l40-2gpu:
|
|
210
|
+
$(MAKE) train PROFILE=l40-2gpu
|
|
211
|
+
|
|
212
|
+
train-l40-4gpu:
|
|
213
|
+
$(MAKE) train PROFILE=l40-4gpu
|
|
214
|
+
|
|
215
|
+
#---------------------------------------------------------------------------
|
|
216
|
+
# Phase 4: Cloud & Production
|
|
217
|
+
#---------------------------------------------------------------------------
|
|
218
|
+
|
|
219
|
+
# Build Docker image for cloud deployment
|
|
220
|
+
docker-build:
|
|
221
|
+
@echo "$(CYAN)Building Docker image for cloud deployment...$(RESET)"
|
|
222
|
+
docker build -t babylon-training:latest .
|
|
223
|
+
@echo "$(GREEN)✓ Docker image built: babylon-training:latest$(RESET)"
|
|
224
|
+
|
|
225
|
+
# Production training with W&B logging (requires WANDB_API_KEY)
|
|
226
|
+
train-cloud: db-up db-migrate
|
|
227
|
+
@echo "$(CYAN)Starting production cloud training with W&B logging...$(RESET)"
|
|
228
|
+
@if [ -z "$$WANDB_API_KEY" ]; then \
|
|
229
|
+
echo "$(YELLOW)Warning: WANDB_API_KEY not set. W&B logging will be disabled.$(RESET)"; \
|
|
230
|
+
fi
|
|
231
|
+
cd $(PYTHON_DIR) && DATABASE_URL=$(DB_URL) \
|
|
232
|
+
PATH="$(shell pwd)/$(PYTHON_DIR)/$(VENV_BIN):$$PATH" \
|
|
233
|
+
$(PYTHON) scripts/run_training.py \
|
|
234
|
+
--profile $(PROFILE) \
|
|
235
|
+
--wandb-project $(WANDB_PROJECT) \
|
|
236
|
+
$(if $(WANDB_ENTITY),--wandb-entity $(WANDB_ENTITY),) \
|
|
237
|
+
$(if $(WANDB_RUN_NAME),--wandb-run-name $(WANDB_RUN_NAME),)
|
|
238
|
+
|
|
239
|
+
# Cloud training with specific L40 profiles
|
|
240
|
+
train-cloud-l40:
|
|
241
|
+
$(MAKE) train-cloud PROFILE=l40
|
|
242
|
+
|
|
243
|
+
train-cloud-l40-2gpu:
|
|
244
|
+
$(MAKE) train-cloud PROFILE=l40-2gpu
|
|
245
|
+
|
|
246
|
+
train-cloud-l40-4gpu:
|
|
247
|
+
$(MAKE) train-cloud PROFILE=l40-4gpu
|
|
248
|
+
|
|
249
|
+
# Online cloud training (with bridge)
|
|
250
|
+
train-cloud-online: db-up db-migrate bridge-check
|
|
251
|
+
@echo "$(CYAN)Starting production online training with W&B logging...$(RESET)"
|
|
252
|
+
cd $(PYTHON_DIR) && DATABASE_URL=$(DB_URL) \
|
|
253
|
+
USE_SIMULATION_BRIDGE=1 \
|
|
254
|
+
SIMULATION_BRIDGE_URL=http://localhost:3001 \
|
|
255
|
+
PATH="$(shell pwd)/$(PYTHON_DIR)/$(VENV_BIN):$$PATH" \
|
|
256
|
+
$(PYTHON) scripts/run_training.py \
|
|
257
|
+
--profile $(PROFILE) \
|
|
258
|
+
--mode online \
|
|
259
|
+
--bridge-url http://localhost:3001 \
|
|
260
|
+
--wandb-project $(WANDB_PROJECT) \
|
|
261
|
+
$(if $(WANDB_ENTITY),--wandb-entity $(WANDB_ENTITY),)
|
|
262
|
+
|
|
263
|
+
# Default W&B settings
|
|
264
|
+
WANDB_PROJECT ?= babylon-training
|
|
265
|
+
WANDB_ENTITY ?=
|
|
266
|
+
WANDB_RUN_NAME ?=
|
|
267
|
+
|
|
268
|
+
train: db-up db-migrate
|
|
269
|
+
@echo "$(CYAN)Starting training with profile: $(PROFILE)$(RESET)"
|
|
270
|
+
cd $(PYTHON_DIR) && DATABASE_URL=$(DB_URL) WANDB_MODE=offline \
|
|
271
|
+
PATH="$(shell pwd)/$(PYTHON_DIR)/$(VENV_BIN):$$PATH" \
|
|
272
|
+
$(PYTHON) scripts/run_training.py \
|
|
273
|
+
--profile $(PROFILE) \
|
|
274
|
+
--no-wandb \
|
|
275
|
+
--skip-validation
|
|
276
|
+
|
|
277
|
+
#---------------------------------------------------------------------------
|
|
278
|
+
# Online Training
|
|
279
|
+
#---------------------------------------------------------------------------
|
|
280
|
+
|
|
281
|
+
bridge-server:
|
|
282
|
+
@echo "$(CYAN)Starting TypeScript simulation bridge server...$(RESET)"
|
|
283
|
+
cd ../engine && bun run src/services/simulation-bridge-server.ts
|
|
284
|
+
|
|
285
|
+
train-online: db-up db-migrate
|
|
286
|
+
@echo "$(CYAN)Starting online training (requires bridge-server running)$(RESET)"
|
|
287
|
+
@echo "Make sure you've started the bridge server with: make bridge-server"
|
|
288
|
+
cd $(PYTHON_DIR) && DATABASE_URL=$(DB_URL) WANDB_MODE=offline \
|
|
289
|
+
USE_SIMULATION_BRIDGE=1 \
|
|
290
|
+
SIMULATION_BRIDGE_URL=http://localhost:3001 \
|
|
291
|
+
PATH="$(shell pwd)/$(PYTHON_DIR)/$(VENV_BIN):$$PATH" \
|
|
292
|
+
$(PYTHON) scripts/run_training.py \
|
|
293
|
+
--profile $(PROFILE) \
|
|
294
|
+
--mode online \
|
|
295
|
+
--bridge-url http://localhost:3001 \
|
|
296
|
+
--no-wandb
|
|
297
|
+
|
|
298
|
+
train-hybrid: db-up db-migrate
|
|
299
|
+
@echo "$(CYAN)Starting hybrid training (requires bridge-server running)$(RESET)"
|
|
300
|
+
@echo "Make sure you've started the bridge server with: make bridge-server"
|
|
301
|
+
@echo "Using online ratio: $(ONLINE_RATIO)"
|
|
302
|
+
cd $(PYTHON_DIR) && DATABASE_URL=$(DB_URL) WANDB_MODE=offline \
|
|
303
|
+
USE_SIMULATION_BRIDGE=1 \
|
|
304
|
+
SIMULATION_BRIDGE_URL=http://localhost:3001 \
|
|
305
|
+
HYBRID_ONLINE_RATIO=$(ONLINE_RATIO) \
|
|
306
|
+
PATH="$(shell pwd)/$(PYTHON_DIR)/$(VENV_BIN):$$PATH" \
|
|
307
|
+
$(PYTHON) scripts/run_training.py \
|
|
308
|
+
--profile $(PROFILE) \
|
|
309
|
+
--mode hybrid \
|
|
310
|
+
--bridge-url http://localhost:3001 \
|
|
311
|
+
--hybrid-online-ratio $(ONLINE_RATIO) \
|
|
312
|
+
--no-wandb
|
|
313
|
+
|
|
314
|
+
# Default online ratio for hybrid mode
|
|
315
|
+
ONLINE_RATIO ?= 0.2
|
|
316
|
+
|
|
317
|
+
# Check if bridge server is running
|
|
318
|
+
bridge-check:
|
|
319
|
+
@curl -s http://localhost:3001/health > /dev/null 2>&1 && \
|
|
320
|
+
echo "$(GREEN)✓ Simulation bridge is running$(RESET)" || \
|
|
321
|
+
(echo "$(YELLOW)✗ Simulation bridge not running. Start with: make bridge-server$(RESET)" && exit 1)
|
|
322
|
+
|
|
323
|
+
generate-data:
|
|
324
|
+
@echo "$(CYAN)Generating training trajectories...$(RESET)"
|
|
325
|
+
./scripts/generate_dataset.sh $(HOURS) $(PARALLEL) $(NPCS) $(OUTPUT)
|
|
326
|
+
@echo "$(GREEN)✓ Data generation complete$(RESET)"
|
|
327
|
+
|
|
328
|
+
#---------------------------------------------------------------------------
|
|
329
|
+
# A/B Testing & Evaluation
|
|
330
|
+
#---------------------------------------------------------------------------
|
|
331
|
+
|
|
332
|
+
# Run A/B test comparing trained model against baseline
|
|
333
|
+
ab-test:
|
|
334
|
+
@echo "$(CYAN)Running A/B test: $(MODEL_A) vs $(MODEL_B)$(RESET)"
|
|
335
|
+
cd $(PYTHON_DIR) && \
|
|
336
|
+
PATH="$(shell pwd)/$(PYTHON_DIR)/$(VENV_BIN):$$PATH" \
|
|
337
|
+
$(PYTHON) scripts/run_ab_test.py \
|
|
338
|
+
--model-a $(MODEL_A) \
|
|
339
|
+
--model-b $(MODEL_B) \
|
|
340
|
+
--num-runs $(AB_RUNS) \
|
|
341
|
+
--output-dir $(AB_OUTPUT) \
|
|
342
|
+
$(if $(AB_ARCHETYPES),--archetypes $(AB_ARCHETYPES),)
|
|
343
|
+
|
|
344
|
+
# Quick A/B test with trained model vs base
|
|
345
|
+
ab-test-quick:
|
|
346
|
+
$(MAKE) ab-test MODEL_B=./trained_models/final_model AB_RUNS=1
|
|
347
|
+
|
|
348
|
+
# Default A/B test settings
|
|
349
|
+
MODEL_A ?= Qwen/Qwen2.5-0.5B-Instruct
|
|
350
|
+
MODEL_B ?= ./trained_models/final_model
|
|
351
|
+
AB_RUNS ?= 3
|
|
352
|
+
AB_OUTPUT ?= ./ab_test_results
|
|
353
|
+
AB_ARCHETYPES ?=
|
|
354
|
+
|
|
355
|
+
#---------------------------------------------------------------------------
|
|
356
|
+
# Development
|
|
357
|
+
#---------------------------------------------------------------------------
|
|
358
|
+
|
|
359
|
+
lint:
|
|
360
|
+
@echo "$(CYAN)Running linting...$(RESET)"
|
|
361
|
+
cd ../.. && bun run lint
|
|
362
|
+
|
|
363
|
+
test: tier1 tier2
|
|
364
|
+
@echo "$(GREEN)✓ All quick tests passed$(RESET)"
|
|
365
|
+
|
|
366
|
+
clean:
|
|
367
|
+
@echo "$(CYAN)Cleaning generated files...$(RESET)"
|
|
368
|
+
rm -rf $(PYTHON_DIR)/logs
|
|
369
|
+
rm -rf $(PYTHON_DIR)/trained_models
|
|
370
|
+
rm -rf $(PYTHON_DIR)/.pytest_cache
|
|
371
|
+
rm -rf $(PYTHON_DIR)/__pycache__
|
|
372
|
+
find $(PYTHON_DIR) -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
|
|
373
|
+
@echo "$(GREEN)✓ Clean complete$(RESET)"
|
|
374
|
+
|