npm - @elizaos/training - Versions diffs - 2.0.0-alpha.11 - Mend

@elizaos/training 2.0.0-alpha.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (207) hide show

package/Dockerfile +75 -0
package/Makefile +374 -0
package/README.md +346 -0
package/config/rubrics.json +137 -0
package/data/.gitkeep +0 -0
package/data/degen/.gitkeep +2 -0
package/data/trader/.gitkeep +2 -0
package/docker-compose.test.yml +57 -0
package/package.json +58 -0
package/python/config/babylon_atropos.yaml +90 -0
package/python/config/profiles/12gb.json +11 -0
package/python/config/profiles/16gb.json +10 -0
package/python/config/profiles/24gb.json +10 -0
package/python/config/profiles/48gb.json +10 -0
package/python/config/profiles/cpu.json +11 -0
package/python/config/profiles/l40-2gpu-safe.json +20 -0
package/python/config/profiles/l40-2gpu.json +22 -0
package/python/config/profiles/l40-4gpu.json +21 -0
package/python/config/profiles/l40.json +17 -0
package/python/config/tinker_training.yaml +143 -0
package/python/curriculum_state.json +165 -0
package/python/env.template +86 -0
package/python/env.training.template +46 -0
package/python/pyproject.toml +41 -0
package/python/requirements-ci.txt +31 -0
package/python/requirements.txt +87 -0
package/python/scripts/__init__.py +4 -0
package/python/scripts/import_json_trajectories.py +412 -0
package/python/scripts/local-finetune/README.md +63 -0
package/python/scripts/local-finetune/ingest_and_score.py +139 -0
package/python/scripts/local-finetune/merge_model.py +32 -0
package/python/scripts/local-finetune/test_adapter.py +91 -0
package/python/scripts/local-finetune/train_from_csv.py +132 -0
package/python/scripts/merge_trajectories.py +318 -0
package/python/scripts/run_ab_test.py +143 -0
package/python/scripts/run_full_pipeline.py +544 -0
package/python/scripts/run_tinker_training.py +192 -0
package/python/scripts/run_training.py +914 -0
package/python/scripts/test_judge.py +155 -0
package/python/scripts/test_pipeline.py +356 -0
package/python/scripts/test_trained_model.py +380 -0
package/python/scripts/train_local.py +528 -0
package/python/setup.py +20 -0
package/python/src/__init__.py +190 -0
package/python/src/data_bridge/__init__.py +24 -0
package/python/src/data_bridge/converter.py +435 -0
package/python/src/data_bridge/reader.py +393 -0
package/python/src/models.py +283 -0
package/python/src/training/__init__.py +605 -0
package/python/src/training/ab_testing.py +404 -0
package/python/src/training/action_executor.py +621 -0
package/python/src/training/archetype_trainer.py +347 -0
package/python/src/training/atropos_trainer.py +980 -0
package/python/src/training/babylon_env.py +1254 -0
package/python/src/training/error_recovery.py +647 -0
package/python/src/training/evaluation.py +856 -0
package/python/src/training/fast_simulator.py +880 -0
package/python/src/training/format_validator.py +584 -0
package/python/src/training/hybrid_env.py +522 -0
package/python/src/training/kl_controller.py +628 -0
package/python/src/training/multi_prompt_dataset.py +883 -0
package/python/src/training/multi_turn.py +656 -0
package/python/src/training/online_env.py +1084 -0
package/python/src/training/quality_scorer.py +391 -0
package/python/src/training/quality_utils.py +633 -0
package/python/src/training/rewards.py +1344 -0
package/python/src/training/rlaif_env.py +17 -0
package/python/src/training/rollout_generator.py +502 -0
package/python/src/training/rubric_loader.py +198 -0
package/python/src/training/scenario_pool.py +1072 -0
package/python/src/training/schemas.py +481 -0
package/python/src/training/service_manager.py +552 -0
package/python/src/training/simulation_bridge.py +535 -0
package/python/src/training/tick_reward_attribution.py +399 -0
package/python/src/training/tinker_client.py +575 -0
package/python/src/training/tinker_trainer.py +646 -0
package/python/src/training/tokenization_utils.py +402 -0
package/python/tests/e2e/__init__.py +13 -0
package/python/tests/e2e/conftest.py +258 -0
package/python/tests/e2e/test_full_pipeline.py +643 -0
package/python/tests/e2e/test_online_training_e2e.py +365 -0
package/python/tests/integration/__init__.py +12 -0
package/python/tests/integration/conftest.py +383 -0
package/python/tests/integration/test_db_integration.py +649 -0
package/python/tests/integration/test_json_mode_integration.py +554 -0
package/python/tests/test_action_executor.py +594 -0
package/python/tests/test_archetype_scoring.py +1027 -0
package/python/tests/test_atropos_integration.py +360 -0
package/python/tests/test_evaluation.py +727 -0
package/python/tests/test_format_validator.py +486 -0
package/python/tests/test_kl_controller.py +432 -0
package/python/tests/test_lr_scheduler.py +579 -0
package/python/tests/test_multi_turn.py +590 -0
package/python/tests/test_online_env.py +519 -0
package/python/tests/test_quality_scorer.py +474 -0
package/python/tests/test_scenario_pool.py +735 -0
package/python/tests/test_service_manager.py +585 -0
package/python/tests/test_simulation_rollout.py +581 -0
package/python/tests/test_tokenization_utils.py +501 -0
package/python/tests/test_training_orchestrator.py +497 -0
package/python/tests/test_training_output_structure.py +661 -0
package/research-output/training-runs/training-run-1770772042899.json +26 -0
package/research-output/training-runs/training-run-1770930079670.json +32 -0
package/research-output/training-runs/training-run-1770930143700.json +44 -0
package/research-output/training-runs/training-run-1770930183638.json +38 -0
package/research-output/training-runs/training-run-1770930442049.json +38 -0
package/research-output/training-runs/training-run-1770930793243.json +38 -0
package/scripts/assess-training-data.ts +422 -0
package/scripts/e2e-training-test.ts +550 -0
package/scripts/export-rubrics.ts +64 -0
package/scripts/generate-research-report.ts +1523 -0
package/scripts/generate_dataset.sh +173 -0
package/scripts/json-mode-benchmark.ts +399 -0
package/scripts/real-archetype-benchmark.ts +210 -0
package/scripts/run-baseline-comparison.ts +116 -0
package/scripts/run-full-pipeline.ts +272 -0
package/scripts/runpod_setup.sh +137 -0
package/scripts/runpod_validate.sh +147 -0
package/scripts/test-model-in-game.ts +955 -0
package/scripts/test-scoring.ts +73 -0
package/scripts/test-trained-model.ts +209 -0
package/scripts/train-and-test.ts +824 -0
package/scripts/verify-final.ts +118 -0
package/src/adapter.ts +516 -0
package/src/archetypes/ArchetypeConfigService.ts +626 -0
package/src/archetypes/derive-archetype.ts +249 -0
package/src/archetypes/index.ts +22 -0
package/src/benchmark/ArchetypeMatchupBenchmark.ts +825 -0
package/src/benchmark/BenchmarkChartGenerator.ts +748 -0
package/src/benchmark/BenchmarkDataGenerator.ts +1288 -0
package/src/benchmark/BenchmarkDataViewer.ts +324 -0
package/src/benchmark/BenchmarkHistoryService.ts +221 -0
package/src/benchmark/BenchmarkRunner.ts +685 -0
package/src/benchmark/BenchmarkValidator.ts +206 -0
package/src/benchmark/FastEvalRunner.ts +225 -0
package/src/benchmark/MetricsValidator.ts +165 -0
package/src/benchmark/MetricsVisualizer.ts +909 -0
package/src/benchmark/ModelBenchmarkService.ts +611 -0
package/src/benchmark/ModelRegistry.ts +158 -0
package/src/benchmark/RulerBenchmarkIntegration.ts +235 -0
package/src/benchmark/SimulationA2AInterface.ts +1169 -0
package/src/benchmark/SimulationEngine.ts +832 -0
package/src/benchmark/__tests__/BenchmarkRunner.test.ts +534 -0
package/src/benchmark/__tests__/HeadToHead.test.ts +126 -0
package/src/benchmark/index.ts +89 -0
package/src/benchmark/parseSimulationMetrics.ts +124 -0
package/src/benchmark/simulation-types.ts +78 -0
package/src/dependencies.ts +439 -0
package/src/generation/TrajectoryGenerator.ts +387 -0
package/src/generation/index.ts +12 -0
package/src/huggingface/HuggingFaceDatasetUploader.ts +636 -0
package/src/huggingface/HuggingFaceIntegrationService.ts +426 -0
package/src/huggingface/HuggingFaceModelUploader.ts +532 -0
package/src/huggingface/index.ts +27 -0
package/src/huggingface/shared/HuggingFaceUploadUtil.ts +206 -0
package/src/index.ts +102 -0
package/src/init-training.ts +53 -0
package/src/metrics/TrajectoryMetricsExtractor.ts +653 -0
package/src/metrics/__tests__/TrajectoryMetricsExtractor.test.ts +759 -0
package/src/metrics/index.ts +8 -0
package/src/metrics/types.ts +200 -0
package/src/rubrics/__tests__/index.test.ts +184 -0
package/src/rubrics/ass-kisser.ts +85 -0
package/src/rubrics/degen.ts +80 -0
package/src/rubrics/goody-twoshoes.ts +84 -0
package/src/rubrics/index.ts +236 -0
package/src/rubrics/information-trader.ts +84 -0
package/src/rubrics/infosec.ts +101 -0
package/src/rubrics/liar.ts +104 -0
package/src/rubrics/perps-trader.ts +87 -0
package/src/rubrics/researcher.ts +81 -0
package/src/rubrics/scammer.ts +82 -0
package/src/rubrics/social-butterfly.ts +73 -0
package/src/rubrics/super-predictor.ts +97 -0
package/src/rubrics/trader.ts +67 -0
package/src/scoring/ArchetypeScoringService.ts +486 -0
package/src/scoring/JudgePromptBuilder.ts +556 -0
package/src/scoring/LLMJudgeCache.ts +401 -0
package/src/scoring/index.ts +9 -0
package/src/training/AutomationPipeline.ts +916 -0
package/src/training/BenchmarkService.ts +518 -0
package/src/training/ConfigValidator.ts +220 -0
package/src/training/MarketOutcomesTracker.ts +187 -0
package/src/training/ModelDeployer.ts +186 -0
package/src/training/ModelFetcher.ts +76 -0
package/src/training/ModelSelectionService.ts +341 -0
package/src/training/ModelUsageVerifier.ts +160 -0
package/src/training/MultiModelOrchestrator.ts +580 -0
package/src/training/RLModelConfig.ts +407 -0
package/src/training/RewardBackpropagationService.ts +149 -0
package/src/training/RulerScoringService.ts +666 -0
package/src/training/TrainingMonitor.ts +166 -0
package/src/training/TrajectoryRecorder.ts +399 -0
package/src/training/__tests__/TrajectoryRecorder.test.ts +472 -0
package/src/training/index.ts +100 -0
package/src/training/logRLConfig.ts +34 -0
package/src/training/pipeline.ts +129 -0
package/src/training/storage/ModelStorageService.ts +279 -0
package/src/training/storage/TrainingDataArchiver.ts +197 -0
package/src/training/storage/index.ts +17 -0
package/src/training/types.ts +207 -0
package/src/training/window-utils.ts +138 -0
package/src/utils/index.ts +101 -0
package/src/utils/logger.ts +59 -0
package/src/utils/snowflake.ts +17 -0
package/src/utils/synthetic-detector.ts +111 -0
package/tsconfig.json +20 -0

package/python/tests/test_tokenization_utils.py ADDED Viewed

@@ -0,0 +1,501 @@
+"""
+Tests for Tokenization Utilities
+Tests cover:
+- Proper prompt/completion masking with -100/token_id format
+- Multi-turn conversation masking
+- Mask validation
+- Historical mask fixing
+MASK FORMAT:
+- mask = -100: Prompt token, ignored in loss calculation
+- mask = token_id: Completion token, trained on
+"""
+import pytest
+from unittest.mock import MagicMock, patch
+from src.training.tokenization_utils import (
+    TokenizationResult,
+    tokenize_for_trainer,
+    tokenize_conversation_for_trainer,
+    validate_masks,
+    create_masks_from_response_start,
+    fix_historical_masks,
+)
+# =============================================================================
+# Mock Tokenizer
+# =============================================================================
+class MockTokenizer:
+    """Mock tokenizer for testing"""
+    def __init__(self):
+        self.vocab = {
+            "<|system|>": 1,
+            "<|user|>": 2,
+            "<|assistant|>": 3,
+            "<|end|>": 4,
+            "hello": 10,
+            "world": 11,
+            "how": 12,
+            "are": 13,
+            "you": 14,
+            "i": 15,
+            "am": 16,
+            "fine": 17,
+            "thanks": 18,
+            "for": 19,
+            "asking": 20,
+        }
+        self.reverse_vocab = {v: k for k, v in self.vocab.items()}
+    def encode(self, text: str, add_special_tokens: bool = True) -> list:
+        """Simple word-level encoding"""
+        words = text.lower().replace("<|", " <|").replace("|>", "|> ").split()
+        tokens = []
+        for word in words:
+            word = word.strip()
+            if word in self.vocab:
+                tokens.append(self.vocab[word])
+            else:
+                tokens.append(100 + len(word))  # Unknown token
+        return tokens
+    def decode(self, tokens: list) -> str:
+        """Simple decoding"""
+        words = []
+        for t in tokens:
+            if t in self.reverse_vocab:
+                words.append(self.reverse_vocab[t])
+            else:
+                words.append(f"[{t}]")
+        return " ".join(words)
+    def apply_chat_template(
+        self,
+        messages: list,
+        return_tensors=None,
+        add_generation_prompt: bool = False,
+    ) -> list:
+        """Mock chat template application"""
+        tokens = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            # Add role token
+            if role == "system":
+                tokens.append(1)
+            elif role == "user":
+                tokens.append(2)
+            elif role == "assistant":
+                tokens.append(3)
+            # Add content tokens
+            tokens.extend(self.encode(content, add_special_tokens=False))
+            # Add end token
+            tokens.append(4)
+        if add_generation_prompt:
+            tokens.append(3)  # Assistant start token
+        return tokens
+# =============================================================================
+# TokenizationResult Tests
+# =============================================================================
+class TestTokenizationResult:
+    """Tests for TokenizationResult dataclass"""
+    def test_creation(self):
+        # New format: -100 for prompt, actual token IDs for completion
+        tokens = [1, 2, 3, 4, 5]
+        masks = [-100, -100, 3, 4, 5]  # First 2 prompt, last 3 completion
+        result = TokenizationResult(
+            tokens=tokens,
+            masks=masks,
+            prompt_length=2,
+            completion_length=3,
+            total_length=5,
+        )
+        assert len(result.tokens) == 5
+        assert result.prompt_length == 2
+        assert result.completion_length == 3
+# =============================================================================
+# tokenize_for_trainer Tests
+# =============================================================================
+class TestTokenizeForTrainer:
+    """Tests for tokenize_for_trainer"""
+    def test_empty_messages(self):
+        tokenizer = MockTokenizer()
+        result = tokenize_for_trainer(tokenizer, [])
+        assert result.tokens == []
+        assert result.masks == []
+        assert result.total_length == 0
+    def test_prompt_only(self):
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "user", "content": "hello world"},
+        ]
+        result = tokenize_for_trainer(tokenizer, messages, add_generation_prompt=True)
+        # All should be masked (no assistant response) - all -100
+        assert all(m == -100 for m in result.masks)
+        assert result.completion_length == 0
+    def test_with_assistant_response(self):
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "world"},
+        ]
+        result = tokenize_for_trainer(tokenizer, messages)
+        # Should have both prompt and completion masks
+        # Prompt: -100, Completion: actual token IDs
+        assert any(m == -100 for m in result.masks)  # Prompt masked with -100
+        assert any(m != -100 for m in result.masks)  # Completion has token IDs
+        assert len(result.masks) == len(result.tokens)
+        # Verify completion tokens match actual tokens
+        for i, (token, mask) in enumerate(zip(result.tokens, result.masks)):
+            if mask != -100:
+                assert mask == token, f"Mask at pos {i} should equal token for completion"
+    def test_with_system_prompt(self):
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "system", "content": "you are helpful"},
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi"},
+        ]
+        result = tokenize_for_trainer(tokenizer, messages)
+        # System and user should be masked (-100), assistant unmasked (token IDs)
+        assert result.prompt_length > 0
+        assert result.completion_length > 0
+        assert result.prompt_length + result.completion_length == result.total_length
+    def test_multiple_turns(self):
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi"},
+            {"role": "user", "content": "how are you"},
+            {"role": "assistant", "content": "fine thanks"},
+        ]
+        result = tokenize_for_trainer(tokenizer, messages)
+        # Only last assistant should be unmasked
+        assert result.completion_length > 0
+        assert len(result.tokens) > 0
+# =============================================================================
+# tokenize_conversation_for_trainer Tests
+# =============================================================================
+class TestTokenizeConversationForTrainer:
+    """Tests for tokenize_conversation_for_trainer"""
+    def test_empty_messages(self):
+        tokenizer = MockTokenizer()
+        result = tokenize_conversation_for_trainer(tokenizer, [])
+        assert result.tokens == []
+        assert result.masks == []
+    def test_single_turn(self):
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi"},
+        ]
+        result = tokenize_conversation_for_trainer(tokenizer, messages)
+        # User masked (-100), assistant unmasked (token IDs)
+        assert result.prompt_length > 0
+        assert result.completion_length > 0
+    def test_multi_turn_all_assistants_unmasked(self):
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "hi"},
+            {"role": "user", "content": "how"},
+            {"role": "assistant", "content": "fine"},
+        ]
+        result = tokenize_conversation_for_trainer(tokenizer, messages)
+        # Should have unmasked tokens for both assistant turns
+        assert result.completion_length > 0
+# =============================================================================
+# validate_masks Tests
+# =============================================================================
+class TestValidateMasks:
+    """Tests for validate_masks with new -100/token_id format"""
+    def test_valid_masks(self):
+        tokenizer = MockTokenizer()
+        tokens = [1, 2, 3, 4, 5]
+        # New format: -100 for prompt, actual token IDs for completion
+        masks = [-100, -100, 3, 4, 5]  # Prompt then completion
+        is_valid, issues = validate_masks(tokens, masks, tokenizer)
+        assert is_valid is True
+        assert issues == []
+    def test_length_mismatch(self):
+        tokenizer = MockTokenizer()
+        tokens = [1, 2, 3, 4, 5]
+        masks = [-100, -100, 3]  # Too short
+        is_valid, issues = validate_masks(tokens, masks, tokenizer)
+        assert is_valid is False
+        assert any("Length mismatch" in issue for issue in issues)
+    def test_legacy_format_detected(self):
+        tokenizer = MockTokenizer()
+        tokens = [1, 2, 3, 4, 5]
+        masks = [0, 0, 1, 1, 1]  # Legacy 0/1 format - should be flagged
+        is_valid, issues = validate_masks(tokens, masks, tokenizer)
+        assert is_valid is False
+        assert any("LEGACY MASK FORMAT" in issue for issue in issues)
+    def test_all_masked(self):
+        tokenizer = MockTokenizer()
+        tokens = [1, 2, 3, 4, 5]
+        masks = [-100, -100, -100, -100, -100]  # All masked (no completion)
+        is_valid, issues = validate_masks(tokens, masks, tokenizer)
+        assert is_valid is False
+        assert any("No unmasked tokens" in issue for issue in issues)
+    def test_all_unmasked(self):
+        tokenizer = MockTokenizer()
+        tokens = [1, 2, 3, 4, 5]
+        # All tokens match their positions (all unmasked, no prompt)
+        masks = [1, 2, 3, 4, 5]
+        is_valid, issues = validate_masks(tokens, masks, tokenizer)
+        assert is_valid is False
+        assert any("No masked tokens" in issue for issue in issues)
+    def test_mask_token_mismatch(self):
+        tokenizer = MockTokenizer()
+        tokens = [1, 2, 3, 4, 5]
+        # Token at position 2 is 3, but mask says 99
+        masks = [-100, -100, 99, 4, 5]
+        is_valid, issues = validate_masks(tokens, masks, tokenizer)
+        assert is_valid is False
+        assert any("Mask mismatch" in issue for issue in issues)
+# =============================================================================
+# create_masks_from_response_start Tests
+# =============================================================================
+class TestCreateMasksFromResponseStart:
+    """Tests for create_masks_from_response_start with new format"""
+    def test_normal_case(self):
+        tokens = [1, 2, 3, 4, 5]
+        response_start = 3
+        masks = create_masks_from_response_start(tokens, response_start)
+        # -100 for prompt, actual token IDs for completion
+        assert masks == [-100, -100, -100, 4, 5]
+    def test_start_at_beginning(self):
+        tokens = [1, 2, 3, 4, 5]
+        response_start = 0
+        masks = create_masks_from_response_start(tokens, response_start)
+        # All completion (all token IDs)
+        assert masks == [1, 2, 3, 4, 5]
+    def test_start_at_end(self):
+        tokens = [1, 2, 3, 4, 5]
+        response_start = 5
+        masks = create_masks_from_response_start(tokens, response_start)
+        # All prompt (all -100)
+        assert masks == [-100, -100, -100, -100, -100]
+    def test_negative_start_clamps(self):
+        tokens = [1, 2, 3, 4, 5]
+        response_start = -10
+        masks = create_masks_from_response_start(tokens, response_start)
+        # Clamps to 0, so all completion
+        assert masks == [1, 2, 3, 4, 5]
+    def test_beyond_end_clamps(self):
+        tokens = [1, 2, 3, 4, 5]
+        response_start = 100
+        masks = create_masks_from_response_start(tokens, response_start)
+        # Clamps to end, so all prompt
+        assert masks == [-100, -100, -100, -100, -100]
+# =============================================================================
+# fix_historical_masks Tests
+# =============================================================================
+class TestFixHistoricalMasks:
+    """Tests for fix_historical_masks"""
+    def test_all_ones_detected_and_fixed(self):
+        tokenizer = MockTokenizer()
+        tokens = [1, 10, 4, 2, 11, 4, 3, 12, 4]  # system hello, user world, assistant how
+        masks = [1, 1, 1, 1, 1, 1, 1, 1, 1]  # All 1s (legacy incorrect format)
+        messages = [
+            {"role": "system", "content": "hello"},
+            {"role": "user", "content": "world"},
+            {"role": "assistant", "content": "how"},
+        ]
+        fixed = fix_historical_masks(tokens, masks, tokenizer, messages)
+        # Should have -100 for prompt now
+        assert any(m == -100 for m in fixed)
+    def test_legacy_zeros_ones_fixed(self):
+        tokenizer = MockTokenizer()
+        tokens = [2, 10, 4, 3, 11, 4]  # user hello, assistant world
+        masks = [0, 0, 0, 1, 1, 1]  # Legacy 0/1 format
+        messages = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "world"},
+        ]
+        fixed = fix_historical_masks(tokens, masks, tokenizer, messages)
+        # Should be converted to -100/token_id format
+        assert any(m == -100 for m in fixed)
+        assert any(m != -100 and m > 0 for m in fixed)
+    def test_already_valid_unchanged(self):
+        tokenizer = MockTokenizer()
+        tokens = [2, 10, 4, 3, 11, 4]  # user hello, assistant world
+        # Already correct format: -100 for prompt, token IDs for completion
+        masks = [-100, -100, -100, 3, 11, 4]
+        messages = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "world"},
+        ]
+        fixed = fix_historical_masks(tokens, masks, tokenizer, messages)
+        assert fixed == masks
+# =============================================================================
+# Integration Tests
+# =============================================================================
+class TestIntegration:
+    """Integration tests combining multiple utilities"""
+    def test_tokenize_validate_flow(self):
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "system", "content": "you are helpful"},
+            {"role": "user", "content": "hello world"},
+            {"role": "assistant", "content": "hi there"},
+        ]
+        # Tokenize
+        result = tokenize_for_trainer(tokenizer, messages)
+        # Validate
+        is_valid, issues = validate_masks(result.tokens, result.masks, tokenizer)
+        # Should be valid
+        assert len(result.tokens) > 0
+        assert len(result.masks) == len(result.tokens)
+        assert is_valid is True, f"Validation failed: {issues}"
+    def test_fix_and_validate_flow(self):
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "world"},
+        ]
+        # Simulate broken historical masks (legacy all-1s format)
+        tokens = tokenizer.apply_chat_template(messages)
+        broken_masks = [1] * len(tokens)
+        # Fix
+        fixed_masks = fix_historical_masks(tokens, broken_masks, tokenizer, messages)
+        # Should have -100 for prompt tokens now
+        assert any(m == -100 for m in fixed_masks)
+        # Validate the fixed masks
+        is_valid, issues = validate_masks(tokens, fixed_masks, tokenizer)
+        assert is_valid is True, f"Fixed masks should be valid: {issues}"
+    def test_completion_tokens_match_in_masks(self):
+        """Verify that completion masks contain actual token IDs"""
+        tokenizer = MockTokenizer()
+        messages = [
+            {"role": "user", "content": "hello"},
+            {"role": "assistant", "content": "world"},
+        ]
+        result = tokenize_for_trainer(tokenizer, messages)
+        # For completion tokens, mask should equal token
+        for i, (token, mask) in enumerate(zip(result.tokens, result.masks)):
+            if mask != -100:
+                assert mask == token, \
+                    f"Position {i}: mask {mask} should equal token {token}"