PyPI - textpolicy - Versions diffs - 0.0.1__tar.gz → 0.1.0__tar.gz - Mend

textpolicy 0.0.1tar.gz → 0.1.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

textpolicy-0.1.0/PKG-INFO +99 -0
textpolicy-0.1.0/README.md +75 -0
textpolicy-0.1.0/pyproject.toml +30 -0
textpolicy-0.1.0/tests/test_gspo_verification.py +215 -0
textpolicy-0.1.0/tests/test_integration_e2e_training.py +49 -0
textpolicy-0.1.0/tests/test_reward_signatures.py +131 -0
textpolicy-0.1.0/tests/test_rollout_rewards.py +228 -0
textpolicy-0.1.0/tests/test_runner_step_enforcement.py +80 -0
textpolicy-0.1.0/tests/test_validate_installation.py +12 -0
textpolicy-0.1.0/textpolicy/__init__.py +52 -0
textpolicy-0.1.0/textpolicy/__main__.py +8 -0
textpolicy-0.1.0/textpolicy/algorithms/__init__.py +54 -0
textpolicy-0.1.0/textpolicy/algorithms/grpo.py +642 -0
textpolicy-0.1.0/textpolicy/algorithms/gspo.py +582 -0
textpolicy-0.1.0/textpolicy/buffer/__init__.py +23 -0
textpolicy-0.1.0/textpolicy/buffer/buffer.py +244 -0
textpolicy-0.1.0/textpolicy/buffer/episode.py +383 -0
textpolicy-0.1.0/textpolicy/buffer/sampling.py +438 -0
textpolicy-0.1.0/textpolicy/buffer/storage.py +255 -0
textpolicy-0.1.0/textpolicy/cli.py +67 -0
textpolicy-0.1.0/textpolicy/environment/__init__.py +79 -0
textpolicy-0.1.0/textpolicy/environment/base.py +110 -0
textpolicy-0.1.0/textpolicy/environment/environment.py +46 -0
textpolicy-0.1.0/textpolicy/environment/factory.py +103 -0
textpolicy-0.1.0/textpolicy/environment/gym.py +106 -0
textpolicy-0.1.0/textpolicy/environment/task_suites.py +51 -0
textpolicy-0.1.0/textpolicy/environment/text_generation.py +789 -0
textpolicy-0.1.0/textpolicy/environment/vectorized.py +253 -0
textpolicy-0.1.0/textpolicy/generation/__init__.py +62 -0
textpolicy-0.1.0/textpolicy/generation/lora.py +411 -0
textpolicy-0.1.0/textpolicy/generation/mlx_generation.py +557 -0
textpolicy-0.1.0/textpolicy/generation/reload.py +253 -0
textpolicy-0.1.0/textpolicy/rewards/__init__.py +137 -0
textpolicy-0.1.0/textpolicy/rewards/adapters.py +387 -0
textpolicy-0.1.0/textpolicy/rewards/basic.py +214 -0
textpolicy-0.1.0/textpolicy/rewards/integrated_system.py +338 -0
textpolicy-0.1.0/textpolicy/rewards/mlx_batch_processor.py +447 -0
textpolicy-0.1.0/textpolicy/rewards/registry.py +293 -0
textpolicy-0.1.0/textpolicy/rewards/rollout_rewards.py +410 -0
textpolicy-0.1.0/textpolicy/rewards/verifiers.py +369 -0
textpolicy-0.1.0/textpolicy/rollout/__init__.py +44 -0
textpolicy-0.1.0/textpolicy/rollout/aggregator.py +145 -0
textpolicy-0.1.0/textpolicy/rollout/base.py +108 -0
textpolicy-0.1.0/textpolicy/rollout/rollout.py +142 -0
textpolicy-0.1.0/textpolicy/rollout/runner.py +280 -0
textpolicy-0.1.0/textpolicy/rollout/strategy.py +208 -0
textpolicy-0.1.0/textpolicy/rollout/worker.py +194 -0
textpolicy-0.1.0/textpolicy/training/__init__.py +14 -0
textpolicy-0.1.0/textpolicy/training/metrics.py +242 -0
textpolicy-0.1.0/textpolicy/training/rollout_manager.py +78 -0
textpolicy-0.1.0/textpolicy/training/trainer.py +684 -0
textpolicy-0.1.0/textpolicy/utils/__init__.py +40 -0
textpolicy-0.1.0/textpolicy/utils/benchmarking.py +489 -0
textpolicy-0.1.0/textpolicy/utils/data.py +60 -0
textpolicy-0.1.0/textpolicy/utils/debug.py +170 -0
textpolicy-0.1.0/textpolicy/utils/environment.py +349 -0
textpolicy-0.1.0/textpolicy/utils/logging/__init__.py +22 -0
textpolicy-0.1.0/textpolicy/utils/logging/base.py +48 -0
textpolicy-0.1.0/textpolicy/utils/logging/console.py +61 -0
textpolicy-0.1.0/textpolicy/utils/logging/factory.py +133 -0
textpolicy-0.1.0/textpolicy/utils/logging/multi.py +83 -0
textpolicy-0.1.0/textpolicy/utils/logging/tensorboard.py +65 -0
textpolicy-0.1.0/textpolicy/utils/logging/wandb.py +72 -0
textpolicy-0.1.0/textpolicy/utils/memory.py +118 -0
textpolicy-0.1.0/textpolicy/utils/performance.py +464 -0
textpolicy-0.1.0/textpolicy/utils/timing.py +171 -0
textpolicy-0.1.0/textpolicy/validate.py +101 -0
textpolicy-0.1.0/textpolicy/validation/__init__.py +13 -0
textpolicy-0.1.0/textpolicy/validation/logprob_validation.py +315 -0
textpolicy-0.1.0/textpolicy.egg-info/PKG-INFO +99 -0
textpolicy-0.1.0/textpolicy.egg-info/SOURCES.txt +75 -0
textpolicy-0.1.0/textpolicy.egg-info/entry_points.txt +2 -0
textpolicy-0.1.0/textpolicy.egg-info/requires.txt +17 -0
textpolicy-0.0.1/PKG-INFO +0 -10
textpolicy-0.0.1/README.md +0 -1
textpolicy-0.0.1/pyproject.toml +0 -7
textpolicy-0.0.1/textpolicy/__init__.py +0 -0
textpolicy-0.0.1/textpolicy.egg-info/PKG-INFO +0 -10
textpolicy-0.0.1/textpolicy.egg-info/SOURCES.txt +0 -8
{textpolicy-0.0.1 → textpolicy-0.1.0}/LICENSE +0 -0
{textpolicy-0.0.1 → textpolicy-0.1.0}/setup.cfg +0 -0
{textpolicy-0.0.1 → textpolicy-0.1.0}/textpolicy.egg-info/dependency_links.txt +0 -0
{textpolicy-0.0.1 → textpolicy-0.1.0}/textpolicy.egg-info/top_level.txt +0 -0

textpolicy-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,99 @@
+Metadata-Version: 2.4
+Name: textpolicy
+Version: 0.1.0
+Summary: MLX-optimized reward and verification system for text generation RL
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=2.3.2
+Requires-Dist: mlx>=0.21.0
+Requires-Dist: mlx-lm>=0.21.0
+Requires-Dist: gymnasium>=0.29.0
+Requires-Dist: psutil>=7.0.0
+Requires-Dist: wandb>=0.21.1
+Requires-Dist: aiohttp>=3.12.15
+Requires-Dist: pytest>=8.4.1
+Provides-Extra: external
+Requires-Dist: aiohttp>=3.8.0; extra == "external"
+Requires-Dist: pydantic>=2.0.0; extra == "external"
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: black>=22.0.0; extra == "dev"
+Requires-Dist: ruff>=0.1.0; extra == "dev"
+Dynamic: license-file
+# TextPolicy
+Reinforcement learning toolkit for text generation on MLX (Apple Silicon).
+TextPolicy provides algorithms (GRPO/GSPO), text-generation environments, a rollout runner,
+reward functions with a decorator registry, and LoRA/QLoRA utilities.
+## Install (uv)
+```bash
+uv add textpolicy
+```
+Optional model integration:
+```bash
+uv add mlx mlx-lm
+```
+## Quickstart
+Working example using a real model and tokenizer (mlx-lm required):
+```python
+import mlx.core as mx
+import textpolicy as tp
+from textpolicy import load_model, create_policy
+from textpolicy.environment.text_generation import TextGenerationEnv
+from textpolicy.rollout import RolloutRunner, create_strategy
+# 1) Load model and tokenizer (mlx-lm)
+model, tokenizer = load_model("Qwen/Qwen3-0.6B")
+# 2) Create a policy (controls generation)
+generation_params = {"max_tokens": 25, "temperature": 0.7}
+policy_fn = create_policy(model, tokenizer, generation_params)
+# 3) Define a reward function (env uses this to score responses)
+@tp.reward
+def length_reward(prompt: str, completion: str, example: dict, **kwargs) -> float:
+    return float(len(completion.split()))
+# 4) Create an environment (requires a tokenizer)
+env = TextGenerationEnv(["What is AI?"], length_reward, tokenizer=tokenizer)
+# 5) Collect one rollout step
+strategy = create_strategy('grpo')
+runner = RolloutRunner(env, policy=policy_fn, strategy=strategy, max_steps=1)
+buffer = runner.collect()
+print(len(buffer.episodes))
+```
+Docs:
+- Quickstart: `docs/QUICKSTART_UV.md`
+- LoRA/QLoRA: `docs/10_lora_qlora.md`
+- Full index: `docs/index.md`
+FAQ:
+- Do I need a model?
+    - Yes for generation with `create_policy`.
+    Use `load_model()` (mlx‑lm) to get `(model, tokenizer)`.
+    For reward‑only code (no generation), a model is not required.
+- Do I need a tokenizer?
+    - Yes.
+    Both `TextGenerationEnv` and `TextGenerationEnvironment` require a tokenizer.
+    `load_model()` returns one for mlx‑lm models.
+- How do I control generation?
+    - Pass `generation_params` to `create_policy` (for example, `max_tokens`, `temperature`, `top_p`, `repetition_penalty`).
+- What does `step()` return?
+    - A dict with `observation`, `reward`, `terminated`, `truncated`, `info`. The runner enforces this.
+Examples:
+- 01–06: reward functions, batch processing, minimal training
+- 08: GRPO training with rollout + buffer
+- 09–10: length reduction (GRPO/GSPO)
+- 11: LoRA/QLoRA configuration

textpolicy-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,75 @@
+# TextPolicy
+Reinforcement learning toolkit for text generation on MLX (Apple Silicon).
+TextPolicy provides algorithms (GRPO/GSPO), text-generation environments, a rollout runner,
+reward functions with a decorator registry, and LoRA/QLoRA utilities.
+## Install (uv)
+```bash
+uv add textpolicy
+```
+Optional model integration:
+```bash
+uv add mlx mlx-lm
+```
+## Quickstart
+Working example using a real model and tokenizer (mlx-lm required):
+```python
+import mlx.core as mx
+import textpolicy as tp
+from textpolicy import load_model, create_policy
+from textpolicy.environment.text_generation import TextGenerationEnv
+from textpolicy.rollout import RolloutRunner, create_strategy
+# 1) Load model and tokenizer (mlx-lm)
+model, tokenizer = load_model("Qwen/Qwen3-0.6B")
+# 2) Create a policy (controls generation)
+generation_params = {"max_tokens": 25, "temperature": 0.7}
+policy_fn = create_policy(model, tokenizer, generation_params)
+# 3) Define a reward function (env uses this to score responses)
+@tp.reward
+def length_reward(prompt: str, completion: str, example: dict, **kwargs) -> float:
+    return float(len(completion.split()))
+# 4) Create an environment (requires a tokenizer)
+env = TextGenerationEnv(["What is AI?"], length_reward, tokenizer=tokenizer)
+# 5) Collect one rollout step
+strategy = create_strategy('grpo')
+runner = RolloutRunner(env, policy=policy_fn, strategy=strategy, max_steps=1)
+buffer = runner.collect()
+print(len(buffer.episodes))
+```
+Docs:
+- Quickstart: `docs/QUICKSTART_UV.md`
+- LoRA/QLoRA: `docs/10_lora_qlora.md`
+- Full index: `docs/index.md`
+FAQ:
+- Do I need a model?
+    - Yes for generation with `create_policy`.
+    Use `load_model()` (mlx‑lm) to get `(model, tokenizer)`.
+    For reward‑only code (no generation), a model is not required.
+- Do I need a tokenizer?
+    - Yes.
+    Both `TextGenerationEnv` and `TextGenerationEnvironment` require a tokenizer.
+    `load_model()` returns one for mlx‑lm models.
+- How do I control generation?
+    - Pass `generation_params` to `create_policy` (for example, `max_tokens`, `temperature`, `top_p`, `repetition_penalty`).
+- What does `step()` return?
+    - A dict with `observation`, `reward`, `terminated`, `truncated`, `info`. The runner enforces this.
+Examples:
+- 01–06: reward functions, batch processing, minimal training
+- 08: GRPO training with rollout + buffer
+- 09–10: length reduction (GRPO/GSPO)
+- 11: LoRA/QLoRA configuration

textpolicy-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,30 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+include = ["textpolicy*"]
+[project]
+name = "textpolicy"
+version = "0.1.0"
+description = "MLX-optimized reward and verification system for text generation RL"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "numpy>=2.3.2",
+    "mlx>=0.21.0", # Core MLX framework for Apple Silicon acceleration
+    "mlx-lm>=0.21.0", # MLX language models for inference
+    "gymnasium>=0.29.0",
+    "psutil>=7.0.0",
+    "wandb>=0.21.1",
+    "aiohttp>=3.12.15",
+    "pytest>=8.4.1",
+]
+[project.scripts]
+textpolicy = "textpolicy.cli:main"
+[project.optional-dependencies]
+external = ["aiohttp>=3.8.0", "pydantic>=2.0.0"]
+dev = ["pytest>=7.0.0", "black>=22.0.0", "ruff>=0.1.0"]

textpolicy-0.1.0/tests/test_gspo_verification.py ADDED Viewed

@@ -0,0 +1,215 @@
+"""
+GSPO Verification Tests - Comprehensive Testing of GSPO Implementation
+This test module verifies that GSPO is working correctly by testing:
+1. Basic functionality of GSPO components
+2. Comparison with GRPO behavior
+3. Sequence-level vs token-level importance sampling
+4. Mathematical correctness of importance weights
+5. Training dynamics and convergence
+"""
+import pytest
+import mlx.core as mx
+import mlx.optimizers as optim
+import numpy as np
+from textpolicy.algorithms import grpo, gspo
+from textpolicy.generation.mlx_generation import load_model, create_policy
+from textpolicy.rollout import RolloutCoordinator
+from textpolicy.buffer import Buffer
+from textpolicy.training import Trainer
+@pytest.mark.unit
+@pytest.mark.algorithm
+class TestGSPOBasicFunctionality:
+    """Test basic GSPO functions work correctly."""
+    def test_sequence_importance_weights(self):
+        """Test sequence-level importance weights computation."""
+        # Create test data
+        old_logprobs = mx.array([-1.0, -1.2, -0.8, -1.1, -0.9])  # 5 tokens
+        new_logprobs = mx.array([-1.1, -1.0, -0.9, -1.0, -1.0])  # 5 tokens
+        sequence_lengths = [2, 3]  # Two sequences: 2 tokens + 3 tokens
+        # Test sequence-level importance weights
+        seq_weights = gspo.compute_sequence_importance_weights(
+            old_logprobs, new_logprobs, sequence_lengths, clip_ratio=0.2
+        )
+        assert len(seq_weights) == len(sequence_lengths), \
+            f"Expected {len(sequence_lengths)} weights, got {len(seq_weights)}"
+        assert all(not mx.isnan(w) and not mx.isinf(w) for w in seq_weights), \
+            "All weights should be finite"
+    def test_gspo_policy_loss(self):
+        """Test GSPO policy loss computation."""
+        old_logprobs = mx.array([-1.0, -1.2, -0.8, -1.1, -0.9])
+        new_logprobs = mx.array([-1.1, -1.0, -0.9, -1.0, -1.0])
+        sequence_lengths = [2, 3]
+        advantages = mx.array([0.5, -0.3])  # Advantages for each sequence
+        # Test GSPO policy loss
+        loss = gspo.gspo_policy_loss(
+            old_logprobs, new_logprobs, advantages, sequence_lengths, variant="sequence"
+        )
+        assert not mx.isnan(loss) and not mx.isinf(loss), "Loss should be finite"
+        assert isinstance(loss, mx.array), "Loss should be an MLX array"
+    def test_hybrid_importance_weights(self):
+        """Test hybrid importance weights computation."""
+        old_logprobs = mx.array([-1.0, -1.2, -0.8, -1.1, -0.9])
+        new_logprobs = mx.array([-1.1, -1.0, -0.9, -1.0, -1.0])
+        sequence_lengths = [2, 3]
+        # Test hybrid variant
+        hybrid_weights = gspo.compute_hybrid_importance_weights(
+            old_logprobs, new_logprobs, sequence_lengths
+        )
+        assert len(hybrid_weights) == len(old_logprobs), \
+            f"Expected {len(old_logprobs)} hybrid weights, got {len(hybrid_weights)}"
+        assert all(not mx.isnan(w) and not mx.isinf(w) for w in hybrid_weights), \
+            "All hybrid weights should be finite"
+@pytest.mark.unit
+@pytest.mark.algorithm
+class TestGSPOvsGRPO:
+    """Test that GSPO produces different importance weights than GRPO."""
+    def test_importance_weight_differences(self):
+        """Test that GSPO produces different importance weights than GRPO."""
+        # Create test data with clear differences between old and new policies
+        old_logprobs = mx.array([-2.0, -2.0, -1.0, -1.0])  # 4 tokens
+        new_logprobs = mx.array([-1.0, -1.0, -2.0, -2.0])  # Policy changed significantly
+        sequence_lengths = [2, 2]  # Two sequences of equal length
+        # Compute GSPO sequence-level importance weights
+        gspo_weights = gspo.compute_sequence_importance_weights(
+            old_logprobs, new_logprobs, sequence_lengths, clip_ratio=0.2
+        )
+        # Compute GRPO token-level importance ratios for comparison
+        grpo_ratios = mx.exp(new_logprobs - old_logprobs)
+        grpo_ratios_clipped = mx.clip(grpo_ratios, 0.8, 1.2)
+        # GSPO should produce sequence-level weights (2 values)
+        # GRPO produces token-level ratios (4 values)
+        assert len(gspo_weights) == len(sequence_lengths), \
+            f"GSPO should produce {len(sequence_lengths)} sequence weights"
+        assert len(grpo_ratios) == len(old_logprobs), \
+            f"GRPO should produce {len(old_logprobs)} token ratios"
+        # The approaches should be fundamentally different
+        # GSPO normalizes by sequence length, GRPO doesn't
+        assert len(gspo_weights) != len(grpo_ratios), \
+            "GSPO and GRPO should produce different numbers of weights"
+@pytest.mark.unit
+@pytest.mark.algorithm
+class TestGSPOClipping:
+    """Test GSPO clipping behavior."""
+    def test_clipping_bounds_respected(self):
+        """Test that importance weights respect clipping bounds."""
+        # Test extreme case to verify clipping
+        old_logprobs = mx.array([-10.0, -1.0])  # Extreme difference
+        new_logprobs = mx.array([-1.0, -1.0])
+        sequence_lengths = [2]
+        clip_ratio = 0.2
+        # Compute sequence weights
+        weights = gspo.compute_sequence_importance_weights(
+            old_logprobs, new_logprobs, sequence_lengths, clip_ratio=clip_ratio
+        )
+        # Weights should be clipped between (1-clip_ratio) and (1+clip_ratio)
+        lower_bound = 1.0 - clip_ratio
+        upper_bound = 1.0 + clip_ratio
+        # Use tolerance for floating-point comparisons due to MLX float32 precision
+        # MLX uses float32 by default, which has precision ~1.19e-7
+        tolerance = 1e-6  # Conservative tolerance for float32 precision issues
+        for weight in weights:
+            weight_val = float(weight)  # Convert MLX scalar to Python float
+            assert lower_bound - tolerance <= weight_val <= upper_bound + tolerance, \
+                f"Weight {weight_val} outside clipping bounds [{lower_bound}, {upper_bound}] with tolerance {tolerance}"
+    def test_length_normalization_effect(self):
+        """Test that GSPO properly normalizes by sequence length."""
+        # Identical sequences of different lengths should have similar weights
+        old_logprobs_short = mx.array([-1.0, -1.0])  # 2 tokens
+        new_logprobs_short = mx.array([-0.5, -0.5])  # Better by 0.5 per token
+        old_logprobs_long = mx.array([-1.0, -1.0, -1.0, -1.0])  # 4 tokens
+        new_logprobs_long = mx.array([-0.5, -0.5, -0.5, -0.5])  # Better by 0.5 per token
+        weight_short = gspo.compute_sequence_importance_weights(
+            old_logprobs_short, new_logprobs_short, [2], clip_ratio=1.0  # No clipping
+        )
+        weight_long = gspo.compute_sequence_importance_weights(
+            old_logprobs_long, new_logprobs_long, [4], clip_ratio=1.0  # No clipping
+        )
+        # Both should be similar due to length normalization
+        # Short: exp((sum(-0.5) - sum(-1.0)) / 2) = exp((−1.0 − (−2.0)) / 2) = exp(0.5)
+        # Long: exp((sum(-0.5) - sum(-1.0)) / 4) = exp((−2.0 − (−4.0)) / 4) = exp(0.5)
+        short_val = float(weight_short[0])
+        long_val = float(weight_long[0])
+        # They should be approximately equal due to length normalization
+        assert abs(short_val - long_val) < 0.01, \
+            f"Length normalization failed: short={short_val}, long={long_val}"
+@pytest.mark.integration
+@pytest.mark.algorithm
+@pytest.mark.slow
+class TestGSPOTraining:
+    """Integration tests for GSPO training."""
+    def test_gspo_training_step(self):
+        """Test a complete GSPO training step."""
+        # This is a minimal integration test
+        # Create minimal test data
+        old_logprobs = mx.array([-1.0, -1.0, -1.0, -1.0])
+        new_logprobs = mx.array([-0.8, -0.8, -1.2, -1.2])
+        advantages = mx.array([0.5, -0.3])
+        sequence_lengths = [2, 2]
+        # Test that we can compute a complete loss
+        loss = gspo.gspo_policy_loss(
+            old_logprobs, new_logprobs, advantages, sequence_lengths, variant="sequence"
+        )
+        assert not mx.isnan(loss) and not mx.isinf(loss), "Training loss should be finite"
+        assert float(loss) != 0.0, "Loss should be non-zero for non-trivial inputs"
+    def test_gspo_metrics_computation(self):
+        """Test GSPO metrics computation."""
+        old_logprobs = mx.array([-1.0, -1.0, -1.0, -1.0])
+        new_logprobs = mx.array([-0.8, -0.8, -1.2, -1.2])
+        advantages = mx.array([0.5, -0.3])
+        # Test metrics computation
+        metrics_fn = gspo.create_gspo_metrics(variant="sequence")
+        metrics = metrics_fn(old_logprobs, new_logprobs, advantages)
+        assert isinstance(metrics, dict), "Metrics should be a dictionary"
+        assert len(metrics) > 0, "Metrics should not be empty"
+        # Check for expected metric keys
+        expected_keys = ['mean_advantage', 'std_advantage']
+        for key in expected_keys:
+            assert key in metrics, f"Missing expected metric: {key}"
+            assert isinstance(metrics[key], (int, float)), \
+                f"Metric {key} should be numeric, got {type(metrics[key])}"
+if __name__ == "__main__":
+    # Allow running this file directly for debugging
+    pytest.main([__file__, "-v"])

textpolicy-0.1.0/tests/test_integration_e2e_training.py ADDED Viewed

@@ -0,0 +1,49 @@
+import pytest
+@pytest.mark.integration
+def test_e2e_minimal_rollout_grpo():
+    """
+    Minimal end-to-end rollout + buffer collection using TextGenerationEnv
+    with a dummy tokenizer and a trivial policy. This validates that
+    the environment returns dict-shaped step results and the runner
+    normalization path works as expected.
+    Kept intentionally lightweight for CI (no external model downloads).
+    """
+    try:
+        import mlx.core as mx  # type: ignore
+    except Exception:
+        pytest.skip("MLX not available")
+    from textpolicy.environment.text_generation import TextGenerationEnv
+    from textpolicy.rollout.runner import RolloutRunner
+    from textpolicy.rollout.strategy import create_strategy
+    class DummyTokenizer:
+        def encode(self, text):
+            return [ord(c) % 256 for c in text]
+        def decode(self, ids):
+            return "".join(chr(int(i) % 256) for i in ids)
+    def reward_fn(prompt, completion, example, **kwargs) -> float:
+        # Simple length reward in words
+        return float(len(completion.split()))
+    # Create simple environment
+    env = TextGenerationEnv(["Hello"], reward_fn, tokenizer=DummyTokenizer())
+    # Policy returns tokens that decode to 'a b c'
+    def simple_policy(obs_mx, deterministic=False):
+        return mx.array([97, 32, 98, 32, 99], dtype=mx.int32), {}
+    strategy = create_strategy('grpo')
+    runner = RolloutRunner(env, policy=simple_policy, strategy=strategy, max_steps=2)
+    buffer = runner.collect()
+    assert len(buffer.episodes) >= 1
+    ep = buffer.episodes[0]
+    # Episode stores rewards in `rew`
+    assert len(ep.rew) >= 1
+    assert all(r > 0 for r in ep.rew)

textpolicy-0.1.0/tests/test_reward_signatures.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""
+Reward Function Signature Tests
+Test reward function signatures and compatibility to ensure proper integration.
+"""
+import pytest
+from textpolicy.rewards import length_reward, keyword_reward, perplexity_reward, accuracy_reward
+@pytest.mark.unit
+@pytest.mark.reward
+class TestRewardFunctionSignatures:
+    """Test reward function signatures for compatibility."""
+    def test_reward_functions_import(self):
+        """Test that all reward functions can be imported successfully."""
+        # Test that imports work
+        assert callable(length_reward), "length_reward should be callable"
+        assert callable(keyword_reward), "keyword_reward should be callable"
+        assert callable(perplexity_reward), "perplexity_reward should be callable"
+        assert callable(accuracy_reward), "accuracy_reward should be callable"
+    def test_length_reward_signature(self):
+        """Test length_reward function signature."""
+        test_prompt = "What is AI?"
+        test_completion = "AI is artificial intelligence technology that enables machines to simulate human thinking."
+        test_example = {"target_length": 15}
+        # Test basic call
+        try:
+            reward = length_reward(test_prompt, test_completion, test_example)
+            assert isinstance(reward, (int, float)), "length_reward should return numeric value"
+        except Exception as e:
+            pytest.fail(f"length_reward failed with signature (prompt, completion, example): {e}")
+    def test_keyword_reward_signature(self):
+        """Test keyword_reward function signature."""
+        test_prompt = "What is AI?"
+        test_completion = "AI is artificial intelligence technology that enables machines to simulate human thinking."
+        test_example = {"keywords": ["AI", "intelligence"]}
+        try:
+            reward = keyword_reward(test_prompt, test_completion, test_example)
+            assert isinstance(reward, (int, float)), "keyword_reward should return numeric value"
+        except Exception as e:
+            pytest.fail(f"keyword_reward failed with signature (prompt, completion, example): {e}")
+    def test_perplexity_reward_signature(self):
+        """Test perplexity_reward function signature."""
+        test_prompt = "What is AI?"
+        test_completion = "AI is artificial intelligence technology."
+        test_example = {"max_perplexity": 10.0}
+        try:
+            reward = perplexity_reward(test_prompt, test_completion, test_example)
+            assert isinstance(reward, (int, float)), "perplexity_reward should return numeric value"
+        except Exception as e:
+            pytest.fail(f"perplexity_reward failed with signature (prompt, completion, example): {e}")
+    def test_accuracy_reward_signature(self):
+        """Test accuracy_reward function signature."""
+        test_prompt = "What is 2+2?"
+        test_completion = "4"
+        test_example = {"correct_answer": "4"}
+        try:
+            reward = accuracy_reward(test_prompt, test_completion, test_example)
+            assert isinstance(reward, (int, float)), "accuracy_reward should return numeric value"
+        except Exception as e:
+            pytest.fail(f"accuracy_reward failed with signature (prompt, completion, example): {e}")
+    @pytest.mark.parametrize("reward_func,example_data", [
+        (length_reward, {"target_length": 15}),
+        (keyword_reward, {"keywords": ["test", "example"]}),
+        (perplexity_reward, {"max_perplexity": 10.0}),
+        (accuracy_reward, {"correct_answer": "test answer"}),
+    ])
+    def test_reward_function_consistency(self, reward_func, example_data):
+        """Test that all reward functions follow consistent signature patterns."""
+        test_prompt = "Test prompt"
+        test_completion = "Test completion response"
+        # All reward functions should accept (prompt, completion, example) signature
+        try:
+            result = reward_func(test_prompt, test_completion, example_data)
+            assert isinstance(result, (int, float)), \
+                f"{reward_func.__name__} should return numeric value"
+            assert -1.0 <= result <= 1.0, \
+                f"{reward_func.__name__} should return value in [-1, 1] range, got {result}"
+        except Exception as e:
+            pytest.fail(f"{reward_func.__name__} failed with standard signature: {e}")
+@pytest.mark.integration
+@pytest.mark.reward
+class TestRewardIntegration:
+    """Test reward function integration with the system."""
+    def test_reward_functions_with_realistic_data(self):
+        """Test reward functions with realistic data."""
+        prompt = "Explain what machine learning is in simple terms."
+        completion = "Machine learning is a type of artificial intelligence that allows computers to learn and improve from data without being explicitly programmed for every task."
+        # Test length reward
+        length_example = {"target_length": 20}
+        length_result = length_reward(prompt, completion, length_example)
+        assert isinstance(length_result, (int, float))
+        # Test keyword reward
+        keyword_example = {"keywords": ["machine", "learning", "artificial", "intelligence"]}
+        keyword_result = keyword_reward(prompt, completion, keyword_example)
+        assert isinstance(keyword_result, (int, float))
+        # Test perplexity reward (if model available)
+        perplexity_example = {"max_perplexity": 15.0}
+        try:
+            perplexity_result = perplexity_reward(prompt, completion, perplexity_example)
+            assert isinstance(perplexity_result, (int, float))
+        except Exception:
+            # Perplexity might fail if model not available, which is acceptable
+            pytest.skip("Perplexity reward requires model - skipping")
+        # Test accuracy reward
+        accuracy_example = {"correct_answer": "machine learning"}
+        accuracy_result = accuracy_reward(prompt, completion, accuracy_example)
+        assert isinstance(accuracy_result, (int, float))
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

textpolicy 0.0.1__tar.gz → 0.1.0__tar.gz

textpolicy 0.0.1tar.gz → 0.1.0tar.gz