PyPI - textpolicy - Versions diffs - 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl - Mend

textpolicy 0.0.1py3-none-any.whl → 0.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

textpolicy/__init__.py +52 -0
textpolicy/__main__.py +8 -0
textpolicy/algorithms/__init__.py +54 -0
textpolicy/algorithms/grpo.py +642 -0
textpolicy/algorithms/gspo.py +582 -0
textpolicy/buffer/__init__.py +23 -0
textpolicy/buffer/buffer.py +244 -0
textpolicy/buffer/episode.py +383 -0
textpolicy/buffer/sampling.py +438 -0
textpolicy/buffer/storage.py +255 -0
textpolicy/cli.py +67 -0
textpolicy/environment/__init__.py +79 -0
textpolicy/environment/base.py +110 -0
textpolicy/environment/environment.py +46 -0
textpolicy/environment/factory.py +103 -0
textpolicy/environment/gym.py +106 -0
textpolicy/environment/task_suites.py +51 -0
textpolicy/environment/text_generation.py +789 -0
textpolicy/environment/vectorized.py +253 -0
textpolicy/generation/__init__.py +62 -0
textpolicy/generation/lora.py +411 -0
textpolicy/generation/mlx_generation.py +557 -0
textpolicy/generation/reload.py +253 -0
textpolicy/rewards/__init__.py +137 -0
textpolicy/rewards/adapters.py +387 -0
textpolicy/rewards/basic.py +214 -0
textpolicy/rewards/integrated_system.py +338 -0
textpolicy/rewards/mlx_batch_processor.py +447 -0
textpolicy/rewards/registry.py +293 -0
textpolicy/rewards/rollout_rewards.py +410 -0
textpolicy/rewards/verifiers.py +369 -0
textpolicy/rollout/__init__.py +44 -0
textpolicy/rollout/aggregator.py +145 -0
textpolicy/rollout/base.py +108 -0
textpolicy/rollout/rollout.py +142 -0
textpolicy/rollout/runner.py +280 -0
textpolicy/rollout/strategy.py +208 -0
textpolicy/rollout/worker.py +194 -0
textpolicy/training/__init__.py +14 -0
textpolicy/training/metrics.py +242 -0
textpolicy/training/rollout_manager.py +78 -0
textpolicy/training/trainer.py +684 -0
textpolicy/utils/__init__.py +40 -0
textpolicy/utils/benchmarking.py +489 -0
textpolicy/utils/data.py +60 -0
textpolicy/utils/debug.py +170 -0
textpolicy/utils/environment.py +349 -0
textpolicy/utils/logging/__init__.py +22 -0
textpolicy/utils/logging/base.py +48 -0
textpolicy/utils/logging/console.py +61 -0
textpolicy/utils/logging/factory.py +133 -0
textpolicy/utils/logging/multi.py +83 -0
textpolicy/utils/logging/tensorboard.py +65 -0
textpolicy/utils/logging/wandb.py +72 -0
textpolicy/utils/memory.py +118 -0
textpolicy/utils/performance.py +464 -0
textpolicy/utils/timing.py +171 -0
textpolicy/validate.py +101 -0
textpolicy/validation/__init__.py +13 -0
textpolicy/validation/logprob_validation.py +315 -0
textpolicy-0.1.0.dist-info/METADATA +99 -0
textpolicy-0.1.0.dist-info/RECORD +66 -0
textpolicy-0.1.0.dist-info/entry_points.txt +2 -0
textpolicy-0.0.1.dist-info/METADATA +0 -10
textpolicy-0.0.1.dist-info/RECORD +0 -6
{textpolicy-0.0.1.dist-info → textpolicy-0.1.0.dist-info}/WHEEL +0 -0
{textpolicy-0.0.1.dist-info → textpolicy-0.1.0.dist-info}/licenses/LICENSE +0 -0
{textpolicy-0.0.1.dist-info → textpolicy-0.1.0.dist-info}/top_level.txt +0 -0

textpolicy/buffer/buffer.py ADDED Viewed

@@ -0,0 +1,244 @@
+# textpolicy/buffer/buffer.py
+"""
+Coordinates storage and sampling for RL training.
+The Buffer class provides a clean interface for episode-centric replay
+buffer operations, optimized for on-policy RL algorithms.
+"""
+from typing import Optional, Any, Dict
+from .storage import BufferStorage
+from .sampling import BufferSampler
+class Buffer:
+    """
+    Episode-centric replay buffer for on-policy RL (e.g., PPO).
+    Stores full episodes and converts to tensors at sample time.
+    Prevents silent corruption from circular overwrite.
+    The buffer enforces clean rollouts:
+    - Episodes are either complete or not stored
+    - Optional fields (logprob, value) must be all-or-nothing
+    - No partial episodes, no fragmented trajectories
+    Designed for:
+    - Apple Silicon (MLX, unified memory)
+    - Multiprocessing (not threading)
+    - PPO, GAE, and other on-policy algorithms
+    Example:
+        buffer = Buffer(max_episodes=100)
+        # Collect data
+        buffer.add(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done)
+        # Sample data
+        batch = buffer.sample_latest_steps(2048)  # Last 2k steps
+        batch = buffer.sample_episodes(10, order='desc')  # Last 10 episodes
+    """
+    def __init__(self, max_episodes: int = 100):
+        """
+        Initialize the buffer.
+        Args:
+            max_episodes: Maximum number of complete episodes to store.
+                         Oldest episodes are dropped when capacity is exceeded.
+        """
+        self.storage = BufferStorage(max_episodes)
+        self.sampler = BufferSampler(self.storage.episodes)
+    def add(
+        self,
+        obs: Any,
+        act: Any,
+        rew: Any,
+        next_obs: Any,
+        done: bool,
+        timeout: bool = False,
+        logprob: Optional[Any] = None,
+        value: Optional[Any] = None,
+        entropy: Optional[Any] = None
+    ):
+        """
+        Add a transition to the current episode.
+        Completes the episode and stores it if `done` or `timeout` is True.
+        Args:
+            obs: Observation
+            act: Action taken
+            rew: Reward received
+            next_obs: Next observation
+            done: Boolean indicating episode termination
+            timeout: Boolean indicating truncation (e.g. time limit)
+            logprob: Log probability of action (optional, all-or-nothing)
+            value: Estimated state value (optional, all-or-nothing)
+            entropy: Action entropy (optional, all-or-nothing)
+        Example:
+            buffer.add(
+                obs=obs,
+                act=action,
+                rew=reward,
+                next_obs=next_obs,
+                done=done,
+                timeout=timeout,
+                logprob=logp.item(),
+                value=value.item()
+            )
+        """
+        self.storage.add_transition(
+            obs=obs, act=act, rew=rew, next_obs=next_obs,
+            done=done, timeout=timeout,
+            logprob=logprob, value=value, entropy=entropy
+        )
+    def sample(self) -> Dict[str, Any]:
+        """
+        Sample all stored episodes as a single concatenated batch.
+        Returns:
+            Dict of MLX arrays with all transitions, in chronological order:
+            - Oldest episode → Newest episode
+            - Each episode: first step → last step
+        Raises:
+            ValueError: If buffer is empty
+        """
+        return self.sampler.sample_all()
+    def sample_latest_steps(self, n: int) -> Dict[str, Any]:
+        """
+        Sample the N most recent transitions across episodes.
+        Returns:
+            Dict of MLX arrays with the latest `n` steps,
+            in **chronological order** (oldest → newest).
+        Args:
+            n: Number of steps to sample (must be > 0)
+        Raises:
+            ValueError: If buffer is empty or n <= 0
+        """
+        return self.sampler.sample_latest_steps(n)
+    def sample_episodes(self, k: int, order: str = 'asc') -> Dict[str, Any]:
+        """
+        Sample up to k complete episodes.
+        Args:
+            k: Number of episodes to sample (must be > 0)
+            order: 'asc' for oldest first, 'desc' for newest first
+        Returns:
+            Dict of MLX arrays with concatenated transitions from selected episodes.
+        Raises:
+            ValueError: If buffer is empty, k <= 0, or invalid order
+        """
+        return self.sampler.sample_episodes(k, order)
+    def sample_sequences(
+        self,
+        batch_size: int,
+        seq_len: int,
+        recent_first: bool = True,
+        drop_incomplete: bool = True,
+        dreamerv3_mode: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Sample contiguous sequences of length `seq_len` for DreamerV3 RSSM training.
+        Returns tensors shaped [batch, time, ...] and avoids crossing episode boundaries.
+        This method is intentionally minimal and efficient to support Apple Silicon
+        memory patterns and avoids padding logic; set `drop_incomplete=True` to skip
+        short episodes.
+        """
+        return self.sampler.sample_sequences(
+            batch_size=batch_size,
+            seq_len=seq_len,
+            recent_first=recent_first,
+            drop_incomplete=drop_incomplete,
+            dreamerv3_mode=dreamerv3_mode,
+        )
+    def add_episode_from_dict(self, data: Dict[str, Any]):
+        """
+        Reconstruct and add an episode from a serialized dictionary.
+        This is used to deserialize episodes sent from RolloutWorker.
+        Args:
+            data: Dictionary containing episode data (e.g. from `episode.to_dict()`)
+                  Must include: obs, act, rew, next_obs, done, timeout
+                  Optional: logprob, value, entropy
+        """
+        self.storage.add_episode_from_dict(data)
+    def clear(self):
+        """
+        Reset the buffer: clear all stored episodes and reset current episode.
+        """
+        self.storage.clear()
+    def ready(self, min_episodes: int = 1) -> bool:
+        """
+        Check if buffer contains at least `min_episodes` complete episodes.
+        Args:
+            min_episodes: Minimum number of episodes required (default: 1)
+        Returns:
+            True if buffer has enough episodes, False otherwise
+        """
+        return self.storage.ready(min_episodes)
+    def __len__(self) -> int:
+        """
+        Total number of steps in the buffer.
+        Returns:
+            Sum of steps across all stored episodes
+        """
+        return len(self.storage)
+    @property
+    def episodes(self):
+        """Access to underlying episodes for backwards compatibility."""
+        return self.storage.episodes
+    @property
+    def current_episode(self):
+        """Access to current incomplete episode for backwards compatibility."""
+        return self.storage.current_episode
+    @property
+    def episode_count(self) -> int:
+        """Number of complete episodes currently stored."""
+        return self.storage.episode_count
+    def print_state(self, label: str = "Buffer State"):
+        """
+        Print current buffer state. Useful for debugging.
+        Args:
+            label: Label to display at the top
+        """
+        info = self.storage.get_storage_info()
+        stats = self.sampler.get_episode_statistics()
+        print("=" * 50)
+        print(f"{label}")
+        print(f"Episodes stored  : {info['episode_count']} (max={info['max_episodes']})")
+        print(f"Total steps      : {info['total_steps']}")
+        print(f"Capacity usage   : {info['capacity_usage']:.1%}")
+        if info['episode_lengths']:
+            print(f"Episode lengths  : {info['episode_lengths']}")
+            print(f"Mean length      : {stats['mean_episode_length']:.1f}")
+            print(f"Mean reward      : {stats['mean_episode_reward']:.2f}")
+        print("=" * 50)

textpolicy/buffer/episode.py ADDED Viewed

@@ -0,0 +1,383 @@
+# textpolicy/buffer/episode.py
+"""
+Single episode trajectory management.
+The Episode class stores transitions as Python lists during rollout,
+then converts to MLX arrays only at sampling time. This aims to be optimal
+for Apple Silicon's unified memory architecture.
+"""
+from typing import Optional, Any, Dict
+import mlx.core as mx # type: ignore
+class Episode:
+    """
+    Represents a single complete episode trajectory.
+    Stores transitions as Python lists during rollout, then converts to MLX arrays
+    only at sampling time. This aims to be optimal for Apple Silicon's unified memory.
+    All optional fields (e.g. `logprob`, `value`) must be provided for **all steps**
+    or **none** — mixing will raise an error. This ensures tensor shape consistency.
+    Example:
+        ep = Episode()
+        ep.append(obs=1, act=0, rew=1, next_obs=2, done=False, logprob=0.1, value=1.5)
+        ep.append(obs=2, act=1, rew=2, next_obs=3, done=True, logprob=0.2, value=2.5)
+        batch = ep.to_tensor_dict()  # Returns dict of MLX arrays
+    """
+    def __init__(self):
+        """Initialize empty episode with required fields."""
+        # Required fields - always present
+        self.obs: list[Any] = []
+        self.act: list[Any] = []
+        self.rew: list[Any] = []
+        self.next_obs: list[Any] = []
+        self.done: list[bool] = []
+        self.timeout: list[bool] = []
+        # Optional fields - all-or-nothing consistency
+        self.logprob: Optional[list[Any]] = None
+        self.value: Optional[list[Any]] = None
+        self.entropy: Optional[list[Any]] = None
+    def append(
+        self,
+        obs,
+        act,
+        rew,
+        next_obs,
+        done,
+        timeout=False,
+        logprob=None,
+        value=None,
+        entropy=None
+    ):
+        """
+        Append a single environment transition to the episode.
+        Args:
+            obs: Observation from environment
+            act: Action taken
+            rew: Reward received
+            next_obs: Next observation
+            done: Boolean indicating episode termination
+            timeout: Boolean indicating truncation (e.g. time limit)
+            logprob: Log probability of action (optional, but must be all-or-nothing)
+            value: Estimated state value (optional, but must be all-or-nothing)
+            entropy: Action entropy (optional, but must be all-or-nothing)
+        Raises:
+            ValueError: If optional fields are inconsistent (some provided, some missing)
+        Example:
+            episode.append(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done)
+        """
+        # Store required fields
+        self.obs.append(obs)
+        self.act.append(act)
+        self.rew.append(rew)
+        self.next_obs.append(next_obs)
+        self.done.append(done)
+        self.timeout.append(timeout)
+        # Handle logprob: must be all-or-nothing
+        if logprob is not None:
+            if self.logprob is None:
+                self.logprob = []
+            self.logprob.append(logprob)
+        else:
+            if self.logprob is not None:
+                raise ValueError(
+                    "This episode includes logprob, but one step is missing it. "
+                    "Either provide logprob for all steps or none."
+                )
+        # Handle value: must be all-or-nothing
+        if value is not None:
+            if self.value is None:
+                self.value = []
+            self.value.append(value)
+        else:
+            if self.value is not None:
+                raise ValueError(
+                    "This episode includes value, but one step is missing it. "
+                    "Either provide value for all steps or none."
+                )
+        # Handle entropy: must be all-or-nothing
+        if entropy is not None:
+            if self.entropy is None:
+                self.entropy = []
+            self.entropy.append(entropy)
+        else:
+            if self.entropy is not None:
+                raise ValueError(
+                    "This episode includes entropy, but one step is missing it. "
+                    "Either provide entropy for all steps or none."
+                )
+    def __len__(self) -> int:
+        """Return the number of steps in this episode."""
+        return len(self.obs)
+    def to_tensor_dict(self) -> Dict[str, mx.array]:
+        """
+        Convert all stored data to MLX arrays for training.
+        Performed once at sample time for efficiency on Apple Silicon and MLX.
+        Returns:
+            Dict of MLX arrays with keys:
+            - 'obs': (T, *obs_shape) - observations
+            - 'act': (T, *act_shape) - actions
+            - 'rew': (T,) - rewards
+            - 'next_obs': (T, *obs_shape) - next observations
+            - 'done': (T,) - termination flags
+            - 'timeout': (T,) - truncation flags
+            - 'logprob': (T,) - log probabilities (if provided)
+            - 'value': (T,) - value estimates (if provided)
+            - 'entropy': (T,) - action entropy (if provided)
+        Notes:
+            This runs once at sample time and uses batched array conversion.
+        """
+        # Batched array conversion for improved memory efficiency
+        # Convert to numpy first, then a single MLX array
+        import numpy as np
+        # Convert required fields to MLX arrays - BATCHED APPROACH
+        try:
+            # Try numpy-based batched conversion first (most efficient)
+            obs_np = np.array(self.obs)
+            next_obs_np = np.array(self.next_obs)
+            act_np = np.array(self.act)
+            result = {
+                'obs': mx.array(obs_np),              # Single batched conversion
+                'act': mx.array(act_np),              # Single batched conversion
+                'rew': mx.array(self.rew),            # Already efficient for scalars
+                'next_obs': mx.array(next_obs_np),    # Single batched conversion
+                'done': mx.array(self.done),          # Already efficient for booleans
+                'timeout': mx.array(self.timeout),    # Already efficient for booleans
+            }
+        except (ValueError, TypeError):
+            # Batch conversion fallback with pre-allocation
+            # (for heterogeneous data types or complex structures)
+            try:
+                # Try batch conversion first (faster for homogeneous data)
+                import numpy as np
+                result = {
+                    'obs': mx.array(np.array(self.obs)),
+                    'act': mx.array(np.array(self.act)),
+                    'rew': mx.array(self.rew),
+                    'next_obs': mx.array(np.array(self.next_obs)),
+                    'done': mx.array(self.done),
+                    'timeout': mx.array(self.timeout),
+                }
+            except:
+                # Fallback for heterogeneous data - try stacking first
+                try:
+                    result = {
+                        'obs': mx.stack([mx.array(o) for o in self.obs]),
+                        'act': mx.stack([mx.array(a) for a in self.act]),
+                        'rew': mx.array(self.rew),
+                        'next_obs': mx.stack([mx.array(o) for o in self.next_obs]),
+                        'done': mx.array(self.done),
+                        'timeout': mx.array(self.timeout),
+                    }
+                except:
+                    # Final fallback for truly heterogeneous shapes - return as list of arrays
+                    # This handles cases where observations have completely different shapes
+                    result = {
+                        'obs': [mx.array(o) for o in self.obs],
+                        'act': [mx.array(a) for a in self.act] if not all(isinstance(a, (int, float)) for a in self.act) else mx.array(self.act),
+                        'rew': mx.array(self.rew),
+                        'next_obs': [mx.array(o) for o in self.next_obs],
+                        'done': mx.array(self.done),
+                        'timeout': mx.array(self.timeout),
+                    }
+        # Add optional fields if present - handle variable-length sequences properly
+        if self.logprob is not None:
+            # Handle variable-length logprob sequences (common in text generation)
+            # Each transition may have different response lengths, so we flatten them
+            try:
+                # Try direct conversion first (for uniform lengths)
+                result['logprob'] = mx.array(self.logprob)
+            except ValueError as e:
+                if "non-uniform length" in str(e):
+                    # Handle variable-length sequences by flattening
+                    # This preserves all logprob data while making it MLX-compatible
+                    flattened_logprobs = []
+                    for logprob_item in self.logprob:
+                        if hasattr(logprob_item, 'tolist'):  # MLX array
+                            flattened_logprobs.extend(logprob_item.tolist())
+                        elif isinstance(logprob_item, list):  # Python list
+                            flattened_logprobs.extend(logprob_item)
+                        else:  # Single value
+                            flattened_logprobs.append(float(logprob_item))
+                    result['logprob'] = mx.array(flattened_logprobs) if flattened_logprobs else mx.array([])
+                else:
+                    # Re-raise other ValueError types
+                    raise
+        if self.value is not None:
+            # Apply same variable-length handling to value if needed
+            try:
+                result['value'] = mx.array(self.value)
+            except ValueError as e:
+                if "non-uniform length" in str(e):
+                    flattened_values = []
+                    for value_item in self.value:
+                        if hasattr(value_item, 'tolist'):  # MLX array
+                            flattened_values.extend(value_item.tolist())
+                        elif isinstance(value_item, list):  # Python list
+                            flattened_values.extend(value_item)
+                        else:  # Single value
+                            flattened_values.append(float(value_item))
+                    result['value'] = mx.array(flattened_values) if flattened_values else mx.array([])
+                else:
+                    raise
+        if self.entropy is not None:
+            # Apply same variable-length handling to entropy if needed
+            try:
+                result['entropy'] = mx.array(self.entropy)
+            except ValueError as e:
+                if "non-uniform length" in str(e):
+                    flattened_entropy = []
+                    for entropy_item in self.entropy:
+                        if hasattr(entropy_item, 'tolist'):  # MLX array
+                            flattened_entropy.extend(entropy_item.tolist())
+                        elif isinstance(entropy_item, list):  # Python list
+                            flattened_entropy.extend(entropy_item)
+                        else:  # Single value
+                            flattened_entropy.append(float(entropy_item))
+                    result['entropy'] = mx.array(flattened_entropy) if flattened_entropy else mx.array([])
+                else:
+                    raise
+        return result
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Convert episode to dictionary for serialization (multiprocessing).
+        Used for inter-process communication where MLX arrays can't be shared.
+        This preserves all data as Python-native types for queue transmission.
+        Returns:
+            Dictionary representation with all Python-native types.
+            This is the inverse of creating an episode from a dict.
+        Example:
+            # In worker process
+            ep_dict = episode.to_dict()
+            queue.put(ep_dict)
+            # In trainer process
+            buffer.add_episode_from_dict(ep_dict)
+        """
+        # Always include required fields
+        result = {
+            'obs': self.obs,
+            'act': self.act,
+            'rew': self.rew,
+            'next_obs': self.next_obs,
+            'done': self.done,
+            'timeout': self.timeout,
+        }
+        # Add optional fields if present
+        if self.logprob is not None:
+            result['logprob'] = self.logprob
+        if self.value is not None:
+            result['value'] = self.value
+        if self.entropy is not None:
+            result['entropy'] = self.entropy
+        return result
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> 'Episode':
+        """
+        Create Episode from dictionary representation (for deserialization).
+        This is the inverse of to_dict() - reconstructs an Episode from
+        serialized dictionary data, typically used after inter-process
+        communication where Episode objects are transmitted as dicts.
+        Args:
+            data: Dictionary containing episode data with Python-native types
+        Returns:
+            New Episode instance with data from the dictionary
+        Example:
+            # Reconstruct episode from serialized data
+            episode = Episode.from_dict(ep_dict)
+        """
+        episode = cls()
+        # Reconstruct episode by appending each step
+        length = len(data['obs'])
+        for i in range(length):
+            step_data = {
+                'obs': data['obs'][i],
+                'act': data['act'][i],
+                'rew': data['rew'][i],
+                'next_obs': data['next_obs'][i],
+                'done': data['done'][i],
+                'timeout': data['timeout'][i] if i < len(data['timeout']) else False
+            }
+            # Add optional fields if present in the data
+            if 'logprob' in data and i < len(data['logprob']):
+                step_data['logprob'] = data['logprob'][i]
+            if 'value' in data and i < len(data['value']):
+                step_data['value'] = data['value'][i]
+            if 'entropy' in data and i < len(data['entropy']):
+                step_data['entropy'] = data['entropy'][i]
+            episode.append(**step_data)
+        return episode
+    def validate_consistency(self):
+        """
+        Validate internal consistency of episode data.
+        Checks:
+        - All required fields have same length
+        - Optional fields have correct length if present
+        - Episode has at least one step
+        Raises:
+            ValueError: If episode data is inconsistent
+        """
+        if len(self) == 0:
+            raise ValueError("Episode is empty")
+        # Check required fields have consistent length
+        required_lengths = [
+            len(self.obs), len(self.act), len(self.rew),
+            len(self.next_obs), len(self.done), len(self.timeout)
+        ]
+        if not all(length == required_lengths[0] for length in required_lengths):
+            raise ValueError(f"Inconsistent required field lengths: {required_lengths}")
+        # Check optional fields have correct length if present
+        episode_length = len(self.obs)
+        if self.logprob is not None and len(self.logprob) != episode_length:
+            raise ValueError(f"logprob length {len(self.logprob)} != episode length {episode_length}")
+        if self.value is not None and len(self.value) != episode_length:
+            raise ValueError(f"value length {len(self.value)} != episode length {episode_length}")
+        if self.entropy is not None and len(self.entropy) != episode_length:
+            raise ValueError(f"entropy length {len(self.entropy)} != episode length {episode_length}")

textpolicy 0.0.1__py3-none-any.whl → 0.1.0__py3-none-any.whl

textpolicy 0.0.1py3-none-any.whl → 0.1.0py3-none-any.whl