PyPI - textpolicy - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

textpolicy 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

textpolicy/__init__.py +3 -0
textpolicy/algorithms/__init__.py +29 -4
textpolicy/algorithms/grpo.py +771 -361
textpolicy/algorithms/length_shaping.py +151 -0
textpolicy/analysis/__init__.py +23 -0
textpolicy/analysis/emergence_logger.py +248 -0
textpolicy/analysis/planning_patterns.py +105 -0
textpolicy/analysis/serialization.py +65 -0
textpolicy/generation/mlx_generation.py +36 -21
textpolicy/tasks/__init__.py +7 -0
textpolicy/tasks/countdown/__init__.py +21 -0
textpolicy/tasks/countdown/dataset.py +163 -0
textpolicy/tasks/countdown/evaluator.py +197 -0
textpolicy/tasks/countdown/prompt.py +89 -0
textpolicy/tasks/countdown/reward.py +56 -0
textpolicy/training/trainer.py +41 -21
{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/METADATA +3 -3
{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/RECORD +22 -11
{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/WHEEL +0 -0
{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/entry_points.txt +0 -0
{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/licenses/LICENSE +0 -0
{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/top_level.txt +0 -0

textpolicy/training/trainer.py CHANGED Viewed

@@ -10,6 +10,7 @@ This trainer achieves maximum efficiency through:
 - Direct MLX-LM integration
 """
+import logging
 from typing import Callable, Dict, Any, Optional, Union, List, cast
 import mlx.core as mx # type: ignore
 import mlx.nn as nn # type: ignore
@@ -51,11 +52,12 @@ class Trainer:
         compile_training: bool = True,
         buffer: Optional[Buffer] = None,
         data_selector_fn: Optional[Callable] = None,
-        auto_save_lora: Optional[str] = None
+        auto_save_lora: Optional[str] = None,
+        metrics_interval: int = 10
     ):
         """
         Initialize unified trainer with composable algorithm functions.
         Args:
             model: MLX model (typically from MLX-LM)
             advantage_fn: Pure function for computing advantages
@@ -68,6 +70,10 @@ class Trainer:
             buffer: Optional linked buffer for automatic data selection
             data_selector_fn: Algorithm-specific function to select data from buffer
             auto_save_lora: Optional path to auto-save LoRA adapters after training
+            metrics_interval: Compute detailed metrics every N steps. Setting >1
+                avoids a duplicate model forward pass on non-metric steps.
+                Default 10 balances insight and throughput; set to 1 for
+                every-step metrics when needed.
         """
         self.model = model
         self.advantage_fn = advantage_fn
@@ -76,11 +82,12 @@ class Trainer:
         self.get_logprobs_fn = get_logprobs_fn or self._default_get_logprobs
         self.metrics_fn = metrics_fn
         self.max_grad_norm = max_grad_norm
+        self.metrics_interval = max(1, metrics_interval)
         # Buffer management
         self.buffer = buffer
         self.data_selector_fn = data_selector_fn or self._default_data_selector
         # LoRA management - detect auto-reload models
         self.auto_save_lora = auto_save_lora or self._detect_auto_reload_lora(model)
         self._has_lora = self._detect_lora_model(model)
@@ -497,12 +504,15 @@ class Trainer:
         # Compute metrics if function provided
         metrics = {'loss': loss.item(), 'step': self._step_count}
-        if self.metrics_fn is not None:
+        if self.metrics_fn is not None and self._step_count % self.metrics_interval == 0:
             # Compute new logprobs using the same pipeline as training to ensure consistency
             # This properly handles GRPO data structure with format conversion
+            #
+            # NOTE: This is a second model forward pass (the first happens inside
+            # loss_and_grad_fn). Set metrics_interval > 1 to amortize this cost.
             observations = batch_data['obs']
             actions = batch_data['act']
             # Use GRPO-specific extraction if episode_lengths available, otherwise fallback
             if 'episode_lengths' in batch_data:
                 episode_lengths = batch_data['episode_lengths']
@@ -515,7 +525,7 @@ class Trainer:
                     model_input = observations  # Already batched
                 model_output = self.model(model_input)
                 new_logprobs = self.get_logprobs_fn(model_output, actions)
             algorithm_metrics = self.metrics_fn(
                 batch_data['logprob'],
                 new_logprobs,
@@ -551,27 +561,37 @@ class Trainer:
         if not episodes:
             raise ValueError("Buffer is empty - no episodes to train on")
-        # Extract episode rewards for advantage computation
-        episode_rewards = []
+        # Extract episode rewards and lengths
+        # Build reward sums lazily, then evaluate in a single sync barrier
         episode_lengths = []
+        pending_sums = []
         # Collect all transitions
         all_obs = []
         all_acts = []
         all_logprobs = []
         for episode in episodes:
-            # Episode reward (sum of all rewards in episode)
-            episode_reward = mx.sum(episode['rew']).item()
-            episode_rewards.append(episode_reward)
-            episode_lengths.append(len(episode['obs']))
+            # Support both Episode objects (attribute access) and dicts
+            rew = episode.rew if hasattr(episode, 'rew') else episode['rew']
+            obs = episode.obs if hasattr(episode, 'obs') else episode['obs']
+            act = episode.act if hasattr(episode, 'act') else episode['act']
+            logprob = episode.logprob if hasattr(episode, 'logprob') else episode['logprob']
+            pending_sums.append(mx.sum(mx.array(rew)))
+            episode_lengths.append(len(obs))
             # Collect transitions
-            all_obs.append(episode['obs'])
-            all_acts.append(episode['act'])
-            all_logprobs.append(episode['logprob'])
+            all_obs.append(mx.array(obs))
+            all_acts.append(mx.array(act))
+            all_logprobs.append(mx.array(logprob))
+        # Single sync barrier for all episode rewards
+        reward_stack = mx.stack(pending_sums)
+        mx.eval(reward_stack)
+        episode_rewards = reward_stack.tolist()
         # Concatenate all transitions
         batch_data = {
             'obs': mx.concatenate(all_obs),

{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: textpolicy
-Version: 0.1.1
+Version: 0.1.3
 Summary: Reinforcement learning for text generation on MLX (Apple Silicon): GRPO/GSPO, environments, rollout, rewards, LoRA/QLoRA
 Project-URL: Homepage, https://github.com/teilomillet/textpolicy
 Project-URL: Repository, https://github.com/teilomillet/textpolicy
@@ -16,8 +16,8 @@ Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: numpy>=2.3.2
-Requires-Dist: mlx>=0.21.0
-Requires-Dist: mlx-lm>=0.21.0
+Requires-Dist: mlx>=0.22.0
+Requires-Dist: mlx-lm>=0.22.0
 Requires-Dist: gymnasium>=0.29.0
 Requires-Dist: psutil>=7.0.0
 Requires-Dist: wandb>=0.21.1

{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,15 @@
-textpolicy/__init__.py,sha256=vDAHJ826gKuTZUjcAftzz-RTX8KuOjH50Uj1RMhjTIQ,1606
+textpolicy/__init__.py,sha256=6DZdg5ZwbqyPYaGrvITONlHeAj7XwCcsxHAwfnmNnhs,1710
 textpolicy/__main__.py,sha256=IlGmjJaW-DJUC7yhxUhbwNOZA3GxkeQGkVbFdS3_wBI,136
 textpolicy/cli.py,sha256=3CcJzrRlin1pgd6Mh312Xp3-EihHtTSvhakyYpdfacs,2107
 textpolicy/validate.py,sha256=lxmegz83B_c-PS3cFHaaL3c9fgWrEaLsDLkpPFtSj8Y,3780
-textpolicy/algorithms/__init__.py,sha256=muJSuiJkaGg-zSaGIYkaB7UbLh6UJYMdI60SGqTgNWM,1257
-textpolicy/algorithms/grpo.py,sha256=1j_C70Bgwrnr_BCAl_qvAsH3Mg9yMOW-D4vhPUxUpFQ,26261
+textpolicy/algorithms/__init__.py,sha256=bAstxSa_M784I2O-MVxZVHMEF7wAdzpAmrTrNarLwlQ,2082
+textpolicy/algorithms/grpo.py,sha256=QSFLpYr3FlZPvxxelXixOMqDOYr8aO9ETHAQKWThaDo,39223
 textpolicy/algorithms/gspo.py,sha256=OWvJolldTSTEOsCIwio3ER0hTWkYsJ1e0BBJElgJ2mc,23485
+textpolicy/algorithms/length_shaping.py,sha256=SFdkiXxUEgcVc19PBUyx34wrTN26D2Vjrvr6Ptbppu0,4813
+textpolicy/analysis/__init__.py,sha256=6UiZR3PHyiukr_OODk3GXoue_vErp29kDmvudDHWqRk,739
+textpolicy/analysis/emergence_logger.py,sha256=bK1p0fmNxl6w_K4NOxqmyUXOz4qFcLqJVvDpG1M_ROI,8725
+textpolicy/analysis/planning_patterns.py,sha256=SrqdWcnOm6rZdWP6UrpXZFhejqvtcw-QeU0qmM1wXRA,3380
+textpolicy/analysis/serialization.py,sha256=JE8OuqfrJeuTVYEJKWqHfFvNR2CH0IhSbPMbd_2WSAk,1928
 textpolicy/buffer/__init__.py,sha256=bnSkX9Oe1ajau-yqC2PYNF4a4ELVP05zjlkDmIerXlw,569
 textpolicy/buffer/buffer.py,sha256=mDie8ZiWgsjNJ4LiKyfpQNLzN1K0UICxI8XaqQacUMM,7917
 textpolicy/buffer/episode.py,sha256=iNyVqeMLzOMauz1Z3fs9JUyL7g7IEC9t8GN1eypThy4,15875
@@ -20,7 +25,7 @@ textpolicy/environment/text_generation.py,sha256=Jql0pEfrPp9tqNsPOAdIP-UYoAUsfV9
 textpolicy/environment/vectorized.py,sha256=ZROtpmdbh1Oi8c0b0D_vmVzqI16Cp2WZTmkjkRbMoDg,9932
 textpolicy/generation/__init__.py,sha256=J3dc0SPAZChJTsRn47tz8FfIp3XwNgZ-8_H9VBpQYvQ,1266
 textpolicy/generation/lora.py,sha256=xSKRczJY20BrkkU1SSgBtDc30tZjdFE7FhEZPUEoiyg,13747
-textpolicy/generation/mlx_generation.py,sha256=r__oXHiAtAQ4xq4ODUwS7FrXL40Hu9cwoS5sZOhsAfs,20468
+textpolicy/generation/mlx_generation.py,sha256=2P2TmZj03Hbgc5YbLwLPgA1RYXYwQLwmOoluWjN_eGI,21309
 textpolicy/generation/reload.py,sha256=-eJE3LXmN-kDatUQjM0--VZp0jjqWgBslYcmNcQZ_A8,7998
 textpolicy/rewards/__init__.py,sha256=mg_wL7oedL_5KLsnaJuPVc_ZHZqZKXRHg9ws2gSifMk,4769
 textpolicy/rewards/adapters.py,sha256=Ffwi9eF_mx6DdCoRRmzl7TdhqNJycpz1TovJXa0XxXk,12843
@@ -37,10 +42,16 @@ textpolicy/rollout/rollout.py,sha256=h3gs_U-NfoIKpBVf1NFeZGInvSki8RDATsq1__ne8Qo
 textpolicy/rollout/runner.py,sha256=9bB0B1GlEGNtr8bhEYQbpY1WBzJQK0MoFrsbZTQ-Lzw,10993
 textpolicy/rollout/strategy.py,sha256=Q97wxgq-FCienL15P1l-pXYEWiUZrh861UmtStj4x3E,7577
 textpolicy/rollout/worker.py,sha256=aXOKRtkivKwDks8g8VtaWUv-wQMPR72idZxPuNtwmSE,6939
+textpolicy/tasks/__init__.py,sha256=RoZkueebtIrEIXjaHy20nzogxe0B8Pf5ZT3XIRNU4wI,195
+textpolicy/tasks/countdown/__init__.py,sha256=wtbntjIbK_4TERtAtsc7XvzNYwRwfm8l9D6XlicCxE8,626
+textpolicy/tasks/countdown/dataset.py,sha256=3Gxzf1HMp_STr20Lxh7yz_2fGtZaKCQiZUcq4iehAoI,5348
+textpolicy/tasks/countdown/evaluator.py,sha256=fZ30lukzmcWfz1F4T2XaTYJK00QhDpwLFdQC-GqF78s,5957
+textpolicy/tasks/countdown/prompt.py,sha256=7JKvzek3jQ5AkkzbaNuH7GwIOEgRd7f2gW9VVf0T53s,2639
+textpolicy/tasks/countdown/reward.py,sha256=ME_ogLrogftBPqYnPVcEqcLoRs6vtSWEuUMA8qfIeC0,1555
 textpolicy/training/__init__.py,sha256=TmcW2BqmwO4DaDDr4n2g1QOtHeVPxgw6xZdeYTmzjD8,282
 textpolicy/training/metrics.py,sha256=fmY1ZBdyEgYrfH18H3fOZ-dieMtjVNzjxjdxd7yo7OU,7582
 textpolicy/training/rollout_manager.py,sha256=ETD7WTbbaQ8uUzrHPBCDX-PawmEJfSK6Kd5N-dvIZRY,2328
-textpolicy/training/trainer.py,sha256=kG7tduOKHPFVVewyspgm360enowTpNpwaLhZWuIc9vo,29268
+textpolicy/training/trainer.py,sha256=WOLaUqpxeiwD0tGzJWkWvY4q62NpM3FoXy30WuIxY2I,30292
 textpolicy/utils/__init__.py,sha256=v0ji-jnegGRydzmAOccKY4XC0nkBbBZqdHXzk-i6ers,1220
 textpolicy/utils/benchmarking.py,sha256=YDN24vU8SL_EsrANQWF1qbmXtfhF4Woj8yjez-h-Io0,18682
 textpolicy/utils/data.py,sha256=KJoPzYWYVAJawvDX1BHzwBZEpCXLSBC168rjud7MSB0,1413
@@ -58,9 +69,9 @@ textpolicy/utils/logging/tensorboard.py,sha256=aY9YMReSJkWEhy6SdAAUlHSB4lzDecivB
 textpolicy/utils/logging/wandb.py,sha256=U4pxuZNOz2l8XiymK8OFbCpiRTBOLNtnZakC_udttfQ,2206
 textpolicy/validation/__init__.py,sha256=KcyppNi91w0bF51gZ0ykUIKEiF7z6TT37uuavMFScnA,328
 textpolicy/validation/logprob_validation.py,sha256=G_CCy5NRDUTmo7WZIChhNVM3NtP1VmWAjdd5z6TIvos,11749
-textpolicy-0.1.1.dist-info/licenses/LICENSE,sha256=AYDHSNRbiqZt4HHH1gaOoQ2hjYjK4bqw4Vd9UyKzx18,1065
-textpolicy-0.1.1.dist-info/METADATA,sha256=CrrIoETuh6xExhyqrhWq-8KcHSNVeuyzo9oZ8uxLOIU,3895
-textpolicy-0.1.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
-textpolicy-0.1.1.dist-info/entry_points.txt,sha256=d0Cj5boT6k_l_beVPWPt9LZMllsN4kbIUmsNsn1BANE,51
-textpolicy-0.1.1.dist-info/top_level.txt,sha256=Ww6_QEF71dI-AYCaugiGeGcgMoFAixSOszSoRsyX-E0,11
-textpolicy-0.1.1.dist-info/RECORD,,
+textpolicy-0.1.3.dist-info/licenses/LICENSE,sha256=AYDHSNRbiqZt4HHH1gaOoQ2hjYjK4bqw4Vd9UyKzx18,1065
+textpolicy-0.1.3.dist-info/METADATA,sha256=1bGvyGC5E3qCqtI0XI6KTyAfpX34gvaCBJxOMHkeDj0,3895
+textpolicy-0.1.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
+textpolicy-0.1.3.dist-info/entry_points.txt,sha256=d0Cj5boT6k_l_beVPWPt9LZMllsN4kbIUmsNsn1BANE,51
+textpolicy-0.1.3.dist-info/top_level.txt,sha256=Ww6_QEF71dI-AYCaugiGeGcgMoFAixSOszSoRsyX-E0,11
+textpolicy-0.1.3.dist-info/RECORD,,

{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{textpolicy-0.1.1.dist-info → textpolicy-0.1.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

textpolicy 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

textpolicy 0.1.1py3-none-any.whl → 0.1.3py3-none-any.whl