PyPI - textpolicy - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

textpolicy 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

textpolicy/__init__.py +3 -0
textpolicy/algorithms/__init__.py +29 -4
textpolicy/algorithms/grpo.py +771 -361
textpolicy/algorithms/length_shaping.py +151 -0
textpolicy/analysis/__init__.py +23 -0
textpolicy/analysis/emergence_logger.py +248 -0
textpolicy/analysis/planning_patterns.py +105 -0
textpolicy/analysis/serialization.py +65 -0
textpolicy/generation/mlx_generation.py +36 -21
textpolicy/tasks/__init__.py +7 -0
textpolicy/tasks/countdown/__init__.py +21 -0
textpolicy/tasks/countdown/dataset.py +163 -0
textpolicy/tasks/countdown/evaluator.py +197 -0
textpolicy/tasks/countdown/prompt.py +89 -0
textpolicy/tasks/countdown/reward.py +56 -0
textpolicy/training/trainer.py +41 -21
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/METADATA +1 -1
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/RECORD +22 -11
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/WHEEL +0 -0
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/entry_points.txt +0 -0
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/licenses/LICENSE +0 -0
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/top_level.txt +0 -0

textpolicy/algorithms/length_shaping.py ADDED Viewed

@@ -0,0 +1,151 @@
+# textpolicy/algorithms/length_shaping.py
+"""
+DAPO-style soft overlong penalties and length shaping utilities.
+These utilities replace hard truncation with graduated penalties,
+reducing training instability from length-based confusion.
+References:
+    DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+    https://arxiv.org/abs/2503.14476
+"""
+from __future__ import annotations
+from typing import List, Dict, Union
+try:
+    import mlx.core as mx  # type: ignore
+except ImportError:
+    mx = None
+def compute_length_penalty(
+    sequence_length: int,
+    max_length: int,
+    cache_length: int = 100,
+    max_penalty: float = 0.5
+) -> float:
+    """
+    Compute soft penalty for sequences approaching max length.
+    Instead of hard cutoffs for max sequence length (which cause truncation
+    that looks like failure to the model), use graduated penalties within
+    an interval before max_length.
+    This reduces training instability from length-based confusion and helps
+    the model learn to be concise without hard punishment.
+    Args:
+        sequence_length: Current sequence length
+        max_length: Maximum allowed sequence length
+        cache_length: Start penalizing this many tokens before max_length.
+                     Must be positive.
+        max_penalty: Maximum penalty at max_length (default 0.5)
+    Returns:
+        Penalty value (0.0 for normal lengths, up to -max_penalty at max_length)
+    Example:
+        With max_length=512, cache_length=100 (threshold=412):
+        - length=400: penalty=0.0 (below threshold)
+        - length=412: penalty=0.0 (at threshold, progress=0)
+        - length=462: penalty=-0.25 (50/100 * 0.5)
+        - length=512: penalty=-0.5 (at max)
+    Raises:
+        ValueError: If cache_length <= 0
+    """
+    if cache_length <= 0:
+        raise ValueError(f"cache_length must be positive, got {cache_length}")
+    threshold = max_length - cache_length
+    if sequence_length < threshold:
+        return 0.0
+    # Linear penalty from 0 to max_penalty as we approach max
+    progress = (sequence_length - threshold) / cache_length
+    progress = min(1.0, progress)  # Clamp at 1.0
+    return -max_penalty * progress
+def apply_length_shaping(
+    rewards: "mx.array",
+    sequence_lengths: List[int],
+    max_length: int,
+    cache_length: int = 100,
+    max_penalty: float = 0.5
+) -> "mx.array":
+    """
+    Apply soft length penalties to rewards.
+    Modifies rewards by adding graduated penalties for sequences that
+    approach the maximum length. This provides a smoother learning signal
+    than hard truncation.
+    Args:
+        rewards: Original rewards array [batch_size]
+        sequence_lengths: List of sequence lengths for each episode
+        max_length: Maximum allowed sequence length
+        cache_length: Start penalizing this many tokens before max_length
+        max_penalty: Maximum penalty at max_length
+    Returns:
+        Rewards with length penalties applied
+    Example:
+        >>> rewards = mx.array([1.0, 0.5, 0.0])
+        >>> lengths = [400, 500, 520]  # max_length=512, cache_length=100
+        >>> shaped = apply_length_shaping(rewards, lengths, 512)
+        >>> # shaped ≈ [1.0, 0.06, -0.5]  # last one gets max penalty
+    """
+    penalties = mx.array([
+        compute_length_penalty(length, max_length, cache_length, max_penalty)
+        for length in sequence_lengths
+    ], dtype=mx.float32)
+    return rewards + penalties
+def compute_length_shaping_stats(
+    sequence_lengths: List[int],
+    max_length: int,
+    cache_length: int = 100
+) -> Dict[str, Union[int, float]]:
+    """
+    Compute statistics about length penalties for monitoring.
+    Args:
+        sequence_lengths: List of sequence lengths
+        max_length: Maximum allowed sequence length
+        cache_length: Penalty threshold offset
+    Returns:
+        Dictionary with length penalty statistics:
+        - mean_length: Average sequence length
+        - max_length_observed: Maximum observed sequence length
+        - truncation_rate: Fraction of sequences at or past max_length
+        - penalty_zone_rate: Fraction of sequences in penalty zone
+    """
+    threshold = max_length - cache_length
+    total = len(sequence_lengths)
+    if total == 0:
+        return {
+            'mean_length': 0.0,
+            'max_length_observed': 0,
+            'truncation_rate': 0.0,
+            'penalty_zone_rate': 0.0,
+        }
+    truncated = sum(1 for l in sequence_lengths if l >= max_length)
+    in_penalty_zone = sum(1 for l in sequence_lengths if threshold <= l < max_length)
+    return {
+        'mean_length': sum(sequence_lengths) / total,
+        'max_length_observed': max(sequence_lengths),
+        'truncation_rate': truncated / total,
+        'penalty_zone_rate': in_penalty_zone / total,
+    }

textpolicy/analysis/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# textpolicy/analysis/__init__.py
+"""
+Post-hoc analysis tooling for TextPolicy training runs.
+Main components:
+- EmergenceLogger: Captures all generations during GRPO training
+- PlanningPatternDetector: Configurable planning-phrase detection
+- PlanningPatternConfig: Pattern configuration dataclass
+- StreamingJSONLWriter: Append-only JSONL writer
+- to_json_safe: MLX/numpy → JSON-native conversion
+"""
+from .emergence_logger import EmergenceLogger
+from .planning_patterns import PlanningPatternConfig, PlanningPatternDetector
+from .serialization import StreamingJSONLWriter, to_json_safe
+__all__ = [
+    "EmergenceLogger",
+    "PlanningPatternDetector",
+    "PlanningPatternConfig",
+    "StreamingJSONLWriter",
+    "to_json_safe",
+]

textpolicy/analysis/emergence_logger.py ADDED Viewed

@@ -0,0 +1,248 @@
+# textpolicy/analysis/emergence_logger.py
+"""
+Generation logging for emergence analysis during GRPO training.
+Captures every generation produced during training and writes two JSONL
+streams: per-generation records (``generations.jsonl``) and per-step
+aggregate statistics (``steps.jsonl``).
+"""
+import time
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+from .planning_patterns import PlanningPatternConfig, PlanningPatternDetector
+from .serialization import StreamingJSONLWriter, to_json_safe
+def _flatten(value: Any) -> list:
+    """Flatten an MLX array, Python list, or scalar to a plain Python list."""
+    if hasattr(value, "tolist"):
+        result = value.tolist()
+        if isinstance(result, list):
+            return result
+        return [result]
+    if isinstance(value, list):
+        return value
+    return [value]
+def _default_metadata_extractor(
+    example: Optional[dict],
+    reward: float,
+) -> dict:
+    """Extract countdown-task metadata from an example dict.
+    Returns ``target``, ``numbers``, and ``correctness`` (reward >= 0.99).
+    """
+    if example is None:
+        return {}
+    meta: Dict[str, Any] = {}
+    if "target" in example:
+        meta["target"] = to_json_safe(example["target"])
+    if "numbers" in example:
+        meta["numbers"] = to_json_safe(example["numbers"])
+    meta["correctness"] = reward >= 0.99
+    return meta
+class EmergenceLogger:
+    """Logs every generation during training for post-hoc emergence analysis.
+    Writes two JSONL files under *output_dir*:
+    * ``generations.jsonl`` — one record per generation
+    * ``steps.jsonl`` — one record per training step (aggregated stats)
+    Args:
+        output_dir: Directory for JSONL output files (created if needed).
+        planning_config: Optional :class:`PlanningPatternConfig`.
+        metadata_extractor: Optional callable ``(example, reward) -> dict``.
+            Defaults to countdown-task extractor.
+    """
+    def __init__(
+        self,
+        output_dir: Union[str, Path],
+        planning_config: Optional[PlanningPatternConfig] = None,
+        metadata_extractor: Optional[Callable] = None,
+    ) -> None:
+        self._output_dir = Path(output_dir)
+        self._output_dir.mkdir(parents=True, exist_ok=True)
+        self._gen_writer = StreamingJSONLWriter(self._output_dir / "generations.jsonl")
+        self._step_writer = StreamingJSONLWriter(self._output_dir / "steps.jsonl")
+        self._detector = PlanningPatternDetector(planning_config)
+        self._extract_metadata = metadata_extractor or _default_metadata_extractor
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def log_step(
+        self,
+        step: int,
+        episodes: list,
+        tokenizer: Any,
+        examples: Optional[list] = None,
+    ) -> dict:
+        """Log all generations for a single training step.
+        Args:
+            step: Current training step index.
+            episodes: List of :class:`Episode` objects (or dicts with the
+                same fields: ``obs``, ``act``, ``rew``, ``logprob``).
+            tokenizer: Tokenizer with a ``decode`` method.
+            examples: Optional parallel list of example dicts (same length
+                as *episodes*).  Used by the metadata extractor.
+        Returns:
+            Aggregated step statistics dict (also written to ``steps.jsonl``).
+        """
+        t0 = time.perf_counter()
+        rewards: List[float] = []
+        completion_lengths: List[int] = []
+        planning_ratios: List[float] = []
+        entropy_values: List[float] = []
+        correct_count = 0
+        for idx, ep in enumerate(episodes):
+            record = self._process_episode(
+                step=step,
+                episode=ep,
+                tokenizer=tokenizer,
+                example=examples[idx] if examples and idx < len(examples) else None,
+            )
+            self._gen_writer.write(record)
+            # Accumulate for step aggregate
+            rewards.append(record["reward"])
+            completion_lengths.append(len(record["tokens"]))
+            planning_ratios.append(record["planning_token_ratio"])
+            if record["entropy_per_token"]:
+                entropy_values.extend(record["entropy_per_token"])
+            if record.get("metadata", {}).get("correctness", False):
+                correct_count += 1
+        elapsed_ms = (time.perf_counter() - t0) * 1000.0
+        total = len(episodes)
+        step_record = self._build_step_record(
+            step=step,
+            rewards=rewards,
+            completion_lengths=completion_lengths,
+            planning_ratios=planning_ratios,
+            entropy_values=entropy_values,
+            correct_count=correct_count,
+            total_count=total,
+            elapsed_ms=elapsed_ms,
+        )
+        self._step_writer.write(step_record)
+        return step_record
+    def finish(self) -> None:
+        """Close underlying file handles."""
+        self._gen_writer.close()
+        self._step_writer.close()
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _process_episode(
+        self,
+        step: int,
+        episode: Any,
+        tokenizer: Any,
+        example: Optional[dict],
+    ) -> dict:
+        """Build a per-generation record from a single episode."""
+        # Support both Episode objects and plain dicts.
+        # Use isinstance check instead of `or` to avoid falsy empty-list fallthrough.
+        if isinstance(episode, dict):
+            obs = episode.get("obs", [])
+            act = episode.get("act", [])
+            rew = episode.get("rew", [])
+            logprob_raw = episode.get("logprob")
+        else:
+            obs = episode.obs
+            act = episode.act
+            rew = episode.rew
+            logprob_raw = episode.logprob
+        # Flatten to plain Python lists
+        prompt_tokens = _flatten(obs[0]) if obs else []
+        completion_tokens = _flatten(act[0]) if act else []
+        reward_val = float(_flatten(rew)[0]) if rew else 0.0
+        # Logprobs (may be None)
+        logprobs: List[float] = []
+        if logprob_raw is not None and len(logprob_raw) > 0:
+            logprobs = [float(v) for v in _flatten(logprob_raw[0])]
+        # Entropy proxy: -logprob per token
+        entropy_per_token = [-lp for lp in logprobs] if logprobs else []
+        # Decode text for pattern detection
+        prompt_text = tokenizer.decode(prompt_tokens) if prompt_tokens else ""
+        completion_text = tokenizer.decode(completion_tokens) if completion_tokens else ""
+        # Planning pattern detection
+        planning_phrases = self._detector.detect(completion_text)
+        planning_ratio = self._detector.planning_token_ratio(
+            completion_text, len(completion_tokens)
+        )
+        # Metadata
+        metadata = self._extract_metadata(example, reward_val)
+        return {
+            "step": step,
+            "prompt": prompt_text,
+            "completion": completion_text,
+            "reward": reward_val,
+            "tokens": completion_tokens,
+            "logprobs": logprobs,
+            "entropy_per_token": entropy_per_token,
+            "planning_phrases_found": planning_phrases,
+            "planning_token_ratio": planning_ratio,
+            "metadata": metadata,
+        }
+    @staticmethod
+    def _build_step_record(
+        step: int,
+        rewards: List[float],
+        completion_lengths: List[int],
+        planning_ratios: List[float],
+        entropy_values: List[float],
+        correct_count: int,
+        total_count: int,
+        elapsed_ms: float,
+    ) -> dict:
+        """Compute aggregate statistics for a training step."""
+        import math
+        def _mean(xs: list) -> float:
+            return sum(xs) / len(xs) if xs else 0.0
+        def _std(xs: list) -> float:
+            if len(xs) < 2:
+                return 0.0
+            m = _mean(xs)
+            return math.sqrt(sum((x - m) ** 2 for x in xs) / len(xs))
+        return {
+            "step": step,
+            "mean_reward": _mean(rewards),
+            "std_reward": _std(rewards),
+            "mean_completion_length": _mean([float(l) for l in completion_lengths]),
+            "planning_token_ratio": _mean(planning_ratios),
+            "entropy_mean": _mean(entropy_values),
+            "entropy_std": _std(entropy_values),
+            "correct_count": correct_count,
+            "total_count": total_count,
+            "logging_overhead_ms": round(elapsed_ms, 2),
+        }

textpolicy/analysis/planning_patterns.py ADDED Viewed

@@ -0,0 +1,105 @@
+# textpolicy/analysis/planning_patterns.py
+"""
+Planning pattern detection for emergence analysis.
+Provides configurable pattern matching to identify reasoning behaviors
+(hesitation, verification, backtracking, etc.) in model generations.
+"""
+import re
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+@dataclass
+class PlanningPatternConfig:
+    """Configuration for planning pattern detection.
+    Each category maps to a list of literal phrases. The detector builds
+    a single compiled regex from all phrases for efficient matching.
+    """
+    hesitation: List[str] = field(
+        default_factory=lambda: [
+            "wait",
+            "hmm",
+            "actually",
+            "let me think",
+            "on second thought",
+        ]
+    )
+    verification: List[str] = field(
+        default_factory=lambda: [
+            "let me check",
+            "verify",
+            "double check",
+            "is this right",
+        ]
+    )
+    backtracking: List[str] = field(
+        default_factory=lambda: [
+            "try another",
+            "different approach",
+            "go back",
+            "start over",
+        ]
+    )
+    alternatives: List[str] = field(
+        default_factory=lambda: [
+            "alternatively",
+            "or we could",
+            "another way",
+        ]
+    )
+    metacognition: List[str] = field(
+        default_factory=lambda: [
+            "notice that",
+            "the key is",
+            "importantly",
+        ]
+    )
+    case_sensitive: bool = False
+    @property
+    def all_patterns(self) -> List[str]:
+        """Return a flat list of all patterns across every category."""
+        patterns: List[str] = []
+        for cat in ("hesitation", "verification", "backtracking",
+                     "alternatives", "metacognition"):
+            patterns.extend(getattr(self, cat))
+        return patterns
+class PlanningPatternDetector:
+    """Efficient planning-phrase detector using a single compiled regex.
+    Args:
+        config: Optional pattern configuration. Uses defaults if *None*.
+    """
+    def __init__(self, config: Optional[PlanningPatternConfig] = None) -> None:
+        self.config = config or PlanningPatternConfig()
+        flags = 0 if self.config.case_sensitive else re.IGNORECASE
+        # Sort longest-first so greedy alternation prefers longer matches
+        patterns = sorted(self.config.all_patterns, key=len, reverse=True)
+        escaped = [re.escape(p) for p in patterns]
+        # Guard against empty pattern list — an empty regex matches every position
+        self._regex = re.compile("|".join(escaped), flags) if escaped else None
+    def detect(self, text: str) -> List[str]:
+        """Return all matched planning phrases found in *text*."""
+        if not text or self._regex is None:
+            return []
+        return [m.group() for m in self._regex.finditer(text)]
+    def planning_token_ratio(self, text: str, total_tokens: int) -> float:
+        """Ratio of planning-phrase words to *total_tokens*.
+        Uses whitespace word count of matched phrases as numerator.
+        Returns 0.0 when *total_tokens* is zero.
+        """
+        if total_tokens == 0:
+            return 0.0
+        matches = self.detect(text)
+        planning_words = sum(len(m.split()) for m in matches)
+        return planning_words / total_tokens

textpolicy/analysis/serialization.py ADDED Viewed

@@ -0,0 +1,65 @@
+# textpolicy/analysis/serialization.py
+"""
+JSON-safe conversion utilities and streaming JSONL writer.
+Handles MLX arrays, numpy scalars, and nested structures for
+serialization to JSONL format used by EmergenceLogger.
+"""
+import json
+from pathlib import Path
+from typing import Any, Union
+def to_json_safe(obj: Any) -> Any:
+    """Recursively convert MLX arrays, numpy scalars, etc. to JSON-native types.
+    Args:
+        obj: Any Python object that may contain MLX arrays or numpy types.
+    Returns:
+        JSON-serializable equivalent.
+    """
+    # MLX array → list
+    if hasattr(obj, "tolist") and callable(obj.tolist):
+        return obj.tolist()
+    # numpy scalar → Python scalar
+    if hasattr(obj, "item") and callable(obj.item):
+        return obj.item()
+    if isinstance(obj, dict):
+        return {k: to_json_safe(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [to_json_safe(v) for v in obj]
+    # int, float, str, bool, None pass through
+    return obj
+class StreamingJSONLWriter:
+    """Append-only JSONL writer with lazy file open and compact serialization.
+    Args:
+        path: Destination file path. Parent directories are created on first write.
+    """
+    def __init__(self, path: Union[str, Path]) -> None:
+        self._path = Path(path)
+        self._file = None
+    def write(self, record: dict) -> None:
+        """Serialize *record* as one compact JSON line, then flush."""
+        if self._file is None:
+            self._path.parent.mkdir(parents=True, exist_ok=True)
+            self._file = open(self._path, "a")
+        line = json.dumps(to_json_safe(record), separators=(",", ":"))
+        self._file.write(line + "\n")
+        self._file.flush()
+    def close(self) -> None:
+        """Close the underlying file handle (idempotent)."""
+        if self._file is not None:
+            self._file.close()
+            self._file = None

textpolicy 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

textpolicy 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl