PyPI - textpolicy - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl - Mend

textpolicy 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

textpolicy/__init__.py +3 -0
textpolicy/algorithms/__init__.py +29 -4
textpolicy/algorithms/grpo.py +771 -361
textpolicy/algorithms/length_shaping.py +151 -0
textpolicy/analysis/__init__.py +23 -0
textpolicy/analysis/emergence_logger.py +248 -0
textpolicy/analysis/planning_patterns.py +105 -0
textpolicy/analysis/serialization.py +65 -0
textpolicy/generation/mlx_generation.py +36 -21
textpolicy/tasks/__init__.py +7 -0
textpolicy/tasks/countdown/__init__.py +21 -0
textpolicy/tasks/countdown/dataset.py +163 -0
textpolicy/tasks/countdown/evaluator.py +197 -0
textpolicy/tasks/countdown/prompt.py +89 -0
textpolicy/tasks/countdown/reward.py +56 -0
textpolicy/training/trainer.py +41 -21
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/METADATA +1 -1
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/RECORD +22 -11
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/WHEEL +0 -0
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/entry_points.txt +0 -0
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/licenses/LICENSE +0 -0
{textpolicy-0.1.2.dist-info → textpolicy-0.1.3.dist-info}/top_level.txt +0 -0

textpolicy/generation/mlx_generation.py CHANGED Viewed

@@ -27,8 +27,8 @@ try:
     from mlx_lm.sample_utils import make_sampler, make_logits_processors
 # sampling utilities fallback when sample_utils is unavailable
 except ImportError:
-    _make_sampler = None
-    _make_logits_processors = None
+    make_sampler = None
+    make_logits_processors = None
 def _get_eos_configs_for_model(
@@ -194,7 +194,9 @@ def generate_tokens(
         return _simple_generate(model, prompt_tokens, max_tokens, temperature)
     prompt_list = prompt_tokens.tolist()
+    response_token_list: list = []
+    response_logprob_list: list = []
     # Use stream_generate instead of generate to get proper EOS token handling
     # This is the core fix - stream_generate respects EOS tokens, generate() does not
     try:
@@ -216,19 +218,20 @@ def generate_tokens(
             logits_processors=logits_processors,
         ))
-        # Extract tokens from response segments and detect natural EOS stopping
-        response_token_list = []
+        # Extract tokens and logprobs from response segments
         for segment in response_segments:
             response_token_list.append(segment.token)
+            # Capture per-token logprob inline to avoid a redundant forward pass
+            if segment.logprobs is not None:
+                response_logprob_list.append(float(segment.logprobs[segment.token]))
             # Check if this segment indicates natural stopping (EOS token)
             if hasattr(segment, 'finish_reason') and segment.finish_reason == "stop":
                 break
         # Convert to MLX array
         response_tokens = mx.array(response_token_list) if response_token_list else mx.array([])
     except ImportError:
         # Fallback to original generate method if stream_generate unavailable
         print("WARNING: stream_generate not available, using fallback generate method")
@@ -248,8 +251,12 @@ def generate_tokens(
         response_tokens = _extract_response_tokens(response, prompt_list, tokenizer)
-    # Compute logprobs for the response tokens
-    logprobs = compute_logprobs(model, prompt_tokens, response_tokens)
+    # Use inline logprobs captured during generation when available,
+    # falling back to a full forward pass only if logprobs were missing.
+    if response_logprob_list and len(response_logprob_list) == len(response_token_list):
+        logprobs = mx.array(response_logprob_list)
+    else:
+        logprobs = compute_logprobs(model, prompt_tokens, response_tokens)
     return response_tokens, {'logprob': logprobs}
@@ -292,35 +299,43 @@ def _simple_generate(
     """
     current_tokens = prompt_tokens
     generated = []
+    generated_logprobs = []
     for _ in range(max_tokens):
         # Model forward pass
         logits = model(current_tokens[None])  # Add batch dimension
         next_token_logits = logits[0, -1, :]  # Last token logits
         # Temperature scaling
         if temperature > 0:
             scaled_logits = next_token_logits / temperature
         else:
             scaled_logits = next_token_logits
         # Sample next token
         probs = mx.softmax(scaled_logits)
         next_token = mx.random.categorical(probs[None])[0]
+        # Capture logprob inline: log_softmax of the *unscaled* logits at the selected token
+        log_probs = next_token_logits - mx.logsumexp(next_token_logits)
+        generated_logprobs.append(float(log_probs[next_token]))
         # Add to sequence
         generated.append(next_token)
         current_tokens = mx.concatenate([current_tokens, next_token[None]])
         # Stop on EOS (approximate) - avoid .item() calls
         if len(generated) > 5 and next_token < 5:  # Simple stop condition
             break
     response_tokens = mx.array(generated) if generated else mx.array([2])
-    # Compute simple logprobs
-    logprobs = compute_logprobs(model, prompt_tokens, response_tokens)
+    # Use inline logprobs captured during generation (avoids redundant forward pass)
+    if generated_logprobs and len(generated_logprobs) == len(generated):
+        logprobs = mx.array(generated_logprobs)
+    else:
+        logprobs = compute_logprobs(model, prompt_tokens, response_tokens)
     return response_tokens, {'logprob': logprobs}

textpolicy/tasks/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+Task implementations for TextPolicy.
+Importing this package triggers auto-registration of task reward functions.
+"""
+from . import countdown  # noqa: F401 — triggers @reward registration

textpolicy/tasks/countdown/__init__.py ADDED Viewed

@@ -0,0 +1,21 @@
+"""
+Countdown Numbers Game task for TextPolicy.
+Importing this module registers the 'countdown' reward function.
+"""
+from .evaluator import ExpressionError, EvalResult, evaluate_expression
+from .prompt import format_countdown_prompt, extract_expression_from_completion
+from .reward import countdown_reward
+from .dataset import generate_countdown_problems, load_countdown_dataset
+__all__ = [
+    "ExpressionError",
+    "EvalResult",
+    "evaluate_expression",
+    "format_countdown_prompt",
+    "extract_expression_from_completion",
+    "countdown_reward",
+    "generate_countdown_problems",
+    "load_countdown_dataset",
+]

textpolicy/tasks/countdown/dataset.py ADDED Viewed

@@ -0,0 +1,163 @@
+"""
+Problem generation and HuggingFace dataset loading for the Countdown task.
+"""
+import ast
+import itertools
+import random
+from typing import Dict, List, Optional, Tuple
+def generate_countdown_problems(
+    num_problems: int,
+    num_numbers: int = 4,
+    number_range: Tuple[int, int] = (1, 25),
+    target_range: Tuple[int, int] = (10, 100),
+    ensure_solvable: bool = True,
+    seed: Optional[int] = None,
+    max_attempts: Optional[int] = None,
+) -> List[Dict]:
+    """
+    Generate Countdown Numbers Game problems.
+    Args:
+        num_problems: Number of problems to generate.
+        num_numbers: How many numbers per problem (3 or 4 recommended).
+        number_range: (min, max) inclusive range for available numbers.
+        target_range: (min, max) inclusive range for target.
+        ensure_solvable: If True, only return problems with at least one solution.
+        seed: Random seed for reproducibility.
+        max_attempts: Maximum number of candidate problems to try before stopping.
+                      Defaults to num_problems * 100 when ensure_solvable is True.
+    Returns:
+        List of dicts with keys 'target' and 'numbers'.
+    Raises:
+        RuntimeError: If max_attempts is exhausted before generating enough problems.
+    """
+    rng = random.Random(seed)
+    problems = []
+    if max_attempts is None:
+        max_attempts = num_problems * 100 if ensure_solvable else num_problems
+    attempts = 0
+    while len(problems) < num_problems:
+        if attempts >= max_attempts:
+            raise RuntimeError(
+                f"Could not generate {num_problems} problems within "
+                f"{max_attempts} attempts (got {len(problems)}). "
+                f"Try wider number_range/target_range or increase max_attempts."
+            )
+        numbers = [rng.randint(*number_range) for _ in range(num_numbers)]
+        target = rng.randint(*target_range)
+        attempts += 1
+        if ensure_solvable and not _is_solvable(numbers, target):
+            continue
+        problems.append({"target": target, "numbers": numbers})
+    return problems
+def load_countdown_dataset(
+    split: str = "train",
+    max_examples: Optional[int] = None,
+) -> List[Dict]:
+    """
+    Load the Countdown task dataset from HuggingFace.
+    Requires the `datasets` library (optional dependency).
+    Args:
+        split: Dataset split to load ('train', 'test', etc.).
+        max_examples: Maximum number of examples to return.
+    Returns:
+        List of dicts with keys 'target' and 'numbers'.
+    """
+    try:
+        from datasets import load_dataset
+    except ImportError:
+        raise ImportError(
+            "The 'datasets' library is required to load HuggingFace datasets. "
+            "Install it with: pip install datasets"
+        )
+    ds = load_dataset("Jiayi-Pan/Countdown-Tasks-3to4", split=split)
+    examples = []
+    for item in ds:
+        target = item.get("target")
+        numbers = item.get("nums") or item.get("numbers")
+        if target is not None and numbers is not None:
+            if isinstance(numbers, str):
+                numbers = ast.literal_eval(numbers)
+            examples.append({"target": int(target), "numbers": list(numbers)})
+            if max_examples is not None and len(examples) >= max_examples:
+                break
+    return examples
+# ---------------------------------------------------------------------------
+# Brute-force solvability check
+# ---------------------------------------------------------------------------
+# Commutative ops: only need (a, b), not (b, a)
+_COMMUTATIVE_OPS = [
+    lambda a, b: a + b,
+    lambda a, b: a * b,
+]
+# Non-commutative ops: must try both orderings
+_NON_COMMUTATIVE_OPS = [
+    lambda a, b: a - b,
+    lambda a, b: a / b if b != 0 else None,
+]
+def _is_solvable(numbers: List[int], target: int) -> bool:
+    """Check if target is reachable using any subset and arrangement of numbers."""
+    return _solve(list(map(float, numbers)), float(target))
+def _solve(nums: List[float], target: float) -> bool:
+    """Recursively try all pairs of numbers with all operations.
+    Allows using a subset of numbers — if any single number in the
+    current list already equals the target, that counts as solved.
+    Uses combinations (not permutations) for pair selection and only
+    tries both orderings for non-commutative operations (-, /).
+    """
+    # Any number in the current set already equals the target → solvable
+    for n in nums:
+        if abs(n - target) < 1e-9:
+            return True
+    if len(nums) < 2:
+        return False
+    for i in range(len(nums)):
+        for j in range(i + 1, len(nums)):
+            a, b = nums[i], nums[j]
+            remaining = [nums[k] for k in range(len(nums)) if k != i and k != j]
+            # Commutative: a+b == b+a, a*b == b*a — one ordering suffices
+            for op in _COMMUTATIVE_OPS:
+                result = op(a, b)
+                if _solve(remaining + [result], target):
+                    return True
+            # Non-commutative: try both (a,b) and (b,a)
+            for op in _NON_COMMUTATIVE_OPS:
+                for x, y in ((a, b), (b, a)):
+                    result = op(x, y)
+                    if result is not None and _solve(remaining + [result], target):
+                        return True
+    return False

textpolicy/tasks/countdown/evaluator.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""
+Safe arithmetic expression evaluator using recursive descent parsing.
+No eval(), no ast module. Handles +, -, *, /, parentheses, integers only.
+"""
+import re
+from dataclasses import dataclass, field
+from typing import List, Optional
+class ExpressionError(Exception):
+    """Raised when an expression is invalid or cannot be evaluated."""
+    pass
+@dataclass
+class EvalResult:
+    """Result of evaluating an arithmetic expression."""
+    value: float
+    numbers_used: List[int] = field(default_factory=list)
+# Allowed characters in expressions
+_ALLOWED_CHARS = re.compile(r'^[0-9\s+\-*/()]+$')
+def evaluate_expression(
+    expression: str, available_numbers: Optional[List[int]] = None
+) -> EvalResult:
+    """
+    Safely evaluate an arithmetic expression using recursive descent parsing.
+    Args:
+        expression: Arithmetic expression string (e.g. "(2+3)*4")
+        available_numbers: If provided, validates that only these numbers are used
+                          (each at most once).
+    Returns:
+        EvalResult with the computed value and list of numbers used.
+    Raises:
+        ExpressionError: On syntax errors, division by zero, disallowed chars,
+                        or number reuse/unavailability.
+    """
+    if not expression or not expression.strip():
+        raise ExpressionError("Empty expression")
+    expr = expression.strip()
+    if not _ALLOWED_CHARS.match(expr):
+        raise ExpressionError(
+            f"Expression contains disallowed characters: {expr!r}"
+        )
+    tokens = _tokenize(expr)
+    if not tokens:
+        raise ExpressionError("Empty expression after tokenization")
+    parser = _Parser(tokens)
+    value = parser.parse_expression()
+    if parser.pos < len(parser.tokens):
+        raise ExpressionError(
+            f"Unexpected token after end of expression: "
+            f"{parser.tokens[parser.pos]!r}"
+        )
+    numbers_used = parser.numbers_used
+    if available_numbers is not None:
+        _validate_numbers(numbers_used, available_numbers)
+    return EvalResult(value=value, numbers_used=numbers_used)
+def _tokenize(expr: str) -> List[str]:
+    """Tokenize an expression into numbers, operators, and parentheses."""
+    tokens = []
+    i = 0
+    while i < len(expr):
+        ch = expr[i]
+        if ch.isspace():
+            i += 1
+            continue
+        if ch.isdigit():
+            j = i
+            while j < len(expr) and expr[j].isdigit():
+                j += 1
+            tokens.append(expr[i:j])
+            i = j
+        elif ch in '+-*/()':
+            tokens.append(ch)
+            i += 1
+        else:
+            raise ExpressionError(f"Unexpected character: {ch!r}")
+    return tokens
+class _Parser:
+    """Recursive descent parser for arithmetic expressions.
+    Grammar:
+        expression := term (('+' | '-') term)*
+        term       := factor (('*' | '/') factor)*
+        factor     := NUMBER | '(' expression ')' | ('+' | '-') factor
+    """
+    def __init__(self, tokens: List[str]):
+        self.tokens = tokens
+        self.pos = 0
+        self.numbers_used: List[int] = []
+    def _peek(self) -> Optional[str]:
+        if self.pos < len(self.tokens):
+            return self.tokens[self.pos]
+        return None
+    def _consume(self) -> str:
+        token = self.tokens[self.pos]
+        self.pos += 1
+        return token
+    def parse_expression(self) -> float:
+        """Parse an expression: term (('+' | '-') term)*"""
+        result = self._parse_term()
+        while self._peek() in ('+', '-'):
+            op = self._consume()
+            right = self._parse_term()
+            if op == '+':
+                result = result + right
+            else:
+                result = result - right
+        return result
+    def _parse_term(self) -> float:
+        """Parse a term: factor (('*' | '/') factor)*"""
+        result = self._parse_factor()
+        while self._peek() in ('*', '/'):
+            op = self._consume()
+            right = self._parse_factor()
+            if op == '*':
+                result = result * right
+            else:
+                if right == 0:
+                    raise ExpressionError("Division by zero")
+                result = result / right
+        return result
+    def _parse_factor(self) -> float:
+        """Parse a factor: NUMBER | '(' expression ')' | unary +/-"""
+        token = self._peek()
+        if token is None:
+            raise ExpressionError("Unexpected end of expression")
+        # Unary plus/minus
+        if token in ('+', '-'):
+            op = self._consume()
+            value = self._parse_factor()
+            return value if op == '+' else -value
+        # Parenthesized expression
+        if token == '(':
+            self._consume()  # eat '('
+            value = self.parse_expression()
+            if self._peek() != ')':
+                raise ExpressionError("Unmatched opening parenthesis")
+            self._consume()  # eat ')'
+            return value
+        if token == ')':
+            raise ExpressionError("Unmatched closing parenthesis")
+        # Number — _tokenize only produces all-digit tokens, so isdigit()
+        # is sufficient. Avoid masking malformed tokens with a looser check.
+        if token.isdigit():
+            self._consume()
+            num = int(token)
+            self.numbers_used.append(num)
+            return float(num)
+        raise ExpressionError(f"Unexpected token: {token!r}")
+def _validate_numbers(
+    numbers_used: List[int], available_numbers: List[int]
+) -> None:
+    """Validate that numbers_used is a valid subset of available_numbers."""
+    available_copy = list(available_numbers)
+    for num in numbers_used:
+        if num in available_copy:
+            available_copy.remove(num)
+        else:
+            raise ExpressionError(
+                f"Number {num} is not available or has been used too many times. "
+                f"Available: {available_numbers}"
+            )

textpolicy/tasks/countdown/prompt.py ADDED Viewed

@@ -0,0 +1,89 @@
+"""
+Prompt formatting and expression extraction for the Countdown task.
+"""
+import re
+from typing import List
+def format_countdown_prompt(target: int, numbers: List[int]) -> str:
+    """
+    Format a Countdown task prompt.
+    Args:
+        target: The target number to reach.
+        numbers: The available numbers to use.
+    Returns:
+        A formatted prompt string.
+    """
+    return (
+        f"Using the numbers {numbers}, create an arithmetic expression "
+        f"that equals {target}. You may use each number at most once. "
+        f"Use only +, -, *, / and parentheses. "
+        f"Provide your expression on its own line."
+    )
+# Pattern matching pure arithmetic expressions (digits, operators, parens, spaces)
+_EXPR_PATTERN = re.compile(r'^[\d\s+\-*/()]+$')
+# Pattern for lines with delimiters like "= ...", ": ...", "answer ..."
+_DELIMITER_PATTERN = re.compile(
+    r'(?:=|:|answer\s*(?:is|:)?)\s*([\d\s+\-*/()]+)',
+    re.IGNORECASE,
+)
+# Find longest arithmetic-like substring
+_ARITH_SUBSTRING = re.compile(r'[\d\s+\-*/()]+')
+def extract_expression_from_completion(completion: str) -> str:
+    """
+    Extract an arithmetic expression from model output.
+    Uses fallback strategies:
+    1. Lines that are pure arithmetic expressions
+    2. Text after =, :, or 'answer' delimiters
+    3. Longest arithmetic-like substring
+    Args:
+        completion: The raw model output.
+    Returns:
+        The extracted expression string (may still be invalid).
+    """
+    if not completion or not completion.strip():
+        return ""
+    text = completion.strip()
+    # Strategy 1: Find lines that are pure arithmetic expressions
+    for line in text.splitlines():
+        line = line.strip()
+        if line and _EXPR_PATTERN.match(line) and _has_digit_and_operator(line):
+            return line
+    # Strategy 2: Look for delimiters
+    match = _DELIMITER_PATTERN.search(text)
+    if match:
+        candidate = match.group(1).strip()
+        if candidate and _has_digit_and_operator(candidate):
+            return candidate
+    # Strategy 3: Longest arithmetic-like substring containing at least
+    # one digit and one operator
+    candidates = _ARITH_SUBSTRING.findall(text)
+    valid = [c.strip() for c in candidates if _has_digit_and_operator(c.strip())]
+    if valid:
+        return max(valid, key=len)
+    # Last resort: return the whole text stripped
+    return text
+def _has_digit_and_operator(s: str) -> bool:
+    """Check if string has at least one digit and one operator."""
+    has_digit = any(c.isdigit() for c in s)
+    has_op = any(c in '+-*/' for c in s)
+    return has_digit and has_op

textpolicy/tasks/countdown/reward.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""
+Countdown reward function for GRPO training.
+"""
+import logging
+from typing import Any, Dict
+from textpolicy.rewards.registry import reward
+from .evaluator import ExpressionError, evaluate_expression
+from .prompt import extract_expression_from_completion
+logger = logging.getLogger(__name__)
+@reward(name="countdown")
+def countdown_reward(
+    prompt: str,
+    completion: str,
+    example: Dict[str, Any],
+    **kwargs,
+) -> float:
+    """
+    Reward function for the Countdown Numbers Game.
+    Scoring:
+        1.0  — expression equals target with valid numbers
+        0.0  — evaluates but wrong answer, or malformed example
+       -0.5  — syntax error, empty, unparseable, number reuse, or invalid numbers
+    The example dict must contain 'target' (int) and 'numbers' (list of int).
+    """
+    # Extract task parameters
+    target = example.get("target")
+    numbers = example.get("numbers")
+    if target is None or numbers is None:
+        logger.warning("Malformed example: missing 'target' or 'numbers'")
+        return 0.0
+    # Extract expression from completion
+    expression = extract_expression_from_completion(completion)
+    if not expression:
+        return -0.5
+    # Evaluate
+    try:
+        result = evaluate_expression(expression, available_numbers=numbers)
+    except ExpressionError as e:
+        logger.debug(f"Expression error: {e}")
+        return -0.5
+    # Check if result matches target (use tolerance for float comparison)
+    if abs(result.value - target) < 1e-9:
+        return 1.0
+    return 0.0

textpolicy 0.1.2__py3-none-any.whl → 0.1.3__py3-none-any.whl

textpolicy 0.1.2py3-none-any.whl → 0.1.3py3-none-any.whl