PyPI - bead - Versions diffs - 0.1.0__py3-none-any.whl - Mend

bead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (231) hide show

bead/__init__.py +11 -0
bead/__main__.py +11 -0
bead/active_learning/__init__.py +15 -0
bead/active_learning/config.py +231 -0
bead/active_learning/loop.py +566 -0
bead/active_learning/models/__init__.py +24 -0
bead/active_learning/models/base.py +852 -0
bead/active_learning/models/binary.py +910 -0
bead/active_learning/models/categorical.py +943 -0
bead/active_learning/models/cloze.py +862 -0
bead/active_learning/models/forced_choice.py +956 -0
bead/active_learning/models/free_text.py +773 -0
bead/active_learning/models/lora.py +365 -0
bead/active_learning/models/magnitude.py +835 -0
bead/active_learning/models/multi_select.py +795 -0
bead/active_learning/models/ordinal_scale.py +811 -0
bead/active_learning/models/peft_adapter.py +155 -0
bead/active_learning/models/random_effects.py +639 -0
bead/active_learning/selection.py +354 -0
bead/active_learning/strategies.py +391 -0
bead/active_learning/trainers/__init__.py +26 -0
bead/active_learning/trainers/base.py +210 -0
bead/active_learning/trainers/data_collator.py +172 -0
bead/active_learning/trainers/dataset_utils.py +261 -0
bead/active_learning/trainers/huggingface.py +304 -0
bead/active_learning/trainers/lightning.py +324 -0
bead/active_learning/trainers/metrics.py +424 -0
bead/active_learning/trainers/mixed_effects.py +551 -0
bead/active_learning/trainers/model_wrapper.py +509 -0
bead/active_learning/trainers/registry.py +104 -0
bead/adapters/__init__.py +11 -0
bead/adapters/huggingface.py +61 -0
bead/behavioral/__init__.py +116 -0
bead/behavioral/analytics.py +646 -0
bead/behavioral/extraction.py +343 -0
bead/behavioral/merging.py +343 -0
bead/cli/__init__.py +11 -0
bead/cli/active_learning.py +513 -0
bead/cli/active_learning_commands.py +779 -0
bead/cli/completion.py +359 -0
bead/cli/config.py +624 -0
bead/cli/constraint_builders.py +286 -0
bead/cli/deployment.py +859 -0
bead/cli/deployment_trials.py +493 -0
bead/cli/deployment_ui.py +332 -0
bead/cli/display.py +378 -0
bead/cli/items.py +960 -0
bead/cli/items_factories.py +776 -0
bead/cli/list_constraints.py +714 -0
bead/cli/lists.py +490 -0
bead/cli/main.py +430 -0
bead/cli/models.py +877 -0
bead/cli/resource_loaders.py +621 -0
bead/cli/resources.py +1036 -0
bead/cli/shell.py +356 -0
bead/cli/simulate.py +840 -0
bead/cli/templates.py +1158 -0
bead/cli/training.py +1080 -0
bead/cli/utils.py +614 -0
bead/cli/workflow.py +1273 -0
bead/config/__init__.py +68 -0
bead/config/active_learning.py +1009 -0
bead/config/config.py +192 -0
bead/config/defaults.py +118 -0
bead/config/deployment.py +217 -0
bead/config/env.py +147 -0
bead/config/item.py +45 -0
bead/config/list.py +193 -0
bead/config/loader.py +149 -0
bead/config/logging.py +42 -0
bead/config/model.py +49 -0
bead/config/paths.py +46 -0
bead/config/profiles.py +320 -0
bead/config/resources.py +47 -0
bead/config/serialization.py +210 -0
bead/config/simulation.py +206 -0
bead/config/template.py +238 -0
bead/config/validation.py +267 -0
bead/data/__init__.py +65 -0
bead/data/base.py +87 -0
bead/data/identifiers.py +97 -0
bead/data/language_codes.py +61 -0
bead/data/metadata.py +270 -0
bead/data/range.py +123 -0
bead/data/repository.py +358 -0
bead/data/serialization.py +249 -0
bead/data/timestamps.py +89 -0
bead/data/validation.py +349 -0
bead/data_collection/__init__.py +11 -0
bead/data_collection/jatos.py +223 -0
bead/data_collection/merger.py +154 -0
bead/data_collection/prolific.py +198 -0
bead/deployment/__init__.py +5 -0
bead/deployment/distribution.py +402 -0
bead/deployment/jatos/__init__.py +1 -0
bead/deployment/jatos/api.py +200 -0
bead/deployment/jatos/exporter.py +210 -0
bead/deployment/jspsych/__init__.py +9 -0
bead/deployment/jspsych/biome.json +44 -0
bead/deployment/jspsych/config.py +411 -0
bead/deployment/jspsych/generator.py +598 -0
bead/deployment/jspsych/package.json +51 -0
bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
bead/deployment/jspsych/randomizer.py +299 -0
bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
bead/deployment/jspsych/src/plugins/rating.ts +248 -0
bead/deployment/jspsych/src/slopit/index.ts +9 -0
bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
bead/deployment/jspsych/templates/experiment.css +1 -0
bead/deployment/jspsych/templates/experiment.js.template +289 -0
bead/deployment/jspsych/templates/index.html +51 -0
bead/deployment/jspsych/templates/randomizer.js +241 -0
bead/deployment/jspsych/templates/randomizer.js.template +313 -0
bead/deployment/jspsych/trials.py +723 -0
bead/deployment/jspsych/tsconfig.json +23 -0
bead/deployment/jspsych/tsup.config.ts +30 -0
bead/deployment/jspsych/ui/__init__.py +1 -0
bead/deployment/jspsych/ui/components.py +383 -0
bead/deployment/jspsych/ui/styles.py +411 -0
bead/dsl/__init__.py +80 -0
bead/dsl/ast.py +168 -0
bead/dsl/context.py +178 -0
bead/dsl/errors.py +71 -0
bead/dsl/evaluator.py +570 -0
bead/dsl/grammar.lark +81 -0
bead/dsl/parser.py +231 -0
bead/dsl/stdlib.py +929 -0
bead/evaluation/__init__.py +13 -0
bead/evaluation/convergence.py +485 -0
bead/evaluation/interannotator.py +398 -0
bead/items/__init__.py +40 -0
bead/items/adapters/__init__.py +70 -0
bead/items/adapters/anthropic.py +224 -0
bead/items/adapters/api_utils.py +167 -0
bead/items/adapters/base.py +216 -0
bead/items/adapters/google.py +259 -0
bead/items/adapters/huggingface.py +1074 -0
bead/items/adapters/openai.py +323 -0
bead/items/adapters/registry.py +202 -0
bead/items/adapters/sentence_transformers.py +224 -0
bead/items/adapters/togetherai.py +309 -0
bead/items/binary.py +515 -0
bead/items/cache.py +558 -0
bead/items/categorical.py +593 -0
bead/items/cloze.py +757 -0
bead/items/constructor.py +784 -0
bead/items/forced_choice.py +413 -0
bead/items/free_text.py +681 -0
bead/items/generation.py +432 -0
bead/items/item.py +396 -0
bead/items/item_template.py +787 -0
bead/items/magnitude.py +573 -0
bead/items/multi_select.py +621 -0
bead/items/ordinal_scale.py +569 -0
bead/items/scoring.py +448 -0
bead/items/validation.py +723 -0
bead/lists/__init__.py +30 -0
bead/lists/balancer.py +263 -0
bead/lists/constraints.py +1067 -0
bead/lists/experiment_list.py +286 -0
bead/lists/list_collection.py +378 -0
bead/lists/partitioner.py +1141 -0
bead/lists/stratification.py +254 -0
bead/participants/__init__.py +73 -0
bead/participants/collection.py +699 -0
bead/participants/merging.py +312 -0
bead/participants/metadata_spec.py +491 -0
bead/participants/models.py +276 -0
bead/resources/__init__.py +29 -0
bead/resources/adapters/__init__.py +19 -0
bead/resources/adapters/base.py +104 -0
bead/resources/adapters/cache.py +128 -0
bead/resources/adapters/glazing.py +508 -0
bead/resources/adapters/registry.py +117 -0
bead/resources/adapters/unimorph.py +796 -0
bead/resources/classification.py +856 -0
bead/resources/constraint_builders.py +329 -0
bead/resources/constraints.py +165 -0
bead/resources/lexical_item.py +223 -0
bead/resources/lexicon.py +744 -0
bead/resources/loaders.py +209 -0
bead/resources/template.py +441 -0
bead/resources/template_collection.py +707 -0
bead/resources/template_generation.py +349 -0
bead/simulation/__init__.py +29 -0
bead/simulation/annotators/__init__.py +15 -0
bead/simulation/annotators/base.py +175 -0
bead/simulation/annotators/distance_based.py +135 -0
bead/simulation/annotators/lm_based.py +114 -0
bead/simulation/annotators/oracle.py +182 -0
bead/simulation/annotators/random.py +181 -0
bead/simulation/dsl_extension/__init__.py +3 -0
bead/simulation/noise_models/__init__.py +13 -0
bead/simulation/noise_models/base.py +42 -0
bead/simulation/noise_models/random_noise.py +82 -0
bead/simulation/noise_models/systematic.py +132 -0
bead/simulation/noise_models/temperature.py +86 -0
bead/simulation/runner.py +144 -0
bead/simulation/strategies/__init__.py +23 -0
bead/simulation/strategies/base.py +123 -0
bead/simulation/strategies/binary.py +103 -0
bead/simulation/strategies/categorical.py +123 -0
bead/simulation/strategies/cloze.py +224 -0
bead/simulation/strategies/forced_choice.py +127 -0
bead/simulation/strategies/free_text.py +105 -0
bead/simulation/strategies/magnitude.py +116 -0
bead/simulation/strategies/multi_select.py +129 -0
bead/simulation/strategies/ordinal_scale.py +131 -0
bead/templates/__init__.py +27 -0
bead/templates/adapters/__init__.py +17 -0
bead/templates/adapters/base.py +128 -0
bead/templates/adapters/cache.py +178 -0
bead/templates/adapters/huggingface.py +312 -0
bead/templates/combinatorics.py +103 -0
bead/templates/filler.py +605 -0
bead/templates/renderers.py +177 -0
bead/templates/resolver.py +178 -0
bead/templates/strategies.py +1806 -0
bead/templates/streaming.py +195 -0
bead-0.1.0.dist-info/METADATA +212 -0
bead-0.1.0.dist-info/RECORD +231 -0
bead-0.1.0.dist-info/WHEEL +4 -0
bead-0.1.0.dist-info/entry_points.txt +2 -0
bead-0.1.0.dist-info/licenses/LICENSE +21 -0

bead/active_learning/trainers/metrics.py ADDED Viewed

@@ -0,0 +1,424 @@
+"""Metrics computation using HuggingFace evaluate library.
+This module provides metric computation functions for use with HuggingFace Trainer.
+It uses the evaluate library for standardized, well-tested metrics.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+import evaluate
+import numpy as np
+if TYPE_CHECKING:
+    from transformers import EvalPrediction, PreTrainedTokenizerBase
+def compute_binary_metrics(eval_pred: EvalPrediction) -> dict[str, float]:
+    """Compute metrics for binary classification tasks.
+    Uses HuggingFace evaluate library for accuracy, precision, recall, and F1.
+    Parameters
+    ----------
+    eval_pred : EvalPrediction
+        EvalPrediction object with predictions and label_ids attributes.
+        predictions: array of shape (n_samples,) with logits
+        label_ids: array of shape (n_samples,) with true labels (0 or 1)
+    Returns
+    -------
+    dict[str, float]
+        Dictionary with accuracy, precision, recall, and f1 metrics.
+    Examples
+    --------
+    >>> from transformers import EvalPrediction
+    >>> import numpy as np
+    >>> predictions = np.array([0.8, 0.3, 0.9, 0.2])  # Logits
+    >>> labels = np.array([1.0, 0.0, 1.0, 0.0])
+    >>> eval_pred = EvalPrediction(predictions=predictions, label_ids=labels)
+    >>> metrics = compute_binary_metrics(eval_pred)
+    >>> "accuracy" in metrics
+    True
+    """
+    # Load metrics from evaluate library
+    accuracy_metric = evaluate.load("accuracy")
+    precision_metric = evaluate.load("precision")
+    recall_metric = evaluate.load("recall")
+    f1_metric = evaluate.load("f1")
+    # Extract predictions and labels
+    predictions = eval_pred.predictions
+    labels = eval_pred.label_ids
+    # Convert logits to predictions (binary: apply sigmoid and threshold)
+    if predictions.ndim == 1:
+        # Single logit per sample
+        preds = (1 / (1 + np.exp(-predictions)) > 0.5).astype(int)
+    else:
+        # Multiple logits (shouldn't happen for binary, but handle it)
+        preds = np.argmax(predictions, axis=-1)
+    # Ensure labels are integers
+    labels = labels.astype(int)
+    # Compute metrics
+    accuracy = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
+    precision = precision_metric.compute(
+        predictions=preds, references=labels, average="binary", zero_division=0
+    )["precision"]
+    recall = recall_metric.compute(
+        predictions=preds, references=labels, average="binary", zero_division=0
+    )["recall"]
+    f1 = f1_metric.compute(
+        predictions=preds, references=labels, average="binary", zero_division=0
+    )["f1"]
+    return {
+        "accuracy": accuracy,
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+    }
+def compute_regression_metrics(eval_pred: EvalPrediction) -> dict[str, float]:
+    """Compute metrics for regression tasks.
+    Uses HuggingFace evaluate library for MSE, MAE, and R².
+    Parameters
+    ----------
+    eval_pred : EvalPrediction
+        EvalPrediction object with predictions and label_ids attributes.
+        predictions: array of shape (n_samples, 1) with continuous values
+        label_ids: array of shape (n_samples,) with true continuous values
+    Returns
+    -------
+    dict[str, float]
+        Dictionary with mse, mae, and r2 metrics.
+    Examples
+    --------
+    >>> from transformers import EvalPrediction
+    >>> import numpy as np
+    >>> predictions = np.array([[250.5], [300.2], [275.0]])  # Continuous values
+    >>> labels = np.array([250.0, 300.0, 275.0])
+    >>> eval_pred = EvalPrediction(predictions=predictions, label_ids=labels)
+    >>> metrics = compute_regression_metrics(eval_pred)
+    >>> "mse" in metrics
+    True
+    """
+    # Load metrics from evaluate library
+    mse_metric = evaluate.load("mse")
+    mae_metric = evaluate.load("mae")
+    # Extract predictions and labels
+    predictions = eval_pred.predictions
+    labels = eval_pred.label_ids
+    # Handle predictions shape: (n_samples, 1) -> (n_samples,)
+    if predictions.ndim == 2 and predictions.shape[1] == 1:
+        predictions = predictions.squeeze(1)
+    elif predictions.ndim > 2:
+        # Flatten if needed
+        predictions = predictions.flatten()
+    # Ensure labels are 1D
+    if labels.ndim > 1:
+        labels = labels.flatten()
+    # Compute metrics
+    mse = mse_metric.compute(predictions=predictions, references=labels)["mse"]
+    mae = mae_metric.compute(predictions=predictions, references=labels)["mae"]
+    # Compute R² manually (evaluate library doesn't have r2)
+    ss_res = np.sum((labels - predictions) ** 2)
+    ss_tot = np.sum((labels - np.mean(labels)) ** 2)
+    r2 = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0
+    return {
+        "mse": mse,
+        "mae": mae,
+        "r2": r2,
+    }
+def compute_multiclass_metrics(
+    eval_pred: EvalPrediction, num_labels: int
+) -> dict[str, float]:
+    """Compute metrics for multi-class classification tasks.
+    Uses HuggingFace evaluate library for accuracy, precision, recall, and F1.
+    Parameters
+    ----------
+    eval_pred : EvalPrediction
+        EvalPrediction object with predictions and label_ids attributes.
+        predictions: array of shape (n_samples, n_classes) with logits
+        label_ids: array of shape (n_samples,) with true labels
+    num_labels : int
+        Number of classes.
+    Returns
+    -------
+    dict[str, float]
+        Dictionary with accuracy, precision, recall, and f1 metrics.
+    Examples
+    --------
+    >>> from transformers import EvalPrediction
+    >>> import numpy as np
+    >>> predictions = np.array([[0.1, 0.8, 0.1], [0.7, 0.2, 0.1]])  # Logits
+    >>> labels = np.array([1, 0])
+    >>> eval_pred = EvalPrediction(predictions=predictions, label_ids=labels)
+    >>> metrics = compute_multiclass_metrics(eval_pred, num_labels=3)
+    >>> "accuracy" in metrics
+    True
+    """
+    # Load metrics
+    accuracy_metric = evaluate.load("accuracy")
+    precision_metric = evaluate.load("precision")
+    recall_metric = evaluate.load("recall")
+    f1_metric = evaluate.load("f1")
+    # Extract predictions and labels
+    predictions = eval_pred.predictions
+    labels = eval_pred.label_ids
+    # Convert logits to predictions
+    if predictions.ndim == 1:
+        # Single logit per sample (shouldn't happen for multi-class)
+        preds = predictions.astype(int)
+    else:
+        # Multiple logits: take argmax
+        preds = np.argmax(predictions, axis=-1)
+    # Ensure labels are integers
+    labels = labels.astype(int)
+    # Compute metrics with macro averaging
+    accuracy = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
+    precision = precision_metric.compute(
+        predictions=preds,
+        references=labels,
+        average="macro",
+        zero_division=0,
+    )["precision"]
+    recall = recall_metric.compute(
+        predictions=preds,
+        references=labels,
+        average="macro",
+        zero_division=0,
+    )["recall"]
+    f1 = f1_metric.compute(
+        predictions=preds,
+        references=labels,
+        average="macro",
+        zero_division=0,
+    )["f1"]
+    return {
+        "accuracy": accuracy,
+        "precision": precision,
+        "recall": recall,
+        "f1": f1,
+    }
+def compute_cloze_metrics(
+    eval_pred: EvalPrediction, tokenizer: PreTrainedTokenizerBase
+) -> dict[str, float]:
+    """Compute metrics for cloze (MLM) tasks.
+    Computes token-level metrics at masked positions:
+    - accuracy: Whether predicted token exactly matches target
+    - top_3_accuracy: Whether target is in top 3 predictions
+    - top_5_accuracy: Whether target is in top 5 predictions
+    - perplexity: Exponentiated average cross-entropy at masked positions
+    Parameters
+    ----------
+    eval_pred : EvalPrediction
+        EvalPrediction object with:
+        - predictions: array of shape (n_samples, seq_len, vocab_size) with logits
+        - label_ids: array of shape (n_samples, seq_len) with target_token_ids at
+                     masked positions, -100 elsewhere (HuggingFace ignore index)
+    tokenizer : PreTrainedTokenizerBase
+        HuggingFace tokenizer. Used for type checking and potential future extensions.
+    Returns
+    -------
+    dict[str, float]
+        Dictionary with accuracy, top_3_accuracy, top_5_accuracy, and perplexity.
+    Notes
+    -----
+    This function expects labels encoded in HuggingFace's MLM convention:
+    - Target token IDs at positions to evaluate
+    - -100 (ignore index) at all other positions
+    The ClozeMLMTrainer's prediction_step() creates this encoding from
+    masked_positions and target_token_ids in the dataset.
+    Examples
+    --------
+    >>> from transformers import EvalPrediction, AutoTokenizer
+    >>> import numpy as np
+    >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+    >>> # Simulate: 2 samples, 5 positions, 100 vocab (simplified)
+    >>> predictions = np.zeros((2, 5, 100))
+    >>> predictions[0, 2, 42] = 10.0  # High logit for token 42 at pos 2
+    >>> predictions[1, 1, 17] = 10.0  # High logit for token 17 at pos 1
+    >>> labels = np.full((2, 5), -100)
+    >>> labels[0, 2] = 42  # Target at pos 2
+    >>> labels[1, 1] = 17  # Target at pos 1
+    >>> eval_pred = EvalPrediction(predictions=predictions, label_ids=labels)
+    >>> metrics = compute_cloze_metrics(eval_pred, tokenizer)
+    >>> metrics["accuracy"]
+    1.0
+    """
+    predictions = eval_pred.predictions
+    labels = eval_pred.label_ids
+    # Handle empty or invalid inputs
+    if predictions is None or predictions.size == 0:
+        return {
+            "accuracy": 0.0,
+            "top_3_accuracy": 0.0,
+            "top_5_accuracy": 0.0,
+            "perplexity": float("inf"),
+        }
+    if labels is None:
+        return {
+            "accuracy": 0.0,
+            "top_3_accuracy": 0.0,
+            "top_5_accuracy": 0.0,
+            "perplexity": float("inf"),
+        }
+    # Validate shapes
+    if predictions.ndim != 3:
+        # Unexpected shape, return defaults
+        return {
+            "accuracy": 0.0,
+            "top_3_accuracy": 0.0,
+            "top_5_accuracy": 0.0,
+            "perplexity": float("inf"),
+        }
+    if labels.ndim != 2:
+        return {
+            "accuracy": 0.0,
+            "top_3_accuracy": 0.0,
+            "top_5_accuracy": 0.0,
+            "perplexity": float("inf"),
+        }
+    # Check shape compatibility
+    if predictions.shape[:2] != labels.shape:
+        return {
+            "accuracy": 0.0,
+            "top_3_accuracy": 0.0,
+            "top_5_accuracy": 0.0,
+            "perplexity": float("inf"),
+        }
+    # Find masked positions (where label != -100)
+    mask = labels != -100
+    # Handle case with no masked positions
+    if not mask.any():
+        return {
+            "accuracy": 0.0,
+            "top_3_accuracy": 0.0,
+            "top_5_accuracy": 0.0,
+            "perplexity": float("inf"),
+        }
+    n_total = int(mask.sum())
+    # Compute top-1 accuracy
+    pred_tokens = np.argmax(predictions, axis=-1)  # (n_samples, seq_len)
+    correct = (pred_tokens == labels) & mask
+    n_correct = int(correct.sum())
+    accuracy = float(n_correct) / float(n_total)
+    # Compute top-k accuracy using argpartition (efficient for large vocab)
+    def compute_topk_accuracy(k: int) -> float:
+        """Compute top-k accuracy at masked positions."""
+        vocab_size = predictions.shape[2]
+        if k >= vocab_size:
+            # All tokens are in top-k
+            return 1.0
+        # Get top-k indices: shape (n_samples, seq_len, k)
+        topk_indices = np.argpartition(predictions, -k, axis=-1)[..., -k:]
+        # Expand labels for comparison: (n_samples, seq_len, 1)
+        labels_expanded = labels[..., np.newaxis]
+        # Check if label is in top-k for each position
+        in_topk = (topk_indices == labels_expanded).any(axis=-1)
+        # Apply mask and compute accuracy
+        correct_topk = in_topk & mask
+        n_correct_k = int(correct_topk.sum())
+        return float(n_correct_k) / float(n_total)
+    top_3_accuracy = compute_topk_accuracy(3)
+    top_5_accuracy = compute_topk_accuracy(5)
+    # Compute perplexity
+    # Perplexity = exp(average cross-entropy loss)
+    def compute_perplexity() -> float:
+        """Compute perplexity at masked positions."""
+        # Numerically stable softmax using log-sum-exp trick
+        max_logits = predictions.max(axis=-1, keepdims=True)
+        shifted = predictions - max_logits
+        exp_logits = np.exp(shifted)
+        sum_exp = exp_logits.sum(axis=-1, keepdims=True)
+        log_probs = shifted - np.log(sum_exp)  # log softmax
+        # Get log probabilities at label positions
+        n_samples, seq_len, _ = predictions.shape
+        # Create indices for gathering
+        batch_indices = np.arange(n_samples)[:, np.newaxis]
+        seq_indices = np.arange(seq_len)[np.newaxis, :]
+        # Handle -100 labels by replacing with 0 temporarily (they'll be masked out)
+        safe_labels = np.where(labels >= 0, labels, 0)
+        # Gather log probs: log_probs[i, j, labels[i, j]]
+        target_log_probs = log_probs[batch_indices, seq_indices, safe_labels]
+        # Cross-entropy is negative log prob
+        cross_entropy = -target_log_probs  # (n_samples, seq_len)
+        # Average over masked positions only
+        masked_ce = cross_entropy[mask]
+        if len(masked_ce) == 0:
+            return float("inf")
+        avg_ce = float(masked_ce.mean())
+        # Perplexity = exp(average cross-entropy)
+        # Clip to avoid overflow
+        if avg_ce > 100:
+            return float("inf")
+        return float(np.exp(avg_ce))
+    perplexity = compute_perplexity()
+    return {
+        "accuracy": accuracy,
+        "top_3_accuracy": top_3_accuracy,
+        "top_5_accuracy": top_5_accuracy,
+        "perplexity": perplexity,
+    }