PyPI - wisent - Versions diffs - 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl - Mend

wisent 0.7.701py3-none-any.whl → 0.7.1045py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (391) hide show

wisent/core/cli/preview_pairs.py ADDED Viewed

@@ -0,0 +1,203 @@
+"""Preview contrastive pairs from benchmarks with different extraction strategies."""
+import sys
+import json
+import argparse
+from typing import Optional
+def execute_preview_pairs(args):
+    """Preview contrastive pairs from a benchmark with different strategies applied."""
+    from wisent.core.contrastive_pairs.lm_eval_pairs.lm_task_pairs_generation import (
+        lm_build_contrastive_pairs,
+    )
+    from wisent.core.contrastive_pairs.huggingface_pairs.hf_extractor_manifest import HF_EXTRACTORS
+    from wisent.core.activations.extraction_strategy import (
+        ExtractionStrategy,
+        build_extraction_texts,
+        get_strategy_for_model,
+    )
+    task_name = args.task_name
+    limit = args.limit or 5
+    strategies = args.strategies or ['chat_last', 'mc_balanced', 'completion_last']
+    print(f"\n{'='*80}")
+    print(f"Preview Contrastive Pairs: {task_name}")
+    print(f"{'='*80}")
+    # Load pairs
+    print(f"\nLoading {limit} pairs from '{task_name}'...")
+    try:
+        task_name_lower = task_name.lower()
+        is_hf_task = task_name_lower in {k.lower() for k in HF_EXTRACTORS.keys()}
+        if is_hf_task:
+            pairs = lm_build_contrastive_pairs(
+                task_name=task_name,
+                lm_eval_task=None,
+                limit=limit,
+            )
+        else:
+            from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
+            loader = LMEvalDataLoader()
+            task_obj = loader.load_lm_eval_task(task_name)
+            if isinstance(task_obj, dict):
+                if len(task_obj) != 1:
+                    keys = ", ".join(sorted(task_obj.keys()))
+                    print(f"Task '{task_name}' has subtasks: {keys}")
+                    print("Please specify a subtask.")
+                    sys.exit(1)
+                (subname, task), = task_obj.items()
+                task_name = subname
+            else:
+                task = task_obj
+            pairs = lm_build_contrastive_pairs(
+                task_name=task_name,
+                lm_eval_task=task,
+                limit=limit,
+            )
+        print(f"Loaded {len(pairs)} pairs\n")
+    except Exception as e:
+        print(f"Error loading task: {e}")
+        sys.exit(1)
+    # Mock tokenizer for preview
+    class PreviewTokenizer:
+        def apply_chat_template(self, messages, tokenize=False, add_generation_prompt=False):
+            if len(messages) == 1:
+                return f"<|user|>\n{messages[0]['content']}\n<|assistant|>\n"
+            elif len(messages) == 2:
+                return f"<|user|>\n{messages[0]['content']}\n<|assistant|>\n{messages[1]['content']}<|end|>"
+            return str(messages)
+        def __call__(self, text, add_special_tokens=False):
+            return {"input_ids": text.split()}
+    tokenizer = PreviewTokenizer()
+    # Show pairs with strategies
+    for i, pair in enumerate(pairs):
+        print(f"\n{'='*80}")
+        print(f"PAIR {i+1}/{len(pairs)}")
+        print(f"{'='*80}")
+        print(f"\n--- RAW DATA (from extractor) ---")
+        print(f"Prompt: {pair.prompt[:300]}{'...' if len(pair.prompt) > 300 else ''}")
+        print(f"Correct: {pair.positive_response.model_response[:100]}{'...' if len(pair.positive_response.model_response) > 100 else ''}")
+        print(f"Incorrect: {pair.negative_response.model_response[:100]}{'...' if len(pair.negative_response.model_response) > 100 else ''}")
+        for strategy_name in strategies:
+            try:
+                strategy = ExtractionStrategy(strategy_name)
+            except ValueError:
+                print(f"\n--- {strategy_name.upper()} --- (invalid strategy)")
+                continue
+            print(f"\n--- {strategy_name.upper()} ---")
+            try:
+                # Build texts for positive response
+                if strategy in (ExtractionStrategy.MC_BALANCED, ExtractionStrategy.MC_COMPLETION):
+                    full_text, answer, prompt_only = build_extraction_texts(
+                        strategy,
+                        pair.prompt,
+                        pair.positive_response.model_response,
+                        tokenizer,
+                        other_response=pair.negative_response.model_response,
+                        is_positive=True,
+                        auto_convert_strategy=False,
+                    )
+                else:
+                    full_text, answer, prompt_only = build_extraction_texts(
+                        strategy,
+                        pair.prompt,
+                        pair.positive_response.model_response,
+                        tokenizer,
+                        auto_convert_strategy=False,
+                    )
+                print(f"Full text (positive):")
+                print(f"  {full_text[:400]}{'...' if len(full_text) > 400 else ''}")
+                print(f"Answer token: {answer}")
+            except Exception as e:
+                print(f"  Error: {e}")
+    # Summary
+    print(f"\n{'='*80}")
+    print("SUMMARY")
+    print(f"{'='*80}")
+    print(f"Task: {task_name}")
+    print(f"Pairs shown: {len(pairs)}")
+    print(f"Strategies: {', '.join(strategies)}")
+    print()
+    # Save to JSON if requested
+    if args.output:
+        output_data = {
+            "task_name": task_name,
+            "num_pairs": len(pairs),
+            "strategies": strategies,
+            "pairs": []
+        }
+        for pair in pairs:
+            pair_data = {
+                "raw": {
+                    "prompt": pair.prompt,
+                    "correct": pair.positive_response.model_response,
+                    "incorrect": pair.negative_response.model_response,
+                },
+                "formatted": {}
+            }
+            for strategy_name in strategies:
+                try:
+                    strategy = ExtractionStrategy(strategy_name)
+                    if strategy in (ExtractionStrategy.MC_BALANCED, ExtractionStrategy.MC_COMPLETION):
+                        full_text, answer, _ = build_extraction_texts(
+                            strategy, pair.prompt, pair.positive_response.model_response,
+                            tokenizer, other_response=pair.negative_response.model_response,
+                            is_positive=True, auto_convert_strategy=False,
+                        )
+                    else:
+                        full_text, answer, _ = build_extraction_texts(
+                            strategy, pair.prompt, pair.positive_response.model_response,
+                            tokenizer, auto_convert_strategy=False,
+                        )
+                    pair_data["formatted"][strategy_name] = {
+                        "full_text": full_text,
+                        "answer": answer,
+                    }
+                except Exception as e:
+                    pair_data["formatted"][strategy_name] = {"error": str(e)}
+            output_data["pairs"].append(pair_data)
+        with open(args.output, 'w') as f:
+            json.dump(output_data, f, indent=2)
+        print(f"Saved to: {args.output}")
+def main():
+    parser = argparse.ArgumentParser(description="Preview contrastive pairs with different strategies")
+    parser.add_argument("task_name", help="Task/benchmark name (e.g., boolq, mmlu, hellaswag)")
+    parser.add_argument("--limit", "-n", type=int, default=5, help="Number of pairs to show (default: 5)")
+    parser.add_argument("--strategies", "-s", nargs="+",
+                        default=["chat_last", "mc_balanced", "completion_last"],
+                        help="Strategies to preview")
+    parser.add_argument("--output", "-o", help="Save to JSON file")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    args = parser.parse_args()
+    execute_preview_pairs(args)
+if __name__ == "__main__":
+    main()

wisent/core/cli/steering_method_trainer.py CHANGED Viewed

@@ -156,7 +156,7 @@ def collect_activations_for_pair_set(
     Returns:
         Updated ContrastivePairSet with activations attached
     """
-    collector = ActivationCollector(model=model, store_device="cpu")
+    collector = ActivationCollector(model=model)
     updated_pairs = []
     for pair in pair_set.pairs:
@@ -320,7 +320,7 @@ class UnifiedSteeringTrainer:
     @property
     def collector(self) -> ActivationCollector:
         if self._collector is None:
-            self._collector = ActivationCollector(model=self.model, store_device="cpu")
+            self._collector = ActivationCollector(model=self.model)
         return self._collector
     def train_for_layer(
@@ -595,7 +595,7 @@ def get_optimal_steering_plan(
     method_name = config["method"]
     # Collect activations for the optimal layer
-    collector = ActivationCollector(model=model, store_device="cpu")
+    collector = ActivationCollector(model=model)
     layer_str = str(layer)
     pos_acts = []

wisent/core/cli/tasks.py CHANGED Viewed

@@ -414,7 +414,7 @@ def execute_tasks(args):
     print(f"\n🧠 Extracting activations from layer {layer}...")
     # 5. Collect activations for all pairs
-    collector = ActivationCollector(model=model, store_device="cpu")
+    collector = ActivationCollector(model=model)
     # Get extraction strategy from args (already an ExtractionStrategy value string)
     extraction_strategy = ExtractionStrategy(getattr(args, 'extraction_strategy', 'chat_last'))
@@ -581,13 +581,6 @@ def execute_tasks(args):
         expected = pair.positive_response.model_response
         choices = [pair.negative_response.model_response, pair.positive_response.model_response]
-        # Extract test_code from pair metadata for coding tasks
-        test_code = None
-        starter_code = None
-        if hasattr(pair, 'metadata') and pair.metadata:
-            test_code = pair.metadata.get('test_code')
-            starter_code = pair.metadata.get('starter_code')
         # Generate response from unsteered model
         messages = [{"role": "user", "content": question}]
@@ -597,6 +590,7 @@ def execute_tasks(args):
         )[0]
         # Evaluate the response using Wisent evaluator
+        # Pass all pair metadata to evaluator - each evaluator uses what it needs
         eval_kwargs = {
             'response': response,
             'expected': expected,
@@ -605,16 +599,16 @@ def execute_tasks(args):
             'choices': choices,
             'task_name': task_name,
         }
-        # Add test_code for coding tasks (livecodebench, humaneval, mbpp, etc.)
-        if test_code:
-            eval_kwargs['test_code'] = test_code
-        if starter_code:
-            eval_kwargs['starter_code'] = starter_code
+        # Add all pair metadata to eval_kwargs (test_code, correct_answers, etc.)
+        if hasattr(pair, 'metadata') and pair.metadata:
+            for key, value in pair.metadata.items():
+                if value is not None and key not in eval_kwargs:
+                    eval_kwargs[key] = value
         eval_result = evaluator.evaluate(**eval_kwargs)
         # Get activation for this generation
         # Use ActivationCollector to collect activations from the generated text
-        gen_collector = ActivationCollector(model=model, store_device="cpu")
+        gen_collector = ActivationCollector(model=model)
         # Create a pair with the generated response
         from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
         from wisent.core.contrastive_pairs.core.pair import ContrastivePair
@@ -631,56 +625,20 @@ def execute_tasks(args):
         # Collect activation - ActivationCollector will re-run the model with prompt+response
         # First, collect with full sequence to get token-by-token activations
         collected_full = gen_collector.collect(
-            temp_pair, strategy=aggregation_strategy,
-            return_full_sequence=True,
-            normalize_layers=False,
-            prompt_strategy=prompt_strategy
+            temp_pair, strategy=extraction_strategy,
         )
         # Access the collected activations
         import torch
         if collected_full.positive_response.layers_activations:
-            layer_activations_full = collected_full.positive_response.layers_activations
-            if layer_str in layer_activations_full:
-                activation_full_seq = layer_activations_full[layer_str]
-                if activation_full_seq is not None and isinstance(activation_full_seq, torch.Tensor):
-                    # activation_full_seq shape: (num_tokens, hidden_dim)
-                    # Apply aggregation manually to get single vector for classifier
-                    if aggregation_strategy.name == 'MEAN_POOLING':
-                        activation_agg = activation_full_seq.mean(dim=0)
-                    elif aggregation_strategy.name == 'LAST_TOKEN':
-                        activation_agg = activation_full_seq[-1]
-                    elif aggregation_strategy.name == 'FIRST_TOKEN':
-                        activation_agg = activation_full_seq[0]
-                    elif aggregation_strategy.name == 'MAX_POOLING':
-                        activation_agg = activation_full_seq.max(dim=0)[0]
-                    elif aggregation_strategy.name == 'MIN_POOLING':
-                        activation_agg = activation_full_seq.min(dim=0)[0]
-                    else:
-                        # Default to mean
-                        activation_agg = activation_full_seq.mean(dim=0)
-                    # Compute per-token classifier scores first (needed for max_score aggregation)
-                    token_scores = []
-                    for token_idx in range(activation_full_seq.shape[0]):
-                        token_act = activation_full_seq[token_idx].unsqueeze(0).float()
-                        token_proba_result = classifier.predict_proba(token_act)
-                        token_proba = token_proba_result if isinstance(token_proba_result, float) else token_proba_result[0]
-                        token_scores.append(float(token_proba))
-                    # Get classifier prediction - either from aggregated vector or min token score
-                    if use_max_token_score:
-                        # Use MINIMUM token score as response-level classification
-                        # Score = P(TRUTHFUL), so min score = most suspicious token
-                        # If ANY token has low truthfulness probability, flag the response
-                        pred_proba = min(token_scores) if token_scores else 0.5
-                    else:
-                        # Standard: classify aggregated activation vector
-                        act_tensor = activation_agg.unsqueeze(0).float()
-                        pred_proba_result = classifier.predict_proba(act_tensor)
-                        pred_proba = pred_proba_result if isinstance(pred_proba_result, float) else pred_proba_result[0]
+            layer_activations = collected_full.positive_response.layers_activations
+            if layer_str in layer_activations:
+                activation = layer_activations[layer_str]
+                if activation is not None and isinstance(activation, torch.Tensor):
+                    # activation shape: (hidden_dim,) - already aggregated by extraction strategy
+                    act_tensor = activation.unsqueeze(0).float()
+                    pred_proba_result = classifier.predict_proba(act_tensor)
+                    pred_proba = pred_proba_result if isinstance(pred_proba_result, float) else pred_proba_result[0]
                     pred_label = int(pred_proba > args.detection_threshold)
                     # Update detection stats
@@ -753,14 +711,6 @@ def execute_tasks(args):
                     # Ground truth from evaluator
                     ground_truth = 1 if eval_result.ground_truth == "TRUTHFUL" else 0
-                    # token_scores = P(TRUTHFUL) for each token
-                    # min_token_score = most suspicious token (lowest P(TRUTHFUL))
-                    # max_token_score = most confident token (highest P(TRUTHFUL))
-                    min_token_score = min(token_scores) if token_scores else 0.0
-                    min_token_idx = token_scores.index(min_token_score) if token_scores else -1
-                    max_token_score = max(token_scores) if token_scores else 0.0
-                    max_token_idx = token_scores.index(max_token_score) if token_scores else -1
                     generation_results.append({
                         'question': question,
                         'response': response,
@@ -770,13 +720,6 @@ def execute_tasks(args):
                         'classifier_pred': pred_label,
                         'classifier_proba': float(pred_proba),
                         'correct': pred_label == ground_truth,
-                        'token_scores': token_scores,  # Per-token P(TRUTHFUL) probabilities
-                        'min_token_score': min_token_score,  # Most suspicious token - lowest P(TRUTHFUL)
-                        'min_token_idx': min_token_idx,  # Index of most suspicious token
-                        'max_token_score': max_token_score,  # Most confident token - highest P(TRUTHFUL) (kept for backward compat)
-                        'max_token_idx': max_token_idx,  # Index of most confident token
-                        'num_tokens': len(token_scores),
-                        'aggregation_method': 'max_score' if use_max_token_score else args.token_aggregation,
                         'quality_score': quality_score,
                         'issue_detected': issue_detected,
                         'detection_type': detection_type,
@@ -852,7 +795,7 @@ def execute_tasks(args):
             classifier_type=args.classifier_type,
             training_accuracy=report.final.accuracy,
             training_samples=len(X),
-            token_aggregation=args.token_aggregation,
+            token_aggregation=extraction_strategy.value,
             detection_threshold=args.detection_threshold
         )
@@ -884,7 +827,7 @@ def execute_tasks(args):
                     'task': args.task_names,
                     'model': args.model,
                     'layer': layer,
-                    'aggregation': args.token_aggregation,
+                    'aggregation': extraction_strategy.value,
                     'threshold': args.detection_threshold,
                     'num_generations': len(generation_results),
                     'detection_stats': detection_stats,

wisent/core/cli/train_unified_goodness.py CHANGED Viewed

@@ -325,11 +325,11 @@ def execute_train_unified_goodness(args):
         'final': ExtractionStrategy.CHAT_LAST,
         'first': ExtractionStrategy.CHAT_FIRST,
         'max': ExtractionStrategy.CHAT_MAX_NORM,
-        'continuation': ExtractionStrategy.CHAT_GEN_POINT,
+        'continuation': ExtractionStrategy.CHAT_FIRST,  # First answer token
     }
     aggregation_strategy = aggregation_map.get(
         args.token_aggregation,
-        ExtractionStrategy.CHAT_GEN_POINT
+        ExtractionStrategy.CHAT_LAST
     )
     # Map prompt strategy
@@ -353,7 +353,7 @@ def execute_train_unified_goodness(args):
         negative_activations = activations_checkpoint['negative_activations']
         print(f"   ✓ Loaded activations from checkpoint ({len(positive_activations[layers[0]])} pairs)")
     else:
-        collector = ActivationCollector(model=model, store_device="cpu")
+        collector = ActivationCollector(model=model)
         # Collect activations for all training pairs using batched processing
         positive_activations = {layer: [] for layer in layers}

wisent/core/contrastive_pairs/diagnostics/control_vectors.py CHANGED Viewed

@@ -95,7 +95,7 @@ def run_control_vector_diagnostics(
             )
             continue
-        flat = detached.to(dtype=torch.float32, device="cpu").reshape(-1)
+        flat = detached.to(device="cpu").reshape(-1)
         if not torch.isfinite(flat).all():
             non_finite = (~torch.isfinite(flat)).sum().item()
@@ -1549,7 +1549,7 @@ def _detect_sparse_structure(
     sorted_abs = abs_diff.sort().values
     n = len(sorted_abs)
     cumsum = sorted_abs.cumsum(0)
-    gini = (2 * torch.arange(1, n + 1, dtype=torch.float32) @ sorted_abs - (n + 1) * sorted_abs.sum()) / (n * sorted_abs.sum() + 1e-10)
+    gini = (2 * torch.arange(1, n + 1, dtype=sorted_abs.dtype, device=sorted_abs.device) @ sorted_abs - (n + 1) * sorted_abs.sum()) / (n * sorted_abs.sum() + 1e-10)
     # Sparse score: high if few dimensions are active
     sparse_score = 0.4 * (1 - float(l1_l2_ratio)) + 0.3 * (1 - float(active_fraction)) + 0.3 * float(gini)
@@ -1632,11 +1632,11 @@ def _compute_dip_statistic(data: torch.Tensor) -> float:
         return 0.0
     # Empirical CDF
-    ecdf = torch.arange(1, n + 1, dtype=torch.float32) / n
+    ecdf = torch.arange(1, n + 1, dtype=sorted_data.dtype, device=sorted_data.device) / n
     # Greatest convex minorant and least concave majorant
     # Simplified: measure deviation from uniform
-    uniform = torch.linspace(0, 1, n)
+    uniform = torch.linspace(0, 1, n, dtype=sorted_data.dtype, device=sorted_data.device)
     # Kolmogorov-Smirnov like statistic
     ks_stat = (ecdf - uniform).abs().max()

wisent/core/contrastive_pairs/diagnostics/linearity.py CHANGED Viewed

@@ -188,6 +188,12 @@ def check_linearity(
                 linear_score = result.all_scores["linear"].score
                 linear_details = result.all_scores["linear"].details
+                # Include all structure scores
+                structure_scores = {
+                    name: {"score": score.score, "confidence": score.confidence}
+                    for name, score in result.all_scores.items()
+                }
                 all_results.append({
                     "extraction_strategy": strategy.value,
                     "normalize": normalize,
@@ -196,6 +202,7 @@ def check_linearity(
                     "cohens_d": linear_details.get("cohens_d", 0),
                     "variance_explained": linear_details.get("variance_explained", 0),
                     "best_structure": result.best_structure.value,
+                    "all_structure_scores": structure_scores,
                 })
     if not all_results:

wisent 0.7.701__py3-none-any.whl → 0.7.1045__py3-none-any.whl

wisent 0.7.701py3-none-any.whl → 0.7.1045py3-none-any.whl