PyPI - wisent - Versions diffs - 0.5.13__py3-none-any.whl → 0.5.15__py3-none-any.whl - Mend

wisent 0.5.13py3-none-any.whl → 0.5.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of wisent might be problematic. Click here for more details.

Files changed (62) hide show

wisent/__init__.py +1 -1
wisent/cli.py +114 -0
wisent/core/activations/activations_collector.py +19 -11
wisent/core/agent/__init__.py +1 -18
wisent/core/agent/diagnose/__init__.py +1 -55
wisent/core/cli/__init__.py +3 -1
wisent/core/cli/create_steering_vector.py +60 -18
wisent/core/cli/evaluate_responses.py +14 -8
wisent/core/cli/generate_pairs_from_task.py +18 -5
wisent/core/cli/get_activations.py +1 -1
wisent/core/cli/multi_steer.py +108 -0
wisent/core/cli/optimize_classification.py +187 -285
wisent/core/cli/optimize_sample_size.py +78 -0
wisent/core/cli/optimize_steering.py +354 -53
wisent/core/cli/tasks.py +274 -9
wisent/core/errors/__init__.py +0 -0
wisent/core/errors/error_handler.py +134 -0
wisent/core/evaluators/benchmark_specific/log_likelihoods_evaluator.py +152 -295
wisent/core/evaluators/rotator.py +22 -8
wisent/core/main.py +5 -1
wisent/core/model_persistence.py +4 -19
wisent/core/models/wisent_model.py +11 -3
wisent/core/parser.py +4 -3
wisent/core/parser_arguments/main_parser.py +1 -1
wisent/core/parser_arguments/multi_steer_parser.py +4 -3
wisent/core/parser_arguments/optimize_steering_parser.py +4 -0
wisent/core/sample_size_optimizer_v2.py +1 -1
wisent/core/steering_optimizer.py +2 -2
wisent/tests/__init__.py +0 -0
wisent/tests/examples/__init__.py +0 -0
wisent/tests/examples/cli/__init__.py +0 -0
wisent/tests/examples/cli/activations/__init__.py +0 -0
wisent/tests/examples/cli/activations/test_get_activations.py +127 -0
wisent/tests/examples/cli/classifier/__init__.py +0 -0
wisent/tests/examples/cli/classifier/test_classifier_examples.py +141 -0
wisent/tests/examples/cli/contrastive_pairs/__init__.py +0 -0
wisent/tests/examples/cli/contrastive_pairs/test_generate_pairs.py +89 -0
wisent/tests/examples/cli/evaluation/__init__.py +0 -0
wisent/tests/examples/cli/evaluation/test_evaluation_examples.py +117 -0
wisent/tests/examples/cli/generate/__init__.py +0 -0
wisent/tests/examples/cli/generate/test_generate_with_classifier.py +146 -0
wisent/tests/examples/cli/generate/test_generate_with_steering.py +149 -0
wisent/tests/examples/cli/generate/test_only_generate.py +110 -0
wisent/tests/examples/cli/multi_steering/__init__.py +0 -0
wisent/tests/examples/cli/multi_steering/test_multi_steer_from_trained_vectors.py +210 -0
wisent/tests/examples/cli/multi_steering/test_multi_steer_with_different_parameters.py +205 -0
wisent/tests/examples/cli/multi_steering/test_train_and_multi_steer.py +174 -0
wisent/tests/examples/cli/optimizer/__init__.py +0 -0
wisent/tests/examples/cli/optimizer/test_optimize_sample_size.py +102 -0
wisent/tests/examples/cli/optimizer/test_optimizer_examples.py +59 -0
wisent/tests/examples/cli/steering/__init__.py +0 -0
wisent/tests/examples/cli/steering/test_create_steering_vectors.py +135 -0
wisent/tests/examples/cli/synthetic/__init__.py +0 -0
wisent/tests/examples/cli/synthetic/test_synthetic_pairs.py +45 -0
{wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/METADATA +3 -1
{wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/RECORD +61 -31
wisent/core/agent/diagnose/test_synthetic_classifier.py +0 -71
/wisent/core/parser_arguments/{test_nonsense_parser.py → nonsense_parser.py} +0 -0
{wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/WHEEL +0 -0
{wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/entry_points.txt +0 -0
{wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/licenses/LICENSE +0 -0
{wisent-0.5.13.dist-info → wisent-0.5.15.dist-info}/top_level.txt +0 -0

wisent/core/cli/tasks.py CHANGED Viewed

@@ -17,6 +17,37 @@ def execute_tasks(args):
     from wisent.core.classifiers.classifiers.core.atoms import ClassifierTrainConfig
     from wisent.core.model_persistence import ModelPersistence, create_classifier_metadata
+    # Check if this is inference-only mode with steering vector
+    if args.inference_only and args.load_steering_vector:
+        import torch
+        print(f"\n🎯 Starting inference with steering vector")
+        print(f"   Loading vector from: {args.load_steering_vector}")
+        # Load steering vector
+        vector_data = torch.load(args.load_steering_vector)
+        steering_vector = vector_data['vector']
+        layer = vector_data['layer']
+        print(f"   ✓ Loaded steering vector for layer {layer}")
+        print(f"   Model: {vector_data.get('model', 'unknown')}")
+        print(f"   Method: {vector_data.get('method', 'unknown')}")
+        # For now, just load and validate - actual inference would require more implementation
+        print(f"\n✅ Steering vector loaded successfully!\n")
+        print(f"Note: Inference with steering vector requires additional implementation")
+        # Return results for programmatic access
+        return {
+            "steering_vector_loaded": True,
+            "vector_path": args.load_steering_vector,
+            "layer": layer,
+            "method": vector_data.get('method', 'unknown'),
+            "test_accuracy": None,
+            "test_f1_score": None,
+            "training_time": 0.0,
+            "evaluation_results": {}
+        }
     print(f"\n🎯 Starting classifier training on task: {args.task_names}")
     print(f"   Model: {args.model}")
     print(f"   Layer: {args.layer}")
@@ -24,10 +55,11 @@ def execute_tasks(args):
     try:
         # 1. Load task data using LMEvalDataLoader
-        print(f"\n📊 Loading task '{args.task_names}'...")
+        task_name = args.task_names[0] if isinstance(args.task_names, list) else args.task_names
+        print(f"\n📊 Loading task '{task_name}'...")
         loader = LMEvalDataLoader()
         result = loader._load_one_task(
-            task_name=args.task_names,
+            task_name=task_name,
             split_ratio=args.split_ratio,
             seed=args.seed,
             limit=args.limit,
@@ -94,6 +126,70 @@ def execute_tasks(args):
         print(f"\n   ✓ Collected {len(positive_activations)} positive and {len(negative_activations)} negative activations")
+        # Check if steering vector mode is requested
+        if args.save_steering_vector and args.train_only:
+            import torch
+            from wisent.core.steering_methods.methods.caa import CAAMethod
+            print(f"\n🎯 Training steering vector using {args.steering_method} method...")
+            # Convert activations to tensors
+            pos_tensors = [torch.from_numpy(act).float() for act in positive_activations]
+            neg_tensors = [torch.from_numpy(act).float() for act in negative_activations]
+            # Create steering method
+            steering_method = CAAMethod(normalize=True)
+            # Train steering vector
+            steering_vector = steering_method.train_for_layer(pos_tensors, neg_tensors)
+            # Save steering vector
+            print(f"\n💾 Saving steering vector to '{args.save_steering_vector}'...")
+            os.makedirs(os.path.dirname(args.save_steering_vector) or '.', exist_ok=True)
+            torch.save({
+                'steering_vector': steering_vector,
+                'layer_index': layer,
+                'method': args.steering_method,
+                'model': args.model,
+                'task': args.task_names,
+                # Legacy keys for backward compatibility
+                'vector': steering_vector,
+                'layer': layer,
+            }, args.save_steering_vector)
+            print(f"   ✓ Steering vector saved to: {args.save_steering_vector}")
+            # Save output artifacts if requested
+            if args.output:
+                print(f"\n📁 Saving artifacts to '{args.output}'...")
+                os.makedirs(args.output, exist_ok=True)
+                report_path = os.path.join(args.output, 'training_report.json')
+                with open(report_path, 'w') as f:
+                    json.dump({
+                        'method': args.steering_method,
+                        'layer': layer,
+                        'num_positive': len(positive_activations),
+                        'num_negative': len(negative_activations),
+                        'vector_shape': list(steering_vector.shape)
+                    }, f, indent=2)
+                print(f"   ✓ Training report saved to: {report_path}")
+            print(f"\n✅ Steering vector training completed successfully!\n")
+            # Return results for programmatic access
+            return {
+                "steering_vector_saved": True,
+                "vector_path": args.save_steering_vector,
+                "layer": layer,
+                "method": args.steering_method,
+                "num_positive": len(positive_activations),
+                "num_negative": len(negative_activations),
+                "vector_shape": list(steering_vector.shape),
+                "test_accuracy": None,
+                "test_f1_score": None,
+                "training_time": 0.0,
+                "evaluation_results": {}
+            }
         # 6. Prepare training data
         print(f"\n🎯 Preparing training data...")
         X_positive = np.array(positive_activations)
@@ -126,15 +222,155 @@ def execute_tasks(args):
         # Train the classifier
         report = classifier.fit(X, y, config=train_config)
-        # 8. Print results
+        # 8. Print training completion
         print(f"\n📈 Training completed!")
         print(f"   Best epoch: {report.best_epoch}/{report.epochs_ran}")
-        print(f"   Final metrics:")
-        print(f"     • Accuracy:  {report.final.accuracy:.4f}")
-        print(f"     • Precision: {report.final.precision:.4f}")
-        print(f"     • Recall:    {report.final.recall:.4f}")
-        print(f"     • F1 Score:  {report.final.f1:.4f}")
-        print(f"     • AUC:       {report.final.auc:.4f}")
+        # 8.5. PROPER EVALUATION: Test classifier on real model generations
+        print(f"\n🎯 Evaluating classifier on real model generations...")
+        # Get test pairs
+        test_pairs = result['test_qa_pairs']
+        print(f"   Generating responses for {len(test_pairs.pairs)} test questions...")
+        # Initialize evaluator for this task
+        from wisent.core.evaluators.rotator import EvaluatorRotator
+        # Discover both oracles and benchmark_specific evaluators
+        EvaluatorRotator.discover_evaluators("wisent.core.evaluators.oracles")
+        EvaluatorRotator.discover_evaluators("wisent.core.evaluators.benchmark_specific")
+        evaluator = EvaluatorRotator(evaluator=None, task_name=task_name, autoload=False)
+        print(f"   Using evaluator: {evaluator._evaluator.name}")
+        # Generate responses and collect activations
+        generation_results = []
+        for i, pair in enumerate(test_pairs.pairs):
+            if i % 10 == 0:
+                print(f"      Processing {i+1}/{len(test_pairs.pairs)}...", end='\r')
+            question = pair.prompt
+            expected = pair.positive_response.model_response
+            choices = [pair.negative_response.model_response, pair.positive_response.model_response]
+            # Generate response from unsteered model
+            response = model.generate(
+                [[{"role": "user", "content": question}]],
+                max_new_tokens=100,
+                do_sample=False  # Deterministic (greedy decoding) for evaluation
+            )[0]
+            # Evaluate the response using Wisent evaluator
+            eval_result = evaluator.evaluate(
+                response=response,
+                expected=expected,
+                model=model,
+                question=question,
+                choices=choices,
+                task_name=task_name
+            )
+            # Get activation for this generation
+            # Use ActivationCollector to collect activations from the generated text
+            gen_collector = ActivationCollector(model=model, store_device="cpu")
+            # Create a pair with the generated response
+            from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
+            from wisent.core.contrastive_pairs.core.pair import ContrastivePair
+            temp_pos_response = PositiveResponse(model_response=response, layers_activations={})
+            temp_neg_response = NegativeResponse(model_response="placeholder", layers_activations={})  # Not used
+            temp_pair = ContrastivePair(
+                prompt=question,
+                positive_response=temp_pos_response,
+                negative_response=temp_neg_response,
+                label=None,
+                trait_description=None
+            )
+            # Collect activation - ActivationCollector will re-run the model with prompt+response
+            # First, collect with full sequence to get token-by-token activations
+            collected_full = gen_collector.collect_for_pair(
+                temp_pair,
+                layers=[layer_str],
+                aggregation=aggregation_strategy,
+                return_full_sequence=True,
+                normalize_layers=False
+            )
+            # Access the collected activations
+            import torch
+            if collected_full.positive_response.layers_activations:
+                layer_activations_full = collected_full.positive_response.layers_activations
+                if layer_str in layer_activations_full:
+                    activation_full_seq = layer_activations_full[layer_str]
+                    if activation_full_seq is not None and isinstance(activation_full_seq, torch.Tensor):
+                        # activation_full_seq shape: (num_tokens, hidden_dim)
+                        # Apply aggregation manually to get single vector for classifier
+                        if aggregation_strategy.name == 'MEAN_POOLING':
+                            activation_agg = activation_full_seq.mean(dim=0)
+                        elif aggregation_strategy.name == 'LAST_TOKEN':
+                            activation_agg = activation_full_seq[-1]
+                        elif aggregation_strategy.name == 'FIRST_TOKEN':
+                            activation_agg = activation_full_seq[0]
+                        elif aggregation_strategy.name == 'MAX_POOLING':
+                            activation_agg = activation_full_seq.max(dim=0)[0]
+                        else:
+                            # Default to mean
+                            activation_agg = activation_full_seq.mean(dim=0)
+                        # Get classifier prediction on aggregated vector
+                        act_tensor = activation_agg.unsqueeze(0).float()
+                        pred_proba_result = classifier.predict_proba(act_tensor)
+                        # Handle both float (single sample) and list return types
+                        pred_proba = pred_proba_result if isinstance(pred_proba_result, float) else pred_proba_result[0]
+                        pred_label = int(pred_proba > args.detection_threshold)
+                        # Ground truth from evaluator
+                        ground_truth = 1 if eval_result.ground_truth == "TRUTHFUL" else 0
+                        # Compute per-token classifier scores
+                        # For each token, get classifier probability
+                        token_scores = []
+                        for token_idx in range(activation_full_seq.shape[0]):
+                            token_act = activation_full_seq[token_idx].unsqueeze(0).float()
+                            token_proba_result = classifier.predict_proba(token_act)
+                            token_proba = token_proba_result if isinstance(token_proba_result, float) else token_proba_result[0]
+                            token_scores.append(float(token_proba))
+                        generation_results.append({
+                            'question': question,
+                            'response': response,
+                            'expected': expected,
+                            'eval_result': eval_result.ground_truth,
+                            'classifier_pred': pred_label,
+                            'classifier_proba': float(pred_proba),
+                            'correct': pred_label == ground_truth,
+                            'token_scores': token_scores,  # Per-token classifier probabilities
+                            'num_tokens': len(token_scores)
+                        })
+        print(f"\n   ✓ Evaluated {len(generation_results)} generations")
+        # Calculate real-world metrics
+        if generation_results:
+            correct_predictions = sum(1 for r in generation_results if r['correct'])
+            real_accuracy = correct_predictions / len(generation_results)
+            # Calculate precision, recall, F1 on real generations
+            true_positives = sum(1 for r in generation_results if r['classifier_pred'] == 1 and r['eval_result'] == 'TRUTHFUL')
+            false_positives = sum(1 for r in generation_results if r['classifier_pred'] == 1 and r['eval_result'] == 'UNTRUTHFUL')
+            false_negatives = sum(1 for r in generation_results if r['classifier_pred'] == 0 and r['eval_result'] == 'TRUTHFUL')
+            real_precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
+            real_recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
+            real_f1 = 2 * (real_precision * real_recall) / (real_precision + real_recall) if (real_precision + real_recall) > 0 else 0
+            print(f"\n   📊 Real-world performance (on actual generations):")
+            print(f"     • Accuracy:  {real_accuracy:.4f}")
+            print(f"     • Precision: {real_precision:.4f}")
+            print(f"     • Recall:    {real_recall:.4f}")
+            print(f"     • F1 Score:  {real_f1:.4f}")
+        else:
+            real_accuracy = real_f1 = real_precision = real_recall = 0.0
+            generation_results = []
         # 9. Save classifier if requested
         if args.save_classifier:
@@ -172,8 +408,37 @@ def execute_tasks(args):
                 json.dump(report.asdict(), f, indent=2)
             print(f"   ✓ Training report saved to: {report_path}")
+            # Save generation details with token scores
+            if generation_results:
+                generation_path = os.path.join(args.output, 'generation_details.json')
+                with open(generation_path, 'w') as f:
+                    json.dump({
+                        'task': args.task_names,
+                        'model': args.model,
+                        'layer': layer,
+                        'aggregation': args.token_aggregation,
+                        'threshold': args.detection_threshold,
+                        'num_generations': len(generation_results),
+                        'generations': generation_results
+                    }, f, indent=2)
+                print(f"   ✓ Generation details (with token scores) saved to: {generation_path}")
         print(f"\n✅ Task completed successfully!\n")
+        # Return results for programmatic access
+        return {
+            # Real-world metrics (on actual generations) - THE ONLY METRICS THAT MATTER
+            "accuracy": float(real_accuracy),
+            "f1_score": float(real_f1),
+            "precision": float(real_precision),
+            "recall": float(real_recall),
+            "generation_count": len(generation_results),
+            # Metadata
+            "best_epoch": report.best_epoch,
+            "epochs_ran": report.epochs_ran,
+            "generation_details": generation_results
+        }
     except Exception as e:
         print(f"\n❌ Error: {str(e)}", file=sys.stderr)
         if args.verbose:

wisent/core/errors/__init__.py ADDED Viewed

File without changes

wisent/core/errors/error_handler.py ADDED Viewed

@@ -0,0 +1,134 @@
+"""Comprehensive error handling for Wisent.
+This module provides informative error classes and utilities for proper error handling
+throughout the codebase. NO FALLBACKS - errors should be raised immediately with
+detailed information about what went wrong and how to fix it.
+"""
+import logging
+from typing import Optional, Any, Dict
+logger = logging.getLogger(__name__)
+class WisentError(Exception):
+    """Base exception for all Wisent errors."""
+    def __init__(self, message: str, details: Optional[Dict[str, Any]] = None):
+        self.message = message
+        self.details = details or {}
+        super().__init__(self.message)
+    def __str__(self):
+        if self.details:
+            details_str = "\n".join(f"  - {k}: {v}" for k, v in self.details.items())
+            return f"{self.message}\nDetails:\n{details_str}"
+        return self.message
+class EvaluationError(WisentError):
+    """Raised when evaluation fails."""
+    pass
+class MissingParameterError(EvaluationError):
+    """Raised when required parameters are missing for evaluation."""
+    def __init__(self, missing_params: list, evaluator_name: str, task_name: Optional[str] = None):
+        message = f"Evaluator '{evaluator_name}' requires missing parameters: {', '.join(missing_params)}"
+        details = {
+            "evaluator": evaluator_name,
+            "missing_parameters": missing_params,
+            "task": task_name or "unknown"
+        }
+        super().__init__(message, details)
+class InvalidChoicesError(EvaluationError):
+    """Raised when choices are invalid or missing for multiple choice evaluation."""
+    def __init__(self, reason: str, task_name: str, choices: Optional[list] = None):
+        message = f"Invalid choices for task '{task_name}': {reason}"
+        details = {
+            "task": task_name,
+            "reason": reason,
+            "choices_provided": choices
+        }
+        super().__init__(message, details)
+class ModelNotProvidedError(EvaluationError):
+    """Raised when model is required but not provided."""
+    def __init__(self, evaluator_name: str, task_name: str):
+        message = (
+            f"Evaluator '{evaluator_name}' requires a model for log likelihood computation, "
+            f"but none was provided for task '{task_name}'. "
+            f"Pass model=<WisentModel> in kwargs to evaluate()."
+        )
+        details = {
+            "evaluator": evaluator_name,
+            "task": task_name,
+            "solution": "Pass model parameter in kwargs"
+        }
+        super().__init__(message, details)
+def require_all_parameters(params: Dict[str, Any], context: str, task_name: Optional[str] = None):
+    """Raise error if any required parameters are None or missing.
+    Args:
+        params: Dict of parameter_name -> value
+        context: Context where parameters are required
+        task_name: Optional task name for better error messages
+    Raises:
+        MissingParameterError: If any parameters are None
+    """
+    missing = [name for name, value in params.items() if value is None]
+    if missing:
+        raise MissingParameterError(
+            missing_params=missing,
+            evaluator_name=context,
+            task_name=task_name
+        )
+def validate_choices(choices: Optional[list], task_name: str, min_choices: int = 2):
+    """Validate that choices are provided and valid.
+    Args:
+        choices: List of answer choices
+        task_name: Name of the task
+        min_choices: Minimum number of choices required
+    Raises:
+        InvalidChoicesError: If choices are invalid
+    """
+    if choices is None:
+        raise InvalidChoicesError(
+            reason="No choices provided",
+            task_name=task_name,
+            choices=None
+        )
+    if not isinstance(choices, list):
+        raise InvalidChoicesError(
+            reason=f"Choices must be a list, got {type(choices).__name__}",
+            task_name=task_name,
+            choices=choices
+        )
+    if len(choices) < min_choices:
+        raise InvalidChoicesError(
+            reason=f"Need at least {min_choices} choices, got {len(choices)}",
+            task_name=task_name,
+            choices=choices
+        )
+    if any(not isinstance(c, str) or not c.strip() for c in choices):
+        raise InvalidChoicesError(
+            reason="All choices must be non-empty strings",
+            task_name=task_name,
+            choices=choices
+        )

wisent 0.5.13__py3-none-any.whl → 0.5.15__py3-none-any.whl

Potentially problematic release.

wisent 0.5.13py3-none-any.whl → 0.5.15py3-none-any.whl