PyPI - wisent - Versions diffs - 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl - Mend

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1020) hide show

wisent/core/cli/tasks.py CHANGED Viewed

@@ -14,8 +14,8 @@ def execute_tasks(args):
     from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
     from wisent.core.models.wisent_model import WisentModel
     from wisent.core.activations.activations_collector import ActivationCollector
-    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
-    from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
+    from wisent.core.activations.extraction_strategy import ExtractionStrategy
     from wisent.core.classifiers.classifiers.models.logistic import LogisticClassifier
     from wisent.core.classifiers.classifiers.models.mlp import MLPClassifier
     from wisent.core.classifiers.classifiers.core.atoms import ClassifierTrainConfig
@@ -161,12 +161,10 @@ def execute_tasks(args):
         # Determine layer range
         if hasattr(args, 'optimize_layers') and args.optimize_layers == 'all':
-            layer_range = list(range(1, num_layers + 1))
+            layer_range = list(range(num_layers))
         else:
-            # Default: test middle 50% of layers
-            start = num_layers // 4
-            end = (3 * num_layers) // 4
-            layer_range = list(range(start, end + 1))
+            # Default: test ALL layers (0-indexed)
+            layer_range = list(range(num_layers))
         # Create optimization config
         config = OptimizationConfig(
@@ -416,34 +414,11 @@ def execute_tasks(args):
     print(f"\n🧠 Extracting activations from layer {layer}...")
     # 5. Collect activations for all pairs
-    collector = ActivationCollector(model=model, store_device="cpu")
-    # Map parser values to enum members
-    aggregation_map = {
-        'average': 'MEAN_POOLING',
-        'final': 'LAST_TOKEN',
-        'first': 'FIRST_TOKEN',
-        'max': 'MAX_POOLING',
-        'min': 'MAX_POOLING',  # Fallback to MAX_POOLING for min
-        'max_score': 'MEAN_POOLING',  # Will use mean for training, but max token score for inference
-    }
-    aggregation_key = aggregation_map.get(args.token_aggregation.lower(), 'MEAN_POOLING')
-    aggregation_strategy = ActivationAggregationStrategy[aggregation_key]
-    use_max_token_score = args.token_aggregation.lower() == 'max_score'
-    # Map prompt construction strategy from CLI to enum
-    prompt_strategy_map = {
-        'multiple_choice': PromptConstructionStrategy.MULTIPLE_CHOICE,
-        'role_playing': PromptConstructionStrategy.ROLE_PLAYING,
-        'direct_completion': PromptConstructionStrategy.DIRECT_COMPLETION,
-        'instruction_following': PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
-        'chat_template': PromptConstructionStrategy.CHAT_TEMPLATE,
-    }
-    prompt_strategy = prompt_strategy_map.get(
-        getattr(args, 'prompt_construction_strategy', 'chat_template'),
-        PromptConstructionStrategy.CHAT_TEMPLATE
-    )
-    print(f"   Prompt construction strategy: {prompt_strategy.value}")
+    collector = ActivationCollector(model=model)
+    # Get extraction strategy from args (already an ExtractionStrategy value string)
+    extraction_strategy = ExtractionStrategy(getattr(args, 'extraction_strategy', 'chat_last'))
+    print(f"   Extraction strategy: {extraction_strategy.value}")
     positive_activations = []
     negative_activations = []
@@ -456,13 +431,9 @@ def execute_tasks(args):
             print(f"   Processing pair {i+1}/{len(pair_set.pairs)}...", end='\r')
         # Collect for positive (correct) response
-        updated_pair = collector.collect_for_pair(
-            pair,
+        updated_pair = collector.collect(
+            pair, strategy=extraction_strategy,
             layers=[layer_str],
-            aggregation=aggregation_strategy,
-            return_full_sequence=False,
-            normalize_layers=False,
-            prompt_strategy=prompt_strategy
         )
         # Extract activations from positive and negative responses
@@ -610,13 +581,6 @@ def execute_tasks(args):
         expected = pair.positive_response.model_response
         choices = [pair.negative_response.model_response, pair.positive_response.model_response]
-        # Extract test_code from pair metadata for coding tasks
-        test_code = None
-        starter_code = None
-        if hasattr(pair, 'metadata') and pair.metadata:
-            test_code = pair.metadata.get('test_code')
-            starter_code = pair.metadata.get('starter_code')
         # Generate response from unsteered model
         messages = [{"role": "user", "content": question}]
@@ -626,6 +590,7 @@ def execute_tasks(args):
         )[0]
         # Evaluate the response using Wisent evaluator
+        # Pass all pair metadata to evaluator - each evaluator uses what it needs
         eval_kwargs = {
             'response': response,
             'expected': expected,
@@ -634,16 +599,16 @@ def execute_tasks(args):
             'choices': choices,
             'task_name': task_name,
         }
-        # Add test_code for coding tasks (livecodebench, humaneval, mbpp, etc.)
-        if test_code:
-            eval_kwargs['test_code'] = test_code
-        if starter_code:
-            eval_kwargs['starter_code'] = starter_code
+        # Add all pair metadata to eval_kwargs (test_code, correct_answers, etc.)
+        if hasattr(pair, 'metadata') and pair.metadata:
+            for key, value in pair.metadata.items():
+                if value is not None and key not in eval_kwargs:
+                    eval_kwargs[key] = value
         eval_result = evaluator.evaluate(**eval_kwargs)
         # Get activation for this generation
         # Use ActivationCollector to collect activations from the generated text
-        gen_collector = ActivationCollector(model=model, store_device="cpu")
+        gen_collector = ActivationCollector(model=model)
         # Create a pair with the generated response
         from wisent.core.contrastive_pairs.core.response import PositiveResponse, NegativeResponse
         from wisent.core.contrastive_pairs.core.pair import ContrastivePair
@@ -659,57 +624,21 @@ def execute_tasks(args):
         # Collect activation - ActivationCollector will re-run the model with prompt+response
         # First, collect with full sequence to get token-by-token activations
-        collected_full = gen_collector.collect_for_pair(
-            temp_pair,
-            layers=[layer_str],
-            aggregation=aggregation_strategy,
-            return_full_sequence=True,
-            normalize_layers=False,
-            prompt_strategy=prompt_strategy
+        collected_full = gen_collector.collect(
+            temp_pair, strategy=extraction_strategy,
         )
         # Access the collected activations
         import torch
         if collected_full.positive_response.layers_activations:
-            layer_activations_full = collected_full.positive_response.layers_activations
-            if layer_str in layer_activations_full:
-                activation_full_seq = layer_activations_full[layer_str]
-                if activation_full_seq is not None and isinstance(activation_full_seq, torch.Tensor):
-                    # activation_full_seq shape: (num_tokens, hidden_dim)
-                    # Apply aggregation manually to get single vector for classifier
-                    if aggregation_strategy.name == 'MEAN_POOLING':
-                        activation_agg = activation_full_seq.mean(dim=0)
-                    elif aggregation_strategy.name == 'LAST_TOKEN':
-                        activation_agg = activation_full_seq[-1]
-                    elif aggregation_strategy.name == 'FIRST_TOKEN':
-                        activation_agg = activation_full_seq[0]
-                    elif aggregation_strategy.name == 'MAX_POOLING':
-                        activation_agg = activation_full_seq.max(dim=0)[0]
-                    else:
-                        # Default to mean
-                        activation_agg = activation_full_seq.mean(dim=0)
-                    # Compute per-token classifier scores first (needed for max_score aggregation)
-                    token_scores = []
-                    for token_idx in range(activation_full_seq.shape[0]):
-                        token_act = activation_full_seq[token_idx].unsqueeze(0).float()
-                        token_proba_result = classifier.predict_proba(token_act)
-                        token_proba = token_proba_result if isinstance(token_proba_result, float) else token_proba_result[0]
-                        token_scores.append(float(token_proba))
-                    # Get classifier prediction - either from aggregated vector or min token score
-                    if use_max_token_score:
-                        # Use MINIMUM token score as response-level classification
-                        # Score = P(TRUTHFUL), so min score = most suspicious token
-                        # If ANY token has low truthfulness probability, flag the response
-                        pred_proba = min(token_scores) if token_scores else 0.5
-                    else:
-                        # Standard: classify aggregated activation vector
-                        act_tensor = activation_agg.unsqueeze(0).float()
-                        pred_proba_result = classifier.predict_proba(act_tensor)
-                        pred_proba = pred_proba_result if isinstance(pred_proba_result, float) else pred_proba_result[0]
+            layer_activations = collected_full.positive_response.layers_activations
+            if layer_str in layer_activations:
+                activation = layer_activations[layer_str]
+                if activation is not None and isinstance(activation, torch.Tensor):
+                    # activation shape: (hidden_dim,) - already aggregated by extraction strategy
+                    act_tensor = activation.unsqueeze(0).float()
+                    pred_proba_result = classifier.predict_proba(act_tensor)
+                    pred_proba = pred_proba_result if isinstance(pred_proba_result, float) else pred_proba_result[0]
                     pred_label = int(pred_proba > args.detection_threshold)
                     # Update detection stats
@@ -782,14 +711,6 @@ def execute_tasks(args):
                     # Ground truth from evaluator
                     ground_truth = 1 if eval_result.ground_truth == "TRUTHFUL" else 0
-                    # token_scores = P(TRUTHFUL) for each token
-                    # min_token_score = most suspicious token (lowest P(TRUTHFUL))
-                    # max_token_score = most confident token (highest P(TRUTHFUL))
-                    min_token_score = min(token_scores) if token_scores else 0.0
-                    min_token_idx = token_scores.index(min_token_score) if token_scores else -1
-                    max_token_score = max(token_scores) if token_scores else 0.0
-                    max_token_idx = token_scores.index(max_token_score) if token_scores else -1
                     generation_results.append({
                         'question': question,
                         'response': response,
@@ -799,13 +720,6 @@ def execute_tasks(args):
                         'classifier_pred': pred_label,
                         'classifier_proba': float(pred_proba),
                         'correct': pred_label == ground_truth,
-                        'token_scores': token_scores,  # Per-token P(TRUTHFUL) probabilities
-                        'min_token_score': min_token_score,  # Most suspicious token - lowest P(TRUTHFUL)
-                        'min_token_idx': min_token_idx,  # Index of most suspicious token
-                        'max_token_score': max_token_score,  # Most confident token - highest P(TRUTHFUL) (kept for backward compat)
-                        'max_token_idx': max_token_idx,  # Index of most confident token
-                        'num_tokens': len(token_scores),
-                        'aggregation_method': 'max_score' if use_max_token_score else args.token_aggregation,
                         'quality_score': quality_score,
                         'issue_detected': issue_detected,
                         'detection_type': detection_type,
@@ -881,7 +795,7 @@ def execute_tasks(args):
             classifier_type=args.classifier_type,
             training_accuracy=report.final.accuracy,
             training_samples=len(X),
-            token_aggregation=args.token_aggregation,
+            token_aggregation=extraction_strategy.value,
             detection_threshold=args.detection_threshold
         )
@@ -913,7 +827,7 @@ def execute_tasks(args):
                     'task': args.task_names,
                     'model': args.model,
                     'layer': layer,
-                    'aggregation': args.token_aggregation,
+                    'aggregation': extraction_strategy.value,
                     'threshold': args.detection_threshold,
                     'num_generations': len(generation_results),
                     'detection_stats': detection_stats,

wisent/core/cli/train_unified_goodness.py CHANGED Viewed

@@ -79,8 +79,8 @@ def execute_train_unified_goodness(args):
     from wisent.core.data_loaders.loaders.lm_loader import LMEvalDataLoader
     from wisent.core.models.wisent_model import WisentModel
     from wisent.core.activations.activations_collector import ActivationCollector
-    from wisent.core.activations.core.atoms import ActivationAggregationStrategy
-    from wisent.core.activations.prompt_construction_strategy import PromptConstructionStrategy
+    from wisent.core.activations.extraction_strategy import ExtractionStrategy
     from wisent.core.contrastive_pairs.core.set import ContrastivePairSet
     from wisent.core.contrastive_pairs.lm_eval_pairs.lm_task_pairs_generation import lm_build_contrastive_pairs
     from wisent.core.steering_methods.methods.caa import CAAMethod
@@ -163,10 +163,9 @@ def execute_train_unified_goodness(args):
             else:
                 layers.append(part)
     else:
-        # Use middle layer by default
-        middle_layer = model.num_layers // 2
-        layers = [str(middle_layer)]
-        print(f"   Using middle layer: {middle_layer}")
+        # Use ALL layers by default
+        layers = [str(i) for i in range(model.num_layers)]
+        print(f"   Using ALL layers: 0 to {model.num_layers - 1}")
     print(f"   ✓ Target layers: {layers}")
@@ -322,28 +321,28 @@ def execute_train_unified_goodness(args):
     # Map aggregation strategy
     aggregation_map = {
-        'average': ActivationAggregationStrategy.MEAN_POOLING,
-        'final': ActivationAggregationStrategy.LAST_TOKEN,
-        'first': ActivationAggregationStrategy.FIRST_TOKEN,
-        'max': ActivationAggregationStrategy.MAX_POOLING,
-        'continuation': ActivationAggregationStrategy.CONTINUATION_TOKEN,
+        'average': ExtractionStrategy.CHAT_MEAN,
+        'final': ExtractionStrategy.CHAT_LAST,
+        'first': ExtractionStrategy.CHAT_FIRST,
+        'max': ExtractionStrategy.CHAT_MAX_NORM,
+        'continuation': ExtractionStrategy.CHAT_FIRST,  # First answer token
     }
     aggregation_strategy = aggregation_map.get(
         args.token_aggregation,
-        ActivationAggregationStrategy.CONTINUATION_TOKEN
+        ExtractionStrategy.CHAT_LAST
     )
     # Map prompt strategy
     prompt_strategy_map = {
-        'chat_template': PromptConstructionStrategy.CHAT_TEMPLATE,
-        'direct_completion': PromptConstructionStrategy.DIRECT_COMPLETION,
-        'instruction_following': PromptConstructionStrategy.INSTRUCTION_FOLLOWING,
-        'multiple_choice': PromptConstructionStrategy.MULTIPLE_CHOICE,
-        'role_playing': PromptConstructionStrategy.ROLE_PLAYING,
+        'chat_template': ExtractionStrategy.CHAT_LAST,
+        'direct_completion': ExtractionStrategy.CHAT_LAST,
+        'instruction_following': ExtractionStrategy.CHAT_LAST,
+        'multiple_choice': ExtractionStrategy.MC_BALANCED,
+        'role_playing': ExtractionStrategy.ROLE_PLAY,
     }
     prompt_strategy = prompt_strategy_map.get(
         args.prompt_strategy,
-        PromptConstructionStrategy.CHAT_TEMPLATE
+        ExtractionStrategy.CHAT_LAST
     )
     # Try to load activations from checkpoint
@@ -354,7 +353,7 @@ def execute_train_unified_goodness(args):
         negative_activations = activations_checkpoint['negative_activations']
         print(f"   ✓ Loaded activations from checkpoint ({len(positive_activations[layers[0]])} pairs)")
     else:
-        collector = ActivationCollector(model=model, store_device="cpu")
+        collector = ActivationCollector(model=model)
         # Collect activations for all training pairs using batched processing
         positive_activations = {layer: [] for layer in layers}

wisent 0.7.379__py3-none-any.whl → 0.7.901__py3-none-any.whl

wisent 0.7.379py3-none-any.whl → 0.7.901py3-none-any.whl