wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.1.dist-info/METADATA +67 -0
- wisent-0.5.1.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Log-Likelihoods Ground Truth Evaluator
|
|
3
|
+
|
|
4
|
+
This module handles ground truth evaluation for log-likelihoods based tasks,
|
|
5
|
+
typically used for multiple choice questions. Instead of generating text,
|
|
6
|
+
it loads the multiple choice options from lm-eval tasks and runs the classifier
|
|
7
|
+
directly on each choice to evaluate performance against known ground truth.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Any, Dict, Optional
|
|
12
|
+
|
|
13
|
+
from wisent_guard.core.activations import ActivationAggregationStrategy, Activations
|
|
14
|
+
from wisent_guard.core.layer import Layer
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LogLikelihoodsEvaluator:
|
|
20
|
+
"""
|
|
21
|
+
Evaluator for log-likelihoods based ground truth assessment.
|
|
22
|
+
|
|
23
|
+
This evaluator loads multiple choice options from lm-eval tasks and runs
|
|
24
|
+
the classifier on each choice to evaluate performance against known ground truth.
|
|
25
|
+
No text generation is performed - only direct classification evaluation.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, task_name: Optional[str] = None, model=None):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the log-likelihoods evaluator.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
task_name: Name of the task (e.g., "truthfulqa_mc1", "mmlu", etc.)
|
|
34
|
+
model: The model instance used to extract activations
|
|
35
|
+
"""
|
|
36
|
+
self.task_name = task_name
|
|
37
|
+
self.model = model
|
|
38
|
+
|
|
39
|
+
def evaluate_classifier_on_task(
|
|
40
|
+
self,
|
|
41
|
+
classifier,
|
|
42
|
+
task_name: str,
|
|
43
|
+
num_samples: int = 100,
|
|
44
|
+
model=None,
|
|
45
|
+
layer: int = 15,
|
|
46
|
+
token_aggregation: str = "average",
|
|
47
|
+
) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Evaluate a classifier on a log-likelihoods task by running it on multiple choice options.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
classifier: The classifier to evaluate
|
|
53
|
+
task_name: Name of the lm-eval task
|
|
54
|
+
num_samples: Number of samples to evaluate (default: 100)
|
|
55
|
+
model: The model instance (overrides self.model if provided)
|
|
56
|
+
layer: Layer to extract activations from (default: 15)
|
|
57
|
+
token_aggregation: Token aggregation method ("average", "final", "first", "max", "min")
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Dict containing evaluation results
|
|
61
|
+
"""
|
|
62
|
+
try:
|
|
63
|
+
# Use provided model or fall back to self.model
|
|
64
|
+
evaluation_model = model or self.model
|
|
65
|
+
if evaluation_model is None:
|
|
66
|
+
return self._error_result("No model provided for activation extraction")
|
|
67
|
+
|
|
68
|
+
logger.info(f"Loading task data for {task_name}...")
|
|
69
|
+
|
|
70
|
+
# Use existing task loading infrastructure
|
|
71
|
+
task_data = evaluation_model.load_lm_eval_task(task_name, shots=0, limit=num_samples)
|
|
72
|
+
docs, _ = evaluation_model.split_task_data(task_data, split_ratio=1.0) # Use all for evaluation
|
|
73
|
+
|
|
74
|
+
if not docs:
|
|
75
|
+
return self._error_result(f"No documents retrieved from task: {task_name}")
|
|
76
|
+
|
|
77
|
+
logger.info(f"Retrieved {len(docs)} documents from {task_name}")
|
|
78
|
+
|
|
79
|
+
# Use existing QA extraction infrastructure (task-agnostic)
|
|
80
|
+
from .contrastive_pairs.contrastive_pair_set import ContrastivePairSet
|
|
81
|
+
|
|
82
|
+
qa_pairs = ContrastivePairSet.extract_qa_pairs_from_task_docs(task_name, task_data, docs)
|
|
83
|
+
|
|
84
|
+
if not qa_pairs:
|
|
85
|
+
return self._error_result(f"No QA pairs could be extracted from task: {task_name}")
|
|
86
|
+
|
|
87
|
+
logger.info(f"Extracted {len(qa_pairs)} QA pairs from {task_name}")
|
|
88
|
+
|
|
89
|
+
# Use existing contrastive pair creation infrastructure
|
|
90
|
+
from wisent_guard.core.activations.activation_collection_method import (
|
|
91
|
+
ActivationCollectionLogic,
|
|
92
|
+
)
|
|
93
|
+
from wisent_guard.core.activations.prompts import PromptConstructionStrategy
|
|
94
|
+
|
|
95
|
+
collector = ActivationCollectionLogic(model=evaluation_model)
|
|
96
|
+
|
|
97
|
+
# For evaluation, use DIRECT_COMPLETION instead of MULTIPLE_CHOICE
|
|
98
|
+
# This creates prompts like "Q" -> "good_resp"/"bad_resp" instead of "Which is better: Q A. bad B. good"
|
|
99
|
+
logger.info("🔍 EVALUATION MODE: Using DIRECT_COMPLETION prompt strategy instead of MULTIPLE_CHOICE")
|
|
100
|
+
contrastive_pairs = collector.create_batch_contrastive_pairs(
|
|
101
|
+
qa_pairs, prompt_strategy=PromptConstructionStrategy.DIRECT_COMPLETION
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if not contrastive_pairs:
|
|
105
|
+
return self._error_result("No contrastive pairs could be created from QA pairs")
|
|
106
|
+
|
|
107
|
+
logger.info(f"Created {len(contrastive_pairs)} contrastive pairs")
|
|
108
|
+
|
|
109
|
+
# Map token aggregation to token targeting strategy for evaluation
|
|
110
|
+
targeting_strategy_mapping = { # TODO Refactor - we should stay with one standard
|
|
111
|
+
"average": ActivationAggregationStrategy.MEAN_POOLING,
|
|
112
|
+
"final": ActivationAggregationStrategy.LAST_TOKEN,
|
|
113
|
+
"first": ActivationAggregationStrategy.FIRST_TOKEN,
|
|
114
|
+
"max": ActivationAggregationStrategy.MAX_POOLING,
|
|
115
|
+
"min": ActivationAggregationStrategy.MEAN_POOLING, # Fallback to mean
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
targeting_strategy = targeting_strategy_mapping.get(
|
|
119
|
+
token_aggregation, ActivationAggregationStrategy.MEAN_POOLING
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
logger.info(
|
|
123
|
+
f"🔍 EVALUATION MODE: Using {targeting_strategy.value} targeting strategy (from token_aggregation: {token_aggregation})"
|
|
124
|
+
)
|
|
125
|
+
logger.info("🎯 ACTIVATION COLLECTION PARAMS:")
|
|
126
|
+
logger.info(f" • Layer: {layer}")
|
|
127
|
+
logger.info(f" • Device: {evaluation_model.device}")
|
|
128
|
+
logger.info(f" • Token targeting: {targeting_strategy.value}")
|
|
129
|
+
logger.info(f" • Pairs count: {len(contrastive_pairs)}")
|
|
130
|
+
|
|
131
|
+
processed_pairs = collector.collect_activations_batch(
|
|
132
|
+
pairs=contrastive_pairs,
|
|
133
|
+
layer_index=layer,
|
|
134
|
+
device=evaluation_model.device,
|
|
135
|
+
token_targeting_strategy=targeting_strategy,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
if not processed_pairs:
|
|
139
|
+
return self._error_result("No activations could be extracted from contrastive pairs")
|
|
140
|
+
|
|
141
|
+
logger.info(f"Extracted activations from {len(processed_pairs)} pairs")
|
|
142
|
+
|
|
143
|
+
# Debug: Show where activations are collected from
|
|
144
|
+
if processed_pairs:
|
|
145
|
+
sample_pair = processed_pairs[0]
|
|
146
|
+
logger.info("📍 DETAILED ACTIVATION COLLECTION ANALYSIS:")
|
|
147
|
+
logger.info(f" 🔧 Sample pair type: {type(sample_pair).__name__}")
|
|
148
|
+
logger.info(
|
|
149
|
+
f" 🔧 Pair attributes: {[attr for attr in dir(sample_pair) if not attr.startswith('_')][:8]}..."
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if hasattr(sample_pair, "positive_activations") and sample_pair.positive_activations is not None:
|
|
153
|
+
logger.info(f" ✅ Positive activations shape: {sample_pair.positive_activations.shape}")
|
|
154
|
+
if hasattr(sample_pair, "negative_activations") and sample_pair.negative_activations is not None:
|
|
155
|
+
logger.info(f" ✅ Negative activations shape: {sample_pair.negative_activations.shape}")
|
|
156
|
+
|
|
157
|
+
if hasattr(sample_pair, "_prompt_pair") and sample_pair._prompt_pair:
|
|
158
|
+
logger.debug(f" 🔸 Positive prompt: {sample_pair._prompt_pair.positive_prompt[:100]}...")
|
|
159
|
+
logger.debug(f" 🔸 Negative prompt: {sample_pair._prompt_pair.negative_prompt[:100]}...")
|
|
160
|
+
logger.debug(f" 🎯 Target token: {sample_pair._prompt_pair.target_token}")
|
|
161
|
+
logger.debug(f" 📊 Prompt strategy: {sample_pair._prompt_strategy.value}")
|
|
162
|
+
logger.info(f" 🔍 Token targeting: {targeting_strategy.value} (evaluation mode)")
|
|
163
|
+
elif hasattr(sample_pair, "prompt") and hasattr(sample_pair, "positive_response"):
|
|
164
|
+
logger.debug(f" 🔸 Question prompt: {sample_pair.prompt[:100]}...")
|
|
165
|
+
logger.debug(f" ✅ Positive response: {sample_pair.positive_response[:50]}...")
|
|
166
|
+
logger.debug(f" ❌ Negative response: {sample_pair.negative_response[:50]}...")
|
|
167
|
+
logger.debug(
|
|
168
|
+
f" 🔍 Token targeting used: {targeting_strategy.value} (from CLI token_aggregation: {token_aggregation})"
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
logger.info(" 📍 ACTIVATION COLLECTION: Unknown format - investigating...")
|
|
172
|
+
logger.info(
|
|
173
|
+
f" 🔧 All attributes: {[attr for attr in dir(sample_pair) if not attr.startswith('__')]}"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Map token aggregation to activation method
|
|
177
|
+
activation_method = token_aggregation
|
|
178
|
+
# Handle both string and enum types
|
|
179
|
+
method_name = activation_method.value if hasattr(activation_method, 'value') else str(activation_method)
|
|
180
|
+
logger.info(
|
|
181
|
+
f"🎯 Using activation aggregation method: {method_name} (from token_aggregation: {token_aggregation})"
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
# Evaluate classifier on each sample
|
|
185
|
+
results = []
|
|
186
|
+
total_correct = 0
|
|
187
|
+
total_samples = 0
|
|
188
|
+
|
|
189
|
+
for i, pair in enumerate(processed_pairs):
|
|
190
|
+
try:
|
|
191
|
+
sample_result = self._evaluate_classifier_on_sample(
|
|
192
|
+
classifier, pair, qa_pairs[i], activation_method
|
|
193
|
+
)
|
|
194
|
+
results.append(sample_result)
|
|
195
|
+
|
|
196
|
+
if sample_result.get("classifier_correct", False):
|
|
197
|
+
total_correct += 1
|
|
198
|
+
total_samples += 1
|
|
199
|
+
|
|
200
|
+
except Exception as e:
|
|
201
|
+
logger.error(f"Error evaluating sample {i}: {e}")
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
# Calculate overall metrics
|
|
205
|
+
accuracy = total_correct / total_samples if total_samples > 0 else 0.0
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
"ground_truth": "EVALUATED",
|
|
209
|
+
"method_used": "log-likelihoods-classifier",
|
|
210
|
+
"confidence": accuracy,
|
|
211
|
+
"details": f"Evaluated {total_samples} samples with {total_correct} correct predictions",
|
|
212
|
+
"task_name": task_name,
|
|
213
|
+
"evaluation_method": "log-likelihoods",
|
|
214
|
+
"lm_eval_metrics": {
|
|
215
|
+
"accuracy": accuracy,
|
|
216
|
+
"correct_predictions": total_correct,
|
|
217
|
+
"total_samples": total_samples,
|
|
218
|
+
},
|
|
219
|
+
"sample_results": results[:10], # First 10 for debugging
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
except Exception as e:
|
|
223
|
+
import traceback
|
|
224
|
+
|
|
225
|
+
logger.error(f"Error evaluating classifier on task {task_name}: {e}")
|
|
226
|
+
logger.error(f"Traceback: {traceback.format_exc()}")
|
|
227
|
+
return self._error_result(f"Evaluation error: {e!s}")
|
|
228
|
+
|
|
229
|
+
def _evaluate_classifier_on_sample(
|
|
230
|
+
self, classifier, processed_pair, qa_pair: Dict[str, Any], activation_method
|
|
231
|
+
) -> Dict[str, Any]:
|
|
232
|
+
"""
|
|
233
|
+
Evaluate the classifier on a single processed contrastive pair.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
classifier: The classifier to evaluate
|
|
237
|
+
processed_pair: ContrastivePair with activations already extracted
|
|
238
|
+
qa_pair: Original QA pair data for reference
|
|
239
|
+
activation_method:
|
|
240
|
+
Returns:
|
|
241
|
+
Dict containing evaluation results for this sample
|
|
242
|
+
"""
|
|
243
|
+
try:
|
|
244
|
+
# Extract activations from the processed pair
|
|
245
|
+
positive_activations = processed_pair.positive_activations # B choice (correct)
|
|
246
|
+
negative_activations = processed_pair.negative_activations # A choice (incorrect)
|
|
247
|
+
|
|
248
|
+
if positive_activations is None or negative_activations is None:
|
|
249
|
+
return {
|
|
250
|
+
"question": qa_pair["question"],
|
|
251
|
+
"correct_answer": qa_pair["correct_answer"],
|
|
252
|
+
"incorrect_answer": qa_pair["incorrect_answer"],
|
|
253
|
+
"classifier_correct": False,
|
|
254
|
+
"error": "Missing activations",
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
layer_obj = Layer(index=15, type="transformer")
|
|
258
|
+
|
|
259
|
+
# Process positive (correct) choice using CLI token aggregation method
|
|
260
|
+
positive_act = Activations(
|
|
261
|
+
tensor=positive_activations, layer=layer_obj, aggregation_strategy=activation_method
|
|
262
|
+
)
|
|
263
|
+
positive_features = positive_act.extract_features_for_classifier()
|
|
264
|
+
positive_prediction = classifier.predict_proba([positive_features.cpu().numpy()])
|
|
265
|
+
if isinstance(positive_prediction, (list, tuple)) and len(positive_prediction) > 0:
|
|
266
|
+
positive_prediction = positive_prediction[0]
|
|
267
|
+
|
|
268
|
+
# Process negative (incorrect) choice using CLI token aggregation method
|
|
269
|
+
negative_act = Activations(
|
|
270
|
+
tensor=negative_activations, layer=layer_obj, aggregation_strategy=activation_method
|
|
271
|
+
)
|
|
272
|
+
negative_features = negative_act.extract_features_for_classifier()
|
|
273
|
+
negative_prediction = classifier.predict_proba([negative_features.cpu().numpy()])
|
|
274
|
+
if isinstance(negative_prediction, (list, tuple)) and len(negative_prediction) > 0:
|
|
275
|
+
negative_prediction = negative_prediction[0]
|
|
276
|
+
|
|
277
|
+
# Evaluate classifier performance
|
|
278
|
+
# For truthfulness: lower scores = more truthful/correct
|
|
279
|
+
# Classifier should give lower score to correct answer, higher to incorrect
|
|
280
|
+
classifier_correct = positive_prediction < negative_prediction
|
|
281
|
+
|
|
282
|
+
# Enhanced logging to show classifier vs normal predictions
|
|
283
|
+
logger.info("📋 Sample Evaluation:")
|
|
284
|
+
logger.debug(f" 🔸 Question: {qa_pair['question'][:100]}...")
|
|
285
|
+
logger.info(" 🧠 CLASSIFIER PREDICTION:")
|
|
286
|
+
logger.debug(f" ✅ Correct answer score: {float(positive_prediction):.3f} (lower = more truthful)")
|
|
287
|
+
logger.debug(f" ❌ Incorrect answer score: {float(negative_prediction):.3f} (higher = less truthful)")
|
|
288
|
+
logger.debug(
|
|
289
|
+
f" 🎯 Classifier judgment: {'CORRECT' if classifier_correct else 'INCORRECT'} (positive {'<' if classifier_correct else '>='} negative)"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
"question": qa_pair["question"],
|
|
294
|
+
"correct_answer": qa_pair["correct_answer"],
|
|
295
|
+
"incorrect_answer": qa_pair["incorrect_answer"],
|
|
296
|
+
"positive_prediction": float(positive_prediction),
|
|
297
|
+
"negative_prediction": float(negative_prediction),
|
|
298
|
+
"classifier_correct": classifier_correct,
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
except Exception as e:
|
|
302
|
+
logger.debug(f"Error evaluating sample: {e}")
|
|
303
|
+
return {
|
|
304
|
+
"question": qa_pair.get("question", "Unknown"),
|
|
305
|
+
"correct_answer": qa_pair.get("correct_answer", "Unknown"),
|
|
306
|
+
"incorrect_answer": qa_pair.get("incorrect_answer", "Unknown"),
|
|
307
|
+
"classifier_correct": False,
|
|
308
|
+
"error": str(e),
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
def _error_result(self, error_msg: str) -> Dict[str, Any]:
|
|
312
|
+
"""Return an error result."""
|
|
313
|
+
return {
|
|
314
|
+
"ground_truth": "UNKNOWN",
|
|
315
|
+
"method_used": "log-likelihoods-error",
|
|
316
|
+
"confidence": 0.0,
|
|
317
|
+
"details": error_msg,
|
|
318
|
+
"task_name": self.task_name or "unknown",
|
|
319
|
+
"evaluation_method": "log-likelihoods",
|
|
320
|
+
"lm_eval_metrics": {"accuracy": 0.0, "correct_predictions": 0, "total_samples": 0},
|
|
321
|
+
}
|