wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.2.dist-info/METADATA +67 -0
  215. wisent-0.5.2.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,321 @@
1
+ """
2
+ Log-Likelihoods Ground Truth Evaluator
3
+
4
+ This module handles ground truth evaluation for log-likelihoods based tasks,
5
+ typically used for multiple choice questions. Instead of generating text,
6
+ it loads the multiple choice options from lm-eval tasks and runs the classifier
7
+ directly on each choice to evaluate performance against known ground truth.
8
+ """
9
+
10
+ import logging
11
+ from typing import Any, Dict, Optional
12
+
13
+ from wisent.core.activations import ActivationAggregationStrategy, Activations
14
+ from wisent.core.layer import Layer
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class LogLikelihoodsEvaluator:
20
+ """
21
+ Evaluator for log-likelihoods based ground truth assessment.
22
+
23
+ This evaluator loads multiple choice options from lm-eval tasks and runs
24
+ the classifier on each choice to evaluate performance against known ground truth.
25
+ No text generation is performed - only direct classification evaluation.
26
+ """
27
+
28
+ def __init__(self, task_name: Optional[str] = None, model=None):
29
+ """
30
+ Initialize the log-likelihoods evaluator.
31
+
32
+ Args:
33
+ task_name: Name of the task (e.g., "truthfulqa_mc1", "mmlu", etc.)
34
+ model: The model instance used to extract activations
35
+ """
36
+ self.task_name = task_name
37
+ self.model = model
38
+
39
+ def evaluate_classifier_on_task(
40
+ self,
41
+ classifier,
42
+ task_name: str,
43
+ num_samples: int = 100,
44
+ model=None,
45
+ layer: int = 15,
46
+ token_aggregation: str = "average",
47
+ ) -> Dict[str, Any]:
48
+ """
49
+ Evaluate a classifier on a log-likelihoods task by running it on multiple choice options.
50
+
51
+ Args:
52
+ classifier: The classifier to evaluate
53
+ task_name: Name of the lm-eval task
54
+ num_samples: Number of samples to evaluate (default: 100)
55
+ model: The model instance (overrides self.model if provided)
56
+ layer: Layer to extract activations from (default: 15)
57
+ token_aggregation: Token aggregation method ("average", "final", "first", "max", "min")
58
+
59
+ Returns:
60
+ Dict containing evaluation results
61
+ """
62
+ try:
63
+ # Use provided model or fall back to self.model
64
+ evaluation_model = model or self.model
65
+ if evaluation_model is None:
66
+ return self._error_result("No model provided for activation extraction")
67
+
68
+ logger.info(f"Loading task data for {task_name}...")
69
+
70
+ # Use existing task loading infrastructure
71
+ task_data = evaluation_model.load_lm_eval_task(task_name, shots=0, limit=num_samples)
72
+ docs, _ = evaluation_model.split_task_data(task_data, split_ratio=1.0) # Use all for evaluation
73
+
74
+ if not docs:
75
+ return self._error_result(f"No documents retrieved from task: {task_name}")
76
+
77
+ logger.info(f"Retrieved {len(docs)} documents from {task_name}")
78
+
79
+ # Use existing QA extraction infrastructure (task-agnostic)
80
+ from .contrastive_pairs.contrastive_pair_set import ContrastivePairSet
81
+
82
+ qa_pairs = ContrastivePairSet.extract_qa_pairs_from_task_docs(task_name, task_data, docs)
83
+
84
+ if not qa_pairs:
85
+ return self._error_result(f"No QA pairs could be extracted from task: {task_name}")
86
+
87
+ logger.info(f"Extracted {len(qa_pairs)} QA pairs from {task_name}")
88
+
89
+ # Use existing contrastive pair creation infrastructure
90
+ from wisent.core.activations.activation_collection_method import (
91
+ ActivationCollectionLogic,
92
+ )
93
+ from wisent.core.activations.prompts import PromptConstructionStrategy
94
+
95
+ collector = ActivationCollectionLogic(model=evaluation_model)
96
+
97
+ # For evaluation, use DIRECT_COMPLETION instead of MULTIPLE_CHOICE
98
+ # This creates prompts like "Q" -> "good_resp"/"bad_resp" instead of "Which is better: Q A. bad B. good"
99
+ logger.info("🔍 EVALUATION MODE: Using DIRECT_COMPLETION prompt strategy instead of MULTIPLE_CHOICE")
100
+ contrastive_pairs = collector.create_batch_contrastive_pairs(
101
+ qa_pairs, prompt_strategy=PromptConstructionStrategy.DIRECT_COMPLETION
102
+ )
103
+
104
+ if not contrastive_pairs:
105
+ return self._error_result("No contrastive pairs could be created from QA pairs")
106
+
107
+ logger.info(f"Created {len(contrastive_pairs)} contrastive pairs")
108
+
109
+ # Map token aggregation to token targeting strategy for evaluation
110
+ targeting_strategy_mapping = { # TODO Refactor - we should stay with one standard
111
+ "average": ActivationAggregationStrategy.MEAN_POOLING,
112
+ "final": ActivationAggregationStrategy.LAST_TOKEN,
113
+ "first": ActivationAggregationStrategy.FIRST_TOKEN,
114
+ "max": ActivationAggregationStrategy.MAX_POOLING,
115
+ "min": ActivationAggregationStrategy.MEAN_POOLING, # Fallback to mean
116
+ }
117
+
118
+ targeting_strategy = targeting_strategy_mapping.get(
119
+ token_aggregation, ActivationAggregationStrategy.MEAN_POOLING
120
+ )
121
+
122
+ logger.info(
123
+ f"🔍 EVALUATION MODE: Using {targeting_strategy.value} targeting strategy (from token_aggregation: {token_aggregation})"
124
+ )
125
+ logger.info("🎯 ACTIVATION COLLECTION PARAMS:")
126
+ logger.info(f" • Layer: {layer}")
127
+ logger.info(f" • Device: {evaluation_model.device}")
128
+ logger.info(f" • Token targeting: {targeting_strategy.value}")
129
+ logger.info(f" • Pairs count: {len(contrastive_pairs)}")
130
+
131
+ processed_pairs = collector.collect_activations_batch(
132
+ pairs=contrastive_pairs,
133
+ layer_index=layer,
134
+ device=evaluation_model.device,
135
+ token_targeting_strategy=targeting_strategy,
136
+ )
137
+
138
+ if not processed_pairs:
139
+ return self._error_result("No activations could be extracted from contrastive pairs")
140
+
141
+ logger.info(f"Extracted activations from {len(processed_pairs)} pairs")
142
+
143
+ # Debug: Show where activations are collected from
144
+ if processed_pairs:
145
+ sample_pair = processed_pairs[0]
146
+ logger.info("📍 DETAILED ACTIVATION COLLECTION ANALYSIS:")
147
+ logger.info(f" 🔧 Sample pair type: {type(sample_pair).__name__}")
148
+ logger.info(
149
+ f" 🔧 Pair attributes: {[attr for attr in dir(sample_pair) if not attr.startswith('_')][:8]}..."
150
+ )
151
+
152
+ if hasattr(sample_pair, "positive_activations") and sample_pair.positive_activations is not None:
153
+ logger.info(f" ✅ Positive activations shape: {sample_pair.positive_activations.shape}")
154
+ if hasattr(sample_pair, "negative_activations") and sample_pair.negative_activations is not None:
155
+ logger.info(f" ✅ Negative activations shape: {sample_pair.negative_activations.shape}")
156
+
157
+ if hasattr(sample_pair, "_prompt_pair") and sample_pair._prompt_pair:
158
+ logger.debug(f" 🔸 Positive prompt: {sample_pair._prompt_pair.positive_prompt[:100]}...")
159
+ logger.debug(f" 🔸 Negative prompt: {sample_pair._prompt_pair.negative_prompt[:100]}...")
160
+ logger.debug(f" 🎯 Target token: {sample_pair._prompt_pair.target_token}")
161
+ logger.debug(f" 📊 Prompt strategy: {sample_pair._prompt_strategy.value}")
162
+ logger.info(f" 🔍 Token targeting: {targeting_strategy.value} (evaluation mode)")
163
+ elif hasattr(sample_pair, "prompt") and hasattr(sample_pair, "positive_response"):
164
+ logger.debug(f" 🔸 Question prompt: {sample_pair.prompt[:100]}...")
165
+ logger.debug(f" ✅ Positive response: {sample_pair.positive_response[:50]}...")
166
+ logger.debug(f" ❌ Negative response: {sample_pair.negative_response[:50]}...")
167
+ logger.debug(
168
+ f" 🔍 Token targeting used: {targeting_strategy.value} (from CLI token_aggregation: {token_aggregation})"
169
+ )
170
+ else:
171
+ logger.info(" 📍 ACTIVATION COLLECTION: Unknown format - investigating...")
172
+ logger.info(
173
+ f" 🔧 All attributes: {[attr for attr in dir(sample_pair) if not attr.startswith('__')]}"
174
+ )
175
+
176
+ # Map token aggregation to activation method
177
+ activation_method = token_aggregation
178
+ # Handle both string and enum types
179
+ method_name = activation_method.value if hasattr(activation_method, 'value') else str(activation_method)
180
+ logger.info(
181
+ f"🎯 Using activation aggregation method: {method_name} (from token_aggregation: {token_aggregation})"
182
+ )
183
+
184
+ # Evaluate classifier on each sample
185
+ results = []
186
+ total_correct = 0
187
+ total_samples = 0
188
+
189
+ for i, pair in enumerate(processed_pairs):
190
+ try:
191
+ sample_result = self._evaluate_classifier_on_sample(
192
+ classifier, pair, qa_pairs[i], activation_method
193
+ )
194
+ results.append(sample_result)
195
+
196
+ if sample_result.get("classifier_correct", False):
197
+ total_correct += 1
198
+ total_samples += 1
199
+
200
+ except Exception as e:
201
+ logger.error(f"Error evaluating sample {i}: {e}")
202
+ continue
203
+
204
+ # Calculate overall metrics
205
+ accuracy = total_correct / total_samples if total_samples > 0 else 0.0
206
+
207
+ return {
208
+ "ground_truth": "EVALUATED",
209
+ "method_used": "log-likelihoods-classifier",
210
+ "confidence": accuracy,
211
+ "details": f"Evaluated {total_samples} samples with {total_correct} correct predictions",
212
+ "task_name": task_name,
213
+ "evaluation_method": "log-likelihoods",
214
+ "lm_eval_metrics": {
215
+ "accuracy": accuracy,
216
+ "correct_predictions": total_correct,
217
+ "total_samples": total_samples,
218
+ },
219
+ "sample_results": results[:10], # First 10 for debugging
220
+ }
221
+
222
+ except Exception as e:
223
+ import traceback
224
+
225
+ logger.error(f"Error evaluating classifier on task {task_name}: {e}")
226
+ logger.error(f"Traceback: {traceback.format_exc()}")
227
+ return self._error_result(f"Evaluation error: {e!s}")
228
+
229
+ def _evaluate_classifier_on_sample(
230
+ self, classifier, processed_pair, qa_pair: Dict[str, Any], activation_method
231
+ ) -> Dict[str, Any]:
232
+ """
233
+ Evaluate the classifier on a single processed contrastive pair.
234
+
235
+ Args:
236
+ classifier: The classifier to evaluate
237
+ processed_pair: ContrastivePair with activations already extracted
238
+ qa_pair: Original QA pair data for reference
239
+ activation_method:
240
+ Returns:
241
+ Dict containing evaluation results for this sample
242
+ """
243
+ try:
244
+ # Extract activations from the processed pair
245
+ positive_activations = processed_pair.positive_activations # B choice (correct)
246
+ negative_activations = processed_pair.negative_activations # A choice (incorrect)
247
+
248
+ if positive_activations is None or negative_activations is None:
249
+ return {
250
+ "question": qa_pair["question"],
251
+ "correct_answer": qa_pair["correct_answer"],
252
+ "incorrect_answer": qa_pair["incorrect_answer"],
253
+ "classifier_correct": False,
254
+ "error": "Missing activations",
255
+ }
256
+
257
+ layer_obj = Layer(index=15, type="transformer")
258
+
259
+ # Process positive (correct) choice using CLI token aggregation method
260
+ positive_act = Activations(
261
+ tensor=positive_activations, layer=layer_obj, aggregation_strategy=activation_method
262
+ )
263
+ positive_features = positive_act.extract_features_for_classifier()
264
+ positive_prediction = classifier.predict_proba([positive_features.cpu().numpy()])
265
+ if isinstance(positive_prediction, (list, tuple)) and len(positive_prediction) > 0:
266
+ positive_prediction = positive_prediction[0]
267
+
268
+ # Process negative (incorrect) choice using CLI token aggregation method
269
+ negative_act = Activations(
270
+ tensor=negative_activations, layer=layer_obj, aggregation_strategy=activation_method
271
+ )
272
+ negative_features = negative_act.extract_features_for_classifier()
273
+ negative_prediction = classifier.predict_proba([negative_features.cpu().numpy()])
274
+ if isinstance(negative_prediction, (list, tuple)) and len(negative_prediction) > 0:
275
+ negative_prediction = negative_prediction[0]
276
+
277
+ # Evaluate classifier performance
278
+ # For truthfulness: lower scores = more truthful/correct
279
+ # Classifier should give lower score to correct answer, higher to incorrect
280
+ classifier_correct = positive_prediction < negative_prediction
281
+
282
+ # Enhanced logging to show classifier vs normal predictions
283
+ logger.info("📋 Sample Evaluation:")
284
+ logger.debug(f" 🔸 Question: {qa_pair['question'][:100]}...")
285
+ logger.info(" 🧠 CLASSIFIER PREDICTION:")
286
+ logger.debug(f" ✅ Correct answer score: {float(positive_prediction):.3f} (lower = more truthful)")
287
+ logger.debug(f" ❌ Incorrect answer score: {float(negative_prediction):.3f} (higher = less truthful)")
288
+ logger.debug(
289
+ f" 🎯 Classifier judgment: {'CORRECT' if classifier_correct else 'INCORRECT'} (positive {'<' if classifier_correct else '>='} negative)"
290
+ )
291
+
292
+ return {
293
+ "question": qa_pair["question"],
294
+ "correct_answer": qa_pair["correct_answer"],
295
+ "incorrect_answer": qa_pair["incorrect_answer"],
296
+ "positive_prediction": float(positive_prediction),
297
+ "negative_prediction": float(negative_prediction),
298
+ "classifier_correct": classifier_correct,
299
+ }
300
+
301
+ except Exception as e:
302
+ logger.debug(f"Error evaluating sample: {e}")
303
+ return {
304
+ "question": qa_pair.get("question", "Unknown"),
305
+ "correct_answer": qa_pair.get("correct_answer", "Unknown"),
306
+ "incorrect_answer": qa_pair.get("incorrect_answer", "Unknown"),
307
+ "classifier_correct": False,
308
+ "error": str(e),
309
+ }
310
+
311
+ def _error_result(self, error_msg: str) -> Dict[str, Any]:
312
+ """Return an error result."""
313
+ return {
314
+ "ground_truth": "UNKNOWN",
315
+ "method_used": "log-likelihoods-error",
316
+ "confidence": 0.0,
317
+ "details": error_msg,
318
+ "task_name": self.task_name or "unknown",
319
+ "evaluation_method": "log-likelihoods",
320
+ "lm_eval_metrics": {"accuracy": 0.0, "correct_predictions": 0, "total_samples": 0},
321
+ }