wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1396 @@
1
+ """
2
+ LM-Eval-Harness Ground Truth Evaluation
3
+
4
+ This module provides ground truth evaluation using the lm-eval-harness framework.
5
+ """
6
+
7
+ import logging
8
+ from typing import Any, Dict
9
+
10
+ from wisent_guard.core.activations import ActivationAggregationStrategy, Activations
11
+ from wisent_guard.core.layer import Layer
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class LMEvalHarnessGroundTruth:
17
+ """
18
+ Ground truth evaluator using lm-eval-harness tasks.
19
+
20
+ This class orchestrates the evaluation of classifiers on lm-eval-harness tasks
21
+ by routing to appropriate evaluation methods based on the task type.
22
+ """
23
+
24
+ def __init__(self, task_name: str, evaluation_method: str = None, model=None):
25
+ """
26
+ Initialize the LM-eval-harness ground truth evaluator.
27
+
28
+ Args:
29
+ task_name: Name of the lm-eval task
30
+ evaluation_method: Evaluation method ("log-likelihoods", "text-generation", "perplexity")
31
+ model: The model instance for activation extraction
32
+ """
33
+ self.task_name = task_name
34
+ self.evaluation_method = evaluation_method
35
+ self.model = model
36
+
37
+ # Load evaluation method from benchmark configuration if not provided
38
+ if not self.evaluation_method:
39
+ self.evaluation_method = self._get_evaluation_method_for_task(task_name)
40
+
41
+ def evaluate_classifier_on_task(
42
+ self,
43
+ classifier,
44
+ task_name: str,
45
+ num_samples: int = 100,
46
+ model=None,
47
+ layer: int = 15,
48
+ token_aggregation: str = "average",
49
+ ) -> Dict[str, Any]:
50
+ """
51
+ Evaluate a classifier on the specified lm-eval task.
52
+
53
+ Args:
54
+ classifier: The classifier to evaluate
55
+ task_name: Name of the lm-eval task
56
+ num_samples: Number of samples to evaluate
57
+ model: The model instance (overrides self.model if provided)
58
+ layer: Layer to extract activations from
59
+ token_aggregation: Token aggregation method ("average", "final", "first", "max", "min")
60
+
61
+ Returns:
62
+ Dict containing evaluation results
63
+ """
64
+ # Use provided model or fall back to self.model
65
+ evaluation_model = model or self.model
66
+
67
+ # Route to appropriate evaluation method
68
+ if self.evaluation_method == "log-likelihoods":
69
+ return self._evaluate_log_likelihoods(
70
+ classifier, task_name, num_samples, evaluation_model, layer, token_aggregation
71
+ )
72
+ if self.evaluation_method == "text-generation":
73
+ return self._evaluate_text_generation(
74
+ classifier, task_name, num_samples, evaluation_model, layer, token_aggregation
75
+ )
76
+ if self.evaluation_method == "perplexity":
77
+ return self._evaluate_perplexity(
78
+ classifier, task_name, num_samples, evaluation_model, layer, token_aggregation
79
+ )
80
+ if self.evaluation_method == "code-execution":
81
+ return self._evaluate_code_execution(
82
+ classifier, task_name, num_samples, evaluation_model, layer, token_aggregation
83
+ )
84
+ return {
85
+ "ground_truth": "UNKNOWN",
86
+ "method_used": "lm-eval-harness-unsupported",
87
+ "confidence": 0.0,
88
+ "details": f"Unsupported evaluation method: {self.evaluation_method}",
89
+ "task_name": task_name,
90
+ "evaluation_method": self.evaluation_method,
91
+ }
92
+
93
+ def _evaluate_log_likelihoods(
94
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
95
+ ) -> Dict[str, Any]:
96
+ """Evaluate classifier using log-likelihoods approach."""
97
+ try:
98
+ from .log_likelihoods_evaluator import LogLikelihoodsEvaluator
99
+
100
+ # Create evaluator with model
101
+ evaluator = LogLikelihoodsEvaluator(task_name, model=model)
102
+
103
+ # Evaluate classifier
104
+ results = evaluator.evaluate_classifier_on_task(
105
+ classifier,
106
+ task_name,
107
+ num_samples=num_samples,
108
+ model=model,
109
+ layer=layer,
110
+ token_aggregation=token_aggregation,
111
+ )
112
+
113
+ return results
114
+
115
+ except Exception as e:
116
+ logger.error(f"Error in log-likelihoods evaluation: {e}")
117
+ return {
118
+ "ground_truth": "UNKNOWN",
119
+ "method_used": "lm-eval-harness-error",
120
+ "confidence": 0.0,
121
+ "details": f"Log-likelihoods evaluation failed: {e!s}",
122
+ "task_name": task_name,
123
+ "evaluation_method": "log-likelihoods",
124
+ }
125
+
126
+ def _evaluate_text_generation(
127
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
128
+ ) -> Dict[str, Any]:
129
+ """Evaluate classifier using text generation approach."""
130
+ try:
131
+ logger.info(f"🎯 TEXT GENERATION EVALUATION: {task_name}")
132
+
133
+ # TODO In general LMEvalHarness should be rebuild to be BenchmarkGroundTruth
134
+ # Check if this is a TaskInterface task
135
+ if self._is_task_interface_task(task_name):
136
+ docs, task_data = self._load_task_interface_data(task_name, num_samples)
137
+ else:
138
+ # Use existing lm-eval task loading infrastructure
139
+ task_data = model.load_lm_eval_task(task_name, shots=0, limit=num_samples)
140
+ docs, _ = model.split_task_data(task_data, split_ratio=1.0) # Use all for evaluation
141
+
142
+ if not docs:
143
+ return self._error_result(f"No documents retrieved from task: {task_name}")
144
+
145
+ logger.info(f"📝 Retrieved {len(docs)} documents from {task_name}")
146
+
147
+ # Generate responses using the model
148
+ generated_responses = []
149
+ ground_truth_responses = []
150
+
151
+ for i, doc in enumerate(docs):
152
+ try:
153
+ # Extract question from document
154
+ if hasattr(task_data, "doc_to_text"):
155
+ question = task_data.doc_to_text(doc)
156
+ else:
157
+ question = str(doc.get("question", doc.get("text", "")))
158
+
159
+ # Generate response using model
160
+ logger.debug(f"🔸 Generating response for: {question[:100]}...")
161
+ generated_response, _ = model.generate(
162
+ prompt=question, layer_index=layer, max_new_tokens=150, temperature=0.1
163
+ )
164
+
165
+ # Extract ground truth answer
166
+ # HLE task handling
167
+ if task_name.startswith("hle") or task_name in ["math500", "math", "hendrycks_math"]:
168
+ ground_truth = doc.get("answer", "")
169
+ # AIME task handling
170
+ elif task_name.startswith("aime"):
171
+ ground_truth = str(doc.get("Answer", "") or doc.get("answer", ""))
172
+ # FIXED: For DROP task, use raw document data to preserve structured format
173
+ elif task_name == "drop":
174
+ # Use raw answer field which contains the structured data
175
+ ground_truth = doc.get("answer", {})
176
+ elif hasattr(task_data, "doc_to_target"):
177
+ ground_truth = task_data.doc_to_target(doc)
178
+ else:
179
+ ground_truth = str(doc.get("answer", doc.get("target", "")))
180
+
181
+ generated_responses.append(
182
+ {
183
+ "question": question,
184
+ "generated_response": generated_response,
185
+ "ground_truth": ground_truth,
186
+ "doc": doc,
187
+ }
188
+ )
189
+
190
+ logger.debug(f" 📝 Generated: {generated_response[:100]}...")
191
+ # FIXED: Handle ground_truth as int or string for logging
192
+ gt_str = str(ground_truth)
193
+ logger.debug(f" ✅ Ground truth: {gt_str[:100]}...")
194
+
195
+ except Exception as e:
196
+ logger.error(f"Error generating response for doc {i}: {e}")
197
+ continue
198
+
199
+ # Evaluate using lm-eval-harness metrics
200
+ logger.info(f"🎯 Evaluating {len(generated_responses)} generated responses using lm-eval metrics...")
201
+
202
+ # Use lm-eval-harness's actual evaluation for this task
203
+ evaluation_results = self._evaluate_with_lm_eval_metrics(task_name, generated_responses, task_data)
204
+
205
+ # Now classify the generated responses to see if classifier agrees
206
+ classification_results = []
207
+ for response_data in generated_responses:
208
+ try:
209
+ layer_obj = Layer(index=layer, type="transformer")
210
+
211
+ # Extract activations from generated response
212
+ activation_tensor = model.extract_activations(response_data["generated_response"], layer_obj)
213
+ activation_method = self._map_token_aggregation_to_activation_method(token_aggregation)
214
+
215
+ activation_obj = Activations(
216
+ tensor=activation_tensor, layer=layer_obj, aggregation_strategy=activation_method
217
+ )
218
+
219
+ # Get classifier prediction
220
+ features = activation_obj.extract_features_for_classifier()
221
+
222
+ # Handle different classifier return formats
223
+ try:
224
+ # Try predict_proba first (returns probabilities)
225
+ features_numpy = features.cpu().numpy()
226
+ logger.debug(f"🔧 Features shape: {features_numpy.shape}")
227
+
228
+ prediction_proba = classifier.predict_proba([features_numpy])
229
+ logger.debug(f"🔧 predict_proba returned: {prediction_proba} (type: {type(prediction_proba)})")
230
+
231
+ # Handle different return formats
232
+ if isinstance(prediction_proba, (list, tuple)):
233
+ if len(prediction_proba) > 0:
234
+ prediction = prediction_proba[0]
235
+ logger.debug(f"🔧 Extracted from list/tuple: {prediction} (type: {type(prediction)})")
236
+ else:
237
+ prediction = 0.5 # Default neutral score
238
+ else:
239
+ prediction = prediction_proba
240
+
241
+ # If prediction is array-like, get first element
242
+ if hasattr(prediction, "__len__") and not isinstance(prediction, str):
243
+ if len(prediction) > 0:
244
+ prediction = prediction[0]
245
+ logger.debug(f"🔧 Extracted from array: {prediction} (type: {type(prediction)})")
246
+ else:
247
+ prediction = 0.5
248
+
249
+ # Ensure we have a float
250
+ prediction = float(prediction)
251
+ logger.debug(f"🔧 Final prediction: {prediction}")
252
+
253
+ except Exception as proba_error:
254
+ logger.warning(f"predict_proba failed: {proba_error}, trying predict...")
255
+
256
+ # Fallback to predict method
257
+ try:
258
+ predictions = classifier.predict([features.cpu().numpy()])
259
+ if len(predictions) > 0:
260
+ prediction = float(predictions[0])
261
+ else:
262
+ logger.warning("Classifier predict returned empty array")
263
+ prediction = 0.5
264
+ except Exception as predict_error:
265
+ logger.error(f"Both predict_proba and predict failed: {predict_error}")
266
+ prediction = 0.5 # Default neutral score
267
+
268
+ classification_results.append(
269
+ {
270
+ "question": response_data["question"],
271
+ "generated_response": response_data["generated_response"],
272
+ "classifier_score": prediction,
273
+ "classifier_truthful": prediction < 0.5, # Lower = more truthful
274
+ }
275
+ )
276
+
277
+ logger.debug("📋 Generated Response Classification:")
278
+ logger.debug(f" 🔸 Question: {response_data['question'][:100]}...")
279
+ logger.debug(f" 🧠 Generated: {response_data['generated_response'][:100]}...")
280
+ logger.debug(f" 🎯 Classifier score: {prediction:.3f} (lower = more truthful)")
281
+ logger.debug(f" ✅ Classifier judgment: {'TRUTHFUL' if prediction < 0.5 else 'UNTRUTHFUL'}")
282
+
283
+ except Exception as e:
284
+ import traceback
285
+
286
+ logger.error(f"Error classifying generated response: {e}")
287
+ logger.error(f"Full traceback: {traceback.format_exc()}")
288
+ classification_results.append(
289
+ {
290
+ "question": response_data["question"],
291
+ "generated_response": response_data["generated_response"],
292
+ "classifier_score": 0.5, # Default neutral score
293
+ "classifier_truthful": False,
294
+ "error": str(e),
295
+ }
296
+ )
297
+ continue
298
+
299
+ return {
300
+ "ground_truth": "EVALUATED",
301
+ "method_used": "lm-eval-harness-text-generation",
302
+ "confidence": evaluation_results.get("accuracy", 0.0),
303
+ "details": f"Generated and evaluated {len(generated_responses)} responses using lm-eval metrics",
304
+ "task_name": task_name,
305
+ "evaluation_method": "text-generation",
306
+ "lm_eval_metrics": evaluation_results,
307
+ "classification_results": classification_results,
308
+ "total_samples": len(generated_responses),
309
+ }
310
+
311
+ except Exception as e:
312
+ logger.error(f"Error in text generation evaluation: {e}")
313
+ return self._error_result(f"Text generation evaluation error: {e!s}")
314
+
315
+ def _evaluate_perplexity(
316
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
317
+ ) -> Dict[str, Any]:
318
+ """Evaluate classifier using perplexity approach."""
319
+ try:
320
+ logger.info(f"🎯 PERPLEXITY EVALUATION: {task_name}")
321
+
322
+ # Use existing task loading infrastructure
323
+ task_data = model.load_lm_eval_task(task_name, shots=0, limit=num_samples)
324
+ docs, _ = model.split_task_data(task_data, split_ratio=1.0) # Use all for evaluation
325
+
326
+ if not docs:
327
+ return self._error_result(f"No documents retrieved from task: {task_name}")
328
+
329
+ logger.info(f"📝 Retrieved {len(docs)} documents from {task_name}")
330
+
331
+ # Calculate perplexity scores for different responses
332
+ perplexity_results = []
333
+
334
+ for i, doc in enumerate(docs):
335
+ try:
336
+ # For WikiText and other pure language modeling tasks
337
+ if task_name == "wikitext":
338
+ # Get the full text for perplexity calculation
339
+ text = doc.get("page", doc.get("text", ""))
340
+ if not text:
341
+ logger.warning(f"No text found in WikiText document {i}")
342
+ continue
343
+
344
+ logger.debug(f"🔸 Calculating perplexity for WikiText document {i} ({len(text)} chars)...")
345
+
346
+ # Calculate perplexity on the full text
347
+ perplexity = self._calculate_perplexity(model, text)
348
+
349
+ # Extract activations from the text for classifier
350
+ try:
351
+ layer_obj = Layer(index=layer, type="transformer")
352
+
353
+ # Use a truncated version for activation extraction if text is too long
354
+ activation_text = text[:1000] if len(text) > 1000 else text
355
+ activation_tensor = model.extract_activations(activation_text, layer_obj)
356
+ activation_method = self._map_token_aggregation_to_activation_method(token_aggregation)
357
+
358
+ activation_obj = Activations(
359
+ tensor=activation_tensor, layer=layer_obj, aggregation_strategy=activation_method
360
+ )
361
+
362
+ # Get classifier prediction (only if classifier is provided)
363
+ if classifier is not None:
364
+ features = activation_obj.extract_features_for_classifier()
365
+
366
+ # Handle different classifier return formats
367
+ try:
368
+ prediction_proba = classifier.predict_proba([features.cpu().numpy()])
369
+
370
+ if isinstance(prediction_proba, (list, tuple)) and len(prediction_proba) > 0:
371
+ classification_score = float(prediction_proba[0])
372
+ else:
373
+ classification_score = float(prediction_proba)
374
+
375
+ if hasattr(classification_score, "__len__") and not isinstance(
376
+ classification_score, str
377
+ ):
378
+ classification_score = float(classification_score[0])
379
+
380
+ except Exception as proba_error:
381
+ logger.warning(f"predict_proba failed: {proba_error}, trying predict...")
382
+ try:
383
+ predictions = classifier.predict([features.cpu().numpy()])
384
+ if len(predictions) > 0:
385
+ classification_score = float(predictions[0])
386
+ else:
387
+ logger.warning("Classifier predict returned empty array")
388
+ classification_score = 0.5
389
+ except Exception as predict_error:
390
+ logger.error(f"Both predict_proba and predict failed: {predict_error}")
391
+ classification_score = 0.5
392
+ else:
393
+ # No classifier provided - use default neutral score for perplexity-only evaluation
394
+ classification_score = 0.5
395
+
396
+ except Exception as e:
397
+ logger.error(f"Error classifying WikiText document: {e}")
398
+ classification_score = None
399
+
400
+ result = {
401
+ "document_idx": i,
402
+ "text_preview": text[:200] + "..." if len(text) > 200 else text,
403
+ "text_length": len(text),
404
+ "perplexity": perplexity,
405
+ "classifier_score": classification_score,
406
+ }
407
+
408
+ perplexity_results.append(result)
409
+
410
+ logger.debug("📋 WikiText Perplexity Analysis:")
411
+ logger.debug(f" 📊 Document {i}: {len(text)} chars")
412
+ logger.debug(f" 🎯 Perplexity: {perplexity:.3f}")
413
+ if classification_score is not None:
414
+ logger.debug(f" 🧠 Classifier score: {classification_score:.3f} (lower = more truthful)")
415
+
416
+ continue # Skip the rest of the loop for WikiText
417
+
418
+ # Extract question/prompt and possible completions for other tasks
419
+ if hasattr(task_data, "doc_to_text"):
420
+ prompt = task_data.doc_to_text(doc)
421
+ else:
422
+ prompt = str(doc.get("question", doc.get("text", "")))
423
+
424
+ # For multiple choice tasks, get all choices
425
+ choices = []
426
+ if hasattr(task_data, "doc_to_choice"):
427
+ choices = [
428
+ task_data.doc_to_choice(doc, choice_idx)
429
+ for choice_idx in range(len(doc.get("choices", [])))
430
+ ]
431
+ elif "choices" in doc:
432
+ choices = doc["choices"]
433
+ else:
434
+ # For non-multiple choice, generate a response and calculate its perplexity
435
+ generated_response, _ = model.generate(
436
+ prompt=prompt, layer_index=layer, max_new_tokens=100, temperature=0.1
437
+ )
438
+ choices = [generated_response]
439
+
440
+ logger.debug(f"🔸 Calculating perplexity for: {prompt[:100]}...")
441
+
442
+ # Calculate perplexity for each choice
443
+ choice_perplexities = []
444
+ for choice_idx, choice in enumerate(choices):
445
+ try:
446
+ # Calculate perplexity of the choice given the prompt
447
+ full_text = f"{prompt} {choice}"
448
+ perplexity = self._calculate_perplexity(model, full_text)
449
+
450
+ choice_perplexities.append(
451
+ {"choice_idx": choice_idx, "choice_text": choice, "perplexity": perplexity}
452
+ )
453
+
454
+ logger.debug(f" 📊 Choice {choice_idx}: {choice[:50]}... (perplexity: {perplexity:.3f})")
455
+
456
+ except Exception as e:
457
+ logger.error(f"Error calculating perplexity for choice {choice_idx}: {e}")
458
+ continue
459
+
460
+ # Get ground truth answer index
461
+ ground_truth_idx = None
462
+ if hasattr(task_data, "doc_to_target"):
463
+ ground_truth = task_data.doc_to_target(doc)
464
+ try:
465
+ ground_truth_idx = int(ground_truth)
466
+ except:
467
+ ground_truth_idx = None
468
+ elif "answer" in doc:
469
+ ground_truth_idx = doc["answer"]
470
+
471
+ # Find the choice with lowest perplexity (most likely)
472
+ if choice_perplexities:
473
+ best_choice = min(choice_perplexities, key=lambda x: x["perplexity"])
474
+
475
+ # Classify the best choice using the classifier
476
+ classification_score = None
477
+ try:
478
+ layer_obj = Layer(index=layer, type="transformer")
479
+
480
+ # Extract activations from the best choice
481
+ activation_tensor = model.extract_activations(best_choice["choice_text"], layer_obj)
482
+ activation_method = self._map_token_aggregation_to_activation_method(token_aggregation)
483
+
484
+ activation_obj = Activations(
485
+ tensor=activation_tensor, layer=layer_obj, aggregation_strategy=activation_method
486
+ )
487
+
488
+ # Get classifier prediction
489
+ features = activation_obj.extract_features_for_classifier()
490
+
491
+ # Handle different classifier return formats
492
+ try:
493
+ # Try predict_proba first (returns probabilities)
494
+ prediction_proba = classifier.predict_proba([features.cpu().numpy()])
495
+
496
+ # Handle different return formats
497
+ if isinstance(prediction_proba, (list, tuple)):
498
+ if len(prediction_proba) > 0:
499
+ classification_score = prediction_proba[0]
500
+ else:
501
+ classification_score = 0.5 # Default neutral score
502
+ else:
503
+ classification_score = prediction_proba
504
+
505
+ # If prediction is array-like, get first element
506
+ if hasattr(classification_score, "__len__") and not isinstance(
507
+ classification_score, str
508
+ ):
509
+ if len(classification_score) > 0:
510
+ classification_score = classification_score[0]
511
+ else:
512
+ classification_score = 0.5
513
+
514
+ # Ensure we have a float
515
+ classification_score = float(classification_score)
516
+
517
+ except Exception as proba_error:
518
+ logger.warning(f"predict_proba failed: {proba_error}, trying predict...")
519
+
520
+ # Fallback to predict method
521
+ try:
522
+ predictions = classifier.predict([features.cpu().numpy()])
523
+ if len(predictions) > 0:
524
+ classification_score = float(predictions[0])
525
+ else:
526
+ logger.warning("Classifier predict returned empty array")
527
+ classification_score = 0.5
528
+ except Exception as predict_error:
529
+ logger.error(f"Both predict_proba and predict failed: {predict_error}")
530
+ classification_score = 0.5 # Default neutral score
531
+
532
+ except Exception as e:
533
+ logger.error(f"Error classifying best choice: {e}")
534
+
535
+ result = {
536
+ "question": prompt,
537
+ "choices": choice_perplexities,
538
+ "best_choice_idx": best_choice["choice_idx"],
539
+ "best_choice_text": best_choice["choice_text"],
540
+ "best_choice_perplexity": best_choice["perplexity"],
541
+ "ground_truth_idx": ground_truth_idx,
542
+ "classifier_score": classification_score,
543
+ "perplexity_correct": best_choice["choice_idx"] == ground_truth_idx
544
+ if ground_truth_idx is not None
545
+ else None,
546
+ }
547
+
548
+ perplexity_results.append(result)
549
+
550
+ logger.debug("📋 Perplexity Analysis:")
551
+ logger.debug(f" 🔸 Question: {prompt[:100]}...")
552
+ logger.debug(f" 📊 Best choice (lowest perplexity): {best_choice['choice_text'][:100]}...")
553
+ logger.debug(f" 🎯 Perplexity: {best_choice['perplexity']:.3f}")
554
+ logger.debug(
555
+ f" 🧠 Classifier score: {classification_score:.3f} (lower = more truthful)"
556
+ if classification_score is not None
557
+ else " 🧠 Classifier score: N/A"
558
+ )
559
+ logger.debug(f" ✅ Perplexity correct: {result['perplexity_correct']}")
560
+
561
+ except Exception as e:
562
+ logger.error(f"Error processing doc {i}: {e}")
563
+ continue
564
+
565
+ # Calculate overall metrics
566
+ total_samples = len(perplexity_results)
567
+
568
+ if task_name == "wikitext":
569
+ # For WikiText, we don't have correct/incorrect, just perplexity values
570
+ perplexities = [r["perplexity"] for r in perplexity_results if r["perplexity"] != float("inf")]
571
+ avg_perplexity = sum(perplexities) / len(perplexities) if perplexities else float("inf")
572
+
573
+ # Average classifier score
574
+ classifier_scores = [
575
+ r["classifier_score"] for r in perplexity_results if r["classifier_score"] is not None
576
+ ]
577
+ avg_classifier_score = sum(classifier_scores) / len(classifier_scores) if classifier_scores else None
578
+
579
+ perplexity_accuracy = 1.0 if avg_perplexity < 100 else 0.0 # Arbitrary threshold for "good" perplexity
580
+ correct_perplexity = sum(1 for r in perplexity_results if r["perplexity"] < 100)
581
+ else:
582
+ correct_perplexity = sum(1 for r in perplexity_results if r.get("perplexity_correct") == True)
583
+ perplexity_accuracy = correct_perplexity / total_samples if total_samples > 0 else 0.0
584
+
585
+ # Average classifier score
586
+ classifier_scores = [
587
+ r["classifier_score"] for r in perplexity_results if r["classifier_score"] is not None
588
+ ]
589
+ avg_classifier_score = sum(classifier_scores) / len(classifier_scores) if classifier_scores else None
590
+
591
+ logger.info("📊 PERPLEXITY EVALUATION RESULTS:")
592
+ logger.info(f" • Total samples: {total_samples}")
593
+ if task_name == "wikitext":
594
+ logger.info(f" • Average perplexity: {avg_perplexity:.3f}")
595
+ logger.info(f" • Documents with perplexity < 100: {correct_perplexity}")
596
+ else:
597
+ logger.info(f" • Perplexity accuracy: {perplexity_accuracy:.3f}")
598
+ logger.info(
599
+ f" • Average classifier score: {avg_classifier_score:.3f}"
600
+ if avg_classifier_score is not None
601
+ else " • Average classifier score: N/A"
602
+ )
603
+
604
+ result_dict = {
605
+ "ground_truth": "EVALUATED",
606
+ "method_used": "lm-eval-harness-perplexity",
607
+ "confidence": perplexity_accuracy,
608
+ "details": f"Calculated perplexity for {total_samples} samples",
609
+ "task_name": task_name,
610
+ "evaluation_method": "perplexity",
611
+ "perplexity_accuracy": perplexity_accuracy,
612
+ "average_classifier_score": avg_classifier_score,
613
+ "total_samples": total_samples,
614
+ "correct_perplexity": correct_perplexity,
615
+ "perplexity_results": perplexity_results[:10], # First 10 for debugging
616
+ }
617
+
618
+ if task_name == "wikitext":
619
+ result_dict["average_perplexity"] = avg_perplexity
620
+ result_dict["details"] = (
621
+ f"Calculated perplexity for {total_samples} WikiText documents, avg perplexity: {avg_perplexity:.3f}"
622
+ )
623
+ else:
624
+ result_dict["details"] = (
625
+ f"Calculated perplexity for {total_samples} samples, accuracy: {perplexity_accuracy:.3f}"
626
+ )
627
+
628
+ return result_dict
629
+
630
+ except Exception as e:
631
+ logger.error(f"Error in perplexity evaluation: {e}")
632
+ return self._error_result(f"Perplexity evaluation error: {e!s}")
633
+
634
+ def _get_evaluation_method_for_task(self, task_name: str) -> str:
635
+ """Get the evaluation method for a task from the benchmark configuration."""
636
+ try:
637
+ import json
638
+
639
+ eval_methods_path = "wisent_guard/parameters/benchmarks/benchmark_evaluation_methods.json"
640
+ with open(eval_methods_path) as f:
641
+ benchmark_methods = json.load(f)
642
+ return benchmark_methods.get(task_name, "text-generation")
643
+ except Exception as e:
644
+ logger.debug(f"Could not load benchmark evaluation methods: {e}")
645
+ return "text-generation"
646
+
647
+ def _error_result(self, error_message: str) -> Dict[str, Any]:
648
+ """Return a standardized error result."""
649
+ return {
650
+ "ground_truth": "ERROR",
651
+ "method_used": "lm-eval-harness-error",
652
+ "confidence": 0.0,
653
+ "details": error_message,
654
+ "task_name": self.task_name,
655
+ "evaluation_method": self.evaluation_method,
656
+ }
657
+
658
+ def _map_token_aggregation_to_activation_method(self, token_aggregation: str):
659
+ """Map token aggregation string to activation method."""
660
+
661
+ mapping = { # TODO This should be refactor, why we use strings as Token aggregation?
662
+ "average": ActivationAggregationStrategy.MEAN_POOLING,
663
+ "mean": ActivationAggregationStrategy.MEAN_POOLING,
664
+ "last": ActivationAggregationStrategy.LAST_TOKEN,
665
+ "max": ActivationAggregationStrategy.MAX_POOLING,
666
+ }
667
+
668
+ return mapping.get(token_aggregation.lower(), ActivationAggregationStrategy.MEAN_POOLING)
669
+
670
+ def _is_task_interface_task(self, task_name: str) -> bool:
671
+ """Check if this is a TaskInterface task (not an lm-eval task)."""
672
+ # List of known TaskInterface tasks
673
+ task_interface_tasks = {
674
+ "hle",
675
+ "hle_exact_match",
676
+ "hle_multiple_choice",
677
+ "livecodebench",
678
+ "math500",
679
+ "math",
680
+ "hendrycks_math",
681
+ "aime",
682
+ "aime2025",
683
+ "aime2024",
684
+ "hmmt",
685
+ "hmmt_feb_2025",
686
+ "polymath",
687
+ "polymath_en_medium",
688
+ "polymath_zh_medium",
689
+ "polymath_en_high",
690
+ "polymath_zh_high",
691
+ "livemathbench",
692
+ "livemathbench_cnmo_en",
693
+ "livemathbench_cnmo_zh",
694
+ }
695
+ return task_name in task_interface_tasks
696
+
697
+ def _load_task_interface_data(self, task_name: str, num_samples: int):
698
+ """Load data from TaskInterface tasks."""
699
+ try:
700
+ from .task_interface import get_task
701
+
702
+ # Get the task instance
703
+ task = get_task(task_name)
704
+
705
+ # Load data
706
+ docs = task.load_data(limit=num_samples)
707
+
708
+ return docs, task
709
+
710
+ except Exception as e:
711
+ logger.error(f"Failed to load TaskInterface task {task_name}: {e}")
712
+ return [], None
713
+
714
+ def _calculate_perplexity(self, model, text: str) -> float:
715
+ """Calculate perplexity of text using the model."""
716
+ try:
717
+ import numpy as np
718
+ import torch
719
+
720
+ # Use the model's prepare_activations method to get outputs
721
+ prepared = model.prepare_activations(text)
722
+ outputs = prepared["outputs"]
723
+ inputs = prepared["inputs"]
724
+
725
+ # Get input IDs
726
+ input_ids = inputs["input_ids"]
727
+
728
+ # Get logits from the outputs
729
+ logits = outputs.logits
730
+
731
+ # Compute log probabilities
732
+ log_probs = torch.log_softmax(logits, dim=-1)
733
+
734
+ # Get log probabilities for actual tokens (shifted for next-token prediction)
735
+ # input_ids shape: [batch_size, sequence_length]
736
+ # logits shape: [batch_size, sequence_length, vocab_size]
737
+ # We need to match targets with predictions
738
+
739
+ if input_ids.shape[1] > 1:
740
+ # Get log probabilities for the target tokens
741
+ target_ids = input_ids[0, 1:] # Skip first token (no prediction for it)
742
+ prediction_logits = log_probs[0, :-1, :] # Skip last prediction (no target for it)
743
+
744
+ # Get log probabilities for actual tokens
745
+ token_log_probs = prediction_logits.gather(dim=-1, index=target_ids.unsqueeze(-1)).squeeze(-1)
746
+
747
+ # Compute average log probability
748
+ avg_log_prob = token_log_probs.mean().item()
749
+
750
+ # Compute perplexity
751
+ perplexity = np.exp(-avg_log_prob)
752
+ else:
753
+ # Single token, cannot compute perplexity
754
+ perplexity = float("inf")
755
+
756
+ return perplexity
757
+
758
+ except Exception as e:
759
+ logger.error(f"Error calculating perplexity: {e}")
760
+ return float("inf")
761
+
762
+ def _evaluate_generic_code_execution(
763
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
764
+ ) -> Dict[str, Any]:
765
+ """Evaluate generic code execution tasks (non-BigCode) like LiveCodeBench."""
766
+ try:
767
+ logger.info(f"🎯 GENERIC CODE EXECUTION EVALUATION: {task_name}")
768
+
769
+ # Get secure code evaluator
770
+ from .secure_code_evaluator import SecureCodeEvaluator
771
+
772
+ secure_evaluator = SecureCodeEvaluator()
773
+
774
+ # Load task data
775
+ task_data = model.load_lm_eval_task(task_name, shots=0, limit=num_samples)
776
+
777
+ if hasattr(task_data, "test_docs"):
778
+ docs = task_data.test_docs()
779
+ else:
780
+ docs, _ = model.split_task_data(task_data, split_ratio=1.0)
781
+
782
+ if not docs:
783
+ return self._error_result(f"No documents retrieved from task: {task_name}")
784
+
785
+ logger.info(f"📝 Retrieved {len(docs)} documents from {task_name}")
786
+
787
+ # Generate code for each sample
788
+ generated_codes = []
789
+ evaluation_results = []
790
+
791
+ for i, doc in enumerate(docs):
792
+ try:
793
+ # Get prompt
794
+ if hasattr(task_data, "doc_to_text"):
795
+ prompt = task_data.doc_to_text(doc)
796
+ else:
797
+ # For LiveCodeBench
798
+ question = doc.get("question_content", doc.get("text", ""))
799
+ starter_code = doc.get("starter_code", "")
800
+ prompt = f"{question}\n\n{starter_code}" if starter_code else question
801
+
802
+ logger.debug(f"📋 Prompt for sample {i + 1}:\n{prompt[:200]}...\n")
803
+
804
+ # Generate code using model
805
+ logger.debug(f"🔸 Generating code for sample {i + 1}/{len(docs)}...")
806
+ generated_code, _ = model.generate(
807
+ prompt=prompt,
808
+ layer_index=layer,
809
+ max_new_tokens=500, # More tokens for code generation
810
+ temperature=0.1,
811
+ )
812
+
813
+ generated_codes.append(generated_code)
814
+ logger.debug(f" 📝 Generated code:\n{generated_code}\n")
815
+
816
+ # Evaluate generated code
817
+ eval_result = secure_evaluator.evaluate_response(task_name, doc, generated_code)
818
+ evaluation_results.append(eval_result)
819
+
820
+ logger.debug(
821
+ f" ✅ Evaluation result: {'PASSED' if eval_result.get('passed', False) else 'FAILED'}"
822
+ )
823
+ if "pass_rate" in eval_result:
824
+ logger.debug(f" 📊 Pass rate: {eval_result['pass_rate']:.2%}")
825
+
826
+ except Exception as e:
827
+ logger.error(f"Error processing sample {i}: {e}")
828
+ generated_codes.append("")
829
+ evaluation_results.append({"passed": False, "error": str(e), "success": False})
830
+
831
+ # Aggregate results
832
+ total_passed = sum(1 for r in evaluation_results if r.get("passed", False))
833
+ accuracy = total_passed / len(evaluation_results) if evaluation_results else 0.0
834
+
835
+ logger.info(
836
+ f"📊 CODE EXECUTION COMPLETED: {total_passed}/{len(evaluation_results)} passed ({accuracy:.2%})"
837
+ )
838
+
839
+ # Clean up Docker resources
840
+ secure_evaluator.cleanup()
841
+
842
+ return {
843
+ "ground_truth": "EVALUATED",
844
+ "method_used": f"generic-code-execution-{task_name}",
845
+ "confidence": accuracy,
846
+ "accuracy": accuracy,
847
+ "details": f"Executed and evaluated {len(generated_codes)} code samples",
848
+ "task_name": task_name,
849
+ "evaluation_method": "code-execution",
850
+ "total_samples": len(generated_codes),
851
+ "passed_samples": total_passed,
852
+ "evaluation_results": evaluation_results,
853
+ }
854
+
855
+ except Exception as e:
856
+ logger.error(f"Error in generic code execution evaluation: {e}")
857
+ import traceback
858
+
859
+ logger.error(f"Traceback: {traceback.format_exc()}")
860
+ return self._error_result(f"Generic code execution evaluation error: {e!s}")
861
+
862
+ def _evaluate_with_lm_eval_metrics(self, task_name: str, response_data: list, task_data) -> Dict[str, Any]:
863
+ """Evaluate responses using task-specific evaluation metrics."""
864
+ try:
865
+ correct = 0
866
+ total = len(response_data)
867
+ evaluation_details = []
868
+
869
+ for response in response_data:
870
+ generated = response["generated_response"]
871
+ ground_truth = response["ground_truth"]
872
+
873
+ # Task-specific evaluation logic
874
+ if task_name == "gsm8k":
875
+ # GSM8K uses exact match on numerical answer
876
+ is_correct = self._evaluate_gsm8k_response(generated, ground_truth)
877
+ elif task_name.startswith("math") or task_name in ["hendrycks_math"]:
878
+ # MATH-500 and related benchmarks use same evaluation as GSM8K (numerical extraction)
879
+ is_correct = self._evaluate_gsm8k_response(generated, ground_truth)
880
+ elif task_name in ["arc_easy", "arc_challenge"]:
881
+ # ARC uses exact match on choice letter/number
882
+ is_correct = self._evaluate_arc_response(generated, ground_truth)
883
+ elif task_name == "hellaswag":
884
+ # HellaSwag uses exact match on choice index
885
+ is_correct = self._evaluate_hellaswag_response(generated, ground_truth)
886
+ elif task_name == "mathqa":
887
+ # MATH_QA uses exact match on choice index (0, 1, 2, 3)
888
+ is_correct = self._evaluate_mathqa_response(generated, ground_truth)
889
+ elif task_name == "drop":
890
+ # DROP uses structured answer format with numbers, spans, and dates
891
+ is_correct = self._evaluate_drop_response(generated, ground_truth)
892
+ elif task_name.startswith("gpqa"):
893
+ # GPQA uses multiple-choice answer extraction (A, B, C, D)
894
+ is_correct = self._evaluate_multiple_choice_response(generated, ground_truth)
895
+ elif task_name.startswith("hle") and "multiple_choice" in task_name:
896
+ # HLE multiple choice uses letter extraction (A, B, C, D, E)
897
+ is_correct = self._evaluate_multiple_choice_response(generated, ground_truth)
898
+ elif task_name.startswith("truthfulqa") or task_name == "truthfulqa_mc1":
899
+ # TruthfulQA uses multiple-choice answer extraction (A, B, C, D)
900
+ is_correct = self._evaluate_multiple_choice_response(generated, ground_truth)
901
+ else:
902
+ # Default: string matching with some flexibility
903
+ is_correct = self._evaluate_default_response(generated, ground_truth)
904
+
905
+ if is_correct:
906
+ correct += 1
907
+
908
+ evaluation_details.append(
909
+ {
910
+ "question": response["question"][:100],
911
+ "generated": generated[-50:],
912
+ "ground_truth": ground_truth,
913
+ "correct": is_correct,
914
+ }
915
+ )
916
+
917
+ logger.debug(f"📊 Evaluation: {response['question'][:50]}...")
918
+ logger.debug(f" Generated: {generated[:50]}...")
919
+ logger.debug(f" Ground Truth: {ground_truth}")
920
+ logger.debug(f" Correct: {is_correct}")
921
+
922
+ accuracy = correct / total if total > 0 else 0.0
923
+
924
+ return {
925
+ "accuracy": accuracy,
926
+ "correct_predictions": correct,
927
+ "total_samples": total,
928
+ "evaluation_details": evaluation_details[:5], # First 5 for debugging
929
+ "task_name": task_name,
930
+ }
931
+
932
+ except Exception as e:
933
+ logger.error(f"Error in metrics evaluation: {e}")
934
+ return {"accuracy": 0.0, "correct_predictions": 0, "total_samples": len(response_data), "error": str(e)}
935
+
936
+ def _evaluate_gsm8k_response(self, generated: str, ground_truth) -> bool:
937
+ """Evaluate GSM8K response using numerical answer extraction."""
938
+ try:
939
+ # Extract numerical answer from generated response
940
+ # GSM8K answers are typically in format "#### 42" or just the number
941
+ generated_answer = self._extract_numerical_answer(generated)
942
+ ground_truth_answer = self._extract_numerical_answer(str(ground_truth))
943
+
944
+ # Compare numerical values
945
+ if generated_answer is not None and ground_truth_answer is not None:
946
+ return abs(generated_answer - ground_truth_answer) < 1e-6
947
+
948
+ # Fallback to string matching
949
+ return generated.strip().lower() == str(ground_truth).strip().lower()
950
+
951
+ except Exception as e:
952
+ logger.error(f"Error evaluating GSM8K response: {e}")
953
+ return False
954
+
955
+ def _extract_numerical_answer(self, text: str) -> float:
956
+ """Extract numerical answer from text."""
957
+ try:
958
+ import re
959
+
960
+ # Look for #### pattern (GSM8K format)
961
+ pattern = r"####\s*([+-]?\d+(?:\.\d+)?)"
962
+ match = re.search(pattern, text)
963
+ if match:
964
+ return float(match.group(1))
965
+
966
+ # Look for last number in text
967
+ numbers = re.findall(r"[+-]?\d+(?:\.\d+)?", text)
968
+ if numbers:
969
+ return float(numbers[-1])
970
+
971
+ return None
972
+
973
+ except Exception as e:
974
+ logger.error(f"Error extracting numerical answer: {e}")
975
+ return None
976
+
977
+ def _evaluate_arc_response(self, generated: str, ground_truth) -> bool:
978
+ """Evaluate ARC response using exact match."""
979
+ try:
980
+ # Normalize responses
981
+ gen_clean = generated.strip().lower()
982
+ gt_clean = str(ground_truth).strip().lower()
983
+
984
+ # Direct match
985
+ if gen_clean == gt_clean:
986
+ return True
987
+
988
+ # Check if generated contains the ground truth
989
+ if gt_clean in gen_clean:
990
+ return True
991
+
992
+ # Check for choice letter/number patterns
993
+ import re
994
+
995
+ gen_match = re.search(r"[abcd]|\d+", gen_clean)
996
+ gt_match = re.search(r"[abcd]|\d+", gt_clean)
997
+
998
+ if gen_match and gt_match:
999
+ return gen_match.group() == gt_match.group()
1000
+
1001
+ return False
1002
+
1003
+ except Exception as e:
1004
+ logger.error(f"Error evaluating ARC response: {e}")
1005
+ return False
1006
+
1007
+ def _evaluate_hellaswag_response(self, generated: str, ground_truth) -> bool:
1008
+ """Evaluate HellaSwag response using exact match."""
1009
+ try:
1010
+ # Normalize and compare
1011
+ gen_clean = generated.strip().lower()
1012
+ gt_clean = str(ground_truth).strip().lower()
1013
+
1014
+ return gen_clean == gt_clean or gt_clean in gen_clean
1015
+
1016
+ except Exception as e:
1017
+ logger.error(f"Error evaluating HellaSwag response: {e}")
1018
+ return False
1019
+
1020
+ def _evaluate_mathqa_response(self, generated: str, ground_truth) -> bool:
1021
+ """Evaluate MATH_QA response using choice matching."""
1022
+ try:
1023
+ import re
1024
+
1025
+ # Ground truth is typically 0, 1, 2, or 3 (choice index)
1026
+ gt_str = str(ground_truth).strip()
1027
+
1028
+ # Look for choice patterns in generated response
1029
+ gen_clean = generated.strip().lower()
1030
+
1031
+ # Direct match with choice index
1032
+ if gt_str in gen_clean:
1033
+ return True
1034
+
1035
+ # Look for choice letter patterns (a=0, b=1, c=2, d=3)
1036
+ choice_map = {"a": "0", "b": "1", "c": "2", "d": "3"}
1037
+ for letter, index in choice_map.items():
1038
+ if index == gt_str and letter in gen_clean:
1039
+ return True
1040
+
1041
+ # Look for explicit choice pattern like "The answer is 1" or "Choice B"
1042
+ choice_patterns = [
1043
+ rf"\b{gt_str}\b", # Exact number match
1044
+ rf"choice\s*{choice_map.get(gt_str, gt_str)}", # "choice 1"
1045
+ rf"answer\s*is\s*{gt_str}", # "answer is 1"
1046
+ rf"option\s*{gt_str}", # "option 1"
1047
+ ]
1048
+
1049
+ for pattern in choice_patterns:
1050
+ if re.search(pattern, gen_clean):
1051
+ return True
1052
+
1053
+ return False
1054
+
1055
+ except Exception as e:
1056
+ logger.error(f"Error evaluating MATH_QA response: {e}")
1057
+ return False
1058
+
1059
+ def _evaluate_drop_response(self, generated: str, ground_truth) -> bool:
1060
+ """Evaluate DROP response using structured answer format."""
1061
+ try:
1062
+ import json
1063
+ import re
1064
+
1065
+ # Parse ground truth if it's a string representation of a dict
1066
+ if isinstance(ground_truth, str):
1067
+ try:
1068
+ # Try to parse as JSON first
1069
+ if ground_truth.startswith("{"):
1070
+ gt_dict = json.loads(ground_truth)
1071
+ else:
1072
+ # Handle malformed string representations
1073
+ return False
1074
+ except:
1075
+ return False
1076
+ elif isinstance(ground_truth, dict):
1077
+ gt_dict = ground_truth
1078
+ else:
1079
+ return False
1080
+
1081
+ gen_clean = generated.strip().lower()
1082
+
1083
+ # Check number field
1084
+ if gt_dict.get("number"):
1085
+ number_str = str(gt_dict["number"]).strip()
1086
+ if number_str:
1087
+ # Direct number match
1088
+ if number_str.lower() in gen_clean:
1089
+ return True
1090
+
1091
+ # Try to extract numbers from generated response
1092
+ gen_numbers = re.findall(r"\b\d+\b", generated)
1093
+ if number_str in gen_numbers:
1094
+ return True
1095
+
1096
+ # Word number matching (e.g., "two" vs "2")
1097
+ number_words = {
1098
+ "0": ["zero", "none"],
1099
+ "1": ["one"],
1100
+ "2": ["two"],
1101
+ "3": ["three"],
1102
+ "4": ["four"],
1103
+ "5": ["five"],
1104
+ "6": ["six"],
1105
+ "7": ["seven"],
1106
+ "8": ["eight"],
1107
+ "9": ["nine"],
1108
+ "10": ["ten"],
1109
+ }
1110
+ if number_str in number_words:
1111
+ for word in number_words[number_str]:
1112
+ if word in gen_clean:
1113
+ return True
1114
+
1115
+ # Check spans field
1116
+ if gt_dict.get("spans"):
1117
+ spans = gt_dict["spans"]
1118
+ if isinstance(spans, list):
1119
+ for span in spans:
1120
+ span_clean = str(span).strip().lower()
1121
+ if span_clean and span_clean in gen_clean:
1122
+ return True
1123
+ elif isinstance(spans, str):
1124
+ span_clean = spans.strip().lower()
1125
+ if span_clean and span_clean in gen_clean:
1126
+ return True
1127
+
1128
+ # Check date field (less common but possible)
1129
+ if gt_dict.get("date"):
1130
+ date_obj = gt_dict["date"]
1131
+ if isinstance(date_obj, dict):
1132
+ # Check individual date components
1133
+ for component in ["day", "month", "year"]:
1134
+ if date_obj.get(component):
1135
+ date_val = str(date_obj[component]).strip().lower()
1136
+ if date_val and date_val in gen_clean:
1137
+ return True
1138
+
1139
+ return False
1140
+
1141
+ except Exception as e:
1142
+ logger.error(f"Error evaluating DROP response: {e}")
1143
+ return False
1144
+
1145
+ def _evaluate_default_response(self, generated: str, ground_truth) -> bool:
1146
+ """Default evaluation using flexible string matching."""
1147
+ try:
1148
+ gen_clean = generated.strip().lower()
1149
+
1150
+ # Handle list ground truth (e.g., COQA format)
1151
+ if isinstance(ground_truth, list):
1152
+ # Check if generated response matches any of the acceptable answers
1153
+ for gt_option in ground_truth:
1154
+ gt_clean = str(gt_option).strip().lower()
1155
+
1156
+ # Exact match
1157
+ if gen_clean == gt_clean:
1158
+ return True
1159
+
1160
+ # Contains match
1161
+ if gt_clean in gen_clean or gen_clean in gt_clean:
1162
+ return True
1163
+
1164
+ return False
1165
+ # Handle string ground truth
1166
+ gt_clean = str(ground_truth).strip().lower()
1167
+
1168
+ # Exact match
1169
+ if gen_clean == gt_clean:
1170
+ return True
1171
+
1172
+ # Contains match
1173
+ if gt_clean in gen_clean or gen_clean in gt_clean:
1174
+ return True
1175
+
1176
+ return False
1177
+
1178
+ except Exception as e:
1179
+ logger.error(f"Error in default evaluation: {e}")
1180
+ return False
1181
+
1182
+ def _evaluate_multiple_choice_response(self, generated: str, ground_truth) -> bool:
1183
+ """Evaluate multiple choice response by extracting choice letter (A, B, C, D, E)."""
1184
+ import re
1185
+
1186
+ try:
1187
+ # Clean the generated response
1188
+ gen_clean = generated.strip()
1189
+
1190
+ # Convert ground truth to string and extract expected letter
1191
+ gt_str = str(ground_truth).strip()
1192
+ expected_letter = None
1193
+
1194
+ # Extract letter from ground truth (could be "(A)", "A", etc.)
1195
+ gt_match = re.search(r"[ABCDE]", gt_str.upper())
1196
+ if gt_match:
1197
+ expected_letter = gt_match.group()
1198
+ else:
1199
+ return False
1200
+
1201
+ # Try multiple strict patterns to extract answer from generated response
1202
+ # These patterns require clear context indicating an intentional choice
1203
+ patterns = [
1204
+ # Fixed pattern to avoid matching 'A' in "Answer:" alone
1205
+ r"(?:answer|choice|option)\s*(?:is\s+|:\s*)(?:\()?([ABCDE])(?:\))?", # "Answer: A" or "Answer is (B)" - requires letter after
1206
+ r"the\s+(?:correct\s+)?answer\s+is\s*(?:\()?([ABCDE])(?:\))?", # "The answer is A" - requires "the answer is"
1207
+ r"(?:select|choose)\s+(?:\()?([ABCDE])(?:\))?", # "Select A" or "Choose A" - requires the action word
1208
+ r"(?:^|\n)([ABCDE])(?:\s*$)", # Letter at start of line followed by whitespace/end only
1209
+ r"^([ABCDE])[.,;!?)\s]*$", # Just the letter with optional punctuation and whitespace
1210
+ r"^(?:\()?([ABCDE])(?:\))?\s*$", # Just the letter with optional parentheses
1211
+ ]
1212
+
1213
+ # Try each pattern - only accept clear, intentional responses
1214
+ for pattern in patterns:
1215
+ matches = re.finditer(pattern, gen_clean.upper(), re.IGNORECASE | re.MULTILINE)
1216
+ for match in matches:
1217
+ extracted_letter = match.group(1).upper()
1218
+ if extracted_letter == expected_letter:
1219
+ return True
1220
+
1221
+ # No more fallback - if we can't clearly identify the choice, it's wrong
1222
+ return False
1223
+
1224
+ except Exception as e:
1225
+ logger.error(f"Error evaluating multiple choice response: {e}")
1226
+ return False
1227
+
1228
+ def _evaluate_code_execution(
1229
+ self, classifier, task_name: str, num_samples: int, model, layer: int, token_aggregation: str = "average"
1230
+ ) -> Dict[str, Any]:
1231
+ """Evaluate classifier using code execution approach for BigCode tasks."""
1232
+ try:
1233
+ logger.debug(f"🎯 CODE EXECUTION EVALUATION: {task_name}")
1234
+
1235
+ # Check if it's a BigCode task
1236
+ from .bigcode_integration import get_bigcode_evaluator, is_bigcode_task, load_bigcode_task
1237
+ from .secure_code_evaluator import SecureCodeEvaluator
1238
+
1239
+ if not is_bigcode_task(task_name):
1240
+ # Check if it's still a code execution task (like LiveCodeBench)
1241
+ if SecureCodeEvaluator.is_code_execution_task(task_name):
1242
+ logger.info(f"Task {task_name} is a non-BigCode code execution task")
1243
+ return self._evaluate_generic_code_execution(
1244
+ classifier, task_name, num_samples, model, layer, token_aggregation
1245
+ )
1246
+ logger.warning(f"Task {task_name} is not a code execution task, falling back to text generation")
1247
+ return self._evaluate_text_generation(
1248
+ classifier, task_name, num_samples, model, layer, token_aggregation
1249
+ )
1250
+
1251
+ # Load BigCode task
1252
+ bigcode_task = load_bigcode_task(task_name, limit=num_samples)
1253
+ logger.info(f"📝 Loaded BigCode task {task_name} with {len(bigcode_task)} samples")
1254
+
1255
+ # Generate code for each sample
1256
+ generated_codes = []
1257
+ for i, sample in enumerate(bigcode_task.get_samples()):
1258
+ try:
1259
+ # Get prompt
1260
+ prompt = bigcode_task.doc_to_text(sample)
1261
+ logger.debug(f"📋 Prompt for sample {i + 1}:\n{prompt}\n")
1262
+
1263
+ # Generate code using model
1264
+ logger.debug(f"🔸 Generating code for sample {i + 1}/{len(bigcode_task)}...")
1265
+ generated_code, _ = model.generate(
1266
+ prompt=prompt,
1267
+ layer_index=layer,
1268
+ max_new_tokens=300, # More tokens for code generation
1269
+ temperature=0.1,
1270
+ # Note: stop_sequences not supported by all models
1271
+ )
1272
+
1273
+ generated_codes.append(generated_code)
1274
+ logger.debug(f" 📝 Generated: {generated_code[:100]}...")
1275
+ logger.debug(f" 📝 Full generated code:\n{generated_code}\n")
1276
+
1277
+ except Exception as e:
1278
+ logger.error(f"Error generating code for sample {i}: {e}")
1279
+ generated_codes.append("") # Empty code for failed generation
1280
+
1281
+ # Evaluate generated code using BigCode evaluator
1282
+ logger.info(f"🎯 Evaluating {len(generated_codes)} generated code samples...")
1283
+
1284
+ # Get Docker executor if available
1285
+ docker_executor = None
1286
+ try:
1287
+ from .docker import OptimizedDockerExecutor
1288
+
1289
+ docker_executor = OptimizedDockerExecutor()
1290
+ except Exception as e:
1291
+ logger.warning(f"Docker executor not available: {e}")
1292
+
1293
+ # Use BigCode evaluator
1294
+ evaluator = get_bigcode_evaluator(docker_executor)
1295
+
1296
+ # Prepare generations in expected format (list of lists)
1297
+ generations_for_eval = [[code] for code in generated_codes]
1298
+
1299
+ # Run evaluation
1300
+ evaluation_results = evaluator.evaluate(
1301
+ bigcode_task,
1302
+ generations_for_eval,
1303
+ k_values=[1], # Just pass@1 for now
1304
+ )
1305
+
1306
+ # Extract pass rate
1307
+ pass_rate = evaluation_results.get("pass_at_k", {}).get("pass@1", 0.0)
1308
+
1309
+ logger.info(f"✅ Code execution pass@1: {pass_rate:.2%}")
1310
+
1311
+ # Now classify the generated code to see if classifier agrees
1312
+ classification_results = []
1313
+ for i, code in enumerate(generated_codes):
1314
+ try:
1315
+ layer_obj = Layer(index=layer, type="transformer")
1316
+
1317
+ # Extract activations from generated code
1318
+ activation_tensor = model.extract_activations(code, layer_obj)
1319
+ activation_method = self._map_token_aggregation_to_activation_method(token_aggregation)
1320
+
1321
+ activation_obj = Activations(
1322
+ tensor=activation_tensor, layer=layer_obj, aggregation_strategy=activation_method
1323
+ )
1324
+
1325
+ # Get classifier prediction
1326
+ features = activation_obj.extract_features_for_classifier()
1327
+ features_numpy = features.cpu().numpy()
1328
+
1329
+ # Get prediction probability
1330
+ try:
1331
+ prediction_proba = classifier.predict_proba([features_numpy])
1332
+ if isinstance(prediction_proba, (list, tuple)) and len(prediction_proba) > 0:
1333
+ prediction = float(prediction_proba[0])
1334
+ else:
1335
+ prediction = float(prediction_proba)
1336
+ except:
1337
+ predictions = classifier.predict([features_numpy])
1338
+ if len(predictions) > 0:
1339
+ prediction = float(predictions[0])
1340
+ else:
1341
+ logger.warning("Classifier predict returned empty array")
1342
+ prediction = 0.5
1343
+
1344
+ # Check if code passed tests
1345
+ code_passed = False
1346
+ if i < len(evaluation_results.get("execution_results", [])):
1347
+ sample_results = evaluation_results["execution_results"][i].get("results", [])
1348
+ if sample_results:
1349
+ code_passed = sample_results[0].get("passed", False)
1350
+
1351
+ classification_results.append(
1352
+ {"classifier_score": prediction, "code_passed": code_passed, "code_snippet": code[:200]}
1353
+ )
1354
+
1355
+ except Exception as e:
1356
+ logger.error(f"Error classifying generated code {i}: {e}")
1357
+ classification_results.append({"classifier_score": 0.5, "code_passed": False, "error": str(e)})
1358
+
1359
+ # Analyze classifier performance
1360
+ correct_predictions = 0
1361
+ for result in classification_results:
1362
+ # Classifier should predict high score (>0.5) for passing code
1363
+ if (result["classifier_score"] > 0.5 and result["code_passed"]) or (
1364
+ result["classifier_score"] <= 0.5 and not result["code_passed"]
1365
+ ):
1366
+ correct_predictions += 1
1367
+
1368
+ classifier_accuracy = correct_predictions / len(classification_results) if classification_results else 0.0
1369
+
1370
+ return {
1371
+ "ground_truth": "CODE_EXECUTION",
1372
+ "method_used": "bigcode-evaluation",
1373
+ "confidence": classifier_accuracy,
1374
+ "pass_rate": pass_rate,
1375
+ "classifier_accuracy": classifier_accuracy,
1376
+ "total_samples": len(generated_codes),
1377
+ "passing_samples": int(pass_rate * len(generated_codes)),
1378
+ "details": f"Pass@1: {pass_rate:.2%}, Classifier accuracy: {classifier_accuracy:.2%}",
1379
+ "task_name": task_name,
1380
+ "evaluation_method": "code-execution",
1381
+ "execution_results": evaluation_results,
1382
+ }
1383
+
1384
+ except Exception as e:
1385
+ logger.error(f"Error in code execution evaluation: {e}")
1386
+ import traceback
1387
+
1388
+ traceback.print_exc()
1389
+ return {
1390
+ "ground_truth": "ERROR",
1391
+ "method_used": "code-execution-error",
1392
+ "confidence": 0.0,
1393
+ "details": f"Code execution evaluation failed: {e!s}",
1394
+ "task_name": task_name,
1395
+ "evaluation_method": "code-execution",
1396
+ }