wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,583 @@
1
+ """
2
+ BigCode Evaluation Harness integration for Wisent Guard.
3
+
4
+ This module provides integration with bigcode-evaluation-harness for code generation benchmarks.
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ import os
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ from pathlib import Path
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class BigCodeTaskLoader:
20
+ """Loads and manages BigCode evaluation tasks."""
21
+
22
+ # Mapping of our task names to BigCode task names
23
+ TASK_MAPPING = {
24
+ # === DIRECT MATCHES ===
25
+ "humaneval": "humaneval",
26
+ "mbpp": "mbpp",
27
+ "conala": "conala",
28
+ "concode": "concode",
29
+ "mercury": "mercury",
30
+ # === CORRECTED MAPPINGS ===
31
+ "humaneval_plus": "humanevalplus",
32
+ "instructhumaneval": "instruct-humaneval",
33
+ "mbpp_plus": "mbppplus",
34
+ "apps": "apps-introductory",
35
+ "ds1000": "ds1000-all-completion",
36
+ # === MULTI-LANGUAGE TASKS ===
37
+ "multiple_py": "multiple-py",
38
+ "multiple_js": "multiple-js",
39
+ "multiple_java": "multiple-java",
40
+ "multiple_cpp": "multiple-cljcpp",
41
+ "multiple_rs": "multiple-rs",
42
+ "multiple_go": "multiple-go",
43
+ # === CODE-TO-TEXT TASKS ===
44
+ "codexglue_code_to_text_python": "codexglue_code_to_text-python",
45
+ "codexglue_code_to_text_go": "codexglue_code_to_text-go",
46
+ "codexglue_code_to_text_java": "codexglue_code_to_text-java",
47
+ "codexglue_code_to_text_javascript": "codexglue_code_to_text-javascript",
48
+ "codexglue_code_to_text_php": "codexglue_code_to_text-php",
49
+ "codexglue_code_to_text_ruby": "codexglue_code_to_text-ruby",
50
+ # === FIXED PROBLEMATIC MAPPINGS ===
51
+ "recode": "perturbed-humaneval-natgen-num_seeds_1",
52
+ "humanevalpack": None, # ❌ REMOVED - no simple mapping exists, only complex variants
53
+ }
54
+
55
+ def __init__(self):
56
+ """Initialize BigCode task loader."""
57
+ self._bigcode_available = self._check_bigcode_available()
58
+ self._task_cache = {}
59
+
60
+ def _check_bigcode_available(self) -> bool:
61
+ """Check if bigcode-evaluation-harness is available."""
62
+ try:
63
+ import bigcode_eval
64
+
65
+ return True
66
+ except ImportError:
67
+ logger.warning("bigcode-evaluation-harness not installed")
68
+ return False
69
+
70
+ def is_bigcode_task(self, task_name: str) -> bool:
71
+ """Check if a task is a BigCode task."""
72
+ return task_name in self.TASK_MAPPING
73
+
74
+ def load_task(self, task_name: str, limit: Optional[int] = None) -> "BigCodeTask":
75
+ """
76
+ Load a BigCode task.
77
+
78
+ Args:
79
+ task_name: Name of the task (our naming convention)
80
+ limit: Optional limit on number of samples
81
+
82
+ Returns:
83
+ BigCodeTask object
84
+ """
85
+ if not self._bigcode_available:
86
+ raise ImportError("bigcode-evaluation-harness not installed. Run: pip install bigcode-evaluation-harness")
87
+
88
+ if task_name not in self.TASK_MAPPING:
89
+ raise ValueError(f"Unknown BigCode task: {task_name}")
90
+
91
+ bigcode_task_name = self.TASK_MAPPING[task_name]
92
+
93
+ # Handle removed tasks with None mapping
94
+ if bigcode_task_name is None:
95
+ raise ValueError(f"Task '{task_name}' has been removed - no BigCode mapping available")
96
+
97
+ # Check cache
98
+ cache_key = f"{task_name}:{limit}"
99
+ if cache_key in self._task_cache:
100
+ return self._task_cache[cache_key]
101
+
102
+ # Create task object
103
+ task = BigCodeTask(task_name, bigcode_task_name, limit)
104
+ self._task_cache[cache_key] = task
105
+
106
+ return task
107
+
108
+
109
+ class BigCodeTask:
110
+ """Represents a BigCode evaluation task."""
111
+
112
+ def __init__(self, task_name: str, bigcode_task_name: str, limit: Optional[int] = None):
113
+ """
114
+ Initialize BigCode task.
115
+
116
+ Args:
117
+ task_name: Our task name
118
+ bigcode_task_name: BigCode's task name
119
+ limit: Optional limit on samples
120
+ """
121
+ self.task_name = task_name
122
+ self.bigcode_task_name = bigcode_task_name
123
+ self.limit = limit
124
+ self._limit = limit # Store as private attribute too
125
+ self._data = None
126
+ self._task_obj = None
127
+ self._load_data()
128
+
129
+ def _load_data(self):
130
+ """Load task data from BigCode."""
131
+ try:
132
+ # Import BigCode modules
133
+ from bigcode_eval.tasks import get_task
134
+
135
+ # Get the task
136
+ task = get_task(self.bigcode_task_name)
137
+ self._task_obj = task
138
+
139
+ # Get dataset - BigCode uses get_dataset() method
140
+ dataset = task.get_dataset()
141
+
142
+ # Convert to list if needed
143
+ if hasattr(dataset, "__iter__"):
144
+ dataset = list(dataset)
145
+
146
+ # Apply limit if specified
147
+ if self.limit:
148
+ dataset = dataset[: self.limit]
149
+
150
+ self._data = dataset
151
+
152
+ except Exception as e:
153
+ logger.error(f"Failed to load BigCode task {self.bigcode_task_name}: {e}")
154
+ # Fallback to loading from files if available
155
+ self._load_from_files()
156
+
157
+ # Methods to match lm-eval interface
158
+ def has_validation_docs(self) -> bool:
159
+ """Check if task has validation documents."""
160
+ return False # BigCode tasks don't have separate validation sets
161
+
162
+ def has_test_docs(self) -> bool:
163
+ """Check if task has test documents."""
164
+ return True # All samples are considered test docs
165
+
166
+ def test_docs(self) -> List[Dict[str, Any]]:
167
+ """Get test documents."""
168
+ return self.get_samples()
169
+
170
+ def validation_docs(self) -> List[Dict[str, Any]]:
171
+ """Get validation documents."""
172
+ return [] # No separate validation set
173
+
174
+ def doc_to_text(self, doc: Dict[str, Any]) -> str:
175
+ """Convert document to text prompt."""
176
+ # Handle different BigCode formats
177
+ if "prompt" in doc:
178
+ return doc["prompt"]
179
+ if "text" in doc:
180
+ return doc["text"]
181
+ if "question" in doc:
182
+ return doc["question"]
183
+ if "problem" in doc:
184
+ return doc["problem"]
185
+ # Fallback - try to use task object if available
186
+ if self._task_obj and hasattr(self._task_obj, "get_prompt"):
187
+ return self._task_obj.get_prompt(doc)
188
+ return str(doc)
189
+
190
+ def _load_from_files(self):
191
+ """Load task data from local files as fallback."""
192
+ # Try to load from standard locations
193
+ data_paths = [
194
+ f"~/.cache/bigcode_eval/{self.bigcode_task_name}",
195
+ f"data/{self.bigcode_task_name}",
196
+ f"bigcode_eval/tasks/{self.bigcode_task_name}",
197
+ ]
198
+
199
+ for path in data_paths:
200
+ expanded_path = os.path.expanduser(path)
201
+ if os.path.exists(expanded_path):
202
+ self._load_from_path(expanded_path)
203
+ return
204
+
205
+ # If no data found, raise error
206
+ raise ValueError(f"No data found for task {self.task_name}. Please provide valid benchmark data.")
207
+
208
+ def _load_from_path(self, path: str):
209
+ """Load data from a specific path."""
210
+ data = []
211
+
212
+ # Look for JSON/JSONL files
213
+ for file in Path(path).glob("*.json*"):
214
+ with open(file) as f:
215
+ if file.suffix == ".jsonl":
216
+ for line in f:
217
+ data.append(json.loads(line))
218
+ else:
219
+ file_data = json.load(f)
220
+ if isinstance(file_data, list):
221
+ data.extend(file_data)
222
+ else:
223
+ data.append(file_data)
224
+
225
+ if self.limit:
226
+ data = data[: self.limit]
227
+
228
+ self._data = data
229
+
230
+ def get_samples(self) -> List[Dict[str, Any]]:
231
+ """Get all samples from the task."""
232
+ return self._data if self._data else []
233
+
234
+ def __len__(self):
235
+ """Get number of samples."""
236
+ return len(self._data) if self._data else 0
237
+
238
+ def __iter__(self):
239
+ """Iterate over samples."""
240
+ return iter(self.get_samples())
241
+
242
+
243
+ class BigCodeEvaluator:
244
+ """Evaluates model outputs on BigCode benchmarks."""
245
+
246
+ def __init__(self, docker_executor=None):
247
+ """
248
+ Initialize evaluator.
249
+
250
+ Args:
251
+ docker_executor: Optional Docker executor for secure code execution
252
+ """
253
+ self.docker_executor = docker_executor
254
+
255
+ def evaluate(self, task: BigCodeTask, generations: List[str], k_values: List[int] = [1, 10, 100]) -> Dict[str, Any]:
256
+ """
257
+ Evaluate generations on a BigCode task.
258
+
259
+ Args:
260
+ task: BigCodeTask object
261
+ generations: List of generated code solutions
262
+ k_values: k values for pass@k metric
263
+
264
+ Returns:
265
+ Evaluation results dict
266
+ """
267
+ results = {
268
+ "task": task.task_name,
269
+ "num_samples": len(task),
270
+ "num_generations": len(generations),
271
+ "pass_at_k": {},
272
+ }
273
+
274
+ # For code generation tasks, we need to execute and test
275
+ if self._is_code_execution_task(task.task_name):
276
+ results["execution_results"] = self._evaluate_code_execution(task, generations)
277
+
278
+ # Calculate pass@k
279
+ for k in k_values:
280
+ if k <= len(generations):
281
+ pass_rate = self._calculate_pass_at_k(results["execution_results"], k)
282
+ results["pass_at_k"][f"pass@{k}"] = pass_rate
283
+
284
+ else:
285
+ # For non-execution tasks (e.g., code-to-text), use BLEU or other metrics
286
+ results["bleu_scores"] = self._evaluate_text_generation(task, generations)
287
+
288
+ return results
289
+
290
+ def _is_code_execution_task(self, task_name: str) -> bool:
291
+ """Check if task requires code execution."""
292
+ non_execution_tasks = {
293
+ "codexglue_code_to_text",
294
+ "codexglue_code_to_text_python",
295
+ "codexglue_code_to_text_go",
296
+ "codexglue_code_to_text_ruby",
297
+ "codexglue_code_to_text_java",
298
+ "codexglue_code_to_text_javascript",
299
+ "codexglue_code_to_text_php",
300
+ }
301
+ return task_name not in non_execution_tasks
302
+
303
+ def _evaluate_code_execution(self, task: BigCodeTask, generations: List[str]) -> List[Dict]:
304
+ """Evaluate code by executing it."""
305
+ results = []
306
+
307
+ for i, sample in enumerate(task.get_samples()):
308
+ sample_results = []
309
+
310
+ for j, generation in enumerate(generations[i] if i < len(generations) else []):
311
+ result = self._execute_and_test(sample, generation, task.task_name)
312
+ sample_results.append(result)
313
+
314
+ results.append({"sample_id": i, "results": sample_results})
315
+
316
+ return results
317
+
318
+ def _execute_and_test(self, sample: Dict, generation: str, task_name: str) -> Dict:
319
+ """Execute generated code and run tests."""
320
+ if self.docker_executor:
321
+ # Use Docker for secure execution
322
+ return self._execute_in_docker(sample, generation, task_name)
323
+ # Fallback to subprocess (less secure)
324
+ return self._execute_in_subprocess(sample, generation, task_name)
325
+
326
+ def _execute_in_docker(self, sample: Dict, generation: str, task_name: str) -> Dict:
327
+ """Execute code in Docker container."""
328
+ # TODO: Implement Docker execution
329
+ logger.warning("Docker execution not yet implemented, using subprocess")
330
+ return self._execute_in_subprocess(sample, generation, task_name)
331
+
332
+ def _execute_in_subprocess(self, sample: Dict, generation: str, task_name: str) -> Dict:
333
+ """Execute code in subprocess (less secure)."""
334
+ result = {"passed": False, "error": None, "output": None}
335
+
336
+ try:
337
+ # Create test script
338
+ test_script = self._create_test_script(sample, generation, task_name)
339
+
340
+ # Write to temp file
341
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
342
+ f.write(test_script)
343
+ temp_path = f.name
344
+
345
+ try:
346
+ # Execute
347
+ proc = subprocess.run([sys.executable, temp_path], capture_output=True, text=True, timeout=10)
348
+
349
+ if proc.returncode == 0:
350
+ result["passed"] = True
351
+ result["output"] = proc.stdout
352
+ logger.debug(f"✅ Code execution PASSED. Output: {proc.stdout[:200]}")
353
+ else:
354
+ result["error"] = proc.stderr or proc.stdout
355
+ logger.debug(f"❌ Code execution FAILED. Error: {result['error'][:500]}")
356
+
357
+ finally:
358
+ # Clean up
359
+ os.unlink(temp_path)
360
+
361
+ except subprocess.TimeoutExpired:
362
+ result["error"] = "Timeout"
363
+ except Exception as e:
364
+ result["error"] = str(e)
365
+
366
+ return result
367
+
368
+ def _create_test_script(self, sample: Dict, generation: str, task_name: str) -> str:
369
+ """Create a test script for the sample."""
370
+ if "humaneval" in task_name:
371
+ script = self._create_humaneval_test_script(sample, generation)
372
+ elif "mbpp" in task_name:
373
+ script = self._create_mbpp_test_script(sample, generation)
374
+ elif "apps" in task_name:
375
+ script = self._create_apps_test_script(sample, generation)
376
+ else:
377
+ # Default format
378
+ script = self._create_humaneval_test_script(sample, generation)
379
+
380
+ logger.debug(f"📝 Test script for {task_name}:\n{script}\n")
381
+ return script
382
+
383
+ def _create_humaneval_test_script(self, sample: Dict, generation: str) -> str:
384
+ """Create test script for HumanEval format."""
385
+ entry_point = sample.get("entry_point", "solution")
386
+ test_code = sample.get("test", "")
387
+ prompt = sample.get("prompt", "")
388
+
389
+ # The prompt contains the function signature, and generation should be the function body
390
+ # We need to combine them properly
391
+ script = f"""
392
+ {prompt}{generation}
393
+
394
+ {test_code}
395
+
396
+ if __name__ == "__main__":
397
+ check({entry_point})
398
+ print("All tests passed!")
399
+ """
400
+ return script
401
+
402
+ def _create_mbpp_test_script(self, sample: Dict, generation: str) -> str:
403
+ """Create test script for MBPP format."""
404
+ test_imports = sample.get("test_imports", [])
405
+ test_list = sample.get("test_list", [])
406
+
407
+ # Fix function name mismatch before creating test script
408
+ fixed_generation = self._fix_function_name_mismatch(generation, test_list)
409
+
410
+ imports = "\n".join(test_imports)
411
+ tests = "\n ".join(test_list)
412
+
413
+ script = f"""
414
+ {imports}
415
+
416
+ {fixed_generation}
417
+
418
+ if __name__ == "__main__":
419
+ {tests}
420
+ print("All tests passed!")
421
+ """
422
+ return script
423
+
424
+ def _create_apps_test_script(self, sample: Dict, generation: str) -> str:
425
+ """Create test script for APPS format."""
426
+ # APPS has input/output pairs
427
+ io_data = json.loads(sample.get("input_output", "{}"))
428
+ inputs = io_data.get("inputs", [])
429
+ outputs = io_data.get("outputs", [])
430
+
431
+ tests = []
432
+ for inp, out in zip(inputs, outputs):
433
+ tests.append(f"assert str(solution({inp})) == '{out}'")
434
+
435
+ test_code = "\n ".join(tests)
436
+
437
+ script = f"""
438
+ {generation}
439
+
440
+ if __name__ == "__main__":
441
+ {test_code}
442
+ print("All tests passed!")
443
+ """
444
+ return script
445
+
446
+ def _fix_function_name_mismatch(self, code: str, test_list: List[str]) -> str:
447
+ """
448
+ Fix function name mismatches between generated code and test cases.
449
+
450
+ Uses wrapper function approach for robustness across different code structures.
451
+
452
+ Args:
453
+ code: Generated code that may have wrong function name
454
+ test_list: List of test assertions that specify expected function name
455
+
456
+ Returns:
457
+ Fixed code with wrapper function if needed
458
+ """
459
+ import re
460
+
461
+ if not test_list or not code.strip():
462
+ return code
463
+
464
+ # Extract expected function name from test assertions
465
+ expected_name = None
466
+ # Built-in functions to skip when looking for the target function
467
+ builtin_functions = {
468
+ "set",
469
+ "len",
470
+ "str",
471
+ "int",
472
+ "float",
473
+ "list",
474
+ "tuple",
475
+ "dict",
476
+ "sum",
477
+ "max",
478
+ "min",
479
+ "abs",
480
+ "round",
481
+ "sorted",
482
+ "reversed",
483
+ }
484
+
485
+ for test in test_list:
486
+ # Find all function calls in assert statements
487
+ function_calls = re.findall(r"(\w+)\s*\(", test)
488
+
489
+ for func_name in function_calls:
490
+ # Skip built-in functions and common test functions
491
+ if func_name not in builtin_functions and func_name not in {
492
+ "assert",
493
+ "assertEqual",
494
+ "assertTrue",
495
+ "assertFalse",
496
+ }:
497
+ expected_name = func_name
498
+ break
499
+
500
+ if expected_name:
501
+ break
502
+
503
+ if not expected_name:
504
+ return code # No function name found in tests
505
+
506
+ # Extract actual function name from generated code
507
+ actual_name = None
508
+ func_match = re.search(r"def\s+(\w+)\s*\(", code)
509
+ if func_match:
510
+ actual_name = func_match.group(1)
511
+
512
+ if not actual_name:
513
+ return code # No function definition found
514
+
515
+ if actual_name == expected_name:
516
+ return code # Names already match
517
+
518
+ logger.debug(f"🔧 Function name mismatch detected: {actual_name} → {expected_name}")
519
+ logger.debug(" Adding wrapper function for compatibility")
520
+
521
+ # Add wrapper function to bridge the name gap
522
+ wrapper = f"""
523
+ # Wrapper function for test compatibility
524
+ def {expected_name}(*args, **kwargs):
525
+ return {actual_name}(*args, **kwargs)
526
+ """
527
+
528
+ return code + wrapper
529
+
530
+ def _calculate_pass_at_k(self, execution_results: List[Dict], k: int) -> float:
531
+ """Calculate pass@k metric."""
532
+ total_passed = 0
533
+ total_samples = len(execution_results)
534
+
535
+ for result in execution_results:
536
+ sample_results = result["results"][:k]
537
+ if any(r["passed"] for r in sample_results):
538
+ total_passed += 1
539
+
540
+ return total_passed / total_samples if total_samples > 0 else 0.0
541
+
542
+ def _evaluate_text_generation(self, task: BigCodeTask, generations: List[str]) -> List[float]:
543
+ """Evaluate text generation tasks (e.g., code-to-text)."""
544
+ # TODO: Implement BLEU scoring
545
+ logger.warning("Text generation evaluation not yet implemented")
546
+ return [0.0] * len(generations)
547
+
548
+
549
+ # Main interface for BigCode integration
550
+ _loader = None
551
+ _evaluator = None
552
+
553
+
554
+ def get_bigcode_loader() -> BigCodeTaskLoader:
555
+ """Get the global BigCode task loader."""
556
+ global _loader
557
+ if _loader is None:
558
+ _loader = BigCodeTaskLoader()
559
+ return _loader
560
+
561
+
562
+ def get_bigcode_evaluator(docker_executor=None) -> BigCodeEvaluator:
563
+ """Get the global BigCode evaluator."""
564
+ global _evaluator
565
+ if _evaluator is None:
566
+ _evaluator = BigCodeEvaluator(docker_executor)
567
+ return _evaluator
568
+
569
+
570
+ def is_bigcode_task(task_name: str) -> bool:
571
+ """Check if a task is from BigCode."""
572
+ return get_bigcode_loader().is_bigcode_task(task_name)
573
+
574
+
575
+ def load_bigcode_task(task_name: str, limit: Optional[int] = None) -> BigCodeTask:
576
+ """Load a BigCode task."""
577
+ return get_bigcode_loader().load_task(task_name, limit)
578
+
579
+
580
+ def evaluate_bigcode_task(task: BigCodeTask, generations: List[str], docker_executor=None) -> Dict[str, Any]:
581
+ """Evaluate generations on a BigCode task."""
582
+ evaluator = get_bigcode_evaluator(docker_executor)
583
+ return evaluator.evaluate(task, generations)
@@ -0,0 +1,15 @@
1
+ """Public interface for contrastive pair utilities."""
2
+
3
+ from .core.pair import ContrastivePair
4
+ from .core.set import ContrastivePairSet
5
+ from .core.buliders import from_phrase_pairs
6
+ from .diagnostics import DiagnosticsConfig, DiagnosticsReport, run_all_diagnostics
7
+
8
+ __all__ = [
9
+ "ContrastivePair",
10
+ "ContrastivePairSet",
11
+ "from_phrase_pairs",
12
+ "DiagnosticsConfig",
13
+ "DiagnosticsReport",
14
+ "run_all_diagnostics",
15
+ ]
File without changes
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from typing import Iterable, TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from wisent_guard.core.activations.core.atoms import LayerActivations
9
+
10
+ __all__ = [
11
+ "AtomResponse",
12
+ "AtomContrastivePair",
13
+ "AtomContrastivePairSet",
14
+ ]
15
+
16
+ class AtomResponse(ABC):
17
+ """Abstract base for a single model response."""
18
+ model_response: str
19
+ layers_activations: LayerActivations | None = None
20
+ label: str | None = None
21
+
22
+
23
+ class AtomContrastivePair(ABC):
24
+ """Abstract base for a (prompt, positive, negative) trio."""
25
+ prompt: str
26
+ positive_response: AtomResponse
27
+ negative_response: AtomResponse
28
+ label: str | None
29
+ trait_description: str | None
30
+
31
+
32
+ class AtomContrastivePairSet(ABC):
33
+ """Abstract base for a named collection of pairs."""
34
+ name: str
35
+ pairs: list[AtomContrastivePair]
36
+ task_type: str | None
37
+
38
+ @abstractmethod
39
+ def add(self, pair: AtomContrastivePair) -> None: ...
40
+
41
+ @abstractmethod
42
+ def extend(self, pairs: Iterable[AtomContrastivePair]) -> None: ...
43
+
44
+ def __len__(self) -> int:
45
+ return len(self.pairs)