wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,220 @@
1
+ """
2
+ SuperGPQA task implementation for task-agnostic architecture.
3
+ """
4
+
5
+ from typing import Dict, Any, List, Optional
6
+ from datasets import load_dataset
7
+ from ..task_interface import TaskInterface
8
+ from ..benchmark_extractors import SuperGPQAExtractor
9
+
10
+
11
+ class SuperGPQATask(TaskInterface):
12
+ """SuperGPQA scientific reasoning task implementation."""
13
+
14
+ def __init__(self, discipline_filter: Optional[str] = None, difficulty_filter: Optional[str] = None,
15
+ calculation_only: Optional[bool] = None, limit: Optional[int] = None):
16
+ """Initialize SuperGPQA task.
17
+
18
+ Args:
19
+ discipline_filter: Filter by discipline (Science, Engineering, etc.)
20
+ difficulty_filter: Filter by difficulty level
21
+ calculation_only: If True, only include calculation problems; if False, exclude them
22
+ limit: Maximum number of examples to load
23
+ """
24
+ self.dataset_name = "m-a-p/SuperGPQA"
25
+ self.discipline_filter = discipline_filter
26
+ self.difficulty_filter = difficulty_filter
27
+ self.calculation_only = calculation_only
28
+ self.limit = limit
29
+ self.field_filter = None # Can be set by subclasses
30
+ self._extractor = SuperGPQAExtractor()
31
+ self._data = None # Cache for loaded data
32
+
33
+ def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
34
+ """Load SuperGPQA data from HuggingFace datasets."""
35
+ dataset = load_dataset(self.dataset_name, split="train")
36
+
37
+ # Apply filters
38
+ filtered_data = self._filter_and_process(dataset)
39
+
40
+ # Apply limit
41
+ effective_limit = limit or self.limit
42
+ if effective_limit:
43
+ filtered_data = filtered_data[:effective_limit]
44
+
45
+ return filtered_data
46
+
47
+ def _filter_and_process(self, dataset) -> List[Dict[str, Any]]:
48
+ """Filter data by discipline, field, difficulty, and calculation type, then convert to internal format."""
49
+ filtered_data = []
50
+
51
+ for item in dataset:
52
+ # Apply discipline filter
53
+ if self.discipline_filter and item.get('discipline') != self.discipline_filter:
54
+ continue
55
+
56
+ # Apply field filter (for subject-specific tasks)
57
+ if self.field_filter and item.get('field') != self.field_filter:
58
+ continue
59
+
60
+ # Apply difficulty filter
61
+ if self.difficulty_filter and item.get('difficulty') != self.difficulty_filter:
62
+ continue
63
+
64
+ # Apply calculation filter
65
+ if self.calculation_only is not None:
66
+ if self.calculation_only and not item.get('is_calculation', False):
67
+ continue
68
+ elif not self.calculation_only and item.get('is_calculation', False):
69
+ continue
70
+
71
+ # Convert to internal format
72
+ processed_item = {
73
+ 'uuid': item.get('uuid', ''),
74
+ 'question': item.get('question', ''),
75
+ 'options': item.get('options', []),
76
+ 'answer': item.get('answer', ''),
77
+ 'answer_letter': item.get('answer_letter', ''),
78
+ 'discipline': item.get('discipline', ''),
79
+ 'field': item.get('field', ''),
80
+ 'subfield': item.get('subfield', ''),
81
+ 'difficulty': item.get('difficulty', ''),
82
+ 'is_calculation': item.get('is_calculation', False),
83
+ 'metadata': {
84
+ 'dataset': self.dataset_name
85
+ }
86
+ }
87
+
88
+ filtered_data.append(processed_item)
89
+
90
+ return filtered_data
91
+
92
+ def get_extractor(self) -> SuperGPQAExtractor:
93
+ """Get the SuperGPQA benchmark extractor."""
94
+ return self._extractor
95
+
96
+ def get_name(self) -> str:
97
+ """Get the task name."""
98
+ name = "supergpqa"
99
+ if self.discipline_filter:
100
+ name += f"_{self.discipline_filter.lower()}"
101
+ if self.difficulty_filter:
102
+ name += f"_{self.difficulty_filter.lower()}"
103
+ if self.calculation_only is not None:
104
+ name += "_calc" if self.calculation_only else "_nocalc"
105
+ return name
106
+
107
+ def get_description(self) -> str:
108
+ """Get the task description."""
109
+ desc = "SuperGPQA: Large-scale dataset of scientific multiple-choice questions across disciplines"
110
+ filters = []
111
+ if self.discipline_filter:
112
+ filters.append(f"discipline: {self.discipline_filter}")
113
+ if self.difficulty_filter:
114
+ filters.append(f"difficulty: {self.difficulty_filter}")
115
+ if self.calculation_only is not None:
116
+ filters.append("calculation problems only" if self.calculation_only else "non-calculation problems only")
117
+
118
+ if filters:
119
+ desc += f" (filtered: {', '.join(filters)})"
120
+ return desc
121
+
122
+ def get_categories(self) -> List[str]:
123
+ """Get the task categories."""
124
+ return ["science", "reasoning", "multiple_choice", "knowledge"]
125
+
126
+ def get_task_info(self) -> Dict[str, Any]:
127
+ """Get information about the SuperGPQA task."""
128
+ return {
129
+ "task_name": self.get_name(),
130
+ "description": self.get_description(),
131
+ "source": self.dataset_name,
132
+ "task_type": "multiple_choice",
133
+ "evaluation_method": "exact_match",
134
+ "filters": {
135
+ "discipline": self.discipline_filter,
136
+ "difficulty": self.difficulty_filter,
137
+ "calculation_only": self.calculation_only
138
+ }
139
+ }
140
+
141
+ def validate_sample(self, sample: Dict[str, Any]) -> bool:
142
+ """Validate that a sample has required SuperGPQA fields."""
143
+ required_fields = ["question", "options", "answer", "answer_letter"]
144
+ return all(field in sample for field in required_fields)
145
+
146
+ # Methods to match lm-eval interface
147
+ def has_validation_docs(self) -> bool:
148
+ """Check if task has validation documents."""
149
+ return False # SuperGPQA doesn't have separate validation sets
150
+
151
+ def has_test_docs(self) -> bool:
152
+ """Check if task has test documents."""
153
+ return True # All samples are considered test docs
154
+
155
+ def test_docs(self) -> List[Dict[str, Any]]:
156
+ """Get test documents."""
157
+ if self._data is None:
158
+ self._data = self.load_data()
159
+ return self._data
160
+
161
+ def validation_docs(self) -> List[Dict[str, Any]]:
162
+ """Get validation documents."""
163
+ return [] # No separate validation set
164
+
165
+ def doc_to_text(self, doc: Dict[str, Any]) -> str:
166
+ """Convert document to text prompt."""
167
+ question = doc.get('question', '')
168
+ options = doc.get('options', [])
169
+
170
+ # Format as multiple choice question
171
+ if options:
172
+ choices = []
173
+ for i, option in enumerate(options):
174
+ letter = chr(ord('A') + i)
175
+ choices.append(f"{letter}. {option}")
176
+ return f"{question}\n\n" + "\n".join(choices)
177
+ else:
178
+ return question
179
+
180
+
181
+ class SuperGPQAPhysicsTask(SuperGPQATask):
182
+ """SuperGPQA task filtered to Physics questions only."""
183
+
184
+ def __init__(self, difficulty_filter: Optional[str] = None, calculation_only: Optional[bool] = None,
185
+ limit: Optional[int] = None):
186
+ # Filter by discipline=Science and field=Physics
187
+ super().__init__(discipline_filter="Science", difficulty_filter=difficulty_filter,
188
+ calculation_only=calculation_only, limit=limit)
189
+ self.field_filter = "Physics" # Add field filtering
190
+
191
+ def get_name(self) -> str:
192
+ return "supergpqa_physics"
193
+
194
+
195
+ class SuperGPQAChemistryTask(SuperGPQATask):
196
+ """SuperGPQA task filtered to Chemistry questions only."""
197
+
198
+ def __init__(self, difficulty_filter: Optional[str] = None, calculation_only: Optional[bool] = None,
199
+ limit: Optional[int] = None):
200
+ # Filter by discipline=Science and field=Chemistry
201
+ super().__init__(discipline_filter="Science", difficulty_filter=difficulty_filter,
202
+ calculation_only=calculation_only, limit=limit)
203
+ self.field_filter = "Chemistry" # Add field filtering
204
+
205
+ def get_name(self) -> str:
206
+ return "supergpqa_chemistry"
207
+
208
+
209
+ class SuperGPQABiologyTask(SuperGPQATask):
210
+ """SuperGPQA task filtered to Biology questions only."""
211
+
212
+ def __init__(self, difficulty_filter: Optional[str] = None, calculation_only: Optional[bool] = None,
213
+ limit: Optional[int] = None):
214
+ # Filter by discipline=Science and field=Biology
215
+ super().__init__(discipline_filter="Science", difficulty_filter=difficulty_filter,
216
+ calculation_only=calculation_only, limit=limit)
217
+ self.field_filter = "Biology" # Add field filtering
218
+
219
+ def get_name(self) -> str:
220
+ return "supergpqa_biology"
@@ -0,0 +1,149 @@
1
+ """Time estimation for optimization operations using runtime calibration"""
2
+ import time
3
+ from typing import Dict, Tuple, Optional
4
+ from pathlib import Path
5
+
6
+ from .timing_calibration import TimingCalibrator
7
+
8
+
9
+ class OptimizationTimeEstimator:
10
+ """Estimates time required for optimization operations using calibration"""
11
+
12
+ def __init__(
13
+ self,
14
+ model_name: str,
15
+ verbose: bool = True,
16
+ skip_calibration: bool = False,
17
+ calibration_file: Optional[Path] = None,
18
+ calibrate_only: bool = False
19
+ ):
20
+ self.model_name = model_name
21
+ self.verbose = verbose
22
+ self.calibrator = TimingCalibrator(verbose=verbose)
23
+
24
+ # Get number of layers in the model
25
+ from . import Model
26
+ model = Model(name=model_name)
27
+ if hasattr(model, 'model') and hasattr(model.model, 'config'):
28
+ if hasattr(model.model.config, 'num_hidden_layers'):
29
+ self.total_layers = model.model.config.num_hidden_layers
30
+ elif hasattr(model.model.config, 'n_layer'):
31
+ self.total_layers = model.model.config.n_layer
32
+ else:
33
+ raise RuntimeError(f"Cannot determine number of layers for model {model_name}")
34
+ else:
35
+ raise RuntimeError(f"Cannot access model configuration for {model_name}")
36
+
37
+ # Handle calibration
38
+ if skip_calibration:
39
+ raise RuntimeError("Calibration cannot be skipped. Accurate timing requires calibration.")
40
+
41
+ if calibration_file and calibration_file.exists():
42
+ # Load from file
43
+ if not self.calibrator.load_from_file(calibration_file):
44
+ raise RuntimeError(f"Failed to load calibration from {calibration_file}")
45
+ self.timing = self.calibrator.timings
46
+ if self.timing["training_time"] is None or self.timing["steering_time"] is None:
47
+ raise RuntimeError(f"Calibration file {calibration_file} contains invalid data")
48
+ else:
49
+ # Run calibration
50
+ if verbose:
51
+ print(f"\nšŸ”§ Running timing calibration for {model_name}...")
52
+
53
+ self.timing = self.calibrator.run_calibration(model_name)
54
+
55
+ # Save calibration if file path provided
56
+ if calibration_file:
57
+ self.calibrator.save_to_file(calibration_file)
58
+
59
+ self.calibrate_only = calibrate_only
60
+
61
+ def estimate_classification_time(
62
+ self,
63
+ num_tasks: int,
64
+ sample_limit: int = 200,
65
+ layers: Optional[list] = None
66
+ ) -> Tuple[float, Dict[str, float]]:
67
+ """
68
+ Estimate time for classification optimization.
69
+
70
+ Returns:
71
+ Tuple of (total_seconds, breakdown)
72
+ """
73
+ num_layers = len(layers) if layers else min(5, self.total_layers)
74
+
75
+ total_time, breakdown = self.calibrator.estimate_optimization_time(
76
+ num_tasks=num_tasks,
77
+ num_layers=num_layers,
78
+ samples_per_task=sample_limit,
79
+ include_sample_size_opt=False,
80
+ include_classifier_training=False,
81
+ include_control_vectors=False
82
+ )
83
+
84
+ return total_time, {"classification": total_time}
85
+
86
+ def estimate_full_optimization_time(
87
+ self,
88
+ num_tasks: int,
89
+ classification_limit: int = 200,
90
+ sample_sizes: list = None,
91
+ sample_size_limit: int = 1000,
92
+ include_sample_size_opt: bool = True,
93
+ include_classifier_training: bool = True,
94
+ include_control_vectors: bool = True
95
+ ) -> Tuple[float, Dict[str, float]]:
96
+ """
97
+ Estimate time for full optimization pipeline.
98
+
99
+ Returns:
100
+ Tuple of (total_seconds, breakdown)
101
+ """
102
+ # Typical number of layers tested in classification
103
+ num_layers = min(5, self.total_layers)
104
+
105
+ # Control vectors typically test more layers
106
+ cv_layers = min(10, self.total_layers)
107
+
108
+ # Sample sizes must be provided
109
+ if sample_sizes is None:
110
+ raise RuntimeError("sample_sizes must be provided for full optimization time estimation")
111
+
112
+ return self.calibrator.estimate_optimization_time(
113
+ num_tasks=num_tasks,
114
+ num_layers=num_layers,
115
+ samples_per_task=classification_limit,
116
+ sample_sizes=sample_sizes,
117
+ sample_size_limit=sample_size_limit,
118
+ include_sample_size_opt=include_sample_size_opt,
119
+ include_classifier_training=include_classifier_training,
120
+ include_control_vectors=include_control_vectors,
121
+ num_cv_layers=cv_layers
122
+ )
123
+
124
+ @staticmethod
125
+ def format_time(seconds: float) -> str:
126
+ """Format time in human-readable format"""
127
+ if seconds < 60:
128
+ return f"{seconds:.0f} seconds"
129
+ elif seconds < 3600:
130
+ minutes = seconds / 60
131
+ return f"{minutes:.0f} minutes"
132
+ else:
133
+ hours = seconds / 3600
134
+ minutes = (seconds % 3600) / 60
135
+ if minutes > 0:
136
+ return f"{hours:.0f} hours {minutes:.0f} minutes"
137
+ else:
138
+ return f"{hours:.0f} hours"
139
+
140
+ def print_time_breakdown(self, total_time: float, breakdown: Dict[str, float]):
141
+ """Print a formatted time breakdown"""
142
+ print(f"\nā±ļø ESTIMATED OPTIMIZATION TIME:")
143
+ print(f" Total: {self.format_time(total_time)}")
144
+
145
+ if len(breakdown) > 1:
146
+ print("\n Breakdown:")
147
+ for phase, time_sec in breakdown.items():
148
+ if time_sec > 0:
149
+ print(f" - {phase.replace('_', ' ').title()}: {self.format_time(time_sec)}")
@@ -0,0 +1,174 @@
1
+ """Runtime timing calibration for optimization time estimation"""
2
+ import time
3
+ import json
4
+ import subprocess
5
+ import sys
6
+ from typing import Dict, Optional, Tuple
7
+ from pathlib import Path
8
+
9
+
10
+ class TimingCalibrator:
11
+ """Measures actual optimization timing on the current system"""
12
+
13
+ def __init__(self, verbose: bool = True):
14
+ self.verbose = verbose
15
+ self.timings = {
16
+ "training_time": None, # Time for training command
17
+ "steering_time": None, # Time for steering command
18
+ }
19
+
20
+ def run_calibration(self, model_name: str) -> Dict[str, float]:
21
+ """
22
+ Run calibration by measuring training and steering times.
23
+ Uses exactly one task, one layer, and 10 samples.
24
+
25
+ Args:
26
+ model_name: Model to calibrate timing for
27
+
28
+ Returns:
29
+ Dictionary with training_time and steering_time
30
+ """
31
+ # Get model layer count
32
+ from . import Model
33
+ model = Model(name=model_name)
34
+ if hasattr(model, 'model') and hasattr(model.model, 'config'):
35
+ if hasattr(model.model.config, 'num_hidden_layers'):
36
+ total_layers = model.model.config.num_hidden_layers
37
+ elif hasattr(model.model.config, 'n_layer'):
38
+ total_layers = model.model.config.n_layer
39
+ else:
40
+ raise RuntimeError(f"Cannot determine number of layers for model {model_name}")
41
+ else:
42
+ raise RuntimeError(f"Cannot access model config for {model_name}")
43
+
44
+ # Use middle layer for calibration
45
+ calibration_layer = total_layers // 2
46
+
47
+ if self.verbose:
48
+ print(f"\nšŸ”§ Running timing calibration for {model_name}...")
49
+ print(f" Task: arc_easy")
50
+ print(f" Layer: {calibration_layer}")
51
+ print(f" Samples: 10")
52
+
53
+ # 1. Measure training time
54
+ if self.verbose:
55
+ print(f"\nšŸ“Š Measuring training time...")
56
+
57
+ start_time = time.time()
58
+
59
+ cmd = [
60
+ sys.executable, "-m", "wisent_guard.cli",
61
+ "optimize-classification",
62
+ model_name,
63
+ "--tasks", "arc_easy",
64
+ "--limit", "10",
65
+ "--layer-range", f"{calibration_layer},{calibration_layer}",
66
+ "--no-save",
67
+ "--skip-timing-estimation"
68
+ ]
69
+
70
+ result = subprocess.run(cmd, capture_output=True, text=True)
71
+
72
+ if result.returncode != 0:
73
+ raise RuntimeError(f"Training calibration failed:\n{result.stderr}")
74
+
75
+ self.timings["training_time"] = time.time() - start_time
76
+
77
+ # 2. Skip steering calibration - it's too complex for quick calibration
78
+ # Users can run steering separately if needed
79
+ if self.verbose:
80
+ print(f"\nšŸ“Š Skipping steering calibration (too complex for quick estimate)")
81
+
82
+ self.timings["steering_time"] = None
83
+
84
+ if self.verbose:
85
+ print(f"\nāœ… Calibration complete!")
86
+ print(f" Training time: {self.timings['training_time']:.3f}s")
87
+ print(f" Steering time: {self.timings['steering_time']:.3f}s")
88
+
89
+ return self.timings
90
+
91
+ def save_to_file(self, filepath: Path):
92
+ """Save calibration results to a file"""
93
+ with open(filepath, 'w') as f:
94
+ json.dump(self.timings, f, indent=2)
95
+ if self.verbose:
96
+ print(f"šŸ’¾ Saved calibration to {filepath}")
97
+
98
+ def load_from_file(self, filepath: Path) -> bool:
99
+ """Load calibration results from a file"""
100
+ if not filepath.exists():
101
+ return False
102
+
103
+ try:
104
+ with open(filepath, 'r') as f:
105
+ self.timings = json.load(f)
106
+ if self.verbose:
107
+ print(f"šŸ“‚ Loaded calibration from {filepath}")
108
+ return True
109
+ except Exception as e:
110
+ if self.verbose:
111
+ print(f"āš ļø Failed to load calibration: {e}")
112
+ return False
113
+
114
+ def estimate_optimization_time(
115
+ self,
116
+ num_tasks: int,
117
+ num_layers: int,
118
+ samples_per_task: int = 1000,
119
+ sample_sizes: list = None,
120
+ sample_size_limit: int = 1000,
121
+ include_sample_size_opt: bool = True,
122
+ include_classifier_training: bool = True,
123
+ include_control_vectors: bool = True,
124
+ num_cv_layers: int = None
125
+ ) -> Tuple[float, Dict[str, float]]:
126
+ """
127
+ Estimate total optimization time based on calibration.
128
+
129
+ Linear scaling from base measurements: 1 task, 1 layer, 10 samples.
130
+
131
+ Returns:
132
+ Tuple of (total_seconds, breakdown_dict)
133
+ """
134
+ if self.timings["training_time"] is None:
135
+ raise RuntimeError("No calibration data available. Run calibration first.")
136
+
137
+ # Base measurements from calibration
138
+ base_training = self.timings["training_time"] # Time for 1 task, 1 layer, 10 samples
139
+ base_steering = self.timings["steering_time"] # Time for 1 task, 1 layer, 10 samples
140
+
141
+ breakdown = {}
142
+
143
+ # Classification optimization: scales linearly with tasks, layers, and samples
144
+ classification_time = base_training * num_tasks * num_layers * (samples_per_task / 10)
145
+ breakdown["classification"] = classification_time
146
+
147
+ # Sample size optimization: tests multiple sample sizes on ONE layer per task
148
+ if include_sample_size_opt and sample_sizes:
149
+ # Calculate average sample size from the provided list
150
+ avg_sample_size = sum(sample_sizes) / len(sample_sizes)
151
+ # Each test uses sample_size_limit samples from the dataset
152
+ sample_size_time = base_training * num_tasks * len(sample_sizes) * (min(avg_sample_size, sample_size_limit) / 10)
153
+ breakdown["sample_size"] = sample_size_time
154
+ else:
155
+ breakdown["sample_size"] = 0
156
+
157
+ # Classifier training: one run per task with full samples
158
+ if include_classifier_training:
159
+ classifier_time = base_training * num_tasks * (samples_per_task / 10)
160
+ breakdown["classifier_training"] = classifier_time
161
+ else:
162
+ breakdown["classifier_training"] = 0
163
+
164
+ # Control vector generation: skip if no steering calibration
165
+ if include_control_vectors and base_steering is not None:
166
+ cv_layers = num_cv_layers or num_layers
167
+ control_vectors_time = base_steering * num_tasks * cv_layers * (samples_per_task / 10)
168
+ breakdown["control_vectors"] = control_vectors_time
169
+ else:
170
+ breakdown["control_vectors"] = 0
171
+
172
+ total_time = sum(breakdown.values())
173
+
174
+ return total_time, breakdown
@@ -0,0 +1,54 @@
1
+ """
2
+ Performance tracking module for wisent-guard.
3
+
4
+ This module provides comprehensive monitoring capabilities including:
5
+ - Memory usage tracking (CPU and GPU)
6
+ - Latency/timing analysis
7
+ - Performance profiling and optimization insights
8
+ """
9
+
10
+ from .memory import (
11
+ MemoryTracker,
12
+ MemorySnapshot,
13
+ MemoryStats,
14
+ get_global_tracker as get_global_memory_tracker,
15
+ track_memory,
16
+ get_memory_info,
17
+ format_memory_usage
18
+ )
19
+
20
+ from .latency import (
21
+ LatencyTracker,
22
+ TimingEvent,
23
+ LatencyStats,
24
+ get_global_tracker as get_global_latency_tracker,
25
+ time_function,
26
+ time_operation,
27
+ get_timing_summary,
28
+ format_timing_summary,
29
+ reset_timing,
30
+ Operations
31
+ )
32
+
33
+ __all__ = [
34
+ # Memory tracking
35
+ "MemoryTracker",
36
+ "MemorySnapshot",
37
+ "MemoryStats",
38
+ "get_global_memory_tracker",
39
+ "track_memory",
40
+ "get_memory_info",
41
+ "format_memory_usage",
42
+
43
+ # Latency tracking
44
+ "LatencyTracker",
45
+ "TimingEvent",
46
+ "LatencyStats",
47
+ "get_global_latency_tracker",
48
+ "time_function",
49
+ "time_operation",
50
+ "get_timing_summary",
51
+ "format_timing_summary",
52
+ "reset_timing",
53
+ "Operations"
54
+ ]