wisent 0.1.1__py3-none-any.whl ā 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.2.dist-info/METADATA +67 -0
- wisent-0.5.2.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info ā wisent-0.5.2.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info ā wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info ā wisent-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SuperGPQA task implementation for task-agnostic architecture.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Any, List, Optional
|
|
6
|
+
from datasets import load_dataset
|
|
7
|
+
from ..task_interface import TaskInterface
|
|
8
|
+
from ..benchmark_extractors import SuperGPQAExtractor
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SuperGPQATask(TaskInterface):
|
|
12
|
+
"""SuperGPQA scientific reasoning task implementation."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, discipline_filter: Optional[str] = None, difficulty_filter: Optional[str] = None,
|
|
15
|
+
calculation_only: Optional[bool] = None, limit: Optional[int] = None):
|
|
16
|
+
"""Initialize SuperGPQA task.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
discipline_filter: Filter by discipline (Science, Engineering, etc.)
|
|
20
|
+
difficulty_filter: Filter by difficulty level
|
|
21
|
+
calculation_only: If True, only include calculation problems; if False, exclude them
|
|
22
|
+
limit: Maximum number of examples to load
|
|
23
|
+
"""
|
|
24
|
+
self.dataset_name = "m-a-p/SuperGPQA"
|
|
25
|
+
self.discipline_filter = discipline_filter
|
|
26
|
+
self.difficulty_filter = difficulty_filter
|
|
27
|
+
self.calculation_only = calculation_only
|
|
28
|
+
self.limit = limit
|
|
29
|
+
self.field_filter = None # Can be set by subclasses
|
|
30
|
+
self._extractor = SuperGPQAExtractor()
|
|
31
|
+
self._data = None # Cache for loaded data
|
|
32
|
+
|
|
33
|
+
def load_data(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
34
|
+
"""Load SuperGPQA data from HuggingFace datasets."""
|
|
35
|
+
dataset = load_dataset(self.dataset_name, split="train")
|
|
36
|
+
|
|
37
|
+
# Apply filters
|
|
38
|
+
filtered_data = self._filter_and_process(dataset)
|
|
39
|
+
|
|
40
|
+
# Apply limit
|
|
41
|
+
effective_limit = limit or self.limit
|
|
42
|
+
if effective_limit:
|
|
43
|
+
filtered_data = filtered_data[:effective_limit]
|
|
44
|
+
|
|
45
|
+
return filtered_data
|
|
46
|
+
|
|
47
|
+
def _filter_and_process(self, dataset) -> List[Dict[str, Any]]:
|
|
48
|
+
"""Filter data by discipline, field, difficulty, and calculation type, then convert to internal format."""
|
|
49
|
+
filtered_data = []
|
|
50
|
+
|
|
51
|
+
for item in dataset:
|
|
52
|
+
# Apply discipline filter
|
|
53
|
+
if self.discipline_filter and item.get('discipline') != self.discipline_filter:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
# Apply field filter (for subject-specific tasks)
|
|
57
|
+
if self.field_filter and item.get('field') != self.field_filter:
|
|
58
|
+
continue
|
|
59
|
+
|
|
60
|
+
# Apply difficulty filter
|
|
61
|
+
if self.difficulty_filter and item.get('difficulty') != self.difficulty_filter:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
# Apply calculation filter
|
|
65
|
+
if self.calculation_only is not None:
|
|
66
|
+
if self.calculation_only and not item.get('is_calculation', False):
|
|
67
|
+
continue
|
|
68
|
+
elif not self.calculation_only and item.get('is_calculation', False):
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
# Convert to internal format
|
|
72
|
+
processed_item = {
|
|
73
|
+
'uuid': item.get('uuid', ''),
|
|
74
|
+
'question': item.get('question', ''),
|
|
75
|
+
'options': item.get('options', []),
|
|
76
|
+
'answer': item.get('answer', ''),
|
|
77
|
+
'answer_letter': item.get('answer_letter', ''),
|
|
78
|
+
'discipline': item.get('discipline', ''),
|
|
79
|
+
'field': item.get('field', ''),
|
|
80
|
+
'subfield': item.get('subfield', ''),
|
|
81
|
+
'difficulty': item.get('difficulty', ''),
|
|
82
|
+
'is_calculation': item.get('is_calculation', False),
|
|
83
|
+
'metadata': {
|
|
84
|
+
'dataset': self.dataset_name
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
filtered_data.append(processed_item)
|
|
89
|
+
|
|
90
|
+
return filtered_data
|
|
91
|
+
|
|
92
|
+
def get_extractor(self) -> SuperGPQAExtractor:
|
|
93
|
+
"""Get the SuperGPQA benchmark extractor."""
|
|
94
|
+
return self._extractor
|
|
95
|
+
|
|
96
|
+
def get_name(self) -> str:
|
|
97
|
+
"""Get the task name."""
|
|
98
|
+
name = "supergpqa"
|
|
99
|
+
if self.discipline_filter:
|
|
100
|
+
name += f"_{self.discipline_filter.lower()}"
|
|
101
|
+
if self.difficulty_filter:
|
|
102
|
+
name += f"_{self.difficulty_filter.lower()}"
|
|
103
|
+
if self.calculation_only is not None:
|
|
104
|
+
name += "_calc" if self.calculation_only else "_nocalc"
|
|
105
|
+
return name
|
|
106
|
+
|
|
107
|
+
def get_description(self) -> str:
|
|
108
|
+
"""Get the task description."""
|
|
109
|
+
desc = "SuperGPQA: Large-scale dataset of scientific multiple-choice questions across disciplines"
|
|
110
|
+
filters = []
|
|
111
|
+
if self.discipline_filter:
|
|
112
|
+
filters.append(f"discipline: {self.discipline_filter}")
|
|
113
|
+
if self.difficulty_filter:
|
|
114
|
+
filters.append(f"difficulty: {self.difficulty_filter}")
|
|
115
|
+
if self.calculation_only is not None:
|
|
116
|
+
filters.append("calculation problems only" if self.calculation_only else "non-calculation problems only")
|
|
117
|
+
|
|
118
|
+
if filters:
|
|
119
|
+
desc += f" (filtered: {', '.join(filters)})"
|
|
120
|
+
return desc
|
|
121
|
+
|
|
122
|
+
def get_categories(self) -> List[str]:
|
|
123
|
+
"""Get the task categories."""
|
|
124
|
+
return ["science", "reasoning", "multiple_choice", "knowledge"]
|
|
125
|
+
|
|
126
|
+
def get_task_info(self) -> Dict[str, Any]:
|
|
127
|
+
"""Get information about the SuperGPQA task."""
|
|
128
|
+
return {
|
|
129
|
+
"task_name": self.get_name(),
|
|
130
|
+
"description": self.get_description(),
|
|
131
|
+
"source": self.dataset_name,
|
|
132
|
+
"task_type": "multiple_choice",
|
|
133
|
+
"evaluation_method": "exact_match",
|
|
134
|
+
"filters": {
|
|
135
|
+
"discipline": self.discipline_filter,
|
|
136
|
+
"difficulty": self.difficulty_filter,
|
|
137
|
+
"calculation_only": self.calculation_only
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
def validate_sample(self, sample: Dict[str, Any]) -> bool:
|
|
142
|
+
"""Validate that a sample has required SuperGPQA fields."""
|
|
143
|
+
required_fields = ["question", "options", "answer", "answer_letter"]
|
|
144
|
+
return all(field in sample for field in required_fields)
|
|
145
|
+
|
|
146
|
+
# Methods to match lm-eval interface
|
|
147
|
+
def has_validation_docs(self) -> bool:
|
|
148
|
+
"""Check if task has validation documents."""
|
|
149
|
+
return False # SuperGPQA doesn't have separate validation sets
|
|
150
|
+
|
|
151
|
+
def has_test_docs(self) -> bool:
|
|
152
|
+
"""Check if task has test documents."""
|
|
153
|
+
return True # All samples are considered test docs
|
|
154
|
+
|
|
155
|
+
def test_docs(self) -> List[Dict[str, Any]]:
|
|
156
|
+
"""Get test documents."""
|
|
157
|
+
if self._data is None:
|
|
158
|
+
self._data = self.load_data()
|
|
159
|
+
return self._data
|
|
160
|
+
|
|
161
|
+
def validation_docs(self) -> List[Dict[str, Any]]:
|
|
162
|
+
"""Get validation documents."""
|
|
163
|
+
return [] # No separate validation set
|
|
164
|
+
|
|
165
|
+
def doc_to_text(self, doc: Dict[str, Any]) -> str:
|
|
166
|
+
"""Convert document to text prompt."""
|
|
167
|
+
question = doc.get('question', '')
|
|
168
|
+
options = doc.get('options', [])
|
|
169
|
+
|
|
170
|
+
# Format as multiple choice question
|
|
171
|
+
if options:
|
|
172
|
+
choices = []
|
|
173
|
+
for i, option in enumerate(options):
|
|
174
|
+
letter = chr(ord('A') + i)
|
|
175
|
+
choices.append(f"{letter}. {option}")
|
|
176
|
+
return f"{question}\n\n" + "\n".join(choices)
|
|
177
|
+
else:
|
|
178
|
+
return question
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class SuperGPQAPhysicsTask(SuperGPQATask):
|
|
182
|
+
"""SuperGPQA task filtered to Physics questions only."""
|
|
183
|
+
|
|
184
|
+
def __init__(self, difficulty_filter: Optional[str] = None, calculation_only: Optional[bool] = None,
|
|
185
|
+
limit: Optional[int] = None):
|
|
186
|
+
# Filter by discipline=Science and field=Physics
|
|
187
|
+
super().__init__(discipline_filter="Science", difficulty_filter=difficulty_filter,
|
|
188
|
+
calculation_only=calculation_only, limit=limit)
|
|
189
|
+
self.field_filter = "Physics" # Add field filtering
|
|
190
|
+
|
|
191
|
+
def get_name(self) -> str:
|
|
192
|
+
return "supergpqa_physics"
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
class SuperGPQAChemistryTask(SuperGPQATask):
|
|
196
|
+
"""SuperGPQA task filtered to Chemistry questions only."""
|
|
197
|
+
|
|
198
|
+
def __init__(self, difficulty_filter: Optional[str] = None, calculation_only: Optional[bool] = None,
|
|
199
|
+
limit: Optional[int] = None):
|
|
200
|
+
# Filter by discipline=Science and field=Chemistry
|
|
201
|
+
super().__init__(discipline_filter="Science", difficulty_filter=difficulty_filter,
|
|
202
|
+
calculation_only=calculation_only, limit=limit)
|
|
203
|
+
self.field_filter = "Chemistry" # Add field filtering
|
|
204
|
+
|
|
205
|
+
def get_name(self) -> str:
|
|
206
|
+
return "supergpqa_chemistry"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class SuperGPQABiologyTask(SuperGPQATask):
|
|
210
|
+
"""SuperGPQA task filtered to Biology questions only."""
|
|
211
|
+
|
|
212
|
+
def __init__(self, difficulty_filter: Optional[str] = None, calculation_only: Optional[bool] = None,
|
|
213
|
+
limit: Optional[int] = None):
|
|
214
|
+
# Filter by discipline=Science and field=Biology
|
|
215
|
+
super().__init__(discipline_filter="Science", difficulty_filter=difficulty_filter,
|
|
216
|
+
calculation_only=calculation_only, limit=limit)
|
|
217
|
+
self.field_filter = "Biology" # Add field filtering
|
|
218
|
+
|
|
219
|
+
def get_name(self) -> str:
|
|
220
|
+
return "supergpqa_biology"
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""Time estimation for optimization operations using runtime calibration"""
|
|
2
|
+
import time
|
|
3
|
+
from typing import Dict, Tuple, Optional
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .timing_calibration import TimingCalibrator
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class OptimizationTimeEstimator:
|
|
10
|
+
"""Estimates time required for optimization operations using calibration"""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
model_name: str,
|
|
15
|
+
verbose: bool = True,
|
|
16
|
+
skip_calibration: bool = False,
|
|
17
|
+
calibration_file: Optional[Path] = None,
|
|
18
|
+
calibrate_only: bool = False
|
|
19
|
+
):
|
|
20
|
+
self.model_name = model_name
|
|
21
|
+
self.verbose = verbose
|
|
22
|
+
self.calibrator = TimingCalibrator(verbose=verbose)
|
|
23
|
+
|
|
24
|
+
# Get number of layers in the model
|
|
25
|
+
from . import Model
|
|
26
|
+
model = Model(name=model_name)
|
|
27
|
+
if hasattr(model, 'model') and hasattr(model.model, 'config'):
|
|
28
|
+
if hasattr(model.model.config, 'num_hidden_layers'):
|
|
29
|
+
self.total_layers = model.model.config.num_hidden_layers
|
|
30
|
+
elif hasattr(model.model.config, 'n_layer'):
|
|
31
|
+
self.total_layers = model.model.config.n_layer
|
|
32
|
+
else:
|
|
33
|
+
raise RuntimeError(f"Cannot determine number of layers for model {model_name}")
|
|
34
|
+
else:
|
|
35
|
+
raise RuntimeError(f"Cannot access model configuration for {model_name}")
|
|
36
|
+
|
|
37
|
+
# Handle calibration
|
|
38
|
+
if skip_calibration:
|
|
39
|
+
raise RuntimeError("Calibration cannot be skipped. Accurate timing requires calibration.")
|
|
40
|
+
|
|
41
|
+
if calibration_file and calibration_file.exists():
|
|
42
|
+
# Load from file
|
|
43
|
+
if not self.calibrator.load_from_file(calibration_file):
|
|
44
|
+
raise RuntimeError(f"Failed to load calibration from {calibration_file}")
|
|
45
|
+
self.timing = self.calibrator.timings
|
|
46
|
+
if self.timing["training_time"] is None or self.timing["steering_time"] is None:
|
|
47
|
+
raise RuntimeError(f"Calibration file {calibration_file} contains invalid data")
|
|
48
|
+
else:
|
|
49
|
+
# Run calibration
|
|
50
|
+
if verbose:
|
|
51
|
+
print(f"\nš§ Running timing calibration for {model_name}...")
|
|
52
|
+
|
|
53
|
+
self.timing = self.calibrator.run_calibration(model_name)
|
|
54
|
+
|
|
55
|
+
# Save calibration if file path provided
|
|
56
|
+
if calibration_file:
|
|
57
|
+
self.calibrator.save_to_file(calibration_file)
|
|
58
|
+
|
|
59
|
+
self.calibrate_only = calibrate_only
|
|
60
|
+
|
|
61
|
+
def estimate_classification_time(
|
|
62
|
+
self,
|
|
63
|
+
num_tasks: int,
|
|
64
|
+
sample_limit: int = 200,
|
|
65
|
+
layers: Optional[list] = None
|
|
66
|
+
) -> Tuple[float, Dict[str, float]]:
|
|
67
|
+
"""
|
|
68
|
+
Estimate time for classification optimization.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Tuple of (total_seconds, breakdown)
|
|
72
|
+
"""
|
|
73
|
+
num_layers = len(layers) if layers else min(5, self.total_layers)
|
|
74
|
+
|
|
75
|
+
total_time, breakdown = self.calibrator.estimate_optimization_time(
|
|
76
|
+
num_tasks=num_tasks,
|
|
77
|
+
num_layers=num_layers,
|
|
78
|
+
samples_per_task=sample_limit,
|
|
79
|
+
include_sample_size_opt=False,
|
|
80
|
+
include_classifier_training=False,
|
|
81
|
+
include_control_vectors=False
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return total_time, {"classification": total_time}
|
|
85
|
+
|
|
86
|
+
def estimate_full_optimization_time(
|
|
87
|
+
self,
|
|
88
|
+
num_tasks: int,
|
|
89
|
+
classification_limit: int = 200,
|
|
90
|
+
sample_sizes: list = None,
|
|
91
|
+
sample_size_limit: int = 1000,
|
|
92
|
+
include_sample_size_opt: bool = True,
|
|
93
|
+
include_classifier_training: bool = True,
|
|
94
|
+
include_control_vectors: bool = True
|
|
95
|
+
) -> Tuple[float, Dict[str, float]]:
|
|
96
|
+
"""
|
|
97
|
+
Estimate time for full optimization pipeline.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Tuple of (total_seconds, breakdown)
|
|
101
|
+
"""
|
|
102
|
+
# Typical number of layers tested in classification
|
|
103
|
+
num_layers = min(5, self.total_layers)
|
|
104
|
+
|
|
105
|
+
# Control vectors typically test more layers
|
|
106
|
+
cv_layers = min(10, self.total_layers)
|
|
107
|
+
|
|
108
|
+
# Sample sizes must be provided
|
|
109
|
+
if sample_sizes is None:
|
|
110
|
+
raise RuntimeError("sample_sizes must be provided for full optimization time estimation")
|
|
111
|
+
|
|
112
|
+
return self.calibrator.estimate_optimization_time(
|
|
113
|
+
num_tasks=num_tasks,
|
|
114
|
+
num_layers=num_layers,
|
|
115
|
+
samples_per_task=classification_limit,
|
|
116
|
+
sample_sizes=sample_sizes,
|
|
117
|
+
sample_size_limit=sample_size_limit,
|
|
118
|
+
include_sample_size_opt=include_sample_size_opt,
|
|
119
|
+
include_classifier_training=include_classifier_training,
|
|
120
|
+
include_control_vectors=include_control_vectors,
|
|
121
|
+
num_cv_layers=cv_layers
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
@staticmethod
|
|
125
|
+
def format_time(seconds: float) -> str:
|
|
126
|
+
"""Format time in human-readable format"""
|
|
127
|
+
if seconds < 60:
|
|
128
|
+
return f"{seconds:.0f} seconds"
|
|
129
|
+
elif seconds < 3600:
|
|
130
|
+
minutes = seconds / 60
|
|
131
|
+
return f"{minutes:.0f} minutes"
|
|
132
|
+
else:
|
|
133
|
+
hours = seconds / 3600
|
|
134
|
+
minutes = (seconds % 3600) / 60
|
|
135
|
+
if minutes > 0:
|
|
136
|
+
return f"{hours:.0f} hours {minutes:.0f} minutes"
|
|
137
|
+
else:
|
|
138
|
+
return f"{hours:.0f} hours"
|
|
139
|
+
|
|
140
|
+
def print_time_breakdown(self, total_time: float, breakdown: Dict[str, float]):
|
|
141
|
+
"""Print a formatted time breakdown"""
|
|
142
|
+
print(f"\nā±ļø ESTIMATED OPTIMIZATION TIME:")
|
|
143
|
+
print(f" Total: {self.format_time(total_time)}")
|
|
144
|
+
|
|
145
|
+
if len(breakdown) > 1:
|
|
146
|
+
print("\n Breakdown:")
|
|
147
|
+
for phase, time_sec in breakdown.items():
|
|
148
|
+
if time_sec > 0:
|
|
149
|
+
print(f" - {phase.replace('_', ' ').title()}: {self.format_time(time_sec)}")
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Runtime timing calibration for optimization time estimation"""
|
|
2
|
+
import time
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Dict, Optional, Tuple
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TimingCalibrator:
|
|
11
|
+
"""Measures actual optimization timing on the current system"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, verbose: bool = True):
|
|
14
|
+
self.verbose = verbose
|
|
15
|
+
self.timings = {
|
|
16
|
+
"training_time": None, # Time for training command
|
|
17
|
+
"steering_time": None, # Time for steering command
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
def run_calibration(self, model_name: str) -> Dict[str, float]:
|
|
21
|
+
"""
|
|
22
|
+
Run calibration by measuring training and steering times.
|
|
23
|
+
Uses exactly one task, one layer, and 10 samples.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
model_name: Model to calibrate timing for
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Dictionary with training_time and steering_time
|
|
30
|
+
"""
|
|
31
|
+
# Get model layer count
|
|
32
|
+
from . import Model
|
|
33
|
+
model = Model(name=model_name)
|
|
34
|
+
if hasattr(model, 'model') and hasattr(model.model, 'config'):
|
|
35
|
+
if hasattr(model.model.config, 'num_hidden_layers'):
|
|
36
|
+
total_layers = model.model.config.num_hidden_layers
|
|
37
|
+
elif hasattr(model.model.config, 'n_layer'):
|
|
38
|
+
total_layers = model.model.config.n_layer
|
|
39
|
+
else:
|
|
40
|
+
raise RuntimeError(f"Cannot determine number of layers for model {model_name}")
|
|
41
|
+
else:
|
|
42
|
+
raise RuntimeError(f"Cannot access model config for {model_name}")
|
|
43
|
+
|
|
44
|
+
# Use middle layer for calibration
|
|
45
|
+
calibration_layer = total_layers // 2
|
|
46
|
+
|
|
47
|
+
if self.verbose:
|
|
48
|
+
print(f"\nš§ Running timing calibration for {model_name}...")
|
|
49
|
+
print(f" Task: arc_easy")
|
|
50
|
+
print(f" Layer: {calibration_layer}")
|
|
51
|
+
print(f" Samples: 10")
|
|
52
|
+
|
|
53
|
+
# 1. Measure training time
|
|
54
|
+
if self.verbose:
|
|
55
|
+
print(f"\nš Measuring training time...")
|
|
56
|
+
|
|
57
|
+
start_time = time.time()
|
|
58
|
+
|
|
59
|
+
cmd = [
|
|
60
|
+
sys.executable, "-m", "wisent_guard.cli",
|
|
61
|
+
"optimize-classification",
|
|
62
|
+
model_name,
|
|
63
|
+
"--tasks", "arc_easy",
|
|
64
|
+
"--limit", "10",
|
|
65
|
+
"--layer-range", f"{calibration_layer},{calibration_layer}",
|
|
66
|
+
"--no-save",
|
|
67
|
+
"--skip-timing-estimation"
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
71
|
+
|
|
72
|
+
if result.returncode != 0:
|
|
73
|
+
raise RuntimeError(f"Training calibration failed:\n{result.stderr}")
|
|
74
|
+
|
|
75
|
+
self.timings["training_time"] = time.time() - start_time
|
|
76
|
+
|
|
77
|
+
# 2. Skip steering calibration - it's too complex for quick calibration
|
|
78
|
+
# Users can run steering separately if needed
|
|
79
|
+
if self.verbose:
|
|
80
|
+
print(f"\nš Skipping steering calibration (too complex for quick estimate)")
|
|
81
|
+
|
|
82
|
+
self.timings["steering_time"] = None
|
|
83
|
+
|
|
84
|
+
if self.verbose:
|
|
85
|
+
print(f"\nā
Calibration complete!")
|
|
86
|
+
print(f" Training time: {self.timings['training_time']:.3f}s")
|
|
87
|
+
print(f" Steering time: {self.timings['steering_time']:.3f}s")
|
|
88
|
+
|
|
89
|
+
return self.timings
|
|
90
|
+
|
|
91
|
+
def save_to_file(self, filepath: Path):
|
|
92
|
+
"""Save calibration results to a file"""
|
|
93
|
+
with open(filepath, 'w') as f:
|
|
94
|
+
json.dump(self.timings, f, indent=2)
|
|
95
|
+
if self.verbose:
|
|
96
|
+
print(f"š¾ Saved calibration to {filepath}")
|
|
97
|
+
|
|
98
|
+
def load_from_file(self, filepath: Path) -> bool:
|
|
99
|
+
"""Load calibration results from a file"""
|
|
100
|
+
if not filepath.exists():
|
|
101
|
+
return False
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
with open(filepath, 'r') as f:
|
|
105
|
+
self.timings = json.load(f)
|
|
106
|
+
if self.verbose:
|
|
107
|
+
print(f"š Loaded calibration from {filepath}")
|
|
108
|
+
return True
|
|
109
|
+
except Exception as e:
|
|
110
|
+
if self.verbose:
|
|
111
|
+
print(f"ā ļø Failed to load calibration: {e}")
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
def estimate_optimization_time(
|
|
115
|
+
self,
|
|
116
|
+
num_tasks: int,
|
|
117
|
+
num_layers: int,
|
|
118
|
+
samples_per_task: int = 1000,
|
|
119
|
+
sample_sizes: list = None,
|
|
120
|
+
sample_size_limit: int = 1000,
|
|
121
|
+
include_sample_size_opt: bool = True,
|
|
122
|
+
include_classifier_training: bool = True,
|
|
123
|
+
include_control_vectors: bool = True,
|
|
124
|
+
num_cv_layers: int = None
|
|
125
|
+
) -> Tuple[float, Dict[str, float]]:
|
|
126
|
+
"""
|
|
127
|
+
Estimate total optimization time based on calibration.
|
|
128
|
+
|
|
129
|
+
Linear scaling from base measurements: 1 task, 1 layer, 10 samples.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Tuple of (total_seconds, breakdown_dict)
|
|
133
|
+
"""
|
|
134
|
+
if self.timings["training_time"] is None:
|
|
135
|
+
raise RuntimeError("No calibration data available. Run calibration first.")
|
|
136
|
+
|
|
137
|
+
# Base measurements from calibration
|
|
138
|
+
base_training = self.timings["training_time"] # Time for 1 task, 1 layer, 10 samples
|
|
139
|
+
base_steering = self.timings["steering_time"] # Time for 1 task, 1 layer, 10 samples
|
|
140
|
+
|
|
141
|
+
breakdown = {}
|
|
142
|
+
|
|
143
|
+
# Classification optimization: scales linearly with tasks, layers, and samples
|
|
144
|
+
classification_time = base_training * num_tasks * num_layers * (samples_per_task / 10)
|
|
145
|
+
breakdown["classification"] = classification_time
|
|
146
|
+
|
|
147
|
+
# Sample size optimization: tests multiple sample sizes on ONE layer per task
|
|
148
|
+
if include_sample_size_opt and sample_sizes:
|
|
149
|
+
# Calculate average sample size from the provided list
|
|
150
|
+
avg_sample_size = sum(sample_sizes) / len(sample_sizes)
|
|
151
|
+
# Each test uses sample_size_limit samples from the dataset
|
|
152
|
+
sample_size_time = base_training * num_tasks * len(sample_sizes) * (min(avg_sample_size, sample_size_limit) / 10)
|
|
153
|
+
breakdown["sample_size"] = sample_size_time
|
|
154
|
+
else:
|
|
155
|
+
breakdown["sample_size"] = 0
|
|
156
|
+
|
|
157
|
+
# Classifier training: one run per task with full samples
|
|
158
|
+
if include_classifier_training:
|
|
159
|
+
classifier_time = base_training * num_tasks * (samples_per_task / 10)
|
|
160
|
+
breakdown["classifier_training"] = classifier_time
|
|
161
|
+
else:
|
|
162
|
+
breakdown["classifier_training"] = 0
|
|
163
|
+
|
|
164
|
+
# Control vector generation: skip if no steering calibration
|
|
165
|
+
if include_control_vectors and base_steering is not None:
|
|
166
|
+
cv_layers = num_cv_layers or num_layers
|
|
167
|
+
control_vectors_time = base_steering * num_tasks * cv_layers * (samples_per_task / 10)
|
|
168
|
+
breakdown["control_vectors"] = control_vectors_time
|
|
169
|
+
else:
|
|
170
|
+
breakdown["control_vectors"] = 0
|
|
171
|
+
|
|
172
|
+
total_time = sum(breakdown.values())
|
|
173
|
+
|
|
174
|
+
return total_time, breakdown
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Performance tracking module for wisent-guard.
|
|
3
|
+
|
|
4
|
+
This module provides comprehensive monitoring capabilities including:
|
|
5
|
+
- Memory usage tracking (CPU and GPU)
|
|
6
|
+
- Latency/timing analysis
|
|
7
|
+
- Performance profiling and optimization insights
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .memory import (
|
|
11
|
+
MemoryTracker,
|
|
12
|
+
MemorySnapshot,
|
|
13
|
+
MemoryStats,
|
|
14
|
+
get_global_tracker as get_global_memory_tracker,
|
|
15
|
+
track_memory,
|
|
16
|
+
get_memory_info,
|
|
17
|
+
format_memory_usage
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from .latency import (
|
|
21
|
+
LatencyTracker,
|
|
22
|
+
TimingEvent,
|
|
23
|
+
LatencyStats,
|
|
24
|
+
get_global_tracker as get_global_latency_tracker,
|
|
25
|
+
time_function,
|
|
26
|
+
time_operation,
|
|
27
|
+
get_timing_summary,
|
|
28
|
+
format_timing_summary,
|
|
29
|
+
reset_timing,
|
|
30
|
+
Operations
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
# Memory tracking
|
|
35
|
+
"MemoryTracker",
|
|
36
|
+
"MemorySnapshot",
|
|
37
|
+
"MemoryStats",
|
|
38
|
+
"get_global_memory_tracker",
|
|
39
|
+
"track_memory",
|
|
40
|
+
"get_memory_info",
|
|
41
|
+
"format_memory_usage",
|
|
42
|
+
|
|
43
|
+
# Latency tracking
|
|
44
|
+
"LatencyTracker",
|
|
45
|
+
"TimingEvent",
|
|
46
|
+
"LatencyStats",
|
|
47
|
+
"get_global_latency_tracker",
|
|
48
|
+
"time_function",
|
|
49
|
+
"time_operation",
|
|
50
|
+
"get_timing_summary",
|
|
51
|
+
"format_timing_summary",
|
|
52
|
+
"reset_timing",
|
|
53
|
+
"Operations"
|
|
54
|
+
]
|