wisent 0.1.1__py3-none-any.whl ā 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of wisent might be problematic. Click here for more details.
- wisent/__init__.py +1 -8
- wisent/benchmarks/__init__.py +0 -0
- wisent/benchmarks/coding/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
- wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
- wisent/benchmarks/coding/metrics/evaluator.py +275 -0
- wisent/benchmarks/coding/metrics/passk.py +66 -0
- wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
- wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
- wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
- wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
- wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
- wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
- wisent/benchmarks/coding/providers/__init__.py +18 -0
- wisent/benchmarks/coding/providers/core/__init__.py +0 -0
- wisent/benchmarks/coding/providers/core/atoms.py +31 -0
- wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
- wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
- wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
- wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
- wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
- wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
- wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
- wisent/classifiers/__init__.py +0 -0
- wisent/classifiers/core/__init__.py +0 -0
- wisent/classifiers/core/atoms.py +747 -0
- wisent/classifiers/models/__init__.py +0 -0
- wisent/classifiers/models/logistic.py +29 -0
- wisent/classifiers/models/mlp.py +47 -0
- wisent/cli/__init__.py +0 -0
- wisent/cli/classifiers/__init__.py +0 -0
- wisent/cli/classifiers/classifier_rotator.py +137 -0
- wisent/cli/cli_logger.py +142 -0
- wisent/cli/data_loaders/__init__.py +0 -0
- wisent/cli/data_loaders/data_loader_rotator.py +96 -0
- wisent/cli/evaluators/__init__.py +0 -0
- wisent/cli/evaluators/evaluator_rotator.py +148 -0
- wisent/cli/steering_methods/__init__.py +0 -0
- wisent/cli/steering_methods/steering_rotator.py +110 -0
- wisent/cli/wisent_cli/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/__init__.py +0 -0
- wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
- wisent/cli/wisent_cli/commands/listing.py +154 -0
- wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
- wisent/cli/wisent_cli/main.py +93 -0
- wisent/cli/wisent_cli/shell.py +80 -0
- wisent/cli/wisent_cli/ui.py +69 -0
- wisent/cli/wisent_cli/util/__init__.py +0 -0
- wisent/cli/wisent_cli/util/aggregations.py +43 -0
- wisent/cli/wisent_cli/util/parsing.py +126 -0
- wisent/cli/wisent_cli/version.py +4 -0
- wisent/core/__init__.py +27 -0
- wisent/core/activations/__init__.py +0 -0
- wisent/core/activations/activations_collector.py +338 -0
- wisent/core/activations/core/__init__.py +0 -0
- wisent/core/activations/core/atoms.py +216 -0
- wisent/core/agent/__init__.py +18 -0
- wisent/core/agent/budget.py +638 -0
- wisent/core/agent/device_benchmarks.py +685 -0
- wisent/core/agent/diagnose/__init__.py +55 -0
- wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
- wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
- wisent/core/agent/diagnose/create_classifier.py +1154 -0
- wisent/core/agent/diagnose/response_diagnostics.py +268 -0
- wisent/core/agent/diagnose/select_classifiers.py +506 -0
- wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
- wisent/core/agent/diagnose/tasks/__init__.py +33 -0
- wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
- wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
- wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
- wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
- wisent/core/agent/diagnose.py +242 -0
- wisent/core/agent/steer.py +212 -0
- wisent/core/agent/timeout.py +134 -0
- wisent/core/autonomous_agent.py +1234 -0
- wisent/core/bigcode_integration.py +583 -0
- wisent/core/contrastive_pairs/__init__.py +15 -0
- wisent/core/contrastive_pairs/core/__init__.py +0 -0
- wisent/core/contrastive_pairs/core/atoms.py +45 -0
- wisent/core/contrastive_pairs/core/buliders.py +59 -0
- wisent/core/contrastive_pairs/core/pair.py +178 -0
- wisent/core/contrastive_pairs/core/response.py +152 -0
- wisent/core/contrastive_pairs/core/serialization.py +300 -0
- wisent/core/contrastive_pairs/core/set.py +133 -0
- wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
- wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
- wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
- wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
- wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
- wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
- wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
- wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
- wisent/core/data_loaders/__init__.py +0 -0
- wisent/core/data_loaders/core/__init__.py +0 -0
- wisent/core/data_loaders/core/atoms.py +98 -0
- wisent/core/data_loaders/loaders/__init__.py +0 -0
- wisent/core/data_loaders/loaders/custom.py +120 -0
- wisent/core/data_loaders/loaders/lm_loader.py +218 -0
- wisent/core/detection_handling.py +257 -0
- wisent/core/download_full_benchmarks.py +1386 -0
- wisent/core/evaluators/__init__.py +0 -0
- wisent/core/evaluators/oracles/__init__.py +0 -0
- wisent/core/evaluators/oracles/interactive.py +73 -0
- wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
- wisent/core/evaluators/oracles/user_specified.py +67 -0
- wisent/core/hyperparameter_optimizer.py +429 -0
- wisent/core/lm_eval_harness_ground_truth.py +1396 -0
- wisent/core/log_likelihoods_evaluator.py +321 -0
- wisent/core/managed_cached_benchmarks.py +595 -0
- wisent/core/mixed_benchmark_sampler.py +364 -0
- wisent/core/model_config_manager.py +330 -0
- wisent/core/model_persistence.py +317 -0
- wisent/core/models/__init__.py +0 -0
- wisent/core/models/core/__init__.py +0 -0
- wisent/core/models/core/atoms.py +460 -0
- wisent/core/models/wisent_model.py +727 -0
- wisent/core/multi_steering.py +316 -0
- wisent/core/optuna/__init__.py +57 -0
- wisent/core/optuna/classifier/__init__.py +25 -0
- wisent/core/optuna/classifier/activation_generator.py +349 -0
- wisent/core/optuna/classifier/classifier_cache.py +509 -0
- wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
- wisent/core/optuna/steering/__init__.py +0 -0
- wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
- wisent/core/optuna/steering/data_utils.py +342 -0
- wisent/core/optuna/steering/metrics.py +474 -0
- wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
- wisent/core/optuna/steering/steering_optimization.py +1111 -0
- wisent/core/parser.py +1668 -0
- wisent/core/prompts/__init__.py +0 -0
- wisent/core/prompts/core/__init__.py +0 -0
- wisent/core/prompts/core/atom.py +57 -0
- wisent/core/prompts/core/prompt_formater.py +157 -0
- wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
- wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
- wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
- wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
- wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
- wisent/core/representation.py +5 -0
- wisent/core/sample_size_optimizer.py +648 -0
- wisent/core/sample_size_optimizer_v2.py +355 -0
- wisent/core/save_results.py +277 -0
- wisent/core/steering.py +652 -0
- wisent/core/steering_method.py +26 -0
- wisent/core/steering_methods/__init__.py +0 -0
- wisent/core/steering_methods/core/__init__.py +0 -0
- wisent/core/steering_methods/core/atoms.py +153 -0
- wisent/core/steering_methods/methods/__init__.py +0 -0
- wisent/core/steering_methods/methods/caa.py +44 -0
- wisent/core/steering_optimizer.py +1297 -0
- wisent/core/task_interface.py +132 -0
- wisent/core/task_selector.py +189 -0
- wisent/core/tasks/__init__.py +175 -0
- wisent/core/tasks/aime_task.py +141 -0
- wisent/core/tasks/file_task.py +211 -0
- wisent/core/tasks/hle_task.py +180 -0
- wisent/core/tasks/hmmt_task.py +119 -0
- wisent/core/tasks/livecodebench_task.py +201 -0
- wisent/core/tasks/livemathbench_task.py +158 -0
- wisent/core/tasks/lm_eval_task.py +455 -0
- wisent/core/tasks/math500_task.py +84 -0
- wisent/core/tasks/polymath_task.py +146 -0
- wisent/core/tasks/supergpqa_task.py +220 -0
- wisent/core/time_estimator.py +149 -0
- wisent/core/timing_calibration.py +174 -0
- wisent/core/tracking/__init__.py +54 -0
- wisent/core/tracking/latency.py +618 -0
- wisent/core/tracking/memory.py +359 -0
- wisent/core/trainers/__init__.py +0 -0
- wisent/core/trainers/core/__init__.py +11 -0
- wisent/core/trainers/core/atoms.py +45 -0
- wisent/core/trainers/steering_trainer.py +271 -0
- wisent/core/user_model_config.py +158 -0
- wisent/opti/__init__.py +0 -0
- wisent/opti/core/__init__.py +0 -0
- wisent/opti/core/atoms.py +175 -0
- wisent/opti/methods/__init__.py +0 -0
- wisent/opti/methods/opti_classificator.py +172 -0
- wisent/opti/methods/opti_steering.py +138 -0
- wisent/synthetic/__init__.py +0 -0
- wisent/synthetic/cleaners/__init__.py +0 -0
- wisent/synthetic/cleaners/core/__init__.py +0 -0
- wisent/synthetic/cleaners/core/atoms.py +58 -0
- wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
- wisent/synthetic/cleaners/methods/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
- wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
- wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
- wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
- wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
- wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
- wisent/synthetic/db_instructions/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/__init__.py +0 -0
- wisent/synthetic/db_instructions/core/atoms.py +25 -0
- wisent/synthetic/db_instructions/mini_dp.py +37 -0
- wisent/synthetic/generators/__init__.py +0 -0
- wisent/synthetic/generators/core/__init__.py +0 -0
- wisent/synthetic/generators/core/atoms.py +73 -0
- wisent/synthetic/generators/diversities/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/__init__.py +0 -0
- wisent/synthetic/generators/diversities/core/core.py +68 -0
- wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
- wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
- wisent/synthetic/generators/pairs_generator.py +179 -0
- wisent-0.5.2.dist-info/METADATA +67 -0
- wisent-0.5.2.dist-info/RECORD +218 -0
- {wisent-0.1.1.dist-info ā wisent-0.5.2.dist-info}/WHEEL +1 -1
- {wisent-0.1.1.dist-info ā wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
- wisent/activations/__init__.py +0 -9
- wisent/activations/client.py +0 -97
- wisent/activations/extractor.py +0 -251
- wisent/activations/models.py +0 -95
- wisent/client.py +0 -45
- wisent/control_vector/__init__.py +0 -9
- wisent/control_vector/client.py +0 -85
- wisent/control_vector/manager.py +0 -168
- wisent/control_vector/models.py +0 -70
- wisent/inference/__init__.py +0 -9
- wisent/inference/client.py +0 -103
- wisent/inference/inferencer.py +0 -250
- wisent/inference/models.py +0 -66
- wisent/utils/__init__.py +0 -3
- wisent/utils/auth.py +0 -30
- wisent/utils/http.py +0 -228
- wisent/version.py +0 -3
- wisent-0.1.1.dist-info/METADATA +0 -142
- wisent-0.1.1.dist-info/RECORD +0 -23
- {wisent-0.1.1.dist-info ā wisent-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,554 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
import pickle
|
|
6
|
+
import time
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from wisent.core.utils.device import resolve_default_device
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class ClassifierListing:
|
|
14
|
+
"""A classifier available in the marketplace."""
|
|
15
|
+
path: str
|
|
16
|
+
layer: int
|
|
17
|
+
issue_type: str
|
|
18
|
+
threshold: float
|
|
19
|
+
quality_score: float # 0.0 to 1.0, higher is better
|
|
20
|
+
training_samples: int
|
|
21
|
+
model_family: str
|
|
22
|
+
created_at: str
|
|
23
|
+
training_time_seconds: float
|
|
24
|
+
metadata: Dict[str, Any]
|
|
25
|
+
|
|
26
|
+
def to_config(self) -> Dict[str, Any]:
|
|
27
|
+
"""Convert to classifier config format."""
|
|
28
|
+
return {
|
|
29
|
+
"path": self.path,
|
|
30
|
+
"layer": self.layer,
|
|
31
|
+
"issue_type": self.issue_type,
|
|
32
|
+
"threshold": self.threshold
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class ClassifierCreationEstimate:
|
|
37
|
+
"""Estimate for creating a new classifier."""
|
|
38
|
+
issue_type: str
|
|
39
|
+
estimated_training_time_minutes: float
|
|
40
|
+
estimated_quality_score: float # Predicted based on issue type complexity
|
|
41
|
+
training_samples_needed: int
|
|
42
|
+
optimal_layer: int
|
|
43
|
+
confidence: float # How confident we are in the estimate
|
|
44
|
+
|
|
45
|
+
class ClassifierMarketplace:
|
|
46
|
+
"""
|
|
47
|
+
A marketplace interface for classifiers that gives the agent full autonomy
|
|
48
|
+
to discover, evaluate, and create classifiers based on its needs.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
def __init__(self, model, search_paths: List[str] = None):
|
|
52
|
+
self.model = model
|
|
53
|
+
self.search_paths = search_paths or [
|
|
54
|
+
"./models/",
|
|
55
|
+
"./classifiers/",
|
|
56
|
+
"./wisent_guard/models/",
|
|
57
|
+
"./wisent_guard/classifiers/",
|
|
58
|
+
"./wisent_guard/core/classifiers/"
|
|
59
|
+
]
|
|
60
|
+
self.available_classifiers: List[ClassifierListing] = []
|
|
61
|
+
self._training_time_cache = {}
|
|
62
|
+
|
|
63
|
+
def discover_available_classifiers(self) -> List[ClassifierListing]:
|
|
64
|
+
"""
|
|
65
|
+
Discover all available classifiers and return them as marketplace listings.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
List of classifier listings with quality scores and metadata
|
|
69
|
+
"""
|
|
70
|
+
print("šŖ Discovering available classifiers in marketplace...")
|
|
71
|
+
|
|
72
|
+
self.available_classifiers = []
|
|
73
|
+
|
|
74
|
+
for search_path in self.search_paths:
|
|
75
|
+
if not os.path.exists(search_path):
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# For wisent_guard/core/classifiers, search recursively for the nested structure
|
|
79
|
+
if "wisent_guard/core/classifiers" in search_path:
|
|
80
|
+
import glob
|
|
81
|
+
pattern = os.path.join(search_path, "**", "*.pkl")
|
|
82
|
+
classifier_files = glob.glob(pattern, recursive=True)
|
|
83
|
+
for filepath in classifier_files:
|
|
84
|
+
listing = self._create_classifier_listing(filepath)
|
|
85
|
+
if listing:
|
|
86
|
+
self.available_classifiers.append(listing)
|
|
87
|
+
else:
|
|
88
|
+
# Original behavior for other directories
|
|
89
|
+
for filename in os.listdir(search_path):
|
|
90
|
+
if filename.endswith('.pkl'):
|
|
91
|
+
filepath = os.path.join(search_path, filename)
|
|
92
|
+
listing = self._create_classifier_listing(filepath)
|
|
93
|
+
if listing:
|
|
94
|
+
self.available_classifiers.append(listing)
|
|
95
|
+
|
|
96
|
+
# Sort by quality score (best first)
|
|
97
|
+
self.available_classifiers.sort(key=lambda x: x.quality_score, reverse=True)
|
|
98
|
+
|
|
99
|
+
print(f" š Found {len(self.available_classifiers)} classifiers in marketplace")
|
|
100
|
+
return self.available_classifiers
|
|
101
|
+
|
|
102
|
+
def _create_classifier_listing(self, filepath: str) -> Optional[ClassifierListing]:
|
|
103
|
+
"""Create a marketplace listing for a classifier file."""
|
|
104
|
+
try:
|
|
105
|
+
# Load metadata
|
|
106
|
+
metadata = self._load_metadata(filepath)
|
|
107
|
+
|
|
108
|
+
# Parse filename for layer and issue type
|
|
109
|
+
layer, issue_type = self._parse_filename(filepath)
|
|
110
|
+
|
|
111
|
+
# Calculate quality score
|
|
112
|
+
quality_score = self._calculate_quality_score(metadata)
|
|
113
|
+
|
|
114
|
+
# Extract other info
|
|
115
|
+
threshold = metadata.get('threshold', 0.5)
|
|
116
|
+
training_samples = metadata.get('training_samples', 0)
|
|
117
|
+
model_family = self._extract_model_family(metadata.get('model_name', ''))
|
|
118
|
+
created_at = metadata.get('created_at', datetime.now().isoformat())
|
|
119
|
+
training_time = metadata.get('training_time_seconds', 0.0)
|
|
120
|
+
|
|
121
|
+
return ClassifierListing(
|
|
122
|
+
path=filepath,
|
|
123
|
+
layer=layer,
|
|
124
|
+
issue_type=issue_type,
|
|
125
|
+
threshold=threshold,
|
|
126
|
+
quality_score=quality_score,
|
|
127
|
+
training_samples=training_samples,
|
|
128
|
+
model_family=model_family,
|
|
129
|
+
created_at=created_at,
|
|
130
|
+
training_time_seconds=training_time,
|
|
131
|
+
metadata=metadata
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
print(f" ā ļø Could not create listing for {filepath}: {e}")
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
def _load_metadata(self, filepath: str) -> Dict[str, Any]:
|
|
139
|
+
"""Load metadata for a classifier."""
|
|
140
|
+
# Try to load companion JSON file first
|
|
141
|
+
json_path = filepath.replace('.pkl', '.json')
|
|
142
|
+
if os.path.exists(json_path):
|
|
143
|
+
try:
|
|
144
|
+
with open(json_path, 'r') as f:
|
|
145
|
+
return json.load(f)
|
|
146
|
+
except:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
# Try to load metadata from the pickle file itself
|
|
150
|
+
try:
|
|
151
|
+
with open(filepath, 'rb') as f:
|
|
152
|
+
data = pickle.load(f)
|
|
153
|
+
if isinstance(data, dict) and 'metadata' in data:
|
|
154
|
+
return data['metadata']
|
|
155
|
+
elif hasattr(data, 'metadata'):
|
|
156
|
+
return data.metadata
|
|
157
|
+
except:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
return {}
|
|
161
|
+
|
|
162
|
+
def _parse_filename(self, filepath: str) -> Tuple[int, str]:
|
|
163
|
+
"""Parse layer and issue type from filename."""
|
|
164
|
+
filename = os.path.basename(filepath).lower()
|
|
165
|
+
|
|
166
|
+
# Check if this is from wisent_guard/core/classifiers with nested structure
|
|
167
|
+
if "wisent_guard/core/classifiers" in filepath:
|
|
168
|
+
# Extract from path structure: wisent_guard/core/classifiers/{model}/{benchmark}/layer_{layer}.pkl
|
|
169
|
+
path_parts = filepath.split(os.sep)
|
|
170
|
+
|
|
171
|
+
# Find the benchmark name (second to last directory)
|
|
172
|
+
if len(path_parts) >= 2:
|
|
173
|
+
benchmark_name = path_parts[-2] # Directory containing the classifier file
|
|
174
|
+
|
|
175
|
+
# Extract layer from filename like "layer_15.pkl"
|
|
176
|
+
import re
|
|
177
|
+
layer_match = re.search(r'layer_(\d+)\.pkl', filename)
|
|
178
|
+
layer = int(layer_match.group(1)) if layer_match else 15
|
|
179
|
+
|
|
180
|
+
# Use benchmark name as issue type for generated classifiers
|
|
181
|
+
issue_type = f"quality_{benchmark_name}"
|
|
182
|
+
|
|
183
|
+
return layer, issue_type
|
|
184
|
+
|
|
185
|
+
# Original parsing logic for other classifiers
|
|
186
|
+
filename = os.path.basename(filepath).lower()
|
|
187
|
+
|
|
188
|
+
# Extract layer
|
|
189
|
+
layer = 15 # default
|
|
190
|
+
for part in filename.replace('_', ' ').replace('-', ' ').split():
|
|
191
|
+
if part.startswith('l') and part[1:].isdigit():
|
|
192
|
+
layer = int(part[1:])
|
|
193
|
+
break
|
|
194
|
+
elif part.startswith('layer') and len(part) > 5:
|
|
195
|
+
try:
|
|
196
|
+
layer = int(part[5:])
|
|
197
|
+
break
|
|
198
|
+
except:
|
|
199
|
+
pass
|
|
200
|
+
elif 'layer' in filename:
|
|
201
|
+
import re
|
|
202
|
+
match = re.search(r'layer[_\s]*(\d+)', filename)
|
|
203
|
+
if match:
|
|
204
|
+
layer = int(match.group(1))
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
# Extract issue type using model
|
|
208
|
+
issue_type = self._get_model_issue_type(filename)
|
|
209
|
+
|
|
210
|
+
return layer, issue_type
|
|
211
|
+
|
|
212
|
+
def _get_model_issue_type(self, filename: str) -> str:
|
|
213
|
+
"""Extract issue type from filename using model decisions."""
|
|
214
|
+
prompt = f"""What AI safety issue type is this classifier filename related to?
|
|
215
|
+
|
|
216
|
+
Filename: {filename}
|
|
217
|
+
|
|
218
|
+
Common issue types include:
|
|
219
|
+
- hallucination (false information, factual errors)
|
|
220
|
+
- quality (output quality, coherence)
|
|
221
|
+
- harmful (toxic content, safety violations)
|
|
222
|
+
- bias (unfairness, discrimination)
|
|
223
|
+
- coherence (logical consistency)
|
|
224
|
+
|
|
225
|
+
Respond with just the issue type (one word):"""
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
response = self.model.generate(prompt, layer_index=15, max_new_tokens=15, temperature=0.1)
|
|
229
|
+
issue_type = response.strip().lower()
|
|
230
|
+
|
|
231
|
+
# Clean up response to single word
|
|
232
|
+
import re
|
|
233
|
+
match = re.search(r'(hallucination|quality|harmful|bias|coherence|unknown)', issue_type)
|
|
234
|
+
if match:
|
|
235
|
+
return match.group(1)
|
|
236
|
+
return "unknown"
|
|
237
|
+
except:
|
|
238
|
+
return "unknown"
|
|
239
|
+
|
|
240
|
+
def _calculate_quality_score(self, metadata: Dict[str, Any]) -> float:
|
|
241
|
+
"""Calculate a comprehensive quality score for the classifier."""
|
|
242
|
+
score = 0.0
|
|
243
|
+
|
|
244
|
+
# Primary performance metrics (70% of score)
|
|
245
|
+
f1_score = metadata.get('f1', metadata.get('training_f1', 0.0))
|
|
246
|
+
accuracy = metadata.get('accuracy', metadata.get('training_accuracy', 0.0))
|
|
247
|
+
|
|
248
|
+
if f1_score > 0:
|
|
249
|
+
score += f1_score * 0.5
|
|
250
|
+
if accuracy > 0:
|
|
251
|
+
score += accuracy * 0.2
|
|
252
|
+
|
|
253
|
+
# Training data quality (20% of score)
|
|
254
|
+
training_samples = metadata.get('training_samples', 0)
|
|
255
|
+
if training_samples > 0:
|
|
256
|
+
data_quality = min(training_samples / 1000, 1.0) * 0.2
|
|
257
|
+
score += data_quality
|
|
258
|
+
|
|
259
|
+
# Recency bonus (10% of score)
|
|
260
|
+
try:
|
|
261
|
+
created_at = datetime.fromisoformat(metadata.get('created_at', ''))
|
|
262
|
+
days_old = (datetime.now() - created_at).days
|
|
263
|
+
recency_score = max(0, (90 - days_old) / 90) * 0.1 # Decays over 90 days
|
|
264
|
+
score += recency_score
|
|
265
|
+
except:
|
|
266
|
+
pass
|
|
267
|
+
|
|
268
|
+
return min(score, 1.0)
|
|
269
|
+
|
|
270
|
+
def _extract_model_family(self, model_name: str) -> str:
|
|
271
|
+
"""Extract model family from model name using model decisions."""
|
|
272
|
+
if not model_name:
|
|
273
|
+
return "unknown"
|
|
274
|
+
|
|
275
|
+
prompt = f"""What model family is this model name from?
|
|
276
|
+
|
|
277
|
+
Model name: {model_name}
|
|
278
|
+
|
|
279
|
+
Common families include: llama, mistral, gemma, qwen, gpt, claude, other
|
|
280
|
+
|
|
281
|
+
Respond with just the family name (one word):"""
|
|
282
|
+
|
|
283
|
+
try:
|
|
284
|
+
response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
|
|
285
|
+
family = response.strip().lower()
|
|
286
|
+
|
|
287
|
+
# Clean up response
|
|
288
|
+
import re
|
|
289
|
+
match = re.search(r'(llama|mistral|gemma|qwen|gpt|claude|other|unknown)', family)
|
|
290
|
+
if match:
|
|
291
|
+
return match.group(1)
|
|
292
|
+
return "unknown"
|
|
293
|
+
except:
|
|
294
|
+
return "unknown"
|
|
295
|
+
|
|
296
|
+
def get_creation_estimate(self, issue_type: str) -> ClassifierCreationEstimate:
|
|
297
|
+
"""
|
|
298
|
+
Get an estimate for creating a new classifier for the given issue type.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
issue_type: The type of issue to create a classifier for
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
Estimate including time, quality, and confidence
|
|
305
|
+
"""
|
|
306
|
+
# Dynamic estimates based on available benchmark data
|
|
307
|
+
# Check if we have relevant benchmarks for this issue type
|
|
308
|
+
available_benchmarks = self._find_available_benchmarks_for_issue(issue_type)
|
|
309
|
+
|
|
310
|
+
if available_benchmarks:
|
|
311
|
+
# We have relevant benchmark data - better quality expected
|
|
312
|
+
benchmark_count = len(available_benchmarks)
|
|
313
|
+
base = {
|
|
314
|
+
"training_time_minutes": 8.0 + (benchmark_count * 2.0), # More benchmarks = more time
|
|
315
|
+
"quality_score": min(0.80, 0.60 + (benchmark_count * 0.05)), # Better with more data
|
|
316
|
+
"samples_needed": min(500, 100 + (benchmark_count * 30)), # Scale with available data
|
|
317
|
+
"optimal_layer": self._estimate_optimal_layer_for_issue(issue_type)
|
|
318
|
+
}
|
|
319
|
+
print(f" š Using {benchmark_count} benchmarks for {issue_type}")
|
|
320
|
+
else:
|
|
321
|
+
# Fall back to synthetic generation
|
|
322
|
+
base = {
|
|
323
|
+
"training_time_minutes": 6.0, # Synthetic is faster but less data
|
|
324
|
+
"quality_score": 0.55, # Lower expectation for synthetic
|
|
325
|
+
"samples_needed": 50, # Fewer samples for synthetic
|
|
326
|
+
"optimal_layer": 14 # General-purpose layer
|
|
327
|
+
}
|
|
328
|
+
print(f" š¤ Using synthetic generation for {issue_type}")
|
|
329
|
+
|
|
330
|
+
return self._complete_creation_estimate(base, available_benchmarks, issue_type)
|
|
331
|
+
|
|
332
|
+
def _find_available_benchmarks_for_issue(self, issue_type: str) -> List[str]:
|
|
333
|
+
"""Find available benchmarks using dynamic semantic analysis."""
|
|
334
|
+
available_tasks = self.model.get_available_tasks()
|
|
335
|
+
|
|
336
|
+
# Use semantic similarity to find relevant benchmarks
|
|
337
|
+
relevant = []
|
|
338
|
+
issue_lower = issue_type.lower()
|
|
339
|
+
|
|
340
|
+
for task in available_tasks[:1000]: # Limit search for speed
|
|
341
|
+
task_lower = task.lower()
|
|
342
|
+
|
|
343
|
+
# Calculate semantic similarity score
|
|
344
|
+
similarity_score = self._calculate_task_similarity(issue_lower, task_lower)
|
|
345
|
+
|
|
346
|
+
if similarity_score > 0:
|
|
347
|
+
relevant.append((task, similarity_score))
|
|
348
|
+
if len(relevant) >= 30: # Get more candidates for ranking
|
|
349
|
+
break
|
|
350
|
+
|
|
351
|
+
# Sort by similarity score and return top matches
|
|
352
|
+
relevant.sort(key=lambda x: x[1], reverse=True)
|
|
353
|
+
return [task for task, score in relevant[:15]] # Return top 15
|
|
354
|
+
|
|
355
|
+
def _calculate_task_similarity(self, issue_type: str, task_name: str) -> float:
|
|
356
|
+
"""Calculate similarity between issue type and task name using model decisions."""
|
|
357
|
+
prompt = f"""Rate the similarity between this issue type and evaluation task for training AI safety classifiers.
|
|
358
|
+
|
|
359
|
+
Issue Type: {issue_type}
|
|
360
|
+
Task: {task_name}
|
|
361
|
+
|
|
362
|
+
Rate similarity from 0.0 to 10.0 (10.0 = highly similar, 0.0 = not similar).
|
|
363
|
+
Respond with only the number:"""
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
|
|
367
|
+
score_str = response.strip()
|
|
368
|
+
|
|
369
|
+
import re
|
|
370
|
+
match = re.search(r'(\d+\.?\d*)', score_str)
|
|
371
|
+
if match:
|
|
372
|
+
score = float(match.group(1))
|
|
373
|
+
return min(10.0, max(0.0, score))
|
|
374
|
+
return 0.0
|
|
375
|
+
except:
|
|
376
|
+
return 0.0
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def _estimate_optimal_layer_for_issue(self, issue_type: str) -> int:
|
|
381
|
+
"""Estimate optimal layer using model analysis of issue complexity."""
|
|
382
|
+
prompt = f"""What transformer layer would be optimal for detecting this AI safety issue?
|
|
383
|
+
|
|
384
|
+
Issue Type: {issue_type}
|
|
385
|
+
|
|
386
|
+
Consider:
|
|
387
|
+
- Simple issues (formatting, basic patterns) ā early layers (8-12)
|
|
388
|
+
- Complex semantic issues (truthfulness, bias) ā middle layers (12-16)
|
|
389
|
+
- Abstract conceptual issues (coherence, quality) ā deeper layers (16-20)
|
|
390
|
+
|
|
391
|
+
Respond with just the layer number (8-20):"""
|
|
392
|
+
|
|
393
|
+
try:
|
|
394
|
+
response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
|
|
395
|
+
layer_str = response.strip()
|
|
396
|
+
|
|
397
|
+
import re
|
|
398
|
+
match = re.search(r'(\d+)', layer_str)
|
|
399
|
+
if match:
|
|
400
|
+
layer = int(match.group(1))
|
|
401
|
+
return max(8, min(20, layer)) # Clamp to valid range
|
|
402
|
+
return 14 # Default middle layer
|
|
403
|
+
except:
|
|
404
|
+
return 14
|
|
405
|
+
|
|
406
|
+
def _complete_creation_estimate(self, base: Dict[str, Any], available_benchmarks: List[str], issue_type: str) -> ClassifierCreationEstimate:
|
|
407
|
+
"""Complete the creation estimate with hardware adjustments."""
|
|
408
|
+
# Adjust based on model and hardware
|
|
409
|
+
hardware_multiplier = self._estimate_hardware_speed()
|
|
410
|
+
training_time = base["training_time_minutes"] * hardware_multiplier
|
|
411
|
+
|
|
412
|
+
# Confidence based on data availability
|
|
413
|
+
confidence = 0.8 if available_benchmarks else 0.6 # Higher confidence with benchmark data
|
|
414
|
+
|
|
415
|
+
return ClassifierCreationEstimate(
|
|
416
|
+
issue_type=issue_type,
|
|
417
|
+
estimated_training_time_minutes=training_time,
|
|
418
|
+
estimated_quality_score=base["quality_score"],
|
|
419
|
+
training_samples_needed=base["samples_needed"],
|
|
420
|
+
optimal_layer=base["optimal_layer"],
|
|
421
|
+
confidence=confidence
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
def _estimate_hardware_speed(self) -> float:
|
|
425
|
+
"""Estimate hardware speed multiplier for training time."""
|
|
426
|
+
device_kind = resolve_default_device()
|
|
427
|
+
if device_kind == "cuda":
|
|
428
|
+
return 0.3
|
|
429
|
+
if device_kind == "mps":
|
|
430
|
+
return 0.5
|
|
431
|
+
return 1.0
|
|
432
|
+
|
|
433
|
+
def get_marketplace_summary(self) -> str:
|
|
434
|
+
"""Get a summary of the classifier marketplace."""
|
|
435
|
+
if not self.available_classifiers:
|
|
436
|
+
self.discover_available_classifiers()
|
|
437
|
+
|
|
438
|
+
if not self.available_classifiers:
|
|
439
|
+
return "šŖ Classifier Marketplace: No classifiers available"
|
|
440
|
+
|
|
441
|
+
summary = f"\nšŖ Classifier Marketplace Summary\n"
|
|
442
|
+
summary += f"{'='*50}\n"
|
|
443
|
+
summary += f"Available Classifiers: {len(self.available_classifiers)}\n\n"
|
|
444
|
+
|
|
445
|
+
# Group by issue type
|
|
446
|
+
by_issue_type = {}
|
|
447
|
+
for classifier in self.available_classifiers:
|
|
448
|
+
issue_type = classifier.issue_type
|
|
449
|
+
if issue_type not in by_issue_type:
|
|
450
|
+
by_issue_type[issue_type] = []
|
|
451
|
+
by_issue_type[issue_type].append(classifier)
|
|
452
|
+
|
|
453
|
+
for issue_type, classifiers in by_issue_type.items():
|
|
454
|
+
best_classifier = max(classifiers, key=lambda x: x.quality_score)
|
|
455
|
+
summary += f"š {issue_type.upper()}: {len(classifiers)} available\n"
|
|
456
|
+
summary += f" Best: {os.path.basename(best_classifier.path)} "
|
|
457
|
+
summary += f"(Quality: {best_classifier.quality_score:.3f}, Layer: {best_classifier.layer})\n"
|
|
458
|
+
summary += f" Samples: {best_classifier.training_samples}, "
|
|
459
|
+
summary += f"Model: {best_classifier.model_family}\n\n"
|
|
460
|
+
|
|
461
|
+
return summary
|
|
462
|
+
|
|
463
|
+
def filter_classifiers(self,
|
|
464
|
+
issue_types: List[str] = None,
|
|
465
|
+
min_quality: float = 0.0,
|
|
466
|
+
model_family: str = None,
|
|
467
|
+
layers: List[int] = None) -> List[ClassifierListing]:
|
|
468
|
+
"""
|
|
469
|
+
Filter available classifiers by criteria.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
issue_types: List of issue types to include
|
|
473
|
+
min_quality: Minimum quality score
|
|
474
|
+
model_family: Required model family
|
|
475
|
+
layers: Allowed layers
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
Filtered list of classifier listings
|
|
479
|
+
"""
|
|
480
|
+
filtered = self.available_classifiers
|
|
481
|
+
|
|
482
|
+
if issue_types:
|
|
483
|
+
filtered = [c for c in filtered if c.issue_type in issue_types]
|
|
484
|
+
|
|
485
|
+
if min_quality > 0:
|
|
486
|
+
filtered = [c for c in filtered if c.quality_score >= min_quality]
|
|
487
|
+
|
|
488
|
+
if model_family:
|
|
489
|
+
filtered = [c for c in filtered if c.model_family == model_family]
|
|
490
|
+
|
|
491
|
+
if layers:
|
|
492
|
+
filtered = [c for c in filtered if c.layer in layers]
|
|
493
|
+
|
|
494
|
+
return filtered
|
|
495
|
+
|
|
496
|
+
async def create_classifier_on_demand(self,
|
|
497
|
+
issue_type: str,
|
|
498
|
+
custom_layer: int = None) -> ClassifierListing:
|
|
499
|
+
"""
|
|
500
|
+
Create a new classifier on demand.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
issue_type: Type of issue to create classifier for
|
|
504
|
+
custom_layer: Optional custom layer (otherwise uses optimal)
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
Newly created classifier listing
|
|
508
|
+
"""
|
|
509
|
+
from .create_classifier import create_classifier_on_demand
|
|
510
|
+
|
|
511
|
+
print(f"šļø Creating new classifier for {issue_type}...")
|
|
512
|
+
|
|
513
|
+
# Get creation estimate
|
|
514
|
+
estimate = self.get_creation_estimate(issue_type)
|
|
515
|
+
layer = custom_layer or estimate.optimal_layer
|
|
516
|
+
|
|
517
|
+
# Create save path
|
|
518
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
519
|
+
save_path = f"./models/agent_created_{issue_type}_layer{layer}_{timestamp}.pkl"
|
|
520
|
+
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
|
521
|
+
|
|
522
|
+
# Create classifier
|
|
523
|
+
start_time = time.time()
|
|
524
|
+
result = create_classifier_on_demand(
|
|
525
|
+
model=self.model,
|
|
526
|
+
issue_type=issue_type,
|
|
527
|
+
layer=layer,
|
|
528
|
+
save_path=save_path,
|
|
529
|
+
optimize=True
|
|
530
|
+
)
|
|
531
|
+
training_time = time.time() - start_time
|
|
532
|
+
|
|
533
|
+
# Create listing for the new classifier
|
|
534
|
+
listing = ClassifierListing(
|
|
535
|
+
path=result.save_path,
|
|
536
|
+
layer=result.config.layer,
|
|
537
|
+
issue_type=issue_type,
|
|
538
|
+
threshold=result.config.threshold,
|
|
539
|
+
quality_score=result.performance_metrics.get('f1', 0.0),
|
|
540
|
+
training_samples=result.performance_metrics.get('training_samples', 0),
|
|
541
|
+
model_family=self._extract_model_family(self.model.model_name),
|
|
542
|
+
created_at=datetime.now().isoformat(),
|
|
543
|
+
training_time_seconds=training_time,
|
|
544
|
+
metadata=result.performance_metrics
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
# Add to available classifiers
|
|
548
|
+
self.available_classifiers.append(listing)
|
|
549
|
+
self.available_classifiers.sort(key=lambda x: x.quality_score, reverse=True)
|
|
550
|
+
|
|
551
|
+
print(f" ā
Created classifier in {training_time/60:.1f} minutes")
|
|
552
|
+
print(f" š Quality score: {listing.quality_score:.3f}")
|
|
553
|
+
|
|
554
|
+
return listing
|