wisent 0.1.1__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.2.dist-info/METADATA +67 -0
  215. wisent-0.5.2.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,554 @@
1
+ from typing import List, Dict, Any, Optional, Tuple
2
+ from dataclasses import dataclass
3
+ import os
4
+ import json
5
+ import pickle
6
+ import time
7
+ from datetime import datetime
8
+ import numpy as np
9
+
10
+ from wisent.core.utils.device import resolve_default_device
11
+
12
+ @dataclass
13
+ class ClassifierListing:
14
+ """A classifier available in the marketplace."""
15
+ path: str
16
+ layer: int
17
+ issue_type: str
18
+ threshold: float
19
+ quality_score: float # 0.0 to 1.0, higher is better
20
+ training_samples: int
21
+ model_family: str
22
+ created_at: str
23
+ training_time_seconds: float
24
+ metadata: Dict[str, Any]
25
+
26
+ def to_config(self) -> Dict[str, Any]:
27
+ """Convert to classifier config format."""
28
+ return {
29
+ "path": self.path,
30
+ "layer": self.layer,
31
+ "issue_type": self.issue_type,
32
+ "threshold": self.threshold
33
+ }
34
+
35
+ @dataclass
36
+ class ClassifierCreationEstimate:
37
+ """Estimate for creating a new classifier."""
38
+ issue_type: str
39
+ estimated_training_time_minutes: float
40
+ estimated_quality_score: float # Predicted based on issue type complexity
41
+ training_samples_needed: int
42
+ optimal_layer: int
43
+ confidence: float # How confident we are in the estimate
44
+
45
+ class ClassifierMarketplace:
46
+ """
47
+ A marketplace interface for classifiers that gives the agent full autonomy
48
+ to discover, evaluate, and create classifiers based on its needs.
49
+ """
50
+
51
+ def __init__(self, model, search_paths: List[str] = None):
52
+ self.model = model
53
+ self.search_paths = search_paths or [
54
+ "./models/",
55
+ "./classifiers/",
56
+ "./wisent_guard/models/",
57
+ "./wisent_guard/classifiers/",
58
+ "./wisent_guard/core/classifiers/"
59
+ ]
60
+ self.available_classifiers: List[ClassifierListing] = []
61
+ self._training_time_cache = {}
62
+
63
+ def discover_available_classifiers(self) -> List[ClassifierListing]:
64
+ """
65
+ Discover all available classifiers and return them as marketplace listings.
66
+
67
+ Returns:
68
+ List of classifier listings with quality scores and metadata
69
+ """
70
+ print("šŸŖ Discovering available classifiers in marketplace...")
71
+
72
+ self.available_classifiers = []
73
+
74
+ for search_path in self.search_paths:
75
+ if not os.path.exists(search_path):
76
+ continue
77
+
78
+ # For wisent_guard/core/classifiers, search recursively for the nested structure
79
+ if "wisent_guard/core/classifiers" in search_path:
80
+ import glob
81
+ pattern = os.path.join(search_path, "**", "*.pkl")
82
+ classifier_files = glob.glob(pattern, recursive=True)
83
+ for filepath in classifier_files:
84
+ listing = self._create_classifier_listing(filepath)
85
+ if listing:
86
+ self.available_classifiers.append(listing)
87
+ else:
88
+ # Original behavior for other directories
89
+ for filename in os.listdir(search_path):
90
+ if filename.endswith('.pkl'):
91
+ filepath = os.path.join(search_path, filename)
92
+ listing = self._create_classifier_listing(filepath)
93
+ if listing:
94
+ self.available_classifiers.append(listing)
95
+
96
+ # Sort by quality score (best first)
97
+ self.available_classifiers.sort(key=lambda x: x.quality_score, reverse=True)
98
+
99
+ print(f" šŸ“Š Found {len(self.available_classifiers)} classifiers in marketplace")
100
+ return self.available_classifiers
101
+
102
+ def _create_classifier_listing(self, filepath: str) -> Optional[ClassifierListing]:
103
+ """Create a marketplace listing for a classifier file."""
104
+ try:
105
+ # Load metadata
106
+ metadata = self._load_metadata(filepath)
107
+
108
+ # Parse filename for layer and issue type
109
+ layer, issue_type = self._parse_filename(filepath)
110
+
111
+ # Calculate quality score
112
+ quality_score = self._calculate_quality_score(metadata)
113
+
114
+ # Extract other info
115
+ threshold = metadata.get('threshold', 0.5)
116
+ training_samples = metadata.get('training_samples', 0)
117
+ model_family = self._extract_model_family(metadata.get('model_name', ''))
118
+ created_at = metadata.get('created_at', datetime.now().isoformat())
119
+ training_time = metadata.get('training_time_seconds', 0.0)
120
+
121
+ return ClassifierListing(
122
+ path=filepath,
123
+ layer=layer,
124
+ issue_type=issue_type,
125
+ threshold=threshold,
126
+ quality_score=quality_score,
127
+ training_samples=training_samples,
128
+ model_family=model_family,
129
+ created_at=created_at,
130
+ training_time_seconds=training_time,
131
+ metadata=metadata
132
+ )
133
+
134
+ except Exception as e:
135
+ print(f" āš ļø Could not create listing for {filepath}: {e}")
136
+ return None
137
+
138
+ def _load_metadata(self, filepath: str) -> Dict[str, Any]:
139
+ """Load metadata for a classifier."""
140
+ # Try to load companion JSON file first
141
+ json_path = filepath.replace('.pkl', '.json')
142
+ if os.path.exists(json_path):
143
+ try:
144
+ with open(json_path, 'r') as f:
145
+ return json.load(f)
146
+ except:
147
+ pass
148
+
149
+ # Try to load metadata from the pickle file itself
150
+ try:
151
+ with open(filepath, 'rb') as f:
152
+ data = pickle.load(f)
153
+ if isinstance(data, dict) and 'metadata' in data:
154
+ return data['metadata']
155
+ elif hasattr(data, 'metadata'):
156
+ return data.metadata
157
+ except:
158
+ pass
159
+
160
+ return {}
161
+
162
+ def _parse_filename(self, filepath: str) -> Tuple[int, str]:
163
+ """Parse layer and issue type from filename."""
164
+ filename = os.path.basename(filepath).lower()
165
+
166
+ # Check if this is from wisent_guard/core/classifiers with nested structure
167
+ if "wisent_guard/core/classifiers" in filepath:
168
+ # Extract from path structure: wisent_guard/core/classifiers/{model}/{benchmark}/layer_{layer}.pkl
169
+ path_parts = filepath.split(os.sep)
170
+
171
+ # Find the benchmark name (second to last directory)
172
+ if len(path_parts) >= 2:
173
+ benchmark_name = path_parts[-2] # Directory containing the classifier file
174
+
175
+ # Extract layer from filename like "layer_15.pkl"
176
+ import re
177
+ layer_match = re.search(r'layer_(\d+)\.pkl', filename)
178
+ layer = int(layer_match.group(1)) if layer_match else 15
179
+
180
+ # Use benchmark name as issue type for generated classifiers
181
+ issue_type = f"quality_{benchmark_name}"
182
+
183
+ return layer, issue_type
184
+
185
+ # Original parsing logic for other classifiers
186
+ filename = os.path.basename(filepath).lower()
187
+
188
+ # Extract layer
189
+ layer = 15 # default
190
+ for part in filename.replace('_', ' ').replace('-', ' ').split():
191
+ if part.startswith('l') and part[1:].isdigit():
192
+ layer = int(part[1:])
193
+ break
194
+ elif part.startswith('layer') and len(part) > 5:
195
+ try:
196
+ layer = int(part[5:])
197
+ break
198
+ except:
199
+ pass
200
+ elif 'layer' in filename:
201
+ import re
202
+ match = re.search(r'layer[_\s]*(\d+)', filename)
203
+ if match:
204
+ layer = int(match.group(1))
205
+ break
206
+
207
+ # Extract issue type using model
208
+ issue_type = self._get_model_issue_type(filename)
209
+
210
+ return layer, issue_type
211
+
212
+ def _get_model_issue_type(self, filename: str) -> str:
213
+ """Extract issue type from filename using model decisions."""
214
+ prompt = f"""What AI safety issue type is this classifier filename related to?
215
+
216
+ Filename: {filename}
217
+
218
+ Common issue types include:
219
+ - hallucination (false information, factual errors)
220
+ - quality (output quality, coherence)
221
+ - harmful (toxic content, safety violations)
222
+ - bias (unfairness, discrimination)
223
+ - coherence (logical consistency)
224
+
225
+ Respond with just the issue type (one word):"""
226
+
227
+ try:
228
+ response = self.model.generate(prompt, layer_index=15, max_new_tokens=15, temperature=0.1)
229
+ issue_type = response.strip().lower()
230
+
231
+ # Clean up response to single word
232
+ import re
233
+ match = re.search(r'(hallucination|quality|harmful|bias|coherence|unknown)', issue_type)
234
+ if match:
235
+ return match.group(1)
236
+ return "unknown"
237
+ except:
238
+ return "unknown"
239
+
240
+ def _calculate_quality_score(self, metadata: Dict[str, Any]) -> float:
241
+ """Calculate a comprehensive quality score for the classifier."""
242
+ score = 0.0
243
+
244
+ # Primary performance metrics (70% of score)
245
+ f1_score = metadata.get('f1', metadata.get('training_f1', 0.0))
246
+ accuracy = metadata.get('accuracy', metadata.get('training_accuracy', 0.0))
247
+
248
+ if f1_score > 0:
249
+ score += f1_score * 0.5
250
+ if accuracy > 0:
251
+ score += accuracy * 0.2
252
+
253
+ # Training data quality (20% of score)
254
+ training_samples = metadata.get('training_samples', 0)
255
+ if training_samples > 0:
256
+ data_quality = min(training_samples / 1000, 1.0) * 0.2
257
+ score += data_quality
258
+
259
+ # Recency bonus (10% of score)
260
+ try:
261
+ created_at = datetime.fromisoformat(metadata.get('created_at', ''))
262
+ days_old = (datetime.now() - created_at).days
263
+ recency_score = max(0, (90 - days_old) / 90) * 0.1 # Decays over 90 days
264
+ score += recency_score
265
+ except:
266
+ pass
267
+
268
+ return min(score, 1.0)
269
+
270
+ def _extract_model_family(self, model_name: str) -> str:
271
+ """Extract model family from model name using model decisions."""
272
+ if not model_name:
273
+ return "unknown"
274
+
275
+ prompt = f"""What model family is this model name from?
276
+
277
+ Model name: {model_name}
278
+
279
+ Common families include: llama, mistral, gemma, qwen, gpt, claude, other
280
+
281
+ Respond with just the family name (one word):"""
282
+
283
+ try:
284
+ response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
285
+ family = response.strip().lower()
286
+
287
+ # Clean up response
288
+ import re
289
+ match = re.search(r'(llama|mistral|gemma|qwen|gpt|claude|other|unknown)', family)
290
+ if match:
291
+ return match.group(1)
292
+ return "unknown"
293
+ except:
294
+ return "unknown"
295
+
296
+ def get_creation_estimate(self, issue_type: str) -> ClassifierCreationEstimate:
297
+ """
298
+ Get an estimate for creating a new classifier for the given issue type.
299
+
300
+ Args:
301
+ issue_type: The type of issue to create a classifier for
302
+
303
+ Returns:
304
+ Estimate including time, quality, and confidence
305
+ """
306
+ # Dynamic estimates based on available benchmark data
307
+ # Check if we have relevant benchmarks for this issue type
308
+ available_benchmarks = self._find_available_benchmarks_for_issue(issue_type)
309
+
310
+ if available_benchmarks:
311
+ # We have relevant benchmark data - better quality expected
312
+ benchmark_count = len(available_benchmarks)
313
+ base = {
314
+ "training_time_minutes": 8.0 + (benchmark_count * 2.0), # More benchmarks = more time
315
+ "quality_score": min(0.80, 0.60 + (benchmark_count * 0.05)), # Better with more data
316
+ "samples_needed": min(500, 100 + (benchmark_count * 30)), # Scale with available data
317
+ "optimal_layer": self._estimate_optimal_layer_for_issue(issue_type)
318
+ }
319
+ print(f" šŸ“Š Using {benchmark_count} benchmarks for {issue_type}")
320
+ else:
321
+ # Fall back to synthetic generation
322
+ base = {
323
+ "training_time_minutes": 6.0, # Synthetic is faster but less data
324
+ "quality_score": 0.55, # Lower expectation for synthetic
325
+ "samples_needed": 50, # Fewer samples for synthetic
326
+ "optimal_layer": 14 # General-purpose layer
327
+ }
328
+ print(f" šŸ¤– Using synthetic generation for {issue_type}")
329
+
330
+ return self._complete_creation_estimate(base, available_benchmarks, issue_type)
331
+
332
+ def _find_available_benchmarks_for_issue(self, issue_type: str) -> List[str]:
333
+ """Find available benchmarks using dynamic semantic analysis."""
334
+ available_tasks = self.model.get_available_tasks()
335
+
336
+ # Use semantic similarity to find relevant benchmarks
337
+ relevant = []
338
+ issue_lower = issue_type.lower()
339
+
340
+ for task in available_tasks[:1000]: # Limit search for speed
341
+ task_lower = task.lower()
342
+
343
+ # Calculate semantic similarity score
344
+ similarity_score = self._calculate_task_similarity(issue_lower, task_lower)
345
+
346
+ if similarity_score > 0:
347
+ relevant.append((task, similarity_score))
348
+ if len(relevant) >= 30: # Get more candidates for ranking
349
+ break
350
+
351
+ # Sort by similarity score and return top matches
352
+ relevant.sort(key=lambda x: x[1], reverse=True)
353
+ return [task for task, score in relevant[:15]] # Return top 15
354
+
355
+ def _calculate_task_similarity(self, issue_type: str, task_name: str) -> float:
356
+ """Calculate similarity between issue type and task name using model decisions."""
357
+ prompt = f"""Rate the similarity between this issue type and evaluation task for training AI safety classifiers.
358
+
359
+ Issue Type: {issue_type}
360
+ Task: {task_name}
361
+
362
+ Rate similarity from 0.0 to 10.0 (10.0 = highly similar, 0.0 = not similar).
363
+ Respond with only the number:"""
364
+
365
+ try:
366
+ response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
367
+ score_str = response.strip()
368
+
369
+ import re
370
+ match = re.search(r'(\d+\.?\d*)', score_str)
371
+ if match:
372
+ score = float(match.group(1))
373
+ return min(10.0, max(0.0, score))
374
+ return 0.0
375
+ except:
376
+ return 0.0
377
+
378
+
379
+
380
+ def _estimate_optimal_layer_for_issue(self, issue_type: str) -> int:
381
+ """Estimate optimal layer using model analysis of issue complexity."""
382
+ prompt = f"""What transformer layer would be optimal for detecting this AI safety issue?
383
+
384
+ Issue Type: {issue_type}
385
+
386
+ Consider:
387
+ - Simple issues (formatting, basic patterns) → early layers (8-12)
388
+ - Complex semantic issues (truthfulness, bias) → middle layers (12-16)
389
+ - Abstract conceptual issues (coherence, quality) → deeper layers (16-20)
390
+
391
+ Respond with just the layer number (8-20):"""
392
+
393
+ try:
394
+ response = self.model.generate(prompt, layer_index=15, max_new_tokens=10, temperature=0.1)
395
+ layer_str = response.strip()
396
+
397
+ import re
398
+ match = re.search(r'(\d+)', layer_str)
399
+ if match:
400
+ layer = int(match.group(1))
401
+ return max(8, min(20, layer)) # Clamp to valid range
402
+ return 14 # Default middle layer
403
+ except:
404
+ return 14
405
+
406
+ def _complete_creation_estimate(self, base: Dict[str, Any], available_benchmarks: List[str], issue_type: str) -> ClassifierCreationEstimate:
407
+ """Complete the creation estimate with hardware adjustments."""
408
+ # Adjust based on model and hardware
409
+ hardware_multiplier = self._estimate_hardware_speed()
410
+ training_time = base["training_time_minutes"] * hardware_multiplier
411
+
412
+ # Confidence based on data availability
413
+ confidence = 0.8 if available_benchmarks else 0.6 # Higher confidence with benchmark data
414
+
415
+ return ClassifierCreationEstimate(
416
+ issue_type=issue_type,
417
+ estimated_training_time_minutes=training_time,
418
+ estimated_quality_score=base["quality_score"],
419
+ training_samples_needed=base["samples_needed"],
420
+ optimal_layer=base["optimal_layer"],
421
+ confidence=confidence
422
+ )
423
+
424
+ def _estimate_hardware_speed(self) -> float:
425
+ """Estimate hardware speed multiplier for training time."""
426
+ device_kind = resolve_default_device()
427
+ if device_kind == "cuda":
428
+ return 0.3
429
+ if device_kind == "mps":
430
+ return 0.5
431
+ return 1.0
432
+
433
+ def get_marketplace_summary(self) -> str:
434
+ """Get a summary of the classifier marketplace."""
435
+ if not self.available_classifiers:
436
+ self.discover_available_classifiers()
437
+
438
+ if not self.available_classifiers:
439
+ return "šŸŖ Classifier Marketplace: No classifiers available"
440
+
441
+ summary = f"\nšŸŖ Classifier Marketplace Summary\n"
442
+ summary += f"{'='*50}\n"
443
+ summary += f"Available Classifiers: {len(self.available_classifiers)}\n\n"
444
+
445
+ # Group by issue type
446
+ by_issue_type = {}
447
+ for classifier in self.available_classifiers:
448
+ issue_type = classifier.issue_type
449
+ if issue_type not in by_issue_type:
450
+ by_issue_type[issue_type] = []
451
+ by_issue_type[issue_type].append(classifier)
452
+
453
+ for issue_type, classifiers in by_issue_type.items():
454
+ best_classifier = max(classifiers, key=lambda x: x.quality_score)
455
+ summary += f"šŸ“Š {issue_type.upper()}: {len(classifiers)} available\n"
456
+ summary += f" Best: {os.path.basename(best_classifier.path)} "
457
+ summary += f"(Quality: {best_classifier.quality_score:.3f}, Layer: {best_classifier.layer})\n"
458
+ summary += f" Samples: {best_classifier.training_samples}, "
459
+ summary += f"Model: {best_classifier.model_family}\n\n"
460
+
461
+ return summary
462
+
463
+ def filter_classifiers(self,
464
+ issue_types: List[str] = None,
465
+ min_quality: float = 0.0,
466
+ model_family: str = None,
467
+ layers: List[int] = None) -> List[ClassifierListing]:
468
+ """
469
+ Filter available classifiers by criteria.
470
+
471
+ Args:
472
+ issue_types: List of issue types to include
473
+ min_quality: Minimum quality score
474
+ model_family: Required model family
475
+ layers: Allowed layers
476
+
477
+ Returns:
478
+ Filtered list of classifier listings
479
+ """
480
+ filtered = self.available_classifiers
481
+
482
+ if issue_types:
483
+ filtered = [c for c in filtered if c.issue_type in issue_types]
484
+
485
+ if min_quality > 0:
486
+ filtered = [c for c in filtered if c.quality_score >= min_quality]
487
+
488
+ if model_family:
489
+ filtered = [c for c in filtered if c.model_family == model_family]
490
+
491
+ if layers:
492
+ filtered = [c for c in filtered if c.layer in layers]
493
+
494
+ return filtered
495
+
496
+ async def create_classifier_on_demand(self,
497
+ issue_type: str,
498
+ custom_layer: int = None) -> ClassifierListing:
499
+ """
500
+ Create a new classifier on demand.
501
+
502
+ Args:
503
+ issue_type: Type of issue to create classifier for
504
+ custom_layer: Optional custom layer (otherwise uses optimal)
505
+
506
+ Returns:
507
+ Newly created classifier listing
508
+ """
509
+ from .create_classifier import create_classifier_on_demand
510
+
511
+ print(f"šŸ—ļø Creating new classifier for {issue_type}...")
512
+
513
+ # Get creation estimate
514
+ estimate = self.get_creation_estimate(issue_type)
515
+ layer = custom_layer or estimate.optimal_layer
516
+
517
+ # Create save path
518
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
519
+ save_path = f"./models/agent_created_{issue_type}_layer{layer}_{timestamp}.pkl"
520
+ os.makedirs(os.path.dirname(save_path), exist_ok=True)
521
+
522
+ # Create classifier
523
+ start_time = time.time()
524
+ result = create_classifier_on_demand(
525
+ model=self.model,
526
+ issue_type=issue_type,
527
+ layer=layer,
528
+ save_path=save_path,
529
+ optimize=True
530
+ )
531
+ training_time = time.time() - start_time
532
+
533
+ # Create listing for the new classifier
534
+ listing = ClassifierListing(
535
+ path=result.save_path,
536
+ layer=result.config.layer,
537
+ issue_type=issue_type,
538
+ threshold=result.config.threshold,
539
+ quality_score=result.performance_metrics.get('f1', 0.0),
540
+ training_samples=result.performance_metrics.get('training_samples', 0),
541
+ model_family=self._extract_model_family(self.model.model_name),
542
+ created_at=datetime.now().isoformat(),
543
+ training_time_seconds=training_time,
544
+ metadata=result.performance_metrics
545
+ )
546
+
547
+ # Add to available classifiers
548
+ self.available_classifiers.append(listing)
549
+ self.available_classifiers.sort(key=lambda x: x.quality_score, reverse=True)
550
+
551
+ print(f" āœ… Created classifier in {training_time/60:.1f} minutes")
552
+ print(f" šŸ“Š Quality score: {listing.quality_score:.3f}")
553
+
554
+ return listing