wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,641 @@
1
+ from typing import List, Dict, Any, Optional, Set
2
+ from dataclasses import dataclass
3
+ import re
4
+ import asyncio
5
+ import time
6
+ import sys
7
+ import os
8
+
9
+ # Add the lm-harness-integration path for benchmark selection
10
+ sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'lm-harness-integration'))
11
+
12
+ from .classifier_marketplace import ClassifierMarketplace, ClassifierListing, ClassifierCreationEstimate
13
+ from ..budget import get_budget_manager, track_task_performance, ResourceType
14
+
15
+ @dataclass
16
+ class TaskAnalysis:
17
+ """Analysis of what classifiers might be needed for a task."""
18
+ prompt_content: str
19
+ relevant_benchmarks: List[Dict[str, Any]] = None # Selected benchmarks for training and steering
20
+
21
+ @dataclass
22
+ class ClassifierDecision:
23
+ """A decision about whether to use an existing classifier or create a new one."""
24
+ benchmark_name: str
25
+ action: str # "use_existing", "create_new", "skip"
26
+ selected_classifier: Optional[ClassifierListing] = None
27
+ creation_estimate: Optional[ClassifierCreationEstimate] = None
28
+ reasoning: str = ""
29
+ confidence: float = 0.0
30
+
31
+ @dataclass
32
+ class SingleClassifierDecision:
33
+ """Decision about creating one combined classifier from multiple benchmarks."""
34
+ benchmark_names: List[str]
35
+ action: str # "use_existing", "create_new", "skip"
36
+ selected_classifier: Optional[ClassifierListing] = None
37
+ creation_estimate: Optional[ClassifierCreationEstimate] = None
38
+ reasoning: str = ""
39
+ confidence: float = 0.0
40
+
41
+ @dataclass
42
+ class ClassifierParams:
43
+ """Model-determined classifier parameters."""
44
+ optimal_layer: int # 8-20: Based on semantic complexity needed
45
+ classification_threshold: float # 0.1-0.9: Based on quality strictness required
46
+ training_samples: int # 10-50: Based on complexity and time constraints
47
+ classifier_type: str # logistic/svm/neural: Based on data characteristics
48
+ reasoning: str = ""
49
+ model_name: str = "unknown" # Model name for matching existing classifiers
50
+
51
+ # Additional classifier configuration parameters
52
+ aggregation_method: str = "last_token" # last_token/mean/max for activation aggregation
53
+ token_aggregation: str = "average" # average/final/first/max/min for token score aggregation
54
+ num_epochs: int = 50
55
+ batch_size: int = 32
56
+ learning_rate: float = 0.001
57
+ early_stopping_patience: int = 10
58
+ hidden_dim: int = 128
59
+
60
+ @dataclass
61
+ class SteeringParams:
62
+ """Model-determined steering parameters."""
63
+ steering_method: str # CAA/HPR/DAC/BiPO/KSteering: Best fit for prompt type
64
+ initial_strength: float # 0.1-2.0: How aggressive to start
65
+ increment: float # 0.1-0.5: How much to increase per failed attempt
66
+ maximum_strength: float # 0.5-3.0: Upper limit to prevent over-steering
67
+ method_specific_params: Dict[str, Any] = None # Beta values, thresholds, etc.
68
+ reasoning: str = ""
69
+
70
+ @dataclass
71
+ class QualityResult:
72
+ """Result of quality evaluation."""
73
+ score: float # Classifier prediction score
74
+ acceptable: bool # Model judgment if quality is acceptable
75
+ reasoning: str = ""
76
+
77
+ @dataclass
78
+ class QualityControlledResponse:
79
+ """Final response with complete metadata."""
80
+ response_text: str
81
+ final_quality_score: float
82
+ attempts_needed: int
83
+ classifier_params_used: ClassifierParams
84
+ steering_params_used: Optional[SteeringParams] = None
85
+ quality_progression: List[float] = None # Quality scores for each attempt
86
+ total_time_seconds: float = 0.0
87
+
88
+ class AgentClassifierDecisionSystem:
89
+ """
90
+ Intelligent system that helps the agent make autonomous decisions about
91
+ which classifiers to use based on task analysis and cost-benefit considerations.
92
+ """
93
+
94
+ def __init__(self, marketplace: ClassifierMarketplace):
95
+ self.marketplace = marketplace
96
+ self.decision_history: List[ClassifierDecision] = []
97
+
98
+ def analyze_task_requirements(self, prompt: str, context: str = "",
99
+ priority: str = "all", fast_only: bool = False,
100
+ time_budget_minutes: float = 5.0, max_benchmarks: int = 1) -> TaskAnalysis:
101
+ """
102
+ Analyze a task/prompt to select relevant benchmarks for training and steering.
103
+
104
+ Args:
105
+ prompt: The prompt or task to analyze
106
+ context: Additional context about the task
107
+ priority: Priority level for benchmark selection
108
+ fast_only: Only use fast benchmarks
109
+ time_budget_minutes: Time budget for benchmark selection
110
+ max_benchmarks: Maximum number of benchmarks to select
111
+ prefer_fast: Prefer fast benchmarks
112
+
113
+ Returns:
114
+ Analysis with relevant benchmarks for direct use
115
+ """
116
+ print(f"šŸ” Analyzing task requirements for prompt...")
117
+
118
+ # Get relevant benchmarks for the prompt using priority-aware selection
119
+ existing_model = getattr(self.marketplace, 'model', None)
120
+ relevant_benchmarks = self._get_relevant_benchmarks_for_prompt(
121
+ prompt,
122
+ existing_model=existing_model,
123
+ priority=priority,
124
+ fast_only=fast_only,
125
+ time_budget_minutes=time_budget_minutes,
126
+ max_benchmarks=max_benchmarks
127
+ )
128
+ print(f" šŸ“Š Found {len(relevant_benchmarks)} relevant benchmarks")
129
+
130
+ return TaskAnalysis(
131
+ prompt_content=prompt,
132
+ relevant_benchmarks=relevant_benchmarks
133
+ )
134
+
135
+ def _get_relevant_benchmarks_for_prompt(self, prompt: str, existing_model=None,
136
+ priority: str = "all", fast_only: bool = False,
137
+ time_budget_minutes: float = 5.0, max_benchmarks: int = 1) -> List[Dict[str, Any]]:
138
+ """Get relevant benchmarks for the prompt using the intelligent selection system with priority awareness."""
139
+ try:
140
+ # Import the benchmark selection function from the correct location
141
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..', 'lm-harness-integration'))
142
+ from populate_tasks import get_relevant_benchmarks_for_prompt
143
+
144
+ # Use priority-aware selection with provided parameters
145
+ relevant_benchmarks = get_relevant_benchmarks_for_prompt(
146
+ prompt=prompt,
147
+ max_benchmarks=max_benchmarks,
148
+ existing_model=existing_model,
149
+ priority=priority,
150
+ fast_only=fast_only,
151
+ time_budget_minutes=time_budget_minutes
152
+ )
153
+
154
+ return relevant_benchmarks
155
+ except Exception as e:
156
+ print(f" āš ļø Failed to get relevant benchmarks: {e}")
157
+ # Fallback to basic high-priority benchmarks
158
+ return [
159
+ {'benchmark': 'mmlu', 'explanation': 'General knowledge benchmark', 'relevance_score': 1, 'priority': 'high', 'loading_time': 9.5},
160
+ {'benchmark': 'truthfulqa_mc1', 'explanation': 'Truthfulness benchmark', 'relevance_score': 2, 'priority': 'high', 'loading_time': 11.2},
161
+ {'benchmark': 'hellaswag', 'explanation': 'Commonsense reasoning benchmark', 'relevance_score': 3, 'priority': 'high', 'loading_time': 12.8}
162
+ ]
163
+
164
+
165
+
166
+
167
+
168
+ async def create_single_quality_classifier(self,
169
+ task_analysis: TaskAnalysis,
170
+ classifier_params: 'ClassifierParams',
171
+ quality_threshold: float = 0.3,
172
+ time_budget_minutes: float = 10.0) -> SingleClassifierDecision:
173
+ """
174
+ Create a single classifier trained on one benchmark.
175
+
176
+ Args:
177
+ task_analysis: Analysis with relevant benchmarks
178
+ classifier_params: Model-determined classifier parameters
179
+ quality_threshold: Minimum quality score to accept existing classifiers
180
+ time_budget_minutes: Maximum time budget for creating new classifiers
181
+
182
+ Returns:
183
+ Single classifier decision for the selected benchmark
184
+ """
185
+ print(f"šŸ” Creating single quality classifier from {len(task_analysis.relevant_benchmarks)} benchmark(s)...")
186
+
187
+ # Extract benchmark names (should be just one now)
188
+ benchmark_names = [b['benchmark'] for b in task_analysis.relevant_benchmarks]
189
+
190
+ if not benchmark_names:
191
+ return SingleClassifierDecision(
192
+ benchmark_names=[],
193
+ action="skip",
194
+ reasoning="No benchmarks selected for classifier training",
195
+ confidence=0.0
196
+ )
197
+
198
+ # Use first (and should be only) benchmark
199
+ benchmark_name = benchmark_names[0]
200
+ print(f" šŸ“Š Using benchmark: {benchmark_name}")
201
+
202
+ # Set up budget manager
203
+ budget_manager = get_budget_manager()
204
+ budget_manager.set_time_budget(time_budget_minutes)
205
+
206
+ # Look for existing classifier for this exact model/layer/benchmark combination
207
+ available_classifiers = self.marketplace.discover_available_classifiers()
208
+ model_name = classifier_params.model_name if hasattr(classifier_params, 'model_name') else "unknown"
209
+ layer = classifier_params.optimal_layer
210
+
211
+ # Create specific classifier identifier
212
+ classifier_id = f"{model_name}_{benchmark_name}_layer_{layer}"
213
+
214
+ print(f" šŸ” Checking for existing classifier: {classifier_id}")
215
+
216
+ # Find existing classifier with exact match
217
+ existing_classifier = None
218
+ for classifier in available_classifiers:
219
+ # Check if classifier matches our exact requirements
220
+ if (benchmark_name.lower() in classifier.path.lower() and
221
+ str(layer) in classifier.path and
222
+ classifier.layer == layer):
223
+ existing_classifier = classifier
224
+ print(f" āœ… Found existing classifier: {classifier.path}")
225
+ break
226
+
227
+ # Decision logic for single benchmark classifier
228
+ if existing_classifier and existing_classifier.quality_score >= quality_threshold:
229
+ return SingleClassifierDecision(
230
+ benchmark_names=[benchmark_name],
231
+ action="use_existing",
232
+ selected_classifier=existing_classifier,
233
+ reasoning=f"Found existing classifier for {benchmark_name} at layer {layer} with quality {existing_classifier.quality_score:.2f}",
234
+ confidence=existing_classifier.quality_score
235
+ )
236
+
237
+ # Get creation estimate for single benchmark classifier
238
+ creation_estimate = self.marketplace.get_creation_estimate(benchmark_name)
239
+
240
+ # Check if we can afford to create new classifier
241
+ training_time_seconds = creation_estimate.estimated_training_time_minutes * 60
242
+ time_budget = budget_manager.get_budget(ResourceType.TIME)
243
+
244
+ if time_budget.can_afford(training_time_seconds):
245
+ return SingleClassifierDecision(
246
+ benchmark_names=[benchmark_name],
247
+ action="create_new",
248
+ creation_estimate=creation_estimate,
249
+ reasoning=f"Creating new classifier for {benchmark_name} at layer {layer}",
250
+ confidence=creation_estimate.confidence
251
+ )
252
+ else:
253
+ return SingleClassifierDecision(
254
+ benchmark_names=[benchmark_name],
255
+ action="skip",
256
+ reasoning=f"Insufficient time budget for creation (need {creation_estimate.estimated_training_time_minutes:.1f}min)",
257
+ confidence=0.0
258
+ )
259
+
260
+ async def execute_single_classifier_decision(self, decision: SingleClassifierDecision, classifier_params: 'ClassifierParams') -> Optional[Any]:
261
+ """
262
+ Execute the single classifier decision to create or use the benchmark classifier.
263
+
264
+ Args:
265
+ decision: The single classifier decision to execute
266
+ classifier_params: Model-determined classifier parameters
267
+
268
+ Returns:
269
+ The trained classifier instance or None if skipped
270
+ """
271
+ if decision.action == "skip":
272
+ print(f" ā¹ļø Skipping classifier creation: {decision.reasoning}")
273
+ return None
274
+
275
+ elif decision.action == "use_existing":
276
+ print(f" šŸ“¦ Using existing classifier: {decision.selected_classifier.path}")
277
+ print(f" Quality: {decision.selected_classifier.quality_score:.3f}")
278
+ print(f" Layer: {decision.selected_classifier.layer}")
279
+ return decision.selected_classifier
280
+
281
+ elif decision.action == "create_new":
282
+ benchmark_name = decision.benchmark_names[0] if decision.benchmark_names else "unknown"
283
+ print(f" šŸ—ļø Creating new classifier for benchmark: {benchmark_name}")
284
+ start_time = time.time()
285
+ try:
286
+ # Create classifier using single benchmark training data
287
+ new_classifier = await self._create_single_benchmark_classifier(
288
+ benchmark_name=benchmark_name,
289
+ classifier_params=classifier_params
290
+ )
291
+
292
+ creation_time = time.time() - start_time
293
+ print(f" āœ… Classifier created successfully in {creation_time:.1f}s")
294
+ return new_classifier
295
+
296
+ except Exception as e:
297
+ print(f" āŒ Failed to create classifier: {e}")
298
+ return None
299
+
300
+ return None
301
+
302
+ async def _create_single_benchmark_classifier(self, benchmark_name: str, classifier_params: 'ClassifierParams') -> Optional[Any]:
303
+ """
304
+ Create a classifier for a single benchmark.
305
+
306
+ Args:
307
+ benchmark_name: Name of the benchmark to use for training
308
+ classifier_params: Model-determined classifier parameters
309
+
310
+ Returns:
311
+ The trained classifier instance or None if failed
312
+ """
313
+ from .create_classifier import ClassifierCreator
314
+ from ...training_config import TrainingConfig
315
+
316
+ try:
317
+ # Create training config
318
+ config = TrainingConfig(
319
+ issue_type=f"quality_{benchmark_name}",
320
+ layer=classifier_params.optimal_layer,
321
+ classifier_type=classifier_params.classifier_type,
322
+ threshold=classifier_params.classification_threshold,
323
+ training_samples=classifier_params.training_samples,
324
+ model_name=self.marketplace.model.name if self.marketplace.model else "unknown"
325
+ )
326
+
327
+ # Create classifier creator
328
+ creator = ClassifierCreator(self.marketplace.model)
329
+
330
+ # Create classifier using benchmark-specific training data
331
+ result = await creator.create_classifier_for_issue_with_benchmarks(
332
+ issue_type=f"quality_{benchmark_name}",
333
+ relevant_benchmarks=[benchmark_name],
334
+ layer=classifier_params.optimal_layer,
335
+ num_samples=classifier_params.training_samples,
336
+ config=config
337
+ )
338
+
339
+ return result.classifier if result else None
340
+
341
+ except Exception as e:
342
+ print(f" āŒ Error in single benchmark classifier creation: {e}")
343
+ raise
344
+
345
+ async def _create_combined_classifier(self, benchmark_names: List[str], classifier_params: 'ClassifierParams'):
346
+ """
347
+ Create a classifier using combined training data from multiple benchmarks.
348
+
349
+ Args:
350
+ benchmark_names: List of benchmark names to combine
351
+ classifier_params: Model-determined parameters for classifier creation
352
+
353
+ Returns:
354
+ Trained classifier instance
355
+ """
356
+ from .create_classifier import ClassifierCreator
357
+
358
+ try:
359
+ # Initialize classifier creator
360
+ creator = ClassifierCreator(self.marketplace.model)
361
+
362
+ # Create classifier using combined benchmark training data
363
+ print(f" šŸ“Š Loading combined training data from benchmarks: {benchmark_names}")
364
+ classifier = await creator.create_combined_benchmark_classifier(
365
+ benchmark_names=benchmark_names,
366
+ classifier_params=classifier_params
367
+ )
368
+
369
+ return classifier
370
+
371
+ except Exception as e:
372
+ print(f" āŒ Error in combined classifier creation: {e}")
373
+ raise
374
+
375
+ async def make_classifier_decisions(self,
376
+ task_analysis: TaskAnalysis,
377
+ quality_threshold: float = 0.3,
378
+ time_budget_minutes: float = 10.0,
379
+ max_classifiers: int = None) -> List[ClassifierDecision]:
380
+ """
381
+ Make decisions about which benchmark-specific classifiers to create or use.
382
+
383
+ Args:
384
+ task_analysis: Analysis with relevant benchmarks
385
+ quality_threshold: Minimum quality score to accept existing classifiers
386
+ time_budget_minutes: Maximum time budget for creating new classifiers
387
+ max_classifiers: Maximum number of classifiers to use (None = no limit)
388
+
389
+ Returns:
390
+ List of classifier decisions for each benchmark
391
+ """
392
+ # Set up budget manager
393
+ budget_manager = get_budget_manager()
394
+ budget_manager.set_time_budget(time_budget_minutes)
395
+
396
+ # Discover available classifiers
397
+ await asyncio.sleep(0) # Make this async-compatible
398
+ available_classifiers = self.marketplace.discover_available_classifiers()
399
+
400
+ decisions = []
401
+ classifier_count = 0
402
+
403
+ # Create one classifier per relevant benchmark
404
+ for benchmark_info in task_analysis.relevant_benchmarks:
405
+ if max_classifiers and classifier_count >= max_classifiers:
406
+ print(f" ā¹ļø Reached maximum classifier limit ({max_classifiers})")
407
+ break
408
+
409
+ benchmark_name = benchmark_info['benchmark']
410
+ print(f"\n šŸ” Analyzing classifier for benchmark: {benchmark_name}")
411
+
412
+ # Look for existing benchmark-specific classifier
413
+ existing_options = [c for c in available_classifiers if benchmark_name.lower() in c.path.lower()]
414
+ best_existing = max(existing_options, key=lambda x: x.quality_score) if existing_options else None
415
+
416
+ # Get creation estimate for this benchmark
417
+ creation_estimate = self.marketplace.get_creation_estimate(benchmark_name)
418
+
419
+ # Make decision based on multiple factors
420
+ decision = self._evaluate_benchmark_classifier_options(
421
+ benchmark_name=benchmark_name,
422
+ best_existing=best_existing,
423
+ creation_estimate=creation_estimate,
424
+ quality_threshold=quality_threshold,
425
+ budget_manager=budget_manager
426
+ )
427
+
428
+ decisions.append(decision)
429
+
430
+ # Update budget and count
431
+ if decision.action == "create_new":
432
+ training_time_seconds = creation_estimate.estimated_training_time_minutes * 60
433
+ budget_manager.get_budget(ResourceType.TIME).spend(training_time_seconds)
434
+ classifier_count += 1
435
+ remaining_minutes = budget_manager.get_budget(ResourceType.TIME).remaining_budget / 60
436
+ print(f" ā±ļø Remaining time budget: {remaining_minutes:.1f} minutes")
437
+ elif decision.action == "use_existing":
438
+ classifier_count += 1
439
+
440
+ print(f" āœ… Decision: {decision.action} - {decision.reasoning}")
441
+
442
+ # Store decisions in history
443
+ self.decision_history.extend(decisions)
444
+
445
+ return decisions
446
+
447
+ def _evaluate_benchmark_classifier_options(self,
448
+ benchmark_name: str,
449
+ best_existing: Optional[ClassifierListing],
450
+ creation_estimate: ClassifierCreationEstimate,
451
+ quality_threshold: float,
452
+ budget_manager) -> ClassifierDecision:
453
+ """Evaluate whether to use existing, create new, or skip a benchmark-specific classifier."""
454
+
455
+ # Factor 1: Existing classifier quality
456
+ existing_quality = best_existing.quality_score if best_existing else 0.0
457
+
458
+ # Factor 2: Time constraints
459
+ time_budget = budget_manager.get_budget(ResourceType.TIME)
460
+ training_time_seconds = creation_estimate.estimated_training_time_minutes * 60
461
+ can_afford_creation = time_budget.can_afford(training_time_seconds)
462
+
463
+ # Factor 3: Expected benefit vs cost
464
+ creation_benefit = creation_estimate.estimated_quality_score
465
+ existing_benefit = existing_quality
466
+
467
+ # Decision logic
468
+ if best_existing and existing_quality >= quality_threshold:
469
+ if existing_quality >= creation_benefit or not can_afford_creation:
470
+ return ClassifierDecision(
471
+ benchmark_name=benchmark_name,
472
+ action="use_existing",
473
+ selected_classifier=best_existing,
474
+ reasoning=f"Existing classifier quality {existing_quality:.2f} meets threshold",
475
+ confidence=existing_quality
476
+ )
477
+
478
+ if can_afford_creation and creation_benefit > existing_benefit:
479
+ return ClassifierDecision(
480
+ benchmark_name=benchmark_name,
481
+ action="create_new",
482
+ creation_estimate=creation_estimate,
483
+ reasoning=f"Creating new classifier (est. quality {creation_benefit:.2f} > existing {existing_benefit:.2f})",
484
+ confidence=creation_estimate.confidence
485
+ )
486
+
487
+ if best_existing:
488
+ return ClassifierDecision(
489
+ benchmark_name=benchmark_name,
490
+ action="use_existing",
491
+ selected_classifier=best_existing,
492
+ reasoning=f"Using existing despite low quality - time/budget constraints",
493
+ confidence=existing_quality * 0.7 # Penalty for low quality
494
+ )
495
+
496
+ return ClassifierDecision(
497
+ benchmark_name=benchmark_name,
498
+ action="skip",
499
+ reasoning="No suitable existing classifier and cannot create new within budget",
500
+ confidence=0.0
501
+ )
502
+
503
+ async def execute_decisions(self, decisions: List[ClassifierDecision]) -> List[Dict[str, Any]]:
504
+ """
505
+ Execute the classifier decisions and return the final classifier configs.
506
+
507
+ Args:
508
+ decisions: List of decisions to execute
509
+
510
+ Returns:
511
+ List of classifier configurations ready for use
512
+ """
513
+ classifier_configs = []
514
+
515
+ for decision in decisions:
516
+ if decision.action == "skip":
517
+ continue
518
+
519
+ elif decision.action == "use_existing":
520
+ config = decision.selected_classifier.to_config()
521
+ classifier_configs.append(config)
522
+ print(f" šŸ“Ž Using existing {decision.issue_type} classifier: {config['path']}")
523
+
524
+ elif decision.action == "create_new":
525
+ print(f" šŸ—ļø Creating new classifier for benchmark: {decision.benchmark_name}...")
526
+ start_time = time.time()
527
+ try:
528
+ # Create benchmark-specific classifier
529
+ new_classifier = await self._create_classifier_for_benchmark(
530
+ benchmark_name=decision.benchmark_name
531
+ )
532
+
533
+ end_time = time.time()
534
+
535
+ # Track performance for future budget estimates
536
+ track_task_performance(
537
+ task_name=f"classifier_training_{decision.benchmark_name}",
538
+ start_time=start_time,
539
+ end_time=end_time
540
+ )
541
+
542
+ config = new_classifier.to_config()
543
+ config['benchmark'] = decision.benchmark_name
544
+ classifier_configs.append(config)
545
+ print(f" āœ… Created: {config['path']} (took {end_time - start_time:.1f}s)")
546
+ except Exception as e:
547
+ print(f" āŒ Failed to create {decision.benchmark_name} classifier: {e}")
548
+ continue
549
+
550
+ return classifier_configs
551
+
552
+ async def _create_classifier_for_benchmark(self, benchmark_name: str):
553
+ """
554
+ Create a classifier trained specifically on a benchmark dataset.
555
+
556
+ Args:
557
+ benchmark_name: Name of the benchmark to train on
558
+
559
+ Returns:
560
+ Trained classifier instance
561
+ """
562
+ from .create_classifier import ClassifierCreator
563
+
564
+ try:
565
+ # Initialize classifier creator
566
+ creator = ClassifierCreator(self.marketplace.model)
567
+
568
+ # Create classifier using benchmark-specific training data
569
+ print(f" šŸ“Š Loading training data from benchmark: {benchmark_name}")
570
+ classifier = await creator.create_classifier_for_issue_with_benchmarks(
571
+ issue_type=benchmark_name, # Use benchmark name as issue type
572
+ relevant_benchmarks=[benchmark_name],
573
+ num_samples=50
574
+ )
575
+
576
+ return classifier
577
+
578
+ except Exception as e:
579
+ print(f" āš ļø Benchmark-based creation failed: {e}")
580
+ raise e
581
+
582
+ def get_decision_summary(self) -> str:
583
+ """Get a summary of recent classifier decisions."""
584
+ if not self.decision_history:
585
+ return "No classifier decisions made yet."
586
+
587
+ recent_decisions = self.decision_history[-10:] # Last 10 decisions
588
+
589
+ summary = "\nšŸ¤– Recent Classifier Decisions\n"
590
+ summary += "=" * 40 + "\n"
591
+
592
+ action_counts = {}
593
+ for decision in recent_decisions:
594
+ action_counts[decision.action] = action_counts.get(decision.action, 0) + 1
595
+
596
+ summary += f"Actions taken: {dict(action_counts)}\n\n"
597
+
598
+ for decision in recent_decisions[-5:]: # Show last 5
599
+ summary += f"• {decision.benchmark_name}: {decision.action}\n"
600
+ summary += f" Reasoning: {decision.reasoning}\n"
601
+ summary += f" Confidence: {decision.confidence:.2f}\n\n"
602
+
603
+ return summary
604
+
605
+ async def smart_classifier_selection(self,
606
+ prompt: str,
607
+ context: str = "",
608
+ quality_threshold: float = 0.3,
609
+ time_budget_minutes: float = 10.0,
610
+ max_classifiers: int = None) -> List[Dict[str, Any]]:
611
+ """
612
+ One-stop method for intelligent classifier selection.
613
+
614
+ Args:
615
+ prompt: The task/prompt to analyze
616
+ context: Additional context
617
+ quality_threshold: Minimum quality for existing classifiers
618
+ time_budget_minutes: Time budget for creating new classifiers
619
+ max_classifiers: Maximum number of classifiers to use
620
+
621
+ Returns:
622
+ List of classifier configurations ready for use
623
+ """
624
+ print(f"🧠 Smart classifier selection for task...")
625
+
626
+ # Step 1: Analyze task requirements
627
+ task_analysis = self.analyze_task_requirements(prompt, context)
628
+
629
+ # Step 2: Make decisions about classifiers
630
+ decisions = await self.make_classifier_decisions(
631
+ task_analysis=task_analysis,
632
+ quality_threshold=quality_threshold,
633
+ time_budget_minutes=time_budget_minutes,
634
+ max_classifiers=max_classifiers
635
+ )
636
+
637
+ # Step 3: Execute decisions
638
+ classifier_configs = await self.execute_decisions(decisions)
639
+
640
+ print(f"šŸŽÆ Selected {len(classifier_configs)} classifiers for the task")
641
+ return classifier_configs