wisent 0.1.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of wisent might be problematic. Click here for more details.

Files changed (237) hide show
  1. wisent/__init__.py +1 -8
  2. wisent/benchmarks/__init__.py +0 -0
  3. wisent/benchmarks/coding/__init__.py +0 -0
  4. wisent/benchmarks/coding/metrics/__init__.py +0 -0
  5. wisent/benchmarks/coding/metrics/core/__init__.py +0 -0
  6. wisent/benchmarks/coding/metrics/core/atoms.py +36 -0
  7. wisent/benchmarks/coding/metrics/evaluator.py +275 -0
  8. wisent/benchmarks/coding/metrics/passk.py +66 -0
  9. wisent/benchmarks/coding/output_sanitizer/__init__.py +0 -0
  10. wisent/benchmarks/coding/output_sanitizer/core/__init__.py +0 -0
  11. wisent/benchmarks/coding/output_sanitizer/core/atoms.py +27 -0
  12. wisent/benchmarks/coding/output_sanitizer/cpp_sanitizer.py +62 -0
  13. wisent/benchmarks/coding/output_sanitizer/java_sanitizer.py +78 -0
  14. wisent/benchmarks/coding/output_sanitizer/python_sanitizer.py +94 -0
  15. wisent/benchmarks/coding/output_sanitizer/utils.py +107 -0
  16. wisent/benchmarks/coding/providers/__init__.py +18 -0
  17. wisent/benchmarks/coding/providers/core/__init__.py +0 -0
  18. wisent/benchmarks/coding/providers/core/atoms.py +31 -0
  19. wisent/benchmarks/coding/providers/livecodebench/__init__.py +0 -0
  20. wisent/benchmarks/coding/providers/livecodebench/provider.py +53 -0
  21. wisent/benchmarks/coding/safe_docker/__init__.py +0 -0
  22. wisent/benchmarks/coding/safe_docker/core/__init__.py +0 -0
  23. wisent/benchmarks/coding/safe_docker/core/atoms.py +105 -0
  24. wisent/benchmarks/coding/safe_docker/core/runtime.py +118 -0
  25. wisent/benchmarks/coding/safe_docker/entrypoint.py +123 -0
  26. wisent/benchmarks/coding/safe_docker/recipes.py +60 -0
  27. wisent/classifiers/__init__.py +0 -0
  28. wisent/classifiers/core/__init__.py +0 -0
  29. wisent/classifiers/core/atoms.py +747 -0
  30. wisent/classifiers/models/__init__.py +0 -0
  31. wisent/classifiers/models/logistic.py +29 -0
  32. wisent/classifiers/models/mlp.py +47 -0
  33. wisent/cli/__init__.py +0 -0
  34. wisent/cli/classifiers/__init__.py +0 -0
  35. wisent/cli/classifiers/classifier_rotator.py +137 -0
  36. wisent/cli/cli_logger.py +142 -0
  37. wisent/cli/data_loaders/__init__.py +0 -0
  38. wisent/cli/data_loaders/data_loader_rotator.py +96 -0
  39. wisent/cli/evaluators/__init__.py +0 -0
  40. wisent/cli/evaluators/evaluator_rotator.py +148 -0
  41. wisent/cli/steering_methods/__init__.py +0 -0
  42. wisent/cli/steering_methods/steering_rotator.py +110 -0
  43. wisent/cli/wisent_cli/__init__.py +0 -0
  44. wisent/cli/wisent_cli/commands/__init__.py +0 -0
  45. wisent/cli/wisent_cli/commands/help_cmd.py +52 -0
  46. wisent/cli/wisent_cli/commands/listing.py +154 -0
  47. wisent/cli/wisent_cli/commands/train_cmd.py +322 -0
  48. wisent/cli/wisent_cli/main.py +93 -0
  49. wisent/cli/wisent_cli/shell.py +80 -0
  50. wisent/cli/wisent_cli/ui.py +69 -0
  51. wisent/cli/wisent_cli/util/__init__.py +0 -0
  52. wisent/cli/wisent_cli/util/aggregations.py +43 -0
  53. wisent/cli/wisent_cli/util/parsing.py +126 -0
  54. wisent/cli/wisent_cli/version.py +4 -0
  55. wisent/core/__init__.py +27 -0
  56. wisent/core/activations/__init__.py +0 -0
  57. wisent/core/activations/activations_collector.py +338 -0
  58. wisent/core/activations/core/__init__.py +0 -0
  59. wisent/core/activations/core/atoms.py +216 -0
  60. wisent/core/agent/__init__.py +18 -0
  61. wisent/core/agent/budget.py +638 -0
  62. wisent/core/agent/device_benchmarks.py +685 -0
  63. wisent/core/agent/diagnose/__init__.py +55 -0
  64. wisent/core/agent/diagnose/agent_classifier_decision.py +641 -0
  65. wisent/core/agent/diagnose/classifier_marketplace.py +554 -0
  66. wisent/core/agent/diagnose/create_classifier.py +1154 -0
  67. wisent/core/agent/diagnose/response_diagnostics.py +268 -0
  68. wisent/core/agent/diagnose/select_classifiers.py +506 -0
  69. wisent/core/agent/diagnose/synthetic_classifier_option.py +754 -0
  70. wisent/core/agent/diagnose/tasks/__init__.py +33 -0
  71. wisent/core/agent/diagnose/tasks/task_manager.py +1456 -0
  72. wisent/core/agent/diagnose/tasks/task_relevance.py +94 -0
  73. wisent/core/agent/diagnose/tasks/task_selector.py +151 -0
  74. wisent/core/agent/diagnose/test_synthetic_classifier.py +71 -0
  75. wisent/core/agent/diagnose.py +242 -0
  76. wisent/core/agent/steer.py +212 -0
  77. wisent/core/agent/timeout.py +134 -0
  78. wisent/core/autonomous_agent.py +1234 -0
  79. wisent/core/bigcode_integration.py +583 -0
  80. wisent/core/contrastive_pairs/__init__.py +15 -0
  81. wisent/core/contrastive_pairs/core/__init__.py +0 -0
  82. wisent/core/contrastive_pairs/core/atoms.py +45 -0
  83. wisent/core/contrastive_pairs/core/buliders.py +59 -0
  84. wisent/core/contrastive_pairs/core/pair.py +178 -0
  85. wisent/core/contrastive_pairs/core/response.py +152 -0
  86. wisent/core/contrastive_pairs/core/serialization.py +300 -0
  87. wisent/core/contrastive_pairs/core/set.py +133 -0
  88. wisent/core/contrastive_pairs/diagnostics/__init__.py +45 -0
  89. wisent/core/contrastive_pairs/diagnostics/activations.py +53 -0
  90. wisent/core/contrastive_pairs/diagnostics/base.py +73 -0
  91. wisent/core/contrastive_pairs/diagnostics/control_vectors.py +169 -0
  92. wisent/core/contrastive_pairs/diagnostics/coverage.py +79 -0
  93. wisent/core/contrastive_pairs/diagnostics/divergence.py +98 -0
  94. wisent/core/contrastive_pairs/diagnostics/duplicates.py +116 -0
  95. wisent/core/contrastive_pairs/lm_eval_pairs/__init__.py +0 -0
  96. wisent/core/contrastive_pairs/lm_eval_pairs/atoms.py +238 -0
  97. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_manifest.py +8 -0
  98. wisent/core/contrastive_pairs/lm_eval_pairs/lm_extractor_registry.py +132 -0
  99. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/__init__.py +0 -0
  100. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_extractors/winogrande.py +115 -0
  101. wisent/core/contrastive_pairs/lm_eval_pairs/lm_task_pairs_generation.py +50 -0
  102. wisent/core/data_loaders/__init__.py +0 -0
  103. wisent/core/data_loaders/core/__init__.py +0 -0
  104. wisent/core/data_loaders/core/atoms.py +98 -0
  105. wisent/core/data_loaders/loaders/__init__.py +0 -0
  106. wisent/core/data_loaders/loaders/custom.py +120 -0
  107. wisent/core/data_loaders/loaders/lm_loader.py +218 -0
  108. wisent/core/detection_handling.py +257 -0
  109. wisent/core/download_full_benchmarks.py +1386 -0
  110. wisent/core/evaluators/__init__.py +0 -0
  111. wisent/core/evaluators/oracles/__init__.py +0 -0
  112. wisent/core/evaluators/oracles/interactive.py +73 -0
  113. wisent/core/evaluators/oracles/nlp_evaluator.py +440 -0
  114. wisent/core/evaluators/oracles/user_specified.py +67 -0
  115. wisent/core/hyperparameter_optimizer.py +429 -0
  116. wisent/core/lm_eval_harness_ground_truth.py +1396 -0
  117. wisent/core/log_likelihoods_evaluator.py +321 -0
  118. wisent/core/managed_cached_benchmarks.py +595 -0
  119. wisent/core/mixed_benchmark_sampler.py +364 -0
  120. wisent/core/model_config_manager.py +330 -0
  121. wisent/core/model_persistence.py +317 -0
  122. wisent/core/models/__init__.py +0 -0
  123. wisent/core/models/core/__init__.py +0 -0
  124. wisent/core/models/core/atoms.py +460 -0
  125. wisent/core/models/wisent_model.py +727 -0
  126. wisent/core/multi_steering.py +316 -0
  127. wisent/core/optuna/__init__.py +57 -0
  128. wisent/core/optuna/classifier/__init__.py +25 -0
  129. wisent/core/optuna/classifier/activation_generator.py +349 -0
  130. wisent/core/optuna/classifier/classifier_cache.py +509 -0
  131. wisent/core/optuna/classifier/optuna_classifier_optimizer.py +606 -0
  132. wisent/core/optuna/steering/__init__.py +0 -0
  133. wisent/core/optuna/steering/bigcode_evaluator_wrapper.py +188 -0
  134. wisent/core/optuna/steering/data_utils.py +342 -0
  135. wisent/core/optuna/steering/metrics.py +474 -0
  136. wisent/core/optuna/steering/optuna_pipeline.py +1738 -0
  137. wisent/core/optuna/steering/steering_optimization.py +1111 -0
  138. wisent/core/parser.py +1668 -0
  139. wisent/core/prompts/__init__.py +0 -0
  140. wisent/core/prompts/core/__init__.py +0 -0
  141. wisent/core/prompts/core/atom.py +57 -0
  142. wisent/core/prompts/core/prompt_formater.py +157 -0
  143. wisent/core/prompts/prompt_stratiegies/__init__.py +0 -0
  144. wisent/core/prompts/prompt_stratiegies/direct_completion.py +24 -0
  145. wisent/core/prompts/prompt_stratiegies/instruction_following.py +24 -0
  146. wisent/core/prompts/prompt_stratiegies/multiple_choice.py +29 -0
  147. wisent/core/prompts/prompt_stratiegies/role_playing.py +31 -0
  148. wisent/core/representation.py +5 -0
  149. wisent/core/sample_size_optimizer.py +648 -0
  150. wisent/core/sample_size_optimizer_v2.py +355 -0
  151. wisent/core/save_results.py +277 -0
  152. wisent/core/steering.py +652 -0
  153. wisent/core/steering_method.py +26 -0
  154. wisent/core/steering_methods/__init__.py +0 -0
  155. wisent/core/steering_methods/core/__init__.py +0 -0
  156. wisent/core/steering_methods/core/atoms.py +153 -0
  157. wisent/core/steering_methods/methods/__init__.py +0 -0
  158. wisent/core/steering_methods/methods/caa.py +44 -0
  159. wisent/core/steering_optimizer.py +1297 -0
  160. wisent/core/task_interface.py +132 -0
  161. wisent/core/task_selector.py +189 -0
  162. wisent/core/tasks/__init__.py +175 -0
  163. wisent/core/tasks/aime_task.py +141 -0
  164. wisent/core/tasks/file_task.py +211 -0
  165. wisent/core/tasks/hle_task.py +180 -0
  166. wisent/core/tasks/hmmt_task.py +119 -0
  167. wisent/core/tasks/livecodebench_task.py +201 -0
  168. wisent/core/tasks/livemathbench_task.py +158 -0
  169. wisent/core/tasks/lm_eval_task.py +455 -0
  170. wisent/core/tasks/math500_task.py +84 -0
  171. wisent/core/tasks/polymath_task.py +146 -0
  172. wisent/core/tasks/supergpqa_task.py +220 -0
  173. wisent/core/time_estimator.py +149 -0
  174. wisent/core/timing_calibration.py +174 -0
  175. wisent/core/tracking/__init__.py +54 -0
  176. wisent/core/tracking/latency.py +618 -0
  177. wisent/core/tracking/memory.py +359 -0
  178. wisent/core/trainers/__init__.py +0 -0
  179. wisent/core/trainers/core/__init__.py +11 -0
  180. wisent/core/trainers/core/atoms.py +45 -0
  181. wisent/core/trainers/steering_trainer.py +271 -0
  182. wisent/core/user_model_config.py +158 -0
  183. wisent/opti/__init__.py +0 -0
  184. wisent/opti/core/__init__.py +0 -0
  185. wisent/opti/core/atoms.py +175 -0
  186. wisent/opti/methods/__init__.py +0 -0
  187. wisent/opti/methods/opti_classificator.py +172 -0
  188. wisent/opti/methods/opti_steering.py +138 -0
  189. wisent/synthetic/__init__.py +0 -0
  190. wisent/synthetic/cleaners/__init__.py +0 -0
  191. wisent/synthetic/cleaners/core/__init__.py +0 -0
  192. wisent/synthetic/cleaners/core/atoms.py +58 -0
  193. wisent/synthetic/cleaners/deduper_cleaner.py +53 -0
  194. wisent/synthetic/cleaners/methods/__init__.py +0 -0
  195. wisent/synthetic/cleaners/methods/base_dedupers.py +320 -0
  196. wisent/synthetic/cleaners/methods/base_refusalers.py +286 -0
  197. wisent/synthetic/cleaners/methods/core/__init__.py +0 -0
  198. wisent/synthetic/cleaners/methods/core/atoms.py +47 -0
  199. wisent/synthetic/cleaners/pairs_cleaner.py +90 -0
  200. wisent/synthetic/cleaners/refusaler_cleaner.py +133 -0
  201. wisent/synthetic/db_instructions/__init__.py +0 -0
  202. wisent/synthetic/db_instructions/core/__init__.py +0 -0
  203. wisent/synthetic/db_instructions/core/atoms.py +25 -0
  204. wisent/synthetic/db_instructions/mini_dp.py +37 -0
  205. wisent/synthetic/generators/__init__.py +0 -0
  206. wisent/synthetic/generators/core/__init__.py +0 -0
  207. wisent/synthetic/generators/core/atoms.py +73 -0
  208. wisent/synthetic/generators/diversities/__init__.py +0 -0
  209. wisent/synthetic/generators/diversities/core/__init__.py +0 -0
  210. wisent/synthetic/generators/diversities/core/core.py +68 -0
  211. wisent/synthetic/generators/diversities/methods/__init__.py +0 -0
  212. wisent/synthetic/generators/diversities/methods/fast_diversity.py +249 -0
  213. wisent/synthetic/generators/pairs_generator.py +179 -0
  214. wisent-0.5.1.dist-info/METADATA +67 -0
  215. wisent-0.5.1.dist-info/RECORD +218 -0
  216. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/WHEEL +1 -1
  217. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info/licenses}/LICENSE +2 -2
  218. wisent/activations/__init__.py +0 -9
  219. wisent/activations/client.py +0 -97
  220. wisent/activations/extractor.py +0 -251
  221. wisent/activations/models.py +0 -95
  222. wisent/client.py +0 -45
  223. wisent/control_vector/__init__.py +0 -9
  224. wisent/control_vector/client.py +0 -85
  225. wisent/control_vector/manager.py +0 -168
  226. wisent/control_vector/models.py +0 -70
  227. wisent/inference/__init__.py +0 -9
  228. wisent/inference/client.py +0 -103
  229. wisent/inference/inferencer.py +0 -250
  230. wisent/inference/models.py +0 -66
  231. wisent/utils/__init__.py +0 -3
  232. wisent/utils/auth.py +0 -30
  233. wisent/utils/http.py +0 -228
  234. wisent/version.py +0 -3
  235. wisent-0.1.1.dist-info/METADATA +0 -142
  236. wisent-0.1.1.dist-info/RECORD +0 -23
  237. {wisent-0.1.1.dist-info → wisent-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1234 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Autonomous Wisent-Guard Agent
4
+
5
+ A model that can autonomously use wisent-guard capabilities on itself:
6
+ - Generate responses
7
+ - Analyze its own outputs for issues
8
+ - Auto-discover or create classifiers on demand
9
+ - Apply corrections to improve future responses
10
+ """
11
+
12
+ import asyncio
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ from wisent_guard.core.activations import ActivationAggregationStrategy, Activations
16
+
17
+ from .agent.diagnose import AgentClassifierDecisionSystem, AnalysisResult, ClassifierMarketplace, ResponseDiagnostics
18
+ from .agent.steer import ImprovementResult, ResponseSteering
19
+ from .model import Model
20
+
21
+
22
+ class AutonomousAgent:
23
+ """
24
+ An autonomous agent that can generate responses, analyze them for issues,
25
+ and improve them using activation-based steering and correction techniques.
26
+
27
+ The agent now uses a marketplace-based system to intelligently select
28
+ classifiers based on task analysis, with no hardcoded requirements.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ model_name: str = "meta-llama/Llama-3.1-8B-Instruct",
34
+ layer_override: int = None,
35
+ enable_tracking: bool = True,
36
+ steering_method: str = "CAA",
37
+ steering_strength: float = 1.0,
38
+ steering_mode: bool = False,
39
+ normalization_method: str = "none",
40
+ target_norm: Optional[float] = None,
41
+ hpr_beta: float = 1.0,
42
+ dac_dynamic_control: bool = False,
43
+ dac_entropy_threshold: float = 1.0,
44
+ bipo_beta: float = 0.1,
45
+ bipo_learning_rate: float = 5e-4,
46
+ bipo_epochs: int = 100,
47
+ ksteering_num_labels: int = 6,
48
+ ksteering_hidden_dim: int = 512,
49
+ ksteering_learning_rate: float = 1e-3,
50
+ ksteering_classifier_epochs: int = 100,
51
+ ksteering_target_labels: str = "0",
52
+ ksteering_avoid_labels: str = "",
53
+ ksteering_alpha: float = 50.0,
54
+ # Priority-aware benchmark selection parameters
55
+ priority: str = "all",
56
+ fast_only: bool = False,
57
+ time_budget_minutes: float = None,
58
+ max_benchmarks: int = None,
59
+ smart_selection: bool = False,
60
+ ):
61
+ """
62
+ Initialize the autonomous agent.
63
+
64
+ Args:
65
+ model_name: Name of the model to use
66
+ layer_override: Layer override from CLI (None to use parameter file)
67
+ enable_tracking: Whether to track improvement history
68
+ steering_method: Steering method to use (CAA, HPR, DAC, BiPO, KSteering)
69
+ steering_strength: Strength of steering application
70
+ steering_mode: Whether to enable steering mode
71
+ priority: Priority level for benchmark selection ("all", "high", "medium", "low")
72
+ fast_only: Only use fast benchmarks (high priority)
73
+ time_budget_minutes: Time budget in minutes for benchmark selection
74
+ max_benchmarks: Maximum number of benchmarks to select
75
+ smart_selection: Use smart benchmark selection based on relevance and priority
76
+ prefer_fast: Prefer fast benchmarks in selection
77
+ (... other steering parameters ...)
78
+ """
79
+ self.model_name = model_name
80
+ self.model: Optional[Model] = None
81
+ self.layer_override = layer_override
82
+ self.enable_tracking = enable_tracking
83
+
84
+ # Load model parameters first
85
+ from .parameters import load_model_parameters
86
+
87
+ self.params = load_model_parameters(model_name, layer_override)
88
+
89
+ # Store steering parameters and load method-specific configs from parameter file
90
+ self.steering_method = steering_method
91
+ self.steering_strength = steering_strength
92
+ self.steering_mode = steering_mode
93
+ self.normalization_method = normalization_method
94
+ self.target_norm = target_norm
95
+
96
+ # Load method-specific parameters from parameter file, with CLI overrides
97
+ steering_config = self.params.get_steering_config(steering_method)
98
+
99
+ self.hpr_beta = hpr_beta if hpr_beta != 1.0 else steering_config.get("beta", 1.0)
100
+ self.dac_dynamic_control = (
101
+ dac_dynamic_control if dac_dynamic_control else steering_config.get("dynamic_control", False)
102
+ )
103
+ self.dac_entropy_threshold = (
104
+ dac_entropy_threshold if dac_entropy_threshold != 1.0 else steering_config.get("entropy_threshold", 1.0)
105
+ )
106
+ self.bipo_beta = bipo_beta if bipo_beta != 0.1 else steering_config.get("beta", 0.1)
107
+ self.bipo_learning_rate = (
108
+ bipo_learning_rate if bipo_learning_rate != 5e-4 else steering_config.get("learning_rate", 5e-4)
109
+ )
110
+ self.bipo_epochs = bipo_epochs if bipo_epochs != 100 else steering_config.get("num_epochs", 100)
111
+ self.ksteering_num_labels = (
112
+ ksteering_num_labels if ksteering_num_labels != 6 else steering_config.get("num_labels", 6)
113
+ )
114
+ self.ksteering_hidden_dim = (
115
+ ksteering_hidden_dim if ksteering_hidden_dim != 512 else steering_config.get("hidden_dim", 512)
116
+ )
117
+ self.ksteering_learning_rate = (
118
+ ksteering_learning_rate if ksteering_learning_rate != 1e-3 else steering_config.get("learning_rate", 1e-3)
119
+ )
120
+ self.ksteering_classifier_epochs = (
121
+ ksteering_classifier_epochs
122
+ if ksteering_classifier_epochs != 100
123
+ else steering_config.get("classifier_epochs", 100)
124
+ )
125
+ self.ksteering_target_labels = (
126
+ ksteering_target_labels
127
+ if ksteering_target_labels != "0"
128
+ else ",".join(map(str, steering_config.get("target_labels", [0])))
129
+ )
130
+ self.ksteering_avoid_labels = (
131
+ ksteering_avoid_labels
132
+ if ksteering_avoid_labels != ""
133
+ else ",".join(map(str, steering_config.get("avoid_labels", [])))
134
+ )
135
+ self.ksteering_alpha = ksteering_alpha if ksteering_alpha != 50.0 else steering_config.get("alpha", 50.0)
136
+
137
+ # Priority-aware benchmark selection parameters
138
+ self.priority = priority
139
+ self.fast_only = fast_only
140
+ self.time_budget_minutes = time_budget_minutes
141
+ self.max_benchmarks = max_benchmarks
142
+ self.smart_selection = smart_selection
143
+
144
+ # New marketplace-based system
145
+ self.marketplace: Optional[ClassifierMarketplace] = None
146
+ self.decision_system: Optional[AgentClassifierDecisionSystem] = None
147
+ self.diagnostics: Optional[ResponseDiagnostics] = None
148
+ self.steering: Optional[ResponseSteering] = None
149
+
150
+ # Tracking
151
+ self.improvement_history: List[ImprovementResult] = []
152
+ self.analysis_history: List[AnalysisResult] = []
153
+
154
+ print(f"šŸ¤– Autonomous Agent initialized with {model_name}")
155
+ print(" šŸŽÆ Using marketplace-based classifier selection")
156
+ print(f" šŸŽ›ļø Steering: {steering_method} (strength: {steering_strength})")
157
+ if steering_mode:
158
+ print(f" šŸ”§ Steering mode enabled with {normalization_method} normalization")
159
+ print(self.params.get_summary())
160
+
161
+ async def initialize(
162
+ self,
163
+ classifier_search_paths: Optional[List[str]] = None,
164
+ quality_threshold: float = 0.3,
165
+ default_time_budget_minutes: float = 10.0,
166
+ ):
167
+ """
168
+ Initialize the autonomous agent with intelligent classifier management.
169
+
170
+ Args:
171
+ classifier_search_paths: Paths to search for existing classifiers
172
+ quality_threshold: Minimum quality threshold for existing classifiers
173
+ default_time_budget_minutes: Default time budget for creating new classifiers
174
+ """
175
+ print("šŸš€ Initializing Autonomous Agent...")
176
+
177
+ # Load model
178
+ print(" šŸ“¦ Loading model...")
179
+ self.model = Model(self.model_name)
180
+
181
+ # Initialize marketplace
182
+ print(" šŸŖ Setting up classifier marketplace...")
183
+ self.marketplace = ClassifierMarketplace(model=self.model, search_paths=classifier_search_paths)
184
+
185
+ # Initialize decision system
186
+ print(" 🧠 Setting up intelligent decision system...")
187
+ self.decision_system = AgentClassifierDecisionSystem(self.marketplace)
188
+
189
+ # Store configuration
190
+ self.quality_threshold = quality_threshold
191
+ self.default_time_budget_minutes = default_time_budget_minutes
192
+
193
+ # Show marketplace summary
194
+ summary = self.marketplace.get_marketplace_summary()
195
+ print(summary)
196
+
197
+ print(" āœ… Autonomous Agent ready!")
198
+
199
+ async def respond_autonomously(
200
+ self,
201
+ prompt: str,
202
+ max_attempts: int = 3,
203
+ quality_threshold: float = None,
204
+ time_budget_minutes: float = None,
205
+ max_classifiers: int = None,
206
+ ) -> Dict[str, Any]:
207
+ """
208
+ Generate a response and autonomously improve it if needed.
209
+ The agent will intelligently select classifiers based on the prompt.
210
+
211
+ Args:
212
+ prompt: The prompt to respond to
213
+ max_attempts: Maximum improvement attempts
214
+ quality_threshold: Quality threshold for classifiers (uses default if None)
215
+ time_budget_minutes: Time budget for creating classifiers (uses default if None)
216
+ max_classifiers: Maximum classifiers to use (None = no limit)
217
+
218
+ Returns:
219
+ Dictionary with response and improvement details
220
+ """
221
+ print(f"\nšŸŽÆ AUTONOMOUS RESPONSE TO: {prompt[:100]}...")
222
+
223
+ # Use defaults if not specified
224
+ quality_threshold = quality_threshold or self.quality_threshold
225
+ time_budget_minutes = time_budget_minutes or self.default_time_budget_minutes
226
+
227
+ # Step 1: Intelligent classifier selection based on the prompt
228
+ print("\n🧠 Analyzing task and selecting classifiers...")
229
+ classifier_configs = await self.decision_system.smart_classifier_selection(
230
+ prompt=prompt,
231
+ quality_threshold=quality_threshold,
232
+ time_budget_minutes=time_budget_minutes,
233
+ max_classifiers=max_classifiers,
234
+ )
235
+
236
+ # Step 2: Initialize diagnostics and steering with selected classifiers
237
+ if classifier_configs:
238
+ print(f" šŸ“Š Initializing diagnostics with {len(classifier_configs)} classifiers")
239
+ self.diagnostics = ResponseDiagnostics(model=self.model, classifier_configs=classifier_configs)
240
+
241
+ self.steering = ResponseSteering(
242
+ generate_response_func=self._generate_response, analyze_response_func=self.diagnostics.analyze_response
243
+ )
244
+ else:
245
+ print(" āš ļø No classifiers selected - proceeding without advanced diagnostics")
246
+ # Could fall back to basic text analysis or skip diagnostics
247
+ return {
248
+ "final_response": await self._generate_response(prompt),
249
+ "attempts": 1,
250
+ "improvement_chain": [],
251
+ "classifier_info": "No classifiers used",
252
+ }
253
+
254
+ # Step 3: Generate and improve response
255
+ attempt = 0
256
+ current_response = None
257
+ improvement_chain = []
258
+
259
+ while attempt < max_attempts:
260
+ attempt += 1
261
+ print(f"\n--- Attempt {attempt} ---")
262
+
263
+ # Generate response
264
+ if current_response is None:
265
+ print("šŸ’­ Generating initial response...")
266
+ current_response = await self._generate_response(prompt)
267
+ print(f" Response: {current_response[:100]}...")
268
+
269
+ # Analyze response using selected classifiers
270
+ print("šŸ” Analyzing response...")
271
+ analysis = await self.diagnostics.analyze_response(current_response, prompt)
272
+
273
+ print(f" Issues found: {analysis.issues_found}")
274
+ print(f" Quality score: {analysis.quality_score:.2f}")
275
+ print(f" Confidence: {analysis.confidence:.2f}")
276
+
277
+ # Track analysis
278
+ if self.enable_tracking:
279
+ self.analysis_history.append(analysis)
280
+
281
+ # Decide if improvement is needed
282
+ needs_improvement = self._decide_if_improvement_needed(analysis)
283
+
284
+ if not needs_improvement:
285
+ print("āœ… Response quality acceptable, no improvement needed")
286
+ break
287
+
288
+ # Attempt improvement
289
+ print("šŸ› ļø Attempting to improve response...")
290
+ improvement = await self.steering.improve_response(prompt, current_response, analysis)
291
+
292
+ if improvement.success:
293
+ print(f" Improvement successful! Score: {improvement.improvement_score:.2f}")
294
+ current_response = improvement.improved_response
295
+ improvement_chain.append(improvement)
296
+
297
+ if self.enable_tracking:
298
+ self.improvement_history.append(improvement)
299
+ else:
300
+ print(" Improvement failed, keeping original response")
301
+ break
302
+
303
+ return {
304
+ "final_response": current_response,
305
+ "attempts": attempt,
306
+ "improvement_chain": improvement_chain,
307
+ "final_analysis": analysis,
308
+ "classifier_info": {
309
+ "count": len(classifier_configs),
310
+ "types": [c.get("issue_type", "unknown") for c in classifier_configs],
311
+ "decision_summary": self.decision_system.get_decision_summary(),
312
+ },
313
+ }
314
+
315
+ async def _generate_response(self, prompt: str) -> str:
316
+ """Generate a response to the prompt with optional steering."""
317
+ if self.steering_mode:
318
+ # Use actual activation steering
319
+ print(f" šŸŽ›ļø Applying {self.steering_method} steering...")
320
+ try:
321
+ # Use actual steering methods from steering_methods folder
322
+ from ..inference import generate_with_classification_and_handling
323
+
324
+ # Create steering method object based on configuration
325
+ steering_method = self._create_steering_method()
326
+
327
+ response, _, _, _ = generate_with_classification_and_handling(
328
+ self.model,
329
+ prompt,
330
+ self.params.layer,
331
+ max_new_tokens=200,
332
+ steering_method=steering_method,
333
+ token_aggregation="average",
334
+ threshold=0.6,
335
+ verbose=False,
336
+ detection_handler=None,
337
+ )
338
+ return response
339
+
340
+ except Exception as e:
341
+ print(f" āš ļø Steering failed, falling back to basic generation: {e}")
342
+ # Fall through to basic generation
343
+
344
+ # Basic generation without steering
345
+ result = self.model.generate(prompt, self.params.layer, max_new_tokens=200)
346
+ # Handle both 2 and 3 return values
347
+ if isinstance(result, tuple) and len(result) == 3:
348
+ response, _, _ = result
349
+ elif isinstance(result, tuple) and len(result) == 2:
350
+ response, _ = result
351
+ else:
352
+ response = result
353
+ return response
354
+
355
+ def _create_steering_method(self):
356
+ """Create a steering method object based on configuration."""
357
+ # Import actual steering methods
358
+ from .steering_methods import CAA, DAC, HPR, BiPO, KSteering
359
+
360
+ # Create the appropriate steering method with parameters
361
+ if self.steering_method == "CAA":
362
+ steering_method = CAA(device=None)
363
+ elif self.steering_method == "HPR":
364
+ steering_method = HPR(device=None, beta=self.hpr_beta)
365
+ elif self.steering_method == "DAC":
366
+ steering_method = DAC(
367
+ device=None, dynamic_control=self.dac_dynamic_control, entropy_threshold=self.dac_entropy_threshold
368
+ )
369
+ elif self.steering_method == "BiPO":
370
+ steering_method = BiPO(
371
+ device=None, beta=self.bipo_beta, learning_rate=self.bipo_learning_rate, num_epochs=self.bipo_epochs
372
+ )
373
+ elif self.steering_method == "KSteering":
374
+ # Parse target and avoid labels
375
+ target_labels = [int(x.strip()) for x in self.ksteering_target_labels.split(",") if x.strip()]
376
+ avoid_labels = [int(x.strip()) for x in self.ksteering_avoid_labels.split(",") if x.strip()]
377
+
378
+ steering_method = KSteering(
379
+ device=None,
380
+ num_labels=self.ksteering_num_labels,
381
+ hidden_dim=self.ksteering_hidden_dim,
382
+ learning_rate=self.ksteering_learning_rate,
383
+ classifier_epochs=self.ksteering_classifier_epochs,
384
+ target_labels=target_labels,
385
+ avoid_labels=avoid_labels,
386
+ alpha=self.ksteering_alpha,
387
+ )
388
+ else:
389
+ # Default to CAA
390
+ steering_method = CAA(device=None)
391
+
392
+ return steering_method
393
+
394
+ async def evaluate_response_quality(
395
+ self, response: str, classifier, classifier_params: "ClassifierParams"
396
+ ) -> "QualityResult":
397
+ """
398
+ Evaluate response quality using classifier + model judgment for threshold determination.
399
+
400
+ Args:
401
+ response: The response to evaluate
402
+ classifier: The trained classifier to use
403
+ classifier_params: Parameters used for classifier training
404
+
405
+ Returns:
406
+ QualityResult with score and acceptability judgment
407
+ """
408
+ from .agent.diagnose.agent_classifier_decision import QualityResult
409
+
410
+ # Extract activations and get classifier score
411
+ activations_tensor = self.model.extract_activations(response, classifier_params.optimal_layer)
412
+ from .layer import Layer
413
+
414
+ activations = Activations(
415
+ tensor=activations_tensor,
416
+ layer=Layer(index=classifier_params.optimal_layer, type="transformer"),
417
+ aggregation_strategy=ActivationAggregationStrategy.LAST_TOKEN,
418
+ )
419
+
420
+ features = activations.extract_features_for_classifier()
421
+ features_cpu = features.cpu() if hasattr(features, "cpu") else features
422
+
423
+ # Get classifier prediction
424
+ prob_result = classifier.predict_proba([features_cpu.numpy()])
425
+ score = prob_result[0] if isinstance(prob_result, (list, tuple)) else prob_result
426
+
427
+ # Use model to determine if this score represents acceptable quality
428
+ acceptable = await self._model_determines_quality_threshold(response, float(score))
429
+
430
+ return QualityResult(
431
+ score=float(score),
432
+ acceptable=acceptable,
433
+ reasoning=f"Model judged score {score:.3f} as {'acceptable' if acceptable else 'unacceptable'}",
434
+ )
435
+
436
+ async def _model_determines_quality_threshold(self, response: str, classifier_score: float) -> bool:
437
+ """
438
+ Use model to determine if classifier score represents acceptable quality.
439
+
440
+ Args:
441
+ response: The response to evaluate
442
+ classifier_score: Score from the classifier (0-1, where 1 typically means more problematic)
443
+
444
+ Returns:
445
+ Boolean indicating if quality is acceptable
446
+ """
447
+ threshold_prompt = f"""
448
+ Evaluate if this response quality is acceptable for a user:
449
+
450
+ Response: "{response}"
451
+
452
+ Quality Classifier Score: {classifier_score:.3f}
453
+ (Note: Score ranges 0-1, where 0 = high quality, 1 = low quality/problematic)
454
+
455
+ Consider:
456
+ - Is the response helpful and informative?
457
+ - Are there factual errors or concerning content?
458
+ - Is the response appropriate length and detail?
459
+ - Would a typical user be satisfied with this response?
460
+
461
+ Respond ONLY with:
462
+ "ACCEPTABLE" if the response quality is good enough for the user
463
+ "UNACCEPTABLE" if the response needs improvement
464
+ """
465
+
466
+ # Generate model judgment
467
+ result = self.model.generate(threshold_prompt, layer_index=15, max_new_tokens=20)
468
+ judgment = result[0] if isinstance(result, tuple) else result
469
+ judgment = judgment.strip().upper()
470
+
471
+ return "ACCEPTABLE" in judgment
472
+
473
+ async def _determine_classifier_parameters(self, prompt: str, benchmark_names: List[str]) -> "ClassifierParams":
474
+ """
475
+ Use model to determine optimal classifier parameters based on prompt analysis.
476
+
477
+ Args:
478
+ prompt: The user prompt to analyze
479
+ benchmark_names: Selected benchmarks for training
480
+
481
+ Returns:
482
+ ClassifierParams with model-determined parameters
483
+ """
484
+
485
+ parameter_prompt = f"""
486
+ Analyze this prompt and determine optimal classifier parameters:
487
+
488
+ Prompt: "{prompt}"
489
+ Selected Benchmarks: {benchmark_names}
490
+
491
+ Consider:
492
+ - Prompt complexity (simple conversational vs complex technical)
493
+ - Domain type (technical/casual/creative/factual)
494
+ - Expected response length and detail needs
495
+ - Quality requirements and safety considerations
496
+
497
+ Determine optimal parameters:
498
+ 1. Optimal Layer (8-20): What layer captures the right semantic complexity?
499
+ - Simple prompts: layers 8-12
500
+ - Medium complexity: layers 12-16
501
+ - Complex technical: layers 16-20
502
+
503
+ 2. Classification Threshold (0.1-0.9): How strict should quality detection be?
504
+ - Lenient (casual conversation): 0.1-0.3
505
+ - Moderate (general use): 0.4-0.6
506
+ - Strict (important/technical): 0.7-0.9
507
+
508
+ 3. Training Samples (10-50): How many samples needed for good training?
509
+ - Simple patterns: 10-20 samples
510
+ - Medium complexity: 20-35 samples
511
+ - Complex patterns: 35-50 samples
512
+
513
+ 4. Classifier Type: What classifier works best for this data?
514
+ - logistic: Simple patterns, fast training
515
+ - svm: Medium complexity, robust
516
+ - neural: Complex patterns, more data needed
517
+
518
+ Format your response as:
519
+ LAYER: [number]
520
+ THRESHOLD: [number]
521
+ SAMPLES: [number]
522
+ TYPE: [logistic/svm/neural]
523
+ REASONING: [one sentence explanation]
524
+ """
525
+
526
+ # Generate model response
527
+ result = self.model.generate(parameter_prompt, layer_index=15, max_new_tokens=150)
528
+ response = result[0] if isinstance(result, tuple) else result
529
+
530
+ # Parse the response
531
+ return self._parse_classifier_params(response)
532
+
533
+ def _parse_classifier_params(self, response: str) -> "ClassifierParams":
534
+ """Parse model response to extract classifier parameters."""
535
+ from .agent.diagnose.agent_classifier_decision import ClassifierParams
536
+
537
+ # Default values in case parsing fails
538
+ layer = 15
539
+ threshold = 0.5
540
+ samples = 25
541
+ classifier_type = "logistic"
542
+ reasoning = "Using default parameters due to parsing failure"
543
+
544
+ try:
545
+ lines = response.strip().split("\n")
546
+ for line in lines:
547
+ line = line.strip()
548
+ if line.startswith("LAYER:"):
549
+ layer = int(line.split(":")[1].strip())
550
+ elif line.startswith("THRESHOLD:"):
551
+ threshold = float(line.split(":")[1].strip())
552
+ elif line.startswith("SAMPLES:"):
553
+ samples = int(line.split(":")[1].strip())
554
+ elif line.startswith("TYPE:"):
555
+ classifier_type = line.split(":")[1].strip().lower()
556
+ elif line.startswith("REASONING:"):
557
+ reasoning = line.split(":", 1)[1].strip()
558
+ except Exception as e:
559
+ print(f" āš ļø Failed to parse classifier parameters: {e}")
560
+ print(
561
+ f" šŸ“‹ Using defaults: layer={layer}, threshold={threshold}, samples={samples}, type={classifier_type}"
562
+ )
563
+
564
+ # Validate ranges
565
+ layer = max(8, min(20, layer))
566
+ threshold = max(0.1, min(0.9, threshold))
567
+ samples = max(10, min(50, samples))
568
+ if classifier_type not in ["logistic", "svm", "neural"]:
569
+ classifier_type = "logistic"
570
+
571
+ return ClassifierParams(
572
+ optimal_layer=layer,
573
+ classification_threshold=threshold,
574
+ training_samples=samples,
575
+ classifier_type=classifier_type,
576
+ reasoning=reasoning,
577
+ model_name=self.model_name,
578
+ aggregation_method="last_token", # Default for model-determined params
579
+ token_aggregation="average", # Default for model-determined params
580
+ num_epochs=50,
581
+ batch_size=32,
582
+ learning_rate=0.001,
583
+ early_stopping_patience=10,
584
+ hidden_dim=128,
585
+ )
586
+
587
+ async def _determine_steering_parameters(
588
+ self, prompt: str, current_quality: float, attempt_number: int
589
+ ) -> "SteeringParams":
590
+ """
591
+ Use model to determine optimal steering parameters based on current quality and prompt.
592
+
593
+ Args:
594
+ prompt: The original user prompt
595
+ current_quality: Current quality score from classifier
596
+ attempt_number: Which attempt this is (1, 2, 3...)
597
+
598
+ Returns:
599
+ SteeringParams with model-determined parameters
600
+ """
601
+
602
+ steering_prompt = f"""
603
+ Determine optimal steering parameters for improving this response:
604
+
605
+ Original Prompt: "{prompt}"
606
+ Current Quality Score: {current_quality:.3f} (0=good, 1=bad)
607
+ Attempt Number: {attempt_number}
608
+
609
+ Available Steering Methods:
610
+ - CAA: Gentle activation steering, good for general improvements
611
+ - HPR: Precise harmfulness reduction, good for safety issues
612
+ - DAC: Dynamic adaptive control, good for complex patterns
613
+ - BiPO: Bidirectional preference optimization, good for quality/preference
614
+ - KSteering: K-label steering, good for specific categorization issues
615
+
616
+ Consider:
617
+ - How much improvement is needed? (quality gap: {1.0 - current_quality:.2f})
618
+ - What type of improvement? (accuracy/safety/coherence/detail)
619
+ - Should we be more aggressive since this is attempt #{attempt_number}?
620
+ - Prompt characteristics (technical/casual/creative/safety-sensitive)
621
+
622
+ Determine parameters:
623
+ 1. Steering Method: Which method fits best?
624
+ 2. Initial Strength (0.1-2.0): How aggressive to start?
625
+ 3. Increment (0.1-0.5): How much to increase if this fails?
626
+ 4. Maximum Strength (0.5-3.0): Upper limit to prevent over-steering?
627
+
628
+ Format response as:
629
+ METHOD: [CAA/HPR/DAC/BiPO/KSteering]
630
+ INITIAL: [number]
631
+ INCREMENT: [number]
632
+ MAXIMUM: [number]
633
+ REASONING: [one sentence explanation]
634
+ """
635
+
636
+ # Generate model response
637
+ result = self.model.generate(steering_prompt, layer_index=15, max_new_tokens=150)
638
+ response = result[0] if isinstance(result, tuple) else result
639
+
640
+ # Parse the response
641
+ return self._parse_steering_params(response)
642
+
643
+ def _parse_steering_params(self, response: str) -> "SteeringParams":
644
+ """Parse model response to extract steering parameters."""
645
+ from .agent.diagnose.agent_classifier_decision import SteeringParams
646
+
647
+ # Default values
648
+ method = "CAA"
649
+ initial = 0.5
650
+ increment = 0.2
651
+ maximum = 1.5
652
+ reasoning = "Using default parameters due to parsing failure"
653
+
654
+ try:
655
+ lines = response.strip().split("\n")
656
+ for line in lines:
657
+ line = line.strip()
658
+ if line.startswith("METHOD:"):
659
+ method = line.split(":")[1].strip()
660
+ elif line.startswith("INITIAL:"):
661
+ initial = float(line.split(":")[1].strip())
662
+ elif line.startswith("INCREMENT:"):
663
+ increment = float(line.split(":")[1].strip())
664
+ elif line.startswith("MAXIMUM:"):
665
+ maximum = float(line.split(":")[1].strip())
666
+ elif line.startswith("REASONING:"):
667
+ reasoning = line.split(":", 1)[1].strip()
668
+ except Exception as e:
669
+ print(f" āš ļø Failed to parse steering parameters: {e}")
670
+ print(f" šŸ“‹ Using defaults: method={method}, initial={initial}, increment={increment}, max={maximum}")
671
+
672
+ # Validate ranges and values
673
+ if method not in ["CAA", "HPR", "DAC", "BiPO", "KSteering"]:
674
+ method = "CAA"
675
+ initial = max(0.1, min(2.0, initial))
676
+ increment = max(0.1, min(0.5, increment))
677
+ maximum = max(0.5, min(3.0, maximum))
678
+
679
+ return SteeringParams(
680
+ steering_method=method,
681
+ initial_strength=initial,
682
+ increment=increment,
683
+ maximum_strength=maximum,
684
+ method_specific_params={}, # Can be expanded later
685
+ reasoning=reasoning,
686
+ )
687
+
688
+ async def _get_or_determine_classifier_parameters(
689
+ self, prompt: str, benchmark_names: List[str]
690
+ ) -> "ClassifierParams":
691
+ """
692
+ Get classifier parameters from memory or determine them fresh.
693
+
694
+ Args:
695
+ prompt: The user prompt to analyze
696
+ benchmark_names: Selected benchmarks for training
697
+
698
+ Returns:
699
+ ClassifierParams from memory or freshly determined
700
+ """
701
+ # Step 1: Try to get from parameter memory
702
+ stored_params = self._get_stored_classifier_parameters(prompt)
703
+
704
+ if stored_params:
705
+ stored_params.reasoning = f"Retrieved from parameter memory: {stored_params.reasoning}"
706
+ print(f" šŸ“š Using stored parameters (success rate: {getattr(stored_params, 'success_rate', 0.0):.2%})")
707
+ return stored_params
708
+
709
+ # Step 2: Fall back to model determination
710
+ print(" 🧠 No stored parameters found, using model determination...")
711
+ fresh_params = await self._determine_classifier_parameters(prompt, benchmark_names)
712
+ fresh_params.reasoning = f"Model-determined: {fresh_params.reasoning}"
713
+
714
+ return fresh_params
715
+
716
+ async def _get_or_determine_steering_parameters(
717
+ self, prompt: str, current_quality: float, attempt_number: int
718
+ ) -> "SteeringParams":
719
+ """
720
+ Get steering parameters from memory or determine them fresh.
721
+
722
+ Args:
723
+ prompt: The original user prompt
724
+ current_quality: Current quality score from classifier
725
+ attempt_number: Which attempt this is (1, 2, 3...)
726
+
727
+ Returns:
728
+ SteeringParams from memory or freshly determined
729
+ """
730
+ # Step 1: Try to get from parameter memory
731
+ stored_params = self._get_stored_steering_parameters(prompt, attempt_number)
732
+
733
+ if stored_params:
734
+ # Adjust strength based on attempt number and current quality
735
+ adjusted_strength = stored_params.initial_strength + (stored_params.increment * (attempt_number - 1))
736
+ adjusted_strength = min(adjusted_strength, stored_params.maximum_strength)
737
+
738
+ stored_params.initial_strength = adjusted_strength
739
+ stored_params.reasoning = f"Retrieved from memory (adjusted): {stored_params.reasoning}"
740
+ print(
741
+ f" šŸ“š Using stored steering parameters (success rate: {getattr(stored_params, 'success_rate', 0.0):.2%})"
742
+ )
743
+ return stored_params
744
+
745
+ # Step 2: Fall back to model determination
746
+ print(" 🧠 No stored steering parameters found, using model determination...")
747
+ fresh_params = await self._determine_steering_parameters(prompt, current_quality, attempt_number)
748
+ fresh_params.reasoning = f"Model-determined: {fresh_params.reasoning}"
749
+
750
+ return fresh_params
751
+
752
+ def _get_stored_classifier_parameters(self, prompt: str) -> "ClassifierParams":
753
+ """
754
+ Retrieve classifier parameters from the parameter file.
755
+
756
+ Args:
757
+ prompt: The user prompt (not used in simplified version)
758
+
759
+ Returns:
760
+ ClassifierParams from parameter file, None if not found
761
+ """
762
+ from .agent.diagnose.agent_classifier_decision import ClassifierParams
763
+
764
+ try:
765
+ # Get classifier config from parameters
766
+ classifier_config = self.params._params.get("classifier", {})
767
+
768
+ if not classifier_config:
769
+ return None
770
+
771
+ # Create ClassifierParams from stored data
772
+ params = ClassifierParams(
773
+ optimal_layer=classifier_config.get("layer", 15),
774
+ classification_threshold=classifier_config.get("threshold", 0.5),
775
+ training_samples=classifier_config.get("samples", 25),
776
+ classifier_type=classifier_config.get("type", "logistic"),
777
+ reasoning="Using parameters from configuration file",
778
+ model_name=self.model_name,
779
+ )
780
+
781
+ # Store additional classifier parameters for later use
782
+ params.aggregation_method = classifier_config.get("aggregation_method", "last_token")
783
+ params.token_aggregation = classifier_config.get("token_aggregation", "average")
784
+ params.num_epochs = classifier_config.get("num_epochs", 50)
785
+ params.batch_size = classifier_config.get("batch_size", 32)
786
+ params.learning_rate = classifier_config.get("learning_rate", 0.001)
787
+ params.early_stopping_patience = classifier_config.get("early_stopping_patience", 10)
788
+ params.hidden_dim = classifier_config.get("hidden_dim", 128)
789
+
790
+ return params
791
+
792
+ except Exception as e:
793
+ print(f" āš ļø Failed to retrieve stored parameters: {e}")
794
+ return None
795
+
796
+ def _get_stored_steering_parameters(self, prompt: str, attempt_number: int) -> "SteeringParams":
797
+ """
798
+ For now, return None to always use model determination for steering parameters.
799
+
800
+ Args:
801
+ prompt: The user prompt
802
+ attempt_number: Current attempt number
803
+
804
+ Returns:
805
+ None (always use model determination)
806
+ """
807
+ # For simplicity, always use model determination for steering parameters
808
+ return None
809
+
810
+ def _classify_prompt_type(self, prompt: str) -> str:
811
+ """
812
+ Classify the prompt into a known type for parameter retrieval.
813
+
814
+ Args:
815
+ prompt: The user prompt to classify
816
+
817
+ Returns:
818
+ Prompt type string or None if no match
819
+ """
820
+ try:
821
+ # Get quality control config from parameters
822
+ quality_config = self.params.config.get("quality_control", {})
823
+ prompt_classification = quality_config.get("prompt_classification", {})
824
+
825
+ # Convert prompt to lowercase for matching
826
+ prompt_lower = prompt.lower()
827
+
828
+ # Score each prompt type based on keyword matches
829
+ scores = {}
830
+ for prompt_type, keywords in prompt_classification.items():
831
+ score = sum(1 for keyword in keywords if keyword.lower() in prompt_lower)
832
+ if score > 0:
833
+ scores[prompt_type] = score
834
+
835
+ # Return the type with highest score, if any
836
+ if scores:
837
+ best_type = max(scores.keys(), key=lambda x: scores[x])
838
+ print(f" šŸ·ļø Classified as '{best_type}' (score: {scores[best_type]})")
839
+ return best_type
840
+
841
+ return None
842
+
843
+ except Exception as e:
844
+ print(f" āš ļø Failed to classify prompt type: {e}")
845
+ return None
846
+
847
+ def _store_successful_parameters(
848
+ self,
849
+ prompt: str,
850
+ classifier_params: "ClassifierParams",
851
+ steering_params: "SteeringParams",
852
+ final_quality: float,
853
+ ):
854
+ """
855
+ Store successful parameter combinations for future use.
856
+
857
+ Args:
858
+ prompt: The user prompt that was processed
859
+ classifier_params: The classifier parameters that worked
860
+ steering_params: The steering parameters that worked (if any)
861
+ final_quality: The final quality score achieved
862
+ """
863
+ try:
864
+ # Only store if quality is acceptable (>= 0.7)
865
+ if final_quality < 0.7:
866
+ return
867
+
868
+ prompt_type = self._classify_prompt_type(prompt)
869
+ if not prompt_type:
870
+ print(" šŸ’¾ Could not classify prompt for storage")
871
+ return
872
+
873
+ print(f" šŸ’¾ Storing successful parameters for '{prompt_type}' (quality: {final_quality:.3f})")
874
+
875
+ # This would update the parameter file
876
+ # Implementation would involve updating the JSON file with new averages
877
+ # For now, just log that we would store it
878
+ print(" šŸ“ Would update parameter file with successful combination")
879
+
880
+ except Exception as e:
881
+ print(f" āš ļø Failed to store parameters: {e}")
882
+
883
+ async def respond_with_quality_control(
884
+ self, prompt: str, max_attempts: int = 5, time_budget_minutes: float = None
885
+ ) -> "QualityControlledResponse":
886
+ """
887
+ Generate response with iterative quality control and adaptive steering.
888
+
889
+ This is the new main method that implements the complete quality control flow:
890
+ 1. Analyze prompt and determine classifier parameters
891
+ 2. Train single combined classifier on relevant benchmarks
892
+ 3. Generate initial response without steering
893
+ 4. Iteratively improve using model-determined steering until acceptable
894
+
895
+ Args:
896
+ prompt: The user prompt to respond to
897
+ max_attempts: Maximum attempts to achieve acceptable quality
898
+ time_budget_minutes: Time budget for classifier creation
899
+
900
+ Returns:
901
+ QualityControlledResponse with final response and complete metadata
902
+ """
903
+ import time
904
+
905
+ from .agent.diagnose.agent_classifier_decision import QualityControlledResponse
906
+ from .agent.timeout import TimeoutError, timeout_context
907
+
908
+ start_time = time.time()
909
+ time_budget = time_budget_minutes or self.default_time_budget_minutes
910
+
911
+ print(f"\nšŸŽÆ QUALITY-CONTROLLED RESPONSE TO: {prompt[:100]}...")
912
+ print(f"ā° Hard timeout enforced: {time_budget:.1f} minutes")
913
+
914
+ try:
915
+ async with timeout_context(time_budget) as timeout_mgr:
916
+ return await self._respond_with_quality_control_impl(
917
+ prompt, max_attempts, time_budget, timeout_mgr, start_time
918
+ )
919
+ except TimeoutError as e:
920
+ print(f"ā° OPERATION TIMED OUT: {e}")
921
+ print(f" Elapsed: {e.elapsed_time:.1f}s / Budget: {e.budget_time:.1f}s")
922
+
923
+ # Return partial result with timeout indication
924
+ return QualityControlledResponse(
925
+ response_text=f"[TIMEOUT] Operation exceeded {time_budget:.1f}min budget. Partial response may be available.",
926
+ final_quality_score=0.0,
927
+ attempts_needed=0,
928
+ classifier_params_used=None,
929
+ total_time_seconds=e.elapsed_time,
930
+ )
931
+
932
+ async def _respond_with_quality_control_impl(
933
+ self, prompt: str, max_attempts: int, time_budget: float, timeout_mgr, start_time: float
934
+ ) -> "QualityControlledResponse":
935
+ """Implementation of quality control with timeout checking."""
936
+ from .agent.diagnose.agent_classifier_decision import QualityControlledResponse
937
+
938
+ # Step 1: Analyze prompt and select relevant benchmarks
939
+ print("\nšŸ“Š Step 1: Analyzing task and selecting benchmarks...")
940
+ timeout_mgr.check_timeout()
941
+
942
+ task_analysis = self.decision_system.analyze_task_requirements(
943
+ prompt,
944
+ priority=self.priority,
945
+ fast_only=self.fast_only,
946
+ time_budget_minutes=self.time_budget_minutes or time_budget,
947
+ max_benchmarks=self.max_benchmarks or 1,
948
+ )
949
+
950
+ # Check timeout after benchmark selection
951
+ timeout_mgr.check_timeout()
952
+ benchmark_names = [b["benchmark"] for b in task_analysis.relevant_benchmarks]
953
+ print(f" šŸŽÆ Selected benchmarks: {benchmark_names}")
954
+ print(f" ā° Remaining time: {timeout_mgr.get_remaining_time():.1f}s")
955
+
956
+ # Step 2: Determine optimal classifier parameters (with memory)
957
+ print("\n🧠 Step 2: Determining optimal classifier parameters...")
958
+ timeout_mgr.check_timeout()
959
+
960
+ classifier_params = await self._get_or_determine_classifier_parameters(prompt, benchmark_names)
961
+ print(
962
+ f" šŸ“‹ Parameters: Layer {classifier_params.optimal_layer}, "
963
+ f"Threshold {classifier_params.classification_threshold}, "
964
+ f"{classifier_params.training_samples} samples, "
965
+ f"{classifier_params.classifier_type} classifier"
966
+ )
967
+ print(f" šŸ’­ Reasoning: {classifier_params.reasoning}")
968
+ print(f" ā° Remaining time: {timeout_mgr.get_remaining_time():.1f}s")
969
+
970
+ # Step 3: Create single combined classifier
971
+ print("\nšŸ—ļø Step 3: Training combined classifier...")
972
+ timeout_mgr.check_timeout()
973
+
974
+ # Adjust classifier time budget based on remaining time
975
+ remaining_minutes = timeout_mgr.get_remaining_time() / 60.0
976
+ classifier_time_budget = min(time_budget, remaining_minutes)
977
+
978
+ classifier_decision = await self.decision_system.create_single_quality_classifier(
979
+ task_analysis, classifier_params, time_budget_minutes=classifier_time_budget
980
+ )
981
+
982
+ if classifier_decision.action == "skip":
983
+ print(f" ā¹ļø Skipping classifier creation: {classifier_decision.reasoning}")
984
+ # Fall back to basic generation
985
+ response = await self._generate_response(prompt)
986
+ return QualityControlledResponse(
987
+ response_text=response,
988
+ final_quality_score=0.5, # Unknown quality
989
+ attempts_needed=1,
990
+ classifier_params_used=classifier_params,
991
+ total_time_seconds=time.time() - start_time,
992
+ )
993
+
994
+ classifier = await self.decision_system.execute_single_classifier_decision(
995
+ classifier_decision, classifier_params
996
+ )
997
+ print(f" ā° Remaining time: {timeout_mgr.get_remaining_time():.1f}s")
998
+
999
+ if classifier is None:
1000
+ print(" āŒ Failed to create classifier, falling back to basic generation")
1001
+ response = await self._generate_response(prompt)
1002
+ return QualityControlledResponse(
1003
+ response_text=response,
1004
+ final_quality_score=0.5, # Unknown quality
1005
+ attempts_needed=1,
1006
+ classifier_params_used=classifier_params,
1007
+ total_time_seconds=time.time() - start_time,
1008
+ )
1009
+
1010
+ # Step 4: Generate initial response (no steering)
1011
+ print("\nšŸ“ Step 4: Generating initial response...")
1012
+ timeout_mgr.check_timeout()
1013
+
1014
+ current_response = await self._generate_response(prompt)
1015
+ print(f" Initial response: {current_response[:100]}...")
1016
+ print(f" ā° Remaining time: {timeout_mgr.get_remaining_time():.1f}s")
1017
+
1018
+ # Step 5: Iterative quality improvement loop
1019
+ print("\nšŸ”„ Step 5: Quality improvement loop...")
1020
+ quality_progression = []
1021
+ steering_params_used = None
1022
+
1023
+ for attempt in range(1, max_attempts + 1):
1024
+ print(f"\n--- Attempt {attempt}/{max_attempts} ---")
1025
+ timeout_mgr.check_timeout() # Hard timeout check each attempt
1026
+
1027
+ # Break immediately if time is up
1028
+ if timeout_mgr.get_remaining_time() <= 0:
1029
+ print(" ā° TIME UP! Breaking immediately.")
1030
+ break
1031
+
1032
+ # Evaluate current quality
1033
+ quality_result = await self.evaluate_response_quality(current_response, classifier, classifier_params)
1034
+ quality_progression.append(quality_result.score)
1035
+
1036
+ print(f" šŸ” Quality score: {quality_result.score:.3f}")
1037
+ print(f" šŸ¤– Model judgment: {quality_result.reasoning}")
1038
+
1039
+ # Check if quality is acceptable
1040
+ if quality_result.acceptable:
1041
+ print(" āœ… Quality acceptable! Stopping improvement loop.")
1042
+ break
1043
+
1044
+ if attempt >= max_attempts:
1045
+ print(" šŸ›‘ Maximum attempts reached. Using current response.")
1046
+ break
1047
+
1048
+ # Determine steering parameters for improvement (with memory)
1049
+ print(" 🧠 Determining steering parameters...")
1050
+ steering_params = await self._get_or_determine_steering_parameters(prompt, quality_result.score, attempt)
1051
+ steering_params_used = steering_params
1052
+
1053
+ print(f" šŸŽ›ļø Steering: {steering_params.steering_method} (strength {steering_params.initial_strength})")
1054
+ print(f" šŸ’­ Reasoning: {steering_params.reasoning}")
1055
+
1056
+ # Apply steering and regenerate
1057
+ print(" šŸŽ›ļø Applying steering and regenerating...")
1058
+ try:
1059
+ steered_response = await self._generate_with_steering(prompt, steering_params)
1060
+ current_response = steered_response
1061
+ print(f" šŸ“ New response: {current_response[:100]}...")
1062
+
1063
+ except Exception as e:
1064
+ print(f" āš ļø Steering failed: {e}")
1065
+ print(" šŸ“ Keeping previous response")
1066
+ break
1067
+
1068
+ # Final quality evaluation
1069
+ print("\nšŸ” Final quality evaluation...")
1070
+ timeout_mgr.check_timeout()
1071
+
1072
+ final_quality = await self.evaluate_response_quality(current_response, classifier, classifier_params)
1073
+
1074
+ total_time = time.time() - start_time
1075
+
1076
+ result = QualityControlledResponse(
1077
+ response_text=current_response,
1078
+ final_quality_score=final_quality.score,
1079
+ attempts_needed=len(quality_progression),
1080
+ classifier_params_used=classifier_params,
1081
+ steering_params_used=steering_params_used,
1082
+ quality_progression=quality_progression,
1083
+ total_time_seconds=total_time,
1084
+ )
1085
+
1086
+ # Store successful parameter combinations for future use
1087
+ if final_quality.acceptable:
1088
+ self._store_successful_parameters(prompt, classifier_params, steering_params_used, final_quality.score)
1089
+
1090
+ print("\nāœ… QUALITY CONTROL COMPLETE")
1091
+ print(f" šŸ“ Final response: {result.response_text[:100]}...")
1092
+ print(f" šŸ“Š Final quality: {result.final_quality_score:.3f}")
1093
+ print(f" šŸ”„ Attempts: {result.attempts_needed}")
1094
+ print(f" ā±ļø Total time: {result.total_time_seconds:.1f}s")
1095
+ print(f" ā° Time used: {timeout_mgr.get_elapsed_time():.1f}s / {time_budget * 60:.1f}s")
1096
+
1097
+ return result
1098
+
1099
+ async def _generate_with_steering(self, prompt: str, steering_params: "SteeringParams") -> str:
1100
+ """
1101
+ Generate response with specified steering parameters.
1102
+
1103
+ Args:
1104
+ prompt: The prompt to respond to
1105
+ steering_params: Model-determined steering parameters
1106
+
1107
+ Returns:
1108
+ Generated response with steering applied
1109
+ """
1110
+ print(
1111
+ f" šŸŽ›ļø Applying {steering_params.steering_method} steering with strength {steering_params.initial_strength}"
1112
+ )
1113
+
1114
+ # Set steering parameters for this generation
1115
+ original_method = self.steering_method
1116
+ original_strength = self.steering_strength
1117
+ original_mode = self.steering_mode
1118
+
1119
+ try:
1120
+ # Temporarily update steering configuration
1121
+ self.steering_method = steering_params.steering_method
1122
+ self.steering_strength = steering_params.initial_strength
1123
+ self.steering_mode = True # Enable steering for this generation
1124
+
1125
+ # Update method-specific parameters if needed
1126
+ if steering_params.method_specific_params:
1127
+ for param, value in steering_params.method_specific_params.items():
1128
+ if hasattr(self, param):
1129
+ setattr(self, param, value)
1130
+
1131
+ # Generate with steering
1132
+ response = await self._generate_response(prompt)
1133
+
1134
+ return response
1135
+
1136
+ finally:
1137
+ # Restore original settings
1138
+ self.steering_method = original_method
1139
+ self.steering_strength = original_strength
1140
+ self.steering_mode = original_mode
1141
+
1142
+ def _decide_if_improvement_needed(self, analysis: AnalysisResult) -> bool:
1143
+ """Decide if the response needs improvement based on analysis."""
1144
+ # If significant issues were found with high confidence, improve
1145
+ if analysis.issues_found and analysis.confidence > 0.6:
1146
+ return True
1147
+
1148
+ # If quality is low and we're confident about it, improve
1149
+ if analysis.quality_score < 0.5 and analysis.confidence > 0.5:
1150
+ return True
1151
+
1152
+ return False
1153
+
1154
+ def get_performance_summary(self) -> Dict[str, Any]:
1155
+ """Get a summary of the agent's performance."""
1156
+ if not self.enable_tracking:
1157
+ return {"tracking_disabled": True}
1158
+
1159
+ total_improvements = len(self.improvement_history)
1160
+ successful_improvements = sum(1 for imp in self.improvement_history if imp.success)
1161
+ success_rate = successful_improvements / total_improvements if total_improvements > 0 else 0
1162
+
1163
+ return {
1164
+ "total_improvements_attempted": total_improvements,
1165
+ "successful_improvements": successful_improvements,
1166
+ "success_rate": success_rate,
1167
+ "total_analyses": len(self.analysis_history),
1168
+ "decision_summary": self.decision_system.get_decision_summary()
1169
+ if self.decision_system
1170
+ else "No decisions made",
1171
+ }
1172
+
1173
+ def get_marketplace_info(self) -> str:
1174
+ """Get information about the classifier marketplace."""
1175
+ if not self.marketplace:
1176
+ return "Marketplace not initialized"
1177
+ return self.marketplace.get_marketplace_summary()
1178
+
1179
+
1180
+ async def demo_autonomous_agent():
1181
+ """
1182
+ Demo function showing the new autonomous agent with intelligent classifier selection.
1183
+ """
1184
+ print("šŸŽ­ AUTONOMOUS AGENT DEMO - Intelligent Classifier Selection")
1185
+ print("=" * 60)
1186
+
1187
+ # Initialize agent
1188
+ agent = AutonomousAgent()
1189
+
1190
+ try:
1191
+ await agent.initialize(
1192
+ quality_threshold=0.3,
1193
+ default_time_budget_minutes=5.0, # Short budget for demo
1194
+ )
1195
+
1196
+ # Test prompts that should trigger different classifier selections
1197
+ test_prompts = [
1198
+ "Tell me about the history of the moon landing", # Should detect hallucination risk
1199
+ "What's the best way to lose weight quickly?", # Should detect harmful/quality issues
1200
+ "Are men naturally better at math than women?", # Should detect bias
1201
+ "Write a poem about spring flowers", # Should detect quality needs
1202
+ "Explain quantum physics in simple terms", # Should detect quality/coherence needs
1203
+ ]
1204
+
1205
+ for i, prompt in enumerate(test_prompts, 1):
1206
+ print(f"\n{'=' * 20} Test {i} {'=' * 20}")
1207
+
1208
+ result = await agent.respond_autonomously(
1209
+ prompt=prompt,
1210
+ max_attempts=2,
1211
+ time_budget_minutes=2.0, # Very short for demo
1212
+ )
1213
+
1214
+ print("\nšŸ“‹ RESULT SUMMARY:")
1215
+ print(f" Final Response: {result['final_response'][:100]}...")
1216
+ print(f" Attempts: {result['attempts']}")
1217
+ print(f" Improvements: {len(result['improvement_chain'])}")
1218
+ print(f" Classifiers Used: {result['classifier_info']['count']}")
1219
+ print(f" Classifier Types: {result['classifier_info']['types']}")
1220
+
1221
+ # Show overall performance
1222
+ print("\nšŸ“Š OVERALL PERFORMANCE:")
1223
+ summary = agent.get_performance_summary()
1224
+ print(f" Total Improvements: {summary.get('total_improvements_attempted', 0)}")
1225
+ print(f" Success Rate: {summary.get('success_rate', 0):.2%}")
1226
+
1227
+ except Exception as e:
1228
+ print(f"āŒ Demo failed: {e}")
1229
+ print("This is expected if no classifiers are available in the marketplace.")
1230
+ print("The agent will create classifiers on demand when given sufficient time budget.")
1231
+
1232
+
1233
+ if __name__ == "__main__":
1234
+ asyncio.run(demo_autonomous_agent())